1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2007 Oracle. All rights reserved. 4 */ 5 6 #include <linux/kernel.h> 7 #include <linux/bio.h> 8 #include <linux/file.h> 9 #include <linux/fs.h> 10 #include <linux/fsnotify.h> 11 #include <linux/pagemap.h> 12 #include <linux/highmem.h> 13 #include <linux/time.h> 14 #include <linux/string.h> 15 #include <linux/backing-dev.h> 16 #include <linux/mount.h> 17 #include <linux/namei.h> 18 #include <linux/writeback.h> 19 #include <linux/compat.h> 20 #include <linux/security.h> 21 #include <linux/xattr.h> 22 #include <linux/mm.h> 23 #include <linux/slab.h> 24 #include <linux/blkdev.h> 25 #include <linux/uuid.h> 26 #include <linux/btrfs.h> 27 #include <linux/uaccess.h> 28 #include <linux/iversion.h> 29 #include "ctree.h" 30 #include "disk-io.h" 31 #include "transaction.h" 32 #include "btrfs_inode.h" 33 #include "print-tree.h" 34 #include "volumes.h" 35 #include "locking.h" 36 #include "inode-map.h" 37 #include "backref.h" 38 #include "rcu-string.h" 39 #include "send.h" 40 #include "dev-replace.h" 41 #include "props.h" 42 #include "sysfs.h" 43 #include "qgroup.h" 44 #include "tree-log.h" 45 #include "compression.h" 46 47 #ifdef CONFIG_64BIT 48 /* If we have a 32-bit userspace and 64-bit kernel, then the UAPI 49 * structures are incorrect, as the timespec structure from userspace 50 * is 4 bytes too small. We define these alternatives here to teach 51 * the kernel about the 32-bit struct packing. 52 */ 53 struct btrfs_ioctl_timespec_32 { 54 __u64 sec; 55 __u32 nsec; 56 } __attribute__ ((__packed__)); 57 58 struct btrfs_ioctl_received_subvol_args_32 { 59 char uuid[BTRFS_UUID_SIZE]; /* in */ 60 __u64 stransid; /* in */ 61 __u64 rtransid; /* out */ 62 struct btrfs_ioctl_timespec_32 stime; /* in */ 63 struct btrfs_ioctl_timespec_32 rtime; /* out */ 64 __u64 flags; /* in */ 65 __u64 reserved[16]; /* in */ 66 } __attribute__ ((__packed__)); 67 68 #define BTRFS_IOC_SET_RECEIVED_SUBVOL_32 _IOWR(BTRFS_IOCTL_MAGIC, 37, \ 69 struct btrfs_ioctl_received_subvol_args_32) 70 #endif 71 72 #if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) 73 struct btrfs_ioctl_send_args_32 { 74 __s64 send_fd; /* in */ 75 __u64 clone_sources_count; /* in */ 76 compat_uptr_t clone_sources; /* in */ 77 __u64 parent_root; /* in */ 78 __u64 flags; /* in */ 79 __u64 reserved[4]; /* in */ 80 } __attribute__ ((__packed__)); 81 82 #define BTRFS_IOC_SEND_32 _IOW(BTRFS_IOCTL_MAGIC, 38, \ 83 struct btrfs_ioctl_send_args_32) 84 #endif 85 86 static int btrfs_clone(struct inode *src, struct inode *inode, 87 u64 off, u64 olen, u64 olen_aligned, u64 destoff, 88 int no_time_update); 89 90 /* Mask out flags that are inappropriate for the given type of inode. */ 91 static unsigned int btrfs_mask_fsflags_for_type(struct inode *inode, 92 unsigned int flags) 93 { 94 if (S_ISDIR(inode->i_mode)) 95 return flags; 96 else if (S_ISREG(inode->i_mode)) 97 return flags & ~FS_DIRSYNC_FL; 98 else 99 return flags & (FS_NODUMP_FL | FS_NOATIME_FL); 100 } 101 102 /* 103 * Export internal inode flags to the format expected by the FS_IOC_GETFLAGS 104 * ioctl. 105 */ 106 static unsigned int btrfs_inode_flags_to_fsflags(unsigned int flags) 107 { 108 unsigned int iflags = 0; 109 110 if (flags & BTRFS_INODE_SYNC) 111 iflags |= FS_SYNC_FL; 112 if (flags & BTRFS_INODE_IMMUTABLE) 113 iflags |= FS_IMMUTABLE_FL; 114 if (flags & BTRFS_INODE_APPEND) 115 iflags |= FS_APPEND_FL; 116 if (flags & BTRFS_INODE_NODUMP) 117 iflags |= FS_NODUMP_FL; 118 if (flags & BTRFS_INODE_NOATIME) 119 iflags |= FS_NOATIME_FL; 120 if (flags & BTRFS_INODE_DIRSYNC) 121 iflags |= FS_DIRSYNC_FL; 122 if (flags & BTRFS_INODE_NODATACOW) 123 iflags |= FS_NOCOW_FL; 124 125 if (flags & BTRFS_INODE_NOCOMPRESS) 126 iflags |= FS_NOCOMP_FL; 127 else if (flags & BTRFS_INODE_COMPRESS) 128 iflags |= FS_COMPR_FL; 129 130 return iflags; 131 } 132 133 /* 134 * Update inode->i_flags based on the btrfs internal flags. 135 */ 136 void btrfs_sync_inode_flags_to_i_flags(struct inode *inode) 137 { 138 struct btrfs_inode *binode = BTRFS_I(inode); 139 unsigned int new_fl = 0; 140 141 if (binode->flags & BTRFS_INODE_SYNC) 142 new_fl |= S_SYNC; 143 if (binode->flags & BTRFS_INODE_IMMUTABLE) 144 new_fl |= S_IMMUTABLE; 145 if (binode->flags & BTRFS_INODE_APPEND) 146 new_fl |= S_APPEND; 147 if (binode->flags & BTRFS_INODE_NOATIME) 148 new_fl |= S_NOATIME; 149 if (binode->flags & BTRFS_INODE_DIRSYNC) 150 new_fl |= S_DIRSYNC; 151 152 set_mask_bits(&inode->i_flags, 153 S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | S_DIRSYNC, 154 new_fl); 155 } 156 157 static int btrfs_ioctl_getflags(struct file *file, void __user *arg) 158 { 159 struct btrfs_inode *binode = BTRFS_I(file_inode(file)); 160 unsigned int flags = btrfs_inode_flags_to_fsflags(binode->flags); 161 162 if (copy_to_user(arg, &flags, sizeof(flags))) 163 return -EFAULT; 164 return 0; 165 } 166 167 /* Check if @flags are a supported and valid set of FS_*_FL flags */ 168 static int check_fsflags(unsigned int flags) 169 { 170 if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \ 171 FS_NOATIME_FL | FS_NODUMP_FL | \ 172 FS_SYNC_FL | FS_DIRSYNC_FL | \ 173 FS_NOCOMP_FL | FS_COMPR_FL | 174 FS_NOCOW_FL)) 175 return -EOPNOTSUPP; 176 177 if ((flags & FS_NOCOMP_FL) && (flags & FS_COMPR_FL)) 178 return -EINVAL; 179 180 return 0; 181 } 182 183 static int btrfs_ioctl_setflags(struct file *file, void __user *arg) 184 { 185 struct inode *inode = file_inode(file); 186 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 187 struct btrfs_inode *binode = BTRFS_I(inode); 188 struct btrfs_root *root = binode->root; 189 struct btrfs_trans_handle *trans; 190 unsigned int fsflags, old_fsflags; 191 int ret; 192 u64 old_flags; 193 unsigned int old_i_flags; 194 umode_t mode; 195 196 if (!inode_owner_or_capable(inode)) 197 return -EPERM; 198 199 if (btrfs_root_readonly(root)) 200 return -EROFS; 201 202 if (copy_from_user(&fsflags, arg, sizeof(fsflags))) 203 return -EFAULT; 204 205 ret = check_fsflags(fsflags); 206 if (ret) 207 return ret; 208 209 ret = mnt_want_write_file(file); 210 if (ret) 211 return ret; 212 213 inode_lock(inode); 214 215 old_flags = binode->flags; 216 old_i_flags = inode->i_flags; 217 mode = inode->i_mode; 218 219 fsflags = btrfs_mask_fsflags_for_type(inode, fsflags); 220 old_fsflags = btrfs_inode_flags_to_fsflags(binode->flags); 221 if ((fsflags ^ old_fsflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) { 222 if (!capable(CAP_LINUX_IMMUTABLE)) { 223 ret = -EPERM; 224 goto out_unlock; 225 } 226 } 227 228 if (fsflags & FS_SYNC_FL) 229 binode->flags |= BTRFS_INODE_SYNC; 230 else 231 binode->flags &= ~BTRFS_INODE_SYNC; 232 if (fsflags & FS_IMMUTABLE_FL) 233 binode->flags |= BTRFS_INODE_IMMUTABLE; 234 else 235 binode->flags &= ~BTRFS_INODE_IMMUTABLE; 236 if (fsflags & FS_APPEND_FL) 237 binode->flags |= BTRFS_INODE_APPEND; 238 else 239 binode->flags &= ~BTRFS_INODE_APPEND; 240 if (fsflags & FS_NODUMP_FL) 241 binode->flags |= BTRFS_INODE_NODUMP; 242 else 243 binode->flags &= ~BTRFS_INODE_NODUMP; 244 if (fsflags & FS_NOATIME_FL) 245 binode->flags |= BTRFS_INODE_NOATIME; 246 else 247 binode->flags &= ~BTRFS_INODE_NOATIME; 248 if (fsflags & FS_DIRSYNC_FL) 249 binode->flags |= BTRFS_INODE_DIRSYNC; 250 else 251 binode->flags &= ~BTRFS_INODE_DIRSYNC; 252 if (fsflags & FS_NOCOW_FL) { 253 if (S_ISREG(mode)) { 254 /* 255 * It's safe to turn csums off here, no extents exist. 256 * Otherwise we want the flag to reflect the real COW 257 * status of the file and will not set it. 258 */ 259 if (inode->i_size == 0) 260 binode->flags |= BTRFS_INODE_NODATACOW 261 | BTRFS_INODE_NODATASUM; 262 } else { 263 binode->flags |= BTRFS_INODE_NODATACOW; 264 } 265 } else { 266 /* 267 * Revert back under same assumptions as above 268 */ 269 if (S_ISREG(mode)) { 270 if (inode->i_size == 0) 271 binode->flags &= ~(BTRFS_INODE_NODATACOW 272 | BTRFS_INODE_NODATASUM); 273 } else { 274 binode->flags &= ~BTRFS_INODE_NODATACOW; 275 } 276 } 277 278 /* 279 * The COMPRESS flag can only be changed by users, while the NOCOMPRESS 280 * flag may be changed automatically if compression code won't make 281 * things smaller. 282 */ 283 if (fsflags & FS_NOCOMP_FL) { 284 binode->flags &= ~BTRFS_INODE_COMPRESS; 285 binode->flags |= BTRFS_INODE_NOCOMPRESS; 286 287 ret = btrfs_set_prop(inode, "btrfs.compression", NULL, 0, 0); 288 if (ret && ret != -ENODATA) 289 goto out_drop; 290 } else if (fsflags & FS_COMPR_FL) { 291 const char *comp; 292 293 binode->flags |= BTRFS_INODE_COMPRESS; 294 binode->flags &= ~BTRFS_INODE_NOCOMPRESS; 295 296 comp = btrfs_compress_type2str(fs_info->compress_type); 297 if (!comp || comp[0] == 0) 298 comp = btrfs_compress_type2str(BTRFS_COMPRESS_ZLIB); 299 300 ret = btrfs_set_prop(inode, "btrfs.compression", 301 comp, strlen(comp), 0); 302 if (ret) 303 goto out_drop; 304 305 } else { 306 ret = btrfs_set_prop(inode, "btrfs.compression", NULL, 0, 0); 307 if (ret && ret != -ENODATA) 308 goto out_drop; 309 binode->flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS); 310 } 311 312 trans = btrfs_start_transaction(root, 1); 313 if (IS_ERR(trans)) { 314 ret = PTR_ERR(trans); 315 goto out_drop; 316 } 317 318 btrfs_sync_inode_flags_to_i_flags(inode); 319 inode_inc_iversion(inode); 320 inode->i_ctime = current_time(inode); 321 ret = btrfs_update_inode(trans, root, inode); 322 323 btrfs_end_transaction(trans); 324 out_drop: 325 if (ret) { 326 binode->flags = old_flags; 327 inode->i_flags = old_i_flags; 328 } 329 330 out_unlock: 331 inode_unlock(inode); 332 mnt_drop_write_file(file); 333 return ret; 334 } 335 336 /* 337 * Translate btrfs internal inode flags to xflags as expected by the 338 * FS_IOC_FSGETXATT ioctl. Filter only the supported ones, unknown flags are 339 * silently dropped. 340 */ 341 static unsigned int btrfs_inode_flags_to_xflags(unsigned int flags) 342 { 343 unsigned int xflags = 0; 344 345 if (flags & BTRFS_INODE_APPEND) 346 xflags |= FS_XFLAG_APPEND; 347 if (flags & BTRFS_INODE_IMMUTABLE) 348 xflags |= FS_XFLAG_IMMUTABLE; 349 if (flags & BTRFS_INODE_NOATIME) 350 xflags |= FS_XFLAG_NOATIME; 351 if (flags & BTRFS_INODE_NODUMP) 352 xflags |= FS_XFLAG_NODUMP; 353 if (flags & BTRFS_INODE_SYNC) 354 xflags |= FS_XFLAG_SYNC; 355 356 return xflags; 357 } 358 359 /* Check if @flags are a supported and valid set of FS_XFLAGS_* flags */ 360 static int check_xflags(unsigned int flags) 361 { 362 if (flags & ~(FS_XFLAG_APPEND | FS_XFLAG_IMMUTABLE | FS_XFLAG_NOATIME | 363 FS_XFLAG_NODUMP | FS_XFLAG_SYNC)) 364 return -EOPNOTSUPP; 365 return 0; 366 } 367 368 /* 369 * Set the xflags from the internal inode flags. The remaining items of fsxattr 370 * are zeroed. 371 */ 372 static int btrfs_ioctl_fsgetxattr(struct file *file, void __user *arg) 373 { 374 struct btrfs_inode *binode = BTRFS_I(file_inode(file)); 375 struct fsxattr fa; 376 377 memset(&fa, 0, sizeof(fa)); 378 fa.fsx_xflags = btrfs_inode_flags_to_xflags(binode->flags); 379 380 if (copy_to_user(arg, &fa, sizeof(fa))) 381 return -EFAULT; 382 383 return 0; 384 } 385 386 static int btrfs_ioctl_fssetxattr(struct file *file, void __user *arg) 387 { 388 struct inode *inode = file_inode(file); 389 struct btrfs_inode *binode = BTRFS_I(inode); 390 struct btrfs_root *root = binode->root; 391 struct btrfs_trans_handle *trans; 392 struct fsxattr fa; 393 unsigned old_flags; 394 unsigned old_i_flags; 395 int ret = 0; 396 397 if (!inode_owner_or_capable(inode)) 398 return -EPERM; 399 400 if (btrfs_root_readonly(root)) 401 return -EROFS; 402 403 memset(&fa, 0, sizeof(fa)); 404 if (copy_from_user(&fa, arg, sizeof(fa))) 405 return -EFAULT; 406 407 ret = check_xflags(fa.fsx_xflags); 408 if (ret) 409 return ret; 410 411 if (fa.fsx_extsize != 0 || fa.fsx_projid != 0 || fa.fsx_cowextsize != 0) 412 return -EOPNOTSUPP; 413 414 ret = mnt_want_write_file(file); 415 if (ret) 416 return ret; 417 418 inode_lock(inode); 419 420 old_flags = binode->flags; 421 old_i_flags = inode->i_flags; 422 423 /* We need the capabilities to change append-only or immutable inode */ 424 if (((old_flags & (BTRFS_INODE_APPEND | BTRFS_INODE_IMMUTABLE)) || 425 (fa.fsx_xflags & (FS_XFLAG_APPEND | FS_XFLAG_IMMUTABLE))) && 426 !capable(CAP_LINUX_IMMUTABLE)) { 427 ret = -EPERM; 428 goto out_unlock; 429 } 430 431 if (fa.fsx_xflags & FS_XFLAG_SYNC) 432 binode->flags |= BTRFS_INODE_SYNC; 433 else 434 binode->flags &= ~BTRFS_INODE_SYNC; 435 if (fa.fsx_xflags & FS_XFLAG_IMMUTABLE) 436 binode->flags |= BTRFS_INODE_IMMUTABLE; 437 else 438 binode->flags &= ~BTRFS_INODE_IMMUTABLE; 439 if (fa.fsx_xflags & FS_XFLAG_APPEND) 440 binode->flags |= BTRFS_INODE_APPEND; 441 else 442 binode->flags &= ~BTRFS_INODE_APPEND; 443 if (fa.fsx_xflags & FS_XFLAG_NODUMP) 444 binode->flags |= BTRFS_INODE_NODUMP; 445 else 446 binode->flags &= ~BTRFS_INODE_NODUMP; 447 if (fa.fsx_xflags & FS_XFLAG_NOATIME) 448 binode->flags |= BTRFS_INODE_NOATIME; 449 else 450 binode->flags &= ~BTRFS_INODE_NOATIME; 451 452 /* 1 item for the inode */ 453 trans = btrfs_start_transaction(root, 1); 454 if (IS_ERR(trans)) { 455 ret = PTR_ERR(trans); 456 goto out_unlock; 457 } 458 459 btrfs_sync_inode_flags_to_i_flags(inode); 460 inode_inc_iversion(inode); 461 inode->i_ctime = current_time(inode); 462 ret = btrfs_update_inode(trans, root, inode); 463 464 btrfs_end_transaction(trans); 465 466 out_unlock: 467 if (ret) { 468 binode->flags = old_flags; 469 inode->i_flags = old_i_flags; 470 } 471 472 inode_unlock(inode); 473 mnt_drop_write_file(file); 474 475 return ret; 476 } 477 478 static int btrfs_ioctl_getversion(struct file *file, int __user *arg) 479 { 480 struct inode *inode = file_inode(file); 481 482 return put_user(inode->i_generation, arg); 483 } 484 485 static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg) 486 { 487 struct inode *inode = file_inode(file); 488 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 489 struct btrfs_device *device; 490 struct request_queue *q; 491 struct fstrim_range range; 492 u64 minlen = ULLONG_MAX; 493 u64 num_devices = 0; 494 u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy); 495 int ret; 496 497 if (!capable(CAP_SYS_ADMIN)) 498 return -EPERM; 499 500 rcu_read_lock(); 501 list_for_each_entry_rcu(device, &fs_info->fs_devices->devices, 502 dev_list) { 503 if (!device->bdev) 504 continue; 505 q = bdev_get_queue(device->bdev); 506 if (blk_queue_discard(q)) { 507 num_devices++; 508 minlen = min_t(u64, q->limits.discard_granularity, 509 minlen); 510 } 511 } 512 rcu_read_unlock(); 513 514 if (!num_devices) 515 return -EOPNOTSUPP; 516 if (copy_from_user(&range, arg, sizeof(range))) 517 return -EFAULT; 518 if (range.start > total_bytes || 519 range.len < fs_info->sb->s_blocksize) 520 return -EINVAL; 521 522 range.len = min(range.len, total_bytes - range.start); 523 range.minlen = max(range.minlen, minlen); 524 ret = btrfs_trim_fs(fs_info, &range); 525 if (ret < 0) 526 return ret; 527 528 if (copy_to_user(arg, &range, sizeof(range))) 529 return -EFAULT; 530 531 return 0; 532 } 533 534 int btrfs_is_empty_uuid(u8 *uuid) 535 { 536 int i; 537 538 for (i = 0; i < BTRFS_UUID_SIZE; i++) { 539 if (uuid[i]) 540 return 0; 541 } 542 return 1; 543 } 544 545 static noinline int create_subvol(struct inode *dir, 546 struct dentry *dentry, 547 const char *name, int namelen, 548 u64 *async_transid, 549 struct btrfs_qgroup_inherit *inherit) 550 { 551 struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); 552 struct btrfs_trans_handle *trans; 553 struct btrfs_key key; 554 struct btrfs_root_item *root_item; 555 struct btrfs_inode_item *inode_item; 556 struct extent_buffer *leaf; 557 struct btrfs_root *root = BTRFS_I(dir)->root; 558 struct btrfs_root *new_root; 559 struct btrfs_block_rsv block_rsv; 560 struct timespec64 cur_time = current_time(dir); 561 struct inode *inode; 562 int ret; 563 int err; 564 u64 objectid; 565 u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID; 566 u64 index = 0; 567 uuid_le new_uuid; 568 569 root_item = kzalloc(sizeof(*root_item), GFP_KERNEL); 570 if (!root_item) 571 return -ENOMEM; 572 573 ret = btrfs_find_free_objectid(fs_info->tree_root, &objectid); 574 if (ret) 575 goto fail_free; 576 577 /* 578 * Don't create subvolume whose level is not zero. Or qgroup will be 579 * screwed up since it assumes subvolume qgroup's level to be 0. 580 */ 581 if (btrfs_qgroup_level(objectid)) { 582 ret = -ENOSPC; 583 goto fail_free; 584 } 585 586 btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP); 587 /* 588 * The same as the snapshot creation, please see the comment 589 * of create_snapshot(). 590 */ 591 ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 8, false); 592 if (ret) 593 goto fail_free; 594 595 trans = btrfs_start_transaction(root, 0); 596 if (IS_ERR(trans)) { 597 ret = PTR_ERR(trans); 598 btrfs_subvolume_release_metadata(fs_info, &block_rsv); 599 goto fail_free; 600 } 601 trans->block_rsv = &block_rsv; 602 trans->bytes_reserved = block_rsv.size; 603 604 ret = btrfs_qgroup_inherit(trans, 0, objectid, inherit); 605 if (ret) 606 goto fail; 607 608 leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0); 609 if (IS_ERR(leaf)) { 610 ret = PTR_ERR(leaf); 611 goto fail; 612 } 613 614 btrfs_mark_buffer_dirty(leaf); 615 616 inode_item = &root_item->inode; 617 btrfs_set_stack_inode_generation(inode_item, 1); 618 btrfs_set_stack_inode_size(inode_item, 3); 619 btrfs_set_stack_inode_nlink(inode_item, 1); 620 btrfs_set_stack_inode_nbytes(inode_item, 621 fs_info->nodesize); 622 btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755); 623 624 btrfs_set_root_flags(root_item, 0); 625 btrfs_set_root_limit(root_item, 0); 626 btrfs_set_stack_inode_flags(inode_item, BTRFS_INODE_ROOT_ITEM_INIT); 627 628 btrfs_set_root_bytenr(root_item, leaf->start); 629 btrfs_set_root_generation(root_item, trans->transid); 630 btrfs_set_root_level(root_item, 0); 631 btrfs_set_root_refs(root_item, 1); 632 btrfs_set_root_used(root_item, leaf->len); 633 btrfs_set_root_last_snapshot(root_item, 0); 634 635 btrfs_set_root_generation_v2(root_item, 636 btrfs_root_generation(root_item)); 637 uuid_le_gen(&new_uuid); 638 memcpy(root_item->uuid, new_uuid.b, BTRFS_UUID_SIZE); 639 btrfs_set_stack_timespec_sec(&root_item->otime, cur_time.tv_sec); 640 btrfs_set_stack_timespec_nsec(&root_item->otime, cur_time.tv_nsec); 641 root_item->ctime = root_item->otime; 642 btrfs_set_root_ctransid(root_item, trans->transid); 643 btrfs_set_root_otransid(root_item, trans->transid); 644 645 btrfs_tree_unlock(leaf); 646 free_extent_buffer(leaf); 647 leaf = NULL; 648 649 btrfs_set_root_dirid(root_item, new_dirid); 650 651 key.objectid = objectid; 652 key.offset = 0; 653 key.type = BTRFS_ROOT_ITEM_KEY; 654 ret = btrfs_insert_root(trans, fs_info->tree_root, &key, 655 root_item); 656 if (ret) 657 goto fail; 658 659 key.offset = (u64)-1; 660 new_root = btrfs_read_fs_root_no_name(fs_info, &key); 661 if (IS_ERR(new_root)) { 662 ret = PTR_ERR(new_root); 663 btrfs_abort_transaction(trans, ret); 664 goto fail; 665 } 666 667 btrfs_record_root_in_trans(trans, new_root); 668 669 ret = btrfs_create_subvol_root(trans, new_root, root, new_dirid); 670 if (ret) { 671 /* We potentially lose an unused inode item here */ 672 btrfs_abort_transaction(trans, ret); 673 goto fail; 674 } 675 676 mutex_lock(&new_root->objectid_mutex); 677 new_root->highest_objectid = new_dirid; 678 mutex_unlock(&new_root->objectid_mutex); 679 680 /* 681 * insert the directory item 682 */ 683 ret = btrfs_set_inode_index(BTRFS_I(dir), &index); 684 if (ret) { 685 btrfs_abort_transaction(trans, ret); 686 goto fail; 687 } 688 689 ret = btrfs_insert_dir_item(trans, root, 690 name, namelen, BTRFS_I(dir), &key, 691 BTRFS_FT_DIR, index); 692 if (ret) { 693 btrfs_abort_transaction(trans, ret); 694 goto fail; 695 } 696 697 btrfs_i_size_write(BTRFS_I(dir), dir->i_size + namelen * 2); 698 ret = btrfs_update_inode(trans, root, dir); 699 BUG_ON(ret); 700 701 ret = btrfs_add_root_ref(trans, objectid, root->root_key.objectid, 702 btrfs_ino(BTRFS_I(dir)), index, name, namelen); 703 BUG_ON(ret); 704 705 ret = btrfs_uuid_tree_add(trans, root_item->uuid, 706 BTRFS_UUID_KEY_SUBVOL, objectid); 707 if (ret) 708 btrfs_abort_transaction(trans, ret); 709 710 fail: 711 kfree(root_item); 712 trans->block_rsv = NULL; 713 trans->bytes_reserved = 0; 714 btrfs_subvolume_release_metadata(fs_info, &block_rsv); 715 716 if (async_transid) { 717 *async_transid = trans->transid; 718 err = btrfs_commit_transaction_async(trans, 1); 719 if (err) 720 err = btrfs_commit_transaction(trans); 721 } else { 722 err = btrfs_commit_transaction(trans); 723 } 724 if (err && !ret) 725 ret = err; 726 727 if (!ret) { 728 inode = btrfs_lookup_dentry(dir, dentry); 729 if (IS_ERR(inode)) 730 return PTR_ERR(inode); 731 d_instantiate(dentry, inode); 732 } 733 return ret; 734 735 fail_free: 736 kfree(root_item); 737 return ret; 738 } 739 740 static int create_snapshot(struct btrfs_root *root, struct inode *dir, 741 struct dentry *dentry, 742 u64 *async_transid, bool readonly, 743 struct btrfs_qgroup_inherit *inherit) 744 { 745 struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); 746 struct inode *inode; 747 struct btrfs_pending_snapshot *pending_snapshot; 748 struct btrfs_trans_handle *trans; 749 int ret; 750 751 if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state)) 752 return -EINVAL; 753 754 pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_KERNEL); 755 if (!pending_snapshot) 756 return -ENOMEM; 757 758 pending_snapshot->root_item = kzalloc(sizeof(struct btrfs_root_item), 759 GFP_KERNEL); 760 pending_snapshot->path = btrfs_alloc_path(); 761 if (!pending_snapshot->root_item || !pending_snapshot->path) { 762 ret = -ENOMEM; 763 goto free_pending; 764 } 765 766 atomic_inc(&root->will_be_snapshotted); 767 smp_mb__after_atomic(); 768 /* wait for no snapshot writes */ 769 wait_event(root->subv_writers->wait, 770 percpu_counter_sum(&root->subv_writers->counter) == 0); 771 772 ret = btrfs_start_delalloc_inodes(root); 773 if (ret) 774 goto dec_and_free; 775 776 btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1); 777 778 btrfs_init_block_rsv(&pending_snapshot->block_rsv, 779 BTRFS_BLOCK_RSV_TEMP); 780 /* 781 * 1 - parent dir inode 782 * 2 - dir entries 783 * 1 - root item 784 * 2 - root ref/backref 785 * 1 - root of snapshot 786 * 1 - UUID item 787 */ 788 ret = btrfs_subvolume_reserve_metadata(BTRFS_I(dir)->root, 789 &pending_snapshot->block_rsv, 8, 790 false); 791 if (ret) 792 goto dec_and_free; 793 794 pending_snapshot->dentry = dentry; 795 pending_snapshot->root = root; 796 pending_snapshot->readonly = readonly; 797 pending_snapshot->dir = dir; 798 pending_snapshot->inherit = inherit; 799 800 trans = btrfs_start_transaction(root, 0); 801 if (IS_ERR(trans)) { 802 ret = PTR_ERR(trans); 803 goto fail; 804 } 805 806 spin_lock(&fs_info->trans_lock); 807 list_add(&pending_snapshot->list, 808 &trans->transaction->pending_snapshots); 809 spin_unlock(&fs_info->trans_lock); 810 if (async_transid) { 811 *async_transid = trans->transid; 812 ret = btrfs_commit_transaction_async(trans, 1); 813 if (ret) 814 ret = btrfs_commit_transaction(trans); 815 } else { 816 ret = btrfs_commit_transaction(trans); 817 } 818 if (ret) 819 goto fail; 820 821 ret = pending_snapshot->error; 822 if (ret) 823 goto fail; 824 825 ret = btrfs_orphan_cleanup(pending_snapshot->snap); 826 if (ret) 827 goto fail; 828 829 inode = btrfs_lookup_dentry(d_inode(dentry->d_parent), dentry); 830 if (IS_ERR(inode)) { 831 ret = PTR_ERR(inode); 832 goto fail; 833 } 834 835 d_instantiate(dentry, inode); 836 ret = 0; 837 fail: 838 btrfs_subvolume_release_metadata(fs_info, &pending_snapshot->block_rsv); 839 dec_and_free: 840 if (atomic_dec_and_test(&root->will_be_snapshotted)) 841 wake_up_var(&root->will_be_snapshotted); 842 free_pending: 843 kfree(pending_snapshot->root_item); 844 btrfs_free_path(pending_snapshot->path); 845 kfree(pending_snapshot); 846 847 return ret; 848 } 849 850 /* copy of may_delete in fs/namei.c() 851 * Check whether we can remove a link victim from directory dir, check 852 * whether the type of victim is right. 853 * 1. We can't do it if dir is read-only (done in permission()) 854 * 2. We should have write and exec permissions on dir 855 * 3. We can't remove anything from append-only dir 856 * 4. We can't do anything with immutable dir (done in permission()) 857 * 5. If the sticky bit on dir is set we should either 858 * a. be owner of dir, or 859 * b. be owner of victim, or 860 * c. have CAP_FOWNER capability 861 * 6. If the victim is append-only or immutable we can't do anything with 862 * links pointing to it. 863 * 7. If we were asked to remove a directory and victim isn't one - ENOTDIR. 864 * 8. If we were asked to remove a non-directory and victim isn't one - EISDIR. 865 * 9. We can't remove a root or mountpoint. 866 * 10. We don't allow removal of NFS sillyrenamed files; it's handled by 867 * nfs_async_unlink(). 868 */ 869 870 static int btrfs_may_delete(struct inode *dir, struct dentry *victim, int isdir) 871 { 872 int error; 873 874 if (d_really_is_negative(victim)) 875 return -ENOENT; 876 877 BUG_ON(d_inode(victim->d_parent) != dir); 878 audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE); 879 880 error = inode_permission(dir, MAY_WRITE | MAY_EXEC); 881 if (error) 882 return error; 883 if (IS_APPEND(dir)) 884 return -EPERM; 885 if (check_sticky(dir, d_inode(victim)) || IS_APPEND(d_inode(victim)) || 886 IS_IMMUTABLE(d_inode(victim)) || IS_SWAPFILE(d_inode(victim))) 887 return -EPERM; 888 if (isdir) { 889 if (!d_is_dir(victim)) 890 return -ENOTDIR; 891 if (IS_ROOT(victim)) 892 return -EBUSY; 893 } else if (d_is_dir(victim)) 894 return -EISDIR; 895 if (IS_DEADDIR(dir)) 896 return -ENOENT; 897 if (victim->d_flags & DCACHE_NFSFS_RENAMED) 898 return -EBUSY; 899 return 0; 900 } 901 902 /* copy of may_create in fs/namei.c() */ 903 static inline int btrfs_may_create(struct inode *dir, struct dentry *child) 904 { 905 if (d_really_is_positive(child)) 906 return -EEXIST; 907 if (IS_DEADDIR(dir)) 908 return -ENOENT; 909 return inode_permission(dir, MAY_WRITE | MAY_EXEC); 910 } 911 912 /* 913 * Create a new subvolume below @parent. This is largely modeled after 914 * sys_mkdirat and vfs_mkdir, but we only do a single component lookup 915 * inside this filesystem so it's quite a bit simpler. 916 */ 917 static noinline int btrfs_mksubvol(const struct path *parent, 918 const char *name, int namelen, 919 struct btrfs_root *snap_src, 920 u64 *async_transid, bool readonly, 921 struct btrfs_qgroup_inherit *inherit) 922 { 923 struct inode *dir = d_inode(parent->dentry); 924 struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); 925 struct dentry *dentry; 926 int error; 927 928 error = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT); 929 if (error == -EINTR) 930 return error; 931 932 dentry = lookup_one_len(name, parent->dentry, namelen); 933 error = PTR_ERR(dentry); 934 if (IS_ERR(dentry)) 935 goto out_unlock; 936 937 error = btrfs_may_create(dir, dentry); 938 if (error) 939 goto out_dput; 940 941 /* 942 * even if this name doesn't exist, we may get hash collisions. 943 * check for them now when we can safely fail 944 */ 945 error = btrfs_check_dir_item_collision(BTRFS_I(dir)->root, 946 dir->i_ino, name, 947 namelen); 948 if (error) 949 goto out_dput; 950 951 down_read(&fs_info->subvol_sem); 952 953 if (btrfs_root_refs(&BTRFS_I(dir)->root->root_item) == 0) 954 goto out_up_read; 955 956 if (snap_src) { 957 error = create_snapshot(snap_src, dir, dentry, 958 async_transid, readonly, inherit); 959 } else { 960 error = create_subvol(dir, dentry, name, namelen, 961 async_transid, inherit); 962 } 963 if (!error) 964 fsnotify_mkdir(dir, dentry); 965 out_up_read: 966 up_read(&fs_info->subvol_sem); 967 out_dput: 968 dput(dentry); 969 out_unlock: 970 inode_unlock(dir); 971 return error; 972 } 973 974 /* 975 * When we're defragging a range, we don't want to kick it off again 976 * if it is really just waiting for delalloc to send it down. 977 * If we find a nice big extent or delalloc range for the bytes in the 978 * file you want to defrag, we return 0 to let you know to skip this 979 * part of the file 980 */ 981 static int check_defrag_in_cache(struct inode *inode, u64 offset, u32 thresh) 982 { 983 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 984 struct extent_map *em = NULL; 985 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 986 u64 end; 987 988 read_lock(&em_tree->lock); 989 em = lookup_extent_mapping(em_tree, offset, PAGE_SIZE); 990 read_unlock(&em_tree->lock); 991 992 if (em) { 993 end = extent_map_end(em); 994 free_extent_map(em); 995 if (end - offset > thresh) 996 return 0; 997 } 998 /* if we already have a nice delalloc here, just stop */ 999 thresh /= 2; 1000 end = count_range_bits(io_tree, &offset, offset + thresh, 1001 thresh, EXTENT_DELALLOC, 1); 1002 if (end >= thresh) 1003 return 0; 1004 return 1; 1005 } 1006 1007 /* 1008 * helper function to walk through a file and find extents 1009 * newer than a specific transid, and smaller than thresh. 1010 * 1011 * This is used by the defragging code to find new and small 1012 * extents 1013 */ 1014 static int find_new_extents(struct btrfs_root *root, 1015 struct inode *inode, u64 newer_than, 1016 u64 *off, u32 thresh) 1017 { 1018 struct btrfs_path *path; 1019 struct btrfs_key min_key; 1020 struct extent_buffer *leaf; 1021 struct btrfs_file_extent_item *extent; 1022 int type; 1023 int ret; 1024 u64 ino = btrfs_ino(BTRFS_I(inode)); 1025 1026 path = btrfs_alloc_path(); 1027 if (!path) 1028 return -ENOMEM; 1029 1030 min_key.objectid = ino; 1031 min_key.type = BTRFS_EXTENT_DATA_KEY; 1032 min_key.offset = *off; 1033 1034 while (1) { 1035 ret = btrfs_search_forward(root, &min_key, path, newer_than); 1036 if (ret != 0) 1037 goto none; 1038 process_slot: 1039 if (min_key.objectid != ino) 1040 goto none; 1041 if (min_key.type != BTRFS_EXTENT_DATA_KEY) 1042 goto none; 1043 1044 leaf = path->nodes[0]; 1045 extent = btrfs_item_ptr(leaf, path->slots[0], 1046 struct btrfs_file_extent_item); 1047 1048 type = btrfs_file_extent_type(leaf, extent); 1049 if (type == BTRFS_FILE_EXTENT_REG && 1050 btrfs_file_extent_num_bytes(leaf, extent) < thresh && 1051 check_defrag_in_cache(inode, min_key.offset, thresh)) { 1052 *off = min_key.offset; 1053 btrfs_free_path(path); 1054 return 0; 1055 } 1056 1057 path->slots[0]++; 1058 if (path->slots[0] < btrfs_header_nritems(leaf)) { 1059 btrfs_item_key_to_cpu(leaf, &min_key, path->slots[0]); 1060 goto process_slot; 1061 } 1062 1063 if (min_key.offset == (u64)-1) 1064 goto none; 1065 1066 min_key.offset++; 1067 btrfs_release_path(path); 1068 } 1069 none: 1070 btrfs_free_path(path); 1071 return -ENOENT; 1072 } 1073 1074 static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start) 1075 { 1076 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 1077 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 1078 struct extent_map *em; 1079 u64 len = PAGE_SIZE; 1080 1081 /* 1082 * hopefully we have this extent in the tree already, try without 1083 * the full extent lock 1084 */ 1085 read_lock(&em_tree->lock); 1086 em = lookup_extent_mapping(em_tree, start, len); 1087 read_unlock(&em_tree->lock); 1088 1089 if (!em) { 1090 struct extent_state *cached = NULL; 1091 u64 end = start + len - 1; 1092 1093 /* get the big lock and read metadata off disk */ 1094 lock_extent_bits(io_tree, start, end, &cached); 1095 em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len, 0); 1096 unlock_extent_cached(io_tree, start, end, &cached); 1097 1098 if (IS_ERR(em)) 1099 return NULL; 1100 } 1101 1102 return em; 1103 } 1104 1105 static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em) 1106 { 1107 struct extent_map *next; 1108 bool ret = true; 1109 1110 /* this is the last extent */ 1111 if (em->start + em->len >= i_size_read(inode)) 1112 return false; 1113 1114 next = defrag_lookup_extent(inode, em->start + em->len); 1115 if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE) 1116 ret = false; 1117 else if ((em->block_start + em->block_len == next->block_start) && 1118 (em->block_len > SZ_128K && next->block_len > SZ_128K)) 1119 ret = false; 1120 1121 free_extent_map(next); 1122 return ret; 1123 } 1124 1125 static int should_defrag_range(struct inode *inode, u64 start, u32 thresh, 1126 u64 *last_len, u64 *skip, u64 *defrag_end, 1127 int compress) 1128 { 1129 struct extent_map *em; 1130 int ret = 1; 1131 bool next_mergeable = true; 1132 bool prev_mergeable = true; 1133 1134 /* 1135 * make sure that once we start defragging an extent, we keep on 1136 * defragging it 1137 */ 1138 if (start < *defrag_end) 1139 return 1; 1140 1141 *skip = 0; 1142 1143 em = defrag_lookup_extent(inode, start); 1144 if (!em) 1145 return 0; 1146 1147 /* this will cover holes, and inline extents */ 1148 if (em->block_start >= EXTENT_MAP_LAST_BYTE) { 1149 ret = 0; 1150 goto out; 1151 } 1152 1153 if (!*defrag_end) 1154 prev_mergeable = false; 1155 1156 next_mergeable = defrag_check_next_extent(inode, em); 1157 /* 1158 * we hit a real extent, if it is big or the next extent is not a 1159 * real extent, don't bother defragging it 1160 */ 1161 if (!compress && (*last_len == 0 || *last_len >= thresh) && 1162 (em->len >= thresh || (!next_mergeable && !prev_mergeable))) 1163 ret = 0; 1164 out: 1165 /* 1166 * last_len ends up being a counter of how many bytes we've defragged. 1167 * every time we choose not to defrag an extent, we reset *last_len 1168 * so that the next tiny extent will force a defrag. 1169 * 1170 * The end result of this is that tiny extents before a single big 1171 * extent will force at least part of that big extent to be defragged. 1172 */ 1173 if (ret) { 1174 *defrag_end = extent_map_end(em); 1175 } else { 1176 *last_len = 0; 1177 *skip = extent_map_end(em); 1178 *defrag_end = 0; 1179 } 1180 1181 free_extent_map(em); 1182 return ret; 1183 } 1184 1185 /* 1186 * it doesn't do much good to defrag one or two pages 1187 * at a time. This pulls in a nice chunk of pages 1188 * to COW and defrag. 1189 * 1190 * It also makes sure the delalloc code has enough 1191 * dirty data to avoid making new small extents as part 1192 * of the defrag 1193 * 1194 * It's a good idea to start RA on this range 1195 * before calling this. 1196 */ 1197 static int cluster_pages_for_defrag(struct inode *inode, 1198 struct page **pages, 1199 unsigned long start_index, 1200 unsigned long num_pages) 1201 { 1202 unsigned long file_end; 1203 u64 isize = i_size_read(inode); 1204 u64 page_start; 1205 u64 page_end; 1206 u64 page_cnt; 1207 int ret; 1208 int i; 1209 int i_done; 1210 struct btrfs_ordered_extent *ordered; 1211 struct extent_state *cached_state = NULL; 1212 struct extent_io_tree *tree; 1213 struct extent_changeset *data_reserved = NULL; 1214 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); 1215 1216 file_end = (isize - 1) >> PAGE_SHIFT; 1217 if (!isize || start_index > file_end) 1218 return 0; 1219 1220 page_cnt = min_t(u64, (u64)num_pages, (u64)file_end - start_index + 1); 1221 1222 ret = btrfs_delalloc_reserve_space(inode, &data_reserved, 1223 start_index << PAGE_SHIFT, 1224 page_cnt << PAGE_SHIFT); 1225 if (ret) 1226 return ret; 1227 i_done = 0; 1228 tree = &BTRFS_I(inode)->io_tree; 1229 1230 /* step one, lock all the pages */ 1231 for (i = 0; i < page_cnt; i++) { 1232 struct page *page; 1233 again: 1234 page = find_or_create_page(inode->i_mapping, 1235 start_index + i, mask); 1236 if (!page) 1237 break; 1238 1239 page_start = page_offset(page); 1240 page_end = page_start + PAGE_SIZE - 1; 1241 while (1) { 1242 lock_extent_bits(tree, page_start, page_end, 1243 &cached_state); 1244 ordered = btrfs_lookup_ordered_extent(inode, 1245 page_start); 1246 unlock_extent_cached(tree, page_start, page_end, 1247 &cached_state); 1248 if (!ordered) 1249 break; 1250 1251 unlock_page(page); 1252 btrfs_start_ordered_extent(inode, ordered, 1); 1253 btrfs_put_ordered_extent(ordered); 1254 lock_page(page); 1255 /* 1256 * we unlocked the page above, so we need check if 1257 * it was released or not. 1258 */ 1259 if (page->mapping != inode->i_mapping) { 1260 unlock_page(page); 1261 put_page(page); 1262 goto again; 1263 } 1264 } 1265 1266 if (!PageUptodate(page)) { 1267 btrfs_readpage(NULL, page); 1268 lock_page(page); 1269 if (!PageUptodate(page)) { 1270 unlock_page(page); 1271 put_page(page); 1272 ret = -EIO; 1273 break; 1274 } 1275 } 1276 1277 if (page->mapping != inode->i_mapping) { 1278 unlock_page(page); 1279 put_page(page); 1280 goto again; 1281 } 1282 1283 pages[i] = page; 1284 i_done++; 1285 } 1286 if (!i_done || ret) 1287 goto out; 1288 1289 if (!(inode->i_sb->s_flags & SB_ACTIVE)) 1290 goto out; 1291 1292 /* 1293 * so now we have a nice long stream of locked 1294 * and up to date pages, lets wait on them 1295 */ 1296 for (i = 0; i < i_done; i++) 1297 wait_on_page_writeback(pages[i]); 1298 1299 page_start = page_offset(pages[0]); 1300 page_end = page_offset(pages[i_done - 1]) + PAGE_SIZE; 1301 1302 lock_extent_bits(&BTRFS_I(inode)->io_tree, 1303 page_start, page_end - 1, &cached_state); 1304 clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, 1305 page_end - 1, EXTENT_DIRTY | EXTENT_DELALLOC | 1306 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0, 1307 &cached_state); 1308 1309 if (i_done != page_cnt) { 1310 spin_lock(&BTRFS_I(inode)->lock); 1311 BTRFS_I(inode)->outstanding_extents++; 1312 spin_unlock(&BTRFS_I(inode)->lock); 1313 btrfs_delalloc_release_space(inode, data_reserved, 1314 start_index << PAGE_SHIFT, 1315 (page_cnt - i_done) << PAGE_SHIFT, true); 1316 } 1317 1318 1319 set_extent_defrag(&BTRFS_I(inode)->io_tree, page_start, page_end - 1, 1320 &cached_state); 1321 1322 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 1323 page_start, page_end - 1, &cached_state); 1324 1325 for (i = 0; i < i_done; i++) { 1326 clear_page_dirty_for_io(pages[i]); 1327 ClearPageChecked(pages[i]); 1328 set_page_extent_mapped(pages[i]); 1329 set_page_dirty(pages[i]); 1330 unlock_page(pages[i]); 1331 put_page(pages[i]); 1332 } 1333 btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT, 1334 false); 1335 extent_changeset_free(data_reserved); 1336 return i_done; 1337 out: 1338 for (i = 0; i < i_done; i++) { 1339 unlock_page(pages[i]); 1340 put_page(pages[i]); 1341 } 1342 btrfs_delalloc_release_space(inode, data_reserved, 1343 start_index << PAGE_SHIFT, 1344 page_cnt << PAGE_SHIFT, true); 1345 btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT, 1346 true); 1347 extent_changeset_free(data_reserved); 1348 return ret; 1349 1350 } 1351 1352 int btrfs_defrag_file(struct inode *inode, struct file *file, 1353 struct btrfs_ioctl_defrag_range_args *range, 1354 u64 newer_than, unsigned long max_to_defrag) 1355 { 1356 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 1357 struct btrfs_root *root = BTRFS_I(inode)->root; 1358 struct file_ra_state *ra = NULL; 1359 unsigned long last_index; 1360 u64 isize = i_size_read(inode); 1361 u64 last_len = 0; 1362 u64 skip = 0; 1363 u64 defrag_end = 0; 1364 u64 newer_off = range->start; 1365 unsigned long i; 1366 unsigned long ra_index = 0; 1367 int ret; 1368 int defrag_count = 0; 1369 int compress_type = BTRFS_COMPRESS_ZLIB; 1370 u32 extent_thresh = range->extent_thresh; 1371 unsigned long max_cluster = SZ_256K >> PAGE_SHIFT; 1372 unsigned long cluster = max_cluster; 1373 u64 new_align = ~((u64)SZ_128K - 1); 1374 struct page **pages = NULL; 1375 bool do_compress = range->flags & BTRFS_DEFRAG_RANGE_COMPRESS; 1376 1377 if (isize == 0) 1378 return 0; 1379 1380 if (range->start >= isize) 1381 return -EINVAL; 1382 1383 if (do_compress) { 1384 if (range->compress_type > BTRFS_COMPRESS_TYPES) 1385 return -EINVAL; 1386 if (range->compress_type) 1387 compress_type = range->compress_type; 1388 } 1389 1390 if (extent_thresh == 0) 1391 extent_thresh = SZ_256K; 1392 1393 /* 1394 * If we were not given a file, allocate a readahead context. As 1395 * readahead is just an optimization, defrag will work without it so 1396 * we don't error out. 1397 */ 1398 if (!file) { 1399 ra = kzalloc(sizeof(*ra), GFP_KERNEL); 1400 if (ra) 1401 file_ra_state_init(ra, inode->i_mapping); 1402 } else { 1403 ra = &file->f_ra; 1404 } 1405 1406 pages = kmalloc_array(max_cluster, sizeof(struct page *), GFP_KERNEL); 1407 if (!pages) { 1408 ret = -ENOMEM; 1409 goto out_ra; 1410 } 1411 1412 /* find the last page to defrag */ 1413 if (range->start + range->len > range->start) { 1414 last_index = min_t(u64, isize - 1, 1415 range->start + range->len - 1) >> PAGE_SHIFT; 1416 } else { 1417 last_index = (isize - 1) >> PAGE_SHIFT; 1418 } 1419 1420 if (newer_than) { 1421 ret = find_new_extents(root, inode, newer_than, 1422 &newer_off, SZ_64K); 1423 if (!ret) { 1424 range->start = newer_off; 1425 /* 1426 * we always align our defrag to help keep 1427 * the extents in the file evenly spaced 1428 */ 1429 i = (newer_off & new_align) >> PAGE_SHIFT; 1430 } else 1431 goto out_ra; 1432 } else { 1433 i = range->start >> PAGE_SHIFT; 1434 } 1435 if (!max_to_defrag) 1436 max_to_defrag = last_index - i + 1; 1437 1438 /* 1439 * make writeback starts from i, so the defrag range can be 1440 * written sequentially. 1441 */ 1442 if (i < inode->i_mapping->writeback_index) 1443 inode->i_mapping->writeback_index = i; 1444 1445 while (i <= last_index && defrag_count < max_to_defrag && 1446 (i < DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE))) { 1447 /* 1448 * make sure we stop running if someone unmounts 1449 * the FS 1450 */ 1451 if (!(inode->i_sb->s_flags & SB_ACTIVE)) 1452 break; 1453 1454 if (btrfs_defrag_cancelled(fs_info)) { 1455 btrfs_debug(fs_info, "defrag_file cancelled"); 1456 ret = -EAGAIN; 1457 break; 1458 } 1459 1460 if (!should_defrag_range(inode, (u64)i << PAGE_SHIFT, 1461 extent_thresh, &last_len, &skip, 1462 &defrag_end, do_compress)){ 1463 unsigned long next; 1464 /* 1465 * the should_defrag function tells us how much to skip 1466 * bump our counter by the suggested amount 1467 */ 1468 next = DIV_ROUND_UP(skip, PAGE_SIZE); 1469 i = max(i + 1, next); 1470 continue; 1471 } 1472 1473 if (!newer_than) { 1474 cluster = (PAGE_ALIGN(defrag_end) >> 1475 PAGE_SHIFT) - i; 1476 cluster = min(cluster, max_cluster); 1477 } else { 1478 cluster = max_cluster; 1479 } 1480 1481 if (i + cluster > ra_index) { 1482 ra_index = max(i, ra_index); 1483 if (ra) 1484 page_cache_sync_readahead(inode->i_mapping, ra, 1485 file, ra_index, cluster); 1486 ra_index += cluster; 1487 } 1488 1489 inode_lock(inode); 1490 if (do_compress) 1491 BTRFS_I(inode)->defrag_compress = compress_type; 1492 ret = cluster_pages_for_defrag(inode, pages, i, cluster); 1493 if (ret < 0) { 1494 inode_unlock(inode); 1495 goto out_ra; 1496 } 1497 1498 defrag_count += ret; 1499 balance_dirty_pages_ratelimited(inode->i_mapping); 1500 inode_unlock(inode); 1501 1502 if (newer_than) { 1503 if (newer_off == (u64)-1) 1504 break; 1505 1506 if (ret > 0) 1507 i += ret; 1508 1509 newer_off = max(newer_off + 1, 1510 (u64)i << PAGE_SHIFT); 1511 1512 ret = find_new_extents(root, inode, newer_than, 1513 &newer_off, SZ_64K); 1514 if (!ret) { 1515 range->start = newer_off; 1516 i = (newer_off & new_align) >> PAGE_SHIFT; 1517 } else { 1518 break; 1519 } 1520 } else { 1521 if (ret > 0) { 1522 i += ret; 1523 last_len += ret << PAGE_SHIFT; 1524 } else { 1525 i++; 1526 last_len = 0; 1527 } 1528 } 1529 } 1530 1531 if ((range->flags & BTRFS_DEFRAG_RANGE_START_IO)) { 1532 filemap_flush(inode->i_mapping); 1533 if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, 1534 &BTRFS_I(inode)->runtime_flags)) 1535 filemap_flush(inode->i_mapping); 1536 } 1537 1538 if (range->compress_type == BTRFS_COMPRESS_LZO) { 1539 btrfs_set_fs_incompat(fs_info, COMPRESS_LZO); 1540 } else if (range->compress_type == BTRFS_COMPRESS_ZSTD) { 1541 btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD); 1542 } 1543 1544 ret = defrag_count; 1545 1546 out_ra: 1547 if (do_compress) { 1548 inode_lock(inode); 1549 BTRFS_I(inode)->defrag_compress = BTRFS_COMPRESS_NONE; 1550 inode_unlock(inode); 1551 } 1552 if (!file) 1553 kfree(ra); 1554 kfree(pages); 1555 return ret; 1556 } 1557 1558 static noinline int btrfs_ioctl_resize(struct file *file, 1559 void __user *arg) 1560 { 1561 struct inode *inode = file_inode(file); 1562 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 1563 u64 new_size; 1564 u64 old_size; 1565 u64 devid = 1; 1566 struct btrfs_root *root = BTRFS_I(inode)->root; 1567 struct btrfs_ioctl_vol_args *vol_args; 1568 struct btrfs_trans_handle *trans; 1569 struct btrfs_device *device = NULL; 1570 char *sizestr; 1571 char *retptr; 1572 char *devstr = NULL; 1573 int ret = 0; 1574 int mod = 0; 1575 1576 if (!capable(CAP_SYS_ADMIN)) 1577 return -EPERM; 1578 1579 ret = mnt_want_write_file(file); 1580 if (ret) 1581 return ret; 1582 1583 if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { 1584 mnt_drop_write_file(file); 1585 return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; 1586 } 1587 1588 vol_args = memdup_user(arg, sizeof(*vol_args)); 1589 if (IS_ERR(vol_args)) { 1590 ret = PTR_ERR(vol_args); 1591 goto out; 1592 } 1593 1594 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; 1595 1596 sizestr = vol_args->name; 1597 devstr = strchr(sizestr, ':'); 1598 if (devstr) { 1599 sizestr = devstr + 1; 1600 *devstr = '\0'; 1601 devstr = vol_args->name; 1602 ret = kstrtoull(devstr, 10, &devid); 1603 if (ret) 1604 goto out_free; 1605 if (!devid) { 1606 ret = -EINVAL; 1607 goto out_free; 1608 } 1609 btrfs_info(fs_info, "resizing devid %llu", devid); 1610 } 1611 1612 device = btrfs_find_device(fs_info, devid, NULL, NULL); 1613 if (!device) { 1614 btrfs_info(fs_info, "resizer unable to find device %llu", 1615 devid); 1616 ret = -ENODEV; 1617 goto out_free; 1618 } 1619 1620 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 1621 btrfs_info(fs_info, 1622 "resizer unable to apply on readonly device %llu", 1623 devid); 1624 ret = -EPERM; 1625 goto out_free; 1626 } 1627 1628 if (!strcmp(sizestr, "max")) 1629 new_size = device->bdev->bd_inode->i_size; 1630 else { 1631 if (sizestr[0] == '-') { 1632 mod = -1; 1633 sizestr++; 1634 } else if (sizestr[0] == '+') { 1635 mod = 1; 1636 sizestr++; 1637 } 1638 new_size = memparse(sizestr, &retptr); 1639 if (*retptr != '\0' || new_size == 0) { 1640 ret = -EINVAL; 1641 goto out_free; 1642 } 1643 } 1644 1645 if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { 1646 ret = -EPERM; 1647 goto out_free; 1648 } 1649 1650 old_size = btrfs_device_get_total_bytes(device); 1651 1652 if (mod < 0) { 1653 if (new_size > old_size) { 1654 ret = -EINVAL; 1655 goto out_free; 1656 } 1657 new_size = old_size - new_size; 1658 } else if (mod > 0) { 1659 if (new_size > ULLONG_MAX - old_size) { 1660 ret = -ERANGE; 1661 goto out_free; 1662 } 1663 new_size = old_size + new_size; 1664 } 1665 1666 if (new_size < SZ_256M) { 1667 ret = -EINVAL; 1668 goto out_free; 1669 } 1670 if (new_size > device->bdev->bd_inode->i_size) { 1671 ret = -EFBIG; 1672 goto out_free; 1673 } 1674 1675 new_size = round_down(new_size, fs_info->sectorsize); 1676 1677 btrfs_info_in_rcu(fs_info, "new size for %s is %llu", 1678 rcu_str_deref(device->name), new_size); 1679 1680 if (new_size > old_size) { 1681 trans = btrfs_start_transaction(root, 0); 1682 if (IS_ERR(trans)) { 1683 ret = PTR_ERR(trans); 1684 goto out_free; 1685 } 1686 ret = btrfs_grow_device(trans, device, new_size); 1687 btrfs_commit_transaction(trans); 1688 } else if (new_size < old_size) { 1689 ret = btrfs_shrink_device(device, new_size); 1690 } /* equal, nothing need to do */ 1691 1692 out_free: 1693 kfree(vol_args); 1694 out: 1695 clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); 1696 mnt_drop_write_file(file); 1697 return ret; 1698 } 1699 1700 static noinline int btrfs_ioctl_snap_create_transid(struct file *file, 1701 const char *name, unsigned long fd, int subvol, 1702 u64 *transid, bool readonly, 1703 struct btrfs_qgroup_inherit *inherit) 1704 { 1705 int namelen; 1706 int ret = 0; 1707 1708 if (!S_ISDIR(file_inode(file)->i_mode)) 1709 return -ENOTDIR; 1710 1711 ret = mnt_want_write_file(file); 1712 if (ret) 1713 goto out; 1714 1715 namelen = strlen(name); 1716 if (strchr(name, '/')) { 1717 ret = -EINVAL; 1718 goto out_drop_write; 1719 } 1720 1721 if (name[0] == '.' && 1722 (namelen == 1 || (name[1] == '.' && namelen == 2))) { 1723 ret = -EEXIST; 1724 goto out_drop_write; 1725 } 1726 1727 if (subvol) { 1728 ret = btrfs_mksubvol(&file->f_path, name, namelen, 1729 NULL, transid, readonly, inherit); 1730 } else { 1731 struct fd src = fdget(fd); 1732 struct inode *src_inode; 1733 if (!src.file) { 1734 ret = -EINVAL; 1735 goto out_drop_write; 1736 } 1737 1738 src_inode = file_inode(src.file); 1739 if (src_inode->i_sb != file_inode(file)->i_sb) { 1740 btrfs_info(BTRFS_I(file_inode(file))->root->fs_info, 1741 "Snapshot src from another FS"); 1742 ret = -EXDEV; 1743 } else if (!inode_owner_or_capable(src_inode)) { 1744 /* 1745 * Subvolume creation is not restricted, but snapshots 1746 * are limited to own subvolumes only 1747 */ 1748 ret = -EPERM; 1749 } else { 1750 ret = btrfs_mksubvol(&file->f_path, name, namelen, 1751 BTRFS_I(src_inode)->root, 1752 transid, readonly, inherit); 1753 } 1754 fdput(src); 1755 } 1756 out_drop_write: 1757 mnt_drop_write_file(file); 1758 out: 1759 return ret; 1760 } 1761 1762 static noinline int btrfs_ioctl_snap_create(struct file *file, 1763 void __user *arg, int subvol) 1764 { 1765 struct btrfs_ioctl_vol_args *vol_args; 1766 int ret; 1767 1768 if (!S_ISDIR(file_inode(file)->i_mode)) 1769 return -ENOTDIR; 1770 1771 vol_args = memdup_user(arg, sizeof(*vol_args)); 1772 if (IS_ERR(vol_args)) 1773 return PTR_ERR(vol_args); 1774 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; 1775 1776 ret = btrfs_ioctl_snap_create_transid(file, vol_args->name, 1777 vol_args->fd, subvol, 1778 NULL, false, NULL); 1779 1780 kfree(vol_args); 1781 return ret; 1782 } 1783 1784 static noinline int btrfs_ioctl_snap_create_v2(struct file *file, 1785 void __user *arg, int subvol) 1786 { 1787 struct btrfs_ioctl_vol_args_v2 *vol_args; 1788 int ret; 1789 u64 transid = 0; 1790 u64 *ptr = NULL; 1791 bool readonly = false; 1792 struct btrfs_qgroup_inherit *inherit = NULL; 1793 1794 if (!S_ISDIR(file_inode(file)->i_mode)) 1795 return -ENOTDIR; 1796 1797 vol_args = memdup_user(arg, sizeof(*vol_args)); 1798 if (IS_ERR(vol_args)) 1799 return PTR_ERR(vol_args); 1800 vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0'; 1801 1802 if (vol_args->flags & 1803 ~(BTRFS_SUBVOL_CREATE_ASYNC | BTRFS_SUBVOL_RDONLY | 1804 BTRFS_SUBVOL_QGROUP_INHERIT)) { 1805 ret = -EOPNOTSUPP; 1806 goto free_args; 1807 } 1808 1809 if (vol_args->flags & BTRFS_SUBVOL_CREATE_ASYNC) 1810 ptr = &transid; 1811 if (vol_args->flags & BTRFS_SUBVOL_RDONLY) 1812 readonly = true; 1813 if (vol_args->flags & BTRFS_SUBVOL_QGROUP_INHERIT) { 1814 if (vol_args->size > PAGE_SIZE) { 1815 ret = -EINVAL; 1816 goto free_args; 1817 } 1818 inherit = memdup_user(vol_args->qgroup_inherit, vol_args->size); 1819 if (IS_ERR(inherit)) { 1820 ret = PTR_ERR(inherit); 1821 goto free_args; 1822 } 1823 } 1824 1825 ret = btrfs_ioctl_snap_create_transid(file, vol_args->name, 1826 vol_args->fd, subvol, ptr, 1827 readonly, inherit); 1828 if (ret) 1829 goto free_inherit; 1830 1831 if (ptr && copy_to_user(arg + 1832 offsetof(struct btrfs_ioctl_vol_args_v2, 1833 transid), 1834 ptr, sizeof(*ptr))) 1835 ret = -EFAULT; 1836 1837 free_inherit: 1838 kfree(inherit); 1839 free_args: 1840 kfree(vol_args); 1841 return ret; 1842 } 1843 1844 static noinline int btrfs_ioctl_subvol_getflags(struct file *file, 1845 void __user *arg) 1846 { 1847 struct inode *inode = file_inode(file); 1848 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 1849 struct btrfs_root *root = BTRFS_I(inode)->root; 1850 int ret = 0; 1851 u64 flags = 0; 1852 1853 if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FIRST_FREE_OBJECTID) 1854 return -EINVAL; 1855 1856 down_read(&fs_info->subvol_sem); 1857 if (btrfs_root_readonly(root)) 1858 flags |= BTRFS_SUBVOL_RDONLY; 1859 up_read(&fs_info->subvol_sem); 1860 1861 if (copy_to_user(arg, &flags, sizeof(flags))) 1862 ret = -EFAULT; 1863 1864 return ret; 1865 } 1866 1867 static noinline int btrfs_ioctl_subvol_setflags(struct file *file, 1868 void __user *arg) 1869 { 1870 struct inode *inode = file_inode(file); 1871 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 1872 struct btrfs_root *root = BTRFS_I(inode)->root; 1873 struct btrfs_trans_handle *trans; 1874 u64 root_flags; 1875 u64 flags; 1876 int ret = 0; 1877 1878 if (!inode_owner_or_capable(inode)) 1879 return -EPERM; 1880 1881 ret = mnt_want_write_file(file); 1882 if (ret) 1883 goto out; 1884 1885 if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FIRST_FREE_OBJECTID) { 1886 ret = -EINVAL; 1887 goto out_drop_write; 1888 } 1889 1890 if (copy_from_user(&flags, arg, sizeof(flags))) { 1891 ret = -EFAULT; 1892 goto out_drop_write; 1893 } 1894 1895 if (flags & BTRFS_SUBVOL_CREATE_ASYNC) { 1896 ret = -EINVAL; 1897 goto out_drop_write; 1898 } 1899 1900 if (flags & ~BTRFS_SUBVOL_RDONLY) { 1901 ret = -EOPNOTSUPP; 1902 goto out_drop_write; 1903 } 1904 1905 down_write(&fs_info->subvol_sem); 1906 1907 /* nothing to do */ 1908 if (!!(flags & BTRFS_SUBVOL_RDONLY) == btrfs_root_readonly(root)) 1909 goto out_drop_sem; 1910 1911 root_flags = btrfs_root_flags(&root->root_item); 1912 if (flags & BTRFS_SUBVOL_RDONLY) { 1913 btrfs_set_root_flags(&root->root_item, 1914 root_flags | BTRFS_ROOT_SUBVOL_RDONLY); 1915 } else { 1916 /* 1917 * Block RO -> RW transition if this subvolume is involved in 1918 * send 1919 */ 1920 spin_lock(&root->root_item_lock); 1921 if (root->send_in_progress == 0) { 1922 btrfs_set_root_flags(&root->root_item, 1923 root_flags & ~BTRFS_ROOT_SUBVOL_RDONLY); 1924 spin_unlock(&root->root_item_lock); 1925 } else { 1926 spin_unlock(&root->root_item_lock); 1927 btrfs_warn(fs_info, 1928 "Attempt to set subvolume %llu read-write during send", 1929 root->root_key.objectid); 1930 ret = -EPERM; 1931 goto out_drop_sem; 1932 } 1933 } 1934 1935 trans = btrfs_start_transaction(root, 1); 1936 if (IS_ERR(trans)) { 1937 ret = PTR_ERR(trans); 1938 goto out_reset; 1939 } 1940 1941 ret = btrfs_update_root(trans, fs_info->tree_root, 1942 &root->root_key, &root->root_item); 1943 if (ret < 0) { 1944 btrfs_end_transaction(trans); 1945 goto out_reset; 1946 } 1947 1948 ret = btrfs_commit_transaction(trans); 1949 1950 out_reset: 1951 if (ret) 1952 btrfs_set_root_flags(&root->root_item, root_flags); 1953 out_drop_sem: 1954 up_write(&fs_info->subvol_sem); 1955 out_drop_write: 1956 mnt_drop_write_file(file); 1957 out: 1958 return ret; 1959 } 1960 1961 static noinline int key_in_sk(struct btrfs_key *key, 1962 struct btrfs_ioctl_search_key *sk) 1963 { 1964 struct btrfs_key test; 1965 int ret; 1966 1967 test.objectid = sk->min_objectid; 1968 test.type = sk->min_type; 1969 test.offset = sk->min_offset; 1970 1971 ret = btrfs_comp_cpu_keys(key, &test); 1972 if (ret < 0) 1973 return 0; 1974 1975 test.objectid = sk->max_objectid; 1976 test.type = sk->max_type; 1977 test.offset = sk->max_offset; 1978 1979 ret = btrfs_comp_cpu_keys(key, &test); 1980 if (ret > 0) 1981 return 0; 1982 return 1; 1983 } 1984 1985 static noinline int copy_to_sk(struct btrfs_path *path, 1986 struct btrfs_key *key, 1987 struct btrfs_ioctl_search_key *sk, 1988 size_t *buf_size, 1989 char __user *ubuf, 1990 unsigned long *sk_offset, 1991 int *num_found) 1992 { 1993 u64 found_transid; 1994 struct extent_buffer *leaf; 1995 struct btrfs_ioctl_search_header sh; 1996 struct btrfs_key test; 1997 unsigned long item_off; 1998 unsigned long item_len; 1999 int nritems; 2000 int i; 2001 int slot; 2002 int ret = 0; 2003 2004 leaf = path->nodes[0]; 2005 slot = path->slots[0]; 2006 nritems = btrfs_header_nritems(leaf); 2007 2008 if (btrfs_header_generation(leaf) > sk->max_transid) { 2009 i = nritems; 2010 goto advance_key; 2011 } 2012 found_transid = btrfs_header_generation(leaf); 2013 2014 for (i = slot; i < nritems; i++) { 2015 item_off = btrfs_item_ptr_offset(leaf, i); 2016 item_len = btrfs_item_size_nr(leaf, i); 2017 2018 btrfs_item_key_to_cpu(leaf, key, i); 2019 if (!key_in_sk(key, sk)) 2020 continue; 2021 2022 if (sizeof(sh) + item_len > *buf_size) { 2023 if (*num_found) { 2024 ret = 1; 2025 goto out; 2026 } 2027 2028 /* 2029 * return one empty item back for v1, which does not 2030 * handle -EOVERFLOW 2031 */ 2032 2033 *buf_size = sizeof(sh) + item_len; 2034 item_len = 0; 2035 ret = -EOVERFLOW; 2036 } 2037 2038 if (sizeof(sh) + item_len + *sk_offset > *buf_size) { 2039 ret = 1; 2040 goto out; 2041 } 2042 2043 sh.objectid = key->objectid; 2044 sh.offset = key->offset; 2045 sh.type = key->type; 2046 sh.len = item_len; 2047 sh.transid = found_transid; 2048 2049 /* copy search result header */ 2050 if (copy_to_user(ubuf + *sk_offset, &sh, sizeof(sh))) { 2051 ret = -EFAULT; 2052 goto out; 2053 } 2054 2055 *sk_offset += sizeof(sh); 2056 2057 if (item_len) { 2058 char __user *up = ubuf + *sk_offset; 2059 /* copy the item */ 2060 if (read_extent_buffer_to_user(leaf, up, 2061 item_off, item_len)) { 2062 ret = -EFAULT; 2063 goto out; 2064 } 2065 2066 *sk_offset += item_len; 2067 } 2068 (*num_found)++; 2069 2070 if (ret) /* -EOVERFLOW from above */ 2071 goto out; 2072 2073 if (*num_found >= sk->nr_items) { 2074 ret = 1; 2075 goto out; 2076 } 2077 } 2078 advance_key: 2079 ret = 0; 2080 test.objectid = sk->max_objectid; 2081 test.type = sk->max_type; 2082 test.offset = sk->max_offset; 2083 if (btrfs_comp_cpu_keys(key, &test) >= 0) 2084 ret = 1; 2085 else if (key->offset < (u64)-1) 2086 key->offset++; 2087 else if (key->type < (u8)-1) { 2088 key->offset = 0; 2089 key->type++; 2090 } else if (key->objectid < (u64)-1) { 2091 key->offset = 0; 2092 key->type = 0; 2093 key->objectid++; 2094 } else 2095 ret = 1; 2096 out: 2097 /* 2098 * 0: all items from this leaf copied, continue with next 2099 * 1: * more items can be copied, but unused buffer is too small 2100 * * all items were found 2101 * Either way, it will stops the loop which iterates to the next 2102 * leaf 2103 * -EOVERFLOW: item was to large for buffer 2104 * -EFAULT: could not copy extent buffer back to userspace 2105 */ 2106 return ret; 2107 } 2108 2109 static noinline int search_ioctl(struct inode *inode, 2110 struct btrfs_ioctl_search_key *sk, 2111 size_t *buf_size, 2112 char __user *ubuf) 2113 { 2114 struct btrfs_fs_info *info = btrfs_sb(inode->i_sb); 2115 struct btrfs_root *root; 2116 struct btrfs_key key; 2117 struct btrfs_path *path; 2118 int ret; 2119 int num_found = 0; 2120 unsigned long sk_offset = 0; 2121 2122 if (*buf_size < sizeof(struct btrfs_ioctl_search_header)) { 2123 *buf_size = sizeof(struct btrfs_ioctl_search_header); 2124 return -EOVERFLOW; 2125 } 2126 2127 path = btrfs_alloc_path(); 2128 if (!path) 2129 return -ENOMEM; 2130 2131 if (sk->tree_id == 0) { 2132 /* search the root of the inode that was passed */ 2133 root = BTRFS_I(inode)->root; 2134 } else { 2135 key.objectid = sk->tree_id; 2136 key.type = BTRFS_ROOT_ITEM_KEY; 2137 key.offset = (u64)-1; 2138 root = btrfs_read_fs_root_no_name(info, &key); 2139 if (IS_ERR(root)) { 2140 btrfs_free_path(path); 2141 return PTR_ERR(root); 2142 } 2143 } 2144 2145 key.objectid = sk->min_objectid; 2146 key.type = sk->min_type; 2147 key.offset = sk->min_offset; 2148 2149 while (1) { 2150 ret = btrfs_search_forward(root, &key, path, sk->min_transid); 2151 if (ret != 0) { 2152 if (ret > 0) 2153 ret = 0; 2154 goto err; 2155 } 2156 ret = copy_to_sk(path, &key, sk, buf_size, ubuf, 2157 &sk_offset, &num_found); 2158 btrfs_release_path(path); 2159 if (ret) 2160 break; 2161 2162 } 2163 if (ret > 0) 2164 ret = 0; 2165 err: 2166 sk->nr_items = num_found; 2167 btrfs_free_path(path); 2168 return ret; 2169 } 2170 2171 static noinline int btrfs_ioctl_tree_search(struct file *file, 2172 void __user *argp) 2173 { 2174 struct btrfs_ioctl_search_args __user *uargs; 2175 struct btrfs_ioctl_search_key sk; 2176 struct inode *inode; 2177 int ret; 2178 size_t buf_size; 2179 2180 if (!capable(CAP_SYS_ADMIN)) 2181 return -EPERM; 2182 2183 uargs = (struct btrfs_ioctl_search_args __user *)argp; 2184 2185 if (copy_from_user(&sk, &uargs->key, sizeof(sk))) 2186 return -EFAULT; 2187 2188 buf_size = sizeof(uargs->buf); 2189 2190 inode = file_inode(file); 2191 ret = search_ioctl(inode, &sk, &buf_size, uargs->buf); 2192 2193 /* 2194 * In the origin implementation an overflow is handled by returning a 2195 * search header with a len of zero, so reset ret. 2196 */ 2197 if (ret == -EOVERFLOW) 2198 ret = 0; 2199 2200 if (ret == 0 && copy_to_user(&uargs->key, &sk, sizeof(sk))) 2201 ret = -EFAULT; 2202 return ret; 2203 } 2204 2205 static noinline int btrfs_ioctl_tree_search_v2(struct file *file, 2206 void __user *argp) 2207 { 2208 struct btrfs_ioctl_search_args_v2 __user *uarg; 2209 struct btrfs_ioctl_search_args_v2 args; 2210 struct inode *inode; 2211 int ret; 2212 size_t buf_size; 2213 const size_t buf_limit = SZ_16M; 2214 2215 if (!capable(CAP_SYS_ADMIN)) 2216 return -EPERM; 2217 2218 /* copy search header and buffer size */ 2219 uarg = (struct btrfs_ioctl_search_args_v2 __user *)argp; 2220 if (copy_from_user(&args, uarg, sizeof(args))) 2221 return -EFAULT; 2222 2223 buf_size = args.buf_size; 2224 2225 /* limit result size to 16MB */ 2226 if (buf_size > buf_limit) 2227 buf_size = buf_limit; 2228 2229 inode = file_inode(file); 2230 ret = search_ioctl(inode, &args.key, &buf_size, 2231 (char __user *)(&uarg->buf[0])); 2232 if (ret == 0 && copy_to_user(&uarg->key, &args.key, sizeof(args.key))) 2233 ret = -EFAULT; 2234 else if (ret == -EOVERFLOW && 2235 copy_to_user(&uarg->buf_size, &buf_size, sizeof(buf_size))) 2236 ret = -EFAULT; 2237 2238 return ret; 2239 } 2240 2241 /* 2242 * Search INODE_REFs to identify path name of 'dirid' directory 2243 * in a 'tree_id' tree. and sets path name to 'name'. 2244 */ 2245 static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info, 2246 u64 tree_id, u64 dirid, char *name) 2247 { 2248 struct btrfs_root *root; 2249 struct btrfs_key key; 2250 char *ptr; 2251 int ret = -1; 2252 int slot; 2253 int len; 2254 int total_len = 0; 2255 struct btrfs_inode_ref *iref; 2256 struct extent_buffer *l; 2257 struct btrfs_path *path; 2258 2259 if (dirid == BTRFS_FIRST_FREE_OBJECTID) { 2260 name[0]='\0'; 2261 return 0; 2262 } 2263 2264 path = btrfs_alloc_path(); 2265 if (!path) 2266 return -ENOMEM; 2267 2268 ptr = &name[BTRFS_INO_LOOKUP_PATH_MAX - 1]; 2269 2270 key.objectid = tree_id; 2271 key.type = BTRFS_ROOT_ITEM_KEY; 2272 key.offset = (u64)-1; 2273 root = btrfs_read_fs_root_no_name(info, &key); 2274 if (IS_ERR(root)) { 2275 ret = PTR_ERR(root); 2276 goto out; 2277 } 2278 2279 key.objectid = dirid; 2280 key.type = BTRFS_INODE_REF_KEY; 2281 key.offset = (u64)-1; 2282 2283 while (1) { 2284 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2285 if (ret < 0) 2286 goto out; 2287 else if (ret > 0) { 2288 ret = btrfs_previous_item(root, path, dirid, 2289 BTRFS_INODE_REF_KEY); 2290 if (ret < 0) 2291 goto out; 2292 else if (ret > 0) { 2293 ret = -ENOENT; 2294 goto out; 2295 } 2296 } 2297 2298 l = path->nodes[0]; 2299 slot = path->slots[0]; 2300 btrfs_item_key_to_cpu(l, &key, slot); 2301 2302 iref = btrfs_item_ptr(l, slot, struct btrfs_inode_ref); 2303 len = btrfs_inode_ref_name_len(l, iref); 2304 ptr -= len + 1; 2305 total_len += len + 1; 2306 if (ptr < name) { 2307 ret = -ENAMETOOLONG; 2308 goto out; 2309 } 2310 2311 *(ptr + len) = '/'; 2312 read_extent_buffer(l, ptr, (unsigned long)(iref + 1), len); 2313 2314 if (key.offset == BTRFS_FIRST_FREE_OBJECTID) 2315 break; 2316 2317 btrfs_release_path(path); 2318 key.objectid = key.offset; 2319 key.offset = (u64)-1; 2320 dirid = key.objectid; 2321 } 2322 memmove(name, ptr, total_len); 2323 name[total_len] = '\0'; 2324 ret = 0; 2325 out: 2326 btrfs_free_path(path); 2327 return ret; 2328 } 2329 2330 static int btrfs_search_path_in_tree_user(struct inode *inode, 2331 struct btrfs_ioctl_ino_lookup_user_args *args) 2332 { 2333 struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; 2334 struct super_block *sb = inode->i_sb; 2335 struct btrfs_key upper_limit = BTRFS_I(inode)->location; 2336 u64 treeid = BTRFS_I(inode)->root->root_key.objectid; 2337 u64 dirid = args->dirid; 2338 unsigned long item_off; 2339 unsigned long item_len; 2340 struct btrfs_inode_ref *iref; 2341 struct btrfs_root_ref *rref; 2342 struct btrfs_root *root; 2343 struct btrfs_path *path; 2344 struct btrfs_key key, key2; 2345 struct extent_buffer *leaf; 2346 struct inode *temp_inode; 2347 char *ptr; 2348 int slot; 2349 int len; 2350 int total_len = 0; 2351 int ret; 2352 2353 path = btrfs_alloc_path(); 2354 if (!path) 2355 return -ENOMEM; 2356 2357 /* 2358 * If the bottom subvolume does not exist directly under upper_limit, 2359 * construct the path in from the bottom up. 2360 */ 2361 if (dirid != upper_limit.objectid) { 2362 ptr = &args->path[BTRFS_INO_LOOKUP_USER_PATH_MAX - 1]; 2363 2364 key.objectid = treeid; 2365 key.type = BTRFS_ROOT_ITEM_KEY; 2366 key.offset = (u64)-1; 2367 root = btrfs_read_fs_root_no_name(fs_info, &key); 2368 if (IS_ERR(root)) { 2369 ret = PTR_ERR(root); 2370 goto out; 2371 } 2372 2373 key.objectid = dirid; 2374 key.type = BTRFS_INODE_REF_KEY; 2375 key.offset = (u64)-1; 2376 while (1) { 2377 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2378 if (ret < 0) { 2379 goto out; 2380 } else if (ret > 0) { 2381 ret = btrfs_previous_item(root, path, dirid, 2382 BTRFS_INODE_REF_KEY); 2383 if (ret < 0) { 2384 goto out; 2385 } else if (ret > 0) { 2386 ret = -ENOENT; 2387 goto out; 2388 } 2389 } 2390 2391 leaf = path->nodes[0]; 2392 slot = path->slots[0]; 2393 btrfs_item_key_to_cpu(leaf, &key, slot); 2394 2395 iref = btrfs_item_ptr(leaf, slot, struct btrfs_inode_ref); 2396 len = btrfs_inode_ref_name_len(leaf, iref); 2397 ptr -= len + 1; 2398 total_len += len + 1; 2399 if (ptr < args->path) { 2400 ret = -ENAMETOOLONG; 2401 goto out; 2402 } 2403 2404 *(ptr + len) = '/'; 2405 read_extent_buffer(leaf, ptr, 2406 (unsigned long)(iref + 1), len); 2407 2408 /* Check the read+exec permission of this directory */ 2409 ret = btrfs_previous_item(root, path, dirid, 2410 BTRFS_INODE_ITEM_KEY); 2411 if (ret < 0) { 2412 goto out; 2413 } else if (ret > 0) { 2414 ret = -ENOENT; 2415 goto out; 2416 } 2417 2418 leaf = path->nodes[0]; 2419 slot = path->slots[0]; 2420 btrfs_item_key_to_cpu(leaf, &key2, slot); 2421 if (key2.objectid != dirid) { 2422 ret = -ENOENT; 2423 goto out; 2424 } 2425 2426 temp_inode = btrfs_iget(sb, &key2, root, NULL); 2427 if (IS_ERR(temp_inode)) { 2428 ret = PTR_ERR(temp_inode); 2429 goto out; 2430 } 2431 ret = inode_permission(temp_inode, MAY_READ | MAY_EXEC); 2432 iput(temp_inode); 2433 if (ret) { 2434 ret = -EACCES; 2435 goto out; 2436 } 2437 2438 if (key.offset == upper_limit.objectid) 2439 break; 2440 if (key.objectid == BTRFS_FIRST_FREE_OBJECTID) { 2441 ret = -EACCES; 2442 goto out; 2443 } 2444 2445 btrfs_release_path(path); 2446 key.objectid = key.offset; 2447 key.offset = (u64)-1; 2448 dirid = key.objectid; 2449 } 2450 2451 memmove(args->path, ptr, total_len); 2452 args->path[total_len] = '\0'; 2453 btrfs_release_path(path); 2454 } 2455 2456 /* Get the bottom subvolume's name from ROOT_REF */ 2457 root = fs_info->tree_root; 2458 key.objectid = treeid; 2459 key.type = BTRFS_ROOT_REF_KEY; 2460 key.offset = args->treeid; 2461 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2462 if (ret < 0) { 2463 goto out; 2464 } else if (ret > 0) { 2465 ret = -ENOENT; 2466 goto out; 2467 } 2468 2469 leaf = path->nodes[0]; 2470 slot = path->slots[0]; 2471 btrfs_item_key_to_cpu(leaf, &key, slot); 2472 2473 item_off = btrfs_item_ptr_offset(leaf, slot); 2474 item_len = btrfs_item_size_nr(leaf, slot); 2475 /* Check if dirid in ROOT_REF corresponds to passed dirid */ 2476 rref = btrfs_item_ptr(leaf, slot, struct btrfs_root_ref); 2477 if (args->dirid != btrfs_root_ref_dirid(leaf, rref)) { 2478 ret = -EINVAL; 2479 goto out; 2480 } 2481 2482 /* Copy subvolume's name */ 2483 item_off += sizeof(struct btrfs_root_ref); 2484 item_len -= sizeof(struct btrfs_root_ref); 2485 read_extent_buffer(leaf, args->name, item_off, item_len); 2486 args->name[item_len] = 0; 2487 2488 out: 2489 btrfs_free_path(path); 2490 return ret; 2491 } 2492 2493 static noinline int btrfs_ioctl_ino_lookup(struct file *file, 2494 void __user *argp) 2495 { 2496 struct btrfs_ioctl_ino_lookup_args *args; 2497 struct inode *inode; 2498 int ret = 0; 2499 2500 args = memdup_user(argp, sizeof(*args)); 2501 if (IS_ERR(args)) 2502 return PTR_ERR(args); 2503 2504 inode = file_inode(file); 2505 2506 /* 2507 * Unprivileged query to obtain the containing subvolume root id. The 2508 * path is reset so it's consistent with btrfs_search_path_in_tree. 2509 */ 2510 if (args->treeid == 0) 2511 args->treeid = BTRFS_I(inode)->root->root_key.objectid; 2512 2513 if (args->objectid == BTRFS_FIRST_FREE_OBJECTID) { 2514 args->name[0] = 0; 2515 goto out; 2516 } 2517 2518 if (!capable(CAP_SYS_ADMIN)) { 2519 ret = -EPERM; 2520 goto out; 2521 } 2522 2523 ret = btrfs_search_path_in_tree(BTRFS_I(inode)->root->fs_info, 2524 args->treeid, args->objectid, 2525 args->name); 2526 2527 out: 2528 if (ret == 0 && copy_to_user(argp, args, sizeof(*args))) 2529 ret = -EFAULT; 2530 2531 kfree(args); 2532 return ret; 2533 } 2534 2535 /* 2536 * Version of ino_lookup ioctl (unprivileged) 2537 * 2538 * The main differences from ino_lookup ioctl are: 2539 * 2540 * 1. Read + Exec permission will be checked using inode_permission() during 2541 * path construction. -EACCES will be returned in case of failure. 2542 * 2. Path construction will be stopped at the inode number which corresponds 2543 * to the fd with which this ioctl is called. If constructed path does not 2544 * exist under fd's inode, -EACCES will be returned. 2545 * 3. The name of bottom subvolume is also searched and filled. 2546 */ 2547 static int btrfs_ioctl_ino_lookup_user(struct file *file, void __user *argp) 2548 { 2549 struct btrfs_ioctl_ino_lookup_user_args *args; 2550 struct inode *inode; 2551 int ret; 2552 2553 args = memdup_user(argp, sizeof(*args)); 2554 if (IS_ERR(args)) 2555 return PTR_ERR(args); 2556 2557 inode = file_inode(file); 2558 2559 if (args->dirid == BTRFS_FIRST_FREE_OBJECTID && 2560 BTRFS_I(inode)->location.objectid != BTRFS_FIRST_FREE_OBJECTID) { 2561 /* 2562 * The subvolume does not exist under fd with which this is 2563 * called 2564 */ 2565 kfree(args); 2566 return -EACCES; 2567 } 2568 2569 ret = btrfs_search_path_in_tree_user(inode, args); 2570 2571 if (ret == 0 && copy_to_user(argp, args, sizeof(*args))) 2572 ret = -EFAULT; 2573 2574 kfree(args); 2575 return ret; 2576 } 2577 2578 /* Get the subvolume information in BTRFS_ROOT_ITEM and BTRFS_ROOT_BACKREF */ 2579 static int btrfs_ioctl_get_subvol_info(struct file *file, void __user *argp) 2580 { 2581 struct btrfs_ioctl_get_subvol_info_args *subvol_info; 2582 struct btrfs_fs_info *fs_info; 2583 struct btrfs_root *root; 2584 struct btrfs_path *path; 2585 struct btrfs_key key; 2586 struct btrfs_root_item *root_item; 2587 struct btrfs_root_ref *rref; 2588 struct extent_buffer *leaf; 2589 unsigned long item_off; 2590 unsigned long item_len; 2591 struct inode *inode; 2592 int slot; 2593 int ret = 0; 2594 2595 path = btrfs_alloc_path(); 2596 if (!path) 2597 return -ENOMEM; 2598 2599 subvol_info = kzalloc(sizeof(*subvol_info), GFP_KERNEL); 2600 if (!subvol_info) { 2601 btrfs_free_path(path); 2602 return -ENOMEM; 2603 } 2604 2605 inode = file_inode(file); 2606 fs_info = BTRFS_I(inode)->root->fs_info; 2607 2608 /* Get root_item of inode's subvolume */ 2609 key.objectid = BTRFS_I(inode)->root->root_key.objectid; 2610 key.type = BTRFS_ROOT_ITEM_KEY; 2611 key.offset = (u64)-1; 2612 root = btrfs_read_fs_root_no_name(fs_info, &key); 2613 if (IS_ERR(root)) { 2614 ret = PTR_ERR(root); 2615 goto out; 2616 } 2617 root_item = &root->root_item; 2618 2619 subvol_info->treeid = key.objectid; 2620 2621 subvol_info->generation = btrfs_root_generation(root_item); 2622 subvol_info->flags = btrfs_root_flags(root_item); 2623 2624 memcpy(subvol_info->uuid, root_item->uuid, BTRFS_UUID_SIZE); 2625 memcpy(subvol_info->parent_uuid, root_item->parent_uuid, 2626 BTRFS_UUID_SIZE); 2627 memcpy(subvol_info->received_uuid, root_item->received_uuid, 2628 BTRFS_UUID_SIZE); 2629 2630 subvol_info->ctransid = btrfs_root_ctransid(root_item); 2631 subvol_info->ctime.sec = btrfs_stack_timespec_sec(&root_item->ctime); 2632 subvol_info->ctime.nsec = btrfs_stack_timespec_nsec(&root_item->ctime); 2633 2634 subvol_info->otransid = btrfs_root_otransid(root_item); 2635 subvol_info->otime.sec = btrfs_stack_timespec_sec(&root_item->otime); 2636 subvol_info->otime.nsec = btrfs_stack_timespec_nsec(&root_item->otime); 2637 2638 subvol_info->stransid = btrfs_root_stransid(root_item); 2639 subvol_info->stime.sec = btrfs_stack_timespec_sec(&root_item->stime); 2640 subvol_info->stime.nsec = btrfs_stack_timespec_nsec(&root_item->stime); 2641 2642 subvol_info->rtransid = btrfs_root_rtransid(root_item); 2643 subvol_info->rtime.sec = btrfs_stack_timespec_sec(&root_item->rtime); 2644 subvol_info->rtime.nsec = btrfs_stack_timespec_nsec(&root_item->rtime); 2645 2646 if (key.objectid != BTRFS_FS_TREE_OBJECTID) { 2647 /* Search root tree for ROOT_BACKREF of this subvolume */ 2648 root = fs_info->tree_root; 2649 2650 key.type = BTRFS_ROOT_BACKREF_KEY; 2651 key.offset = 0; 2652 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2653 if (ret < 0) { 2654 goto out; 2655 } else if (path->slots[0] >= 2656 btrfs_header_nritems(path->nodes[0])) { 2657 ret = btrfs_next_leaf(root, path); 2658 if (ret < 0) { 2659 goto out; 2660 } else if (ret > 0) { 2661 ret = -EUCLEAN; 2662 goto out; 2663 } 2664 } 2665 2666 leaf = path->nodes[0]; 2667 slot = path->slots[0]; 2668 btrfs_item_key_to_cpu(leaf, &key, slot); 2669 if (key.objectid == subvol_info->treeid && 2670 key.type == BTRFS_ROOT_BACKREF_KEY) { 2671 subvol_info->parent_id = key.offset; 2672 2673 rref = btrfs_item_ptr(leaf, slot, struct btrfs_root_ref); 2674 subvol_info->dirid = btrfs_root_ref_dirid(leaf, rref); 2675 2676 item_off = btrfs_item_ptr_offset(leaf, slot) 2677 + sizeof(struct btrfs_root_ref); 2678 item_len = btrfs_item_size_nr(leaf, slot) 2679 - sizeof(struct btrfs_root_ref); 2680 read_extent_buffer(leaf, subvol_info->name, 2681 item_off, item_len); 2682 } else { 2683 ret = -ENOENT; 2684 goto out; 2685 } 2686 } 2687 2688 if (copy_to_user(argp, subvol_info, sizeof(*subvol_info))) 2689 ret = -EFAULT; 2690 2691 out: 2692 btrfs_free_path(path); 2693 kzfree(subvol_info); 2694 return ret; 2695 } 2696 2697 /* 2698 * Return ROOT_REF information of the subvolume containing this inode 2699 * except the subvolume name. 2700 */ 2701 static int btrfs_ioctl_get_subvol_rootref(struct file *file, void __user *argp) 2702 { 2703 struct btrfs_ioctl_get_subvol_rootref_args *rootrefs; 2704 struct btrfs_root_ref *rref; 2705 struct btrfs_root *root; 2706 struct btrfs_path *path; 2707 struct btrfs_key key; 2708 struct extent_buffer *leaf; 2709 struct inode *inode; 2710 u64 objectid; 2711 int slot; 2712 int ret; 2713 u8 found; 2714 2715 path = btrfs_alloc_path(); 2716 if (!path) 2717 return -ENOMEM; 2718 2719 rootrefs = memdup_user(argp, sizeof(*rootrefs)); 2720 if (IS_ERR(rootrefs)) { 2721 btrfs_free_path(path); 2722 return PTR_ERR(rootrefs); 2723 } 2724 2725 inode = file_inode(file); 2726 root = BTRFS_I(inode)->root->fs_info->tree_root; 2727 objectid = BTRFS_I(inode)->root->root_key.objectid; 2728 2729 key.objectid = objectid; 2730 key.type = BTRFS_ROOT_REF_KEY; 2731 key.offset = rootrefs->min_treeid; 2732 found = 0; 2733 2734 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2735 if (ret < 0) { 2736 goto out; 2737 } else if (path->slots[0] >= 2738 btrfs_header_nritems(path->nodes[0])) { 2739 ret = btrfs_next_leaf(root, path); 2740 if (ret < 0) { 2741 goto out; 2742 } else if (ret > 0) { 2743 ret = -EUCLEAN; 2744 goto out; 2745 } 2746 } 2747 while (1) { 2748 leaf = path->nodes[0]; 2749 slot = path->slots[0]; 2750 2751 btrfs_item_key_to_cpu(leaf, &key, slot); 2752 if (key.objectid != objectid || key.type != BTRFS_ROOT_REF_KEY) { 2753 ret = 0; 2754 goto out; 2755 } 2756 2757 if (found == BTRFS_MAX_ROOTREF_BUFFER_NUM) { 2758 ret = -EOVERFLOW; 2759 goto out; 2760 } 2761 2762 rref = btrfs_item_ptr(leaf, slot, struct btrfs_root_ref); 2763 rootrefs->rootref[found].treeid = key.offset; 2764 rootrefs->rootref[found].dirid = 2765 btrfs_root_ref_dirid(leaf, rref); 2766 found++; 2767 2768 ret = btrfs_next_item(root, path); 2769 if (ret < 0) { 2770 goto out; 2771 } else if (ret > 0) { 2772 ret = -EUCLEAN; 2773 goto out; 2774 } 2775 } 2776 2777 out: 2778 if (!ret || ret == -EOVERFLOW) { 2779 rootrefs->num_items = found; 2780 /* update min_treeid for next search */ 2781 if (found) 2782 rootrefs->min_treeid = 2783 rootrefs->rootref[found - 1].treeid + 1; 2784 if (copy_to_user(argp, rootrefs, sizeof(*rootrefs))) 2785 ret = -EFAULT; 2786 } 2787 2788 kfree(rootrefs); 2789 btrfs_free_path(path); 2790 2791 return ret; 2792 } 2793 2794 static noinline int btrfs_ioctl_snap_destroy(struct file *file, 2795 void __user *arg) 2796 { 2797 struct dentry *parent = file->f_path.dentry; 2798 struct btrfs_fs_info *fs_info = btrfs_sb(parent->d_sb); 2799 struct dentry *dentry; 2800 struct inode *dir = d_inode(parent); 2801 struct inode *inode; 2802 struct btrfs_root *root = BTRFS_I(dir)->root; 2803 struct btrfs_root *dest = NULL; 2804 struct btrfs_ioctl_vol_args *vol_args; 2805 int namelen; 2806 int err = 0; 2807 2808 if (!S_ISDIR(dir->i_mode)) 2809 return -ENOTDIR; 2810 2811 vol_args = memdup_user(arg, sizeof(*vol_args)); 2812 if (IS_ERR(vol_args)) 2813 return PTR_ERR(vol_args); 2814 2815 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; 2816 namelen = strlen(vol_args->name); 2817 if (strchr(vol_args->name, '/') || 2818 strncmp(vol_args->name, "..", namelen) == 0) { 2819 err = -EINVAL; 2820 goto out; 2821 } 2822 2823 err = mnt_want_write_file(file); 2824 if (err) 2825 goto out; 2826 2827 2828 err = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT); 2829 if (err == -EINTR) 2830 goto out_drop_write; 2831 dentry = lookup_one_len(vol_args->name, parent, namelen); 2832 if (IS_ERR(dentry)) { 2833 err = PTR_ERR(dentry); 2834 goto out_unlock_dir; 2835 } 2836 2837 if (d_really_is_negative(dentry)) { 2838 err = -ENOENT; 2839 goto out_dput; 2840 } 2841 2842 inode = d_inode(dentry); 2843 dest = BTRFS_I(inode)->root; 2844 if (!capable(CAP_SYS_ADMIN)) { 2845 /* 2846 * Regular user. Only allow this with a special mount 2847 * option, when the user has write+exec access to the 2848 * subvol root, and when rmdir(2) would have been 2849 * allowed. 2850 * 2851 * Note that this is _not_ check that the subvol is 2852 * empty or doesn't contain data that we wouldn't 2853 * otherwise be able to delete. 2854 * 2855 * Users who want to delete empty subvols should try 2856 * rmdir(2). 2857 */ 2858 err = -EPERM; 2859 if (!btrfs_test_opt(fs_info, USER_SUBVOL_RM_ALLOWED)) 2860 goto out_dput; 2861 2862 /* 2863 * Do not allow deletion if the parent dir is the same 2864 * as the dir to be deleted. That means the ioctl 2865 * must be called on the dentry referencing the root 2866 * of the subvol, not a random directory contained 2867 * within it. 2868 */ 2869 err = -EINVAL; 2870 if (root == dest) 2871 goto out_dput; 2872 2873 err = inode_permission(inode, MAY_WRITE | MAY_EXEC); 2874 if (err) 2875 goto out_dput; 2876 } 2877 2878 /* check if subvolume may be deleted by a user */ 2879 err = btrfs_may_delete(dir, dentry, 1); 2880 if (err) 2881 goto out_dput; 2882 2883 if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FIRST_FREE_OBJECTID) { 2884 err = -EINVAL; 2885 goto out_dput; 2886 } 2887 2888 inode_lock(inode); 2889 err = btrfs_delete_subvolume(dir, dentry); 2890 inode_unlock(inode); 2891 if (!err) 2892 d_delete(dentry); 2893 2894 out_dput: 2895 dput(dentry); 2896 out_unlock_dir: 2897 inode_unlock(dir); 2898 out_drop_write: 2899 mnt_drop_write_file(file); 2900 out: 2901 kfree(vol_args); 2902 return err; 2903 } 2904 2905 static int btrfs_ioctl_defrag(struct file *file, void __user *argp) 2906 { 2907 struct inode *inode = file_inode(file); 2908 struct btrfs_root *root = BTRFS_I(inode)->root; 2909 struct btrfs_ioctl_defrag_range_args *range; 2910 int ret; 2911 2912 ret = mnt_want_write_file(file); 2913 if (ret) 2914 return ret; 2915 2916 if (btrfs_root_readonly(root)) { 2917 ret = -EROFS; 2918 goto out; 2919 } 2920 2921 switch (inode->i_mode & S_IFMT) { 2922 case S_IFDIR: 2923 if (!capable(CAP_SYS_ADMIN)) { 2924 ret = -EPERM; 2925 goto out; 2926 } 2927 ret = btrfs_defrag_root(root); 2928 break; 2929 case S_IFREG: 2930 /* 2931 * Note that this does not check the file descriptor for write 2932 * access. This prevents defragmenting executables that are 2933 * running and allows defrag on files open in read-only mode. 2934 */ 2935 if (!capable(CAP_SYS_ADMIN) && 2936 inode_permission(inode, MAY_WRITE)) { 2937 ret = -EPERM; 2938 goto out; 2939 } 2940 2941 range = kzalloc(sizeof(*range), GFP_KERNEL); 2942 if (!range) { 2943 ret = -ENOMEM; 2944 goto out; 2945 } 2946 2947 if (argp) { 2948 if (copy_from_user(range, argp, 2949 sizeof(*range))) { 2950 ret = -EFAULT; 2951 kfree(range); 2952 goto out; 2953 } 2954 /* compression requires us to start the IO */ 2955 if ((range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)) { 2956 range->flags |= BTRFS_DEFRAG_RANGE_START_IO; 2957 range->extent_thresh = (u32)-1; 2958 } 2959 } else { 2960 /* the rest are all set to zero by kzalloc */ 2961 range->len = (u64)-1; 2962 } 2963 ret = btrfs_defrag_file(file_inode(file), file, 2964 range, BTRFS_OLDEST_GENERATION, 0); 2965 if (ret > 0) 2966 ret = 0; 2967 kfree(range); 2968 break; 2969 default: 2970 ret = -EINVAL; 2971 } 2972 out: 2973 mnt_drop_write_file(file); 2974 return ret; 2975 } 2976 2977 static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg) 2978 { 2979 struct btrfs_ioctl_vol_args *vol_args; 2980 int ret; 2981 2982 if (!capable(CAP_SYS_ADMIN)) 2983 return -EPERM; 2984 2985 if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) 2986 return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; 2987 2988 vol_args = memdup_user(arg, sizeof(*vol_args)); 2989 if (IS_ERR(vol_args)) { 2990 ret = PTR_ERR(vol_args); 2991 goto out; 2992 } 2993 2994 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; 2995 ret = btrfs_init_new_device(fs_info, vol_args->name); 2996 2997 if (!ret) 2998 btrfs_info(fs_info, "disk added %s", vol_args->name); 2999 3000 kfree(vol_args); 3001 out: 3002 clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); 3003 return ret; 3004 } 3005 3006 static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) 3007 { 3008 struct inode *inode = file_inode(file); 3009 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 3010 struct btrfs_ioctl_vol_args_v2 *vol_args; 3011 int ret; 3012 3013 if (!capable(CAP_SYS_ADMIN)) 3014 return -EPERM; 3015 3016 ret = mnt_want_write_file(file); 3017 if (ret) 3018 return ret; 3019 3020 vol_args = memdup_user(arg, sizeof(*vol_args)); 3021 if (IS_ERR(vol_args)) { 3022 ret = PTR_ERR(vol_args); 3023 goto err_drop; 3024 } 3025 3026 /* Check for compatibility reject unknown flags */ 3027 if (vol_args->flags & ~BTRFS_VOL_ARG_V2_FLAGS_SUPPORTED) { 3028 ret = -EOPNOTSUPP; 3029 goto out; 3030 } 3031 3032 if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { 3033 ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; 3034 goto out; 3035 } 3036 3037 if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) { 3038 ret = btrfs_rm_device(fs_info, NULL, vol_args->devid); 3039 } else { 3040 vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0'; 3041 ret = btrfs_rm_device(fs_info, vol_args->name, 0); 3042 } 3043 clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); 3044 3045 if (!ret) { 3046 if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) 3047 btrfs_info(fs_info, "device deleted: id %llu", 3048 vol_args->devid); 3049 else 3050 btrfs_info(fs_info, "device deleted: %s", 3051 vol_args->name); 3052 } 3053 out: 3054 kfree(vol_args); 3055 err_drop: 3056 mnt_drop_write_file(file); 3057 return ret; 3058 } 3059 3060 static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) 3061 { 3062 struct inode *inode = file_inode(file); 3063 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 3064 struct btrfs_ioctl_vol_args *vol_args; 3065 int ret; 3066 3067 if (!capable(CAP_SYS_ADMIN)) 3068 return -EPERM; 3069 3070 ret = mnt_want_write_file(file); 3071 if (ret) 3072 return ret; 3073 3074 if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { 3075 ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; 3076 goto out_drop_write; 3077 } 3078 3079 vol_args = memdup_user(arg, sizeof(*vol_args)); 3080 if (IS_ERR(vol_args)) { 3081 ret = PTR_ERR(vol_args); 3082 goto out; 3083 } 3084 3085 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; 3086 ret = btrfs_rm_device(fs_info, vol_args->name, 0); 3087 3088 if (!ret) 3089 btrfs_info(fs_info, "disk deleted %s", vol_args->name); 3090 kfree(vol_args); 3091 out: 3092 clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); 3093 out_drop_write: 3094 mnt_drop_write_file(file); 3095 3096 return ret; 3097 } 3098 3099 static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info, 3100 void __user *arg) 3101 { 3102 struct btrfs_ioctl_fs_info_args *fi_args; 3103 struct btrfs_device *device; 3104 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 3105 int ret = 0; 3106 3107 fi_args = kzalloc(sizeof(*fi_args), GFP_KERNEL); 3108 if (!fi_args) 3109 return -ENOMEM; 3110 3111 rcu_read_lock(); 3112 fi_args->num_devices = fs_devices->num_devices; 3113 3114 list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) { 3115 if (device->devid > fi_args->max_id) 3116 fi_args->max_id = device->devid; 3117 } 3118 rcu_read_unlock(); 3119 3120 memcpy(&fi_args->fsid, fs_info->fsid, sizeof(fi_args->fsid)); 3121 fi_args->nodesize = fs_info->nodesize; 3122 fi_args->sectorsize = fs_info->sectorsize; 3123 fi_args->clone_alignment = fs_info->sectorsize; 3124 3125 if (copy_to_user(arg, fi_args, sizeof(*fi_args))) 3126 ret = -EFAULT; 3127 3128 kfree(fi_args); 3129 return ret; 3130 } 3131 3132 static long btrfs_ioctl_dev_info(struct btrfs_fs_info *fs_info, 3133 void __user *arg) 3134 { 3135 struct btrfs_ioctl_dev_info_args *di_args; 3136 struct btrfs_device *dev; 3137 int ret = 0; 3138 char *s_uuid = NULL; 3139 3140 di_args = memdup_user(arg, sizeof(*di_args)); 3141 if (IS_ERR(di_args)) 3142 return PTR_ERR(di_args); 3143 3144 if (!btrfs_is_empty_uuid(di_args->uuid)) 3145 s_uuid = di_args->uuid; 3146 3147 rcu_read_lock(); 3148 dev = btrfs_find_device(fs_info, di_args->devid, s_uuid, NULL); 3149 3150 if (!dev) { 3151 ret = -ENODEV; 3152 goto out; 3153 } 3154 3155 di_args->devid = dev->devid; 3156 di_args->bytes_used = btrfs_device_get_bytes_used(dev); 3157 di_args->total_bytes = btrfs_device_get_total_bytes(dev); 3158 memcpy(di_args->uuid, dev->uuid, sizeof(di_args->uuid)); 3159 if (dev->name) { 3160 strncpy(di_args->path, rcu_str_deref(dev->name), 3161 sizeof(di_args->path) - 1); 3162 di_args->path[sizeof(di_args->path) - 1] = 0; 3163 } else { 3164 di_args->path[0] = '\0'; 3165 } 3166 3167 out: 3168 rcu_read_unlock(); 3169 if (ret == 0 && copy_to_user(arg, di_args, sizeof(*di_args))) 3170 ret = -EFAULT; 3171 3172 kfree(di_args); 3173 return ret; 3174 } 3175 3176 static struct page *extent_same_get_page(struct inode *inode, pgoff_t index) 3177 { 3178 struct page *page; 3179 3180 page = grab_cache_page(inode->i_mapping, index); 3181 if (!page) 3182 return ERR_PTR(-ENOMEM); 3183 3184 if (!PageUptodate(page)) { 3185 int ret; 3186 3187 ret = btrfs_readpage(NULL, page); 3188 if (ret) 3189 return ERR_PTR(ret); 3190 lock_page(page); 3191 if (!PageUptodate(page)) { 3192 unlock_page(page); 3193 put_page(page); 3194 return ERR_PTR(-EIO); 3195 } 3196 if (page->mapping != inode->i_mapping) { 3197 unlock_page(page); 3198 put_page(page); 3199 return ERR_PTR(-EAGAIN); 3200 } 3201 } 3202 3203 return page; 3204 } 3205 3206 static int gather_extent_pages(struct inode *inode, struct page **pages, 3207 int num_pages, u64 off) 3208 { 3209 int i; 3210 pgoff_t index = off >> PAGE_SHIFT; 3211 3212 for (i = 0; i < num_pages; i++) { 3213 again: 3214 pages[i] = extent_same_get_page(inode, index + i); 3215 if (IS_ERR(pages[i])) { 3216 int err = PTR_ERR(pages[i]); 3217 3218 if (err == -EAGAIN) 3219 goto again; 3220 pages[i] = NULL; 3221 return err; 3222 } 3223 } 3224 return 0; 3225 } 3226 3227 static int lock_extent_range(struct inode *inode, u64 off, u64 len, 3228 bool retry_range_locking) 3229 { 3230 /* 3231 * Do any pending delalloc/csum calculations on inode, one way or 3232 * another, and lock file content. 3233 * The locking order is: 3234 * 3235 * 1) pages 3236 * 2) range in the inode's io tree 3237 */ 3238 while (1) { 3239 struct btrfs_ordered_extent *ordered; 3240 lock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1); 3241 ordered = btrfs_lookup_first_ordered_extent(inode, 3242 off + len - 1); 3243 if ((!ordered || 3244 ordered->file_offset + ordered->len <= off || 3245 ordered->file_offset >= off + len) && 3246 !test_range_bit(&BTRFS_I(inode)->io_tree, off, 3247 off + len - 1, EXTENT_DELALLOC, 0, NULL)) { 3248 if (ordered) 3249 btrfs_put_ordered_extent(ordered); 3250 break; 3251 } 3252 unlock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1); 3253 if (ordered) 3254 btrfs_put_ordered_extent(ordered); 3255 if (!retry_range_locking) 3256 return -EAGAIN; 3257 btrfs_wait_ordered_range(inode, off, len); 3258 } 3259 return 0; 3260 } 3261 3262 static void btrfs_double_inode_unlock(struct inode *inode1, struct inode *inode2) 3263 { 3264 inode_unlock(inode1); 3265 inode_unlock(inode2); 3266 } 3267 3268 static void btrfs_double_inode_lock(struct inode *inode1, struct inode *inode2) 3269 { 3270 if (inode1 < inode2) 3271 swap(inode1, inode2); 3272 3273 inode_lock_nested(inode1, I_MUTEX_PARENT); 3274 inode_lock_nested(inode2, I_MUTEX_CHILD); 3275 } 3276 3277 static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1, 3278 struct inode *inode2, u64 loff2, u64 len) 3279 { 3280 unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1); 3281 unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1); 3282 } 3283 3284 static int btrfs_double_extent_lock(struct inode *inode1, u64 loff1, 3285 struct inode *inode2, u64 loff2, u64 len, 3286 bool retry_range_locking) 3287 { 3288 int ret; 3289 3290 if (inode1 < inode2) { 3291 swap(inode1, inode2); 3292 swap(loff1, loff2); 3293 } 3294 ret = lock_extent_range(inode1, loff1, len, retry_range_locking); 3295 if (ret) 3296 return ret; 3297 ret = lock_extent_range(inode2, loff2, len, retry_range_locking); 3298 if (ret) 3299 unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, 3300 loff1 + len - 1); 3301 return ret; 3302 } 3303 3304 struct cmp_pages { 3305 int num_pages; 3306 struct page **src_pages; 3307 struct page **dst_pages; 3308 }; 3309 3310 static void btrfs_cmp_data_free(struct cmp_pages *cmp) 3311 { 3312 int i; 3313 struct page *pg; 3314 3315 for (i = 0; i < cmp->num_pages; i++) { 3316 pg = cmp->src_pages[i]; 3317 if (pg) { 3318 unlock_page(pg); 3319 put_page(pg); 3320 cmp->src_pages[i] = NULL; 3321 } 3322 pg = cmp->dst_pages[i]; 3323 if (pg) { 3324 unlock_page(pg); 3325 put_page(pg); 3326 cmp->dst_pages[i] = NULL; 3327 } 3328 } 3329 } 3330 3331 static int btrfs_cmp_data_prepare(struct inode *src, u64 loff, 3332 struct inode *dst, u64 dst_loff, 3333 u64 len, struct cmp_pages *cmp) 3334 { 3335 int ret; 3336 int num_pages = PAGE_ALIGN(len) >> PAGE_SHIFT; 3337 3338 cmp->num_pages = num_pages; 3339 3340 ret = gather_extent_pages(src, cmp->src_pages, num_pages, loff); 3341 if (ret) 3342 goto out; 3343 3344 ret = gather_extent_pages(dst, cmp->dst_pages, num_pages, dst_loff); 3345 3346 out: 3347 if (ret) 3348 btrfs_cmp_data_free(cmp); 3349 return ret; 3350 } 3351 3352 static int btrfs_cmp_data(u64 len, struct cmp_pages *cmp) 3353 { 3354 int ret = 0; 3355 int i; 3356 struct page *src_page, *dst_page; 3357 unsigned int cmp_len = PAGE_SIZE; 3358 void *addr, *dst_addr; 3359 3360 i = 0; 3361 while (len) { 3362 if (len < PAGE_SIZE) 3363 cmp_len = len; 3364 3365 BUG_ON(i >= cmp->num_pages); 3366 3367 src_page = cmp->src_pages[i]; 3368 dst_page = cmp->dst_pages[i]; 3369 ASSERT(PageLocked(src_page)); 3370 ASSERT(PageLocked(dst_page)); 3371 3372 addr = kmap_atomic(src_page); 3373 dst_addr = kmap_atomic(dst_page); 3374 3375 flush_dcache_page(src_page); 3376 flush_dcache_page(dst_page); 3377 3378 if (memcmp(addr, dst_addr, cmp_len)) 3379 ret = -EBADE; 3380 3381 kunmap_atomic(addr); 3382 kunmap_atomic(dst_addr); 3383 3384 if (ret) 3385 break; 3386 3387 len -= cmp_len; 3388 i++; 3389 } 3390 3391 return ret; 3392 } 3393 3394 static int extent_same_check_offsets(struct inode *inode, u64 off, u64 *plen, 3395 u64 olen) 3396 { 3397 u64 len = *plen; 3398 u64 bs = BTRFS_I(inode)->root->fs_info->sb->s_blocksize; 3399 3400 if (off + olen > inode->i_size || off + olen < off) 3401 return -EINVAL; 3402 3403 /* if we extend to eof, continue to block boundary */ 3404 if (off + len == inode->i_size) 3405 *plen = len = ALIGN(inode->i_size, bs) - off; 3406 3407 /* Check that we are block aligned - btrfs_clone() requires this */ 3408 if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs)) 3409 return -EINVAL; 3410 3411 return 0; 3412 } 3413 3414 static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 olen, 3415 struct inode *dst, u64 dst_loff, 3416 struct cmp_pages *cmp) 3417 { 3418 int ret; 3419 u64 len = olen; 3420 bool same_inode = (src == dst); 3421 u64 same_lock_start = 0; 3422 u64 same_lock_len = 0; 3423 3424 ret = extent_same_check_offsets(src, loff, &len, olen); 3425 if (ret) 3426 return ret; 3427 3428 ret = extent_same_check_offsets(dst, dst_loff, &len, olen); 3429 if (ret) 3430 return ret; 3431 3432 if (same_inode) { 3433 /* 3434 * Single inode case wants the same checks, except we 3435 * don't want our length pushed out past i_size as 3436 * comparing that data range makes no sense. 3437 * 3438 * extent_same_check_offsets() will do this for an 3439 * unaligned length at i_size, so catch it here and 3440 * reject the request. 3441 * 3442 * This effectively means we require aligned extents 3443 * for the single-inode case, whereas the other cases 3444 * allow an unaligned length so long as it ends at 3445 * i_size. 3446 */ 3447 if (len != olen) 3448 return -EINVAL; 3449 3450 /* Check for overlapping ranges */ 3451 if (dst_loff + len > loff && dst_loff < loff + len) 3452 return -EINVAL; 3453 3454 same_lock_start = min_t(u64, loff, dst_loff); 3455 same_lock_len = max_t(u64, loff, dst_loff) + len - same_lock_start; 3456 } 3457 3458 again: 3459 ret = btrfs_cmp_data_prepare(src, loff, dst, dst_loff, olen, cmp); 3460 if (ret) 3461 return ret; 3462 3463 if (same_inode) 3464 ret = lock_extent_range(src, same_lock_start, same_lock_len, 3465 false); 3466 else 3467 ret = btrfs_double_extent_lock(src, loff, dst, dst_loff, len, 3468 false); 3469 /* 3470 * If one of the inodes has dirty pages in the respective range or 3471 * ordered extents, we need to flush dellaloc and wait for all ordered 3472 * extents in the range. We must unlock the pages and the ranges in the 3473 * io trees to avoid deadlocks when flushing delalloc (requires locking 3474 * pages) and when waiting for ordered extents to complete (they require 3475 * range locking). 3476 */ 3477 if (ret == -EAGAIN) { 3478 /* 3479 * Ranges in the io trees already unlocked. Now unlock all 3480 * pages before waiting for all IO to complete. 3481 */ 3482 btrfs_cmp_data_free(cmp); 3483 if (same_inode) { 3484 btrfs_wait_ordered_range(src, same_lock_start, 3485 same_lock_len); 3486 } else { 3487 btrfs_wait_ordered_range(src, loff, len); 3488 btrfs_wait_ordered_range(dst, dst_loff, len); 3489 } 3490 goto again; 3491 } 3492 ASSERT(ret == 0); 3493 if (WARN_ON(ret)) { 3494 /* ranges in the io trees already unlocked */ 3495 btrfs_cmp_data_free(cmp); 3496 return ret; 3497 } 3498 3499 /* pass original length for comparison so we stay within i_size */ 3500 ret = btrfs_cmp_data(olen, cmp); 3501 if (ret == 0) 3502 ret = btrfs_clone(src, dst, loff, olen, len, dst_loff, 1); 3503 3504 if (same_inode) 3505 unlock_extent(&BTRFS_I(src)->io_tree, same_lock_start, 3506 same_lock_start + same_lock_len - 1); 3507 else 3508 btrfs_double_extent_unlock(src, loff, dst, dst_loff, len); 3509 3510 btrfs_cmp_data_free(cmp); 3511 3512 return ret; 3513 } 3514 3515 #define BTRFS_MAX_DEDUPE_LEN SZ_16M 3516 3517 static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, 3518 struct inode *dst, u64 dst_loff) 3519 { 3520 int ret; 3521 struct cmp_pages cmp; 3522 int num_pages = PAGE_ALIGN(BTRFS_MAX_DEDUPE_LEN) >> PAGE_SHIFT; 3523 bool same_inode = (src == dst); 3524 u64 i, tail_len, chunk_count; 3525 3526 if (olen == 0) 3527 return 0; 3528 3529 if (same_inode) 3530 inode_lock(src); 3531 else 3532 btrfs_double_inode_lock(src, dst); 3533 3534 /* don't make the dst file partly checksummed */ 3535 if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) != 3536 (BTRFS_I(dst)->flags & BTRFS_INODE_NODATASUM)) { 3537 ret = -EINVAL; 3538 goto out_unlock; 3539 } 3540 3541 tail_len = olen % BTRFS_MAX_DEDUPE_LEN; 3542 chunk_count = div_u64(olen, BTRFS_MAX_DEDUPE_LEN); 3543 if (chunk_count == 0) 3544 num_pages = PAGE_ALIGN(tail_len) >> PAGE_SHIFT; 3545 3546 /* 3547 * If deduping ranges in the same inode, locking rules make it 3548 * mandatory to always lock pages in ascending order to avoid deadlocks 3549 * with concurrent tasks (such as starting writeback/delalloc). 3550 */ 3551 if (same_inode && dst_loff < loff) 3552 swap(loff, dst_loff); 3553 3554 /* 3555 * We must gather up all the pages before we initiate our extent 3556 * locking. We use an array for the page pointers. Size of the array is 3557 * bounded by len, which is in turn bounded by BTRFS_MAX_DEDUPE_LEN. 3558 */ 3559 cmp.src_pages = kvmalloc_array(num_pages, sizeof(struct page *), 3560 GFP_KERNEL | __GFP_ZERO); 3561 cmp.dst_pages = kvmalloc_array(num_pages, sizeof(struct page *), 3562 GFP_KERNEL | __GFP_ZERO); 3563 if (!cmp.src_pages || !cmp.dst_pages) { 3564 ret = -ENOMEM; 3565 goto out_free; 3566 } 3567 3568 for (i = 0; i < chunk_count; i++) { 3569 ret = btrfs_extent_same_range(src, loff, BTRFS_MAX_DEDUPE_LEN, 3570 dst, dst_loff, &cmp); 3571 if (ret) 3572 goto out_free; 3573 3574 loff += BTRFS_MAX_DEDUPE_LEN; 3575 dst_loff += BTRFS_MAX_DEDUPE_LEN; 3576 } 3577 3578 if (tail_len > 0) 3579 ret = btrfs_extent_same_range(src, loff, tail_len, dst, 3580 dst_loff, &cmp); 3581 3582 out_free: 3583 kvfree(cmp.src_pages); 3584 kvfree(cmp.dst_pages); 3585 3586 out_unlock: 3587 if (same_inode) 3588 inode_unlock(src); 3589 else 3590 btrfs_double_inode_unlock(src, dst); 3591 3592 return ret; 3593 } 3594 3595 int btrfs_dedupe_file_range(struct file *src_file, loff_t src_loff, 3596 struct file *dst_file, loff_t dst_loff, 3597 u64 olen) 3598 { 3599 struct inode *src = file_inode(src_file); 3600 struct inode *dst = file_inode(dst_file); 3601 u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize; 3602 3603 if (WARN_ON_ONCE(bs < PAGE_SIZE)) { 3604 /* 3605 * Btrfs does not support blocksize < page_size. As a 3606 * result, btrfs_cmp_data() won't correctly handle 3607 * this situation without an update. 3608 */ 3609 return -EINVAL; 3610 } 3611 3612 return btrfs_extent_same(src, src_loff, olen, dst, dst_loff); 3613 } 3614 3615 static int clone_finish_inode_update(struct btrfs_trans_handle *trans, 3616 struct inode *inode, 3617 u64 endoff, 3618 const u64 destoff, 3619 const u64 olen, 3620 int no_time_update) 3621 { 3622 struct btrfs_root *root = BTRFS_I(inode)->root; 3623 int ret; 3624 3625 inode_inc_iversion(inode); 3626 if (!no_time_update) 3627 inode->i_mtime = inode->i_ctime = current_time(inode); 3628 /* 3629 * We round up to the block size at eof when determining which 3630 * extents to clone above, but shouldn't round up the file size. 3631 */ 3632 if (endoff > destoff + olen) 3633 endoff = destoff + olen; 3634 if (endoff > inode->i_size) 3635 btrfs_i_size_write(BTRFS_I(inode), endoff); 3636 3637 ret = btrfs_update_inode(trans, root, inode); 3638 if (ret) { 3639 btrfs_abort_transaction(trans, ret); 3640 btrfs_end_transaction(trans); 3641 goto out; 3642 } 3643 ret = btrfs_end_transaction(trans); 3644 out: 3645 return ret; 3646 } 3647 3648 static void clone_update_extent_map(struct btrfs_inode *inode, 3649 const struct btrfs_trans_handle *trans, 3650 const struct btrfs_path *path, 3651 const u64 hole_offset, 3652 const u64 hole_len) 3653 { 3654 struct extent_map_tree *em_tree = &inode->extent_tree; 3655 struct extent_map *em; 3656 int ret; 3657 3658 em = alloc_extent_map(); 3659 if (!em) { 3660 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags); 3661 return; 3662 } 3663 3664 if (path) { 3665 struct btrfs_file_extent_item *fi; 3666 3667 fi = btrfs_item_ptr(path->nodes[0], path->slots[0], 3668 struct btrfs_file_extent_item); 3669 btrfs_extent_item_to_extent_map(inode, path, fi, false, em); 3670 em->generation = -1; 3671 if (btrfs_file_extent_type(path->nodes[0], fi) == 3672 BTRFS_FILE_EXTENT_INLINE) 3673 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 3674 &inode->runtime_flags); 3675 } else { 3676 em->start = hole_offset; 3677 em->len = hole_len; 3678 em->ram_bytes = em->len; 3679 em->orig_start = hole_offset; 3680 em->block_start = EXTENT_MAP_HOLE; 3681 em->block_len = 0; 3682 em->orig_block_len = 0; 3683 em->compress_type = BTRFS_COMPRESS_NONE; 3684 em->generation = trans->transid; 3685 } 3686 3687 while (1) { 3688 write_lock(&em_tree->lock); 3689 ret = add_extent_mapping(em_tree, em, 1); 3690 write_unlock(&em_tree->lock); 3691 if (ret != -EEXIST) { 3692 free_extent_map(em); 3693 break; 3694 } 3695 btrfs_drop_extent_cache(inode, em->start, 3696 em->start + em->len - 1, 0); 3697 } 3698 3699 if (ret) 3700 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags); 3701 } 3702 3703 /* 3704 * Make sure we do not end up inserting an inline extent into a file that has 3705 * already other (non-inline) extents. If a file has an inline extent it can 3706 * not have any other extents and the (single) inline extent must start at the 3707 * file offset 0. Failing to respect these rules will lead to file corruption, 3708 * resulting in EIO errors on read/write operations, hitting BUG_ON's in mm, etc 3709 * 3710 * We can have extents that have been already written to disk or we can have 3711 * dirty ranges still in delalloc, in which case the extent maps and items are 3712 * created only when we run delalloc, and the delalloc ranges might fall outside 3713 * the range we are currently locking in the inode's io tree. So we check the 3714 * inode's i_size because of that (i_size updates are done while holding the 3715 * i_mutex, which we are holding here). 3716 * We also check to see if the inode has a size not greater than "datal" but has 3717 * extents beyond it, due to an fallocate with FALLOC_FL_KEEP_SIZE (and we are 3718 * protected against such concurrent fallocate calls by the i_mutex). 3719 * 3720 * If the file has no extents but a size greater than datal, do not allow the 3721 * copy because we would need turn the inline extent into a non-inline one (even 3722 * with NO_HOLES enabled). If we find our destination inode only has one inline 3723 * extent, just overwrite it with the source inline extent if its size is less 3724 * than the source extent's size, or we could copy the source inline extent's 3725 * data into the destination inode's inline extent if the later is greater then 3726 * the former. 3727 */ 3728 static int clone_copy_inline_extent(struct inode *dst, 3729 struct btrfs_trans_handle *trans, 3730 struct btrfs_path *path, 3731 struct btrfs_key *new_key, 3732 const u64 drop_start, 3733 const u64 datal, 3734 const u64 skip, 3735 const u64 size, 3736 char *inline_data) 3737 { 3738 struct btrfs_fs_info *fs_info = btrfs_sb(dst->i_sb); 3739 struct btrfs_root *root = BTRFS_I(dst)->root; 3740 const u64 aligned_end = ALIGN(new_key->offset + datal, 3741 fs_info->sectorsize); 3742 int ret; 3743 struct btrfs_key key; 3744 3745 if (new_key->offset > 0) 3746 return -EOPNOTSUPP; 3747 3748 key.objectid = btrfs_ino(BTRFS_I(dst)); 3749 key.type = BTRFS_EXTENT_DATA_KEY; 3750 key.offset = 0; 3751 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 3752 if (ret < 0) { 3753 return ret; 3754 } else if (ret > 0) { 3755 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { 3756 ret = btrfs_next_leaf(root, path); 3757 if (ret < 0) 3758 return ret; 3759 else if (ret > 0) 3760 goto copy_inline_extent; 3761 } 3762 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 3763 if (key.objectid == btrfs_ino(BTRFS_I(dst)) && 3764 key.type == BTRFS_EXTENT_DATA_KEY) { 3765 ASSERT(key.offset > 0); 3766 return -EOPNOTSUPP; 3767 } 3768 } else if (i_size_read(dst) <= datal) { 3769 struct btrfs_file_extent_item *ei; 3770 u64 ext_len; 3771 3772 /* 3773 * If the file size is <= datal, make sure there are no other 3774 * extents following (can happen do to an fallocate call with 3775 * the flag FALLOC_FL_KEEP_SIZE). 3776 */ 3777 ei = btrfs_item_ptr(path->nodes[0], path->slots[0], 3778 struct btrfs_file_extent_item); 3779 /* 3780 * If it's an inline extent, it can not have other extents 3781 * following it. 3782 */ 3783 if (btrfs_file_extent_type(path->nodes[0], ei) == 3784 BTRFS_FILE_EXTENT_INLINE) 3785 goto copy_inline_extent; 3786 3787 ext_len = btrfs_file_extent_num_bytes(path->nodes[0], ei); 3788 if (ext_len > aligned_end) 3789 return -EOPNOTSUPP; 3790 3791 ret = btrfs_next_item(root, path); 3792 if (ret < 0) { 3793 return ret; 3794 } else if (ret == 0) { 3795 btrfs_item_key_to_cpu(path->nodes[0], &key, 3796 path->slots[0]); 3797 if (key.objectid == btrfs_ino(BTRFS_I(dst)) && 3798 key.type == BTRFS_EXTENT_DATA_KEY) 3799 return -EOPNOTSUPP; 3800 } 3801 } 3802 3803 copy_inline_extent: 3804 /* 3805 * We have no extent items, or we have an extent at offset 0 which may 3806 * or may not be inlined. All these cases are dealt the same way. 3807 */ 3808 if (i_size_read(dst) > datal) { 3809 /* 3810 * If the destination inode has an inline extent... 3811 * This would require copying the data from the source inline 3812 * extent into the beginning of the destination's inline extent. 3813 * But this is really complex, both extents can be compressed 3814 * or just one of them, which would require decompressing and 3815 * re-compressing data (which could increase the new compressed 3816 * size, not allowing the compressed data to fit anymore in an 3817 * inline extent). 3818 * So just don't support this case for now (it should be rare, 3819 * we are not really saving space when cloning inline extents). 3820 */ 3821 return -EOPNOTSUPP; 3822 } 3823 3824 btrfs_release_path(path); 3825 ret = btrfs_drop_extents(trans, root, dst, drop_start, aligned_end, 1); 3826 if (ret) 3827 return ret; 3828 ret = btrfs_insert_empty_item(trans, root, path, new_key, size); 3829 if (ret) 3830 return ret; 3831 3832 if (skip) { 3833 const u32 start = btrfs_file_extent_calc_inline_size(0); 3834 3835 memmove(inline_data + start, inline_data + start + skip, datal); 3836 } 3837 3838 write_extent_buffer(path->nodes[0], inline_data, 3839 btrfs_item_ptr_offset(path->nodes[0], 3840 path->slots[0]), 3841 size); 3842 inode_add_bytes(dst, datal); 3843 3844 return 0; 3845 } 3846 3847 /** 3848 * btrfs_clone() - clone a range from inode file to another 3849 * 3850 * @src: Inode to clone from 3851 * @inode: Inode to clone to 3852 * @off: Offset within source to start clone from 3853 * @olen: Original length, passed by user, of range to clone 3854 * @olen_aligned: Block-aligned value of olen 3855 * @destoff: Offset within @inode to start clone 3856 * @no_time_update: Whether to update mtime/ctime on the target inode 3857 */ 3858 static int btrfs_clone(struct inode *src, struct inode *inode, 3859 const u64 off, const u64 olen, const u64 olen_aligned, 3860 const u64 destoff, int no_time_update) 3861 { 3862 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 3863 struct btrfs_root *root = BTRFS_I(inode)->root; 3864 struct btrfs_path *path = NULL; 3865 struct extent_buffer *leaf; 3866 struct btrfs_trans_handle *trans; 3867 char *buf = NULL; 3868 struct btrfs_key key; 3869 u32 nritems; 3870 int slot; 3871 int ret; 3872 const u64 len = olen_aligned; 3873 u64 last_dest_end = destoff; 3874 3875 ret = -ENOMEM; 3876 buf = kvmalloc(fs_info->nodesize, GFP_KERNEL); 3877 if (!buf) 3878 return ret; 3879 3880 path = btrfs_alloc_path(); 3881 if (!path) { 3882 kvfree(buf); 3883 return ret; 3884 } 3885 3886 path->reada = READA_FORWARD; 3887 /* clone data */ 3888 key.objectid = btrfs_ino(BTRFS_I(src)); 3889 key.type = BTRFS_EXTENT_DATA_KEY; 3890 key.offset = off; 3891 3892 while (1) { 3893 u64 next_key_min_offset = key.offset + 1; 3894 3895 /* 3896 * note the key will change type as we walk through the 3897 * tree. 3898 */ 3899 path->leave_spinning = 1; 3900 ret = btrfs_search_slot(NULL, BTRFS_I(src)->root, &key, path, 3901 0, 0); 3902 if (ret < 0) 3903 goto out; 3904 /* 3905 * First search, if no extent item that starts at offset off was 3906 * found but the previous item is an extent item, it's possible 3907 * it might overlap our target range, therefore process it. 3908 */ 3909 if (key.offset == off && ret > 0 && path->slots[0] > 0) { 3910 btrfs_item_key_to_cpu(path->nodes[0], &key, 3911 path->slots[0] - 1); 3912 if (key.type == BTRFS_EXTENT_DATA_KEY) 3913 path->slots[0]--; 3914 } 3915 3916 nritems = btrfs_header_nritems(path->nodes[0]); 3917 process_slot: 3918 if (path->slots[0] >= nritems) { 3919 ret = btrfs_next_leaf(BTRFS_I(src)->root, path); 3920 if (ret < 0) 3921 goto out; 3922 if (ret > 0) 3923 break; 3924 nritems = btrfs_header_nritems(path->nodes[0]); 3925 } 3926 leaf = path->nodes[0]; 3927 slot = path->slots[0]; 3928 3929 btrfs_item_key_to_cpu(leaf, &key, slot); 3930 if (key.type > BTRFS_EXTENT_DATA_KEY || 3931 key.objectid != btrfs_ino(BTRFS_I(src))) 3932 break; 3933 3934 if (key.type == BTRFS_EXTENT_DATA_KEY) { 3935 struct btrfs_file_extent_item *extent; 3936 int type; 3937 u32 size; 3938 struct btrfs_key new_key; 3939 u64 disko = 0, diskl = 0; 3940 u64 datao = 0, datal = 0; 3941 u8 comp; 3942 u64 drop_start; 3943 3944 extent = btrfs_item_ptr(leaf, slot, 3945 struct btrfs_file_extent_item); 3946 comp = btrfs_file_extent_compression(leaf, extent); 3947 type = btrfs_file_extent_type(leaf, extent); 3948 if (type == BTRFS_FILE_EXTENT_REG || 3949 type == BTRFS_FILE_EXTENT_PREALLOC) { 3950 disko = btrfs_file_extent_disk_bytenr(leaf, 3951 extent); 3952 diskl = btrfs_file_extent_disk_num_bytes(leaf, 3953 extent); 3954 datao = btrfs_file_extent_offset(leaf, extent); 3955 datal = btrfs_file_extent_num_bytes(leaf, 3956 extent); 3957 } else if (type == BTRFS_FILE_EXTENT_INLINE) { 3958 /* take upper bound, may be compressed */ 3959 datal = btrfs_file_extent_ram_bytes(leaf, 3960 extent); 3961 } 3962 3963 /* 3964 * The first search might have left us at an extent 3965 * item that ends before our target range's start, can 3966 * happen if we have holes and NO_HOLES feature enabled. 3967 */ 3968 if (key.offset + datal <= off) { 3969 path->slots[0]++; 3970 goto process_slot; 3971 } else if (key.offset >= off + len) { 3972 break; 3973 } 3974 next_key_min_offset = key.offset + datal; 3975 size = btrfs_item_size_nr(leaf, slot); 3976 read_extent_buffer(leaf, buf, 3977 btrfs_item_ptr_offset(leaf, slot), 3978 size); 3979 3980 btrfs_release_path(path); 3981 path->leave_spinning = 0; 3982 3983 memcpy(&new_key, &key, sizeof(new_key)); 3984 new_key.objectid = btrfs_ino(BTRFS_I(inode)); 3985 if (off <= key.offset) 3986 new_key.offset = key.offset + destoff - off; 3987 else 3988 new_key.offset = destoff; 3989 3990 /* 3991 * Deal with a hole that doesn't have an extent item 3992 * that represents it (NO_HOLES feature enabled). 3993 * This hole is either in the middle of the cloning 3994 * range or at the beginning (fully overlaps it or 3995 * partially overlaps it). 3996 */ 3997 if (new_key.offset != last_dest_end) 3998 drop_start = last_dest_end; 3999 else 4000 drop_start = new_key.offset; 4001 4002 /* 4003 * 1 - adjusting old extent (we may have to split it) 4004 * 1 - add new extent 4005 * 1 - inode update 4006 */ 4007 trans = btrfs_start_transaction(root, 3); 4008 if (IS_ERR(trans)) { 4009 ret = PTR_ERR(trans); 4010 goto out; 4011 } 4012 4013 if (type == BTRFS_FILE_EXTENT_REG || 4014 type == BTRFS_FILE_EXTENT_PREALLOC) { 4015 /* 4016 * a | --- range to clone ---| b 4017 * | ------------- extent ------------- | 4018 */ 4019 4020 /* subtract range b */ 4021 if (key.offset + datal > off + len) 4022 datal = off + len - key.offset; 4023 4024 /* subtract range a */ 4025 if (off > key.offset) { 4026 datao += off - key.offset; 4027 datal -= off - key.offset; 4028 } 4029 4030 ret = btrfs_drop_extents(trans, root, inode, 4031 drop_start, 4032 new_key.offset + datal, 4033 1); 4034 if (ret) { 4035 if (ret != -EOPNOTSUPP) 4036 btrfs_abort_transaction(trans, 4037 ret); 4038 btrfs_end_transaction(trans); 4039 goto out; 4040 } 4041 4042 ret = btrfs_insert_empty_item(trans, root, path, 4043 &new_key, size); 4044 if (ret) { 4045 btrfs_abort_transaction(trans, ret); 4046 btrfs_end_transaction(trans); 4047 goto out; 4048 } 4049 4050 leaf = path->nodes[0]; 4051 slot = path->slots[0]; 4052 write_extent_buffer(leaf, buf, 4053 btrfs_item_ptr_offset(leaf, slot), 4054 size); 4055 4056 extent = btrfs_item_ptr(leaf, slot, 4057 struct btrfs_file_extent_item); 4058 4059 /* disko == 0 means it's a hole */ 4060 if (!disko) 4061 datao = 0; 4062 4063 btrfs_set_file_extent_offset(leaf, extent, 4064 datao); 4065 btrfs_set_file_extent_num_bytes(leaf, extent, 4066 datal); 4067 4068 if (disko) { 4069 inode_add_bytes(inode, datal); 4070 ret = btrfs_inc_extent_ref(trans, 4071 root, 4072 disko, diskl, 0, 4073 root->root_key.objectid, 4074 btrfs_ino(BTRFS_I(inode)), 4075 new_key.offset - datao); 4076 if (ret) { 4077 btrfs_abort_transaction(trans, 4078 ret); 4079 btrfs_end_transaction(trans); 4080 goto out; 4081 4082 } 4083 } 4084 } else if (type == BTRFS_FILE_EXTENT_INLINE) { 4085 u64 skip = 0; 4086 u64 trim = 0; 4087 4088 if (off > key.offset) { 4089 skip = off - key.offset; 4090 new_key.offset += skip; 4091 } 4092 4093 if (key.offset + datal > off + len) 4094 trim = key.offset + datal - (off + len); 4095 4096 if (comp && (skip || trim)) { 4097 ret = -EINVAL; 4098 btrfs_end_transaction(trans); 4099 goto out; 4100 } 4101 size -= skip + trim; 4102 datal -= skip + trim; 4103 4104 ret = clone_copy_inline_extent(inode, 4105 trans, path, 4106 &new_key, 4107 drop_start, 4108 datal, 4109 skip, size, buf); 4110 if (ret) { 4111 if (ret != -EOPNOTSUPP) 4112 btrfs_abort_transaction(trans, 4113 ret); 4114 btrfs_end_transaction(trans); 4115 goto out; 4116 } 4117 leaf = path->nodes[0]; 4118 slot = path->slots[0]; 4119 } 4120 4121 /* If we have an implicit hole (NO_HOLES feature). */ 4122 if (drop_start < new_key.offset) 4123 clone_update_extent_map(BTRFS_I(inode), trans, 4124 NULL, drop_start, 4125 new_key.offset - drop_start); 4126 4127 clone_update_extent_map(BTRFS_I(inode), trans, 4128 path, 0, 0); 4129 4130 btrfs_mark_buffer_dirty(leaf); 4131 btrfs_release_path(path); 4132 4133 last_dest_end = ALIGN(new_key.offset + datal, 4134 fs_info->sectorsize); 4135 ret = clone_finish_inode_update(trans, inode, 4136 last_dest_end, 4137 destoff, olen, 4138 no_time_update); 4139 if (ret) 4140 goto out; 4141 if (new_key.offset + datal >= destoff + len) 4142 break; 4143 } 4144 btrfs_release_path(path); 4145 key.offset = next_key_min_offset; 4146 4147 if (fatal_signal_pending(current)) { 4148 ret = -EINTR; 4149 goto out; 4150 } 4151 } 4152 ret = 0; 4153 4154 if (last_dest_end < destoff + len) { 4155 /* 4156 * We have an implicit hole (NO_HOLES feature is enabled) that 4157 * fully or partially overlaps our cloning range at its end. 4158 */ 4159 btrfs_release_path(path); 4160 4161 /* 4162 * 1 - remove extent(s) 4163 * 1 - inode update 4164 */ 4165 trans = btrfs_start_transaction(root, 2); 4166 if (IS_ERR(trans)) { 4167 ret = PTR_ERR(trans); 4168 goto out; 4169 } 4170 ret = btrfs_drop_extents(trans, root, inode, 4171 last_dest_end, destoff + len, 1); 4172 if (ret) { 4173 if (ret != -EOPNOTSUPP) 4174 btrfs_abort_transaction(trans, ret); 4175 btrfs_end_transaction(trans); 4176 goto out; 4177 } 4178 clone_update_extent_map(BTRFS_I(inode), trans, NULL, 4179 last_dest_end, 4180 destoff + len - last_dest_end); 4181 ret = clone_finish_inode_update(trans, inode, destoff + len, 4182 destoff, olen, no_time_update); 4183 } 4184 4185 out: 4186 btrfs_free_path(path); 4187 kvfree(buf); 4188 return ret; 4189 } 4190 4191 static noinline int btrfs_clone_files(struct file *file, struct file *file_src, 4192 u64 off, u64 olen, u64 destoff) 4193 { 4194 struct inode *inode = file_inode(file); 4195 struct inode *src = file_inode(file_src); 4196 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 4197 struct btrfs_root *root = BTRFS_I(inode)->root; 4198 int ret; 4199 u64 len = olen; 4200 u64 bs = fs_info->sb->s_blocksize; 4201 int same_inode = src == inode; 4202 4203 /* 4204 * TODO: 4205 * - split compressed inline extents. annoying: we need to 4206 * decompress into destination's address_space (the file offset 4207 * may change, so source mapping won't do), then recompress (or 4208 * otherwise reinsert) a subrange. 4209 * 4210 * - split destination inode's inline extents. The inline extents can 4211 * be either compressed or non-compressed. 4212 */ 4213 4214 if (btrfs_root_readonly(root)) 4215 return -EROFS; 4216 4217 if (file_src->f_path.mnt != file->f_path.mnt || 4218 src->i_sb != inode->i_sb) 4219 return -EXDEV; 4220 4221 if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode)) 4222 return -EISDIR; 4223 4224 if (!same_inode) { 4225 btrfs_double_inode_lock(src, inode); 4226 } else { 4227 inode_lock(src); 4228 } 4229 4230 /* don't make the dst file partly checksummed */ 4231 if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) != 4232 (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { 4233 ret = -EINVAL; 4234 goto out_unlock; 4235 } 4236 4237 /* determine range to clone */ 4238 ret = -EINVAL; 4239 if (off + len > src->i_size || off + len < off) 4240 goto out_unlock; 4241 if (len == 0) 4242 olen = len = src->i_size - off; 4243 /* if we extend to eof, continue to block boundary */ 4244 if (off + len == src->i_size) 4245 len = ALIGN(src->i_size, bs) - off; 4246 4247 if (len == 0) { 4248 ret = 0; 4249 goto out_unlock; 4250 } 4251 4252 /* verify the end result is block aligned */ 4253 if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs) || 4254 !IS_ALIGNED(destoff, bs)) 4255 goto out_unlock; 4256 4257 /* verify if ranges are overlapped within the same file */ 4258 if (same_inode) { 4259 if (destoff + len > off && destoff < off + len) 4260 goto out_unlock; 4261 } 4262 4263 if (destoff > inode->i_size) { 4264 ret = btrfs_cont_expand(inode, inode->i_size, destoff); 4265 if (ret) 4266 goto out_unlock; 4267 } 4268 4269 /* 4270 * Lock the target range too. Right after we replace the file extent 4271 * items in the fs tree (which now point to the cloned data), we might 4272 * have a worker replace them with extent items relative to a write 4273 * operation that was issued before this clone operation (i.e. confront 4274 * with inode.c:btrfs_finish_ordered_io). 4275 */ 4276 if (same_inode) { 4277 u64 lock_start = min_t(u64, off, destoff); 4278 u64 lock_len = max_t(u64, off, destoff) + len - lock_start; 4279 4280 ret = lock_extent_range(src, lock_start, lock_len, true); 4281 } else { 4282 ret = btrfs_double_extent_lock(src, off, inode, destoff, len, 4283 true); 4284 } 4285 ASSERT(ret == 0); 4286 if (WARN_ON(ret)) { 4287 /* ranges in the io trees already unlocked */ 4288 goto out_unlock; 4289 } 4290 4291 ret = btrfs_clone(src, inode, off, olen, len, destoff, 0); 4292 4293 if (same_inode) { 4294 u64 lock_start = min_t(u64, off, destoff); 4295 u64 lock_end = max_t(u64, off, destoff) + len - 1; 4296 4297 unlock_extent(&BTRFS_I(src)->io_tree, lock_start, lock_end); 4298 } else { 4299 btrfs_double_extent_unlock(src, off, inode, destoff, len); 4300 } 4301 /* 4302 * Truncate page cache pages so that future reads will see the cloned 4303 * data immediately and not the previous data. 4304 */ 4305 truncate_inode_pages_range(&inode->i_data, 4306 round_down(destoff, PAGE_SIZE), 4307 round_up(destoff + len, PAGE_SIZE) - 1); 4308 out_unlock: 4309 if (!same_inode) 4310 btrfs_double_inode_unlock(src, inode); 4311 else 4312 inode_unlock(src); 4313 return ret; 4314 } 4315 4316 int btrfs_clone_file_range(struct file *src_file, loff_t off, 4317 struct file *dst_file, loff_t destoff, u64 len) 4318 { 4319 return btrfs_clone_files(dst_file, src_file, off, len, destoff); 4320 } 4321 4322 static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) 4323 { 4324 struct inode *inode = file_inode(file); 4325 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 4326 struct btrfs_root *root = BTRFS_I(inode)->root; 4327 struct btrfs_root *new_root; 4328 struct btrfs_dir_item *di; 4329 struct btrfs_trans_handle *trans; 4330 struct btrfs_path *path; 4331 struct btrfs_key location; 4332 struct btrfs_disk_key disk_key; 4333 u64 objectid = 0; 4334 u64 dir_id; 4335 int ret; 4336 4337 if (!capable(CAP_SYS_ADMIN)) 4338 return -EPERM; 4339 4340 ret = mnt_want_write_file(file); 4341 if (ret) 4342 return ret; 4343 4344 if (copy_from_user(&objectid, argp, sizeof(objectid))) { 4345 ret = -EFAULT; 4346 goto out; 4347 } 4348 4349 if (!objectid) 4350 objectid = BTRFS_FS_TREE_OBJECTID; 4351 4352 location.objectid = objectid; 4353 location.type = BTRFS_ROOT_ITEM_KEY; 4354 location.offset = (u64)-1; 4355 4356 new_root = btrfs_read_fs_root_no_name(fs_info, &location); 4357 if (IS_ERR(new_root)) { 4358 ret = PTR_ERR(new_root); 4359 goto out; 4360 } 4361 if (!is_fstree(new_root->objectid)) { 4362 ret = -ENOENT; 4363 goto out; 4364 } 4365 4366 path = btrfs_alloc_path(); 4367 if (!path) { 4368 ret = -ENOMEM; 4369 goto out; 4370 } 4371 path->leave_spinning = 1; 4372 4373 trans = btrfs_start_transaction(root, 1); 4374 if (IS_ERR(trans)) { 4375 btrfs_free_path(path); 4376 ret = PTR_ERR(trans); 4377 goto out; 4378 } 4379 4380 dir_id = btrfs_super_root_dir(fs_info->super_copy); 4381 di = btrfs_lookup_dir_item(trans, fs_info->tree_root, path, 4382 dir_id, "default", 7, 1); 4383 if (IS_ERR_OR_NULL(di)) { 4384 btrfs_free_path(path); 4385 btrfs_end_transaction(trans); 4386 btrfs_err(fs_info, 4387 "Umm, you don't have the default diritem, this isn't going to work"); 4388 ret = -ENOENT; 4389 goto out; 4390 } 4391 4392 btrfs_cpu_key_to_disk(&disk_key, &new_root->root_key); 4393 btrfs_set_dir_item_key(path->nodes[0], di, &disk_key); 4394 btrfs_mark_buffer_dirty(path->nodes[0]); 4395 btrfs_free_path(path); 4396 4397 btrfs_set_fs_incompat(fs_info, DEFAULT_SUBVOL); 4398 btrfs_end_transaction(trans); 4399 out: 4400 mnt_drop_write_file(file); 4401 return ret; 4402 } 4403 4404 static void get_block_group_info(struct list_head *groups_list, 4405 struct btrfs_ioctl_space_info *space) 4406 { 4407 struct btrfs_block_group_cache *block_group; 4408 4409 space->total_bytes = 0; 4410 space->used_bytes = 0; 4411 space->flags = 0; 4412 list_for_each_entry(block_group, groups_list, list) { 4413 space->flags = block_group->flags; 4414 space->total_bytes += block_group->key.offset; 4415 space->used_bytes += 4416 btrfs_block_group_used(&block_group->item); 4417 } 4418 } 4419 4420 static long btrfs_ioctl_space_info(struct btrfs_fs_info *fs_info, 4421 void __user *arg) 4422 { 4423 struct btrfs_ioctl_space_args space_args; 4424 struct btrfs_ioctl_space_info space; 4425 struct btrfs_ioctl_space_info *dest; 4426 struct btrfs_ioctl_space_info *dest_orig; 4427 struct btrfs_ioctl_space_info __user *user_dest; 4428 struct btrfs_space_info *info; 4429 static const u64 types[] = { 4430 BTRFS_BLOCK_GROUP_DATA, 4431 BTRFS_BLOCK_GROUP_SYSTEM, 4432 BTRFS_BLOCK_GROUP_METADATA, 4433 BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA 4434 }; 4435 int num_types = 4; 4436 int alloc_size; 4437 int ret = 0; 4438 u64 slot_count = 0; 4439 int i, c; 4440 4441 if (copy_from_user(&space_args, 4442 (struct btrfs_ioctl_space_args __user *)arg, 4443 sizeof(space_args))) 4444 return -EFAULT; 4445 4446 for (i = 0; i < num_types; i++) { 4447 struct btrfs_space_info *tmp; 4448 4449 info = NULL; 4450 rcu_read_lock(); 4451 list_for_each_entry_rcu(tmp, &fs_info->space_info, 4452 list) { 4453 if (tmp->flags == types[i]) { 4454 info = tmp; 4455 break; 4456 } 4457 } 4458 rcu_read_unlock(); 4459 4460 if (!info) 4461 continue; 4462 4463 down_read(&info->groups_sem); 4464 for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) { 4465 if (!list_empty(&info->block_groups[c])) 4466 slot_count++; 4467 } 4468 up_read(&info->groups_sem); 4469 } 4470 4471 /* 4472 * Global block reserve, exported as a space_info 4473 */ 4474 slot_count++; 4475 4476 /* space_slots == 0 means they are asking for a count */ 4477 if (space_args.space_slots == 0) { 4478 space_args.total_spaces = slot_count; 4479 goto out; 4480 } 4481 4482 slot_count = min_t(u64, space_args.space_slots, slot_count); 4483 4484 alloc_size = sizeof(*dest) * slot_count; 4485 4486 /* we generally have at most 6 or so space infos, one for each raid 4487 * level. So, a whole page should be more than enough for everyone 4488 */ 4489 if (alloc_size > PAGE_SIZE) 4490 return -ENOMEM; 4491 4492 space_args.total_spaces = 0; 4493 dest = kmalloc(alloc_size, GFP_KERNEL); 4494 if (!dest) 4495 return -ENOMEM; 4496 dest_orig = dest; 4497 4498 /* now we have a buffer to copy into */ 4499 for (i = 0; i < num_types; i++) { 4500 struct btrfs_space_info *tmp; 4501 4502 if (!slot_count) 4503 break; 4504 4505 info = NULL; 4506 rcu_read_lock(); 4507 list_for_each_entry_rcu(tmp, &fs_info->space_info, 4508 list) { 4509 if (tmp->flags == types[i]) { 4510 info = tmp; 4511 break; 4512 } 4513 } 4514 rcu_read_unlock(); 4515 4516 if (!info) 4517 continue; 4518 down_read(&info->groups_sem); 4519 for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) { 4520 if (!list_empty(&info->block_groups[c])) { 4521 get_block_group_info(&info->block_groups[c], 4522 &space); 4523 memcpy(dest, &space, sizeof(space)); 4524 dest++; 4525 space_args.total_spaces++; 4526 slot_count--; 4527 } 4528 if (!slot_count) 4529 break; 4530 } 4531 up_read(&info->groups_sem); 4532 } 4533 4534 /* 4535 * Add global block reserve 4536 */ 4537 if (slot_count) { 4538 struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; 4539 4540 spin_lock(&block_rsv->lock); 4541 space.total_bytes = block_rsv->size; 4542 space.used_bytes = block_rsv->size - block_rsv->reserved; 4543 spin_unlock(&block_rsv->lock); 4544 space.flags = BTRFS_SPACE_INFO_GLOBAL_RSV; 4545 memcpy(dest, &space, sizeof(space)); 4546 space_args.total_spaces++; 4547 } 4548 4549 user_dest = (struct btrfs_ioctl_space_info __user *) 4550 (arg + sizeof(struct btrfs_ioctl_space_args)); 4551 4552 if (copy_to_user(user_dest, dest_orig, alloc_size)) 4553 ret = -EFAULT; 4554 4555 kfree(dest_orig); 4556 out: 4557 if (ret == 0 && copy_to_user(arg, &space_args, sizeof(space_args))) 4558 ret = -EFAULT; 4559 4560 return ret; 4561 } 4562 4563 static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root, 4564 void __user *argp) 4565 { 4566 struct btrfs_trans_handle *trans; 4567 u64 transid; 4568 int ret; 4569 4570 trans = btrfs_attach_transaction_barrier(root); 4571 if (IS_ERR(trans)) { 4572 if (PTR_ERR(trans) != -ENOENT) 4573 return PTR_ERR(trans); 4574 4575 /* No running transaction, don't bother */ 4576 transid = root->fs_info->last_trans_committed; 4577 goto out; 4578 } 4579 transid = trans->transid; 4580 ret = btrfs_commit_transaction_async(trans, 0); 4581 if (ret) { 4582 btrfs_end_transaction(trans); 4583 return ret; 4584 } 4585 out: 4586 if (argp) 4587 if (copy_to_user(argp, &transid, sizeof(transid))) 4588 return -EFAULT; 4589 return 0; 4590 } 4591 4592 static noinline long btrfs_ioctl_wait_sync(struct btrfs_fs_info *fs_info, 4593 void __user *argp) 4594 { 4595 u64 transid; 4596 4597 if (argp) { 4598 if (copy_from_user(&transid, argp, sizeof(transid))) 4599 return -EFAULT; 4600 } else { 4601 transid = 0; /* current trans */ 4602 } 4603 return btrfs_wait_for_commit(fs_info, transid); 4604 } 4605 4606 static long btrfs_ioctl_scrub(struct file *file, void __user *arg) 4607 { 4608 struct btrfs_fs_info *fs_info = btrfs_sb(file_inode(file)->i_sb); 4609 struct btrfs_ioctl_scrub_args *sa; 4610 int ret; 4611 4612 if (!capable(CAP_SYS_ADMIN)) 4613 return -EPERM; 4614 4615 sa = memdup_user(arg, sizeof(*sa)); 4616 if (IS_ERR(sa)) 4617 return PTR_ERR(sa); 4618 4619 if (!(sa->flags & BTRFS_SCRUB_READONLY)) { 4620 ret = mnt_want_write_file(file); 4621 if (ret) 4622 goto out; 4623 } 4624 4625 ret = btrfs_scrub_dev(fs_info, sa->devid, sa->start, sa->end, 4626 &sa->progress, sa->flags & BTRFS_SCRUB_READONLY, 4627 0); 4628 4629 if (copy_to_user(arg, sa, sizeof(*sa))) 4630 ret = -EFAULT; 4631 4632 if (!(sa->flags & BTRFS_SCRUB_READONLY)) 4633 mnt_drop_write_file(file); 4634 out: 4635 kfree(sa); 4636 return ret; 4637 } 4638 4639 static long btrfs_ioctl_scrub_cancel(struct btrfs_fs_info *fs_info) 4640 { 4641 if (!capable(CAP_SYS_ADMIN)) 4642 return -EPERM; 4643 4644 return btrfs_scrub_cancel(fs_info); 4645 } 4646 4647 static long btrfs_ioctl_scrub_progress(struct btrfs_fs_info *fs_info, 4648 void __user *arg) 4649 { 4650 struct btrfs_ioctl_scrub_args *sa; 4651 int ret; 4652 4653 if (!capable(CAP_SYS_ADMIN)) 4654 return -EPERM; 4655 4656 sa = memdup_user(arg, sizeof(*sa)); 4657 if (IS_ERR(sa)) 4658 return PTR_ERR(sa); 4659 4660 ret = btrfs_scrub_progress(fs_info, sa->devid, &sa->progress); 4661 4662 if (copy_to_user(arg, sa, sizeof(*sa))) 4663 ret = -EFAULT; 4664 4665 kfree(sa); 4666 return ret; 4667 } 4668 4669 static long btrfs_ioctl_get_dev_stats(struct btrfs_fs_info *fs_info, 4670 void __user *arg) 4671 { 4672 struct btrfs_ioctl_get_dev_stats *sa; 4673 int ret; 4674 4675 sa = memdup_user(arg, sizeof(*sa)); 4676 if (IS_ERR(sa)) 4677 return PTR_ERR(sa); 4678 4679 if ((sa->flags & BTRFS_DEV_STATS_RESET) && !capable(CAP_SYS_ADMIN)) { 4680 kfree(sa); 4681 return -EPERM; 4682 } 4683 4684 ret = btrfs_get_dev_stats(fs_info, sa); 4685 4686 if (copy_to_user(arg, sa, sizeof(*sa))) 4687 ret = -EFAULT; 4688 4689 kfree(sa); 4690 return ret; 4691 } 4692 4693 static long btrfs_ioctl_dev_replace(struct btrfs_fs_info *fs_info, 4694 void __user *arg) 4695 { 4696 struct btrfs_ioctl_dev_replace_args *p; 4697 int ret; 4698 4699 if (!capable(CAP_SYS_ADMIN)) 4700 return -EPERM; 4701 4702 p = memdup_user(arg, sizeof(*p)); 4703 if (IS_ERR(p)) 4704 return PTR_ERR(p); 4705 4706 switch (p->cmd) { 4707 case BTRFS_IOCTL_DEV_REPLACE_CMD_START: 4708 if (sb_rdonly(fs_info->sb)) { 4709 ret = -EROFS; 4710 goto out; 4711 } 4712 if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { 4713 ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; 4714 } else { 4715 ret = btrfs_dev_replace_by_ioctl(fs_info, p); 4716 clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); 4717 } 4718 break; 4719 case BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS: 4720 btrfs_dev_replace_status(fs_info, p); 4721 ret = 0; 4722 break; 4723 case BTRFS_IOCTL_DEV_REPLACE_CMD_CANCEL: 4724 p->result = btrfs_dev_replace_cancel(fs_info); 4725 ret = 0; 4726 break; 4727 default: 4728 ret = -EINVAL; 4729 break; 4730 } 4731 4732 if (copy_to_user(arg, p, sizeof(*p))) 4733 ret = -EFAULT; 4734 out: 4735 kfree(p); 4736 return ret; 4737 } 4738 4739 static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg) 4740 { 4741 int ret = 0; 4742 int i; 4743 u64 rel_ptr; 4744 int size; 4745 struct btrfs_ioctl_ino_path_args *ipa = NULL; 4746 struct inode_fs_paths *ipath = NULL; 4747 struct btrfs_path *path; 4748 4749 if (!capable(CAP_DAC_READ_SEARCH)) 4750 return -EPERM; 4751 4752 path = btrfs_alloc_path(); 4753 if (!path) { 4754 ret = -ENOMEM; 4755 goto out; 4756 } 4757 4758 ipa = memdup_user(arg, sizeof(*ipa)); 4759 if (IS_ERR(ipa)) { 4760 ret = PTR_ERR(ipa); 4761 ipa = NULL; 4762 goto out; 4763 } 4764 4765 size = min_t(u32, ipa->size, 4096); 4766 ipath = init_ipath(size, root, path); 4767 if (IS_ERR(ipath)) { 4768 ret = PTR_ERR(ipath); 4769 ipath = NULL; 4770 goto out; 4771 } 4772 4773 ret = paths_from_inode(ipa->inum, ipath); 4774 if (ret < 0) 4775 goto out; 4776 4777 for (i = 0; i < ipath->fspath->elem_cnt; ++i) { 4778 rel_ptr = ipath->fspath->val[i] - 4779 (u64)(unsigned long)ipath->fspath->val; 4780 ipath->fspath->val[i] = rel_ptr; 4781 } 4782 4783 ret = copy_to_user((void __user *)(unsigned long)ipa->fspath, 4784 ipath->fspath, size); 4785 if (ret) { 4786 ret = -EFAULT; 4787 goto out; 4788 } 4789 4790 out: 4791 btrfs_free_path(path); 4792 free_ipath(ipath); 4793 kfree(ipa); 4794 4795 return ret; 4796 } 4797 4798 static int build_ino_list(u64 inum, u64 offset, u64 root, void *ctx) 4799 { 4800 struct btrfs_data_container *inodes = ctx; 4801 const size_t c = 3 * sizeof(u64); 4802 4803 if (inodes->bytes_left >= c) { 4804 inodes->bytes_left -= c; 4805 inodes->val[inodes->elem_cnt] = inum; 4806 inodes->val[inodes->elem_cnt + 1] = offset; 4807 inodes->val[inodes->elem_cnt + 2] = root; 4808 inodes->elem_cnt += 3; 4809 } else { 4810 inodes->bytes_missing += c - inodes->bytes_left; 4811 inodes->bytes_left = 0; 4812 inodes->elem_missed += 3; 4813 } 4814 4815 return 0; 4816 } 4817 4818 static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info, 4819 void __user *arg, int version) 4820 { 4821 int ret = 0; 4822 int size; 4823 struct btrfs_ioctl_logical_ino_args *loi; 4824 struct btrfs_data_container *inodes = NULL; 4825 struct btrfs_path *path = NULL; 4826 bool ignore_offset; 4827 4828 if (!capable(CAP_SYS_ADMIN)) 4829 return -EPERM; 4830 4831 loi = memdup_user(arg, sizeof(*loi)); 4832 if (IS_ERR(loi)) 4833 return PTR_ERR(loi); 4834 4835 if (version == 1) { 4836 ignore_offset = false; 4837 size = min_t(u32, loi->size, SZ_64K); 4838 } else { 4839 /* All reserved bits must be 0 for now */ 4840 if (memchr_inv(loi->reserved, 0, sizeof(loi->reserved))) { 4841 ret = -EINVAL; 4842 goto out_loi; 4843 } 4844 /* Only accept flags we have defined so far */ 4845 if (loi->flags & ~(BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET)) { 4846 ret = -EINVAL; 4847 goto out_loi; 4848 } 4849 ignore_offset = loi->flags & BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET; 4850 size = min_t(u32, loi->size, SZ_16M); 4851 } 4852 4853 path = btrfs_alloc_path(); 4854 if (!path) { 4855 ret = -ENOMEM; 4856 goto out; 4857 } 4858 4859 inodes = init_data_container(size); 4860 if (IS_ERR(inodes)) { 4861 ret = PTR_ERR(inodes); 4862 inodes = NULL; 4863 goto out; 4864 } 4865 4866 ret = iterate_inodes_from_logical(loi->logical, fs_info, path, 4867 build_ino_list, inodes, ignore_offset); 4868 if (ret == -EINVAL) 4869 ret = -ENOENT; 4870 if (ret < 0) 4871 goto out; 4872 4873 ret = copy_to_user((void __user *)(unsigned long)loi->inodes, inodes, 4874 size); 4875 if (ret) 4876 ret = -EFAULT; 4877 4878 out: 4879 btrfs_free_path(path); 4880 kvfree(inodes); 4881 out_loi: 4882 kfree(loi); 4883 4884 return ret; 4885 } 4886 4887 void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info, 4888 struct btrfs_ioctl_balance_args *bargs) 4889 { 4890 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 4891 4892 bargs->flags = bctl->flags; 4893 4894 if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) 4895 bargs->state |= BTRFS_BALANCE_STATE_RUNNING; 4896 if (atomic_read(&fs_info->balance_pause_req)) 4897 bargs->state |= BTRFS_BALANCE_STATE_PAUSE_REQ; 4898 if (atomic_read(&fs_info->balance_cancel_req)) 4899 bargs->state |= BTRFS_BALANCE_STATE_CANCEL_REQ; 4900 4901 memcpy(&bargs->data, &bctl->data, sizeof(bargs->data)); 4902 memcpy(&bargs->meta, &bctl->meta, sizeof(bargs->meta)); 4903 memcpy(&bargs->sys, &bctl->sys, sizeof(bargs->sys)); 4904 4905 spin_lock(&fs_info->balance_lock); 4906 memcpy(&bargs->stat, &bctl->stat, sizeof(bargs->stat)); 4907 spin_unlock(&fs_info->balance_lock); 4908 } 4909 4910 static long btrfs_ioctl_balance(struct file *file, void __user *arg) 4911 { 4912 struct btrfs_root *root = BTRFS_I(file_inode(file))->root; 4913 struct btrfs_fs_info *fs_info = root->fs_info; 4914 struct btrfs_ioctl_balance_args *bargs; 4915 struct btrfs_balance_control *bctl; 4916 bool need_unlock; /* for mut. excl. ops lock */ 4917 int ret; 4918 4919 if (!capable(CAP_SYS_ADMIN)) 4920 return -EPERM; 4921 4922 ret = mnt_want_write_file(file); 4923 if (ret) 4924 return ret; 4925 4926 again: 4927 if (!test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { 4928 mutex_lock(&fs_info->balance_mutex); 4929 need_unlock = true; 4930 goto locked; 4931 } 4932 4933 /* 4934 * mut. excl. ops lock is locked. Three possibilities: 4935 * (1) some other op is running 4936 * (2) balance is running 4937 * (3) balance is paused -- special case (think resume) 4938 */ 4939 mutex_lock(&fs_info->balance_mutex); 4940 if (fs_info->balance_ctl) { 4941 /* this is either (2) or (3) */ 4942 if (!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { 4943 mutex_unlock(&fs_info->balance_mutex); 4944 /* 4945 * Lock released to allow other waiters to continue, 4946 * we'll reexamine the status again. 4947 */ 4948 mutex_lock(&fs_info->balance_mutex); 4949 4950 if (fs_info->balance_ctl && 4951 !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { 4952 /* this is (3) */ 4953 need_unlock = false; 4954 goto locked; 4955 } 4956 4957 mutex_unlock(&fs_info->balance_mutex); 4958 goto again; 4959 } else { 4960 /* this is (2) */ 4961 mutex_unlock(&fs_info->balance_mutex); 4962 ret = -EINPROGRESS; 4963 goto out; 4964 } 4965 } else { 4966 /* this is (1) */ 4967 mutex_unlock(&fs_info->balance_mutex); 4968 ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; 4969 goto out; 4970 } 4971 4972 locked: 4973 BUG_ON(!test_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)); 4974 4975 if (arg) { 4976 bargs = memdup_user(arg, sizeof(*bargs)); 4977 if (IS_ERR(bargs)) { 4978 ret = PTR_ERR(bargs); 4979 goto out_unlock; 4980 } 4981 4982 if (bargs->flags & BTRFS_BALANCE_RESUME) { 4983 if (!fs_info->balance_ctl) { 4984 ret = -ENOTCONN; 4985 goto out_bargs; 4986 } 4987 4988 bctl = fs_info->balance_ctl; 4989 spin_lock(&fs_info->balance_lock); 4990 bctl->flags |= BTRFS_BALANCE_RESUME; 4991 spin_unlock(&fs_info->balance_lock); 4992 4993 goto do_balance; 4994 } 4995 } else { 4996 bargs = NULL; 4997 } 4998 4999 if (fs_info->balance_ctl) { 5000 ret = -EINPROGRESS; 5001 goto out_bargs; 5002 } 5003 5004 bctl = kzalloc(sizeof(*bctl), GFP_KERNEL); 5005 if (!bctl) { 5006 ret = -ENOMEM; 5007 goto out_bargs; 5008 } 5009 5010 if (arg) { 5011 memcpy(&bctl->data, &bargs->data, sizeof(bctl->data)); 5012 memcpy(&bctl->meta, &bargs->meta, sizeof(bctl->meta)); 5013 memcpy(&bctl->sys, &bargs->sys, sizeof(bctl->sys)); 5014 5015 bctl->flags = bargs->flags; 5016 } else { 5017 /* balance everything - no filters */ 5018 bctl->flags |= BTRFS_BALANCE_TYPE_MASK; 5019 } 5020 5021 if (bctl->flags & ~(BTRFS_BALANCE_ARGS_MASK | BTRFS_BALANCE_TYPE_MASK)) { 5022 ret = -EINVAL; 5023 goto out_bctl; 5024 } 5025 5026 do_balance: 5027 /* 5028 * Ownership of bctl and filesystem flag BTRFS_FS_EXCL_OP goes to 5029 * btrfs_balance. bctl is freed in reset_balance_state, or, if 5030 * restriper was paused all the way until unmount, in free_fs_info. 5031 * The flag should be cleared after reset_balance_state. 5032 */ 5033 need_unlock = false; 5034 5035 ret = btrfs_balance(fs_info, bctl, bargs); 5036 bctl = NULL; 5037 5038 if (arg) { 5039 if (copy_to_user(arg, bargs, sizeof(*bargs))) 5040 ret = -EFAULT; 5041 } 5042 5043 out_bctl: 5044 kfree(bctl); 5045 out_bargs: 5046 kfree(bargs); 5047 out_unlock: 5048 mutex_unlock(&fs_info->balance_mutex); 5049 if (need_unlock) 5050 clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); 5051 out: 5052 mnt_drop_write_file(file); 5053 return ret; 5054 } 5055 5056 static long btrfs_ioctl_balance_ctl(struct btrfs_fs_info *fs_info, int cmd) 5057 { 5058 if (!capable(CAP_SYS_ADMIN)) 5059 return -EPERM; 5060 5061 switch (cmd) { 5062 case BTRFS_BALANCE_CTL_PAUSE: 5063 return btrfs_pause_balance(fs_info); 5064 case BTRFS_BALANCE_CTL_CANCEL: 5065 return btrfs_cancel_balance(fs_info); 5066 } 5067 5068 return -EINVAL; 5069 } 5070 5071 static long btrfs_ioctl_balance_progress(struct btrfs_fs_info *fs_info, 5072 void __user *arg) 5073 { 5074 struct btrfs_ioctl_balance_args *bargs; 5075 int ret = 0; 5076 5077 if (!capable(CAP_SYS_ADMIN)) 5078 return -EPERM; 5079 5080 mutex_lock(&fs_info->balance_mutex); 5081 if (!fs_info->balance_ctl) { 5082 ret = -ENOTCONN; 5083 goto out; 5084 } 5085 5086 bargs = kzalloc(sizeof(*bargs), GFP_KERNEL); 5087 if (!bargs) { 5088 ret = -ENOMEM; 5089 goto out; 5090 } 5091 5092 btrfs_update_ioctl_balance_args(fs_info, bargs); 5093 5094 if (copy_to_user(arg, bargs, sizeof(*bargs))) 5095 ret = -EFAULT; 5096 5097 kfree(bargs); 5098 out: 5099 mutex_unlock(&fs_info->balance_mutex); 5100 return ret; 5101 } 5102 5103 static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg) 5104 { 5105 struct inode *inode = file_inode(file); 5106 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 5107 struct btrfs_ioctl_quota_ctl_args *sa; 5108 int ret; 5109 5110 if (!capable(CAP_SYS_ADMIN)) 5111 return -EPERM; 5112 5113 ret = mnt_want_write_file(file); 5114 if (ret) 5115 return ret; 5116 5117 sa = memdup_user(arg, sizeof(*sa)); 5118 if (IS_ERR(sa)) { 5119 ret = PTR_ERR(sa); 5120 goto drop_write; 5121 } 5122 5123 down_write(&fs_info->subvol_sem); 5124 5125 switch (sa->cmd) { 5126 case BTRFS_QUOTA_CTL_ENABLE: 5127 ret = btrfs_quota_enable(fs_info); 5128 break; 5129 case BTRFS_QUOTA_CTL_DISABLE: 5130 ret = btrfs_quota_disable(fs_info); 5131 break; 5132 default: 5133 ret = -EINVAL; 5134 break; 5135 } 5136 5137 kfree(sa); 5138 up_write(&fs_info->subvol_sem); 5139 drop_write: 5140 mnt_drop_write_file(file); 5141 return ret; 5142 } 5143 5144 static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg) 5145 { 5146 struct inode *inode = file_inode(file); 5147 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 5148 struct btrfs_root *root = BTRFS_I(inode)->root; 5149 struct btrfs_ioctl_qgroup_assign_args *sa; 5150 struct btrfs_trans_handle *trans; 5151 int ret; 5152 int err; 5153 5154 if (!capable(CAP_SYS_ADMIN)) 5155 return -EPERM; 5156 5157 ret = mnt_want_write_file(file); 5158 if (ret) 5159 return ret; 5160 5161 sa = memdup_user(arg, sizeof(*sa)); 5162 if (IS_ERR(sa)) { 5163 ret = PTR_ERR(sa); 5164 goto drop_write; 5165 } 5166 5167 trans = btrfs_join_transaction(root); 5168 if (IS_ERR(trans)) { 5169 ret = PTR_ERR(trans); 5170 goto out; 5171 } 5172 5173 if (sa->assign) { 5174 ret = btrfs_add_qgroup_relation(trans, sa->src, sa->dst); 5175 } else { 5176 ret = btrfs_del_qgroup_relation(trans, sa->src, sa->dst); 5177 } 5178 5179 /* update qgroup status and info */ 5180 err = btrfs_run_qgroups(trans); 5181 if (err < 0) 5182 btrfs_handle_fs_error(fs_info, err, 5183 "failed to update qgroup status and info"); 5184 err = btrfs_end_transaction(trans); 5185 if (err && !ret) 5186 ret = err; 5187 5188 out: 5189 kfree(sa); 5190 drop_write: 5191 mnt_drop_write_file(file); 5192 return ret; 5193 } 5194 5195 static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg) 5196 { 5197 struct inode *inode = file_inode(file); 5198 struct btrfs_root *root = BTRFS_I(inode)->root; 5199 struct btrfs_ioctl_qgroup_create_args *sa; 5200 struct btrfs_trans_handle *trans; 5201 int ret; 5202 int err; 5203 5204 if (!capable(CAP_SYS_ADMIN)) 5205 return -EPERM; 5206 5207 ret = mnt_want_write_file(file); 5208 if (ret) 5209 return ret; 5210 5211 sa = memdup_user(arg, sizeof(*sa)); 5212 if (IS_ERR(sa)) { 5213 ret = PTR_ERR(sa); 5214 goto drop_write; 5215 } 5216 5217 if (!sa->qgroupid) { 5218 ret = -EINVAL; 5219 goto out; 5220 } 5221 5222 trans = btrfs_join_transaction(root); 5223 if (IS_ERR(trans)) { 5224 ret = PTR_ERR(trans); 5225 goto out; 5226 } 5227 5228 if (sa->create) { 5229 ret = btrfs_create_qgroup(trans, sa->qgroupid); 5230 } else { 5231 ret = btrfs_remove_qgroup(trans, sa->qgroupid); 5232 } 5233 5234 err = btrfs_end_transaction(trans); 5235 if (err && !ret) 5236 ret = err; 5237 5238 out: 5239 kfree(sa); 5240 drop_write: 5241 mnt_drop_write_file(file); 5242 return ret; 5243 } 5244 5245 static long btrfs_ioctl_qgroup_limit(struct file *file, void __user *arg) 5246 { 5247 struct inode *inode = file_inode(file); 5248 struct btrfs_root *root = BTRFS_I(inode)->root; 5249 struct btrfs_ioctl_qgroup_limit_args *sa; 5250 struct btrfs_trans_handle *trans; 5251 int ret; 5252 int err; 5253 u64 qgroupid; 5254 5255 if (!capable(CAP_SYS_ADMIN)) 5256 return -EPERM; 5257 5258 ret = mnt_want_write_file(file); 5259 if (ret) 5260 return ret; 5261 5262 sa = memdup_user(arg, sizeof(*sa)); 5263 if (IS_ERR(sa)) { 5264 ret = PTR_ERR(sa); 5265 goto drop_write; 5266 } 5267 5268 trans = btrfs_join_transaction(root); 5269 if (IS_ERR(trans)) { 5270 ret = PTR_ERR(trans); 5271 goto out; 5272 } 5273 5274 qgroupid = sa->qgroupid; 5275 if (!qgroupid) { 5276 /* take the current subvol as qgroup */ 5277 qgroupid = root->root_key.objectid; 5278 } 5279 5280 ret = btrfs_limit_qgroup(trans, qgroupid, &sa->lim); 5281 5282 err = btrfs_end_transaction(trans); 5283 if (err && !ret) 5284 ret = err; 5285 5286 out: 5287 kfree(sa); 5288 drop_write: 5289 mnt_drop_write_file(file); 5290 return ret; 5291 } 5292 5293 static long btrfs_ioctl_quota_rescan(struct file *file, void __user *arg) 5294 { 5295 struct inode *inode = file_inode(file); 5296 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 5297 struct btrfs_ioctl_quota_rescan_args *qsa; 5298 int ret; 5299 5300 if (!capable(CAP_SYS_ADMIN)) 5301 return -EPERM; 5302 5303 ret = mnt_want_write_file(file); 5304 if (ret) 5305 return ret; 5306 5307 qsa = memdup_user(arg, sizeof(*qsa)); 5308 if (IS_ERR(qsa)) { 5309 ret = PTR_ERR(qsa); 5310 goto drop_write; 5311 } 5312 5313 if (qsa->flags) { 5314 ret = -EINVAL; 5315 goto out; 5316 } 5317 5318 ret = btrfs_qgroup_rescan(fs_info); 5319 5320 out: 5321 kfree(qsa); 5322 drop_write: 5323 mnt_drop_write_file(file); 5324 return ret; 5325 } 5326 5327 static long btrfs_ioctl_quota_rescan_status(struct file *file, void __user *arg) 5328 { 5329 struct inode *inode = file_inode(file); 5330 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 5331 struct btrfs_ioctl_quota_rescan_args *qsa; 5332 int ret = 0; 5333 5334 if (!capable(CAP_SYS_ADMIN)) 5335 return -EPERM; 5336 5337 qsa = kzalloc(sizeof(*qsa), GFP_KERNEL); 5338 if (!qsa) 5339 return -ENOMEM; 5340 5341 if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { 5342 qsa->flags = 1; 5343 qsa->progress = fs_info->qgroup_rescan_progress.objectid; 5344 } 5345 5346 if (copy_to_user(arg, qsa, sizeof(*qsa))) 5347 ret = -EFAULT; 5348 5349 kfree(qsa); 5350 return ret; 5351 } 5352 5353 static long btrfs_ioctl_quota_rescan_wait(struct file *file, void __user *arg) 5354 { 5355 struct inode *inode = file_inode(file); 5356 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 5357 5358 if (!capable(CAP_SYS_ADMIN)) 5359 return -EPERM; 5360 5361 return btrfs_qgroup_wait_for_completion(fs_info, true); 5362 } 5363 5364 static long _btrfs_ioctl_set_received_subvol(struct file *file, 5365 struct btrfs_ioctl_received_subvol_args *sa) 5366 { 5367 struct inode *inode = file_inode(file); 5368 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 5369 struct btrfs_root *root = BTRFS_I(inode)->root; 5370 struct btrfs_root_item *root_item = &root->root_item; 5371 struct btrfs_trans_handle *trans; 5372 struct timespec64 ct = current_time(inode); 5373 int ret = 0; 5374 int received_uuid_changed; 5375 5376 if (!inode_owner_or_capable(inode)) 5377 return -EPERM; 5378 5379 ret = mnt_want_write_file(file); 5380 if (ret < 0) 5381 return ret; 5382 5383 down_write(&fs_info->subvol_sem); 5384 5385 if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FIRST_FREE_OBJECTID) { 5386 ret = -EINVAL; 5387 goto out; 5388 } 5389 5390 if (btrfs_root_readonly(root)) { 5391 ret = -EROFS; 5392 goto out; 5393 } 5394 5395 /* 5396 * 1 - root item 5397 * 2 - uuid items (received uuid + subvol uuid) 5398 */ 5399 trans = btrfs_start_transaction(root, 3); 5400 if (IS_ERR(trans)) { 5401 ret = PTR_ERR(trans); 5402 trans = NULL; 5403 goto out; 5404 } 5405 5406 sa->rtransid = trans->transid; 5407 sa->rtime.sec = ct.tv_sec; 5408 sa->rtime.nsec = ct.tv_nsec; 5409 5410 received_uuid_changed = memcmp(root_item->received_uuid, sa->uuid, 5411 BTRFS_UUID_SIZE); 5412 if (received_uuid_changed && 5413 !btrfs_is_empty_uuid(root_item->received_uuid)) { 5414 ret = btrfs_uuid_tree_remove(trans, root_item->received_uuid, 5415 BTRFS_UUID_KEY_RECEIVED_SUBVOL, 5416 root->root_key.objectid); 5417 if (ret && ret != -ENOENT) { 5418 btrfs_abort_transaction(trans, ret); 5419 btrfs_end_transaction(trans); 5420 goto out; 5421 } 5422 } 5423 memcpy(root_item->received_uuid, sa->uuid, BTRFS_UUID_SIZE); 5424 btrfs_set_root_stransid(root_item, sa->stransid); 5425 btrfs_set_root_rtransid(root_item, sa->rtransid); 5426 btrfs_set_stack_timespec_sec(&root_item->stime, sa->stime.sec); 5427 btrfs_set_stack_timespec_nsec(&root_item->stime, sa->stime.nsec); 5428 btrfs_set_stack_timespec_sec(&root_item->rtime, sa->rtime.sec); 5429 btrfs_set_stack_timespec_nsec(&root_item->rtime, sa->rtime.nsec); 5430 5431 ret = btrfs_update_root(trans, fs_info->tree_root, 5432 &root->root_key, &root->root_item); 5433 if (ret < 0) { 5434 btrfs_end_transaction(trans); 5435 goto out; 5436 } 5437 if (received_uuid_changed && !btrfs_is_empty_uuid(sa->uuid)) { 5438 ret = btrfs_uuid_tree_add(trans, sa->uuid, 5439 BTRFS_UUID_KEY_RECEIVED_SUBVOL, 5440 root->root_key.objectid); 5441 if (ret < 0 && ret != -EEXIST) { 5442 btrfs_abort_transaction(trans, ret); 5443 btrfs_end_transaction(trans); 5444 goto out; 5445 } 5446 } 5447 ret = btrfs_commit_transaction(trans); 5448 out: 5449 up_write(&fs_info->subvol_sem); 5450 mnt_drop_write_file(file); 5451 return ret; 5452 } 5453 5454 #ifdef CONFIG_64BIT 5455 static long btrfs_ioctl_set_received_subvol_32(struct file *file, 5456 void __user *arg) 5457 { 5458 struct btrfs_ioctl_received_subvol_args_32 *args32 = NULL; 5459 struct btrfs_ioctl_received_subvol_args *args64 = NULL; 5460 int ret = 0; 5461 5462 args32 = memdup_user(arg, sizeof(*args32)); 5463 if (IS_ERR(args32)) 5464 return PTR_ERR(args32); 5465 5466 args64 = kmalloc(sizeof(*args64), GFP_KERNEL); 5467 if (!args64) { 5468 ret = -ENOMEM; 5469 goto out; 5470 } 5471 5472 memcpy(args64->uuid, args32->uuid, BTRFS_UUID_SIZE); 5473 args64->stransid = args32->stransid; 5474 args64->rtransid = args32->rtransid; 5475 args64->stime.sec = args32->stime.sec; 5476 args64->stime.nsec = args32->stime.nsec; 5477 args64->rtime.sec = args32->rtime.sec; 5478 args64->rtime.nsec = args32->rtime.nsec; 5479 args64->flags = args32->flags; 5480 5481 ret = _btrfs_ioctl_set_received_subvol(file, args64); 5482 if (ret) 5483 goto out; 5484 5485 memcpy(args32->uuid, args64->uuid, BTRFS_UUID_SIZE); 5486 args32->stransid = args64->stransid; 5487 args32->rtransid = args64->rtransid; 5488 args32->stime.sec = args64->stime.sec; 5489 args32->stime.nsec = args64->stime.nsec; 5490 args32->rtime.sec = args64->rtime.sec; 5491 args32->rtime.nsec = args64->rtime.nsec; 5492 args32->flags = args64->flags; 5493 5494 ret = copy_to_user(arg, args32, sizeof(*args32)); 5495 if (ret) 5496 ret = -EFAULT; 5497 5498 out: 5499 kfree(args32); 5500 kfree(args64); 5501 return ret; 5502 } 5503 #endif 5504 5505 static long btrfs_ioctl_set_received_subvol(struct file *file, 5506 void __user *arg) 5507 { 5508 struct btrfs_ioctl_received_subvol_args *sa = NULL; 5509 int ret = 0; 5510 5511 sa = memdup_user(arg, sizeof(*sa)); 5512 if (IS_ERR(sa)) 5513 return PTR_ERR(sa); 5514 5515 ret = _btrfs_ioctl_set_received_subvol(file, sa); 5516 5517 if (ret) 5518 goto out; 5519 5520 ret = copy_to_user(arg, sa, sizeof(*sa)); 5521 if (ret) 5522 ret = -EFAULT; 5523 5524 out: 5525 kfree(sa); 5526 return ret; 5527 } 5528 5529 static int btrfs_ioctl_get_fslabel(struct file *file, void __user *arg) 5530 { 5531 struct inode *inode = file_inode(file); 5532 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 5533 size_t len; 5534 int ret; 5535 char label[BTRFS_LABEL_SIZE]; 5536 5537 spin_lock(&fs_info->super_lock); 5538 memcpy(label, fs_info->super_copy->label, BTRFS_LABEL_SIZE); 5539 spin_unlock(&fs_info->super_lock); 5540 5541 len = strnlen(label, BTRFS_LABEL_SIZE); 5542 5543 if (len == BTRFS_LABEL_SIZE) { 5544 btrfs_warn(fs_info, 5545 "label is too long, return the first %zu bytes", 5546 --len); 5547 } 5548 5549 ret = copy_to_user(arg, label, len); 5550 5551 return ret ? -EFAULT : 0; 5552 } 5553 5554 static int btrfs_ioctl_set_fslabel(struct file *file, void __user *arg) 5555 { 5556 struct inode *inode = file_inode(file); 5557 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 5558 struct btrfs_root *root = BTRFS_I(inode)->root; 5559 struct btrfs_super_block *super_block = fs_info->super_copy; 5560 struct btrfs_trans_handle *trans; 5561 char label[BTRFS_LABEL_SIZE]; 5562 int ret; 5563 5564 if (!capable(CAP_SYS_ADMIN)) 5565 return -EPERM; 5566 5567 if (copy_from_user(label, arg, sizeof(label))) 5568 return -EFAULT; 5569 5570 if (strnlen(label, BTRFS_LABEL_SIZE) == BTRFS_LABEL_SIZE) { 5571 btrfs_err(fs_info, 5572 "unable to set label with more than %d bytes", 5573 BTRFS_LABEL_SIZE - 1); 5574 return -EINVAL; 5575 } 5576 5577 ret = mnt_want_write_file(file); 5578 if (ret) 5579 return ret; 5580 5581 trans = btrfs_start_transaction(root, 0); 5582 if (IS_ERR(trans)) { 5583 ret = PTR_ERR(trans); 5584 goto out_unlock; 5585 } 5586 5587 spin_lock(&fs_info->super_lock); 5588 strcpy(super_block->label, label); 5589 spin_unlock(&fs_info->super_lock); 5590 ret = btrfs_commit_transaction(trans); 5591 5592 out_unlock: 5593 mnt_drop_write_file(file); 5594 return ret; 5595 } 5596 5597 #define INIT_FEATURE_FLAGS(suffix) \ 5598 { .compat_flags = BTRFS_FEATURE_COMPAT_##suffix, \ 5599 .compat_ro_flags = BTRFS_FEATURE_COMPAT_RO_##suffix, \ 5600 .incompat_flags = BTRFS_FEATURE_INCOMPAT_##suffix } 5601 5602 int btrfs_ioctl_get_supported_features(void __user *arg) 5603 { 5604 static const struct btrfs_ioctl_feature_flags features[3] = { 5605 INIT_FEATURE_FLAGS(SUPP), 5606 INIT_FEATURE_FLAGS(SAFE_SET), 5607 INIT_FEATURE_FLAGS(SAFE_CLEAR) 5608 }; 5609 5610 if (copy_to_user(arg, &features, sizeof(features))) 5611 return -EFAULT; 5612 5613 return 0; 5614 } 5615 5616 static int btrfs_ioctl_get_features(struct file *file, void __user *arg) 5617 { 5618 struct inode *inode = file_inode(file); 5619 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 5620 struct btrfs_super_block *super_block = fs_info->super_copy; 5621 struct btrfs_ioctl_feature_flags features; 5622 5623 features.compat_flags = btrfs_super_compat_flags(super_block); 5624 features.compat_ro_flags = btrfs_super_compat_ro_flags(super_block); 5625 features.incompat_flags = btrfs_super_incompat_flags(super_block); 5626 5627 if (copy_to_user(arg, &features, sizeof(features))) 5628 return -EFAULT; 5629 5630 return 0; 5631 } 5632 5633 static int check_feature_bits(struct btrfs_fs_info *fs_info, 5634 enum btrfs_feature_set set, 5635 u64 change_mask, u64 flags, u64 supported_flags, 5636 u64 safe_set, u64 safe_clear) 5637 { 5638 const char *type = btrfs_feature_set_names[set]; 5639 char *names; 5640 u64 disallowed, unsupported; 5641 u64 set_mask = flags & change_mask; 5642 u64 clear_mask = ~flags & change_mask; 5643 5644 unsupported = set_mask & ~supported_flags; 5645 if (unsupported) { 5646 names = btrfs_printable_features(set, unsupported); 5647 if (names) { 5648 btrfs_warn(fs_info, 5649 "this kernel does not support the %s feature bit%s", 5650 names, strchr(names, ',') ? "s" : ""); 5651 kfree(names); 5652 } else 5653 btrfs_warn(fs_info, 5654 "this kernel does not support %s bits 0x%llx", 5655 type, unsupported); 5656 return -EOPNOTSUPP; 5657 } 5658 5659 disallowed = set_mask & ~safe_set; 5660 if (disallowed) { 5661 names = btrfs_printable_features(set, disallowed); 5662 if (names) { 5663 btrfs_warn(fs_info, 5664 "can't set the %s feature bit%s while mounted", 5665 names, strchr(names, ',') ? "s" : ""); 5666 kfree(names); 5667 } else 5668 btrfs_warn(fs_info, 5669 "can't set %s bits 0x%llx while mounted", 5670 type, disallowed); 5671 return -EPERM; 5672 } 5673 5674 disallowed = clear_mask & ~safe_clear; 5675 if (disallowed) { 5676 names = btrfs_printable_features(set, disallowed); 5677 if (names) { 5678 btrfs_warn(fs_info, 5679 "can't clear the %s feature bit%s while mounted", 5680 names, strchr(names, ',') ? "s" : ""); 5681 kfree(names); 5682 } else 5683 btrfs_warn(fs_info, 5684 "can't clear %s bits 0x%llx while mounted", 5685 type, disallowed); 5686 return -EPERM; 5687 } 5688 5689 return 0; 5690 } 5691 5692 #define check_feature(fs_info, change_mask, flags, mask_base) \ 5693 check_feature_bits(fs_info, FEAT_##mask_base, change_mask, flags, \ 5694 BTRFS_FEATURE_ ## mask_base ## _SUPP, \ 5695 BTRFS_FEATURE_ ## mask_base ## _SAFE_SET, \ 5696 BTRFS_FEATURE_ ## mask_base ## _SAFE_CLEAR) 5697 5698 static int btrfs_ioctl_set_features(struct file *file, void __user *arg) 5699 { 5700 struct inode *inode = file_inode(file); 5701 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 5702 struct btrfs_root *root = BTRFS_I(inode)->root; 5703 struct btrfs_super_block *super_block = fs_info->super_copy; 5704 struct btrfs_ioctl_feature_flags flags[2]; 5705 struct btrfs_trans_handle *trans; 5706 u64 newflags; 5707 int ret; 5708 5709 if (!capable(CAP_SYS_ADMIN)) 5710 return -EPERM; 5711 5712 if (copy_from_user(flags, arg, sizeof(flags))) 5713 return -EFAULT; 5714 5715 /* Nothing to do */ 5716 if (!flags[0].compat_flags && !flags[0].compat_ro_flags && 5717 !flags[0].incompat_flags) 5718 return 0; 5719 5720 ret = check_feature(fs_info, flags[0].compat_flags, 5721 flags[1].compat_flags, COMPAT); 5722 if (ret) 5723 return ret; 5724 5725 ret = check_feature(fs_info, flags[0].compat_ro_flags, 5726 flags[1].compat_ro_flags, COMPAT_RO); 5727 if (ret) 5728 return ret; 5729 5730 ret = check_feature(fs_info, flags[0].incompat_flags, 5731 flags[1].incompat_flags, INCOMPAT); 5732 if (ret) 5733 return ret; 5734 5735 ret = mnt_want_write_file(file); 5736 if (ret) 5737 return ret; 5738 5739 trans = btrfs_start_transaction(root, 0); 5740 if (IS_ERR(trans)) { 5741 ret = PTR_ERR(trans); 5742 goto out_drop_write; 5743 } 5744 5745 spin_lock(&fs_info->super_lock); 5746 newflags = btrfs_super_compat_flags(super_block); 5747 newflags |= flags[0].compat_flags & flags[1].compat_flags; 5748 newflags &= ~(flags[0].compat_flags & ~flags[1].compat_flags); 5749 btrfs_set_super_compat_flags(super_block, newflags); 5750 5751 newflags = btrfs_super_compat_ro_flags(super_block); 5752 newflags |= flags[0].compat_ro_flags & flags[1].compat_ro_flags; 5753 newflags &= ~(flags[0].compat_ro_flags & ~flags[1].compat_ro_flags); 5754 btrfs_set_super_compat_ro_flags(super_block, newflags); 5755 5756 newflags = btrfs_super_incompat_flags(super_block); 5757 newflags |= flags[0].incompat_flags & flags[1].incompat_flags; 5758 newflags &= ~(flags[0].incompat_flags & ~flags[1].incompat_flags); 5759 btrfs_set_super_incompat_flags(super_block, newflags); 5760 spin_unlock(&fs_info->super_lock); 5761 5762 ret = btrfs_commit_transaction(trans); 5763 out_drop_write: 5764 mnt_drop_write_file(file); 5765 5766 return ret; 5767 } 5768 5769 static int _btrfs_ioctl_send(struct file *file, void __user *argp, bool compat) 5770 { 5771 struct btrfs_ioctl_send_args *arg; 5772 int ret; 5773 5774 if (compat) { 5775 #if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) 5776 struct btrfs_ioctl_send_args_32 args32; 5777 5778 ret = copy_from_user(&args32, argp, sizeof(args32)); 5779 if (ret) 5780 return -EFAULT; 5781 arg = kzalloc(sizeof(*arg), GFP_KERNEL); 5782 if (!arg) 5783 return -ENOMEM; 5784 arg->send_fd = args32.send_fd; 5785 arg->clone_sources_count = args32.clone_sources_count; 5786 arg->clone_sources = compat_ptr(args32.clone_sources); 5787 arg->parent_root = args32.parent_root; 5788 arg->flags = args32.flags; 5789 memcpy(arg->reserved, args32.reserved, 5790 sizeof(args32.reserved)); 5791 #else 5792 return -ENOTTY; 5793 #endif 5794 } else { 5795 arg = memdup_user(argp, sizeof(*arg)); 5796 if (IS_ERR(arg)) 5797 return PTR_ERR(arg); 5798 } 5799 ret = btrfs_ioctl_send(file, arg); 5800 kfree(arg); 5801 return ret; 5802 } 5803 5804 long btrfs_ioctl(struct file *file, unsigned int 5805 cmd, unsigned long arg) 5806 { 5807 struct inode *inode = file_inode(file); 5808 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 5809 struct btrfs_root *root = BTRFS_I(inode)->root; 5810 void __user *argp = (void __user *)arg; 5811 5812 switch (cmd) { 5813 case FS_IOC_GETFLAGS: 5814 return btrfs_ioctl_getflags(file, argp); 5815 case FS_IOC_SETFLAGS: 5816 return btrfs_ioctl_setflags(file, argp); 5817 case FS_IOC_GETVERSION: 5818 return btrfs_ioctl_getversion(file, argp); 5819 case FITRIM: 5820 return btrfs_ioctl_fitrim(file, argp); 5821 case BTRFS_IOC_SNAP_CREATE: 5822 return btrfs_ioctl_snap_create(file, argp, 0); 5823 case BTRFS_IOC_SNAP_CREATE_V2: 5824 return btrfs_ioctl_snap_create_v2(file, argp, 0); 5825 case BTRFS_IOC_SUBVOL_CREATE: 5826 return btrfs_ioctl_snap_create(file, argp, 1); 5827 case BTRFS_IOC_SUBVOL_CREATE_V2: 5828 return btrfs_ioctl_snap_create_v2(file, argp, 1); 5829 case BTRFS_IOC_SNAP_DESTROY: 5830 return btrfs_ioctl_snap_destroy(file, argp); 5831 case BTRFS_IOC_SUBVOL_GETFLAGS: 5832 return btrfs_ioctl_subvol_getflags(file, argp); 5833 case BTRFS_IOC_SUBVOL_SETFLAGS: 5834 return btrfs_ioctl_subvol_setflags(file, argp); 5835 case BTRFS_IOC_DEFAULT_SUBVOL: 5836 return btrfs_ioctl_default_subvol(file, argp); 5837 case BTRFS_IOC_DEFRAG: 5838 return btrfs_ioctl_defrag(file, NULL); 5839 case BTRFS_IOC_DEFRAG_RANGE: 5840 return btrfs_ioctl_defrag(file, argp); 5841 case BTRFS_IOC_RESIZE: 5842 return btrfs_ioctl_resize(file, argp); 5843 case BTRFS_IOC_ADD_DEV: 5844 return btrfs_ioctl_add_dev(fs_info, argp); 5845 case BTRFS_IOC_RM_DEV: 5846 return btrfs_ioctl_rm_dev(file, argp); 5847 case BTRFS_IOC_RM_DEV_V2: 5848 return btrfs_ioctl_rm_dev_v2(file, argp); 5849 case BTRFS_IOC_FS_INFO: 5850 return btrfs_ioctl_fs_info(fs_info, argp); 5851 case BTRFS_IOC_DEV_INFO: 5852 return btrfs_ioctl_dev_info(fs_info, argp); 5853 case BTRFS_IOC_BALANCE: 5854 return btrfs_ioctl_balance(file, NULL); 5855 case BTRFS_IOC_TREE_SEARCH: 5856 return btrfs_ioctl_tree_search(file, argp); 5857 case BTRFS_IOC_TREE_SEARCH_V2: 5858 return btrfs_ioctl_tree_search_v2(file, argp); 5859 case BTRFS_IOC_INO_LOOKUP: 5860 return btrfs_ioctl_ino_lookup(file, argp); 5861 case BTRFS_IOC_INO_PATHS: 5862 return btrfs_ioctl_ino_to_path(root, argp); 5863 case BTRFS_IOC_LOGICAL_INO: 5864 return btrfs_ioctl_logical_to_ino(fs_info, argp, 1); 5865 case BTRFS_IOC_LOGICAL_INO_V2: 5866 return btrfs_ioctl_logical_to_ino(fs_info, argp, 2); 5867 case BTRFS_IOC_SPACE_INFO: 5868 return btrfs_ioctl_space_info(fs_info, argp); 5869 case BTRFS_IOC_SYNC: { 5870 int ret; 5871 5872 ret = btrfs_start_delalloc_roots(fs_info, -1); 5873 if (ret) 5874 return ret; 5875 ret = btrfs_sync_fs(inode->i_sb, 1); 5876 /* 5877 * The transaction thread may want to do more work, 5878 * namely it pokes the cleaner kthread that will start 5879 * processing uncleaned subvols. 5880 */ 5881 wake_up_process(fs_info->transaction_kthread); 5882 return ret; 5883 } 5884 case BTRFS_IOC_START_SYNC: 5885 return btrfs_ioctl_start_sync(root, argp); 5886 case BTRFS_IOC_WAIT_SYNC: 5887 return btrfs_ioctl_wait_sync(fs_info, argp); 5888 case BTRFS_IOC_SCRUB: 5889 return btrfs_ioctl_scrub(file, argp); 5890 case BTRFS_IOC_SCRUB_CANCEL: 5891 return btrfs_ioctl_scrub_cancel(fs_info); 5892 case BTRFS_IOC_SCRUB_PROGRESS: 5893 return btrfs_ioctl_scrub_progress(fs_info, argp); 5894 case BTRFS_IOC_BALANCE_V2: 5895 return btrfs_ioctl_balance(file, argp); 5896 case BTRFS_IOC_BALANCE_CTL: 5897 return btrfs_ioctl_balance_ctl(fs_info, arg); 5898 case BTRFS_IOC_BALANCE_PROGRESS: 5899 return btrfs_ioctl_balance_progress(fs_info, argp); 5900 case BTRFS_IOC_SET_RECEIVED_SUBVOL: 5901 return btrfs_ioctl_set_received_subvol(file, argp); 5902 #ifdef CONFIG_64BIT 5903 case BTRFS_IOC_SET_RECEIVED_SUBVOL_32: 5904 return btrfs_ioctl_set_received_subvol_32(file, argp); 5905 #endif 5906 case BTRFS_IOC_SEND: 5907 return _btrfs_ioctl_send(file, argp, false); 5908 #if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) 5909 case BTRFS_IOC_SEND_32: 5910 return _btrfs_ioctl_send(file, argp, true); 5911 #endif 5912 case BTRFS_IOC_GET_DEV_STATS: 5913 return btrfs_ioctl_get_dev_stats(fs_info, argp); 5914 case BTRFS_IOC_QUOTA_CTL: 5915 return btrfs_ioctl_quota_ctl(file, argp); 5916 case BTRFS_IOC_QGROUP_ASSIGN: 5917 return btrfs_ioctl_qgroup_assign(file, argp); 5918 case BTRFS_IOC_QGROUP_CREATE: 5919 return btrfs_ioctl_qgroup_create(file, argp); 5920 case BTRFS_IOC_QGROUP_LIMIT: 5921 return btrfs_ioctl_qgroup_limit(file, argp); 5922 case BTRFS_IOC_QUOTA_RESCAN: 5923 return btrfs_ioctl_quota_rescan(file, argp); 5924 case BTRFS_IOC_QUOTA_RESCAN_STATUS: 5925 return btrfs_ioctl_quota_rescan_status(file, argp); 5926 case BTRFS_IOC_QUOTA_RESCAN_WAIT: 5927 return btrfs_ioctl_quota_rescan_wait(file, argp); 5928 case BTRFS_IOC_DEV_REPLACE: 5929 return btrfs_ioctl_dev_replace(fs_info, argp); 5930 case BTRFS_IOC_GET_FSLABEL: 5931 return btrfs_ioctl_get_fslabel(file, argp); 5932 case BTRFS_IOC_SET_FSLABEL: 5933 return btrfs_ioctl_set_fslabel(file, argp); 5934 case BTRFS_IOC_GET_SUPPORTED_FEATURES: 5935 return btrfs_ioctl_get_supported_features(argp); 5936 case BTRFS_IOC_GET_FEATURES: 5937 return btrfs_ioctl_get_features(file, argp); 5938 case BTRFS_IOC_SET_FEATURES: 5939 return btrfs_ioctl_set_features(file, argp); 5940 case FS_IOC_FSGETXATTR: 5941 return btrfs_ioctl_fsgetxattr(file, argp); 5942 case FS_IOC_FSSETXATTR: 5943 return btrfs_ioctl_fssetxattr(file, argp); 5944 case BTRFS_IOC_GET_SUBVOL_INFO: 5945 return btrfs_ioctl_get_subvol_info(file, argp); 5946 case BTRFS_IOC_GET_SUBVOL_ROOTREF: 5947 return btrfs_ioctl_get_subvol_rootref(file, argp); 5948 case BTRFS_IOC_INO_LOOKUP_USER: 5949 return btrfs_ioctl_ino_lookup_user(file, argp); 5950 } 5951 5952 return -ENOTTY; 5953 } 5954 5955 #ifdef CONFIG_COMPAT 5956 long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) 5957 { 5958 /* 5959 * These all access 32-bit values anyway so no further 5960 * handling is necessary. 5961 */ 5962 switch (cmd) { 5963 case FS_IOC32_GETFLAGS: 5964 cmd = FS_IOC_GETFLAGS; 5965 break; 5966 case FS_IOC32_SETFLAGS: 5967 cmd = FS_IOC_SETFLAGS; 5968 break; 5969 case FS_IOC32_GETVERSION: 5970 cmd = FS_IOC_GETVERSION; 5971 break; 5972 } 5973 5974 return btrfs_ioctl(file, cmd, (unsigned long) compat_ptr(arg)); 5975 } 5976 #endif 5977