1 /* 2 * Copyright (C) 2007 Oracle. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 19 #include <linux/kernel.h> 20 #include <linux/bio.h> 21 #include <linux/buffer_head.h> 22 #include <linux/file.h> 23 #include <linux/fs.h> 24 #include <linux/fsnotify.h> 25 #include <linux/pagemap.h> 26 #include <linux/highmem.h> 27 #include <linux/time.h> 28 #include <linux/init.h> 29 #include <linux/string.h> 30 #include <linux/backing-dev.h> 31 #include <linux/mount.h> 32 #include <linux/mpage.h> 33 #include <linux/namei.h> 34 #include <linux/swap.h> 35 #include <linux/writeback.h> 36 #include <linux/statfs.h> 37 #include <linux/compat.h> 38 #include <linux/bit_spinlock.h> 39 #include <linux/security.h> 40 #include <linux/xattr.h> 41 #include <linux/vmalloc.h> 42 #include <linux/slab.h> 43 #include <linux/blkdev.h> 44 #include <linux/uuid.h> 45 #include <linux/btrfs.h> 46 #include <linux/uaccess.h> 47 #include "ctree.h" 48 #include "disk-io.h" 49 #include "transaction.h" 50 #include "btrfs_inode.h" 51 #include "print-tree.h" 52 #include "volumes.h" 53 #include "locking.h" 54 #include "inode-map.h" 55 #include "backref.h" 56 #include "rcu-string.h" 57 #include "send.h" 58 #include "dev-replace.h" 59 #include "props.h" 60 #include "sysfs.h" 61 #include "qgroup.h" 62 63 #ifdef CONFIG_64BIT 64 /* If we have a 32-bit userspace and 64-bit kernel, then the UAPI 65 * structures are incorrect, as the timespec structure from userspace 66 * is 4 bytes too small. We define these alternatives here to teach 67 * the kernel about the 32-bit struct packing. 68 */ 69 struct btrfs_ioctl_timespec_32 { 70 __u64 sec; 71 __u32 nsec; 72 } __attribute__ ((__packed__)); 73 74 struct btrfs_ioctl_received_subvol_args_32 { 75 char uuid[BTRFS_UUID_SIZE]; /* in */ 76 __u64 stransid; /* in */ 77 __u64 rtransid; /* out */ 78 struct btrfs_ioctl_timespec_32 stime; /* in */ 79 struct btrfs_ioctl_timespec_32 rtime; /* out */ 80 __u64 flags; /* in */ 81 __u64 reserved[16]; /* in */ 82 } __attribute__ ((__packed__)); 83 84 #define BTRFS_IOC_SET_RECEIVED_SUBVOL_32 _IOWR(BTRFS_IOCTL_MAGIC, 37, \ 85 struct btrfs_ioctl_received_subvol_args_32) 86 #endif 87 88 89 static int btrfs_clone(struct inode *src, struct inode *inode, 90 u64 off, u64 olen, u64 olen_aligned, u64 destoff); 91 92 /* Mask out flags that are inappropriate for the given type of inode. */ 93 static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags) 94 { 95 if (S_ISDIR(mode)) 96 return flags; 97 else if (S_ISREG(mode)) 98 return flags & ~FS_DIRSYNC_FL; 99 else 100 return flags & (FS_NODUMP_FL | FS_NOATIME_FL); 101 } 102 103 /* 104 * Export inode flags to the format expected by the FS_IOC_GETFLAGS ioctl. 105 */ 106 static unsigned int btrfs_flags_to_ioctl(unsigned int flags) 107 { 108 unsigned int iflags = 0; 109 110 if (flags & BTRFS_INODE_SYNC) 111 iflags |= FS_SYNC_FL; 112 if (flags & BTRFS_INODE_IMMUTABLE) 113 iflags |= FS_IMMUTABLE_FL; 114 if (flags & BTRFS_INODE_APPEND) 115 iflags |= FS_APPEND_FL; 116 if (flags & BTRFS_INODE_NODUMP) 117 iflags |= FS_NODUMP_FL; 118 if (flags & BTRFS_INODE_NOATIME) 119 iflags |= FS_NOATIME_FL; 120 if (flags & BTRFS_INODE_DIRSYNC) 121 iflags |= FS_DIRSYNC_FL; 122 if (flags & BTRFS_INODE_NODATACOW) 123 iflags |= FS_NOCOW_FL; 124 125 if ((flags & BTRFS_INODE_COMPRESS) && !(flags & BTRFS_INODE_NOCOMPRESS)) 126 iflags |= FS_COMPR_FL; 127 else if (flags & BTRFS_INODE_NOCOMPRESS) 128 iflags |= FS_NOCOMP_FL; 129 130 return iflags; 131 } 132 133 /* 134 * Update inode->i_flags based on the btrfs internal flags. 135 */ 136 void btrfs_update_iflags(struct inode *inode) 137 { 138 struct btrfs_inode *ip = BTRFS_I(inode); 139 unsigned int new_fl = 0; 140 141 if (ip->flags & BTRFS_INODE_SYNC) 142 new_fl |= S_SYNC; 143 if (ip->flags & BTRFS_INODE_IMMUTABLE) 144 new_fl |= S_IMMUTABLE; 145 if (ip->flags & BTRFS_INODE_APPEND) 146 new_fl |= S_APPEND; 147 if (ip->flags & BTRFS_INODE_NOATIME) 148 new_fl |= S_NOATIME; 149 if (ip->flags & BTRFS_INODE_DIRSYNC) 150 new_fl |= S_DIRSYNC; 151 152 set_mask_bits(&inode->i_flags, 153 S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | S_DIRSYNC, 154 new_fl); 155 } 156 157 /* 158 * Inherit flags from the parent inode. 159 * 160 * Currently only the compression flags and the cow flags are inherited. 161 */ 162 void btrfs_inherit_iflags(struct inode *inode, struct inode *dir) 163 { 164 unsigned int flags; 165 166 if (!dir) 167 return; 168 169 flags = BTRFS_I(dir)->flags; 170 171 if (flags & BTRFS_INODE_NOCOMPRESS) { 172 BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS; 173 BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS; 174 } else if (flags & BTRFS_INODE_COMPRESS) { 175 BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS; 176 BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS; 177 } 178 179 if (flags & BTRFS_INODE_NODATACOW) { 180 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW; 181 if (S_ISREG(inode->i_mode)) 182 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; 183 } 184 185 btrfs_update_iflags(inode); 186 } 187 188 static int btrfs_ioctl_getflags(struct file *file, void __user *arg) 189 { 190 struct btrfs_inode *ip = BTRFS_I(file_inode(file)); 191 unsigned int flags = btrfs_flags_to_ioctl(ip->flags); 192 193 if (copy_to_user(arg, &flags, sizeof(flags))) 194 return -EFAULT; 195 return 0; 196 } 197 198 static int check_flags(unsigned int flags) 199 { 200 if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \ 201 FS_NOATIME_FL | FS_NODUMP_FL | \ 202 FS_SYNC_FL | FS_DIRSYNC_FL | \ 203 FS_NOCOMP_FL | FS_COMPR_FL | 204 FS_NOCOW_FL)) 205 return -EOPNOTSUPP; 206 207 if ((flags & FS_NOCOMP_FL) && (flags & FS_COMPR_FL)) 208 return -EINVAL; 209 210 return 0; 211 } 212 213 static int btrfs_ioctl_setflags(struct file *file, void __user *arg) 214 { 215 struct inode *inode = file_inode(file); 216 struct btrfs_inode *ip = BTRFS_I(inode); 217 struct btrfs_root *root = ip->root; 218 struct btrfs_trans_handle *trans; 219 unsigned int flags, oldflags; 220 int ret; 221 u64 ip_oldflags; 222 unsigned int i_oldflags; 223 umode_t mode; 224 225 if (!inode_owner_or_capable(inode)) 226 return -EPERM; 227 228 if (btrfs_root_readonly(root)) 229 return -EROFS; 230 231 if (copy_from_user(&flags, arg, sizeof(flags))) 232 return -EFAULT; 233 234 ret = check_flags(flags); 235 if (ret) 236 return ret; 237 238 ret = mnt_want_write_file(file); 239 if (ret) 240 return ret; 241 242 mutex_lock(&inode->i_mutex); 243 244 ip_oldflags = ip->flags; 245 i_oldflags = inode->i_flags; 246 mode = inode->i_mode; 247 248 flags = btrfs_mask_flags(inode->i_mode, flags); 249 oldflags = btrfs_flags_to_ioctl(ip->flags); 250 if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) { 251 if (!capable(CAP_LINUX_IMMUTABLE)) { 252 ret = -EPERM; 253 goto out_unlock; 254 } 255 } 256 257 if (flags & FS_SYNC_FL) 258 ip->flags |= BTRFS_INODE_SYNC; 259 else 260 ip->flags &= ~BTRFS_INODE_SYNC; 261 if (flags & FS_IMMUTABLE_FL) 262 ip->flags |= BTRFS_INODE_IMMUTABLE; 263 else 264 ip->flags &= ~BTRFS_INODE_IMMUTABLE; 265 if (flags & FS_APPEND_FL) 266 ip->flags |= BTRFS_INODE_APPEND; 267 else 268 ip->flags &= ~BTRFS_INODE_APPEND; 269 if (flags & FS_NODUMP_FL) 270 ip->flags |= BTRFS_INODE_NODUMP; 271 else 272 ip->flags &= ~BTRFS_INODE_NODUMP; 273 if (flags & FS_NOATIME_FL) 274 ip->flags |= BTRFS_INODE_NOATIME; 275 else 276 ip->flags &= ~BTRFS_INODE_NOATIME; 277 if (flags & FS_DIRSYNC_FL) 278 ip->flags |= BTRFS_INODE_DIRSYNC; 279 else 280 ip->flags &= ~BTRFS_INODE_DIRSYNC; 281 if (flags & FS_NOCOW_FL) { 282 if (S_ISREG(mode)) { 283 /* 284 * It's safe to turn csums off here, no extents exist. 285 * Otherwise we want the flag to reflect the real COW 286 * status of the file and will not set it. 287 */ 288 if (inode->i_size == 0) 289 ip->flags |= BTRFS_INODE_NODATACOW 290 | BTRFS_INODE_NODATASUM; 291 } else { 292 ip->flags |= BTRFS_INODE_NODATACOW; 293 } 294 } else { 295 /* 296 * Revert back under same assuptions as above 297 */ 298 if (S_ISREG(mode)) { 299 if (inode->i_size == 0) 300 ip->flags &= ~(BTRFS_INODE_NODATACOW 301 | BTRFS_INODE_NODATASUM); 302 } else { 303 ip->flags &= ~BTRFS_INODE_NODATACOW; 304 } 305 } 306 307 /* 308 * The COMPRESS flag can only be changed by users, while the NOCOMPRESS 309 * flag may be changed automatically if compression code won't make 310 * things smaller. 311 */ 312 if (flags & FS_NOCOMP_FL) { 313 ip->flags &= ~BTRFS_INODE_COMPRESS; 314 ip->flags |= BTRFS_INODE_NOCOMPRESS; 315 316 ret = btrfs_set_prop(inode, "btrfs.compression", NULL, 0, 0); 317 if (ret && ret != -ENODATA) 318 goto out_drop; 319 } else if (flags & FS_COMPR_FL) { 320 const char *comp; 321 322 ip->flags |= BTRFS_INODE_COMPRESS; 323 ip->flags &= ~BTRFS_INODE_NOCOMPRESS; 324 325 if (root->fs_info->compress_type == BTRFS_COMPRESS_LZO) 326 comp = "lzo"; 327 else 328 comp = "zlib"; 329 ret = btrfs_set_prop(inode, "btrfs.compression", 330 comp, strlen(comp), 0); 331 if (ret) 332 goto out_drop; 333 334 } else { 335 ret = btrfs_set_prop(inode, "btrfs.compression", NULL, 0, 0); 336 if (ret && ret != -ENODATA) 337 goto out_drop; 338 ip->flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS); 339 } 340 341 trans = btrfs_start_transaction(root, 1); 342 if (IS_ERR(trans)) { 343 ret = PTR_ERR(trans); 344 goto out_drop; 345 } 346 347 btrfs_update_iflags(inode); 348 inode_inc_iversion(inode); 349 inode->i_ctime = CURRENT_TIME; 350 ret = btrfs_update_inode(trans, root, inode); 351 352 btrfs_end_transaction(trans, root); 353 out_drop: 354 if (ret) { 355 ip->flags = ip_oldflags; 356 inode->i_flags = i_oldflags; 357 } 358 359 out_unlock: 360 mutex_unlock(&inode->i_mutex); 361 mnt_drop_write_file(file); 362 return ret; 363 } 364 365 static int btrfs_ioctl_getversion(struct file *file, int __user *arg) 366 { 367 struct inode *inode = file_inode(file); 368 369 return put_user(inode->i_generation, arg); 370 } 371 372 static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg) 373 { 374 struct btrfs_fs_info *fs_info = btrfs_sb(file_inode(file)->i_sb); 375 struct btrfs_device *device; 376 struct request_queue *q; 377 struct fstrim_range range; 378 u64 minlen = ULLONG_MAX; 379 u64 num_devices = 0; 380 u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy); 381 int ret; 382 383 if (!capable(CAP_SYS_ADMIN)) 384 return -EPERM; 385 386 rcu_read_lock(); 387 list_for_each_entry_rcu(device, &fs_info->fs_devices->devices, 388 dev_list) { 389 if (!device->bdev) 390 continue; 391 q = bdev_get_queue(device->bdev); 392 if (blk_queue_discard(q)) { 393 num_devices++; 394 minlen = min((u64)q->limits.discard_granularity, 395 minlen); 396 } 397 } 398 rcu_read_unlock(); 399 400 if (!num_devices) 401 return -EOPNOTSUPP; 402 if (copy_from_user(&range, arg, sizeof(range))) 403 return -EFAULT; 404 if (range.start > total_bytes || 405 range.len < fs_info->sb->s_blocksize) 406 return -EINVAL; 407 408 range.len = min(range.len, total_bytes - range.start); 409 range.minlen = max(range.minlen, minlen); 410 ret = btrfs_trim_fs(fs_info->tree_root, &range); 411 if (ret < 0) 412 return ret; 413 414 if (copy_to_user(arg, &range, sizeof(range))) 415 return -EFAULT; 416 417 return 0; 418 } 419 420 int btrfs_is_empty_uuid(u8 *uuid) 421 { 422 int i; 423 424 for (i = 0; i < BTRFS_UUID_SIZE; i++) { 425 if (uuid[i]) 426 return 0; 427 } 428 return 1; 429 } 430 431 static noinline int create_subvol(struct inode *dir, 432 struct dentry *dentry, 433 char *name, int namelen, 434 u64 *async_transid, 435 struct btrfs_qgroup_inherit *inherit) 436 { 437 struct btrfs_trans_handle *trans; 438 struct btrfs_key key; 439 struct btrfs_root_item root_item; 440 struct btrfs_inode_item *inode_item; 441 struct extent_buffer *leaf; 442 struct btrfs_root *root = BTRFS_I(dir)->root; 443 struct btrfs_root *new_root; 444 struct btrfs_block_rsv block_rsv; 445 struct timespec cur_time = CURRENT_TIME; 446 struct inode *inode; 447 int ret; 448 int err; 449 u64 objectid; 450 u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID; 451 u64 index = 0; 452 u64 qgroup_reserved; 453 uuid_le new_uuid; 454 455 ret = btrfs_find_free_objectid(root->fs_info->tree_root, &objectid); 456 if (ret) 457 return ret; 458 459 btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP); 460 /* 461 * The same as the snapshot creation, please see the comment 462 * of create_snapshot(). 463 */ 464 ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 465 8, &qgroup_reserved, false); 466 if (ret) 467 return ret; 468 469 trans = btrfs_start_transaction(root, 0); 470 if (IS_ERR(trans)) { 471 ret = PTR_ERR(trans); 472 btrfs_subvolume_release_metadata(root, &block_rsv, 473 qgroup_reserved); 474 return ret; 475 } 476 trans->block_rsv = &block_rsv; 477 trans->bytes_reserved = block_rsv.size; 478 479 ret = btrfs_qgroup_inherit(trans, root->fs_info, 0, objectid, inherit); 480 if (ret) 481 goto fail; 482 483 leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0); 484 if (IS_ERR(leaf)) { 485 ret = PTR_ERR(leaf); 486 goto fail; 487 } 488 489 memset_extent_buffer(leaf, 0, 0, sizeof(struct btrfs_header)); 490 btrfs_set_header_bytenr(leaf, leaf->start); 491 btrfs_set_header_generation(leaf, trans->transid); 492 btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV); 493 btrfs_set_header_owner(leaf, objectid); 494 495 write_extent_buffer(leaf, root->fs_info->fsid, btrfs_header_fsid(), 496 BTRFS_FSID_SIZE); 497 write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid, 498 btrfs_header_chunk_tree_uuid(leaf), 499 BTRFS_UUID_SIZE); 500 btrfs_mark_buffer_dirty(leaf); 501 502 memset(&root_item, 0, sizeof(root_item)); 503 504 inode_item = &root_item.inode; 505 btrfs_set_stack_inode_generation(inode_item, 1); 506 btrfs_set_stack_inode_size(inode_item, 3); 507 btrfs_set_stack_inode_nlink(inode_item, 1); 508 btrfs_set_stack_inode_nbytes(inode_item, root->nodesize); 509 btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755); 510 511 btrfs_set_root_flags(&root_item, 0); 512 btrfs_set_root_limit(&root_item, 0); 513 btrfs_set_stack_inode_flags(inode_item, BTRFS_INODE_ROOT_ITEM_INIT); 514 515 btrfs_set_root_bytenr(&root_item, leaf->start); 516 btrfs_set_root_generation(&root_item, trans->transid); 517 btrfs_set_root_level(&root_item, 0); 518 btrfs_set_root_refs(&root_item, 1); 519 btrfs_set_root_used(&root_item, leaf->len); 520 btrfs_set_root_last_snapshot(&root_item, 0); 521 522 btrfs_set_root_generation_v2(&root_item, 523 btrfs_root_generation(&root_item)); 524 uuid_le_gen(&new_uuid); 525 memcpy(root_item.uuid, new_uuid.b, BTRFS_UUID_SIZE); 526 btrfs_set_stack_timespec_sec(&root_item.otime, cur_time.tv_sec); 527 btrfs_set_stack_timespec_nsec(&root_item.otime, cur_time.tv_nsec); 528 root_item.ctime = root_item.otime; 529 btrfs_set_root_ctransid(&root_item, trans->transid); 530 btrfs_set_root_otransid(&root_item, trans->transid); 531 532 btrfs_tree_unlock(leaf); 533 free_extent_buffer(leaf); 534 leaf = NULL; 535 536 btrfs_set_root_dirid(&root_item, new_dirid); 537 538 key.objectid = objectid; 539 key.offset = 0; 540 key.type = BTRFS_ROOT_ITEM_KEY; 541 ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key, 542 &root_item); 543 if (ret) 544 goto fail; 545 546 key.offset = (u64)-1; 547 new_root = btrfs_read_fs_root_no_name(root->fs_info, &key); 548 if (IS_ERR(new_root)) { 549 btrfs_abort_transaction(trans, root, PTR_ERR(new_root)); 550 ret = PTR_ERR(new_root); 551 goto fail; 552 } 553 554 btrfs_record_root_in_trans(trans, new_root); 555 556 ret = btrfs_create_subvol_root(trans, new_root, root, new_dirid); 557 if (ret) { 558 /* We potentially lose an unused inode item here */ 559 btrfs_abort_transaction(trans, root, ret); 560 goto fail; 561 } 562 563 /* 564 * insert the directory item 565 */ 566 ret = btrfs_set_inode_index(dir, &index); 567 if (ret) { 568 btrfs_abort_transaction(trans, root, ret); 569 goto fail; 570 } 571 572 ret = btrfs_insert_dir_item(trans, root, 573 name, namelen, dir, &key, 574 BTRFS_FT_DIR, index); 575 if (ret) { 576 btrfs_abort_transaction(trans, root, ret); 577 goto fail; 578 } 579 580 btrfs_i_size_write(dir, dir->i_size + namelen * 2); 581 ret = btrfs_update_inode(trans, root, dir); 582 BUG_ON(ret); 583 584 ret = btrfs_add_root_ref(trans, root->fs_info->tree_root, 585 objectid, root->root_key.objectid, 586 btrfs_ino(dir), index, name, namelen); 587 BUG_ON(ret); 588 589 ret = btrfs_uuid_tree_add(trans, root->fs_info->uuid_root, 590 root_item.uuid, BTRFS_UUID_KEY_SUBVOL, 591 objectid); 592 if (ret) 593 btrfs_abort_transaction(trans, root, ret); 594 595 fail: 596 trans->block_rsv = NULL; 597 trans->bytes_reserved = 0; 598 btrfs_subvolume_release_metadata(root, &block_rsv, qgroup_reserved); 599 600 if (async_transid) { 601 *async_transid = trans->transid; 602 err = btrfs_commit_transaction_async(trans, root, 1); 603 if (err) 604 err = btrfs_commit_transaction(trans, root); 605 } else { 606 err = btrfs_commit_transaction(trans, root); 607 } 608 if (err && !ret) 609 ret = err; 610 611 if (!ret) { 612 inode = btrfs_lookup_dentry(dir, dentry); 613 if (IS_ERR(inode)) 614 return PTR_ERR(inode); 615 d_instantiate(dentry, inode); 616 } 617 return ret; 618 } 619 620 static void btrfs_wait_nocow_write(struct btrfs_root *root) 621 { 622 s64 writers; 623 DEFINE_WAIT(wait); 624 625 do { 626 prepare_to_wait(&root->subv_writers->wait, &wait, 627 TASK_UNINTERRUPTIBLE); 628 629 writers = percpu_counter_sum(&root->subv_writers->counter); 630 if (writers) 631 schedule(); 632 633 finish_wait(&root->subv_writers->wait, &wait); 634 } while (writers); 635 } 636 637 static int create_snapshot(struct btrfs_root *root, struct inode *dir, 638 struct dentry *dentry, char *name, int namelen, 639 u64 *async_transid, bool readonly, 640 struct btrfs_qgroup_inherit *inherit) 641 { 642 struct inode *inode; 643 struct btrfs_pending_snapshot *pending_snapshot; 644 struct btrfs_trans_handle *trans; 645 int ret; 646 647 if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state)) 648 return -EINVAL; 649 650 atomic_inc(&root->will_be_snapshoted); 651 smp_mb__after_atomic(); 652 btrfs_wait_nocow_write(root); 653 654 ret = btrfs_start_delalloc_inodes(root, 0); 655 if (ret) 656 goto out; 657 658 btrfs_wait_ordered_extents(root, -1); 659 660 pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS); 661 if (!pending_snapshot) { 662 ret = -ENOMEM; 663 goto out; 664 } 665 666 btrfs_init_block_rsv(&pending_snapshot->block_rsv, 667 BTRFS_BLOCK_RSV_TEMP); 668 /* 669 * 1 - parent dir inode 670 * 2 - dir entries 671 * 1 - root item 672 * 2 - root ref/backref 673 * 1 - root of snapshot 674 * 1 - UUID item 675 */ 676 ret = btrfs_subvolume_reserve_metadata(BTRFS_I(dir)->root, 677 &pending_snapshot->block_rsv, 8, 678 &pending_snapshot->qgroup_reserved, 679 false); 680 if (ret) 681 goto free; 682 683 pending_snapshot->dentry = dentry; 684 pending_snapshot->root = root; 685 pending_snapshot->readonly = readonly; 686 pending_snapshot->dir = dir; 687 pending_snapshot->inherit = inherit; 688 689 trans = btrfs_start_transaction(root, 0); 690 if (IS_ERR(trans)) { 691 ret = PTR_ERR(trans); 692 goto fail; 693 } 694 695 spin_lock(&root->fs_info->trans_lock); 696 list_add(&pending_snapshot->list, 697 &trans->transaction->pending_snapshots); 698 spin_unlock(&root->fs_info->trans_lock); 699 if (async_transid) { 700 *async_transid = trans->transid; 701 ret = btrfs_commit_transaction_async(trans, 702 root->fs_info->extent_root, 1); 703 if (ret) 704 ret = btrfs_commit_transaction(trans, root); 705 } else { 706 ret = btrfs_commit_transaction(trans, 707 root->fs_info->extent_root); 708 } 709 if (ret) 710 goto fail; 711 712 ret = pending_snapshot->error; 713 if (ret) 714 goto fail; 715 716 ret = btrfs_orphan_cleanup(pending_snapshot->snap); 717 if (ret) 718 goto fail; 719 720 /* 721 * If orphan cleanup did remove any orphans, it means the tree was 722 * modified and therefore the commit root is not the same as the 723 * current root anymore. This is a problem, because send uses the 724 * commit root and therefore can see inode items that don't exist 725 * in the current root anymore, and for example make calls to 726 * btrfs_iget, which will do tree lookups based on the current root 727 * and not on the commit root. Those lookups will fail, returning a 728 * -ESTALE error, and making send fail with that error. So make sure 729 * a send does not see any orphans we have just removed, and that it 730 * will see the same inodes regardless of whether a transaction 731 * commit happened before it started (meaning that the commit root 732 * will be the same as the current root) or not. 733 */ 734 if (readonly && pending_snapshot->snap->node != 735 pending_snapshot->snap->commit_root) { 736 trans = btrfs_join_transaction(pending_snapshot->snap); 737 if (IS_ERR(trans) && PTR_ERR(trans) != -ENOENT) { 738 ret = PTR_ERR(trans); 739 goto fail; 740 } 741 if (!IS_ERR(trans)) { 742 ret = btrfs_commit_transaction(trans, 743 pending_snapshot->snap); 744 if (ret) 745 goto fail; 746 } 747 } 748 749 inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry); 750 if (IS_ERR(inode)) { 751 ret = PTR_ERR(inode); 752 goto fail; 753 } 754 755 d_instantiate(dentry, inode); 756 ret = 0; 757 fail: 758 btrfs_subvolume_release_metadata(BTRFS_I(dir)->root, 759 &pending_snapshot->block_rsv, 760 pending_snapshot->qgroup_reserved); 761 free: 762 kfree(pending_snapshot); 763 out: 764 atomic_dec(&root->will_be_snapshoted); 765 return ret; 766 } 767 768 /* copy of may_delete in fs/namei.c() 769 * Check whether we can remove a link victim from directory dir, check 770 * whether the type of victim is right. 771 * 1. We can't do it if dir is read-only (done in permission()) 772 * 2. We should have write and exec permissions on dir 773 * 3. We can't remove anything from append-only dir 774 * 4. We can't do anything with immutable dir (done in permission()) 775 * 5. If the sticky bit on dir is set we should either 776 * a. be owner of dir, or 777 * b. be owner of victim, or 778 * c. have CAP_FOWNER capability 779 * 6. If the victim is append-only or immutable we can't do antyhing with 780 * links pointing to it. 781 * 7. If we were asked to remove a directory and victim isn't one - ENOTDIR. 782 * 8. If we were asked to remove a non-directory and victim isn't one - EISDIR. 783 * 9. We can't remove a root or mountpoint. 784 * 10. We don't allow removal of NFS sillyrenamed files; it's handled by 785 * nfs_async_unlink(). 786 */ 787 788 static int btrfs_may_delete(struct inode *dir, struct dentry *victim, int isdir) 789 { 790 int error; 791 792 if (!victim->d_inode) 793 return -ENOENT; 794 795 BUG_ON(victim->d_parent->d_inode != dir); 796 audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE); 797 798 error = inode_permission(dir, MAY_WRITE | MAY_EXEC); 799 if (error) 800 return error; 801 if (IS_APPEND(dir)) 802 return -EPERM; 803 if (check_sticky(dir, victim->d_inode) || IS_APPEND(victim->d_inode) || 804 IS_IMMUTABLE(victim->d_inode) || IS_SWAPFILE(victim->d_inode)) 805 return -EPERM; 806 if (isdir) { 807 if (!S_ISDIR(victim->d_inode->i_mode)) 808 return -ENOTDIR; 809 if (IS_ROOT(victim)) 810 return -EBUSY; 811 } else if (S_ISDIR(victim->d_inode->i_mode)) 812 return -EISDIR; 813 if (IS_DEADDIR(dir)) 814 return -ENOENT; 815 if (victim->d_flags & DCACHE_NFSFS_RENAMED) 816 return -EBUSY; 817 return 0; 818 } 819 820 /* copy of may_create in fs/namei.c() */ 821 static inline int btrfs_may_create(struct inode *dir, struct dentry *child) 822 { 823 if (child->d_inode) 824 return -EEXIST; 825 if (IS_DEADDIR(dir)) 826 return -ENOENT; 827 return inode_permission(dir, MAY_WRITE | MAY_EXEC); 828 } 829 830 /* 831 * Create a new subvolume below @parent. This is largely modeled after 832 * sys_mkdirat and vfs_mkdir, but we only do a single component lookup 833 * inside this filesystem so it's quite a bit simpler. 834 */ 835 static noinline int btrfs_mksubvol(struct path *parent, 836 char *name, int namelen, 837 struct btrfs_root *snap_src, 838 u64 *async_transid, bool readonly, 839 struct btrfs_qgroup_inherit *inherit) 840 { 841 struct inode *dir = parent->dentry->d_inode; 842 struct dentry *dentry; 843 int error; 844 845 error = mutex_lock_killable_nested(&dir->i_mutex, I_MUTEX_PARENT); 846 if (error == -EINTR) 847 return error; 848 849 dentry = lookup_one_len(name, parent->dentry, namelen); 850 error = PTR_ERR(dentry); 851 if (IS_ERR(dentry)) 852 goto out_unlock; 853 854 error = -EEXIST; 855 if (dentry->d_inode) 856 goto out_dput; 857 858 error = btrfs_may_create(dir, dentry); 859 if (error) 860 goto out_dput; 861 862 /* 863 * even if this name doesn't exist, we may get hash collisions. 864 * check for them now when we can safely fail 865 */ 866 error = btrfs_check_dir_item_collision(BTRFS_I(dir)->root, 867 dir->i_ino, name, 868 namelen); 869 if (error) 870 goto out_dput; 871 872 down_read(&BTRFS_I(dir)->root->fs_info->subvol_sem); 873 874 if (btrfs_root_refs(&BTRFS_I(dir)->root->root_item) == 0) 875 goto out_up_read; 876 877 if (snap_src) { 878 error = create_snapshot(snap_src, dir, dentry, name, namelen, 879 async_transid, readonly, inherit); 880 } else { 881 error = create_subvol(dir, dentry, name, namelen, 882 async_transid, inherit); 883 } 884 if (!error) 885 fsnotify_mkdir(dir, dentry); 886 out_up_read: 887 up_read(&BTRFS_I(dir)->root->fs_info->subvol_sem); 888 out_dput: 889 dput(dentry); 890 out_unlock: 891 mutex_unlock(&dir->i_mutex); 892 return error; 893 } 894 895 /* 896 * When we're defragging a range, we don't want to kick it off again 897 * if it is really just waiting for delalloc to send it down. 898 * If we find a nice big extent or delalloc range for the bytes in the 899 * file you want to defrag, we return 0 to let you know to skip this 900 * part of the file 901 */ 902 static int check_defrag_in_cache(struct inode *inode, u64 offset, u32 thresh) 903 { 904 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 905 struct extent_map *em = NULL; 906 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 907 u64 end; 908 909 read_lock(&em_tree->lock); 910 em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE); 911 read_unlock(&em_tree->lock); 912 913 if (em) { 914 end = extent_map_end(em); 915 free_extent_map(em); 916 if (end - offset > thresh) 917 return 0; 918 } 919 /* if we already have a nice delalloc here, just stop */ 920 thresh /= 2; 921 end = count_range_bits(io_tree, &offset, offset + thresh, 922 thresh, EXTENT_DELALLOC, 1); 923 if (end >= thresh) 924 return 0; 925 return 1; 926 } 927 928 /* 929 * helper function to walk through a file and find extents 930 * newer than a specific transid, and smaller than thresh. 931 * 932 * This is used by the defragging code to find new and small 933 * extents 934 */ 935 static int find_new_extents(struct btrfs_root *root, 936 struct inode *inode, u64 newer_than, 937 u64 *off, u32 thresh) 938 { 939 struct btrfs_path *path; 940 struct btrfs_key min_key; 941 struct extent_buffer *leaf; 942 struct btrfs_file_extent_item *extent; 943 int type; 944 int ret; 945 u64 ino = btrfs_ino(inode); 946 947 path = btrfs_alloc_path(); 948 if (!path) 949 return -ENOMEM; 950 951 min_key.objectid = ino; 952 min_key.type = BTRFS_EXTENT_DATA_KEY; 953 min_key.offset = *off; 954 955 while (1) { 956 ret = btrfs_search_forward(root, &min_key, path, newer_than); 957 if (ret != 0) 958 goto none; 959 process_slot: 960 if (min_key.objectid != ino) 961 goto none; 962 if (min_key.type != BTRFS_EXTENT_DATA_KEY) 963 goto none; 964 965 leaf = path->nodes[0]; 966 extent = btrfs_item_ptr(leaf, path->slots[0], 967 struct btrfs_file_extent_item); 968 969 type = btrfs_file_extent_type(leaf, extent); 970 if (type == BTRFS_FILE_EXTENT_REG && 971 btrfs_file_extent_num_bytes(leaf, extent) < thresh && 972 check_defrag_in_cache(inode, min_key.offset, thresh)) { 973 *off = min_key.offset; 974 btrfs_free_path(path); 975 return 0; 976 } 977 978 path->slots[0]++; 979 if (path->slots[0] < btrfs_header_nritems(leaf)) { 980 btrfs_item_key_to_cpu(leaf, &min_key, path->slots[0]); 981 goto process_slot; 982 } 983 984 if (min_key.offset == (u64)-1) 985 goto none; 986 987 min_key.offset++; 988 btrfs_release_path(path); 989 } 990 none: 991 btrfs_free_path(path); 992 return -ENOENT; 993 } 994 995 static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start) 996 { 997 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 998 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 999 struct extent_map *em; 1000 u64 len = PAGE_CACHE_SIZE; 1001 1002 /* 1003 * hopefully we have this extent in the tree already, try without 1004 * the full extent lock 1005 */ 1006 read_lock(&em_tree->lock); 1007 em = lookup_extent_mapping(em_tree, start, len); 1008 read_unlock(&em_tree->lock); 1009 1010 if (!em) { 1011 struct extent_state *cached = NULL; 1012 u64 end = start + len - 1; 1013 1014 /* get the big lock and read metadata off disk */ 1015 lock_extent_bits(io_tree, start, end, 0, &cached); 1016 em = btrfs_get_extent(inode, NULL, 0, start, len, 0); 1017 unlock_extent_cached(io_tree, start, end, &cached, GFP_NOFS); 1018 1019 if (IS_ERR(em)) 1020 return NULL; 1021 } 1022 1023 return em; 1024 } 1025 1026 static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em) 1027 { 1028 struct extent_map *next; 1029 bool ret = true; 1030 1031 /* this is the last extent */ 1032 if (em->start + em->len >= i_size_read(inode)) 1033 return false; 1034 1035 next = defrag_lookup_extent(inode, em->start + em->len); 1036 if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE) 1037 ret = false; 1038 else if ((em->block_start + em->block_len == next->block_start) && 1039 (em->block_len > 128 * 1024 && next->block_len > 128 * 1024)) 1040 ret = false; 1041 1042 free_extent_map(next); 1043 return ret; 1044 } 1045 1046 static int should_defrag_range(struct inode *inode, u64 start, u32 thresh, 1047 u64 *last_len, u64 *skip, u64 *defrag_end, 1048 int compress) 1049 { 1050 struct extent_map *em; 1051 int ret = 1; 1052 bool next_mergeable = true; 1053 1054 /* 1055 * make sure that once we start defragging an extent, we keep on 1056 * defragging it 1057 */ 1058 if (start < *defrag_end) 1059 return 1; 1060 1061 *skip = 0; 1062 1063 em = defrag_lookup_extent(inode, start); 1064 if (!em) 1065 return 0; 1066 1067 /* this will cover holes, and inline extents */ 1068 if (em->block_start >= EXTENT_MAP_LAST_BYTE) { 1069 ret = 0; 1070 goto out; 1071 } 1072 1073 next_mergeable = defrag_check_next_extent(inode, em); 1074 /* 1075 * we hit a real extent, if it is big or the next extent is not a 1076 * real extent, don't bother defragging it 1077 */ 1078 if (!compress && (*last_len == 0 || *last_len >= thresh) && 1079 (em->len >= thresh || !next_mergeable)) 1080 ret = 0; 1081 out: 1082 /* 1083 * last_len ends up being a counter of how many bytes we've defragged. 1084 * every time we choose not to defrag an extent, we reset *last_len 1085 * so that the next tiny extent will force a defrag. 1086 * 1087 * The end result of this is that tiny extents before a single big 1088 * extent will force at least part of that big extent to be defragged. 1089 */ 1090 if (ret) { 1091 *defrag_end = extent_map_end(em); 1092 } else { 1093 *last_len = 0; 1094 *skip = extent_map_end(em); 1095 *defrag_end = 0; 1096 } 1097 1098 free_extent_map(em); 1099 return ret; 1100 } 1101 1102 /* 1103 * it doesn't do much good to defrag one or two pages 1104 * at a time. This pulls in a nice chunk of pages 1105 * to COW and defrag. 1106 * 1107 * It also makes sure the delalloc code has enough 1108 * dirty data to avoid making new small extents as part 1109 * of the defrag 1110 * 1111 * It's a good idea to start RA on this range 1112 * before calling this. 1113 */ 1114 static int cluster_pages_for_defrag(struct inode *inode, 1115 struct page **pages, 1116 unsigned long start_index, 1117 unsigned long num_pages) 1118 { 1119 unsigned long file_end; 1120 u64 isize = i_size_read(inode); 1121 u64 page_start; 1122 u64 page_end; 1123 u64 page_cnt; 1124 int ret; 1125 int i; 1126 int i_done; 1127 struct btrfs_ordered_extent *ordered; 1128 struct extent_state *cached_state = NULL; 1129 struct extent_io_tree *tree; 1130 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); 1131 1132 file_end = (isize - 1) >> PAGE_CACHE_SHIFT; 1133 if (!isize || start_index > file_end) 1134 return 0; 1135 1136 page_cnt = min_t(u64, (u64)num_pages, (u64)file_end - start_index + 1); 1137 1138 ret = btrfs_delalloc_reserve_space(inode, 1139 page_cnt << PAGE_CACHE_SHIFT); 1140 if (ret) 1141 return ret; 1142 i_done = 0; 1143 tree = &BTRFS_I(inode)->io_tree; 1144 1145 /* step one, lock all the pages */ 1146 for (i = 0; i < page_cnt; i++) { 1147 struct page *page; 1148 again: 1149 page = find_or_create_page(inode->i_mapping, 1150 start_index + i, mask); 1151 if (!page) 1152 break; 1153 1154 page_start = page_offset(page); 1155 page_end = page_start + PAGE_CACHE_SIZE - 1; 1156 while (1) { 1157 lock_extent_bits(tree, page_start, page_end, 1158 0, &cached_state); 1159 ordered = btrfs_lookup_ordered_extent(inode, 1160 page_start); 1161 unlock_extent_cached(tree, page_start, page_end, 1162 &cached_state, GFP_NOFS); 1163 if (!ordered) 1164 break; 1165 1166 unlock_page(page); 1167 btrfs_start_ordered_extent(inode, ordered, 1); 1168 btrfs_put_ordered_extent(ordered); 1169 lock_page(page); 1170 /* 1171 * we unlocked the page above, so we need check if 1172 * it was released or not. 1173 */ 1174 if (page->mapping != inode->i_mapping) { 1175 unlock_page(page); 1176 page_cache_release(page); 1177 goto again; 1178 } 1179 } 1180 1181 if (!PageUptodate(page)) { 1182 btrfs_readpage(NULL, page); 1183 lock_page(page); 1184 if (!PageUptodate(page)) { 1185 unlock_page(page); 1186 page_cache_release(page); 1187 ret = -EIO; 1188 break; 1189 } 1190 } 1191 1192 if (page->mapping != inode->i_mapping) { 1193 unlock_page(page); 1194 page_cache_release(page); 1195 goto again; 1196 } 1197 1198 pages[i] = page; 1199 i_done++; 1200 } 1201 if (!i_done || ret) 1202 goto out; 1203 1204 if (!(inode->i_sb->s_flags & MS_ACTIVE)) 1205 goto out; 1206 1207 /* 1208 * so now we have a nice long stream of locked 1209 * and up to date pages, lets wait on them 1210 */ 1211 for (i = 0; i < i_done; i++) 1212 wait_on_page_writeback(pages[i]); 1213 1214 page_start = page_offset(pages[0]); 1215 page_end = page_offset(pages[i_done - 1]) + PAGE_CACHE_SIZE; 1216 1217 lock_extent_bits(&BTRFS_I(inode)->io_tree, 1218 page_start, page_end - 1, 0, &cached_state); 1219 clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, 1220 page_end - 1, EXTENT_DIRTY | EXTENT_DELALLOC | 1221 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0, 1222 &cached_state, GFP_NOFS); 1223 1224 if (i_done != page_cnt) { 1225 spin_lock(&BTRFS_I(inode)->lock); 1226 BTRFS_I(inode)->outstanding_extents++; 1227 spin_unlock(&BTRFS_I(inode)->lock); 1228 btrfs_delalloc_release_space(inode, 1229 (page_cnt - i_done) << PAGE_CACHE_SHIFT); 1230 } 1231 1232 1233 set_extent_defrag(&BTRFS_I(inode)->io_tree, page_start, page_end - 1, 1234 &cached_state, GFP_NOFS); 1235 1236 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 1237 page_start, page_end - 1, &cached_state, 1238 GFP_NOFS); 1239 1240 for (i = 0; i < i_done; i++) { 1241 clear_page_dirty_for_io(pages[i]); 1242 ClearPageChecked(pages[i]); 1243 set_page_extent_mapped(pages[i]); 1244 set_page_dirty(pages[i]); 1245 unlock_page(pages[i]); 1246 page_cache_release(pages[i]); 1247 } 1248 return i_done; 1249 out: 1250 for (i = 0; i < i_done; i++) { 1251 unlock_page(pages[i]); 1252 page_cache_release(pages[i]); 1253 } 1254 btrfs_delalloc_release_space(inode, page_cnt << PAGE_CACHE_SHIFT); 1255 return ret; 1256 1257 } 1258 1259 int btrfs_defrag_file(struct inode *inode, struct file *file, 1260 struct btrfs_ioctl_defrag_range_args *range, 1261 u64 newer_than, unsigned long max_to_defrag) 1262 { 1263 struct btrfs_root *root = BTRFS_I(inode)->root; 1264 struct file_ra_state *ra = NULL; 1265 unsigned long last_index; 1266 u64 isize = i_size_read(inode); 1267 u64 last_len = 0; 1268 u64 skip = 0; 1269 u64 defrag_end = 0; 1270 u64 newer_off = range->start; 1271 unsigned long i; 1272 unsigned long ra_index = 0; 1273 int ret; 1274 int defrag_count = 0; 1275 int compress_type = BTRFS_COMPRESS_ZLIB; 1276 u32 extent_thresh = range->extent_thresh; 1277 unsigned long max_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT; 1278 unsigned long cluster = max_cluster; 1279 u64 new_align = ~((u64)128 * 1024 - 1); 1280 struct page **pages = NULL; 1281 1282 if (isize == 0) 1283 return 0; 1284 1285 if (range->start >= isize) 1286 return -EINVAL; 1287 1288 if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) { 1289 if (range->compress_type > BTRFS_COMPRESS_TYPES) 1290 return -EINVAL; 1291 if (range->compress_type) 1292 compress_type = range->compress_type; 1293 } 1294 1295 if (extent_thresh == 0) 1296 extent_thresh = 256 * 1024; 1297 1298 /* 1299 * if we were not given a file, allocate a readahead 1300 * context 1301 */ 1302 if (!file) { 1303 ra = kzalloc(sizeof(*ra), GFP_NOFS); 1304 if (!ra) 1305 return -ENOMEM; 1306 file_ra_state_init(ra, inode->i_mapping); 1307 } else { 1308 ra = &file->f_ra; 1309 } 1310 1311 pages = kmalloc_array(max_cluster, sizeof(struct page *), 1312 GFP_NOFS); 1313 if (!pages) { 1314 ret = -ENOMEM; 1315 goto out_ra; 1316 } 1317 1318 /* find the last page to defrag */ 1319 if (range->start + range->len > range->start) { 1320 last_index = min_t(u64, isize - 1, 1321 range->start + range->len - 1) >> PAGE_CACHE_SHIFT; 1322 } else { 1323 last_index = (isize - 1) >> PAGE_CACHE_SHIFT; 1324 } 1325 1326 if (newer_than) { 1327 ret = find_new_extents(root, inode, newer_than, 1328 &newer_off, 64 * 1024); 1329 if (!ret) { 1330 range->start = newer_off; 1331 /* 1332 * we always align our defrag to help keep 1333 * the extents in the file evenly spaced 1334 */ 1335 i = (newer_off & new_align) >> PAGE_CACHE_SHIFT; 1336 } else 1337 goto out_ra; 1338 } else { 1339 i = range->start >> PAGE_CACHE_SHIFT; 1340 } 1341 if (!max_to_defrag) 1342 max_to_defrag = last_index + 1; 1343 1344 /* 1345 * make writeback starts from i, so the defrag range can be 1346 * written sequentially. 1347 */ 1348 if (i < inode->i_mapping->writeback_index) 1349 inode->i_mapping->writeback_index = i; 1350 1351 while (i <= last_index && defrag_count < max_to_defrag && 1352 (i < DIV_ROUND_UP(i_size_read(inode), PAGE_CACHE_SIZE))) { 1353 /* 1354 * make sure we stop running if someone unmounts 1355 * the FS 1356 */ 1357 if (!(inode->i_sb->s_flags & MS_ACTIVE)) 1358 break; 1359 1360 if (btrfs_defrag_cancelled(root->fs_info)) { 1361 printk(KERN_DEBUG "BTRFS: defrag_file cancelled\n"); 1362 ret = -EAGAIN; 1363 break; 1364 } 1365 1366 if (!should_defrag_range(inode, (u64)i << PAGE_CACHE_SHIFT, 1367 extent_thresh, &last_len, &skip, 1368 &defrag_end, range->flags & 1369 BTRFS_DEFRAG_RANGE_COMPRESS)) { 1370 unsigned long next; 1371 /* 1372 * the should_defrag function tells us how much to skip 1373 * bump our counter by the suggested amount 1374 */ 1375 next = DIV_ROUND_UP(skip, PAGE_CACHE_SIZE); 1376 i = max(i + 1, next); 1377 continue; 1378 } 1379 1380 if (!newer_than) { 1381 cluster = (PAGE_CACHE_ALIGN(defrag_end) >> 1382 PAGE_CACHE_SHIFT) - i; 1383 cluster = min(cluster, max_cluster); 1384 } else { 1385 cluster = max_cluster; 1386 } 1387 1388 if (i + cluster > ra_index) { 1389 ra_index = max(i, ra_index); 1390 btrfs_force_ra(inode->i_mapping, ra, file, ra_index, 1391 cluster); 1392 ra_index += max_cluster; 1393 } 1394 1395 mutex_lock(&inode->i_mutex); 1396 if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) 1397 BTRFS_I(inode)->force_compress = compress_type; 1398 ret = cluster_pages_for_defrag(inode, pages, i, cluster); 1399 if (ret < 0) { 1400 mutex_unlock(&inode->i_mutex); 1401 goto out_ra; 1402 } 1403 1404 defrag_count += ret; 1405 balance_dirty_pages_ratelimited(inode->i_mapping); 1406 mutex_unlock(&inode->i_mutex); 1407 1408 if (newer_than) { 1409 if (newer_off == (u64)-1) 1410 break; 1411 1412 if (ret > 0) 1413 i += ret; 1414 1415 newer_off = max(newer_off + 1, 1416 (u64)i << PAGE_CACHE_SHIFT); 1417 1418 ret = find_new_extents(root, inode, 1419 newer_than, &newer_off, 1420 64 * 1024); 1421 if (!ret) { 1422 range->start = newer_off; 1423 i = (newer_off & new_align) >> PAGE_CACHE_SHIFT; 1424 } else { 1425 break; 1426 } 1427 } else { 1428 if (ret > 0) { 1429 i += ret; 1430 last_len += ret << PAGE_CACHE_SHIFT; 1431 } else { 1432 i++; 1433 last_len = 0; 1434 } 1435 } 1436 } 1437 1438 if ((range->flags & BTRFS_DEFRAG_RANGE_START_IO)) { 1439 filemap_flush(inode->i_mapping); 1440 if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, 1441 &BTRFS_I(inode)->runtime_flags)) 1442 filemap_flush(inode->i_mapping); 1443 } 1444 1445 if ((range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)) { 1446 /* the filemap_flush will queue IO into the worker threads, but 1447 * we have to make sure the IO is actually started and that 1448 * ordered extents get created before we return 1449 */ 1450 atomic_inc(&root->fs_info->async_submit_draining); 1451 while (atomic_read(&root->fs_info->nr_async_submits) || 1452 atomic_read(&root->fs_info->async_delalloc_pages)) { 1453 wait_event(root->fs_info->async_submit_wait, 1454 (atomic_read(&root->fs_info->nr_async_submits) == 0 && 1455 atomic_read(&root->fs_info->async_delalloc_pages) == 0)); 1456 } 1457 atomic_dec(&root->fs_info->async_submit_draining); 1458 } 1459 1460 if (range->compress_type == BTRFS_COMPRESS_LZO) { 1461 btrfs_set_fs_incompat(root->fs_info, COMPRESS_LZO); 1462 } 1463 1464 ret = defrag_count; 1465 1466 out_ra: 1467 if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) { 1468 mutex_lock(&inode->i_mutex); 1469 BTRFS_I(inode)->force_compress = BTRFS_COMPRESS_NONE; 1470 mutex_unlock(&inode->i_mutex); 1471 } 1472 if (!file) 1473 kfree(ra); 1474 kfree(pages); 1475 return ret; 1476 } 1477 1478 static noinline int btrfs_ioctl_resize(struct file *file, 1479 void __user *arg) 1480 { 1481 u64 new_size; 1482 u64 old_size; 1483 u64 devid = 1; 1484 struct btrfs_root *root = BTRFS_I(file_inode(file))->root; 1485 struct btrfs_ioctl_vol_args *vol_args; 1486 struct btrfs_trans_handle *trans; 1487 struct btrfs_device *device = NULL; 1488 char *sizestr; 1489 char *retptr; 1490 char *devstr = NULL; 1491 int ret = 0; 1492 int mod = 0; 1493 1494 if (!capable(CAP_SYS_ADMIN)) 1495 return -EPERM; 1496 1497 ret = mnt_want_write_file(file); 1498 if (ret) 1499 return ret; 1500 1501 if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, 1502 1)) { 1503 mnt_drop_write_file(file); 1504 return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; 1505 } 1506 1507 mutex_lock(&root->fs_info->volume_mutex); 1508 vol_args = memdup_user(arg, sizeof(*vol_args)); 1509 if (IS_ERR(vol_args)) { 1510 ret = PTR_ERR(vol_args); 1511 goto out; 1512 } 1513 1514 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; 1515 1516 sizestr = vol_args->name; 1517 devstr = strchr(sizestr, ':'); 1518 if (devstr) { 1519 sizestr = devstr + 1; 1520 *devstr = '\0'; 1521 devstr = vol_args->name; 1522 ret = kstrtoull(devstr, 10, &devid); 1523 if (ret) 1524 goto out_free; 1525 if (!devid) { 1526 ret = -EINVAL; 1527 goto out_free; 1528 } 1529 btrfs_info(root->fs_info, "resizing devid %llu", devid); 1530 } 1531 1532 device = btrfs_find_device(root->fs_info, devid, NULL, NULL); 1533 if (!device) { 1534 btrfs_info(root->fs_info, "resizer unable to find device %llu", 1535 devid); 1536 ret = -ENODEV; 1537 goto out_free; 1538 } 1539 1540 if (!device->writeable) { 1541 btrfs_info(root->fs_info, 1542 "resizer unable to apply on readonly device %llu", 1543 devid); 1544 ret = -EPERM; 1545 goto out_free; 1546 } 1547 1548 if (!strcmp(sizestr, "max")) 1549 new_size = device->bdev->bd_inode->i_size; 1550 else { 1551 if (sizestr[0] == '-') { 1552 mod = -1; 1553 sizestr++; 1554 } else if (sizestr[0] == '+') { 1555 mod = 1; 1556 sizestr++; 1557 } 1558 new_size = memparse(sizestr, &retptr); 1559 if (*retptr != '\0' || new_size == 0) { 1560 ret = -EINVAL; 1561 goto out_free; 1562 } 1563 } 1564 1565 if (device->is_tgtdev_for_dev_replace) { 1566 ret = -EPERM; 1567 goto out_free; 1568 } 1569 1570 old_size = btrfs_device_get_total_bytes(device); 1571 1572 if (mod < 0) { 1573 if (new_size > old_size) { 1574 ret = -EINVAL; 1575 goto out_free; 1576 } 1577 new_size = old_size - new_size; 1578 } else if (mod > 0) { 1579 if (new_size > ULLONG_MAX - old_size) { 1580 ret = -ERANGE; 1581 goto out_free; 1582 } 1583 new_size = old_size + new_size; 1584 } 1585 1586 if (new_size < 256 * 1024 * 1024) { 1587 ret = -EINVAL; 1588 goto out_free; 1589 } 1590 if (new_size > device->bdev->bd_inode->i_size) { 1591 ret = -EFBIG; 1592 goto out_free; 1593 } 1594 1595 do_div(new_size, root->sectorsize); 1596 new_size *= root->sectorsize; 1597 1598 printk_in_rcu(KERN_INFO "BTRFS: new size for %s is %llu\n", 1599 rcu_str_deref(device->name), new_size); 1600 1601 if (new_size > old_size) { 1602 trans = btrfs_start_transaction(root, 0); 1603 if (IS_ERR(trans)) { 1604 ret = PTR_ERR(trans); 1605 goto out_free; 1606 } 1607 ret = btrfs_grow_device(trans, device, new_size); 1608 btrfs_commit_transaction(trans, root); 1609 } else if (new_size < old_size) { 1610 ret = btrfs_shrink_device(device, new_size); 1611 } /* equal, nothing need to do */ 1612 1613 out_free: 1614 kfree(vol_args); 1615 out: 1616 mutex_unlock(&root->fs_info->volume_mutex); 1617 atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); 1618 mnt_drop_write_file(file); 1619 return ret; 1620 } 1621 1622 static noinline int btrfs_ioctl_snap_create_transid(struct file *file, 1623 char *name, unsigned long fd, int subvol, 1624 u64 *transid, bool readonly, 1625 struct btrfs_qgroup_inherit *inherit) 1626 { 1627 int namelen; 1628 int ret = 0; 1629 1630 ret = mnt_want_write_file(file); 1631 if (ret) 1632 goto out; 1633 1634 namelen = strlen(name); 1635 if (strchr(name, '/')) { 1636 ret = -EINVAL; 1637 goto out_drop_write; 1638 } 1639 1640 if (name[0] == '.' && 1641 (namelen == 1 || (name[1] == '.' && namelen == 2))) { 1642 ret = -EEXIST; 1643 goto out_drop_write; 1644 } 1645 1646 if (subvol) { 1647 ret = btrfs_mksubvol(&file->f_path, name, namelen, 1648 NULL, transid, readonly, inherit); 1649 } else { 1650 struct fd src = fdget(fd); 1651 struct inode *src_inode; 1652 if (!src.file) { 1653 ret = -EINVAL; 1654 goto out_drop_write; 1655 } 1656 1657 src_inode = file_inode(src.file); 1658 if (src_inode->i_sb != file_inode(file)->i_sb) { 1659 btrfs_info(BTRFS_I(src_inode)->root->fs_info, 1660 "Snapshot src from another FS"); 1661 ret = -EXDEV; 1662 } else if (!inode_owner_or_capable(src_inode)) { 1663 /* 1664 * Subvolume creation is not restricted, but snapshots 1665 * are limited to own subvolumes only 1666 */ 1667 ret = -EPERM; 1668 } else { 1669 ret = btrfs_mksubvol(&file->f_path, name, namelen, 1670 BTRFS_I(src_inode)->root, 1671 transid, readonly, inherit); 1672 } 1673 fdput(src); 1674 } 1675 out_drop_write: 1676 mnt_drop_write_file(file); 1677 out: 1678 return ret; 1679 } 1680 1681 static noinline int btrfs_ioctl_snap_create(struct file *file, 1682 void __user *arg, int subvol) 1683 { 1684 struct btrfs_ioctl_vol_args *vol_args; 1685 int ret; 1686 1687 vol_args = memdup_user(arg, sizeof(*vol_args)); 1688 if (IS_ERR(vol_args)) 1689 return PTR_ERR(vol_args); 1690 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; 1691 1692 ret = btrfs_ioctl_snap_create_transid(file, vol_args->name, 1693 vol_args->fd, subvol, 1694 NULL, false, NULL); 1695 1696 kfree(vol_args); 1697 return ret; 1698 } 1699 1700 static noinline int btrfs_ioctl_snap_create_v2(struct file *file, 1701 void __user *arg, int subvol) 1702 { 1703 struct btrfs_ioctl_vol_args_v2 *vol_args; 1704 int ret; 1705 u64 transid = 0; 1706 u64 *ptr = NULL; 1707 bool readonly = false; 1708 struct btrfs_qgroup_inherit *inherit = NULL; 1709 1710 vol_args = memdup_user(arg, sizeof(*vol_args)); 1711 if (IS_ERR(vol_args)) 1712 return PTR_ERR(vol_args); 1713 vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0'; 1714 1715 if (vol_args->flags & 1716 ~(BTRFS_SUBVOL_CREATE_ASYNC | BTRFS_SUBVOL_RDONLY | 1717 BTRFS_SUBVOL_QGROUP_INHERIT)) { 1718 ret = -EOPNOTSUPP; 1719 goto free_args; 1720 } 1721 1722 if (vol_args->flags & BTRFS_SUBVOL_CREATE_ASYNC) 1723 ptr = &transid; 1724 if (vol_args->flags & BTRFS_SUBVOL_RDONLY) 1725 readonly = true; 1726 if (vol_args->flags & BTRFS_SUBVOL_QGROUP_INHERIT) { 1727 if (vol_args->size > PAGE_CACHE_SIZE) { 1728 ret = -EINVAL; 1729 goto free_args; 1730 } 1731 inherit = memdup_user(vol_args->qgroup_inherit, vol_args->size); 1732 if (IS_ERR(inherit)) { 1733 ret = PTR_ERR(inherit); 1734 goto free_args; 1735 } 1736 } 1737 1738 ret = btrfs_ioctl_snap_create_transid(file, vol_args->name, 1739 vol_args->fd, subvol, ptr, 1740 readonly, inherit); 1741 if (ret) 1742 goto free_inherit; 1743 1744 if (ptr && copy_to_user(arg + 1745 offsetof(struct btrfs_ioctl_vol_args_v2, 1746 transid), 1747 ptr, sizeof(*ptr))) 1748 ret = -EFAULT; 1749 1750 free_inherit: 1751 kfree(inherit); 1752 free_args: 1753 kfree(vol_args); 1754 return ret; 1755 } 1756 1757 static noinline int btrfs_ioctl_subvol_getflags(struct file *file, 1758 void __user *arg) 1759 { 1760 struct inode *inode = file_inode(file); 1761 struct btrfs_root *root = BTRFS_I(inode)->root; 1762 int ret = 0; 1763 u64 flags = 0; 1764 1765 if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID) 1766 return -EINVAL; 1767 1768 down_read(&root->fs_info->subvol_sem); 1769 if (btrfs_root_readonly(root)) 1770 flags |= BTRFS_SUBVOL_RDONLY; 1771 up_read(&root->fs_info->subvol_sem); 1772 1773 if (copy_to_user(arg, &flags, sizeof(flags))) 1774 ret = -EFAULT; 1775 1776 return ret; 1777 } 1778 1779 static noinline int btrfs_ioctl_subvol_setflags(struct file *file, 1780 void __user *arg) 1781 { 1782 struct inode *inode = file_inode(file); 1783 struct btrfs_root *root = BTRFS_I(inode)->root; 1784 struct btrfs_trans_handle *trans; 1785 u64 root_flags; 1786 u64 flags; 1787 int ret = 0; 1788 1789 if (!inode_owner_or_capable(inode)) 1790 return -EPERM; 1791 1792 ret = mnt_want_write_file(file); 1793 if (ret) 1794 goto out; 1795 1796 if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID) { 1797 ret = -EINVAL; 1798 goto out_drop_write; 1799 } 1800 1801 if (copy_from_user(&flags, arg, sizeof(flags))) { 1802 ret = -EFAULT; 1803 goto out_drop_write; 1804 } 1805 1806 if (flags & BTRFS_SUBVOL_CREATE_ASYNC) { 1807 ret = -EINVAL; 1808 goto out_drop_write; 1809 } 1810 1811 if (flags & ~BTRFS_SUBVOL_RDONLY) { 1812 ret = -EOPNOTSUPP; 1813 goto out_drop_write; 1814 } 1815 1816 down_write(&root->fs_info->subvol_sem); 1817 1818 /* nothing to do */ 1819 if (!!(flags & BTRFS_SUBVOL_RDONLY) == btrfs_root_readonly(root)) 1820 goto out_drop_sem; 1821 1822 root_flags = btrfs_root_flags(&root->root_item); 1823 if (flags & BTRFS_SUBVOL_RDONLY) { 1824 btrfs_set_root_flags(&root->root_item, 1825 root_flags | BTRFS_ROOT_SUBVOL_RDONLY); 1826 } else { 1827 /* 1828 * Block RO -> RW transition if this subvolume is involved in 1829 * send 1830 */ 1831 spin_lock(&root->root_item_lock); 1832 if (root->send_in_progress == 0) { 1833 btrfs_set_root_flags(&root->root_item, 1834 root_flags & ~BTRFS_ROOT_SUBVOL_RDONLY); 1835 spin_unlock(&root->root_item_lock); 1836 } else { 1837 spin_unlock(&root->root_item_lock); 1838 btrfs_warn(root->fs_info, 1839 "Attempt to set subvolume %llu read-write during send", 1840 root->root_key.objectid); 1841 ret = -EPERM; 1842 goto out_drop_sem; 1843 } 1844 } 1845 1846 trans = btrfs_start_transaction(root, 1); 1847 if (IS_ERR(trans)) { 1848 ret = PTR_ERR(trans); 1849 goto out_reset; 1850 } 1851 1852 ret = btrfs_update_root(trans, root->fs_info->tree_root, 1853 &root->root_key, &root->root_item); 1854 1855 btrfs_commit_transaction(trans, root); 1856 out_reset: 1857 if (ret) 1858 btrfs_set_root_flags(&root->root_item, root_flags); 1859 out_drop_sem: 1860 up_write(&root->fs_info->subvol_sem); 1861 out_drop_write: 1862 mnt_drop_write_file(file); 1863 out: 1864 return ret; 1865 } 1866 1867 /* 1868 * helper to check if the subvolume references other subvolumes 1869 */ 1870 static noinline int may_destroy_subvol(struct btrfs_root *root) 1871 { 1872 struct btrfs_path *path; 1873 struct btrfs_dir_item *di; 1874 struct btrfs_key key; 1875 u64 dir_id; 1876 int ret; 1877 1878 path = btrfs_alloc_path(); 1879 if (!path) 1880 return -ENOMEM; 1881 1882 /* Make sure this root isn't set as the default subvol */ 1883 dir_id = btrfs_super_root_dir(root->fs_info->super_copy); 1884 di = btrfs_lookup_dir_item(NULL, root->fs_info->tree_root, path, 1885 dir_id, "default", 7, 0); 1886 if (di && !IS_ERR(di)) { 1887 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key); 1888 if (key.objectid == root->root_key.objectid) { 1889 ret = -EPERM; 1890 btrfs_err(root->fs_info, "deleting default subvolume " 1891 "%llu is not allowed", key.objectid); 1892 goto out; 1893 } 1894 btrfs_release_path(path); 1895 } 1896 1897 key.objectid = root->root_key.objectid; 1898 key.type = BTRFS_ROOT_REF_KEY; 1899 key.offset = (u64)-1; 1900 1901 ret = btrfs_search_slot(NULL, root->fs_info->tree_root, 1902 &key, path, 0, 0); 1903 if (ret < 0) 1904 goto out; 1905 BUG_ON(ret == 0); 1906 1907 ret = 0; 1908 if (path->slots[0] > 0) { 1909 path->slots[0]--; 1910 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1911 if (key.objectid == root->root_key.objectid && 1912 key.type == BTRFS_ROOT_REF_KEY) 1913 ret = -ENOTEMPTY; 1914 } 1915 out: 1916 btrfs_free_path(path); 1917 return ret; 1918 } 1919 1920 static noinline int key_in_sk(struct btrfs_key *key, 1921 struct btrfs_ioctl_search_key *sk) 1922 { 1923 struct btrfs_key test; 1924 int ret; 1925 1926 test.objectid = sk->min_objectid; 1927 test.type = sk->min_type; 1928 test.offset = sk->min_offset; 1929 1930 ret = btrfs_comp_cpu_keys(key, &test); 1931 if (ret < 0) 1932 return 0; 1933 1934 test.objectid = sk->max_objectid; 1935 test.type = sk->max_type; 1936 test.offset = sk->max_offset; 1937 1938 ret = btrfs_comp_cpu_keys(key, &test); 1939 if (ret > 0) 1940 return 0; 1941 return 1; 1942 } 1943 1944 static noinline int copy_to_sk(struct btrfs_root *root, 1945 struct btrfs_path *path, 1946 struct btrfs_key *key, 1947 struct btrfs_ioctl_search_key *sk, 1948 size_t *buf_size, 1949 char __user *ubuf, 1950 unsigned long *sk_offset, 1951 int *num_found) 1952 { 1953 u64 found_transid; 1954 struct extent_buffer *leaf; 1955 struct btrfs_ioctl_search_header sh; 1956 unsigned long item_off; 1957 unsigned long item_len; 1958 int nritems; 1959 int i; 1960 int slot; 1961 int ret = 0; 1962 1963 leaf = path->nodes[0]; 1964 slot = path->slots[0]; 1965 nritems = btrfs_header_nritems(leaf); 1966 1967 if (btrfs_header_generation(leaf) > sk->max_transid) { 1968 i = nritems; 1969 goto advance_key; 1970 } 1971 found_transid = btrfs_header_generation(leaf); 1972 1973 for (i = slot; i < nritems; i++) { 1974 item_off = btrfs_item_ptr_offset(leaf, i); 1975 item_len = btrfs_item_size_nr(leaf, i); 1976 1977 btrfs_item_key_to_cpu(leaf, key, i); 1978 if (!key_in_sk(key, sk)) 1979 continue; 1980 1981 if (sizeof(sh) + item_len > *buf_size) { 1982 if (*num_found) { 1983 ret = 1; 1984 goto out; 1985 } 1986 1987 /* 1988 * return one empty item back for v1, which does not 1989 * handle -EOVERFLOW 1990 */ 1991 1992 *buf_size = sizeof(sh) + item_len; 1993 item_len = 0; 1994 ret = -EOVERFLOW; 1995 } 1996 1997 if (sizeof(sh) + item_len + *sk_offset > *buf_size) { 1998 ret = 1; 1999 goto out; 2000 } 2001 2002 sh.objectid = key->objectid; 2003 sh.offset = key->offset; 2004 sh.type = key->type; 2005 sh.len = item_len; 2006 sh.transid = found_transid; 2007 2008 /* copy search result header */ 2009 if (copy_to_user(ubuf + *sk_offset, &sh, sizeof(sh))) { 2010 ret = -EFAULT; 2011 goto out; 2012 } 2013 2014 *sk_offset += sizeof(sh); 2015 2016 if (item_len) { 2017 char __user *up = ubuf + *sk_offset; 2018 /* copy the item */ 2019 if (read_extent_buffer_to_user(leaf, up, 2020 item_off, item_len)) { 2021 ret = -EFAULT; 2022 goto out; 2023 } 2024 2025 *sk_offset += item_len; 2026 } 2027 (*num_found)++; 2028 2029 if (ret) /* -EOVERFLOW from above */ 2030 goto out; 2031 2032 if (*num_found >= sk->nr_items) { 2033 ret = 1; 2034 goto out; 2035 } 2036 } 2037 advance_key: 2038 ret = 0; 2039 if (key->offset < (u64)-1 && key->offset < sk->max_offset) 2040 key->offset++; 2041 else if (key->type < (u8)-1 && key->type < sk->max_type) { 2042 key->offset = 0; 2043 key->type++; 2044 } else if (key->objectid < (u64)-1 && key->objectid < sk->max_objectid) { 2045 key->offset = 0; 2046 key->type = 0; 2047 key->objectid++; 2048 } else 2049 ret = 1; 2050 out: 2051 /* 2052 * 0: all items from this leaf copied, continue with next 2053 * 1: * more items can be copied, but unused buffer is too small 2054 * * all items were found 2055 * Either way, it will stops the loop which iterates to the next 2056 * leaf 2057 * -EOVERFLOW: item was to large for buffer 2058 * -EFAULT: could not copy extent buffer back to userspace 2059 */ 2060 return ret; 2061 } 2062 2063 static noinline int search_ioctl(struct inode *inode, 2064 struct btrfs_ioctl_search_key *sk, 2065 size_t *buf_size, 2066 char __user *ubuf) 2067 { 2068 struct btrfs_root *root; 2069 struct btrfs_key key; 2070 struct btrfs_path *path; 2071 struct btrfs_fs_info *info = BTRFS_I(inode)->root->fs_info; 2072 int ret; 2073 int num_found = 0; 2074 unsigned long sk_offset = 0; 2075 2076 if (*buf_size < sizeof(struct btrfs_ioctl_search_header)) { 2077 *buf_size = sizeof(struct btrfs_ioctl_search_header); 2078 return -EOVERFLOW; 2079 } 2080 2081 path = btrfs_alloc_path(); 2082 if (!path) 2083 return -ENOMEM; 2084 2085 if (sk->tree_id == 0) { 2086 /* search the root of the inode that was passed */ 2087 root = BTRFS_I(inode)->root; 2088 } else { 2089 key.objectid = sk->tree_id; 2090 key.type = BTRFS_ROOT_ITEM_KEY; 2091 key.offset = (u64)-1; 2092 root = btrfs_read_fs_root_no_name(info, &key); 2093 if (IS_ERR(root)) { 2094 printk(KERN_ERR "BTRFS: could not find root %llu\n", 2095 sk->tree_id); 2096 btrfs_free_path(path); 2097 return -ENOENT; 2098 } 2099 } 2100 2101 key.objectid = sk->min_objectid; 2102 key.type = sk->min_type; 2103 key.offset = sk->min_offset; 2104 2105 while (1) { 2106 ret = btrfs_search_forward(root, &key, path, sk->min_transid); 2107 if (ret != 0) { 2108 if (ret > 0) 2109 ret = 0; 2110 goto err; 2111 } 2112 ret = copy_to_sk(root, path, &key, sk, buf_size, ubuf, 2113 &sk_offset, &num_found); 2114 btrfs_release_path(path); 2115 if (ret) 2116 break; 2117 2118 } 2119 if (ret > 0) 2120 ret = 0; 2121 err: 2122 sk->nr_items = num_found; 2123 btrfs_free_path(path); 2124 return ret; 2125 } 2126 2127 static noinline int btrfs_ioctl_tree_search(struct file *file, 2128 void __user *argp) 2129 { 2130 struct btrfs_ioctl_search_args __user *uargs; 2131 struct btrfs_ioctl_search_key sk; 2132 struct inode *inode; 2133 int ret; 2134 size_t buf_size; 2135 2136 if (!capable(CAP_SYS_ADMIN)) 2137 return -EPERM; 2138 2139 uargs = (struct btrfs_ioctl_search_args __user *)argp; 2140 2141 if (copy_from_user(&sk, &uargs->key, sizeof(sk))) 2142 return -EFAULT; 2143 2144 buf_size = sizeof(uargs->buf); 2145 2146 inode = file_inode(file); 2147 ret = search_ioctl(inode, &sk, &buf_size, uargs->buf); 2148 2149 /* 2150 * In the origin implementation an overflow is handled by returning a 2151 * search header with a len of zero, so reset ret. 2152 */ 2153 if (ret == -EOVERFLOW) 2154 ret = 0; 2155 2156 if (ret == 0 && copy_to_user(&uargs->key, &sk, sizeof(sk))) 2157 ret = -EFAULT; 2158 return ret; 2159 } 2160 2161 static noinline int btrfs_ioctl_tree_search_v2(struct file *file, 2162 void __user *argp) 2163 { 2164 struct btrfs_ioctl_search_args_v2 __user *uarg; 2165 struct btrfs_ioctl_search_args_v2 args; 2166 struct inode *inode; 2167 int ret; 2168 size_t buf_size; 2169 const size_t buf_limit = 16 * 1024 * 1024; 2170 2171 if (!capable(CAP_SYS_ADMIN)) 2172 return -EPERM; 2173 2174 /* copy search header and buffer size */ 2175 uarg = (struct btrfs_ioctl_search_args_v2 __user *)argp; 2176 if (copy_from_user(&args, uarg, sizeof(args))) 2177 return -EFAULT; 2178 2179 buf_size = args.buf_size; 2180 2181 if (buf_size < sizeof(struct btrfs_ioctl_search_header)) 2182 return -EOVERFLOW; 2183 2184 /* limit result size to 16MB */ 2185 if (buf_size > buf_limit) 2186 buf_size = buf_limit; 2187 2188 inode = file_inode(file); 2189 ret = search_ioctl(inode, &args.key, &buf_size, 2190 (char *)(&uarg->buf[0])); 2191 if (ret == 0 && copy_to_user(&uarg->key, &args.key, sizeof(args.key))) 2192 ret = -EFAULT; 2193 else if (ret == -EOVERFLOW && 2194 copy_to_user(&uarg->buf_size, &buf_size, sizeof(buf_size))) 2195 ret = -EFAULT; 2196 2197 return ret; 2198 } 2199 2200 /* 2201 * Search INODE_REFs to identify path name of 'dirid' directory 2202 * in a 'tree_id' tree. and sets path name to 'name'. 2203 */ 2204 static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info, 2205 u64 tree_id, u64 dirid, char *name) 2206 { 2207 struct btrfs_root *root; 2208 struct btrfs_key key; 2209 char *ptr; 2210 int ret = -1; 2211 int slot; 2212 int len; 2213 int total_len = 0; 2214 struct btrfs_inode_ref *iref; 2215 struct extent_buffer *l; 2216 struct btrfs_path *path; 2217 2218 if (dirid == BTRFS_FIRST_FREE_OBJECTID) { 2219 name[0]='\0'; 2220 return 0; 2221 } 2222 2223 path = btrfs_alloc_path(); 2224 if (!path) 2225 return -ENOMEM; 2226 2227 ptr = &name[BTRFS_INO_LOOKUP_PATH_MAX]; 2228 2229 key.objectid = tree_id; 2230 key.type = BTRFS_ROOT_ITEM_KEY; 2231 key.offset = (u64)-1; 2232 root = btrfs_read_fs_root_no_name(info, &key); 2233 if (IS_ERR(root)) { 2234 printk(KERN_ERR "BTRFS: could not find root %llu\n", tree_id); 2235 ret = -ENOENT; 2236 goto out; 2237 } 2238 2239 key.objectid = dirid; 2240 key.type = BTRFS_INODE_REF_KEY; 2241 key.offset = (u64)-1; 2242 2243 while (1) { 2244 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2245 if (ret < 0) 2246 goto out; 2247 else if (ret > 0) { 2248 ret = btrfs_previous_item(root, path, dirid, 2249 BTRFS_INODE_REF_KEY); 2250 if (ret < 0) 2251 goto out; 2252 else if (ret > 0) { 2253 ret = -ENOENT; 2254 goto out; 2255 } 2256 } 2257 2258 l = path->nodes[0]; 2259 slot = path->slots[0]; 2260 btrfs_item_key_to_cpu(l, &key, slot); 2261 2262 iref = btrfs_item_ptr(l, slot, struct btrfs_inode_ref); 2263 len = btrfs_inode_ref_name_len(l, iref); 2264 ptr -= len + 1; 2265 total_len += len + 1; 2266 if (ptr < name) { 2267 ret = -ENAMETOOLONG; 2268 goto out; 2269 } 2270 2271 *(ptr + len) = '/'; 2272 read_extent_buffer(l, ptr, (unsigned long)(iref + 1), len); 2273 2274 if (key.offset == BTRFS_FIRST_FREE_OBJECTID) 2275 break; 2276 2277 btrfs_release_path(path); 2278 key.objectid = key.offset; 2279 key.offset = (u64)-1; 2280 dirid = key.objectid; 2281 } 2282 memmove(name, ptr, total_len); 2283 name[total_len] = '\0'; 2284 ret = 0; 2285 out: 2286 btrfs_free_path(path); 2287 return ret; 2288 } 2289 2290 static noinline int btrfs_ioctl_ino_lookup(struct file *file, 2291 void __user *argp) 2292 { 2293 struct btrfs_ioctl_ino_lookup_args *args; 2294 struct inode *inode; 2295 int ret; 2296 2297 if (!capable(CAP_SYS_ADMIN)) 2298 return -EPERM; 2299 2300 args = memdup_user(argp, sizeof(*args)); 2301 if (IS_ERR(args)) 2302 return PTR_ERR(args); 2303 2304 inode = file_inode(file); 2305 2306 if (args->treeid == 0) 2307 args->treeid = BTRFS_I(inode)->root->root_key.objectid; 2308 2309 ret = btrfs_search_path_in_tree(BTRFS_I(inode)->root->fs_info, 2310 args->treeid, args->objectid, 2311 args->name); 2312 2313 if (ret == 0 && copy_to_user(argp, args, sizeof(*args))) 2314 ret = -EFAULT; 2315 2316 kfree(args); 2317 return ret; 2318 } 2319 2320 static noinline int btrfs_ioctl_snap_destroy(struct file *file, 2321 void __user *arg) 2322 { 2323 struct dentry *parent = file->f_path.dentry; 2324 struct dentry *dentry; 2325 struct inode *dir = parent->d_inode; 2326 struct inode *inode; 2327 struct btrfs_root *root = BTRFS_I(dir)->root; 2328 struct btrfs_root *dest = NULL; 2329 struct btrfs_ioctl_vol_args *vol_args; 2330 struct btrfs_trans_handle *trans; 2331 struct btrfs_block_rsv block_rsv; 2332 u64 root_flags; 2333 u64 qgroup_reserved; 2334 int namelen; 2335 int ret; 2336 int err = 0; 2337 2338 vol_args = memdup_user(arg, sizeof(*vol_args)); 2339 if (IS_ERR(vol_args)) 2340 return PTR_ERR(vol_args); 2341 2342 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; 2343 namelen = strlen(vol_args->name); 2344 if (strchr(vol_args->name, '/') || 2345 strncmp(vol_args->name, "..", namelen) == 0) { 2346 err = -EINVAL; 2347 goto out; 2348 } 2349 2350 err = mnt_want_write_file(file); 2351 if (err) 2352 goto out; 2353 2354 2355 err = mutex_lock_killable_nested(&dir->i_mutex, I_MUTEX_PARENT); 2356 if (err == -EINTR) 2357 goto out_drop_write; 2358 dentry = lookup_one_len(vol_args->name, parent, namelen); 2359 if (IS_ERR(dentry)) { 2360 err = PTR_ERR(dentry); 2361 goto out_unlock_dir; 2362 } 2363 2364 if (!dentry->d_inode) { 2365 err = -ENOENT; 2366 goto out_dput; 2367 } 2368 2369 inode = dentry->d_inode; 2370 dest = BTRFS_I(inode)->root; 2371 if (!capable(CAP_SYS_ADMIN)) { 2372 /* 2373 * Regular user. Only allow this with a special mount 2374 * option, when the user has write+exec access to the 2375 * subvol root, and when rmdir(2) would have been 2376 * allowed. 2377 * 2378 * Note that this is _not_ check that the subvol is 2379 * empty or doesn't contain data that we wouldn't 2380 * otherwise be able to delete. 2381 * 2382 * Users who want to delete empty subvols should try 2383 * rmdir(2). 2384 */ 2385 err = -EPERM; 2386 if (!btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED)) 2387 goto out_dput; 2388 2389 /* 2390 * Do not allow deletion if the parent dir is the same 2391 * as the dir to be deleted. That means the ioctl 2392 * must be called on the dentry referencing the root 2393 * of the subvol, not a random directory contained 2394 * within it. 2395 */ 2396 err = -EINVAL; 2397 if (root == dest) 2398 goto out_dput; 2399 2400 err = inode_permission(inode, MAY_WRITE | MAY_EXEC); 2401 if (err) 2402 goto out_dput; 2403 } 2404 2405 /* check if subvolume may be deleted by a user */ 2406 err = btrfs_may_delete(dir, dentry, 1); 2407 if (err) 2408 goto out_dput; 2409 2410 if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID) { 2411 err = -EINVAL; 2412 goto out_dput; 2413 } 2414 2415 mutex_lock(&inode->i_mutex); 2416 2417 /* 2418 * Don't allow to delete a subvolume with send in progress. This is 2419 * inside the i_mutex so the error handling that has to drop the bit 2420 * again is not run concurrently. 2421 */ 2422 spin_lock(&dest->root_item_lock); 2423 root_flags = btrfs_root_flags(&dest->root_item); 2424 if (dest->send_in_progress == 0) { 2425 btrfs_set_root_flags(&dest->root_item, 2426 root_flags | BTRFS_ROOT_SUBVOL_DEAD); 2427 spin_unlock(&dest->root_item_lock); 2428 } else { 2429 spin_unlock(&dest->root_item_lock); 2430 btrfs_warn(root->fs_info, 2431 "Attempt to delete subvolume %llu during send", 2432 dest->root_key.objectid); 2433 err = -EPERM; 2434 goto out_dput; 2435 } 2436 2437 d_invalidate(dentry); 2438 2439 down_write(&root->fs_info->subvol_sem); 2440 2441 err = may_destroy_subvol(dest); 2442 if (err) 2443 goto out_up_write; 2444 2445 btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP); 2446 /* 2447 * One for dir inode, two for dir entries, two for root 2448 * ref/backref. 2449 */ 2450 err = btrfs_subvolume_reserve_metadata(root, &block_rsv, 2451 5, &qgroup_reserved, true); 2452 if (err) 2453 goto out_up_write; 2454 2455 trans = btrfs_start_transaction(root, 0); 2456 if (IS_ERR(trans)) { 2457 err = PTR_ERR(trans); 2458 goto out_release; 2459 } 2460 trans->block_rsv = &block_rsv; 2461 trans->bytes_reserved = block_rsv.size; 2462 2463 ret = btrfs_unlink_subvol(trans, root, dir, 2464 dest->root_key.objectid, 2465 dentry->d_name.name, 2466 dentry->d_name.len); 2467 if (ret) { 2468 err = ret; 2469 btrfs_abort_transaction(trans, root, ret); 2470 goto out_end_trans; 2471 } 2472 2473 btrfs_record_root_in_trans(trans, dest); 2474 2475 memset(&dest->root_item.drop_progress, 0, 2476 sizeof(dest->root_item.drop_progress)); 2477 dest->root_item.drop_level = 0; 2478 btrfs_set_root_refs(&dest->root_item, 0); 2479 2480 if (!test_and_set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &dest->state)) { 2481 ret = btrfs_insert_orphan_item(trans, 2482 root->fs_info->tree_root, 2483 dest->root_key.objectid); 2484 if (ret) { 2485 btrfs_abort_transaction(trans, root, ret); 2486 err = ret; 2487 goto out_end_trans; 2488 } 2489 } 2490 2491 ret = btrfs_uuid_tree_rem(trans, root->fs_info->uuid_root, 2492 dest->root_item.uuid, BTRFS_UUID_KEY_SUBVOL, 2493 dest->root_key.objectid); 2494 if (ret && ret != -ENOENT) { 2495 btrfs_abort_transaction(trans, root, ret); 2496 err = ret; 2497 goto out_end_trans; 2498 } 2499 if (!btrfs_is_empty_uuid(dest->root_item.received_uuid)) { 2500 ret = btrfs_uuid_tree_rem(trans, root->fs_info->uuid_root, 2501 dest->root_item.received_uuid, 2502 BTRFS_UUID_KEY_RECEIVED_SUBVOL, 2503 dest->root_key.objectid); 2504 if (ret && ret != -ENOENT) { 2505 btrfs_abort_transaction(trans, root, ret); 2506 err = ret; 2507 goto out_end_trans; 2508 } 2509 } 2510 2511 out_end_trans: 2512 trans->block_rsv = NULL; 2513 trans->bytes_reserved = 0; 2514 ret = btrfs_end_transaction(trans, root); 2515 if (ret && !err) 2516 err = ret; 2517 inode->i_flags |= S_DEAD; 2518 out_release: 2519 btrfs_subvolume_release_metadata(root, &block_rsv, qgroup_reserved); 2520 out_up_write: 2521 up_write(&root->fs_info->subvol_sem); 2522 if (err) { 2523 spin_lock(&dest->root_item_lock); 2524 root_flags = btrfs_root_flags(&dest->root_item); 2525 btrfs_set_root_flags(&dest->root_item, 2526 root_flags & ~BTRFS_ROOT_SUBVOL_DEAD); 2527 spin_unlock(&dest->root_item_lock); 2528 } 2529 mutex_unlock(&inode->i_mutex); 2530 if (!err) { 2531 shrink_dcache_sb(root->fs_info->sb); 2532 btrfs_invalidate_inodes(dest); 2533 d_delete(dentry); 2534 ASSERT(dest->send_in_progress == 0); 2535 2536 /* the last ref */ 2537 if (dest->ino_cache_inode) { 2538 iput(dest->ino_cache_inode); 2539 dest->ino_cache_inode = NULL; 2540 } 2541 } 2542 out_dput: 2543 dput(dentry); 2544 out_unlock_dir: 2545 mutex_unlock(&dir->i_mutex); 2546 out_drop_write: 2547 mnt_drop_write_file(file); 2548 out: 2549 kfree(vol_args); 2550 return err; 2551 } 2552 2553 static int btrfs_ioctl_defrag(struct file *file, void __user *argp) 2554 { 2555 struct inode *inode = file_inode(file); 2556 struct btrfs_root *root = BTRFS_I(inode)->root; 2557 struct btrfs_ioctl_defrag_range_args *range; 2558 int ret; 2559 2560 ret = mnt_want_write_file(file); 2561 if (ret) 2562 return ret; 2563 2564 if (btrfs_root_readonly(root)) { 2565 ret = -EROFS; 2566 goto out; 2567 } 2568 2569 switch (inode->i_mode & S_IFMT) { 2570 case S_IFDIR: 2571 if (!capable(CAP_SYS_ADMIN)) { 2572 ret = -EPERM; 2573 goto out; 2574 } 2575 ret = btrfs_defrag_root(root); 2576 if (ret) 2577 goto out; 2578 ret = btrfs_defrag_root(root->fs_info->extent_root); 2579 break; 2580 case S_IFREG: 2581 if (!(file->f_mode & FMODE_WRITE)) { 2582 ret = -EINVAL; 2583 goto out; 2584 } 2585 2586 range = kzalloc(sizeof(*range), GFP_KERNEL); 2587 if (!range) { 2588 ret = -ENOMEM; 2589 goto out; 2590 } 2591 2592 if (argp) { 2593 if (copy_from_user(range, argp, 2594 sizeof(*range))) { 2595 ret = -EFAULT; 2596 kfree(range); 2597 goto out; 2598 } 2599 /* compression requires us to start the IO */ 2600 if ((range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)) { 2601 range->flags |= BTRFS_DEFRAG_RANGE_START_IO; 2602 range->extent_thresh = (u32)-1; 2603 } 2604 } else { 2605 /* the rest are all set to zero by kzalloc */ 2606 range->len = (u64)-1; 2607 } 2608 ret = btrfs_defrag_file(file_inode(file), file, 2609 range, 0, 0); 2610 if (ret > 0) 2611 ret = 0; 2612 kfree(range); 2613 break; 2614 default: 2615 ret = -EINVAL; 2616 } 2617 out: 2618 mnt_drop_write_file(file); 2619 return ret; 2620 } 2621 2622 static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg) 2623 { 2624 struct btrfs_ioctl_vol_args *vol_args; 2625 int ret; 2626 2627 if (!capable(CAP_SYS_ADMIN)) 2628 return -EPERM; 2629 2630 if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, 2631 1)) { 2632 return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; 2633 } 2634 2635 mutex_lock(&root->fs_info->volume_mutex); 2636 vol_args = memdup_user(arg, sizeof(*vol_args)); 2637 if (IS_ERR(vol_args)) { 2638 ret = PTR_ERR(vol_args); 2639 goto out; 2640 } 2641 2642 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; 2643 ret = btrfs_init_new_device(root, vol_args->name); 2644 2645 if (!ret) 2646 btrfs_info(root->fs_info, "disk added %s",vol_args->name); 2647 2648 kfree(vol_args); 2649 out: 2650 mutex_unlock(&root->fs_info->volume_mutex); 2651 atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); 2652 return ret; 2653 } 2654 2655 static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) 2656 { 2657 struct btrfs_root *root = BTRFS_I(file_inode(file))->root; 2658 struct btrfs_ioctl_vol_args *vol_args; 2659 int ret; 2660 2661 if (!capable(CAP_SYS_ADMIN)) 2662 return -EPERM; 2663 2664 ret = mnt_want_write_file(file); 2665 if (ret) 2666 return ret; 2667 2668 vol_args = memdup_user(arg, sizeof(*vol_args)); 2669 if (IS_ERR(vol_args)) { 2670 ret = PTR_ERR(vol_args); 2671 goto err_drop; 2672 } 2673 2674 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; 2675 2676 if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, 2677 1)) { 2678 ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; 2679 goto out; 2680 } 2681 2682 mutex_lock(&root->fs_info->volume_mutex); 2683 ret = btrfs_rm_device(root, vol_args->name); 2684 mutex_unlock(&root->fs_info->volume_mutex); 2685 atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); 2686 2687 if (!ret) 2688 btrfs_info(root->fs_info, "disk deleted %s",vol_args->name); 2689 2690 out: 2691 kfree(vol_args); 2692 err_drop: 2693 mnt_drop_write_file(file); 2694 return ret; 2695 } 2696 2697 static long btrfs_ioctl_fs_info(struct btrfs_root *root, void __user *arg) 2698 { 2699 struct btrfs_ioctl_fs_info_args *fi_args; 2700 struct btrfs_device *device; 2701 struct btrfs_device *next; 2702 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; 2703 int ret = 0; 2704 2705 fi_args = kzalloc(sizeof(*fi_args), GFP_KERNEL); 2706 if (!fi_args) 2707 return -ENOMEM; 2708 2709 mutex_lock(&fs_devices->device_list_mutex); 2710 fi_args->num_devices = fs_devices->num_devices; 2711 memcpy(&fi_args->fsid, root->fs_info->fsid, sizeof(fi_args->fsid)); 2712 2713 list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { 2714 if (device->devid > fi_args->max_id) 2715 fi_args->max_id = device->devid; 2716 } 2717 mutex_unlock(&fs_devices->device_list_mutex); 2718 2719 fi_args->nodesize = root->fs_info->super_copy->nodesize; 2720 fi_args->sectorsize = root->fs_info->super_copy->sectorsize; 2721 fi_args->clone_alignment = root->fs_info->super_copy->sectorsize; 2722 2723 if (copy_to_user(arg, fi_args, sizeof(*fi_args))) 2724 ret = -EFAULT; 2725 2726 kfree(fi_args); 2727 return ret; 2728 } 2729 2730 static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg) 2731 { 2732 struct btrfs_ioctl_dev_info_args *di_args; 2733 struct btrfs_device *dev; 2734 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; 2735 int ret = 0; 2736 char *s_uuid = NULL; 2737 2738 di_args = memdup_user(arg, sizeof(*di_args)); 2739 if (IS_ERR(di_args)) 2740 return PTR_ERR(di_args); 2741 2742 if (!btrfs_is_empty_uuid(di_args->uuid)) 2743 s_uuid = di_args->uuid; 2744 2745 mutex_lock(&fs_devices->device_list_mutex); 2746 dev = btrfs_find_device(root->fs_info, di_args->devid, s_uuid, NULL); 2747 2748 if (!dev) { 2749 ret = -ENODEV; 2750 goto out; 2751 } 2752 2753 di_args->devid = dev->devid; 2754 di_args->bytes_used = btrfs_device_get_bytes_used(dev); 2755 di_args->total_bytes = btrfs_device_get_total_bytes(dev); 2756 memcpy(di_args->uuid, dev->uuid, sizeof(di_args->uuid)); 2757 if (dev->name) { 2758 struct rcu_string *name; 2759 2760 rcu_read_lock(); 2761 name = rcu_dereference(dev->name); 2762 strncpy(di_args->path, name->str, sizeof(di_args->path)); 2763 rcu_read_unlock(); 2764 di_args->path[sizeof(di_args->path) - 1] = 0; 2765 } else { 2766 di_args->path[0] = '\0'; 2767 } 2768 2769 out: 2770 mutex_unlock(&fs_devices->device_list_mutex); 2771 if (ret == 0 && copy_to_user(arg, di_args, sizeof(*di_args))) 2772 ret = -EFAULT; 2773 2774 kfree(di_args); 2775 return ret; 2776 } 2777 2778 static struct page *extent_same_get_page(struct inode *inode, u64 off) 2779 { 2780 struct page *page; 2781 pgoff_t index; 2782 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; 2783 2784 index = off >> PAGE_CACHE_SHIFT; 2785 2786 page = grab_cache_page(inode->i_mapping, index); 2787 if (!page) 2788 return NULL; 2789 2790 if (!PageUptodate(page)) { 2791 if (extent_read_full_page_nolock(tree, page, btrfs_get_extent, 2792 0)) 2793 return NULL; 2794 lock_page(page); 2795 if (!PageUptodate(page)) { 2796 unlock_page(page); 2797 page_cache_release(page); 2798 return NULL; 2799 } 2800 } 2801 unlock_page(page); 2802 2803 return page; 2804 } 2805 2806 static inline void lock_extent_range(struct inode *inode, u64 off, u64 len) 2807 { 2808 /* do any pending delalloc/csum calc on src, one way or 2809 another, and lock file content */ 2810 while (1) { 2811 struct btrfs_ordered_extent *ordered; 2812 lock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1); 2813 ordered = btrfs_lookup_first_ordered_extent(inode, 2814 off + len - 1); 2815 if ((!ordered || 2816 ordered->file_offset + ordered->len <= off || 2817 ordered->file_offset >= off + len) && 2818 !test_range_bit(&BTRFS_I(inode)->io_tree, off, 2819 off + len - 1, EXTENT_DELALLOC, 0, NULL)) { 2820 if (ordered) 2821 btrfs_put_ordered_extent(ordered); 2822 break; 2823 } 2824 unlock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1); 2825 if (ordered) 2826 btrfs_put_ordered_extent(ordered); 2827 btrfs_wait_ordered_range(inode, off, len); 2828 } 2829 } 2830 2831 static void btrfs_double_unlock(struct inode *inode1, u64 loff1, 2832 struct inode *inode2, u64 loff2, u64 len) 2833 { 2834 unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1); 2835 unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1); 2836 2837 mutex_unlock(&inode1->i_mutex); 2838 mutex_unlock(&inode2->i_mutex); 2839 } 2840 2841 static void btrfs_double_lock(struct inode *inode1, u64 loff1, 2842 struct inode *inode2, u64 loff2, u64 len) 2843 { 2844 if (inode1 < inode2) { 2845 swap(inode1, inode2); 2846 swap(loff1, loff2); 2847 } 2848 2849 mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT); 2850 lock_extent_range(inode1, loff1, len); 2851 if (inode1 != inode2) { 2852 mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD); 2853 lock_extent_range(inode2, loff2, len); 2854 } 2855 } 2856 2857 static int btrfs_cmp_data(struct inode *src, u64 loff, struct inode *dst, 2858 u64 dst_loff, u64 len) 2859 { 2860 int ret = 0; 2861 struct page *src_page, *dst_page; 2862 unsigned int cmp_len = PAGE_CACHE_SIZE; 2863 void *addr, *dst_addr; 2864 2865 while (len) { 2866 if (len < PAGE_CACHE_SIZE) 2867 cmp_len = len; 2868 2869 src_page = extent_same_get_page(src, loff); 2870 if (!src_page) 2871 return -EINVAL; 2872 dst_page = extent_same_get_page(dst, dst_loff); 2873 if (!dst_page) { 2874 page_cache_release(src_page); 2875 return -EINVAL; 2876 } 2877 addr = kmap_atomic(src_page); 2878 dst_addr = kmap_atomic(dst_page); 2879 2880 flush_dcache_page(src_page); 2881 flush_dcache_page(dst_page); 2882 2883 if (memcmp(addr, dst_addr, cmp_len)) 2884 ret = BTRFS_SAME_DATA_DIFFERS; 2885 2886 kunmap_atomic(addr); 2887 kunmap_atomic(dst_addr); 2888 page_cache_release(src_page); 2889 page_cache_release(dst_page); 2890 2891 if (ret) 2892 break; 2893 2894 loff += cmp_len; 2895 dst_loff += cmp_len; 2896 len -= cmp_len; 2897 } 2898 2899 return ret; 2900 } 2901 2902 static int extent_same_check_offsets(struct inode *inode, u64 off, u64 len) 2903 { 2904 u64 bs = BTRFS_I(inode)->root->fs_info->sb->s_blocksize; 2905 2906 if (off + len > inode->i_size || off + len < off) 2907 return -EINVAL; 2908 /* Check that we are block aligned - btrfs_clone() requires this */ 2909 if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs)) 2910 return -EINVAL; 2911 2912 return 0; 2913 } 2914 2915 static int btrfs_extent_same(struct inode *src, u64 loff, u64 len, 2916 struct inode *dst, u64 dst_loff) 2917 { 2918 int ret; 2919 2920 /* 2921 * btrfs_clone() can't handle extents in the same file 2922 * yet. Once that works, we can drop this check and replace it 2923 * with a check for the same inode, but overlapping extents. 2924 */ 2925 if (src == dst) 2926 return -EINVAL; 2927 2928 btrfs_double_lock(src, loff, dst, dst_loff, len); 2929 2930 ret = extent_same_check_offsets(src, loff, len); 2931 if (ret) 2932 goto out_unlock; 2933 2934 ret = extent_same_check_offsets(dst, dst_loff, len); 2935 if (ret) 2936 goto out_unlock; 2937 2938 /* don't make the dst file partly checksummed */ 2939 if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) != 2940 (BTRFS_I(dst)->flags & BTRFS_INODE_NODATASUM)) { 2941 ret = -EINVAL; 2942 goto out_unlock; 2943 } 2944 2945 ret = btrfs_cmp_data(src, loff, dst, dst_loff, len); 2946 if (ret == 0) 2947 ret = btrfs_clone(src, dst, loff, len, len, dst_loff); 2948 2949 out_unlock: 2950 btrfs_double_unlock(src, loff, dst, dst_loff, len); 2951 2952 return ret; 2953 } 2954 2955 #define BTRFS_MAX_DEDUPE_LEN (16 * 1024 * 1024) 2956 2957 static long btrfs_ioctl_file_extent_same(struct file *file, 2958 struct btrfs_ioctl_same_args __user *argp) 2959 { 2960 struct btrfs_ioctl_same_args *same; 2961 struct btrfs_ioctl_same_extent_info *info; 2962 struct inode *src = file_inode(file); 2963 u64 off; 2964 u64 len; 2965 int i; 2966 int ret; 2967 unsigned long size; 2968 u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize; 2969 bool is_admin = capable(CAP_SYS_ADMIN); 2970 u16 count; 2971 2972 if (!(file->f_mode & FMODE_READ)) 2973 return -EINVAL; 2974 2975 ret = mnt_want_write_file(file); 2976 if (ret) 2977 return ret; 2978 2979 if (get_user(count, &argp->dest_count)) { 2980 ret = -EFAULT; 2981 goto out; 2982 } 2983 2984 size = offsetof(struct btrfs_ioctl_same_args __user, info[count]); 2985 2986 same = memdup_user(argp, size); 2987 2988 if (IS_ERR(same)) { 2989 ret = PTR_ERR(same); 2990 goto out; 2991 } 2992 2993 off = same->logical_offset; 2994 len = same->length; 2995 2996 /* 2997 * Limit the total length we will dedupe for each operation. 2998 * This is intended to bound the total time spent in this 2999 * ioctl to something sane. 3000 */ 3001 if (len > BTRFS_MAX_DEDUPE_LEN) 3002 len = BTRFS_MAX_DEDUPE_LEN; 3003 3004 if (WARN_ON_ONCE(bs < PAGE_CACHE_SIZE)) { 3005 /* 3006 * Btrfs does not support blocksize < page_size. As a 3007 * result, btrfs_cmp_data() won't correctly handle 3008 * this situation without an update. 3009 */ 3010 ret = -EINVAL; 3011 goto out; 3012 } 3013 3014 ret = -EISDIR; 3015 if (S_ISDIR(src->i_mode)) 3016 goto out; 3017 3018 ret = -EACCES; 3019 if (!S_ISREG(src->i_mode)) 3020 goto out; 3021 3022 /* pre-format output fields to sane values */ 3023 for (i = 0; i < count; i++) { 3024 same->info[i].bytes_deduped = 0ULL; 3025 same->info[i].status = 0; 3026 } 3027 3028 for (i = 0, info = same->info; i < count; i++, info++) { 3029 struct inode *dst; 3030 struct fd dst_file = fdget(info->fd); 3031 if (!dst_file.file) { 3032 info->status = -EBADF; 3033 continue; 3034 } 3035 dst = file_inode(dst_file.file); 3036 3037 if (!(is_admin || (dst_file.file->f_mode & FMODE_WRITE))) { 3038 info->status = -EINVAL; 3039 } else if (file->f_path.mnt != dst_file.file->f_path.mnt) { 3040 info->status = -EXDEV; 3041 } else if (S_ISDIR(dst->i_mode)) { 3042 info->status = -EISDIR; 3043 } else if (!S_ISREG(dst->i_mode)) { 3044 info->status = -EACCES; 3045 } else { 3046 info->status = btrfs_extent_same(src, off, len, dst, 3047 info->logical_offset); 3048 if (info->status == 0) 3049 info->bytes_deduped += len; 3050 } 3051 fdput(dst_file); 3052 } 3053 3054 ret = copy_to_user(argp, same, size); 3055 if (ret) 3056 ret = -EFAULT; 3057 3058 out: 3059 mnt_drop_write_file(file); 3060 return ret; 3061 } 3062 3063 /* Helper to check and see if this root currently has a ref on the given disk 3064 * bytenr. If it does then we need to update the quota for this root. This 3065 * doesn't do anything if quotas aren't enabled. 3066 */ 3067 static int check_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 3068 u64 disko) 3069 { 3070 struct seq_list tree_mod_seq_elem = {}; 3071 struct ulist *roots; 3072 struct ulist_iterator uiter; 3073 struct ulist_node *root_node = NULL; 3074 int ret; 3075 3076 if (!root->fs_info->quota_enabled) 3077 return 1; 3078 3079 btrfs_get_tree_mod_seq(root->fs_info, &tree_mod_seq_elem); 3080 ret = btrfs_find_all_roots(trans, root->fs_info, disko, 3081 tree_mod_seq_elem.seq, &roots); 3082 if (ret < 0) 3083 goto out; 3084 ret = 0; 3085 ULIST_ITER_INIT(&uiter); 3086 while ((root_node = ulist_next(roots, &uiter))) { 3087 if (root_node->val == root->objectid) { 3088 ret = 1; 3089 break; 3090 } 3091 } 3092 ulist_free(roots); 3093 out: 3094 btrfs_put_tree_mod_seq(root->fs_info, &tree_mod_seq_elem); 3095 return ret; 3096 } 3097 3098 static int clone_finish_inode_update(struct btrfs_trans_handle *trans, 3099 struct inode *inode, 3100 u64 endoff, 3101 const u64 destoff, 3102 const u64 olen) 3103 { 3104 struct btrfs_root *root = BTRFS_I(inode)->root; 3105 int ret; 3106 3107 inode_inc_iversion(inode); 3108 inode->i_mtime = inode->i_ctime = CURRENT_TIME; 3109 /* 3110 * We round up to the block size at eof when determining which 3111 * extents to clone above, but shouldn't round up the file size. 3112 */ 3113 if (endoff > destoff + olen) 3114 endoff = destoff + olen; 3115 if (endoff > inode->i_size) 3116 btrfs_i_size_write(inode, endoff); 3117 3118 ret = btrfs_update_inode(trans, root, inode); 3119 if (ret) { 3120 btrfs_abort_transaction(trans, root, ret); 3121 btrfs_end_transaction(trans, root); 3122 goto out; 3123 } 3124 ret = btrfs_end_transaction(trans, root); 3125 out: 3126 return ret; 3127 } 3128 3129 static void clone_update_extent_map(struct inode *inode, 3130 const struct btrfs_trans_handle *trans, 3131 const struct btrfs_path *path, 3132 const u64 hole_offset, 3133 const u64 hole_len) 3134 { 3135 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 3136 struct extent_map *em; 3137 int ret; 3138 3139 em = alloc_extent_map(); 3140 if (!em) { 3141 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 3142 &BTRFS_I(inode)->runtime_flags); 3143 return; 3144 } 3145 3146 if (path) { 3147 struct btrfs_file_extent_item *fi; 3148 3149 fi = btrfs_item_ptr(path->nodes[0], path->slots[0], 3150 struct btrfs_file_extent_item); 3151 btrfs_extent_item_to_extent_map(inode, path, fi, false, em); 3152 em->generation = -1; 3153 if (btrfs_file_extent_type(path->nodes[0], fi) == 3154 BTRFS_FILE_EXTENT_INLINE) 3155 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 3156 &BTRFS_I(inode)->runtime_flags); 3157 } else { 3158 em->start = hole_offset; 3159 em->len = hole_len; 3160 em->ram_bytes = em->len; 3161 em->orig_start = hole_offset; 3162 em->block_start = EXTENT_MAP_HOLE; 3163 em->block_len = 0; 3164 em->orig_block_len = 0; 3165 em->compress_type = BTRFS_COMPRESS_NONE; 3166 em->generation = trans->transid; 3167 } 3168 3169 while (1) { 3170 write_lock(&em_tree->lock); 3171 ret = add_extent_mapping(em_tree, em, 1); 3172 write_unlock(&em_tree->lock); 3173 if (ret != -EEXIST) { 3174 free_extent_map(em); 3175 break; 3176 } 3177 btrfs_drop_extent_cache(inode, em->start, 3178 em->start + em->len - 1, 0); 3179 } 3180 3181 if (ret) 3182 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 3183 &BTRFS_I(inode)->runtime_flags); 3184 } 3185 3186 /** 3187 * btrfs_clone() - clone a range from inode file to another 3188 * 3189 * @src: Inode to clone from 3190 * @inode: Inode to clone to 3191 * @off: Offset within source to start clone from 3192 * @olen: Original length, passed by user, of range to clone 3193 * @olen_aligned: Block-aligned value of olen, extent_same uses 3194 * identical values here 3195 * @destoff: Offset within @inode to start clone 3196 */ 3197 static int btrfs_clone(struct inode *src, struct inode *inode, 3198 const u64 off, const u64 olen, const u64 olen_aligned, 3199 const u64 destoff) 3200 { 3201 struct btrfs_root *root = BTRFS_I(inode)->root; 3202 struct btrfs_path *path = NULL; 3203 struct extent_buffer *leaf; 3204 struct btrfs_trans_handle *trans; 3205 char *buf = NULL; 3206 struct btrfs_key key; 3207 u32 nritems; 3208 int slot; 3209 int ret; 3210 int no_quota; 3211 const u64 len = olen_aligned; 3212 u64 last_disko = 0; 3213 u64 last_dest_end = destoff; 3214 3215 ret = -ENOMEM; 3216 buf = vmalloc(root->nodesize); 3217 if (!buf) 3218 return ret; 3219 3220 path = btrfs_alloc_path(); 3221 if (!path) { 3222 vfree(buf); 3223 return ret; 3224 } 3225 3226 path->reada = 2; 3227 /* clone data */ 3228 key.objectid = btrfs_ino(src); 3229 key.type = BTRFS_EXTENT_DATA_KEY; 3230 key.offset = off; 3231 3232 while (1) { 3233 /* 3234 * note the key will change type as we walk through the 3235 * tree. 3236 */ 3237 path->leave_spinning = 1; 3238 ret = btrfs_search_slot(NULL, BTRFS_I(src)->root, &key, path, 3239 0, 0); 3240 if (ret < 0) 3241 goto out; 3242 /* 3243 * First search, if no extent item that starts at offset off was 3244 * found but the previous item is an extent item, it's possible 3245 * it might overlap our target range, therefore process it. 3246 */ 3247 if (key.offset == off && ret > 0 && path->slots[0] > 0) { 3248 btrfs_item_key_to_cpu(path->nodes[0], &key, 3249 path->slots[0] - 1); 3250 if (key.type == BTRFS_EXTENT_DATA_KEY) 3251 path->slots[0]--; 3252 } 3253 3254 nritems = btrfs_header_nritems(path->nodes[0]); 3255 process_slot: 3256 no_quota = 1; 3257 if (path->slots[0] >= nritems) { 3258 ret = btrfs_next_leaf(BTRFS_I(src)->root, path); 3259 if (ret < 0) 3260 goto out; 3261 if (ret > 0) 3262 break; 3263 nritems = btrfs_header_nritems(path->nodes[0]); 3264 } 3265 leaf = path->nodes[0]; 3266 slot = path->slots[0]; 3267 3268 btrfs_item_key_to_cpu(leaf, &key, slot); 3269 if (key.type > BTRFS_EXTENT_DATA_KEY || 3270 key.objectid != btrfs_ino(src)) 3271 break; 3272 3273 if (key.type == BTRFS_EXTENT_DATA_KEY) { 3274 struct btrfs_file_extent_item *extent; 3275 int type; 3276 u32 size; 3277 struct btrfs_key new_key; 3278 u64 disko = 0, diskl = 0; 3279 u64 datao = 0, datal = 0; 3280 u8 comp; 3281 u64 drop_start; 3282 3283 extent = btrfs_item_ptr(leaf, slot, 3284 struct btrfs_file_extent_item); 3285 comp = btrfs_file_extent_compression(leaf, extent); 3286 type = btrfs_file_extent_type(leaf, extent); 3287 if (type == BTRFS_FILE_EXTENT_REG || 3288 type == BTRFS_FILE_EXTENT_PREALLOC) { 3289 disko = btrfs_file_extent_disk_bytenr(leaf, 3290 extent); 3291 diskl = btrfs_file_extent_disk_num_bytes(leaf, 3292 extent); 3293 datao = btrfs_file_extent_offset(leaf, extent); 3294 datal = btrfs_file_extent_num_bytes(leaf, 3295 extent); 3296 } else if (type == BTRFS_FILE_EXTENT_INLINE) { 3297 /* take upper bound, may be compressed */ 3298 datal = btrfs_file_extent_ram_bytes(leaf, 3299 extent); 3300 } 3301 3302 /* 3303 * The first search might have left us at an extent 3304 * item that ends before our target range's start, can 3305 * happen if we have holes and NO_HOLES feature enabled. 3306 */ 3307 if (key.offset + datal <= off) { 3308 path->slots[0]++; 3309 goto process_slot; 3310 } else if (key.offset >= off + len) { 3311 break; 3312 } 3313 3314 size = btrfs_item_size_nr(leaf, slot); 3315 read_extent_buffer(leaf, buf, 3316 btrfs_item_ptr_offset(leaf, slot), 3317 size); 3318 3319 btrfs_release_path(path); 3320 path->leave_spinning = 0; 3321 3322 memcpy(&new_key, &key, sizeof(new_key)); 3323 new_key.objectid = btrfs_ino(inode); 3324 if (off <= key.offset) 3325 new_key.offset = key.offset + destoff - off; 3326 else 3327 new_key.offset = destoff; 3328 3329 /* 3330 * Deal with a hole that doesn't have an extent item 3331 * that represents it (NO_HOLES feature enabled). 3332 * This hole is either in the middle of the cloning 3333 * range or at the beginning (fully overlaps it or 3334 * partially overlaps it). 3335 */ 3336 if (new_key.offset != last_dest_end) 3337 drop_start = last_dest_end; 3338 else 3339 drop_start = new_key.offset; 3340 3341 /* 3342 * 1 - adjusting old extent (we may have to split it) 3343 * 1 - add new extent 3344 * 1 - inode update 3345 */ 3346 trans = btrfs_start_transaction(root, 3); 3347 if (IS_ERR(trans)) { 3348 ret = PTR_ERR(trans); 3349 goto out; 3350 } 3351 3352 if (type == BTRFS_FILE_EXTENT_REG || 3353 type == BTRFS_FILE_EXTENT_PREALLOC) { 3354 /* 3355 * a | --- range to clone ---| b 3356 * | ------------- extent ------------- | 3357 */ 3358 3359 /* subtract range b */ 3360 if (key.offset + datal > off + len) 3361 datal = off + len - key.offset; 3362 3363 /* subtract range a */ 3364 if (off > key.offset) { 3365 datao += off - key.offset; 3366 datal -= off - key.offset; 3367 } 3368 3369 ret = btrfs_drop_extents(trans, root, inode, 3370 drop_start, 3371 new_key.offset + datal, 3372 1); 3373 if (ret) { 3374 if (ret != -EOPNOTSUPP) 3375 btrfs_abort_transaction(trans, 3376 root, ret); 3377 btrfs_end_transaction(trans, root); 3378 goto out; 3379 } 3380 3381 ret = btrfs_insert_empty_item(trans, root, path, 3382 &new_key, size); 3383 if (ret) { 3384 btrfs_abort_transaction(trans, root, 3385 ret); 3386 btrfs_end_transaction(trans, root); 3387 goto out; 3388 } 3389 3390 leaf = path->nodes[0]; 3391 slot = path->slots[0]; 3392 write_extent_buffer(leaf, buf, 3393 btrfs_item_ptr_offset(leaf, slot), 3394 size); 3395 3396 extent = btrfs_item_ptr(leaf, slot, 3397 struct btrfs_file_extent_item); 3398 3399 /* disko == 0 means it's a hole */ 3400 if (!disko) 3401 datao = 0; 3402 3403 btrfs_set_file_extent_offset(leaf, extent, 3404 datao); 3405 btrfs_set_file_extent_num_bytes(leaf, extent, 3406 datal); 3407 3408 /* 3409 * We need to look up the roots that point at 3410 * this bytenr and see if the new root does. If 3411 * it does not we need to make sure we update 3412 * quotas appropriately. 3413 */ 3414 if (disko && root != BTRFS_I(src)->root && 3415 disko != last_disko) { 3416 no_quota = check_ref(trans, root, 3417 disko); 3418 if (no_quota < 0) { 3419 btrfs_abort_transaction(trans, 3420 root, 3421 ret); 3422 btrfs_end_transaction(trans, 3423 root); 3424 ret = no_quota; 3425 goto out; 3426 } 3427 } 3428 3429 if (disko) { 3430 inode_add_bytes(inode, datal); 3431 ret = btrfs_inc_extent_ref(trans, root, 3432 disko, diskl, 0, 3433 root->root_key.objectid, 3434 btrfs_ino(inode), 3435 new_key.offset - datao, 3436 no_quota); 3437 if (ret) { 3438 btrfs_abort_transaction(trans, 3439 root, 3440 ret); 3441 btrfs_end_transaction(trans, 3442 root); 3443 goto out; 3444 3445 } 3446 } 3447 } else if (type == BTRFS_FILE_EXTENT_INLINE) { 3448 u64 skip = 0; 3449 u64 trim = 0; 3450 u64 aligned_end = 0; 3451 3452 if (off > key.offset) { 3453 skip = off - key.offset; 3454 new_key.offset += skip; 3455 } 3456 3457 if (key.offset + datal > off + len) 3458 trim = key.offset + datal - (off + len); 3459 3460 if (comp && (skip || trim)) { 3461 ret = -EINVAL; 3462 btrfs_end_transaction(trans, root); 3463 goto out; 3464 } 3465 size -= skip + trim; 3466 datal -= skip + trim; 3467 3468 aligned_end = ALIGN(new_key.offset + datal, 3469 root->sectorsize); 3470 ret = btrfs_drop_extents(trans, root, inode, 3471 drop_start, 3472 aligned_end, 3473 1); 3474 if (ret) { 3475 if (ret != -EOPNOTSUPP) 3476 btrfs_abort_transaction(trans, 3477 root, ret); 3478 btrfs_end_transaction(trans, root); 3479 goto out; 3480 } 3481 3482 ret = btrfs_insert_empty_item(trans, root, path, 3483 &new_key, size); 3484 if (ret) { 3485 btrfs_abort_transaction(trans, root, 3486 ret); 3487 btrfs_end_transaction(trans, root); 3488 goto out; 3489 } 3490 3491 if (skip) { 3492 u32 start = 3493 btrfs_file_extent_calc_inline_size(0); 3494 memmove(buf+start, buf+start+skip, 3495 datal); 3496 } 3497 3498 leaf = path->nodes[0]; 3499 slot = path->slots[0]; 3500 write_extent_buffer(leaf, buf, 3501 btrfs_item_ptr_offset(leaf, slot), 3502 size); 3503 inode_add_bytes(inode, datal); 3504 } 3505 3506 /* If we have an implicit hole (NO_HOLES feature). */ 3507 if (drop_start < new_key.offset) 3508 clone_update_extent_map(inode, trans, 3509 NULL, drop_start, 3510 new_key.offset - drop_start); 3511 3512 clone_update_extent_map(inode, trans, path, 0, 0); 3513 3514 btrfs_mark_buffer_dirty(leaf); 3515 btrfs_release_path(path); 3516 3517 last_dest_end = ALIGN(new_key.offset + datal, 3518 root->sectorsize); 3519 ret = clone_finish_inode_update(trans, inode, 3520 last_dest_end, 3521 destoff, olen); 3522 if (ret) 3523 goto out; 3524 if (new_key.offset + datal >= destoff + len) 3525 break; 3526 } 3527 btrfs_release_path(path); 3528 key.offset++; 3529 } 3530 ret = 0; 3531 3532 if (last_dest_end < destoff + len) { 3533 /* 3534 * We have an implicit hole (NO_HOLES feature is enabled) that 3535 * fully or partially overlaps our cloning range at its end. 3536 */ 3537 btrfs_release_path(path); 3538 3539 /* 3540 * 1 - remove extent(s) 3541 * 1 - inode update 3542 */ 3543 trans = btrfs_start_transaction(root, 2); 3544 if (IS_ERR(trans)) { 3545 ret = PTR_ERR(trans); 3546 goto out; 3547 } 3548 ret = btrfs_drop_extents(trans, root, inode, 3549 last_dest_end, destoff + len, 1); 3550 if (ret) { 3551 if (ret != -EOPNOTSUPP) 3552 btrfs_abort_transaction(trans, root, ret); 3553 btrfs_end_transaction(trans, root); 3554 goto out; 3555 } 3556 clone_update_extent_map(inode, trans, NULL, last_dest_end, 3557 destoff + len - last_dest_end); 3558 ret = clone_finish_inode_update(trans, inode, destoff + len, 3559 destoff, olen); 3560 } 3561 3562 out: 3563 btrfs_free_path(path); 3564 vfree(buf); 3565 return ret; 3566 } 3567 3568 static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, 3569 u64 off, u64 olen, u64 destoff) 3570 { 3571 struct inode *inode = file_inode(file); 3572 struct btrfs_root *root = BTRFS_I(inode)->root; 3573 struct fd src_file; 3574 struct inode *src; 3575 int ret; 3576 u64 len = olen; 3577 u64 bs = root->fs_info->sb->s_blocksize; 3578 int same_inode = 0; 3579 3580 /* 3581 * TODO: 3582 * - split compressed inline extents. annoying: we need to 3583 * decompress into destination's address_space (the file offset 3584 * may change, so source mapping won't do), then recompress (or 3585 * otherwise reinsert) a subrange. 3586 * 3587 * - split destination inode's inline extents. The inline extents can 3588 * be either compressed or non-compressed. 3589 */ 3590 3591 /* the destination must be opened for writing */ 3592 if (!(file->f_mode & FMODE_WRITE) || (file->f_flags & O_APPEND)) 3593 return -EINVAL; 3594 3595 if (btrfs_root_readonly(root)) 3596 return -EROFS; 3597 3598 ret = mnt_want_write_file(file); 3599 if (ret) 3600 return ret; 3601 3602 src_file = fdget(srcfd); 3603 if (!src_file.file) { 3604 ret = -EBADF; 3605 goto out_drop_write; 3606 } 3607 3608 ret = -EXDEV; 3609 if (src_file.file->f_path.mnt != file->f_path.mnt) 3610 goto out_fput; 3611 3612 src = file_inode(src_file.file); 3613 3614 ret = -EINVAL; 3615 if (src == inode) 3616 same_inode = 1; 3617 3618 /* the src must be open for reading */ 3619 if (!(src_file.file->f_mode & FMODE_READ)) 3620 goto out_fput; 3621 3622 /* don't make the dst file partly checksummed */ 3623 if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) != 3624 (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) 3625 goto out_fput; 3626 3627 ret = -EISDIR; 3628 if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode)) 3629 goto out_fput; 3630 3631 ret = -EXDEV; 3632 if (src->i_sb != inode->i_sb) 3633 goto out_fput; 3634 3635 if (!same_inode) { 3636 if (inode < src) { 3637 mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT); 3638 mutex_lock_nested(&src->i_mutex, I_MUTEX_CHILD); 3639 } else { 3640 mutex_lock_nested(&src->i_mutex, I_MUTEX_PARENT); 3641 mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD); 3642 } 3643 } else { 3644 mutex_lock(&src->i_mutex); 3645 } 3646 3647 /* determine range to clone */ 3648 ret = -EINVAL; 3649 if (off + len > src->i_size || off + len < off) 3650 goto out_unlock; 3651 if (len == 0) 3652 olen = len = src->i_size - off; 3653 /* if we extend to eof, continue to block boundary */ 3654 if (off + len == src->i_size) 3655 len = ALIGN(src->i_size, bs) - off; 3656 3657 /* verify the end result is block aligned */ 3658 if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs) || 3659 !IS_ALIGNED(destoff, bs)) 3660 goto out_unlock; 3661 3662 /* verify if ranges are overlapped within the same file */ 3663 if (same_inode) { 3664 if (destoff + len > off && destoff < off + len) 3665 goto out_unlock; 3666 } 3667 3668 if (destoff > inode->i_size) { 3669 ret = btrfs_cont_expand(inode, inode->i_size, destoff); 3670 if (ret) 3671 goto out_unlock; 3672 } 3673 3674 /* 3675 * Lock the target range too. Right after we replace the file extent 3676 * items in the fs tree (which now point to the cloned data), we might 3677 * have a worker replace them with extent items relative to a write 3678 * operation that was issued before this clone operation (i.e. confront 3679 * with inode.c:btrfs_finish_ordered_io). 3680 */ 3681 if (same_inode) { 3682 u64 lock_start = min_t(u64, off, destoff); 3683 u64 lock_len = max_t(u64, off, destoff) + len - lock_start; 3684 3685 lock_extent_range(src, lock_start, lock_len); 3686 } else { 3687 lock_extent_range(src, off, len); 3688 lock_extent_range(inode, destoff, len); 3689 } 3690 3691 ret = btrfs_clone(src, inode, off, olen, len, destoff); 3692 3693 if (same_inode) { 3694 u64 lock_start = min_t(u64, off, destoff); 3695 u64 lock_end = max_t(u64, off, destoff) + len - 1; 3696 3697 unlock_extent(&BTRFS_I(src)->io_tree, lock_start, lock_end); 3698 } else { 3699 unlock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1); 3700 unlock_extent(&BTRFS_I(inode)->io_tree, destoff, 3701 destoff + len - 1); 3702 } 3703 /* 3704 * Truncate page cache pages so that future reads will see the cloned 3705 * data immediately and not the previous data. 3706 */ 3707 truncate_inode_pages_range(&inode->i_data, destoff, 3708 PAGE_CACHE_ALIGN(destoff + len) - 1); 3709 out_unlock: 3710 if (!same_inode) { 3711 if (inode < src) { 3712 mutex_unlock(&src->i_mutex); 3713 mutex_unlock(&inode->i_mutex); 3714 } else { 3715 mutex_unlock(&inode->i_mutex); 3716 mutex_unlock(&src->i_mutex); 3717 } 3718 } else { 3719 mutex_unlock(&src->i_mutex); 3720 } 3721 out_fput: 3722 fdput(src_file); 3723 out_drop_write: 3724 mnt_drop_write_file(file); 3725 return ret; 3726 } 3727 3728 static long btrfs_ioctl_clone_range(struct file *file, void __user *argp) 3729 { 3730 struct btrfs_ioctl_clone_range_args args; 3731 3732 if (copy_from_user(&args, argp, sizeof(args))) 3733 return -EFAULT; 3734 return btrfs_ioctl_clone(file, args.src_fd, args.src_offset, 3735 args.src_length, args.dest_offset); 3736 } 3737 3738 /* 3739 * there are many ways the trans_start and trans_end ioctls can lead 3740 * to deadlocks. They should only be used by applications that 3741 * basically own the machine, and have a very in depth understanding 3742 * of all the possible deadlocks and enospc problems. 3743 */ 3744 static long btrfs_ioctl_trans_start(struct file *file) 3745 { 3746 struct inode *inode = file_inode(file); 3747 struct btrfs_root *root = BTRFS_I(inode)->root; 3748 struct btrfs_trans_handle *trans; 3749 int ret; 3750 3751 ret = -EPERM; 3752 if (!capable(CAP_SYS_ADMIN)) 3753 goto out; 3754 3755 ret = -EINPROGRESS; 3756 if (file->private_data) 3757 goto out; 3758 3759 ret = -EROFS; 3760 if (btrfs_root_readonly(root)) 3761 goto out; 3762 3763 ret = mnt_want_write_file(file); 3764 if (ret) 3765 goto out; 3766 3767 atomic_inc(&root->fs_info->open_ioctl_trans); 3768 3769 ret = -ENOMEM; 3770 trans = btrfs_start_ioctl_transaction(root); 3771 if (IS_ERR(trans)) 3772 goto out_drop; 3773 3774 file->private_data = trans; 3775 return 0; 3776 3777 out_drop: 3778 atomic_dec(&root->fs_info->open_ioctl_trans); 3779 mnt_drop_write_file(file); 3780 out: 3781 return ret; 3782 } 3783 3784 static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) 3785 { 3786 struct inode *inode = file_inode(file); 3787 struct btrfs_root *root = BTRFS_I(inode)->root; 3788 struct btrfs_root *new_root; 3789 struct btrfs_dir_item *di; 3790 struct btrfs_trans_handle *trans; 3791 struct btrfs_path *path; 3792 struct btrfs_key location; 3793 struct btrfs_disk_key disk_key; 3794 u64 objectid = 0; 3795 u64 dir_id; 3796 int ret; 3797 3798 if (!capable(CAP_SYS_ADMIN)) 3799 return -EPERM; 3800 3801 ret = mnt_want_write_file(file); 3802 if (ret) 3803 return ret; 3804 3805 if (copy_from_user(&objectid, argp, sizeof(objectid))) { 3806 ret = -EFAULT; 3807 goto out; 3808 } 3809 3810 if (!objectid) 3811 objectid = BTRFS_FS_TREE_OBJECTID; 3812 3813 location.objectid = objectid; 3814 location.type = BTRFS_ROOT_ITEM_KEY; 3815 location.offset = (u64)-1; 3816 3817 new_root = btrfs_read_fs_root_no_name(root->fs_info, &location); 3818 if (IS_ERR(new_root)) { 3819 ret = PTR_ERR(new_root); 3820 goto out; 3821 } 3822 3823 path = btrfs_alloc_path(); 3824 if (!path) { 3825 ret = -ENOMEM; 3826 goto out; 3827 } 3828 path->leave_spinning = 1; 3829 3830 trans = btrfs_start_transaction(root, 1); 3831 if (IS_ERR(trans)) { 3832 btrfs_free_path(path); 3833 ret = PTR_ERR(trans); 3834 goto out; 3835 } 3836 3837 dir_id = btrfs_super_root_dir(root->fs_info->super_copy); 3838 di = btrfs_lookup_dir_item(trans, root->fs_info->tree_root, path, 3839 dir_id, "default", 7, 1); 3840 if (IS_ERR_OR_NULL(di)) { 3841 btrfs_free_path(path); 3842 btrfs_end_transaction(trans, root); 3843 btrfs_err(new_root->fs_info, "Umm, you don't have the default dir" 3844 "item, this isn't going to work"); 3845 ret = -ENOENT; 3846 goto out; 3847 } 3848 3849 btrfs_cpu_key_to_disk(&disk_key, &new_root->root_key); 3850 btrfs_set_dir_item_key(path->nodes[0], di, &disk_key); 3851 btrfs_mark_buffer_dirty(path->nodes[0]); 3852 btrfs_free_path(path); 3853 3854 btrfs_set_fs_incompat(root->fs_info, DEFAULT_SUBVOL); 3855 btrfs_end_transaction(trans, root); 3856 out: 3857 mnt_drop_write_file(file); 3858 return ret; 3859 } 3860 3861 void btrfs_get_block_group_info(struct list_head *groups_list, 3862 struct btrfs_ioctl_space_info *space) 3863 { 3864 struct btrfs_block_group_cache *block_group; 3865 3866 space->total_bytes = 0; 3867 space->used_bytes = 0; 3868 space->flags = 0; 3869 list_for_each_entry(block_group, groups_list, list) { 3870 space->flags = block_group->flags; 3871 space->total_bytes += block_group->key.offset; 3872 space->used_bytes += 3873 btrfs_block_group_used(&block_group->item); 3874 } 3875 } 3876 3877 static long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg) 3878 { 3879 struct btrfs_ioctl_space_args space_args; 3880 struct btrfs_ioctl_space_info space; 3881 struct btrfs_ioctl_space_info *dest; 3882 struct btrfs_ioctl_space_info *dest_orig; 3883 struct btrfs_ioctl_space_info __user *user_dest; 3884 struct btrfs_space_info *info; 3885 u64 types[] = {BTRFS_BLOCK_GROUP_DATA, 3886 BTRFS_BLOCK_GROUP_SYSTEM, 3887 BTRFS_BLOCK_GROUP_METADATA, 3888 BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA}; 3889 int num_types = 4; 3890 int alloc_size; 3891 int ret = 0; 3892 u64 slot_count = 0; 3893 int i, c; 3894 3895 if (copy_from_user(&space_args, 3896 (struct btrfs_ioctl_space_args __user *)arg, 3897 sizeof(space_args))) 3898 return -EFAULT; 3899 3900 for (i = 0; i < num_types; i++) { 3901 struct btrfs_space_info *tmp; 3902 3903 info = NULL; 3904 rcu_read_lock(); 3905 list_for_each_entry_rcu(tmp, &root->fs_info->space_info, 3906 list) { 3907 if (tmp->flags == types[i]) { 3908 info = tmp; 3909 break; 3910 } 3911 } 3912 rcu_read_unlock(); 3913 3914 if (!info) 3915 continue; 3916 3917 down_read(&info->groups_sem); 3918 for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) { 3919 if (!list_empty(&info->block_groups[c])) 3920 slot_count++; 3921 } 3922 up_read(&info->groups_sem); 3923 } 3924 3925 /* 3926 * Global block reserve, exported as a space_info 3927 */ 3928 slot_count++; 3929 3930 /* space_slots == 0 means they are asking for a count */ 3931 if (space_args.space_slots == 0) { 3932 space_args.total_spaces = slot_count; 3933 goto out; 3934 } 3935 3936 slot_count = min_t(u64, space_args.space_slots, slot_count); 3937 3938 alloc_size = sizeof(*dest) * slot_count; 3939 3940 /* we generally have at most 6 or so space infos, one for each raid 3941 * level. So, a whole page should be more than enough for everyone 3942 */ 3943 if (alloc_size > PAGE_CACHE_SIZE) 3944 return -ENOMEM; 3945 3946 space_args.total_spaces = 0; 3947 dest = kmalloc(alloc_size, GFP_NOFS); 3948 if (!dest) 3949 return -ENOMEM; 3950 dest_orig = dest; 3951 3952 /* now we have a buffer to copy into */ 3953 for (i = 0; i < num_types; i++) { 3954 struct btrfs_space_info *tmp; 3955 3956 if (!slot_count) 3957 break; 3958 3959 info = NULL; 3960 rcu_read_lock(); 3961 list_for_each_entry_rcu(tmp, &root->fs_info->space_info, 3962 list) { 3963 if (tmp->flags == types[i]) { 3964 info = tmp; 3965 break; 3966 } 3967 } 3968 rcu_read_unlock(); 3969 3970 if (!info) 3971 continue; 3972 down_read(&info->groups_sem); 3973 for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) { 3974 if (!list_empty(&info->block_groups[c])) { 3975 btrfs_get_block_group_info( 3976 &info->block_groups[c], &space); 3977 memcpy(dest, &space, sizeof(space)); 3978 dest++; 3979 space_args.total_spaces++; 3980 slot_count--; 3981 } 3982 if (!slot_count) 3983 break; 3984 } 3985 up_read(&info->groups_sem); 3986 } 3987 3988 /* 3989 * Add global block reserve 3990 */ 3991 if (slot_count) { 3992 struct btrfs_block_rsv *block_rsv = &root->fs_info->global_block_rsv; 3993 3994 spin_lock(&block_rsv->lock); 3995 space.total_bytes = block_rsv->size; 3996 space.used_bytes = block_rsv->size - block_rsv->reserved; 3997 spin_unlock(&block_rsv->lock); 3998 space.flags = BTRFS_SPACE_INFO_GLOBAL_RSV; 3999 memcpy(dest, &space, sizeof(space)); 4000 space_args.total_spaces++; 4001 } 4002 4003 user_dest = (struct btrfs_ioctl_space_info __user *) 4004 (arg + sizeof(struct btrfs_ioctl_space_args)); 4005 4006 if (copy_to_user(user_dest, dest_orig, alloc_size)) 4007 ret = -EFAULT; 4008 4009 kfree(dest_orig); 4010 out: 4011 if (ret == 0 && copy_to_user(arg, &space_args, sizeof(space_args))) 4012 ret = -EFAULT; 4013 4014 return ret; 4015 } 4016 4017 /* 4018 * there are many ways the trans_start and trans_end ioctls can lead 4019 * to deadlocks. They should only be used by applications that 4020 * basically own the machine, and have a very in depth understanding 4021 * of all the possible deadlocks and enospc problems. 4022 */ 4023 long btrfs_ioctl_trans_end(struct file *file) 4024 { 4025 struct inode *inode = file_inode(file); 4026 struct btrfs_root *root = BTRFS_I(inode)->root; 4027 struct btrfs_trans_handle *trans; 4028 4029 trans = file->private_data; 4030 if (!trans) 4031 return -EINVAL; 4032 file->private_data = NULL; 4033 4034 btrfs_end_transaction(trans, root); 4035 4036 atomic_dec(&root->fs_info->open_ioctl_trans); 4037 4038 mnt_drop_write_file(file); 4039 return 0; 4040 } 4041 4042 static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root, 4043 void __user *argp) 4044 { 4045 struct btrfs_trans_handle *trans; 4046 u64 transid; 4047 int ret; 4048 4049 trans = btrfs_attach_transaction_barrier(root); 4050 if (IS_ERR(trans)) { 4051 if (PTR_ERR(trans) != -ENOENT) 4052 return PTR_ERR(trans); 4053 4054 /* No running transaction, don't bother */ 4055 transid = root->fs_info->last_trans_committed; 4056 goto out; 4057 } 4058 transid = trans->transid; 4059 ret = btrfs_commit_transaction_async(trans, root, 0); 4060 if (ret) { 4061 btrfs_end_transaction(trans, root); 4062 return ret; 4063 } 4064 out: 4065 if (argp) 4066 if (copy_to_user(argp, &transid, sizeof(transid))) 4067 return -EFAULT; 4068 return 0; 4069 } 4070 4071 static noinline long btrfs_ioctl_wait_sync(struct btrfs_root *root, 4072 void __user *argp) 4073 { 4074 u64 transid; 4075 4076 if (argp) { 4077 if (copy_from_user(&transid, argp, sizeof(transid))) 4078 return -EFAULT; 4079 } else { 4080 transid = 0; /* current trans */ 4081 } 4082 return btrfs_wait_for_commit(root, transid); 4083 } 4084 4085 static long btrfs_ioctl_scrub(struct file *file, void __user *arg) 4086 { 4087 struct btrfs_root *root = BTRFS_I(file_inode(file))->root; 4088 struct btrfs_ioctl_scrub_args *sa; 4089 int ret; 4090 4091 if (!capable(CAP_SYS_ADMIN)) 4092 return -EPERM; 4093 4094 sa = memdup_user(arg, sizeof(*sa)); 4095 if (IS_ERR(sa)) 4096 return PTR_ERR(sa); 4097 4098 if (!(sa->flags & BTRFS_SCRUB_READONLY)) { 4099 ret = mnt_want_write_file(file); 4100 if (ret) 4101 goto out; 4102 } 4103 4104 ret = btrfs_scrub_dev(root->fs_info, sa->devid, sa->start, sa->end, 4105 &sa->progress, sa->flags & BTRFS_SCRUB_READONLY, 4106 0); 4107 4108 if (copy_to_user(arg, sa, sizeof(*sa))) 4109 ret = -EFAULT; 4110 4111 if (!(sa->flags & BTRFS_SCRUB_READONLY)) 4112 mnt_drop_write_file(file); 4113 out: 4114 kfree(sa); 4115 return ret; 4116 } 4117 4118 static long btrfs_ioctl_scrub_cancel(struct btrfs_root *root, void __user *arg) 4119 { 4120 if (!capable(CAP_SYS_ADMIN)) 4121 return -EPERM; 4122 4123 return btrfs_scrub_cancel(root->fs_info); 4124 } 4125 4126 static long btrfs_ioctl_scrub_progress(struct btrfs_root *root, 4127 void __user *arg) 4128 { 4129 struct btrfs_ioctl_scrub_args *sa; 4130 int ret; 4131 4132 if (!capable(CAP_SYS_ADMIN)) 4133 return -EPERM; 4134 4135 sa = memdup_user(arg, sizeof(*sa)); 4136 if (IS_ERR(sa)) 4137 return PTR_ERR(sa); 4138 4139 ret = btrfs_scrub_progress(root, sa->devid, &sa->progress); 4140 4141 if (copy_to_user(arg, sa, sizeof(*sa))) 4142 ret = -EFAULT; 4143 4144 kfree(sa); 4145 return ret; 4146 } 4147 4148 static long btrfs_ioctl_get_dev_stats(struct btrfs_root *root, 4149 void __user *arg) 4150 { 4151 struct btrfs_ioctl_get_dev_stats *sa; 4152 int ret; 4153 4154 sa = memdup_user(arg, sizeof(*sa)); 4155 if (IS_ERR(sa)) 4156 return PTR_ERR(sa); 4157 4158 if ((sa->flags & BTRFS_DEV_STATS_RESET) && !capable(CAP_SYS_ADMIN)) { 4159 kfree(sa); 4160 return -EPERM; 4161 } 4162 4163 ret = btrfs_get_dev_stats(root, sa); 4164 4165 if (copy_to_user(arg, sa, sizeof(*sa))) 4166 ret = -EFAULT; 4167 4168 kfree(sa); 4169 return ret; 4170 } 4171 4172 static long btrfs_ioctl_dev_replace(struct btrfs_root *root, void __user *arg) 4173 { 4174 struct btrfs_ioctl_dev_replace_args *p; 4175 int ret; 4176 4177 if (!capable(CAP_SYS_ADMIN)) 4178 return -EPERM; 4179 4180 p = memdup_user(arg, sizeof(*p)); 4181 if (IS_ERR(p)) 4182 return PTR_ERR(p); 4183 4184 switch (p->cmd) { 4185 case BTRFS_IOCTL_DEV_REPLACE_CMD_START: 4186 if (root->fs_info->sb->s_flags & MS_RDONLY) { 4187 ret = -EROFS; 4188 goto out; 4189 } 4190 if (atomic_xchg( 4191 &root->fs_info->mutually_exclusive_operation_running, 4192 1)) { 4193 ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; 4194 } else { 4195 ret = btrfs_dev_replace_start(root, p); 4196 atomic_set( 4197 &root->fs_info->mutually_exclusive_operation_running, 4198 0); 4199 } 4200 break; 4201 case BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS: 4202 btrfs_dev_replace_status(root->fs_info, p); 4203 ret = 0; 4204 break; 4205 case BTRFS_IOCTL_DEV_REPLACE_CMD_CANCEL: 4206 ret = btrfs_dev_replace_cancel(root->fs_info, p); 4207 break; 4208 default: 4209 ret = -EINVAL; 4210 break; 4211 } 4212 4213 if (copy_to_user(arg, p, sizeof(*p))) 4214 ret = -EFAULT; 4215 out: 4216 kfree(p); 4217 return ret; 4218 } 4219 4220 static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg) 4221 { 4222 int ret = 0; 4223 int i; 4224 u64 rel_ptr; 4225 int size; 4226 struct btrfs_ioctl_ino_path_args *ipa = NULL; 4227 struct inode_fs_paths *ipath = NULL; 4228 struct btrfs_path *path; 4229 4230 if (!capable(CAP_DAC_READ_SEARCH)) 4231 return -EPERM; 4232 4233 path = btrfs_alloc_path(); 4234 if (!path) { 4235 ret = -ENOMEM; 4236 goto out; 4237 } 4238 4239 ipa = memdup_user(arg, sizeof(*ipa)); 4240 if (IS_ERR(ipa)) { 4241 ret = PTR_ERR(ipa); 4242 ipa = NULL; 4243 goto out; 4244 } 4245 4246 size = min_t(u32, ipa->size, 4096); 4247 ipath = init_ipath(size, root, path); 4248 if (IS_ERR(ipath)) { 4249 ret = PTR_ERR(ipath); 4250 ipath = NULL; 4251 goto out; 4252 } 4253 4254 ret = paths_from_inode(ipa->inum, ipath); 4255 if (ret < 0) 4256 goto out; 4257 4258 for (i = 0; i < ipath->fspath->elem_cnt; ++i) { 4259 rel_ptr = ipath->fspath->val[i] - 4260 (u64)(unsigned long)ipath->fspath->val; 4261 ipath->fspath->val[i] = rel_ptr; 4262 } 4263 4264 ret = copy_to_user((void *)(unsigned long)ipa->fspath, 4265 (void *)(unsigned long)ipath->fspath, size); 4266 if (ret) { 4267 ret = -EFAULT; 4268 goto out; 4269 } 4270 4271 out: 4272 btrfs_free_path(path); 4273 free_ipath(ipath); 4274 kfree(ipa); 4275 4276 return ret; 4277 } 4278 4279 static int build_ino_list(u64 inum, u64 offset, u64 root, void *ctx) 4280 { 4281 struct btrfs_data_container *inodes = ctx; 4282 const size_t c = 3 * sizeof(u64); 4283 4284 if (inodes->bytes_left >= c) { 4285 inodes->bytes_left -= c; 4286 inodes->val[inodes->elem_cnt] = inum; 4287 inodes->val[inodes->elem_cnt + 1] = offset; 4288 inodes->val[inodes->elem_cnt + 2] = root; 4289 inodes->elem_cnt += 3; 4290 } else { 4291 inodes->bytes_missing += c - inodes->bytes_left; 4292 inodes->bytes_left = 0; 4293 inodes->elem_missed += 3; 4294 } 4295 4296 return 0; 4297 } 4298 4299 static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root, 4300 void __user *arg) 4301 { 4302 int ret = 0; 4303 int size; 4304 struct btrfs_ioctl_logical_ino_args *loi; 4305 struct btrfs_data_container *inodes = NULL; 4306 struct btrfs_path *path = NULL; 4307 4308 if (!capable(CAP_SYS_ADMIN)) 4309 return -EPERM; 4310 4311 loi = memdup_user(arg, sizeof(*loi)); 4312 if (IS_ERR(loi)) { 4313 ret = PTR_ERR(loi); 4314 loi = NULL; 4315 goto out; 4316 } 4317 4318 path = btrfs_alloc_path(); 4319 if (!path) { 4320 ret = -ENOMEM; 4321 goto out; 4322 } 4323 4324 size = min_t(u32, loi->size, 64 * 1024); 4325 inodes = init_data_container(size); 4326 if (IS_ERR(inodes)) { 4327 ret = PTR_ERR(inodes); 4328 inodes = NULL; 4329 goto out; 4330 } 4331 4332 ret = iterate_inodes_from_logical(loi->logical, root->fs_info, path, 4333 build_ino_list, inodes); 4334 if (ret == -EINVAL) 4335 ret = -ENOENT; 4336 if (ret < 0) 4337 goto out; 4338 4339 ret = copy_to_user((void *)(unsigned long)loi->inodes, 4340 (void *)(unsigned long)inodes, size); 4341 if (ret) 4342 ret = -EFAULT; 4343 4344 out: 4345 btrfs_free_path(path); 4346 vfree(inodes); 4347 kfree(loi); 4348 4349 return ret; 4350 } 4351 4352 void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock, 4353 struct btrfs_ioctl_balance_args *bargs) 4354 { 4355 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 4356 4357 bargs->flags = bctl->flags; 4358 4359 if (atomic_read(&fs_info->balance_running)) 4360 bargs->state |= BTRFS_BALANCE_STATE_RUNNING; 4361 if (atomic_read(&fs_info->balance_pause_req)) 4362 bargs->state |= BTRFS_BALANCE_STATE_PAUSE_REQ; 4363 if (atomic_read(&fs_info->balance_cancel_req)) 4364 bargs->state |= BTRFS_BALANCE_STATE_CANCEL_REQ; 4365 4366 memcpy(&bargs->data, &bctl->data, sizeof(bargs->data)); 4367 memcpy(&bargs->meta, &bctl->meta, sizeof(bargs->meta)); 4368 memcpy(&bargs->sys, &bctl->sys, sizeof(bargs->sys)); 4369 4370 if (lock) { 4371 spin_lock(&fs_info->balance_lock); 4372 memcpy(&bargs->stat, &bctl->stat, sizeof(bargs->stat)); 4373 spin_unlock(&fs_info->balance_lock); 4374 } else { 4375 memcpy(&bargs->stat, &bctl->stat, sizeof(bargs->stat)); 4376 } 4377 } 4378 4379 static long btrfs_ioctl_balance(struct file *file, void __user *arg) 4380 { 4381 struct btrfs_root *root = BTRFS_I(file_inode(file))->root; 4382 struct btrfs_fs_info *fs_info = root->fs_info; 4383 struct btrfs_ioctl_balance_args *bargs; 4384 struct btrfs_balance_control *bctl; 4385 bool need_unlock; /* for mut. excl. ops lock */ 4386 int ret; 4387 4388 if (!capable(CAP_SYS_ADMIN)) 4389 return -EPERM; 4390 4391 ret = mnt_want_write_file(file); 4392 if (ret) 4393 return ret; 4394 4395 again: 4396 if (!atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)) { 4397 mutex_lock(&fs_info->volume_mutex); 4398 mutex_lock(&fs_info->balance_mutex); 4399 need_unlock = true; 4400 goto locked; 4401 } 4402 4403 /* 4404 * mut. excl. ops lock is locked. Three possibilites: 4405 * (1) some other op is running 4406 * (2) balance is running 4407 * (3) balance is paused -- special case (think resume) 4408 */ 4409 mutex_lock(&fs_info->balance_mutex); 4410 if (fs_info->balance_ctl) { 4411 /* this is either (2) or (3) */ 4412 if (!atomic_read(&fs_info->balance_running)) { 4413 mutex_unlock(&fs_info->balance_mutex); 4414 if (!mutex_trylock(&fs_info->volume_mutex)) 4415 goto again; 4416 mutex_lock(&fs_info->balance_mutex); 4417 4418 if (fs_info->balance_ctl && 4419 !atomic_read(&fs_info->balance_running)) { 4420 /* this is (3) */ 4421 need_unlock = false; 4422 goto locked; 4423 } 4424 4425 mutex_unlock(&fs_info->balance_mutex); 4426 mutex_unlock(&fs_info->volume_mutex); 4427 goto again; 4428 } else { 4429 /* this is (2) */ 4430 mutex_unlock(&fs_info->balance_mutex); 4431 ret = -EINPROGRESS; 4432 goto out; 4433 } 4434 } else { 4435 /* this is (1) */ 4436 mutex_unlock(&fs_info->balance_mutex); 4437 ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; 4438 goto out; 4439 } 4440 4441 locked: 4442 BUG_ON(!atomic_read(&fs_info->mutually_exclusive_operation_running)); 4443 4444 if (arg) { 4445 bargs = memdup_user(arg, sizeof(*bargs)); 4446 if (IS_ERR(bargs)) { 4447 ret = PTR_ERR(bargs); 4448 goto out_unlock; 4449 } 4450 4451 if (bargs->flags & BTRFS_BALANCE_RESUME) { 4452 if (!fs_info->balance_ctl) { 4453 ret = -ENOTCONN; 4454 goto out_bargs; 4455 } 4456 4457 bctl = fs_info->balance_ctl; 4458 spin_lock(&fs_info->balance_lock); 4459 bctl->flags |= BTRFS_BALANCE_RESUME; 4460 spin_unlock(&fs_info->balance_lock); 4461 4462 goto do_balance; 4463 } 4464 } else { 4465 bargs = NULL; 4466 } 4467 4468 if (fs_info->balance_ctl) { 4469 ret = -EINPROGRESS; 4470 goto out_bargs; 4471 } 4472 4473 bctl = kzalloc(sizeof(*bctl), GFP_NOFS); 4474 if (!bctl) { 4475 ret = -ENOMEM; 4476 goto out_bargs; 4477 } 4478 4479 bctl->fs_info = fs_info; 4480 if (arg) { 4481 memcpy(&bctl->data, &bargs->data, sizeof(bctl->data)); 4482 memcpy(&bctl->meta, &bargs->meta, sizeof(bctl->meta)); 4483 memcpy(&bctl->sys, &bargs->sys, sizeof(bctl->sys)); 4484 4485 bctl->flags = bargs->flags; 4486 } else { 4487 /* balance everything - no filters */ 4488 bctl->flags |= BTRFS_BALANCE_TYPE_MASK; 4489 } 4490 4491 do_balance: 4492 /* 4493 * Ownership of bctl and mutually_exclusive_operation_running 4494 * goes to to btrfs_balance. bctl is freed in __cancel_balance, 4495 * or, if restriper was paused all the way until unmount, in 4496 * free_fs_info. mutually_exclusive_operation_running is 4497 * cleared in __cancel_balance. 4498 */ 4499 need_unlock = false; 4500 4501 ret = btrfs_balance(bctl, bargs); 4502 4503 if (arg) { 4504 if (copy_to_user(arg, bargs, sizeof(*bargs))) 4505 ret = -EFAULT; 4506 } 4507 4508 out_bargs: 4509 kfree(bargs); 4510 out_unlock: 4511 mutex_unlock(&fs_info->balance_mutex); 4512 mutex_unlock(&fs_info->volume_mutex); 4513 if (need_unlock) 4514 atomic_set(&fs_info->mutually_exclusive_operation_running, 0); 4515 out: 4516 mnt_drop_write_file(file); 4517 return ret; 4518 } 4519 4520 static long btrfs_ioctl_balance_ctl(struct btrfs_root *root, int cmd) 4521 { 4522 if (!capable(CAP_SYS_ADMIN)) 4523 return -EPERM; 4524 4525 switch (cmd) { 4526 case BTRFS_BALANCE_CTL_PAUSE: 4527 return btrfs_pause_balance(root->fs_info); 4528 case BTRFS_BALANCE_CTL_CANCEL: 4529 return btrfs_cancel_balance(root->fs_info); 4530 } 4531 4532 return -EINVAL; 4533 } 4534 4535 static long btrfs_ioctl_balance_progress(struct btrfs_root *root, 4536 void __user *arg) 4537 { 4538 struct btrfs_fs_info *fs_info = root->fs_info; 4539 struct btrfs_ioctl_balance_args *bargs; 4540 int ret = 0; 4541 4542 if (!capable(CAP_SYS_ADMIN)) 4543 return -EPERM; 4544 4545 mutex_lock(&fs_info->balance_mutex); 4546 if (!fs_info->balance_ctl) { 4547 ret = -ENOTCONN; 4548 goto out; 4549 } 4550 4551 bargs = kzalloc(sizeof(*bargs), GFP_NOFS); 4552 if (!bargs) { 4553 ret = -ENOMEM; 4554 goto out; 4555 } 4556 4557 update_ioctl_balance_args(fs_info, 1, bargs); 4558 4559 if (copy_to_user(arg, bargs, sizeof(*bargs))) 4560 ret = -EFAULT; 4561 4562 kfree(bargs); 4563 out: 4564 mutex_unlock(&fs_info->balance_mutex); 4565 return ret; 4566 } 4567 4568 static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg) 4569 { 4570 struct btrfs_root *root = BTRFS_I(file_inode(file))->root; 4571 struct btrfs_ioctl_quota_ctl_args *sa; 4572 struct btrfs_trans_handle *trans = NULL; 4573 int ret; 4574 int err; 4575 4576 if (!capable(CAP_SYS_ADMIN)) 4577 return -EPERM; 4578 4579 ret = mnt_want_write_file(file); 4580 if (ret) 4581 return ret; 4582 4583 sa = memdup_user(arg, sizeof(*sa)); 4584 if (IS_ERR(sa)) { 4585 ret = PTR_ERR(sa); 4586 goto drop_write; 4587 } 4588 4589 down_write(&root->fs_info->subvol_sem); 4590 trans = btrfs_start_transaction(root->fs_info->tree_root, 2); 4591 if (IS_ERR(trans)) { 4592 ret = PTR_ERR(trans); 4593 goto out; 4594 } 4595 4596 switch (sa->cmd) { 4597 case BTRFS_QUOTA_CTL_ENABLE: 4598 ret = btrfs_quota_enable(trans, root->fs_info); 4599 break; 4600 case BTRFS_QUOTA_CTL_DISABLE: 4601 ret = btrfs_quota_disable(trans, root->fs_info); 4602 break; 4603 default: 4604 ret = -EINVAL; 4605 break; 4606 } 4607 4608 err = btrfs_commit_transaction(trans, root->fs_info->tree_root); 4609 if (err && !ret) 4610 ret = err; 4611 out: 4612 kfree(sa); 4613 up_write(&root->fs_info->subvol_sem); 4614 drop_write: 4615 mnt_drop_write_file(file); 4616 return ret; 4617 } 4618 4619 static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg) 4620 { 4621 struct btrfs_root *root = BTRFS_I(file_inode(file))->root; 4622 struct btrfs_ioctl_qgroup_assign_args *sa; 4623 struct btrfs_trans_handle *trans; 4624 int ret; 4625 int err; 4626 4627 if (!capable(CAP_SYS_ADMIN)) 4628 return -EPERM; 4629 4630 ret = mnt_want_write_file(file); 4631 if (ret) 4632 return ret; 4633 4634 sa = memdup_user(arg, sizeof(*sa)); 4635 if (IS_ERR(sa)) { 4636 ret = PTR_ERR(sa); 4637 goto drop_write; 4638 } 4639 4640 trans = btrfs_join_transaction(root); 4641 if (IS_ERR(trans)) { 4642 ret = PTR_ERR(trans); 4643 goto out; 4644 } 4645 4646 /* FIXME: check if the IDs really exist */ 4647 if (sa->assign) { 4648 ret = btrfs_add_qgroup_relation(trans, root->fs_info, 4649 sa->src, sa->dst); 4650 } else { 4651 ret = btrfs_del_qgroup_relation(trans, root->fs_info, 4652 sa->src, sa->dst); 4653 } 4654 4655 err = btrfs_end_transaction(trans, root); 4656 if (err && !ret) 4657 ret = err; 4658 4659 out: 4660 kfree(sa); 4661 drop_write: 4662 mnt_drop_write_file(file); 4663 return ret; 4664 } 4665 4666 static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg) 4667 { 4668 struct btrfs_root *root = BTRFS_I(file_inode(file))->root; 4669 struct btrfs_ioctl_qgroup_create_args *sa; 4670 struct btrfs_trans_handle *trans; 4671 int ret; 4672 int err; 4673 4674 if (!capable(CAP_SYS_ADMIN)) 4675 return -EPERM; 4676 4677 ret = mnt_want_write_file(file); 4678 if (ret) 4679 return ret; 4680 4681 sa = memdup_user(arg, sizeof(*sa)); 4682 if (IS_ERR(sa)) { 4683 ret = PTR_ERR(sa); 4684 goto drop_write; 4685 } 4686 4687 if (!sa->qgroupid) { 4688 ret = -EINVAL; 4689 goto out; 4690 } 4691 4692 trans = btrfs_join_transaction(root); 4693 if (IS_ERR(trans)) { 4694 ret = PTR_ERR(trans); 4695 goto out; 4696 } 4697 4698 /* FIXME: check if the IDs really exist */ 4699 if (sa->create) { 4700 ret = btrfs_create_qgroup(trans, root->fs_info, sa->qgroupid, 4701 NULL); 4702 } else { 4703 ret = btrfs_remove_qgroup(trans, root->fs_info, sa->qgroupid); 4704 } 4705 4706 err = btrfs_end_transaction(trans, root); 4707 if (err && !ret) 4708 ret = err; 4709 4710 out: 4711 kfree(sa); 4712 drop_write: 4713 mnt_drop_write_file(file); 4714 return ret; 4715 } 4716 4717 static long btrfs_ioctl_qgroup_limit(struct file *file, void __user *arg) 4718 { 4719 struct btrfs_root *root = BTRFS_I(file_inode(file))->root; 4720 struct btrfs_ioctl_qgroup_limit_args *sa; 4721 struct btrfs_trans_handle *trans; 4722 int ret; 4723 int err; 4724 u64 qgroupid; 4725 4726 if (!capable(CAP_SYS_ADMIN)) 4727 return -EPERM; 4728 4729 ret = mnt_want_write_file(file); 4730 if (ret) 4731 return ret; 4732 4733 sa = memdup_user(arg, sizeof(*sa)); 4734 if (IS_ERR(sa)) { 4735 ret = PTR_ERR(sa); 4736 goto drop_write; 4737 } 4738 4739 trans = btrfs_join_transaction(root); 4740 if (IS_ERR(trans)) { 4741 ret = PTR_ERR(trans); 4742 goto out; 4743 } 4744 4745 qgroupid = sa->qgroupid; 4746 if (!qgroupid) { 4747 /* take the current subvol as qgroup */ 4748 qgroupid = root->root_key.objectid; 4749 } 4750 4751 /* FIXME: check if the IDs really exist */ 4752 ret = btrfs_limit_qgroup(trans, root->fs_info, qgroupid, &sa->lim); 4753 4754 err = btrfs_end_transaction(trans, root); 4755 if (err && !ret) 4756 ret = err; 4757 4758 out: 4759 kfree(sa); 4760 drop_write: 4761 mnt_drop_write_file(file); 4762 return ret; 4763 } 4764 4765 static long btrfs_ioctl_quota_rescan(struct file *file, void __user *arg) 4766 { 4767 struct btrfs_root *root = BTRFS_I(file_inode(file))->root; 4768 struct btrfs_ioctl_quota_rescan_args *qsa; 4769 int ret; 4770 4771 if (!capable(CAP_SYS_ADMIN)) 4772 return -EPERM; 4773 4774 ret = mnt_want_write_file(file); 4775 if (ret) 4776 return ret; 4777 4778 qsa = memdup_user(arg, sizeof(*qsa)); 4779 if (IS_ERR(qsa)) { 4780 ret = PTR_ERR(qsa); 4781 goto drop_write; 4782 } 4783 4784 if (qsa->flags) { 4785 ret = -EINVAL; 4786 goto out; 4787 } 4788 4789 ret = btrfs_qgroup_rescan(root->fs_info); 4790 4791 out: 4792 kfree(qsa); 4793 drop_write: 4794 mnt_drop_write_file(file); 4795 return ret; 4796 } 4797 4798 static long btrfs_ioctl_quota_rescan_status(struct file *file, void __user *arg) 4799 { 4800 struct btrfs_root *root = BTRFS_I(file_inode(file))->root; 4801 struct btrfs_ioctl_quota_rescan_args *qsa; 4802 int ret = 0; 4803 4804 if (!capable(CAP_SYS_ADMIN)) 4805 return -EPERM; 4806 4807 qsa = kzalloc(sizeof(*qsa), GFP_NOFS); 4808 if (!qsa) 4809 return -ENOMEM; 4810 4811 if (root->fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { 4812 qsa->flags = 1; 4813 qsa->progress = root->fs_info->qgroup_rescan_progress.objectid; 4814 } 4815 4816 if (copy_to_user(arg, qsa, sizeof(*qsa))) 4817 ret = -EFAULT; 4818 4819 kfree(qsa); 4820 return ret; 4821 } 4822 4823 static long btrfs_ioctl_quota_rescan_wait(struct file *file, void __user *arg) 4824 { 4825 struct btrfs_root *root = BTRFS_I(file_inode(file))->root; 4826 4827 if (!capable(CAP_SYS_ADMIN)) 4828 return -EPERM; 4829 4830 return btrfs_qgroup_wait_for_completion(root->fs_info); 4831 } 4832 4833 static long _btrfs_ioctl_set_received_subvol(struct file *file, 4834 struct btrfs_ioctl_received_subvol_args *sa) 4835 { 4836 struct inode *inode = file_inode(file); 4837 struct btrfs_root *root = BTRFS_I(inode)->root; 4838 struct btrfs_root_item *root_item = &root->root_item; 4839 struct btrfs_trans_handle *trans; 4840 struct timespec ct = CURRENT_TIME; 4841 int ret = 0; 4842 int received_uuid_changed; 4843 4844 if (!inode_owner_or_capable(inode)) 4845 return -EPERM; 4846 4847 ret = mnt_want_write_file(file); 4848 if (ret < 0) 4849 return ret; 4850 4851 down_write(&root->fs_info->subvol_sem); 4852 4853 if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID) { 4854 ret = -EINVAL; 4855 goto out; 4856 } 4857 4858 if (btrfs_root_readonly(root)) { 4859 ret = -EROFS; 4860 goto out; 4861 } 4862 4863 /* 4864 * 1 - root item 4865 * 2 - uuid items (received uuid + subvol uuid) 4866 */ 4867 trans = btrfs_start_transaction(root, 3); 4868 if (IS_ERR(trans)) { 4869 ret = PTR_ERR(trans); 4870 trans = NULL; 4871 goto out; 4872 } 4873 4874 sa->rtransid = trans->transid; 4875 sa->rtime.sec = ct.tv_sec; 4876 sa->rtime.nsec = ct.tv_nsec; 4877 4878 received_uuid_changed = memcmp(root_item->received_uuid, sa->uuid, 4879 BTRFS_UUID_SIZE); 4880 if (received_uuid_changed && 4881 !btrfs_is_empty_uuid(root_item->received_uuid)) 4882 btrfs_uuid_tree_rem(trans, root->fs_info->uuid_root, 4883 root_item->received_uuid, 4884 BTRFS_UUID_KEY_RECEIVED_SUBVOL, 4885 root->root_key.objectid); 4886 memcpy(root_item->received_uuid, sa->uuid, BTRFS_UUID_SIZE); 4887 btrfs_set_root_stransid(root_item, sa->stransid); 4888 btrfs_set_root_rtransid(root_item, sa->rtransid); 4889 btrfs_set_stack_timespec_sec(&root_item->stime, sa->stime.sec); 4890 btrfs_set_stack_timespec_nsec(&root_item->stime, sa->stime.nsec); 4891 btrfs_set_stack_timespec_sec(&root_item->rtime, sa->rtime.sec); 4892 btrfs_set_stack_timespec_nsec(&root_item->rtime, sa->rtime.nsec); 4893 4894 ret = btrfs_update_root(trans, root->fs_info->tree_root, 4895 &root->root_key, &root->root_item); 4896 if (ret < 0) { 4897 btrfs_end_transaction(trans, root); 4898 goto out; 4899 } 4900 if (received_uuid_changed && !btrfs_is_empty_uuid(sa->uuid)) { 4901 ret = btrfs_uuid_tree_add(trans, root->fs_info->uuid_root, 4902 sa->uuid, 4903 BTRFS_UUID_KEY_RECEIVED_SUBVOL, 4904 root->root_key.objectid); 4905 if (ret < 0 && ret != -EEXIST) { 4906 btrfs_abort_transaction(trans, root, ret); 4907 goto out; 4908 } 4909 } 4910 ret = btrfs_commit_transaction(trans, root); 4911 if (ret < 0) { 4912 btrfs_abort_transaction(trans, root, ret); 4913 goto out; 4914 } 4915 4916 out: 4917 up_write(&root->fs_info->subvol_sem); 4918 mnt_drop_write_file(file); 4919 return ret; 4920 } 4921 4922 #ifdef CONFIG_64BIT 4923 static long btrfs_ioctl_set_received_subvol_32(struct file *file, 4924 void __user *arg) 4925 { 4926 struct btrfs_ioctl_received_subvol_args_32 *args32 = NULL; 4927 struct btrfs_ioctl_received_subvol_args *args64 = NULL; 4928 int ret = 0; 4929 4930 args32 = memdup_user(arg, sizeof(*args32)); 4931 if (IS_ERR(args32)) { 4932 ret = PTR_ERR(args32); 4933 args32 = NULL; 4934 goto out; 4935 } 4936 4937 args64 = kmalloc(sizeof(*args64), GFP_NOFS); 4938 if (!args64) { 4939 ret = -ENOMEM; 4940 goto out; 4941 } 4942 4943 memcpy(args64->uuid, args32->uuid, BTRFS_UUID_SIZE); 4944 args64->stransid = args32->stransid; 4945 args64->rtransid = args32->rtransid; 4946 args64->stime.sec = args32->stime.sec; 4947 args64->stime.nsec = args32->stime.nsec; 4948 args64->rtime.sec = args32->rtime.sec; 4949 args64->rtime.nsec = args32->rtime.nsec; 4950 args64->flags = args32->flags; 4951 4952 ret = _btrfs_ioctl_set_received_subvol(file, args64); 4953 if (ret) 4954 goto out; 4955 4956 memcpy(args32->uuid, args64->uuid, BTRFS_UUID_SIZE); 4957 args32->stransid = args64->stransid; 4958 args32->rtransid = args64->rtransid; 4959 args32->stime.sec = args64->stime.sec; 4960 args32->stime.nsec = args64->stime.nsec; 4961 args32->rtime.sec = args64->rtime.sec; 4962 args32->rtime.nsec = args64->rtime.nsec; 4963 args32->flags = args64->flags; 4964 4965 ret = copy_to_user(arg, args32, sizeof(*args32)); 4966 if (ret) 4967 ret = -EFAULT; 4968 4969 out: 4970 kfree(args32); 4971 kfree(args64); 4972 return ret; 4973 } 4974 #endif 4975 4976 static long btrfs_ioctl_set_received_subvol(struct file *file, 4977 void __user *arg) 4978 { 4979 struct btrfs_ioctl_received_subvol_args *sa = NULL; 4980 int ret = 0; 4981 4982 sa = memdup_user(arg, sizeof(*sa)); 4983 if (IS_ERR(sa)) { 4984 ret = PTR_ERR(sa); 4985 sa = NULL; 4986 goto out; 4987 } 4988 4989 ret = _btrfs_ioctl_set_received_subvol(file, sa); 4990 4991 if (ret) 4992 goto out; 4993 4994 ret = copy_to_user(arg, sa, sizeof(*sa)); 4995 if (ret) 4996 ret = -EFAULT; 4997 4998 out: 4999 kfree(sa); 5000 return ret; 5001 } 5002 5003 static int btrfs_ioctl_get_fslabel(struct file *file, void __user *arg) 5004 { 5005 struct btrfs_root *root = BTRFS_I(file_inode(file))->root; 5006 size_t len; 5007 int ret; 5008 char label[BTRFS_LABEL_SIZE]; 5009 5010 spin_lock(&root->fs_info->super_lock); 5011 memcpy(label, root->fs_info->super_copy->label, BTRFS_LABEL_SIZE); 5012 spin_unlock(&root->fs_info->super_lock); 5013 5014 len = strnlen(label, BTRFS_LABEL_SIZE); 5015 5016 if (len == BTRFS_LABEL_SIZE) { 5017 btrfs_warn(root->fs_info, 5018 "label is too long, return the first %zu bytes", --len); 5019 } 5020 5021 ret = copy_to_user(arg, label, len); 5022 5023 return ret ? -EFAULT : 0; 5024 } 5025 5026 static int btrfs_ioctl_set_fslabel(struct file *file, void __user *arg) 5027 { 5028 struct btrfs_root *root = BTRFS_I(file_inode(file))->root; 5029 struct btrfs_super_block *super_block = root->fs_info->super_copy; 5030 struct btrfs_trans_handle *trans; 5031 char label[BTRFS_LABEL_SIZE]; 5032 int ret; 5033 5034 if (!capable(CAP_SYS_ADMIN)) 5035 return -EPERM; 5036 5037 if (copy_from_user(label, arg, sizeof(label))) 5038 return -EFAULT; 5039 5040 if (strnlen(label, BTRFS_LABEL_SIZE) == BTRFS_LABEL_SIZE) { 5041 btrfs_err(root->fs_info, "unable to set label with more than %d bytes", 5042 BTRFS_LABEL_SIZE - 1); 5043 return -EINVAL; 5044 } 5045 5046 ret = mnt_want_write_file(file); 5047 if (ret) 5048 return ret; 5049 5050 trans = btrfs_start_transaction(root, 0); 5051 if (IS_ERR(trans)) { 5052 ret = PTR_ERR(trans); 5053 goto out_unlock; 5054 } 5055 5056 spin_lock(&root->fs_info->super_lock); 5057 strcpy(super_block->label, label); 5058 spin_unlock(&root->fs_info->super_lock); 5059 ret = btrfs_commit_transaction(trans, root); 5060 5061 out_unlock: 5062 mnt_drop_write_file(file); 5063 return ret; 5064 } 5065 5066 #define INIT_FEATURE_FLAGS(suffix) \ 5067 { .compat_flags = BTRFS_FEATURE_COMPAT_##suffix, \ 5068 .compat_ro_flags = BTRFS_FEATURE_COMPAT_RO_##suffix, \ 5069 .incompat_flags = BTRFS_FEATURE_INCOMPAT_##suffix } 5070 5071 static int btrfs_ioctl_get_supported_features(struct file *file, 5072 void __user *arg) 5073 { 5074 static struct btrfs_ioctl_feature_flags features[3] = { 5075 INIT_FEATURE_FLAGS(SUPP), 5076 INIT_FEATURE_FLAGS(SAFE_SET), 5077 INIT_FEATURE_FLAGS(SAFE_CLEAR) 5078 }; 5079 5080 if (copy_to_user(arg, &features, sizeof(features))) 5081 return -EFAULT; 5082 5083 return 0; 5084 } 5085 5086 static int btrfs_ioctl_get_features(struct file *file, void __user *arg) 5087 { 5088 struct btrfs_root *root = BTRFS_I(file_inode(file))->root; 5089 struct btrfs_super_block *super_block = root->fs_info->super_copy; 5090 struct btrfs_ioctl_feature_flags features; 5091 5092 features.compat_flags = btrfs_super_compat_flags(super_block); 5093 features.compat_ro_flags = btrfs_super_compat_ro_flags(super_block); 5094 features.incompat_flags = btrfs_super_incompat_flags(super_block); 5095 5096 if (copy_to_user(arg, &features, sizeof(features))) 5097 return -EFAULT; 5098 5099 return 0; 5100 } 5101 5102 static int check_feature_bits(struct btrfs_root *root, 5103 enum btrfs_feature_set set, 5104 u64 change_mask, u64 flags, u64 supported_flags, 5105 u64 safe_set, u64 safe_clear) 5106 { 5107 const char *type = btrfs_feature_set_names[set]; 5108 char *names; 5109 u64 disallowed, unsupported; 5110 u64 set_mask = flags & change_mask; 5111 u64 clear_mask = ~flags & change_mask; 5112 5113 unsupported = set_mask & ~supported_flags; 5114 if (unsupported) { 5115 names = btrfs_printable_features(set, unsupported); 5116 if (names) { 5117 btrfs_warn(root->fs_info, 5118 "this kernel does not support the %s feature bit%s", 5119 names, strchr(names, ',') ? "s" : ""); 5120 kfree(names); 5121 } else 5122 btrfs_warn(root->fs_info, 5123 "this kernel does not support %s bits 0x%llx", 5124 type, unsupported); 5125 return -EOPNOTSUPP; 5126 } 5127 5128 disallowed = set_mask & ~safe_set; 5129 if (disallowed) { 5130 names = btrfs_printable_features(set, disallowed); 5131 if (names) { 5132 btrfs_warn(root->fs_info, 5133 "can't set the %s feature bit%s while mounted", 5134 names, strchr(names, ',') ? "s" : ""); 5135 kfree(names); 5136 } else 5137 btrfs_warn(root->fs_info, 5138 "can't set %s bits 0x%llx while mounted", 5139 type, disallowed); 5140 return -EPERM; 5141 } 5142 5143 disallowed = clear_mask & ~safe_clear; 5144 if (disallowed) { 5145 names = btrfs_printable_features(set, disallowed); 5146 if (names) { 5147 btrfs_warn(root->fs_info, 5148 "can't clear the %s feature bit%s while mounted", 5149 names, strchr(names, ',') ? "s" : ""); 5150 kfree(names); 5151 } else 5152 btrfs_warn(root->fs_info, 5153 "can't clear %s bits 0x%llx while mounted", 5154 type, disallowed); 5155 return -EPERM; 5156 } 5157 5158 return 0; 5159 } 5160 5161 #define check_feature(root, change_mask, flags, mask_base) \ 5162 check_feature_bits(root, FEAT_##mask_base, change_mask, flags, \ 5163 BTRFS_FEATURE_ ## mask_base ## _SUPP, \ 5164 BTRFS_FEATURE_ ## mask_base ## _SAFE_SET, \ 5165 BTRFS_FEATURE_ ## mask_base ## _SAFE_CLEAR) 5166 5167 static int btrfs_ioctl_set_features(struct file *file, void __user *arg) 5168 { 5169 struct btrfs_root *root = BTRFS_I(file_inode(file))->root; 5170 struct btrfs_super_block *super_block = root->fs_info->super_copy; 5171 struct btrfs_ioctl_feature_flags flags[2]; 5172 struct btrfs_trans_handle *trans; 5173 u64 newflags; 5174 int ret; 5175 5176 if (!capable(CAP_SYS_ADMIN)) 5177 return -EPERM; 5178 5179 if (copy_from_user(flags, arg, sizeof(flags))) 5180 return -EFAULT; 5181 5182 /* Nothing to do */ 5183 if (!flags[0].compat_flags && !flags[0].compat_ro_flags && 5184 !flags[0].incompat_flags) 5185 return 0; 5186 5187 ret = check_feature(root, flags[0].compat_flags, 5188 flags[1].compat_flags, COMPAT); 5189 if (ret) 5190 return ret; 5191 5192 ret = check_feature(root, flags[0].compat_ro_flags, 5193 flags[1].compat_ro_flags, COMPAT_RO); 5194 if (ret) 5195 return ret; 5196 5197 ret = check_feature(root, flags[0].incompat_flags, 5198 flags[1].incompat_flags, INCOMPAT); 5199 if (ret) 5200 return ret; 5201 5202 trans = btrfs_start_transaction(root, 0); 5203 if (IS_ERR(trans)) 5204 return PTR_ERR(trans); 5205 5206 spin_lock(&root->fs_info->super_lock); 5207 newflags = btrfs_super_compat_flags(super_block); 5208 newflags |= flags[0].compat_flags & flags[1].compat_flags; 5209 newflags &= ~(flags[0].compat_flags & ~flags[1].compat_flags); 5210 btrfs_set_super_compat_flags(super_block, newflags); 5211 5212 newflags = btrfs_super_compat_ro_flags(super_block); 5213 newflags |= flags[0].compat_ro_flags & flags[1].compat_ro_flags; 5214 newflags &= ~(flags[0].compat_ro_flags & ~flags[1].compat_ro_flags); 5215 btrfs_set_super_compat_ro_flags(super_block, newflags); 5216 5217 newflags = btrfs_super_incompat_flags(super_block); 5218 newflags |= flags[0].incompat_flags & flags[1].incompat_flags; 5219 newflags &= ~(flags[0].incompat_flags & ~flags[1].incompat_flags); 5220 btrfs_set_super_incompat_flags(super_block, newflags); 5221 spin_unlock(&root->fs_info->super_lock); 5222 5223 return btrfs_commit_transaction(trans, root); 5224 } 5225 5226 long btrfs_ioctl(struct file *file, unsigned int 5227 cmd, unsigned long arg) 5228 { 5229 struct btrfs_root *root = BTRFS_I(file_inode(file))->root; 5230 void __user *argp = (void __user *)arg; 5231 5232 switch (cmd) { 5233 case FS_IOC_GETFLAGS: 5234 return btrfs_ioctl_getflags(file, argp); 5235 case FS_IOC_SETFLAGS: 5236 return btrfs_ioctl_setflags(file, argp); 5237 case FS_IOC_GETVERSION: 5238 return btrfs_ioctl_getversion(file, argp); 5239 case FITRIM: 5240 return btrfs_ioctl_fitrim(file, argp); 5241 case BTRFS_IOC_SNAP_CREATE: 5242 return btrfs_ioctl_snap_create(file, argp, 0); 5243 case BTRFS_IOC_SNAP_CREATE_V2: 5244 return btrfs_ioctl_snap_create_v2(file, argp, 0); 5245 case BTRFS_IOC_SUBVOL_CREATE: 5246 return btrfs_ioctl_snap_create(file, argp, 1); 5247 case BTRFS_IOC_SUBVOL_CREATE_V2: 5248 return btrfs_ioctl_snap_create_v2(file, argp, 1); 5249 case BTRFS_IOC_SNAP_DESTROY: 5250 return btrfs_ioctl_snap_destroy(file, argp); 5251 case BTRFS_IOC_SUBVOL_GETFLAGS: 5252 return btrfs_ioctl_subvol_getflags(file, argp); 5253 case BTRFS_IOC_SUBVOL_SETFLAGS: 5254 return btrfs_ioctl_subvol_setflags(file, argp); 5255 case BTRFS_IOC_DEFAULT_SUBVOL: 5256 return btrfs_ioctl_default_subvol(file, argp); 5257 case BTRFS_IOC_DEFRAG: 5258 return btrfs_ioctl_defrag(file, NULL); 5259 case BTRFS_IOC_DEFRAG_RANGE: 5260 return btrfs_ioctl_defrag(file, argp); 5261 case BTRFS_IOC_RESIZE: 5262 return btrfs_ioctl_resize(file, argp); 5263 case BTRFS_IOC_ADD_DEV: 5264 return btrfs_ioctl_add_dev(root, argp); 5265 case BTRFS_IOC_RM_DEV: 5266 return btrfs_ioctl_rm_dev(file, argp); 5267 case BTRFS_IOC_FS_INFO: 5268 return btrfs_ioctl_fs_info(root, argp); 5269 case BTRFS_IOC_DEV_INFO: 5270 return btrfs_ioctl_dev_info(root, argp); 5271 case BTRFS_IOC_BALANCE: 5272 return btrfs_ioctl_balance(file, NULL); 5273 case BTRFS_IOC_CLONE: 5274 return btrfs_ioctl_clone(file, arg, 0, 0, 0); 5275 case BTRFS_IOC_CLONE_RANGE: 5276 return btrfs_ioctl_clone_range(file, argp); 5277 case BTRFS_IOC_TRANS_START: 5278 return btrfs_ioctl_trans_start(file); 5279 case BTRFS_IOC_TRANS_END: 5280 return btrfs_ioctl_trans_end(file); 5281 case BTRFS_IOC_TREE_SEARCH: 5282 return btrfs_ioctl_tree_search(file, argp); 5283 case BTRFS_IOC_TREE_SEARCH_V2: 5284 return btrfs_ioctl_tree_search_v2(file, argp); 5285 case BTRFS_IOC_INO_LOOKUP: 5286 return btrfs_ioctl_ino_lookup(file, argp); 5287 case BTRFS_IOC_INO_PATHS: 5288 return btrfs_ioctl_ino_to_path(root, argp); 5289 case BTRFS_IOC_LOGICAL_INO: 5290 return btrfs_ioctl_logical_to_ino(root, argp); 5291 case BTRFS_IOC_SPACE_INFO: 5292 return btrfs_ioctl_space_info(root, argp); 5293 case BTRFS_IOC_SYNC: { 5294 int ret; 5295 5296 ret = btrfs_start_delalloc_roots(root->fs_info, 0, -1); 5297 if (ret) 5298 return ret; 5299 ret = btrfs_sync_fs(file->f_dentry->d_sb, 1); 5300 /* 5301 * The transaction thread may want to do more work, 5302 * namely it pokes the cleaner ktread that will start 5303 * processing uncleaned subvols. 5304 */ 5305 wake_up_process(root->fs_info->transaction_kthread); 5306 return ret; 5307 } 5308 case BTRFS_IOC_START_SYNC: 5309 return btrfs_ioctl_start_sync(root, argp); 5310 case BTRFS_IOC_WAIT_SYNC: 5311 return btrfs_ioctl_wait_sync(root, argp); 5312 case BTRFS_IOC_SCRUB: 5313 return btrfs_ioctl_scrub(file, argp); 5314 case BTRFS_IOC_SCRUB_CANCEL: 5315 return btrfs_ioctl_scrub_cancel(root, argp); 5316 case BTRFS_IOC_SCRUB_PROGRESS: 5317 return btrfs_ioctl_scrub_progress(root, argp); 5318 case BTRFS_IOC_BALANCE_V2: 5319 return btrfs_ioctl_balance(file, argp); 5320 case BTRFS_IOC_BALANCE_CTL: 5321 return btrfs_ioctl_balance_ctl(root, arg); 5322 case BTRFS_IOC_BALANCE_PROGRESS: 5323 return btrfs_ioctl_balance_progress(root, argp); 5324 case BTRFS_IOC_SET_RECEIVED_SUBVOL: 5325 return btrfs_ioctl_set_received_subvol(file, argp); 5326 #ifdef CONFIG_64BIT 5327 case BTRFS_IOC_SET_RECEIVED_SUBVOL_32: 5328 return btrfs_ioctl_set_received_subvol_32(file, argp); 5329 #endif 5330 case BTRFS_IOC_SEND: 5331 return btrfs_ioctl_send(file, argp); 5332 case BTRFS_IOC_GET_DEV_STATS: 5333 return btrfs_ioctl_get_dev_stats(root, argp); 5334 case BTRFS_IOC_QUOTA_CTL: 5335 return btrfs_ioctl_quota_ctl(file, argp); 5336 case BTRFS_IOC_QGROUP_ASSIGN: 5337 return btrfs_ioctl_qgroup_assign(file, argp); 5338 case BTRFS_IOC_QGROUP_CREATE: 5339 return btrfs_ioctl_qgroup_create(file, argp); 5340 case BTRFS_IOC_QGROUP_LIMIT: 5341 return btrfs_ioctl_qgroup_limit(file, argp); 5342 case BTRFS_IOC_QUOTA_RESCAN: 5343 return btrfs_ioctl_quota_rescan(file, argp); 5344 case BTRFS_IOC_QUOTA_RESCAN_STATUS: 5345 return btrfs_ioctl_quota_rescan_status(file, argp); 5346 case BTRFS_IOC_QUOTA_RESCAN_WAIT: 5347 return btrfs_ioctl_quota_rescan_wait(file, argp); 5348 case BTRFS_IOC_DEV_REPLACE: 5349 return btrfs_ioctl_dev_replace(root, argp); 5350 case BTRFS_IOC_GET_FSLABEL: 5351 return btrfs_ioctl_get_fslabel(file, argp); 5352 case BTRFS_IOC_SET_FSLABEL: 5353 return btrfs_ioctl_set_fslabel(file, argp); 5354 case BTRFS_IOC_FILE_EXTENT_SAME: 5355 return btrfs_ioctl_file_extent_same(file, argp); 5356 case BTRFS_IOC_GET_SUPPORTED_FEATURES: 5357 return btrfs_ioctl_get_supported_features(file, argp); 5358 case BTRFS_IOC_GET_FEATURES: 5359 return btrfs_ioctl_get_features(file, argp); 5360 case BTRFS_IOC_SET_FEATURES: 5361 return btrfs_ioctl_set_features(file, argp); 5362 } 5363 5364 return -ENOTTY; 5365 } 5366