1 /* 2 * Copyright (C) 2007 Oracle. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 19 #include <linux/kernel.h> 20 #include <linux/bio.h> 21 #include <linux/buffer_head.h> 22 #include <linux/file.h> 23 #include <linux/fs.h> 24 #include <linux/fsnotify.h> 25 #include <linux/pagemap.h> 26 #include <linux/highmem.h> 27 #include <linux/time.h> 28 #include <linux/init.h> 29 #include <linux/string.h> 30 #include <linux/backing-dev.h> 31 #include <linux/mount.h> 32 #include <linux/mpage.h> 33 #include <linux/namei.h> 34 #include <linux/swap.h> 35 #include <linux/writeback.h> 36 #include <linux/statfs.h> 37 #include <linux/compat.h> 38 #include <linux/bit_spinlock.h> 39 #include <linux/security.h> 40 #include <linux/xattr.h> 41 #include <linux/vmalloc.h> 42 #include <linux/slab.h> 43 #include <linux/blkdev.h> 44 #include <linux/uuid.h> 45 #include <linux/btrfs.h> 46 #include <linux/uaccess.h> 47 #include "ctree.h" 48 #include "disk-io.h" 49 #include "transaction.h" 50 #include "btrfs_inode.h" 51 #include "print-tree.h" 52 #include "volumes.h" 53 #include "locking.h" 54 #include "inode-map.h" 55 #include "backref.h" 56 #include "rcu-string.h" 57 #include "send.h" 58 #include "dev-replace.h" 59 #include "props.h" 60 #include "sysfs.h" 61 #include "qgroup.h" 62 63 #ifdef CONFIG_64BIT 64 /* If we have a 32-bit userspace and 64-bit kernel, then the UAPI 65 * structures are incorrect, as the timespec structure from userspace 66 * is 4 bytes too small. We define these alternatives here to teach 67 * the kernel about the 32-bit struct packing. 68 */ 69 struct btrfs_ioctl_timespec_32 { 70 __u64 sec; 71 __u32 nsec; 72 } __attribute__ ((__packed__)); 73 74 struct btrfs_ioctl_received_subvol_args_32 { 75 char uuid[BTRFS_UUID_SIZE]; /* in */ 76 __u64 stransid; /* in */ 77 __u64 rtransid; /* out */ 78 struct btrfs_ioctl_timespec_32 stime; /* in */ 79 struct btrfs_ioctl_timespec_32 rtime; /* out */ 80 __u64 flags; /* in */ 81 __u64 reserved[16]; /* in */ 82 } __attribute__ ((__packed__)); 83 84 #define BTRFS_IOC_SET_RECEIVED_SUBVOL_32 _IOWR(BTRFS_IOCTL_MAGIC, 37, \ 85 struct btrfs_ioctl_received_subvol_args_32) 86 #endif 87 88 89 static int btrfs_clone(struct inode *src, struct inode *inode, 90 u64 off, u64 olen, u64 olen_aligned, u64 destoff); 91 92 /* Mask out flags that are inappropriate for the given type of inode. */ 93 static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags) 94 { 95 if (S_ISDIR(mode)) 96 return flags; 97 else if (S_ISREG(mode)) 98 return flags & ~FS_DIRSYNC_FL; 99 else 100 return flags & (FS_NODUMP_FL | FS_NOATIME_FL); 101 } 102 103 /* 104 * Export inode flags to the format expected by the FS_IOC_GETFLAGS ioctl. 105 */ 106 static unsigned int btrfs_flags_to_ioctl(unsigned int flags) 107 { 108 unsigned int iflags = 0; 109 110 if (flags & BTRFS_INODE_SYNC) 111 iflags |= FS_SYNC_FL; 112 if (flags & BTRFS_INODE_IMMUTABLE) 113 iflags |= FS_IMMUTABLE_FL; 114 if (flags & BTRFS_INODE_APPEND) 115 iflags |= FS_APPEND_FL; 116 if (flags & BTRFS_INODE_NODUMP) 117 iflags |= FS_NODUMP_FL; 118 if (flags & BTRFS_INODE_NOATIME) 119 iflags |= FS_NOATIME_FL; 120 if (flags & BTRFS_INODE_DIRSYNC) 121 iflags |= FS_DIRSYNC_FL; 122 if (flags & BTRFS_INODE_NODATACOW) 123 iflags |= FS_NOCOW_FL; 124 125 if ((flags & BTRFS_INODE_COMPRESS) && !(flags & BTRFS_INODE_NOCOMPRESS)) 126 iflags |= FS_COMPR_FL; 127 else if (flags & BTRFS_INODE_NOCOMPRESS) 128 iflags |= FS_NOCOMP_FL; 129 130 return iflags; 131 } 132 133 /* 134 * Update inode->i_flags based on the btrfs internal flags. 135 */ 136 void btrfs_update_iflags(struct inode *inode) 137 { 138 struct btrfs_inode *ip = BTRFS_I(inode); 139 unsigned int new_fl = 0; 140 141 if (ip->flags & BTRFS_INODE_SYNC) 142 new_fl |= S_SYNC; 143 if (ip->flags & BTRFS_INODE_IMMUTABLE) 144 new_fl |= S_IMMUTABLE; 145 if (ip->flags & BTRFS_INODE_APPEND) 146 new_fl |= S_APPEND; 147 if (ip->flags & BTRFS_INODE_NOATIME) 148 new_fl |= S_NOATIME; 149 if (ip->flags & BTRFS_INODE_DIRSYNC) 150 new_fl |= S_DIRSYNC; 151 152 set_mask_bits(&inode->i_flags, 153 S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | S_DIRSYNC, 154 new_fl); 155 } 156 157 /* 158 * Inherit flags from the parent inode. 159 * 160 * Currently only the compression flags and the cow flags are inherited. 161 */ 162 void btrfs_inherit_iflags(struct inode *inode, struct inode *dir) 163 { 164 unsigned int flags; 165 166 if (!dir) 167 return; 168 169 flags = BTRFS_I(dir)->flags; 170 171 if (flags & BTRFS_INODE_NOCOMPRESS) { 172 BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS; 173 BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS; 174 } else if (flags & BTRFS_INODE_COMPRESS) { 175 BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS; 176 BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS; 177 } 178 179 if (flags & BTRFS_INODE_NODATACOW) { 180 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW; 181 if (S_ISREG(inode->i_mode)) 182 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; 183 } 184 185 btrfs_update_iflags(inode); 186 } 187 188 static int btrfs_ioctl_getflags(struct file *file, void __user *arg) 189 { 190 struct btrfs_inode *ip = BTRFS_I(file_inode(file)); 191 unsigned int flags = btrfs_flags_to_ioctl(ip->flags); 192 193 if (copy_to_user(arg, &flags, sizeof(flags))) 194 return -EFAULT; 195 return 0; 196 } 197 198 static int check_flags(unsigned int flags) 199 { 200 if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \ 201 FS_NOATIME_FL | FS_NODUMP_FL | \ 202 FS_SYNC_FL | FS_DIRSYNC_FL | \ 203 FS_NOCOMP_FL | FS_COMPR_FL | 204 FS_NOCOW_FL)) 205 return -EOPNOTSUPP; 206 207 if ((flags & FS_NOCOMP_FL) && (flags & FS_COMPR_FL)) 208 return -EINVAL; 209 210 return 0; 211 } 212 213 static int btrfs_ioctl_setflags(struct file *file, void __user *arg) 214 { 215 struct inode *inode = file_inode(file); 216 struct btrfs_inode *ip = BTRFS_I(inode); 217 struct btrfs_root *root = ip->root; 218 struct btrfs_trans_handle *trans; 219 unsigned int flags, oldflags; 220 int ret; 221 u64 ip_oldflags; 222 unsigned int i_oldflags; 223 umode_t mode; 224 225 if (!inode_owner_or_capable(inode)) 226 return -EPERM; 227 228 if (btrfs_root_readonly(root)) 229 return -EROFS; 230 231 if (copy_from_user(&flags, arg, sizeof(flags))) 232 return -EFAULT; 233 234 ret = check_flags(flags); 235 if (ret) 236 return ret; 237 238 ret = mnt_want_write_file(file); 239 if (ret) 240 return ret; 241 242 mutex_lock(&inode->i_mutex); 243 244 ip_oldflags = ip->flags; 245 i_oldflags = inode->i_flags; 246 mode = inode->i_mode; 247 248 flags = btrfs_mask_flags(inode->i_mode, flags); 249 oldflags = btrfs_flags_to_ioctl(ip->flags); 250 if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) { 251 if (!capable(CAP_LINUX_IMMUTABLE)) { 252 ret = -EPERM; 253 goto out_unlock; 254 } 255 } 256 257 if (flags & FS_SYNC_FL) 258 ip->flags |= BTRFS_INODE_SYNC; 259 else 260 ip->flags &= ~BTRFS_INODE_SYNC; 261 if (flags & FS_IMMUTABLE_FL) 262 ip->flags |= BTRFS_INODE_IMMUTABLE; 263 else 264 ip->flags &= ~BTRFS_INODE_IMMUTABLE; 265 if (flags & FS_APPEND_FL) 266 ip->flags |= BTRFS_INODE_APPEND; 267 else 268 ip->flags &= ~BTRFS_INODE_APPEND; 269 if (flags & FS_NODUMP_FL) 270 ip->flags |= BTRFS_INODE_NODUMP; 271 else 272 ip->flags &= ~BTRFS_INODE_NODUMP; 273 if (flags & FS_NOATIME_FL) 274 ip->flags |= BTRFS_INODE_NOATIME; 275 else 276 ip->flags &= ~BTRFS_INODE_NOATIME; 277 if (flags & FS_DIRSYNC_FL) 278 ip->flags |= BTRFS_INODE_DIRSYNC; 279 else 280 ip->flags &= ~BTRFS_INODE_DIRSYNC; 281 if (flags & FS_NOCOW_FL) { 282 if (S_ISREG(mode)) { 283 /* 284 * It's safe to turn csums off here, no extents exist. 285 * Otherwise we want the flag to reflect the real COW 286 * status of the file and will not set it. 287 */ 288 if (inode->i_size == 0) 289 ip->flags |= BTRFS_INODE_NODATACOW 290 | BTRFS_INODE_NODATASUM; 291 } else { 292 ip->flags |= BTRFS_INODE_NODATACOW; 293 } 294 } else { 295 /* 296 * Revert back under same assuptions as above 297 */ 298 if (S_ISREG(mode)) { 299 if (inode->i_size == 0) 300 ip->flags &= ~(BTRFS_INODE_NODATACOW 301 | BTRFS_INODE_NODATASUM); 302 } else { 303 ip->flags &= ~BTRFS_INODE_NODATACOW; 304 } 305 } 306 307 /* 308 * The COMPRESS flag can only be changed by users, while the NOCOMPRESS 309 * flag may be changed automatically if compression code won't make 310 * things smaller. 311 */ 312 if (flags & FS_NOCOMP_FL) { 313 ip->flags &= ~BTRFS_INODE_COMPRESS; 314 ip->flags |= BTRFS_INODE_NOCOMPRESS; 315 316 ret = btrfs_set_prop(inode, "btrfs.compression", NULL, 0, 0); 317 if (ret && ret != -ENODATA) 318 goto out_drop; 319 } else if (flags & FS_COMPR_FL) { 320 const char *comp; 321 322 ip->flags |= BTRFS_INODE_COMPRESS; 323 ip->flags &= ~BTRFS_INODE_NOCOMPRESS; 324 325 if (root->fs_info->compress_type == BTRFS_COMPRESS_LZO) 326 comp = "lzo"; 327 else 328 comp = "zlib"; 329 ret = btrfs_set_prop(inode, "btrfs.compression", 330 comp, strlen(comp), 0); 331 if (ret) 332 goto out_drop; 333 334 } else { 335 ip->flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS); 336 } 337 338 trans = btrfs_start_transaction(root, 1); 339 if (IS_ERR(trans)) { 340 ret = PTR_ERR(trans); 341 goto out_drop; 342 } 343 344 btrfs_update_iflags(inode); 345 inode_inc_iversion(inode); 346 inode->i_ctime = CURRENT_TIME; 347 ret = btrfs_update_inode(trans, root, inode); 348 349 btrfs_end_transaction(trans, root); 350 out_drop: 351 if (ret) { 352 ip->flags = ip_oldflags; 353 inode->i_flags = i_oldflags; 354 } 355 356 out_unlock: 357 mutex_unlock(&inode->i_mutex); 358 mnt_drop_write_file(file); 359 return ret; 360 } 361 362 static int btrfs_ioctl_getversion(struct file *file, int __user *arg) 363 { 364 struct inode *inode = file_inode(file); 365 366 return put_user(inode->i_generation, arg); 367 } 368 369 static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg) 370 { 371 struct btrfs_fs_info *fs_info = btrfs_sb(file_inode(file)->i_sb); 372 struct btrfs_device *device; 373 struct request_queue *q; 374 struct fstrim_range range; 375 u64 minlen = ULLONG_MAX; 376 u64 num_devices = 0; 377 u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy); 378 int ret; 379 380 if (!capable(CAP_SYS_ADMIN)) 381 return -EPERM; 382 383 rcu_read_lock(); 384 list_for_each_entry_rcu(device, &fs_info->fs_devices->devices, 385 dev_list) { 386 if (!device->bdev) 387 continue; 388 q = bdev_get_queue(device->bdev); 389 if (blk_queue_discard(q)) { 390 num_devices++; 391 minlen = min((u64)q->limits.discard_granularity, 392 minlen); 393 } 394 } 395 rcu_read_unlock(); 396 397 if (!num_devices) 398 return -EOPNOTSUPP; 399 if (copy_from_user(&range, arg, sizeof(range))) 400 return -EFAULT; 401 if (range.start > total_bytes || 402 range.len < fs_info->sb->s_blocksize) 403 return -EINVAL; 404 405 range.len = min(range.len, total_bytes - range.start); 406 range.minlen = max(range.minlen, minlen); 407 ret = btrfs_trim_fs(fs_info->tree_root, &range); 408 if (ret < 0) 409 return ret; 410 411 if (copy_to_user(arg, &range, sizeof(range))) 412 return -EFAULT; 413 414 return 0; 415 } 416 417 int btrfs_is_empty_uuid(u8 *uuid) 418 { 419 int i; 420 421 for (i = 0; i < BTRFS_UUID_SIZE; i++) { 422 if (uuid[i]) 423 return 0; 424 } 425 return 1; 426 } 427 428 static noinline int create_subvol(struct inode *dir, 429 struct dentry *dentry, 430 char *name, int namelen, 431 u64 *async_transid, 432 struct btrfs_qgroup_inherit *inherit) 433 { 434 struct btrfs_trans_handle *trans; 435 struct btrfs_key key; 436 struct btrfs_root_item root_item; 437 struct btrfs_inode_item *inode_item; 438 struct extent_buffer *leaf; 439 struct btrfs_root *root = BTRFS_I(dir)->root; 440 struct btrfs_root *new_root; 441 struct btrfs_block_rsv block_rsv; 442 struct timespec cur_time = CURRENT_TIME; 443 struct inode *inode; 444 int ret; 445 int err; 446 u64 objectid; 447 u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID; 448 u64 index = 0; 449 u64 qgroup_reserved; 450 uuid_le new_uuid; 451 452 ret = btrfs_find_free_objectid(root->fs_info->tree_root, &objectid); 453 if (ret) 454 return ret; 455 456 btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP); 457 /* 458 * The same as the snapshot creation, please see the comment 459 * of create_snapshot(). 460 */ 461 ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 462 8, &qgroup_reserved, false); 463 if (ret) 464 return ret; 465 466 trans = btrfs_start_transaction(root, 0); 467 if (IS_ERR(trans)) { 468 ret = PTR_ERR(trans); 469 btrfs_subvolume_release_metadata(root, &block_rsv, 470 qgroup_reserved); 471 return ret; 472 } 473 trans->block_rsv = &block_rsv; 474 trans->bytes_reserved = block_rsv.size; 475 476 ret = btrfs_qgroup_inherit(trans, root->fs_info, 0, objectid, inherit); 477 if (ret) 478 goto fail; 479 480 leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 481 0, objectid, NULL, 0, 0, 0); 482 if (IS_ERR(leaf)) { 483 ret = PTR_ERR(leaf); 484 goto fail; 485 } 486 487 memset_extent_buffer(leaf, 0, 0, sizeof(struct btrfs_header)); 488 btrfs_set_header_bytenr(leaf, leaf->start); 489 btrfs_set_header_generation(leaf, trans->transid); 490 btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV); 491 btrfs_set_header_owner(leaf, objectid); 492 493 write_extent_buffer(leaf, root->fs_info->fsid, btrfs_header_fsid(), 494 BTRFS_FSID_SIZE); 495 write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid, 496 btrfs_header_chunk_tree_uuid(leaf), 497 BTRFS_UUID_SIZE); 498 btrfs_mark_buffer_dirty(leaf); 499 500 memset(&root_item, 0, sizeof(root_item)); 501 502 inode_item = &root_item.inode; 503 btrfs_set_stack_inode_generation(inode_item, 1); 504 btrfs_set_stack_inode_size(inode_item, 3); 505 btrfs_set_stack_inode_nlink(inode_item, 1); 506 btrfs_set_stack_inode_nbytes(inode_item, root->leafsize); 507 btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755); 508 509 btrfs_set_root_flags(&root_item, 0); 510 btrfs_set_root_limit(&root_item, 0); 511 btrfs_set_stack_inode_flags(inode_item, BTRFS_INODE_ROOT_ITEM_INIT); 512 513 btrfs_set_root_bytenr(&root_item, leaf->start); 514 btrfs_set_root_generation(&root_item, trans->transid); 515 btrfs_set_root_level(&root_item, 0); 516 btrfs_set_root_refs(&root_item, 1); 517 btrfs_set_root_used(&root_item, leaf->len); 518 btrfs_set_root_last_snapshot(&root_item, 0); 519 520 btrfs_set_root_generation_v2(&root_item, 521 btrfs_root_generation(&root_item)); 522 uuid_le_gen(&new_uuid); 523 memcpy(root_item.uuid, new_uuid.b, BTRFS_UUID_SIZE); 524 btrfs_set_stack_timespec_sec(&root_item.otime, cur_time.tv_sec); 525 btrfs_set_stack_timespec_nsec(&root_item.otime, cur_time.tv_nsec); 526 root_item.ctime = root_item.otime; 527 btrfs_set_root_ctransid(&root_item, trans->transid); 528 btrfs_set_root_otransid(&root_item, trans->transid); 529 530 btrfs_tree_unlock(leaf); 531 free_extent_buffer(leaf); 532 leaf = NULL; 533 534 btrfs_set_root_dirid(&root_item, new_dirid); 535 536 key.objectid = objectid; 537 key.offset = 0; 538 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); 539 ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key, 540 &root_item); 541 if (ret) 542 goto fail; 543 544 key.offset = (u64)-1; 545 new_root = btrfs_read_fs_root_no_name(root->fs_info, &key); 546 if (IS_ERR(new_root)) { 547 btrfs_abort_transaction(trans, root, PTR_ERR(new_root)); 548 ret = PTR_ERR(new_root); 549 goto fail; 550 } 551 552 btrfs_record_root_in_trans(trans, new_root); 553 554 ret = btrfs_create_subvol_root(trans, new_root, root, new_dirid); 555 if (ret) { 556 /* We potentially lose an unused inode item here */ 557 btrfs_abort_transaction(trans, root, ret); 558 goto fail; 559 } 560 561 /* 562 * insert the directory item 563 */ 564 ret = btrfs_set_inode_index(dir, &index); 565 if (ret) { 566 btrfs_abort_transaction(trans, root, ret); 567 goto fail; 568 } 569 570 ret = btrfs_insert_dir_item(trans, root, 571 name, namelen, dir, &key, 572 BTRFS_FT_DIR, index); 573 if (ret) { 574 btrfs_abort_transaction(trans, root, ret); 575 goto fail; 576 } 577 578 btrfs_i_size_write(dir, dir->i_size + namelen * 2); 579 ret = btrfs_update_inode(trans, root, dir); 580 BUG_ON(ret); 581 582 ret = btrfs_add_root_ref(trans, root->fs_info->tree_root, 583 objectid, root->root_key.objectid, 584 btrfs_ino(dir), index, name, namelen); 585 BUG_ON(ret); 586 587 ret = btrfs_uuid_tree_add(trans, root->fs_info->uuid_root, 588 root_item.uuid, BTRFS_UUID_KEY_SUBVOL, 589 objectid); 590 if (ret) 591 btrfs_abort_transaction(trans, root, ret); 592 593 fail: 594 trans->block_rsv = NULL; 595 trans->bytes_reserved = 0; 596 btrfs_subvolume_release_metadata(root, &block_rsv, qgroup_reserved); 597 598 if (async_transid) { 599 *async_transid = trans->transid; 600 err = btrfs_commit_transaction_async(trans, root, 1); 601 if (err) 602 err = btrfs_commit_transaction(trans, root); 603 } else { 604 err = btrfs_commit_transaction(trans, root); 605 } 606 if (err && !ret) 607 ret = err; 608 609 if (!ret) { 610 inode = btrfs_lookup_dentry(dir, dentry); 611 if (IS_ERR(inode)) 612 return PTR_ERR(inode); 613 d_instantiate(dentry, inode); 614 } 615 return ret; 616 } 617 618 static void btrfs_wait_nocow_write(struct btrfs_root *root) 619 { 620 s64 writers; 621 DEFINE_WAIT(wait); 622 623 do { 624 prepare_to_wait(&root->subv_writers->wait, &wait, 625 TASK_UNINTERRUPTIBLE); 626 627 writers = percpu_counter_sum(&root->subv_writers->counter); 628 if (writers) 629 schedule(); 630 631 finish_wait(&root->subv_writers->wait, &wait); 632 } while (writers); 633 } 634 635 static int create_snapshot(struct btrfs_root *root, struct inode *dir, 636 struct dentry *dentry, char *name, int namelen, 637 u64 *async_transid, bool readonly, 638 struct btrfs_qgroup_inherit *inherit) 639 { 640 struct inode *inode; 641 struct btrfs_pending_snapshot *pending_snapshot; 642 struct btrfs_trans_handle *trans; 643 int ret; 644 645 if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state)) 646 return -EINVAL; 647 648 atomic_inc(&root->will_be_snapshoted); 649 smp_mb__after_atomic(); 650 btrfs_wait_nocow_write(root); 651 652 ret = btrfs_start_delalloc_inodes(root, 0); 653 if (ret) 654 goto out; 655 656 btrfs_wait_ordered_extents(root, -1); 657 658 pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS); 659 if (!pending_snapshot) { 660 ret = -ENOMEM; 661 goto out; 662 } 663 664 btrfs_init_block_rsv(&pending_snapshot->block_rsv, 665 BTRFS_BLOCK_RSV_TEMP); 666 /* 667 * 1 - parent dir inode 668 * 2 - dir entries 669 * 1 - root item 670 * 2 - root ref/backref 671 * 1 - root of snapshot 672 * 1 - UUID item 673 */ 674 ret = btrfs_subvolume_reserve_metadata(BTRFS_I(dir)->root, 675 &pending_snapshot->block_rsv, 8, 676 &pending_snapshot->qgroup_reserved, 677 false); 678 if (ret) 679 goto free; 680 681 pending_snapshot->dentry = dentry; 682 pending_snapshot->root = root; 683 pending_snapshot->readonly = readonly; 684 pending_snapshot->dir = dir; 685 pending_snapshot->inherit = inherit; 686 687 trans = btrfs_start_transaction(root, 0); 688 if (IS_ERR(trans)) { 689 ret = PTR_ERR(trans); 690 goto fail; 691 } 692 693 spin_lock(&root->fs_info->trans_lock); 694 list_add(&pending_snapshot->list, 695 &trans->transaction->pending_snapshots); 696 spin_unlock(&root->fs_info->trans_lock); 697 if (async_transid) { 698 *async_transid = trans->transid; 699 ret = btrfs_commit_transaction_async(trans, 700 root->fs_info->extent_root, 1); 701 if (ret) 702 ret = btrfs_commit_transaction(trans, root); 703 } else { 704 ret = btrfs_commit_transaction(trans, 705 root->fs_info->extent_root); 706 } 707 if (ret) 708 goto fail; 709 710 ret = pending_snapshot->error; 711 if (ret) 712 goto fail; 713 714 inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry); 715 if (IS_ERR(inode)) { 716 ret = PTR_ERR(inode); 717 goto fail; 718 } 719 720 d_instantiate(dentry, inode); 721 ret = 0; 722 fail: 723 btrfs_subvolume_release_metadata(BTRFS_I(dir)->root, 724 &pending_snapshot->block_rsv, 725 pending_snapshot->qgroup_reserved); 726 free: 727 kfree(pending_snapshot); 728 out: 729 atomic_dec(&root->will_be_snapshoted); 730 return ret; 731 } 732 733 /* copy of check_sticky in fs/namei.c() 734 * It's inline, so penalty for filesystems that don't use sticky bit is 735 * minimal. 736 */ 737 static inline int btrfs_check_sticky(struct inode *dir, struct inode *inode) 738 { 739 kuid_t fsuid = current_fsuid(); 740 741 if (!(dir->i_mode & S_ISVTX)) 742 return 0; 743 if (uid_eq(inode->i_uid, fsuid)) 744 return 0; 745 if (uid_eq(dir->i_uid, fsuid)) 746 return 0; 747 return !capable(CAP_FOWNER); 748 } 749 750 /* copy of may_delete in fs/namei.c() 751 * Check whether we can remove a link victim from directory dir, check 752 * whether the type of victim is right. 753 * 1. We can't do it if dir is read-only (done in permission()) 754 * 2. We should have write and exec permissions on dir 755 * 3. We can't remove anything from append-only dir 756 * 4. We can't do anything with immutable dir (done in permission()) 757 * 5. If the sticky bit on dir is set we should either 758 * a. be owner of dir, or 759 * b. be owner of victim, or 760 * c. have CAP_FOWNER capability 761 * 6. If the victim is append-only or immutable we can't do antyhing with 762 * links pointing to it. 763 * 7. If we were asked to remove a directory and victim isn't one - ENOTDIR. 764 * 8. If we were asked to remove a non-directory and victim isn't one - EISDIR. 765 * 9. We can't remove a root or mountpoint. 766 * 10. We don't allow removal of NFS sillyrenamed files; it's handled by 767 * nfs_async_unlink(). 768 */ 769 770 static int btrfs_may_delete(struct inode *dir, struct dentry *victim, int isdir) 771 { 772 int error; 773 774 if (!victim->d_inode) 775 return -ENOENT; 776 777 BUG_ON(victim->d_parent->d_inode != dir); 778 audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE); 779 780 error = inode_permission(dir, MAY_WRITE | MAY_EXEC); 781 if (error) 782 return error; 783 if (IS_APPEND(dir)) 784 return -EPERM; 785 if (btrfs_check_sticky(dir, victim->d_inode)|| 786 IS_APPEND(victim->d_inode)|| 787 IS_IMMUTABLE(victim->d_inode) || IS_SWAPFILE(victim->d_inode)) 788 return -EPERM; 789 if (isdir) { 790 if (!S_ISDIR(victim->d_inode->i_mode)) 791 return -ENOTDIR; 792 if (IS_ROOT(victim)) 793 return -EBUSY; 794 } else if (S_ISDIR(victim->d_inode->i_mode)) 795 return -EISDIR; 796 if (IS_DEADDIR(dir)) 797 return -ENOENT; 798 if (victim->d_flags & DCACHE_NFSFS_RENAMED) 799 return -EBUSY; 800 return 0; 801 } 802 803 /* copy of may_create in fs/namei.c() */ 804 static inline int btrfs_may_create(struct inode *dir, struct dentry *child) 805 { 806 if (child->d_inode) 807 return -EEXIST; 808 if (IS_DEADDIR(dir)) 809 return -ENOENT; 810 return inode_permission(dir, MAY_WRITE | MAY_EXEC); 811 } 812 813 /* 814 * Create a new subvolume below @parent. This is largely modeled after 815 * sys_mkdirat and vfs_mkdir, but we only do a single component lookup 816 * inside this filesystem so it's quite a bit simpler. 817 */ 818 static noinline int btrfs_mksubvol(struct path *parent, 819 char *name, int namelen, 820 struct btrfs_root *snap_src, 821 u64 *async_transid, bool readonly, 822 struct btrfs_qgroup_inherit *inherit) 823 { 824 struct inode *dir = parent->dentry->d_inode; 825 struct dentry *dentry; 826 int error; 827 828 error = mutex_lock_killable_nested(&dir->i_mutex, I_MUTEX_PARENT); 829 if (error == -EINTR) 830 return error; 831 832 dentry = lookup_one_len(name, parent->dentry, namelen); 833 error = PTR_ERR(dentry); 834 if (IS_ERR(dentry)) 835 goto out_unlock; 836 837 error = -EEXIST; 838 if (dentry->d_inode) 839 goto out_dput; 840 841 error = btrfs_may_create(dir, dentry); 842 if (error) 843 goto out_dput; 844 845 /* 846 * even if this name doesn't exist, we may get hash collisions. 847 * check for them now when we can safely fail 848 */ 849 error = btrfs_check_dir_item_collision(BTRFS_I(dir)->root, 850 dir->i_ino, name, 851 namelen); 852 if (error) 853 goto out_dput; 854 855 down_read(&BTRFS_I(dir)->root->fs_info->subvol_sem); 856 857 if (btrfs_root_refs(&BTRFS_I(dir)->root->root_item) == 0) 858 goto out_up_read; 859 860 if (snap_src) { 861 error = create_snapshot(snap_src, dir, dentry, name, namelen, 862 async_transid, readonly, inherit); 863 } else { 864 error = create_subvol(dir, dentry, name, namelen, 865 async_transid, inherit); 866 } 867 if (!error) 868 fsnotify_mkdir(dir, dentry); 869 out_up_read: 870 up_read(&BTRFS_I(dir)->root->fs_info->subvol_sem); 871 out_dput: 872 dput(dentry); 873 out_unlock: 874 mutex_unlock(&dir->i_mutex); 875 return error; 876 } 877 878 /* 879 * When we're defragging a range, we don't want to kick it off again 880 * if it is really just waiting for delalloc to send it down. 881 * If we find a nice big extent or delalloc range for the bytes in the 882 * file you want to defrag, we return 0 to let you know to skip this 883 * part of the file 884 */ 885 static int check_defrag_in_cache(struct inode *inode, u64 offset, int thresh) 886 { 887 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 888 struct extent_map *em = NULL; 889 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 890 u64 end; 891 892 read_lock(&em_tree->lock); 893 em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE); 894 read_unlock(&em_tree->lock); 895 896 if (em) { 897 end = extent_map_end(em); 898 free_extent_map(em); 899 if (end - offset > thresh) 900 return 0; 901 } 902 /* if we already have a nice delalloc here, just stop */ 903 thresh /= 2; 904 end = count_range_bits(io_tree, &offset, offset + thresh, 905 thresh, EXTENT_DELALLOC, 1); 906 if (end >= thresh) 907 return 0; 908 return 1; 909 } 910 911 /* 912 * helper function to walk through a file and find extents 913 * newer than a specific transid, and smaller than thresh. 914 * 915 * This is used by the defragging code to find new and small 916 * extents 917 */ 918 static int find_new_extents(struct btrfs_root *root, 919 struct inode *inode, u64 newer_than, 920 u64 *off, int thresh) 921 { 922 struct btrfs_path *path; 923 struct btrfs_key min_key; 924 struct extent_buffer *leaf; 925 struct btrfs_file_extent_item *extent; 926 int type; 927 int ret; 928 u64 ino = btrfs_ino(inode); 929 930 path = btrfs_alloc_path(); 931 if (!path) 932 return -ENOMEM; 933 934 min_key.objectid = ino; 935 min_key.type = BTRFS_EXTENT_DATA_KEY; 936 min_key.offset = *off; 937 938 while (1) { 939 path->keep_locks = 1; 940 ret = btrfs_search_forward(root, &min_key, path, newer_than); 941 if (ret != 0) 942 goto none; 943 path->keep_locks = 0; 944 btrfs_unlock_up_safe(path, 1); 945 process_slot: 946 if (min_key.objectid != ino) 947 goto none; 948 if (min_key.type != BTRFS_EXTENT_DATA_KEY) 949 goto none; 950 951 leaf = path->nodes[0]; 952 extent = btrfs_item_ptr(leaf, path->slots[0], 953 struct btrfs_file_extent_item); 954 955 type = btrfs_file_extent_type(leaf, extent); 956 if (type == BTRFS_FILE_EXTENT_REG && 957 btrfs_file_extent_num_bytes(leaf, extent) < thresh && 958 check_defrag_in_cache(inode, min_key.offset, thresh)) { 959 *off = min_key.offset; 960 btrfs_free_path(path); 961 return 0; 962 } 963 964 path->slots[0]++; 965 if (path->slots[0] < btrfs_header_nritems(leaf)) { 966 btrfs_item_key_to_cpu(leaf, &min_key, path->slots[0]); 967 goto process_slot; 968 } 969 970 if (min_key.offset == (u64)-1) 971 goto none; 972 973 min_key.offset++; 974 btrfs_release_path(path); 975 } 976 none: 977 btrfs_free_path(path); 978 return -ENOENT; 979 } 980 981 static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start) 982 { 983 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 984 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 985 struct extent_map *em; 986 u64 len = PAGE_CACHE_SIZE; 987 988 /* 989 * hopefully we have this extent in the tree already, try without 990 * the full extent lock 991 */ 992 read_lock(&em_tree->lock); 993 em = lookup_extent_mapping(em_tree, start, len); 994 read_unlock(&em_tree->lock); 995 996 if (!em) { 997 struct extent_state *cached = NULL; 998 u64 end = start + len - 1; 999 1000 /* get the big lock and read metadata off disk */ 1001 lock_extent_bits(io_tree, start, end, 0, &cached); 1002 em = btrfs_get_extent(inode, NULL, 0, start, len, 0); 1003 unlock_extent_cached(io_tree, start, end, &cached, GFP_NOFS); 1004 1005 if (IS_ERR(em)) 1006 return NULL; 1007 } 1008 1009 return em; 1010 } 1011 1012 static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em) 1013 { 1014 struct extent_map *next; 1015 bool ret = true; 1016 1017 /* this is the last extent */ 1018 if (em->start + em->len >= i_size_read(inode)) 1019 return false; 1020 1021 next = defrag_lookup_extent(inode, em->start + em->len); 1022 if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE || 1023 (em->block_start + em->block_len == next->block_start)) 1024 ret = false; 1025 1026 free_extent_map(next); 1027 return ret; 1028 } 1029 1030 static int should_defrag_range(struct inode *inode, u64 start, int thresh, 1031 u64 *last_len, u64 *skip, u64 *defrag_end, 1032 int compress) 1033 { 1034 struct extent_map *em; 1035 int ret = 1; 1036 bool next_mergeable = true; 1037 1038 /* 1039 * make sure that once we start defragging an extent, we keep on 1040 * defragging it 1041 */ 1042 if (start < *defrag_end) 1043 return 1; 1044 1045 *skip = 0; 1046 1047 em = defrag_lookup_extent(inode, start); 1048 if (!em) 1049 return 0; 1050 1051 /* this will cover holes, and inline extents */ 1052 if (em->block_start >= EXTENT_MAP_LAST_BYTE) { 1053 ret = 0; 1054 goto out; 1055 } 1056 1057 next_mergeable = defrag_check_next_extent(inode, em); 1058 1059 /* 1060 * we hit a real extent, if it is big or the next extent is not a 1061 * real extent, don't bother defragging it 1062 */ 1063 if (!compress && (*last_len == 0 || *last_len >= thresh) && 1064 (em->len >= thresh || !next_mergeable)) 1065 ret = 0; 1066 out: 1067 /* 1068 * last_len ends up being a counter of how many bytes we've defragged. 1069 * every time we choose not to defrag an extent, we reset *last_len 1070 * so that the next tiny extent will force a defrag. 1071 * 1072 * The end result of this is that tiny extents before a single big 1073 * extent will force at least part of that big extent to be defragged. 1074 */ 1075 if (ret) { 1076 *defrag_end = extent_map_end(em); 1077 } else { 1078 *last_len = 0; 1079 *skip = extent_map_end(em); 1080 *defrag_end = 0; 1081 } 1082 1083 free_extent_map(em); 1084 return ret; 1085 } 1086 1087 /* 1088 * it doesn't do much good to defrag one or two pages 1089 * at a time. This pulls in a nice chunk of pages 1090 * to COW and defrag. 1091 * 1092 * It also makes sure the delalloc code has enough 1093 * dirty data to avoid making new small extents as part 1094 * of the defrag 1095 * 1096 * It's a good idea to start RA on this range 1097 * before calling this. 1098 */ 1099 static int cluster_pages_for_defrag(struct inode *inode, 1100 struct page **pages, 1101 unsigned long start_index, 1102 unsigned long num_pages) 1103 { 1104 unsigned long file_end; 1105 u64 isize = i_size_read(inode); 1106 u64 page_start; 1107 u64 page_end; 1108 u64 page_cnt; 1109 int ret; 1110 int i; 1111 int i_done; 1112 struct btrfs_ordered_extent *ordered; 1113 struct extent_state *cached_state = NULL; 1114 struct extent_io_tree *tree; 1115 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); 1116 1117 file_end = (isize - 1) >> PAGE_CACHE_SHIFT; 1118 if (!isize || start_index > file_end) 1119 return 0; 1120 1121 page_cnt = min_t(u64, (u64)num_pages, (u64)file_end - start_index + 1); 1122 1123 ret = btrfs_delalloc_reserve_space(inode, 1124 page_cnt << PAGE_CACHE_SHIFT); 1125 if (ret) 1126 return ret; 1127 i_done = 0; 1128 tree = &BTRFS_I(inode)->io_tree; 1129 1130 /* step one, lock all the pages */ 1131 for (i = 0; i < page_cnt; i++) { 1132 struct page *page; 1133 again: 1134 page = find_or_create_page(inode->i_mapping, 1135 start_index + i, mask); 1136 if (!page) 1137 break; 1138 1139 page_start = page_offset(page); 1140 page_end = page_start + PAGE_CACHE_SIZE - 1; 1141 while (1) { 1142 lock_extent_bits(tree, page_start, page_end, 1143 0, &cached_state); 1144 ordered = btrfs_lookup_ordered_extent(inode, 1145 page_start); 1146 unlock_extent_cached(tree, page_start, page_end, 1147 &cached_state, GFP_NOFS); 1148 if (!ordered) 1149 break; 1150 1151 unlock_page(page); 1152 btrfs_start_ordered_extent(inode, ordered, 1); 1153 btrfs_put_ordered_extent(ordered); 1154 lock_page(page); 1155 /* 1156 * we unlocked the page above, so we need check if 1157 * it was released or not. 1158 */ 1159 if (page->mapping != inode->i_mapping) { 1160 unlock_page(page); 1161 page_cache_release(page); 1162 goto again; 1163 } 1164 } 1165 1166 if (!PageUptodate(page)) { 1167 btrfs_readpage(NULL, page); 1168 lock_page(page); 1169 if (!PageUptodate(page)) { 1170 unlock_page(page); 1171 page_cache_release(page); 1172 ret = -EIO; 1173 break; 1174 } 1175 } 1176 1177 if (page->mapping != inode->i_mapping) { 1178 unlock_page(page); 1179 page_cache_release(page); 1180 goto again; 1181 } 1182 1183 pages[i] = page; 1184 i_done++; 1185 } 1186 if (!i_done || ret) 1187 goto out; 1188 1189 if (!(inode->i_sb->s_flags & MS_ACTIVE)) 1190 goto out; 1191 1192 /* 1193 * so now we have a nice long stream of locked 1194 * and up to date pages, lets wait on them 1195 */ 1196 for (i = 0; i < i_done; i++) 1197 wait_on_page_writeback(pages[i]); 1198 1199 page_start = page_offset(pages[0]); 1200 page_end = page_offset(pages[i_done - 1]) + PAGE_CACHE_SIZE; 1201 1202 lock_extent_bits(&BTRFS_I(inode)->io_tree, 1203 page_start, page_end - 1, 0, &cached_state); 1204 clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, 1205 page_end - 1, EXTENT_DIRTY | EXTENT_DELALLOC | 1206 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0, 1207 &cached_state, GFP_NOFS); 1208 1209 if (i_done != page_cnt) { 1210 spin_lock(&BTRFS_I(inode)->lock); 1211 BTRFS_I(inode)->outstanding_extents++; 1212 spin_unlock(&BTRFS_I(inode)->lock); 1213 btrfs_delalloc_release_space(inode, 1214 (page_cnt - i_done) << PAGE_CACHE_SHIFT); 1215 } 1216 1217 1218 set_extent_defrag(&BTRFS_I(inode)->io_tree, page_start, page_end - 1, 1219 &cached_state, GFP_NOFS); 1220 1221 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 1222 page_start, page_end - 1, &cached_state, 1223 GFP_NOFS); 1224 1225 for (i = 0; i < i_done; i++) { 1226 clear_page_dirty_for_io(pages[i]); 1227 ClearPageChecked(pages[i]); 1228 set_page_extent_mapped(pages[i]); 1229 set_page_dirty(pages[i]); 1230 unlock_page(pages[i]); 1231 page_cache_release(pages[i]); 1232 } 1233 return i_done; 1234 out: 1235 for (i = 0; i < i_done; i++) { 1236 unlock_page(pages[i]); 1237 page_cache_release(pages[i]); 1238 } 1239 btrfs_delalloc_release_space(inode, page_cnt << PAGE_CACHE_SHIFT); 1240 return ret; 1241 1242 } 1243 1244 int btrfs_defrag_file(struct inode *inode, struct file *file, 1245 struct btrfs_ioctl_defrag_range_args *range, 1246 u64 newer_than, unsigned long max_to_defrag) 1247 { 1248 struct btrfs_root *root = BTRFS_I(inode)->root; 1249 struct file_ra_state *ra = NULL; 1250 unsigned long last_index; 1251 u64 isize = i_size_read(inode); 1252 u64 last_len = 0; 1253 u64 skip = 0; 1254 u64 defrag_end = 0; 1255 u64 newer_off = range->start; 1256 unsigned long i; 1257 unsigned long ra_index = 0; 1258 int ret; 1259 int defrag_count = 0; 1260 int compress_type = BTRFS_COMPRESS_ZLIB; 1261 int extent_thresh = range->extent_thresh; 1262 unsigned long max_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT; 1263 unsigned long cluster = max_cluster; 1264 u64 new_align = ~((u64)128 * 1024 - 1); 1265 struct page **pages = NULL; 1266 1267 if (isize == 0) 1268 return 0; 1269 1270 if (range->start >= isize) 1271 return -EINVAL; 1272 1273 if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) { 1274 if (range->compress_type > BTRFS_COMPRESS_TYPES) 1275 return -EINVAL; 1276 if (range->compress_type) 1277 compress_type = range->compress_type; 1278 } 1279 1280 if (extent_thresh == 0) 1281 extent_thresh = 256 * 1024; 1282 1283 /* 1284 * if we were not given a file, allocate a readahead 1285 * context 1286 */ 1287 if (!file) { 1288 ra = kzalloc(sizeof(*ra), GFP_NOFS); 1289 if (!ra) 1290 return -ENOMEM; 1291 file_ra_state_init(ra, inode->i_mapping); 1292 } else { 1293 ra = &file->f_ra; 1294 } 1295 1296 pages = kmalloc_array(max_cluster, sizeof(struct page *), 1297 GFP_NOFS); 1298 if (!pages) { 1299 ret = -ENOMEM; 1300 goto out_ra; 1301 } 1302 1303 /* find the last page to defrag */ 1304 if (range->start + range->len > range->start) { 1305 last_index = min_t(u64, isize - 1, 1306 range->start + range->len - 1) >> PAGE_CACHE_SHIFT; 1307 } else { 1308 last_index = (isize - 1) >> PAGE_CACHE_SHIFT; 1309 } 1310 1311 if (newer_than) { 1312 ret = find_new_extents(root, inode, newer_than, 1313 &newer_off, 64 * 1024); 1314 if (!ret) { 1315 range->start = newer_off; 1316 /* 1317 * we always align our defrag to help keep 1318 * the extents in the file evenly spaced 1319 */ 1320 i = (newer_off & new_align) >> PAGE_CACHE_SHIFT; 1321 } else 1322 goto out_ra; 1323 } else { 1324 i = range->start >> PAGE_CACHE_SHIFT; 1325 } 1326 if (!max_to_defrag) 1327 max_to_defrag = last_index + 1; 1328 1329 /* 1330 * make writeback starts from i, so the defrag range can be 1331 * written sequentially. 1332 */ 1333 if (i < inode->i_mapping->writeback_index) 1334 inode->i_mapping->writeback_index = i; 1335 1336 while (i <= last_index && defrag_count < max_to_defrag && 1337 (i < (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> 1338 PAGE_CACHE_SHIFT)) { 1339 /* 1340 * make sure we stop running if someone unmounts 1341 * the FS 1342 */ 1343 if (!(inode->i_sb->s_flags & MS_ACTIVE)) 1344 break; 1345 1346 if (btrfs_defrag_cancelled(root->fs_info)) { 1347 printk(KERN_DEBUG "BTRFS: defrag_file cancelled\n"); 1348 ret = -EAGAIN; 1349 break; 1350 } 1351 1352 if (!should_defrag_range(inode, (u64)i << PAGE_CACHE_SHIFT, 1353 extent_thresh, &last_len, &skip, 1354 &defrag_end, range->flags & 1355 BTRFS_DEFRAG_RANGE_COMPRESS)) { 1356 unsigned long next; 1357 /* 1358 * the should_defrag function tells us how much to skip 1359 * bump our counter by the suggested amount 1360 */ 1361 next = (skip + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 1362 i = max(i + 1, next); 1363 continue; 1364 } 1365 1366 if (!newer_than) { 1367 cluster = (PAGE_CACHE_ALIGN(defrag_end) >> 1368 PAGE_CACHE_SHIFT) - i; 1369 cluster = min(cluster, max_cluster); 1370 } else { 1371 cluster = max_cluster; 1372 } 1373 1374 if (i + cluster > ra_index) { 1375 ra_index = max(i, ra_index); 1376 btrfs_force_ra(inode->i_mapping, ra, file, ra_index, 1377 cluster); 1378 ra_index += max_cluster; 1379 } 1380 1381 mutex_lock(&inode->i_mutex); 1382 if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) 1383 BTRFS_I(inode)->force_compress = compress_type; 1384 ret = cluster_pages_for_defrag(inode, pages, i, cluster); 1385 if (ret < 0) { 1386 mutex_unlock(&inode->i_mutex); 1387 goto out_ra; 1388 } 1389 1390 defrag_count += ret; 1391 balance_dirty_pages_ratelimited(inode->i_mapping); 1392 mutex_unlock(&inode->i_mutex); 1393 1394 if (newer_than) { 1395 if (newer_off == (u64)-1) 1396 break; 1397 1398 if (ret > 0) 1399 i += ret; 1400 1401 newer_off = max(newer_off + 1, 1402 (u64)i << PAGE_CACHE_SHIFT); 1403 1404 ret = find_new_extents(root, inode, 1405 newer_than, &newer_off, 1406 64 * 1024); 1407 if (!ret) { 1408 range->start = newer_off; 1409 i = (newer_off & new_align) >> PAGE_CACHE_SHIFT; 1410 } else { 1411 break; 1412 } 1413 } else { 1414 if (ret > 0) { 1415 i += ret; 1416 last_len += ret << PAGE_CACHE_SHIFT; 1417 } else { 1418 i++; 1419 last_len = 0; 1420 } 1421 } 1422 } 1423 1424 if ((range->flags & BTRFS_DEFRAG_RANGE_START_IO)) { 1425 filemap_flush(inode->i_mapping); 1426 if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, 1427 &BTRFS_I(inode)->runtime_flags)) 1428 filemap_flush(inode->i_mapping); 1429 } 1430 1431 if ((range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)) { 1432 /* the filemap_flush will queue IO into the worker threads, but 1433 * we have to make sure the IO is actually started and that 1434 * ordered extents get created before we return 1435 */ 1436 atomic_inc(&root->fs_info->async_submit_draining); 1437 while (atomic_read(&root->fs_info->nr_async_submits) || 1438 atomic_read(&root->fs_info->async_delalloc_pages)) { 1439 wait_event(root->fs_info->async_submit_wait, 1440 (atomic_read(&root->fs_info->nr_async_submits) == 0 && 1441 atomic_read(&root->fs_info->async_delalloc_pages) == 0)); 1442 } 1443 atomic_dec(&root->fs_info->async_submit_draining); 1444 } 1445 1446 if (range->compress_type == BTRFS_COMPRESS_LZO) { 1447 btrfs_set_fs_incompat(root->fs_info, COMPRESS_LZO); 1448 } 1449 1450 ret = defrag_count; 1451 1452 out_ra: 1453 if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) { 1454 mutex_lock(&inode->i_mutex); 1455 BTRFS_I(inode)->force_compress = BTRFS_COMPRESS_NONE; 1456 mutex_unlock(&inode->i_mutex); 1457 } 1458 if (!file) 1459 kfree(ra); 1460 kfree(pages); 1461 return ret; 1462 } 1463 1464 static noinline int btrfs_ioctl_resize(struct file *file, 1465 void __user *arg) 1466 { 1467 u64 new_size; 1468 u64 old_size; 1469 u64 devid = 1; 1470 struct btrfs_root *root = BTRFS_I(file_inode(file))->root; 1471 struct btrfs_ioctl_vol_args *vol_args; 1472 struct btrfs_trans_handle *trans; 1473 struct btrfs_device *device = NULL; 1474 char *sizestr; 1475 char *retptr; 1476 char *devstr = NULL; 1477 int ret = 0; 1478 int mod = 0; 1479 1480 if (!capable(CAP_SYS_ADMIN)) 1481 return -EPERM; 1482 1483 ret = mnt_want_write_file(file); 1484 if (ret) 1485 return ret; 1486 1487 if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, 1488 1)) { 1489 mnt_drop_write_file(file); 1490 return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; 1491 } 1492 1493 mutex_lock(&root->fs_info->volume_mutex); 1494 vol_args = memdup_user(arg, sizeof(*vol_args)); 1495 if (IS_ERR(vol_args)) { 1496 ret = PTR_ERR(vol_args); 1497 goto out; 1498 } 1499 1500 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; 1501 1502 sizestr = vol_args->name; 1503 devstr = strchr(sizestr, ':'); 1504 if (devstr) { 1505 sizestr = devstr + 1; 1506 *devstr = '\0'; 1507 devstr = vol_args->name; 1508 ret = kstrtoull(devstr, 10, &devid); 1509 if (ret) 1510 goto out_free; 1511 if (!devid) { 1512 ret = -EINVAL; 1513 goto out_free; 1514 } 1515 btrfs_info(root->fs_info, "resizing devid %llu", devid); 1516 } 1517 1518 device = btrfs_find_device(root->fs_info, devid, NULL, NULL); 1519 if (!device) { 1520 btrfs_info(root->fs_info, "resizer unable to find device %llu", 1521 devid); 1522 ret = -ENODEV; 1523 goto out_free; 1524 } 1525 1526 if (!device->writeable) { 1527 btrfs_info(root->fs_info, 1528 "resizer unable to apply on readonly device %llu", 1529 devid); 1530 ret = -EPERM; 1531 goto out_free; 1532 } 1533 1534 if (!strcmp(sizestr, "max")) 1535 new_size = device->bdev->bd_inode->i_size; 1536 else { 1537 if (sizestr[0] == '-') { 1538 mod = -1; 1539 sizestr++; 1540 } else if (sizestr[0] == '+') { 1541 mod = 1; 1542 sizestr++; 1543 } 1544 new_size = memparse(sizestr, &retptr); 1545 if (*retptr != '\0' || new_size == 0) { 1546 ret = -EINVAL; 1547 goto out_free; 1548 } 1549 } 1550 1551 if (device->is_tgtdev_for_dev_replace) { 1552 ret = -EPERM; 1553 goto out_free; 1554 } 1555 1556 old_size = device->total_bytes; 1557 1558 if (mod < 0) { 1559 if (new_size > old_size) { 1560 ret = -EINVAL; 1561 goto out_free; 1562 } 1563 new_size = old_size - new_size; 1564 } else if (mod > 0) { 1565 if (new_size > ULLONG_MAX - old_size) { 1566 ret = -ERANGE; 1567 goto out_free; 1568 } 1569 new_size = old_size + new_size; 1570 } 1571 1572 if (new_size < 256 * 1024 * 1024) { 1573 ret = -EINVAL; 1574 goto out_free; 1575 } 1576 if (new_size > device->bdev->bd_inode->i_size) { 1577 ret = -EFBIG; 1578 goto out_free; 1579 } 1580 1581 do_div(new_size, root->sectorsize); 1582 new_size *= root->sectorsize; 1583 1584 printk_in_rcu(KERN_INFO "BTRFS: new size for %s is %llu\n", 1585 rcu_str_deref(device->name), new_size); 1586 1587 if (new_size > old_size) { 1588 trans = btrfs_start_transaction(root, 0); 1589 if (IS_ERR(trans)) { 1590 ret = PTR_ERR(trans); 1591 goto out_free; 1592 } 1593 ret = btrfs_grow_device(trans, device, new_size); 1594 btrfs_commit_transaction(trans, root); 1595 } else if (new_size < old_size) { 1596 ret = btrfs_shrink_device(device, new_size); 1597 } /* equal, nothing need to do */ 1598 1599 out_free: 1600 kfree(vol_args); 1601 out: 1602 mutex_unlock(&root->fs_info->volume_mutex); 1603 atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); 1604 mnt_drop_write_file(file); 1605 return ret; 1606 } 1607 1608 static noinline int btrfs_ioctl_snap_create_transid(struct file *file, 1609 char *name, unsigned long fd, int subvol, 1610 u64 *transid, bool readonly, 1611 struct btrfs_qgroup_inherit *inherit) 1612 { 1613 int namelen; 1614 int ret = 0; 1615 1616 ret = mnt_want_write_file(file); 1617 if (ret) 1618 goto out; 1619 1620 namelen = strlen(name); 1621 if (strchr(name, '/')) { 1622 ret = -EINVAL; 1623 goto out_drop_write; 1624 } 1625 1626 if (name[0] == '.' && 1627 (namelen == 1 || (name[1] == '.' && namelen == 2))) { 1628 ret = -EEXIST; 1629 goto out_drop_write; 1630 } 1631 1632 if (subvol) { 1633 ret = btrfs_mksubvol(&file->f_path, name, namelen, 1634 NULL, transid, readonly, inherit); 1635 } else { 1636 struct fd src = fdget(fd); 1637 struct inode *src_inode; 1638 if (!src.file) { 1639 ret = -EINVAL; 1640 goto out_drop_write; 1641 } 1642 1643 src_inode = file_inode(src.file); 1644 if (src_inode->i_sb != file_inode(file)->i_sb) { 1645 btrfs_info(BTRFS_I(src_inode)->root->fs_info, 1646 "Snapshot src from another FS"); 1647 ret = -EXDEV; 1648 } else if (!inode_owner_or_capable(src_inode)) { 1649 /* 1650 * Subvolume creation is not restricted, but snapshots 1651 * are limited to own subvolumes only 1652 */ 1653 ret = -EPERM; 1654 } else { 1655 ret = btrfs_mksubvol(&file->f_path, name, namelen, 1656 BTRFS_I(src_inode)->root, 1657 transid, readonly, inherit); 1658 } 1659 fdput(src); 1660 } 1661 out_drop_write: 1662 mnt_drop_write_file(file); 1663 out: 1664 return ret; 1665 } 1666 1667 static noinline int btrfs_ioctl_snap_create(struct file *file, 1668 void __user *arg, int subvol) 1669 { 1670 struct btrfs_ioctl_vol_args *vol_args; 1671 int ret; 1672 1673 vol_args = memdup_user(arg, sizeof(*vol_args)); 1674 if (IS_ERR(vol_args)) 1675 return PTR_ERR(vol_args); 1676 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; 1677 1678 ret = btrfs_ioctl_snap_create_transid(file, vol_args->name, 1679 vol_args->fd, subvol, 1680 NULL, false, NULL); 1681 1682 kfree(vol_args); 1683 return ret; 1684 } 1685 1686 static noinline int btrfs_ioctl_snap_create_v2(struct file *file, 1687 void __user *arg, int subvol) 1688 { 1689 struct btrfs_ioctl_vol_args_v2 *vol_args; 1690 int ret; 1691 u64 transid = 0; 1692 u64 *ptr = NULL; 1693 bool readonly = false; 1694 struct btrfs_qgroup_inherit *inherit = NULL; 1695 1696 vol_args = memdup_user(arg, sizeof(*vol_args)); 1697 if (IS_ERR(vol_args)) 1698 return PTR_ERR(vol_args); 1699 vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0'; 1700 1701 if (vol_args->flags & 1702 ~(BTRFS_SUBVOL_CREATE_ASYNC | BTRFS_SUBVOL_RDONLY | 1703 BTRFS_SUBVOL_QGROUP_INHERIT)) { 1704 ret = -EOPNOTSUPP; 1705 goto out; 1706 } 1707 1708 if (vol_args->flags & BTRFS_SUBVOL_CREATE_ASYNC) 1709 ptr = &transid; 1710 if (vol_args->flags & BTRFS_SUBVOL_RDONLY) 1711 readonly = true; 1712 if (vol_args->flags & BTRFS_SUBVOL_QGROUP_INHERIT) { 1713 if (vol_args->size > PAGE_CACHE_SIZE) { 1714 ret = -EINVAL; 1715 goto out; 1716 } 1717 inherit = memdup_user(vol_args->qgroup_inherit, vol_args->size); 1718 if (IS_ERR(inherit)) { 1719 ret = PTR_ERR(inherit); 1720 goto out; 1721 } 1722 } 1723 1724 ret = btrfs_ioctl_snap_create_transid(file, vol_args->name, 1725 vol_args->fd, subvol, ptr, 1726 readonly, inherit); 1727 1728 if (ret == 0 && ptr && 1729 copy_to_user(arg + 1730 offsetof(struct btrfs_ioctl_vol_args_v2, 1731 transid), ptr, sizeof(*ptr))) 1732 ret = -EFAULT; 1733 out: 1734 kfree(vol_args); 1735 kfree(inherit); 1736 return ret; 1737 } 1738 1739 static noinline int btrfs_ioctl_subvol_getflags(struct file *file, 1740 void __user *arg) 1741 { 1742 struct inode *inode = file_inode(file); 1743 struct btrfs_root *root = BTRFS_I(inode)->root; 1744 int ret = 0; 1745 u64 flags = 0; 1746 1747 if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID) 1748 return -EINVAL; 1749 1750 down_read(&root->fs_info->subvol_sem); 1751 if (btrfs_root_readonly(root)) 1752 flags |= BTRFS_SUBVOL_RDONLY; 1753 up_read(&root->fs_info->subvol_sem); 1754 1755 if (copy_to_user(arg, &flags, sizeof(flags))) 1756 ret = -EFAULT; 1757 1758 return ret; 1759 } 1760 1761 static noinline int btrfs_ioctl_subvol_setflags(struct file *file, 1762 void __user *arg) 1763 { 1764 struct inode *inode = file_inode(file); 1765 struct btrfs_root *root = BTRFS_I(inode)->root; 1766 struct btrfs_trans_handle *trans; 1767 u64 root_flags; 1768 u64 flags; 1769 int ret = 0; 1770 1771 if (!inode_owner_or_capable(inode)) 1772 return -EPERM; 1773 1774 ret = mnt_want_write_file(file); 1775 if (ret) 1776 goto out; 1777 1778 if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID) { 1779 ret = -EINVAL; 1780 goto out_drop_write; 1781 } 1782 1783 if (copy_from_user(&flags, arg, sizeof(flags))) { 1784 ret = -EFAULT; 1785 goto out_drop_write; 1786 } 1787 1788 if (flags & BTRFS_SUBVOL_CREATE_ASYNC) { 1789 ret = -EINVAL; 1790 goto out_drop_write; 1791 } 1792 1793 if (flags & ~BTRFS_SUBVOL_RDONLY) { 1794 ret = -EOPNOTSUPP; 1795 goto out_drop_write; 1796 } 1797 1798 down_write(&root->fs_info->subvol_sem); 1799 1800 /* nothing to do */ 1801 if (!!(flags & BTRFS_SUBVOL_RDONLY) == btrfs_root_readonly(root)) 1802 goto out_drop_sem; 1803 1804 root_flags = btrfs_root_flags(&root->root_item); 1805 if (flags & BTRFS_SUBVOL_RDONLY) { 1806 btrfs_set_root_flags(&root->root_item, 1807 root_flags | BTRFS_ROOT_SUBVOL_RDONLY); 1808 } else { 1809 /* 1810 * Block RO -> RW transition if this subvolume is involved in 1811 * send 1812 */ 1813 spin_lock(&root->root_item_lock); 1814 if (root->send_in_progress == 0) { 1815 btrfs_set_root_flags(&root->root_item, 1816 root_flags & ~BTRFS_ROOT_SUBVOL_RDONLY); 1817 spin_unlock(&root->root_item_lock); 1818 } else { 1819 spin_unlock(&root->root_item_lock); 1820 btrfs_warn(root->fs_info, 1821 "Attempt to set subvolume %llu read-write during send", 1822 root->root_key.objectid); 1823 ret = -EPERM; 1824 goto out_drop_sem; 1825 } 1826 } 1827 1828 trans = btrfs_start_transaction(root, 1); 1829 if (IS_ERR(trans)) { 1830 ret = PTR_ERR(trans); 1831 goto out_reset; 1832 } 1833 1834 ret = btrfs_update_root(trans, root->fs_info->tree_root, 1835 &root->root_key, &root->root_item); 1836 1837 btrfs_commit_transaction(trans, root); 1838 out_reset: 1839 if (ret) 1840 btrfs_set_root_flags(&root->root_item, root_flags); 1841 out_drop_sem: 1842 up_write(&root->fs_info->subvol_sem); 1843 out_drop_write: 1844 mnt_drop_write_file(file); 1845 out: 1846 return ret; 1847 } 1848 1849 /* 1850 * helper to check if the subvolume references other subvolumes 1851 */ 1852 static noinline int may_destroy_subvol(struct btrfs_root *root) 1853 { 1854 struct btrfs_path *path; 1855 struct btrfs_dir_item *di; 1856 struct btrfs_key key; 1857 u64 dir_id; 1858 int ret; 1859 1860 path = btrfs_alloc_path(); 1861 if (!path) 1862 return -ENOMEM; 1863 1864 /* Make sure this root isn't set as the default subvol */ 1865 dir_id = btrfs_super_root_dir(root->fs_info->super_copy); 1866 di = btrfs_lookup_dir_item(NULL, root->fs_info->tree_root, path, 1867 dir_id, "default", 7, 0); 1868 if (di && !IS_ERR(di)) { 1869 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key); 1870 if (key.objectid == root->root_key.objectid) { 1871 ret = -EPERM; 1872 btrfs_err(root->fs_info, "deleting default subvolume " 1873 "%llu is not allowed", key.objectid); 1874 goto out; 1875 } 1876 btrfs_release_path(path); 1877 } 1878 1879 key.objectid = root->root_key.objectid; 1880 key.type = BTRFS_ROOT_REF_KEY; 1881 key.offset = (u64)-1; 1882 1883 ret = btrfs_search_slot(NULL, root->fs_info->tree_root, 1884 &key, path, 0, 0); 1885 if (ret < 0) 1886 goto out; 1887 BUG_ON(ret == 0); 1888 1889 ret = 0; 1890 if (path->slots[0] > 0) { 1891 path->slots[0]--; 1892 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1893 if (key.objectid == root->root_key.objectid && 1894 key.type == BTRFS_ROOT_REF_KEY) 1895 ret = -ENOTEMPTY; 1896 } 1897 out: 1898 btrfs_free_path(path); 1899 return ret; 1900 } 1901 1902 static noinline int key_in_sk(struct btrfs_key *key, 1903 struct btrfs_ioctl_search_key *sk) 1904 { 1905 struct btrfs_key test; 1906 int ret; 1907 1908 test.objectid = sk->min_objectid; 1909 test.type = sk->min_type; 1910 test.offset = sk->min_offset; 1911 1912 ret = btrfs_comp_cpu_keys(key, &test); 1913 if (ret < 0) 1914 return 0; 1915 1916 test.objectid = sk->max_objectid; 1917 test.type = sk->max_type; 1918 test.offset = sk->max_offset; 1919 1920 ret = btrfs_comp_cpu_keys(key, &test); 1921 if (ret > 0) 1922 return 0; 1923 return 1; 1924 } 1925 1926 static noinline int copy_to_sk(struct btrfs_root *root, 1927 struct btrfs_path *path, 1928 struct btrfs_key *key, 1929 struct btrfs_ioctl_search_key *sk, 1930 size_t *buf_size, 1931 char __user *ubuf, 1932 unsigned long *sk_offset, 1933 int *num_found) 1934 { 1935 u64 found_transid; 1936 struct extent_buffer *leaf; 1937 struct btrfs_ioctl_search_header sh; 1938 unsigned long item_off; 1939 unsigned long item_len; 1940 int nritems; 1941 int i; 1942 int slot; 1943 int ret = 0; 1944 1945 leaf = path->nodes[0]; 1946 slot = path->slots[0]; 1947 nritems = btrfs_header_nritems(leaf); 1948 1949 if (btrfs_header_generation(leaf) > sk->max_transid) { 1950 i = nritems; 1951 goto advance_key; 1952 } 1953 found_transid = btrfs_header_generation(leaf); 1954 1955 for (i = slot; i < nritems; i++) { 1956 item_off = btrfs_item_ptr_offset(leaf, i); 1957 item_len = btrfs_item_size_nr(leaf, i); 1958 1959 btrfs_item_key_to_cpu(leaf, key, i); 1960 if (!key_in_sk(key, sk)) 1961 continue; 1962 1963 if (sizeof(sh) + item_len > *buf_size) { 1964 if (*num_found) { 1965 ret = 1; 1966 goto out; 1967 } 1968 1969 /* 1970 * return one empty item back for v1, which does not 1971 * handle -EOVERFLOW 1972 */ 1973 1974 *buf_size = sizeof(sh) + item_len; 1975 item_len = 0; 1976 ret = -EOVERFLOW; 1977 } 1978 1979 if (sizeof(sh) + item_len + *sk_offset > *buf_size) { 1980 ret = 1; 1981 goto out; 1982 } 1983 1984 sh.objectid = key->objectid; 1985 sh.offset = key->offset; 1986 sh.type = key->type; 1987 sh.len = item_len; 1988 sh.transid = found_transid; 1989 1990 /* copy search result header */ 1991 if (copy_to_user(ubuf + *sk_offset, &sh, sizeof(sh))) { 1992 ret = -EFAULT; 1993 goto out; 1994 } 1995 1996 *sk_offset += sizeof(sh); 1997 1998 if (item_len) { 1999 char __user *up = ubuf + *sk_offset; 2000 /* copy the item */ 2001 if (read_extent_buffer_to_user(leaf, up, 2002 item_off, item_len)) { 2003 ret = -EFAULT; 2004 goto out; 2005 } 2006 2007 *sk_offset += item_len; 2008 } 2009 (*num_found)++; 2010 2011 if (ret) /* -EOVERFLOW from above */ 2012 goto out; 2013 2014 if (*num_found >= sk->nr_items) { 2015 ret = 1; 2016 goto out; 2017 } 2018 } 2019 advance_key: 2020 ret = 0; 2021 if (key->offset < (u64)-1 && key->offset < sk->max_offset) 2022 key->offset++; 2023 else if (key->type < (u8)-1 && key->type < sk->max_type) { 2024 key->offset = 0; 2025 key->type++; 2026 } else if (key->objectid < (u64)-1 && key->objectid < sk->max_objectid) { 2027 key->offset = 0; 2028 key->type = 0; 2029 key->objectid++; 2030 } else 2031 ret = 1; 2032 out: 2033 /* 2034 * 0: all items from this leaf copied, continue with next 2035 * 1: * more items can be copied, but unused buffer is too small 2036 * * all items were found 2037 * Either way, it will stops the loop which iterates to the next 2038 * leaf 2039 * -EOVERFLOW: item was to large for buffer 2040 * -EFAULT: could not copy extent buffer back to userspace 2041 */ 2042 return ret; 2043 } 2044 2045 static noinline int search_ioctl(struct inode *inode, 2046 struct btrfs_ioctl_search_key *sk, 2047 size_t *buf_size, 2048 char __user *ubuf) 2049 { 2050 struct btrfs_root *root; 2051 struct btrfs_key key; 2052 struct btrfs_path *path; 2053 struct btrfs_fs_info *info = BTRFS_I(inode)->root->fs_info; 2054 int ret; 2055 int num_found = 0; 2056 unsigned long sk_offset = 0; 2057 2058 if (*buf_size < sizeof(struct btrfs_ioctl_search_header)) { 2059 *buf_size = sizeof(struct btrfs_ioctl_search_header); 2060 return -EOVERFLOW; 2061 } 2062 2063 path = btrfs_alloc_path(); 2064 if (!path) 2065 return -ENOMEM; 2066 2067 if (sk->tree_id == 0) { 2068 /* search the root of the inode that was passed */ 2069 root = BTRFS_I(inode)->root; 2070 } else { 2071 key.objectid = sk->tree_id; 2072 key.type = BTRFS_ROOT_ITEM_KEY; 2073 key.offset = (u64)-1; 2074 root = btrfs_read_fs_root_no_name(info, &key); 2075 if (IS_ERR(root)) { 2076 printk(KERN_ERR "BTRFS: could not find root %llu\n", 2077 sk->tree_id); 2078 btrfs_free_path(path); 2079 return -ENOENT; 2080 } 2081 } 2082 2083 key.objectid = sk->min_objectid; 2084 key.type = sk->min_type; 2085 key.offset = sk->min_offset; 2086 2087 path->keep_locks = 1; 2088 2089 while (1) { 2090 ret = btrfs_search_forward(root, &key, path, sk->min_transid); 2091 if (ret != 0) { 2092 if (ret > 0) 2093 ret = 0; 2094 goto err; 2095 } 2096 ret = copy_to_sk(root, path, &key, sk, buf_size, ubuf, 2097 &sk_offset, &num_found); 2098 btrfs_release_path(path); 2099 if (ret) 2100 break; 2101 2102 } 2103 if (ret > 0) 2104 ret = 0; 2105 err: 2106 sk->nr_items = num_found; 2107 btrfs_free_path(path); 2108 return ret; 2109 } 2110 2111 static noinline int btrfs_ioctl_tree_search(struct file *file, 2112 void __user *argp) 2113 { 2114 struct btrfs_ioctl_search_args __user *uargs; 2115 struct btrfs_ioctl_search_key sk; 2116 struct inode *inode; 2117 int ret; 2118 size_t buf_size; 2119 2120 if (!capable(CAP_SYS_ADMIN)) 2121 return -EPERM; 2122 2123 uargs = (struct btrfs_ioctl_search_args __user *)argp; 2124 2125 if (copy_from_user(&sk, &uargs->key, sizeof(sk))) 2126 return -EFAULT; 2127 2128 buf_size = sizeof(uargs->buf); 2129 2130 inode = file_inode(file); 2131 ret = search_ioctl(inode, &sk, &buf_size, uargs->buf); 2132 2133 /* 2134 * In the origin implementation an overflow is handled by returning a 2135 * search header with a len of zero, so reset ret. 2136 */ 2137 if (ret == -EOVERFLOW) 2138 ret = 0; 2139 2140 if (ret == 0 && copy_to_user(&uargs->key, &sk, sizeof(sk))) 2141 ret = -EFAULT; 2142 return ret; 2143 } 2144 2145 static noinline int btrfs_ioctl_tree_search_v2(struct file *file, 2146 void __user *argp) 2147 { 2148 struct btrfs_ioctl_search_args_v2 __user *uarg; 2149 struct btrfs_ioctl_search_args_v2 args; 2150 struct inode *inode; 2151 int ret; 2152 size_t buf_size; 2153 const size_t buf_limit = 16 * 1024 * 1024; 2154 2155 if (!capable(CAP_SYS_ADMIN)) 2156 return -EPERM; 2157 2158 /* copy search header and buffer size */ 2159 uarg = (struct btrfs_ioctl_search_args_v2 __user *)argp; 2160 if (copy_from_user(&args, uarg, sizeof(args))) 2161 return -EFAULT; 2162 2163 buf_size = args.buf_size; 2164 2165 if (buf_size < sizeof(struct btrfs_ioctl_search_header)) 2166 return -EOVERFLOW; 2167 2168 /* limit result size to 16MB */ 2169 if (buf_size > buf_limit) 2170 buf_size = buf_limit; 2171 2172 inode = file_inode(file); 2173 ret = search_ioctl(inode, &args.key, &buf_size, 2174 (char *)(&uarg->buf[0])); 2175 if (ret == 0 && copy_to_user(&uarg->key, &args.key, sizeof(args.key))) 2176 ret = -EFAULT; 2177 else if (ret == -EOVERFLOW && 2178 copy_to_user(&uarg->buf_size, &buf_size, sizeof(buf_size))) 2179 ret = -EFAULT; 2180 2181 return ret; 2182 } 2183 2184 /* 2185 * Search INODE_REFs to identify path name of 'dirid' directory 2186 * in a 'tree_id' tree. and sets path name to 'name'. 2187 */ 2188 static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info, 2189 u64 tree_id, u64 dirid, char *name) 2190 { 2191 struct btrfs_root *root; 2192 struct btrfs_key key; 2193 char *ptr; 2194 int ret = -1; 2195 int slot; 2196 int len; 2197 int total_len = 0; 2198 struct btrfs_inode_ref *iref; 2199 struct extent_buffer *l; 2200 struct btrfs_path *path; 2201 2202 if (dirid == BTRFS_FIRST_FREE_OBJECTID) { 2203 name[0]='\0'; 2204 return 0; 2205 } 2206 2207 path = btrfs_alloc_path(); 2208 if (!path) 2209 return -ENOMEM; 2210 2211 ptr = &name[BTRFS_INO_LOOKUP_PATH_MAX]; 2212 2213 key.objectid = tree_id; 2214 key.type = BTRFS_ROOT_ITEM_KEY; 2215 key.offset = (u64)-1; 2216 root = btrfs_read_fs_root_no_name(info, &key); 2217 if (IS_ERR(root)) { 2218 printk(KERN_ERR "BTRFS: could not find root %llu\n", tree_id); 2219 ret = -ENOENT; 2220 goto out; 2221 } 2222 2223 key.objectid = dirid; 2224 key.type = BTRFS_INODE_REF_KEY; 2225 key.offset = (u64)-1; 2226 2227 while (1) { 2228 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2229 if (ret < 0) 2230 goto out; 2231 else if (ret > 0) { 2232 ret = btrfs_previous_item(root, path, dirid, 2233 BTRFS_INODE_REF_KEY); 2234 if (ret < 0) 2235 goto out; 2236 else if (ret > 0) { 2237 ret = -ENOENT; 2238 goto out; 2239 } 2240 } 2241 2242 l = path->nodes[0]; 2243 slot = path->slots[0]; 2244 btrfs_item_key_to_cpu(l, &key, slot); 2245 2246 iref = btrfs_item_ptr(l, slot, struct btrfs_inode_ref); 2247 len = btrfs_inode_ref_name_len(l, iref); 2248 ptr -= len + 1; 2249 total_len += len + 1; 2250 if (ptr < name) { 2251 ret = -ENAMETOOLONG; 2252 goto out; 2253 } 2254 2255 *(ptr + len) = '/'; 2256 read_extent_buffer(l, ptr, (unsigned long)(iref + 1), len); 2257 2258 if (key.offset == BTRFS_FIRST_FREE_OBJECTID) 2259 break; 2260 2261 btrfs_release_path(path); 2262 key.objectid = key.offset; 2263 key.offset = (u64)-1; 2264 dirid = key.objectid; 2265 } 2266 memmove(name, ptr, total_len); 2267 name[total_len] = '\0'; 2268 ret = 0; 2269 out: 2270 btrfs_free_path(path); 2271 return ret; 2272 } 2273 2274 static noinline int btrfs_ioctl_ino_lookup(struct file *file, 2275 void __user *argp) 2276 { 2277 struct btrfs_ioctl_ino_lookup_args *args; 2278 struct inode *inode; 2279 int ret; 2280 2281 if (!capable(CAP_SYS_ADMIN)) 2282 return -EPERM; 2283 2284 args = memdup_user(argp, sizeof(*args)); 2285 if (IS_ERR(args)) 2286 return PTR_ERR(args); 2287 2288 inode = file_inode(file); 2289 2290 if (args->treeid == 0) 2291 args->treeid = BTRFS_I(inode)->root->root_key.objectid; 2292 2293 ret = btrfs_search_path_in_tree(BTRFS_I(inode)->root->fs_info, 2294 args->treeid, args->objectid, 2295 args->name); 2296 2297 if (ret == 0 && copy_to_user(argp, args, sizeof(*args))) 2298 ret = -EFAULT; 2299 2300 kfree(args); 2301 return ret; 2302 } 2303 2304 static noinline int btrfs_ioctl_snap_destroy(struct file *file, 2305 void __user *arg) 2306 { 2307 struct dentry *parent = file->f_path.dentry; 2308 struct dentry *dentry; 2309 struct inode *dir = parent->d_inode; 2310 struct inode *inode; 2311 struct btrfs_root *root = BTRFS_I(dir)->root; 2312 struct btrfs_root *dest = NULL; 2313 struct btrfs_ioctl_vol_args *vol_args; 2314 struct btrfs_trans_handle *trans; 2315 struct btrfs_block_rsv block_rsv; 2316 u64 root_flags; 2317 u64 qgroup_reserved; 2318 int namelen; 2319 int ret; 2320 int err = 0; 2321 2322 vol_args = memdup_user(arg, sizeof(*vol_args)); 2323 if (IS_ERR(vol_args)) 2324 return PTR_ERR(vol_args); 2325 2326 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; 2327 namelen = strlen(vol_args->name); 2328 if (strchr(vol_args->name, '/') || 2329 strncmp(vol_args->name, "..", namelen) == 0) { 2330 err = -EINVAL; 2331 goto out; 2332 } 2333 2334 err = mnt_want_write_file(file); 2335 if (err) 2336 goto out; 2337 2338 2339 err = mutex_lock_killable_nested(&dir->i_mutex, I_MUTEX_PARENT); 2340 if (err == -EINTR) 2341 goto out_drop_write; 2342 dentry = lookup_one_len(vol_args->name, parent, namelen); 2343 if (IS_ERR(dentry)) { 2344 err = PTR_ERR(dentry); 2345 goto out_unlock_dir; 2346 } 2347 2348 if (!dentry->d_inode) { 2349 err = -ENOENT; 2350 goto out_dput; 2351 } 2352 2353 inode = dentry->d_inode; 2354 dest = BTRFS_I(inode)->root; 2355 if (!capable(CAP_SYS_ADMIN)) { 2356 /* 2357 * Regular user. Only allow this with a special mount 2358 * option, when the user has write+exec access to the 2359 * subvol root, and when rmdir(2) would have been 2360 * allowed. 2361 * 2362 * Note that this is _not_ check that the subvol is 2363 * empty or doesn't contain data that we wouldn't 2364 * otherwise be able to delete. 2365 * 2366 * Users who want to delete empty subvols should try 2367 * rmdir(2). 2368 */ 2369 err = -EPERM; 2370 if (!btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED)) 2371 goto out_dput; 2372 2373 /* 2374 * Do not allow deletion if the parent dir is the same 2375 * as the dir to be deleted. That means the ioctl 2376 * must be called on the dentry referencing the root 2377 * of the subvol, not a random directory contained 2378 * within it. 2379 */ 2380 err = -EINVAL; 2381 if (root == dest) 2382 goto out_dput; 2383 2384 err = inode_permission(inode, MAY_WRITE | MAY_EXEC); 2385 if (err) 2386 goto out_dput; 2387 } 2388 2389 /* check if subvolume may be deleted by a user */ 2390 err = btrfs_may_delete(dir, dentry, 1); 2391 if (err) 2392 goto out_dput; 2393 2394 if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID) { 2395 err = -EINVAL; 2396 goto out_dput; 2397 } 2398 2399 mutex_lock(&inode->i_mutex); 2400 2401 /* 2402 * Don't allow to delete a subvolume with send in progress. This is 2403 * inside the i_mutex so the error handling that has to drop the bit 2404 * again is not run concurrently. 2405 */ 2406 spin_lock(&dest->root_item_lock); 2407 root_flags = btrfs_root_flags(&dest->root_item); 2408 if (dest->send_in_progress == 0) { 2409 btrfs_set_root_flags(&dest->root_item, 2410 root_flags | BTRFS_ROOT_SUBVOL_DEAD); 2411 spin_unlock(&dest->root_item_lock); 2412 } else { 2413 spin_unlock(&dest->root_item_lock); 2414 btrfs_warn(root->fs_info, 2415 "Attempt to delete subvolume %llu during send", 2416 dest->root_key.objectid); 2417 err = -EPERM; 2418 goto out_dput; 2419 } 2420 2421 err = d_invalidate(dentry); 2422 if (err) 2423 goto out_unlock; 2424 2425 down_write(&root->fs_info->subvol_sem); 2426 2427 err = may_destroy_subvol(dest); 2428 if (err) 2429 goto out_up_write; 2430 2431 btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP); 2432 /* 2433 * One for dir inode, two for dir entries, two for root 2434 * ref/backref. 2435 */ 2436 err = btrfs_subvolume_reserve_metadata(root, &block_rsv, 2437 5, &qgroup_reserved, true); 2438 if (err) 2439 goto out_up_write; 2440 2441 trans = btrfs_start_transaction(root, 0); 2442 if (IS_ERR(trans)) { 2443 err = PTR_ERR(trans); 2444 goto out_release; 2445 } 2446 trans->block_rsv = &block_rsv; 2447 trans->bytes_reserved = block_rsv.size; 2448 2449 ret = btrfs_unlink_subvol(trans, root, dir, 2450 dest->root_key.objectid, 2451 dentry->d_name.name, 2452 dentry->d_name.len); 2453 if (ret) { 2454 err = ret; 2455 btrfs_abort_transaction(trans, root, ret); 2456 goto out_end_trans; 2457 } 2458 2459 btrfs_record_root_in_trans(trans, dest); 2460 2461 memset(&dest->root_item.drop_progress, 0, 2462 sizeof(dest->root_item.drop_progress)); 2463 dest->root_item.drop_level = 0; 2464 btrfs_set_root_refs(&dest->root_item, 0); 2465 2466 if (!test_and_set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &dest->state)) { 2467 ret = btrfs_insert_orphan_item(trans, 2468 root->fs_info->tree_root, 2469 dest->root_key.objectid); 2470 if (ret) { 2471 btrfs_abort_transaction(trans, root, ret); 2472 err = ret; 2473 goto out_end_trans; 2474 } 2475 } 2476 2477 ret = btrfs_uuid_tree_rem(trans, root->fs_info->uuid_root, 2478 dest->root_item.uuid, BTRFS_UUID_KEY_SUBVOL, 2479 dest->root_key.objectid); 2480 if (ret && ret != -ENOENT) { 2481 btrfs_abort_transaction(trans, root, ret); 2482 err = ret; 2483 goto out_end_trans; 2484 } 2485 if (!btrfs_is_empty_uuid(dest->root_item.received_uuid)) { 2486 ret = btrfs_uuid_tree_rem(trans, root->fs_info->uuid_root, 2487 dest->root_item.received_uuid, 2488 BTRFS_UUID_KEY_RECEIVED_SUBVOL, 2489 dest->root_key.objectid); 2490 if (ret && ret != -ENOENT) { 2491 btrfs_abort_transaction(trans, root, ret); 2492 err = ret; 2493 goto out_end_trans; 2494 } 2495 } 2496 2497 out_end_trans: 2498 trans->block_rsv = NULL; 2499 trans->bytes_reserved = 0; 2500 ret = btrfs_end_transaction(trans, root); 2501 if (ret && !err) 2502 err = ret; 2503 inode->i_flags |= S_DEAD; 2504 out_release: 2505 btrfs_subvolume_release_metadata(root, &block_rsv, qgroup_reserved); 2506 out_up_write: 2507 up_write(&root->fs_info->subvol_sem); 2508 out_unlock: 2509 if (err) { 2510 spin_lock(&dest->root_item_lock); 2511 root_flags = btrfs_root_flags(&dest->root_item); 2512 btrfs_set_root_flags(&dest->root_item, 2513 root_flags & ~BTRFS_ROOT_SUBVOL_DEAD); 2514 spin_unlock(&dest->root_item_lock); 2515 } 2516 mutex_unlock(&inode->i_mutex); 2517 if (!err) { 2518 shrink_dcache_sb(root->fs_info->sb); 2519 btrfs_invalidate_inodes(dest); 2520 d_delete(dentry); 2521 ASSERT(dest->send_in_progress == 0); 2522 2523 /* the last ref */ 2524 if (dest->cache_inode) { 2525 iput(dest->cache_inode); 2526 dest->cache_inode = NULL; 2527 } 2528 } 2529 out_dput: 2530 dput(dentry); 2531 out_unlock_dir: 2532 mutex_unlock(&dir->i_mutex); 2533 out_drop_write: 2534 mnt_drop_write_file(file); 2535 out: 2536 kfree(vol_args); 2537 return err; 2538 } 2539 2540 static int btrfs_ioctl_defrag(struct file *file, void __user *argp) 2541 { 2542 struct inode *inode = file_inode(file); 2543 struct btrfs_root *root = BTRFS_I(inode)->root; 2544 struct btrfs_ioctl_defrag_range_args *range; 2545 int ret; 2546 2547 ret = mnt_want_write_file(file); 2548 if (ret) 2549 return ret; 2550 2551 if (btrfs_root_readonly(root)) { 2552 ret = -EROFS; 2553 goto out; 2554 } 2555 2556 switch (inode->i_mode & S_IFMT) { 2557 case S_IFDIR: 2558 if (!capable(CAP_SYS_ADMIN)) { 2559 ret = -EPERM; 2560 goto out; 2561 } 2562 ret = btrfs_defrag_root(root); 2563 if (ret) 2564 goto out; 2565 ret = btrfs_defrag_root(root->fs_info->extent_root); 2566 break; 2567 case S_IFREG: 2568 if (!(file->f_mode & FMODE_WRITE)) { 2569 ret = -EINVAL; 2570 goto out; 2571 } 2572 2573 range = kzalloc(sizeof(*range), GFP_KERNEL); 2574 if (!range) { 2575 ret = -ENOMEM; 2576 goto out; 2577 } 2578 2579 if (argp) { 2580 if (copy_from_user(range, argp, 2581 sizeof(*range))) { 2582 ret = -EFAULT; 2583 kfree(range); 2584 goto out; 2585 } 2586 /* compression requires us to start the IO */ 2587 if ((range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)) { 2588 range->flags |= BTRFS_DEFRAG_RANGE_START_IO; 2589 range->extent_thresh = (u32)-1; 2590 } 2591 } else { 2592 /* the rest are all set to zero by kzalloc */ 2593 range->len = (u64)-1; 2594 } 2595 ret = btrfs_defrag_file(file_inode(file), file, 2596 range, 0, 0); 2597 if (ret > 0) 2598 ret = 0; 2599 kfree(range); 2600 break; 2601 default: 2602 ret = -EINVAL; 2603 } 2604 out: 2605 mnt_drop_write_file(file); 2606 return ret; 2607 } 2608 2609 static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg) 2610 { 2611 struct btrfs_ioctl_vol_args *vol_args; 2612 int ret; 2613 2614 if (!capable(CAP_SYS_ADMIN)) 2615 return -EPERM; 2616 2617 if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, 2618 1)) { 2619 return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; 2620 } 2621 2622 mutex_lock(&root->fs_info->volume_mutex); 2623 vol_args = memdup_user(arg, sizeof(*vol_args)); 2624 if (IS_ERR(vol_args)) { 2625 ret = PTR_ERR(vol_args); 2626 goto out; 2627 } 2628 2629 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; 2630 ret = btrfs_init_new_device(root, vol_args->name); 2631 2632 kfree(vol_args); 2633 out: 2634 mutex_unlock(&root->fs_info->volume_mutex); 2635 atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); 2636 return ret; 2637 } 2638 2639 static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) 2640 { 2641 struct btrfs_root *root = BTRFS_I(file_inode(file))->root; 2642 struct btrfs_ioctl_vol_args *vol_args; 2643 int ret; 2644 2645 if (!capable(CAP_SYS_ADMIN)) 2646 return -EPERM; 2647 2648 ret = mnt_want_write_file(file); 2649 if (ret) 2650 return ret; 2651 2652 vol_args = memdup_user(arg, sizeof(*vol_args)); 2653 if (IS_ERR(vol_args)) { 2654 ret = PTR_ERR(vol_args); 2655 goto out; 2656 } 2657 2658 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; 2659 2660 if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, 2661 1)) { 2662 ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; 2663 goto out; 2664 } 2665 2666 mutex_lock(&root->fs_info->volume_mutex); 2667 ret = btrfs_rm_device(root, vol_args->name); 2668 mutex_unlock(&root->fs_info->volume_mutex); 2669 atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); 2670 2671 out: 2672 kfree(vol_args); 2673 mnt_drop_write_file(file); 2674 return ret; 2675 } 2676 2677 static long btrfs_ioctl_fs_info(struct btrfs_root *root, void __user *arg) 2678 { 2679 struct btrfs_ioctl_fs_info_args *fi_args; 2680 struct btrfs_device *device; 2681 struct btrfs_device *next; 2682 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; 2683 int ret = 0; 2684 2685 fi_args = kzalloc(sizeof(*fi_args), GFP_KERNEL); 2686 if (!fi_args) 2687 return -ENOMEM; 2688 2689 mutex_lock(&fs_devices->device_list_mutex); 2690 fi_args->num_devices = fs_devices->num_devices; 2691 memcpy(&fi_args->fsid, root->fs_info->fsid, sizeof(fi_args->fsid)); 2692 2693 list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { 2694 if (device->devid > fi_args->max_id) 2695 fi_args->max_id = device->devid; 2696 } 2697 mutex_unlock(&fs_devices->device_list_mutex); 2698 2699 fi_args->nodesize = root->fs_info->super_copy->nodesize; 2700 fi_args->sectorsize = root->fs_info->super_copy->sectorsize; 2701 fi_args->clone_alignment = root->fs_info->super_copy->sectorsize; 2702 2703 if (copy_to_user(arg, fi_args, sizeof(*fi_args))) 2704 ret = -EFAULT; 2705 2706 kfree(fi_args); 2707 return ret; 2708 } 2709 2710 static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg) 2711 { 2712 struct btrfs_ioctl_dev_info_args *di_args; 2713 struct btrfs_device *dev; 2714 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; 2715 int ret = 0; 2716 char *s_uuid = NULL; 2717 2718 di_args = memdup_user(arg, sizeof(*di_args)); 2719 if (IS_ERR(di_args)) 2720 return PTR_ERR(di_args); 2721 2722 if (!btrfs_is_empty_uuid(di_args->uuid)) 2723 s_uuid = di_args->uuid; 2724 2725 mutex_lock(&fs_devices->device_list_mutex); 2726 dev = btrfs_find_device(root->fs_info, di_args->devid, s_uuid, NULL); 2727 2728 if (!dev) { 2729 ret = -ENODEV; 2730 goto out; 2731 } 2732 2733 di_args->devid = dev->devid; 2734 di_args->bytes_used = dev->bytes_used; 2735 di_args->total_bytes = dev->total_bytes; 2736 memcpy(di_args->uuid, dev->uuid, sizeof(di_args->uuid)); 2737 if (dev->name) { 2738 struct rcu_string *name; 2739 2740 rcu_read_lock(); 2741 name = rcu_dereference(dev->name); 2742 strncpy(di_args->path, name->str, sizeof(di_args->path)); 2743 rcu_read_unlock(); 2744 di_args->path[sizeof(di_args->path) - 1] = 0; 2745 } else { 2746 di_args->path[0] = '\0'; 2747 } 2748 2749 out: 2750 mutex_unlock(&fs_devices->device_list_mutex); 2751 if (ret == 0 && copy_to_user(arg, di_args, sizeof(*di_args))) 2752 ret = -EFAULT; 2753 2754 kfree(di_args); 2755 return ret; 2756 } 2757 2758 static struct page *extent_same_get_page(struct inode *inode, u64 off) 2759 { 2760 struct page *page; 2761 pgoff_t index; 2762 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; 2763 2764 index = off >> PAGE_CACHE_SHIFT; 2765 2766 page = grab_cache_page(inode->i_mapping, index); 2767 if (!page) 2768 return NULL; 2769 2770 if (!PageUptodate(page)) { 2771 if (extent_read_full_page_nolock(tree, page, btrfs_get_extent, 2772 0)) 2773 return NULL; 2774 lock_page(page); 2775 if (!PageUptodate(page)) { 2776 unlock_page(page); 2777 page_cache_release(page); 2778 return NULL; 2779 } 2780 } 2781 unlock_page(page); 2782 2783 return page; 2784 } 2785 2786 static inline void lock_extent_range(struct inode *inode, u64 off, u64 len) 2787 { 2788 /* do any pending delalloc/csum calc on src, one way or 2789 another, and lock file content */ 2790 while (1) { 2791 struct btrfs_ordered_extent *ordered; 2792 lock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1); 2793 ordered = btrfs_lookup_first_ordered_extent(inode, 2794 off + len - 1); 2795 if ((!ordered || 2796 ordered->file_offset + ordered->len <= off || 2797 ordered->file_offset >= off + len) && 2798 !test_range_bit(&BTRFS_I(inode)->io_tree, off, 2799 off + len - 1, EXTENT_DELALLOC, 0, NULL)) { 2800 if (ordered) 2801 btrfs_put_ordered_extent(ordered); 2802 break; 2803 } 2804 unlock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1); 2805 if (ordered) 2806 btrfs_put_ordered_extent(ordered); 2807 btrfs_wait_ordered_range(inode, off, len); 2808 } 2809 } 2810 2811 static void btrfs_double_unlock(struct inode *inode1, u64 loff1, 2812 struct inode *inode2, u64 loff2, u64 len) 2813 { 2814 unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1); 2815 unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1); 2816 2817 mutex_unlock(&inode1->i_mutex); 2818 mutex_unlock(&inode2->i_mutex); 2819 } 2820 2821 static void btrfs_double_lock(struct inode *inode1, u64 loff1, 2822 struct inode *inode2, u64 loff2, u64 len) 2823 { 2824 if (inode1 < inode2) { 2825 swap(inode1, inode2); 2826 swap(loff1, loff2); 2827 } 2828 2829 mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT); 2830 lock_extent_range(inode1, loff1, len); 2831 if (inode1 != inode2) { 2832 mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD); 2833 lock_extent_range(inode2, loff2, len); 2834 } 2835 } 2836 2837 static int btrfs_cmp_data(struct inode *src, u64 loff, struct inode *dst, 2838 u64 dst_loff, u64 len) 2839 { 2840 int ret = 0; 2841 struct page *src_page, *dst_page; 2842 unsigned int cmp_len = PAGE_CACHE_SIZE; 2843 void *addr, *dst_addr; 2844 2845 while (len) { 2846 if (len < PAGE_CACHE_SIZE) 2847 cmp_len = len; 2848 2849 src_page = extent_same_get_page(src, loff); 2850 if (!src_page) 2851 return -EINVAL; 2852 dst_page = extent_same_get_page(dst, dst_loff); 2853 if (!dst_page) { 2854 page_cache_release(src_page); 2855 return -EINVAL; 2856 } 2857 addr = kmap_atomic(src_page); 2858 dst_addr = kmap_atomic(dst_page); 2859 2860 flush_dcache_page(src_page); 2861 flush_dcache_page(dst_page); 2862 2863 if (memcmp(addr, dst_addr, cmp_len)) 2864 ret = BTRFS_SAME_DATA_DIFFERS; 2865 2866 kunmap_atomic(addr); 2867 kunmap_atomic(dst_addr); 2868 page_cache_release(src_page); 2869 page_cache_release(dst_page); 2870 2871 if (ret) 2872 break; 2873 2874 loff += cmp_len; 2875 dst_loff += cmp_len; 2876 len -= cmp_len; 2877 } 2878 2879 return ret; 2880 } 2881 2882 static int extent_same_check_offsets(struct inode *inode, u64 off, u64 len) 2883 { 2884 u64 bs = BTRFS_I(inode)->root->fs_info->sb->s_blocksize; 2885 2886 if (off + len > inode->i_size || off + len < off) 2887 return -EINVAL; 2888 /* Check that we are block aligned - btrfs_clone() requires this */ 2889 if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs)) 2890 return -EINVAL; 2891 2892 return 0; 2893 } 2894 2895 static int btrfs_extent_same(struct inode *src, u64 loff, u64 len, 2896 struct inode *dst, u64 dst_loff) 2897 { 2898 int ret; 2899 2900 /* 2901 * btrfs_clone() can't handle extents in the same file 2902 * yet. Once that works, we can drop this check and replace it 2903 * with a check for the same inode, but overlapping extents. 2904 */ 2905 if (src == dst) 2906 return -EINVAL; 2907 2908 btrfs_double_lock(src, loff, dst, dst_loff, len); 2909 2910 ret = extent_same_check_offsets(src, loff, len); 2911 if (ret) 2912 goto out_unlock; 2913 2914 ret = extent_same_check_offsets(dst, dst_loff, len); 2915 if (ret) 2916 goto out_unlock; 2917 2918 /* don't make the dst file partly checksummed */ 2919 if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) != 2920 (BTRFS_I(dst)->flags & BTRFS_INODE_NODATASUM)) { 2921 ret = -EINVAL; 2922 goto out_unlock; 2923 } 2924 2925 ret = btrfs_cmp_data(src, loff, dst, dst_loff, len); 2926 if (ret == 0) 2927 ret = btrfs_clone(src, dst, loff, len, len, dst_loff); 2928 2929 out_unlock: 2930 btrfs_double_unlock(src, loff, dst, dst_loff, len); 2931 2932 return ret; 2933 } 2934 2935 #define BTRFS_MAX_DEDUPE_LEN (16 * 1024 * 1024) 2936 2937 static long btrfs_ioctl_file_extent_same(struct file *file, 2938 struct btrfs_ioctl_same_args __user *argp) 2939 { 2940 struct btrfs_ioctl_same_args *same; 2941 struct btrfs_ioctl_same_extent_info *info; 2942 struct inode *src = file_inode(file); 2943 u64 off; 2944 u64 len; 2945 int i; 2946 int ret; 2947 unsigned long size; 2948 u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize; 2949 bool is_admin = capable(CAP_SYS_ADMIN); 2950 u16 count; 2951 2952 if (!(file->f_mode & FMODE_READ)) 2953 return -EINVAL; 2954 2955 ret = mnt_want_write_file(file); 2956 if (ret) 2957 return ret; 2958 2959 if (get_user(count, &argp->dest_count)) { 2960 ret = -EFAULT; 2961 goto out; 2962 } 2963 2964 size = offsetof(struct btrfs_ioctl_same_args __user, info[count]); 2965 2966 same = memdup_user(argp, size); 2967 2968 if (IS_ERR(same)) { 2969 ret = PTR_ERR(same); 2970 goto out; 2971 } 2972 2973 off = same->logical_offset; 2974 len = same->length; 2975 2976 /* 2977 * Limit the total length we will dedupe for each operation. 2978 * This is intended to bound the total time spent in this 2979 * ioctl to something sane. 2980 */ 2981 if (len > BTRFS_MAX_DEDUPE_LEN) 2982 len = BTRFS_MAX_DEDUPE_LEN; 2983 2984 if (WARN_ON_ONCE(bs < PAGE_CACHE_SIZE)) { 2985 /* 2986 * Btrfs does not support blocksize < page_size. As a 2987 * result, btrfs_cmp_data() won't correctly handle 2988 * this situation without an update. 2989 */ 2990 ret = -EINVAL; 2991 goto out; 2992 } 2993 2994 ret = -EISDIR; 2995 if (S_ISDIR(src->i_mode)) 2996 goto out; 2997 2998 ret = -EACCES; 2999 if (!S_ISREG(src->i_mode)) 3000 goto out; 3001 3002 /* pre-format output fields to sane values */ 3003 for (i = 0; i < count; i++) { 3004 same->info[i].bytes_deduped = 0ULL; 3005 same->info[i].status = 0; 3006 } 3007 3008 for (i = 0, info = same->info; i < count; i++, info++) { 3009 struct inode *dst; 3010 struct fd dst_file = fdget(info->fd); 3011 if (!dst_file.file) { 3012 info->status = -EBADF; 3013 continue; 3014 } 3015 dst = file_inode(dst_file.file); 3016 3017 if (!(is_admin || (dst_file.file->f_mode & FMODE_WRITE))) { 3018 info->status = -EINVAL; 3019 } else if (file->f_path.mnt != dst_file.file->f_path.mnt) { 3020 info->status = -EXDEV; 3021 } else if (S_ISDIR(dst->i_mode)) { 3022 info->status = -EISDIR; 3023 } else if (!S_ISREG(dst->i_mode)) { 3024 info->status = -EACCES; 3025 } else { 3026 info->status = btrfs_extent_same(src, off, len, dst, 3027 info->logical_offset); 3028 if (info->status == 0) 3029 info->bytes_deduped += len; 3030 } 3031 fdput(dst_file); 3032 } 3033 3034 ret = copy_to_user(argp, same, size); 3035 if (ret) 3036 ret = -EFAULT; 3037 3038 out: 3039 mnt_drop_write_file(file); 3040 return ret; 3041 } 3042 3043 /* Helper to check and see if this root currently has a ref on the given disk 3044 * bytenr. If it does then we need to update the quota for this root. This 3045 * doesn't do anything if quotas aren't enabled. 3046 */ 3047 static int check_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 3048 u64 disko) 3049 { 3050 struct seq_list tree_mod_seq_elem = {}; 3051 struct ulist *roots; 3052 struct ulist_iterator uiter; 3053 struct ulist_node *root_node = NULL; 3054 int ret; 3055 3056 if (!root->fs_info->quota_enabled) 3057 return 1; 3058 3059 btrfs_get_tree_mod_seq(root->fs_info, &tree_mod_seq_elem); 3060 ret = btrfs_find_all_roots(trans, root->fs_info, disko, 3061 tree_mod_seq_elem.seq, &roots); 3062 if (ret < 0) 3063 goto out; 3064 ret = 0; 3065 ULIST_ITER_INIT(&uiter); 3066 while ((root_node = ulist_next(roots, &uiter))) { 3067 if (root_node->val == root->objectid) { 3068 ret = 1; 3069 break; 3070 } 3071 } 3072 ulist_free(roots); 3073 out: 3074 btrfs_put_tree_mod_seq(root->fs_info, &tree_mod_seq_elem); 3075 return ret; 3076 } 3077 3078 static int clone_finish_inode_update(struct btrfs_trans_handle *trans, 3079 struct inode *inode, 3080 u64 endoff, 3081 const u64 destoff, 3082 const u64 olen) 3083 { 3084 struct btrfs_root *root = BTRFS_I(inode)->root; 3085 int ret; 3086 3087 inode_inc_iversion(inode); 3088 inode->i_mtime = inode->i_ctime = CURRENT_TIME; 3089 /* 3090 * We round up to the block size at eof when determining which 3091 * extents to clone above, but shouldn't round up the file size. 3092 */ 3093 if (endoff > destoff + olen) 3094 endoff = destoff + olen; 3095 if (endoff > inode->i_size) 3096 btrfs_i_size_write(inode, endoff); 3097 3098 ret = btrfs_update_inode(trans, root, inode); 3099 if (ret) { 3100 btrfs_abort_transaction(trans, root, ret); 3101 btrfs_end_transaction(trans, root); 3102 goto out; 3103 } 3104 ret = btrfs_end_transaction(trans, root); 3105 out: 3106 return ret; 3107 } 3108 3109 static void clone_update_extent_map(struct inode *inode, 3110 const struct btrfs_trans_handle *trans, 3111 const struct btrfs_path *path, 3112 const u64 hole_offset, 3113 const u64 hole_len) 3114 { 3115 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 3116 struct extent_map *em; 3117 int ret; 3118 3119 em = alloc_extent_map(); 3120 if (!em) { 3121 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 3122 &BTRFS_I(inode)->runtime_flags); 3123 return; 3124 } 3125 3126 if (path) { 3127 struct btrfs_file_extent_item *fi; 3128 3129 fi = btrfs_item_ptr(path->nodes[0], path->slots[0], 3130 struct btrfs_file_extent_item); 3131 btrfs_extent_item_to_extent_map(inode, path, fi, false, em); 3132 em->generation = -1; 3133 if (btrfs_file_extent_type(path->nodes[0], fi) == 3134 BTRFS_FILE_EXTENT_INLINE) 3135 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 3136 &BTRFS_I(inode)->runtime_flags); 3137 } else { 3138 em->start = hole_offset; 3139 em->len = hole_len; 3140 em->ram_bytes = em->len; 3141 em->orig_start = hole_offset; 3142 em->block_start = EXTENT_MAP_HOLE; 3143 em->block_len = 0; 3144 em->orig_block_len = 0; 3145 em->compress_type = BTRFS_COMPRESS_NONE; 3146 em->generation = trans->transid; 3147 } 3148 3149 while (1) { 3150 write_lock(&em_tree->lock); 3151 ret = add_extent_mapping(em_tree, em, 1); 3152 write_unlock(&em_tree->lock); 3153 if (ret != -EEXIST) { 3154 free_extent_map(em); 3155 break; 3156 } 3157 btrfs_drop_extent_cache(inode, em->start, 3158 em->start + em->len - 1, 0); 3159 } 3160 3161 if (unlikely(ret)) 3162 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 3163 &BTRFS_I(inode)->runtime_flags); 3164 } 3165 3166 /** 3167 * btrfs_clone() - clone a range from inode file to another 3168 * 3169 * @src: Inode to clone from 3170 * @inode: Inode to clone to 3171 * @off: Offset within source to start clone from 3172 * @olen: Original length, passed by user, of range to clone 3173 * @olen_aligned: Block-aligned value of olen, extent_same uses 3174 * identical values here 3175 * @destoff: Offset within @inode to start clone 3176 */ 3177 static int btrfs_clone(struct inode *src, struct inode *inode, 3178 const u64 off, const u64 olen, const u64 olen_aligned, 3179 const u64 destoff) 3180 { 3181 struct btrfs_root *root = BTRFS_I(inode)->root; 3182 struct btrfs_path *path = NULL; 3183 struct extent_buffer *leaf; 3184 struct btrfs_trans_handle *trans; 3185 char *buf = NULL; 3186 struct btrfs_key key; 3187 u32 nritems; 3188 int slot; 3189 int ret; 3190 int no_quota; 3191 const u64 len = olen_aligned; 3192 u64 last_disko = 0; 3193 u64 last_dest_end = destoff; 3194 3195 ret = -ENOMEM; 3196 buf = vmalloc(btrfs_level_size(root, 0)); 3197 if (!buf) 3198 return ret; 3199 3200 path = btrfs_alloc_path(); 3201 if (!path) { 3202 vfree(buf); 3203 return ret; 3204 } 3205 3206 path->reada = 2; 3207 /* clone data */ 3208 key.objectid = btrfs_ino(src); 3209 key.type = BTRFS_EXTENT_DATA_KEY; 3210 key.offset = off; 3211 3212 while (1) { 3213 /* 3214 * note the key will change type as we walk through the 3215 * tree. 3216 */ 3217 path->leave_spinning = 1; 3218 ret = btrfs_search_slot(NULL, BTRFS_I(src)->root, &key, path, 3219 0, 0); 3220 if (ret < 0) 3221 goto out; 3222 /* 3223 * First search, if no extent item that starts at offset off was 3224 * found but the previous item is an extent item, it's possible 3225 * it might overlap our target range, therefore process it. 3226 */ 3227 if (key.offset == off && ret > 0 && path->slots[0] > 0) { 3228 btrfs_item_key_to_cpu(path->nodes[0], &key, 3229 path->slots[0] - 1); 3230 if (key.type == BTRFS_EXTENT_DATA_KEY) 3231 path->slots[0]--; 3232 } 3233 3234 nritems = btrfs_header_nritems(path->nodes[0]); 3235 process_slot: 3236 no_quota = 1; 3237 if (path->slots[0] >= nritems) { 3238 ret = btrfs_next_leaf(BTRFS_I(src)->root, path); 3239 if (ret < 0) 3240 goto out; 3241 if (ret > 0) 3242 break; 3243 nritems = btrfs_header_nritems(path->nodes[0]); 3244 } 3245 leaf = path->nodes[0]; 3246 slot = path->slots[0]; 3247 3248 btrfs_item_key_to_cpu(leaf, &key, slot); 3249 if (btrfs_key_type(&key) > BTRFS_EXTENT_DATA_KEY || 3250 key.objectid != btrfs_ino(src)) 3251 break; 3252 3253 if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) { 3254 struct btrfs_file_extent_item *extent; 3255 int type; 3256 u32 size; 3257 struct btrfs_key new_key; 3258 u64 disko = 0, diskl = 0; 3259 u64 datao = 0, datal = 0; 3260 u8 comp; 3261 u64 drop_start; 3262 3263 extent = btrfs_item_ptr(leaf, slot, 3264 struct btrfs_file_extent_item); 3265 comp = btrfs_file_extent_compression(leaf, extent); 3266 type = btrfs_file_extent_type(leaf, extent); 3267 if (type == BTRFS_FILE_EXTENT_REG || 3268 type == BTRFS_FILE_EXTENT_PREALLOC) { 3269 disko = btrfs_file_extent_disk_bytenr(leaf, 3270 extent); 3271 diskl = btrfs_file_extent_disk_num_bytes(leaf, 3272 extent); 3273 datao = btrfs_file_extent_offset(leaf, extent); 3274 datal = btrfs_file_extent_num_bytes(leaf, 3275 extent); 3276 } else if (type == BTRFS_FILE_EXTENT_INLINE) { 3277 /* take upper bound, may be compressed */ 3278 datal = btrfs_file_extent_ram_bytes(leaf, 3279 extent); 3280 } 3281 3282 /* 3283 * The first search might have left us at an extent 3284 * item that ends before our target range's start, can 3285 * happen if we have holes and NO_HOLES feature enabled. 3286 */ 3287 if (key.offset + datal <= off) { 3288 path->slots[0]++; 3289 goto process_slot; 3290 } else if (key.offset >= off + len) { 3291 break; 3292 } 3293 3294 size = btrfs_item_size_nr(leaf, slot); 3295 read_extent_buffer(leaf, buf, 3296 btrfs_item_ptr_offset(leaf, slot), 3297 size); 3298 3299 btrfs_release_path(path); 3300 path->leave_spinning = 0; 3301 3302 memcpy(&new_key, &key, sizeof(new_key)); 3303 new_key.objectid = btrfs_ino(inode); 3304 if (off <= key.offset) 3305 new_key.offset = key.offset + destoff - off; 3306 else 3307 new_key.offset = destoff; 3308 3309 /* 3310 * Deal with a hole that doesn't have an extent item 3311 * that represents it (NO_HOLES feature enabled). 3312 * This hole is either in the middle of the cloning 3313 * range or at the beginning (fully overlaps it or 3314 * partially overlaps it). 3315 */ 3316 if (new_key.offset != last_dest_end) 3317 drop_start = last_dest_end; 3318 else 3319 drop_start = new_key.offset; 3320 3321 /* 3322 * 1 - adjusting old extent (we may have to split it) 3323 * 1 - add new extent 3324 * 1 - inode update 3325 */ 3326 trans = btrfs_start_transaction(root, 3); 3327 if (IS_ERR(trans)) { 3328 ret = PTR_ERR(trans); 3329 goto out; 3330 } 3331 3332 if (type == BTRFS_FILE_EXTENT_REG || 3333 type == BTRFS_FILE_EXTENT_PREALLOC) { 3334 /* 3335 * a | --- range to clone ---| b 3336 * | ------------- extent ------------- | 3337 */ 3338 3339 /* subtract range b */ 3340 if (key.offset + datal > off + len) 3341 datal = off + len - key.offset; 3342 3343 /* subtract range a */ 3344 if (off > key.offset) { 3345 datao += off - key.offset; 3346 datal -= off - key.offset; 3347 } 3348 3349 ret = btrfs_drop_extents(trans, root, inode, 3350 drop_start, 3351 new_key.offset + datal, 3352 1); 3353 if (ret) { 3354 if (ret != -EOPNOTSUPP) 3355 btrfs_abort_transaction(trans, 3356 root, ret); 3357 btrfs_end_transaction(trans, root); 3358 goto out; 3359 } 3360 3361 ret = btrfs_insert_empty_item(trans, root, path, 3362 &new_key, size); 3363 if (ret) { 3364 btrfs_abort_transaction(trans, root, 3365 ret); 3366 btrfs_end_transaction(trans, root); 3367 goto out; 3368 } 3369 3370 leaf = path->nodes[0]; 3371 slot = path->slots[0]; 3372 write_extent_buffer(leaf, buf, 3373 btrfs_item_ptr_offset(leaf, slot), 3374 size); 3375 3376 extent = btrfs_item_ptr(leaf, slot, 3377 struct btrfs_file_extent_item); 3378 3379 /* disko == 0 means it's a hole */ 3380 if (!disko) 3381 datao = 0; 3382 3383 btrfs_set_file_extent_offset(leaf, extent, 3384 datao); 3385 btrfs_set_file_extent_num_bytes(leaf, extent, 3386 datal); 3387 3388 /* 3389 * We need to look up the roots that point at 3390 * this bytenr and see if the new root does. If 3391 * it does not we need to make sure we update 3392 * quotas appropriately. 3393 */ 3394 if (disko && root != BTRFS_I(src)->root && 3395 disko != last_disko) { 3396 no_quota = check_ref(trans, root, 3397 disko); 3398 if (no_quota < 0) { 3399 btrfs_abort_transaction(trans, 3400 root, 3401 ret); 3402 btrfs_end_transaction(trans, 3403 root); 3404 ret = no_quota; 3405 goto out; 3406 } 3407 } 3408 3409 if (disko) { 3410 inode_add_bytes(inode, datal); 3411 ret = btrfs_inc_extent_ref(trans, root, 3412 disko, diskl, 0, 3413 root->root_key.objectid, 3414 btrfs_ino(inode), 3415 new_key.offset - datao, 3416 no_quota); 3417 if (ret) { 3418 btrfs_abort_transaction(trans, 3419 root, 3420 ret); 3421 btrfs_end_transaction(trans, 3422 root); 3423 goto out; 3424 3425 } 3426 } 3427 } else if (type == BTRFS_FILE_EXTENT_INLINE) { 3428 u64 skip = 0; 3429 u64 trim = 0; 3430 u64 aligned_end = 0; 3431 3432 if (off > key.offset) { 3433 skip = off - key.offset; 3434 new_key.offset += skip; 3435 } 3436 3437 if (key.offset + datal > off + len) 3438 trim = key.offset + datal - (off + len); 3439 3440 if (comp && (skip || trim)) { 3441 ret = -EINVAL; 3442 btrfs_end_transaction(trans, root); 3443 goto out; 3444 } 3445 size -= skip + trim; 3446 datal -= skip + trim; 3447 3448 aligned_end = ALIGN(new_key.offset + datal, 3449 root->sectorsize); 3450 ret = btrfs_drop_extents(trans, root, inode, 3451 drop_start, 3452 aligned_end, 3453 1); 3454 if (ret) { 3455 if (ret != -EOPNOTSUPP) 3456 btrfs_abort_transaction(trans, 3457 root, ret); 3458 btrfs_end_transaction(trans, root); 3459 goto out; 3460 } 3461 3462 ret = btrfs_insert_empty_item(trans, root, path, 3463 &new_key, size); 3464 if (ret) { 3465 btrfs_abort_transaction(trans, root, 3466 ret); 3467 btrfs_end_transaction(trans, root); 3468 goto out; 3469 } 3470 3471 if (skip) { 3472 u32 start = 3473 btrfs_file_extent_calc_inline_size(0); 3474 memmove(buf+start, buf+start+skip, 3475 datal); 3476 } 3477 3478 leaf = path->nodes[0]; 3479 slot = path->slots[0]; 3480 write_extent_buffer(leaf, buf, 3481 btrfs_item_ptr_offset(leaf, slot), 3482 size); 3483 inode_add_bytes(inode, datal); 3484 } 3485 3486 /* If we have an implicit hole (NO_HOLES feature). */ 3487 if (drop_start < new_key.offset) 3488 clone_update_extent_map(inode, trans, 3489 NULL, drop_start, 3490 new_key.offset - drop_start); 3491 3492 clone_update_extent_map(inode, trans, path, 0, 0); 3493 3494 btrfs_mark_buffer_dirty(leaf); 3495 btrfs_release_path(path); 3496 3497 last_dest_end = ALIGN(new_key.offset + datal, 3498 root->sectorsize); 3499 ret = clone_finish_inode_update(trans, inode, 3500 last_dest_end, 3501 destoff, olen); 3502 if (ret) 3503 goto out; 3504 if (new_key.offset + datal >= destoff + len) 3505 break; 3506 } 3507 btrfs_release_path(path); 3508 key.offset++; 3509 } 3510 ret = 0; 3511 3512 if (last_dest_end < destoff + len) { 3513 /* 3514 * We have an implicit hole (NO_HOLES feature is enabled) that 3515 * fully or partially overlaps our cloning range at its end. 3516 */ 3517 btrfs_release_path(path); 3518 3519 /* 3520 * 1 - remove extent(s) 3521 * 1 - inode update 3522 */ 3523 trans = btrfs_start_transaction(root, 2); 3524 if (IS_ERR(trans)) { 3525 ret = PTR_ERR(trans); 3526 goto out; 3527 } 3528 ret = btrfs_drop_extents(trans, root, inode, 3529 last_dest_end, destoff + len, 1); 3530 if (ret) { 3531 if (ret != -EOPNOTSUPP) 3532 btrfs_abort_transaction(trans, root, ret); 3533 btrfs_end_transaction(trans, root); 3534 goto out; 3535 } 3536 clone_update_extent_map(inode, trans, NULL, last_dest_end, 3537 destoff + len - last_dest_end); 3538 ret = clone_finish_inode_update(trans, inode, destoff + len, 3539 destoff, olen); 3540 } 3541 3542 out: 3543 btrfs_free_path(path); 3544 vfree(buf); 3545 return ret; 3546 } 3547 3548 static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, 3549 u64 off, u64 olen, u64 destoff) 3550 { 3551 struct inode *inode = file_inode(file); 3552 struct btrfs_root *root = BTRFS_I(inode)->root; 3553 struct fd src_file; 3554 struct inode *src; 3555 int ret; 3556 u64 len = olen; 3557 u64 bs = root->fs_info->sb->s_blocksize; 3558 int same_inode = 0; 3559 3560 /* 3561 * TODO: 3562 * - split compressed inline extents. annoying: we need to 3563 * decompress into destination's address_space (the file offset 3564 * may change, so source mapping won't do), then recompress (or 3565 * otherwise reinsert) a subrange. 3566 * 3567 * - split destination inode's inline extents. The inline extents can 3568 * be either compressed or non-compressed. 3569 */ 3570 3571 /* the destination must be opened for writing */ 3572 if (!(file->f_mode & FMODE_WRITE) || (file->f_flags & O_APPEND)) 3573 return -EINVAL; 3574 3575 if (btrfs_root_readonly(root)) 3576 return -EROFS; 3577 3578 ret = mnt_want_write_file(file); 3579 if (ret) 3580 return ret; 3581 3582 src_file = fdget(srcfd); 3583 if (!src_file.file) { 3584 ret = -EBADF; 3585 goto out_drop_write; 3586 } 3587 3588 ret = -EXDEV; 3589 if (src_file.file->f_path.mnt != file->f_path.mnt) 3590 goto out_fput; 3591 3592 src = file_inode(src_file.file); 3593 3594 ret = -EINVAL; 3595 if (src == inode) 3596 same_inode = 1; 3597 3598 /* the src must be open for reading */ 3599 if (!(src_file.file->f_mode & FMODE_READ)) 3600 goto out_fput; 3601 3602 /* don't make the dst file partly checksummed */ 3603 if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) != 3604 (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) 3605 goto out_fput; 3606 3607 ret = -EISDIR; 3608 if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode)) 3609 goto out_fput; 3610 3611 ret = -EXDEV; 3612 if (src->i_sb != inode->i_sb) 3613 goto out_fput; 3614 3615 if (!same_inode) { 3616 if (inode < src) { 3617 mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT); 3618 mutex_lock_nested(&src->i_mutex, I_MUTEX_CHILD); 3619 } else { 3620 mutex_lock_nested(&src->i_mutex, I_MUTEX_PARENT); 3621 mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD); 3622 } 3623 } else { 3624 mutex_lock(&src->i_mutex); 3625 } 3626 3627 /* determine range to clone */ 3628 ret = -EINVAL; 3629 if (off + len > src->i_size || off + len < off) 3630 goto out_unlock; 3631 if (len == 0) 3632 olen = len = src->i_size - off; 3633 /* if we extend to eof, continue to block boundary */ 3634 if (off + len == src->i_size) 3635 len = ALIGN(src->i_size, bs) - off; 3636 3637 /* verify the end result is block aligned */ 3638 if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs) || 3639 !IS_ALIGNED(destoff, bs)) 3640 goto out_unlock; 3641 3642 /* verify if ranges are overlapped within the same file */ 3643 if (same_inode) { 3644 if (destoff + len > off && destoff < off + len) 3645 goto out_unlock; 3646 } 3647 3648 if (destoff > inode->i_size) { 3649 ret = btrfs_cont_expand(inode, inode->i_size, destoff); 3650 if (ret) 3651 goto out_unlock; 3652 } 3653 3654 /* 3655 * Lock the target range too. Right after we replace the file extent 3656 * items in the fs tree (which now point to the cloned data), we might 3657 * have a worker replace them with extent items relative to a write 3658 * operation that was issued before this clone operation (i.e. confront 3659 * with inode.c:btrfs_finish_ordered_io). 3660 */ 3661 if (same_inode) { 3662 u64 lock_start = min_t(u64, off, destoff); 3663 u64 lock_len = max_t(u64, off, destoff) + len - lock_start; 3664 3665 lock_extent_range(src, lock_start, lock_len); 3666 } else { 3667 lock_extent_range(src, off, len); 3668 lock_extent_range(inode, destoff, len); 3669 } 3670 3671 ret = btrfs_clone(src, inode, off, olen, len, destoff); 3672 3673 if (same_inode) { 3674 u64 lock_start = min_t(u64, off, destoff); 3675 u64 lock_end = max_t(u64, off, destoff) + len - 1; 3676 3677 unlock_extent(&BTRFS_I(src)->io_tree, lock_start, lock_end); 3678 } else { 3679 unlock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1); 3680 unlock_extent(&BTRFS_I(inode)->io_tree, destoff, 3681 destoff + len - 1); 3682 } 3683 /* 3684 * Truncate page cache pages so that future reads will see the cloned 3685 * data immediately and not the previous data. 3686 */ 3687 truncate_inode_pages_range(&inode->i_data, destoff, 3688 PAGE_CACHE_ALIGN(destoff + len) - 1); 3689 out_unlock: 3690 if (!same_inode) { 3691 if (inode < src) { 3692 mutex_unlock(&src->i_mutex); 3693 mutex_unlock(&inode->i_mutex); 3694 } else { 3695 mutex_unlock(&inode->i_mutex); 3696 mutex_unlock(&src->i_mutex); 3697 } 3698 } else { 3699 mutex_unlock(&src->i_mutex); 3700 } 3701 out_fput: 3702 fdput(src_file); 3703 out_drop_write: 3704 mnt_drop_write_file(file); 3705 return ret; 3706 } 3707 3708 static long btrfs_ioctl_clone_range(struct file *file, void __user *argp) 3709 { 3710 struct btrfs_ioctl_clone_range_args args; 3711 3712 if (copy_from_user(&args, argp, sizeof(args))) 3713 return -EFAULT; 3714 return btrfs_ioctl_clone(file, args.src_fd, args.src_offset, 3715 args.src_length, args.dest_offset); 3716 } 3717 3718 /* 3719 * there are many ways the trans_start and trans_end ioctls can lead 3720 * to deadlocks. They should only be used by applications that 3721 * basically own the machine, and have a very in depth understanding 3722 * of all the possible deadlocks and enospc problems. 3723 */ 3724 static long btrfs_ioctl_trans_start(struct file *file) 3725 { 3726 struct inode *inode = file_inode(file); 3727 struct btrfs_root *root = BTRFS_I(inode)->root; 3728 struct btrfs_trans_handle *trans; 3729 int ret; 3730 3731 ret = -EPERM; 3732 if (!capable(CAP_SYS_ADMIN)) 3733 goto out; 3734 3735 ret = -EINPROGRESS; 3736 if (file->private_data) 3737 goto out; 3738 3739 ret = -EROFS; 3740 if (btrfs_root_readonly(root)) 3741 goto out; 3742 3743 ret = mnt_want_write_file(file); 3744 if (ret) 3745 goto out; 3746 3747 atomic_inc(&root->fs_info->open_ioctl_trans); 3748 3749 ret = -ENOMEM; 3750 trans = btrfs_start_ioctl_transaction(root); 3751 if (IS_ERR(trans)) 3752 goto out_drop; 3753 3754 file->private_data = trans; 3755 return 0; 3756 3757 out_drop: 3758 atomic_dec(&root->fs_info->open_ioctl_trans); 3759 mnt_drop_write_file(file); 3760 out: 3761 return ret; 3762 } 3763 3764 static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) 3765 { 3766 struct inode *inode = file_inode(file); 3767 struct btrfs_root *root = BTRFS_I(inode)->root; 3768 struct btrfs_root *new_root; 3769 struct btrfs_dir_item *di; 3770 struct btrfs_trans_handle *trans; 3771 struct btrfs_path *path; 3772 struct btrfs_key location; 3773 struct btrfs_disk_key disk_key; 3774 u64 objectid = 0; 3775 u64 dir_id; 3776 int ret; 3777 3778 if (!capable(CAP_SYS_ADMIN)) 3779 return -EPERM; 3780 3781 ret = mnt_want_write_file(file); 3782 if (ret) 3783 return ret; 3784 3785 if (copy_from_user(&objectid, argp, sizeof(objectid))) { 3786 ret = -EFAULT; 3787 goto out; 3788 } 3789 3790 if (!objectid) 3791 objectid = BTRFS_FS_TREE_OBJECTID; 3792 3793 location.objectid = objectid; 3794 location.type = BTRFS_ROOT_ITEM_KEY; 3795 location.offset = (u64)-1; 3796 3797 new_root = btrfs_read_fs_root_no_name(root->fs_info, &location); 3798 if (IS_ERR(new_root)) { 3799 ret = PTR_ERR(new_root); 3800 goto out; 3801 } 3802 3803 path = btrfs_alloc_path(); 3804 if (!path) { 3805 ret = -ENOMEM; 3806 goto out; 3807 } 3808 path->leave_spinning = 1; 3809 3810 trans = btrfs_start_transaction(root, 1); 3811 if (IS_ERR(trans)) { 3812 btrfs_free_path(path); 3813 ret = PTR_ERR(trans); 3814 goto out; 3815 } 3816 3817 dir_id = btrfs_super_root_dir(root->fs_info->super_copy); 3818 di = btrfs_lookup_dir_item(trans, root->fs_info->tree_root, path, 3819 dir_id, "default", 7, 1); 3820 if (IS_ERR_OR_NULL(di)) { 3821 btrfs_free_path(path); 3822 btrfs_end_transaction(trans, root); 3823 btrfs_err(new_root->fs_info, "Umm, you don't have the default dir" 3824 "item, this isn't going to work"); 3825 ret = -ENOENT; 3826 goto out; 3827 } 3828 3829 btrfs_cpu_key_to_disk(&disk_key, &new_root->root_key); 3830 btrfs_set_dir_item_key(path->nodes[0], di, &disk_key); 3831 btrfs_mark_buffer_dirty(path->nodes[0]); 3832 btrfs_free_path(path); 3833 3834 btrfs_set_fs_incompat(root->fs_info, DEFAULT_SUBVOL); 3835 btrfs_end_transaction(trans, root); 3836 out: 3837 mnt_drop_write_file(file); 3838 return ret; 3839 } 3840 3841 void btrfs_get_block_group_info(struct list_head *groups_list, 3842 struct btrfs_ioctl_space_info *space) 3843 { 3844 struct btrfs_block_group_cache *block_group; 3845 3846 space->total_bytes = 0; 3847 space->used_bytes = 0; 3848 space->flags = 0; 3849 list_for_each_entry(block_group, groups_list, list) { 3850 space->flags = block_group->flags; 3851 space->total_bytes += block_group->key.offset; 3852 space->used_bytes += 3853 btrfs_block_group_used(&block_group->item); 3854 } 3855 } 3856 3857 static long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg) 3858 { 3859 struct btrfs_ioctl_space_args space_args; 3860 struct btrfs_ioctl_space_info space; 3861 struct btrfs_ioctl_space_info *dest; 3862 struct btrfs_ioctl_space_info *dest_orig; 3863 struct btrfs_ioctl_space_info __user *user_dest; 3864 struct btrfs_space_info *info; 3865 u64 types[] = {BTRFS_BLOCK_GROUP_DATA, 3866 BTRFS_BLOCK_GROUP_SYSTEM, 3867 BTRFS_BLOCK_GROUP_METADATA, 3868 BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA}; 3869 int num_types = 4; 3870 int alloc_size; 3871 int ret = 0; 3872 u64 slot_count = 0; 3873 int i, c; 3874 3875 if (copy_from_user(&space_args, 3876 (struct btrfs_ioctl_space_args __user *)arg, 3877 sizeof(space_args))) 3878 return -EFAULT; 3879 3880 for (i = 0; i < num_types; i++) { 3881 struct btrfs_space_info *tmp; 3882 3883 info = NULL; 3884 rcu_read_lock(); 3885 list_for_each_entry_rcu(tmp, &root->fs_info->space_info, 3886 list) { 3887 if (tmp->flags == types[i]) { 3888 info = tmp; 3889 break; 3890 } 3891 } 3892 rcu_read_unlock(); 3893 3894 if (!info) 3895 continue; 3896 3897 down_read(&info->groups_sem); 3898 for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) { 3899 if (!list_empty(&info->block_groups[c])) 3900 slot_count++; 3901 } 3902 up_read(&info->groups_sem); 3903 } 3904 3905 /* 3906 * Global block reserve, exported as a space_info 3907 */ 3908 slot_count++; 3909 3910 /* space_slots == 0 means they are asking for a count */ 3911 if (space_args.space_slots == 0) { 3912 space_args.total_spaces = slot_count; 3913 goto out; 3914 } 3915 3916 slot_count = min_t(u64, space_args.space_slots, slot_count); 3917 3918 alloc_size = sizeof(*dest) * slot_count; 3919 3920 /* we generally have at most 6 or so space infos, one for each raid 3921 * level. So, a whole page should be more than enough for everyone 3922 */ 3923 if (alloc_size > PAGE_CACHE_SIZE) 3924 return -ENOMEM; 3925 3926 space_args.total_spaces = 0; 3927 dest = kmalloc(alloc_size, GFP_NOFS); 3928 if (!dest) 3929 return -ENOMEM; 3930 dest_orig = dest; 3931 3932 /* now we have a buffer to copy into */ 3933 for (i = 0; i < num_types; i++) { 3934 struct btrfs_space_info *tmp; 3935 3936 if (!slot_count) 3937 break; 3938 3939 info = NULL; 3940 rcu_read_lock(); 3941 list_for_each_entry_rcu(tmp, &root->fs_info->space_info, 3942 list) { 3943 if (tmp->flags == types[i]) { 3944 info = tmp; 3945 break; 3946 } 3947 } 3948 rcu_read_unlock(); 3949 3950 if (!info) 3951 continue; 3952 down_read(&info->groups_sem); 3953 for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) { 3954 if (!list_empty(&info->block_groups[c])) { 3955 btrfs_get_block_group_info( 3956 &info->block_groups[c], &space); 3957 memcpy(dest, &space, sizeof(space)); 3958 dest++; 3959 space_args.total_spaces++; 3960 slot_count--; 3961 } 3962 if (!slot_count) 3963 break; 3964 } 3965 up_read(&info->groups_sem); 3966 } 3967 3968 /* 3969 * Add global block reserve 3970 */ 3971 if (slot_count) { 3972 struct btrfs_block_rsv *block_rsv = &root->fs_info->global_block_rsv; 3973 3974 spin_lock(&block_rsv->lock); 3975 space.total_bytes = block_rsv->size; 3976 space.used_bytes = block_rsv->size - block_rsv->reserved; 3977 spin_unlock(&block_rsv->lock); 3978 space.flags = BTRFS_SPACE_INFO_GLOBAL_RSV; 3979 memcpy(dest, &space, sizeof(space)); 3980 space_args.total_spaces++; 3981 } 3982 3983 user_dest = (struct btrfs_ioctl_space_info __user *) 3984 (arg + sizeof(struct btrfs_ioctl_space_args)); 3985 3986 if (copy_to_user(user_dest, dest_orig, alloc_size)) 3987 ret = -EFAULT; 3988 3989 kfree(dest_orig); 3990 out: 3991 if (ret == 0 && copy_to_user(arg, &space_args, sizeof(space_args))) 3992 ret = -EFAULT; 3993 3994 return ret; 3995 } 3996 3997 /* 3998 * there are many ways the trans_start and trans_end ioctls can lead 3999 * to deadlocks. They should only be used by applications that 4000 * basically own the machine, and have a very in depth understanding 4001 * of all the possible deadlocks and enospc problems. 4002 */ 4003 long btrfs_ioctl_trans_end(struct file *file) 4004 { 4005 struct inode *inode = file_inode(file); 4006 struct btrfs_root *root = BTRFS_I(inode)->root; 4007 struct btrfs_trans_handle *trans; 4008 4009 trans = file->private_data; 4010 if (!trans) 4011 return -EINVAL; 4012 file->private_data = NULL; 4013 4014 btrfs_end_transaction(trans, root); 4015 4016 atomic_dec(&root->fs_info->open_ioctl_trans); 4017 4018 mnt_drop_write_file(file); 4019 return 0; 4020 } 4021 4022 static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root, 4023 void __user *argp) 4024 { 4025 struct btrfs_trans_handle *trans; 4026 u64 transid; 4027 int ret; 4028 4029 trans = btrfs_attach_transaction_barrier(root); 4030 if (IS_ERR(trans)) { 4031 if (PTR_ERR(trans) != -ENOENT) 4032 return PTR_ERR(trans); 4033 4034 /* No running transaction, don't bother */ 4035 transid = root->fs_info->last_trans_committed; 4036 goto out; 4037 } 4038 transid = trans->transid; 4039 ret = btrfs_commit_transaction_async(trans, root, 0); 4040 if (ret) { 4041 btrfs_end_transaction(trans, root); 4042 return ret; 4043 } 4044 out: 4045 if (argp) 4046 if (copy_to_user(argp, &transid, sizeof(transid))) 4047 return -EFAULT; 4048 return 0; 4049 } 4050 4051 static noinline long btrfs_ioctl_wait_sync(struct btrfs_root *root, 4052 void __user *argp) 4053 { 4054 u64 transid; 4055 4056 if (argp) { 4057 if (copy_from_user(&transid, argp, sizeof(transid))) 4058 return -EFAULT; 4059 } else { 4060 transid = 0; /* current trans */ 4061 } 4062 return btrfs_wait_for_commit(root, transid); 4063 } 4064 4065 static long btrfs_ioctl_scrub(struct file *file, void __user *arg) 4066 { 4067 struct btrfs_root *root = BTRFS_I(file_inode(file))->root; 4068 struct btrfs_ioctl_scrub_args *sa; 4069 int ret; 4070 4071 if (!capable(CAP_SYS_ADMIN)) 4072 return -EPERM; 4073 4074 sa = memdup_user(arg, sizeof(*sa)); 4075 if (IS_ERR(sa)) 4076 return PTR_ERR(sa); 4077 4078 if (!(sa->flags & BTRFS_SCRUB_READONLY)) { 4079 ret = mnt_want_write_file(file); 4080 if (ret) 4081 goto out; 4082 } 4083 4084 ret = btrfs_scrub_dev(root->fs_info, sa->devid, sa->start, sa->end, 4085 &sa->progress, sa->flags & BTRFS_SCRUB_READONLY, 4086 0); 4087 4088 if (copy_to_user(arg, sa, sizeof(*sa))) 4089 ret = -EFAULT; 4090 4091 if (!(sa->flags & BTRFS_SCRUB_READONLY)) 4092 mnt_drop_write_file(file); 4093 out: 4094 kfree(sa); 4095 return ret; 4096 } 4097 4098 static long btrfs_ioctl_scrub_cancel(struct btrfs_root *root, void __user *arg) 4099 { 4100 if (!capable(CAP_SYS_ADMIN)) 4101 return -EPERM; 4102 4103 return btrfs_scrub_cancel(root->fs_info); 4104 } 4105 4106 static long btrfs_ioctl_scrub_progress(struct btrfs_root *root, 4107 void __user *arg) 4108 { 4109 struct btrfs_ioctl_scrub_args *sa; 4110 int ret; 4111 4112 if (!capable(CAP_SYS_ADMIN)) 4113 return -EPERM; 4114 4115 sa = memdup_user(arg, sizeof(*sa)); 4116 if (IS_ERR(sa)) 4117 return PTR_ERR(sa); 4118 4119 ret = btrfs_scrub_progress(root, sa->devid, &sa->progress); 4120 4121 if (copy_to_user(arg, sa, sizeof(*sa))) 4122 ret = -EFAULT; 4123 4124 kfree(sa); 4125 return ret; 4126 } 4127 4128 static long btrfs_ioctl_get_dev_stats(struct btrfs_root *root, 4129 void __user *arg) 4130 { 4131 struct btrfs_ioctl_get_dev_stats *sa; 4132 int ret; 4133 4134 sa = memdup_user(arg, sizeof(*sa)); 4135 if (IS_ERR(sa)) 4136 return PTR_ERR(sa); 4137 4138 if ((sa->flags & BTRFS_DEV_STATS_RESET) && !capable(CAP_SYS_ADMIN)) { 4139 kfree(sa); 4140 return -EPERM; 4141 } 4142 4143 ret = btrfs_get_dev_stats(root, sa); 4144 4145 if (copy_to_user(arg, sa, sizeof(*sa))) 4146 ret = -EFAULT; 4147 4148 kfree(sa); 4149 return ret; 4150 } 4151 4152 static long btrfs_ioctl_dev_replace(struct btrfs_root *root, void __user *arg) 4153 { 4154 struct btrfs_ioctl_dev_replace_args *p; 4155 int ret; 4156 4157 if (!capable(CAP_SYS_ADMIN)) 4158 return -EPERM; 4159 4160 p = memdup_user(arg, sizeof(*p)); 4161 if (IS_ERR(p)) 4162 return PTR_ERR(p); 4163 4164 switch (p->cmd) { 4165 case BTRFS_IOCTL_DEV_REPLACE_CMD_START: 4166 if (root->fs_info->sb->s_flags & MS_RDONLY) { 4167 ret = -EROFS; 4168 goto out; 4169 } 4170 if (atomic_xchg( 4171 &root->fs_info->mutually_exclusive_operation_running, 4172 1)) { 4173 ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; 4174 } else { 4175 ret = btrfs_dev_replace_start(root, p); 4176 atomic_set( 4177 &root->fs_info->mutually_exclusive_operation_running, 4178 0); 4179 } 4180 break; 4181 case BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS: 4182 btrfs_dev_replace_status(root->fs_info, p); 4183 ret = 0; 4184 break; 4185 case BTRFS_IOCTL_DEV_REPLACE_CMD_CANCEL: 4186 ret = btrfs_dev_replace_cancel(root->fs_info, p); 4187 break; 4188 default: 4189 ret = -EINVAL; 4190 break; 4191 } 4192 4193 if (copy_to_user(arg, p, sizeof(*p))) 4194 ret = -EFAULT; 4195 out: 4196 kfree(p); 4197 return ret; 4198 } 4199 4200 static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg) 4201 { 4202 int ret = 0; 4203 int i; 4204 u64 rel_ptr; 4205 int size; 4206 struct btrfs_ioctl_ino_path_args *ipa = NULL; 4207 struct inode_fs_paths *ipath = NULL; 4208 struct btrfs_path *path; 4209 4210 if (!capable(CAP_DAC_READ_SEARCH)) 4211 return -EPERM; 4212 4213 path = btrfs_alloc_path(); 4214 if (!path) { 4215 ret = -ENOMEM; 4216 goto out; 4217 } 4218 4219 ipa = memdup_user(arg, sizeof(*ipa)); 4220 if (IS_ERR(ipa)) { 4221 ret = PTR_ERR(ipa); 4222 ipa = NULL; 4223 goto out; 4224 } 4225 4226 size = min_t(u32, ipa->size, 4096); 4227 ipath = init_ipath(size, root, path); 4228 if (IS_ERR(ipath)) { 4229 ret = PTR_ERR(ipath); 4230 ipath = NULL; 4231 goto out; 4232 } 4233 4234 ret = paths_from_inode(ipa->inum, ipath); 4235 if (ret < 0) 4236 goto out; 4237 4238 for (i = 0; i < ipath->fspath->elem_cnt; ++i) { 4239 rel_ptr = ipath->fspath->val[i] - 4240 (u64)(unsigned long)ipath->fspath->val; 4241 ipath->fspath->val[i] = rel_ptr; 4242 } 4243 4244 ret = copy_to_user((void *)(unsigned long)ipa->fspath, 4245 (void *)(unsigned long)ipath->fspath, size); 4246 if (ret) { 4247 ret = -EFAULT; 4248 goto out; 4249 } 4250 4251 out: 4252 btrfs_free_path(path); 4253 free_ipath(ipath); 4254 kfree(ipa); 4255 4256 return ret; 4257 } 4258 4259 static int build_ino_list(u64 inum, u64 offset, u64 root, void *ctx) 4260 { 4261 struct btrfs_data_container *inodes = ctx; 4262 const size_t c = 3 * sizeof(u64); 4263 4264 if (inodes->bytes_left >= c) { 4265 inodes->bytes_left -= c; 4266 inodes->val[inodes->elem_cnt] = inum; 4267 inodes->val[inodes->elem_cnt + 1] = offset; 4268 inodes->val[inodes->elem_cnt + 2] = root; 4269 inodes->elem_cnt += 3; 4270 } else { 4271 inodes->bytes_missing += c - inodes->bytes_left; 4272 inodes->bytes_left = 0; 4273 inodes->elem_missed += 3; 4274 } 4275 4276 return 0; 4277 } 4278 4279 static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root, 4280 void __user *arg) 4281 { 4282 int ret = 0; 4283 int size; 4284 struct btrfs_ioctl_logical_ino_args *loi; 4285 struct btrfs_data_container *inodes = NULL; 4286 struct btrfs_path *path = NULL; 4287 4288 if (!capable(CAP_SYS_ADMIN)) 4289 return -EPERM; 4290 4291 loi = memdup_user(arg, sizeof(*loi)); 4292 if (IS_ERR(loi)) { 4293 ret = PTR_ERR(loi); 4294 loi = NULL; 4295 goto out; 4296 } 4297 4298 path = btrfs_alloc_path(); 4299 if (!path) { 4300 ret = -ENOMEM; 4301 goto out; 4302 } 4303 4304 size = min_t(u32, loi->size, 64 * 1024); 4305 inodes = init_data_container(size); 4306 if (IS_ERR(inodes)) { 4307 ret = PTR_ERR(inodes); 4308 inodes = NULL; 4309 goto out; 4310 } 4311 4312 ret = iterate_inodes_from_logical(loi->logical, root->fs_info, path, 4313 build_ino_list, inodes); 4314 if (ret == -EINVAL) 4315 ret = -ENOENT; 4316 if (ret < 0) 4317 goto out; 4318 4319 ret = copy_to_user((void *)(unsigned long)loi->inodes, 4320 (void *)(unsigned long)inodes, size); 4321 if (ret) 4322 ret = -EFAULT; 4323 4324 out: 4325 btrfs_free_path(path); 4326 vfree(inodes); 4327 kfree(loi); 4328 4329 return ret; 4330 } 4331 4332 void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock, 4333 struct btrfs_ioctl_balance_args *bargs) 4334 { 4335 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 4336 4337 bargs->flags = bctl->flags; 4338 4339 if (atomic_read(&fs_info->balance_running)) 4340 bargs->state |= BTRFS_BALANCE_STATE_RUNNING; 4341 if (atomic_read(&fs_info->balance_pause_req)) 4342 bargs->state |= BTRFS_BALANCE_STATE_PAUSE_REQ; 4343 if (atomic_read(&fs_info->balance_cancel_req)) 4344 bargs->state |= BTRFS_BALANCE_STATE_CANCEL_REQ; 4345 4346 memcpy(&bargs->data, &bctl->data, sizeof(bargs->data)); 4347 memcpy(&bargs->meta, &bctl->meta, sizeof(bargs->meta)); 4348 memcpy(&bargs->sys, &bctl->sys, sizeof(bargs->sys)); 4349 4350 if (lock) { 4351 spin_lock(&fs_info->balance_lock); 4352 memcpy(&bargs->stat, &bctl->stat, sizeof(bargs->stat)); 4353 spin_unlock(&fs_info->balance_lock); 4354 } else { 4355 memcpy(&bargs->stat, &bctl->stat, sizeof(bargs->stat)); 4356 } 4357 } 4358 4359 static long btrfs_ioctl_balance(struct file *file, void __user *arg) 4360 { 4361 struct btrfs_root *root = BTRFS_I(file_inode(file))->root; 4362 struct btrfs_fs_info *fs_info = root->fs_info; 4363 struct btrfs_ioctl_balance_args *bargs; 4364 struct btrfs_balance_control *bctl; 4365 bool need_unlock; /* for mut. excl. ops lock */ 4366 int ret; 4367 4368 if (!capable(CAP_SYS_ADMIN)) 4369 return -EPERM; 4370 4371 ret = mnt_want_write_file(file); 4372 if (ret) 4373 return ret; 4374 4375 again: 4376 if (!atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)) { 4377 mutex_lock(&fs_info->volume_mutex); 4378 mutex_lock(&fs_info->balance_mutex); 4379 need_unlock = true; 4380 goto locked; 4381 } 4382 4383 /* 4384 * mut. excl. ops lock is locked. Three possibilites: 4385 * (1) some other op is running 4386 * (2) balance is running 4387 * (3) balance is paused -- special case (think resume) 4388 */ 4389 mutex_lock(&fs_info->balance_mutex); 4390 if (fs_info->balance_ctl) { 4391 /* this is either (2) or (3) */ 4392 if (!atomic_read(&fs_info->balance_running)) { 4393 mutex_unlock(&fs_info->balance_mutex); 4394 if (!mutex_trylock(&fs_info->volume_mutex)) 4395 goto again; 4396 mutex_lock(&fs_info->balance_mutex); 4397 4398 if (fs_info->balance_ctl && 4399 !atomic_read(&fs_info->balance_running)) { 4400 /* this is (3) */ 4401 need_unlock = false; 4402 goto locked; 4403 } 4404 4405 mutex_unlock(&fs_info->balance_mutex); 4406 mutex_unlock(&fs_info->volume_mutex); 4407 goto again; 4408 } else { 4409 /* this is (2) */ 4410 mutex_unlock(&fs_info->balance_mutex); 4411 ret = -EINPROGRESS; 4412 goto out; 4413 } 4414 } else { 4415 /* this is (1) */ 4416 mutex_unlock(&fs_info->balance_mutex); 4417 ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; 4418 goto out; 4419 } 4420 4421 locked: 4422 BUG_ON(!atomic_read(&fs_info->mutually_exclusive_operation_running)); 4423 4424 if (arg) { 4425 bargs = memdup_user(arg, sizeof(*bargs)); 4426 if (IS_ERR(bargs)) { 4427 ret = PTR_ERR(bargs); 4428 goto out_unlock; 4429 } 4430 4431 if (bargs->flags & BTRFS_BALANCE_RESUME) { 4432 if (!fs_info->balance_ctl) { 4433 ret = -ENOTCONN; 4434 goto out_bargs; 4435 } 4436 4437 bctl = fs_info->balance_ctl; 4438 spin_lock(&fs_info->balance_lock); 4439 bctl->flags |= BTRFS_BALANCE_RESUME; 4440 spin_unlock(&fs_info->balance_lock); 4441 4442 goto do_balance; 4443 } 4444 } else { 4445 bargs = NULL; 4446 } 4447 4448 if (fs_info->balance_ctl) { 4449 ret = -EINPROGRESS; 4450 goto out_bargs; 4451 } 4452 4453 bctl = kzalloc(sizeof(*bctl), GFP_NOFS); 4454 if (!bctl) { 4455 ret = -ENOMEM; 4456 goto out_bargs; 4457 } 4458 4459 bctl->fs_info = fs_info; 4460 if (arg) { 4461 memcpy(&bctl->data, &bargs->data, sizeof(bctl->data)); 4462 memcpy(&bctl->meta, &bargs->meta, sizeof(bctl->meta)); 4463 memcpy(&bctl->sys, &bargs->sys, sizeof(bctl->sys)); 4464 4465 bctl->flags = bargs->flags; 4466 } else { 4467 /* balance everything - no filters */ 4468 bctl->flags |= BTRFS_BALANCE_TYPE_MASK; 4469 } 4470 4471 do_balance: 4472 /* 4473 * Ownership of bctl and mutually_exclusive_operation_running 4474 * goes to to btrfs_balance. bctl is freed in __cancel_balance, 4475 * or, if restriper was paused all the way until unmount, in 4476 * free_fs_info. mutually_exclusive_operation_running is 4477 * cleared in __cancel_balance. 4478 */ 4479 need_unlock = false; 4480 4481 ret = btrfs_balance(bctl, bargs); 4482 4483 if (arg) { 4484 if (copy_to_user(arg, bargs, sizeof(*bargs))) 4485 ret = -EFAULT; 4486 } 4487 4488 out_bargs: 4489 kfree(bargs); 4490 out_unlock: 4491 mutex_unlock(&fs_info->balance_mutex); 4492 mutex_unlock(&fs_info->volume_mutex); 4493 if (need_unlock) 4494 atomic_set(&fs_info->mutually_exclusive_operation_running, 0); 4495 out: 4496 mnt_drop_write_file(file); 4497 return ret; 4498 } 4499 4500 static long btrfs_ioctl_balance_ctl(struct btrfs_root *root, int cmd) 4501 { 4502 if (!capable(CAP_SYS_ADMIN)) 4503 return -EPERM; 4504 4505 switch (cmd) { 4506 case BTRFS_BALANCE_CTL_PAUSE: 4507 return btrfs_pause_balance(root->fs_info); 4508 case BTRFS_BALANCE_CTL_CANCEL: 4509 return btrfs_cancel_balance(root->fs_info); 4510 } 4511 4512 return -EINVAL; 4513 } 4514 4515 static long btrfs_ioctl_balance_progress(struct btrfs_root *root, 4516 void __user *arg) 4517 { 4518 struct btrfs_fs_info *fs_info = root->fs_info; 4519 struct btrfs_ioctl_balance_args *bargs; 4520 int ret = 0; 4521 4522 if (!capable(CAP_SYS_ADMIN)) 4523 return -EPERM; 4524 4525 mutex_lock(&fs_info->balance_mutex); 4526 if (!fs_info->balance_ctl) { 4527 ret = -ENOTCONN; 4528 goto out; 4529 } 4530 4531 bargs = kzalloc(sizeof(*bargs), GFP_NOFS); 4532 if (!bargs) { 4533 ret = -ENOMEM; 4534 goto out; 4535 } 4536 4537 update_ioctl_balance_args(fs_info, 1, bargs); 4538 4539 if (copy_to_user(arg, bargs, sizeof(*bargs))) 4540 ret = -EFAULT; 4541 4542 kfree(bargs); 4543 out: 4544 mutex_unlock(&fs_info->balance_mutex); 4545 return ret; 4546 } 4547 4548 static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg) 4549 { 4550 struct btrfs_root *root = BTRFS_I(file_inode(file))->root; 4551 struct btrfs_ioctl_quota_ctl_args *sa; 4552 struct btrfs_trans_handle *trans = NULL; 4553 int ret; 4554 int err; 4555 4556 if (!capable(CAP_SYS_ADMIN)) 4557 return -EPERM; 4558 4559 ret = mnt_want_write_file(file); 4560 if (ret) 4561 return ret; 4562 4563 sa = memdup_user(arg, sizeof(*sa)); 4564 if (IS_ERR(sa)) { 4565 ret = PTR_ERR(sa); 4566 goto drop_write; 4567 } 4568 4569 down_write(&root->fs_info->subvol_sem); 4570 trans = btrfs_start_transaction(root->fs_info->tree_root, 2); 4571 if (IS_ERR(trans)) { 4572 ret = PTR_ERR(trans); 4573 goto out; 4574 } 4575 4576 switch (sa->cmd) { 4577 case BTRFS_QUOTA_CTL_ENABLE: 4578 ret = btrfs_quota_enable(trans, root->fs_info); 4579 break; 4580 case BTRFS_QUOTA_CTL_DISABLE: 4581 ret = btrfs_quota_disable(trans, root->fs_info); 4582 break; 4583 default: 4584 ret = -EINVAL; 4585 break; 4586 } 4587 4588 err = btrfs_commit_transaction(trans, root->fs_info->tree_root); 4589 if (err && !ret) 4590 ret = err; 4591 out: 4592 kfree(sa); 4593 up_write(&root->fs_info->subvol_sem); 4594 drop_write: 4595 mnt_drop_write_file(file); 4596 return ret; 4597 } 4598 4599 static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg) 4600 { 4601 struct btrfs_root *root = BTRFS_I(file_inode(file))->root; 4602 struct btrfs_ioctl_qgroup_assign_args *sa; 4603 struct btrfs_trans_handle *trans; 4604 int ret; 4605 int err; 4606 4607 if (!capable(CAP_SYS_ADMIN)) 4608 return -EPERM; 4609 4610 ret = mnt_want_write_file(file); 4611 if (ret) 4612 return ret; 4613 4614 sa = memdup_user(arg, sizeof(*sa)); 4615 if (IS_ERR(sa)) { 4616 ret = PTR_ERR(sa); 4617 goto drop_write; 4618 } 4619 4620 trans = btrfs_join_transaction(root); 4621 if (IS_ERR(trans)) { 4622 ret = PTR_ERR(trans); 4623 goto out; 4624 } 4625 4626 /* FIXME: check if the IDs really exist */ 4627 if (sa->assign) { 4628 ret = btrfs_add_qgroup_relation(trans, root->fs_info, 4629 sa->src, sa->dst); 4630 } else { 4631 ret = btrfs_del_qgroup_relation(trans, root->fs_info, 4632 sa->src, sa->dst); 4633 } 4634 4635 err = btrfs_end_transaction(trans, root); 4636 if (err && !ret) 4637 ret = err; 4638 4639 out: 4640 kfree(sa); 4641 drop_write: 4642 mnt_drop_write_file(file); 4643 return ret; 4644 } 4645 4646 static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg) 4647 { 4648 struct btrfs_root *root = BTRFS_I(file_inode(file))->root; 4649 struct btrfs_ioctl_qgroup_create_args *sa; 4650 struct btrfs_trans_handle *trans; 4651 int ret; 4652 int err; 4653 4654 if (!capable(CAP_SYS_ADMIN)) 4655 return -EPERM; 4656 4657 ret = mnt_want_write_file(file); 4658 if (ret) 4659 return ret; 4660 4661 sa = memdup_user(arg, sizeof(*sa)); 4662 if (IS_ERR(sa)) { 4663 ret = PTR_ERR(sa); 4664 goto drop_write; 4665 } 4666 4667 if (!sa->qgroupid) { 4668 ret = -EINVAL; 4669 goto out; 4670 } 4671 4672 trans = btrfs_join_transaction(root); 4673 if (IS_ERR(trans)) { 4674 ret = PTR_ERR(trans); 4675 goto out; 4676 } 4677 4678 /* FIXME: check if the IDs really exist */ 4679 if (sa->create) { 4680 ret = btrfs_create_qgroup(trans, root->fs_info, sa->qgroupid, 4681 NULL); 4682 } else { 4683 ret = btrfs_remove_qgroup(trans, root->fs_info, sa->qgroupid); 4684 } 4685 4686 err = btrfs_end_transaction(trans, root); 4687 if (err && !ret) 4688 ret = err; 4689 4690 out: 4691 kfree(sa); 4692 drop_write: 4693 mnt_drop_write_file(file); 4694 return ret; 4695 } 4696 4697 static long btrfs_ioctl_qgroup_limit(struct file *file, void __user *arg) 4698 { 4699 struct btrfs_root *root = BTRFS_I(file_inode(file))->root; 4700 struct btrfs_ioctl_qgroup_limit_args *sa; 4701 struct btrfs_trans_handle *trans; 4702 int ret; 4703 int err; 4704 u64 qgroupid; 4705 4706 if (!capable(CAP_SYS_ADMIN)) 4707 return -EPERM; 4708 4709 ret = mnt_want_write_file(file); 4710 if (ret) 4711 return ret; 4712 4713 sa = memdup_user(arg, sizeof(*sa)); 4714 if (IS_ERR(sa)) { 4715 ret = PTR_ERR(sa); 4716 goto drop_write; 4717 } 4718 4719 trans = btrfs_join_transaction(root); 4720 if (IS_ERR(trans)) { 4721 ret = PTR_ERR(trans); 4722 goto out; 4723 } 4724 4725 qgroupid = sa->qgroupid; 4726 if (!qgroupid) { 4727 /* take the current subvol as qgroup */ 4728 qgroupid = root->root_key.objectid; 4729 } 4730 4731 /* FIXME: check if the IDs really exist */ 4732 ret = btrfs_limit_qgroup(trans, root->fs_info, qgroupid, &sa->lim); 4733 4734 err = btrfs_end_transaction(trans, root); 4735 if (err && !ret) 4736 ret = err; 4737 4738 out: 4739 kfree(sa); 4740 drop_write: 4741 mnt_drop_write_file(file); 4742 return ret; 4743 } 4744 4745 static long btrfs_ioctl_quota_rescan(struct file *file, void __user *arg) 4746 { 4747 struct btrfs_root *root = BTRFS_I(file_inode(file))->root; 4748 struct btrfs_ioctl_quota_rescan_args *qsa; 4749 int ret; 4750 4751 if (!capable(CAP_SYS_ADMIN)) 4752 return -EPERM; 4753 4754 ret = mnt_want_write_file(file); 4755 if (ret) 4756 return ret; 4757 4758 qsa = memdup_user(arg, sizeof(*qsa)); 4759 if (IS_ERR(qsa)) { 4760 ret = PTR_ERR(qsa); 4761 goto drop_write; 4762 } 4763 4764 if (qsa->flags) { 4765 ret = -EINVAL; 4766 goto out; 4767 } 4768 4769 ret = btrfs_qgroup_rescan(root->fs_info); 4770 4771 out: 4772 kfree(qsa); 4773 drop_write: 4774 mnt_drop_write_file(file); 4775 return ret; 4776 } 4777 4778 static long btrfs_ioctl_quota_rescan_status(struct file *file, void __user *arg) 4779 { 4780 struct btrfs_root *root = BTRFS_I(file_inode(file))->root; 4781 struct btrfs_ioctl_quota_rescan_args *qsa; 4782 int ret = 0; 4783 4784 if (!capable(CAP_SYS_ADMIN)) 4785 return -EPERM; 4786 4787 qsa = kzalloc(sizeof(*qsa), GFP_NOFS); 4788 if (!qsa) 4789 return -ENOMEM; 4790 4791 if (root->fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { 4792 qsa->flags = 1; 4793 qsa->progress = root->fs_info->qgroup_rescan_progress.objectid; 4794 } 4795 4796 if (copy_to_user(arg, qsa, sizeof(*qsa))) 4797 ret = -EFAULT; 4798 4799 kfree(qsa); 4800 return ret; 4801 } 4802 4803 static long btrfs_ioctl_quota_rescan_wait(struct file *file, void __user *arg) 4804 { 4805 struct btrfs_root *root = BTRFS_I(file_inode(file))->root; 4806 4807 if (!capable(CAP_SYS_ADMIN)) 4808 return -EPERM; 4809 4810 return btrfs_qgroup_wait_for_completion(root->fs_info); 4811 } 4812 4813 static long _btrfs_ioctl_set_received_subvol(struct file *file, 4814 struct btrfs_ioctl_received_subvol_args *sa) 4815 { 4816 struct inode *inode = file_inode(file); 4817 struct btrfs_root *root = BTRFS_I(inode)->root; 4818 struct btrfs_root_item *root_item = &root->root_item; 4819 struct btrfs_trans_handle *trans; 4820 struct timespec ct = CURRENT_TIME; 4821 int ret = 0; 4822 int received_uuid_changed; 4823 4824 if (!inode_owner_or_capable(inode)) 4825 return -EPERM; 4826 4827 ret = mnt_want_write_file(file); 4828 if (ret < 0) 4829 return ret; 4830 4831 down_write(&root->fs_info->subvol_sem); 4832 4833 if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID) { 4834 ret = -EINVAL; 4835 goto out; 4836 } 4837 4838 if (btrfs_root_readonly(root)) { 4839 ret = -EROFS; 4840 goto out; 4841 } 4842 4843 /* 4844 * 1 - root item 4845 * 2 - uuid items (received uuid + subvol uuid) 4846 */ 4847 trans = btrfs_start_transaction(root, 3); 4848 if (IS_ERR(trans)) { 4849 ret = PTR_ERR(trans); 4850 trans = NULL; 4851 goto out; 4852 } 4853 4854 sa->rtransid = trans->transid; 4855 sa->rtime.sec = ct.tv_sec; 4856 sa->rtime.nsec = ct.tv_nsec; 4857 4858 received_uuid_changed = memcmp(root_item->received_uuid, sa->uuid, 4859 BTRFS_UUID_SIZE); 4860 if (received_uuid_changed && 4861 !btrfs_is_empty_uuid(root_item->received_uuid)) 4862 btrfs_uuid_tree_rem(trans, root->fs_info->uuid_root, 4863 root_item->received_uuid, 4864 BTRFS_UUID_KEY_RECEIVED_SUBVOL, 4865 root->root_key.objectid); 4866 memcpy(root_item->received_uuid, sa->uuid, BTRFS_UUID_SIZE); 4867 btrfs_set_root_stransid(root_item, sa->stransid); 4868 btrfs_set_root_rtransid(root_item, sa->rtransid); 4869 btrfs_set_stack_timespec_sec(&root_item->stime, sa->stime.sec); 4870 btrfs_set_stack_timespec_nsec(&root_item->stime, sa->stime.nsec); 4871 btrfs_set_stack_timespec_sec(&root_item->rtime, sa->rtime.sec); 4872 btrfs_set_stack_timespec_nsec(&root_item->rtime, sa->rtime.nsec); 4873 4874 ret = btrfs_update_root(trans, root->fs_info->tree_root, 4875 &root->root_key, &root->root_item); 4876 if (ret < 0) { 4877 btrfs_end_transaction(trans, root); 4878 goto out; 4879 } 4880 if (received_uuid_changed && !btrfs_is_empty_uuid(sa->uuid)) { 4881 ret = btrfs_uuid_tree_add(trans, root->fs_info->uuid_root, 4882 sa->uuid, 4883 BTRFS_UUID_KEY_RECEIVED_SUBVOL, 4884 root->root_key.objectid); 4885 if (ret < 0 && ret != -EEXIST) { 4886 btrfs_abort_transaction(trans, root, ret); 4887 goto out; 4888 } 4889 } 4890 ret = btrfs_commit_transaction(trans, root); 4891 if (ret < 0) { 4892 btrfs_abort_transaction(trans, root, ret); 4893 goto out; 4894 } 4895 4896 out: 4897 up_write(&root->fs_info->subvol_sem); 4898 mnt_drop_write_file(file); 4899 return ret; 4900 } 4901 4902 #ifdef CONFIG_64BIT 4903 static long btrfs_ioctl_set_received_subvol_32(struct file *file, 4904 void __user *arg) 4905 { 4906 struct btrfs_ioctl_received_subvol_args_32 *args32 = NULL; 4907 struct btrfs_ioctl_received_subvol_args *args64 = NULL; 4908 int ret = 0; 4909 4910 args32 = memdup_user(arg, sizeof(*args32)); 4911 if (IS_ERR(args32)) { 4912 ret = PTR_ERR(args32); 4913 args32 = NULL; 4914 goto out; 4915 } 4916 4917 args64 = kmalloc(sizeof(*args64), GFP_NOFS); 4918 if (!args64) { 4919 ret = -ENOMEM; 4920 goto out; 4921 } 4922 4923 memcpy(args64->uuid, args32->uuid, BTRFS_UUID_SIZE); 4924 args64->stransid = args32->stransid; 4925 args64->rtransid = args32->rtransid; 4926 args64->stime.sec = args32->stime.sec; 4927 args64->stime.nsec = args32->stime.nsec; 4928 args64->rtime.sec = args32->rtime.sec; 4929 args64->rtime.nsec = args32->rtime.nsec; 4930 args64->flags = args32->flags; 4931 4932 ret = _btrfs_ioctl_set_received_subvol(file, args64); 4933 if (ret) 4934 goto out; 4935 4936 memcpy(args32->uuid, args64->uuid, BTRFS_UUID_SIZE); 4937 args32->stransid = args64->stransid; 4938 args32->rtransid = args64->rtransid; 4939 args32->stime.sec = args64->stime.sec; 4940 args32->stime.nsec = args64->stime.nsec; 4941 args32->rtime.sec = args64->rtime.sec; 4942 args32->rtime.nsec = args64->rtime.nsec; 4943 args32->flags = args64->flags; 4944 4945 ret = copy_to_user(arg, args32, sizeof(*args32)); 4946 if (ret) 4947 ret = -EFAULT; 4948 4949 out: 4950 kfree(args32); 4951 kfree(args64); 4952 return ret; 4953 } 4954 #endif 4955 4956 static long btrfs_ioctl_set_received_subvol(struct file *file, 4957 void __user *arg) 4958 { 4959 struct btrfs_ioctl_received_subvol_args *sa = NULL; 4960 int ret = 0; 4961 4962 sa = memdup_user(arg, sizeof(*sa)); 4963 if (IS_ERR(sa)) { 4964 ret = PTR_ERR(sa); 4965 sa = NULL; 4966 goto out; 4967 } 4968 4969 ret = _btrfs_ioctl_set_received_subvol(file, sa); 4970 4971 if (ret) 4972 goto out; 4973 4974 ret = copy_to_user(arg, sa, sizeof(*sa)); 4975 if (ret) 4976 ret = -EFAULT; 4977 4978 out: 4979 kfree(sa); 4980 return ret; 4981 } 4982 4983 static int btrfs_ioctl_get_fslabel(struct file *file, void __user *arg) 4984 { 4985 struct btrfs_root *root = BTRFS_I(file_inode(file))->root; 4986 size_t len; 4987 int ret; 4988 char label[BTRFS_LABEL_SIZE]; 4989 4990 spin_lock(&root->fs_info->super_lock); 4991 memcpy(label, root->fs_info->super_copy->label, BTRFS_LABEL_SIZE); 4992 spin_unlock(&root->fs_info->super_lock); 4993 4994 len = strnlen(label, BTRFS_LABEL_SIZE); 4995 4996 if (len == BTRFS_LABEL_SIZE) { 4997 btrfs_warn(root->fs_info, 4998 "label is too long, return the first %zu bytes", --len); 4999 } 5000 5001 ret = copy_to_user(arg, label, len); 5002 5003 return ret ? -EFAULT : 0; 5004 } 5005 5006 static int btrfs_ioctl_set_fslabel(struct file *file, void __user *arg) 5007 { 5008 struct btrfs_root *root = BTRFS_I(file_inode(file))->root; 5009 struct btrfs_super_block *super_block = root->fs_info->super_copy; 5010 struct btrfs_trans_handle *trans; 5011 char label[BTRFS_LABEL_SIZE]; 5012 int ret; 5013 5014 if (!capable(CAP_SYS_ADMIN)) 5015 return -EPERM; 5016 5017 if (copy_from_user(label, arg, sizeof(label))) 5018 return -EFAULT; 5019 5020 if (strnlen(label, BTRFS_LABEL_SIZE) == BTRFS_LABEL_SIZE) { 5021 btrfs_err(root->fs_info, "unable to set label with more than %d bytes", 5022 BTRFS_LABEL_SIZE - 1); 5023 return -EINVAL; 5024 } 5025 5026 ret = mnt_want_write_file(file); 5027 if (ret) 5028 return ret; 5029 5030 trans = btrfs_start_transaction(root, 0); 5031 if (IS_ERR(trans)) { 5032 ret = PTR_ERR(trans); 5033 goto out_unlock; 5034 } 5035 5036 spin_lock(&root->fs_info->super_lock); 5037 strcpy(super_block->label, label); 5038 spin_unlock(&root->fs_info->super_lock); 5039 ret = btrfs_commit_transaction(trans, root); 5040 5041 out_unlock: 5042 mnt_drop_write_file(file); 5043 return ret; 5044 } 5045 5046 #define INIT_FEATURE_FLAGS(suffix) \ 5047 { .compat_flags = BTRFS_FEATURE_COMPAT_##suffix, \ 5048 .compat_ro_flags = BTRFS_FEATURE_COMPAT_RO_##suffix, \ 5049 .incompat_flags = BTRFS_FEATURE_INCOMPAT_##suffix } 5050 5051 static int btrfs_ioctl_get_supported_features(struct file *file, 5052 void __user *arg) 5053 { 5054 static struct btrfs_ioctl_feature_flags features[3] = { 5055 INIT_FEATURE_FLAGS(SUPP), 5056 INIT_FEATURE_FLAGS(SAFE_SET), 5057 INIT_FEATURE_FLAGS(SAFE_CLEAR) 5058 }; 5059 5060 if (copy_to_user(arg, &features, sizeof(features))) 5061 return -EFAULT; 5062 5063 return 0; 5064 } 5065 5066 static int btrfs_ioctl_get_features(struct file *file, void __user *arg) 5067 { 5068 struct btrfs_root *root = BTRFS_I(file_inode(file))->root; 5069 struct btrfs_super_block *super_block = root->fs_info->super_copy; 5070 struct btrfs_ioctl_feature_flags features; 5071 5072 features.compat_flags = btrfs_super_compat_flags(super_block); 5073 features.compat_ro_flags = btrfs_super_compat_ro_flags(super_block); 5074 features.incompat_flags = btrfs_super_incompat_flags(super_block); 5075 5076 if (copy_to_user(arg, &features, sizeof(features))) 5077 return -EFAULT; 5078 5079 return 0; 5080 } 5081 5082 static int check_feature_bits(struct btrfs_root *root, 5083 enum btrfs_feature_set set, 5084 u64 change_mask, u64 flags, u64 supported_flags, 5085 u64 safe_set, u64 safe_clear) 5086 { 5087 const char *type = btrfs_feature_set_names[set]; 5088 char *names; 5089 u64 disallowed, unsupported; 5090 u64 set_mask = flags & change_mask; 5091 u64 clear_mask = ~flags & change_mask; 5092 5093 unsupported = set_mask & ~supported_flags; 5094 if (unsupported) { 5095 names = btrfs_printable_features(set, unsupported); 5096 if (names) { 5097 btrfs_warn(root->fs_info, 5098 "this kernel does not support the %s feature bit%s", 5099 names, strchr(names, ',') ? "s" : ""); 5100 kfree(names); 5101 } else 5102 btrfs_warn(root->fs_info, 5103 "this kernel does not support %s bits 0x%llx", 5104 type, unsupported); 5105 return -EOPNOTSUPP; 5106 } 5107 5108 disallowed = set_mask & ~safe_set; 5109 if (disallowed) { 5110 names = btrfs_printable_features(set, disallowed); 5111 if (names) { 5112 btrfs_warn(root->fs_info, 5113 "can't set the %s feature bit%s while mounted", 5114 names, strchr(names, ',') ? "s" : ""); 5115 kfree(names); 5116 } else 5117 btrfs_warn(root->fs_info, 5118 "can't set %s bits 0x%llx while mounted", 5119 type, disallowed); 5120 return -EPERM; 5121 } 5122 5123 disallowed = clear_mask & ~safe_clear; 5124 if (disallowed) { 5125 names = btrfs_printable_features(set, disallowed); 5126 if (names) { 5127 btrfs_warn(root->fs_info, 5128 "can't clear the %s feature bit%s while mounted", 5129 names, strchr(names, ',') ? "s" : ""); 5130 kfree(names); 5131 } else 5132 btrfs_warn(root->fs_info, 5133 "can't clear %s bits 0x%llx while mounted", 5134 type, disallowed); 5135 return -EPERM; 5136 } 5137 5138 return 0; 5139 } 5140 5141 #define check_feature(root, change_mask, flags, mask_base) \ 5142 check_feature_bits(root, FEAT_##mask_base, change_mask, flags, \ 5143 BTRFS_FEATURE_ ## mask_base ## _SUPP, \ 5144 BTRFS_FEATURE_ ## mask_base ## _SAFE_SET, \ 5145 BTRFS_FEATURE_ ## mask_base ## _SAFE_CLEAR) 5146 5147 static int btrfs_ioctl_set_features(struct file *file, void __user *arg) 5148 { 5149 struct btrfs_root *root = BTRFS_I(file_inode(file))->root; 5150 struct btrfs_super_block *super_block = root->fs_info->super_copy; 5151 struct btrfs_ioctl_feature_flags flags[2]; 5152 struct btrfs_trans_handle *trans; 5153 u64 newflags; 5154 int ret; 5155 5156 if (!capable(CAP_SYS_ADMIN)) 5157 return -EPERM; 5158 5159 if (copy_from_user(flags, arg, sizeof(flags))) 5160 return -EFAULT; 5161 5162 /* Nothing to do */ 5163 if (!flags[0].compat_flags && !flags[0].compat_ro_flags && 5164 !flags[0].incompat_flags) 5165 return 0; 5166 5167 ret = check_feature(root, flags[0].compat_flags, 5168 flags[1].compat_flags, COMPAT); 5169 if (ret) 5170 return ret; 5171 5172 ret = check_feature(root, flags[0].compat_ro_flags, 5173 flags[1].compat_ro_flags, COMPAT_RO); 5174 if (ret) 5175 return ret; 5176 5177 ret = check_feature(root, flags[0].incompat_flags, 5178 flags[1].incompat_flags, INCOMPAT); 5179 if (ret) 5180 return ret; 5181 5182 trans = btrfs_start_transaction(root, 0); 5183 if (IS_ERR(trans)) 5184 return PTR_ERR(trans); 5185 5186 spin_lock(&root->fs_info->super_lock); 5187 newflags = btrfs_super_compat_flags(super_block); 5188 newflags |= flags[0].compat_flags & flags[1].compat_flags; 5189 newflags &= ~(flags[0].compat_flags & ~flags[1].compat_flags); 5190 btrfs_set_super_compat_flags(super_block, newflags); 5191 5192 newflags = btrfs_super_compat_ro_flags(super_block); 5193 newflags |= flags[0].compat_ro_flags & flags[1].compat_ro_flags; 5194 newflags &= ~(flags[0].compat_ro_flags & ~flags[1].compat_ro_flags); 5195 btrfs_set_super_compat_ro_flags(super_block, newflags); 5196 5197 newflags = btrfs_super_incompat_flags(super_block); 5198 newflags |= flags[0].incompat_flags & flags[1].incompat_flags; 5199 newflags &= ~(flags[0].incompat_flags & ~flags[1].incompat_flags); 5200 btrfs_set_super_incompat_flags(super_block, newflags); 5201 spin_unlock(&root->fs_info->super_lock); 5202 5203 return btrfs_commit_transaction(trans, root); 5204 } 5205 5206 long btrfs_ioctl(struct file *file, unsigned int 5207 cmd, unsigned long arg) 5208 { 5209 struct btrfs_root *root = BTRFS_I(file_inode(file))->root; 5210 void __user *argp = (void __user *)arg; 5211 5212 switch (cmd) { 5213 case FS_IOC_GETFLAGS: 5214 return btrfs_ioctl_getflags(file, argp); 5215 case FS_IOC_SETFLAGS: 5216 return btrfs_ioctl_setflags(file, argp); 5217 case FS_IOC_GETVERSION: 5218 return btrfs_ioctl_getversion(file, argp); 5219 case FITRIM: 5220 return btrfs_ioctl_fitrim(file, argp); 5221 case BTRFS_IOC_SNAP_CREATE: 5222 return btrfs_ioctl_snap_create(file, argp, 0); 5223 case BTRFS_IOC_SNAP_CREATE_V2: 5224 return btrfs_ioctl_snap_create_v2(file, argp, 0); 5225 case BTRFS_IOC_SUBVOL_CREATE: 5226 return btrfs_ioctl_snap_create(file, argp, 1); 5227 case BTRFS_IOC_SUBVOL_CREATE_V2: 5228 return btrfs_ioctl_snap_create_v2(file, argp, 1); 5229 case BTRFS_IOC_SNAP_DESTROY: 5230 return btrfs_ioctl_snap_destroy(file, argp); 5231 case BTRFS_IOC_SUBVOL_GETFLAGS: 5232 return btrfs_ioctl_subvol_getflags(file, argp); 5233 case BTRFS_IOC_SUBVOL_SETFLAGS: 5234 return btrfs_ioctl_subvol_setflags(file, argp); 5235 case BTRFS_IOC_DEFAULT_SUBVOL: 5236 return btrfs_ioctl_default_subvol(file, argp); 5237 case BTRFS_IOC_DEFRAG: 5238 return btrfs_ioctl_defrag(file, NULL); 5239 case BTRFS_IOC_DEFRAG_RANGE: 5240 return btrfs_ioctl_defrag(file, argp); 5241 case BTRFS_IOC_RESIZE: 5242 return btrfs_ioctl_resize(file, argp); 5243 case BTRFS_IOC_ADD_DEV: 5244 return btrfs_ioctl_add_dev(root, argp); 5245 case BTRFS_IOC_RM_DEV: 5246 return btrfs_ioctl_rm_dev(file, argp); 5247 case BTRFS_IOC_FS_INFO: 5248 return btrfs_ioctl_fs_info(root, argp); 5249 case BTRFS_IOC_DEV_INFO: 5250 return btrfs_ioctl_dev_info(root, argp); 5251 case BTRFS_IOC_BALANCE: 5252 return btrfs_ioctl_balance(file, NULL); 5253 case BTRFS_IOC_CLONE: 5254 return btrfs_ioctl_clone(file, arg, 0, 0, 0); 5255 case BTRFS_IOC_CLONE_RANGE: 5256 return btrfs_ioctl_clone_range(file, argp); 5257 case BTRFS_IOC_TRANS_START: 5258 return btrfs_ioctl_trans_start(file); 5259 case BTRFS_IOC_TRANS_END: 5260 return btrfs_ioctl_trans_end(file); 5261 case BTRFS_IOC_TREE_SEARCH: 5262 return btrfs_ioctl_tree_search(file, argp); 5263 case BTRFS_IOC_TREE_SEARCH_V2: 5264 return btrfs_ioctl_tree_search_v2(file, argp); 5265 case BTRFS_IOC_INO_LOOKUP: 5266 return btrfs_ioctl_ino_lookup(file, argp); 5267 case BTRFS_IOC_INO_PATHS: 5268 return btrfs_ioctl_ino_to_path(root, argp); 5269 case BTRFS_IOC_LOGICAL_INO: 5270 return btrfs_ioctl_logical_to_ino(root, argp); 5271 case BTRFS_IOC_SPACE_INFO: 5272 return btrfs_ioctl_space_info(root, argp); 5273 case BTRFS_IOC_SYNC: { 5274 int ret; 5275 5276 ret = btrfs_start_delalloc_roots(root->fs_info, 0, -1); 5277 if (ret) 5278 return ret; 5279 ret = btrfs_sync_fs(file->f_dentry->d_sb, 1); 5280 return ret; 5281 } 5282 case BTRFS_IOC_START_SYNC: 5283 return btrfs_ioctl_start_sync(root, argp); 5284 case BTRFS_IOC_WAIT_SYNC: 5285 return btrfs_ioctl_wait_sync(root, argp); 5286 case BTRFS_IOC_SCRUB: 5287 return btrfs_ioctl_scrub(file, argp); 5288 case BTRFS_IOC_SCRUB_CANCEL: 5289 return btrfs_ioctl_scrub_cancel(root, argp); 5290 case BTRFS_IOC_SCRUB_PROGRESS: 5291 return btrfs_ioctl_scrub_progress(root, argp); 5292 case BTRFS_IOC_BALANCE_V2: 5293 return btrfs_ioctl_balance(file, argp); 5294 case BTRFS_IOC_BALANCE_CTL: 5295 return btrfs_ioctl_balance_ctl(root, arg); 5296 case BTRFS_IOC_BALANCE_PROGRESS: 5297 return btrfs_ioctl_balance_progress(root, argp); 5298 case BTRFS_IOC_SET_RECEIVED_SUBVOL: 5299 return btrfs_ioctl_set_received_subvol(file, argp); 5300 #ifdef CONFIG_64BIT 5301 case BTRFS_IOC_SET_RECEIVED_SUBVOL_32: 5302 return btrfs_ioctl_set_received_subvol_32(file, argp); 5303 #endif 5304 case BTRFS_IOC_SEND: 5305 return btrfs_ioctl_send(file, argp); 5306 case BTRFS_IOC_GET_DEV_STATS: 5307 return btrfs_ioctl_get_dev_stats(root, argp); 5308 case BTRFS_IOC_QUOTA_CTL: 5309 return btrfs_ioctl_quota_ctl(file, argp); 5310 case BTRFS_IOC_QGROUP_ASSIGN: 5311 return btrfs_ioctl_qgroup_assign(file, argp); 5312 case BTRFS_IOC_QGROUP_CREATE: 5313 return btrfs_ioctl_qgroup_create(file, argp); 5314 case BTRFS_IOC_QGROUP_LIMIT: 5315 return btrfs_ioctl_qgroup_limit(file, argp); 5316 case BTRFS_IOC_QUOTA_RESCAN: 5317 return btrfs_ioctl_quota_rescan(file, argp); 5318 case BTRFS_IOC_QUOTA_RESCAN_STATUS: 5319 return btrfs_ioctl_quota_rescan_status(file, argp); 5320 case BTRFS_IOC_QUOTA_RESCAN_WAIT: 5321 return btrfs_ioctl_quota_rescan_wait(file, argp); 5322 case BTRFS_IOC_DEV_REPLACE: 5323 return btrfs_ioctl_dev_replace(root, argp); 5324 case BTRFS_IOC_GET_FSLABEL: 5325 return btrfs_ioctl_get_fslabel(file, argp); 5326 case BTRFS_IOC_SET_FSLABEL: 5327 return btrfs_ioctl_set_fslabel(file, argp); 5328 case BTRFS_IOC_FILE_EXTENT_SAME: 5329 return btrfs_ioctl_file_extent_same(file, argp); 5330 case BTRFS_IOC_GET_SUPPORTED_FEATURES: 5331 return btrfs_ioctl_get_supported_features(file, argp); 5332 case BTRFS_IOC_GET_FEATURES: 5333 return btrfs_ioctl_get_features(file, argp); 5334 case BTRFS_IOC_SET_FEATURES: 5335 return btrfs_ioctl_set_features(file, argp); 5336 } 5337 5338 return -ENOTTY; 5339 } 5340