1e2a58ee5SAlex Elder 2602adf40SYehuda Sadeh /* 3602adf40SYehuda Sadeh rbd.c -- Export ceph rados objects as a Linux block device 4602adf40SYehuda Sadeh 5602adf40SYehuda Sadeh 6602adf40SYehuda Sadeh based on drivers/block/osdblk.c: 7602adf40SYehuda Sadeh 8602adf40SYehuda Sadeh Copyright 2009 Red Hat, Inc. 9602adf40SYehuda Sadeh 10602adf40SYehuda Sadeh This program is free software; you can redistribute it and/or modify 11602adf40SYehuda Sadeh it under the terms of the GNU General Public License as published by 12602adf40SYehuda Sadeh the Free Software Foundation. 13602adf40SYehuda Sadeh 14602adf40SYehuda Sadeh This program is distributed in the hope that it will be useful, 15602adf40SYehuda Sadeh but WITHOUT ANY WARRANTY; without even the implied warranty of 16602adf40SYehuda Sadeh MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17602adf40SYehuda Sadeh GNU General Public License for more details. 18602adf40SYehuda Sadeh 19602adf40SYehuda Sadeh You should have received a copy of the GNU General Public License 20602adf40SYehuda Sadeh along with this program; see the file COPYING. If not, write to 21602adf40SYehuda Sadeh the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 22602adf40SYehuda Sadeh 23602adf40SYehuda Sadeh 24602adf40SYehuda Sadeh 25dfc5606dSYehuda Sadeh For usage instructions, please refer to: 26602adf40SYehuda Sadeh 27dfc5606dSYehuda Sadeh Documentation/ABI/testing/sysfs-bus-rbd 28602adf40SYehuda Sadeh 29602adf40SYehuda Sadeh */ 30602adf40SYehuda Sadeh 31602adf40SYehuda Sadeh #include <linux/ceph/libceph.h> 32602adf40SYehuda Sadeh #include <linux/ceph/osd_client.h> 33602adf40SYehuda Sadeh #include <linux/ceph/mon_client.h> 34ed95b21aSIlya Dryomov #include <linux/ceph/cls_lock_client.h> 3543df3d35SIlya Dryomov #include <linux/ceph/striper.h> 36602adf40SYehuda Sadeh #include <linux/ceph/decode.h> 3759c2be1eSYehuda Sadeh #include <linux/parser.h> 3830d1cff8SAlex Elder #include <linux/bsearch.h> 39602adf40SYehuda Sadeh 40602adf40SYehuda Sadeh #include <linux/kernel.h> 41602adf40SYehuda Sadeh #include <linux/device.h> 42602adf40SYehuda Sadeh #include <linux/module.h> 437ad18afaSChristoph Hellwig #include <linux/blk-mq.h> 44602adf40SYehuda Sadeh #include <linux/fs.h> 45602adf40SYehuda Sadeh #include <linux/blkdev.h> 461c2a9dfeSAlex Elder #include <linux/slab.h> 47f8a22fc2SIlya Dryomov #include <linux/idr.h> 48bc1ecc65SIlya Dryomov #include <linux/workqueue.h> 49602adf40SYehuda Sadeh 50602adf40SYehuda Sadeh #include "rbd_types.h" 51602adf40SYehuda Sadeh 52aafb230eSAlex Elder #define RBD_DEBUG /* Activate rbd_assert() calls */ 53aafb230eSAlex Elder 54593a9e7bSAlex Elder /* 55a2acd00eSAlex Elder * Increment the given counter and return its updated value. 56a2acd00eSAlex Elder * If the counter is already 0 it will not be incremented. 57a2acd00eSAlex Elder * If the counter is already at its maximum value returns 58a2acd00eSAlex Elder * -EINVAL without updating it. 59a2acd00eSAlex Elder */ 60a2acd00eSAlex Elder static int atomic_inc_return_safe(atomic_t *v) 61a2acd00eSAlex Elder { 62a2acd00eSAlex Elder unsigned int counter; 63a2acd00eSAlex Elder 64bfc18e38SMark Rutland counter = (unsigned int)atomic_fetch_add_unless(v, 1, 0); 65a2acd00eSAlex Elder if (counter <= (unsigned int)INT_MAX) 66a2acd00eSAlex Elder return (int)counter; 67a2acd00eSAlex Elder 68a2acd00eSAlex Elder atomic_dec(v); 69a2acd00eSAlex Elder 70a2acd00eSAlex Elder return -EINVAL; 71a2acd00eSAlex Elder } 72a2acd00eSAlex Elder 73a2acd00eSAlex Elder /* Decrement the counter. Return the resulting value, or -EINVAL */ 74a2acd00eSAlex Elder static int atomic_dec_return_safe(atomic_t *v) 75a2acd00eSAlex Elder { 76a2acd00eSAlex Elder int counter; 77a2acd00eSAlex Elder 78a2acd00eSAlex Elder counter = atomic_dec_return(v); 79a2acd00eSAlex Elder if (counter >= 0) 80a2acd00eSAlex Elder return counter; 81a2acd00eSAlex Elder 82a2acd00eSAlex Elder atomic_inc(v); 83a2acd00eSAlex Elder 84a2acd00eSAlex Elder return -EINVAL; 85a2acd00eSAlex Elder } 86a2acd00eSAlex Elder 87f0f8cef5SAlex Elder #define RBD_DRV_NAME "rbd" 88602adf40SYehuda Sadeh 897e513d43SIlya Dryomov #define RBD_MINORS_PER_MAJOR 256 907e513d43SIlya Dryomov #define RBD_SINGLE_MAJOR_PART_SHIFT 4 91602adf40SYehuda Sadeh 926d69bb53SIlya Dryomov #define RBD_MAX_PARENT_CHAIN_LEN 16 936d69bb53SIlya Dryomov 94d4b125e9SAlex Elder #define RBD_SNAP_DEV_NAME_PREFIX "snap_" 95d4b125e9SAlex Elder #define RBD_MAX_SNAP_NAME_LEN \ 96d4b125e9SAlex Elder (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1)) 97d4b125e9SAlex Elder 9835d489f9SAlex Elder #define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */ 99602adf40SYehuda Sadeh 100602adf40SYehuda Sadeh #define RBD_SNAP_HEAD_NAME "-" 101602adf40SYehuda Sadeh 1029682fc6dSAlex Elder #define BAD_SNAP_INDEX U32_MAX /* invalid index into snap array */ 1039682fc6dSAlex Elder 1049e15b77dSAlex Elder /* This allows a single page to hold an image name sent by OSD */ 1059e15b77dSAlex Elder #define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1) 106589d30e0SAlex Elder #define RBD_IMAGE_ID_LEN_MAX 64 1079e15b77dSAlex Elder 1081e130199SAlex Elder #define RBD_OBJ_PREFIX_LEN_MAX 64 109589d30e0SAlex Elder 110ed95b21aSIlya Dryomov #define RBD_NOTIFY_TIMEOUT 5 /* seconds */ 11199d16943SIlya Dryomov #define RBD_RETRY_DELAY msecs_to_jiffies(1000) 11299d16943SIlya Dryomov 113d889140cSAlex Elder /* Feature bits */ 114d889140cSAlex Elder 1158767b293SIlya Dryomov #define RBD_FEATURE_LAYERING (1ULL<<0) 1168767b293SIlya Dryomov #define RBD_FEATURE_STRIPINGV2 (1ULL<<1) 1178767b293SIlya Dryomov #define RBD_FEATURE_EXCLUSIVE_LOCK (1ULL<<2) 118b9f6d447SIlya Dryomov #define RBD_FEATURE_DEEP_FLATTEN (1ULL<<5) 1198767b293SIlya Dryomov #define RBD_FEATURE_DATA_POOL (1ULL<<7) 120e573427aSIlya Dryomov #define RBD_FEATURE_OPERATIONS (1ULL<<8) 1218767b293SIlya Dryomov 122ed95b21aSIlya Dryomov #define RBD_FEATURES_ALL (RBD_FEATURE_LAYERING | \ 123ed95b21aSIlya Dryomov RBD_FEATURE_STRIPINGV2 | \ 1247e97332eSIlya Dryomov RBD_FEATURE_EXCLUSIVE_LOCK | \ 125b9f6d447SIlya Dryomov RBD_FEATURE_DEEP_FLATTEN | \ 126e573427aSIlya Dryomov RBD_FEATURE_DATA_POOL | \ 127e573427aSIlya Dryomov RBD_FEATURE_OPERATIONS) 128d889140cSAlex Elder 129d889140cSAlex Elder /* Features supported by this (client software) implementation. */ 130d889140cSAlex Elder 131770eba6eSAlex Elder #define RBD_FEATURES_SUPPORTED (RBD_FEATURES_ALL) 132d889140cSAlex Elder 13381a89793SAlex Elder /* 13481a89793SAlex Elder * An RBD device name will be "rbd#", where the "rbd" comes from 13581a89793SAlex Elder * RBD_DRV_NAME above, and # is a unique integer identifier. 13681a89793SAlex Elder */ 137602adf40SYehuda Sadeh #define DEV_NAME_LEN 32 138602adf40SYehuda Sadeh 139602adf40SYehuda Sadeh /* 140602adf40SYehuda Sadeh * block device image metadata (in-memory version) 141602adf40SYehuda Sadeh */ 142602adf40SYehuda Sadeh struct rbd_image_header { 143f35a4deeSAlex Elder /* These six fields never change for a given rbd image */ 144849b4260SAlex Elder char *object_prefix; 145602adf40SYehuda Sadeh __u8 obj_order; 146f35a4deeSAlex Elder u64 stripe_unit; 147f35a4deeSAlex Elder u64 stripe_count; 1487e97332eSIlya Dryomov s64 data_pool_id; 149f35a4deeSAlex Elder u64 features; /* Might be changeable someday? */ 150602adf40SYehuda Sadeh 151f84344f3SAlex Elder /* The remaining fields need to be updated occasionally */ 152f84344f3SAlex Elder u64 image_size; 153f84344f3SAlex Elder struct ceph_snap_context *snapc; 154f35a4deeSAlex Elder char *snap_names; /* format 1 only */ 155f35a4deeSAlex Elder u64 *snap_sizes; /* format 1 only */ 15659c2be1eSYehuda Sadeh }; 15759c2be1eSYehuda Sadeh 1580d7dbfceSAlex Elder /* 1590d7dbfceSAlex Elder * An rbd image specification. 1600d7dbfceSAlex Elder * 1610d7dbfceSAlex Elder * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely 162c66c6e0cSAlex Elder * identify an image. Each rbd_dev structure includes a pointer to 163c66c6e0cSAlex Elder * an rbd_spec structure that encapsulates this identity. 164c66c6e0cSAlex Elder * 165c66c6e0cSAlex Elder * Each of the id's in an rbd_spec has an associated name. For a 166c66c6e0cSAlex Elder * user-mapped image, the names are supplied and the id's associated 167c66c6e0cSAlex Elder * with them are looked up. For a layered image, a parent image is 168c66c6e0cSAlex Elder * defined by the tuple, and the names are looked up. 169c66c6e0cSAlex Elder * 170c66c6e0cSAlex Elder * An rbd_dev structure contains a parent_spec pointer which is 171c66c6e0cSAlex Elder * non-null if the image it represents is a child in a layered 172c66c6e0cSAlex Elder * image. This pointer will refer to the rbd_spec structure used 173c66c6e0cSAlex Elder * by the parent rbd_dev for its own identity (i.e., the structure 174c66c6e0cSAlex Elder * is shared between the parent and child). 175c66c6e0cSAlex Elder * 176c66c6e0cSAlex Elder * Since these structures are populated once, during the discovery 177c66c6e0cSAlex Elder * phase of image construction, they are effectively immutable so 178c66c6e0cSAlex Elder * we make no effort to synchronize access to them. 179c66c6e0cSAlex Elder * 180c66c6e0cSAlex Elder * Note that code herein does not assume the image name is known (it 181c66c6e0cSAlex Elder * could be a null pointer). 1820d7dbfceSAlex Elder */ 1830d7dbfceSAlex Elder struct rbd_spec { 1840d7dbfceSAlex Elder u64 pool_id; 185ecb4dc22SAlex Elder const char *pool_name; 186b26c047bSIlya Dryomov const char *pool_ns; /* NULL if default, never "" */ 1870d7dbfceSAlex Elder 188ecb4dc22SAlex Elder const char *image_id; 189ecb4dc22SAlex Elder const char *image_name; 1900d7dbfceSAlex Elder 1910d7dbfceSAlex Elder u64 snap_id; 192ecb4dc22SAlex Elder const char *snap_name; 1930d7dbfceSAlex Elder 1940d7dbfceSAlex Elder struct kref kref; 1950d7dbfceSAlex Elder }; 1960d7dbfceSAlex Elder 197602adf40SYehuda Sadeh /* 198f0f8cef5SAlex Elder * an instance of the client. multiple devices may share an rbd client. 199602adf40SYehuda Sadeh */ 200602adf40SYehuda Sadeh struct rbd_client { 201602adf40SYehuda Sadeh struct ceph_client *client; 202602adf40SYehuda Sadeh struct kref kref; 203602adf40SYehuda Sadeh struct list_head node; 204602adf40SYehuda Sadeh }; 205602adf40SYehuda Sadeh 206bf0d5f50SAlex Elder struct rbd_img_request; 207bf0d5f50SAlex Elder 2089969ebc5SAlex Elder enum obj_request_type { 209a1fbb5e7SIlya Dryomov OBJ_REQUEST_NODATA = 1, 2105359a17dSIlya Dryomov OBJ_REQUEST_BIO, /* pointer into provided bio (list) */ 2117e07efb1SIlya Dryomov OBJ_REQUEST_BVECS, /* pointer into provided bio_vec array */ 212afb97888SIlya Dryomov OBJ_REQUEST_OWN_BVECS, /* private bio_vec array, doesn't own pages */ 2139969ebc5SAlex Elder }; 214bf0d5f50SAlex Elder 2156d2940c8SGuangliang Zhao enum obj_operation_type { 216a1fbb5e7SIlya Dryomov OBJ_OP_READ = 1, 2176d2940c8SGuangliang Zhao OBJ_OP_WRITE, 21890e98c52SGuangliang Zhao OBJ_OP_DISCARD, 2196484cbe9SIlya Dryomov OBJ_OP_ZEROOUT, 2206d2940c8SGuangliang Zhao }; 2216d2940c8SGuangliang Zhao 2223da691bfSIlya Dryomov /* 2233da691bfSIlya Dryomov * Writes go through the following state machine to deal with 2243da691bfSIlya Dryomov * layering: 2253da691bfSIlya Dryomov * 22689a59c1cSIlya Dryomov * . . . . . RBD_OBJ_WRITE_GUARD. . . . . . . . . . . . . . 22789a59c1cSIlya Dryomov * . | . 22889a59c1cSIlya Dryomov * . v . 22989a59c1cSIlya Dryomov * . RBD_OBJ_WRITE_READ_FROM_PARENT. . . . 23089a59c1cSIlya Dryomov * . | . . 23189a59c1cSIlya Dryomov * . v v (deep-copyup . 23289a59c1cSIlya Dryomov * (image . RBD_OBJ_WRITE_COPYUP_EMPTY_SNAPC . not needed) . 23389a59c1cSIlya Dryomov * flattened) v | . . 23489a59c1cSIlya Dryomov * . v . . 23589a59c1cSIlya Dryomov * . . . .RBD_OBJ_WRITE_COPYUP_OPS. . . . . (copyup . 23689a59c1cSIlya Dryomov * | not needed) v 23789a59c1cSIlya Dryomov * v . 23889a59c1cSIlya Dryomov * done . . . . . . . . . . . . . . . . . . 2393da691bfSIlya Dryomov * ^ 2403da691bfSIlya Dryomov * | 2413da691bfSIlya Dryomov * RBD_OBJ_WRITE_FLAT 2423da691bfSIlya Dryomov * 2433da691bfSIlya Dryomov * Writes start in RBD_OBJ_WRITE_GUARD or _FLAT, depending on whether 24489a59c1cSIlya Dryomov * assert_exists guard is needed or not (in some cases it's not needed 24589a59c1cSIlya Dryomov * even if there is a parent). 2463da691bfSIlya Dryomov */ 2473da691bfSIlya Dryomov enum rbd_obj_write_state { 2483da691bfSIlya Dryomov RBD_OBJ_WRITE_FLAT = 1, 2493da691bfSIlya Dryomov RBD_OBJ_WRITE_GUARD, 2503a482501SIlya Dryomov RBD_OBJ_WRITE_READ_FROM_PARENT, 25189a59c1cSIlya Dryomov RBD_OBJ_WRITE_COPYUP_EMPTY_SNAPC, 2523a482501SIlya Dryomov RBD_OBJ_WRITE_COPYUP_OPS, 253926f9b3fSAlex Elder }; 254926f9b3fSAlex Elder 255bf0d5f50SAlex Elder struct rbd_obj_request { 25643df3d35SIlya Dryomov struct ceph_object_extent ex; 257c5b5ef6cSAlex Elder union { 2583da691bfSIlya Dryomov bool tried_parent; /* for reads */ 2593da691bfSIlya Dryomov enum rbd_obj_write_state write_state; /* for writes */ 2603da691bfSIlya Dryomov }; 261bf0d5f50SAlex Elder 262bf0d5f50SAlex Elder struct rbd_img_request *img_request; 26386bd7998SIlya Dryomov struct ceph_file_extent *img_extents; 26486bd7998SIlya Dryomov u32 num_img_extents; 265bf0d5f50SAlex Elder 266788e2df3SAlex Elder union { 2675359a17dSIlya Dryomov struct ceph_bio_iter bio_pos; 268788e2df3SAlex Elder struct { 2697e07efb1SIlya Dryomov struct ceph_bvec_iter bvec_pos; 2707e07efb1SIlya Dryomov u32 bvec_count; 271afb97888SIlya Dryomov u32 bvec_idx; 272788e2df3SAlex Elder }; 273788e2df3SAlex Elder }; 2747e07efb1SIlya Dryomov struct bio_vec *copyup_bvecs; 2757e07efb1SIlya Dryomov u32 copyup_bvec_count; 276bf0d5f50SAlex Elder 277bf0d5f50SAlex Elder struct ceph_osd_request *osd_req; 278bf0d5f50SAlex Elder 279bf0d5f50SAlex Elder u64 xferred; /* bytes transferred */ 2801b83bef2SSage Weil int result; 281bf0d5f50SAlex Elder 282bf0d5f50SAlex Elder struct kref kref; 283bf0d5f50SAlex Elder }; 284bf0d5f50SAlex Elder 2850c425248SAlex Elder enum img_req_flags { 2869849e986SAlex Elder IMG_REQ_CHILD, /* initiator: block = 0, child image = 1 */ 287d0b2e944SAlex Elder IMG_REQ_LAYERED, /* ENOENT handling: normal = 0, layered = 1 */ 2880c425248SAlex Elder }; 2890c425248SAlex Elder 290bf0d5f50SAlex Elder struct rbd_img_request { 291bf0d5f50SAlex Elder struct rbd_device *rbd_dev; 2929bb0248dSIlya Dryomov enum obj_operation_type op_type; 293ecc633caSIlya Dryomov enum obj_request_type data_type; 2940c425248SAlex Elder unsigned long flags; 295bf0d5f50SAlex Elder union { 296bf0d5f50SAlex Elder u64 snap_id; /* for reads */ 2979849e986SAlex Elder struct ceph_snap_context *snapc; /* for writes */ 2989849e986SAlex Elder }; 2999849e986SAlex Elder union { 3009849e986SAlex Elder struct request *rq; /* block request */ 3019849e986SAlex Elder struct rbd_obj_request *obj_request; /* obj req initiator */ 302bf0d5f50SAlex Elder }; 30315961b44SIlya Dryomov spinlock_t completion_lock; 30455f27e09SAlex Elder u64 xferred;/* aggregate bytes transferred */ 305a5a337d4SAlex Elder int result; /* first nonzero obj_request result */ 306bf0d5f50SAlex Elder 30743df3d35SIlya Dryomov struct list_head object_extents; /* obj_req.ex structs */ 3087114edacSIlya Dryomov u32 pending_count; 309bf0d5f50SAlex Elder 310bf0d5f50SAlex Elder struct kref kref; 311bf0d5f50SAlex Elder }; 312bf0d5f50SAlex Elder 313bf0d5f50SAlex Elder #define for_each_obj_request(ireq, oreq) \ 31443df3d35SIlya Dryomov list_for_each_entry(oreq, &(ireq)->object_extents, ex.oe_item) 315bf0d5f50SAlex Elder #define for_each_obj_request_safe(ireq, oreq, n) \ 31643df3d35SIlya Dryomov list_for_each_entry_safe(oreq, n, &(ireq)->object_extents, ex.oe_item) 317bf0d5f50SAlex Elder 31899d16943SIlya Dryomov enum rbd_watch_state { 31999d16943SIlya Dryomov RBD_WATCH_STATE_UNREGISTERED, 32099d16943SIlya Dryomov RBD_WATCH_STATE_REGISTERED, 32199d16943SIlya Dryomov RBD_WATCH_STATE_ERROR, 32299d16943SIlya Dryomov }; 32399d16943SIlya Dryomov 324ed95b21aSIlya Dryomov enum rbd_lock_state { 325ed95b21aSIlya Dryomov RBD_LOCK_STATE_UNLOCKED, 326ed95b21aSIlya Dryomov RBD_LOCK_STATE_LOCKED, 327ed95b21aSIlya Dryomov RBD_LOCK_STATE_RELEASING, 328ed95b21aSIlya Dryomov }; 329ed95b21aSIlya Dryomov 330ed95b21aSIlya Dryomov /* WatchNotify::ClientId */ 331ed95b21aSIlya Dryomov struct rbd_client_id { 332ed95b21aSIlya Dryomov u64 gid; 333ed95b21aSIlya Dryomov u64 handle; 334ed95b21aSIlya Dryomov }; 335ed95b21aSIlya Dryomov 336f84344f3SAlex Elder struct rbd_mapping { 33799c1f08fSAlex Elder u64 size; 33834b13184SAlex Elder u64 features; 339f84344f3SAlex Elder }; 340f84344f3SAlex Elder 341602adf40SYehuda Sadeh /* 342602adf40SYehuda Sadeh * a single device 343602adf40SYehuda Sadeh */ 344602adf40SYehuda Sadeh struct rbd_device { 345de71a297SAlex Elder int dev_id; /* blkdev unique id */ 346602adf40SYehuda Sadeh 347602adf40SYehuda Sadeh int major; /* blkdev assigned major */ 348dd82fff1SIlya Dryomov int minor; 349602adf40SYehuda Sadeh struct gendisk *disk; /* blkdev's gendisk and rq */ 350602adf40SYehuda Sadeh 351a30b71b9SAlex Elder u32 image_format; /* Either 1 or 2 */ 352602adf40SYehuda Sadeh struct rbd_client *rbd_client; 353602adf40SYehuda Sadeh 354602adf40SYehuda Sadeh char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */ 355602adf40SYehuda Sadeh 356b82d167bSAlex Elder spinlock_t lock; /* queue, flags, open_count */ 357602adf40SYehuda Sadeh 358602adf40SYehuda Sadeh struct rbd_image_header header; 359b82d167bSAlex Elder unsigned long flags; /* possibly lock protected */ 3600d7dbfceSAlex Elder struct rbd_spec *spec; 361d147543dSIlya Dryomov struct rbd_options *opts; 3620d6d1e9cSMike Christie char *config_info; /* add{,_single_major} string */ 363602adf40SYehuda Sadeh 364c41d13a3SIlya Dryomov struct ceph_object_id header_oid; 365922dab61SIlya Dryomov struct ceph_object_locator header_oloc; 366971f839aSAlex Elder 3671643dfa4SIlya Dryomov struct ceph_file_layout layout; /* used for all rbd requests */ 3680903e875SAlex Elder 36999d16943SIlya Dryomov struct mutex watch_mutex; 37099d16943SIlya Dryomov enum rbd_watch_state watch_state; 371922dab61SIlya Dryomov struct ceph_osd_linger_request *watch_handle; 37299d16943SIlya Dryomov u64 watch_cookie; 37399d16943SIlya Dryomov struct delayed_work watch_dwork; 37459c2be1eSYehuda Sadeh 375ed95b21aSIlya Dryomov struct rw_semaphore lock_rwsem; 376ed95b21aSIlya Dryomov enum rbd_lock_state lock_state; 377cbbfb0ffSIlya Dryomov char lock_cookie[32]; 378ed95b21aSIlya Dryomov struct rbd_client_id owner_cid; 379ed95b21aSIlya Dryomov struct work_struct acquired_lock_work; 380ed95b21aSIlya Dryomov struct work_struct released_lock_work; 381ed95b21aSIlya Dryomov struct delayed_work lock_dwork; 382ed95b21aSIlya Dryomov struct work_struct unlock_work; 383ed95b21aSIlya Dryomov wait_queue_head_t lock_waitq; 384ed95b21aSIlya Dryomov 3851643dfa4SIlya Dryomov struct workqueue_struct *task_wq; 386602adf40SYehuda Sadeh 38786b00e0dSAlex Elder struct rbd_spec *parent_spec; 38886b00e0dSAlex Elder u64 parent_overlap; 389a2acd00eSAlex Elder atomic_t parent_ref; 3902f82ee54SAlex Elder struct rbd_device *parent; 39186b00e0dSAlex Elder 3927ad18afaSChristoph Hellwig /* Block layer tags. */ 3937ad18afaSChristoph Hellwig struct blk_mq_tag_set tag_set; 3947ad18afaSChristoph Hellwig 395c666601aSJosh Durgin /* protects updating the header */ 396c666601aSJosh Durgin struct rw_semaphore header_rwsem; 397f84344f3SAlex Elder 398f84344f3SAlex Elder struct rbd_mapping mapping; 399602adf40SYehuda Sadeh 400602adf40SYehuda Sadeh struct list_head node; 401dfc5606dSYehuda Sadeh 402dfc5606dSYehuda Sadeh /* sysfs related */ 403dfc5606dSYehuda Sadeh struct device dev; 404b82d167bSAlex Elder unsigned long open_count; /* protected by lock */ 405dfc5606dSYehuda Sadeh }; 406dfc5606dSYehuda Sadeh 407b82d167bSAlex Elder /* 40887c0fdedSIlya Dryomov * Flag bits for rbd_dev->flags: 40987c0fdedSIlya Dryomov * - REMOVING (which is coupled with rbd_dev->open_count) is protected 41087c0fdedSIlya Dryomov * by rbd_dev->lock 41187c0fdedSIlya Dryomov * - BLACKLISTED is protected by rbd_dev->lock_rwsem 412b82d167bSAlex Elder */ 4136d292906SAlex Elder enum rbd_dev_flags { 4146d292906SAlex Elder RBD_DEV_FLAG_EXISTS, /* mapped snapshot has not been deleted */ 415b82d167bSAlex Elder RBD_DEV_FLAG_REMOVING, /* this mapping is being removed */ 41687c0fdedSIlya Dryomov RBD_DEV_FLAG_BLACKLISTED, /* our ceph_client is blacklisted */ 4176d292906SAlex Elder }; 4186d292906SAlex Elder 419cfbf6377SAlex Elder static DEFINE_MUTEX(client_mutex); /* Serialize client creation */ 420e124a82fSAlex Elder 421602adf40SYehuda Sadeh static LIST_HEAD(rbd_dev_list); /* devices */ 422e124a82fSAlex Elder static DEFINE_SPINLOCK(rbd_dev_list_lock); 423e124a82fSAlex Elder 424602adf40SYehuda Sadeh static LIST_HEAD(rbd_client_list); /* clients */ 425432b8587SAlex Elder static DEFINE_SPINLOCK(rbd_client_list_lock); 426602adf40SYehuda Sadeh 42778c2a44aSAlex Elder /* Slab caches for frequently-allocated structures */ 42878c2a44aSAlex Elder 4291c2a9dfeSAlex Elder static struct kmem_cache *rbd_img_request_cache; 430868311b1SAlex Elder static struct kmem_cache *rbd_obj_request_cache; 4311c2a9dfeSAlex Elder 4329b60e70bSIlya Dryomov static int rbd_major; 433f8a22fc2SIlya Dryomov static DEFINE_IDA(rbd_dev_id_ida); 434f8a22fc2SIlya Dryomov 435f5ee37bdSIlya Dryomov static struct workqueue_struct *rbd_wq; 436f5ee37bdSIlya Dryomov 43789a59c1cSIlya Dryomov static struct ceph_snap_context rbd_empty_snapc = { 43889a59c1cSIlya Dryomov .nref = REFCOUNT_INIT(1), 43989a59c1cSIlya Dryomov }; 44089a59c1cSIlya Dryomov 4419b60e70bSIlya Dryomov /* 4423cfa3b16SIlya Dryomov * single-major requires >= 0.75 version of userspace rbd utility. 4439b60e70bSIlya Dryomov */ 4443cfa3b16SIlya Dryomov static bool single_major = true; 4455657a819SJoe Perches module_param(single_major, bool, 0444); 4463cfa3b16SIlya Dryomov MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: true)"); 4479b60e70bSIlya Dryomov 4487e9586baSGreg Kroah-Hartman static ssize_t add_store(struct bus_type *bus, const char *buf, size_t count); 4497e9586baSGreg Kroah-Hartman static ssize_t remove_store(struct bus_type *bus, const char *buf, 450f0f8cef5SAlex Elder size_t count); 4517e9586baSGreg Kroah-Hartman static ssize_t add_single_major_store(struct bus_type *bus, const char *buf, 452f0f8cef5SAlex Elder size_t count); 4537e9586baSGreg Kroah-Hartman static ssize_t remove_single_major_store(struct bus_type *bus, const char *buf, 4549b60e70bSIlya Dryomov size_t count); 4556d69bb53SIlya Dryomov static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth); 456f0f8cef5SAlex Elder 4579b60e70bSIlya Dryomov static int rbd_dev_id_to_minor(int dev_id) 4589b60e70bSIlya Dryomov { 4597e513d43SIlya Dryomov return dev_id << RBD_SINGLE_MAJOR_PART_SHIFT; 4609b60e70bSIlya Dryomov } 4619b60e70bSIlya Dryomov 4629b60e70bSIlya Dryomov static int minor_to_rbd_dev_id(int minor) 4639b60e70bSIlya Dryomov { 4647e513d43SIlya Dryomov return minor >> RBD_SINGLE_MAJOR_PART_SHIFT; 4659b60e70bSIlya Dryomov } 4669b60e70bSIlya Dryomov 467ed95b21aSIlya Dryomov static bool __rbd_is_lock_owner(struct rbd_device *rbd_dev) 468ed95b21aSIlya Dryomov { 469ed95b21aSIlya Dryomov return rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED || 470ed95b21aSIlya Dryomov rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING; 471ed95b21aSIlya Dryomov } 472ed95b21aSIlya Dryomov 473ed95b21aSIlya Dryomov static bool rbd_is_lock_owner(struct rbd_device *rbd_dev) 474ed95b21aSIlya Dryomov { 475ed95b21aSIlya Dryomov bool is_lock_owner; 476ed95b21aSIlya Dryomov 477ed95b21aSIlya Dryomov down_read(&rbd_dev->lock_rwsem); 478ed95b21aSIlya Dryomov is_lock_owner = __rbd_is_lock_owner(rbd_dev); 479ed95b21aSIlya Dryomov up_read(&rbd_dev->lock_rwsem); 480ed95b21aSIlya Dryomov return is_lock_owner; 481ed95b21aSIlya Dryomov } 482ed95b21aSIlya Dryomov 4837e9586baSGreg Kroah-Hartman static ssize_t supported_features_show(struct bus_type *bus, char *buf) 4848767b293SIlya Dryomov { 4858767b293SIlya Dryomov return sprintf(buf, "0x%llx\n", RBD_FEATURES_SUPPORTED); 4868767b293SIlya Dryomov } 4878767b293SIlya Dryomov 4887e9586baSGreg Kroah-Hartman static BUS_ATTR_WO(add); 4897e9586baSGreg Kroah-Hartman static BUS_ATTR_WO(remove); 4907e9586baSGreg Kroah-Hartman static BUS_ATTR_WO(add_single_major); 4917e9586baSGreg Kroah-Hartman static BUS_ATTR_WO(remove_single_major); 4927e9586baSGreg Kroah-Hartman static BUS_ATTR_RO(supported_features); 493b15a21ddSGreg Kroah-Hartman 494b15a21ddSGreg Kroah-Hartman static struct attribute *rbd_bus_attrs[] = { 495b15a21ddSGreg Kroah-Hartman &bus_attr_add.attr, 496b15a21ddSGreg Kroah-Hartman &bus_attr_remove.attr, 4979b60e70bSIlya Dryomov &bus_attr_add_single_major.attr, 4989b60e70bSIlya Dryomov &bus_attr_remove_single_major.attr, 4998767b293SIlya Dryomov &bus_attr_supported_features.attr, 500b15a21ddSGreg Kroah-Hartman NULL, 501f0f8cef5SAlex Elder }; 50292c76dc0SIlya Dryomov 50392c76dc0SIlya Dryomov static umode_t rbd_bus_is_visible(struct kobject *kobj, 50492c76dc0SIlya Dryomov struct attribute *attr, int index) 50592c76dc0SIlya Dryomov { 5069b60e70bSIlya Dryomov if (!single_major && 5079b60e70bSIlya Dryomov (attr == &bus_attr_add_single_major.attr || 5089b60e70bSIlya Dryomov attr == &bus_attr_remove_single_major.attr)) 5099b60e70bSIlya Dryomov return 0; 5109b60e70bSIlya Dryomov 51192c76dc0SIlya Dryomov return attr->mode; 51292c76dc0SIlya Dryomov } 51392c76dc0SIlya Dryomov 51492c76dc0SIlya Dryomov static const struct attribute_group rbd_bus_group = { 51592c76dc0SIlya Dryomov .attrs = rbd_bus_attrs, 51692c76dc0SIlya Dryomov .is_visible = rbd_bus_is_visible, 51792c76dc0SIlya Dryomov }; 51892c76dc0SIlya Dryomov __ATTRIBUTE_GROUPS(rbd_bus); 519f0f8cef5SAlex Elder 520f0f8cef5SAlex Elder static struct bus_type rbd_bus_type = { 521f0f8cef5SAlex Elder .name = "rbd", 522b15a21ddSGreg Kroah-Hartman .bus_groups = rbd_bus_groups, 523f0f8cef5SAlex Elder }; 524f0f8cef5SAlex Elder 525f0f8cef5SAlex Elder static void rbd_root_dev_release(struct device *dev) 526f0f8cef5SAlex Elder { 527f0f8cef5SAlex Elder } 528f0f8cef5SAlex Elder 529f0f8cef5SAlex Elder static struct device rbd_root_dev = { 530f0f8cef5SAlex Elder .init_name = "rbd", 531f0f8cef5SAlex Elder .release = rbd_root_dev_release, 532f0f8cef5SAlex Elder }; 533f0f8cef5SAlex Elder 53406ecc6cbSAlex Elder static __printf(2, 3) 53506ecc6cbSAlex Elder void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...) 53606ecc6cbSAlex Elder { 53706ecc6cbSAlex Elder struct va_format vaf; 53806ecc6cbSAlex Elder va_list args; 53906ecc6cbSAlex Elder 54006ecc6cbSAlex Elder va_start(args, fmt); 54106ecc6cbSAlex Elder vaf.fmt = fmt; 54206ecc6cbSAlex Elder vaf.va = &args; 54306ecc6cbSAlex Elder 54406ecc6cbSAlex Elder if (!rbd_dev) 54506ecc6cbSAlex Elder printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf); 54606ecc6cbSAlex Elder else if (rbd_dev->disk) 54706ecc6cbSAlex Elder printk(KERN_WARNING "%s: %s: %pV\n", 54806ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf); 54906ecc6cbSAlex Elder else if (rbd_dev->spec && rbd_dev->spec->image_name) 55006ecc6cbSAlex Elder printk(KERN_WARNING "%s: image %s: %pV\n", 55106ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf); 55206ecc6cbSAlex Elder else if (rbd_dev->spec && rbd_dev->spec->image_id) 55306ecc6cbSAlex Elder printk(KERN_WARNING "%s: id %s: %pV\n", 55406ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf); 55506ecc6cbSAlex Elder else /* punt */ 55606ecc6cbSAlex Elder printk(KERN_WARNING "%s: rbd_dev %p: %pV\n", 55706ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev, &vaf); 55806ecc6cbSAlex Elder va_end(args); 55906ecc6cbSAlex Elder } 56006ecc6cbSAlex Elder 561aafb230eSAlex Elder #ifdef RBD_DEBUG 562aafb230eSAlex Elder #define rbd_assert(expr) \ 563aafb230eSAlex Elder if (unlikely(!(expr))) { \ 564aafb230eSAlex Elder printk(KERN_ERR "\nAssertion failure in %s() " \ 565aafb230eSAlex Elder "at line %d:\n\n" \ 566aafb230eSAlex Elder "\trbd_assert(%s);\n\n", \ 567aafb230eSAlex Elder __func__, __LINE__, #expr); \ 568aafb230eSAlex Elder BUG(); \ 569aafb230eSAlex Elder } 570aafb230eSAlex Elder #else /* !RBD_DEBUG */ 571aafb230eSAlex Elder # define rbd_assert(expr) ((void) 0) 572aafb230eSAlex Elder #endif /* !RBD_DEBUG */ 573dfc5606dSYehuda Sadeh 57405a46afdSAlex Elder static void rbd_dev_remove_parent(struct rbd_device *rbd_dev); 5758b3e1a56SAlex Elder 576cc4a38bdSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev); 5772df3fac7SAlex Elder static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev); 578a720ae09SIlya Dryomov static int rbd_dev_header_info(struct rbd_device *rbd_dev); 579e8f59b59SIlya Dryomov static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev); 58054cac61fSAlex Elder static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, 58154cac61fSAlex Elder u64 snap_id); 5822ad3d716SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id, 5832ad3d716SAlex Elder u8 *order, u64 *snap_size); 5842ad3d716SAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id, 5852ad3d716SAlex Elder u64 *snap_features); 58659c2be1eSYehuda Sadeh 587602adf40SYehuda Sadeh static int rbd_open(struct block_device *bdev, fmode_t mode) 588602adf40SYehuda Sadeh { 589f0f8cef5SAlex Elder struct rbd_device *rbd_dev = bdev->bd_disk->private_data; 590b82d167bSAlex Elder bool removing = false; 591602adf40SYehuda Sadeh 592a14ea269SAlex Elder spin_lock_irq(&rbd_dev->lock); 593b82d167bSAlex Elder if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags)) 594b82d167bSAlex Elder removing = true; 595b82d167bSAlex Elder else 596b82d167bSAlex Elder rbd_dev->open_count++; 597a14ea269SAlex Elder spin_unlock_irq(&rbd_dev->lock); 598b82d167bSAlex Elder if (removing) 599b82d167bSAlex Elder return -ENOENT; 600b82d167bSAlex Elder 601c3e946ceSAlex Elder (void) get_device(&rbd_dev->dev); 602340c7a2bSAlex Elder 603602adf40SYehuda Sadeh return 0; 604602adf40SYehuda Sadeh } 605602adf40SYehuda Sadeh 606db2a144bSAl Viro static void rbd_release(struct gendisk *disk, fmode_t mode) 607dfc5606dSYehuda Sadeh { 608dfc5606dSYehuda Sadeh struct rbd_device *rbd_dev = disk->private_data; 609b82d167bSAlex Elder unsigned long open_count_before; 610b82d167bSAlex Elder 611a14ea269SAlex Elder spin_lock_irq(&rbd_dev->lock); 612b82d167bSAlex Elder open_count_before = rbd_dev->open_count--; 613a14ea269SAlex Elder spin_unlock_irq(&rbd_dev->lock); 614b82d167bSAlex Elder rbd_assert(open_count_before > 0); 615dfc5606dSYehuda Sadeh 616c3e946ceSAlex Elder put_device(&rbd_dev->dev); 617dfc5606dSYehuda Sadeh } 618dfc5606dSYehuda Sadeh 619131fd9f6SGuangliang Zhao static int rbd_ioctl_set_ro(struct rbd_device *rbd_dev, unsigned long arg) 620131fd9f6SGuangliang Zhao { 6211de797bbSIlya Dryomov int ro; 622131fd9f6SGuangliang Zhao 6231de797bbSIlya Dryomov if (get_user(ro, (int __user *)arg)) 624131fd9f6SGuangliang Zhao return -EFAULT; 625131fd9f6SGuangliang Zhao 6261de797bbSIlya Dryomov /* Snapshots can't be marked read-write */ 627131fd9f6SGuangliang Zhao if (rbd_dev->spec->snap_id != CEPH_NOSNAP && !ro) 628131fd9f6SGuangliang Zhao return -EROFS; 629131fd9f6SGuangliang Zhao 6301de797bbSIlya Dryomov /* Let blkdev_roset() handle it */ 6311de797bbSIlya Dryomov return -ENOTTY; 632131fd9f6SGuangliang Zhao } 633131fd9f6SGuangliang Zhao 634131fd9f6SGuangliang Zhao static int rbd_ioctl(struct block_device *bdev, fmode_t mode, 635131fd9f6SGuangliang Zhao unsigned int cmd, unsigned long arg) 636131fd9f6SGuangliang Zhao { 637131fd9f6SGuangliang Zhao struct rbd_device *rbd_dev = bdev->bd_disk->private_data; 6381de797bbSIlya Dryomov int ret; 639131fd9f6SGuangliang Zhao 640131fd9f6SGuangliang Zhao switch (cmd) { 641131fd9f6SGuangliang Zhao case BLKROSET: 642131fd9f6SGuangliang Zhao ret = rbd_ioctl_set_ro(rbd_dev, arg); 643131fd9f6SGuangliang Zhao break; 644131fd9f6SGuangliang Zhao default: 645131fd9f6SGuangliang Zhao ret = -ENOTTY; 646131fd9f6SGuangliang Zhao } 647131fd9f6SGuangliang Zhao 648131fd9f6SGuangliang Zhao return ret; 649131fd9f6SGuangliang Zhao } 650131fd9f6SGuangliang Zhao 651131fd9f6SGuangliang Zhao #ifdef CONFIG_COMPAT 652131fd9f6SGuangliang Zhao static int rbd_compat_ioctl(struct block_device *bdev, fmode_t mode, 653131fd9f6SGuangliang Zhao unsigned int cmd, unsigned long arg) 654131fd9f6SGuangliang Zhao { 655131fd9f6SGuangliang Zhao return rbd_ioctl(bdev, mode, cmd, arg); 656131fd9f6SGuangliang Zhao } 657131fd9f6SGuangliang Zhao #endif /* CONFIG_COMPAT */ 658131fd9f6SGuangliang Zhao 659602adf40SYehuda Sadeh static const struct block_device_operations rbd_bd_ops = { 660602adf40SYehuda Sadeh .owner = THIS_MODULE, 661602adf40SYehuda Sadeh .open = rbd_open, 662dfc5606dSYehuda Sadeh .release = rbd_release, 663131fd9f6SGuangliang Zhao .ioctl = rbd_ioctl, 664131fd9f6SGuangliang Zhao #ifdef CONFIG_COMPAT 665131fd9f6SGuangliang Zhao .compat_ioctl = rbd_compat_ioctl, 666131fd9f6SGuangliang Zhao #endif 667602adf40SYehuda Sadeh }; 668602adf40SYehuda Sadeh 669602adf40SYehuda Sadeh /* 6707262cfcaSAlex Elder * Initialize an rbd client instance. Success or not, this function 671cfbf6377SAlex Elder * consumes ceph_opts. Caller holds client_mutex. 672602adf40SYehuda Sadeh */ 673f8c38929SAlex Elder static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts) 674602adf40SYehuda Sadeh { 675602adf40SYehuda Sadeh struct rbd_client *rbdc; 676602adf40SYehuda Sadeh int ret = -ENOMEM; 677602adf40SYehuda Sadeh 67837206ee5SAlex Elder dout("%s:\n", __func__); 679602adf40SYehuda Sadeh rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL); 680602adf40SYehuda Sadeh if (!rbdc) 681602adf40SYehuda Sadeh goto out_opt; 682602adf40SYehuda Sadeh 683602adf40SYehuda Sadeh kref_init(&rbdc->kref); 684602adf40SYehuda Sadeh INIT_LIST_HEAD(&rbdc->node); 685602adf40SYehuda Sadeh 68674da4a0fSIlya Dryomov rbdc->client = ceph_create_client(ceph_opts, rbdc); 687602adf40SYehuda Sadeh if (IS_ERR(rbdc->client)) 68808f75463SAlex Elder goto out_rbdc; 68943ae4701SAlex Elder ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */ 690602adf40SYehuda Sadeh 691602adf40SYehuda Sadeh ret = ceph_open_session(rbdc->client); 692602adf40SYehuda Sadeh if (ret < 0) 69308f75463SAlex Elder goto out_client; 694602adf40SYehuda Sadeh 695432b8587SAlex Elder spin_lock(&rbd_client_list_lock); 696602adf40SYehuda Sadeh list_add_tail(&rbdc->node, &rbd_client_list); 697432b8587SAlex Elder spin_unlock(&rbd_client_list_lock); 698602adf40SYehuda Sadeh 69937206ee5SAlex Elder dout("%s: rbdc %p\n", __func__, rbdc); 700bc534d86SAlex Elder 701602adf40SYehuda Sadeh return rbdc; 70208f75463SAlex Elder out_client: 703602adf40SYehuda Sadeh ceph_destroy_client(rbdc->client); 70408f75463SAlex Elder out_rbdc: 705602adf40SYehuda Sadeh kfree(rbdc); 706602adf40SYehuda Sadeh out_opt: 70743ae4701SAlex Elder if (ceph_opts) 70843ae4701SAlex Elder ceph_destroy_options(ceph_opts); 70937206ee5SAlex Elder dout("%s: error %d\n", __func__, ret); 71037206ee5SAlex Elder 71128f259b7SVasiliy Kulikov return ERR_PTR(ret); 712602adf40SYehuda Sadeh } 713602adf40SYehuda Sadeh 7142f82ee54SAlex Elder static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc) 7152f82ee54SAlex Elder { 7162f82ee54SAlex Elder kref_get(&rbdc->kref); 7172f82ee54SAlex Elder 7182f82ee54SAlex Elder return rbdc; 7192f82ee54SAlex Elder } 7202f82ee54SAlex Elder 721602adf40SYehuda Sadeh /* 7221f7ba331SAlex Elder * Find a ceph client with specific addr and configuration. If 7231f7ba331SAlex Elder * found, bump its reference count. 724602adf40SYehuda Sadeh */ 7251f7ba331SAlex Elder static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts) 726602adf40SYehuda Sadeh { 727602adf40SYehuda Sadeh struct rbd_client *client_node; 7281f7ba331SAlex Elder bool found = false; 729602adf40SYehuda Sadeh 73043ae4701SAlex Elder if (ceph_opts->flags & CEPH_OPT_NOSHARE) 731602adf40SYehuda Sadeh return NULL; 732602adf40SYehuda Sadeh 7331f7ba331SAlex Elder spin_lock(&rbd_client_list_lock); 7341f7ba331SAlex Elder list_for_each_entry(client_node, &rbd_client_list, node) { 7351f7ba331SAlex Elder if (!ceph_compare_options(ceph_opts, client_node->client)) { 7362f82ee54SAlex Elder __rbd_get_client(client_node); 7372f82ee54SAlex Elder 7381f7ba331SAlex Elder found = true; 7391f7ba331SAlex Elder break; 7401f7ba331SAlex Elder } 7411f7ba331SAlex Elder } 7421f7ba331SAlex Elder spin_unlock(&rbd_client_list_lock); 7431f7ba331SAlex Elder 7441f7ba331SAlex Elder return found ? client_node : NULL; 745602adf40SYehuda Sadeh } 746602adf40SYehuda Sadeh 747602adf40SYehuda Sadeh /* 748210c104cSIlya Dryomov * (Per device) rbd map options 74959c2be1eSYehuda Sadeh */ 75059c2be1eSYehuda Sadeh enum { 751b5584180SIlya Dryomov Opt_queue_depth, 7520c93e1b7SIlya Dryomov Opt_alloc_size, 75334f55d0bSDongsheng Yang Opt_lock_timeout, 75459c2be1eSYehuda Sadeh Opt_last_int, 75559c2be1eSYehuda Sadeh /* int args above */ 756b26c047bSIlya Dryomov Opt_pool_ns, 75759c2be1eSYehuda Sadeh Opt_last_string, 75859c2be1eSYehuda Sadeh /* string args above */ 759cc0538b6SAlex Elder Opt_read_only, 760cc0538b6SAlex Elder Opt_read_write, 76180de1912SIlya Dryomov Opt_lock_on_read, 762e010dd0aSIlya Dryomov Opt_exclusive, 763d9360540SIlya Dryomov Opt_notrim, 764210c104cSIlya Dryomov Opt_err 76559c2be1eSYehuda Sadeh }; 76659c2be1eSYehuda Sadeh 76743ae4701SAlex Elder static match_table_t rbd_opts_tokens = { 768b5584180SIlya Dryomov {Opt_queue_depth, "queue_depth=%d"}, 7690c93e1b7SIlya Dryomov {Opt_alloc_size, "alloc_size=%d"}, 77034f55d0bSDongsheng Yang {Opt_lock_timeout, "lock_timeout=%d"}, 77159c2be1eSYehuda Sadeh /* int args above */ 772b26c047bSIlya Dryomov {Opt_pool_ns, "_pool_ns=%s"}, 77359c2be1eSYehuda Sadeh /* string args above */ 774be466c1cSAlex Elder {Opt_read_only, "read_only"}, 775cc0538b6SAlex Elder {Opt_read_only, "ro"}, /* Alternate spelling */ 776cc0538b6SAlex Elder {Opt_read_write, "read_write"}, 777cc0538b6SAlex Elder {Opt_read_write, "rw"}, /* Alternate spelling */ 77880de1912SIlya Dryomov {Opt_lock_on_read, "lock_on_read"}, 779e010dd0aSIlya Dryomov {Opt_exclusive, "exclusive"}, 780d9360540SIlya Dryomov {Opt_notrim, "notrim"}, 781210c104cSIlya Dryomov {Opt_err, NULL} 78259c2be1eSYehuda Sadeh }; 78359c2be1eSYehuda Sadeh 78498571b5aSAlex Elder struct rbd_options { 785b5584180SIlya Dryomov int queue_depth; 7860c93e1b7SIlya Dryomov int alloc_size; 78734f55d0bSDongsheng Yang unsigned long lock_timeout; 78898571b5aSAlex Elder bool read_only; 78980de1912SIlya Dryomov bool lock_on_read; 790e010dd0aSIlya Dryomov bool exclusive; 791d9360540SIlya Dryomov bool trim; 79298571b5aSAlex Elder }; 79398571b5aSAlex Elder 794b5584180SIlya Dryomov #define RBD_QUEUE_DEPTH_DEFAULT BLKDEV_MAX_RQ 7950c93e1b7SIlya Dryomov #define RBD_ALLOC_SIZE_DEFAULT (64 * 1024) 79634f55d0bSDongsheng Yang #define RBD_LOCK_TIMEOUT_DEFAULT 0 /* no timeout */ 79798571b5aSAlex Elder #define RBD_READ_ONLY_DEFAULT false 79880de1912SIlya Dryomov #define RBD_LOCK_ON_READ_DEFAULT false 799e010dd0aSIlya Dryomov #define RBD_EXCLUSIVE_DEFAULT false 800d9360540SIlya Dryomov #define RBD_TRIM_DEFAULT true 80198571b5aSAlex Elder 802c300156bSIlya Dryomov struct parse_rbd_opts_ctx { 803c300156bSIlya Dryomov struct rbd_spec *spec; 804c300156bSIlya Dryomov struct rbd_options *opts; 805c300156bSIlya Dryomov }; 806c300156bSIlya Dryomov 80759c2be1eSYehuda Sadeh static int parse_rbd_opts_token(char *c, void *private) 80859c2be1eSYehuda Sadeh { 809c300156bSIlya Dryomov struct parse_rbd_opts_ctx *pctx = private; 81059c2be1eSYehuda Sadeh substring_t argstr[MAX_OPT_ARGS]; 81159c2be1eSYehuda Sadeh int token, intval, ret; 81259c2be1eSYehuda Sadeh 81343ae4701SAlex Elder token = match_token(c, rbd_opts_tokens, argstr); 81459c2be1eSYehuda Sadeh if (token < Opt_last_int) { 81559c2be1eSYehuda Sadeh ret = match_int(&argstr[0], &intval); 81659c2be1eSYehuda Sadeh if (ret < 0) { 8172f56b6baSIlya Dryomov pr_err("bad option arg (not int) at '%s'\n", c); 81859c2be1eSYehuda Sadeh return ret; 81959c2be1eSYehuda Sadeh } 82059c2be1eSYehuda Sadeh dout("got int token %d val %d\n", token, intval); 82159c2be1eSYehuda Sadeh } else if (token > Opt_last_int && token < Opt_last_string) { 822210c104cSIlya Dryomov dout("got string token %d val %s\n", token, argstr[0].from); 82359c2be1eSYehuda Sadeh } else { 82459c2be1eSYehuda Sadeh dout("got token %d\n", token); 82559c2be1eSYehuda Sadeh } 82659c2be1eSYehuda Sadeh 82759c2be1eSYehuda Sadeh switch (token) { 828b5584180SIlya Dryomov case Opt_queue_depth: 829b5584180SIlya Dryomov if (intval < 1) { 830b5584180SIlya Dryomov pr_err("queue_depth out of range\n"); 831b5584180SIlya Dryomov return -EINVAL; 832b5584180SIlya Dryomov } 833c300156bSIlya Dryomov pctx->opts->queue_depth = intval; 834b5584180SIlya Dryomov break; 8350c93e1b7SIlya Dryomov case Opt_alloc_size: 83616d80c54SIlya Dryomov if (intval < SECTOR_SIZE) { 8370c93e1b7SIlya Dryomov pr_err("alloc_size out of range\n"); 8380c93e1b7SIlya Dryomov return -EINVAL; 8390c93e1b7SIlya Dryomov } 8400c93e1b7SIlya Dryomov if (!is_power_of_2(intval)) { 8410c93e1b7SIlya Dryomov pr_err("alloc_size must be a power of 2\n"); 8420c93e1b7SIlya Dryomov return -EINVAL; 8430c93e1b7SIlya Dryomov } 8440c93e1b7SIlya Dryomov pctx->opts->alloc_size = intval; 8450c93e1b7SIlya Dryomov break; 84634f55d0bSDongsheng Yang case Opt_lock_timeout: 84734f55d0bSDongsheng Yang /* 0 is "wait forever" (i.e. infinite timeout) */ 84834f55d0bSDongsheng Yang if (intval < 0 || intval > INT_MAX / 1000) { 84934f55d0bSDongsheng Yang pr_err("lock_timeout out of range\n"); 85034f55d0bSDongsheng Yang return -EINVAL; 85134f55d0bSDongsheng Yang } 852c300156bSIlya Dryomov pctx->opts->lock_timeout = msecs_to_jiffies(intval * 1000); 85334f55d0bSDongsheng Yang break; 854b26c047bSIlya Dryomov case Opt_pool_ns: 855b26c047bSIlya Dryomov kfree(pctx->spec->pool_ns); 856b26c047bSIlya Dryomov pctx->spec->pool_ns = match_strdup(argstr); 857b26c047bSIlya Dryomov if (!pctx->spec->pool_ns) 858b26c047bSIlya Dryomov return -ENOMEM; 85959c2be1eSYehuda Sadeh break; 860cc0538b6SAlex Elder case Opt_read_only: 861c300156bSIlya Dryomov pctx->opts->read_only = true; 862cc0538b6SAlex Elder break; 863cc0538b6SAlex Elder case Opt_read_write: 864c300156bSIlya Dryomov pctx->opts->read_only = false; 865cc0538b6SAlex Elder break; 86680de1912SIlya Dryomov case Opt_lock_on_read: 867c300156bSIlya Dryomov pctx->opts->lock_on_read = true; 86880de1912SIlya Dryomov break; 869e010dd0aSIlya Dryomov case Opt_exclusive: 870c300156bSIlya Dryomov pctx->opts->exclusive = true; 871e010dd0aSIlya Dryomov break; 872d9360540SIlya Dryomov case Opt_notrim: 873c300156bSIlya Dryomov pctx->opts->trim = false; 874d9360540SIlya Dryomov break; 87559c2be1eSYehuda Sadeh default: 876210c104cSIlya Dryomov /* libceph prints "bad option" msg */ 877210c104cSIlya Dryomov return -EINVAL; 87859c2be1eSYehuda Sadeh } 879210c104cSIlya Dryomov 88059c2be1eSYehuda Sadeh return 0; 88159c2be1eSYehuda Sadeh } 88259c2be1eSYehuda Sadeh 8836d2940c8SGuangliang Zhao static char* obj_op_name(enum obj_operation_type op_type) 8846d2940c8SGuangliang Zhao { 8856d2940c8SGuangliang Zhao switch (op_type) { 8866d2940c8SGuangliang Zhao case OBJ_OP_READ: 8876d2940c8SGuangliang Zhao return "read"; 8886d2940c8SGuangliang Zhao case OBJ_OP_WRITE: 8896d2940c8SGuangliang Zhao return "write"; 89090e98c52SGuangliang Zhao case OBJ_OP_DISCARD: 89190e98c52SGuangliang Zhao return "discard"; 8926484cbe9SIlya Dryomov case OBJ_OP_ZEROOUT: 8936484cbe9SIlya Dryomov return "zeroout"; 8946d2940c8SGuangliang Zhao default: 8956d2940c8SGuangliang Zhao return "???"; 8966d2940c8SGuangliang Zhao } 8976d2940c8SGuangliang Zhao } 8986d2940c8SGuangliang Zhao 89959c2be1eSYehuda Sadeh /* 900602adf40SYehuda Sadeh * Destroy ceph client 901d23a4b3fSAlex Elder * 902432b8587SAlex Elder * Caller must hold rbd_client_list_lock. 903602adf40SYehuda Sadeh */ 904602adf40SYehuda Sadeh static void rbd_client_release(struct kref *kref) 905602adf40SYehuda Sadeh { 906602adf40SYehuda Sadeh struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref); 907602adf40SYehuda Sadeh 90837206ee5SAlex Elder dout("%s: rbdc %p\n", __func__, rbdc); 909cd9d9f5dSAlex Elder spin_lock(&rbd_client_list_lock); 910602adf40SYehuda Sadeh list_del(&rbdc->node); 911cd9d9f5dSAlex Elder spin_unlock(&rbd_client_list_lock); 912602adf40SYehuda Sadeh 913602adf40SYehuda Sadeh ceph_destroy_client(rbdc->client); 914602adf40SYehuda Sadeh kfree(rbdc); 915602adf40SYehuda Sadeh } 916602adf40SYehuda Sadeh 917602adf40SYehuda Sadeh /* 918602adf40SYehuda Sadeh * Drop reference to ceph client node. If it's not referenced anymore, release 919602adf40SYehuda Sadeh * it. 920602adf40SYehuda Sadeh */ 9219d3997fdSAlex Elder static void rbd_put_client(struct rbd_client *rbdc) 922602adf40SYehuda Sadeh { 923c53d5893SAlex Elder if (rbdc) 9249d3997fdSAlex Elder kref_put(&rbdc->kref, rbd_client_release); 925602adf40SYehuda Sadeh } 926602adf40SYehuda Sadeh 9275feb0d8dSIlya Dryomov /* 9285feb0d8dSIlya Dryomov * Get a ceph client with specific addr and configuration, if one does 9295feb0d8dSIlya Dryomov * not exist create it. Either way, ceph_opts is consumed by this 9305feb0d8dSIlya Dryomov * function. 9315feb0d8dSIlya Dryomov */ 9325feb0d8dSIlya Dryomov static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts) 9335feb0d8dSIlya Dryomov { 9345feb0d8dSIlya Dryomov struct rbd_client *rbdc; 935dd435855SIlya Dryomov int ret; 9365feb0d8dSIlya Dryomov 9375feb0d8dSIlya Dryomov mutex_lock_nested(&client_mutex, SINGLE_DEPTH_NESTING); 9385feb0d8dSIlya Dryomov rbdc = rbd_client_find(ceph_opts); 939dd435855SIlya Dryomov if (rbdc) { 9405feb0d8dSIlya Dryomov ceph_destroy_options(ceph_opts); 941dd435855SIlya Dryomov 942dd435855SIlya Dryomov /* 943dd435855SIlya Dryomov * Using an existing client. Make sure ->pg_pools is up to 944dd435855SIlya Dryomov * date before we look up the pool id in do_rbd_add(). 945dd435855SIlya Dryomov */ 9469d4a227fSIlya Dryomov ret = ceph_wait_for_latest_osdmap(rbdc->client, 9479d4a227fSIlya Dryomov rbdc->client->options->mount_timeout); 948dd435855SIlya Dryomov if (ret) { 949dd435855SIlya Dryomov rbd_warn(NULL, "failed to get latest osdmap: %d", ret); 950dd435855SIlya Dryomov rbd_put_client(rbdc); 951dd435855SIlya Dryomov rbdc = ERR_PTR(ret); 952dd435855SIlya Dryomov } 953dd435855SIlya Dryomov } else { 9545feb0d8dSIlya Dryomov rbdc = rbd_client_create(ceph_opts); 955dd435855SIlya Dryomov } 9565feb0d8dSIlya Dryomov mutex_unlock(&client_mutex); 9575feb0d8dSIlya Dryomov 9585feb0d8dSIlya Dryomov return rbdc; 9595feb0d8dSIlya Dryomov } 9605feb0d8dSIlya Dryomov 961a30b71b9SAlex Elder static bool rbd_image_format_valid(u32 image_format) 962a30b71b9SAlex Elder { 963a30b71b9SAlex Elder return image_format == 1 || image_format == 2; 964a30b71b9SAlex Elder } 965a30b71b9SAlex Elder 9668e94af8eSAlex Elder static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk) 9678e94af8eSAlex Elder { 968103a150fSAlex Elder size_t size; 969103a150fSAlex Elder u32 snap_count; 970103a150fSAlex Elder 971103a150fSAlex Elder /* The header has to start with the magic rbd header text */ 972103a150fSAlex Elder if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT))) 973103a150fSAlex Elder return false; 974103a150fSAlex Elder 975db2388b6SAlex Elder /* The bio layer requires at least sector-sized I/O */ 976db2388b6SAlex Elder 977db2388b6SAlex Elder if (ondisk->options.order < SECTOR_SHIFT) 978db2388b6SAlex Elder return false; 979db2388b6SAlex Elder 980db2388b6SAlex Elder /* If we use u64 in a few spots we may be able to loosen this */ 981db2388b6SAlex Elder 982db2388b6SAlex Elder if (ondisk->options.order > 8 * sizeof (int) - 1) 983db2388b6SAlex Elder return false; 984db2388b6SAlex Elder 985103a150fSAlex Elder /* 986103a150fSAlex Elder * The size of a snapshot header has to fit in a size_t, and 987103a150fSAlex Elder * that limits the number of snapshots. 988103a150fSAlex Elder */ 989103a150fSAlex Elder snap_count = le32_to_cpu(ondisk->snap_count); 990103a150fSAlex Elder size = SIZE_MAX - sizeof (struct ceph_snap_context); 991103a150fSAlex Elder if (snap_count > size / sizeof (__le64)) 992103a150fSAlex Elder return false; 993103a150fSAlex Elder 994103a150fSAlex Elder /* 995103a150fSAlex Elder * Not only that, but the size of the entire the snapshot 996103a150fSAlex Elder * header must also be representable in a size_t. 997103a150fSAlex Elder */ 998103a150fSAlex Elder size -= snap_count * sizeof (__le64); 999103a150fSAlex Elder if ((u64) size < le64_to_cpu(ondisk->snap_names_len)) 1000103a150fSAlex Elder return false; 1001103a150fSAlex Elder 1002103a150fSAlex Elder return true; 10038e94af8eSAlex Elder } 10048e94af8eSAlex Elder 1005602adf40SYehuda Sadeh /* 10065bc3fb17SIlya Dryomov * returns the size of an object in the image 10075bc3fb17SIlya Dryomov */ 10085bc3fb17SIlya Dryomov static u32 rbd_obj_bytes(struct rbd_image_header *header) 10095bc3fb17SIlya Dryomov { 10105bc3fb17SIlya Dryomov return 1U << header->obj_order; 10115bc3fb17SIlya Dryomov } 10125bc3fb17SIlya Dryomov 1013263423f8SIlya Dryomov static void rbd_init_layout(struct rbd_device *rbd_dev) 1014263423f8SIlya Dryomov { 1015263423f8SIlya Dryomov if (rbd_dev->header.stripe_unit == 0 || 1016263423f8SIlya Dryomov rbd_dev->header.stripe_count == 0) { 1017263423f8SIlya Dryomov rbd_dev->header.stripe_unit = rbd_obj_bytes(&rbd_dev->header); 1018263423f8SIlya Dryomov rbd_dev->header.stripe_count = 1; 1019263423f8SIlya Dryomov } 1020263423f8SIlya Dryomov 1021263423f8SIlya Dryomov rbd_dev->layout.stripe_unit = rbd_dev->header.stripe_unit; 1022263423f8SIlya Dryomov rbd_dev->layout.stripe_count = rbd_dev->header.stripe_count; 1023263423f8SIlya Dryomov rbd_dev->layout.object_size = rbd_obj_bytes(&rbd_dev->header); 10247e97332eSIlya Dryomov rbd_dev->layout.pool_id = rbd_dev->header.data_pool_id == CEPH_NOPOOL ? 10257e97332eSIlya Dryomov rbd_dev->spec->pool_id : rbd_dev->header.data_pool_id; 1026263423f8SIlya Dryomov RCU_INIT_POINTER(rbd_dev->layout.pool_ns, NULL); 1027263423f8SIlya Dryomov } 1028263423f8SIlya Dryomov 10295bc3fb17SIlya Dryomov /* 1030bb23e37aSAlex Elder * Fill an rbd image header with information from the given format 1 1031bb23e37aSAlex Elder * on-disk header. 1032602adf40SYehuda Sadeh */ 1033662518b1SAlex Elder static int rbd_header_from_disk(struct rbd_device *rbd_dev, 10344156d998SAlex Elder struct rbd_image_header_ondisk *ondisk) 1035602adf40SYehuda Sadeh { 1036662518b1SAlex Elder struct rbd_image_header *header = &rbd_dev->header; 1037bb23e37aSAlex Elder bool first_time = header->object_prefix == NULL; 1038bb23e37aSAlex Elder struct ceph_snap_context *snapc; 1039bb23e37aSAlex Elder char *object_prefix = NULL; 1040bb23e37aSAlex Elder char *snap_names = NULL; 1041bb23e37aSAlex Elder u64 *snap_sizes = NULL; 1042ccece235SAlex Elder u32 snap_count; 1043bb23e37aSAlex Elder int ret = -ENOMEM; 1044621901d6SAlex Elder u32 i; 1045602adf40SYehuda Sadeh 1046bb23e37aSAlex Elder /* Allocate this now to avoid having to handle failure below */ 1047103a150fSAlex Elder 1048bb23e37aSAlex Elder if (first_time) { 1049848d796cSIlya Dryomov object_prefix = kstrndup(ondisk->object_prefix, 1050848d796cSIlya Dryomov sizeof(ondisk->object_prefix), 1051848d796cSIlya Dryomov GFP_KERNEL); 1052bb23e37aSAlex Elder if (!object_prefix) 1053602adf40SYehuda Sadeh return -ENOMEM; 1054bb23e37aSAlex Elder } 105500f1f36fSAlex Elder 1056bb23e37aSAlex Elder /* Allocate the snapshot context and fill it in */ 1057d2bb24e5SAlex Elder 1058602adf40SYehuda Sadeh snap_count = le32_to_cpu(ondisk->snap_count); 1059bb23e37aSAlex Elder snapc = ceph_create_snap_context(snap_count, GFP_KERNEL); 1060bb23e37aSAlex Elder if (!snapc) 1061bb23e37aSAlex Elder goto out_err; 1062bb23e37aSAlex Elder snapc->seq = le64_to_cpu(ondisk->snap_seq); 1063602adf40SYehuda Sadeh if (snap_count) { 1064bb23e37aSAlex Elder struct rbd_image_snap_ondisk *snaps; 1065f785cc1dSAlex Elder u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len); 1066f785cc1dSAlex Elder 1067bb23e37aSAlex Elder /* We'll keep a copy of the snapshot names... */ 1068621901d6SAlex Elder 1069f785cc1dSAlex Elder if (snap_names_len > (u64)SIZE_MAX) 1070bb23e37aSAlex Elder goto out_2big; 1071bb23e37aSAlex Elder snap_names = kmalloc(snap_names_len, GFP_KERNEL); 1072bb23e37aSAlex Elder if (!snap_names) 1073602adf40SYehuda Sadeh goto out_err; 1074bb23e37aSAlex Elder 1075bb23e37aSAlex Elder /* ...as well as the array of their sizes. */ 107688a25a5fSMarkus Elfring snap_sizes = kmalloc_array(snap_count, 107788a25a5fSMarkus Elfring sizeof(*header->snap_sizes), 107888a25a5fSMarkus Elfring GFP_KERNEL); 1079bb23e37aSAlex Elder if (!snap_sizes) 1080bb23e37aSAlex Elder goto out_err; 1081bb23e37aSAlex Elder 1082f785cc1dSAlex Elder /* 1083bb23e37aSAlex Elder * Copy the names, and fill in each snapshot's id 1084bb23e37aSAlex Elder * and size. 1085bb23e37aSAlex Elder * 108699a41ebcSAlex Elder * Note that rbd_dev_v1_header_info() guarantees the 1087bb23e37aSAlex Elder * ondisk buffer we're working with has 1088f785cc1dSAlex Elder * snap_names_len bytes beyond the end of the 1089f785cc1dSAlex Elder * snapshot id array, this memcpy() is safe. 1090f785cc1dSAlex Elder */ 1091bb23e37aSAlex Elder memcpy(snap_names, &ondisk->snaps[snap_count], snap_names_len); 1092bb23e37aSAlex Elder snaps = ondisk->snaps; 1093bb23e37aSAlex Elder for (i = 0; i < snap_count; i++) { 1094bb23e37aSAlex Elder snapc->snaps[i] = le64_to_cpu(snaps[i].id); 1095bb23e37aSAlex Elder snap_sizes[i] = le64_to_cpu(snaps[i].image_size); 1096bb23e37aSAlex Elder } 1097602adf40SYehuda Sadeh } 1098849b4260SAlex Elder 1099bb23e37aSAlex Elder /* We won't fail any more, fill in the header */ 1100bb23e37aSAlex Elder 1101bb23e37aSAlex Elder if (first_time) { 1102bb23e37aSAlex Elder header->object_prefix = object_prefix; 1103602adf40SYehuda Sadeh header->obj_order = ondisk->options.order; 1104263423f8SIlya Dryomov rbd_init_layout(rbd_dev); 1105662518b1SAlex Elder } else { 1106662518b1SAlex Elder ceph_put_snap_context(header->snapc); 1107662518b1SAlex Elder kfree(header->snap_names); 1108662518b1SAlex Elder kfree(header->snap_sizes); 1109bb23e37aSAlex Elder } 11106a52325fSAlex Elder 1111bb23e37aSAlex Elder /* The remaining fields always get updated (when we refresh) */ 1112621901d6SAlex Elder 1113f84344f3SAlex Elder header->image_size = le64_to_cpu(ondisk->image_size); 1114bb23e37aSAlex Elder header->snapc = snapc; 1115bb23e37aSAlex Elder header->snap_names = snap_names; 1116bb23e37aSAlex Elder header->snap_sizes = snap_sizes; 1117468521c1SAlex Elder 1118602adf40SYehuda Sadeh return 0; 1119bb23e37aSAlex Elder out_2big: 1120bb23e37aSAlex Elder ret = -EIO; 11216a52325fSAlex Elder out_err: 1122bb23e37aSAlex Elder kfree(snap_sizes); 1123bb23e37aSAlex Elder kfree(snap_names); 1124bb23e37aSAlex Elder ceph_put_snap_context(snapc); 1125bb23e37aSAlex Elder kfree(object_prefix); 1126ccece235SAlex Elder 1127bb23e37aSAlex Elder return ret; 1128602adf40SYehuda Sadeh } 1129602adf40SYehuda Sadeh 11309682fc6dSAlex Elder static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which) 11319682fc6dSAlex Elder { 11329682fc6dSAlex Elder const char *snap_name; 11339682fc6dSAlex Elder 11349682fc6dSAlex Elder rbd_assert(which < rbd_dev->header.snapc->num_snaps); 11359682fc6dSAlex Elder 11369682fc6dSAlex Elder /* Skip over names until we find the one we are looking for */ 11379682fc6dSAlex Elder 11389682fc6dSAlex Elder snap_name = rbd_dev->header.snap_names; 11399682fc6dSAlex Elder while (which--) 11409682fc6dSAlex Elder snap_name += strlen(snap_name) + 1; 11419682fc6dSAlex Elder 11429682fc6dSAlex Elder return kstrdup(snap_name, GFP_KERNEL); 11439682fc6dSAlex Elder } 11449682fc6dSAlex Elder 114530d1cff8SAlex Elder /* 114630d1cff8SAlex Elder * Snapshot id comparison function for use with qsort()/bsearch(). 114730d1cff8SAlex Elder * Note that result is for snapshots in *descending* order. 114830d1cff8SAlex Elder */ 114930d1cff8SAlex Elder static int snapid_compare_reverse(const void *s1, const void *s2) 115030d1cff8SAlex Elder { 115130d1cff8SAlex Elder u64 snap_id1 = *(u64 *)s1; 115230d1cff8SAlex Elder u64 snap_id2 = *(u64 *)s2; 115330d1cff8SAlex Elder 115430d1cff8SAlex Elder if (snap_id1 < snap_id2) 115530d1cff8SAlex Elder return 1; 115630d1cff8SAlex Elder return snap_id1 == snap_id2 ? 0 : -1; 115730d1cff8SAlex Elder } 115830d1cff8SAlex Elder 115930d1cff8SAlex Elder /* 116030d1cff8SAlex Elder * Search a snapshot context to see if the given snapshot id is 116130d1cff8SAlex Elder * present. 116230d1cff8SAlex Elder * 116330d1cff8SAlex Elder * Returns the position of the snapshot id in the array if it's found, 116430d1cff8SAlex Elder * or BAD_SNAP_INDEX otherwise. 116530d1cff8SAlex Elder * 116630d1cff8SAlex Elder * Note: The snapshot array is in kept sorted (by the osd) in 116730d1cff8SAlex Elder * reverse order, highest snapshot id first. 116830d1cff8SAlex Elder */ 11699682fc6dSAlex Elder static u32 rbd_dev_snap_index(struct rbd_device *rbd_dev, u64 snap_id) 11709682fc6dSAlex Elder { 11719682fc6dSAlex Elder struct ceph_snap_context *snapc = rbd_dev->header.snapc; 117230d1cff8SAlex Elder u64 *found; 11739682fc6dSAlex Elder 117430d1cff8SAlex Elder found = bsearch(&snap_id, &snapc->snaps, snapc->num_snaps, 117530d1cff8SAlex Elder sizeof (snap_id), snapid_compare_reverse); 11769682fc6dSAlex Elder 117730d1cff8SAlex Elder return found ? (u32)(found - &snapc->snaps[0]) : BAD_SNAP_INDEX; 11789682fc6dSAlex Elder } 11799682fc6dSAlex Elder 11802ad3d716SAlex Elder static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, 11812ad3d716SAlex Elder u64 snap_id) 118254cac61fSAlex Elder { 118354cac61fSAlex Elder u32 which; 1184da6a6b63SJosh Durgin const char *snap_name; 118554cac61fSAlex Elder 118654cac61fSAlex Elder which = rbd_dev_snap_index(rbd_dev, snap_id); 118754cac61fSAlex Elder if (which == BAD_SNAP_INDEX) 1188da6a6b63SJosh Durgin return ERR_PTR(-ENOENT); 118954cac61fSAlex Elder 1190da6a6b63SJosh Durgin snap_name = _rbd_dev_v1_snap_name(rbd_dev, which); 1191da6a6b63SJosh Durgin return snap_name ? snap_name : ERR_PTR(-ENOMEM); 119254cac61fSAlex Elder } 119354cac61fSAlex Elder 11949e15b77dSAlex Elder static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id) 11959e15b77dSAlex Elder { 11969e15b77dSAlex Elder if (snap_id == CEPH_NOSNAP) 11979e15b77dSAlex Elder return RBD_SNAP_HEAD_NAME; 11989e15b77dSAlex Elder 119954cac61fSAlex Elder rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 120054cac61fSAlex Elder if (rbd_dev->image_format == 1) 120154cac61fSAlex Elder return rbd_dev_v1_snap_name(rbd_dev, snap_id); 12029e15b77dSAlex Elder 120354cac61fSAlex Elder return rbd_dev_v2_snap_name(rbd_dev, snap_id); 12049e15b77dSAlex Elder } 12059e15b77dSAlex Elder 12062ad3d716SAlex Elder static int rbd_snap_size(struct rbd_device *rbd_dev, u64 snap_id, 12072ad3d716SAlex Elder u64 *snap_size) 1208602adf40SYehuda Sadeh { 12092ad3d716SAlex Elder rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 12102ad3d716SAlex Elder if (snap_id == CEPH_NOSNAP) { 12112ad3d716SAlex Elder *snap_size = rbd_dev->header.image_size; 12122ad3d716SAlex Elder } else if (rbd_dev->image_format == 1) { 12132ad3d716SAlex Elder u32 which; 121400f1f36fSAlex Elder 12152ad3d716SAlex Elder which = rbd_dev_snap_index(rbd_dev, snap_id); 12162ad3d716SAlex Elder if (which == BAD_SNAP_INDEX) 12172ad3d716SAlex Elder return -ENOENT; 121800f1f36fSAlex Elder 12192ad3d716SAlex Elder *snap_size = rbd_dev->header.snap_sizes[which]; 12202ad3d716SAlex Elder } else { 12212ad3d716SAlex Elder u64 size = 0; 12222ad3d716SAlex Elder int ret; 12232ad3d716SAlex Elder 12242ad3d716SAlex Elder ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size); 12252ad3d716SAlex Elder if (ret) 12262ad3d716SAlex Elder return ret; 12272ad3d716SAlex Elder 12282ad3d716SAlex Elder *snap_size = size; 12292ad3d716SAlex Elder } 12302ad3d716SAlex Elder return 0; 12312ad3d716SAlex Elder } 12322ad3d716SAlex Elder 12332ad3d716SAlex Elder static int rbd_snap_features(struct rbd_device *rbd_dev, u64 snap_id, 12342ad3d716SAlex Elder u64 *snap_features) 12352ad3d716SAlex Elder { 12362ad3d716SAlex Elder rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 12372ad3d716SAlex Elder if (snap_id == CEPH_NOSNAP) { 12382ad3d716SAlex Elder *snap_features = rbd_dev->header.features; 12392ad3d716SAlex Elder } else if (rbd_dev->image_format == 1) { 12402ad3d716SAlex Elder *snap_features = 0; /* No features for format 1 */ 12412ad3d716SAlex Elder } else { 12422ad3d716SAlex Elder u64 features = 0; 12432ad3d716SAlex Elder int ret; 12442ad3d716SAlex Elder 12452ad3d716SAlex Elder ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, &features); 12462ad3d716SAlex Elder if (ret) 12472ad3d716SAlex Elder return ret; 12482ad3d716SAlex Elder 12492ad3d716SAlex Elder *snap_features = features; 12502ad3d716SAlex Elder } 12512ad3d716SAlex Elder return 0; 125200f1f36fSAlex Elder } 1253602adf40SYehuda Sadeh 1254d1cf5788SAlex Elder static int rbd_dev_mapping_set(struct rbd_device *rbd_dev) 1255602adf40SYehuda Sadeh { 12568f4b7d98SAlex Elder u64 snap_id = rbd_dev->spec->snap_id; 12572ad3d716SAlex Elder u64 size = 0; 12582ad3d716SAlex Elder u64 features = 0; 12592ad3d716SAlex Elder int ret; 12608b0241f8SAlex Elder 12612ad3d716SAlex Elder ret = rbd_snap_size(rbd_dev, snap_id, &size); 12622ad3d716SAlex Elder if (ret) 12632ad3d716SAlex Elder return ret; 12642ad3d716SAlex Elder ret = rbd_snap_features(rbd_dev, snap_id, &features); 12652ad3d716SAlex Elder if (ret) 12662ad3d716SAlex Elder return ret; 12672ad3d716SAlex Elder 12682ad3d716SAlex Elder rbd_dev->mapping.size = size; 12692ad3d716SAlex Elder rbd_dev->mapping.features = features; 12702ad3d716SAlex Elder 12718b0241f8SAlex Elder return 0; 1272602adf40SYehuda Sadeh } 1273602adf40SYehuda Sadeh 1274d1cf5788SAlex Elder static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev) 1275d1cf5788SAlex Elder { 1276d1cf5788SAlex Elder rbd_dev->mapping.size = 0; 1277d1cf5788SAlex Elder rbd_dev->mapping.features = 0; 1278200a6a8bSAlex Elder } 1279200a6a8bSAlex Elder 12805359a17dSIlya Dryomov static void zero_bvec(struct bio_vec *bv) 128165ccfe21SAlex Elder { 1282602adf40SYehuda Sadeh void *buf; 12835359a17dSIlya Dryomov unsigned long flags; 1284602adf40SYehuda Sadeh 12855359a17dSIlya Dryomov buf = bvec_kmap_irq(bv, &flags); 12865359a17dSIlya Dryomov memset(buf, 0, bv->bv_len); 12875359a17dSIlya Dryomov flush_dcache_page(bv->bv_page); 128885b5aaa6SDan Carpenter bvec_kunmap_irq(buf, &flags); 1289602adf40SYehuda Sadeh } 1290602adf40SYehuda Sadeh 12915359a17dSIlya Dryomov static void zero_bios(struct ceph_bio_iter *bio_pos, u32 off, u32 bytes) 1292b9434c5bSAlex Elder { 12935359a17dSIlya Dryomov struct ceph_bio_iter it = *bio_pos; 1294b9434c5bSAlex Elder 12955359a17dSIlya Dryomov ceph_bio_iter_advance(&it, off); 12965359a17dSIlya Dryomov ceph_bio_iter_advance_step(&it, bytes, ({ 12975359a17dSIlya Dryomov zero_bvec(&bv); 12985359a17dSIlya Dryomov })); 1299b9434c5bSAlex Elder } 1300b9434c5bSAlex Elder 13017e07efb1SIlya Dryomov static void zero_bvecs(struct ceph_bvec_iter *bvec_pos, u32 off, u32 bytes) 1302602adf40SYehuda Sadeh { 13037e07efb1SIlya Dryomov struct ceph_bvec_iter it = *bvec_pos; 1304602adf40SYehuda Sadeh 13057e07efb1SIlya Dryomov ceph_bvec_iter_advance(&it, off); 13067e07efb1SIlya Dryomov ceph_bvec_iter_advance_step(&it, bytes, ({ 13077e07efb1SIlya Dryomov zero_bvec(&bv); 13087e07efb1SIlya Dryomov })); 1309602adf40SYehuda Sadeh } 1310602adf40SYehuda Sadeh 1311f7760dadSAlex Elder /* 13123da691bfSIlya Dryomov * Zero a range in @obj_req data buffer defined by a bio (list) or 1313afb97888SIlya Dryomov * (private) bio_vec array. 1314f7760dadSAlex Elder * 13153da691bfSIlya Dryomov * @off is relative to the start of the data buffer. 1316f7760dadSAlex Elder */ 13173da691bfSIlya Dryomov static void rbd_obj_zero_range(struct rbd_obj_request *obj_req, u32 off, 13183da691bfSIlya Dryomov u32 bytes) 1319f7760dadSAlex Elder { 1320ecc633caSIlya Dryomov switch (obj_req->img_request->data_type) { 13213da691bfSIlya Dryomov case OBJ_REQUEST_BIO: 13223da691bfSIlya Dryomov zero_bios(&obj_req->bio_pos, off, bytes); 13233da691bfSIlya Dryomov break; 13243da691bfSIlya Dryomov case OBJ_REQUEST_BVECS: 1325afb97888SIlya Dryomov case OBJ_REQUEST_OWN_BVECS: 13263da691bfSIlya Dryomov zero_bvecs(&obj_req->bvec_pos, off, bytes); 13273da691bfSIlya Dryomov break; 13283da691bfSIlya Dryomov default: 13293da691bfSIlya Dryomov rbd_assert(0); 1330f5400b7aSAlex Elder } 1331bf0d5f50SAlex Elder } 1332bf0d5f50SAlex Elder 1333bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref); 1334bf0d5f50SAlex Elder static void rbd_obj_request_put(struct rbd_obj_request *obj_request) 1335bf0d5f50SAlex Elder { 1336bf0d5f50SAlex Elder rbd_assert(obj_request != NULL); 133737206ee5SAlex Elder dout("%s: obj %p (was %d)\n", __func__, obj_request, 13382c935bc5SPeter Zijlstra kref_read(&obj_request->kref)); 1339bf0d5f50SAlex Elder kref_put(&obj_request->kref, rbd_obj_request_destroy); 1340bf0d5f50SAlex Elder } 1341bf0d5f50SAlex Elder 13420f2d5be7SAlex Elder static void rbd_img_request_get(struct rbd_img_request *img_request) 13430f2d5be7SAlex Elder { 13440f2d5be7SAlex Elder dout("%s: img %p (was %d)\n", __func__, img_request, 13452c935bc5SPeter Zijlstra kref_read(&img_request->kref)); 13460f2d5be7SAlex Elder kref_get(&img_request->kref); 13470f2d5be7SAlex Elder } 13480f2d5be7SAlex Elder 1349bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref); 1350bf0d5f50SAlex Elder static void rbd_img_request_put(struct rbd_img_request *img_request) 1351bf0d5f50SAlex Elder { 1352bf0d5f50SAlex Elder rbd_assert(img_request != NULL); 135337206ee5SAlex Elder dout("%s: img %p (was %d)\n", __func__, img_request, 13542c935bc5SPeter Zijlstra kref_read(&img_request->kref)); 1355bf0d5f50SAlex Elder kref_put(&img_request->kref, rbd_img_request_destroy); 1356bf0d5f50SAlex Elder } 1357bf0d5f50SAlex Elder 1358bf0d5f50SAlex Elder static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request, 1359bf0d5f50SAlex Elder struct rbd_obj_request *obj_request) 1360bf0d5f50SAlex Elder { 136125dcf954SAlex Elder rbd_assert(obj_request->img_request == NULL); 136225dcf954SAlex Elder 1363b155e86cSAlex Elder /* Image request now owns object's original reference */ 1364bf0d5f50SAlex Elder obj_request->img_request = img_request; 13657114edacSIlya Dryomov img_request->pending_count++; 136615961b44SIlya Dryomov dout("%s: img %p obj %p\n", __func__, img_request, obj_request); 1367bf0d5f50SAlex Elder } 1368bf0d5f50SAlex Elder 1369bf0d5f50SAlex Elder static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request, 1370bf0d5f50SAlex Elder struct rbd_obj_request *obj_request) 1371bf0d5f50SAlex Elder { 137215961b44SIlya Dryomov dout("%s: img %p obj %p\n", __func__, img_request, obj_request); 137343df3d35SIlya Dryomov list_del(&obj_request->ex.oe_item); 1374bf0d5f50SAlex Elder rbd_assert(obj_request->img_request == img_request); 1375bf0d5f50SAlex Elder rbd_obj_request_put(obj_request); 1376bf0d5f50SAlex Elder } 1377bf0d5f50SAlex Elder 1378980917fcSIlya Dryomov static void rbd_obj_request_submit(struct rbd_obj_request *obj_request) 1379bf0d5f50SAlex Elder { 1380980917fcSIlya Dryomov struct ceph_osd_request *osd_req = obj_request->osd_req; 1381980917fcSIlya Dryomov 1382a90bb0c1SIlya Dryomov dout("%s %p object_no %016llx %llu~%llu osd_req %p\n", __func__, 138343df3d35SIlya Dryomov obj_request, obj_request->ex.oe_objno, obj_request->ex.oe_off, 138443df3d35SIlya Dryomov obj_request->ex.oe_len, osd_req); 1385980917fcSIlya Dryomov ceph_osdc_start_request(osd_req->r_osdc, osd_req, false); 1386bf0d5f50SAlex Elder } 1387bf0d5f50SAlex Elder 13880c425248SAlex Elder /* 13890c425248SAlex Elder * The default/initial value for all image request flags is 0. Each 13900c425248SAlex Elder * is conditionally set to 1 at image request initialization time 13910c425248SAlex Elder * and currently never change thereafter. 13920c425248SAlex Elder */ 1393d0b2e944SAlex Elder static void img_request_layered_set(struct rbd_img_request *img_request) 1394d0b2e944SAlex Elder { 1395d0b2e944SAlex Elder set_bit(IMG_REQ_LAYERED, &img_request->flags); 1396d0b2e944SAlex Elder smp_mb(); 1397d0b2e944SAlex Elder } 1398d0b2e944SAlex Elder 1399a2acd00eSAlex Elder static void img_request_layered_clear(struct rbd_img_request *img_request) 1400a2acd00eSAlex Elder { 1401a2acd00eSAlex Elder clear_bit(IMG_REQ_LAYERED, &img_request->flags); 1402a2acd00eSAlex Elder smp_mb(); 1403a2acd00eSAlex Elder } 1404a2acd00eSAlex Elder 1405d0b2e944SAlex Elder static bool img_request_layered_test(struct rbd_img_request *img_request) 1406d0b2e944SAlex Elder { 1407d0b2e944SAlex Elder smp_mb(); 1408d0b2e944SAlex Elder return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0; 1409d0b2e944SAlex Elder } 1410d0b2e944SAlex Elder 14113da691bfSIlya Dryomov static bool rbd_obj_is_entire(struct rbd_obj_request *obj_req) 14123b434a2aSJosh Durgin { 14133da691bfSIlya Dryomov struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; 14143da691bfSIlya Dryomov 141543df3d35SIlya Dryomov return !obj_req->ex.oe_off && 141643df3d35SIlya Dryomov obj_req->ex.oe_len == rbd_dev->layout.object_size; 14173b434a2aSJosh Durgin } 14183b434a2aSJosh Durgin 14193da691bfSIlya Dryomov static bool rbd_obj_is_tail(struct rbd_obj_request *obj_req) 14206e2a4505SAlex Elder { 14213da691bfSIlya Dryomov struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; 1422b9434c5bSAlex Elder 142343df3d35SIlya Dryomov return obj_req->ex.oe_off + obj_req->ex.oe_len == 14243da691bfSIlya Dryomov rbd_dev->layout.object_size; 14256e2a4505SAlex Elder } 14266e2a4505SAlex Elder 142713488d53SIlya Dryomov /* 142813488d53SIlya Dryomov * Must be called after rbd_obj_calc_img_extents(). 142913488d53SIlya Dryomov */ 143013488d53SIlya Dryomov static bool rbd_obj_copyup_enabled(struct rbd_obj_request *obj_req) 143113488d53SIlya Dryomov { 143213488d53SIlya Dryomov if (!obj_req->num_img_extents || 14339b17eb2cSIlya Dryomov (rbd_obj_is_entire(obj_req) && 14349b17eb2cSIlya Dryomov !obj_req->img_request->snapc->num_snaps)) 143513488d53SIlya Dryomov return false; 143613488d53SIlya Dryomov 143713488d53SIlya Dryomov return true; 143813488d53SIlya Dryomov } 143913488d53SIlya Dryomov 144086bd7998SIlya Dryomov static u64 rbd_obj_img_extents_bytes(struct rbd_obj_request *obj_req) 1441bf0d5f50SAlex Elder { 144286bd7998SIlya Dryomov return ceph_file_extents_bytes(obj_req->img_extents, 144386bd7998SIlya Dryomov obj_req->num_img_extents); 1444bf0d5f50SAlex Elder } 1445bf0d5f50SAlex Elder 14463da691bfSIlya Dryomov static bool rbd_img_is_write(struct rbd_img_request *img_req) 14470dcc685eSIlya Dryomov { 14489bb0248dSIlya Dryomov switch (img_req->op_type) { 14493da691bfSIlya Dryomov case OBJ_OP_READ: 14503da691bfSIlya Dryomov return false; 14513da691bfSIlya Dryomov case OBJ_OP_WRITE: 14523da691bfSIlya Dryomov case OBJ_OP_DISCARD: 14536484cbe9SIlya Dryomov case OBJ_OP_ZEROOUT: 14543da691bfSIlya Dryomov return true; 14553da691bfSIlya Dryomov default: 1456c6244b3bSArnd Bergmann BUG(); 14570dcc685eSIlya Dryomov } 14580dcc685eSIlya Dryomov } 14590dcc685eSIlya Dryomov 14603da691bfSIlya Dryomov static void rbd_obj_handle_request(struct rbd_obj_request *obj_req); 14612761713dSIlya Dryomov 146285e084feSIlya Dryomov static void rbd_osd_req_callback(struct ceph_osd_request *osd_req) 1463bf0d5f50SAlex Elder { 14643da691bfSIlya Dryomov struct rbd_obj_request *obj_req = osd_req->r_priv; 1465bf0d5f50SAlex Elder 14663da691bfSIlya Dryomov dout("%s osd_req %p result %d for obj_req %p\n", __func__, osd_req, 14673da691bfSIlya Dryomov osd_req->r_result, obj_req); 14683da691bfSIlya Dryomov rbd_assert(osd_req == obj_req->osd_req); 1469bf0d5f50SAlex Elder 14703da691bfSIlya Dryomov obj_req->result = osd_req->r_result < 0 ? osd_req->r_result : 0; 14713da691bfSIlya Dryomov if (!obj_req->result && !rbd_img_is_write(obj_req->img_request)) 14723da691bfSIlya Dryomov obj_req->xferred = osd_req->r_result; 14733da691bfSIlya Dryomov else 1474c47f9371SAlex Elder /* 14753da691bfSIlya Dryomov * Writes aren't allowed to return a data payload. In some 14763da691bfSIlya Dryomov * guarded write cases (e.g. stat + zero on an empty object) 14773da691bfSIlya Dryomov * a stat response makes it through, but we don't care. 1478c47f9371SAlex Elder */ 14793da691bfSIlya Dryomov obj_req->xferred = 0; 14800ccd5926SIlya Dryomov 14813da691bfSIlya Dryomov rbd_obj_handle_request(obj_req); 1482bf0d5f50SAlex Elder } 1483bf0d5f50SAlex Elder 14849d4df01fSAlex Elder static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request) 1485430c28c3SAlex Elder { 14868c042b0dSAlex Elder struct ceph_osd_request *osd_req = obj_request->osd_req; 1487430c28c3SAlex Elder 1488a162b308SIlya Dryomov osd_req->r_flags = CEPH_OSD_FLAG_READ; 14897c84883aSIlya Dryomov osd_req->r_snapid = obj_request->img_request->snap_id; 14909d4df01fSAlex Elder } 14919d4df01fSAlex Elder 14929d4df01fSAlex Elder static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request) 14939d4df01fSAlex Elder { 14949d4df01fSAlex Elder struct ceph_osd_request *osd_req = obj_request->osd_req; 14959d4df01fSAlex Elder 1496a162b308SIlya Dryomov osd_req->r_flags = CEPH_OSD_FLAG_WRITE; 1497fac02ddfSArnd Bergmann ktime_get_real_ts64(&osd_req->r_mtime); 149843df3d35SIlya Dryomov osd_req->r_data_offset = obj_request->ex.oe_off; 1499430c28c3SAlex Elder } 1500430c28c3SAlex Elder 1501bc81207eSIlya Dryomov static struct ceph_osd_request * 1502e28eded5SIlya Dryomov __rbd_osd_req_create(struct rbd_obj_request *obj_req, 1503e28eded5SIlya Dryomov struct ceph_snap_context *snapc, unsigned int num_ops) 1504bc81207eSIlya Dryomov { 1505e28eded5SIlya Dryomov struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; 1506bc81207eSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 1507bc81207eSIlya Dryomov struct ceph_osd_request *req; 1508a90bb0c1SIlya Dryomov const char *name_format = rbd_dev->image_format == 1 ? 1509a90bb0c1SIlya Dryomov RBD_V1_DATA_FORMAT : RBD_V2_DATA_FORMAT; 1510bc81207eSIlya Dryomov 1511e28eded5SIlya Dryomov req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false, GFP_NOIO); 1512bc81207eSIlya Dryomov if (!req) 1513bc81207eSIlya Dryomov return NULL; 1514bc81207eSIlya Dryomov 1515bc81207eSIlya Dryomov req->r_callback = rbd_osd_req_callback; 1516a162b308SIlya Dryomov req->r_priv = obj_req; 1517bc81207eSIlya Dryomov 1518b26c047bSIlya Dryomov /* 1519b26c047bSIlya Dryomov * Data objects may be stored in a separate pool, but always in 1520b26c047bSIlya Dryomov * the same namespace in that pool as the header in its pool. 1521b26c047bSIlya Dryomov */ 1522b26c047bSIlya Dryomov ceph_oloc_copy(&req->r_base_oloc, &rbd_dev->header_oloc); 1523bc81207eSIlya Dryomov req->r_base_oloc.pool = rbd_dev->layout.pool_id; 1524b26c047bSIlya Dryomov 1525a90bb0c1SIlya Dryomov if (ceph_oid_aprintf(&req->r_base_oid, GFP_NOIO, name_format, 152643df3d35SIlya Dryomov rbd_dev->header.object_prefix, obj_req->ex.oe_objno)) 1527bc81207eSIlya Dryomov goto err_req; 1528bc81207eSIlya Dryomov 1529bc81207eSIlya Dryomov return req; 1530bc81207eSIlya Dryomov 1531bc81207eSIlya Dryomov err_req: 1532bc81207eSIlya Dryomov ceph_osdc_put_request(req); 1533bc81207eSIlya Dryomov return NULL; 1534bc81207eSIlya Dryomov } 1535bc81207eSIlya Dryomov 1536e28eded5SIlya Dryomov static struct ceph_osd_request * 1537e28eded5SIlya Dryomov rbd_osd_req_create(struct rbd_obj_request *obj_req, unsigned int num_ops) 1538e28eded5SIlya Dryomov { 1539e28eded5SIlya Dryomov return __rbd_osd_req_create(obj_req, obj_req->img_request->snapc, 1540e28eded5SIlya Dryomov num_ops); 1541e28eded5SIlya Dryomov } 1542e28eded5SIlya Dryomov 1543bf0d5f50SAlex Elder static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req) 1544bf0d5f50SAlex Elder { 1545bf0d5f50SAlex Elder ceph_osdc_put_request(osd_req); 1546bf0d5f50SAlex Elder } 1547bf0d5f50SAlex Elder 1548ecc633caSIlya Dryomov static struct rbd_obj_request *rbd_obj_request_create(void) 1549bf0d5f50SAlex Elder { 1550bf0d5f50SAlex Elder struct rbd_obj_request *obj_request; 1551bf0d5f50SAlex Elder 15525a60e876SIlya Dryomov obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_NOIO); 15536c696d85SIlya Dryomov if (!obj_request) 1554f907ad55SAlex Elder return NULL; 1555f907ad55SAlex Elder 155643df3d35SIlya Dryomov ceph_object_extent_init(&obj_request->ex); 1557bf0d5f50SAlex Elder kref_init(&obj_request->kref); 1558bf0d5f50SAlex Elder 155967e2b652SIlya Dryomov dout("%s %p\n", __func__, obj_request); 1560bf0d5f50SAlex Elder return obj_request; 1561bf0d5f50SAlex Elder } 1562bf0d5f50SAlex Elder 1563bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref) 1564bf0d5f50SAlex Elder { 1565bf0d5f50SAlex Elder struct rbd_obj_request *obj_request; 15667e07efb1SIlya Dryomov u32 i; 1567bf0d5f50SAlex Elder 1568bf0d5f50SAlex Elder obj_request = container_of(kref, struct rbd_obj_request, kref); 1569bf0d5f50SAlex Elder 157037206ee5SAlex Elder dout("%s: obj %p\n", __func__, obj_request); 157137206ee5SAlex Elder 1572bf0d5f50SAlex Elder if (obj_request->osd_req) 1573bf0d5f50SAlex Elder rbd_osd_req_destroy(obj_request->osd_req); 1574bf0d5f50SAlex Elder 1575ecc633caSIlya Dryomov switch (obj_request->img_request->data_type) { 15769969ebc5SAlex Elder case OBJ_REQUEST_NODATA: 1577bf0d5f50SAlex Elder case OBJ_REQUEST_BIO: 15787e07efb1SIlya Dryomov case OBJ_REQUEST_BVECS: 15795359a17dSIlya Dryomov break; /* Nothing to do */ 1580afb97888SIlya Dryomov case OBJ_REQUEST_OWN_BVECS: 1581afb97888SIlya Dryomov kfree(obj_request->bvec_pos.bvecs); 1582bf0d5f50SAlex Elder break; 15837e07efb1SIlya Dryomov default: 15847e07efb1SIlya Dryomov rbd_assert(0); 1585bf0d5f50SAlex Elder } 1586bf0d5f50SAlex Elder 158786bd7998SIlya Dryomov kfree(obj_request->img_extents); 15887e07efb1SIlya Dryomov if (obj_request->copyup_bvecs) { 15897e07efb1SIlya Dryomov for (i = 0; i < obj_request->copyup_bvec_count; i++) { 15907e07efb1SIlya Dryomov if (obj_request->copyup_bvecs[i].bv_page) 15917e07efb1SIlya Dryomov __free_page(obj_request->copyup_bvecs[i].bv_page); 15927e07efb1SIlya Dryomov } 15937e07efb1SIlya Dryomov kfree(obj_request->copyup_bvecs); 1594bf0d5f50SAlex Elder } 1595bf0d5f50SAlex Elder 1596868311b1SAlex Elder kmem_cache_free(rbd_obj_request_cache, obj_request); 1597bf0d5f50SAlex Elder } 1598bf0d5f50SAlex Elder 1599fb65d228SAlex Elder /* It's OK to call this for a device with no parent */ 1600fb65d228SAlex Elder 1601fb65d228SAlex Elder static void rbd_spec_put(struct rbd_spec *spec); 1602fb65d228SAlex Elder static void rbd_dev_unparent(struct rbd_device *rbd_dev) 1603fb65d228SAlex Elder { 1604fb65d228SAlex Elder rbd_dev_remove_parent(rbd_dev); 1605fb65d228SAlex Elder rbd_spec_put(rbd_dev->parent_spec); 1606fb65d228SAlex Elder rbd_dev->parent_spec = NULL; 1607fb65d228SAlex Elder rbd_dev->parent_overlap = 0; 1608fb65d228SAlex Elder } 1609fb65d228SAlex Elder 1610bf0d5f50SAlex Elder /* 1611a2acd00eSAlex Elder * Parent image reference counting is used to determine when an 1612a2acd00eSAlex Elder * image's parent fields can be safely torn down--after there are no 1613a2acd00eSAlex Elder * more in-flight requests to the parent image. When the last 1614a2acd00eSAlex Elder * reference is dropped, cleaning them up is safe. 1615a2acd00eSAlex Elder */ 1616a2acd00eSAlex Elder static void rbd_dev_parent_put(struct rbd_device *rbd_dev) 1617a2acd00eSAlex Elder { 1618a2acd00eSAlex Elder int counter; 1619a2acd00eSAlex Elder 1620a2acd00eSAlex Elder if (!rbd_dev->parent_spec) 1621a2acd00eSAlex Elder return; 1622a2acd00eSAlex Elder 1623a2acd00eSAlex Elder counter = atomic_dec_return_safe(&rbd_dev->parent_ref); 1624a2acd00eSAlex Elder if (counter > 0) 1625a2acd00eSAlex Elder return; 1626a2acd00eSAlex Elder 1627a2acd00eSAlex Elder /* Last reference; clean up parent data structures */ 1628a2acd00eSAlex Elder 1629a2acd00eSAlex Elder if (!counter) 1630a2acd00eSAlex Elder rbd_dev_unparent(rbd_dev); 1631a2acd00eSAlex Elder else 16329584d508SIlya Dryomov rbd_warn(rbd_dev, "parent reference underflow"); 1633a2acd00eSAlex Elder } 1634a2acd00eSAlex Elder 1635a2acd00eSAlex Elder /* 1636a2acd00eSAlex Elder * If an image has a non-zero parent overlap, get a reference to its 1637a2acd00eSAlex Elder * parent. 1638a2acd00eSAlex Elder * 1639a2acd00eSAlex Elder * Returns true if the rbd device has a parent with a non-zero 1640a2acd00eSAlex Elder * overlap and a reference for it was successfully taken, or 1641a2acd00eSAlex Elder * false otherwise. 1642a2acd00eSAlex Elder */ 1643a2acd00eSAlex Elder static bool rbd_dev_parent_get(struct rbd_device *rbd_dev) 1644a2acd00eSAlex Elder { 1645ae43e9d0SIlya Dryomov int counter = 0; 1646a2acd00eSAlex Elder 1647a2acd00eSAlex Elder if (!rbd_dev->parent_spec) 1648a2acd00eSAlex Elder return false; 1649a2acd00eSAlex Elder 1650ae43e9d0SIlya Dryomov down_read(&rbd_dev->header_rwsem); 1651ae43e9d0SIlya Dryomov if (rbd_dev->parent_overlap) 1652a2acd00eSAlex Elder counter = atomic_inc_return_safe(&rbd_dev->parent_ref); 1653ae43e9d0SIlya Dryomov up_read(&rbd_dev->header_rwsem); 1654a2acd00eSAlex Elder 1655a2acd00eSAlex Elder if (counter < 0) 16569584d508SIlya Dryomov rbd_warn(rbd_dev, "parent reference overflow"); 1657a2acd00eSAlex Elder 1658ae43e9d0SIlya Dryomov return counter > 0; 1659a2acd00eSAlex Elder } 1660a2acd00eSAlex Elder 1661bf0d5f50SAlex Elder /* 1662bf0d5f50SAlex Elder * Caller is responsible for filling in the list of object requests 1663bf0d5f50SAlex Elder * that comprises the image request, and the Linux request pointer 1664bf0d5f50SAlex Elder * (if there is one). 1665bf0d5f50SAlex Elder */ 1666cc344fa1SAlex Elder static struct rbd_img_request *rbd_img_request_create( 1667cc344fa1SAlex Elder struct rbd_device *rbd_dev, 16686d2940c8SGuangliang Zhao enum obj_operation_type op_type, 16694e752f0aSJosh Durgin struct ceph_snap_context *snapc) 1670bf0d5f50SAlex Elder { 1671bf0d5f50SAlex Elder struct rbd_img_request *img_request; 1672bf0d5f50SAlex Elder 1673a0c5895bSIlya Dryomov img_request = kmem_cache_zalloc(rbd_img_request_cache, GFP_NOIO); 1674bf0d5f50SAlex Elder if (!img_request) 1675bf0d5f50SAlex Elder return NULL; 1676bf0d5f50SAlex Elder 1677bf0d5f50SAlex Elder img_request->rbd_dev = rbd_dev; 16789bb0248dSIlya Dryomov img_request->op_type = op_type; 16799bb0248dSIlya Dryomov if (!rbd_img_is_write(img_request)) 1680bf0d5f50SAlex Elder img_request->snap_id = rbd_dev->spec->snap_id; 16819bb0248dSIlya Dryomov else 16829bb0248dSIlya Dryomov img_request->snapc = snapc; 16839bb0248dSIlya Dryomov 1684a2acd00eSAlex Elder if (rbd_dev_parent_get(rbd_dev)) 1685d0b2e944SAlex Elder img_request_layered_set(img_request); 1686a0c5895bSIlya Dryomov 1687bf0d5f50SAlex Elder spin_lock_init(&img_request->completion_lock); 168843df3d35SIlya Dryomov INIT_LIST_HEAD(&img_request->object_extents); 1689bf0d5f50SAlex Elder kref_init(&img_request->kref); 1690bf0d5f50SAlex Elder 1691dfd9875fSIlya Dryomov dout("%s: rbd_dev %p %s -> img %p\n", __func__, rbd_dev, 1692dfd9875fSIlya Dryomov obj_op_name(op_type), img_request); 1693bf0d5f50SAlex Elder return img_request; 1694bf0d5f50SAlex Elder } 1695bf0d5f50SAlex Elder 1696bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref) 1697bf0d5f50SAlex Elder { 1698bf0d5f50SAlex Elder struct rbd_img_request *img_request; 1699bf0d5f50SAlex Elder struct rbd_obj_request *obj_request; 1700bf0d5f50SAlex Elder struct rbd_obj_request *next_obj_request; 1701bf0d5f50SAlex Elder 1702bf0d5f50SAlex Elder img_request = container_of(kref, struct rbd_img_request, kref); 1703bf0d5f50SAlex Elder 170437206ee5SAlex Elder dout("%s: img %p\n", __func__, img_request); 170537206ee5SAlex Elder 1706bf0d5f50SAlex Elder for_each_obj_request_safe(img_request, obj_request, next_obj_request) 1707bf0d5f50SAlex Elder rbd_img_obj_request_del(img_request, obj_request); 1708bf0d5f50SAlex Elder 1709a2acd00eSAlex Elder if (img_request_layered_test(img_request)) { 1710a2acd00eSAlex Elder img_request_layered_clear(img_request); 1711a2acd00eSAlex Elder rbd_dev_parent_put(img_request->rbd_dev); 1712a2acd00eSAlex Elder } 1713a2acd00eSAlex Elder 17149bb0248dSIlya Dryomov if (rbd_img_is_write(img_request)) 1715812164f8SAlex Elder ceph_put_snap_context(img_request->snapc); 1716bf0d5f50SAlex Elder 17171c2a9dfeSAlex Elder kmem_cache_free(rbd_img_request_cache, img_request); 1718bf0d5f50SAlex Elder } 1719bf0d5f50SAlex Elder 172086bd7998SIlya Dryomov static void prune_extents(struct ceph_file_extent *img_extents, 172186bd7998SIlya Dryomov u32 *num_img_extents, u64 overlap) 1722e93f3152SAlex Elder { 172386bd7998SIlya Dryomov u32 cnt = *num_img_extents; 1724e93f3152SAlex Elder 172586bd7998SIlya Dryomov /* drop extents completely beyond the overlap */ 172686bd7998SIlya Dryomov while (cnt && img_extents[cnt - 1].fe_off >= overlap) 172786bd7998SIlya Dryomov cnt--; 1728e93f3152SAlex Elder 172986bd7998SIlya Dryomov if (cnt) { 173086bd7998SIlya Dryomov struct ceph_file_extent *ex = &img_extents[cnt - 1]; 1731e93f3152SAlex Elder 173286bd7998SIlya Dryomov /* trim final overlapping extent */ 173386bd7998SIlya Dryomov if (ex->fe_off + ex->fe_len > overlap) 173486bd7998SIlya Dryomov ex->fe_len = overlap - ex->fe_off; 1735e93f3152SAlex Elder } 1736e93f3152SAlex Elder 173786bd7998SIlya Dryomov *num_img_extents = cnt; 173886bd7998SIlya Dryomov } 173986bd7998SIlya Dryomov 174086bd7998SIlya Dryomov /* 174186bd7998SIlya Dryomov * Determine the byte range(s) covered by either just the object extent 174286bd7998SIlya Dryomov * or the entire object in the parent image. 174386bd7998SIlya Dryomov */ 174486bd7998SIlya Dryomov static int rbd_obj_calc_img_extents(struct rbd_obj_request *obj_req, 174586bd7998SIlya Dryomov bool entire) 1746e93f3152SAlex Elder { 174786bd7998SIlya Dryomov struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; 1748c5b5ef6cSAlex Elder int ret; 1749c5b5ef6cSAlex Elder 175086bd7998SIlya Dryomov if (!rbd_dev->parent_overlap) 175186bd7998SIlya Dryomov return 0; 175286bd7998SIlya Dryomov 175386bd7998SIlya Dryomov ret = ceph_extent_to_file(&rbd_dev->layout, obj_req->ex.oe_objno, 175486bd7998SIlya Dryomov entire ? 0 : obj_req->ex.oe_off, 175586bd7998SIlya Dryomov entire ? rbd_dev->layout.object_size : 175686bd7998SIlya Dryomov obj_req->ex.oe_len, 175786bd7998SIlya Dryomov &obj_req->img_extents, 175886bd7998SIlya Dryomov &obj_req->num_img_extents); 175986bd7998SIlya Dryomov if (ret) 176086bd7998SIlya Dryomov return ret; 176186bd7998SIlya Dryomov 176286bd7998SIlya Dryomov prune_extents(obj_req->img_extents, &obj_req->num_img_extents, 176386bd7998SIlya Dryomov rbd_dev->parent_overlap); 176486bd7998SIlya Dryomov return 0; 176586bd7998SIlya Dryomov } 176686bd7998SIlya Dryomov 17673da691bfSIlya Dryomov static void rbd_osd_req_setup_data(struct rbd_obj_request *obj_req, u32 which) 17683da691bfSIlya Dryomov { 1769ecc633caSIlya Dryomov switch (obj_req->img_request->data_type) { 17703da691bfSIlya Dryomov case OBJ_REQUEST_BIO: 17713da691bfSIlya Dryomov osd_req_op_extent_osd_data_bio(obj_req->osd_req, which, 17723da691bfSIlya Dryomov &obj_req->bio_pos, 177343df3d35SIlya Dryomov obj_req->ex.oe_len); 17743da691bfSIlya Dryomov break; 17753da691bfSIlya Dryomov case OBJ_REQUEST_BVECS: 1776afb97888SIlya Dryomov case OBJ_REQUEST_OWN_BVECS: 17773da691bfSIlya Dryomov rbd_assert(obj_req->bvec_pos.iter.bi_size == 177843df3d35SIlya Dryomov obj_req->ex.oe_len); 1779afb97888SIlya Dryomov rbd_assert(obj_req->bvec_idx == obj_req->bvec_count); 17803da691bfSIlya Dryomov osd_req_op_extent_osd_data_bvec_pos(obj_req->osd_req, which, 17813da691bfSIlya Dryomov &obj_req->bvec_pos); 17823da691bfSIlya Dryomov break; 17833da691bfSIlya Dryomov default: 17843da691bfSIlya Dryomov rbd_assert(0); 17853da691bfSIlya Dryomov } 17863da691bfSIlya Dryomov } 17873da691bfSIlya Dryomov 17883da691bfSIlya Dryomov static int rbd_obj_setup_read(struct rbd_obj_request *obj_req) 17893da691bfSIlya Dryomov { 1790e28eded5SIlya Dryomov obj_req->osd_req = __rbd_osd_req_create(obj_req, NULL, 1); 17913da691bfSIlya Dryomov if (!obj_req->osd_req) 1792710214e3SIlya Dryomov return -ENOMEM; 1793710214e3SIlya Dryomov 17943da691bfSIlya Dryomov osd_req_op_extent_init(obj_req->osd_req, 0, CEPH_OSD_OP_READ, 179543df3d35SIlya Dryomov obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0); 17963da691bfSIlya Dryomov rbd_osd_req_setup_data(obj_req, 0); 1797a90bb0c1SIlya Dryomov 17983da691bfSIlya Dryomov rbd_osd_req_format_read(obj_req); 17993da691bfSIlya Dryomov return 0; 1800710214e3SIlya Dryomov } 1801710214e3SIlya Dryomov 18023da691bfSIlya Dryomov static int __rbd_obj_setup_stat(struct rbd_obj_request *obj_req, 18033da691bfSIlya Dryomov unsigned int which) 18043da691bfSIlya Dryomov { 18053da691bfSIlya Dryomov struct page **pages; 18063da691bfSIlya Dryomov 1807c5b5ef6cSAlex Elder /* 1808c5b5ef6cSAlex Elder * The response data for a STAT call consists of: 1809c5b5ef6cSAlex Elder * le64 length; 1810c5b5ef6cSAlex Elder * struct { 1811c5b5ef6cSAlex Elder * le32 tv_sec; 1812c5b5ef6cSAlex Elder * le32 tv_nsec; 1813c5b5ef6cSAlex Elder * } mtime; 1814c5b5ef6cSAlex Elder */ 18153da691bfSIlya Dryomov pages = ceph_alloc_page_vector(1, GFP_NOIO); 18163da691bfSIlya Dryomov if (IS_ERR(pages)) 18173da691bfSIlya Dryomov return PTR_ERR(pages); 18183da691bfSIlya Dryomov 18193da691bfSIlya Dryomov osd_req_op_init(obj_req->osd_req, which, CEPH_OSD_OP_STAT, 0); 18203da691bfSIlya Dryomov osd_req_op_raw_data_in_pages(obj_req->osd_req, which, pages, 18213da691bfSIlya Dryomov 8 + sizeof(struct ceph_timespec), 18223da691bfSIlya Dryomov 0, false, true); 18233da691bfSIlya Dryomov return 0; 1824710214e3SIlya Dryomov } 1825c5b5ef6cSAlex Elder 182613488d53SIlya Dryomov static int count_write_ops(struct rbd_obj_request *obj_req) 182713488d53SIlya Dryomov { 182813488d53SIlya Dryomov return 2; /* setallochint + write/writefull */ 182913488d53SIlya Dryomov } 183013488d53SIlya Dryomov 18313da691bfSIlya Dryomov static void __rbd_obj_setup_write(struct rbd_obj_request *obj_req, 18323da691bfSIlya Dryomov unsigned int which) 18333da691bfSIlya Dryomov { 18343da691bfSIlya Dryomov struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; 18353da691bfSIlya Dryomov u16 opcode; 1836c5b5ef6cSAlex Elder 18373da691bfSIlya Dryomov osd_req_op_alloc_hint_init(obj_req->osd_req, which++, 18383da691bfSIlya Dryomov rbd_dev->layout.object_size, 18393da691bfSIlya Dryomov rbd_dev->layout.object_size); 1840c5b5ef6cSAlex Elder 18413da691bfSIlya Dryomov if (rbd_obj_is_entire(obj_req)) 18423da691bfSIlya Dryomov opcode = CEPH_OSD_OP_WRITEFULL; 18433da691bfSIlya Dryomov else 18443da691bfSIlya Dryomov opcode = CEPH_OSD_OP_WRITE; 1845c5b5ef6cSAlex Elder 18463da691bfSIlya Dryomov osd_req_op_extent_init(obj_req->osd_req, which, opcode, 184743df3d35SIlya Dryomov obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0); 18483da691bfSIlya Dryomov rbd_osd_req_setup_data(obj_req, which++); 18493da691bfSIlya Dryomov 18503da691bfSIlya Dryomov rbd_assert(which == obj_req->osd_req->r_num_ops); 18513da691bfSIlya Dryomov rbd_osd_req_format_write(obj_req); 18523da691bfSIlya Dryomov } 18533da691bfSIlya Dryomov 18543da691bfSIlya Dryomov static int rbd_obj_setup_write(struct rbd_obj_request *obj_req) 18553da691bfSIlya Dryomov { 18563da691bfSIlya Dryomov unsigned int num_osd_ops, which = 0; 185713488d53SIlya Dryomov bool need_guard; 18583da691bfSIlya Dryomov int ret; 18593da691bfSIlya Dryomov 186086bd7998SIlya Dryomov /* reverse map the entire object onto the parent */ 186186bd7998SIlya Dryomov ret = rbd_obj_calc_img_extents(obj_req, true); 186286bd7998SIlya Dryomov if (ret) 186386bd7998SIlya Dryomov return ret; 186486bd7998SIlya Dryomov 186513488d53SIlya Dryomov need_guard = rbd_obj_copyup_enabled(obj_req); 186613488d53SIlya Dryomov num_osd_ops = need_guard + count_write_ops(obj_req); 18673da691bfSIlya Dryomov 1868a162b308SIlya Dryomov obj_req->osd_req = rbd_osd_req_create(obj_req, num_osd_ops); 18693da691bfSIlya Dryomov if (!obj_req->osd_req) 18703da691bfSIlya Dryomov return -ENOMEM; 18713da691bfSIlya Dryomov 187213488d53SIlya Dryomov if (need_guard) { 18733da691bfSIlya Dryomov ret = __rbd_obj_setup_stat(obj_req, which++); 18743da691bfSIlya Dryomov if (ret) 1875c5b5ef6cSAlex Elder return ret; 187613488d53SIlya Dryomov 187713488d53SIlya Dryomov obj_req->write_state = RBD_OBJ_WRITE_GUARD; 187813488d53SIlya Dryomov } else { 187913488d53SIlya Dryomov obj_req->write_state = RBD_OBJ_WRITE_FLAT; 1880c5b5ef6cSAlex Elder } 1881c5b5ef6cSAlex Elder 18823da691bfSIlya Dryomov __rbd_obj_setup_write(obj_req, which); 18833da691bfSIlya Dryomov return 0; 188470d045f6SIlya Dryomov } 188570d045f6SIlya Dryomov 18866484cbe9SIlya Dryomov static u16 truncate_or_zero_opcode(struct rbd_obj_request *obj_req) 18876484cbe9SIlya Dryomov { 18886484cbe9SIlya Dryomov return rbd_obj_is_tail(obj_req) ? CEPH_OSD_OP_TRUNCATE : 18896484cbe9SIlya Dryomov CEPH_OSD_OP_ZERO; 18906484cbe9SIlya Dryomov } 18916484cbe9SIlya Dryomov 18926484cbe9SIlya Dryomov static int rbd_obj_setup_discard(struct rbd_obj_request *obj_req) 18936484cbe9SIlya Dryomov { 18940c93e1b7SIlya Dryomov struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; 18950c93e1b7SIlya Dryomov u64 off = obj_req->ex.oe_off; 18960c93e1b7SIlya Dryomov u64 next_off = obj_req->ex.oe_off + obj_req->ex.oe_len; 18976484cbe9SIlya Dryomov int ret; 18986484cbe9SIlya Dryomov 18990c93e1b7SIlya Dryomov /* 19000c93e1b7SIlya Dryomov * Align the range to alloc_size boundary and punt on discards 19010c93e1b7SIlya Dryomov * that are too small to free up any space. 19020c93e1b7SIlya Dryomov * 19030c93e1b7SIlya Dryomov * alloc_size == object_size && is_tail() is a special case for 19040c93e1b7SIlya Dryomov * filestore with filestore_punch_hole = false, needed to allow 19050c93e1b7SIlya Dryomov * truncate (in addition to delete). 19060c93e1b7SIlya Dryomov */ 19070c93e1b7SIlya Dryomov if (rbd_dev->opts->alloc_size != rbd_dev->layout.object_size || 19080c93e1b7SIlya Dryomov !rbd_obj_is_tail(obj_req)) { 19090c93e1b7SIlya Dryomov off = round_up(off, rbd_dev->opts->alloc_size); 19100c93e1b7SIlya Dryomov next_off = round_down(next_off, rbd_dev->opts->alloc_size); 19110c93e1b7SIlya Dryomov if (off >= next_off) 19120c93e1b7SIlya Dryomov return 1; 19130c93e1b7SIlya Dryomov } 19140c93e1b7SIlya Dryomov 19156484cbe9SIlya Dryomov /* reverse map the entire object onto the parent */ 19166484cbe9SIlya Dryomov ret = rbd_obj_calc_img_extents(obj_req, true); 19176484cbe9SIlya Dryomov if (ret) 19186484cbe9SIlya Dryomov return ret; 19196484cbe9SIlya Dryomov 19206484cbe9SIlya Dryomov obj_req->osd_req = rbd_osd_req_create(obj_req, 1); 19216484cbe9SIlya Dryomov if (!obj_req->osd_req) 19226484cbe9SIlya Dryomov return -ENOMEM; 19236484cbe9SIlya Dryomov 19246484cbe9SIlya Dryomov if (rbd_obj_is_entire(obj_req) && !obj_req->num_img_extents) { 19256484cbe9SIlya Dryomov osd_req_op_init(obj_req->osd_req, 0, CEPH_OSD_OP_DELETE, 0); 19266484cbe9SIlya Dryomov } else { 19270c93e1b7SIlya Dryomov dout("%s %p %llu~%llu -> %llu~%llu\n", __func__, 19280c93e1b7SIlya Dryomov obj_req, obj_req->ex.oe_off, obj_req->ex.oe_len, 19290c93e1b7SIlya Dryomov off, next_off - off); 19306484cbe9SIlya Dryomov osd_req_op_extent_init(obj_req->osd_req, 0, 19316484cbe9SIlya Dryomov truncate_or_zero_opcode(obj_req), 19320c93e1b7SIlya Dryomov off, next_off - off, 0, 0); 19336484cbe9SIlya Dryomov } 19346484cbe9SIlya Dryomov 19356484cbe9SIlya Dryomov obj_req->write_state = RBD_OBJ_WRITE_FLAT; 19366484cbe9SIlya Dryomov rbd_osd_req_format_write(obj_req); 19376484cbe9SIlya Dryomov return 0; 19386484cbe9SIlya Dryomov } 19396484cbe9SIlya Dryomov 194013488d53SIlya Dryomov static int count_zeroout_ops(struct rbd_obj_request *obj_req) 194113488d53SIlya Dryomov { 194213488d53SIlya Dryomov int num_osd_ops; 194313488d53SIlya Dryomov 19449b17eb2cSIlya Dryomov if (rbd_obj_is_entire(obj_req) && obj_req->num_img_extents && 19459b17eb2cSIlya Dryomov !rbd_obj_copyup_enabled(obj_req)) 194613488d53SIlya Dryomov num_osd_ops = 2; /* create + truncate */ 194713488d53SIlya Dryomov else 194813488d53SIlya Dryomov num_osd_ops = 1; /* delete/truncate/zero */ 194913488d53SIlya Dryomov 195013488d53SIlya Dryomov return num_osd_ops; 195113488d53SIlya Dryomov } 195213488d53SIlya Dryomov 19536484cbe9SIlya Dryomov static void __rbd_obj_setup_zeroout(struct rbd_obj_request *obj_req, 19543da691bfSIlya Dryomov unsigned int which) 195570d045f6SIlya Dryomov { 19563da691bfSIlya Dryomov u16 opcode; 1957058aa991SIlya Dryomov 19583da691bfSIlya Dryomov if (rbd_obj_is_entire(obj_req)) { 195986bd7998SIlya Dryomov if (obj_req->num_img_extents) { 19609b17eb2cSIlya Dryomov if (!rbd_obj_copyup_enabled(obj_req)) 19612bb1e56eSIlya Dryomov osd_req_op_init(obj_req->osd_req, which++, 19622bb1e56eSIlya Dryomov CEPH_OSD_OP_CREATE, 0); 19633da691bfSIlya Dryomov opcode = CEPH_OSD_OP_TRUNCATE; 19643da691bfSIlya Dryomov } else { 19653da691bfSIlya Dryomov osd_req_op_init(obj_req->osd_req, which++, 19663da691bfSIlya Dryomov CEPH_OSD_OP_DELETE, 0); 19673da691bfSIlya Dryomov opcode = 0; 19683da691bfSIlya Dryomov } 19693da691bfSIlya Dryomov } else { 19706484cbe9SIlya Dryomov opcode = truncate_or_zero_opcode(obj_req); 19713da691bfSIlya Dryomov } 19723da691bfSIlya Dryomov 19733da691bfSIlya Dryomov if (opcode) 19743da691bfSIlya Dryomov osd_req_op_extent_init(obj_req->osd_req, which++, opcode, 197543df3d35SIlya Dryomov obj_req->ex.oe_off, obj_req->ex.oe_len, 19763da691bfSIlya Dryomov 0, 0); 19773da691bfSIlya Dryomov 19783da691bfSIlya Dryomov rbd_assert(which == obj_req->osd_req->r_num_ops); 19793da691bfSIlya Dryomov rbd_osd_req_format_write(obj_req); 19803da691bfSIlya Dryomov } 19813da691bfSIlya Dryomov 19826484cbe9SIlya Dryomov static int rbd_obj_setup_zeroout(struct rbd_obj_request *obj_req) 19833da691bfSIlya Dryomov { 19843da691bfSIlya Dryomov unsigned int num_osd_ops, which = 0; 198513488d53SIlya Dryomov bool need_guard; 19863da691bfSIlya Dryomov int ret; 19873da691bfSIlya Dryomov 198886bd7998SIlya Dryomov /* reverse map the entire object onto the parent */ 198986bd7998SIlya Dryomov ret = rbd_obj_calc_img_extents(obj_req, true); 199086bd7998SIlya Dryomov if (ret) 199186bd7998SIlya Dryomov return ret; 199286bd7998SIlya Dryomov 199313488d53SIlya Dryomov need_guard = rbd_obj_copyup_enabled(obj_req); 199413488d53SIlya Dryomov num_osd_ops = need_guard + count_zeroout_ops(obj_req); 19953da691bfSIlya Dryomov 1996a162b308SIlya Dryomov obj_req->osd_req = rbd_osd_req_create(obj_req, num_osd_ops); 19973da691bfSIlya Dryomov if (!obj_req->osd_req) 19983da691bfSIlya Dryomov return -ENOMEM; 19993da691bfSIlya Dryomov 200013488d53SIlya Dryomov if (need_guard) { 20013da691bfSIlya Dryomov ret = __rbd_obj_setup_stat(obj_req, which++); 20023da691bfSIlya Dryomov if (ret) 20033da691bfSIlya Dryomov return ret; 200413488d53SIlya Dryomov 200513488d53SIlya Dryomov obj_req->write_state = RBD_OBJ_WRITE_GUARD; 200613488d53SIlya Dryomov } else { 200713488d53SIlya Dryomov obj_req->write_state = RBD_OBJ_WRITE_FLAT; 20083da691bfSIlya Dryomov } 20093da691bfSIlya Dryomov 20106484cbe9SIlya Dryomov __rbd_obj_setup_zeroout(obj_req, which); 2011980917fcSIlya Dryomov return 0; 2012b454e36dSAlex Elder } 2013b454e36dSAlex Elder 2014b454e36dSAlex Elder /* 20153da691bfSIlya Dryomov * For each object request in @img_req, allocate an OSD request, add 20163da691bfSIlya Dryomov * individual OSD ops and prepare them for submission. The number of 20173da691bfSIlya Dryomov * OSD ops depends on op_type and the overlap point (if any). 2018b454e36dSAlex Elder */ 20193da691bfSIlya Dryomov static int __rbd_img_fill_request(struct rbd_img_request *img_req) 20203da691bfSIlya Dryomov { 20210c93e1b7SIlya Dryomov struct rbd_obj_request *obj_req, *next_obj_req; 20223da691bfSIlya Dryomov int ret; 20233d7efd18SAlex Elder 20240c93e1b7SIlya Dryomov for_each_obj_request_safe(img_req, obj_req, next_obj_req) { 20259bb0248dSIlya Dryomov switch (img_req->op_type) { 20263da691bfSIlya Dryomov case OBJ_OP_READ: 20273da691bfSIlya Dryomov ret = rbd_obj_setup_read(obj_req); 20283da691bfSIlya Dryomov break; 20293da691bfSIlya Dryomov case OBJ_OP_WRITE: 20303da691bfSIlya Dryomov ret = rbd_obj_setup_write(obj_req); 20313da691bfSIlya Dryomov break; 20323da691bfSIlya Dryomov case OBJ_OP_DISCARD: 20333da691bfSIlya Dryomov ret = rbd_obj_setup_discard(obj_req); 20343da691bfSIlya Dryomov break; 20356484cbe9SIlya Dryomov case OBJ_OP_ZEROOUT: 20366484cbe9SIlya Dryomov ret = rbd_obj_setup_zeroout(obj_req); 20376484cbe9SIlya Dryomov break; 20383da691bfSIlya Dryomov default: 20393da691bfSIlya Dryomov rbd_assert(0); 20403da691bfSIlya Dryomov } 20410c93e1b7SIlya Dryomov if (ret < 0) 20423da691bfSIlya Dryomov return ret; 20430c93e1b7SIlya Dryomov if (ret > 0) { 20440c93e1b7SIlya Dryomov img_req->xferred += obj_req->ex.oe_len; 20450c93e1b7SIlya Dryomov img_req->pending_count--; 20460c93e1b7SIlya Dryomov rbd_img_obj_request_del(img_req, obj_req); 20470c93e1b7SIlya Dryomov continue; 20480c93e1b7SIlya Dryomov } 204926f887e0SIlya Dryomov 205026f887e0SIlya Dryomov ret = ceph_osdc_alloc_messages(obj_req->osd_req, GFP_NOIO); 205126f887e0SIlya Dryomov if (ret) 205226f887e0SIlya Dryomov return ret; 2053b454e36dSAlex Elder } 2054b454e36dSAlex Elder 20553da691bfSIlya Dryomov return 0; 20563da691bfSIlya Dryomov } 20573da691bfSIlya Dryomov 20585a237819SIlya Dryomov union rbd_img_fill_iter { 20595a237819SIlya Dryomov struct ceph_bio_iter bio_iter; 20605a237819SIlya Dryomov struct ceph_bvec_iter bvec_iter; 20615a237819SIlya Dryomov }; 20625a237819SIlya Dryomov 20635a237819SIlya Dryomov struct rbd_img_fill_ctx { 20645a237819SIlya Dryomov enum obj_request_type pos_type; 20655a237819SIlya Dryomov union rbd_img_fill_iter *pos; 20665a237819SIlya Dryomov union rbd_img_fill_iter iter; 20675a237819SIlya Dryomov ceph_object_extent_fn_t set_pos_fn; 2068afb97888SIlya Dryomov ceph_object_extent_fn_t count_fn; 2069afb97888SIlya Dryomov ceph_object_extent_fn_t copy_fn; 20705a237819SIlya Dryomov }; 20715a237819SIlya Dryomov 20725a237819SIlya Dryomov static struct ceph_object_extent *alloc_object_extent(void *arg) 20735a237819SIlya Dryomov { 20745a237819SIlya Dryomov struct rbd_img_request *img_req = arg; 20755a237819SIlya Dryomov struct rbd_obj_request *obj_req; 20765a237819SIlya Dryomov 20775a237819SIlya Dryomov obj_req = rbd_obj_request_create(); 20785a237819SIlya Dryomov if (!obj_req) 20795a237819SIlya Dryomov return NULL; 20805a237819SIlya Dryomov 20815a237819SIlya Dryomov rbd_img_obj_request_add(img_req, obj_req); 20825a237819SIlya Dryomov return &obj_req->ex; 20835a237819SIlya Dryomov } 20845a237819SIlya Dryomov 20855a237819SIlya Dryomov /* 2086afb97888SIlya Dryomov * While su != os && sc == 1 is technically not fancy (it's the same 2087afb97888SIlya Dryomov * layout as su == os && sc == 1), we can't use the nocopy path for it 2088afb97888SIlya Dryomov * because ->set_pos_fn() should be called only once per object. 2089afb97888SIlya Dryomov * ceph_file_to_extents() invokes action_fn once per stripe unit, so 2090afb97888SIlya Dryomov * treat su != os && sc == 1 as fancy. 20915a237819SIlya Dryomov */ 2092afb97888SIlya Dryomov static bool rbd_layout_is_fancy(struct ceph_file_layout *l) 2093afb97888SIlya Dryomov { 2094afb97888SIlya Dryomov return l->stripe_unit != l->object_size; 2095afb97888SIlya Dryomov } 2096afb97888SIlya Dryomov 2097afb97888SIlya Dryomov static int rbd_img_fill_request_nocopy(struct rbd_img_request *img_req, 20985a237819SIlya Dryomov struct ceph_file_extent *img_extents, 20995a237819SIlya Dryomov u32 num_img_extents, 21005a237819SIlya Dryomov struct rbd_img_fill_ctx *fctx) 21015a237819SIlya Dryomov { 21025a237819SIlya Dryomov u32 i; 21035a237819SIlya Dryomov int ret; 21045a237819SIlya Dryomov 21055a237819SIlya Dryomov img_req->data_type = fctx->pos_type; 21065a237819SIlya Dryomov 21075a237819SIlya Dryomov /* 21085a237819SIlya Dryomov * Create object requests and set each object request's starting 21095a237819SIlya Dryomov * position in the provided bio (list) or bio_vec array. 21105a237819SIlya Dryomov */ 21115a237819SIlya Dryomov fctx->iter = *fctx->pos; 21125a237819SIlya Dryomov for (i = 0; i < num_img_extents; i++) { 21135a237819SIlya Dryomov ret = ceph_file_to_extents(&img_req->rbd_dev->layout, 21145a237819SIlya Dryomov img_extents[i].fe_off, 21155a237819SIlya Dryomov img_extents[i].fe_len, 21165a237819SIlya Dryomov &img_req->object_extents, 21175a237819SIlya Dryomov alloc_object_extent, img_req, 21185a237819SIlya Dryomov fctx->set_pos_fn, &fctx->iter); 21195a237819SIlya Dryomov if (ret) 21205a237819SIlya Dryomov return ret; 21215a237819SIlya Dryomov } 21225a237819SIlya Dryomov 21235a237819SIlya Dryomov return __rbd_img_fill_request(img_req); 21245a237819SIlya Dryomov } 21255a237819SIlya Dryomov 2126afb97888SIlya Dryomov /* 2127afb97888SIlya Dryomov * Map a list of image extents to a list of object extents, create the 2128afb97888SIlya Dryomov * corresponding object requests (normally each to a different object, 2129afb97888SIlya Dryomov * but not always) and add them to @img_req. For each object request, 2130afb97888SIlya Dryomov * set up its data descriptor to point to the corresponding chunk(s) of 2131afb97888SIlya Dryomov * @fctx->pos data buffer. 2132afb97888SIlya Dryomov * 2133afb97888SIlya Dryomov * Because ceph_file_to_extents() will merge adjacent object extents 2134afb97888SIlya Dryomov * together, each object request's data descriptor may point to multiple 2135afb97888SIlya Dryomov * different chunks of @fctx->pos data buffer. 2136afb97888SIlya Dryomov * 2137afb97888SIlya Dryomov * @fctx->pos data buffer is assumed to be large enough. 2138afb97888SIlya Dryomov */ 2139afb97888SIlya Dryomov static int rbd_img_fill_request(struct rbd_img_request *img_req, 2140afb97888SIlya Dryomov struct ceph_file_extent *img_extents, 2141afb97888SIlya Dryomov u32 num_img_extents, 2142afb97888SIlya Dryomov struct rbd_img_fill_ctx *fctx) 2143afb97888SIlya Dryomov { 2144afb97888SIlya Dryomov struct rbd_device *rbd_dev = img_req->rbd_dev; 2145afb97888SIlya Dryomov struct rbd_obj_request *obj_req; 2146afb97888SIlya Dryomov u32 i; 2147afb97888SIlya Dryomov int ret; 2148afb97888SIlya Dryomov 2149afb97888SIlya Dryomov if (fctx->pos_type == OBJ_REQUEST_NODATA || 2150afb97888SIlya Dryomov !rbd_layout_is_fancy(&rbd_dev->layout)) 2151afb97888SIlya Dryomov return rbd_img_fill_request_nocopy(img_req, img_extents, 2152afb97888SIlya Dryomov num_img_extents, fctx); 2153afb97888SIlya Dryomov 2154afb97888SIlya Dryomov img_req->data_type = OBJ_REQUEST_OWN_BVECS; 2155afb97888SIlya Dryomov 2156afb97888SIlya Dryomov /* 2157afb97888SIlya Dryomov * Create object requests and determine ->bvec_count for each object 2158afb97888SIlya Dryomov * request. Note that ->bvec_count sum over all object requests may 2159afb97888SIlya Dryomov * be greater than the number of bio_vecs in the provided bio (list) 2160afb97888SIlya Dryomov * or bio_vec array because when mapped, those bio_vecs can straddle 2161afb97888SIlya Dryomov * stripe unit boundaries. 2162afb97888SIlya Dryomov */ 2163afb97888SIlya Dryomov fctx->iter = *fctx->pos; 2164afb97888SIlya Dryomov for (i = 0; i < num_img_extents; i++) { 2165afb97888SIlya Dryomov ret = ceph_file_to_extents(&rbd_dev->layout, 2166afb97888SIlya Dryomov img_extents[i].fe_off, 2167afb97888SIlya Dryomov img_extents[i].fe_len, 2168afb97888SIlya Dryomov &img_req->object_extents, 2169afb97888SIlya Dryomov alloc_object_extent, img_req, 2170afb97888SIlya Dryomov fctx->count_fn, &fctx->iter); 2171afb97888SIlya Dryomov if (ret) 2172afb97888SIlya Dryomov return ret; 2173afb97888SIlya Dryomov } 2174afb97888SIlya Dryomov 2175afb97888SIlya Dryomov for_each_obj_request(img_req, obj_req) { 2176afb97888SIlya Dryomov obj_req->bvec_pos.bvecs = kmalloc_array(obj_req->bvec_count, 2177afb97888SIlya Dryomov sizeof(*obj_req->bvec_pos.bvecs), 2178afb97888SIlya Dryomov GFP_NOIO); 2179afb97888SIlya Dryomov if (!obj_req->bvec_pos.bvecs) 2180afb97888SIlya Dryomov return -ENOMEM; 2181afb97888SIlya Dryomov } 2182afb97888SIlya Dryomov 2183afb97888SIlya Dryomov /* 2184afb97888SIlya Dryomov * Fill in each object request's private bio_vec array, splitting and 2185afb97888SIlya Dryomov * rearranging the provided bio_vecs in stripe unit chunks as needed. 2186afb97888SIlya Dryomov */ 2187afb97888SIlya Dryomov fctx->iter = *fctx->pos; 2188afb97888SIlya Dryomov for (i = 0; i < num_img_extents; i++) { 2189afb97888SIlya Dryomov ret = ceph_iterate_extents(&rbd_dev->layout, 2190afb97888SIlya Dryomov img_extents[i].fe_off, 2191afb97888SIlya Dryomov img_extents[i].fe_len, 2192afb97888SIlya Dryomov &img_req->object_extents, 2193afb97888SIlya Dryomov fctx->copy_fn, &fctx->iter); 2194afb97888SIlya Dryomov if (ret) 2195afb97888SIlya Dryomov return ret; 2196afb97888SIlya Dryomov } 2197afb97888SIlya Dryomov 2198afb97888SIlya Dryomov return __rbd_img_fill_request(img_req); 2199afb97888SIlya Dryomov } 2200afb97888SIlya Dryomov 22015a237819SIlya Dryomov static int rbd_img_fill_nodata(struct rbd_img_request *img_req, 22025a237819SIlya Dryomov u64 off, u64 len) 22035a237819SIlya Dryomov { 22045a237819SIlya Dryomov struct ceph_file_extent ex = { off, len }; 22055a237819SIlya Dryomov union rbd_img_fill_iter dummy; 22065a237819SIlya Dryomov struct rbd_img_fill_ctx fctx = { 22075a237819SIlya Dryomov .pos_type = OBJ_REQUEST_NODATA, 22085a237819SIlya Dryomov .pos = &dummy, 22095a237819SIlya Dryomov }; 22105a237819SIlya Dryomov 22115a237819SIlya Dryomov return rbd_img_fill_request(img_req, &ex, 1, &fctx); 22125a237819SIlya Dryomov } 22135a237819SIlya Dryomov 22145a237819SIlya Dryomov static void set_bio_pos(struct ceph_object_extent *ex, u32 bytes, void *arg) 22155a237819SIlya Dryomov { 22165a237819SIlya Dryomov struct rbd_obj_request *obj_req = 22175a237819SIlya Dryomov container_of(ex, struct rbd_obj_request, ex); 22185a237819SIlya Dryomov struct ceph_bio_iter *it = arg; 22195a237819SIlya Dryomov 22205a237819SIlya Dryomov dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes); 22215a237819SIlya Dryomov obj_req->bio_pos = *it; 22225a237819SIlya Dryomov ceph_bio_iter_advance(it, bytes); 22235a237819SIlya Dryomov } 22245a237819SIlya Dryomov 2225afb97888SIlya Dryomov static void count_bio_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg) 2226afb97888SIlya Dryomov { 2227afb97888SIlya Dryomov struct rbd_obj_request *obj_req = 2228afb97888SIlya Dryomov container_of(ex, struct rbd_obj_request, ex); 2229afb97888SIlya Dryomov struct ceph_bio_iter *it = arg; 2230afb97888SIlya Dryomov 2231afb97888SIlya Dryomov dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes); 2232afb97888SIlya Dryomov ceph_bio_iter_advance_step(it, bytes, ({ 2233afb97888SIlya Dryomov obj_req->bvec_count++; 2234afb97888SIlya Dryomov })); 2235afb97888SIlya Dryomov 2236afb97888SIlya Dryomov } 2237afb97888SIlya Dryomov 2238afb97888SIlya Dryomov static void copy_bio_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg) 2239afb97888SIlya Dryomov { 2240afb97888SIlya Dryomov struct rbd_obj_request *obj_req = 2241afb97888SIlya Dryomov container_of(ex, struct rbd_obj_request, ex); 2242afb97888SIlya Dryomov struct ceph_bio_iter *it = arg; 2243afb97888SIlya Dryomov 2244afb97888SIlya Dryomov dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes); 2245afb97888SIlya Dryomov ceph_bio_iter_advance_step(it, bytes, ({ 2246afb97888SIlya Dryomov obj_req->bvec_pos.bvecs[obj_req->bvec_idx++] = bv; 2247afb97888SIlya Dryomov obj_req->bvec_pos.iter.bi_size += bv.bv_len; 2248afb97888SIlya Dryomov })); 2249afb97888SIlya Dryomov } 2250afb97888SIlya Dryomov 22515a237819SIlya Dryomov static int __rbd_img_fill_from_bio(struct rbd_img_request *img_req, 22525a237819SIlya Dryomov struct ceph_file_extent *img_extents, 22535a237819SIlya Dryomov u32 num_img_extents, 22545a237819SIlya Dryomov struct ceph_bio_iter *bio_pos) 22555a237819SIlya Dryomov { 22565a237819SIlya Dryomov struct rbd_img_fill_ctx fctx = { 22575a237819SIlya Dryomov .pos_type = OBJ_REQUEST_BIO, 22585a237819SIlya Dryomov .pos = (union rbd_img_fill_iter *)bio_pos, 22595a237819SIlya Dryomov .set_pos_fn = set_bio_pos, 2260afb97888SIlya Dryomov .count_fn = count_bio_bvecs, 2261afb97888SIlya Dryomov .copy_fn = copy_bio_bvecs, 22625a237819SIlya Dryomov }; 22635a237819SIlya Dryomov 22645a237819SIlya Dryomov return rbd_img_fill_request(img_req, img_extents, num_img_extents, 22655a237819SIlya Dryomov &fctx); 22665a237819SIlya Dryomov } 22675a237819SIlya Dryomov 22685a237819SIlya Dryomov static int rbd_img_fill_from_bio(struct rbd_img_request *img_req, 22695a237819SIlya Dryomov u64 off, u64 len, struct bio *bio) 22705a237819SIlya Dryomov { 22715a237819SIlya Dryomov struct ceph_file_extent ex = { off, len }; 22725a237819SIlya Dryomov struct ceph_bio_iter it = { .bio = bio, .iter = bio->bi_iter }; 22735a237819SIlya Dryomov 22745a237819SIlya Dryomov return __rbd_img_fill_from_bio(img_req, &ex, 1, &it); 22755a237819SIlya Dryomov } 22765a237819SIlya Dryomov 22775a237819SIlya Dryomov static void set_bvec_pos(struct ceph_object_extent *ex, u32 bytes, void *arg) 22785a237819SIlya Dryomov { 22795a237819SIlya Dryomov struct rbd_obj_request *obj_req = 22805a237819SIlya Dryomov container_of(ex, struct rbd_obj_request, ex); 22815a237819SIlya Dryomov struct ceph_bvec_iter *it = arg; 22825a237819SIlya Dryomov 22835a237819SIlya Dryomov obj_req->bvec_pos = *it; 22845a237819SIlya Dryomov ceph_bvec_iter_shorten(&obj_req->bvec_pos, bytes); 22855a237819SIlya Dryomov ceph_bvec_iter_advance(it, bytes); 22865a237819SIlya Dryomov } 22875a237819SIlya Dryomov 2288afb97888SIlya Dryomov static void count_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg) 2289afb97888SIlya Dryomov { 2290afb97888SIlya Dryomov struct rbd_obj_request *obj_req = 2291afb97888SIlya Dryomov container_of(ex, struct rbd_obj_request, ex); 2292afb97888SIlya Dryomov struct ceph_bvec_iter *it = arg; 2293afb97888SIlya Dryomov 2294afb97888SIlya Dryomov ceph_bvec_iter_advance_step(it, bytes, ({ 2295afb97888SIlya Dryomov obj_req->bvec_count++; 2296afb97888SIlya Dryomov })); 2297afb97888SIlya Dryomov } 2298afb97888SIlya Dryomov 2299afb97888SIlya Dryomov static void copy_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg) 2300afb97888SIlya Dryomov { 2301afb97888SIlya Dryomov struct rbd_obj_request *obj_req = 2302afb97888SIlya Dryomov container_of(ex, struct rbd_obj_request, ex); 2303afb97888SIlya Dryomov struct ceph_bvec_iter *it = arg; 2304afb97888SIlya Dryomov 2305afb97888SIlya Dryomov ceph_bvec_iter_advance_step(it, bytes, ({ 2306afb97888SIlya Dryomov obj_req->bvec_pos.bvecs[obj_req->bvec_idx++] = bv; 2307afb97888SIlya Dryomov obj_req->bvec_pos.iter.bi_size += bv.bv_len; 2308afb97888SIlya Dryomov })); 2309afb97888SIlya Dryomov } 2310afb97888SIlya Dryomov 23115a237819SIlya Dryomov static int __rbd_img_fill_from_bvecs(struct rbd_img_request *img_req, 23125a237819SIlya Dryomov struct ceph_file_extent *img_extents, 23135a237819SIlya Dryomov u32 num_img_extents, 23145a237819SIlya Dryomov struct ceph_bvec_iter *bvec_pos) 23155a237819SIlya Dryomov { 23165a237819SIlya Dryomov struct rbd_img_fill_ctx fctx = { 23175a237819SIlya Dryomov .pos_type = OBJ_REQUEST_BVECS, 23185a237819SIlya Dryomov .pos = (union rbd_img_fill_iter *)bvec_pos, 23195a237819SIlya Dryomov .set_pos_fn = set_bvec_pos, 2320afb97888SIlya Dryomov .count_fn = count_bvecs, 2321afb97888SIlya Dryomov .copy_fn = copy_bvecs, 23225a237819SIlya Dryomov }; 23235a237819SIlya Dryomov 23245a237819SIlya Dryomov return rbd_img_fill_request(img_req, img_extents, num_img_extents, 23255a237819SIlya Dryomov &fctx); 23265a237819SIlya Dryomov } 23275a237819SIlya Dryomov 23285a237819SIlya Dryomov static int rbd_img_fill_from_bvecs(struct rbd_img_request *img_req, 23295a237819SIlya Dryomov struct ceph_file_extent *img_extents, 23305a237819SIlya Dryomov u32 num_img_extents, 23315a237819SIlya Dryomov struct bio_vec *bvecs) 23325a237819SIlya Dryomov { 23335a237819SIlya Dryomov struct ceph_bvec_iter it = { 23345a237819SIlya Dryomov .bvecs = bvecs, 23355a237819SIlya Dryomov .iter = { .bi_size = ceph_file_extents_bytes(img_extents, 23365a237819SIlya Dryomov num_img_extents) }, 23375a237819SIlya Dryomov }; 23385a237819SIlya Dryomov 23395a237819SIlya Dryomov return __rbd_img_fill_from_bvecs(img_req, img_extents, num_img_extents, 23405a237819SIlya Dryomov &it); 23415a237819SIlya Dryomov } 23425a237819SIlya Dryomov 2343efbd1a11SIlya Dryomov static void rbd_img_request_submit(struct rbd_img_request *img_request) 2344bf0d5f50SAlex Elder { 2345bf0d5f50SAlex Elder struct rbd_obj_request *obj_request; 2346bf0d5f50SAlex Elder 234737206ee5SAlex Elder dout("%s: img %p\n", __func__, img_request); 2348bf0d5f50SAlex Elder 2349663ae2ccSIlya Dryomov rbd_img_request_get(img_request); 2350efbd1a11SIlya Dryomov for_each_obj_request(img_request, obj_request) 23513da691bfSIlya Dryomov rbd_obj_request_submit(obj_request); 2352bf0d5f50SAlex Elder 2353663ae2ccSIlya Dryomov rbd_img_request_put(img_request); 2354bf0d5f50SAlex Elder } 2355bf0d5f50SAlex Elder 235686bd7998SIlya Dryomov static int rbd_obj_read_from_parent(struct rbd_obj_request *obj_req) 23573da691bfSIlya Dryomov { 23583da691bfSIlya Dryomov struct rbd_img_request *img_req = obj_req->img_request; 23593da691bfSIlya Dryomov struct rbd_img_request *child_img_req; 23603da691bfSIlya Dryomov int ret; 23613da691bfSIlya Dryomov 2362e93aca0aSIlya Dryomov child_img_req = rbd_img_request_create(img_req->rbd_dev->parent, 2363e93aca0aSIlya Dryomov OBJ_OP_READ, NULL); 23643da691bfSIlya Dryomov if (!child_img_req) 23653da691bfSIlya Dryomov return -ENOMEM; 23663da691bfSIlya Dryomov 2367e93aca0aSIlya Dryomov __set_bit(IMG_REQ_CHILD, &child_img_req->flags); 2368e93aca0aSIlya Dryomov child_img_req->obj_request = obj_req; 2369e93aca0aSIlya Dryomov 23703da691bfSIlya Dryomov if (!rbd_img_is_write(img_req)) { 2371ecc633caSIlya Dryomov switch (img_req->data_type) { 23723da691bfSIlya Dryomov case OBJ_REQUEST_BIO: 23735a237819SIlya Dryomov ret = __rbd_img_fill_from_bio(child_img_req, 23745a237819SIlya Dryomov obj_req->img_extents, 23755a237819SIlya Dryomov obj_req->num_img_extents, 23763da691bfSIlya Dryomov &obj_req->bio_pos); 23773da691bfSIlya Dryomov break; 23783da691bfSIlya Dryomov case OBJ_REQUEST_BVECS: 2379afb97888SIlya Dryomov case OBJ_REQUEST_OWN_BVECS: 23805a237819SIlya Dryomov ret = __rbd_img_fill_from_bvecs(child_img_req, 23815a237819SIlya Dryomov obj_req->img_extents, 23825a237819SIlya Dryomov obj_req->num_img_extents, 23833da691bfSIlya Dryomov &obj_req->bvec_pos); 23843da691bfSIlya Dryomov break; 23853da691bfSIlya Dryomov default: 23863da691bfSIlya Dryomov rbd_assert(0); 23873da691bfSIlya Dryomov } 23883da691bfSIlya Dryomov } else { 23895a237819SIlya Dryomov ret = rbd_img_fill_from_bvecs(child_img_req, 23905a237819SIlya Dryomov obj_req->img_extents, 23915a237819SIlya Dryomov obj_req->num_img_extents, 23925a237819SIlya Dryomov obj_req->copyup_bvecs); 23933da691bfSIlya Dryomov } 23943da691bfSIlya Dryomov if (ret) { 23953da691bfSIlya Dryomov rbd_img_request_put(child_img_req); 2396663ae2ccSIlya Dryomov return ret; 2397bf0d5f50SAlex Elder } 2398bf0d5f50SAlex Elder 23993da691bfSIlya Dryomov rbd_img_request_submit(child_img_req); 24003da691bfSIlya Dryomov return 0; 24013da691bfSIlya Dryomov } 24023da691bfSIlya Dryomov 24033da691bfSIlya Dryomov static bool rbd_obj_handle_read(struct rbd_obj_request *obj_req) 24048b3e1a56SAlex Elder { 24053da691bfSIlya Dryomov struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; 24063da691bfSIlya Dryomov int ret; 24078b3e1a56SAlex Elder 24083da691bfSIlya Dryomov if (obj_req->result == -ENOENT && 240986bd7998SIlya Dryomov rbd_dev->parent_overlap && !obj_req->tried_parent) { 241086bd7998SIlya Dryomov /* reverse map this object extent onto the parent */ 241186bd7998SIlya Dryomov ret = rbd_obj_calc_img_extents(obj_req, false); 241286bd7998SIlya Dryomov if (ret) { 241386bd7998SIlya Dryomov obj_req->result = ret; 241486bd7998SIlya Dryomov return true; 241586bd7998SIlya Dryomov } 24168b3e1a56SAlex Elder 241786bd7998SIlya Dryomov if (obj_req->num_img_extents) { 24183da691bfSIlya Dryomov obj_req->tried_parent = true; 241986bd7998SIlya Dryomov ret = rbd_obj_read_from_parent(obj_req); 24203da691bfSIlya Dryomov if (ret) { 24213da691bfSIlya Dryomov obj_req->result = ret; 24223da691bfSIlya Dryomov return true; 24233da691bfSIlya Dryomov } 24243da691bfSIlya Dryomov return false; 24253da691bfSIlya Dryomov } 242686bd7998SIlya Dryomov } 242702c74fbaSAlex Elder 242802c74fbaSAlex Elder /* 24293da691bfSIlya Dryomov * -ENOENT means a hole in the image -- zero-fill the entire 24303da691bfSIlya Dryomov * length of the request. A short read also implies zero-fill 24313da691bfSIlya Dryomov * to the end of the request. In both cases we update xferred 24323da691bfSIlya Dryomov * count to indicate the whole request was satisfied. 243302c74fbaSAlex Elder */ 24343da691bfSIlya Dryomov if (obj_req->result == -ENOENT || 243543df3d35SIlya Dryomov (!obj_req->result && obj_req->xferred < obj_req->ex.oe_len)) { 24363da691bfSIlya Dryomov rbd_assert(!obj_req->xferred || !obj_req->result); 24373da691bfSIlya Dryomov rbd_obj_zero_range(obj_req, obj_req->xferred, 243843df3d35SIlya Dryomov obj_req->ex.oe_len - obj_req->xferred); 24393da691bfSIlya Dryomov obj_req->result = 0; 244043df3d35SIlya Dryomov obj_req->xferred = obj_req->ex.oe_len; 24413da691bfSIlya Dryomov } 24423da691bfSIlya Dryomov 24433da691bfSIlya Dryomov return true; 24443da691bfSIlya Dryomov } 24453da691bfSIlya Dryomov 24463da691bfSIlya Dryomov /* 24473da691bfSIlya Dryomov * copyup_bvecs pages are never highmem pages 24483da691bfSIlya Dryomov */ 24493da691bfSIlya Dryomov static bool is_zero_bvecs(struct bio_vec *bvecs, u32 bytes) 24503da691bfSIlya Dryomov { 24513da691bfSIlya Dryomov struct ceph_bvec_iter it = { 24523da691bfSIlya Dryomov .bvecs = bvecs, 24533da691bfSIlya Dryomov .iter = { .bi_size = bytes }, 24543da691bfSIlya Dryomov }; 24553da691bfSIlya Dryomov 24563da691bfSIlya Dryomov ceph_bvec_iter_advance_step(&it, bytes, ({ 24573da691bfSIlya Dryomov if (memchr_inv(page_address(bv.bv_page) + bv.bv_offset, 0, 24583da691bfSIlya Dryomov bv.bv_len)) 24593da691bfSIlya Dryomov return false; 24603da691bfSIlya Dryomov })); 24613da691bfSIlya Dryomov return true; 24623da691bfSIlya Dryomov } 24633da691bfSIlya Dryomov 24643a482501SIlya Dryomov #define MODS_ONLY U32_MAX 24653a482501SIlya Dryomov 246689a59c1cSIlya Dryomov static int rbd_obj_issue_copyup_empty_snapc(struct rbd_obj_request *obj_req, 246789a59c1cSIlya Dryomov u32 bytes) 24683da691bfSIlya Dryomov { 2469fe943d50SChengguang Xu int ret; 24703da691bfSIlya Dryomov 24713da691bfSIlya Dryomov dout("%s obj_req %p bytes %u\n", __func__, obj_req, bytes); 24723da691bfSIlya Dryomov rbd_assert(obj_req->osd_req->r_ops[0].op == CEPH_OSD_OP_STAT); 247389a59c1cSIlya Dryomov rbd_assert(bytes > 0 && bytes != MODS_ONLY); 24743da691bfSIlya Dryomov rbd_osd_req_destroy(obj_req->osd_req); 24753da691bfSIlya Dryomov 247689a59c1cSIlya Dryomov obj_req->osd_req = __rbd_osd_req_create(obj_req, &rbd_empty_snapc, 1); 24773da691bfSIlya Dryomov if (!obj_req->osd_req) 24783da691bfSIlya Dryomov return -ENOMEM; 24793da691bfSIlya Dryomov 248024639ce5SIlya Dryomov ret = osd_req_op_cls_init(obj_req->osd_req, 0, "rbd", "copyup"); 2481fe943d50SChengguang Xu if (ret) 2482fe943d50SChengguang Xu return ret; 2483fe943d50SChengguang Xu 24843da691bfSIlya Dryomov osd_req_op_cls_request_data_bvecs(obj_req->osd_req, 0, 24850010f705SIlya Dryomov obj_req->copyup_bvecs, 24860010f705SIlya Dryomov obj_req->copyup_bvec_count, 24870010f705SIlya Dryomov bytes); 248889a59c1cSIlya Dryomov rbd_osd_req_format_write(obj_req); 24893da691bfSIlya Dryomov 249089a59c1cSIlya Dryomov ret = ceph_osdc_alloc_messages(obj_req->osd_req, GFP_NOIO); 249189a59c1cSIlya Dryomov if (ret) 249289a59c1cSIlya Dryomov return ret; 249389a59c1cSIlya Dryomov 249489a59c1cSIlya Dryomov rbd_obj_request_submit(obj_req); 249589a59c1cSIlya Dryomov return 0; 249689a59c1cSIlya Dryomov } 249789a59c1cSIlya Dryomov 24983a482501SIlya Dryomov static int rbd_obj_issue_copyup_ops(struct rbd_obj_request *obj_req, u32 bytes) 24993da691bfSIlya Dryomov { 250013488d53SIlya Dryomov struct rbd_img_request *img_req = obj_req->img_request; 25013a482501SIlya Dryomov unsigned int num_osd_ops = (bytes != MODS_ONLY); 25023a482501SIlya Dryomov unsigned int which = 0; 25033da691bfSIlya Dryomov int ret; 25043da691bfSIlya Dryomov 25053da691bfSIlya Dryomov dout("%s obj_req %p bytes %u\n", __func__, obj_req, bytes); 250689a59c1cSIlya Dryomov rbd_assert(obj_req->osd_req->r_ops[0].op == CEPH_OSD_OP_STAT || 250789a59c1cSIlya Dryomov obj_req->osd_req->r_ops[0].op == CEPH_OSD_OP_CALL); 25083da691bfSIlya Dryomov rbd_osd_req_destroy(obj_req->osd_req); 25093da691bfSIlya Dryomov 251013488d53SIlya Dryomov switch (img_req->op_type) { 25113da691bfSIlya Dryomov case OBJ_OP_WRITE: 251213488d53SIlya Dryomov num_osd_ops += count_write_ops(obj_req); 25133da691bfSIlya Dryomov break; 251413488d53SIlya Dryomov case OBJ_OP_ZEROOUT: 251513488d53SIlya Dryomov num_osd_ops += count_zeroout_ops(obj_req); 251613488d53SIlya Dryomov break; 251713488d53SIlya Dryomov default: 251813488d53SIlya Dryomov rbd_assert(0); 251913488d53SIlya Dryomov } 252013488d53SIlya Dryomov 25213da691bfSIlya Dryomov obj_req->osd_req = rbd_osd_req_create(obj_req, num_osd_ops); 25223da691bfSIlya Dryomov if (!obj_req->osd_req) 25233da691bfSIlya Dryomov return -ENOMEM; 25243da691bfSIlya Dryomov 25253a482501SIlya Dryomov if (bytes != MODS_ONLY) { 25263a482501SIlya Dryomov ret = osd_req_op_cls_init(obj_req->osd_req, which, "rbd", 25273a482501SIlya Dryomov "copyup"); 25283da691bfSIlya Dryomov if (ret) 25293da691bfSIlya Dryomov return ret; 25303da691bfSIlya Dryomov 25313a482501SIlya Dryomov osd_req_op_cls_request_data_bvecs(obj_req->osd_req, which++, 25323da691bfSIlya Dryomov obj_req->copyup_bvecs, 25333da691bfSIlya Dryomov obj_req->copyup_bvec_count, 25343da691bfSIlya Dryomov bytes); 25353a482501SIlya Dryomov } 25363da691bfSIlya Dryomov 253713488d53SIlya Dryomov switch (img_req->op_type) { 25383da691bfSIlya Dryomov case OBJ_OP_WRITE: 25393a482501SIlya Dryomov __rbd_obj_setup_write(obj_req, which); 25403da691bfSIlya Dryomov break; 25416484cbe9SIlya Dryomov case OBJ_OP_ZEROOUT: 25423a482501SIlya Dryomov __rbd_obj_setup_zeroout(obj_req, which); 25433da691bfSIlya Dryomov break; 25443da691bfSIlya Dryomov default: 25453da691bfSIlya Dryomov rbd_assert(0); 25463da691bfSIlya Dryomov } 25473da691bfSIlya Dryomov 254826f887e0SIlya Dryomov ret = ceph_osdc_alloc_messages(obj_req->osd_req, GFP_NOIO); 254926f887e0SIlya Dryomov if (ret) 255026f887e0SIlya Dryomov return ret; 255126f887e0SIlya Dryomov 25523da691bfSIlya Dryomov rbd_obj_request_submit(obj_req); 25533da691bfSIlya Dryomov return 0; 25543da691bfSIlya Dryomov } 25553da691bfSIlya Dryomov 25563a482501SIlya Dryomov static int rbd_obj_issue_copyup(struct rbd_obj_request *obj_req, u32 bytes) 25573a482501SIlya Dryomov { 25583a482501SIlya Dryomov /* 25593a482501SIlya Dryomov * Only send non-zero copyup data to save some I/O and network 25603a482501SIlya Dryomov * bandwidth -- zero copyup data is equivalent to the object not 25613a482501SIlya Dryomov * existing. 25623a482501SIlya Dryomov */ 25633a482501SIlya Dryomov if (is_zero_bvecs(obj_req->copyup_bvecs, bytes)) { 25643a482501SIlya Dryomov dout("%s obj_req %p detected zeroes\n", __func__, obj_req); 25653a482501SIlya Dryomov bytes = 0; 25663a482501SIlya Dryomov } 25673a482501SIlya Dryomov 256889a59c1cSIlya Dryomov if (obj_req->img_request->snapc->num_snaps && bytes > 0) { 256989a59c1cSIlya Dryomov /* 257089a59c1cSIlya Dryomov * Send a copyup request with an empty snapshot context to 257189a59c1cSIlya Dryomov * deep-copyup the object through all existing snapshots. 257289a59c1cSIlya Dryomov * A second request with the current snapshot context will be 257389a59c1cSIlya Dryomov * sent for the actual modification. 257489a59c1cSIlya Dryomov */ 257589a59c1cSIlya Dryomov obj_req->write_state = RBD_OBJ_WRITE_COPYUP_EMPTY_SNAPC; 257689a59c1cSIlya Dryomov return rbd_obj_issue_copyup_empty_snapc(obj_req, bytes); 257789a59c1cSIlya Dryomov } 257889a59c1cSIlya Dryomov 25793a482501SIlya Dryomov obj_req->write_state = RBD_OBJ_WRITE_COPYUP_OPS; 25803a482501SIlya Dryomov return rbd_obj_issue_copyup_ops(obj_req, bytes); 25813a482501SIlya Dryomov } 25823a482501SIlya Dryomov 25837e07efb1SIlya Dryomov static int setup_copyup_bvecs(struct rbd_obj_request *obj_req, u64 obj_overlap) 25847e07efb1SIlya Dryomov { 25857e07efb1SIlya Dryomov u32 i; 25867e07efb1SIlya Dryomov 25877e07efb1SIlya Dryomov rbd_assert(!obj_req->copyup_bvecs); 25887e07efb1SIlya Dryomov obj_req->copyup_bvec_count = calc_pages_for(0, obj_overlap); 25897e07efb1SIlya Dryomov obj_req->copyup_bvecs = kcalloc(obj_req->copyup_bvec_count, 25907e07efb1SIlya Dryomov sizeof(*obj_req->copyup_bvecs), 25917e07efb1SIlya Dryomov GFP_NOIO); 25927e07efb1SIlya Dryomov if (!obj_req->copyup_bvecs) 25937e07efb1SIlya Dryomov return -ENOMEM; 25947e07efb1SIlya Dryomov 25957e07efb1SIlya Dryomov for (i = 0; i < obj_req->copyup_bvec_count; i++) { 25967e07efb1SIlya Dryomov unsigned int len = min(obj_overlap, (u64)PAGE_SIZE); 25977e07efb1SIlya Dryomov 25987e07efb1SIlya Dryomov obj_req->copyup_bvecs[i].bv_page = alloc_page(GFP_NOIO); 25997e07efb1SIlya Dryomov if (!obj_req->copyup_bvecs[i].bv_page) 26007e07efb1SIlya Dryomov return -ENOMEM; 26017e07efb1SIlya Dryomov 26027e07efb1SIlya Dryomov obj_req->copyup_bvecs[i].bv_offset = 0; 26037e07efb1SIlya Dryomov obj_req->copyup_bvecs[i].bv_len = len; 26047e07efb1SIlya Dryomov obj_overlap -= len; 26057e07efb1SIlya Dryomov } 26067e07efb1SIlya Dryomov 26077e07efb1SIlya Dryomov rbd_assert(!obj_overlap); 26087e07efb1SIlya Dryomov return 0; 26097e07efb1SIlya Dryomov } 26107e07efb1SIlya Dryomov 26113da691bfSIlya Dryomov static int rbd_obj_handle_write_guard(struct rbd_obj_request *obj_req) 26123da691bfSIlya Dryomov { 26133da691bfSIlya Dryomov struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; 26143da691bfSIlya Dryomov int ret; 26153da691bfSIlya Dryomov 261686bd7998SIlya Dryomov rbd_assert(obj_req->num_img_extents); 261786bd7998SIlya Dryomov prune_extents(obj_req->img_extents, &obj_req->num_img_extents, 261886bd7998SIlya Dryomov rbd_dev->parent_overlap); 261986bd7998SIlya Dryomov if (!obj_req->num_img_extents) { 26203da691bfSIlya Dryomov /* 26213da691bfSIlya Dryomov * The overlap has become 0 (most likely because the 26223a482501SIlya Dryomov * image has been flattened). Re-submit the original write 26233a482501SIlya Dryomov * request -- pass MODS_ONLY since the copyup isn't needed 26243a482501SIlya Dryomov * anymore. 26253da691bfSIlya Dryomov */ 26263a482501SIlya Dryomov obj_req->write_state = RBD_OBJ_WRITE_COPYUP_OPS; 26273a482501SIlya Dryomov return rbd_obj_issue_copyup_ops(obj_req, MODS_ONLY); 26283da691bfSIlya Dryomov } 26293da691bfSIlya Dryomov 263086bd7998SIlya Dryomov ret = setup_copyup_bvecs(obj_req, rbd_obj_img_extents_bytes(obj_req)); 26313da691bfSIlya Dryomov if (ret) 26323da691bfSIlya Dryomov return ret; 26333da691bfSIlya Dryomov 26343a482501SIlya Dryomov obj_req->write_state = RBD_OBJ_WRITE_READ_FROM_PARENT; 263586bd7998SIlya Dryomov return rbd_obj_read_from_parent(obj_req); 26363da691bfSIlya Dryomov } 26373da691bfSIlya Dryomov 26383da691bfSIlya Dryomov static bool rbd_obj_handle_write(struct rbd_obj_request *obj_req) 26393da691bfSIlya Dryomov { 26403da691bfSIlya Dryomov int ret; 26413da691bfSIlya Dryomov 26423da691bfSIlya Dryomov switch (obj_req->write_state) { 26433da691bfSIlya Dryomov case RBD_OBJ_WRITE_GUARD: 26443da691bfSIlya Dryomov rbd_assert(!obj_req->xferred); 26453da691bfSIlya Dryomov if (obj_req->result == -ENOENT) { 26463da691bfSIlya Dryomov /* 26473da691bfSIlya Dryomov * The target object doesn't exist. Read the data for 26483da691bfSIlya Dryomov * the entire target object up to the overlap point (if 26493da691bfSIlya Dryomov * any) from the parent, so we can use it for a copyup. 26503da691bfSIlya Dryomov */ 26513da691bfSIlya Dryomov ret = rbd_obj_handle_write_guard(obj_req); 26523da691bfSIlya Dryomov if (ret) { 26533da691bfSIlya Dryomov obj_req->result = ret; 26543da691bfSIlya Dryomov return true; 26553da691bfSIlya Dryomov } 26563da691bfSIlya Dryomov return false; 26573da691bfSIlya Dryomov } 26583da691bfSIlya Dryomov /* fall through */ 26593da691bfSIlya Dryomov case RBD_OBJ_WRITE_FLAT: 26603a482501SIlya Dryomov case RBD_OBJ_WRITE_COPYUP_OPS: 26613da691bfSIlya Dryomov if (!obj_req->result) 26623da691bfSIlya Dryomov /* 26633da691bfSIlya Dryomov * There is no such thing as a successful short 26643da691bfSIlya Dryomov * write -- indicate the whole request was satisfied. 26653da691bfSIlya Dryomov */ 266643df3d35SIlya Dryomov obj_req->xferred = obj_req->ex.oe_len; 26673da691bfSIlya Dryomov return true; 26683a482501SIlya Dryomov case RBD_OBJ_WRITE_READ_FROM_PARENT: 26693da691bfSIlya Dryomov if (obj_req->result) 26703a482501SIlya Dryomov return true; 26713da691bfSIlya Dryomov 26723da691bfSIlya Dryomov rbd_assert(obj_req->xferred); 26733da691bfSIlya Dryomov ret = rbd_obj_issue_copyup(obj_req, obj_req->xferred); 26743da691bfSIlya Dryomov if (ret) { 26753da691bfSIlya Dryomov obj_req->result = ret; 2676356889c4SIlya Dryomov obj_req->xferred = 0; 26773da691bfSIlya Dryomov return true; 26783da691bfSIlya Dryomov } 26793da691bfSIlya Dryomov return false; 268089a59c1cSIlya Dryomov case RBD_OBJ_WRITE_COPYUP_EMPTY_SNAPC: 268189a59c1cSIlya Dryomov if (obj_req->result) 268289a59c1cSIlya Dryomov return true; 268389a59c1cSIlya Dryomov 268489a59c1cSIlya Dryomov obj_req->write_state = RBD_OBJ_WRITE_COPYUP_OPS; 268589a59c1cSIlya Dryomov ret = rbd_obj_issue_copyup_ops(obj_req, MODS_ONLY); 268689a59c1cSIlya Dryomov if (ret) { 268789a59c1cSIlya Dryomov obj_req->result = ret; 26883da691bfSIlya Dryomov return true; 26893da691bfSIlya Dryomov } 26903da691bfSIlya Dryomov return false; 26913da691bfSIlya Dryomov default: 2692c6244b3bSArnd Bergmann BUG(); 26933da691bfSIlya Dryomov } 26943da691bfSIlya Dryomov } 26953da691bfSIlya Dryomov 26963da691bfSIlya Dryomov /* 26973da691bfSIlya Dryomov * Returns true if @obj_req is completed, or false otherwise. 26983da691bfSIlya Dryomov */ 26993da691bfSIlya Dryomov static bool __rbd_obj_handle_request(struct rbd_obj_request *obj_req) 27003da691bfSIlya Dryomov { 27019bb0248dSIlya Dryomov switch (obj_req->img_request->op_type) { 27023da691bfSIlya Dryomov case OBJ_OP_READ: 27033da691bfSIlya Dryomov return rbd_obj_handle_read(obj_req); 27043da691bfSIlya Dryomov case OBJ_OP_WRITE: 27053da691bfSIlya Dryomov return rbd_obj_handle_write(obj_req); 27063da691bfSIlya Dryomov case OBJ_OP_DISCARD: 27076484cbe9SIlya Dryomov case OBJ_OP_ZEROOUT: 27083da691bfSIlya Dryomov if (rbd_obj_handle_write(obj_req)) { 27093da691bfSIlya Dryomov /* 27103da691bfSIlya Dryomov * Hide -ENOENT from delete/truncate/zero -- discarding 27113da691bfSIlya Dryomov * a non-existent object is not a problem. 27123da691bfSIlya Dryomov */ 27133da691bfSIlya Dryomov if (obj_req->result == -ENOENT) { 27143da691bfSIlya Dryomov obj_req->result = 0; 271543df3d35SIlya Dryomov obj_req->xferred = obj_req->ex.oe_len; 27163da691bfSIlya Dryomov } 27173da691bfSIlya Dryomov return true; 27183da691bfSIlya Dryomov } 27193da691bfSIlya Dryomov return false; 27203da691bfSIlya Dryomov default: 2721c6244b3bSArnd Bergmann BUG(); 27223da691bfSIlya Dryomov } 27233da691bfSIlya Dryomov } 27243da691bfSIlya Dryomov 27257114edacSIlya Dryomov static void rbd_obj_end_request(struct rbd_obj_request *obj_req) 27267114edacSIlya Dryomov { 27277114edacSIlya Dryomov struct rbd_img_request *img_req = obj_req->img_request; 27287114edacSIlya Dryomov 27297114edacSIlya Dryomov rbd_assert((!obj_req->result && 273043df3d35SIlya Dryomov obj_req->xferred == obj_req->ex.oe_len) || 27317114edacSIlya Dryomov (obj_req->result < 0 && !obj_req->xferred)); 27327114edacSIlya Dryomov if (!obj_req->result) { 27337114edacSIlya Dryomov img_req->xferred += obj_req->xferred; 273402c74fbaSAlex Elder return; 273502c74fbaSAlex Elder } 273602c74fbaSAlex Elder 27377114edacSIlya Dryomov rbd_warn(img_req->rbd_dev, 27387114edacSIlya Dryomov "%s at objno %llu %llu~%llu result %d xferred %llu", 273943df3d35SIlya Dryomov obj_op_name(img_req->op_type), obj_req->ex.oe_objno, 274043df3d35SIlya Dryomov obj_req->ex.oe_off, obj_req->ex.oe_len, obj_req->result, 27417114edacSIlya Dryomov obj_req->xferred); 27427114edacSIlya Dryomov if (!img_req->result) { 27437114edacSIlya Dryomov img_req->result = obj_req->result; 27447114edacSIlya Dryomov img_req->xferred = 0; 2745a9e8ba2cSAlex Elder } 27468b3e1a56SAlex Elder } 27478b3e1a56SAlex Elder 27483da691bfSIlya Dryomov static void rbd_img_end_child_request(struct rbd_img_request *img_req) 27498b3e1a56SAlex Elder { 27503da691bfSIlya Dryomov struct rbd_obj_request *obj_req = img_req->obj_request; 27518b3e1a56SAlex Elder 27523da691bfSIlya Dryomov rbd_assert(test_bit(IMG_REQ_CHILD, &img_req->flags)); 275386bd7998SIlya Dryomov rbd_assert((!img_req->result && 275486bd7998SIlya Dryomov img_req->xferred == rbd_obj_img_extents_bytes(obj_req)) || 275586bd7998SIlya Dryomov (img_req->result < 0 && !img_req->xferred)); 27568b3e1a56SAlex Elder 27573da691bfSIlya Dryomov obj_req->result = img_req->result; 27583da691bfSIlya Dryomov obj_req->xferred = img_req->xferred; 27593da691bfSIlya Dryomov rbd_img_request_put(img_req); 27607114edacSIlya Dryomov } 27618b3e1a56SAlex Elder 27627114edacSIlya Dryomov static void rbd_img_end_request(struct rbd_img_request *img_req) 27637114edacSIlya Dryomov { 27647114edacSIlya Dryomov rbd_assert(!test_bit(IMG_REQ_CHILD, &img_req->flags)); 27657114edacSIlya Dryomov rbd_assert((!img_req->result && 27667114edacSIlya Dryomov img_req->xferred == blk_rq_bytes(img_req->rq)) || 27677114edacSIlya Dryomov (img_req->result < 0 && !img_req->xferred)); 27688b3e1a56SAlex Elder 27697114edacSIlya Dryomov blk_mq_end_request(img_req->rq, 27707114edacSIlya Dryomov errno_to_blk_status(img_req->result)); 27717114edacSIlya Dryomov rbd_img_request_put(img_req); 27723da691bfSIlya Dryomov } 27738b3e1a56SAlex Elder 27743da691bfSIlya Dryomov static void rbd_obj_handle_request(struct rbd_obj_request *obj_req) 27753da691bfSIlya Dryomov { 27767114edacSIlya Dryomov struct rbd_img_request *img_req; 27777114edacSIlya Dryomov 27787114edacSIlya Dryomov again: 27793da691bfSIlya Dryomov if (!__rbd_obj_handle_request(obj_req)) 27808b3e1a56SAlex Elder return; 27813da691bfSIlya Dryomov 27827114edacSIlya Dryomov img_req = obj_req->img_request; 27837114edacSIlya Dryomov spin_lock(&img_req->completion_lock); 27847114edacSIlya Dryomov rbd_obj_end_request(obj_req); 27857114edacSIlya Dryomov rbd_assert(img_req->pending_count); 27867114edacSIlya Dryomov if (--img_req->pending_count) { 27877114edacSIlya Dryomov spin_unlock(&img_req->completion_lock); 27887114edacSIlya Dryomov return; 27897114edacSIlya Dryomov } 27907114edacSIlya Dryomov 27917114edacSIlya Dryomov spin_unlock(&img_req->completion_lock); 27927114edacSIlya Dryomov if (test_bit(IMG_REQ_CHILD, &img_req->flags)) { 27937114edacSIlya Dryomov obj_req = img_req->obj_request; 27947114edacSIlya Dryomov rbd_img_end_child_request(img_req); 27957114edacSIlya Dryomov goto again; 27967114edacSIlya Dryomov } 27977114edacSIlya Dryomov rbd_img_end_request(img_req); 27988b3e1a56SAlex Elder } 27998b3e1a56SAlex Elder 2800ed95b21aSIlya Dryomov static const struct rbd_client_id rbd_empty_cid; 2801ed95b21aSIlya Dryomov 2802ed95b21aSIlya Dryomov static bool rbd_cid_equal(const struct rbd_client_id *lhs, 2803ed95b21aSIlya Dryomov const struct rbd_client_id *rhs) 2804ed95b21aSIlya Dryomov { 2805ed95b21aSIlya Dryomov return lhs->gid == rhs->gid && lhs->handle == rhs->handle; 2806ed95b21aSIlya Dryomov } 2807ed95b21aSIlya Dryomov 2808ed95b21aSIlya Dryomov static struct rbd_client_id rbd_get_cid(struct rbd_device *rbd_dev) 2809ed95b21aSIlya Dryomov { 2810ed95b21aSIlya Dryomov struct rbd_client_id cid; 2811ed95b21aSIlya Dryomov 2812ed95b21aSIlya Dryomov mutex_lock(&rbd_dev->watch_mutex); 2813ed95b21aSIlya Dryomov cid.gid = ceph_client_gid(rbd_dev->rbd_client->client); 2814ed95b21aSIlya Dryomov cid.handle = rbd_dev->watch_cookie; 2815ed95b21aSIlya Dryomov mutex_unlock(&rbd_dev->watch_mutex); 2816ed95b21aSIlya Dryomov return cid; 2817ed95b21aSIlya Dryomov } 2818ed95b21aSIlya Dryomov 2819ed95b21aSIlya Dryomov /* 2820ed95b21aSIlya Dryomov * lock_rwsem must be held for write 2821ed95b21aSIlya Dryomov */ 2822ed95b21aSIlya Dryomov static void rbd_set_owner_cid(struct rbd_device *rbd_dev, 2823ed95b21aSIlya Dryomov const struct rbd_client_id *cid) 2824ed95b21aSIlya Dryomov { 2825ed95b21aSIlya Dryomov dout("%s rbd_dev %p %llu-%llu -> %llu-%llu\n", __func__, rbd_dev, 2826ed95b21aSIlya Dryomov rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle, 2827ed95b21aSIlya Dryomov cid->gid, cid->handle); 2828ed95b21aSIlya Dryomov rbd_dev->owner_cid = *cid; /* struct */ 2829ed95b21aSIlya Dryomov } 2830ed95b21aSIlya Dryomov 2831ed95b21aSIlya Dryomov static void format_lock_cookie(struct rbd_device *rbd_dev, char *buf) 2832ed95b21aSIlya Dryomov { 2833ed95b21aSIlya Dryomov mutex_lock(&rbd_dev->watch_mutex); 2834ed95b21aSIlya Dryomov sprintf(buf, "%s %llu", RBD_LOCK_COOKIE_PREFIX, rbd_dev->watch_cookie); 2835ed95b21aSIlya Dryomov mutex_unlock(&rbd_dev->watch_mutex); 2836ed95b21aSIlya Dryomov } 2837ed95b21aSIlya Dryomov 2838edd8ca80SFlorian Margaine static void __rbd_lock(struct rbd_device *rbd_dev, const char *cookie) 2839edd8ca80SFlorian Margaine { 2840edd8ca80SFlorian Margaine struct rbd_client_id cid = rbd_get_cid(rbd_dev); 2841edd8ca80SFlorian Margaine 2842edd8ca80SFlorian Margaine strcpy(rbd_dev->lock_cookie, cookie); 2843edd8ca80SFlorian Margaine rbd_set_owner_cid(rbd_dev, &cid); 2844edd8ca80SFlorian Margaine queue_work(rbd_dev->task_wq, &rbd_dev->acquired_lock_work); 2845edd8ca80SFlorian Margaine } 2846edd8ca80SFlorian Margaine 2847ed95b21aSIlya Dryomov /* 2848ed95b21aSIlya Dryomov * lock_rwsem must be held for write 2849ed95b21aSIlya Dryomov */ 2850ed95b21aSIlya Dryomov static int rbd_lock(struct rbd_device *rbd_dev) 2851ed95b21aSIlya Dryomov { 2852ed95b21aSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 2853ed95b21aSIlya Dryomov char cookie[32]; 2854ed95b21aSIlya Dryomov int ret; 2855ed95b21aSIlya Dryomov 2856cbbfb0ffSIlya Dryomov WARN_ON(__rbd_is_lock_owner(rbd_dev) || 2857cbbfb0ffSIlya Dryomov rbd_dev->lock_cookie[0] != '\0'); 2858ed95b21aSIlya Dryomov 2859ed95b21aSIlya Dryomov format_lock_cookie(rbd_dev, cookie); 2860ed95b21aSIlya Dryomov ret = ceph_cls_lock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc, 2861ed95b21aSIlya Dryomov RBD_LOCK_NAME, CEPH_CLS_LOCK_EXCLUSIVE, cookie, 2862ed95b21aSIlya Dryomov RBD_LOCK_TAG, "", 0); 2863ed95b21aSIlya Dryomov if (ret) 2864ed95b21aSIlya Dryomov return ret; 2865ed95b21aSIlya Dryomov 2866ed95b21aSIlya Dryomov rbd_dev->lock_state = RBD_LOCK_STATE_LOCKED; 2867edd8ca80SFlorian Margaine __rbd_lock(rbd_dev, cookie); 2868ed95b21aSIlya Dryomov return 0; 2869ed95b21aSIlya Dryomov } 2870ed95b21aSIlya Dryomov 2871ed95b21aSIlya Dryomov /* 2872ed95b21aSIlya Dryomov * lock_rwsem must be held for write 2873ed95b21aSIlya Dryomov */ 2874bbead745SIlya Dryomov static void rbd_unlock(struct rbd_device *rbd_dev) 2875ed95b21aSIlya Dryomov { 2876ed95b21aSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 2877ed95b21aSIlya Dryomov int ret; 2878ed95b21aSIlya Dryomov 2879cbbfb0ffSIlya Dryomov WARN_ON(!__rbd_is_lock_owner(rbd_dev) || 2880cbbfb0ffSIlya Dryomov rbd_dev->lock_cookie[0] == '\0'); 2881ed95b21aSIlya Dryomov 2882ed95b21aSIlya Dryomov ret = ceph_cls_unlock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc, 2883cbbfb0ffSIlya Dryomov RBD_LOCK_NAME, rbd_dev->lock_cookie); 2884bbead745SIlya Dryomov if (ret && ret != -ENOENT) 2885bbead745SIlya Dryomov rbd_warn(rbd_dev, "failed to unlock: %d", ret); 2886ed95b21aSIlya Dryomov 2887bbead745SIlya Dryomov /* treat errors as the image is unlocked */ 2888bbead745SIlya Dryomov rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED; 2889cbbfb0ffSIlya Dryomov rbd_dev->lock_cookie[0] = '\0'; 2890ed95b21aSIlya Dryomov rbd_set_owner_cid(rbd_dev, &rbd_empty_cid); 2891ed95b21aSIlya Dryomov queue_work(rbd_dev->task_wq, &rbd_dev->released_lock_work); 2892ed95b21aSIlya Dryomov } 2893ed95b21aSIlya Dryomov 2894ed95b21aSIlya Dryomov static int __rbd_notify_op_lock(struct rbd_device *rbd_dev, 2895ed95b21aSIlya Dryomov enum rbd_notify_op notify_op, 2896ed95b21aSIlya Dryomov struct page ***preply_pages, 2897ed95b21aSIlya Dryomov size_t *preply_len) 2898ed95b21aSIlya Dryomov { 2899ed95b21aSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 2900ed95b21aSIlya Dryomov struct rbd_client_id cid = rbd_get_cid(rbd_dev); 290108a79102SKyle Spiers char buf[4 + 8 + 8 + CEPH_ENCODING_START_BLK_LEN]; 290208a79102SKyle Spiers int buf_size = sizeof(buf); 2903ed95b21aSIlya Dryomov void *p = buf; 2904ed95b21aSIlya Dryomov 2905ed95b21aSIlya Dryomov dout("%s rbd_dev %p notify_op %d\n", __func__, rbd_dev, notify_op); 2906ed95b21aSIlya Dryomov 2907ed95b21aSIlya Dryomov /* encode *LockPayload NotifyMessage (op + ClientId) */ 2908ed95b21aSIlya Dryomov ceph_start_encoding(&p, 2, 1, buf_size - CEPH_ENCODING_START_BLK_LEN); 2909ed95b21aSIlya Dryomov ceph_encode_32(&p, notify_op); 2910ed95b21aSIlya Dryomov ceph_encode_64(&p, cid.gid); 2911ed95b21aSIlya Dryomov ceph_encode_64(&p, cid.handle); 2912ed95b21aSIlya Dryomov 2913ed95b21aSIlya Dryomov return ceph_osdc_notify(osdc, &rbd_dev->header_oid, 2914ed95b21aSIlya Dryomov &rbd_dev->header_oloc, buf, buf_size, 2915ed95b21aSIlya Dryomov RBD_NOTIFY_TIMEOUT, preply_pages, preply_len); 2916ed95b21aSIlya Dryomov } 2917ed95b21aSIlya Dryomov 2918ed95b21aSIlya Dryomov static void rbd_notify_op_lock(struct rbd_device *rbd_dev, 2919ed95b21aSIlya Dryomov enum rbd_notify_op notify_op) 2920ed95b21aSIlya Dryomov { 2921ed95b21aSIlya Dryomov struct page **reply_pages; 2922ed95b21aSIlya Dryomov size_t reply_len; 2923ed95b21aSIlya Dryomov 2924ed95b21aSIlya Dryomov __rbd_notify_op_lock(rbd_dev, notify_op, &reply_pages, &reply_len); 2925ed95b21aSIlya Dryomov ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len)); 2926ed95b21aSIlya Dryomov } 2927ed95b21aSIlya Dryomov 2928ed95b21aSIlya Dryomov static void rbd_notify_acquired_lock(struct work_struct *work) 2929ed95b21aSIlya Dryomov { 2930ed95b21aSIlya Dryomov struct rbd_device *rbd_dev = container_of(work, struct rbd_device, 2931ed95b21aSIlya Dryomov acquired_lock_work); 2932ed95b21aSIlya Dryomov 2933ed95b21aSIlya Dryomov rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_ACQUIRED_LOCK); 2934ed95b21aSIlya Dryomov } 2935ed95b21aSIlya Dryomov 2936ed95b21aSIlya Dryomov static void rbd_notify_released_lock(struct work_struct *work) 2937ed95b21aSIlya Dryomov { 2938ed95b21aSIlya Dryomov struct rbd_device *rbd_dev = container_of(work, struct rbd_device, 2939ed95b21aSIlya Dryomov released_lock_work); 2940ed95b21aSIlya Dryomov 2941ed95b21aSIlya Dryomov rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_RELEASED_LOCK); 2942ed95b21aSIlya Dryomov } 2943ed95b21aSIlya Dryomov 2944ed95b21aSIlya Dryomov static int rbd_request_lock(struct rbd_device *rbd_dev) 2945ed95b21aSIlya Dryomov { 2946ed95b21aSIlya Dryomov struct page **reply_pages; 2947ed95b21aSIlya Dryomov size_t reply_len; 2948ed95b21aSIlya Dryomov bool lock_owner_responded = false; 2949ed95b21aSIlya Dryomov int ret; 2950ed95b21aSIlya Dryomov 2951ed95b21aSIlya Dryomov dout("%s rbd_dev %p\n", __func__, rbd_dev); 2952ed95b21aSIlya Dryomov 2953ed95b21aSIlya Dryomov ret = __rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_REQUEST_LOCK, 2954ed95b21aSIlya Dryomov &reply_pages, &reply_len); 2955ed95b21aSIlya Dryomov if (ret && ret != -ETIMEDOUT) { 2956ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "failed to request lock: %d", ret); 2957ed95b21aSIlya Dryomov goto out; 2958ed95b21aSIlya Dryomov } 2959ed95b21aSIlya Dryomov 2960ed95b21aSIlya Dryomov if (reply_len > 0 && reply_len <= PAGE_SIZE) { 2961ed95b21aSIlya Dryomov void *p = page_address(reply_pages[0]); 2962ed95b21aSIlya Dryomov void *const end = p + reply_len; 2963ed95b21aSIlya Dryomov u32 n; 2964ed95b21aSIlya Dryomov 2965ed95b21aSIlya Dryomov ceph_decode_32_safe(&p, end, n, e_inval); /* num_acks */ 2966ed95b21aSIlya Dryomov while (n--) { 2967ed95b21aSIlya Dryomov u8 struct_v; 2968ed95b21aSIlya Dryomov u32 len; 2969ed95b21aSIlya Dryomov 2970ed95b21aSIlya Dryomov ceph_decode_need(&p, end, 8 + 8, e_inval); 2971ed95b21aSIlya Dryomov p += 8 + 8; /* skip gid and cookie */ 2972ed95b21aSIlya Dryomov 2973ed95b21aSIlya Dryomov ceph_decode_32_safe(&p, end, len, e_inval); 2974ed95b21aSIlya Dryomov if (!len) 2975ed95b21aSIlya Dryomov continue; 2976ed95b21aSIlya Dryomov 2977ed95b21aSIlya Dryomov if (lock_owner_responded) { 2978ed95b21aSIlya Dryomov rbd_warn(rbd_dev, 2979ed95b21aSIlya Dryomov "duplicate lock owners detected"); 2980ed95b21aSIlya Dryomov ret = -EIO; 2981ed95b21aSIlya Dryomov goto out; 2982ed95b21aSIlya Dryomov } 2983ed95b21aSIlya Dryomov 2984ed95b21aSIlya Dryomov lock_owner_responded = true; 2985ed95b21aSIlya Dryomov ret = ceph_start_decoding(&p, end, 1, "ResponseMessage", 2986ed95b21aSIlya Dryomov &struct_v, &len); 2987ed95b21aSIlya Dryomov if (ret) { 2988ed95b21aSIlya Dryomov rbd_warn(rbd_dev, 2989ed95b21aSIlya Dryomov "failed to decode ResponseMessage: %d", 2990ed95b21aSIlya Dryomov ret); 2991ed95b21aSIlya Dryomov goto e_inval; 2992ed95b21aSIlya Dryomov } 2993ed95b21aSIlya Dryomov 2994ed95b21aSIlya Dryomov ret = ceph_decode_32(&p); 2995ed95b21aSIlya Dryomov } 2996ed95b21aSIlya Dryomov } 2997ed95b21aSIlya Dryomov 2998ed95b21aSIlya Dryomov if (!lock_owner_responded) { 2999ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "no lock owners detected"); 3000ed95b21aSIlya Dryomov ret = -ETIMEDOUT; 3001ed95b21aSIlya Dryomov } 3002ed95b21aSIlya Dryomov 3003ed95b21aSIlya Dryomov out: 3004ed95b21aSIlya Dryomov ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len)); 3005ed95b21aSIlya Dryomov return ret; 3006ed95b21aSIlya Dryomov 3007ed95b21aSIlya Dryomov e_inval: 3008ed95b21aSIlya Dryomov ret = -EINVAL; 3009ed95b21aSIlya Dryomov goto out; 3010ed95b21aSIlya Dryomov } 3011ed95b21aSIlya Dryomov 3012ed95b21aSIlya Dryomov static void wake_requests(struct rbd_device *rbd_dev, bool wake_all) 3013ed95b21aSIlya Dryomov { 3014ed95b21aSIlya Dryomov dout("%s rbd_dev %p wake_all %d\n", __func__, rbd_dev, wake_all); 3015ed95b21aSIlya Dryomov 3016ed95b21aSIlya Dryomov cancel_delayed_work(&rbd_dev->lock_dwork); 3017ed95b21aSIlya Dryomov if (wake_all) 3018ed95b21aSIlya Dryomov wake_up_all(&rbd_dev->lock_waitq); 3019ed95b21aSIlya Dryomov else 3020ed95b21aSIlya Dryomov wake_up(&rbd_dev->lock_waitq); 3021ed95b21aSIlya Dryomov } 3022ed95b21aSIlya Dryomov 3023ed95b21aSIlya Dryomov static int get_lock_owner_info(struct rbd_device *rbd_dev, 3024ed95b21aSIlya Dryomov struct ceph_locker **lockers, u32 *num_lockers) 3025ed95b21aSIlya Dryomov { 3026ed95b21aSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 3027ed95b21aSIlya Dryomov u8 lock_type; 3028ed95b21aSIlya Dryomov char *lock_tag; 3029ed95b21aSIlya Dryomov int ret; 3030ed95b21aSIlya Dryomov 3031ed95b21aSIlya Dryomov dout("%s rbd_dev %p\n", __func__, rbd_dev); 3032ed95b21aSIlya Dryomov 3033ed95b21aSIlya Dryomov ret = ceph_cls_lock_info(osdc, &rbd_dev->header_oid, 3034ed95b21aSIlya Dryomov &rbd_dev->header_oloc, RBD_LOCK_NAME, 3035ed95b21aSIlya Dryomov &lock_type, &lock_tag, lockers, num_lockers); 3036ed95b21aSIlya Dryomov if (ret) 3037ed95b21aSIlya Dryomov return ret; 3038ed95b21aSIlya Dryomov 3039ed95b21aSIlya Dryomov if (*num_lockers == 0) { 3040ed95b21aSIlya Dryomov dout("%s rbd_dev %p no lockers detected\n", __func__, rbd_dev); 3041ed95b21aSIlya Dryomov goto out; 3042ed95b21aSIlya Dryomov } 3043ed95b21aSIlya Dryomov 3044ed95b21aSIlya Dryomov if (strcmp(lock_tag, RBD_LOCK_TAG)) { 3045ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "locked by external mechanism, tag %s", 3046ed95b21aSIlya Dryomov lock_tag); 3047ed95b21aSIlya Dryomov ret = -EBUSY; 3048ed95b21aSIlya Dryomov goto out; 3049ed95b21aSIlya Dryomov } 3050ed95b21aSIlya Dryomov 3051ed95b21aSIlya Dryomov if (lock_type == CEPH_CLS_LOCK_SHARED) { 3052ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "shared lock type detected"); 3053ed95b21aSIlya Dryomov ret = -EBUSY; 3054ed95b21aSIlya Dryomov goto out; 3055ed95b21aSIlya Dryomov } 3056ed95b21aSIlya Dryomov 3057ed95b21aSIlya Dryomov if (strncmp((*lockers)[0].id.cookie, RBD_LOCK_COOKIE_PREFIX, 3058ed95b21aSIlya Dryomov strlen(RBD_LOCK_COOKIE_PREFIX))) { 3059ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "locked by external mechanism, cookie %s", 3060ed95b21aSIlya Dryomov (*lockers)[0].id.cookie); 3061ed95b21aSIlya Dryomov ret = -EBUSY; 3062ed95b21aSIlya Dryomov goto out; 3063ed95b21aSIlya Dryomov } 3064ed95b21aSIlya Dryomov 3065ed95b21aSIlya Dryomov out: 3066ed95b21aSIlya Dryomov kfree(lock_tag); 3067ed95b21aSIlya Dryomov return ret; 3068ed95b21aSIlya Dryomov } 3069ed95b21aSIlya Dryomov 3070ed95b21aSIlya Dryomov static int find_watcher(struct rbd_device *rbd_dev, 3071ed95b21aSIlya Dryomov const struct ceph_locker *locker) 3072ed95b21aSIlya Dryomov { 3073ed95b21aSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 3074ed95b21aSIlya Dryomov struct ceph_watch_item *watchers; 3075ed95b21aSIlya Dryomov u32 num_watchers; 3076ed95b21aSIlya Dryomov u64 cookie; 3077ed95b21aSIlya Dryomov int i; 3078ed95b21aSIlya Dryomov int ret; 3079ed95b21aSIlya Dryomov 3080ed95b21aSIlya Dryomov ret = ceph_osdc_list_watchers(osdc, &rbd_dev->header_oid, 3081ed95b21aSIlya Dryomov &rbd_dev->header_oloc, &watchers, 3082ed95b21aSIlya Dryomov &num_watchers); 3083ed95b21aSIlya Dryomov if (ret) 3084ed95b21aSIlya Dryomov return ret; 3085ed95b21aSIlya Dryomov 3086ed95b21aSIlya Dryomov sscanf(locker->id.cookie, RBD_LOCK_COOKIE_PREFIX " %llu", &cookie); 3087ed95b21aSIlya Dryomov for (i = 0; i < num_watchers; i++) { 3088ed95b21aSIlya Dryomov if (!memcmp(&watchers[i].addr, &locker->info.addr, 3089ed95b21aSIlya Dryomov sizeof(locker->info.addr)) && 3090ed95b21aSIlya Dryomov watchers[i].cookie == cookie) { 3091ed95b21aSIlya Dryomov struct rbd_client_id cid = { 3092ed95b21aSIlya Dryomov .gid = le64_to_cpu(watchers[i].name.num), 3093ed95b21aSIlya Dryomov .handle = cookie, 3094ed95b21aSIlya Dryomov }; 3095ed95b21aSIlya Dryomov 3096ed95b21aSIlya Dryomov dout("%s rbd_dev %p found cid %llu-%llu\n", __func__, 3097ed95b21aSIlya Dryomov rbd_dev, cid.gid, cid.handle); 3098ed95b21aSIlya Dryomov rbd_set_owner_cid(rbd_dev, &cid); 3099ed95b21aSIlya Dryomov ret = 1; 3100ed95b21aSIlya Dryomov goto out; 3101ed95b21aSIlya Dryomov } 3102ed95b21aSIlya Dryomov } 3103ed95b21aSIlya Dryomov 3104ed95b21aSIlya Dryomov dout("%s rbd_dev %p no watchers\n", __func__, rbd_dev); 3105ed95b21aSIlya Dryomov ret = 0; 3106ed95b21aSIlya Dryomov out: 3107ed95b21aSIlya Dryomov kfree(watchers); 3108ed95b21aSIlya Dryomov return ret; 3109ed95b21aSIlya Dryomov } 3110ed95b21aSIlya Dryomov 3111ed95b21aSIlya Dryomov /* 3112ed95b21aSIlya Dryomov * lock_rwsem must be held for write 3113ed95b21aSIlya Dryomov */ 3114ed95b21aSIlya Dryomov static int rbd_try_lock(struct rbd_device *rbd_dev) 3115ed95b21aSIlya Dryomov { 3116ed95b21aSIlya Dryomov struct ceph_client *client = rbd_dev->rbd_client->client; 3117ed95b21aSIlya Dryomov struct ceph_locker *lockers; 3118ed95b21aSIlya Dryomov u32 num_lockers; 3119ed95b21aSIlya Dryomov int ret; 3120ed95b21aSIlya Dryomov 3121ed95b21aSIlya Dryomov for (;;) { 3122ed95b21aSIlya Dryomov ret = rbd_lock(rbd_dev); 3123ed95b21aSIlya Dryomov if (ret != -EBUSY) 3124ed95b21aSIlya Dryomov return ret; 3125ed95b21aSIlya Dryomov 3126ed95b21aSIlya Dryomov /* determine if the current lock holder is still alive */ 3127ed95b21aSIlya Dryomov ret = get_lock_owner_info(rbd_dev, &lockers, &num_lockers); 3128ed95b21aSIlya Dryomov if (ret) 3129ed95b21aSIlya Dryomov return ret; 3130ed95b21aSIlya Dryomov 3131ed95b21aSIlya Dryomov if (num_lockers == 0) 3132ed95b21aSIlya Dryomov goto again; 3133ed95b21aSIlya Dryomov 3134ed95b21aSIlya Dryomov ret = find_watcher(rbd_dev, lockers); 3135ed95b21aSIlya Dryomov if (ret) { 3136ed95b21aSIlya Dryomov if (ret > 0) 3137ed95b21aSIlya Dryomov ret = 0; /* have to request lock */ 3138ed95b21aSIlya Dryomov goto out; 3139ed95b21aSIlya Dryomov } 3140ed95b21aSIlya Dryomov 3141ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "%s%llu seems dead, breaking lock", 3142ed95b21aSIlya Dryomov ENTITY_NAME(lockers[0].id.name)); 3143ed95b21aSIlya Dryomov 3144ed95b21aSIlya Dryomov ret = ceph_monc_blacklist_add(&client->monc, 3145ed95b21aSIlya Dryomov &lockers[0].info.addr); 3146ed95b21aSIlya Dryomov if (ret) { 3147ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "blacklist of %s%llu failed: %d", 3148ed95b21aSIlya Dryomov ENTITY_NAME(lockers[0].id.name), ret); 3149ed95b21aSIlya Dryomov goto out; 3150ed95b21aSIlya Dryomov } 3151ed95b21aSIlya Dryomov 3152ed95b21aSIlya Dryomov ret = ceph_cls_break_lock(&client->osdc, &rbd_dev->header_oid, 3153ed95b21aSIlya Dryomov &rbd_dev->header_oloc, RBD_LOCK_NAME, 3154ed95b21aSIlya Dryomov lockers[0].id.cookie, 3155ed95b21aSIlya Dryomov &lockers[0].id.name); 3156ed95b21aSIlya Dryomov if (ret && ret != -ENOENT) 3157ed95b21aSIlya Dryomov goto out; 3158ed95b21aSIlya Dryomov 3159ed95b21aSIlya Dryomov again: 3160ed95b21aSIlya Dryomov ceph_free_lockers(lockers, num_lockers); 3161ed95b21aSIlya Dryomov } 3162ed95b21aSIlya Dryomov 3163ed95b21aSIlya Dryomov out: 3164ed95b21aSIlya Dryomov ceph_free_lockers(lockers, num_lockers); 3165ed95b21aSIlya Dryomov return ret; 3166ed95b21aSIlya Dryomov } 3167ed95b21aSIlya Dryomov 3168ed95b21aSIlya Dryomov /* 3169ed95b21aSIlya Dryomov * ret is set only if lock_state is RBD_LOCK_STATE_UNLOCKED 3170ed95b21aSIlya Dryomov */ 3171ed95b21aSIlya Dryomov static enum rbd_lock_state rbd_try_acquire_lock(struct rbd_device *rbd_dev, 3172ed95b21aSIlya Dryomov int *pret) 3173ed95b21aSIlya Dryomov { 3174ed95b21aSIlya Dryomov enum rbd_lock_state lock_state; 3175ed95b21aSIlya Dryomov 3176ed95b21aSIlya Dryomov down_read(&rbd_dev->lock_rwsem); 3177ed95b21aSIlya Dryomov dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev, 3178ed95b21aSIlya Dryomov rbd_dev->lock_state); 3179ed95b21aSIlya Dryomov if (__rbd_is_lock_owner(rbd_dev)) { 3180ed95b21aSIlya Dryomov lock_state = rbd_dev->lock_state; 3181ed95b21aSIlya Dryomov up_read(&rbd_dev->lock_rwsem); 3182ed95b21aSIlya Dryomov return lock_state; 3183ed95b21aSIlya Dryomov } 3184ed95b21aSIlya Dryomov 3185ed95b21aSIlya Dryomov up_read(&rbd_dev->lock_rwsem); 3186ed95b21aSIlya Dryomov down_write(&rbd_dev->lock_rwsem); 3187ed95b21aSIlya Dryomov dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev, 3188ed95b21aSIlya Dryomov rbd_dev->lock_state); 3189ed95b21aSIlya Dryomov if (!__rbd_is_lock_owner(rbd_dev)) { 3190ed95b21aSIlya Dryomov *pret = rbd_try_lock(rbd_dev); 3191ed95b21aSIlya Dryomov if (*pret) 3192ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "failed to acquire lock: %d", *pret); 3193ed95b21aSIlya Dryomov } 3194ed95b21aSIlya Dryomov 3195ed95b21aSIlya Dryomov lock_state = rbd_dev->lock_state; 3196ed95b21aSIlya Dryomov up_write(&rbd_dev->lock_rwsem); 3197ed95b21aSIlya Dryomov return lock_state; 3198ed95b21aSIlya Dryomov } 3199ed95b21aSIlya Dryomov 3200ed95b21aSIlya Dryomov static void rbd_acquire_lock(struct work_struct *work) 3201ed95b21aSIlya Dryomov { 3202ed95b21aSIlya Dryomov struct rbd_device *rbd_dev = container_of(to_delayed_work(work), 3203ed95b21aSIlya Dryomov struct rbd_device, lock_dwork); 3204ed95b21aSIlya Dryomov enum rbd_lock_state lock_state; 320537f13252SKefeng Wang int ret = 0; 3206ed95b21aSIlya Dryomov 3207ed95b21aSIlya Dryomov dout("%s rbd_dev %p\n", __func__, rbd_dev); 3208ed95b21aSIlya Dryomov again: 3209ed95b21aSIlya Dryomov lock_state = rbd_try_acquire_lock(rbd_dev, &ret); 3210ed95b21aSIlya Dryomov if (lock_state != RBD_LOCK_STATE_UNLOCKED || ret == -EBLACKLISTED) { 3211ed95b21aSIlya Dryomov if (lock_state == RBD_LOCK_STATE_LOCKED) 3212ed95b21aSIlya Dryomov wake_requests(rbd_dev, true); 3213ed95b21aSIlya Dryomov dout("%s rbd_dev %p lock_state %d ret %d - done\n", __func__, 3214ed95b21aSIlya Dryomov rbd_dev, lock_state, ret); 3215ed95b21aSIlya Dryomov return; 3216ed95b21aSIlya Dryomov } 3217ed95b21aSIlya Dryomov 3218ed95b21aSIlya Dryomov ret = rbd_request_lock(rbd_dev); 3219ed95b21aSIlya Dryomov if (ret == -ETIMEDOUT) { 3220ed95b21aSIlya Dryomov goto again; /* treat this as a dead client */ 3221e010dd0aSIlya Dryomov } else if (ret == -EROFS) { 3222e010dd0aSIlya Dryomov rbd_warn(rbd_dev, "peer will not release lock"); 3223e010dd0aSIlya Dryomov /* 3224e010dd0aSIlya Dryomov * If this is rbd_add_acquire_lock(), we want to fail 3225e010dd0aSIlya Dryomov * immediately -- reuse BLACKLISTED flag. Otherwise we 3226e010dd0aSIlya Dryomov * want to block. 3227e010dd0aSIlya Dryomov */ 3228e010dd0aSIlya Dryomov if (!(rbd_dev->disk->flags & GENHD_FL_UP)) { 3229e010dd0aSIlya Dryomov set_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags); 3230e010dd0aSIlya Dryomov /* wake "rbd map --exclusive" process */ 3231e010dd0aSIlya Dryomov wake_requests(rbd_dev, false); 3232e010dd0aSIlya Dryomov } 3233ed95b21aSIlya Dryomov } else if (ret < 0) { 3234ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "error requesting lock: %d", ret); 3235ed95b21aSIlya Dryomov mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 3236ed95b21aSIlya Dryomov RBD_RETRY_DELAY); 3237ed95b21aSIlya Dryomov } else { 3238ed95b21aSIlya Dryomov /* 3239ed95b21aSIlya Dryomov * lock owner acked, but resend if we don't see them 3240ed95b21aSIlya Dryomov * release the lock 3241ed95b21aSIlya Dryomov */ 3242ed95b21aSIlya Dryomov dout("%s rbd_dev %p requeueing lock_dwork\n", __func__, 3243ed95b21aSIlya Dryomov rbd_dev); 3244ed95b21aSIlya Dryomov mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 3245ed95b21aSIlya Dryomov msecs_to_jiffies(2 * RBD_NOTIFY_TIMEOUT * MSEC_PER_SEC)); 3246ed95b21aSIlya Dryomov } 3247ed95b21aSIlya Dryomov } 3248ed95b21aSIlya Dryomov 3249ed95b21aSIlya Dryomov /* 3250ed95b21aSIlya Dryomov * lock_rwsem must be held for write 3251ed95b21aSIlya Dryomov */ 3252ed95b21aSIlya Dryomov static bool rbd_release_lock(struct rbd_device *rbd_dev) 3253ed95b21aSIlya Dryomov { 3254ed95b21aSIlya Dryomov dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev, 3255ed95b21aSIlya Dryomov rbd_dev->lock_state); 3256ed95b21aSIlya Dryomov if (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED) 3257ed95b21aSIlya Dryomov return false; 3258ed95b21aSIlya Dryomov 3259ed95b21aSIlya Dryomov rbd_dev->lock_state = RBD_LOCK_STATE_RELEASING; 3260ed95b21aSIlya Dryomov downgrade_write(&rbd_dev->lock_rwsem); 3261ed95b21aSIlya Dryomov /* 3262ed95b21aSIlya Dryomov * Ensure that all in-flight IO is flushed. 3263ed95b21aSIlya Dryomov * 3264ed95b21aSIlya Dryomov * FIXME: ceph_osdc_sync() flushes the entire OSD client, which 3265ed95b21aSIlya Dryomov * may be shared with other devices. 3266ed95b21aSIlya Dryomov */ 3267ed95b21aSIlya Dryomov ceph_osdc_sync(&rbd_dev->rbd_client->client->osdc); 3268ed95b21aSIlya Dryomov up_read(&rbd_dev->lock_rwsem); 3269ed95b21aSIlya Dryomov 3270ed95b21aSIlya Dryomov down_write(&rbd_dev->lock_rwsem); 3271ed95b21aSIlya Dryomov dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev, 3272ed95b21aSIlya Dryomov rbd_dev->lock_state); 3273ed95b21aSIlya Dryomov if (rbd_dev->lock_state != RBD_LOCK_STATE_RELEASING) 3274ed95b21aSIlya Dryomov return false; 3275ed95b21aSIlya Dryomov 3276bbead745SIlya Dryomov rbd_unlock(rbd_dev); 3277ed95b21aSIlya Dryomov /* 3278ed95b21aSIlya Dryomov * Give others a chance to grab the lock - we would re-acquire 3279ed95b21aSIlya Dryomov * almost immediately if we got new IO during ceph_osdc_sync() 3280ed95b21aSIlya Dryomov * otherwise. We need to ack our own notifications, so this 3281ed95b21aSIlya Dryomov * lock_dwork will be requeued from rbd_wait_state_locked() 3282ed95b21aSIlya Dryomov * after wake_requests() in rbd_handle_released_lock(). 3283ed95b21aSIlya Dryomov */ 3284ed95b21aSIlya Dryomov cancel_delayed_work(&rbd_dev->lock_dwork); 3285ed95b21aSIlya Dryomov return true; 3286ed95b21aSIlya Dryomov } 3287ed95b21aSIlya Dryomov 3288ed95b21aSIlya Dryomov static void rbd_release_lock_work(struct work_struct *work) 3289ed95b21aSIlya Dryomov { 3290ed95b21aSIlya Dryomov struct rbd_device *rbd_dev = container_of(work, struct rbd_device, 3291ed95b21aSIlya Dryomov unlock_work); 3292ed95b21aSIlya Dryomov 3293ed95b21aSIlya Dryomov down_write(&rbd_dev->lock_rwsem); 3294ed95b21aSIlya Dryomov rbd_release_lock(rbd_dev); 3295ed95b21aSIlya Dryomov up_write(&rbd_dev->lock_rwsem); 3296ed95b21aSIlya Dryomov } 3297ed95b21aSIlya Dryomov 3298ed95b21aSIlya Dryomov static void rbd_handle_acquired_lock(struct rbd_device *rbd_dev, u8 struct_v, 3299ed95b21aSIlya Dryomov void **p) 3300ed95b21aSIlya Dryomov { 3301ed95b21aSIlya Dryomov struct rbd_client_id cid = { 0 }; 3302ed95b21aSIlya Dryomov 3303ed95b21aSIlya Dryomov if (struct_v >= 2) { 3304ed95b21aSIlya Dryomov cid.gid = ceph_decode_64(p); 3305ed95b21aSIlya Dryomov cid.handle = ceph_decode_64(p); 3306ed95b21aSIlya Dryomov } 3307ed95b21aSIlya Dryomov 3308ed95b21aSIlya Dryomov dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid, 3309ed95b21aSIlya Dryomov cid.handle); 3310ed95b21aSIlya Dryomov if (!rbd_cid_equal(&cid, &rbd_empty_cid)) { 3311ed95b21aSIlya Dryomov down_write(&rbd_dev->lock_rwsem); 3312ed95b21aSIlya Dryomov if (rbd_cid_equal(&cid, &rbd_dev->owner_cid)) { 3313ed95b21aSIlya Dryomov /* 3314ed95b21aSIlya Dryomov * we already know that the remote client is 3315ed95b21aSIlya Dryomov * the owner 3316ed95b21aSIlya Dryomov */ 3317ed95b21aSIlya Dryomov up_write(&rbd_dev->lock_rwsem); 3318ed95b21aSIlya Dryomov return; 3319ed95b21aSIlya Dryomov } 3320ed95b21aSIlya Dryomov 3321ed95b21aSIlya Dryomov rbd_set_owner_cid(rbd_dev, &cid); 3322ed95b21aSIlya Dryomov downgrade_write(&rbd_dev->lock_rwsem); 3323ed95b21aSIlya Dryomov } else { 3324ed95b21aSIlya Dryomov down_read(&rbd_dev->lock_rwsem); 3325ed95b21aSIlya Dryomov } 3326ed95b21aSIlya Dryomov 3327ed95b21aSIlya Dryomov if (!__rbd_is_lock_owner(rbd_dev)) 3328ed95b21aSIlya Dryomov wake_requests(rbd_dev, false); 3329ed95b21aSIlya Dryomov up_read(&rbd_dev->lock_rwsem); 3330ed95b21aSIlya Dryomov } 3331ed95b21aSIlya Dryomov 3332ed95b21aSIlya Dryomov static void rbd_handle_released_lock(struct rbd_device *rbd_dev, u8 struct_v, 3333ed95b21aSIlya Dryomov void **p) 3334ed95b21aSIlya Dryomov { 3335ed95b21aSIlya Dryomov struct rbd_client_id cid = { 0 }; 3336ed95b21aSIlya Dryomov 3337ed95b21aSIlya Dryomov if (struct_v >= 2) { 3338ed95b21aSIlya Dryomov cid.gid = ceph_decode_64(p); 3339ed95b21aSIlya Dryomov cid.handle = ceph_decode_64(p); 3340ed95b21aSIlya Dryomov } 3341ed95b21aSIlya Dryomov 3342ed95b21aSIlya Dryomov dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid, 3343ed95b21aSIlya Dryomov cid.handle); 3344ed95b21aSIlya Dryomov if (!rbd_cid_equal(&cid, &rbd_empty_cid)) { 3345ed95b21aSIlya Dryomov down_write(&rbd_dev->lock_rwsem); 3346ed95b21aSIlya Dryomov if (!rbd_cid_equal(&cid, &rbd_dev->owner_cid)) { 3347ed95b21aSIlya Dryomov dout("%s rbd_dev %p unexpected owner, cid %llu-%llu != owner_cid %llu-%llu\n", 3348ed95b21aSIlya Dryomov __func__, rbd_dev, cid.gid, cid.handle, 3349ed95b21aSIlya Dryomov rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle); 3350ed95b21aSIlya Dryomov up_write(&rbd_dev->lock_rwsem); 3351ed95b21aSIlya Dryomov return; 3352ed95b21aSIlya Dryomov } 3353ed95b21aSIlya Dryomov 3354ed95b21aSIlya Dryomov rbd_set_owner_cid(rbd_dev, &rbd_empty_cid); 3355ed95b21aSIlya Dryomov downgrade_write(&rbd_dev->lock_rwsem); 3356ed95b21aSIlya Dryomov } else { 3357ed95b21aSIlya Dryomov down_read(&rbd_dev->lock_rwsem); 3358ed95b21aSIlya Dryomov } 3359ed95b21aSIlya Dryomov 3360ed95b21aSIlya Dryomov if (!__rbd_is_lock_owner(rbd_dev)) 3361ed95b21aSIlya Dryomov wake_requests(rbd_dev, false); 3362ed95b21aSIlya Dryomov up_read(&rbd_dev->lock_rwsem); 3363ed95b21aSIlya Dryomov } 3364ed95b21aSIlya Dryomov 33653b77faa0SIlya Dryomov /* 33663b77faa0SIlya Dryomov * Returns result for ResponseMessage to be encoded (<= 0), or 1 if no 33673b77faa0SIlya Dryomov * ResponseMessage is needed. 33683b77faa0SIlya Dryomov */ 33693b77faa0SIlya Dryomov static int rbd_handle_request_lock(struct rbd_device *rbd_dev, u8 struct_v, 3370ed95b21aSIlya Dryomov void **p) 3371ed95b21aSIlya Dryomov { 3372ed95b21aSIlya Dryomov struct rbd_client_id my_cid = rbd_get_cid(rbd_dev); 3373ed95b21aSIlya Dryomov struct rbd_client_id cid = { 0 }; 33743b77faa0SIlya Dryomov int result = 1; 3375ed95b21aSIlya Dryomov 3376ed95b21aSIlya Dryomov if (struct_v >= 2) { 3377ed95b21aSIlya Dryomov cid.gid = ceph_decode_64(p); 3378ed95b21aSIlya Dryomov cid.handle = ceph_decode_64(p); 3379ed95b21aSIlya Dryomov } 3380ed95b21aSIlya Dryomov 3381ed95b21aSIlya Dryomov dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid, 3382ed95b21aSIlya Dryomov cid.handle); 3383ed95b21aSIlya Dryomov if (rbd_cid_equal(&cid, &my_cid)) 33843b77faa0SIlya Dryomov return result; 3385ed95b21aSIlya Dryomov 3386ed95b21aSIlya Dryomov down_read(&rbd_dev->lock_rwsem); 33873b77faa0SIlya Dryomov if (__rbd_is_lock_owner(rbd_dev)) { 33883b77faa0SIlya Dryomov if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED && 33893b77faa0SIlya Dryomov rbd_cid_equal(&rbd_dev->owner_cid, &rbd_empty_cid)) 33903b77faa0SIlya Dryomov goto out_unlock; 33913b77faa0SIlya Dryomov 33923b77faa0SIlya Dryomov /* 33933b77faa0SIlya Dryomov * encode ResponseMessage(0) so the peer can detect 33943b77faa0SIlya Dryomov * a missing owner 33953b77faa0SIlya Dryomov */ 33963b77faa0SIlya Dryomov result = 0; 33973b77faa0SIlya Dryomov 3398ed95b21aSIlya Dryomov if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED) { 3399e010dd0aSIlya Dryomov if (!rbd_dev->opts->exclusive) { 3400e010dd0aSIlya Dryomov dout("%s rbd_dev %p queueing unlock_work\n", 3401e010dd0aSIlya Dryomov __func__, rbd_dev); 3402e010dd0aSIlya Dryomov queue_work(rbd_dev->task_wq, 3403e010dd0aSIlya Dryomov &rbd_dev->unlock_work); 3404e010dd0aSIlya Dryomov } else { 3405e010dd0aSIlya Dryomov /* refuse to release the lock */ 3406e010dd0aSIlya Dryomov result = -EROFS; 3407ed95b21aSIlya Dryomov } 3408ed95b21aSIlya Dryomov } 3409ed95b21aSIlya Dryomov } 34103b77faa0SIlya Dryomov 34113b77faa0SIlya Dryomov out_unlock: 3412ed95b21aSIlya Dryomov up_read(&rbd_dev->lock_rwsem); 34133b77faa0SIlya Dryomov return result; 3414ed95b21aSIlya Dryomov } 3415ed95b21aSIlya Dryomov 3416ed95b21aSIlya Dryomov static void __rbd_acknowledge_notify(struct rbd_device *rbd_dev, 3417ed95b21aSIlya Dryomov u64 notify_id, u64 cookie, s32 *result) 3418ed95b21aSIlya Dryomov { 3419ed95b21aSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 342008a79102SKyle Spiers char buf[4 + CEPH_ENCODING_START_BLK_LEN]; 342108a79102SKyle Spiers int buf_size = sizeof(buf); 3422ed95b21aSIlya Dryomov int ret; 3423ed95b21aSIlya Dryomov 3424ed95b21aSIlya Dryomov if (result) { 3425ed95b21aSIlya Dryomov void *p = buf; 3426ed95b21aSIlya Dryomov 3427ed95b21aSIlya Dryomov /* encode ResponseMessage */ 3428ed95b21aSIlya Dryomov ceph_start_encoding(&p, 1, 1, 3429ed95b21aSIlya Dryomov buf_size - CEPH_ENCODING_START_BLK_LEN); 3430ed95b21aSIlya Dryomov ceph_encode_32(&p, *result); 3431ed95b21aSIlya Dryomov } else { 3432ed95b21aSIlya Dryomov buf_size = 0; 3433ed95b21aSIlya Dryomov } 3434ed95b21aSIlya Dryomov 3435ed95b21aSIlya Dryomov ret = ceph_osdc_notify_ack(osdc, &rbd_dev->header_oid, 3436ed95b21aSIlya Dryomov &rbd_dev->header_oloc, notify_id, cookie, 3437ed95b21aSIlya Dryomov buf, buf_size); 3438ed95b21aSIlya Dryomov if (ret) 3439ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "acknowledge_notify failed: %d", ret); 3440ed95b21aSIlya Dryomov } 3441ed95b21aSIlya Dryomov 3442ed95b21aSIlya Dryomov static void rbd_acknowledge_notify(struct rbd_device *rbd_dev, u64 notify_id, 3443ed95b21aSIlya Dryomov u64 cookie) 3444ed95b21aSIlya Dryomov { 3445ed95b21aSIlya Dryomov dout("%s rbd_dev %p\n", __func__, rbd_dev); 3446ed95b21aSIlya Dryomov __rbd_acknowledge_notify(rbd_dev, notify_id, cookie, NULL); 3447ed95b21aSIlya Dryomov } 3448ed95b21aSIlya Dryomov 3449ed95b21aSIlya Dryomov static void rbd_acknowledge_notify_result(struct rbd_device *rbd_dev, 3450ed95b21aSIlya Dryomov u64 notify_id, u64 cookie, s32 result) 3451ed95b21aSIlya Dryomov { 3452ed95b21aSIlya Dryomov dout("%s rbd_dev %p result %d\n", __func__, rbd_dev, result); 3453ed95b21aSIlya Dryomov __rbd_acknowledge_notify(rbd_dev, notify_id, cookie, &result); 3454ed95b21aSIlya Dryomov } 3455922dab61SIlya Dryomov 3456922dab61SIlya Dryomov static void rbd_watch_cb(void *arg, u64 notify_id, u64 cookie, 3457922dab61SIlya Dryomov u64 notifier_id, void *data, size_t data_len) 3458b8d70035SAlex Elder { 3459922dab61SIlya Dryomov struct rbd_device *rbd_dev = arg; 3460ed95b21aSIlya Dryomov void *p = data; 3461ed95b21aSIlya Dryomov void *const end = p + data_len; 3462d4c2269bSIlya Dryomov u8 struct_v = 0; 3463ed95b21aSIlya Dryomov u32 len; 3464ed95b21aSIlya Dryomov u32 notify_op; 3465b8d70035SAlex Elder int ret; 3466b8d70035SAlex Elder 3467ed95b21aSIlya Dryomov dout("%s rbd_dev %p cookie %llu notify_id %llu data_len %zu\n", 3468ed95b21aSIlya Dryomov __func__, rbd_dev, cookie, notify_id, data_len); 3469ed95b21aSIlya Dryomov if (data_len) { 3470ed95b21aSIlya Dryomov ret = ceph_start_decoding(&p, end, 1, "NotifyMessage", 3471ed95b21aSIlya Dryomov &struct_v, &len); 3472ed95b21aSIlya Dryomov if (ret) { 3473ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "failed to decode NotifyMessage: %d", 3474ed95b21aSIlya Dryomov ret); 3475ed95b21aSIlya Dryomov return; 3476ed95b21aSIlya Dryomov } 347752bb1f9bSIlya Dryomov 3478ed95b21aSIlya Dryomov notify_op = ceph_decode_32(&p); 3479ed95b21aSIlya Dryomov } else { 3480ed95b21aSIlya Dryomov /* legacy notification for header updates */ 3481ed95b21aSIlya Dryomov notify_op = RBD_NOTIFY_OP_HEADER_UPDATE; 3482ed95b21aSIlya Dryomov len = 0; 3483ed95b21aSIlya Dryomov } 3484ed95b21aSIlya Dryomov 3485ed95b21aSIlya Dryomov dout("%s rbd_dev %p notify_op %u\n", __func__, rbd_dev, notify_op); 3486ed95b21aSIlya Dryomov switch (notify_op) { 3487ed95b21aSIlya Dryomov case RBD_NOTIFY_OP_ACQUIRED_LOCK: 3488ed95b21aSIlya Dryomov rbd_handle_acquired_lock(rbd_dev, struct_v, &p); 3489ed95b21aSIlya Dryomov rbd_acknowledge_notify(rbd_dev, notify_id, cookie); 3490ed95b21aSIlya Dryomov break; 3491ed95b21aSIlya Dryomov case RBD_NOTIFY_OP_RELEASED_LOCK: 3492ed95b21aSIlya Dryomov rbd_handle_released_lock(rbd_dev, struct_v, &p); 3493ed95b21aSIlya Dryomov rbd_acknowledge_notify(rbd_dev, notify_id, cookie); 3494ed95b21aSIlya Dryomov break; 3495ed95b21aSIlya Dryomov case RBD_NOTIFY_OP_REQUEST_LOCK: 34963b77faa0SIlya Dryomov ret = rbd_handle_request_lock(rbd_dev, struct_v, &p); 34973b77faa0SIlya Dryomov if (ret <= 0) 3498ed95b21aSIlya Dryomov rbd_acknowledge_notify_result(rbd_dev, notify_id, 34993b77faa0SIlya Dryomov cookie, ret); 3500ed95b21aSIlya Dryomov else 3501ed95b21aSIlya Dryomov rbd_acknowledge_notify(rbd_dev, notify_id, cookie); 3502ed95b21aSIlya Dryomov break; 3503ed95b21aSIlya Dryomov case RBD_NOTIFY_OP_HEADER_UPDATE: 3504e627db08SAlex Elder ret = rbd_dev_refresh(rbd_dev); 3505e627db08SAlex Elder if (ret) 35069584d508SIlya Dryomov rbd_warn(rbd_dev, "refresh failed: %d", ret); 3507b8d70035SAlex Elder 3508ed95b21aSIlya Dryomov rbd_acknowledge_notify(rbd_dev, notify_id, cookie); 3509ed95b21aSIlya Dryomov break; 3510ed95b21aSIlya Dryomov default: 3511ed95b21aSIlya Dryomov if (rbd_is_lock_owner(rbd_dev)) 3512ed95b21aSIlya Dryomov rbd_acknowledge_notify_result(rbd_dev, notify_id, 3513ed95b21aSIlya Dryomov cookie, -EOPNOTSUPP); 3514ed95b21aSIlya Dryomov else 3515ed95b21aSIlya Dryomov rbd_acknowledge_notify(rbd_dev, notify_id, cookie); 3516ed95b21aSIlya Dryomov break; 3517b8d70035SAlex Elder } 3518b8d70035SAlex Elder } 3519b8d70035SAlex Elder 352099d16943SIlya Dryomov static void __rbd_unregister_watch(struct rbd_device *rbd_dev); 35219969ebc5SAlex Elder 3522922dab61SIlya Dryomov static void rbd_watch_errcb(void *arg, u64 cookie, int err) 3523bb040aa0SIlya Dryomov { 3524922dab61SIlya Dryomov struct rbd_device *rbd_dev = arg; 3525bb040aa0SIlya Dryomov 3526922dab61SIlya Dryomov rbd_warn(rbd_dev, "encountered watch error: %d", err); 3527bb040aa0SIlya Dryomov 3528ed95b21aSIlya Dryomov down_write(&rbd_dev->lock_rwsem); 3529ed95b21aSIlya Dryomov rbd_set_owner_cid(rbd_dev, &rbd_empty_cid); 3530ed95b21aSIlya Dryomov up_write(&rbd_dev->lock_rwsem); 3531bb040aa0SIlya Dryomov 353299d16943SIlya Dryomov mutex_lock(&rbd_dev->watch_mutex); 353399d16943SIlya Dryomov if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED) { 353499d16943SIlya Dryomov __rbd_unregister_watch(rbd_dev); 353599d16943SIlya Dryomov rbd_dev->watch_state = RBD_WATCH_STATE_ERROR; 3536bb040aa0SIlya Dryomov 353799d16943SIlya Dryomov queue_delayed_work(rbd_dev->task_wq, &rbd_dev->watch_dwork, 0); 3538bb040aa0SIlya Dryomov } 353999d16943SIlya Dryomov mutex_unlock(&rbd_dev->watch_mutex); 3540bb040aa0SIlya Dryomov } 3541bb040aa0SIlya Dryomov 3542bb040aa0SIlya Dryomov /* 354399d16943SIlya Dryomov * watch_mutex must be locked 35449969ebc5SAlex Elder */ 354599d16943SIlya Dryomov static int __rbd_register_watch(struct rbd_device *rbd_dev) 35469969ebc5SAlex Elder { 35479969ebc5SAlex Elder struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 3548922dab61SIlya Dryomov struct ceph_osd_linger_request *handle; 35499969ebc5SAlex Elder 3550922dab61SIlya Dryomov rbd_assert(!rbd_dev->watch_handle); 355199d16943SIlya Dryomov dout("%s rbd_dev %p\n", __func__, rbd_dev); 35529969ebc5SAlex Elder 3553922dab61SIlya Dryomov handle = ceph_osdc_watch(osdc, &rbd_dev->header_oid, 3554922dab61SIlya Dryomov &rbd_dev->header_oloc, rbd_watch_cb, 3555922dab61SIlya Dryomov rbd_watch_errcb, rbd_dev); 3556922dab61SIlya Dryomov if (IS_ERR(handle)) 3557922dab61SIlya Dryomov return PTR_ERR(handle); 35589969ebc5SAlex Elder 3559922dab61SIlya Dryomov rbd_dev->watch_handle = handle; 35608eb87565SAlex Elder return 0; 35619969ebc5SAlex Elder } 35629969ebc5SAlex Elder 356399d16943SIlya Dryomov /* 356499d16943SIlya Dryomov * watch_mutex must be locked 356599d16943SIlya Dryomov */ 356699d16943SIlya Dryomov static void __rbd_unregister_watch(struct rbd_device *rbd_dev) 3567fca27065SIlya Dryomov { 3568922dab61SIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 3569922dab61SIlya Dryomov int ret; 3570b30a01f2SIlya Dryomov 357199d16943SIlya Dryomov rbd_assert(rbd_dev->watch_handle); 357299d16943SIlya Dryomov dout("%s rbd_dev %p\n", __func__, rbd_dev); 3573b30a01f2SIlya Dryomov 3574922dab61SIlya Dryomov ret = ceph_osdc_unwatch(osdc, rbd_dev->watch_handle); 3575922dab61SIlya Dryomov if (ret) 3576922dab61SIlya Dryomov rbd_warn(rbd_dev, "failed to unwatch: %d", ret); 3577b30a01f2SIlya Dryomov 3578922dab61SIlya Dryomov rbd_dev->watch_handle = NULL; 3579c525f036SIlya Dryomov } 3580c525f036SIlya Dryomov 358199d16943SIlya Dryomov static int rbd_register_watch(struct rbd_device *rbd_dev) 3582c525f036SIlya Dryomov { 358399d16943SIlya Dryomov int ret; 3584811c6688SIlya Dryomov 358599d16943SIlya Dryomov mutex_lock(&rbd_dev->watch_mutex); 358699d16943SIlya Dryomov rbd_assert(rbd_dev->watch_state == RBD_WATCH_STATE_UNREGISTERED); 358799d16943SIlya Dryomov ret = __rbd_register_watch(rbd_dev); 358899d16943SIlya Dryomov if (ret) 358999d16943SIlya Dryomov goto out; 359099d16943SIlya Dryomov 359199d16943SIlya Dryomov rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED; 359299d16943SIlya Dryomov rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id; 359399d16943SIlya Dryomov 359499d16943SIlya Dryomov out: 359599d16943SIlya Dryomov mutex_unlock(&rbd_dev->watch_mutex); 359699d16943SIlya Dryomov return ret; 359799d16943SIlya Dryomov } 359899d16943SIlya Dryomov 359999d16943SIlya Dryomov static void cancel_tasks_sync(struct rbd_device *rbd_dev) 360099d16943SIlya Dryomov { 360199d16943SIlya Dryomov dout("%s rbd_dev %p\n", __func__, rbd_dev); 360299d16943SIlya Dryomov 3603ed95b21aSIlya Dryomov cancel_work_sync(&rbd_dev->acquired_lock_work); 3604ed95b21aSIlya Dryomov cancel_work_sync(&rbd_dev->released_lock_work); 3605ed95b21aSIlya Dryomov cancel_delayed_work_sync(&rbd_dev->lock_dwork); 3606ed95b21aSIlya Dryomov cancel_work_sync(&rbd_dev->unlock_work); 360799d16943SIlya Dryomov } 360899d16943SIlya Dryomov 360999d16943SIlya Dryomov static void rbd_unregister_watch(struct rbd_device *rbd_dev) 361099d16943SIlya Dryomov { 3611ed95b21aSIlya Dryomov WARN_ON(waitqueue_active(&rbd_dev->lock_waitq)); 361299d16943SIlya Dryomov cancel_tasks_sync(rbd_dev); 361399d16943SIlya Dryomov 361499d16943SIlya Dryomov mutex_lock(&rbd_dev->watch_mutex); 361599d16943SIlya Dryomov if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED) 361699d16943SIlya Dryomov __rbd_unregister_watch(rbd_dev); 361799d16943SIlya Dryomov rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED; 361899d16943SIlya Dryomov mutex_unlock(&rbd_dev->watch_mutex); 361999d16943SIlya Dryomov 362023edca86SDongsheng Yang cancel_delayed_work_sync(&rbd_dev->watch_dwork); 3621811c6688SIlya Dryomov ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc); 3622fca27065SIlya Dryomov } 3623fca27065SIlya Dryomov 362414bb211dSIlya Dryomov /* 362514bb211dSIlya Dryomov * lock_rwsem must be held for write 362614bb211dSIlya Dryomov */ 362714bb211dSIlya Dryomov static void rbd_reacquire_lock(struct rbd_device *rbd_dev) 362814bb211dSIlya Dryomov { 362914bb211dSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 363014bb211dSIlya Dryomov char cookie[32]; 363114bb211dSIlya Dryomov int ret; 363214bb211dSIlya Dryomov 363314bb211dSIlya Dryomov WARN_ON(rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED); 363414bb211dSIlya Dryomov 363514bb211dSIlya Dryomov format_lock_cookie(rbd_dev, cookie); 363614bb211dSIlya Dryomov ret = ceph_cls_set_cookie(osdc, &rbd_dev->header_oid, 363714bb211dSIlya Dryomov &rbd_dev->header_oloc, RBD_LOCK_NAME, 363814bb211dSIlya Dryomov CEPH_CLS_LOCK_EXCLUSIVE, rbd_dev->lock_cookie, 363914bb211dSIlya Dryomov RBD_LOCK_TAG, cookie); 364014bb211dSIlya Dryomov if (ret) { 364114bb211dSIlya Dryomov if (ret != -EOPNOTSUPP) 364214bb211dSIlya Dryomov rbd_warn(rbd_dev, "failed to update lock cookie: %d", 364314bb211dSIlya Dryomov ret); 364414bb211dSIlya Dryomov 364514bb211dSIlya Dryomov /* 364614bb211dSIlya Dryomov * Lock cookie cannot be updated on older OSDs, so do 364714bb211dSIlya Dryomov * a manual release and queue an acquire. 364814bb211dSIlya Dryomov */ 364914bb211dSIlya Dryomov if (rbd_release_lock(rbd_dev)) 365014bb211dSIlya Dryomov queue_delayed_work(rbd_dev->task_wq, 365114bb211dSIlya Dryomov &rbd_dev->lock_dwork, 0); 365214bb211dSIlya Dryomov } else { 3653edd8ca80SFlorian Margaine __rbd_lock(rbd_dev, cookie); 365414bb211dSIlya Dryomov } 365514bb211dSIlya Dryomov } 365614bb211dSIlya Dryomov 365799d16943SIlya Dryomov static void rbd_reregister_watch(struct work_struct *work) 365899d16943SIlya Dryomov { 365999d16943SIlya Dryomov struct rbd_device *rbd_dev = container_of(to_delayed_work(work), 366099d16943SIlya Dryomov struct rbd_device, watch_dwork); 366199d16943SIlya Dryomov int ret; 366299d16943SIlya Dryomov 366399d16943SIlya Dryomov dout("%s rbd_dev %p\n", __func__, rbd_dev); 366499d16943SIlya Dryomov 366599d16943SIlya Dryomov mutex_lock(&rbd_dev->watch_mutex); 366687c0fdedSIlya Dryomov if (rbd_dev->watch_state != RBD_WATCH_STATE_ERROR) { 366787c0fdedSIlya Dryomov mutex_unlock(&rbd_dev->watch_mutex); 366814bb211dSIlya Dryomov return; 366987c0fdedSIlya Dryomov } 367099d16943SIlya Dryomov 367199d16943SIlya Dryomov ret = __rbd_register_watch(rbd_dev); 367299d16943SIlya Dryomov if (ret) { 367399d16943SIlya Dryomov rbd_warn(rbd_dev, "failed to reregister watch: %d", ret); 36744d73644bSIlya Dryomov if (ret == -EBLACKLISTED || ret == -ENOENT) { 367587c0fdedSIlya Dryomov set_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags); 367614bb211dSIlya Dryomov wake_requests(rbd_dev, true); 367787c0fdedSIlya Dryomov } else { 367899d16943SIlya Dryomov queue_delayed_work(rbd_dev->task_wq, 367999d16943SIlya Dryomov &rbd_dev->watch_dwork, 368099d16943SIlya Dryomov RBD_RETRY_DELAY); 368187c0fdedSIlya Dryomov } 368287c0fdedSIlya Dryomov mutex_unlock(&rbd_dev->watch_mutex); 368314bb211dSIlya Dryomov return; 368499d16943SIlya Dryomov } 368599d16943SIlya Dryomov 368699d16943SIlya Dryomov rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED; 368799d16943SIlya Dryomov rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id; 368899d16943SIlya Dryomov mutex_unlock(&rbd_dev->watch_mutex); 368999d16943SIlya Dryomov 369014bb211dSIlya Dryomov down_write(&rbd_dev->lock_rwsem); 369114bb211dSIlya Dryomov if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED) 369214bb211dSIlya Dryomov rbd_reacquire_lock(rbd_dev); 369314bb211dSIlya Dryomov up_write(&rbd_dev->lock_rwsem); 369414bb211dSIlya Dryomov 369599d16943SIlya Dryomov ret = rbd_dev_refresh(rbd_dev); 369699d16943SIlya Dryomov if (ret) 3697f6870cc9SColin Ian King rbd_warn(rbd_dev, "reregistration refresh failed: %d", ret); 369899d16943SIlya Dryomov } 369999d16943SIlya Dryomov 370036be9a76SAlex Elder /* 3701f40eb349SAlex Elder * Synchronous osd object method call. Returns the number of bytes 3702f40eb349SAlex Elder * returned in the outbound buffer, or a negative error code. 370336be9a76SAlex Elder */ 370436be9a76SAlex Elder static int rbd_obj_method_sync(struct rbd_device *rbd_dev, 3705ecd4a68aSIlya Dryomov struct ceph_object_id *oid, 3706ecd4a68aSIlya Dryomov struct ceph_object_locator *oloc, 370736be9a76SAlex Elder const char *method_name, 37084157976bSAlex Elder const void *outbound, 370936be9a76SAlex Elder size_t outbound_size, 37104157976bSAlex Elder void *inbound, 3711e2a58ee5SAlex Elder size_t inbound_size) 371236be9a76SAlex Elder { 3713ecd4a68aSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 3714ecd4a68aSIlya Dryomov struct page *req_page = NULL; 3715ecd4a68aSIlya Dryomov struct page *reply_page; 371636be9a76SAlex Elder int ret; 371736be9a76SAlex Elder 371836be9a76SAlex Elder /* 37196010a451SAlex Elder * Method calls are ultimately read operations. The result 37206010a451SAlex Elder * should placed into the inbound buffer provided. They 37216010a451SAlex Elder * also supply outbound data--parameters for the object 37226010a451SAlex Elder * method. Currently if this is present it will be a 37236010a451SAlex Elder * snapshot id. 372436be9a76SAlex Elder */ 3725ecd4a68aSIlya Dryomov if (outbound) { 3726ecd4a68aSIlya Dryomov if (outbound_size > PAGE_SIZE) 3727ecd4a68aSIlya Dryomov return -E2BIG; 372836be9a76SAlex Elder 3729ecd4a68aSIlya Dryomov req_page = alloc_page(GFP_KERNEL); 3730ecd4a68aSIlya Dryomov if (!req_page) 3731ecd4a68aSIlya Dryomov return -ENOMEM; 373236be9a76SAlex Elder 3733ecd4a68aSIlya Dryomov memcpy(page_address(req_page), outbound, outbound_size); 373404017e29SAlex Elder } 3735430c28c3SAlex Elder 3736ecd4a68aSIlya Dryomov reply_page = alloc_page(GFP_KERNEL); 3737ecd4a68aSIlya Dryomov if (!reply_page) { 3738ecd4a68aSIlya Dryomov if (req_page) 3739ecd4a68aSIlya Dryomov __free_page(req_page); 3740ecd4a68aSIlya Dryomov return -ENOMEM; 3741ecd4a68aSIlya Dryomov } 374236be9a76SAlex Elder 3743ecd4a68aSIlya Dryomov ret = ceph_osdc_call(osdc, oid, oloc, RBD_DRV_NAME, method_name, 3744ecd4a68aSIlya Dryomov CEPH_OSD_FLAG_READ, req_page, outbound_size, 3745ecd4a68aSIlya Dryomov reply_page, &inbound_size); 3746ecd4a68aSIlya Dryomov if (!ret) { 3747ecd4a68aSIlya Dryomov memcpy(inbound, page_address(reply_page), inbound_size); 3748ecd4a68aSIlya Dryomov ret = inbound_size; 3749ecd4a68aSIlya Dryomov } 375057385b51SAlex Elder 3751ecd4a68aSIlya Dryomov if (req_page) 3752ecd4a68aSIlya Dryomov __free_page(req_page); 3753ecd4a68aSIlya Dryomov __free_page(reply_page); 375436be9a76SAlex Elder return ret; 375536be9a76SAlex Elder } 375636be9a76SAlex Elder 3757ed95b21aSIlya Dryomov /* 3758ed95b21aSIlya Dryomov * lock_rwsem must be held for read 3759ed95b21aSIlya Dryomov */ 37602f18d466SIlya Dryomov static int rbd_wait_state_locked(struct rbd_device *rbd_dev, bool may_acquire) 3761ed95b21aSIlya Dryomov { 3762ed95b21aSIlya Dryomov DEFINE_WAIT(wait); 376334f55d0bSDongsheng Yang unsigned long timeout; 37642f18d466SIlya Dryomov int ret = 0; 37652f18d466SIlya Dryomov 37662f18d466SIlya Dryomov if (test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags)) 37672f18d466SIlya Dryomov return -EBLACKLISTED; 37682f18d466SIlya Dryomov 37692f18d466SIlya Dryomov if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED) 37702f18d466SIlya Dryomov return 0; 37712f18d466SIlya Dryomov 37722f18d466SIlya Dryomov if (!may_acquire) { 37732f18d466SIlya Dryomov rbd_warn(rbd_dev, "exclusive lock required"); 37742f18d466SIlya Dryomov return -EROFS; 37752f18d466SIlya Dryomov } 3776ed95b21aSIlya Dryomov 3777ed95b21aSIlya Dryomov do { 3778ed95b21aSIlya Dryomov /* 3779ed95b21aSIlya Dryomov * Note the use of mod_delayed_work() in rbd_acquire_lock() 3780ed95b21aSIlya Dryomov * and cancel_delayed_work() in wake_requests(). 3781ed95b21aSIlya Dryomov */ 3782ed95b21aSIlya Dryomov dout("%s rbd_dev %p queueing lock_dwork\n", __func__, rbd_dev); 3783ed95b21aSIlya Dryomov queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0); 3784ed95b21aSIlya Dryomov prepare_to_wait_exclusive(&rbd_dev->lock_waitq, &wait, 3785ed95b21aSIlya Dryomov TASK_UNINTERRUPTIBLE); 3786ed95b21aSIlya Dryomov up_read(&rbd_dev->lock_rwsem); 378734f55d0bSDongsheng Yang timeout = schedule_timeout(ceph_timeout_jiffies( 378834f55d0bSDongsheng Yang rbd_dev->opts->lock_timeout)); 3789ed95b21aSIlya Dryomov down_read(&rbd_dev->lock_rwsem); 37902f18d466SIlya Dryomov if (test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags)) { 37912f18d466SIlya Dryomov ret = -EBLACKLISTED; 37922f18d466SIlya Dryomov break; 37932f18d466SIlya Dryomov } 379434f55d0bSDongsheng Yang if (!timeout) { 379534f55d0bSDongsheng Yang rbd_warn(rbd_dev, "timed out waiting for lock"); 379634f55d0bSDongsheng Yang ret = -ETIMEDOUT; 379734f55d0bSDongsheng Yang break; 379834f55d0bSDongsheng Yang } 37992f18d466SIlya Dryomov } while (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED); 380087c0fdedSIlya Dryomov 3801ed95b21aSIlya Dryomov finish_wait(&rbd_dev->lock_waitq, &wait); 38022f18d466SIlya Dryomov return ret; 3803ed95b21aSIlya Dryomov } 3804ed95b21aSIlya Dryomov 38057ad18afaSChristoph Hellwig static void rbd_queue_workfn(struct work_struct *work) 3806bc1ecc65SIlya Dryomov { 38077ad18afaSChristoph Hellwig struct request *rq = blk_mq_rq_from_pdu(work); 38087ad18afaSChristoph Hellwig struct rbd_device *rbd_dev = rq->q->queuedata; 3809bc1ecc65SIlya Dryomov struct rbd_img_request *img_request; 38104e752f0aSJosh Durgin struct ceph_snap_context *snapc = NULL; 3811bc1ecc65SIlya Dryomov u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT; 3812bc1ecc65SIlya Dryomov u64 length = blk_rq_bytes(rq); 38136d2940c8SGuangliang Zhao enum obj_operation_type op_type; 38144e752f0aSJosh Durgin u64 mapping_size; 381580de1912SIlya Dryomov bool must_be_locked; 3816bc1ecc65SIlya Dryomov int result; 3817bc1ecc65SIlya Dryomov 3818aebf526bSChristoph Hellwig switch (req_op(rq)) { 3819aebf526bSChristoph Hellwig case REQ_OP_DISCARD: 3820aebf526bSChristoph Hellwig op_type = OBJ_OP_DISCARD; 3821aebf526bSChristoph Hellwig break; 38226484cbe9SIlya Dryomov case REQ_OP_WRITE_ZEROES: 38236484cbe9SIlya Dryomov op_type = OBJ_OP_ZEROOUT; 38246484cbe9SIlya Dryomov break; 3825aebf526bSChristoph Hellwig case REQ_OP_WRITE: 3826aebf526bSChristoph Hellwig op_type = OBJ_OP_WRITE; 3827aebf526bSChristoph Hellwig break; 3828aebf526bSChristoph Hellwig case REQ_OP_READ: 3829aebf526bSChristoph Hellwig op_type = OBJ_OP_READ; 3830aebf526bSChristoph Hellwig break; 3831aebf526bSChristoph Hellwig default: 3832aebf526bSChristoph Hellwig dout("%s: non-fs request type %d\n", __func__, req_op(rq)); 38337ad18afaSChristoph Hellwig result = -EIO; 38347ad18afaSChristoph Hellwig goto err; 38357ad18afaSChristoph Hellwig } 38367ad18afaSChristoph Hellwig 3837bc1ecc65SIlya Dryomov /* Ignore/skip any zero-length requests */ 3838bc1ecc65SIlya Dryomov 3839bc1ecc65SIlya Dryomov if (!length) { 3840bc1ecc65SIlya Dryomov dout("%s: zero-length request\n", __func__); 3841bc1ecc65SIlya Dryomov result = 0; 3842bc1ecc65SIlya Dryomov goto err_rq; 3843bc1ecc65SIlya Dryomov } 3844bc1ecc65SIlya Dryomov 38459568c93eSIlya Dryomov rbd_assert(op_type == OBJ_OP_READ || 38469568c93eSIlya Dryomov rbd_dev->spec->snap_id == CEPH_NOSNAP); 3847bc1ecc65SIlya Dryomov 3848bc1ecc65SIlya Dryomov /* 3849bc1ecc65SIlya Dryomov * Quit early if the mapped snapshot no longer exists. It's 3850bc1ecc65SIlya Dryomov * still possible the snapshot will have disappeared by the 3851bc1ecc65SIlya Dryomov * time our request arrives at the osd, but there's no sense in 3852bc1ecc65SIlya Dryomov * sending it if we already know. 3853bc1ecc65SIlya Dryomov */ 3854bc1ecc65SIlya Dryomov if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) { 3855bc1ecc65SIlya Dryomov dout("request for non-existent snapshot"); 3856bc1ecc65SIlya Dryomov rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP); 3857bc1ecc65SIlya Dryomov result = -ENXIO; 3858bc1ecc65SIlya Dryomov goto err_rq; 3859bc1ecc65SIlya Dryomov } 3860bc1ecc65SIlya Dryomov 3861bc1ecc65SIlya Dryomov if (offset && length > U64_MAX - offset + 1) { 3862bc1ecc65SIlya Dryomov rbd_warn(rbd_dev, "bad request range (%llu~%llu)", offset, 3863bc1ecc65SIlya Dryomov length); 3864bc1ecc65SIlya Dryomov result = -EINVAL; 3865bc1ecc65SIlya Dryomov goto err_rq; /* Shouldn't happen */ 3866bc1ecc65SIlya Dryomov } 3867bc1ecc65SIlya Dryomov 38687ad18afaSChristoph Hellwig blk_mq_start_request(rq); 38697ad18afaSChristoph Hellwig 38704e752f0aSJosh Durgin down_read(&rbd_dev->header_rwsem); 38714e752f0aSJosh Durgin mapping_size = rbd_dev->mapping.size; 38726d2940c8SGuangliang Zhao if (op_type != OBJ_OP_READ) { 38734e752f0aSJosh Durgin snapc = rbd_dev->header.snapc; 38744e752f0aSJosh Durgin ceph_get_snap_context(snapc); 38754e752f0aSJosh Durgin } 38764e752f0aSJosh Durgin up_read(&rbd_dev->header_rwsem); 38774e752f0aSJosh Durgin 38784e752f0aSJosh Durgin if (offset + length > mapping_size) { 3879bc1ecc65SIlya Dryomov rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)", offset, 38804e752f0aSJosh Durgin length, mapping_size); 3881bc1ecc65SIlya Dryomov result = -EIO; 3882bc1ecc65SIlya Dryomov goto err_rq; 3883bc1ecc65SIlya Dryomov } 3884bc1ecc65SIlya Dryomov 3885f9bebd58SIlya Dryomov must_be_locked = 3886f9bebd58SIlya Dryomov (rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK) && 3887f9bebd58SIlya Dryomov (op_type != OBJ_OP_READ || rbd_dev->opts->lock_on_read); 3888ed95b21aSIlya Dryomov if (must_be_locked) { 3889ed95b21aSIlya Dryomov down_read(&rbd_dev->lock_rwsem); 38902f18d466SIlya Dryomov result = rbd_wait_state_locked(rbd_dev, 38912f18d466SIlya Dryomov !rbd_dev->opts->exclusive); 38922f18d466SIlya Dryomov if (result) 3893e010dd0aSIlya Dryomov goto err_unlock; 3894e010dd0aSIlya Dryomov } 3895ed95b21aSIlya Dryomov 3896dfd9875fSIlya Dryomov img_request = rbd_img_request_create(rbd_dev, op_type, snapc); 3897bc1ecc65SIlya Dryomov if (!img_request) { 3898bc1ecc65SIlya Dryomov result = -ENOMEM; 3899ed95b21aSIlya Dryomov goto err_unlock; 3900bc1ecc65SIlya Dryomov } 3901bc1ecc65SIlya Dryomov img_request->rq = rq; 390270b16db8SIlya Dryomov snapc = NULL; /* img_request consumes a ref */ 3903bc1ecc65SIlya Dryomov 39046484cbe9SIlya Dryomov if (op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_ZEROOUT) 39055a237819SIlya Dryomov result = rbd_img_fill_nodata(img_request, offset, length); 390690e98c52SGuangliang Zhao else 39075a237819SIlya Dryomov result = rbd_img_fill_from_bio(img_request, offset, length, 390890e98c52SGuangliang Zhao rq->bio); 39090c93e1b7SIlya Dryomov if (result || !img_request->pending_count) 3910bc1ecc65SIlya Dryomov goto err_img_request; 3911bc1ecc65SIlya Dryomov 3912efbd1a11SIlya Dryomov rbd_img_request_submit(img_request); 3913ed95b21aSIlya Dryomov if (must_be_locked) 3914ed95b21aSIlya Dryomov up_read(&rbd_dev->lock_rwsem); 3915bc1ecc65SIlya Dryomov return; 3916bc1ecc65SIlya Dryomov 3917bc1ecc65SIlya Dryomov err_img_request: 3918bc1ecc65SIlya Dryomov rbd_img_request_put(img_request); 3919ed95b21aSIlya Dryomov err_unlock: 3920ed95b21aSIlya Dryomov if (must_be_locked) 3921ed95b21aSIlya Dryomov up_read(&rbd_dev->lock_rwsem); 3922bc1ecc65SIlya Dryomov err_rq: 3923bc1ecc65SIlya Dryomov if (result) 3924bc1ecc65SIlya Dryomov rbd_warn(rbd_dev, "%s %llx at %llx result %d", 39256d2940c8SGuangliang Zhao obj_op_name(op_type), length, offset, result); 39264e752f0aSJosh Durgin ceph_put_snap_context(snapc); 39277ad18afaSChristoph Hellwig err: 39282a842acaSChristoph Hellwig blk_mq_end_request(rq, errno_to_blk_status(result)); 3929bc1ecc65SIlya Dryomov } 3930bc1ecc65SIlya Dryomov 3931fc17b653SChristoph Hellwig static blk_status_t rbd_queue_rq(struct blk_mq_hw_ctx *hctx, 39327ad18afaSChristoph Hellwig const struct blk_mq_queue_data *bd) 3933bc1ecc65SIlya Dryomov { 39347ad18afaSChristoph Hellwig struct request *rq = bd->rq; 39357ad18afaSChristoph Hellwig struct work_struct *work = blk_mq_rq_to_pdu(rq); 3936bc1ecc65SIlya Dryomov 39377ad18afaSChristoph Hellwig queue_work(rbd_wq, work); 3938fc17b653SChristoph Hellwig return BLK_STS_OK; 3939bf0d5f50SAlex Elder } 3940bf0d5f50SAlex Elder 3941602adf40SYehuda Sadeh static void rbd_free_disk(struct rbd_device *rbd_dev) 3942602adf40SYehuda Sadeh { 39435769ed0cSIlya Dryomov blk_cleanup_queue(rbd_dev->disk->queue); 39447ad18afaSChristoph Hellwig blk_mq_free_tag_set(&rbd_dev->tag_set); 39455769ed0cSIlya Dryomov put_disk(rbd_dev->disk); 39465769ed0cSIlya Dryomov rbd_dev->disk = NULL; 3947602adf40SYehuda Sadeh } 3948602adf40SYehuda Sadeh 3949788e2df3SAlex Elder static int rbd_obj_read_sync(struct rbd_device *rbd_dev, 3950fe5478e0SIlya Dryomov struct ceph_object_id *oid, 3951fe5478e0SIlya Dryomov struct ceph_object_locator *oloc, 3952fe5478e0SIlya Dryomov void *buf, int buf_len) 3953788e2df3SAlex Elder 3954788e2df3SAlex Elder { 3955fe5478e0SIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 3956fe5478e0SIlya Dryomov struct ceph_osd_request *req; 3957fe5478e0SIlya Dryomov struct page **pages; 3958fe5478e0SIlya Dryomov int num_pages = calc_pages_for(0, buf_len); 3959788e2df3SAlex Elder int ret; 3960788e2df3SAlex Elder 3961fe5478e0SIlya Dryomov req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_KERNEL); 3962fe5478e0SIlya Dryomov if (!req) 3963fe5478e0SIlya Dryomov return -ENOMEM; 3964788e2df3SAlex Elder 3965fe5478e0SIlya Dryomov ceph_oid_copy(&req->r_base_oid, oid); 3966fe5478e0SIlya Dryomov ceph_oloc_copy(&req->r_base_oloc, oloc); 3967fe5478e0SIlya Dryomov req->r_flags = CEPH_OSD_FLAG_READ; 3968788e2df3SAlex Elder 3969fe5478e0SIlya Dryomov pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); 3970fe5478e0SIlya Dryomov if (IS_ERR(pages)) { 3971fe5478e0SIlya Dryomov ret = PTR_ERR(pages); 3972fe5478e0SIlya Dryomov goto out_req; 3973fe5478e0SIlya Dryomov } 39741ceae7efSAlex Elder 3975fe5478e0SIlya Dryomov osd_req_op_extent_init(req, 0, CEPH_OSD_OP_READ, 0, buf_len, 0, 0); 3976fe5478e0SIlya Dryomov osd_req_op_extent_osd_data_pages(req, 0, pages, buf_len, 0, false, 3977fe5478e0SIlya Dryomov true); 3978788e2df3SAlex Elder 397926f887e0SIlya Dryomov ret = ceph_osdc_alloc_messages(req, GFP_KERNEL); 398026f887e0SIlya Dryomov if (ret) 398126f887e0SIlya Dryomov goto out_req; 398226f887e0SIlya Dryomov 3983fe5478e0SIlya Dryomov ceph_osdc_start_request(osdc, req, false); 3984fe5478e0SIlya Dryomov ret = ceph_osdc_wait_request(osdc, req); 3985fe5478e0SIlya Dryomov if (ret >= 0) 3986fe5478e0SIlya Dryomov ceph_copy_from_page_vector(pages, buf, 0, ret); 3987fe5478e0SIlya Dryomov 3988fe5478e0SIlya Dryomov out_req: 3989fe5478e0SIlya Dryomov ceph_osdc_put_request(req); 3990788e2df3SAlex Elder return ret; 3991788e2df3SAlex Elder } 3992788e2df3SAlex Elder 3993602adf40SYehuda Sadeh /* 3994662518b1SAlex Elder * Read the complete header for the given rbd device. On successful 3995662518b1SAlex Elder * return, the rbd_dev->header field will contain up-to-date 3996662518b1SAlex Elder * information about the image. 39974156d998SAlex Elder */ 399899a41ebcSAlex Elder static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev) 39994156d998SAlex Elder { 40004156d998SAlex Elder struct rbd_image_header_ondisk *ondisk = NULL; 40014156d998SAlex Elder u32 snap_count = 0; 40024156d998SAlex Elder u64 names_size = 0; 40034156d998SAlex Elder u32 want_count; 40044156d998SAlex Elder int ret; 40054156d998SAlex Elder 40064156d998SAlex Elder /* 40074156d998SAlex Elder * The complete header will include an array of its 64-bit 40084156d998SAlex Elder * snapshot ids, followed by the names of those snapshots as 40094156d998SAlex Elder * a contiguous block of NUL-terminated strings. Note that 40104156d998SAlex Elder * the number of snapshots could change by the time we read 40114156d998SAlex Elder * it in, in which case we re-read it. 40124156d998SAlex Elder */ 40134156d998SAlex Elder do { 40144156d998SAlex Elder size_t size; 40154156d998SAlex Elder 40164156d998SAlex Elder kfree(ondisk); 40174156d998SAlex Elder 40184156d998SAlex Elder size = sizeof (*ondisk); 40194156d998SAlex Elder size += snap_count * sizeof (struct rbd_image_snap_ondisk); 40204156d998SAlex Elder size += names_size; 40214156d998SAlex Elder ondisk = kmalloc(size, GFP_KERNEL); 40224156d998SAlex Elder if (!ondisk) 4023662518b1SAlex Elder return -ENOMEM; 40244156d998SAlex Elder 4025fe5478e0SIlya Dryomov ret = rbd_obj_read_sync(rbd_dev, &rbd_dev->header_oid, 4026fe5478e0SIlya Dryomov &rbd_dev->header_oloc, ondisk, size); 40274156d998SAlex Elder if (ret < 0) 4028662518b1SAlex Elder goto out; 4029c0cd10dbSAlex Elder if ((size_t)ret < size) { 40304156d998SAlex Elder ret = -ENXIO; 403106ecc6cbSAlex Elder rbd_warn(rbd_dev, "short header read (want %zd got %d)", 403206ecc6cbSAlex Elder size, ret); 4033662518b1SAlex Elder goto out; 40344156d998SAlex Elder } 40354156d998SAlex Elder if (!rbd_dev_ondisk_valid(ondisk)) { 40364156d998SAlex Elder ret = -ENXIO; 403706ecc6cbSAlex Elder rbd_warn(rbd_dev, "invalid header"); 4038662518b1SAlex Elder goto out; 40394156d998SAlex Elder } 40404156d998SAlex Elder 40414156d998SAlex Elder names_size = le64_to_cpu(ondisk->snap_names_len); 40424156d998SAlex Elder want_count = snap_count; 40434156d998SAlex Elder snap_count = le32_to_cpu(ondisk->snap_count); 40444156d998SAlex Elder } while (snap_count != want_count); 40454156d998SAlex Elder 4046662518b1SAlex Elder ret = rbd_header_from_disk(rbd_dev, ondisk); 4047662518b1SAlex Elder out: 40484156d998SAlex Elder kfree(ondisk); 40494156d998SAlex Elder 4050dfc5606dSYehuda Sadeh return ret; 4051602adf40SYehuda Sadeh } 4052602adf40SYehuda Sadeh 405315228edeSAlex Elder /* 405415228edeSAlex Elder * Clear the rbd device's EXISTS flag if the snapshot it's mapped to 405515228edeSAlex Elder * has disappeared from the (just updated) snapshot context. 405615228edeSAlex Elder */ 405715228edeSAlex Elder static void rbd_exists_validate(struct rbd_device *rbd_dev) 405815228edeSAlex Elder { 405915228edeSAlex Elder u64 snap_id; 406015228edeSAlex Elder 406115228edeSAlex Elder if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) 406215228edeSAlex Elder return; 406315228edeSAlex Elder 406415228edeSAlex Elder snap_id = rbd_dev->spec->snap_id; 406515228edeSAlex Elder if (snap_id == CEPH_NOSNAP) 406615228edeSAlex Elder return; 406715228edeSAlex Elder 406815228edeSAlex Elder if (rbd_dev_snap_index(rbd_dev, snap_id) == BAD_SNAP_INDEX) 406915228edeSAlex Elder clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); 407015228edeSAlex Elder } 407115228edeSAlex Elder 40729875201eSJosh Durgin static void rbd_dev_update_size(struct rbd_device *rbd_dev) 40739875201eSJosh Durgin { 40749875201eSJosh Durgin sector_t size; 40759875201eSJosh Durgin 40769875201eSJosh Durgin /* 4077811c6688SIlya Dryomov * If EXISTS is not set, rbd_dev->disk may be NULL, so don't 4078811c6688SIlya Dryomov * try to update its size. If REMOVING is set, updating size 4079811c6688SIlya Dryomov * is just useless work since the device can't be opened. 40809875201eSJosh Durgin */ 4081811c6688SIlya Dryomov if (test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags) && 4082811c6688SIlya Dryomov !test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags)) { 40839875201eSJosh Durgin size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE; 40849875201eSJosh Durgin dout("setting size to %llu sectors", (unsigned long long)size); 40859875201eSJosh Durgin set_capacity(rbd_dev->disk, size); 40869875201eSJosh Durgin revalidate_disk(rbd_dev->disk); 40879875201eSJosh Durgin } 40889875201eSJosh Durgin } 40899875201eSJosh Durgin 4090cc4a38bdSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev) 40911fe5e993SAlex Elder { 4092e627db08SAlex Elder u64 mapping_size; 40931fe5e993SAlex Elder int ret; 40941fe5e993SAlex Elder 4095cfbf6377SAlex Elder down_write(&rbd_dev->header_rwsem); 40963b5cf2a2SAlex Elder mapping_size = rbd_dev->mapping.size; 4097a720ae09SIlya Dryomov 4098a720ae09SIlya Dryomov ret = rbd_dev_header_info(rbd_dev); 409952bb1f9bSIlya Dryomov if (ret) 410073e39e4dSIlya Dryomov goto out; 410115228edeSAlex Elder 4102e8f59b59SIlya Dryomov /* 4103e8f59b59SIlya Dryomov * If there is a parent, see if it has disappeared due to the 4104e8f59b59SIlya Dryomov * mapped image getting flattened. 4105e8f59b59SIlya Dryomov */ 4106e8f59b59SIlya Dryomov if (rbd_dev->parent) { 4107e8f59b59SIlya Dryomov ret = rbd_dev_v2_parent_info(rbd_dev); 4108e8f59b59SIlya Dryomov if (ret) 410973e39e4dSIlya Dryomov goto out; 4110e8f59b59SIlya Dryomov } 4111e8f59b59SIlya Dryomov 41125ff1108cSIlya Dryomov if (rbd_dev->spec->snap_id == CEPH_NOSNAP) { 41135ff1108cSIlya Dryomov rbd_dev->mapping.size = rbd_dev->header.image_size; 41145ff1108cSIlya Dryomov } else { 41155ff1108cSIlya Dryomov /* validate mapped snapshot's EXISTS flag */ 411615228edeSAlex Elder rbd_exists_validate(rbd_dev); 41175ff1108cSIlya Dryomov } 41185ff1108cSIlya Dryomov 411973e39e4dSIlya Dryomov out: 4120cfbf6377SAlex Elder up_write(&rbd_dev->header_rwsem); 412173e39e4dSIlya Dryomov if (!ret && mapping_size != rbd_dev->mapping.size) 41229875201eSJosh Durgin rbd_dev_update_size(rbd_dev); 41231fe5e993SAlex Elder 412473e39e4dSIlya Dryomov return ret; 41251fe5e993SAlex Elder } 41261fe5e993SAlex Elder 4127d6296d39SChristoph Hellwig static int rbd_init_request(struct blk_mq_tag_set *set, struct request *rq, 4128d6296d39SChristoph Hellwig unsigned int hctx_idx, unsigned int numa_node) 41297ad18afaSChristoph Hellwig { 41307ad18afaSChristoph Hellwig struct work_struct *work = blk_mq_rq_to_pdu(rq); 41317ad18afaSChristoph Hellwig 41327ad18afaSChristoph Hellwig INIT_WORK(work, rbd_queue_workfn); 41337ad18afaSChristoph Hellwig return 0; 41347ad18afaSChristoph Hellwig } 41357ad18afaSChristoph Hellwig 4136f363b089SEric Biggers static const struct blk_mq_ops rbd_mq_ops = { 41377ad18afaSChristoph Hellwig .queue_rq = rbd_queue_rq, 41387ad18afaSChristoph Hellwig .init_request = rbd_init_request, 41397ad18afaSChristoph Hellwig }; 41407ad18afaSChristoph Hellwig 4141602adf40SYehuda Sadeh static int rbd_init_disk(struct rbd_device *rbd_dev) 4142602adf40SYehuda Sadeh { 4143602adf40SYehuda Sadeh struct gendisk *disk; 4144602adf40SYehuda Sadeh struct request_queue *q; 4145420efbdfSIlya Dryomov unsigned int objset_bytes = 4146420efbdfSIlya Dryomov rbd_dev->layout.object_size * rbd_dev->layout.stripe_count; 41477ad18afaSChristoph Hellwig int err; 4148602adf40SYehuda Sadeh 4149602adf40SYehuda Sadeh /* create gendisk info */ 41507e513d43SIlya Dryomov disk = alloc_disk(single_major ? 41517e513d43SIlya Dryomov (1 << RBD_SINGLE_MAJOR_PART_SHIFT) : 41527e513d43SIlya Dryomov RBD_MINORS_PER_MAJOR); 4153602adf40SYehuda Sadeh if (!disk) 41541fcdb8aaSAlex Elder return -ENOMEM; 4155602adf40SYehuda Sadeh 4156f0f8cef5SAlex Elder snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d", 4157de71a297SAlex Elder rbd_dev->dev_id); 4158602adf40SYehuda Sadeh disk->major = rbd_dev->major; 4159dd82fff1SIlya Dryomov disk->first_minor = rbd_dev->minor; 41607e513d43SIlya Dryomov if (single_major) 41617e513d43SIlya Dryomov disk->flags |= GENHD_FL_EXT_DEVT; 4162602adf40SYehuda Sadeh disk->fops = &rbd_bd_ops; 4163602adf40SYehuda Sadeh disk->private_data = rbd_dev; 4164602adf40SYehuda Sadeh 41657ad18afaSChristoph Hellwig memset(&rbd_dev->tag_set, 0, sizeof(rbd_dev->tag_set)); 41667ad18afaSChristoph Hellwig rbd_dev->tag_set.ops = &rbd_mq_ops; 4167b5584180SIlya Dryomov rbd_dev->tag_set.queue_depth = rbd_dev->opts->queue_depth; 41687ad18afaSChristoph Hellwig rbd_dev->tag_set.numa_node = NUMA_NO_NODE; 416956d18f62SMing Lei rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; 41707ad18afaSChristoph Hellwig rbd_dev->tag_set.nr_hw_queues = 1; 41717ad18afaSChristoph Hellwig rbd_dev->tag_set.cmd_size = sizeof(struct work_struct); 41727ad18afaSChristoph Hellwig 41737ad18afaSChristoph Hellwig err = blk_mq_alloc_tag_set(&rbd_dev->tag_set); 41747ad18afaSChristoph Hellwig if (err) 4175602adf40SYehuda Sadeh goto out_disk; 4176029bcbd8SJosh Durgin 41777ad18afaSChristoph Hellwig q = blk_mq_init_queue(&rbd_dev->tag_set); 41787ad18afaSChristoph Hellwig if (IS_ERR(q)) { 41797ad18afaSChristoph Hellwig err = PTR_ERR(q); 41807ad18afaSChristoph Hellwig goto out_tag_set; 41817ad18afaSChristoph Hellwig } 41827ad18afaSChristoph Hellwig 41838b904b5bSBart Van Assche blk_queue_flag_set(QUEUE_FLAG_NONROT, q); 4184d8a2c89cSIlya Dryomov /* QUEUE_FLAG_ADD_RANDOM is off by default for blk-mq */ 4185593a9e7bSAlex Elder 4186420efbdfSIlya Dryomov blk_queue_max_hw_sectors(q, objset_bytes >> SECTOR_SHIFT); 41870d9fde4fSIlya Dryomov q->limits.max_sectors = queue_max_hw_sectors(q); 418821acdf45SIlya Dryomov blk_queue_max_segments(q, USHRT_MAX); 418924f1df60SIlya Dryomov blk_queue_max_segment_size(q, UINT_MAX); 419016d80c54SIlya Dryomov blk_queue_io_min(q, rbd_dev->opts->alloc_size); 419116d80c54SIlya Dryomov blk_queue_io_opt(q, rbd_dev->opts->alloc_size); 4192029bcbd8SJosh Durgin 4193d9360540SIlya Dryomov if (rbd_dev->opts->trim) { 41948b904b5bSBart Van Assche blk_queue_flag_set(QUEUE_FLAG_DISCARD, q); 419516d80c54SIlya Dryomov q->limits.discard_granularity = rbd_dev->opts->alloc_size; 4196420efbdfSIlya Dryomov blk_queue_max_discard_sectors(q, objset_bytes >> SECTOR_SHIFT); 4197420efbdfSIlya Dryomov blk_queue_max_write_zeroes_sectors(q, objset_bytes >> SECTOR_SHIFT); 4198d9360540SIlya Dryomov } 419990e98c52SGuangliang Zhao 4200bae818eeSRonny Hegewald if (!ceph_test_opt(rbd_dev->rbd_client->client, NOCRC)) 4201dc3b17ccSJan Kara q->backing_dev_info->capabilities |= BDI_CAP_STABLE_WRITES; 4202bae818eeSRonny Hegewald 42035769ed0cSIlya Dryomov /* 42045769ed0cSIlya Dryomov * disk_release() expects a queue ref from add_disk() and will 42055769ed0cSIlya Dryomov * put it. Hold an extra ref until add_disk() is called. 42065769ed0cSIlya Dryomov */ 42075769ed0cSIlya Dryomov WARN_ON(!blk_get_queue(q)); 4208602adf40SYehuda Sadeh disk->queue = q; 4209602adf40SYehuda Sadeh q->queuedata = rbd_dev; 4210602adf40SYehuda Sadeh 4211602adf40SYehuda Sadeh rbd_dev->disk = disk; 4212602adf40SYehuda Sadeh 4213602adf40SYehuda Sadeh return 0; 42147ad18afaSChristoph Hellwig out_tag_set: 42157ad18afaSChristoph Hellwig blk_mq_free_tag_set(&rbd_dev->tag_set); 4216602adf40SYehuda Sadeh out_disk: 4217602adf40SYehuda Sadeh put_disk(disk); 42187ad18afaSChristoph Hellwig return err; 4219602adf40SYehuda Sadeh } 4220602adf40SYehuda Sadeh 4221dfc5606dSYehuda Sadeh /* 4222dfc5606dSYehuda Sadeh sysfs 4223dfc5606dSYehuda Sadeh */ 4224602adf40SYehuda Sadeh 4225593a9e7bSAlex Elder static struct rbd_device *dev_to_rbd_dev(struct device *dev) 4226593a9e7bSAlex Elder { 4227593a9e7bSAlex Elder return container_of(dev, struct rbd_device, dev); 4228593a9e7bSAlex Elder } 4229593a9e7bSAlex Elder 4230dfc5606dSYehuda Sadeh static ssize_t rbd_size_show(struct device *dev, 4231dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 4232602adf40SYehuda Sadeh { 4233593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4234dfc5606dSYehuda Sadeh 4235fc71d833SAlex Elder return sprintf(buf, "%llu\n", 4236fc71d833SAlex Elder (unsigned long long)rbd_dev->mapping.size); 4237602adf40SYehuda Sadeh } 4238602adf40SYehuda Sadeh 423934b13184SAlex Elder /* 424034b13184SAlex Elder * Note this shows the features for whatever's mapped, which is not 424134b13184SAlex Elder * necessarily the base image. 424234b13184SAlex Elder */ 424334b13184SAlex Elder static ssize_t rbd_features_show(struct device *dev, 424434b13184SAlex Elder struct device_attribute *attr, char *buf) 424534b13184SAlex Elder { 424634b13184SAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 424734b13184SAlex Elder 424834b13184SAlex Elder return sprintf(buf, "0x%016llx\n", 424934b13184SAlex Elder (unsigned long long)rbd_dev->mapping.features); 425034b13184SAlex Elder } 425134b13184SAlex Elder 4252dfc5606dSYehuda Sadeh static ssize_t rbd_major_show(struct device *dev, 4253dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 4254602adf40SYehuda Sadeh { 4255593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4256dfc5606dSYehuda Sadeh 4257fc71d833SAlex Elder if (rbd_dev->major) 4258dfc5606dSYehuda Sadeh return sprintf(buf, "%d\n", rbd_dev->major); 4259fc71d833SAlex Elder 4260fc71d833SAlex Elder return sprintf(buf, "(none)\n"); 4261dd82fff1SIlya Dryomov } 4262fc71d833SAlex Elder 4263dd82fff1SIlya Dryomov static ssize_t rbd_minor_show(struct device *dev, 4264dd82fff1SIlya Dryomov struct device_attribute *attr, char *buf) 4265dd82fff1SIlya Dryomov { 4266dd82fff1SIlya Dryomov struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4267dd82fff1SIlya Dryomov 4268dd82fff1SIlya Dryomov return sprintf(buf, "%d\n", rbd_dev->minor); 4269dfc5606dSYehuda Sadeh } 4270dfc5606dSYehuda Sadeh 4271005a07bfSIlya Dryomov static ssize_t rbd_client_addr_show(struct device *dev, 4272005a07bfSIlya Dryomov struct device_attribute *attr, char *buf) 4273005a07bfSIlya Dryomov { 4274005a07bfSIlya Dryomov struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4275005a07bfSIlya Dryomov struct ceph_entity_addr *client_addr = 4276005a07bfSIlya Dryomov ceph_client_addr(rbd_dev->rbd_client->client); 4277005a07bfSIlya Dryomov 4278005a07bfSIlya Dryomov return sprintf(buf, "%pISpc/%u\n", &client_addr->in_addr, 4279005a07bfSIlya Dryomov le32_to_cpu(client_addr->nonce)); 4280005a07bfSIlya Dryomov } 4281005a07bfSIlya Dryomov 4282dfc5606dSYehuda Sadeh static ssize_t rbd_client_id_show(struct device *dev, 4283dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 4284dfc5606dSYehuda Sadeh { 4285593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4286dfc5606dSYehuda Sadeh 42871dbb4399SAlex Elder return sprintf(buf, "client%lld\n", 4288033268a5SIlya Dryomov ceph_client_gid(rbd_dev->rbd_client->client)); 4289dfc5606dSYehuda Sadeh } 4290dfc5606dSYehuda Sadeh 4291267fb90bSMike Christie static ssize_t rbd_cluster_fsid_show(struct device *dev, 4292267fb90bSMike Christie struct device_attribute *attr, char *buf) 4293267fb90bSMike Christie { 4294267fb90bSMike Christie struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4295267fb90bSMike Christie 4296267fb90bSMike Christie return sprintf(buf, "%pU\n", &rbd_dev->rbd_client->client->fsid); 4297267fb90bSMike Christie } 4298267fb90bSMike Christie 42990d6d1e9cSMike Christie static ssize_t rbd_config_info_show(struct device *dev, 43000d6d1e9cSMike Christie struct device_attribute *attr, char *buf) 43010d6d1e9cSMike Christie { 43020d6d1e9cSMike Christie struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 43030d6d1e9cSMike Christie 43040d6d1e9cSMike Christie return sprintf(buf, "%s\n", rbd_dev->config_info); 4305dfc5606dSYehuda Sadeh } 4306dfc5606dSYehuda Sadeh 4307dfc5606dSYehuda Sadeh static ssize_t rbd_pool_show(struct device *dev, 4308dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 4309dfc5606dSYehuda Sadeh { 4310593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4311dfc5606dSYehuda Sadeh 43120d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->pool_name); 4313dfc5606dSYehuda Sadeh } 4314dfc5606dSYehuda Sadeh 43159bb2f334SAlex Elder static ssize_t rbd_pool_id_show(struct device *dev, 43169bb2f334SAlex Elder struct device_attribute *attr, char *buf) 43179bb2f334SAlex Elder { 43189bb2f334SAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 43199bb2f334SAlex Elder 43200d7dbfceSAlex Elder return sprintf(buf, "%llu\n", 43210d7dbfceSAlex Elder (unsigned long long) rbd_dev->spec->pool_id); 43229bb2f334SAlex Elder } 43239bb2f334SAlex Elder 4324b26c047bSIlya Dryomov static ssize_t rbd_pool_ns_show(struct device *dev, 4325b26c047bSIlya Dryomov struct device_attribute *attr, char *buf) 4326b26c047bSIlya Dryomov { 4327b26c047bSIlya Dryomov struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4328b26c047bSIlya Dryomov 4329b26c047bSIlya Dryomov return sprintf(buf, "%s\n", rbd_dev->spec->pool_ns ?: ""); 4330b26c047bSIlya Dryomov } 4331b26c047bSIlya Dryomov 4332dfc5606dSYehuda Sadeh static ssize_t rbd_name_show(struct device *dev, 4333dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 4334dfc5606dSYehuda Sadeh { 4335593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4336dfc5606dSYehuda Sadeh 4337a92ffdf8SAlex Elder if (rbd_dev->spec->image_name) 43380d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->image_name); 4339a92ffdf8SAlex Elder 4340a92ffdf8SAlex Elder return sprintf(buf, "(unknown)\n"); 4341dfc5606dSYehuda Sadeh } 4342dfc5606dSYehuda Sadeh 4343589d30e0SAlex Elder static ssize_t rbd_image_id_show(struct device *dev, 4344589d30e0SAlex Elder struct device_attribute *attr, char *buf) 4345589d30e0SAlex Elder { 4346589d30e0SAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4347589d30e0SAlex Elder 43480d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->image_id); 4349589d30e0SAlex Elder } 4350589d30e0SAlex Elder 435134b13184SAlex Elder /* 435234b13184SAlex Elder * Shows the name of the currently-mapped snapshot (or 435334b13184SAlex Elder * RBD_SNAP_HEAD_NAME for the base image). 435434b13184SAlex Elder */ 4355dfc5606dSYehuda Sadeh static ssize_t rbd_snap_show(struct device *dev, 4356dfc5606dSYehuda Sadeh struct device_attribute *attr, 4357dfc5606dSYehuda Sadeh char *buf) 4358dfc5606dSYehuda Sadeh { 4359593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4360dfc5606dSYehuda Sadeh 43610d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->snap_name); 4362dfc5606dSYehuda Sadeh } 4363dfc5606dSYehuda Sadeh 436492a58671SMike Christie static ssize_t rbd_snap_id_show(struct device *dev, 436592a58671SMike Christie struct device_attribute *attr, char *buf) 436692a58671SMike Christie { 436792a58671SMike Christie struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 436892a58671SMike Christie 436992a58671SMike Christie return sprintf(buf, "%llu\n", rbd_dev->spec->snap_id); 437092a58671SMike Christie } 437192a58671SMike Christie 437286b00e0dSAlex Elder /* 4373ff96128fSIlya Dryomov * For a v2 image, shows the chain of parent images, separated by empty 4374ff96128fSIlya Dryomov * lines. For v1 images or if there is no parent, shows "(no parent 4375ff96128fSIlya Dryomov * image)". 437686b00e0dSAlex Elder */ 437786b00e0dSAlex Elder static ssize_t rbd_parent_show(struct device *dev, 437886b00e0dSAlex Elder struct device_attribute *attr, 437986b00e0dSAlex Elder char *buf) 438086b00e0dSAlex Elder { 438186b00e0dSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4382ff96128fSIlya Dryomov ssize_t count = 0; 438386b00e0dSAlex Elder 4384ff96128fSIlya Dryomov if (!rbd_dev->parent) 438586b00e0dSAlex Elder return sprintf(buf, "(no parent image)\n"); 438686b00e0dSAlex Elder 4387ff96128fSIlya Dryomov for ( ; rbd_dev->parent; rbd_dev = rbd_dev->parent) { 4388ff96128fSIlya Dryomov struct rbd_spec *spec = rbd_dev->parent_spec; 438986b00e0dSAlex Elder 4390ff96128fSIlya Dryomov count += sprintf(&buf[count], "%s" 4391ff96128fSIlya Dryomov "pool_id %llu\npool_name %s\n" 4392e92c0eafSIlya Dryomov "pool_ns %s\n" 4393ff96128fSIlya Dryomov "image_id %s\nimage_name %s\n" 4394ff96128fSIlya Dryomov "snap_id %llu\nsnap_name %s\n" 4395ff96128fSIlya Dryomov "overlap %llu\n", 4396ff96128fSIlya Dryomov !count ? "" : "\n", /* first? */ 4397ff96128fSIlya Dryomov spec->pool_id, spec->pool_name, 4398e92c0eafSIlya Dryomov spec->pool_ns ?: "", 4399ff96128fSIlya Dryomov spec->image_id, spec->image_name ?: "(unknown)", 4400ff96128fSIlya Dryomov spec->snap_id, spec->snap_name, 4401ff96128fSIlya Dryomov rbd_dev->parent_overlap); 4402ff96128fSIlya Dryomov } 440386b00e0dSAlex Elder 440486b00e0dSAlex Elder return count; 440586b00e0dSAlex Elder } 440686b00e0dSAlex Elder 4407dfc5606dSYehuda Sadeh static ssize_t rbd_image_refresh(struct device *dev, 4408dfc5606dSYehuda Sadeh struct device_attribute *attr, 4409dfc5606dSYehuda Sadeh const char *buf, 4410dfc5606dSYehuda Sadeh size_t size) 4411dfc5606dSYehuda Sadeh { 4412593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4413b813623aSAlex Elder int ret; 4414602adf40SYehuda Sadeh 4415cc4a38bdSAlex Elder ret = rbd_dev_refresh(rbd_dev); 4416e627db08SAlex Elder if (ret) 441752bb1f9bSIlya Dryomov return ret; 4418b813623aSAlex Elder 441952bb1f9bSIlya Dryomov return size; 4420dfc5606dSYehuda Sadeh } 4421602adf40SYehuda Sadeh 44225657a819SJoe Perches static DEVICE_ATTR(size, 0444, rbd_size_show, NULL); 44235657a819SJoe Perches static DEVICE_ATTR(features, 0444, rbd_features_show, NULL); 44245657a819SJoe Perches static DEVICE_ATTR(major, 0444, rbd_major_show, NULL); 44255657a819SJoe Perches static DEVICE_ATTR(minor, 0444, rbd_minor_show, NULL); 44265657a819SJoe Perches static DEVICE_ATTR(client_addr, 0444, rbd_client_addr_show, NULL); 44275657a819SJoe Perches static DEVICE_ATTR(client_id, 0444, rbd_client_id_show, NULL); 44285657a819SJoe Perches static DEVICE_ATTR(cluster_fsid, 0444, rbd_cluster_fsid_show, NULL); 44295657a819SJoe Perches static DEVICE_ATTR(config_info, 0400, rbd_config_info_show, NULL); 44305657a819SJoe Perches static DEVICE_ATTR(pool, 0444, rbd_pool_show, NULL); 44315657a819SJoe Perches static DEVICE_ATTR(pool_id, 0444, rbd_pool_id_show, NULL); 4432b26c047bSIlya Dryomov static DEVICE_ATTR(pool_ns, 0444, rbd_pool_ns_show, NULL); 44335657a819SJoe Perches static DEVICE_ATTR(name, 0444, rbd_name_show, NULL); 44345657a819SJoe Perches static DEVICE_ATTR(image_id, 0444, rbd_image_id_show, NULL); 44355657a819SJoe Perches static DEVICE_ATTR(refresh, 0200, NULL, rbd_image_refresh); 44365657a819SJoe Perches static DEVICE_ATTR(current_snap, 0444, rbd_snap_show, NULL); 44375657a819SJoe Perches static DEVICE_ATTR(snap_id, 0444, rbd_snap_id_show, NULL); 44385657a819SJoe Perches static DEVICE_ATTR(parent, 0444, rbd_parent_show, NULL); 4439dfc5606dSYehuda Sadeh 4440dfc5606dSYehuda Sadeh static struct attribute *rbd_attrs[] = { 4441dfc5606dSYehuda Sadeh &dev_attr_size.attr, 444234b13184SAlex Elder &dev_attr_features.attr, 4443dfc5606dSYehuda Sadeh &dev_attr_major.attr, 4444dd82fff1SIlya Dryomov &dev_attr_minor.attr, 4445005a07bfSIlya Dryomov &dev_attr_client_addr.attr, 4446dfc5606dSYehuda Sadeh &dev_attr_client_id.attr, 4447267fb90bSMike Christie &dev_attr_cluster_fsid.attr, 44480d6d1e9cSMike Christie &dev_attr_config_info.attr, 4449dfc5606dSYehuda Sadeh &dev_attr_pool.attr, 44509bb2f334SAlex Elder &dev_attr_pool_id.attr, 4451b26c047bSIlya Dryomov &dev_attr_pool_ns.attr, 4452dfc5606dSYehuda Sadeh &dev_attr_name.attr, 4453589d30e0SAlex Elder &dev_attr_image_id.attr, 4454dfc5606dSYehuda Sadeh &dev_attr_current_snap.attr, 445592a58671SMike Christie &dev_attr_snap_id.attr, 445686b00e0dSAlex Elder &dev_attr_parent.attr, 4457dfc5606dSYehuda Sadeh &dev_attr_refresh.attr, 4458dfc5606dSYehuda Sadeh NULL 4459dfc5606dSYehuda Sadeh }; 4460dfc5606dSYehuda Sadeh 4461dfc5606dSYehuda Sadeh static struct attribute_group rbd_attr_group = { 4462dfc5606dSYehuda Sadeh .attrs = rbd_attrs, 4463dfc5606dSYehuda Sadeh }; 4464dfc5606dSYehuda Sadeh 4465dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_attr_groups[] = { 4466dfc5606dSYehuda Sadeh &rbd_attr_group, 4467dfc5606dSYehuda Sadeh NULL 4468dfc5606dSYehuda Sadeh }; 4469dfc5606dSYehuda Sadeh 44706cac4695SIlya Dryomov static void rbd_dev_release(struct device *dev); 4471dfc5606dSYehuda Sadeh 4472b9942bc9SBhumika Goyal static const struct device_type rbd_device_type = { 4473dfc5606dSYehuda Sadeh .name = "rbd", 4474dfc5606dSYehuda Sadeh .groups = rbd_attr_groups, 44756cac4695SIlya Dryomov .release = rbd_dev_release, 4476dfc5606dSYehuda Sadeh }; 4477dfc5606dSYehuda Sadeh 44788b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec) 44798b8fb99cSAlex Elder { 44808b8fb99cSAlex Elder kref_get(&spec->kref); 44818b8fb99cSAlex Elder 44828b8fb99cSAlex Elder return spec; 44838b8fb99cSAlex Elder } 44848b8fb99cSAlex Elder 44858b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref); 44868b8fb99cSAlex Elder static void rbd_spec_put(struct rbd_spec *spec) 44878b8fb99cSAlex Elder { 44888b8fb99cSAlex Elder if (spec) 44898b8fb99cSAlex Elder kref_put(&spec->kref, rbd_spec_free); 44908b8fb99cSAlex Elder } 44918b8fb99cSAlex Elder 44928b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_alloc(void) 44938b8fb99cSAlex Elder { 44948b8fb99cSAlex Elder struct rbd_spec *spec; 44958b8fb99cSAlex Elder 44968b8fb99cSAlex Elder spec = kzalloc(sizeof (*spec), GFP_KERNEL); 44978b8fb99cSAlex Elder if (!spec) 44988b8fb99cSAlex Elder return NULL; 449904077599SIlya Dryomov 450004077599SIlya Dryomov spec->pool_id = CEPH_NOPOOL; 450104077599SIlya Dryomov spec->snap_id = CEPH_NOSNAP; 45028b8fb99cSAlex Elder kref_init(&spec->kref); 45038b8fb99cSAlex Elder 45048b8fb99cSAlex Elder return spec; 45058b8fb99cSAlex Elder } 45068b8fb99cSAlex Elder 45078b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref) 45088b8fb99cSAlex Elder { 45098b8fb99cSAlex Elder struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref); 45108b8fb99cSAlex Elder 45118b8fb99cSAlex Elder kfree(spec->pool_name); 4512b26c047bSIlya Dryomov kfree(spec->pool_ns); 45138b8fb99cSAlex Elder kfree(spec->image_id); 45148b8fb99cSAlex Elder kfree(spec->image_name); 45158b8fb99cSAlex Elder kfree(spec->snap_name); 45168b8fb99cSAlex Elder kfree(spec); 45178b8fb99cSAlex Elder } 45188b8fb99cSAlex Elder 45191643dfa4SIlya Dryomov static void rbd_dev_free(struct rbd_device *rbd_dev) 4520dd5ac32dSIlya Dryomov { 452199d16943SIlya Dryomov WARN_ON(rbd_dev->watch_state != RBD_WATCH_STATE_UNREGISTERED); 4522ed95b21aSIlya Dryomov WARN_ON(rbd_dev->lock_state != RBD_LOCK_STATE_UNLOCKED); 4523dd5ac32dSIlya Dryomov 4524c41d13a3SIlya Dryomov ceph_oid_destroy(&rbd_dev->header_oid); 45256b6dddbeSIlya Dryomov ceph_oloc_destroy(&rbd_dev->header_oloc); 45260d6d1e9cSMike Christie kfree(rbd_dev->config_info); 4527c41d13a3SIlya Dryomov 4528dd5ac32dSIlya Dryomov rbd_put_client(rbd_dev->rbd_client); 4529dd5ac32dSIlya Dryomov rbd_spec_put(rbd_dev->spec); 4530dd5ac32dSIlya Dryomov kfree(rbd_dev->opts); 4531dd5ac32dSIlya Dryomov kfree(rbd_dev); 45321643dfa4SIlya Dryomov } 45331643dfa4SIlya Dryomov 45341643dfa4SIlya Dryomov static void rbd_dev_release(struct device *dev) 45351643dfa4SIlya Dryomov { 45361643dfa4SIlya Dryomov struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 45371643dfa4SIlya Dryomov bool need_put = !!rbd_dev->opts; 45381643dfa4SIlya Dryomov 45391643dfa4SIlya Dryomov if (need_put) { 45401643dfa4SIlya Dryomov destroy_workqueue(rbd_dev->task_wq); 45411643dfa4SIlya Dryomov ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id); 45421643dfa4SIlya Dryomov } 45431643dfa4SIlya Dryomov 45441643dfa4SIlya Dryomov rbd_dev_free(rbd_dev); 4545dd5ac32dSIlya Dryomov 4546dd5ac32dSIlya Dryomov /* 4547dd5ac32dSIlya Dryomov * This is racy, but way better than putting module outside of 4548dd5ac32dSIlya Dryomov * the release callback. The race window is pretty small, so 4549dd5ac32dSIlya Dryomov * doing something similar to dm (dm-builtin.c) is overkill. 4550dd5ac32dSIlya Dryomov */ 4551dd5ac32dSIlya Dryomov if (need_put) 4552dd5ac32dSIlya Dryomov module_put(THIS_MODULE); 4553dd5ac32dSIlya Dryomov } 4554dd5ac32dSIlya Dryomov 45551643dfa4SIlya Dryomov static struct rbd_device *__rbd_dev_create(struct rbd_client *rbdc, 45561643dfa4SIlya Dryomov struct rbd_spec *spec) 4557c53d5893SAlex Elder { 4558c53d5893SAlex Elder struct rbd_device *rbd_dev; 4559c53d5893SAlex Elder 4560c53d5893SAlex Elder rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL); 4561c53d5893SAlex Elder if (!rbd_dev) 4562c53d5893SAlex Elder return NULL; 4563c53d5893SAlex Elder 4564c53d5893SAlex Elder spin_lock_init(&rbd_dev->lock); 4565c53d5893SAlex Elder INIT_LIST_HEAD(&rbd_dev->node); 4566c53d5893SAlex Elder init_rwsem(&rbd_dev->header_rwsem); 4567c53d5893SAlex Elder 45687e97332eSIlya Dryomov rbd_dev->header.data_pool_id = CEPH_NOPOOL; 4569c41d13a3SIlya Dryomov ceph_oid_init(&rbd_dev->header_oid); 4570431a02cdSIlya Dryomov rbd_dev->header_oloc.pool = spec->pool_id; 4571b26c047bSIlya Dryomov if (spec->pool_ns) { 4572b26c047bSIlya Dryomov WARN_ON(!*spec->pool_ns); 4573b26c047bSIlya Dryomov rbd_dev->header_oloc.pool_ns = 4574b26c047bSIlya Dryomov ceph_find_or_create_string(spec->pool_ns, 4575b26c047bSIlya Dryomov strlen(spec->pool_ns)); 4576b26c047bSIlya Dryomov } 4577c41d13a3SIlya Dryomov 457899d16943SIlya Dryomov mutex_init(&rbd_dev->watch_mutex); 457999d16943SIlya Dryomov rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED; 458099d16943SIlya Dryomov INIT_DELAYED_WORK(&rbd_dev->watch_dwork, rbd_reregister_watch); 458199d16943SIlya Dryomov 4582ed95b21aSIlya Dryomov init_rwsem(&rbd_dev->lock_rwsem); 4583ed95b21aSIlya Dryomov rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED; 4584ed95b21aSIlya Dryomov INIT_WORK(&rbd_dev->acquired_lock_work, rbd_notify_acquired_lock); 4585ed95b21aSIlya Dryomov INIT_WORK(&rbd_dev->released_lock_work, rbd_notify_released_lock); 4586ed95b21aSIlya Dryomov INIT_DELAYED_WORK(&rbd_dev->lock_dwork, rbd_acquire_lock); 4587ed95b21aSIlya Dryomov INIT_WORK(&rbd_dev->unlock_work, rbd_release_lock_work); 4588ed95b21aSIlya Dryomov init_waitqueue_head(&rbd_dev->lock_waitq); 4589ed95b21aSIlya Dryomov 4590dd5ac32dSIlya Dryomov rbd_dev->dev.bus = &rbd_bus_type; 4591dd5ac32dSIlya Dryomov rbd_dev->dev.type = &rbd_device_type; 4592dd5ac32dSIlya Dryomov rbd_dev->dev.parent = &rbd_root_dev; 4593dd5ac32dSIlya Dryomov device_initialize(&rbd_dev->dev); 4594dd5ac32dSIlya Dryomov 4595c53d5893SAlex Elder rbd_dev->rbd_client = rbdc; 4596d147543dSIlya Dryomov rbd_dev->spec = spec; 45970903e875SAlex Elder 45981643dfa4SIlya Dryomov return rbd_dev; 45991643dfa4SIlya Dryomov } 46001643dfa4SIlya Dryomov 4601dd5ac32dSIlya Dryomov /* 46021643dfa4SIlya Dryomov * Create a mapping rbd_dev. 4603dd5ac32dSIlya Dryomov */ 46041643dfa4SIlya Dryomov static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc, 46051643dfa4SIlya Dryomov struct rbd_spec *spec, 46061643dfa4SIlya Dryomov struct rbd_options *opts) 46071643dfa4SIlya Dryomov { 46081643dfa4SIlya Dryomov struct rbd_device *rbd_dev; 46091643dfa4SIlya Dryomov 46101643dfa4SIlya Dryomov rbd_dev = __rbd_dev_create(rbdc, spec); 46111643dfa4SIlya Dryomov if (!rbd_dev) 46121643dfa4SIlya Dryomov return NULL; 46131643dfa4SIlya Dryomov 46141643dfa4SIlya Dryomov rbd_dev->opts = opts; 46151643dfa4SIlya Dryomov 46161643dfa4SIlya Dryomov /* get an id and fill in device name */ 46171643dfa4SIlya Dryomov rbd_dev->dev_id = ida_simple_get(&rbd_dev_id_ida, 0, 46181643dfa4SIlya Dryomov minor_to_rbd_dev_id(1 << MINORBITS), 46191643dfa4SIlya Dryomov GFP_KERNEL); 46201643dfa4SIlya Dryomov if (rbd_dev->dev_id < 0) 46211643dfa4SIlya Dryomov goto fail_rbd_dev; 46221643dfa4SIlya Dryomov 46231643dfa4SIlya Dryomov sprintf(rbd_dev->name, RBD_DRV_NAME "%d", rbd_dev->dev_id); 46241643dfa4SIlya Dryomov rbd_dev->task_wq = alloc_ordered_workqueue("%s-tasks", WQ_MEM_RECLAIM, 46251643dfa4SIlya Dryomov rbd_dev->name); 46261643dfa4SIlya Dryomov if (!rbd_dev->task_wq) 46271643dfa4SIlya Dryomov goto fail_dev_id; 46281643dfa4SIlya Dryomov 46291643dfa4SIlya Dryomov /* we have a ref from do_rbd_add() */ 4630dd5ac32dSIlya Dryomov __module_get(THIS_MODULE); 4631dd5ac32dSIlya Dryomov 46321643dfa4SIlya Dryomov dout("%s rbd_dev %p dev_id %d\n", __func__, rbd_dev, rbd_dev->dev_id); 4633c53d5893SAlex Elder return rbd_dev; 46341643dfa4SIlya Dryomov 46351643dfa4SIlya Dryomov fail_dev_id: 46361643dfa4SIlya Dryomov ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id); 46371643dfa4SIlya Dryomov fail_rbd_dev: 46381643dfa4SIlya Dryomov rbd_dev_free(rbd_dev); 46391643dfa4SIlya Dryomov return NULL; 4640c53d5893SAlex Elder } 4641c53d5893SAlex Elder 4642c53d5893SAlex Elder static void rbd_dev_destroy(struct rbd_device *rbd_dev) 4643c53d5893SAlex Elder { 4644dd5ac32dSIlya Dryomov if (rbd_dev) 4645dd5ac32dSIlya Dryomov put_device(&rbd_dev->dev); 4646c53d5893SAlex Elder } 4647c53d5893SAlex Elder 4648dfc5606dSYehuda Sadeh /* 46499d475de5SAlex Elder * Get the size and object order for an image snapshot, or if 46509d475de5SAlex Elder * snap_id is CEPH_NOSNAP, gets this information for the base 46519d475de5SAlex Elder * image. 46529d475de5SAlex Elder */ 46539d475de5SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id, 46549d475de5SAlex Elder u8 *order, u64 *snap_size) 46559d475de5SAlex Elder { 46569d475de5SAlex Elder __le64 snapid = cpu_to_le64(snap_id); 46579d475de5SAlex Elder int ret; 46589d475de5SAlex Elder struct { 46599d475de5SAlex Elder u8 order; 46609d475de5SAlex Elder __le64 size; 46619d475de5SAlex Elder } __attribute__ ((packed)) size_buf = { 0 }; 46629d475de5SAlex Elder 4663ecd4a68aSIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, 4664ecd4a68aSIlya Dryomov &rbd_dev->header_oloc, "get_size", 46654157976bSAlex Elder &snapid, sizeof(snapid), 4666e2a58ee5SAlex Elder &size_buf, sizeof(size_buf)); 466736be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 46689d475de5SAlex Elder if (ret < 0) 46699d475de5SAlex Elder return ret; 467057385b51SAlex Elder if (ret < sizeof (size_buf)) 467157385b51SAlex Elder return -ERANGE; 46729d475de5SAlex Elder 4673c3545579SJosh Durgin if (order) { 46749d475de5SAlex Elder *order = size_buf.order; 4675c3545579SJosh Durgin dout(" order %u", (unsigned int)*order); 4676c3545579SJosh Durgin } 46779d475de5SAlex Elder *snap_size = le64_to_cpu(size_buf.size); 46789d475de5SAlex Elder 4679c3545579SJosh Durgin dout(" snap_id 0x%016llx snap_size = %llu\n", 4680c3545579SJosh Durgin (unsigned long long)snap_id, 46819d475de5SAlex Elder (unsigned long long)*snap_size); 46829d475de5SAlex Elder 46839d475de5SAlex Elder return 0; 46849d475de5SAlex Elder } 46859d475de5SAlex Elder 46869d475de5SAlex Elder static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev) 46879d475de5SAlex Elder { 46889d475de5SAlex Elder return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP, 46899d475de5SAlex Elder &rbd_dev->header.obj_order, 46909d475de5SAlex Elder &rbd_dev->header.image_size); 46919d475de5SAlex Elder } 46929d475de5SAlex Elder 46931e130199SAlex Elder static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev) 46941e130199SAlex Elder { 46951e130199SAlex Elder void *reply_buf; 46961e130199SAlex Elder int ret; 46971e130199SAlex Elder void *p; 46981e130199SAlex Elder 46991e130199SAlex Elder reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL); 47001e130199SAlex Elder if (!reply_buf) 47011e130199SAlex Elder return -ENOMEM; 47021e130199SAlex Elder 4703ecd4a68aSIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, 4704ecd4a68aSIlya Dryomov &rbd_dev->header_oloc, "get_object_prefix", 4705ecd4a68aSIlya Dryomov NULL, 0, reply_buf, RBD_OBJ_PREFIX_LEN_MAX); 470636be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 47071e130199SAlex Elder if (ret < 0) 47081e130199SAlex Elder goto out; 47091e130199SAlex Elder 47101e130199SAlex Elder p = reply_buf; 47111e130199SAlex Elder rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p, 471257385b51SAlex Elder p + ret, NULL, GFP_NOIO); 471357385b51SAlex Elder ret = 0; 47141e130199SAlex Elder 47151e130199SAlex Elder if (IS_ERR(rbd_dev->header.object_prefix)) { 47161e130199SAlex Elder ret = PTR_ERR(rbd_dev->header.object_prefix); 47171e130199SAlex Elder rbd_dev->header.object_prefix = NULL; 47181e130199SAlex Elder } else { 47191e130199SAlex Elder dout(" object_prefix = %s\n", rbd_dev->header.object_prefix); 47201e130199SAlex Elder } 47211e130199SAlex Elder out: 47221e130199SAlex Elder kfree(reply_buf); 47231e130199SAlex Elder 47241e130199SAlex Elder return ret; 47251e130199SAlex Elder } 47261e130199SAlex Elder 4727b1b5402aSAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id, 4728b1b5402aSAlex Elder u64 *snap_features) 4729b1b5402aSAlex Elder { 4730b1b5402aSAlex Elder __le64 snapid = cpu_to_le64(snap_id); 4731b1b5402aSAlex Elder struct { 4732b1b5402aSAlex Elder __le64 features; 4733b1b5402aSAlex Elder __le64 incompat; 47344157976bSAlex Elder } __attribute__ ((packed)) features_buf = { 0 }; 4735d3767f0fSIlya Dryomov u64 unsup; 4736b1b5402aSAlex Elder int ret; 4737b1b5402aSAlex Elder 4738ecd4a68aSIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, 4739ecd4a68aSIlya Dryomov &rbd_dev->header_oloc, "get_features", 47404157976bSAlex Elder &snapid, sizeof(snapid), 4741e2a58ee5SAlex Elder &features_buf, sizeof(features_buf)); 474236be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 4743b1b5402aSAlex Elder if (ret < 0) 4744b1b5402aSAlex Elder return ret; 474557385b51SAlex Elder if (ret < sizeof (features_buf)) 474657385b51SAlex Elder return -ERANGE; 4747d889140cSAlex Elder 4748d3767f0fSIlya Dryomov unsup = le64_to_cpu(features_buf.incompat) & ~RBD_FEATURES_SUPPORTED; 4749d3767f0fSIlya Dryomov if (unsup) { 4750d3767f0fSIlya Dryomov rbd_warn(rbd_dev, "image uses unsupported features: 0x%llx", 4751d3767f0fSIlya Dryomov unsup); 4752b8f5c6edSAlex Elder return -ENXIO; 4753d3767f0fSIlya Dryomov } 4754d889140cSAlex Elder 4755b1b5402aSAlex Elder *snap_features = le64_to_cpu(features_buf.features); 4756b1b5402aSAlex Elder 4757b1b5402aSAlex Elder dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n", 4758b1b5402aSAlex Elder (unsigned long long)snap_id, 4759b1b5402aSAlex Elder (unsigned long long)*snap_features, 4760b1b5402aSAlex Elder (unsigned long long)le64_to_cpu(features_buf.incompat)); 4761b1b5402aSAlex Elder 4762b1b5402aSAlex Elder return 0; 4763b1b5402aSAlex Elder } 4764b1b5402aSAlex Elder 4765b1b5402aSAlex Elder static int rbd_dev_v2_features(struct rbd_device *rbd_dev) 4766b1b5402aSAlex Elder { 4767b1b5402aSAlex Elder return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP, 4768b1b5402aSAlex Elder &rbd_dev->header.features); 4769b1b5402aSAlex Elder } 4770b1b5402aSAlex Elder 4771eb3b2d6bSIlya Dryomov struct parent_image_info { 4772eb3b2d6bSIlya Dryomov u64 pool_id; 4773e92c0eafSIlya Dryomov const char *pool_ns; 4774eb3b2d6bSIlya Dryomov const char *image_id; 4775eb3b2d6bSIlya Dryomov u64 snap_id; 4776eb3b2d6bSIlya Dryomov 4777e92c0eafSIlya Dryomov bool has_overlap; 4778eb3b2d6bSIlya Dryomov u64 overlap; 4779eb3b2d6bSIlya Dryomov }; 4780eb3b2d6bSIlya Dryomov 4781eb3b2d6bSIlya Dryomov /* 4782eb3b2d6bSIlya Dryomov * The caller is responsible for @pii. 4783eb3b2d6bSIlya Dryomov */ 4784e92c0eafSIlya Dryomov static int decode_parent_image_spec(void **p, void *end, 4785e92c0eafSIlya Dryomov struct parent_image_info *pii) 4786e92c0eafSIlya Dryomov { 4787e92c0eafSIlya Dryomov u8 struct_v; 4788e92c0eafSIlya Dryomov u32 struct_len; 4789e92c0eafSIlya Dryomov int ret; 4790e92c0eafSIlya Dryomov 4791e92c0eafSIlya Dryomov ret = ceph_start_decoding(p, end, 1, "ParentImageSpec", 4792e92c0eafSIlya Dryomov &struct_v, &struct_len); 4793e92c0eafSIlya Dryomov if (ret) 4794e92c0eafSIlya Dryomov return ret; 4795e92c0eafSIlya Dryomov 4796e92c0eafSIlya Dryomov ceph_decode_64_safe(p, end, pii->pool_id, e_inval); 4797e92c0eafSIlya Dryomov pii->pool_ns = ceph_extract_encoded_string(p, end, NULL, GFP_KERNEL); 4798e92c0eafSIlya Dryomov if (IS_ERR(pii->pool_ns)) { 4799e92c0eafSIlya Dryomov ret = PTR_ERR(pii->pool_ns); 4800e92c0eafSIlya Dryomov pii->pool_ns = NULL; 4801e92c0eafSIlya Dryomov return ret; 4802e92c0eafSIlya Dryomov } 4803e92c0eafSIlya Dryomov pii->image_id = ceph_extract_encoded_string(p, end, NULL, GFP_KERNEL); 4804e92c0eafSIlya Dryomov if (IS_ERR(pii->image_id)) { 4805e92c0eafSIlya Dryomov ret = PTR_ERR(pii->image_id); 4806e92c0eafSIlya Dryomov pii->image_id = NULL; 4807e92c0eafSIlya Dryomov return ret; 4808e92c0eafSIlya Dryomov } 4809e92c0eafSIlya Dryomov ceph_decode_64_safe(p, end, pii->snap_id, e_inval); 4810e92c0eafSIlya Dryomov return 0; 4811e92c0eafSIlya Dryomov 4812e92c0eafSIlya Dryomov e_inval: 4813e92c0eafSIlya Dryomov return -EINVAL; 4814e92c0eafSIlya Dryomov } 4815e92c0eafSIlya Dryomov 4816e92c0eafSIlya Dryomov static int __get_parent_info(struct rbd_device *rbd_dev, 4817e92c0eafSIlya Dryomov struct page *req_page, 4818e92c0eafSIlya Dryomov struct page *reply_page, 4819e92c0eafSIlya Dryomov struct parent_image_info *pii) 4820e92c0eafSIlya Dryomov { 4821e92c0eafSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 4822e92c0eafSIlya Dryomov size_t reply_len = PAGE_SIZE; 4823e92c0eafSIlya Dryomov void *p, *end; 4824e92c0eafSIlya Dryomov int ret; 4825e92c0eafSIlya Dryomov 4826e92c0eafSIlya Dryomov ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc, 4827e92c0eafSIlya Dryomov "rbd", "parent_get", CEPH_OSD_FLAG_READ, 4828e92c0eafSIlya Dryomov req_page, sizeof(u64), reply_page, &reply_len); 4829e92c0eafSIlya Dryomov if (ret) 4830e92c0eafSIlya Dryomov return ret == -EOPNOTSUPP ? 1 : ret; 4831e92c0eafSIlya Dryomov 4832e92c0eafSIlya Dryomov p = page_address(reply_page); 4833e92c0eafSIlya Dryomov end = p + reply_len; 4834e92c0eafSIlya Dryomov ret = decode_parent_image_spec(&p, end, pii); 4835e92c0eafSIlya Dryomov if (ret) 4836e92c0eafSIlya Dryomov return ret; 4837e92c0eafSIlya Dryomov 4838e92c0eafSIlya Dryomov ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc, 4839e92c0eafSIlya Dryomov "rbd", "parent_overlap_get", CEPH_OSD_FLAG_READ, 4840e92c0eafSIlya Dryomov req_page, sizeof(u64), reply_page, &reply_len); 4841e92c0eafSIlya Dryomov if (ret) 4842e92c0eafSIlya Dryomov return ret; 4843e92c0eafSIlya Dryomov 4844e92c0eafSIlya Dryomov p = page_address(reply_page); 4845e92c0eafSIlya Dryomov end = p + reply_len; 4846e92c0eafSIlya Dryomov ceph_decode_8_safe(&p, end, pii->has_overlap, e_inval); 4847e92c0eafSIlya Dryomov if (pii->has_overlap) 4848e92c0eafSIlya Dryomov ceph_decode_64_safe(&p, end, pii->overlap, e_inval); 4849e92c0eafSIlya Dryomov 4850e92c0eafSIlya Dryomov return 0; 4851e92c0eafSIlya Dryomov 4852e92c0eafSIlya Dryomov e_inval: 4853e92c0eafSIlya Dryomov return -EINVAL; 4854e92c0eafSIlya Dryomov } 4855e92c0eafSIlya Dryomov 4856e92c0eafSIlya Dryomov /* 4857e92c0eafSIlya Dryomov * The caller is responsible for @pii. 4858e92c0eafSIlya Dryomov */ 4859eb3b2d6bSIlya Dryomov static int __get_parent_info_legacy(struct rbd_device *rbd_dev, 4860eb3b2d6bSIlya Dryomov struct page *req_page, 4861eb3b2d6bSIlya Dryomov struct page *reply_page, 4862eb3b2d6bSIlya Dryomov struct parent_image_info *pii) 4863eb3b2d6bSIlya Dryomov { 4864eb3b2d6bSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 4865eb3b2d6bSIlya Dryomov size_t reply_len = PAGE_SIZE; 4866eb3b2d6bSIlya Dryomov void *p, *end; 4867eb3b2d6bSIlya Dryomov int ret; 4868eb3b2d6bSIlya Dryomov 4869eb3b2d6bSIlya Dryomov ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc, 4870eb3b2d6bSIlya Dryomov "rbd", "get_parent", CEPH_OSD_FLAG_READ, 4871eb3b2d6bSIlya Dryomov req_page, sizeof(u64), reply_page, &reply_len); 4872eb3b2d6bSIlya Dryomov if (ret) 4873eb3b2d6bSIlya Dryomov return ret; 4874eb3b2d6bSIlya Dryomov 4875eb3b2d6bSIlya Dryomov p = page_address(reply_page); 4876eb3b2d6bSIlya Dryomov end = p + reply_len; 4877eb3b2d6bSIlya Dryomov ceph_decode_64_safe(&p, end, pii->pool_id, e_inval); 4878eb3b2d6bSIlya Dryomov pii->image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL); 4879eb3b2d6bSIlya Dryomov if (IS_ERR(pii->image_id)) { 4880eb3b2d6bSIlya Dryomov ret = PTR_ERR(pii->image_id); 4881eb3b2d6bSIlya Dryomov pii->image_id = NULL; 4882eb3b2d6bSIlya Dryomov return ret; 4883eb3b2d6bSIlya Dryomov } 4884eb3b2d6bSIlya Dryomov ceph_decode_64_safe(&p, end, pii->snap_id, e_inval); 4885e92c0eafSIlya Dryomov pii->has_overlap = true; 4886eb3b2d6bSIlya Dryomov ceph_decode_64_safe(&p, end, pii->overlap, e_inval); 4887eb3b2d6bSIlya Dryomov 4888eb3b2d6bSIlya Dryomov return 0; 4889eb3b2d6bSIlya Dryomov 4890eb3b2d6bSIlya Dryomov e_inval: 4891eb3b2d6bSIlya Dryomov return -EINVAL; 4892eb3b2d6bSIlya Dryomov } 4893eb3b2d6bSIlya Dryomov 4894eb3b2d6bSIlya Dryomov static int get_parent_info(struct rbd_device *rbd_dev, 4895eb3b2d6bSIlya Dryomov struct parent_image_info *pii) 4896eb3b2d6bSIlya Dryomov { 4897eb3b2d6bSIlya Dryomov struct page *req_page, *reply_page; 4898eb3b2d6bSIlya Dryomov void *p; 4899eb3b2d6bSIlya Dryomov int ret; 4900eb3b2d6bSIlya Dryomov 4901eb3b2d6bSIlya Dryomov req_page = alloc_page(GFP_KERNEL); 4902eb3b2d6bSIlya Dryomov if (!req_page) 4903eb3b2d6bSIlya Dryomov return -ENOMEM; 4904eb3b2d6bSIlya Dryomov 4905eb3b2d6bSIlya Dryomov reply_page = alloc_page(GFP_KERNEL); 4906eb3b2d6bSIlya Dryomov if (!reply_page) { 4907eb3b2d6bSIlya Dryomov __free_page(req_page); 4908eb3b2d6bSIlya Dryomov return -ENOMEM; 4909eb3b2d6bSIlya Dryomov } 4910eb3b2d6bSIlya Dryomov 4911eb3b2d6bSIlya Dryomov p = page_address(req_page); 4912eb3b2d6bSIlya Dryomov ceph_encode_64(&p, rbd_dev->spec->snap_id); 4913e92c0eafSIlya Dryomov ret = __get_parent_info(rbd_dev, req_page, reply_page, pii); 4914e92c0eafSIlya Dryomov if (ret > 0) 4915e92c0eafSIlya Dryomov ret = __get_parent_info_legacy(rbd_dev, req_page, reply_page, 4916e92c0eafSIlya Dryomov pii); 4917eb3b2d6bSIlya Dryomov 4918eb3b2d6bSIlya Dryomov __free_page(req_page); 4919eb3b2d6bSIlya Dryomov __free_page(reply_page); 4920eb3b2d6bSIlya Dryomov return ret; 4921eb3b2d6bSIlya Dryomov } 4922eb3b2d6bSIlya Dryomov 492386b00e0dSAlex Elder static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev) 492486b00e0dSAlex Elder { 492586b00e0dSAlex Elder struct rbd_spec *parent_spec; 4926eb3b2d6bSIlya Dryomov struct parent_image_info pii = { 0 }; 492786b00e0dSAlex Elder int ret; 492886b00e0dSAlex Elder 492986b00e0dSAlex Elder parent_spec = rbd_spec_alloc(); 493086b00e0dSAlex Elder if (!parent_spec) 493186b00e0dSAlex Elder return -ENOMEM; 493286b00e0dSAlex Elder 4933eb3b2d6bSIlya Dryomov ret = get_parent_info(rbd_dev, &pii); 4934eb3b2d6bSIlya Dryomov if (ret) 493586b00e0dSAlex Elder goto out_err; 493686b00e0dSAlex Elder 4937e92c0eafSIlya Dryomov dout("%s pool_id %llu pool_ns %s image_id %s snap_id %llu has_overlap %d overlap %llu\n", 4938e92c0eafSIlya Dryomov __func__, pii.pool_id, pii.pool_ns, pii.image_id, pii.snap_id, 4939e92c0eafSIlya Dryomov pii.has_overlap, pii.overlap); 4940eb3b2d6bSIlya Dryomov 4941e92c0eafSIlya Dryomov if (pii.pool_id == CEPH_NOPOOL || !pii.has_overlap) { 4942392a9dadSAlex Elder /* 4943392a9dadSAlex Elder * Either the parent never existed, or we have 4944392a9dadSAlex Elder * record of it but the image got flattened so it no 4945392a9dadSAlex Elder * longer has a parent. When the parent of a 4946392a9dadSAlex Elder * layered image disappears we immediately set the 4947392a9dadSAlex Elder * overlap to 0. The effect of this is that all new 4948392a9dadSAlex Elder * requests will be treated as if the image had no 4949392a9dadSAlex Elder * parent. 4950e92c0eafSIlya Dryomov * 4951e92c0eafSIlya Dryomov * If !pii.has_overlap, the parent image spec is not 4952e92c0eafSIlya Dryomov * applicable. It's there to avoid duplication in each 4953e92c0eafSIlya Dryomov * snapshot record. 4954392a9dadSAlex Elder */ 4955392a9dadSAlex Elder if (rbd_dev->parent_overlap) { 4956392a9dadSAlex Elder rbd_dev->parent_overlap = 0; 4957392a9dadSAlex Elder rbd_dev_parent_put(rbd_dev); 4958392a9dadSAlex Elder pr_info("%s: clone image has been flattened\n", 4959392a9dadSAlex Elder rbd_dev->disk->disk_name); 4960392a9dadSAlex Elder } 4961392a9dadSAlex Elder 496286b00e0dSAlex Elder goto out; /* No parent? No problem. */ 4963392a9dadSAlex Elder } 496486b00e0dSAlex Elder 49650903e875SAlex Elder /* The ceph file layout needs to fit pool id in 32 bits */ 49660903e875SAlex Elder 49670903e875SAlex Elder ret = -EIO; 4968eb3b2d6bSIlya Dryomov if (pii.pool_id > (u64)U32_MAX) { 49699584d508SIlya Dryomov rbd_warn(NULL, "parent pool id too large (%llu > %u)", 4970eb3b2d6bSIlya Dryomov (unsigned long long)pii.pool_id, U32_MAX); 497157385b51SAlex Elder goto out_err; 4972c0cd10dbSAlex Elder } 49730903e875SAlex Elder 49743b5cf2a2SAlex Elder /* 49753b5cf2a2SAlex Elder * The parent won't change (except when the clone is 49763b5cf2a2SAlex Elder * flattened, already handled that). So we only need to 49773b5cf2a2SAlex Elder * record the parent spec we have not already done so. 49783b5cf2a2SAlex Elder */ 49793b5cf2a2SAlex Elder if (!rbd_dev->parent_spec) { 4980eb3b2d6bSIlya Dryomov parent_spec->pool_id = pii.pool_id; 4981e92c0eafSIlya Dryomov if (pii.pool_ns && *pii.pool_ns) { 4982e92c0eafSIlya Dryomov parent_spec->pool_ns = pii.pool_ns; 4983e92c0eafSIlya Dryomov pii.pool_ns = NULL; 4984e92c0eafSIlya Dryomov } 4985eb3b2d6bSIlya Dryomov parent_spec->image_id = pii.image_id; 4986eb3b2d6bSIlya Dryomov pii.image_id = NULL; 4987eb3b2d6bSIlya Dryomov parent_spec->snap_id = pii.snap_id; 4988b26c047bSIlya Dryomov 498986b00e0dSAlex Elder rbd_dev->parent_spec = parent_spec; 499086b00e0dSAlex Elder parent_spec = NULL; /* rbd_dev now owns this */ 49913b5cf2a2SAlex Elder } 49923b5cf2a2SAlex Elder 49933b5cf2a2SAlex Elder /* 4994cf32bd9cSIlya Dryomov * We always update the parent overlap. If it's zero we issue 4995cf32bd9cSIlya Dryomov * a warning, as we will proceed as if there was no parent. 49963b5cf2a2SAlex Elder */ 4997eb3b2d6bSIlya Dryomov if (!pii.overlap) { 49983b5cf2a2SAlex Elder if (parent_spec) { 4999cf32bd9cSIlya Dryomov /* refresh, careful to warn just once */ 5000cf32bd9cSIlya Dryomov if (rbd_dev->parent_overlap) 5001cf32bd9cSIlya Dryomov rbd_warn(rbd_dev, 5002cf32bd9cSIlya Dryomov "clone now standalone (overlap became 0)"); 500370cf49cfSAlex Elder } else { 5004cf32bd9cSIlya Dryomov /* initial probe */ 5005cf32bd9cSIlya Dryomov rbd_warn(rbd_dev, "clone is standalone (overlap 0)"); 50063b5cf2a2SAlex Elder } 500770cf49cfSAlex Elder } 5008eb3b2d6bSIlya Dryomov rbd_dev->parent_overlap = pii.overlap; 5009cf32bd9cSIlya Dryomov 501086b00e0dSAlex Elder out: 501186b00e0dSAlex Elder ret = 0; 501286b00e0dSAlex Elder out_err: 5013e92c0eafSIlya Dryomov kfree(pii.pool_ns); 5014eb3b2d6bSIlya Dryomov kfree(pii.image_id); 501586b00e0dSAlex Elder rbd_spec_put(parent_spec); 501686b00e0dSAlex Elder return ret; 501786b00e0dSAlex Elder } 501886b00e0dSAlex Elder 5019cc070d59SAlex Elder static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev) 5020cc070d59SAlex Elder { 5021cc070d59SAlex Elder struct { 5022cc070d59SAlex Elder __le64 stripe_unit; 5023cc070d59SAlex Elder __le64 stripe_count; 5024cc070d59SAlex Elder } __attribute__ ((packed)) striping_info_buf = { 0 }; 5025cc070d59SAlex Elder size_t size = sizeof (striping_info_buf); 5026cc070d59SAlex Elder void *p; 5027cc070d59SAlex Elder int ret; 5028cc070d59SAlex Elder 5029ecd4a68aSIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, 5030ecd4a68aSIlya Dryomov &rbd_dev->header_oloc, "get_stripe_unit_count", 5031ecd4a68aSIlya Dryomov NULL, 0, &striping_info_buf, size); 5032cc070d59SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 5033cc070d59SAlex Elder if (ret < 0) 5034cc070d59SAlex Elder return ret; 5035cc070d59SAlex Elder if (ret < size) 5036cc070d59SAlex Elder return -ERANGE; 5037cc070d59SAlex Elder 5038cc070d59SAlex Elder p = &striping_info_buf; 5039b1331852SIlya Dryomov rbd_dev->header.stripe_unit = ceph_decode_64(&p); 5040b1331852SIlya Dryomov rbd_dev->header.stripe_count = ceph_decode_64(&p); 5041cc070d59SAlex Elder return 0; 5042cc070d59SAlex Elder } 5043cc070d59SAlex Elder 50447e97332eSIlya Dryomov static int rbd_dev_v2_data_pool(struct rbd_device *rbd_dev) 50457e97332eSIlya Dryomov { 50467e97332eSIlya Dryomov __le64 data_pool_id; 50477e97332eSIlya Dryomov int ret; 50487e97332eSIlya Dryomov 50497e97332eSIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, 50507e97332eSIlya Dryomov &rbd_dev->header_oloc, "get_data_pool", 50517e97332eSIlya Dryomov NULL, 0, &data_pool_id, sizeof(data_pool_id)); 50527e97332eSIlya Dryomov if (ret < 0) 50537e97332eSIlya Dryomov return ret; 50547e97332eSIlya Dryomov if (ret < sizeof(data_pool_id)) 50557e97332eSIlya Dryomov return -EBADMSG; 50567e97332eSIlya Dryomov 50577e97332eSIlya Dryomov rbd_dev->header.data_pool_id = le64_to_cpu(data_pool_id); 50587e97332eSIlya Dryomov WARN_ON(rbd_dev->header.data_pool_id == CEPH_NOPOOL); 50597e97332eSIlya Dryomov return 0; 50607e97332eSIlya Dryomov } 50617e97332eSIlya Dryomov 50629e15b77dSAlex Elder static char *rbd_dev_image_name(struct rbd_device *rbd_dev) 50639e15b77dSAlex Elder { 5064ecd4a68aSIlya Dryomov CEPH_DEFINE_OID_ONSTACK(oid); 50659e15b77dSAlex Elder size_t image_id_size; 50669e15b77dSAlex Elder char *image_id; 50679e15b77dSAlex Elder void *p; 50689e15b77dSAlex Elder void *end; 50699e15b77dSAlex Elder size_t size; 50709e15b77dSAlex Elder void *reply_buf = NULL; 50719e15b77dSAlex Elder size_t len = 0; 50729e15b77dSAlex Elder char *image_name = NULL; 50739e15b77dSAlex Elder int ret; 50749e15b77dSAlex Elder 50759e15b77dSAlex Elder rbd_assert(!rbd_dev->spec->image_name); 50769e15b77dSAlex Elder 507769e7a02fSAlex Elder len = strlen(rbd_dev->spec->image_id); 507869e7a02fSAlex Elder image_id_size = sizeof (__le32) + len; 50799e15b77dSAlex Elder image_id = kmalloc(image_id_size, GFP_KERNEL); 50809e15b77dSAlex Elder if (!image_id) 50819e15b77dSAlex Elder return NULL; 50829e15b77dSAlex Elder 50839e15b77dSAlex Elder p = image_id; 50844157976bSAlex Elder end = image_id + image_id_size; 508569e7a02fSAlex Elder ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len); 50869e15b77dSAlex Elder 50879e15b77dSAlex Elder size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX; 50889e15b77dSAlex Elder reply_buf = kmalloc(size, GFP_KERNEL); 50899e15b77dSAlex Elder if (!reply_buf) 50909e15b77dSAlex Elder goto out; 50919e15b77dSAlex Elder 5092ecd4a68aSIlya Dryomov ceph_oid_printf(&oid, "%s", RBD_DIRECTORY); 5093ecd4a68aSIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc, 5094ecd4a68aSIlya Dryomov "dir_get_name", image_id, image_id_size, 5095e2a58ee5SAlex Elder reply_buf, size); 50969e15b77dSAlex Elder if (ret < 0) 50979e15b77dSAlex Elder goto out; 50989e15b77dSAlex Elder p = reply_buf; 5099f40eb349SAlex Elder end = reply_buf + ret; 5100f40eb349SAlex Elder 51019e15b77dSAlex Elder image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL); 51029e15b77dSAlex Elder if (IS_ERR(image_name)) 51039e15b77dSAlex Elder image_name = NULL; 51049e15b77dSAlex Elder else 51059e15b77dSAlex Elder dout("%s: name is %s len is %zd\n", __func__, image_name, len); 51069e15b77dSAlex Elder out: 51079e15b77dSAlex Elder kfree(reply_buf); 51089e15b77dSAlex Elder kfree(image_id); 51099e15b77dSAlex Elder 51109e15b77dSAlex Elder return image_name; 51119e15b77dSAlex Elder } 51129e15b77dSAlex Elder 51132ad3d716SAlex Elder static u64 rbd_v1_snap_id_by_name(struct rbd_device *rbd_dev, const char *name) 51142ad3d716SAlex Elder { 51152ad3d716SAlex Elder struct ceph_snap_context *snapc = rbd_dev->header.snapc; 51162ad3d716SAlex Elder const char *snap_name; 51172ad3d716SAlex Elder u32 which = 0; 51182ad3d716SAlex Elder 51192ad3d716SAlex Elder /* Skip over names until we find the one we are looking for */ 51202ad3d716SAlex Elder 51212ad3d716SAlex Elder snap_name = rbd_dev->header.snap_names; 51222ad3d716SAlex Elder while (which < snapc->num_snaps) { 51232ad3d716SAlex Elder if (!strcmp(name, snap_name)) 51242ad3d716SAlex Elder return snapc->snaps[which]; 51252ad3d716SAlex Elder snap_name += strlen(snap_name) + 1; 51262ad3d716SAlex Elder which++; 51272ad3d716SAlex Elder } 51282ad3d716SAlex Elder return CEPH_NOSNAP; 51292ad3d716SAlex Elder } 51302ad3d716SAlex Elder 51312ad3d716SAlex Elder static u64 rbd_v2_snap_id_by_name(struct rbd_device *rbd_dev, const char *name) 51322ad3d716SAlex Elder { 51332ad3d716SAlex Elder struct ceph_snap_context *snapc = rbd_dev->header.snapc; 51342ad3d716SAlex Elder u32 which; 51352ad3d716SAlex Elder bool found = false; 51362ad3d716SAlex Elder u64 snap_id; 51372ad3d716SAlex Elder 51382ad3d716SAlex Elder for (which = 0; !found && which < snapc->num_snaps; which++) { 51392ad3d716SAlex Elder const char *snap_name; 51402ad3d716SAlex Elder 51412ad3d716SAlex Elder snap_id = snapc->snaps[which]; 51422ad3d716SAlex Elder snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id); 5143efadc98aSJosh Durgin if (IS_ERR(snap_name)) { 5144efadc98aSJosh Durgin /* ignore no-longer existing snapshots */ 5145efadc98aSJosh Durgin if (PTR_ERR(snap_name) == -ENOENT) 5146efadc98aSJosh Durgin continue; 5147efadc98aSJosh Durgin else 51482ad3d716SAlex Elder break; 5149efadc98aSJosh Durgin } 51502ad3d716SAlex Elder found = !strcmp(name, snap_name); 51512ad3d716SAlex Elder kfree(snap_name); 51522ad3d716SAlex Elder } 51532ad3d716SAlex Elder return found ? snap_id : CEPH_NOSNAP; 51542ad3d716SAlex Elder } 51552ad3d716SAlex Elder 51562ad3d716SAlex Elder /* 51572ad3d716SAlex Elder * Assumes name is never RBD_SNAP_HEAD_NAME; returns CEPH_NOSNAP if 51582ad3d716SAlex Elder * no snapshot by that name is found, or if an error occurs. 51592ad3d716SAlex Elder */ 51602ad3d716SAlex Elder static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name) 51612ad3d716SAlex Elder { 51622ad3d716SAlex Elder if (rbd_dev->image_format == 1) 51632ad3d716SAlex Elder return rbd_v1_snap_id_by_name(rbd_dev, name); 51642ad3d716SAlex Elder 51652ad3d716SAlex Elder return rbd_v2_snap_id_by_name(rbd_dev, name); 51662ad3d716SAlex Elder } 51672ad3d716SAlex Elder 51689e15b77dSAlex Elder /* 516904077599SIlya Dryomov * An image being mapped will have everything but the snap id. 51709e15b77dSAlex Elder */ 517104077599SIlya Dryomov static int rbd_spec_fill_snap_id(struct rbd_device *rbd_dev) 517204077599SIlya Dryomov { 517304077599SIlya Dryomov struct rbd_spec *spec = rbd_dev->spec; 517404077599SIlya Dryomov 517504077599SIlya Dryomov rbd_assert(spec->pool_id != CEPH_NOPOOL && spec->pool_name); 517604077599SIlya Dryomov rbd_assert(spec->image_id && spec->image_name); 517704077599SIlya Dryomov rbd_assert(spec->snap_name); 517804077599SIlya Dryomov 517904077599SIlya Dryomov if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) { 518004077599SIlya Dryomov u64 snap_id; 518104077599SIlya Dryomov 518204077599SIlya Dryomov snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name); 518304077599SIlya Dryomov if (snap_id == CEPH_NOSNAP) 518404077599SIlya Dryomov return -ENOENT; 518504077599SIlya Dryomov 518604077599SIlya Dryomov spec->snap_id = snap_id; 518704077599SIlya Dryomov } else { 518804077599SIlya Dryomov spec->snap_id = CEPH_NOSNAP; 518904077599SIlya Dryomov } 519004077599SIlya Dryomov 519104077599SIlya Dryomov return 0; 519204077599SIlya Dryomov } 519304077599SIlya Dryomov 519404077599SIlya Dryomov /* 519504077599SIlya Dryomov * A parent image will have all ids but none of the names. 519604077599SIlya Dryomov * 519704077599SIlya Dryomov * All names in an rbd spec are dynamically allocated. It's OK if we 519804077599SIlya Dryomov * can't figure out the name for an image id. 519904077599SIlya Dryomov */ 520004077599SIlya Dryomov static int rbd_spec_fill_names(struct rbd_device *rbd_dev) 52019e15b77dSAlex Elder { 52022e9f7f1cSAlex Elder struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 52032e9f7f1cSAlex Elder struct rbd_spec *spec = rbd_dev->spec; 52042e9f7f1cSAlex Elder const char *pool_name; 52052e9f7f1cSAlex Elder const char *image_name; 52062e9f7f1cSAlex Elder const char *snap_name; 52079e15b77dSAlex Elder int ret; 52089e15b77dSAlex Elder 520904077599SIlya Dryomov rbd_assert(spec->pool_id != CEPH_NOPOOL); 521004077599SIlya Dryomov rbd_assert(spec->image_id); 521104077599SIlya Dryomov rbd_assert(spec->snap_id != CEPH_NOSNAP); 52129e15b77dSAlex Elder 52132e9f7f1cSAlex Elder /* Get the pool name; we have to make our own copy of this */ 52149e15b77dSAlex Elder 52152e9f7f1cSAlex Elder pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id); 52162e9f7f1cSAlex Elder if (!pool_name) { 52172e9f7f1cSAlex Elder rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id); 5218935dc89fSAlex Elder return -EIO; 5219935dc89fSAlex Elder } 52202e9f7f1cSAlex Elder pool_name = kstrdup(pool_name, GFP_KERNEL); 52212e9f7f1cSAlex Elder if (!pool_name) 52229e15b77dSAlex Elder return -ENOMEM; 52239e15b77dSAlex Elder 52249e15b77dSAlex Elder /* Fetch the image name; tolerate failure here */ 52259e15b77dSAlex Elder 52262e9f7f1cSAlex Elder image_name = rbd_dev_image_name(rbd_dev); 52272e9f7f1cSAlex Elder if (!image_name) 522806ecc6cbSAlex Elder rbd_warn(rbd_dev, "unable to get image name"); 52299e15b77dSAlex Elder 523004077599SIlya Dryomov /* Fetch the snapshot name */ 52319e15b77dSAlex Elder 52322e9f7f1cSAlex Elder snap_name = rbd_snap_name(rbd_dev, spec->snap_id); 5233da6a6b63SJosh Durgin if (IS_ERR(snap_name)) { 5234da6a6b63SJosh Durgin ret = PTR_ERR(snap_name); 52359e15b77dSAlex Elder goto out_err; 52362e9f7f1cSAlex Elder } 52372e9f7f1cSAlex Elder 52382e9f7f1cSAlex Elder spec->pool_name = pool_name; 52392e9f7f1cSAlex Elder spec->image_name = image_name; 52402e9f7f1cSAlex Elder spec->snap_name = snap_name; 52419e15b77dSAlex Elder 52429e15b77dSAlex Elder return 0; 524304077599SIlya Dryomov 52449e15b77dSAlex Elder out_err: 52452e9f7f1cSAlex Elder kfree(image_name); 52462e9f7f1cSAlex Elder kfree(pool_name); 52479e15b77dSAlex Elder return ret; 52489e15b77dSAlex Elder } 52499e15b77dSAlex Elder 5250cc4a38bdSAlex Elder static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev) 525135d489f9SAlex Elder { 525235d489f9SAlex Elder size_t size; 525335d489f9SAlex Elder int ret; 525435d489f9SAlex Elder void *reply_buf; 525535d489f9SAlex Elder void *p; 525635d489f9SAlex Elder void *end; 525735d489f9SAlex Elder u64 seq; 525835d489f9SAlex Elder u32 snap_count; 525935d489f9SAlex Elder struct ceph_snap_context *snapc; 526035d489f9SAlex Elder u32 i; 526135d489f9SAlex Elder 526235d489f9SAlex Elder /* 526335d489f9SAlex Elder * We'll need room for the seq value (maximum snapshot id), 526435d489f9SAlex Elder * snapshot count, and array of that many snapshot ids. 526535d489f9SAlex Elder * For now we have a fixed upper limit on the number we're 526635d489f9SAlex Elder * prepared to receive. 526735d489f9SAlex Elder */ 526835d489f9SAlex Elder size = sizeof (__le64) + sizeof (__le32) + 526935d489f9SAlex Elder RBD_MAX_SNAP_COUNT * sizeof (__le64); 527035d489f9SAlex Elder reply_buf = kzalloc(size, GFP_KERNEL); 527135d489f9SAlex Elder if (!reply_buf) 527235d489f9SAlex Elder return -ENOMEM; 527335d489f9SAlex Elder 5274ecd4a68aSIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, 5275ecd4a68aSIlya Dryomov &rbd_dev->header_oloc, "get_snapcontext", 5276ecd4a68aSIlya Dryomov NULL, 0, reply_buf, size); 527736be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 527835d489f9SAlex Elder if (ret < 0) 527935d489f9SAlex Elder goto out; 528035d489f9SAlex Elder 528135d489f9SAlex Elder p = reply_buf; 528257385b51SAlex Elder end = reply_buf + ret; 528357385b51SAlex Elder ret = -ERANGE; 528435d489f9SAlex Elder ceph_decode_64_safe(&p, end, seq, out); 528535d489f9SAlex Elder ceph_decode_32_safe(&p, end, snap_count, out); 528635d489f9SAlex Elder 528735d489f9SAlex Elder /* 528835d489f9SAlex Elder * Make sure the reported number of snapshot ids wouldn't go 528935d489f9SAlex Elder * beyond the end of our buffer. But before checking that, 529035d489f9SAlex Elder * make sure the computed size of the snapshot context we 529135d489f9SAlex Elder * allocate is representable in a size_t. 529235d489f9SAlex Elder */ 529335d489f9SAlex Elder if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context)) 529435d489f9SAlex Elder / sizeof (u64)) { 529535d489f9SAlex Elder ret = -EINVAL; 529635d489f9SAlex Elder goto out; 529735d489f9SAlex Elder } 529835d489f9SAlex Elder if (!ceph_has_room(&p, end, snap_count * sizeof (__le64))) 529935d489f9SAlex Elder goto out; 5300468521c1SAlex Elder ret = 0; 530135d489f9SAlex Elder 5302812164f8SAlex Elder snapc = ceph_create_snap_context(snap_count, GFP_KERNEL); 530335d489f9SAlex Elder if (!snapc) { 530435d489f9SAlex Elder ret = -ENOMEM; 530535d489f9SAlex Elder goto out; 530635d489f9SAlex Elder } 530735d489f9SAlex Elder snapc->seq = seq; 530835d489f9SAlex Elder for (i = 0; i < snap_count; i++) 530935d489f9SAlex Elder snapc->snaps[i] = ceph_decode_64(&p); 531035d489f9SAlex Elder 531149ece554SAlex Elder ceph_put_snap_context(rbd_dev->header.snapc); 531235d489f9SAlex Elder rbd_dev->header.snapc = snapc; 531335d489f9SAlex Elder 531435d489f9SAlex Elder dout(" snap context seq = %llu, snap_count = %u\n", 531535d489f9SAlex Elder (unsigned long long)seq, (unsigned int)snap_count); 531635d489f9SAlex Elder out: 531735d489f9SAlex Elder kfree(reply_buf); 531835d489f9SAlex Elder 531957385b51SAlex Elder return ret; 532035d489f9SAlex Elder } 532135d489f9SAlex Elder 532254cac61fSAlex Elder static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, 532354cac61fSAlex Elder u64 snap_id) 5324b8b1e2dbSAlex Elder { 5325b8b1e2dbSAlex Elder size_t size; 5326b8b1e2dbSAlex Elder void *reply_buf; 532754cac61fSAlex Elder __le64 snapid; 5328b8b1e2dbSAlex Elder int ret; 5329b8b1e2dbSAlex Elder void *p; 5330b8b1e2dbSAlex Elder void *end; 5331b8b1e2dbSAlex Elder char *snap_name; 5332b8b1e2dbSAlex Elder 5333b8b1e2dbSAlex Elder size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN; 5334b8b1e2dbSAlex Elder reply_buf = kmalloc(size, GFP_KERNEL); 5335b8b1e2dbSAlex Elder if (!reply_buf) 5336b8b1e2dbSAlex Elder return ERR_PTR(-ENOMEM); 5337b8b1e2dbSAlex Elder 533854cac61fSAlex Elder snapid = cpu_to_le64(snap_id); 5339ecd4a68aSIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, 5340ecd4a68aSIlya Dryomov &rbd_dev->header_oloc, "get_snapshot_name", 5341ecd4a68aSIlya Dryomov &snapid, sizeof(snapid), reply_buf, size); 534236be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 5343f40eb349SAlex Elder if (ret < 0) { 5344f40eb349SAlex Elder snap_name = ERR_PTR(ret); 5345b8b1e2dbSAlex Elder goto out; 5346f40eb349SAlex Elder } 5347b8b1e2dbSAlex Elder 5348b8b1e2dbSAlex Elder p = reply_buf; 5349f40eb349SAlex Elder end = reply_buf + ret; 5350e5c35534SAlex Elder snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL); 5351f40eb349SAlex Elder if (IS_ERR(snap_name)) 5352b8b1e2dbSAlex Elder goto out; 5353f40eb349SAlex Elder 5354b8b1e2dbSAlex Elder dout(" snap_id 0x%016llx snap_name = %s\n", 535554cac61fSAlex Elder (unsigned long long)snap_id, snap_name); 5356b8b1e2dbSAlex Elder out: 5357b8b1e2dbSAlex Elder kfree(reply_buf); 5358b8b1e2dbSAlex Elder 5359f40eb349SAlex Elder return snap_name; 5360b8b1e2dbSAlex Elder } 5361b8b1e2dbSAlex Elder 53622df3fac7SAlex Elder static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev) 5363117973fbSAlex Elder { 53642df3fac7SAlex Elder bool first_time = rbd_dev->header.object_prefix == NULL; 5365117973fbSAlex Elder int ret; 5366117973fbSAlex Elder 53671617e40cSJosh Durgin ret = rbd_dev_v2_image_size(rbd_dev); 53681617e40cSJosh Durgin if (ret) 5369cfbf6377SAlex Elder return ret; 53701617e40cSJosh Durgin 53712df3fac7SAlex Elder if (first_time) { 53722df3fac7SAlex Elder ret = rbd_dev_v2_header_onetime(rbd_dev); 53732df3fac7SAlex Elder if (ret) 5374cfbf6377SAlex Elder return ret; 53752df3fac7SAlex Elder } 53762df3fac7SAlex Elder 5377cc4a38bdSAlex Elder ret = rbd_dev_v2_snap_context(rbd_dev); 5378d194cd1dSIlya Dryomov if (ret && first_time) { 5379d194cd1dSIlya Dryomov kfree(rbd_dev->header.object_prefix); 5380d194cd1dSIlya Dryomov rbd_dev->header.object_prefix = NULL; 5381d194cd1dSIlya Dryomov } 5382117973fbSAlex Elder 5383117973fbSAlex Elder return ret; 5384117973fbSAlex Elder } 5385117973fbSAlex Elder 5386a720ae09SIlya Dryomov static int rbd_dev_header_info(struct rbd_device *rbd_dev) 5387a720ae09SIlya Dryomov { 5388a720ae09SIlya Dryomov rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 5389a720ae09SIlya Dryomov 5390a720ae09SIlya Dryomov if (rbd_dev->image_format == 1) 5391a720ae09SIlya Dryomov return rbd_dev_v1_header_info(rbd_dev); 5392a720ae09SIlya Dryomov 5393a720ae09SIlya Dryomov return rbd_dev_v2_header_info(rbd_dev); 5394a720ae09SIlya Dryomov } 5395a720ae09SIlya Dryomov 53961ddbe94eSAlex Elder /* 5397e28fff26SAlex Elder * Skips over white space at *buf, and updates *buf to point to the 5398e28fff26SAlex Elder * first found non-space character (if any). Returns the length of 5399593a9e7bSAlex Elder * the token (string of non-white space characters) found. Note 5400593a9e7bSAlex Elder * that *buf must be terminated with '\0'. 5401e28fff26SAlex Elder */ 5402e28fff26SAlex Elder static inline size_t next_token(const char **buf) 5403e28fff26SAlex Elder { 5404e28fff26SAlex Elder /* 5405e28fff26SAlex Elder * These are the characters that produce nonzero for 5406e28fff26SAlex Elder * isspace() in the "C" and "POSIX" locales. 5407e28fff26SAlex Elder */ 5408e28fff26SAlex Elder const char *spaces = " \f\n\r\t\v"; 5409e28fff26SAlex Elder 5410e28fff26SAlex Elder *buf += strspn(*buf, spaces); /* Find start of token */ 5411e28fff26SAlex Elder 5412e28fff26SAlex Elder return strcspn(*buf, spaces); /* Return token length */ 5413e28fff26SAlex Elder } 5414e28fff26SAlex Elder 5415e28fff26SAlex Elder /* 5416ea3352f4SAlex Elder * Finds the next token in *buf, dynamically allocates a buffer big 5417ea3352f4SAlex Elder * enough to hold a copy of it, and copies the token into the new 5418ea3352f4SAlex Elder * buffer. The copy is guaranteed to be terminated with '\0'. Note 5419ea3352f4SAlex Elder * that a duplicate buffer is created even for a zero-length token. 5420ea3352f4SAlex Elder * 5421ea3352f4SAlex Elder * Returns a pointer to the newly-allocated duplicate, or a null 5422ea3352f4SAlex Elder * pointer if memory for the duplicate was not available. If 5423ea3352f4SAlex Elder * the lenp argument is a non-null pointer, the length of the token 5424ea3352f4SAlex Elder * (not including the '\0') is returned in *lenp. 5425ea3352f4SAlex Elder * 5426ea3352f4SAlex Elder * If successful, the *buf pointer will be updated to point beyond 5427ea3352f4SAlex Elder * the end of the found token. 5428ea3352f4SAlex Elder * 5429ea3352f4SAlex Elder * Note: uses GFP_KERNEL for allocation. 5430ea3352f4SAlex Elder */ 5431ea3352f4SAlex Elder static inline char *dup_token(const char **buf, size_t *lenp) 5432ea3352f4SAlex Elder { 5433ea3352f4SAlex Elder char *dup; 5434ea3352f4SAlex Elder size_t len; 5435ea3352f4SAlex Elder 5436ea3352f4SAlex Elder len = next_token(buf); 54374caf35f9SAlex Elder dup = kmemdup(*buf, len + 1, GFP_KERNEL); 5438ea3352f4SAlex Elder if (!dup) 5439ea3352f4SAlex Elder return NULL; 5440ea3352f4SAlex Elder *(dup + len) = '\0'; 5441ea3352f4SAlex Elder *buf += len; 5442ea3352f4SAlex Elder 5443ea3352f4SAlex Elder if (lenp) 5444ea3352f4SAlex Elder *lenp = len; 5445ea3352f4SAlex Elder 5446ea3352f4SAlex Elder return dup; 5447ea3352f4SAlex Elder } 5448ea3352f4SAlex Elder 5449ea3352f4SAlex Elder /* 5450859c31dfSAlex Elder * Parse the options provided for an "rbd add" (i.e., rbd image 5451859c31dfSAlex Elder * mapping) request. These arrive via a write to /sys/bus/rbd/add, 5452859c31dfSAlex Elder * and the data written is passed here via a NUL-terminated buffer. 5453859c31dfSAlex Elder * Returns 0 if successful or an error code otherwise. 5454d22f76e7SAlex Elder * 5455859c31dfSAlex Elder * The information extracted from these options is recorded in 5456859c31dfSAlex Elder * the other parameters which return dynamically-allocated 5457859c31dfSAlex Elder * structures: 5458859c31dfSAlex Elder * ceph_opts 5459859c31dfSAlex Elder * The address of a pointer that will refer to a ceph options 5460859c31dfSAlex Elder * structure. Caller must release the returned pointer using 5461859c31dfSAlex Elder * ceph_destroy_options() when it is no longer needed. 5462859c31dfSAlex Elder * rbd_opts 5463859c31dfSAlex Elder * Address of an rbd options pointer. Fully initialized by 5464859c31dfSAlex Elder * this function; caller must release with kfree(). 5465859c31dfSAlex Elder * spec 5466859c31dfSAlex Elder * Address of an rbd image specification pointer. Fully 5467859c31dfSAlex Elder * initialized by this function based on parsed options. 5468859c31dfSAlex Elder * Caller must release with rbd_spec_put(). 5469859c31dfSAlex Elder * 5470859c31dfSAlex Elder * The options passed take this form: 5471859c31dfSAlex Elder * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>] 5472859c31dfSAlex Elder * where: 5473859c31dfSAlex Elder * <mon_addrs> 5474859c31dfSAlex Elder * A comma-separated list of one or more monitor addresses. 5475859c31dfSAlex Elder * A monitor address is an ip address, optionally followed 5476859c31dfSAlex Elder * by a port number (separated by a colon). 5477859c31dfSAlex Elder * I.e.: ip1[:port1][,ip2[:port2]...] 5478859c31dfSAlex Elder * <options> 5479859c31dfSAlex Elder * A comma-separated list of ceph and/or rbd options. 5480859c31dfSAlex Elder * <pool_name> 5481859c31dfSAlex Elder * The name of the rados pool containing the rbd image. 5482859c31dfSAlex Elder * <image_name> 5483859c31dfSAlex Elder * The name of the image in that pool to map. 5484859c31dfSAlex Elder * <snap_id> 5485859c31dfSAlex Elder * An optional snapshot id. If provided, the mapping will 5486859c31dfSAlex Elder * present data from the image at the time that snapshot was 5487859c31dfSAlex Elder * created. The image head is used if no snapshot id is 5488859c31dfSAlex Elder * provided. Snapshot mappings are always read-only. 5489a725f65eSAlex Elder */ 5490859c31dfSAlex Elder static int rbd_add_parse_args(const char *buf, 5491dc79b113SAlex Elder struct ceph_options **ceph_opts, 5492859c31dfSAlex Elder struct rbd_options **opts, 5493859c31dfSAlex Elder struct rbd_spec **rbd_spec) 5494a725f65eSAlex Elder { 5495e28fff26SAlex Elder size_t len; 5496859c31dfSAlex Elder char *options; 54970ddebc0cSAlex Elder const char *mon_addrs; 5498ecb4dc22SAlex Elder char *snap_name; 54990ddebc0cSAlex Elder size_t mon_addrs_size; 5500c300156bSIlya Dryomov struct parse_rbd_opts_ctx pctx = { 0 }; 5501859c31dfSAlex Elder struct ceph_options *copts; 5502dc79b113SAlex Elder int ret; 5503e28fff26SAlex Elder 5504e28fff26SAlex Elder /* The first four tokens are required */ 5505e28fff26SAlex Elder 55067ef3214aSAlex Elder len = next_token(&buf); 55074fb5d671SAlex Elder if (!len) { 55084fb5d671SAlex Elder rbd_warn(NULL, "no monitor address(es) provided"); 55094fb5d671SAlex Elder return -EINVAL; 55104fb5d671SAlex Elder } 55110ddebc0cSAlex Elder mon_addrs = buf; 5512f28e565aSAlex Elder mon_addrs_size = len + 1; 55137ef3214aSAlex Elder buf += len; 5514a725f65eSAlex Elder 5515dc79b113SAlex Elder ret = -EINVAL; 5516f28e565aSAlex Elder options = dup_token(&buf, NULL); 5517f28e565aSAlex Elder if (!options) 5518dc79b113SAlex Elder return -ENOMEM; 55194fb5d671SAlex Elder if (!*options) { 55204fb5d671SAlex Elder rbd_warn(NULL, "no options provided"); 55214fb5d671SAlex Elder goto out_err; 55224fb5d671SAlex Elder } 5523a725f65eSAlex Elder 5524c300156bSIlya Dryomov pctx.spec = rbd_spec_alloc(); 5525c300156bSIlya Dryomov if (!pctx.spec) 5526f28e565aSAlex Elder goto out_mem; 5527859c31dfSAlex Elder 5528c300156bSIlya Dryomov pctx.spec->pool_name = dup_token(&buf, NULL); 5529c300156bSIlya Dryomov if (!pctx.spec->pool_name) 5530859c31dfSAlex Elder goto out_mem; 5531c300156bSIlya Dryomov if (!*pctx.spec->pool_name) { 55324fb5d671SAlex Elder rbd_warn(NULL, "no pool name provided"); 55334fb5d671SAlex Elder goto out_err; 55344fb5d671SAlex Elder } 5535e28fff26SAlex Elder 5536c300156bSIlya Dryomov pctx.spec->image_name = dup_token(&buf, NULL); 5537c300156bSIlya Dryomov if (!pctx.spec->image_name) 5538f28e565aSAlex Elder goto out_mem; 5539c300156bSIlya Dryomov if (!*pctx.spec->image_name) { 55404fb5d671SAlex Elder rbd_warn(NULL, "no image name provided"); 55414fb5d671SAlex Elder goto out_err; 55424fb5d671SAlex Elder } 5543e28fff26SAlex Elder 5544f28e565aSAlex Elder /* 5545f28e565aSAlex Elder * Snapshot name is optional; default is to use "-" 5546f28e565aSAlex Elder * (indicating the head/no snapshot). 5547f28e565aSAlex Elder */ 55483feeb894SAlex Elder len = next_token(&buf); 5549820a5f3eSAlex Elder if (!len) { 55503feeb894SAlex Elder buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */ 55513feeb894SAlex Elder len = sizeof (RBD_SNAP_HEAD_NAME) - 1; 5552f28e565aSAlex Elder } else if (len > RBD_MAX_SNAP_NAME_LEN) { 5553dc79b113SAlex Elder ret = -ENAMETOOLONG; 5554f28e565aSAlex Elder goto out_err; 5555849b4260SAlex Elder } 5556ecb4dc22SAlex Elder snap_name = kmemdup(buf, len + 1, GFP_KERNEL); 5557ecb4dc22SAlex Elder if (!snap_name) 5558f28e565aSAlex Elder goto out_mem; 5559ecb4dc22SAlex Elder *(snap_name + len) = '\0'; 5560c300156bSIlya Dryomov pctx.spec->snap_name = snap_name; 5561e5c35534SAlex Elder 55620ddebc0cSAlex Elder /* Initialize all rbd options to the defaults */ 5563e28fff26SAlex Elder 5564c300156bSIlya Dryomov pctx.opts = kzalloc(sizeof(*pctx.opts), GFP_KERNEL); 5565c300156bSIlya Dryomov if (!pctx.opts) 55664e9afebaSAlex Elder goto out_mem; 55674e9afebaSAlex Elder 5568c300156bSIlya Dryomov pctx.opts->read_only = RBD_READ_ONLY_DEFAULT; 5569c300156bSIlya Dryomov pctx.opts->queue_depth = RBD_QUEUE_DEPTH_DEFAULT; 55700c93e1b7SIlya Dryomov pctx.opts->alloc_size = RBD_ALLOC_SIZE_DEFAULT; 5571c300156bSIlya Dryomov pctx.opts->lock_timeout = RBD_LOCK_TIMEOUT_DEFAULT; 5572c300156bSIlya Dryomov pctx.opts->lock_on_read = RBD_LOCK_ON_READ_DEFAULT; 5573c300156bSIlya Dryomov pctx.opts->exclusive = RBD_EXCLUSIVE_DEFAULT; 5574c300156bSIlya Dryomov pctx.opts->trim = RBD_TRIM_DEFAULT; 5575d22f76e7SAlex Elder 5576859c31dfSAlex Elder copts = ceph_parse_options(options, mon_addrs, 55770ddebc0cSAlex Elder mon_addrs + mon_addrs_size - 1, 5578c300156bSIlya Dryomov parse_rbd_opts_token, &pctx); 5579859c31dfSAlex Elder if (IS_ERR(copts)) { 5580859c31dfSAlex Elder ret = PTR_ERR(copts); 5581dc79b113SAlex Elder goto out_err; 5582dc79b113SAlex Elder } 5583859c31dfSAlex Elder kfree(options); 5584859c31dfSAlex Elder 5585859c31dfSAlex Elder *ceph_opts = copts; 5586c300156bSIlya Dryomov *opts = pctx.opts; 5587c300156bSIlya Dryomov *rbd_spec = pctx.spec; 55880ddebc0cSAlex Elder 5589dc79b113SAlex Elder return 0; 5590f28e565aSAlex Elder out_mem: 5591dc79b113SAlex Elder ret = -ENOMEM; 5592d22f76e7SAlex Elder out_err: 5593c300156bSIlya Dryomov kfree(pctx.opts); 5594c300156bSIlya Dryomov rbd_spec_put(pctx.spec); 5595f28e565aSAlex Elder kfree(options); 5596d22f76e7SAlex Elder 5597dc79b113SAlex Elder return ret; 5598a725f65eSAlex Elder } 5599a725f65eSAlex Elder 5600e010dd0aSIlya Dryomov static void rbd_dev_image_unlock(struct rbd_device *rbd_dev) 5601e010dd0aSIlya Dryomov { 5602e010dd0aSIlya Dryomov down_write(&rbd_dev->lock_rwsem); 5603e010dd0aSIlya Dryomov if (__rbd_is_lock_owner(rbd_dev)) 5604e010dd0aSIlya Dryomov rbd_unlock(rbd_dev); 5605e010dd0aSIlya Dryomov up_write(&rbd_dev->lock_rwsem); 5606e010dd0aSIlya Dryomov } 5607e010dd0aSIlya Dryomov 5608e010dd0aSIlya Dryomov static int rbd_add_acquire_lock(struct rbd_device *rbd_dev) 5609e010dd0aSIlya Dryomov { 56102f18d466SIlya Dryomov int ret; 56112f18d466SIlya Dryomov 5612e010dd0aSIlya Dryomov if (!(rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK)) { 5613e010dd0aSIlya Dryomov rbd_warn(rbd_dev, "exclusive-lock feature is not enabled"); 5614e010dd0aSIlya Dryomov return -EINVAL; 5615e010dd0aSIlya Dryomov } 5616e010dd0aSIlya Dryomov 5617e010dd0aSIlya Dryomov /* FIXME: "rbd map --exclusive" should be in interruptible */ 5618e010dd0aSIlya Dryomov down_read(&rbd_dev->lock_rwsem); 56192f18d466SIlya Dryomov ret = rbd_wait_state_locked(rbd_dev, true); 5620e010dd0aSIlya Dryomov up_read(&rbd_dev->lock_rwsem); 56212f18d466SIlya Dryomov if (ret) { 5622e010dd0aSIlya Dryomov rbd_warn(rbd_dev, "failed to acquire exclusive lock"); 5623e010dd0aSIlya Dryomov return -EROFS; 5624e010dd0aSIlya Dryomov } 5625e010dd0aSIlya Dryomov 5626e010dd0aSIlya Dryomov return 0; 5627e010dd0aSIlya Dryomov } 5628e010dd0aSIlya Dryomov 562930ba1f02SIlya Dryomov /* 5630589d30e0SAlex Elder * An rbd format 2 image has a unique identifier, distinct from the 5631589d30e0SAlex Elder * name given to it by the user. Internally, that identifier is 5632589d30e0SAlex Elder * what's used to specify the names of objects related to the image. 5633589d30e0SAlex Elder * 5634589d30e0SAlex Elder * A special "rbd id" object is used to map an rbd image name to its 5635589d30e0SAlex Elder * id. If that object doesn't exist, then there is no v2 rbd image 5636589d30e0SAlex Elder * with the supplied name. 5637589d30e0SAlex Elder * 5638589d30e0SAlex Elder * This function will record the given rbd_dev's image_id field if 5639589d30e0SAlex Elder * it can be determined, and in that case will return 0. If any 5640589d30e0SAlex Elder * errors occur a negative errno will be returned and the rbd_dev's 5641589d30e0SAlex Elder * image_id field will be unchanged (and should be NULL). 5642589d30e0SAlex Elder */ 5643589d30e0SAlex Elder static int rbd_dev_image_id(struct rbd_device *rbd_dev) 5644589d30e0SAlex Elder { 5645589d30e0SAlex Elder int ret; 5646589d30e0SAlex Elder size_t size; 5647ecd4a68aSIlya Dryomov CEPH_DEFINE_OID_ONSTACK(oid); 5648589d30e0SAlex Elder void *response; 5649c0fba368SAlex Elder char *image_id; 56502f82ee54SAlex Elder 5651589d30e0SAlex Elder /* 56522c0d0a10SAlex Elder * When probing a parent image, the image id is already 56532c0d0a10SAlex Elder * known (and the image name likely is not). There's no 5654c0fba368SAlex Elder * need to fetch the image id again in this case. We 5655c0fba368SAlex Elder * do still need to set the image format though. 56562c0d0a10SAlex Elder */ 5657c0fba368SAlex Elder if (rbd_dev->spec->image_id) { 5658c0fba368SAlex Elder rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1; 5659c0fba368SAlex Elder 56602c0d0a10SAlex Elder return 0; 5661c0fba368SAlex Elder } 56622c0d0a10SAlex Elder 56632c0d0a10SAlex Elder /* 5664589d30e0SAlex Elder * First, see if the format 2 image id file exists, and if 5665589d30e0SAlex Elder * so, get the image's persistent id from it. 5666589d30e0SAlex Elder */ 5667ecd4a68aSIlya Dryomov ret = ceph_oid_aprintf(&oid, GFP_KERNEL, "%s%s", RBD_ID_PREFIX, 5668ecd4a68aSIlya Dryomov rbd_dev->spec->image_name); 5669ecd4a68aSIlya Dryomov if (ret) 5670ecd4a68aSIlya Dryomov return ret; 5671ecd4a68aSIlya Dryomov 5672ecd4a68aSIlya Dryomov dout("rbd id object name is %s\n", oid.name); 5673589d30e0SAlex Elder 5674589d30e0SAlex Elder /* Response will be an encoded string, which includes a length */ 5675589d30e0SAlex Elder 5676589d30e0SAlex Elder size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX; 5677589d30e0SAlex Elder response = kzalloc(size, GFP_NOIO); 5678589d30e0SAlex Elder if (!response) { 5679589d30e0SAlex Elder ret = -ENOMEM; 5680589d30e0SAlex Elder goto out; 5681589d30e0SAlex Elder } 5682589d30e0SAlex Elder 5683c0fba368SAlex Elder /* If it doesn't exist we'll assume it's a format 1 image */ 5684c0fba368SAlex Elder 5685ecd4a68aSIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc, 5686ecd4a68aSIlya Dryomov "get_id", NULL, 0, 5687e2a58ee5SAlex Elder response, RBD_IMAGE_ID_LEN_MAX); 568836be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 5689c0fba368SAlex Elder if (ret == -ENOENT) { 5690c0fba368SAlex Elder image_id = kstrdup("", GFP_KERNEL); 5691c0fba368SAlex Elder ret = image_id ? 0 : -ENOMEM; 5692c0fba368SAlex Elder if (!ret) 5693c0fba368SAlex Elder rbd_dev->image_format = 1; 56947dd440c9SIlya Dryomov } else if (ret >= 0) { 5695c0fba368SAlex Elder void *p = response; 5696589d30e0SAlex Elder 5697c0fba368SAlex Elder image_id = ceph_extract_encoded_string(&p, p + ret, 5698979ed480SAlex Elder NULL, GFP_NOIO); 5699461f758aSDuan Jiong ret = PTR_ERR_OR_ZERO(image_id); 5700c0fba368SAlex Elder if (!ret) 5701c0fba368SAlex Elder rbd_dev->image_format = 2; 5702c0fba368SAlex Elder } 5703c0fba368SAlex Elder 5704c0fba368SAlex Elder if (!ret) { 5705c0fba368SAlex Elder rbd_dev->spec->image_id = image_id; 5706c0fba368SAlex Elder dout("image_id is %s\n", image_id); 5707589d30e0SAlex Elder } 5708589d30e0SAlex Elder out: 5709589d30e0SAlex Elder kfree(response); 5710ecd4a68aSIlya Dryomov ceph_oid_destroy(&oid); 5711589d30e0SAlex Elder return ret; 5712589d30e0SAlex Elder } 5713589d30e0SAlex Elder 57143abef3b3SAlex Elder /* 57153abef3b3SAlex Elder * Undo whatever state changes are made by v1 or v2 header info 57163abef3b3SAlex Elder * call. 57173abef3b3SAlex Elder */ 57186fd48b3bSAlex Elder static void rbd_dev_unprobe(struct rbd_device *rbd_dev) 57196fd48b3bSAlex Elder { 57206fd48b3bSAlex Elder struct rbd_image_header *header; 57216fd48b3bSAlex Elder 5722a2acd00eSAlex Elder rbd_dev_parent_put(rbd_dev); 57236fd48b3bSAlex Elder 57246fd48b3bSAlex Elder /* Free dynamic fields from the header, then zero it out */ 57256fd48b3bSAlex Elder 57266fd48b3bSAlex Elder header = &rbd_dev->header; 5727812164f8SAlex Elder ceph_put_snap_context(header->snapc); 57286fd48b3bSAlex Elder kfree(header->snap_sizes); 57296fd48b3bSAlex Elder kfree(header->snap_names); 57306fd48b3bSAlex Elder kfree(header->object_prefix); 57316fd48b3bSAlex Elder memset(header, 0, sizeof (*header)); 57326fd48b3bSAlex Elder } 57336fd48b3bSAlex Elder 57342df3fac7SAlex Elder static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev) 5735a30b71b9SAlex Elder { 5736a30b71b9SAlex Elder int ret; 5737a30b71b9SAlex Elder 57381e130199SAlex Elder ret = rbd_dev_v2_object_prefix(rbd_dev); 573957385b51SAlex Elder if (ret) 57401e130199SAlex Elder goto out_err; 5741b1b5402aSAlex Elder 57422df3fac7SAlex Elder /* 57432df3fac7SAlex Elder * Get the and check features for the image. Currently the 57442df3fac7SAlex Elder * features are assumed to never change. 57452df3fac7SAlex Elder */ 5746b1b5402aSAlex Elder ret = rbd_dev_v2_features(rbd_dev); 574757385b51SAlex Elder if (ret) 5748b1b5402aSAlex Elder goto out_err; 574935d489f9SAlex Elder 5750cc070d59SAlex Elder /* If the image supports fancy striping, get its parameters */ 5751cc070d59SAlex Elder 5752cc070d59SAlex Elder if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) { 5753cc070d59SAlex Elder ret = rbd_dev_v2_striping_info(rbd_dev); 5754cc070d59SAlex Elder if (ret < 0) 5755cc070d59SAlex Elder goto out_err; 5756cc070d59SAlex Elder } 5757a30b71b9SAlex Elder 57587e97332eSIlya Dryomov if (rbd_dev->header.features & RBD_FEATURE_DATA_POOL) { 57597e97332eSIlya Dryomov ret = rbd_dev_v2_data_pool(rbd_dev); 57607e97332eSIlya Dryomov if (ret) 57617e97332eSIlya Dryomov goto out_err; 57627e97332eSIlya Dryomov } 57637e97332eSIlya Dryomov 5764263423f8SIlya Dryomov rbd_init_layout(rbd_dev); 576535152979SAlex Elder return 0; 5766263423f8SIlya Dryomov 57679d475de5SAlex Elder out_err: 5768642a2537SAlex Elder rbd_dev->header.features = 0; 57691e130199SAlex Elder kfree(rbd_dev->header.object_prefix); 57701e130199SAlex Elder rbd_dev->header.object_prefix = NULL; 57719d475de5SAlex Elder return ret; 5772a30b71b9SAlex Elder } 5773a30b71b9SAlex Elder 57746d69bb53SIlya Dryomov /* 57756d69bb53SIlya Dryomov * @depth is rbd_dev_image_probe() -> rbd_dev_probe_parent() -> 57766d69bb53SIlya Dryomov * rbd_dev_image_probe() recursion depth, which means it's also the 57776d69bb53SIlya Dryomov * length of the already discovered part of the parent chain. 57786d69bb53SIlya Dryomov */ 57796d69bb53SIlya Dryomov static int rbd_dev_probe_parent(struct rbd_device *rbd_dev, int depth) 578083a06263SAlex Elder { 57812f82ee54SAlex Elder struct rbd_device *parent = NULL; 5782124afba2SAlex Elder int ret; 5783124afba2SAlex Elder 5784124afba2SAlex Elder if (!rbd_dev->parent_spec) 5785124afba2SAlex Elder return 0; 5786124afba2SAlex Elder 57876d69bb53SIlya Dryomov if (++depth > RBD_MAX_PARENT_CHAIN_LEN) { 57886d69bb53SIlya Dryomov pr_info("parent chain is too long (%d)\n", depth); 57896d69bb53SIlya Dryomov ret = -EINVAL; 57906d69bb53SIlya Dryomov goto out_err; 57916d69bb53SIlya Dryomov } 57926d69bb53SIlya Dryomov 57931643dfa4SIlya Dryomov parent = __rbd_dev_create(rbd_dev->rbd_client, rbd_dev->parent_spec); 57941f2c6651SIlya Dryomov if (!parent) { 5795124afba2SAlex Elder ret = -ENOMEM; 5796124afba2SAlex Elder goto out_err; 57971f2c6651SIlya Dryomov } 57981f2c6651SIlya Dryomov 57991f2c6651SIlya Dryomov /* 58001f2c6651SIlya Dryomov * Images related by parent/child relationships always share 58011f2c6651SIlya Dryomov * rbd_client and spec/parent_spec, so bump their refcounts. 58021f2c6651SIlya Dryomov */ 58031f2c6651SIlya Dryomov __rbd_get_client(rbd_dev->rbd_client); 58041f2c6651SIlya Dryomov rbd_spec_get(rbd_dev->parent_spec); 5805124afba2SAlex Elder 58066d69bb53SIlya Dryomov ret = rbd_dev_image_probe(parent, depth); 5807124afba2SAlex Elder if (ret < 0) 5808124afba2SAlex Elder goto out_err; 58091f2c6651SIlya Dryomov 5810124afba2SAlex Elder rbd_dev->parent = parent; 5811a2acd00eSAlex Elder atomic_set(&rbd_dev->parent_ref, 1); 5812124afba2SAlex Elder return 0; 5813124afba2SAlex Elder 58141f2c6651SIlya Dryomov out_err: 58151f2c6651SIlya Dryomov rbd_dev_unparent(rbd_dev); 58161f2c6651SIlya Dryomov rbd_dev_destroy(parent); 5817124afba2SAlex Elder return ret; 5818124afba2SAlex Elder } 5819124afba2SAlex Elder 58205769ed0cSIlya Dryomov static void rbd_dev_device_release(struct rbd_device *rbd_dev) 58215769ed0cSIlya Dryomov { 58225769ed0cSIlya Dryomov clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); 58235769ed0cSIlya Dryomov rbd_dev_mapping_clear(rbd_dev); 58245769ed0cSIlya Dryomov rbd_free_disk(rbd_dev); 58255769ed0cSIlya Dryomov if (!single_major) 58265769ed0cSIlya Dryomov unregister_blkdev(rbd_dev->major, rbd_dev->name); 58275769ed0cSIlya Dryomov } 58285769ed0cSIlya Dryomov 5829811c6688SIlya Dryomov /* 5830811c6688SIlya Dryomov * rbd_dev->header_rwsem must be locked for write and will be unlocked 5831811c6688SIlya Dryomov * upon return. 5832811c6688SIlya Dryomov */ 5833200a6a8bSAlex Elder static int rbd_dev_device_setup(struct rbd_device *rbd_dev) 5834124afba2SAlex Elder { 583583a06263SAlex Elder int ret; 583683a06263SAlex Elder 58379b60e70bSIlya Dryomov /* Record our major and minor device numbers. */ 583883a06263SAlex Elder 58399b60e70bSIlya Dryomov if (!single_major) { 584083a06263SAlex Elder ret = register_blkdev(0, rbd_dev->name); 584183a06263SAlex Elder if (ret < 0) 58421643dfa4SIlya Dryomov goto err_out_unlock; 58439b60e70bSIlya Dryomov 584483a06263SAlex Elder rbd_dev->major = ret; 5845dd82fff1SIlya Dryomov rbd_dev->minor = 0; 58469b60e70bSIlya Dryomov } else { 58479b60e70bSIlya Dryomov rbd_dev->major = rbd_major; 58489b60e70bSIlya Dryomov rbd_dev->minor = rbd_dev_id_to_minor(rbd_dev->dev_id); 58499b60e70bSIlya Dryomov } 585083a06263SAlex Elder 585183a06263SAlex Elder /* Set up the blkdev mapping. */ 585283a06263SAlex Elder 585383a06263SAlex Elder ret = rbd_init_disk(rbd_dev); 585483a06263SAlex Elder if (ret) 585583a06263SAlex Elder goto err_out_blkdev; 585683a06263SAlex Elder 5857f35a4deeSAlex Elder ret = rbd_dev_mapping_set(rbd_dev); 585883a06263SAlex Elder if (ret) 585983a06263SAlex Elder goto err_out_disk; 5860bc1ecc65SIlya Dryomov 5861f35a4deeSAlex Elder set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE); 58629568c93eSIlya Dryomov set_disk_ro(rbd_dev->disk, rbd_dev->opts->read_only); 5863f35a4deeSAlex Elder 58645769ed0cSIlya Dryomov ret = dev_set_name(&rbd_dev->dev, "%d", rbd_dev->dev_id); 5865f35a4deeSAlex Elder if (ret) 5866f5ee37bdSIlya Dryomov goto err_out_mapping; 586783a06263SAlex Elder 5868129b79d4SAlex Elder set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); 5869811c6688SIlya Dryomov up_write(&rbd_dev->header_rwsem); 58705769ed0cSIlya Dryomov return 0; 58712f82ee54SAlex Elder 5872f35a4deeSAlex Elder err_out_mapping: 5873f35a4deeSAlex Elder rbd_dev_mapping_clear(rbd_dev); 587483a06263SAlex Elder err_out_disk: 587583a06263SAlex Elder rbd_free_disk(rbd_dev); 587683a06263SAlex Elder err_out_blkdev: 58779b60e70bSIlya Dryomov if (!single_major) 587883a06263SAlex Elder unregister_blkdev(rbd_dev->major, rbd_dev->name); 5879811c6688SIlya Dryomov err_out_unlock: 5880811c6688SIlya Dryomov up_write(&rbd_dev->header_rwsem); 588183a06263SAlex Elder return ret; 588283a06263SAlex Elder } 588383a06263SAlex Elder 5884332bb12dSAlex Elder static int rbd_dev_header_name(struct rbd_device *rbd_dev) 5885332bb12dSAlex Elder { 5886332bb12dSAlex Elder struct rbd_spec *spec = rbd_dev->spec; 5887c41d13a3SIlya Dryomov int ret; 5888332bb12dSAlex Elder 5889332bb12dSAlex Elder /* Record the header object name for this rbd image. */ 5890332bb12dSAlex Elder 5891332bb12dSAlex Elder rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 5892332bb12dSAlex Elder if (rbd_dev->image_format == 1) 5893c41d13a3SIlya Dryomov ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s", 5894332bb12dSAlex Elder spec->image_name, RBD_SUFFIX); 5895332bb12dSAlex Elder else 5896c41d13a3SIlya Dryomov ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s", 5897332bb12dSAlex Elder RBD_HEADER_PREFIX, spec->image_id); 5898c41d13a3SIlya Dryomov 5899c41d13a3SIlya Dryomov return ret; 5900332bb12dSAlex Elder } 5901332bb12dSAlex Elder 5902200a6a8bSAlex Elder static void rbd_dev_image_release(struct rbd_device *rbd_dev) 5903200a6a8bSAlex Elder { 59046fd48b3bSAlex Elder rbd_dev_unprobe(rbd_dev); 5905fd22aef8SIlya Dryomov if (rbd_dev->opts) 5906fd22aef8SIlya Dryomov rbd_unregister_watch(rbd_dev); 59076fd48b3bSAlex Elder rbd_dev->image_format = 0; 59086fd48b3bSAlex Elder kfree(rbd_dev->spec->image_id); 59096fd48b3bSAlex Elder rbd_dev->spec->image_id = NULL; 5910200a6a8bSAlex Elder } 5911200a6a8bSAlex Elder 5912a30b71b9SAlex Elder /* 5913a30b71b9SAlex Elder * Probe for the existence of the header object for the given rbd 59141f3ef788SAlex Elder * device. If this image is the one being mapped (i.e., not a 59151f3ef788SAlex Elder * parent), initiate a watch on its header object before using that 59161f3ef788SAlex Elder * object to get detailed information about the rbd image. 5917a30b71b9SAlex Elder */ 59186d69bb53SIlya Dryomov static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth) 5919a30b71b9SAlex Elder { 5920a30b71b9SAlex Elder int ret; 5921a30b71b9SAlex Elder 5922a30b71b9SAlex Elder /* 59233abef3b3SAlex Elder * Get the id from the image id object. Unless there's an 59243abef3b3SAlex Elder * error, rbd_dev->spec->image_id will be filled in with 59253abef3b3SAlex Elder * a dynamically-allocated string, and rbd_dev->image_format 59263abef3b3SAlex Elder * will be set to either 1 or 2. 5927a30b71b9SAlex Elder */ 5928a30b71b9SAlex Elder ret = rbd_dev_image_id(rbd_dev); 5929a30b71b9SAlex Elder if (ret) 5930c0fba368SAlex Elder return ret; 5931c0fba368SAlex Elder 5932332bb12dSAlex Elder ret = rbd_dev_header_name(rbd_dev); 5933332bb12dSAlex Elder if (ret) 5934332bb12dSAlex Elder goto err_out_format; 5935332bb12dSAlex Elder 59366d69bb53SIlya Dryomov if (!depth) { 593799d16943SIlya Dryomov ret = rbd_register_watch(rbd_dev); 59381fe48023SIlya Dryomov if (ret) { 59391fe48023SIlya Dryomov if (ret == -ENOENT) 5940b26c047bSIlya Dryomov pr_info("image %s/%s%s%s does not exist\n", 59411fe48023SIlya Dryomov rbd_dev->spec->pool_name, 5942b26c047bSIlya Dryomov rbd_dev->spec->pool_ns ?: "", 5943b26c047bSIlya Dryomov rbd_dev->spec->pool_ns ? "/" : "", 59441fe48023SIlya Dryomov rbd_dev->spec->image_name); 5945c41d13a3SIlya Dryomov goto err_out_format; 59461f3ef788SAlex Elder } 59471fe48023SIlya Dryomov } 5948b644de2bSAlex Elder 5949a720ae09SIlya Dryomov ret = rbd_dev_header_info(rbd_dev); 59505655c4d9SAlex Elder if (ret) 5951b644de2bSAlex Elder goto err_out_watch; 5952a30b71b9SAlex Elder 595304077599SIlya Dryomov /* 595404077599SIlya Dryomov * If this image is the one being mapped, we have pool name and 595504077599SIlya Dryomov * id, image name and id, and snap name - need to fill snap id. 595604077599SIlya Dryomov * Otherwise this is a parent image, identified by pool, image 595704077599SIlya Dryomov * and snap ids - need to fill in names for those ids. 595804077599SIlya Dryomov */ 59596d69bb53SIlya Dryomov if (!depth) 596004077599SIlya Dryomov ret = rbd_spec_fill_snap_id(rbd_dev); 596104077599SIlya Dryomov else 596204077599SIlya Dryomov ret = rbd_spec_fill_names(rbd_dev); 59631fe48023SIlya Dryomov if (ret) { 59641fe48023SIlya Dryomov if (ret == -ENOENT) 5965b26c047bSIlya Dryomov pr_info("snap %s/%s%s%s@%s does not exist\n", 59661fe48023SIlya Dryomov rbd_dev->spec->pool_name, 5967b26c047bSIlya Dryomov rbd_dev->spec->pool_ns ?: "", 5968b26c047bSIlya Dryomov rbd_dev->spec->pool_ns ? "/" : "", 59691fe48023SIlya Dryomov rbd_dev->spec->image_name, 59701fe48023SIlya Dryomov rbd_dev->spec->snap_name); 597133dca39fSAlex Elder goto err_out_probe; 59721fe48023SIlya Dryomov } 59739bb81c9bSAlex Elder 5974e8f59b59SIlya Dryomov if (rbd_dev->header.features & RBD_FEATURE_LAYERING) { 5975e8f59b59SIlya Dryomov ret = rbd_dev_v2_parent_info(rbd_dev); 5976e8f59b59SIlya Dryomov if (ret) 5977e8f59b59SIlya Dryomov goto err_out_probe; 5978e8f59b59SIlya Dryomov } 5979e8f59b59SIlya Dryomov 59806d69bb53SIlya Dryomov ret = rbd_dev_probe_parent(rbd_dev, depth); 598130d60ba2SAlex Elder if (ret) 598230d60ba2SAlex Elder goto err_out_probe; 598383a06263SAlex Elder 598430d60ba2SAlex Elder dout("discovered format %u image, header name is %s\n", 5985c41d13a3SIlya Dryomov rbd_dev->image_format, rbd_dev->header_oid.name); 598630d60ba2SAlex Elder return 0; 5987e8f59b59SIlya Dryomov 59886fd48b3bSAlex Elder err_out_probe: 59896fd48b3bSAlex Elder rbd_dev_unprobe(rbd_dev); 5990b644de2bSAlex Elder err_out_watch: 59916d69bb53SIlya Dryomov if (!depth) 599299d16943SIlya Dryomov rbd_unregister_watch(rbd_dev); 5993332bb12dSAlex Elder err_out_format: 5994332bb12dSAlex Elder rbd_dev->image_format = 0; 59955655c4d9SAlex Elder kfree(rbd_dev->spec->image_id); 59965655c4d9SAlex Elder rbd_dev->spec->image_id = NULL; 59975655c4d9SAlex Elder return ret; 599883a06263SAlex Elder } 599983a06263SAlex Elder 60009b60e70bSIlya Dryomov static ssize_t do_rbd_add(struct bus_type *bus, 600159c2be1eSYehuda Sadeh const char *buf, 600259c2be1eSYehuda Sadeh size_t count) 6003602adf40SYehuda Sadeh { 6004cb8627c7SAlex Elder struct rbd_device *rbd_dev = NULL; 6005dc79b113SAlex Elder struct ceph_options *ceph_opts = NULL; 60064e9afebaSAlex Elder struct rbd_options *rbd_opts = NULL; 6007859c31dfSAlex Elder struct rbd_spec *spec = NULL; 60089d3997fdSAlex Elder struct rbd_client *rbdc; 6009b51c83c2SIlya Dryomov int rc; 6010602adf40SYehuda Sadeh 6011602adf40SYehuda Sadeh if (!try_module_get(THIS_MODULE)) 6012602adf40SYehuda Sadeh return -ENODEV; 6013602adf40SYehuda Sadeh 6014a725f65eSAlex Elder /* parse add command */ 6015859c31dfSAlex Elder rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec); 6016dc79b113SAlex Elder if (rc < 0) 6017dd5ac32dSIlya Dryomov goto out; 6018a725f65eSAlex Elder 60199d3997fdSAlex Elder rbdc = rbd_get_client(ceph_opts); 60209d3997fdSAlex Elder if (IS_ERR(rbdc)) { 60219d3997fdSAlex Elder rc = PTR_ERR(rbdc); 60220ddebc0cSAlex Elder goto err_out_args; 60239d3997fdSAlex Elder } 6024602adf40SYehuda Sadeh 6025602adf40SYehuda Sadeh /* pick the pool */ 6026dd435855SIlya Dryomov rc = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, spec->pool_name); 60271fe48023SIlya Dryomov if (rc < 0) { 60281fe48023SIlya Dryomov if (rc == -ENOENT) 60291fe48023SIlya Dryomov pr_info("pool %s does not exist\n", spec->pool_name); 6030602adf40SYehuda Sadeh goto err_out_client; 60311fe48023SIlya Dryomov } 6032859c31dfSAlex Elder spec->pool_id = (u64)rc; 6033859c31dfSAlex Elder 6034d147543dSIlya Dryomov rbd_dev = rbd_dev_create(rbdc, spec, rbd_opts); 6035b51c83c2SIlya Dryomov if (!rbd_dev) { 6036b51c83c2SIlya Dryomov rc = -ENOMEM; 6037bd4ba655SAlex Elder goto err_out_client; 6038b51c83c2SIlya Dryomov } 6039c53d5893SAlex Elder rbdc = NULL; /* rbd_dev now owns this */ 6040c53d5893SAlex Elder spec = NULL; /* rbd_dev now owns this */ 6041d147543dSIlya Dryomov rbd_opts = NULL; /* rbd_dev now owns this */ 6042602adf40SYehuda Sadeh 60430d6d1e9cSMike Christie rbd_dev->config_info = kstrdup(buf, GFP_KERNEL); 60440d6d1e9cSMike Christie if (!rbd_dev->config_info) { 60450d6d1e9cSMike Christie rc = -ENOMEM; 60460d6d1e9cSMike Christie goto err_out_rbd_dev; 60470d6d1e9cSMike Christie } 60480d6d1e9cSMike Christie 6049811c6688SIlya Dryomov down_write(&rbd_dev->header_rwsem); 60506d69bb53SIlya Dryomov rc = rbd_dev_image_probe(rbd_dev, 0); 60510d6d1e9cSMike Christie if (rc < 0) { 60520d6d1e9cSMike Christie up_write(&rbd_dev->header_rwsem); 6053c53d5893SAlex Elder goto err_out_rbd_dev; 60540d6d1e9cSMike Christie } 605505fd6f6fSAlex Elder 60567ce4eef7SAlex Elder /* If we are mapping a snapshot it must be marked read-only */ 60577ce4eef7SAlex Elder if (rbd_dev->spec->snap_id != CEPH_NOSNAP) 60589568c93eSIlya Dryomov rbd_dev->opts->read_only = true; 60597ce4eef7SAlex Elder 60600c93e1b7SIlya Dryomov if (rbd_dev->opts->alloc_size > rbd_dev->layout.object_size) { 60610c93e1b7SIlya Dryomov rbd_warn(rbd_dev, "alloc_size adjusted to %u", 60620c93e1b7SIlya Dryomov rbd_dev->layout.object_size); 60630c93e1b7SIlya Dryomov rbd_dev->opts->alloc_size = rbd_dev->layout.object_size; 60640c93e1b7SIlya Dryomov } 60650c93e1b7SIlya Dryomov 6066b536f69aSAlex Elder rc = rbd_dev_device_setup(rbd_dev); 6067fd22aef8SIlya Dryomov if (rc) 60688b679ec5SIlya Dryomov goto err_out_image_probe; 60693abef3b3SAlex Elder 6070e010dd0aSIlya Dryomov if (rbd_dev->opts->exclusive) { 6071e010dd0aSIlya Dryomov rc = rbd_add_acquire_lock(rbd_dev); 6072e010dd0aSIlya Dryomov if (rc) 6073e010dd0aSIlya Dryomov goto err_out_device_setup; 6074b536f69aSAlex Elder } 6075b536f69aSAlex Elder 60765769ed0cSIlya Dryomov /* Everything's ready. Announce the disk to the world. */ 60775769ed0cSIlya Dryomov 60785769ed0cSIlya Dryomov rc = device_add(&rbd_dev->dev); 60795769ed0cSIlya Dryomov if (rc) 6080e010dd0aSIlya Dryomov goto err_out_image_lock; 60815769ed0cSIlya Dryomov 60825769ed0cSIlya Dryomov add_disk(rbd_dev->disk); 60835769ed0cSIlya Dryomov /* see rbd_init_disk() */ 60845769ed0cSIlya Dryomov blk_put_queue(rbd_dev->disk->queue); 60855769ed0cSIlya Dryomov 60865769ed0cSIlya Dryomov spin_lock(&rbd_dev_list_lock); 60875769ed0cSIlya Dryomov list_add_tail(&rbd_dev->node, &rbd_dev_list); 60885769ed0cSIlya Dryomov spin_unlock(&rbd_dev_list_lock); 60895769ed0cSIlya Dryomov 60905769ed0cSIlya Dryomov pr_info("%s: capacity %llu features 0x%llx\n", rbd_dev->disk->disk_name, 60915769ed0cSIlya Dryomov (unsigned long long)get_capacity(rbd_dev->disk) << SECTOR_SHIFT, 60925769ed0cSIlya Dryomov rbd_dev->header.features); 6093dd5ac32dSIlya Dryomov rc = count; 6094dd5ac32dSIlya Dryomov out: 6095dd5ac32dSIlya Dryomov module_put(THIS_MODULE); 6096dd5ac32dSIlya Dryomov return rc; 6097b536f69aSAlex Elder 6098e010dd0aSIlya Dryomov err_out_image_lock: 6099e010dd0aSIlya Dryomov rbd_dev_image_unlock(rbd_dev); 61005769ed0cSIlya Dryomov err_out_device_setup: 61015769ed0cSIlya Dryomov rbd_dev_device_release(rbd_dev); 61028b679ec5SIlya Dryomov err_out_image_probe: 61038b679ec5SIlya Dryomov rbd_dev_image_release(rbd_dev); 6104c53d5893SAlex Elder err_out_rbd_dev: 6105c53d5893SAlex Elder rbd_dev_destroy(rbd_dev); 6106bd4ba655SAlex Elder err_out_client: 61079d3997fdSAlex Elder rbd_put_client(rbdc); 61080ddebc0cSAlex Elder err_out_args: 6109859c31dfSAlex Elder rbd_spec_put(spec); 6110d147543dSIlya Dryomov kfree(rbd_opts); 6111dd5ac32dSIlya Dryomov goto out; 6112602adf40SYehuda Sadeh } 6113602adf40SYehuda Sadeh 61147e9586baSGreg Kroah-Hartman static ssize_t add_store(struct bus_type *bus, const char *buf, size_t count) 61159b60e70bSIlya Dryomov { 61169b60e70bSIlya Dryomov if (single_major) 61179b60e70bSIlya Dryomov return -EINVAL; 61189b60e70bSIlya Dryomov 61199b60e70bSIlya Dryomov return do_rbd_add(bus, buf, count); 61209b60e70bSIlya Dryomov } 61219b60e70bSIlya Dryomov 61227e9586baSGreg Kroah-Hartman static ssize_t add_single_major_store(struct bus_type *bus, const char *buf, 61239b60e70bSIlya Dryomov size_t count) 61249b60e70bSIlya Dryomov { 61259b60e70bSIlya Dryomov return do_rbd_add(bus, buf, count); 61269b60e70bSIlya Dryomov } 61279b60e70bSIlya Dryomov 612805a46afdSAlex Elder static void rbd_dev_remove_parent(struct rbd_device *rbd_dev) 612905a46afdSAlex Elder { 6130ad945fc1SAlex Elder while (rbd_dev->parent) { 613105a46afdSAlex Elder struct rbd_device *first = rbd_dev; 613205a46afdSAlex Elder struct rbd_device *second = first->parent; 613305a46afdSAlex Elder struct rbd_device *third; 613405a46afdSAlex Elder 613505a46afdSAlex Elder /* 613605a46afdSAlex Elder * Follow to the parent with no grandparent and 613705a46afdSAlex Elder * remove it. 613805a46afdSAlex Elder */ 613905a46afdSAlex Elder while (second && (third = second->parent)) { 614005a46afdSAlex Elder first = second; 614105a46afdSAlex Elder second = third; 614205a46afdSAlex Elder } 6143ad945fc1SAlex Elder rbd_assert(second); 61448ad42cd0SAlex Elder rbd_dev_image_release(second); 61458b679ec5SIlya Dryomov rbd_dev_destroy(second); 6146ad945fc1SAlex Elder first->parent = NULL; 6147ad945fc1SAlex Elder first->parent_overlap = 0; 6148ad945fc1SAlex Elder 6149ad945fc1SAlex Elder rbd_assert(first->parent_spec); 615005a46afdSAlex Elder rbd_spec_put(first->parent_spec); 615105a46afdSAlex Elder first->parent_spec = NULL; 615205a46afdSAlex Elder } 615305a46afdSAlex Elder } 615405a46afdSAlex Elder 61559b60e70bSIlya Dryomov static ssize_t do_rbd_remove(struct bus_type *bus, 6156602adf40SYehuda Sadeh const char *buf, 6157602adf40SYehuda Sadeh size_t count) 6158602adf40SYehuda Sadeh { 6159602adf40SYehuda Sadeh struct rbd_device *rbd_dev = NULL; 6160751cc0e3SAlex Elder struct list_head *tmp; 6161751cc0e3SAlex Elder int dev_id; 61620276dca6SMike Christie char opt_buf[6]; 61630276dca6SMike Christie bool force = false; 61640d8189e1SAlex Elder int ret; 6165602adf40SYehuda Sadeh 61660276dca6SMike Christie dev_id = -1; 61670276dca6SMike Christie opt_buf[0] = '\0'; 61680276dca6SMike Christie sscanf(buf, "%d %5s", &dev_id, opt_buf); 61690276dca6SMike Christie if (dev_id < 0) { 61700276dca6SMike Christie pr_err("dev_id out of range\n"); 6171602adf40SYehuda Sadeh return -EINVAL; 61720276dca6SMike Christie } 61730276dca6SMike Christie if (opt_buf[0] != '\0') { 61740276dca6SMike Christie if (!strcmp(opt_buf, "force")) { 61750276dca6SMike Christie force = true; 61760276dca6SMike Christie } else { 61770276dca6SMike Christie pr_err("bad remove option at '%s'\n", opt_buf); 61780276dca6SMike Christie return -EINVAL; 61790276dca6SMike Christie } 61800276dca6SMike Christie } 6181602adf40SYehuda Sadeh 6182602adf40SYehuda Sadeh ret = -ENOENT; 6183751cc0e3SAlex Elder spin_lock(&rbd_dev_list_lock); 6184751cc0e3SAlex Elder list_for_each(tmp, &rbd_dev_list) { 6185751cc0e3SAlex Elder rbd_dev = list_entry(tmp, struct rbd_device, node); 6186751cc0e3SAlex Elder if (rbd_dev->dev_id == dev_id) { 6187751cc0e3SAlex Elder ret = 0; 6188751cc0e3SAlex Elder break; 6189602adf40SYehuda Sadeh } 6190751cc0e3SAlex Elder } 6191751cc0e3SAlex Elder if (!ret) { 6192a14ea269SAlex Elder spin_lock_irq(&rbd_dev->lock); 61930276dca6SMike Christie if (rbd_dev->open_count && !force) 619442382b70SAlex Elder ret = -EBUSY; 619585f5a4d6SIlya Dryomov else if (test_and_set_bit(RBD_DEV_FLAG_REMOVING, 619685f5a4d6SIlya Dryomov &rbd_dev->flags)) 619785f5a4d6SIlya Dryomov ret = -EINPROGRESS; 6198a14ea269SAlex Elder spin_unlock_irq(&rbd_dev->lock); 6199751cc0e3SAlex Elder } 6200751cc0e3SAlex Elder spin_unlock(&rbd_dev_list_lock); 620185f5a4d6SIlya Dryomov if (ret) 62021ba0f1e7SAlex Elder return ret; 6203751cc0e3SAlex Elder 62040276dca6SMike Christie if (force) { 62050276dca6SMike Christie /* 62060276dca6SMike Christie * Prevent new IO from being queued and wait for existing 62070276dca6SMike Christie * IO to complete/fail. 62080276dca6SMike Christie */ 62090276dca6SMike Christie blk_mq_freeze_queue(rbd_dev->disk->queue); 62100276dca6SMike Christie blk_set_queue_dying(rbd_dev->disk->queue); 62110276dca6SMike Christie } 62120276dca6SMike Christie 62135769ed0cSIlya Dryomov del_gendisk(rbd_dev->disk); 62145769ed0cSIlya Dryomov spin_lock(&rbd_dev_list_lock); 62155769ed0cSIlya Dryomov list_del_init(&rbd_dev->node); 62165769ed0cSIlya Dryomov spin_unlock(&rbd_dev_list_lock); 62175769ed0cSIlya Dryomov device_del(&rbd_dev->dev); 6218fca27065SIlya Dryomov 6219e010dd0aSIlya Dryomov rbd_dev_image_unlock(rbd_dev); 6220dd5ac32dSIlya Dryomov rbd_dev_device_release(rbd_dev); 62218ad42cd0SAlex Elder rbd_dev_image_release(rbd_dev); 62228b679ec5SIlya Dryomov rbd_dev_destroy(rbd_dev); 62231ba0f1e7SAlex Elder return count; 6224602adf40SYehuda Sadeh } 6225602adf40SYehuda Sadeh 62267e9586baSGreg Kroah-Hartman static ssize_t remove_store(struct bus_type *bus, const char *buf, size_t count) 62279b60e70bSIlya Dryomov { 62289b60e70bSIlya Dryomov if (single_major) 62299b60e70bSIlya Dryomov return -EINVAL; 62309b60e70bSIlya Dryomov 62319b60e70bSIlya Dryomov return do_rbd_remove(bus, buf, count); 62329b60e70bSIlya Dryomov } 62339b60e70bSIlya Dryomov 62347e9586baSGreg Kroah-Hartman static ssize_t remove_single_major_store(struct bus_type *bus, const char *buf, 62359b60e70bSIlya Dryomov size_t count) 62369b60e70bSIlya Dryomov { 62379b60e70bSIlya Dryomov return do_rbd_remove(bus, buf, count); 62389b60e70bSIlya Dryomov } 62399b60e70bSIlya Dryomov 6240602adf40SYehuda Sadeh /* 6241602adf40SYehuda Sadeh * create control files in sysfs 6242dfc5606dSYehuda Sadeh * /sys/bus/rbd/... 6243602adf40SYehuda Sadeh */ 62447d8dc534SChengguang Xu static int __init rbd_sysfs_init(void) 6245602adf40SYehuda Sadeh { 6246dfc5606dSYehuda Sadeh int ret; 6247602adf40SYehuda Sadeh 6248fed4c143SAlex Elder ret = device_register(&rbd_root_dev); 6249dfc5606dSYehuda Sadeh if (ret < 0) 6250dfc5606dSYehuda Sadeh return ret; 6251602adf40SYehuda Sadeh 6252fed4c143SAlex Elder ret = bus_register(&rbd_bus_type); 6253fed4c143SAlex Elder if (ret < 0) 6254fed4c143SAlex Elder device_unregister(&rbd_root_dev); 6255602adf40SYehuda Sadeh 6256602adf40SYehuda Sadeh return ret; 6257602adf40SYehuda Sadeh } 6258602adf40SYehuda Sadeh 62597d8dc534SChengguang Xu static void __exit rbd_sysfs_cleanup(void) 6260602adf40SYehuda Sadeh { 6261dfc5606dSYehuda Sadeh bus_unregister(&rbd_bus_type); 6262fed4c143SAlex Elder device_unregister(&rbd_root_dev); 6263602adf40SYehuda Sadeh } 6264602adf40SYehuda Sadeh 62657d8dc534SChengguang Xu static int __init rbd_slab_init(void) 62661c2a9dfeSAlex Elder { 62671c2a9dfeSAlex Elder rbd_assert(!rbd_img_request_cache); 626803d94406SGeliang Tang rbd_img_request_cache = KMEM_CACHE(rbd_img_request, 0); 6269868311b1SAlex Elder if (!rbd_img_request_cache) 6270868311b1SAlex Elder return -ENOMEM; 6271868311b1SAlex Elder 6272868311b1SAlex Elder rbd_assert(!rbd_obj_request_cache); 627303d94406SGeliang Tang rbd_obj_request_cache = KMEM_CACHE(rbd_obj_request, 0); 627478c2a44aSAlex Elder if (!rbd_obj_request_cache) 627578c2a44aSAlex Elder goto out_err; 627678c2a44aSAlex Elder 62771c2a9dfeSAlex Elder return 0; 62781c2a9dfeSAlex Elder 62796c696d85SIlya Dryomov out_err: 6280868311b1SAlex Elder kmem_cache_destroy(rbd_img_request_cache); 6281868311b1SAlex Elder rbd_img_request_cache = NULL; 62821c2a9dfeSAlex Elder return -ENOMEM; 62831c2a9dfeSAlex Elder } 62841c2a9dfeSAlex Elder 62851c2a9dfeSAlex Elder static void rbd_slab_exit(void) 62861c2a9dfeSAlex Elder { 6287868311b1SAlex Elder rbd_assert(rbd_obj_request_cache); 6288868311b1SAlex Elder kmem_cache_destroy(rbd_obj_request_cache); 6289868311b1SAlex Elder rbd_obj_request_cache = NULL; 6290868311b1SAlex Elder 62911c2a9dfeSAlex Elder rbd_assert(rbd_img_request_cache); 62921c2a9dfeSAlex Elder kmem_cache_destroy(rbd_img_request_cache); 62931c2a9dfeSAlex Elder rbd_img_request_cache = NULL; 62941c2a9dfeSAlex Elder } 62951c2a9dfeSAlex Elder 6296cc344fa1SAlex Elder static int __init rbd_init(void) 6297602adf40SYehuda Sadeh { 6298602adf40SYehuda Sadeh int rc; 6299602adf40SYehuda Sadeh 63001e32d34cSAlex Elder if (!libceph_compatible(NULL)) { 63011e32d34cSAlex Elder rbd_warn(NULL, "libceph incompatibility (quitting)"); 63021e32d34cSAlex Elder return -EINVAL; 63031e32d34cSAlex Elder } 6304e1b4d96dSIlya Dryomov 63051c2a9dfeSAlex Elder rc = rbd_slab_init(); 6306602adf40SYehuda Sadeh if (rc) 6307602adf40SYehuda Sadeh return rc; 6308e1b4d96dSIlya Dryomov 6309f5ee37bdSIlya Dryomov /* 6310f5ee37bdSIlya Dryomov * The number of active work items is limited by the number of 6311f77303bdSIlya Dryomov * rbd devices * queue depth, so leave @max_active at default. 6312f5ee37bdSIlya Dryomov */ 6313f5ee37bdSIlya Dryomov rbd_wq = alloc_workqueue(RBD_DRV_NAME, WQ_MEM_RECLAIM, 0); 6314f5ee37bdSIlya Dryomov if (!rbd_wq) { 6315f5ee37bdSIlya Dryomov rc = -ENOMEM; 6316f5ee37bdSIlya Dryomov goto err_out_slab; 6317f5ee37bdSIlya Dryomov } 6318f5ee37bdSIlya Dryomov 63199b60e70bSIlya Dryomov if (single_major) { 63209b60e70bSIlya Dryomov rbd_major = register_blkdev(0, RBD_DRV_NAME); 63219b60e70bSIlya Dryomov if (rbd_major < 0) { 63229b60e70bSIlya Dryomov rc = rbd_major; 6323f5ee37bdSIlya Dryomov goto err_out_wq; 63249b60e70bSIlya Dryomov } 63259b60e70bSIlya Dryomov } 63269b60e70bSIlya Dryomov 63271c2a9dfeSAlex Elder rc = rbd_sysfs_init(); 63281c2a9dfeSAlex Elder if (rc) 63299b60e70bSIlya Dryomov goto err_out_blkdev; 63301c2a9dfeSAlex Elder 63319b60e70bSIlya Dryomov if (single_major) 63329b60e70bSIlya Dryomov pr_info("loaded (major %d)\n", rbd_major); 63339b60e70bSIlya Dryomov else 6334e1b4d96dSIlya Dryomov pr_info("loaded\n"); 63359b60e70bSIlya Dryomov 6336e1b4d96dSIlya Dryomov return 0; 6337e1b4d96dSIlya Dryomov 63389b60e70bSIlya Dryomov err_out_blkdev: 63399b60e70bSIlya Dryomov if (single_major) 63409b60e70bSIlya Dryomov unregister_blkdev(rbd_major, RBD_DRV_NAME); 6341f5ee37bdSIlya Dryomov err_out_wq: 6342f5ee37bdSIlya Dryomov destroy_workqueue(rbd_wq); 6343e1b4d96dSIlya Dryomov err_out_slab: 6344e1b4d96dSIlya Dryomov rbd_slab_exit(); 63451c2a9dfeSAlex Elder return rc; 6346602adf40SYehuda Sadeh } 6347602adf40SYehuda Sadeh 6348cc344fa1SAlex Elder static void __exit rbd_exit(void) 6349602adf40SYehuda Sadeh { 6350ffe312cfSIlya Dryomov ida_destroy(&rbd_dev_id_ida); 6351602adf40SYehuda Sadeh rbd_sysfs_cleanup(); 63529b60e70bSIlya Dryomov if (single_major) 63539b60e70bSIlya Dryomov unregister_blkdev(rbd_major, RBD_DRV_NAME); 6354f5ee37bdSIlya Dryomov destroy_workqueue(rbd_wq); 63551c2a9dfeSAlex Elder rbd_slab_exit(); 6356602adf40SYehuda Sadeh } 6357602adf40SYehuda Sadeh 6358602adf40SYehuda Sadeh module_init(rbd_init); 6359602adf40SYehuda Sadeh module_exit(rbd_exit); 6360602adf40SYehuda Sadeh 6361d552c619SAlex Elder MODULE_AUTHOR("Alex Elder <elder@inktank.com>"); 6362602adf40SYehuda Sadeh MODULE_AUTHOR("Sage Weil <sage@newdream.net>"); 6363602adf40SYehuda Sadeh MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>"); 6364602adf40SYehuda Sadeh /* following authorship retained from original osdblk.c */ 6365602adf40SYehuda Sadeh MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>"); 6366602adf40SYehuda Sadeh 636790da258bSIlya Dryomov MODULE_DESCRIPTION("RADOS Block Device (RBD) driver"); 6368602adf40SYehuda Sadeh MODULE_LICENSE("GPL"); 6369