1e2a58ee5SAlex Elder 2602adf40SYehuda Sadeh /* 3602adf40SYehuda Sadeh rbd.c -- Export ceph rados objects as a Linux block device 4602adf40SYehuda Sadeh 5602adf40SYehuda Sadeh 6602adf40SYehuda Sadeh based on drivers/block/osdblk.c: 7602adf40SYehuda Sadeh 8602adf40SYehuda Sadeh Copyright 2009 Red Hat, Inc. 9602adf40SYehuda Sadeh 10602adf40SYehuda Sadeh This program is free software; you can redistribute it and/or modify 11602adf40SYehuda Sadeh it under the terms of the GNU General Public License as published by 12602adf40SYehuda Sadeh the Free Software Foundation. 13602adf40SYehuda Sadeh 14602adf40SYehuda Sadeh This program is distributed in the hope that it will be useful, 15602adf40SYehuda Sadeh but WITHOUT ANY WARRANTY; without even the implied warranty of 16602adf40SYehuda Sadeh MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17602adf40SYehuda Sadeh GNU General Public License for more details. 18602adf40SYehuda Sadeh 19602adf40SYehuda Sadeh You should have received a copy of the GNU General Public License 20602adf40SYehuda Sadeh along with this program; see the file COPYING. If not, write to 21602adf40SYehuda Sadeh the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 22602adf40SYehuda Sadeh 23602adf40SYehuda Sadeh 24602adf40SYehuda Sadeh 25dfc5606dSYehuda Sadeh For usage instructions, please refer to: 26602adf40SYehuda Sadeh 27dfc5606dSYehuda Sadeh Documentation/ABI/testing/sysfs-bus-rbd 28602adf40SYehuda Sadeh 29602adf40SYehuda Sadeh */ 30602adf40SYehuda Sadeh 31602adf40SYehuda Sadeh #include <linux/ceph/libceph.h> 32602adf40SYehuda Sadeh #include <linux/ceph/osd_client.h> 33602adf40SYehuda Sadeh #include <linux/ceph/mon_client.h> 34ed95b21aSIlya Dryomov #include <linux/ceph/cls_lock_client.h> 3543df3d35SIlya Dryomov #include <linux/ceph/striper.h> 36602adf40SYehuda Sadeh #include <linux/ceph/decode.h> 3759c2be1eSYehuda Sadeh #include <linux/parser.h> 3830d1cff8SAlex Elder #include <linux/bsearch.h> 39602adf40SYehuda Sadeh 40602adf40SYehuda Sadeh #include <linux/kernel.h> 41602adf40SYehuda Sadeh #include <linux/device.h> 42602adf40SYehuda Sadeh #include <linux/module.h> 437ad18afaSChristoph Hellwig #include <linux/blk-mq.h> 44602adf40SYehuda Sadeh #include <linux/fs.h> 45602adf40SYehuda Sadeh #include <linux/blkdev.h> 461c2a9dfeSAlex Elder #include <linux/slab.h> 47f8a22fc2SIlya Dryomov #include <linux/idr.h> 48bc1ecc65SIlya Dryomov #include <linux/workqueue.h> 49602adf40SYehuda Sadeh 50602adf40SYehuda Sadeh #include "rbd_types.h" 51602adf40SYehuda Sadeh 52aafb230eSAlex Elder #define RBD_DEBUG /* Activate rbd_assert() calls */ 53aafb230eSAlex Elder 54593a9e7bSAlex Elder /* 55a2acd00eSAlex Elder * Increment the given counter and return its updated value. 56a2acd00eSAlex Elder * If the counter is already 0 it will not be incremented. 57a2acd00eSAlex Elder * If the counter is already at its maximum value returns 58a2acd00eSAlex Elder * -EINVAL without updating it. 59a2acd00eSAlex Elder */ 60a2acd00eSAlex Elder static int atomic_inc_return_safe(atomic_t *v) 61a2acd00eSAlex Elder { 62a2acd00eSAlex Elder unsigned int counter; 63a2acd00eSAlex Elder 64bfc18e38SMark Rutland counter = (unsigned int)atomic_fetch_add_unless(v, 1, 0); 65a2acd00eSAlex Elder if (counter <= (unsigned int)INT_MAX) 66a2acd00eSAlex Elder return (int)counter; 67a2acd00eSAlex Elder 68a2acd00eSAlex Elder atomic_dec(v); 69a2acd00eSAlex Elder 70a2acd00eSAlex Elder return -EINVAL; 71a2acd00eSAlex Elder } 72a2acd00eSAlex Elder 73a2acd00eSAlex Elder /* Decrement the counter. Return the resulting value, or -EINVAL */ 74a2acd00eSAlex Elder static int atomic_dec_return_safe(atomic_t *v) 75a2acd00eSAlex Elder { 76a2acd00eSAlex Elder int counter; 77a2acd00eSAlex Elder 78a2acd00eSAlex Elder counter = atomic_dec_return(v); 79a2acd00eSAlex Elder if (counter >= 0) 80a2acd00eSAlex Elder return counter; 81a2acd00eSAlex Elder 82a2acd00eSAlex Elder atomic_inc(v); 83a2acd00eSAlex Elder 84a2acd00eSAlex Elder return -EINVAL; 85a2acd00eSAlex Elder } 86a2acd00eSAlex Elder 87f0f8cef5SAlex Elder #define RBD_DRV_NAME "rbd" 88602adf40SYehuda Sadeh 897e513d43SIlya Dryomov #define RBD_MINORS_PER_MAJOR 256 907e513d43SIlya Dryomov #define RBD_SINGLE_MAJOR_PART_SHIFT 4 91602adf40SYehuda Sadeh 926d69bb53SIlya Dryomov #define RBD_MAX_PARENT_CHAIN_LEN 16 936d69bb53SIlya Dryomov 94d4b125e9SAlex Elder #define RBD_SNAP_DEV_NAME_PREFIX "snap_" 95d4b125e9SAlex Elder #define RBD_MAX_SNAP_NAME_LEN \ 96d4b125e9SAlex Elder (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1)) 97d4b125e9SAlex Elder 9835d489f9SAlex Elder #define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */ 99602adf40SYehuda Sadeh 100602adf40SYehuda Sadeh #define RBD_SNAP_HEAD_NAME "-" 101602adf40SYehuda Sadeh 1029682fc6dSAlex Elder #define BAD_SNAP_INDEX U32_MAX /* invalid index into snap array */ 1039682fc6dSAlex Elder 1049e15b77dSAlex Elder /* This allows a single page to hold an image name sent by OSD */ 1059e15b77dSAlex Elder #define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1) 106589d30e0SAlex Elder #define RBD_IMAGE_ID_LEN_MAX 64 1079e15b77dSAlex Elder 1081e130199SAlex Elder #define RBD_OBJ_PREFIX_LEN_MAX 64 109589d30e0SAlex Elder 110ed95b21aSIlya Dryomov #define RBD_NOTIFY_TIMEOUT 5 /* seconds */ 11199d16943SIlya Dryomov #define RBD_RETRY_DELAY msecs_to_jiffies(1000) 11299d16943SIlya Dryomov 113d889140cSAlex Elder /* Feature bits */ 114d889140cSAlex Elder 1158767b293SIlya Dryomov #define RBD_FEATURE_LAYERING (1ULL<<0) 1168767b293SIlya Dryomov #define RBD_FEATURE_STRIPINGV2 (1ULL<<1) 1178767b293SIlya Dryomov #define RBD_FEATURE_EXCLUSIVE_LOCK (1ULL<<2) 118b9f6d447SIlya Dryomov #define RBD_FEATURE_DEEP_FLATTEN (1ULL<<5) 1198767b293SIlya Dryomov #define RBD_FEATURE_DATA_POOL (1ULL<<7) 120e573427aSIlya Dryomov #define RBD_FEATURE_OPERATIONS (1ULL<<8) 1218767b293SIlya Dryomov 122ed95b21aSIlya Dryomov #define RBD_FEATURES_ALL (RBD_FEATURE_LAYERING | \ 123ed95b21aSIlya Dryomov RBD_FEATURE_STRIPINGV2 | \ 1247e97332eSIlya Dryomov RBD_FEATURE_EXCLUSIVE_LOCK | \ 125b9f6d447SIlya Dryomov RBD_FEATURE_DEEP_FLATTEN | \ 126e573427aSIlya Dryomov RBD_FEATURE_DATA_POOL | \ 127e573427aSIlya Dryomov RBD_FEATURE_OPERATIONS) 128d889140cSAlex Elder 129d889140cSAlex Elder /* Features supported by this (client software) implementation. */ 130d889140cSAlex Elder 131770eba6eSAlex Elder #define RBD_FEATURES_SUPPORTED (RBD_FEATURES_ALL) 132d889140cSAlex Elder 13381a89793SAlex Elder /* 13481a89793SAlex Elder * An RBD device name will be "rbd#", where the "rbd" comes from 13581a89793SAlex Elder * RBD_DRV_NAME above, and # is a unique integer identifier. 13681a89793SAlex Elder */ 137602adf40SYehuda Sadeh #define DEV_NAME_LEN 32 138602adf40SYehuda Sadeh 139602adf40SYehuda Sadeh /* 140602adf40SYehuda Sadeh * block device image metadata (in-memory version) 141602adf40SYehuda Sadeh */ 142602adf40SYehuda Sadeh struct rbd_image_header { 143f35a4deeSAlex Elder /* These six fields never change for a given rbd image */ 144849b4260SAlex Elder char *object_prefix; 145602adf40SYehuda Sadeh __u8 obj_order; 146f35a4deeSAlex Elder u64 stripe_unit; 147f35a4deeSAlex Elder u64 stripe_count; 1487e97332eSIlya Dryomov s64 data_pool_id; 149f35a4deeSAlex Elder u64 features; /* Might be changeable someday? */ 150602adf40SYehuda Sadeh 151f84344f3SAlex Elder /* The remaining fields need to be updated occasionally */ 152f84344f3SAlex Elder u64 image_size; 153f84344f3SAlex Elder struct ceph_snap_context *snapc; 154f35a4deeSAlex Elder char *snap_names; /* format 1 only */ 155f35a4deeSAlex Elder u64 *snap_sizes; /* format 1 only */ 15659c2be1eSYehuda Sadeh }; 15759c2be1eSYehuda Sadeh 1580d7dbfceSAlex Elder /* 1590d7dbfceSAlex Elder * An rbd image specification. 1600d7dbfceSAlex Elder * 1610d7dbfceSAlex Elder * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely 162c66c6e0cSAlex Elder * identify an image. Each rbd_dev structure includes a pointer to 163c66c6e0cSAlex Elder * an rbd_spec structure that encapsulates this identity. 164c66c6e0cSAlex Elder * 165c66c6e0cSAlex Elder * Each of the id's in an rbd_spec has an associated name. For a 166c66c6e0cSAlex Elder * user-mapped image, the names are supplied and the id's associated 167c66c6e0cSAlex Elder * with them are looked up. For a layered image, a parent image is 168c66c6e0cSAlex Elder * defined by the tuple, and the names are looked up. 169c66c6e0cSAlex Elder * 170c66c6e0cSAlex Elder * An rbd_dev structure contains a parent_spec pointer which is 171c66c6e0cSAlex Elder * non-null if the image it represents is a child in a layered 172c66c6e0cSAlex Elder * image. This pointer will refer to the rbd_spec structure used 173c66c6e0cSAlex Elder * by the parent rbd_dev for its own identity (i.e., the structure 174c66c6e0cSAlex Elder * is shared between the parent and child). 175c66c6e0cSAlex Elder * 176c66c6e0cSAlex Elder * Since these structures are populated once, during the discovery 177c66c6e0cSAlex Elder * phase of image construction, they are effectively immutable so 178c66c6e0cSAlex Elder * we make no effort to synchronize access to them. 179c66c6e0cSAlex Elder * 180c66c6e0cSAlex Elder * Note that code herein does not assume the image name is known (it 181c66c6e0cSAlex Elder * could be a null pointer). 1820d7dbfceSAlex Elder */ 1830d7dbfceSAlex Elder struct rbd_spec { 1840d7dbfceSAlex Elder u64 pool_id; 185ecb4dc22SAlex Elder const char *pool_name; 186b26c047bSIlya Dryomov const char *pool_ns; /* NULL if default, never "" */ 1870d7dbfceSAlex Elder 188ecb4dc22SAlex Elder const char *image_id; 189ecb4dc22SAlex Elder const char *image_name; 1900d7dbfceSAlex Elder 1910d7dbfceSAlex Elder u64 snap_id; 192ecb4dc22SAlex Elder const char *snap_name; 1930d7dbfceSAlex Elder 1940d7dbfceSAlex Elder struct kref kref; 1950d7dbfceSAlex Elder }; 1960d7dbfceSAlex Elder 197602adf40SYehuda Sadeh /* 198f0f8cef5SAlex Elder * an instance of the client. multiple devices may share an rbd client. 199602adf40SYehuda Sadeh */ 200602adf40SYehuda Sadeh struct rbd_client { 201602adf40SYehuda Sadeh struct ceph_client *client; 202602adf40SYehuda Sadeh struct kref kref; 203602adf40SYehuda Sadeh struct list_head node; 204602adf40SYehuda Sadeh }; 205602adf40SYehuda Sadeh 206bf0d5f50SAlex Elder struct rbd_img_request; 207bf0d5f50SAlex Elder 2089969ebc5SAlex Elder enum obj_request_type { 209a1fbb5e7SIlya Dryomov OBJ_REQUEST_NODATA = 1, 2105359a17dSIlya Dryomov OBJ_REQUEST_BIO, /* pointer into provided bio (list) */ 2117e07efb1SIlya Dryomov OBJ_REQUEST_BVECS, /* pointer into provided bio_vec array */ 212afb97888SIlya Dryomov OBJ_REQUEST_OWN_BVECS, /* private bio_vec array, doesn't own pages */ 2139969ebc5SAlex Elder }; 214bf0d5f50SAlex Elder 2156d2940c8SGuangliang Zhao enum obj_operation_type { 216a1fbb5e7SIlya Dryomov OBJ_OP_READ = 1, 2176d2940c8SGuangliang Zhao OBJ_OP_WRITE, 21890e98c52SGuangliang Zhao OBJ_OP_DISCARD, 2196484cbe9SIlya Dryomov OBJ_OP_ZEROOUT, 2206d2940c8SGuangliang Zhao }; 2216d2940c8SGuangliang Zhao 2223da691bfSIlya Dryomov /* 2233da691bfSIlya Dryomov * Writes go through the following state machine to deal with 2243da691bfSIlya Dryomov * layering: 2253da691bfSIlya Dryomov * 22689a59c1cSIlya Dryomov * . . . . . RBD_OBJ_WRITE_GUARD. . . . . . . . . . . . . . 22789a59c1cSIlya Dryomov * . | . 22889a59c1cSIlya Dryomov * . v . 22989a59c1cSIlya Dryomov * . RBD_OBJ_WRITE_READ_FROM_PARENT. . . . 23089a59c1cSIlya Dryomov * . | . . 23189a59c1cSIlya Dryomov * . v v (deep-copyup . 23289a59c1cSIlya Dryomov * (image . RBD_OBJ_WRITE_COPYUP_EMPTY_SNAPC . not needed) . 23389a59c1cSIlya Dryomov * flattened) v | . . 23489a59c1cSIlya Dryomov * . v . . 23589a59c1cSIlya Dryomov * . . . .RBD_OBJ_WRITE_COPYUP_OPS. . . . . (copyup . 23689a59c1cSIlya Dryomov * | not needed) v 23789a59c1cSIlya Dryomov * v . 23889a59c1cSIlya Dryomov * done . . . . . . . . . . . . . . . . . . 2393da691bfSIlya Dryomov * ^ 2403da691bfSIlya Dryomov * | 2413da691bfSIlya Dryomov * RBD_OBJ_WRITE_FLAT 2423da691bfSIlya Dryomov * 2433da691bfSIlya Dryomov * Writes start in RBD_OBJ_WRITE_GUARD or _FLAT, depending on whether 24489a59c1cSIlya Dryomov * assert_exists guard is needed or not (in some cases it's not needed 24589a59c1cSIlya Dryomov * even if there is a parent). 2463da691bfSIlya Dryomov */ 2473da691bfSIlya Dryomov enum rbd_obj_write_state { 2483da691bfSIlya Dryomov RBD_OBJ_WRITE_FLAT = 1, 2493da691bfSIlya Dryomov RBD_OBJ_WRITE_GUARD, 2503a482501SIlya Dryomov RBD_OBJ_WRITE_READ_FROM_PARENT, 25189a59c1cSIlya Dryomov RBD_OBJ_WRITE_COPYUP_EMPTY_SNAPC, 2523a482501SIlya Dryomov RBD_OBJ_WRITE_COPYUP_OPS, 253926f9b3fSAlex Elder }; 254926f9b3fSAlex Elder 255bf0d5f50SAlex Elder struct rbd_obj_request { 25643df3d35SIlya Dryomov struct ceph_object_extent ex; 257c5b5ef6cSAlex Elder union { 2583da691bfSIlya Dryomov bool tried_parent; /* for reads */ 2593da691bfSIlya Dryomov enum rbd_obj_write_state write_state; /* for writes */ 2603da691bfSIlya Dryomov }; 261bf0d5f50SAlex Elder 262bf0d5f50SAlex Elder struct rbd_img_request *img_request; 26386bd7998SIlya Dryomov struct ceph_file_extent *img_extents; 26486bd7998SIlya Dryomov u32 num_img_extents; 265bf0d5f50SAlex Elder 266788e2df3SAlex Elder union { 2675359a17dSIlya Dryomov struct ceph_bio_iter bio_pos; 268788e2df3SAlex Elder struct { 2697e07efb1SIlya Dryomov struct ceph_bvec_iter bvec_pos; 2707e07efb1SIlya Dryomov u32 bvec_count; 271afb97888SIlya Dryomov u32 bvec_idx; 272788e2df3SAlex Elder }; 273788e2df3SAlex Elder }; 2747e07efb1SIlya Dryomov struct bio_vec *copyup_bvecs; 2757e07efb1SIlya Dryomov u32 copyup_bvec_count; 276bf0d5f50SAlex Elder 277bf0d5f50SAlex Elder struct ceph_osd_request *osd_req; 278bf0d5f50SAlex Elder 279bf0d5f50SAlex Elder u64 xferred; /* bytes transferred */ 2801b83bef2SSage Weil int result; 281bf0d5f50SAlex Elder 282bf0d5f50SAlex Elder struct kref kref; 283bf0d5f50SAlex Elder }; 284bf0d5f50SAlex Elder 2850c425248SAlex Elder enum img_req_flags { 2869849e986SAlex Elder IMG_REQ_CHILD, /* initiator: block = 0, child image = 1 */ 287d0b2e944SAlex Elder IMG_REQ_LAYERED, /* ENOENT handling: normal = 0, layered = 1 */ 2880c425248SAlex Elder }; 2890c425248SAlex Elder 290bf0d5f50SAlex Elder struct rbd_img_request { 291bf0d5f50SAlex Elder struct rbd_device *rbd_dev; 2929bb0248dSIlya Dryomov enum obj_operation_type op_type; 293ecc633caSIlya Dryomov enum obj_request_type data_type; 2940c425248SAlex Elder unsigned long flags; 295bf0d5f50SAlex Elder union { 296bf0d5f50SAlex Elder u64 snap_id; /* for reads */ 2979849e986SAlex Elder struct ceph_snap_context *snapc; /* for writes */ 2989849e986SAlex Elder }; 2999849e986SAlex Elder union { 3009849e986SAlex Elder struct request *rq; /* block request */ 3019849e986SAlex Elder struct rbd_obj_request *obj_request; /* obj req initiator */ 302bf0d5f50SAlex Elder }; 30315961b44SIlya Dryomov spinlock_t completion_lock; 30455f27e09SAlex Elder u64 xferred;/* aggregate bytes transferred */ 305a5a337d4SAlex Elder int result; /* first nonzero obj_request result */ 306bf0d5f50SAlex Elder 30743df3d35SIlya Dryomov struct list_head object_extents; /* obj_req.ex structs */ 3087114edacSIlya Dryomov u32 pending_count; 309bf0d5f50SAlex Elder 310bf0d5f50SAlex Elder struct kref kref; 311bf0d5f50SAlex Elder }; 312bf0d5f50SAlex Elder 313bf0d5f50SAlex Elder #define for_each_obj_request(ireq, oreq) \ 31443df3d35SIlya Dryomov list_for_each_entry(oreq, &(ireq)->object_extents, ex.oe_item) 315bf0d5f50SAlex Elder #define for_each_obj_request_safe(ireq, oreq, n) \ 31643df3d35SIlya Dryomov list_for_each_entry_safe(oreq, n, &(ireq)->object_extents, ex.oe_item) 317bf0d5f50SAlex Elder 31899d16943SIlya Dryomov enum rbd_watch_state { 31999d16943SIlya Dryomov RBD_WATCH_STATE_UNREGISTERED, 32099d16943SIlya Dryomov RBD_WATCH_STATE_REGISTERED, 32199d16943SIlya Dryomov RBD_WATCH_STATE_ERROR, 32299d16943SIlya Dryomov }; 32399d16943SIlya Dryomov 324ed95b21aSIlya Dryomov enum rbd_lock_state { 325ed95b21aSIlya Dryomov RBD_LOCK_STATE_UNLOCKED, 326ed95b21aSIlya Dryomov RBD_LOCK_STATE_LOCKED, 327ed95b21aSIlya Dryomov RBD_LOCK_STATE_RELEASING, 328ed95b21aSIlya Dryomov }; 329ed95b21aSIlya Dryomov 330ed95b21aSIlya Dryomov /* WatchNotify::ClientId */ 331ed95b21aSIlya Dryomov struct rbd_client_id { 332ed95b21aSIlya Dryomov u64 gid; 333ed95b21aSIlya Dryomov u64 handle; 334ed95b21aSIlya Dryomov }; 335ed95b21aSIlya Dryomov 336f84344f3SAlex Elder struct rbd_mapping { 33799c1f08fSAlex Elder u64 size; 33834b13184SAlex Elder u64 features; 339f84344f3SAlex Elder }; 340f84344f3SAlex Elder 341602adf40SYehuda Sadeh /* 342602adf40SYehuda Sadeh * a single device 343602adf40SYehuda Sadeh */ 344602adf40SYehuda Sadeh struct rbd_device { 345de71a297SAlex Elder int dev_id; /* blkdev unique id */ 346602adf40SYehuda Sadeh 347602adf40SYehuda Sadeh int major; /* blkdev assigned major */ 348dd82fff1SIlya Dryomov int minor; 349602adf40SYehuda Sadeh struct gendisk *disk; /* blkdev's gendisk and rq */ 350602adf40SYehuda Sadeh 351a30b71b9SAlex Elder u32 image_format; /* Either 1 or 2 */ 352602adf40SYehuda Sadeh struct rbd_client *rbd_client; 353602adf40SYehuda Sadeh 354602adf40SYehuda Sadeh char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */ 355602adf40SYehuda Sadeh 356b82d167bSAlex Elder spinlock_t lock; /* queue, flags, open_count */ 357602adf40SYehuda Sadeh 358602adf40SYehuda Sadeh struct rbd_image_header header; 359b82d167bSAlex Elder unsigned long flags; /* possibly lock protected */ 3600d7dbfceSAlex Elder struct rbd_spec *spec; 361d147543dSIlya Dryomov struct rbd_options *opts; 3620d6d1e9cSMike Christie char *config_info; /* add{,_single_major} string */ 363602adf40SYehuda Sadeh 364c41d13a3SIlya Dryomov struct ceph_object_id header_oid; 365922dab61SIlya Dryomov struct ceph_object_locator header_oloc; 366971f839aSAlex Elder 3671643dfa4SIlya Dryomov struct ceph_file_layout layout; /* used for all rbd requests */ 3680903e875SAlex Elder 36999d16943SIlya Dryomov struct mutex watch_mutex; 37099d16943SIlya Dryomov enum rbd_watch_state watch_state; 371922dab61SIlya Dryomov struct ceph_osd_linger_request *watch_handle; 37299d16943SIlya Dryomov u64 watch_cookie; 37399d16943SIlya Dryomov struct delayed_work watch_dwork; 37459c2be1eSYehuda Sadeh 375ed95b21aSIlya Dryomov struct rw_semaphore lock_rwsem; 376ed95b21aSIlya Dryomov enum rbd_lock_state lock_state; 377cbbfb0ffSIlya Dryomov char lock_cookie[32]; 378ed95b21aSIlya Dryomov struct rbd_client_id owner_cid; 379ed95b21aSIlya Dryomov struct work_struct acquired_lock_work; 380ed95b21aSIlya Dryomov struct work_struct released_lock_work; 381ed95b21aSIlya Dryomov struct delayed_work lock_dwork; 382ed95b21aSIlya Dryomov struct work_struct unlock_work; 383ed95b21aSIlya Dryomov wait_queue_head_t lock_waitq; 384ed95b21aSIlya Dryomov 3851643dfa4SIlya Dryomov struct workqueue_struct *task_wq; 386602adf40SYehuda Sadeh 38786b00e0dSAlex Elder struct rbd_spec *parent_spec; 38886b00e0dSAlex Elder u64 parent_overlap; 389a2acd00eSAlex Elder atomic_t parent_ref; 3902f82ee54SAlex Elder struct rbd_device *parent; 39186b00e0dSAlex Elder 3927ad18afaSChristoph Hellwig /* Block layer tags. */ 3937ad18afaSChristoph Hellwig struct blk_mq_tag_set tag_set; 3947ad18afaSChristoph Hellwig 395c666601aSJosh Durgin /* protects updating the header */ 396c666601aSJosh Durgin struct rw_semaphore header_rwsem; 397f84344f3SAlex Elder 398f84344f3SAlex Elder struct rbd_mapping mapping; 399602adf40SYehuda Sadeh 400602adf40SYehuda Sadeh struct list_head node; 401dfc5606dSYehuda Sadeh 402dfc5606dSYehuda Sadeh /* sysfs related */ 403dfc5606dSYehuda Sadeh struct device dev; 404b82d167bSAlex Elder unsigned long open_count; /* protected by lock */ 405dfc5606dSYehuda Sadeh }; 406dfc5606dSYehuda Sadeh 407b82d167bSAlex Elder /* 40887c0fdedSIlya Dryomov * Flag bits for rbd_dev->flags: 40987c0fdedSIlya Dryomov * - REMOVING (which is coupled with rbd_dev->open_count) is protected 41087c0fdedSIlya Dryomov * by rbd_dev->lock 41187c0fdedSIlya Dryomov * - BLACKLISTED is protected by rbd_dev->lock_rwsem 412b82d167bSAlex Elder */ 4136d292906SAlex Elder enum rbd_dev_flags { 4146d292906SAlex Elder RBD_DEV_FLAG_EXISTS, /* mapped snapshot has not been deleted */ 415b82d167bSAlex Elder RBD_DEV_FLAG_REMOVING, /* this mapping is being removed */ 41687c0fdedSIlya Dryomov RBD_DEV_FLAG_BLACKLISTED, /* our ceph_client is blacklisted */ 4176d292906SAlex Elder }; 4186d292906SAlex Elder 419cfbf6377SAlex Elder static DEFINE_MUTEX(client_mutex); /* Serialize client creation */ 420e124a82fSAlex Elder 421602adf40SYehuda Sadeh static LIST_HEAD(rbd_dev_list); /* devices */ 422e124a82fSAlex Elder static DEFINE_SPINLOCK(rbd_dev_list_lock); 423e124a82fSAlex Elder 424602adf40SYehuda Sadeh static LIST_HEAD(rbd_client_list); /* clients */ 425432b8587SAlex Elder static DEFINE_SPINLOCK(rbd_client_list_lock); 426602adf40SYehuda Sadeh 42778c2a44aSAlex Elder /* Slab caches for frequently-allocated structures */ 42878c2a44aSAlex Elder 4291c2a9dfeSAlex Elder static struct kmem_cache *rbd_img_request_cache; 430868311b1SAlex Elder static struct kmem_cache *rbd_obj_request_cache; 4311c2a9dfeSAlex Elder 4329b60e70bSIlya Dryomov static int rbd_major; 433f8a22fc2SIlya Dryomov static DEFINE_IDA(rbd_dev_id_ida); 434f8a22fc2SIlya Dryomov 435f5ee37bdSIlya Dryomov static struct workqueue_struct *rbd_wq; 436f5ee37bdSIlya Dryomov 43789a59c1cSIlya Dryomov static struct ceph_snap_context rbd_empty_snapc = { 43889a59c1cSIlya Dryomov .nref = REFCOUNT_INIT(1), 43989a59c1cSIlya Dryomov }; 44089a59c1cSIlya Dryomov 4419b60e70bSIlya Dryomov /* 4423cfa3b16SIlya Dryomov * single-major requires >= 0.75 version of userspace rbd utility. 4439b60e70bSIlya Dryomov */ 4443cfa3b16SIlya Dryomov static bool single_major = true; 4455657a819SJoe Perches module_param(single_major, bool, 0444); 4463cfa3b16SIlya Dryomov MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: true)"); 4479b60e70bSIlya Dryomov 448f0f8cef5SAlex Elder static ssize_t rbd_add(struct bus_type *bus, const char *buf, 449f0f8cef5SAlex Elder size_t count); 450f0f8cef5SAlex Elder static ssize_t rbd_remove(struct bus_type *bus, const char *buf, 451f0f8cef5SAlex Elder size_t count); 4529b60e70bSIlya Dryomov static ssize_t rbd_add_single_major(struct bus_type *bus, const char *buf, 4539b60e70bSIlya Dryomov size_t count); 4549b60e70bSIlya Dryomov static ssize_t rbd_remove_single_major(struct bus_type *bus, const char *buf, 4559b60e70bSIlya Dryomov size_t count); 4566d69bb53SIlya Dryomov static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth); 457f0f8cef5SAlex Elder 4589b60e70bSIlya Dryomov static int rbd_dev_id_to_minor(int dev_id) 4599b60e70bSIlya Dryomov { 4607e513d43SIlya Dryomov return dev_id << RBD_SINGLE_MAJOR_PART_SHIFT; 4619b60e70bSIlya Dryomov } 4629b60e70bSIlya Dryomov 4639b60e70bSIlya Dryomov static int minor_to_rbd_dev_id(int minor) 4649b60e70bSIlya Dryomov { 4657e513d43SIlya Dryomov return minor >> RBD_SINGLE_MAJOR_PART_SHIFT; 4669b60e70bSIlya Dryomov } 4679b60e70bSIlya Dryomov 468ed95b21aSIlya Dryomov static bool __rbd_is_lock_owner(struct rbd_device *rbd_dev) 469ed95b21aSIlya Dryomov { 470ed95b21aSIlya Dryomov return rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED || 471ed95b21aSIlya Dryomov rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING; 472ed95b21aSIlya Dryomov } 473ed95b21aSIlya Dryomov 474ed95b21aSIlya Dryomov static bool rbd_is_lock_owner(struct rbd_device *rbd_dev) 475ed95b21aSIlya Dryomov { 476ed95b21aSIlya Dryomov bool is_lock_owner; 477ed95b21aSIlya Dryomov 478ed95b21aSIlya Dryomov down_read(&rbd_dev->lock_rwsem); 479ed95b21aSIlya Dryomov is_lock_owner = __rbd_is_lock_owner(rbd_dev); 480ed95b21aSIlya Dryomov up_read(&rbd_dev->lock_rwsem); 481ed95b21aSIlya Dryomov return is_lock_owner; 482ed95b21aSIlya Dryomov } 483ed95b21aSIlya Dryomov 4848767b293SIlya Dryomov static ssize_t rbd_supported_features_show(struct bus_type *bus, char *buf) 4858767b293SIlya Dryomov { 4868767b293SIlya Dryomov return sprintf(buf, "0x%llx\n", RBD_FEATURES_SUPPORTED); 4878767b293SIlya Dryomov } 4888767b293SIlya Dryomov 4895657a819SJoe Perches static BUS_ATTR(add, 0200, NULL, rbd_add); 4905657a819SJoe Perches static BUS_ATTR(remove, 0200, NULL, rbd_remove); 4915657a819SJoe Perches static BUS_ATTR(add_single_major, 0200, NULL, rbd_add_single_major); 4925657a819SJoe Perches static BUS_ATTR(remove_single_major, 0200, NULL, rbd_remove_single_major); 4935657a819SJoe Perches static BUS_ATTR(supported_features, 0444, rbd_supported_features_show, NULL); 494b15a21ddSGreg Kroah-Hartman 495b15a21ddSGreg Kroah-Hartman static struct attribute *rbd_bus_attrs[] = { 496b15a21ddSGreg Kroah-Hartman &bus_attr_add.attr, 497b15a21ddSGreg Kroah-Hartman &bus_attr_remove.attr, 4989b60e70bSIlya Dryomov &bus_attr_add_single_major.attr, 4999b60e70bSIlya Dryomov &bus_attr_remove_single_major.attr, 5008767b293SIlya Dryomov &bus_attr_supported_features.attr, 501b15a21ddSGreg Kroah-Hartman NULL, 502f0f8cef5SAlex Elder }; 50392c76dc0SIlya Dryomov 50492c76dc0SIlya Dryomov static umode_t rbd_bus_is_visible(struct kobject *kobj, 50592c76dc0SIlya Dryomov struct attribute *attr, int index) 50692c76dc0SIlya Dryomov { 5079b60e70bSIlya Dryomov if (!single_major && 5089b60e70bSIlya Dryomov (attr == &bus_attr_add_single_major.attr || 5099b60e70bSIlya Dryomov attr == &bus_attr_remove_single_major.attr)) 5109b60e70bSIlya Dryomov return 0; 5119b60e70bSIlya Dryomov 51292c76dc0SIlya Dryomov return attr->mode; 51392c76dc0SIlya Dryomov } 51492c76dc0SIlya Dryomov 51592c76dc0SIlya Dryomov static const struct attribute_group rbd_bus_group = { 51692c76dc0SIlya Dryomov .attrs = rbd_bus_attrs, 51792c76dc0SIlya Dryomov .is_visible = rbd_bus_is_visible, 51892c76dc0SIlya Dryomov }; 51992c76dc0SIlya Dryomov __ATTRIBUTE_GROUPS(rbd_bus); 520f0f8cef5SAlex Elder 521f0f8cef5SAlex Elder static struct bus_type rbd_bus_type = { 522f0f8cef5SAlex Elder .name = "rbd", 523b15a21ddSGreg Kroah-Hartman .bus_groups = rbd_bus_groups, 524f0f8cef5SAlex Elder }; 525f0f8cef5SAlex Elder 526f0f8cef5SAlex Elder static void rbd_root_dev_release(struct device *dev) 527f0f8cef5SAlex Elder { 528f0f8cef5SAlex Elder } 529f0f8cef5SAlex Elder 530f0f8cef5SAlex Elder static struct device rbd_root_dev = { 531f0f8cef5SAlex Elder .init_name = "rbd", 532f0f8cef5SAlex Elder .release = rbd_root_dev_release, 533f0f8cef5SAlex Elder }; 534f0f8cef5SAlex Elder 53506ecc6cbSAlex Elder static __printf(2, 3) 53606ecc6cbSAlex Elder void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...) 53706ecc6cbSAlex Elder { 53806ecc6cbSAlex Elder struct va_format vaf; 53906ecc6cbSAlex Elder va_list args; 54006ecc6cbSAlex Elder 54106ecc6cbSAlex Elder va_start(args, fmt); 54206ecc6cbSAlex Elder vaf.fmt = fmt; 54306ecc6cbSAlex Elder vaf.va = &args; 54406ecc6cbSAlex Elder 54506ecc6cbSAlex Elder if (!rbd_dev) 54606ecc6cbSAlex Elder printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf); 54706ecc6cbSAlex Elder else if (rbd_dev->disk) 54806ecc6cbSAlex Elder printk(KERN_WARNING "%s: %s: %pV\n", 54906ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf); 55006ecc6cbSAlex Elder else if (rbd_dev->spec && rbd_dev->spec->image_name) 55106ecc6cbSAlex Elder printk(KERN_WARNING "%s: image %s: %pV\n", 55206ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf); 55306ecc6cbSAlex Elder else if (rbd_dev->spec && rbd_dev->spec->image_id) 55406ecc6cbSAlex Elder printk(KERN_WARNING "%s: id %s: %pV\n", 55506ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf); 55606ecc6cbSAlex Elder else /* punt */ 55706ecc6cbSAlex Elder printk(KERN_WARNING "%s: rbd_dev %p: %pV\n", 55806ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev, &vaf); 55906ecc6cbSAlex Elder va_end(args); 56006ecc6cbSAlex Elder } 56106ecc6cbSAlex Elder 562aafb230eSAlex Elder #ifdef RBD_DEBUG 563aafb230eSAlex Elder #define rbd_assert(expr) \ 564aafb230eSAlex Elder if (unlikely(!(expr))) { \ 565aafb230eSAlex Elder printk(KERN_ERR "\nAssertion failure in %s() " \ 566aafb230eSAlex Elder "at line %d:\n\n" \ 567aafb230eSAlex Elder "\trbd_assert(%s);\n\n", \ 568aafb230eSAlex Elder __func__, __LINE__, #expr); \ 569aafb230eSAlex Elder BUG(); \ 570aafb230eSAlex Elder } 571aafb230eSAlex Elder #else /* !RBD_DEBUG */ 572aafb230eSAlex Elder # define rbd_assert(expr) ((void) 0) 573aafb230eSAlex Elder #endif /* !RBD_DEBUG */ 574dfc5606dSYehuda Sadeh 57505a46afdSAlex Elder static void rbd_dev_remove_parent(struct rbd_device *rbd_dev); 5768b3e1a56SAlex Elder 577cc4a38bdSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev); 5782df3fac7SAlex Elder static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev); 579a720ae09SIlya Dryomov static int rbd_dev_header_info(struct rbd_device *rbd_dev); 580e8f59b59SIlya Dryomov static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev); 58154cac61fSAlex Elder static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, 58254cac61fSAlex Elder u64 snap_id); 5832ad3d716SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id, 5842ad3d716SAlex Elder u8 *order, u64 *snap_size); 5852ad3d716SAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id, 5862ad3d716SAlex Elder u64 *snap_features); 58759c2be1eSYehuda Sadeh 588602adf40SYehuda Sadeh static int rbd_open(struct block_device *bdev, fmode_t mode) 589602adf40SYehuda Sadeh { 590f0f8cef5SAlex Elder struct rbd_device *rbd_dev = bdev->bd_disk->private_data; 591b82d167bSAlex Elder bool removing = false; 592602adf40SYehuda Sadeh 593a14ea269SAlex Elder spin_lock_irq(&rbd_dev->lock); 594b82d167bSAlex Elder if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags)) 595b82d167bSAlex Elder removing = true; 596b82d167bSAlex Elder else 597b82d167bSAlex Elder rbd_dev->open_count++; 598a14ea269SAlex Elder spin_unlock_irq(&rbd_dev->lock); 599b82d167bSAlex Elder if (removing) 600b82d167bSAlex Elder return -ENOENT; 601b82d167bSAlex Elder 602c3e946ceSAlex Elder (void) get_device(&rbd_dev->dev); 603340c7a2bSAlex Elder 604602adf40SYehuda Sadeh return 0; 605602adf40SYehuda Sadeh } 606602adf40SYehuda Sadeh 607db2a144bSAl Viro static void rbd_release(struct gendisk *disk, fmode_t mode) 608dfc5606dSYehuda Sadeh { 609dfc5606dSYehuda Sadeh struct rbd_device *rbd_dev = disk->private_data; 610b82d167bSAlex Elder unsigned long open_count_before; 611b82d167bSAlex Elder 612a14ea269SAlex Elder spin_lock_irq(&rbd_dev->lock); 613b82d167bSAlex Elder open_count_before = rbd_dev->open_count--; 614a14ea269SAlex Elder spin_unlock_irq(&rbd_dev->lock); 615b82d167bSAlex Elder rbd_assert(open_count_before > 0); 616dfc5606dSYehuda Sadeh 617c3e946ceSAlex Elder put_device(&rbd_dev->dev); 618dfc5606dSYehuda Sadeh } 619dfc5606dSYehuda Sadeh 620131fd9f6SGuangliang Zhao static int rbd_ioctl_set_ro(struct rbd_device *rbd_dev, unsigned long arg) 621131fd9f6SGuangliang Zhao { 6221de797bbSIlya Dryomov int ro; 623131fd9f6SGuangliang Zhao 6241de797bbSIlya Dryomov if (get_user(ro, (int __user *)arg)) 625131fd9f6SGuangliang Zhao return -EFAULT; 626131fd9f6SGuangliang Zhao 6271de797bbSIlya Dryomov /* Snapshots can't be marked read-write */ 628131fd9f6SGuangliang Zhao if (rbd_dev->spec->snap_id != CEPH_NOSNAP && !ro) 629131fd9f6SGuangliang Zhao return -EROFS; 630131fd9f6SGuangliang Zhao 6311de797bbSIlya Dryomov /* Let blkdev_roset() handle it */ 6321de797bbSIlya Dryomov return -ENOTTY; 633131fd9f6SGuangliang Zhao } 634131fd9f6SGuangliang Zhao 635131fd9f6SGuangliang Zhao static int rbd_ioctl(struct block_device *bdev, fmode_t mode, 636131fd9f6SGuangliang Zhao unsigned int cmd, unsigned long arg) 637131fd9f6SGuangliang Zhao { 638131fd9f6SGuangliang Zhao struct rbd_device *rbd_dev = bdev->bd_disk->private_data; 6391de797bbSIlya Dryomov int ret; 640131fd9f6SGuangliang Zhao 641131fd9f6SGuangliang Zhao switch (cmd) { 642131fd9f6SGuangliang Zhao case BLKROSET: 643131fd9f6SGuangliang Zhao ret = rbd_ioctl_set_ro(rbd_dev, arg); 644131fd9f6SGuangliang Zhao break; 645131fd9f6SGuangliang Zhao default: 646131fd9f6SGuangliang Zhao ret = -ENOTTY; 647131fd9f6SGuangliang Zhao } 648131fd9f6SGuangliang Zhao 649131fd9f6SGuangliang Zhao return ret; 650131fd9f6SGuangliang Zhao } 651131fd9f6SGuangliang Zhao 652131fd9f6SGuangliang Zhao #ifdef CONFIG_COMPAT 653131fd9f6SGuangliang Zhao static int rbd_compat_ioctl(struct block_device *bdev, fmode_t mode, 654131fd9f6SGuangliang Zhao unsigned int cmd, unsigned long arg) 655131fd9f6SGuangliang Zhao { 656131fd9f6SGuangliang Zhao return rbd_ioctl(bdev, mode, cmd, arg); 657131fd9f6SGuangliang Zhao } 658131fd9f6SGuangliang Zhao #endif /* CONFIG_COMPAT */ 659131fd9f6SGuangliang Zhao 660602adf40SYehuda Sadeh static const struct block_device_operations rbd_bd_ops = { 661602adf40SYehuda Sadeh .owner = THIS_MODULE, 662602adf40SYehuda Sadeh .open = rbd_open, 663dfc5606dSYehuda Sadeh .release = rbd_release, 664131fd9f6SGuangliang Zhao .ioctl = rbd_ioctl, 665131fd9f6SGuangliang Zhao #ifdef CONFIG_COMPAT 666131fd9f6SGuangliang Zhao .compat_ioctl = rbd_compat_ioctl, 667131fd9f6SGuangliang Zhao #endif 668602adf40SYehuda Sadeh }; 669602adf40SYehuda Sadeh 670602adf40SYehuda Sadeh /* 6717262cfcaSAlex Elder * Initialize an rbd client instance. Success or not, this function 672cfbf6377SAlex Elder * consumes ceph_opts. Caller holds client_mutex. 673602adf40SYehuda Sadeh */ 674f8c38929SAlex Elder static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts) 675602adf40SYehuda Sadeh { 676602adf40SYehuda Sadeh struct rbd_client *rbdc; 677602adf40SYehuda Sadeh int ret = -ENOMEM; 678602adf40SYehuda Sadeh 67937206ee5SAlex Elder dout("%s:\n", __func__); 680602adf40SYehuda Sadeh rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL); 681602adf40SYehuda Sadeh if (!rbdc) 682602adf40SYehuda Sadeh goto out_opt; 683602adf40SYehuda Sadeh 684602adf40SYehuda Sadeh kref_init(&rbdc->kref); 685602adf40SYehuda Sadeh INIT_LIST_HEAD(&rbdc->node); 686602adf40SYehuda Sadeh 68774da4a0fSIlya Dryomov rbdc->client = ceph_create_client(ceph_opts, rbdc); 688602adf40SYehuda Sadeh if (IS_ERR(rbdc->client)) 68908f75463SAlex Elder goto out_rbdc; 69043ae4701SAlex Elder ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */ 691602adf40SYehuda Sadeh 692602adf40SYehuda Sadeh ret = ceph_open_session(rbdc->client); 693602adf40SYehuda Sadeh if (ret < 0) 69408f75463SAlex Elder goto out_client; 695602adf40SYehuda Sadeh 696432b8587SAlex Elder spin_lock(&rbd_client_list_lock); 697602adf40SYehuda Sadeh list_add_tail(&rbdc->node, &rbd_client_list); 698432b8587SAlex Elder spin_unlock(&rbd_client_list_lock); 699602adf40SYehuda Sadeh 70037206ee5SAlex Elder dout("%s: rbdc %p\n", __func__, rbdc); 701bc534d86SAlex Elder 702602adf40SYehuda Sadeh return rbdc; 70308f75463SAlex Elder out_client: 704602adf40SYehuda Sadeh ceph_destroy_client(rbdc->client); 70508f75463SAlex Elder out_rbdc: 706602adf40SYehuda Sadeh kfree(rbdc); 707602adf40SYehuda Sadeh out_opt: 70843ae4701SAlex Elder if (ceph_opts) 70943ae4701SAlex Elder ceph_destroy_options(ceph_opts); 71037206ee5SAlex Elder dout("%s: error %d\n", __func__, ret); 71137206ee5SAlex Elder 71228f259b7SVasiliy Kulikov return ERR_PTR(ret); 713602adf40SYehuda Sadeh } 714602adf40SYehuda Sadeh 7152f82ee54SAlex Elder static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc) 7162f82ee54SAlex Elder { 7172f82ee54SAlex Elder kref_get(&rbdc->kref); 7182f82ee54SAlex Elder 7192f82ee54SAlex Elder return rbdc; 7202f82ee54SAlex Elder } 7212f82ee54SAlex Elder 722602adf40SYehuda Sadeh /* 7231f7ba331SAlex Elder * Find a ceph client with specific addr and configuration. If 7241f7ba331SAlex Elder * found, bump its reference count. 725602adf40SYehuda Sadeh */ 7261f7ba331SAlex Elder static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts) 727602adf40SYehuda Sadeh { 728602adf40SYehuda Sadeh struct rbd_client *client_node; 7291f7ba331SAlex Elder bool found = false; 730602adf40SYehuda Sadeh 73143ae4701SAlex Elder if (ceph_opts->flags & CEPH_OPT_NOSHARE) 732602adf40SYehuda Sadeh return NULL; 733602adf40SYehuda Sadeh 7341f7ba331SAlex Elder spin_lock(&rbd_client_list_lock); 7351f7ba331SAlex Elder list_for_each_entry(client_node, &rbd_client_list, node) { 7361f7ba331SAlex Elder if (!ceph_compare_options(ceph_opts, client_node->client)) { 7372f82ee54SAlex Elder __rbd_get_client(client_node); 7382f82ee54SAlex Elder 7391f7ba331SAlex Elder found = true; 7401f7ba331SAlex Elder break; 7411f7ba331SAlex Elder } 7421f7ba331SAlex Elder } 7431f7ba331SAlex Elder spin_unlock(&rbd_client_list_lock); 7441f7ba331SAlex Elder 7451f7ba331SAlex Elder return found ? client_node : NULL; 746602adf40SYehuda Sadeh } 747602adf40SYehuda Sadeh 748602adf40SYehuda Sadeh /* 749210c104cSIlya Dryomov * (Per device) rbd map options 75059c2be1eSYehuda Sadeh */ 75159c2be1eSYehuda Sadeh enum { 752b5584180SIlya Dryomov Opt_queue_depth, 7530c93e1b7SIlya Dryomov Opt_alloc_size, 75434f55d0bSDongsheng Yang Opt_lock_timeout, 75559c2be1eSYehuda Sadeh Opt_last_int, 75659c2be1eSYehuda Sadeh /* int args above */ 757b26c047bSIlya Dryomov Opt_pool_ns, 75859c2be1eSYehuda Sadeh Opt_last_string, 75959c2be1eSYehuda Sadeh /* string args above */ 760cc0538b6SAlex Elder Opt_read_only, 761cc0538b6SAlex Elder Opt_read_write, 76280de1912SIlya Dryomov Opt_lock_on_read, 763e010dd0aSIlya Dryomov Opt_exclusive, 764d9360540SIlya Dryomov Opt_notrim, 765210c104cSIlya Dryomov Opt_err 76659c2be1eSYehuda Sadeh }; 76759c2be1eSYehuda Sadeh 76843ae4701SAlex Elder static match_table_t rbd_opts_tokens = { 769b5584180SIlya Dryomov {Opt_queue_depth, "queue_depth=%d"}, 7700c93e1b7SIlya Dryomov {Opt_alloc_size, "alloc_size=%d"}, 77134f55d0bSDongsheng Yang {Opt_lock_timeout, "lock_timeout=%d"}, 77259c2be1eSYehuda Sadeh /* int args above */ 773b26c047bSIlya Dryomov {Opt_pool_ns, "_pool_ns=%s"}, 77459c2be1eSYehuda Sadeh /* string args above */ 775be466c1cSAlex Elder {Opt_read_only, "read_only"}, 776cc0538b6SAlex Elder {Opt_read_only, "ro"}, /* Alternate spelling */ 777cc0538b6SAlex Elder {Opt_read_write, "read_write"}, 778cc0538b6SAlex Elder {Opt_read_write, "rw"}, /* Alternate spelling */ 77980de1912SIlya Dryomov {Opt_lock_on_read, "lock_on_read"}, 780e010dd0aSIlya Dryomov {Opt_exclusive, "exclusive"}, 781d9360540SIlya Dryomov {Opt_notrim, "notrim"}, 782210c104cSIlya Dryomov {Opt_err, NULL} 78359c2be1eSYehuda Sadeh }; 78459c2be1eSYehuda Sadeh 78598571b5aSAlex Elder struct rbd_options { 786b5584180SIlya Dryomov int queue_depth; 7870c93e1b7SIlya Dryomov int alloc_size; 78834f55d0bSDongsheng Yang unsigned long lock_timeout; 78998571b5aSAlex Elder bool read_only; 79080de1912SIlya Dryomov bool lock_on_read; 791e010dd0aSIlya Dryomov bool exclusive; 792d9360540SIlya Dryomov bool trim; 79398571b5aSAlex Elder }; 79498571b5aSAlex Elder 795b5584180SIlya Dryomov #define RBD_QUEUE_DEPTH_DEFAULT BLKDEV_MAX_RQ 7960c93e1b7SIlya Dryomov #define RBD_ALLOC_SIZE_DEFAULT (64 * 1024) 79734f55d0bSDongsheng Yang #define RBD_LOCK_TIMEOUT_DEFAULT 0 /* no timeout */ 79898571b5aSAlex Elder #define RBD_READ_ONLY_DEFAULT false 79980de1912SIlya Dryomov #define RBD_LOCK_ON_READ_DEFAULT false 800e010dd0aSIlya Dryomov #define RBD_EXCLUSIVE_DEFAULT false 801d9360540SIlya Dryomov #define RBD_TRIM_DEFAULT true 80298571b5aSAlex Elder 803c300156bSIlya Dryomov struct parse_rbd_opts_ctx { 804c300156bSIlya Dryomov struct rbd_spec *spec; 805c300156bSIlya Dryomov struct rbd_options *opts; 806c300156bSIlya Dryomov }; 807c300156bSIlya Dryomov 80859c2be1eSYehuda Sadeh static int parse_rbd_opts_token(char *c, void *private) 80959c2be1eSYehuda Sadeh { 810c300156bSIlya Dryomov struct parse_rbd_opts_ctx *pctx = private; 81159c2be1eSYehuda Sadeh substring_t argstr[MAX_OPT_ARGS]; 81259c2be1eSYehuda Sadeh int token, intval, ret; 81359c2be1eSYehuda Sadeh 81443ae4701SAlex Elder token = match_token(c, rbd_opts_tokens, argstr); 81559c2be1eSYehuda Sadeh if (token < Opt_last_int) { 81659c2be1eSYehuda Sadeh ret = match_int(&argstr[0], &intval); 81759c2be1eSYehuda Sadeh if (ret < 0) { 8182f56b6baSIlya Dryomov pr_err("bad option arg (not int) at '%s'\n", c); 81959c2be1eSYehuda Sadeh return ret; 82059c2be1eSYehuda Sadeh } 82159c2be1eSYehuda Sadeh dout("got int token %d val %d\n", token, intval); 82259c2be1eSYehuda Sadeh } else if (token > Opt_last_int && token < Opt_last_string) { 823210c104cSIlya Dryomov dout("got string token %d val %s\n", token, argstr[0].from); 82459c2be1eSYehuda Sadeh } else { 82559c2be1eSYehuda Sadeh dout("got token %d\n", token); 82659c2be1eSYehuda Sadeh } 82759c2be1eSYehuda Sadeh 82859c2be1eSYehuda Sadeh switch (token) { 829b5584180SIlya Dryomov case Opt_queue_depth: 830b5584180SIlya Dryomov if (intval < 1) { 831b5584180SIlya Dryomov pr_err("queue_depth out of range\n"); 832b5584180SIlya Dryomov return -EINVAL; 833b5584180SIlya Dryomov } 834c300156bSIlya Dryomov pctx->opts->queue_depth = intval; 835b5584180SIlya Dryomov break; 8360c93e1b7SIlya Dryomov case Opt_alloc_size: 8370c93e1b7SIlya Dryomov if (intval < 1) { 8380c93e1b7SIlya Dryomov pr_err("alloc_size out of range\n"); 8390c93e1b7SIlya Dryomov return -EINVAL; 8400c93e1b7SIlya Dryomov } 8410c93e1b7SIlya Dryomov if (!is_power_of_2(intval)) { 8420c93e1b7SIlya Dryomov pr_err("alloc_size must be a power of 2\n"); 8430c93e1b7SIlya Dryomov return -EINVAL; 8440c93e1b7SIlya Dryomov } 8450c93e1b7SIlya Dryomov pctx->opts->alloc_size = intval; 8460c93e1b7SIlya Dryomov break; 84734f55d0bSDongsheng Yang case Opt_lock_timeout: 84834f55d0bSDongsheng Yang /* 0 is "wait forever" (i.e. infinite timeout) */ 84934f55d0bSDongsheng Yang if (intval < 0 || intval > INT_MAX / 1000) { 85034f55d0bSDongsheng Yang pr_err("lock_timeout out of range\n"); 85134f55d0bSDongsheng Yang return -EINVAL; 85234f55d0bSDongsheng Yang } 853c300156bSIlya Dryomov pctx->opts->lock_timeout = msecs_to_jiffies(intval * 1000); 85434f55d0bSDongsheng Yang break; 855b26c047bSIlya Dryomov case Opt_pool_ns: 856b26c047bSIlya Dryomov kfree(pctx->spec->pool_ns); 857b26c047bSIlya Dryomov pctx->spec->pool_ns = match_strdup(argstr); 858b26c047bSIlya Dryomov if (!pctx->spec->pool_ns) 859b26c047bSIlya Dryomov return -ENOMEM; 86059c2be1eSYehuda Sadeh break; 861cc0538b6SAlex Elder case Opt_read_only: 862c300156bSIlya Dryomov pctx->opts->read_only = true; 863cc0538b6SAlex Elder break; 864cc0538b6SAlex Elder case Opt_read_write: 865c300156bSIlya Dryomov pctx->opts->read_only = false; 866cc0538b6SAlex Elder break; 86780de1912SIlya Dryomov case Opt_lock_on_read: 868c300156bSIlya Dryomov pctx->opts->lock_on_read = true; 86980de1912SIlya Dryomov break; 870e010dd0aSIlya Dryomov case Opt_exclusive: 871c300156bSIlya Dryomov pctx->opts->exclusive = true; 872e010dd0aSIlya Dryomov break; 873d9360540SIlya Dryomov case Opt_notrim: 874c300156bSIlya Dryomov pctx->opts->trim = false; 875d9360540SIlya Dryomov break; 87659c2be1eSYehuda Sadeh default: 877210c104cSIlya Dryomov /* libceph prints "bad option" msg */ 878210c104cSIlya Dryomov return -EINVAL; 87959c2be1eSYehuda Sadeh } 880210c104cSIlya Dryomov 88159c2be1eSYehuda Sadeh return 0; 88259c2be1eSYehuda Sadeh } 88359c2be1eSYehuda Sadeh 8846d2940c8SGuangliang Zhao static char* obj_op_name(enum obj_operation_type op_type) 8856d2940c8SGuangliang Zhao { 8866d2940c8SGuangliang Zhao switch (op_type) { 8876d2940c8SGuangliang Zhao case OBJ_OP_READ: 8886d2940c8SGuangliang Zhao return "read"; 8896d2940c8SGuangliang Zhao case OBJ_OP_WRITE: 8906d2940c8SGuangliang Zhao return "write"; 89190e98c52SGuangliang Zhao case OBJ_OP_DISCARD: 89290e98c52SGuangliang Zhao return "discard"; 8936484cbe9SIlya Dryomov case OBJ_OP_ZEROOUT: 8946484cbe9SIlya Dryomov return "zeroout"; 8956d2940c8SGuangliang Zhao default: 8966d2940c8SGuangliang Zhao return "???"; 8976d2940c8SGuangliang Zhao } 8986d2940c8SGuangliang Zhao } 8996d2940c8SGuangliang Zhao 90059c2be1eSYehuda Sadeh /* 901602adf40SYehuda Sadeh * Destroy ceph client 902d23a4b3fSAlex Elder * 903432b8587SAlex Elder * Caller must hold rbd_client_list_lock. 904602adf40SYehuda Sadeh */ 905602adf40SYehuda Sadeh static void rbd_client_release(struct kref *kref) 906602adf40SYehuda Sadeh { 907602adf40SYehuda Sadeh struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref); 908602adf40SYehuda Sadeh 90937206ee5SAlex Elder dout("%s: rbdc %p\n", __func__, rbdc); 910cd9d9f5dSAlex Elder spin_lock(&rbd_client_list_lock); 911602adf40SYehuda Sadeh list_del(&rbdc->node); 912cd9d9f5dSAlex Elder spin_unlock(&rbd_client_list_lock); 913602adf40SYehuda Sadeh 914602adf40SYehuda Sadeh ceph_destroy_client(rbdc->client); 915602adf40SYehuda Sadeh kfree(rbdc); 916602adf40SYehuda Sadeh } 917602adf40SYehuda Sadeh 918602adf40SYehuda Sadeh /* 919602adf40SYehuda Sadeh * Drop reference to ceph client node. If it's not referenced anymore, release 920602adf40SYehuda Sadeh * it. 921602adf40SYehuda Sadeh */ 9229d3997fdSAlex Elder static void rbd_put_client(struct rbd_client *rbdc) 923602adf40SYehuda Sadeh { 924c53d5893SAlex Elder if (rbdc) 9259d3997fdSAlex Elder kref_put(&rbdc->kref, rbd_client_release); 926602adf40SYehuda Sadeh } 927602adf40SYehuda Sadeh 928dd435855SIlya Dryomov static int wait_for_latest_osdmap(struct ceph_client *client) 929dd435855SIlya Dryomov { 930dd435855SIlya Dryomov u64 newest_epoch; 931dd435855SIlya Dryomov int ret; 932dd435855SIlya Dryomov 933dd435855SIlya Dryomov ret = ceph_monc_get_version(&client->monc, "osdmap", &newest_epoch); 934dd435855SIlya Dryomov if (ret) 935dd435855SIlya Dryomov return ret; 936dd435855SIlya Dryomov 937dd435855SIlya Dryomov if (client->osdc.osdmap->epoch >= newest_epoch) 938dd435855SIlya Dryomov return 0; 939dd435855SIlya Dryomov 940dd435855SIlya Dryomov ceph_osdc_maybe_request_map(&client->osdc); 941dd435855SIlya Dryomov return ceph_monc_wait_osdmap(&client->monc, newest_epoch, 942dd435855SIlya Dryomov client->options->mount_timeout); 943dd435855SIlya Dryomov } 944dd435855SIlya Dryomov 9455feb0d8dSIlya Dryomov /* 9465feb0d8dSIlya Dryomov * Get a ceph client with specific addr and configuration, if one does 9475feb0d8dSIlya Dryomov * not exist create it. Either way, ceph_opts is consumed by this 9485feb0d8dSIlya Dryomov * function. 9495feb0d8dSIlya Dryomov */ 9505feb0d8dSIlya Dryomov static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts) 9515feb0d8dSIlya Dryomov { 9525feb0d8dSIlya Dryomov struct rbd_client *rbdc; 953dd435855SIlya Dryomov int ret; 9545feb0d8dSIlya Dryomov 9555feb0d8dSIlya Dryomov mutex_lock_nested(&client_mutex, SINGLE_DEPTH_NESTING); 9565feb0d8dSIlya Dryomov rbdc = rbd_client_find(ceph_opts); 957dd435855SIlya Dryomov if (rbdc) { 9585feb0d8dSIlya Dryomov ceph_destroy_options(ceph_opts); 959dd435855SIlya Dryomov 960dd435855SIlya Dryomov /* 961dd435855SIlya Dryomov * Using an existing client. Make sure ->pg_pools is up to 962dd435855SIlya Dryomov * date before we look up the pool id in do_rbd_add(). 963dd435855SIlya Dryomov */ 964dd435855SIlya Dryomov ret = wait_for_latest_osdmap(rbdc->client); 965dd435855SIlya Dryomov if (ret) { 966dd435855SIlya Dryomov rbd_warn(NULL, "failed to get latest osdmap: %d", ret); 967dd435855SIlya Dryomov rbd_put_client(rbdc); 968dd435855SIlya Dryomov rbdc = ERR_PTR(ret); 969dd435855SIlya Dryomov } 970dd435855SIlya Dryomov } else { 9715feb0d8dSIlya Dryomov rbdc = rbd_client_create(ceph_opts); 972dd435855SIlya Dryomov } 9735feb0d8dSIlya Dryomov mutex_unlock(&client_mutex); 9745feb0d8dSIlya Dryomov 9755feb0d8dSIlya Dryomov return rbdc; 9765feb0d8dSIlya Dryomov } 9775feb0d8dSIlya Dryomov 978a30b71b9SAlex Elder static bool rbd_image_format_valid(u32 image_format) 979a30b71b9SAlex Elder { 980a30b71b9SAlex Elder return image_format == 1 || image_format == 2; 981a30b71b9SAlex Elder } 982a30b71b9SAlex Elder 9838e94af8eSAlex Elder static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk) 9848e94af8eSAlex Elder { 985103a150fSAlex Elder size_t size; 986103a150fSAlex Elder u32 snap_count; 987103a150fSAlex Elder 988103a150fSAlex Elder /* The header has to start with the magic rbd header text */ 989103a150fSAlex Elder if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT))) 990103a150fSAlex Elder return false; 991103a150fSAlex Elder 992db2388b6SAlex Elder /* The bio layer requires at least sector-sized I/O */ 993db2388b6SAlex Elder 994db2388b6SAlex Elder if (ondisk->options.order < SECTOR_SHIFT) 995db2388b6SAlex Elder return false; 996db2388b6SAlex Elder 997db2388b6SAlex Elder /* If we use u64 in a few spots we may be able to loosen this */ 998db2388b6SAlex Elder 999db2388b6SAlex Elder if (ondisk->options.order > 8 * sizeof (int) - 1) 1000db2388b6SAlex Elder return false; 1001db2388b6SAlex Elder 1002103a150fSAlex Elder /* 1003103a150fSAlex Elder * The size of a snapshot header has to fit in a size_t, and 1004103a150fSAlex Elder * that limits the number of snapshots. 1005103a150fSAlex Elder */ 1006103a150fSAlex Elder snap_count = le32_to_cpu(ondisk->snap_count); 1007103a150fSAlex Elder size = SIZE_MAX - sizeof (struct ceph_snap_context); 1008103a150fSAlex Elder if (snap_count > size / sizeof (__le64)) 1009103a150fSAlex Elder return false; 1010103a150fSAlex Elder 1011103a150fSAlex Elder /* 1012103a150fSAlex Elder * Not only that, but the size of the entire the snapshot 1013103a150fSAlex Elder * header must also be representable in a size_t. 1014103a150fSAlex Elder */ 1015103a150fSAlex Elder size -= snap_count * sizeof (__le64); 1016103a150fSAlex Elder if ((u64) size < le64_to_cpu(ondisk->snap_names_len)) 1017103a150fSAlex Elder return false; 1018103a150fSAlex Elder 1019103a150fSAlex Elder return true; 10208e94af8eSAlex Elder } 10218e94af8eSAlex Elder 1022602adf40SYehuda Sadeh /* 10235bc3fb17SIlya Dryomov * returns the size of an object in the image 10245bc3fb17SIlya Dryomov */ 10255bc3fb17SIlya Dryomov static u32 rbd_obj_bytes(struct rbd_image_header *header) 10265bc3fb17SIlya Dryomov { 10275bc3fb17SIlya Dryomov return 1U << header->obj_order; 10285bc3fb17SIlya Dryomov } 10295bc3fb17SIlya Dryomov 1030263423f8SIlya Dryomov static void rbd_init_layout(struct rbd_device *rbd_dev) 1031263423f8SIlya Dryomov { 1032263423f8SIlya Dryomov if (rbd_dev->header.stripe_unit == 0 || 1033263423f8SIlya Dryomov rbd_dev->header.stripe_count == 0) { 1034263423f8SIlya Dryomov rbd_dev->header.stripe_unit = rbd_obj_bytes(&rbd_dev->header); 1035263423f8SIlya Dryomov rbd_dev->header.stripe_count = 1; 1036263423f8SIlya Dryomov } 1037263423f8SIlya Dryomov 1038263423f8SIlya Dryomov rbd_dev->layout.stripe_unit = rbd_dev->header.stripe_unit; 1039263423f8SIlya Dryomov rbd_dev->layout.stripe_count = rbd_dev->header.stripe_count; 1040263423f8SIlya Dryomov rbd_dev->layout.object_size = rbd_obj_bytes(&rbd_dev->header); 10417e97332eSIlya Dryomov rbd_dev->layout.pool_id = rbd_dev->header.data_pool_id == CEPH_NOPOOL ? 10427e97332eSIlya Dryomov rbd_dev->spec->pool_id : rbd_dev->header.data_pool_id; 1043263423f8SIlya Dryomov RCU_INIT_POINTER(rbd_dev->layout.pool_ns, NULL); 1044263423f8SIlya Dryomov } 1045263423f8SIlya Dryomov 10465bc3fb17SIlya Dryomov /* 1047bb23e37aSAlex Elder * Fill an rbd image header with information from the given format 1 1048bb23e37aSAlex Elder * on-disk header. 1049602adf40SYehuda Sadeh */ 1050662518b1SAlex Elder static int rbd_header_from_disk(struct rbd_device *rbd_dev, 10514156d998SAlex Elder struct rbd_image_header_ondisk *ondisk) 1052602adf40SYehuda Sadeh { 1053662518b1SAlex Elder struct rbd_image_header *header = &rbd_dev->header; 1054bb23e37aSAlex Elder bool first_time = header->object_prefix == NULL; 1055bb23e37aSAlex Elder struct ceph_snap_context *snapc; 1056bb23e37aSAlex Elder char *object_prefix = NULL; 1057bb23e37aSAlex Elder char *snap_names = NULL; 1058bb23e37aSAlex Elder u64 *snap_sizes = NULL; 1059ccece235SAlex Elder u32 snap_count; 1060bb23e37aSAlex Elder int ret = -ENOMEM; 1061621901d6SAlex Elder u32 i; 1062602adf40SYehuda Sadeh 1063bb23e37aSAlex Elder /* Allocate this now to avoid having to handle failure below */ 1064103a150fSAlex Elder 1065bb23e37aSAlex Elder if (first_time) { 1066848d796cSIlya Dryomov object_prefix = kstrndup(ondisk->object_prefix, 1067848d796cSIlya Dryomov sizeof(ondisk->object_prefix), 1068848d796cSIlya Dryomov GFP_KERNEL); 1069bb23e37aSAlex Elder if (!object_prefix) 1070602adf40SYehuda Sadeh return -ENOMEM; 1071bb23e37aSAlex Elder } 107200f1f36fSAlex Elder 1073bb23e37aSAlex Elder /* Allocate the snapshot context and fill it in */ 1074d2bb24e5SAlex Elder 1075602adf40SYehuda Sadeh snap_count = le32_to_cpu(ondisk->snap_count); 1076bb23e37aSAlex Elder snapc = ceph_create_snap_context(snap_count, GFP_KERNEL); 1077bb23e37aSAlex Elder if (!snapc) 1078bb23e37aSAlex Elder goto out_err; 1079bb23e37aSAlex Elder snapc->seq = le64_to_cpu(ondisk->snap_seq); 1080602adf40SYehuda Sadeh if (snap_count) { 1081bb23e37aSAlex Elder struct rbd_image_snap_ondisk *snaps; 1082f785cc1dSAlex Elder u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len); 1083f785cc1dSAlex Elder 1084bb23e37aSAlex Elder /* We'll keep a copy of the snapshot names... */ 1085621901d6SAlex Elder 1086f785cc1dSAlex Elder if (snap_names_len > (u64)SIZE_MAX) 1087bb23e37aSAlex Elder goto out_2big; 1088bb23e37aSAlex Elder snap_names = kmalloc(snap_names_len, GFP_KERNEL); 1089bb23e37aSAlex Elder if (!snap_names) 1090602adf40SYehuda Sadeh goto out_err; 1091bb23e37aSAlex Elder 1092bb23e37aSAlex Elder /* ...as well as the array of their sizes. */ 109388a25a5fSMarkus Elfring snap_sizes = kmalloc_array(snap_count, 109488a25a5fSMarkus Elfring sizeof(*header->snap_sizes), 109588a25a5fSMarkus Elfring GFP_KERNEL); 1096bb23e37aSAlex Elder if (!snap_sizes) 1097bb23e37aSAlex Elder goto out_err; 1098bb23e37aSAlex Elder 1099f785cc1dSAlex Elder /* 1100bb23e37aSAlex Elder * Copy the names, and fill in each snapshot's id 1101bb23e37aSAlex Elder * and size. 1102bb23e37aSAlex Elder * 110399a41ebcSAlex Elder * Note that rbd_dev_v1_header_info() guarantees the 1104bb23e37aSAlex Elder * ondisk buffer we're working with has 1105f785cc1dSAlex Elder * snap_names_len bytes beyond the end of the 1106f785cc1dSAlex Elder * snapshot id array, this memcpy() is safe. 1107f785cc1dSAlex Elder */ 1108bb23e37aSAlex Elder memcpy(snap_names, &ondisk->snaps[snap_count], snap_names_len); 1109bb23e37aSAlex Elder snaps = ondisk->snaps; 1110bb23e37aSAlex Elder for (i = 0; i < snap_count; i++) { 1111bb23e37aSAlex Elder snapc->snaps[i] = le64_to_cpu(snaps[i].id); 1112bb23e37aSAlex Elder snap_sizes[i] = le64_to_cpu(snaps[i].image_size); 1113bb23e37aSAlex Elder } 1114602adf40SYehuda Sadeh } 1115849b4260SAlex Elder 1116bb23e37aSAlex Elder /* We won't fail any more, fill in the header */ 1117bb23e37aSAlex Elder 1118bb23e37aSAlex Elder if (first_time) { 1119bb23e37aSAlex Elder header->object_prefix = object_prefix; 1120602adf40SYehuda Sadeh header->obj_order = ondisk->options.order; 1121263423f8SIlya Dryomov rbd_init_layout(rbd_dev); 1122662518b1SAlex Elder } else { 1123662518b1SAlex Elder ceph_put_snap_context(header->snapc); 1124662518b1SAlex Elder kfree(header->snap_names); 1125662518b1SAlex Elder kfree(header->snap_sizes); 1126bb23e37aSAlex Elder } 11276a52325fSAlex Elder 1128bb23e37aSAlex Elder /* The remaining fields always get updated (when we refresh) */ 1129621901d6SAlex Elder 1130f84344f3SAlex Elder header->image_size = le64_to_cpu(ondisk->image_size); 1131bb23e37aSAlex Elder header->snapc = snapc; 1132bb23e37aSAlex Elder header->snap_names = snap_names; 1133bb23e37aSAlex Elder header->snap_sizes = snap_sizes; 1134468521c1SAlex Elder 1135602adf40SYehuda Sadeh return 0; 1136bb23e37aSAlex Elder out_2big: 1137bb23e37aSAlex Elder ret = -EIO; 11386a52325fSAlex Elder out_err: 1139bb23e37aSAlex Elder kfree(snap_sizes); 1140bb23e37aSAlex Elder kfree(snap_names); 1141bb23e37aSAlex Elder ceph_put_snap_context(snapc); 1142bb23e37aSAlex Elder kfree(object_prefix); 1143ccece235SAlex Elder 1144bb23e37aSAlex Elder return ret; 1145602adf40SYehuda Sadeh } 1146602adf40SYehuda Sadeh 11479682fc6dSAlex Elder static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which) 11489682fc6dSAlex Elder { 11499682fc6dSAlex Elder const char *snap_name; 11509682fc6dSAlex Elder 11519682fc6dSAlex Elder rbd_assert(which < rbd_dev->header.snapc->num_snaps); 11529682fc6dSAlex Elder 11539682fc6dSAlex Elder /* Skip over names until we find the one we are looking for */ 11549682fc6dSAlex Elder 11559682fc6dSAlex Elder snap_name = rbd_dev->header.snap_names; 11569682fc6dSAlex Elder while (which--) 11579682fc6dSAlex Elder snap_name += strlen(snap_name) + 1; 11589682fc6dSAlex Elder 11599682fc6dSAlex Elder return kstrdup(snap_name, GFP_KERNEL); 11609682fc6dSAlex Elder } 11619682fc6dSAlex Elder 116230d1cff8SAlex Elder /* 116330d1cff8SAlex Elder * Snapshot id comparison function for use with qsort()/bsearch(). 116430d1cff8SAlex Elder * Note that result is for snapshots in *descending* order. 116530d1cff8SAlex Elder */ 116630d1cff8SAlex Elder static int snapid_compare_reverse(const void *s1, const void *s2) 116730d1cff8SAlex Elder { 116830d1cff8SAlex Elder u64 snap_id1 = *(u64 *)s1; 116930d1cff8SAlex Elder u64 snap_id2 = *(u64 *)s2; 117030d1cff8SAlex Elder 117130d1cff8SAlex Elder if (snap_id1 < snap_id2) 117230d1cff8SAlex Elder return 1; 117330d1cff8SAlex Elder return snap_id1 == snap_id2 ? 0 : -1; 117430d1cff8SAlex Elder } 117530d1cff8SAlex Elder 117630d1cff8SAlex Elder /* 117730d1cff8SAlex Elder * Search a snapshot context to see if the given snapshot id is 117830d1cff8SAlex Elder * present. 117930d1cff8SAlex Elder * 118030d1cff8SAlex Elder * Returns the position of the snapshot id in the array if it's found, 118130d1cff8SAlex Elder * or BAD_SNAP_INDEX otherwise. 118230d1cff8SAlex Elder * 118330d1cff8SAlex Elder * Note: The snapshot array is in kept sorted (by the osd) in 118430d1cff8SAlex Elder * reverse order, highest snapshot id first. 118530d1cff8SAlex Elder */ 11869682fc6dSAlex Elder static u32 rbd_dev_snap_index(struct rbd_device *rbd_dev, u64 snap_id) 11879682fc6dSAlex Elder { 11889682fc6dSAlex Elder struct ceph_snap_context *snapc = rbd_dev->header.snapc; 118930d1cff8SAlex Elder u64 *found; 11909682fc6dSAlex Elder 119130d1cff8SAlex Elder found = bsearch(&snap_id, &snapc->snaps, snapc->num_snaps, 119230d1cff8SAlex Elder sizeof (snap_id), snapid_compare_reverse); 11939682fc6dSAlex Elder 119430d1cff8SAlex Elder return found ? (u32)(found - &snapc->snaps[0]) : BAD_SNAP_INDEX; 11959682fc6dSAlex Elder } 11969682fc6dSAlex Elder 11972ad3d716SAlex Elder static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, 11982ad3d716SAlex Elder u64 snap_id) 119954cac61fSAlex Elder { 120054cac61fSAlex Elder u32 which; 1201da6a6b63SJosh Durgin const char *snap_name; 120254cac61fSAlex Elder 120354cac61fSAlex Elder which = rbd_dev_snap_index(rbd_dev, snap_id); 120454cac61fSAlex Elder if (which == BAD_SNAP_INDEX) 1205da6a6b63SJosh Durgin return ERR_PTR(-ENOENT); 120654cac61fSAlex Elder 1207da6a6b63SJosh Durgin snap_name = _rbd_dev_v1_snap_name(rbd_dev, which); 1208da6a6b63SJosh Durgin return snap_name ? snap_name : ERR_PTR(-ENOMEM); 120954cac61fSAlex Elder } 121054cac61fSAlex Elder 12119e15b77dSAlex Elder static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id) 12129e15b77dSAlex Elder { 12139e15b77dSAlex Elder if (snap_id == CEPH_NOSNAP) 12149e15b77dSAlex Elder return RBD_SNAP_HEAD_NAME; 12159e15b77dSAlex Elder 121654cac61fSAlex Elder rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 121754cac61fSAlex Elder if (rbd_dev->image_format == 1) 121854cac61fSAlex Elder return rbd_dev_v1_snap_name(rbd_dev, snap_id); 12199e15b77dSAlex Elder 122054cac61fSAlex Elder return rbd_dev_v2_snap_name(rbd_dev, snap_id); 12219e15b77dSAlex Elder } 12229e15b77dSAlex Elder 12232ad3d716SAlex Elder static int rbd_snap_size(struct rbd_device *rbd_dev, u64 snap_id, 12242ad3d716SAlex Elder u64 *snap_size) 1225602adf40SYehuda Sadeh { 12262ad3d716SAlex Elder rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 12272ad3d716SAlex Elder if (snap_id == CEPH_NOSNAP) { 12282ad3d716SAlex Elder *snap_size = rbd_dev->header.image_size; 12292ad3d716SAlex Elder } else if (rbd_dev->image_format == 1) { 12302ad3d716SAlex Elder u32 which; 123100f1f36fSAlex Elder 12322ad3d716SAlex Elder which = rbd_dev_snap_index(rbd_dev, snap_id); 12332ad3d716SAlex Elder if (which == BAD_SNAP_INDEX) 12342ad3d716SAlex Elder return -ENOENT; 123500f1f36fSAlex Elder 12362ad3d716SAlex Elder *snap_size = rbd_dev->header.snap_sizes[which]; 12372ad3d716SAlex Elder } else { 12382ad3d716SAlex Elder u64 size = 0; 12392ad3d716SAlex Elder int ret; 12402ad3d716SAlex Elder 12412ad3d716SAlex Elder ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size); 12422ad3d716SAlex Elder if (ret) 12432ad3d716SAlex Elder return ret; 12442ad3d716SAlex Elder 12452ad3d716SAlex Elder *snap_size = size; 12462ad3d716SAlex Elder } 12472ad3d716SAlex Elder return 0; 12482ad3d716SAlex Elder } 12492ad3d716SAlex Elder 12502ad3d716SAlex Elder static int rbd_snap_features(struct rbd_device *rbd_dev, u64 snap_id, 12512ad3d716SAlex Elder u64 *snap_features) 12522ad3d716SAlex Elder { 12532ad3d716SAlex Elder rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 12542ad3d716SAlex Elder if (snap_id == CEPH_NOSNAP) { 12552ad3d716SAlex Elder *snap_features = rbd_dev->header.features; 12562ad3d716SAlex Elder } else if (rbd_dev->image_format == 1) { 12572ad3d716SAlex Elder *snap_features = 0; /* No features for format 1 */ 12582ad3d716SAlex Elder } else { 12592ad3d716SAlex Elder u64 features = 0; 12602ad3d716SAlex Elder int ret; 12612ad3d716SAlex Elder 12622ad3d716SAlex Elder ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, &features); 12632ad3d716SAlex Elder if (ret) 12642ad3d716SAlex Elder return ret; 12652ad3d716SAlex Elder 12662ad3d716SAlex Elder *snap_features = features; 12672ad3d716SAlex Elder } 12682ad3d716SAlex Elder return 0; 126900f1f36fSAlex Elder } 1270602adf40SYehuda Sadeh 1271d1cf5788SAlex Elder static int rbd_dev_mapping_set(struct rbd_device *rbd_dev) 1272602adf40SYehuda Sadeh { 12738f4b7d98SAlex Elder u64 snap_id = rbd_dev->spec->snap_id; 12742ad3d716SAlex Elder u64 size = 0; 12752ad3d716SAlex Elder u64 features = 0; 12762ad3d716SAlex Elder int ret; 12778b0241f8SAlex Elder 12782ad3d716SAlex Elder ret = rbd_snap_size(rbd_dev, snap_id, &size); 12792ad3d716SAlex Elder if (ret) 12802ad3d716SAlex Elder return ret; 12812ad3d716SAlex Elder ret = rbd_snap_features(rbd_dev, snap_id, &features); 12822ad3d716SAlex Elder if (ret) 12832ad3d716SAlex Elder return ret; 12842ad3d716SAlex Elder 12852ad3d716SAlex Elder rbd_dev->mapping.size = size; 12862ad3d716SAlex Elder rbd_dev->mapping.features = features; 12872ad3d716SAlex Elder 12888b0241f8SAlex Elder return 0; 1289602adf40SYehuda Sadeh } 1290602adf40SYehuda Sadeh 1291d1cf5788SAlex Elder static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev) 1292d1cf5788SAlex Elder { 1293d1cf5788SAlex Elder rbd_dev->mapping.size = 0; 1294d1cf5788SAlex Elder rbd_dev->mapping.features = 0; 1295200a6a8bSAlex Elder } 1296200a6a8bSAlex Elder 12975359a17dSIlya Dryomov static void zero_bvec(struct bio_vec *bv) 129865ccfe21SAlex Elder { 1299602adf40SYehuda Sadeh void *buf; 13005359a17dSIlya Dryomov unsigned long flags; 1301602adf40SYehuda Sadeh 13025359a17dSIlya Dryomov buf = bvec_kmap_irq(bv, &flags); 13035359a17dSIlya Dryomov memset(buf, 0, bv->bv_len); 13045359a17dSIlya Dryomov flush_dcache_page(bv->bv_page); 130585b5aaa6SDan Carpenter bvec_kunmap_irq(buf, &flags); 1306602adf40SYehuda Sadeh } 1307602adf40SYehuda Sadeh 13085359a17dSIlya Dryomov static void zero_bios(struct ceph_bio_iter *bio_pos, u32 off, u32 bytes) 1309b9434c5bSAlex Elder { 13105359a17dSIlya Dryomov struct ceph_bio_iter it = *bio_pos; 1311b9434c5bSAlex Elder 13125359a17dSIlya Dryomov ceph_bio_iter_advance(&it, off); 13135359a17dSIlya Dryomov ceph_bio_iter_advance_step(&it, bytes, ({ 13145359a17dSIlya Dryomov zero_bvec(&bv); 13155359a17dSIlya Dryomov })); 1316b9434c5bSAlex Elder } 1317b9434c5bSAlex Elder 13187e07efb1SIlya Dryomov static void zero_bvecs(struct ceph_bvec_iter *bvec_pos, u32 off, u32 bytes) 1319602adf40SYehuda Sadeh { 13207e07efb1SIlya Dryomov struct ceph_bvec_iter it = *bvec_pos; 1321602adf40SYehuda Sadeh 13227e07efb1SIlya Dryomov ceph_bvec_iter_advance(&it, off); 13237e07efb1SIlya Dryomov ceph_bvec_iter_advance_step(&it, bytes, ({ 13247e07efb1SIlya Dryomov zero_bvec(&bv); 13257e07efb1SIlya Dryomov })); 1326602adf40SYehuda Sadeh } 1327602adf40SYehuda Sadeh 1328f7760dadSAlex Elder /* 13293da691bfSIlya Dryomov * Zero a range in @obj_req data buffer defined by a bio (list) or 1330afb97888SIlya Dryomov * (private) bio_vec array. 1331f7760dadSAlex Elder * 13323da691bfSIlya Dryomov * @off is relative to the start of the data buffer. 1333f7760dadSAlex Elder */ 13343da691bfSIlya Dryomov static void rbd_obj_zero_range(struct rbd_obj_request *obj_req, u32 off, 13353da691bfSIlya Dryomov u32 bytes) 1336f7760dadSAlex Elder { 1337ecc633caSIlya Dryomov switch (obj_req->img_request->data_type) { 13383da691bfSIlya Dryomov case OBJ_REQUEST_BIO: 13393da691bfSIlya Dryomov zero_bios(&obj_req->bio_pos, off, bytes); 13403da691bfSIlya Dryomov break; 13413da691bfSIlya Dryomov case OBJ_REQUEST_BVECS: 1342afb97888SIlya Dryomov case OBJ_REQUEST_OWN_BVECS: 13433da691bfSIlya Dryomov zero_bvecs(&obj_req->bvec_pos, off, bytes); 13443da691bfSIlya Dryomov break; 13453da691bfSIlya Dryomov default: 13463da691bfSIlya Dryomov rbd_assert(0); 1347f5400b7aSAlex Elder } 1348bf0d5f50SAlex Elder } 1349bf0d5f50SAlex Elder 1350bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref); 1351bf0d5f50SAlex Elder static void rbd_obj_request_put(struct rbd_obj_request *obj_request) 1352bf0d5f50SAlex Elder { 1353bf0d5f50SAlex Elder rbd_assert(obj_request != NULL); 135437206ee5SAlex Elder dout("%s: obj %p (was %d)\n", __func__, obj_request, 13552c935bc5SPeter Zijlstra kref_read(&obj_request->kref)); 1356bf0d5f50SAlex Elder kref_put(&obj_request->kref, rbd_obj_request_destroy); 1357bf0d5f50SAlex Elder } 1358bf0d5f50SAlex Elder 13590f2d5be7SAlex Elder static void rbd_img_request_get(struct rbd_img_request *img_request) 13600f2d5be7SAlex Elder { 13610f2d5be7SAlex Elder dout("%s: img %p (was %d)\n", __func__, img_request, 13622c935bc5SPeter Zijlstra kref_read(&img_request->kref)); 13630f2d5be7SAlex Elder kref_get(&img_request->kref); 13640f2d5be7SAlex Elder } 13650f2d5be7SAlex Elder 1366bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref); 1367bf0d5f50SAlex Elder static void rbd_img_request_put(struct rbd_img_request *img_request) 1368bf0d5f50SAlex Elder { 1369bf0d5f50SAlex Elder rbd_assert(img_request != NULL); 137037206ee5SAlex Elder dout("%s: img %p (was %d)\n", __func__, img_request, 13712c935bc5SPeter Zijlstra kref_read(&img_request->kref)); 1372bf0d5f50SAlex Elder kref_put(&img_request->kref, rbd_img_request_destroy); 1373bf0d5f50SAlex Elder } 1374bf0d5f50SAlex Elder 1375bf0d5f50SAlex Elder static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request, 1376bf0d5f50SAlex Elder struct rbd_obj_request *obj_request) 1377bf0d5f50SAlex Elder { 137825dcf954SAlex Elder rbd_assert(obj_request->img_request == NULL); 137925dcf954SAlex Elder 1380b155e86cSAlex Elder /* Image request now owns object's original reference */ 1381bf0d5f50SAlex Elder obj_request->img_request = img_request; 13827114edacSIlya Dryomov img_request->pending_count++; 138315961b44SIlya Dryomov dout("%s: img %p obj %p\n", __func__, img_request, obj_request); 1384bf0d5f50SAlex Elder } 1385bf0d5f50SAlex Elder 1386bf0d5f50SAlex Elder static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request, 1387bf0d5f50SAlex Elder struct rbd_obj_request *obj_request) 1388bf0d5f50SAlex Elder { 138915961b44SIlya Dryomov dout("%s: img %p obj %p\n", __func__, img_request, obj_request); 139043df3d35SIlya Dryomov list_del(&obj_request->ex.oe_item); 1391bf0d5f50SAlex Elder rbd_assert(obj_request->img_request == img_request); 1392bf0d5f50SAlex Elder rbd_obj_request_put(obj_request); 1393bf0d5f50SAlex Elder } 1394bf0d5f50SAlex Elder 1395980917fcSIlya Dryomov static void rbd_obj_request_submit(struct rbd_obj_request *obj_request) 1396bf0d5f50SAlex Elder { 1397980917fcSIlya Dryomov struct ceph_osd_request *osd_req = obj_request->osd_req; 1398980917fcSIlya Dryomov 1399a90bb0c1SIlya Dryomov dout("%s %p object_no %016llx %llu~%llu osd_req %p\n", __func__, 140043df3d35SIlya Dryomov obj_request, obj_request->ex.oe_objno, obj_request->ex.oe_off, 140143df3d35SIlya Dryomov obj_request->ex.oe_len, osd_req); 1402980917fcSIlya Dryomov ceph_osdc_start_request(osd_req->r_osdc, osd_req, false); 1403bf0d5f50SAlex Elder } 1404bf0d5f50SAlex Elder 14050c425248SAlex Elder /* 14060c425248SAlex Elder * The default/initial value for all image request flags is 0. Each 14070c425248SAlex Elder * is conditionally set to 1 at image request initialization time 14080c425248SAlex Elder * and currently never change thereafter. 14090c425248SAlex Elder */ 1410d0b2e944SAlex Elder static void img_request_layered_set(struct rbd_img_request *img_request) 1411d0b2e944SAlex Elder { 1412d0b2e944SAlex Elder set_bit(IMG_REQ_LAYERED, &img_request->flags); 1413d0b2e944SAlex Elder smp_mb(); 1414d0b2e944SAlex Elder } 1415d0b2e944SAlex Elder 1416a2acd00eSAlex Elder static void img_request_layered_clear(struct rbd_img_request *img_request) 1417a2acd00eSAlex Elder { 1418a2acd00eSAlex Elder clear_bit(IMG_REQ_LAYERED, &img_request->flags); 1419a2acd00eSAlex Elder smp_mb(); 1420a2acd00eSAlex Elder } 1421a2acd00eSAlex Elder 1422d0b2e944SAlex Elder static bool img_request_layered_test(struct rbd_img_request *img_request) 1423d0b2e944SAlex Elder { 1424d0b2e944SAlex Elder smp_mb(); 1425d0b2e944SAlex Elder return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0; 1426d0b2e944SAlex Elder } 1427d0b2e944SAlex Elder 14283da691bfSIlya Dryomov static bool rbd_obj_is_entire(struct rbd_obj_request *obj_req) 14293b434a2aSJosh Durgin { 14303da691bfSIlya Dryomov struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; 14313da691bfSIlya Dryomov 143243df3d35SIlya Dryomov return !obj_req->ex.oe_off && 143343df3d35SIlya Dryomov obj_req->ex.oe_len == rbd_dev->layout.object_size; 14343b434a2aSJosh Durgin } 14353b434a2aSJosh Durgin 14363da691bfSIlya Dryomov static bool rbd_obj_is_tail(struct rbd_obj_request *obj_req) 14376e2a4505SAlex Elder { 14383da691bfSIlya Dryomov struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; 1439b9434c5bSAlex Elder 144043df3d35SIlya Dryomov return obj_req->ex.oe_off + obj_req->ex.oe_len == 14413da691bfSIlya Dryomov rbd_dev->layout.object_size; 14426e2a4505SAlex Elder } 14436e2a4505SAlex Elder 144413488d53SIlya Dryomov /* 144513488d53SIlya Dryomov * Must be called after rbd_obj_calc_img_extents(). 144613488d53SIlya Dryomov */ 144713488d53SIlya Dryomov static bool rbd_obj_copyup_enabled(struct rbd_obj_request *obj_req) 144813488d53SIlya Dryomov { 144913488d53SIlya Dryomov if (!obj_req->num_img_extents || 14509b17eb2cSIlya Dryomov (rbd_obj_is_entire(obj_req) && 14519b17eb2cSIlya Dryomov !obj_req->img_request->snapc->num_snaps)) 145213488d53SIlya Dryomov return false; 145313488d53SIlya Dryomov 145413488d53SIlya Dryomov return true; 145513488d53SIlya Dryomov } 145613488d53SIlya Dryomov 145786bd7998SIlya Dryomov static u64 rbd_obj_img_extents_bytes(struct rbd_obj_request *obj_req) 1458bf0d5f50SAlex Elder { 145986bd7998SIlya Dryomov return ceph_file_extents_bytes(obj_req->img_extents, 146086bd7998SIlya Dryomov obj_req->num_img_extents); 1461bf0d5f50SAlex Elder } 1462bf0d5f50SAlex Elder 14633da691bfSIlya Dryomov static bool rbd_img_is_write(struct rbd_img_request *img_req) 14640dcc685eSIlya Dryomov { 14659bb0248dSIlya Dryomov switch (img_req->op_type) { 14663da691bfSIlya Dryomov case OBJ_OP_READ: 14673da691bfSIlya Dryomov return false; 14683da691bfSIlya Dryomov case OBJ_OP_WRITE: 14693da691bfSIlya Dryomov case OBJ_OP_DISCARD: 14706484cbe9SIlya Dryomov case OBJ_OP_ZEROOUT: 14713da691bfSIlya Dryomov return true; 14723da691bfSIlya Dryomov default: 1473c6244b3bSArnd Bergmann BUG(); 14740dcc685eSIlya Dryomov } 14750dcc685eSIlya Dryomov } 14760dcc685eSIlya Dryomov 14773da691bfSIlya Dryomov static void rbd_obj_handle_request(struct rbd_obj_request *obj_req); 14782761713dSIlya Dryomov 147985e084feSIlya Dryomov static void rbd_osd_req_callback(struct ceph_osd_request *osd_req) 1480bf0d5f50SAlex Elder { 14813da691bfSIlya Dryomov struct rbd_obj_request *obj_req = osd_req->r_priv; 1482bf0d5f50SAlex Elder 14833da691bfSIlya Dryomov dout("%s osd_req %p result %d for obj_req %p\n", __func__, osd_req, 14843da691bfSIlya Dryomov osd_req->r_result, obj_req); 14853da691bfSIlya Dryomov rbd_assert(osd_req == obj_req->osd_req); 1486bf0d5f50SAlex Elder 14873da691bfSIlya Dryomov obj_req->result = osd_req->r_result < 0 ? osd_req->r_result : 0; 14883da691bfSIlya Dryomov if (!obj_req->result && !rbd_img_is_write(obj_req->img_request)) 14893da691bfSIlya Dryomov obj_req->xferred = osd_req->r_result; 14903da691bfSIlya Dryomov else 1491c47f9371SAlex Elder /* 14923da691bfSIlya Dryomov * Writes aren't allowed to return a data payload. In some 14933da691bfSIlya Dryomov * guarded write cases (e.g. stat + zero on an empty object) 14943da691bfSIlya Dryomov * a stat response makes it through, but we don't care. 1495c47f9371SAlex Elder */ 14963da691bfSIlya Dryomov obj_req->xferred = 0; 14970ccd5926SIlya Dryomov 14983da691bfSIlya Dryomov rbd_obj_handle_request(obj_req); 1499bf0d5f50SAlex Elder } 1500bf0d5f50SAlex Elder 15019d4df01fSAlex Elder static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request) 1502430c28c3SAlex Elder { 15038c042b0dSAlex Elder struct ceph_osd_request *osd_req = obj_request->osd_req; 1504430c28c3SAlex Elder 1505a162b308SIlya Dryomov osd_req->r_flags = CEPH_OSD_FLAG_READ; 15067c84883aSIlya Dryomov osd_req->r_snapid = obj_request->img_request->snap_id; 15079d4df01fSAlex Elder } 15089d4df01fSAlex Elder 15099d4df01fSAlex Elder static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request) 15109d4df01fSAlex Elder { 15119d4df01fSAlex Elder struct ceph_osd_request *osd_req = obj_request->osd_req; 15129d4df01fSAlex Elder 1513a162b308SIlya Dryomov osd_req->r_flags = CEPH_OSD_FLAG_WRITE; 1514fac02ddfSArnd Bergmann ktime_get_real_ts64(&osd_req->r_mtime); 151543df3d35SIlya Dryomov osd_req->r_data_offset = obj_request->ex.oe_off; 1516430c28c3SAlex Elder } 1517430c28c3SAlex Elder 1518bc81207eSIlya Dryomov static struct ceph_osd_request * 1519e28eded5SIlya Dryomov __rbd_osd_req_create(struct rbd_obj_request *obj_req, 1520e28eded5SIlya Dryomov struct ceph_snap_context *snapc, unsigned int num_ops) 1521bc81207eSIlya Dryomov { 1522e28eded5SIlya Dryomov struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; 1523bc81207eSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 1524bc81207eSIlya Dryomov struct ceph_osd_request *req; 1525a90bb0c1SIlya Dryomov const char *name_format = rbd_dev->image_format == 1 ? 1526a90bb0c1SIlya Dryomov RBD_V1_DATA_FORMAT : RBD_V2_DATA_FORMAT; 1527bc81207eSIlya Dryomov 1528e28eded5SIlya Dryomov req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false, GFP_NOIO); 1529bc81207eSIlya Dryomov if (!req) 1530bc81207eSIlya Dryomov return NULL; 1531bc81207eSIlya Dryomov 1532bc81207eSIlya Dryomov req->r_callback = rbd_osd_req_callback; 1533a162b308SIlya Dryomov req->r_priv = obj_req; 1534bc81207eSIlya Dryomov 1535b26c047bSIlya Dryomov /* 1536b26c047bSIlya Dryomov * Data objects may be stored in a separate pool, but always in 1537b26c047bSIlya Dryomov * the same namespace in that pool as the header in its pool. 1538b26c047bSIlya Dryomov */ 1539b26c047bSIlya Dryomov ceph_oloc_copy(&req->r_base_oloc, &rbd_dev->header_oloc); 1540bc81207eSIlya Dryomov req->r_base_oloc.pool = rbd_dev->layout.pool_id; 1541b26c047bSIlya Dryomov 1542a90bb0c1SIlya Dryomov if (ceph_oid_aprintf(&req->r_base_oid, GFP_NOIO, name_format, 154343df3d35SIlya Dryomov rbd_dev->header.object_prefix, obj_req->ex.oe_objno)) 1544bc81207eSIlya Dryomov goto err_req; 1545bc81207eSIlya Dryomov 1546bc81207eSIlya Dryomov return req; 1547bc81207eSIlya Dryomov 1548bc81207eSIlya Dryomov err_req: 1549bc81207eSIlya Dryomov ceph_osdc_put_request(req); 1550bc81207eSIlya Dryomov return NULL; 1551bc81207eSIlya Dryomov } 1552bc81207eSIlya Dryomov 1553e28eded5SIlya Dryomov static struct ceph_osd_request * 1554e28eded5SIlya Dryomov rbd_osd_req_create(struct rbd_obj_request *obj_req, unsigned int num_ops) 1555e28eded5SIlya Dryomov { 1556e28eded5SIlya Dryomov return __rbd_osd_req_create(obj_req, obj_req->img_request->snapc, 1557e28eded5SIlya Dryomov num_ops); 1558e28eded5SIlya Dryomov } 1559e28eded5SIlya Dryomov 1560bf0d5f50SAlex Elder static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req) 1561bf0d5f50SAlex Elder { 1562bf0d5f50SAlex Elder ceph_osdc_put_request(osd_req); 1563bf0d5f50SAlex Elder } 1564bf0d5f50SAlex Elder 1565ecc633caSIlya Dryomov static struct rbd_obj_request *rbd_obj_request_create(void) 1566bf0d5f50SAlex Elder { 1567bf0d5f50SAlex Elder struct rbd_obj_request *obj_request; 1568bf0d5f50SAlex Elder 15695a60e876SIlya Dryomov obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_NOIO); 15706c696d85SIlya Dryomov if (!obj_request) 1571f907ad55SAlex Elder return NULL; 1572f907ad55SAlex Elder 157343df3d35SIlya Dryomov ceph_object_extent_init(&obj_request->ex); 1574bf0d5f50SAlex Elder kref_init(&obj_request->kref); 1575bf0d5f50SAlex Elder 157667e2b652SIlya Dryomov dout("%s %p\n", __func__, obj_request); 1577bf0d5f50SAlex Elder return obj_request; 1578bf0d5f50SAlex Elder } 1579bf0d5f50SAlex Elder 1580bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref) 1581bf0d5f50SAlex Elder { 1582bf0d5f50SAlex Elder struct rbd_obj_request *obj_request; 15837e07efb1SIlya Dryomov u32 i; 1584bf0d5f50SAlex Elder 1585bf0d5f50SAlex Elder obj_request = container_of(kref, struct rbd_obj_request, kref); 1586bf0d5f50SAlex Elder 158737206ee5SAlex Elder dout("%s: obj %p\n", __func__, obj_request); 158837206ee5SAlex Elder 1589bf0d5f50SAlex Elder if (obj_request->osd_req) 1590bf0d5f50SAlex Elder rbd_osd_req_destroy(obj_request->osd_req); 1591bf0d5f50SAlex Elder 1592ecc633caSIlya Dryomov switch (obj_request->img_request->data_type) { 15939969ebc5SAlex Elder case OBJ_REQUEST_NODATA: 1594bf0d5f50SAlex Elder case OBJ_REQUEST_BIO: 15957e07efb1SIlya Dryomov case OBJ_REQUEST_BVECS: 15965359a17dSIlya Dryomov break; /* Nothing to do */ 1597afb97888SIlya Dryomov case OBJ_REQUEST_OWN_BVECS: 1598afb97888SIlya Dryomov kfree(obj_request->bvec_pos.bvecs); 1599bf0d5f50SAlex Elder break; 16007e07efb1SIlya Dryomov default: 16017e07efb1SIlya Dryomov rbd_assert(0); 1602bf0d5f50SAlex Elder } 1603bf0d5f50SAlex Elder 160486bd7998SIlya Dryomov kfree(obj_request->img_extents); 16057e07efb1SIlya Dryomov if (obj_request->copyup_bvecs) { 16067e07efb1SIlya Dryomov for (i = 0; i < obj_request->copyup_bvec_count; i++) { 16077e07efb1SIlya Dryomov if (obj_request->copyup_bvecs[i].bv_page) 16087e07efb1SIlya Dryomov __free_page(obj_request->copyup_bvecs[i].bv_page); 16097e07efb1SIlya Dryomov } 16107e07efb1SIlya Dryomov kfree(obj_request->copyup_bvecs); 1611bf0d5f50SAlex Elder } 1612bf0d5f50SAlex Elder 1613868311b1SAlex Elder kmem_cache_free(rbd_obj_request_cache, obj_request); 1614bf0d5f50SAlex Elder } 1615bf0d5f50SAlex Elder 1616fb65d228SAlex Elder /* It's OK to call this for a device with no parent */ 1617fb65d228SAlex Elder 1618fb65d228SAlex Elder static void rbd_spec_put(struct rbd_spec *spec); 1619fb65d228SAlex Elder static void rbd_dev_unparent(struct rbd_device *rbd_dev) 1620fb65d228SAlex Elder { 1621fb65d228SAlex Elder rbd_dev_remove_parent(rbd_dev); 1622fb65d228SAlex Elder rbd_spec_put(rbd_dev->parent_spec); 1623fb65d228SAlex Elder rbd_dev->parent_spec = NULL; 1624fb65d228SAlex Elder rbd_dev->parent_overlap = 0; 1625fb65d228SAlex Elder } 1626fb65d228SAlex Elder 1627bf0d5f50SAlex Elder /* 1628a2acd00eSAlex Elder * Parent image reference counting is used to determine when an 1629a2acd00eSAlex Elder * image's parent fields can be safely torn down--after there are no 1630a2acd00eSAlex Elder * more in-flight requests to the parent image. When the last 1631a2acd00eSAlex Elder * reference is dropped, cleaning them up is safe. 1632a2acd00eSAlex Elder */ 1633a2acd00eSAlex Elder static void rbd_dev_parent_put(struct rbd_device *rbd_dev) 1634a2acd00eSAlex Elder { 1635a2acd00eSAlex Elder int counter; 1636a2acd00eSAlex Elder 1637a2acd00eSAlex Elder if (!rbd_dev->parent_spec) 1638a2acd00eSAlex Elder return; 1639a2acd00eSAlex Elder 1640a2acd00eSAlex Elder counter = atomic_dec_return_safe(&rbd_dev->parent_ref); 1641a2acd00eSAlex Elder if (counter > 0) 1642a2acd00eSAlex Elder return; 1643a2acd00eSAlex Elder 1644a2acd00eSAlex Elder /* Last reference; clean up parent data structures */ 1645a2acd00eSAlex Elder 1646a2acd00eSAlex Elder if (!counter) 1647a2acd00eSAlex Elder rbd_dev_unparent(rbd_dev); 1648a2acd00eSAlex Elder else 16499584d508SIlya Dryomov rbd_warn(rbd_dev, "parent reference underflow"); 1650a2acd00eSAlex Elder } 1651a2acd00eSAlex Elder 1652a2acd00eSAlex Elder /* 1653a2acd00eSAlex Elder * If an image has a non-zero parent overlap, get a reference to its 1654a2acd00eSAlex Elder * parent. 1655a2acd00eSAlex Elder * 1656a2acd00eSAlex Elder * Returns true if the rbd device has a parent with a non-zero 1657a2acd00eSAlex Elder * overlap and a reference for it was successfully taken, or 1658a2acd00eSAlex Elder * false otherwise. 1659a2acd00eSAlex Elder */ 1660a2acd00eSAlex Elder static bool rbd_dev_parent_get(struct rbd_device *rbd_dev) 1661a2acd00eSAlex Elder { 1662ae43e9d0SIlya Dryomov int counter = 0; 1663a2acd00eSAlex Elder 1664a2acd00eSAlex Elder if (!rbd_dev->parent_spec) 1665a2acd00eSAlex Elder return false; 1666a2acd00eSAlex Elder 1667ae43e9d0SIlya Dryomov down_read(&rbd_dev->header_rwsem); 1668ae43e9d0SIlya Dryomov if (rbd_dev->parent_overlap) 1669a2acd00eSAlex Elder counter = atomic_inc_return_safe(&rbd_dev->parent_ref); 1670ae43e9d0SIlya Dryomov up_read(&rbd_dev->header_rwsem); 1671a2acd00eSAlex Elder 1672a2acd00eSAlex Elder if (counter < 0) 16739584d508SIlya Dryomov rbd_warn(rbd_dev, "parent reference overflow"); 1674a2acd00eSAlex Elder 1675ae43e9d0SIlya Dryomov return counter > 0; 1676a2acd00eSAlex Elder } 1677a2acd00eSAlex Elder 1678bf0d5f50SAlex Elder /* 1679bf0d5f50SAlex Elder * Caller is responsible for filling in the list of object requests 1680bf0d5f50SAlex Elder * that comprises the image request, and the Linux request pointer 1681bf0d5f50SAlex Elder * (if there is one). 1682bf0d5f50SAlex Elder */ 1683cc344fa1SAlex Elder static struct rbd_img_request *rbd_img_request_create( 1684cc344fa1SAlex Elder struct rbd_device *rbd_dev, 16856d2940c8SGuangliang Zhao enum obj_operation_type op_type, 16864e752f0aSJosh Durgin struct ceph_snap_context *snapc) 1687bf0d5f50SAlex Elder { 1688bf0d5f50SAlex Elder struct rbd_img_request *img_request; 1689bf0d5f50SAlex Elder 1690a0c5895bSIlya Dryomov img_request = kmem_cache_zalloc(rbd_img_request_cache, GFP_NOIO); 1691bf0d5f50SAlex Elder if (!img_request) 1692bf0d5f50SAlex Elder return NULL; 1693bf0d5f50SAlex Elder 1694bf0d5f50SAlex Elder img_request->rbd_dev = rbd_dev; 16959bb0248dSIlya Dryomov img_request->op_type = op_type; 16969bb0248dSIlya Dryomov if (!rbd_img_is_write(img_request)) 1697bf0d5f50SAlex Elder img_request->snap_id = rbd_dev->spec->snap_id; 16989bb0248dSIlya Dryomov else 16999bb0248dSIlya Dryomov img_request->snapc = snapc; 17009bb0248dSIlya Dryomov 1701a2acd00eSAlex Elder if (rbd_dev_parent_get(rbd_dev)) 1702d0b2e944SAlex Elder img_request_layered_set(img_request); 1703a0c5895bSIlya Dryomov 1704bf0d5f50SAlex Elder spin_lock_init(&img_request->completion_lock); 170543df3d35SIlya Dryomov INIT_LIST_HEAD(&img_request->object_extents); 1706bf0d5f50SAlex Elder kref_init(&img_request->kref); 1707bf0d5f50SAlex Elder 1708dfd9875fSIlya Dryomov dout("%s: rbd_dev %p %s -> img %p\n", __func__, rbd_dev, 1709dfd9875fSIlya Dryomov obj_op_name(op_type), img_request); 1710bf0d5f50SAlex Elder return img_request; 1711bf0d5f50SAlex Elder } 1712bf0d5f50SAlex Elder 1713bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref) 1714bf0d5f50SAlex Elder { 1715bf0d5f50SAlex Elder struct rbd_img_request *img_request; 1716bf0d5f50SAlex Elder struct rbd_obj_request *obj_request; 1717bf0d5f50SAlex Elder struct rbd_obj_request *next_obj_request; 1718bf0d5f50SAlex Elder 1719bf0d5f50SAlex Elder img_request = container_of(kref, struct rbd_img_request, kref); 1720bf0d5f50SAlex Elder 172137206ee5SAlex Elder dout("%s: img %p\n", __func__, img_request); 172237206ee5SAlex Elder 1723bf0d5f50SAlex Elder for_each_obj_request_safe(img_request, obj_request, next_obj_request) 1724bf0d5f50SAlex Elder rbd_img_obj_request_del(img_request, obj_request); 1725bf0d5f50SAlex Elder 1726a2acd00eSAlex Elder if (img_request_layered_test(img_request)) { 1727a2acd00eSAlex Elder img_request_layered_clear(img_request); 1728a2acd00eSAlex Elder rbd_dev_parent_put(img_request->rbd_dev); 1729a2acd00eSAlex Elder } 1730a2acd00eSAlex Elder 17319bb0248dSIlya Dryomov if (rbd_img_is_write(img_request)) 1732812164f8SAlex Elder ceph_put_snap_context(img_request->snapc); 1733bf0d5f50SAlex Elder 17341c2a9dfeSAlex Elder kmem_cache_free(rbd_img_request_cache, img_request); 1735bf0d5f50SAlex Elder } 1736bf0d5f50SAlex Elder 173786bd7998SIlya Dryomov static void prune_extents(struct ceph_file_extent *img_extents, 173886bd7998SIlya Dryomov u32 *num_img_extents, u64 overlap) 1739e93f3152SAlex Elder { 174086bd7998SIlya Dryomov u32 cnt = *num_img_extents; 1741e93f3152SAlex Elder 174286bd7998SIlya Dryomov /* drop extents completely beyond the overlap */ 174386bd7998SIlya Dryomov while (cnt && img_extents[cnt - 1].fe_off >= overlap) 174486bd7998SIlya Dryomov cnt--; 1745e93f3152SAlex Elder 174686bd7998SIlya Dryomov if (cnt) { 174786bd7998SIlya Dryomov struct ceph_file_extent *ex = &img_extents[cnt - 1]; 1748e93f3152SAlex Elder 174986bd7998SIlya Dryomov /* trim final overlapping extent */ 175086bd7998SIlya Dryomov if (ex->fe_off + ex->fe_len > overlap) 175186bd7998SIlya Dryomov ex->fe_len = overlap - ex->fe_off; 1752e93f3152SAlex Elder } 1753e93f3152SAlex Elder 175486bd7998SIlya Dryomov *num_img_extents = cnt; 175586bd7998SIlya Dryomov } 175686bd7998SIlya Dryomov 175786bd7998SIlya Dryomov /* 175886bd7998SIlya Dryomov * Determine the byte range(s) covered by either just the object extent 175986bd7998SIlya Dryomov * or the entire object in the parent image. 176086bd7998SIlya Dryomov */ 176186bd7998SIlya Dryomov static int rbd_obj_calc_img_extents(struct rbd_obj_request *obj_req, 176286bd7998SIlya Dryomov bool entire) 1763e93f3152SAlex Elder { 176486bd7998SIlya Dryomov struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; 1765c5b5ef6cSAlex Elder int ret; 1766c5b5ef6cSAlex Elder 176786bd7998SIlya Dryomov if (!rbd_dev->parent_overlap) 176886bd7998SIlya Dryomov return 0; 176986bd7998SIlya Dryomov 177086bd7998SIlya Dryomov ret = ceph_extent_to_file(&rbd_dev->layout, obj_req->ex.oe_objno, 177186bd7998SIlya Dryomov entire ? 0 : obj_req->ex.oe_off, 177286bd7998SIlya Dryomov entire ? rbd_dev->layout.object_size : 177386bd7998SIlya Dryomov obj_req->ex.oe_len, 177486bd7998SIlya Dryomov &obj_req->img_extents, 177586bd7998SIlya Dryomov &obj_req->num_img_extents); 177686bd7998SIlya Dryomov if (ret) 177786bd7998SIlya Dryomov return ret; 177886bd7998SIlya Dryomov 177986bd7998SIlya Dryomov prune_extents(obj_req->img_extents, &obj_req->num_img_extents, 178086bd7998SIlya Dryomov rbd_dev->parent_overlap); 178186bd7998SIlya Dryomov return 0; 178286bd7998SIlya Dryomov } 178386bd7998SIlya Dryomov 17843da691bfSIlya Dryomov static void rbd_osd_req_setup_data(struct rbd_obj_request *obj_req, u32 which) 17853da691bfSIlya Dryomov { 1786ecc633caSIlya Dryomov switch (obj_req->img_request->data_type) { 17873da691bfSIlya Dryomov case OBJ_REQUEST_BIO: 17883da691bfSIlya Dryomov osd_req_op_extent_osd_data_bio(obj_req->osd_req, which, 17893da691bfSIlya Dryomov &obj_req->bio_pos, 179043df3d35SIlya Dryomov obj_req->ex.oe_len); 17913da691bfSIlya Dryomov break; 17923da691bfSIlya Dryomov case OBJ_REQUEST_BVECS: 1793afb97888SIlya Dryomov case OBJ_REQUEST_OWN_BVECS: 17943da691bfSIlya Dryomov rbd_assert(obj_req->bvec_pos.iter.bi_size == 179543df3d35SIlya Dryomov obj_req->ex.oe_len); 1796afb97888SIlya Dryomov rbd_assert(obj_req->bvec_idx == obj_req->bvec_count); 17973da691bfSIlya Dryomov osd_req_op_extent_osd_data_bvec_pos(obj_req->osd_req, which, 17983da691bfSIlya Dryomov &obj_req->bvec_pos); 17993da691bfSIlya Dryomov break; 18003da691bfSIlya Dryomov default: 18013da691bfSIlya Dryomov rbd_assert(0); 18023da691bfSIlya Dryomov } 18033da691bfSIlya Dryomov } 18043da691bfSIlya Dryomov 18053da691bfSIlya Dryomov static int rbd_obj_setup_read(struct rbd_obj_request *obj_req) 18063da691bfSIlya Dryomov { 1807e28eded5SIlya Dryomov obj_req->osd_req = __rbd_osd_req_create(obj_req, NULL, 1); 18083da691bfSIlya Dryomov if (!obj_req->osd_req) 1809710214e3SIlya Dryomov return -ENOMEM; 1810710214e3SIlya Dryomov 18113da691bfSIlya Dryomov osd_req_op_extent_init(obj_req->osd_req, 0, CEPH_OSD_OP_READ, 181243df3d35SIlya Dryomov obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0); 18133da691bfSIlya Dryomov rbd_osd_req_setup_data(obj_req, 0); 1814a90bb0c1SIlya Dryomov 18153da691bfSIlya Dryomov rbd_osd_req_format_read(obj_req); 18163da691bfSIlya Dryomov return 0; 1817710214e3SIlya Dryomov } 1818710214e3SIlya Dryomov 18193da691bfSIlya Dryomov static int __rbd_obj_setup_stat(struct rbd_obj_request *obj_req, 18203da691bfSIlya Dryomov unsigned int which) 18213da691bfSIlya Dryomov { 18223da691bfSIlya Dryomov struct page **pages; 18233da691bfSIlya Dryomov 1824c5b5ef6cSAlex Elder /* 1825c5b5ef6cSAlex Elder * The response data for a STAT call consists of: 1826c5b5ef6cSAlex Elder * le64 length; 1827c5b5ef6cSAlex Elder * struct { 1828c5b5ef6cSAlex Elder * le32 tv_sec; 1829c5b5ef6cSAlex Elder * le32 tv_nsec; 1830c5b5ef6cSAlex Elder * } mtime; 1831c5b5ef6cSAlex Elder */ 18323da691bfSIlya Dryomov pages = ceph_alloc_page_vector(1, GFP_NOIO); 18333da691bfSIlya Dryomov if (IS_ERR(pages)) 18343da691bfSIlya Dryomov return PTR_ERR(pages); 18353da691bfSIlya Dryomov 18363da691bfSIlya Dryomov osd_req_op_init(obj_req->osd_req, which, CEPH_OSD_OP_STAT, 0); 18373da691bfSIlya Dryomov osd_req_op_raw_data_in_pages(obj_req->osd_req, which, pages, 18383da691bfSIlya Dryomov 8 + sizeof(struct ceph_timespec), 18393da691bfSIlya Dryomov 0, false, true); 18403da691bfSIlya Dryomov return 0; 1841710214e3SIlya Dryomov } 1842c5b5ef6cSAlex Elder 184313488d53SIlya Dryomov static int count_write_ops(struct rbd_obj_request *obj_req) 184413488d53SIlya Dryomov { 184513488d53SIlya Dryomov return 2; /* setallochint + write/writefull */ 184613488d53SIlya Dryomov } 184713488d53SIlya Dryomov 18483da691bfSIlya Dryomov static void __rbd_obj_setup_write(struct rbd_obj_request *obj_req, 18493da691bfSIlya Dryomov unsigned int which) 18503da691bfSIlya Dryomov { 18513da691bfSIlya Dryomov struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; 18523da691bfSIlya Dryomov u16 opcode; 1853c5b5ef6cSAlex Elder 18543da691bfSIlya Dryomov osd_req_op_alloc_hint_init(obj_req->osd_req, which++, 18553da691bfSIlya Dryomov rbd_dev->layout.object_size, 18563da691bfSIlya Dryomov rbd_dev->layout.object_size); 1857c5b5ef6cSAlex Elder 18583da691bfSIlya Dryomov if (rbd_obj_is_entire(obj_req)) 18593da691bfSIlya Dryomov opcode = CEPH_OSD_OP_WRITEFULL; 18603da691bfSIlya Dryomov else 18613da691bfSIlya Dryomov opcode = CEPH_OSD_OP_WRITE; 1862c5b5ef6cSAlex Elder 18633da691bfSIlya Dryomov osd_req_op_extent_init(obj_req->osd_req, which, opcode, 186443df3d35SIlya Dryomov obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0); 18653da691bfSIlya Dryomov rbd_osd_req_setup_data(obj_req, which++); 18663da691bfSIlya Dryomov 18673da691bfSIlya Dryomov rbd_assert(which == obj_req->osd_req->r_num_ops); 18683da691bfSIlya Dryomov rbd_osd_req_format_write(obj_req); 18693da691bfSIlya Dryomov } 18703da691bfSIlya Dryomov 18713da691bfSIlya Dryomov static int rbd_obj_setup_write(struct rbd_obj_request *obj_req) 18723da691bfSIlya Dryomov { 18733da691bfSIlya Dryomov unsigned int num_osd_ops, which = 0; 187413488d53SIlya Dryomov bool need_guard; 18753da691bfSIlya Dryomov int ret; 18763da691bfSIlya Dryomov 187786bd7998SIlya Dryomov /* reverse map the entire object onto the parent */ 187886bd7998SIlya Dryomov ret = rbd_obj_calc_img_extents(obj_req, true); 187986bd7998SIlya Dryomov if (ret) 188086bd7998SIlya Dryomov return ret; 188186bd7998SIlya Dryomov 188213488d53SIlya Dryomov need_guard = rbd_obj_copyup_enabled(obj_req); 188313488d53SIlya Dryomov num_osd_ops = need_guard + count_write_ops(obj_req); 18843da691bfSIlya Dryomov 1885a162b308SIlya Dryomov obj_req->osd_req = rbd_osd_req_create(obj_req, num_osd_ops); 18863da691bfSIlya Dryomov if (!obj_req->osd_req) 18873da691bfSIlya Dryomov return -ENOMEM; 18883da691bfSIlya Dryomov 188913488d53SIlya Dryomov if (need_guard) { 18903da691bfSIlya Dryomov ret = __rbd_obj_setup_stat(obj_req, which++); 18913da691bfSIlya Dryomov if (ret) 1892c5b5ef6cSAlex Elder return ret; 189313488d53SIlya Dryomov 189413488d53SIlya Dryomov obj_req->write_state = RBD_OBJ_WRITE_GUARD; 189513488d53SIlya Dryomov } else { 189613488d53SIlya Dryomov obj_req->write_state = RBD_OBJ_WRITE_FLAT; 1897c5b5ef6cSAlex Elder } 1898c5b5ef6cSAlex Elder 18993da691bfSIlya Dryomov __rbd_obj_setup_write(obj_req, which); 19003da691bfSIlya Dryomov return 0; 190170d045f6SIlya Dryomov } 190270d045f6SIlya Dryomov 19036484cbe9SIlya Dryomov static u16 truncate_or_zero_opcode(struct rbd_obj_request *obj_req) 19046484cbe9SIlya Dryomov { 19056484cbe9SIlya Dryomov return rbd_obj_is_tail(obj_req) ? CEPH_OSD_OP_TRUNCATE : 19066484cbe9SIlya Dryomov CEPH_OSD_OP_ZERO; 19076484cbe9SIlya Dryomov } 19086484cbe9SIlya Dryomov 19096484cbe9SIlya Dryomov static int rbd_obj_setup_discard(struct rbd_obj_request *obj_req) 19106484cbe9SIlya Dryomov { 19110c93e1b7SIlya Dryomov struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; 19120c93e1b7SIlya Dryomov u64 off = obj_req->ex.oe_off; 19130c93e1b7SIlya Dryomov u64 next_off = obj_req->ex.oe_off + obj_req->ex.oe_len; 19146484cbe9SIlya Dryomov int ret; 19156484cbe9SIlya Dryomov 19160c93e1b7SIlya Dryomov /* 19170c93e1b7SIlya Dryomov * Align the range to alloc_size boundary and punt on discards 19180c93e1b7SIlya Dryomov * that are too small to free up any space. 19190c93e1b7SIlya Dryomov * 19200c93e1b7SIlya Dryomov * alloc_size == object_size && is_tail() is a special case for 19210c93e1b7SIlya Dryomov * filestore with filestore_punch_hole = false, needed to allow 19220c93e1b7SIlya Dryomov * truncate (in addition to delete). 19230c93e1b7SIlya Dryomov */ 19240c93e1b7SIlya Dryomov if (rbd_dev->opts->alloc_size != rbd_dev->layout.object_size || 19250c93e1b7SIlya Dryomov !rbd_obj_is_tail(obj_req)) { 19260c93e1b7SIlya Dryomov off = round_up(off, rbd_dev->opts->alloc_size); 19270c93e1b7SIlya Dryomov next_off = round_down(next_off, rbd_dev->opts->alloc_size); 19280c93e1b7SIlya Dryomov if (off >= next_off) 19290c93e1b7SIlya Dryomov return 1; 19300c93e1b7SIlya Dryomov } 19310c93e1b7SIlya Dryomov 19326484cbe9SIlya Dryomov /* reverse map the entire object onto the parent */ 19336484cbe9SIlya Dryomov ret = rbd_obj_calc_img_extents(obj_req, true); 19346484cbe9SIlya Dryomov if (ret) 19356484cbe9SIlya Dryomov return ret; 19366484cbe9SIlya Dryomov 19376484cbe9SIlya Dryomov obj_req->osd_req = rbd_osd_req_create(obj_req, 1); 19386484cbe9SIlya Dryomov if (!obj_req->osd_req) 19396484cbe9SIlya Dryomov return -ENOMEM; 19406484cbe9SIlya Dryomov 19416484cbe9SIlya Dryomov if (rbd_obj_is_entire(obj_req) && !obj_req->num_img_extents) { 19426484cbe9SIlya Dryomov osd_req_op_init(obj_req->osd_req, 0, CEPH_OSD_OP_DELETE, 0); 19436484cbe9SIlya Dryomov } else { 19440c93e1b7SIlya Dryomov dout("%s %p %llu~%llu -> %llu~%llu\n", __func__, 19450c93e1b7SIlya Dryomov obj_req, obj_req->ex.oe_off, obj_req->ex.oe_len, 19460c93e1b7SIlya Dryomov off, next_off - off); 19476484cbe9SIlya Dryomov osd_req_op_extent_init(obj_req->osd_req, 0, 19486484cbe9SIlya Dryomov truncate_or_zero_opcode(obj_req), 19490c93e1b7SIlya Dryomov off, next_off - off, 0, 0); 19506484cbe9SIlya Dryomov } 19516484cbe9SIlya Dryomov 19526484cbe9SIlya Dryomov obj_req->write_state = RBD_OBJ_WRITE_FLAT; 19536484cbe9SIlya Dryomov rbd_osd_req_format_write(obj_req); 19546484cbe9SIlya Dryomov return 0; 19556484cbe9SIlya Dryomov } 19566484cbe9SIlya Dryomov 195713488d53SIlya Dryomov static int count_zeroout_ops(struct rbd_obj_request *obj_req) 195813488d53SIlya Dryomov { 195913488d53SIlya Dryomov int num_osd_ops; 196013488d53SIlya Dryomov 19619b17eb2cSIlya Dryomov if (rbd_obj_is_entire(obj_req) && obj_req->num_img_extents && 19629b17eb2cSIlya Dryomov !rbd_obj_copyup_enabled(obj_req)) 196313488d53SIlya Dryomov num_osd_ops = 2; /* create + truncate */ 196413488d53SIlya Dryomov else 196513488d53SIlya Dryomov num_osd_ops = 1; /* delete/truncate/zero */ 196613488d53SIlya Dryomov 196713488d53SIlya Dryomov return num_osd_ops; 196813488d53SIlya Dryomov } 196913488d53SIlya Dryomov 19706484cbe9SIlya Dryomov static void __rbd_obj_setup_zeroout(struct rbd_obj_request *obj_req, 19713da691bfSIlya Dryomov unsigned int which) 197270d045f6SIlya Dryomov { 19733da691bfSIlya Dryomov u16 opcode; 1974058aa991SIlya Dryomov 19753da691bfSIlya Dryomov if (rbd_obj_is_entire(obj_req)) { 197686bd7998SIlya Dryomov if (obj_req->num_img_extents) { 19779b17eb2cSIlya Dryomov if (!rbd_obj_copyup_enabled(obj_req)) 19782bb1e56eSIlya Dryomov osd_req_op_init(obj_req->osd_req, which++, 19792bb1e56eSIlya Dryomov CEPH_OSD_OP_CREATE, 0); 19803da691bfSIlya Dryomov opcode = CEPH_OSD_OP_TRUNCATE; 19813da691bfSIlya Dryomov } else { 19823da691bfSIlya Dryomov osd_req_op_init(obj_req->osd_req, which++, 19833da691bfSIlya Dryomov CEPH_OSD_OP_DELETE, 0); 19843da691bfSIlya Dryomov opcode = 0; 19853da691bfSIlya Dryomov } 19863da691bfSIlya Dryomov } else { 19876484cbe9SIlya Dryomov opcode = truncate_or_zero_opcode(obj_req); 19883da691bfSIlya Dryomov } 19893da691bfSIlya Dryomov 19903da691bfSIlya Dryomov if (opcode) 19913da691bfSIlya Dryomov osd_req_op_extent_init(obj_req->osd_req, which++, opcode, 199243df3d35SIlya Dryomov obj_req->ex.oe_off, obj_req->ex.oe_len, 19933da691bfSIlya Dryomov 0, 0); 19943da691bfSIlya Dryomov 19953da691bfSIlya Dryomov rbd_assert(which == obj_req->osd_req->r_num_ops); 19963da691bfSIlya Dryomov rbd_osd_req_format_write(obj_req); 19973da691bfSIlya Dryomov } 19983da691bfSIlya Dryomov 19996484cbe9SIlya Dryomov static int rbd_obj_setup_zeroout(struct rbd_obj_request *obj_req) 20003da691bfSIlya Dryomov { 20013da691bfSIlya Dryomov unsigned int num_osd_ops, which = 0; 200213488d53SIlya Dryomov bool need_guard; 20033da691bfSIlya Dryomov int ret; 20043da691bfSIlya Dryomov 200586bd7998SIlya Dryomov /* reverse map the entire object onto the parent */ 200686bd7998SIlya Dryomov ret = rbd_obj_calc_img_extents(obj_req, true); 200786bd7998SIlya Dryomov if (ret) 200886bd7998SIlya Dryomov return ret; 200986bd7998SIlya Dryomov 201013488d53SIlya Dryomov need_guard = rbd_obj_copyup_enabled(obj_req); 201113488d53SIlya Dryomov num_osd_ops = need_guard + count_zeroout_ops(obj_req); 20123da691bfSIlya Dryomov 2013a162b308SIlya Dryomov obj_req->osd_req = rbd_osd_req_create(obj_req, num_osd_ops); 20143da691bfSIlya Dryomov if (!obj_req->osd_req) 20153da691bfSIlya Dryomov return -ENOMEM; 20163da691bfSIlya Dryomov 201713488d53SIlya Dryomov if (need_guard) { 20183da691bfSIlya Dryomov ret = __rbd_obj_setup_stat(obj_req, which++); 20193da691bfSIlya Dryomov if (ret) 20203da691bfSIlya Dryomov return ret; 202113488d53SIlya Dryomov 202213488d53SIlya Dryomov obj_req->write_state = RBD_OBJ_WRITE_GUARD; 202313488d53SIlya Dryomov } else { 202413488d53SIlya Dryomov obj_req->write_state = RBD_OBJ_WRITE_FLAT; 20253da691bfSIlya Dryomov } 20263da691bfSIlya Dryomov 20276484cbe9SIlya Dryomov __rbd_obj_setup_zeroout(obj_req, which); 2028980917fcSIlya Dryomov return 0; 2029b454e36dSAlex Elder } 2030b454e36dSAlex Elder 2031b454e36dSAlex Elder /* 20323da691bfSIlya Dryomov * For each object request in @img_req, allocate an OSD request, add 20333da691bfSIlya Dryomov * individual OSD ops and prepare them for submission. The number of 20343da691bfSIlya Dryomov * OSD ops depends on op_type and the overlap point (if any). 2035b454e36dSAlex Elder */ 20363da691bfSIlya Dryomov static int __rbd_img_fill_request(struct rbd_img_request *img_req) 20373da691bfSIlya Dryomov { 20380c93e1b7SIlya Dryomov struct rbd_obj_request *obj_req, *next_obj_req; 20393da691bfSIlya Dryomov int ret; 20403d7efd18SAlex Elder 20410c93e1b7SIlya Dryomov for_each_obj_request_safe(img_req, obj_req, next_obj_req) { 20429bb0248dSIlya Dryomov switch (img_req->op_type) { 20433da691bfSIlya Dryomov case OBJ_OP_READ: 20443da691bfSIlya Dryomov ret = rbd_obj_setup_read(obj_req); 20453da691bfSIlya Dryomov break; 20463da691bfSIlya Dryomov case OBJ_OP_WRITE: 20473da691bfSIlya Dryomov ret = rbd_obj_setup_write(obj_req); 20483da691bfSIlya Dryomov break; 20493da691bfSIlya Dryomov case OBJ_OP_DISCARD: 20503da691bfSIlya Dryomov ret = rbd_obj_setup_discard(obj_req); 20513da691bfSIlya Dryomov break; 20526484cbe9SIlya Dryomov case OBJ_OP_ZEROOUT: 20536484cbe9SIlya Dryomov ret = rbd_obj_setup_zeroout(obj_req); 20546484cbe9SIlya Dryomov break; 20553da691bfSIlya Dryomov default: 20563da691bfSIlya Dryomov rbd_assert(0); 20573da691bfSIlya Dryomov } 20580c93e1b7SIlya Dryomov if (ret < 0) 20593da691bfSIlya Dryomov return ret; 20600c93e1b7SIlya Dryomov if (ret > 0) { 20610c93e1b7SIlya Dryomov img_req->xferred += obj_req->ex.oe_len; 20620c93e1b7SIlya Dryomov img_req->pending_count--; 20630c93e1b7SIlya Dryomov rbd_img_obj_request_del(img_req, obj_req); 20640c93e1b7SIlya Dryomov continue; 20650c93e1b7SIlya Dryomov } 206626f887e0SIlya Dryomov 206726f887e0SIlya Dryomov ret = ceph_osdc_alloc_messages(obj_req->osd_req, GFP_NOIO); 206826f887e0SIlya Dryomov if (ret) 206926f887e0SIlya Dryomov return ret; 2070b454e36dSAlex Elder } 2071b454e36dSAlex Elder 20723da691bfSIlya Dryomov return 0; 20733da691bfSIlya Dryomov } 20743da691bfSIlya Dryomov 20755a237819SIlya Dryomov union rbd_img_fill_iter { 20765a237819SIlya Dryomov struct ceph_bio_iter bio_iter; 20775a237819SIlya Dryomov struct ceph_bvec_iter bvec_iter; 20785a237819SIlya Dryomov }; 20795a237819SIlya Dryomov 20805a237819SIlya Dryomov struct rbd_img_fill_ctx { 20815a237819SIlya Dryomov enum obj_request_type pos_type; 20825a237819SIlya Dryomov union rbd_img_fill_iter *pos; 20835a237819SIlya Dryomov union rbd_img_fill_iter iter; 20845a237819SIlya Dryomov ceph_object_extent_fn_t set_pos_fn; 2085afb97888SIlya Dryomov ceph_object_extent_fn_t count_fn; 2086afb97888SIlya Dryomov ceph_object_extent_fn_t copy_fn; 20875a237819SIlya Dryomov }; 20885a237819SIlya Dryomov 20895a237819SIlya Dryomov static struct ceph_object_extent *alloc_object_extent(void *arg) 20905a237819SIlya Dryomov { 20915a237819SIlya Dryomov struct rbd_img_request *img_req = arg; 20925a237819SIlya Dryomov struct rbd_obj_request *obj_req; 20935a237819SIlya Dryomov 20945a237819SIlya Dryomov obj_req = rbd_obj_request_create(); 20955a237819SIlya Dryomov if (!obj_req) 20965a237819SIlya Dryomov return NULL; 20975a237819SIlya Dryomov 20985a237819SIlya Dryomov rbd_img_obj_request_add(img_req, obj_req); 20995a237819SIlya Dryomov return &obj_req->ex; 21005a237819SIlya Dryomov } 21015a237819SIlya Dryomov 21025a237819SIlya Dryomov /* 2103afb97888SIlya Dryomov * While su != os && sc == 1 is technically not fancy (it's the same 2104afb97888SIlya Dryomov * layout as su == os && sc == 1), we can't use the nocopy path for it 2105afb97888SIlya Dryomov * because ->set_pos_fn() should be called only once per object. 2106afb97888SIlya Dryomov * ceph_file_to_extents() invokes action_fn once per stripe unit, so 2107afb97888SIlya Dryomov * treat su != os && sc == 1 as fancy. 21085a237819SIlya Dryomov */ 2109afb97888SIlya Dryomov static bool rbd_layout_is_fancy(struct ceph_file_layout *l) 2110afb97888SIlya Dryomov { 2111afb97888SIlya Dryomov return l->stripe_unit != l->object_size; 2112afb97888SIlya Dryomov } 2113afb97888SIlya Dryomov 2114afb97888SIlya Dryomov static int rbd_img_fill_request_nocopy(struct rbd_img_request *img_req, 21155a237819SIlya Dryomov struct ceph_file_extent *img_extents, 21165a237819SIlya Dryomov u32 num_img_extents, 21175a237819SIlya Dryomov struct rbd_img_fill_ctx *fctx) 21185a237819SIlya Dryomov { 21195a237819SIlya Dryomov u32 i; 21205a237819SIlya Dryomov int ret; 21215a237819SIlya Dryomov 21225a237819SIlya Dryomov img_req->data_type = fctx->pos_type; 21235a237819SIlya Dryomov 21245a237819SIlya Dryomov /* 21255a237819SIlya Dryomov * Create object requests and set each object request's starting 21265a237819SIlya Dryomov * position in the provided bio (list) or bio_vec array. 21275a237819SIlya Dryomov */ 21285a237819SIlya Dryomov fctx->iter = *fctx->pos; 21295a237819SIlya Dryomov for (i = 0; i < num_img_extents; i++) { 21305a237819SIlya Dryomov ret = ceph_file_to_extents(&img_req->rbd_dev->layout, 21315a237819SIlya Dryomov img_extents[i].fe_off, 21325a237819SIlya Dryomov img_extents[i].fe_len, 21335a237819SIlya Dryomov &img_req->object_extents, 21345a237819SIlya Dryomov alloc_object_extent, img_req, 21355a237819SIlya Dryomov fctx->set_pos_fn, &fctx->iter); 21365a237819SIlya Dryomov if (ret) 21375a237819SIlya Dryomov return ret; 21385a237819SIlya Dryomov } 21395a237819SIlya Dryomov 21405a237819SIlya Dryomov return __rbd_img_fill_request(img_req); 21415a237819SIlya Dryomov } 21425a237819SIlya Dryomov 2143afb97888SIlya Dryomov /* 2144afb97888SIlya Dryomov * Map a list of image extents to a list of object extents, create the 2145afb97888SIlya Dryomov * corresponding object requests (normally each to a different object, 2146afb97888SIlya Dryomov * but not always) and add them to @img_req. For each object request, 2147afb97888SIlya Dryomov * set up its data descriptor to point to the corresponding chunk(s) of 2148afb97888SIlya Dryomov * @fctx->pos data buffer. 2149afb97888SIlya Dryomov * 2150afb97888SIlya Dryomov * Because ceph_file_to_extents() will merge adjacent object extents 2151afb97888SIlya Dryomov * together, each object request's data descriptor may point to multiple 2152afb97888SIlya Dryomov * different chunks of @fctx->pos data buffer. 2153afb97888SIlya Dryomov * 2154afb97888SIlya Dryomov * @fctx->pos data buffer is assumed to be large enough. 2155afb97888SIlya Dryomov */ 2156afb97888SIlya Dryomov static int rbd_img_fill_request(struct rbd_img_request *img_req, 2157afb97888SIlya Dryomov struct ceph_file_extent *img_extents, 2158afb97888SIlya Dryomov u32 num_img_extents, 2159afb97888SIlya Dryomov struct rbd_img_fill_ctx *fctx) 2160afb97888SIlya Dryomov { 2161afb97888SIlya Dryomov struct rbd_device *rbd_dev = img_req->rbd_dev; 2162afb97888SIlya Dryomov struct rbd_obj_request *obj_req; 2163afb97888SIlya Dryomov u32 i; 2164afb97888SIlya Dryomov int ret; 2165afb97888SIlya Dryomov 2166afb97888SIlya Dryomov if (fctx->pos_type == OBJ_REQUEST_NODATA || 2167afb97888SIlya Dryomov !rbd_layout_is_fancy(&rbd_dev->layout)) 2168afb97888SIlya Dryomov return rbd_img_fill_request_nocopy(img_req, img_extents, 2169afb97888SIlya Dryomov num_img_extents, fctx); 2170afb97888SIlya Dryomov 2171afb97888SIlya Dryomov img_req->data_type = OBJ_REQUEST_OWN_BVECS; 2172afb97888SIlya Dryomov 2173afb97888SIlya Dryomov /* 2174afb97888SIlya Dryomov * Create object requests and determine ->bvec_count for each object 2175afb97888SIlya Dryomov * request. Note that ->bvec_count sum over all object requests may 2176afb97888SIlya Dryomov * be greater than the number of bio_vecs in the provided bio (list) 2177afb97888SIlya Dryomov * or bio_vec array because when mapped, those bio_vecs can straddle 2178afb97888SIlya Dryomov * stripe unit boundaries. 2179afb97888SIlya Dryomov */ 2180afb97888SIlya Dryomov fctx->iter = *fctx->pos; 2181afb97888SIlya Dryomov for (i = 0; i < num_img_extents; i++) { 2182afb97888SIlya Dryomov ret = ceph_file_to_extents(&rbd_dev->layout, 2183afb97888SIlya Dryomov img_extents[i].fe_off, 2184afb97888SIlya Dryomov img_extents[i].fe_len, 2185afb97888SIlya Dryomov &img_req->object_extents, 2186afb97888SIlya Dryomov alloc_object_extent, img_req, 2187afb97888SIlya Dryomov fctx->count_fn, &fctx->iter); 2188afb97888SIlya Dryomov if (ret) 2189afb97888SIlya Dryomov return ret; 2190afb97888SIlya Dryomov } 2191afb97888SIlya Dryomov 2192afb97888SIlya Dryomov for_each_obj_request(img_req, obj_req) { 2193afb97888SIlya Dryomov obj_req->bvec_pos.bvecs = kmalloc_array(obj_req->bvec_count, 2194afb97888SIlya Dryomov sizeof(*obj_req->bvec_pos.bvecs), 2195afb97888SIlya Dryomov GFP_NOIO); 2196afb97888SIlya Dryomov if (!obj_req->bvec_pos.bvecs) 2197afb97888SIlya Dryomov return -ENOMEM; 2198afb97888SIlya Dryomov } 2199afb97888SIlya Dryomov 2200afb97888SIlya Dryomov /* 2201afb97888SIlya Dryomov * Fill in each object request's private bio_vec array, splitting and 2202afb97888SIlya Dryomov * rearranging the provided bio_vecs in stripe unit chunks as needed. 2203afb97888SIlya Dryomov */ 2204afb97888SIlya Dryomov fctx->iter = *fctx->pos; 2205afb97888SIlya Dryomov for (i = 0; i < num_img_extents; i++) { 2206afb97888SIlya Dryomov ret = ceph_iterate_extents(&rbd_dev->layout, 2207afb97888SIlya Dryomov img_extents[i].fe_off, 2208afb97888SIlya Dryomov img_extents[i].fe_len, 2209afb97888SIlya Dryomov &img_req->object_extents, 2210afb97888SIlya Dryomov fctx->copy_fn, &fctx->iter); 2211afb97888SIlya Dryomov if (ret) 2212afb97888SIlya Dryomov return ret; 2213afb97888SIlya Dryomov } 2214afb97888SIlya Dryomov 2215afb97888SIlya Dryomov return __rbd_img_fill_request(img_req); 2216afb97888SIlya Dryomov } 2217afb97888SIlya Dryomov 22185a237819SIlya Dryomov static int rbd_img_fill_nodata(struct rbd_img_request *img_req, 22195a237819SIlya Dryomov u64 off, u64 len) 22205a237819SIlya Dryomov { 22215a237819SIlya Dryomov struct ceph_file_extent ex = { off, len }; 22225a237819SIlya Dryomov union rbd_img_fill_iter dummy; 22235a237819SIlya Dryomov struct rbd_img_fill_ctx fctx = { 22245a237819SIlya Dryomov .pos_type = OBJ_REQUEST_NODATA, 22255a237819SIlya Dryomov .pos = &dummy, 22265a237819SIlya Dryomov }; 22275a237819SIlya Dryomov 22285a237819SIlya Dryomov return rbd_img_fill_request(img_req, &ex, 1, &fctx); 22295a237819SIlya Dryomov } 22305a237819SIlya Dryomov 22315a237819SIlya Dryomov static void set_bio_pos(struct ceph_object_extent *ex, u32 bytes, void *arg) 22325a237819SIlya Dryomov { 22335a237819SIlya Dryomov struct rbd_obj_request *obj_req = 22345a237819SIlya Dryomov container_of(ex, struct rbd_obj_request, ex); 22355a237819SIlya Dryomov struct ceph_bio_iter *it = arg; 22365a237819SIlya Dryomov 22375a237819SIlya Dryomov dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes); 22385a237819SIlya Dryomov obj_req->bio_pos = *it; 22395a237819SIlya Dryomov ceph_bio_iter_advance(it, bytes); 22405a237819SIlya Dryomov } 22415a237819SIlya Dryomov 2242afb97888SIlya Dryomov static void count_bio_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg) 2243afb97888SIlya Dryomov { 2244afb97888SIlya Dryomov struct rbd_obj_request *obj_req = 2245afb97888SIlya Dryomov container_of(ex, struct rbd_obj_request, ex); 2246afb97888SIlya Dryomov struct ceph_bio_iter *it = arg; 2247afb97888SIlya Dryomov 2248afb97888SIlya Dryomov dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes); 2249afb97888SIlya Dryomov ceph_bio_iter_advance_step(it, bytes, ({ 2250afb97888SIlya Dryomov obj_req->bvec_count++; 2251afb97888SIlya Dryomov })); 2252afb97888SIlya Dryomov 2253afb97888SIlya Dryomov } 2254afb97888SIlya Dryomov 2255afb97888SIlya Dryomov static void copy_bio_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg) 2256afb97888SIlya Dryomov { 2257afb97888SIlya Dryomov struct rbd_obj_request *obj_req = 2258afb97888SIlya Dryomov container_of(ex, struct rbd_obj_request, ex); 2259afb97888SIlya Dryomov struct ceph_bio_iter *it = arg; 2260afb97888SIlya Dryomov 2261afb97888SIlya Dryomov dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes); 2262afb97888SIlya Dryomov ceph_bio_iter_advance_step(it, bytes, ({ 2263afb97888SIlya Dryomov obj_req->bvec_pos.bvecs[obj_req->bvec_idx++] = bv; 2264afb97888SIlya Dryomov obj_req->bvec_pos.iter.bi_size += bv.bv_len; 2265afb97888SIlya Dryomov })); 2266afb97888SIlya Dryomov } 2267afb97888SIlya Dryomov 22685a237819SIlya Dryomov static int __rbd_img_fill_from_bio(struct rbd_img_request *img_req, 22695a237819SIlya Dryomov struct ceph_file_extent *img_extents, 22705a237819SIlya Dryomov u32 num_img_extents, 22715a237819SIlya Dryomov struct ceph_bio_iter *bio_pos) 22725a237819SIlya Dryomov { 22735a237819SIlya Dryomov struct rbd_img_fill_ctx fctx = { 22745a237819SIlya Dryomov .pos_type = OBJ_REQUEST_BIO, 22755a237819SIlya Dryomov .pos = (union rbd_img_fill_iter *)bio_pos, 22765a237819SIlya Dryomov .set_pos_fn = set_bio_pos, 2277afb97888SIlya Dryomov .count_fn = count_bio_bvecs, 2278afb97888SIlya Dryomov .copy_fn = copy_bio_bvecs, 22795a237819SIlya Dryomov }; 22805a237819SIlya Dryomov 22815a237819SIlya Dryomov return rbd_img_fill_request(img_req, img_extents, num_img_extents, 22825a237819SIlya Dryomov &fctx); 22835a237819SIlya Dryomov } 22845a237819SIlya Dryomov 22855a237819SIlya Dryomov static int rbd_img_fill_from_bio(struct rbd_img_request *img_req, 22865a237819SIlya Dryomov u64 off, u64 len, struct bio *bio) 22875a237819SIlya Dryomov { 22885a237819SIlya Dryomov struct ceph_file_extent ex = { off, len }; 22895a237819SIlya Dryomov struct ceph_bio_iter it = { .bio = bio, .iter = bio->bi_iter }; 22905a237819SIlya Dryomov 22915a237819SIlya Dryomov return __rbd_img_fill_from_bio(img_req, &ex, 1, &it); 22925a237819SIlya Dryomov } 22935a237819SIlya Dryomov 22945a237819SIlya Dryomov static void set_bvec_pos(struct ceph_object_extent *ex, u32 bytes, void *arg) 22955a237819SIlya Dryomov { 22965a237819SIlya Dryomov struct rbd_obj_request *obj_req = 22975a237819SIlya Dryomov container_of(ex, struct rbd_obj_request, ex); 22985a237819SIlya Dryomov struct ceph_bvec_iter *it = arg; 22995a237819SIlya Dryomov 23005a237819SIlya Dryomov obj_req->bvec_pos = *it; 23015a237819SIlya Dryomov ceph_bvec_iter_shorten(&obj_req->bvec_pos, bytes); 23025a237819SIlya Dryomov ceph_bvec_iter_advance(it, bytes); 23035a237819SIlya Dryomov } 23045a237819SIlya Dryomov 2305afb97888SIlya Dryomov static void count_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg) 2306afb97888SIlya Dryomov { 2307afb97888SIlya Dryomov struct rbd_obj_request *obj_req = 2308afb97888SIlya Dryomov container_of(ex, struct rbd_obj_request, ex); 2309afb97888SIlya Dryomov struct ceph_bvec_iter *it = arg; 2310afb97888SIlya Dryomov 2311afb97888SIlya Dryomov ceph_bvec_iter_advance_step(it, bytes, ({ 2312afb97888SIlya Dryomov obj_req->bvec_count++; 2313afb97888SIlya Dryomov })); 2314afb97888SIlya Dryomov } 2315afb97888SIlya Dryomov 2316afb97888SIlya Dryomov static void copy_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg) 2317afb97888SIlya Dryomov { 2318afb97888SIlya Dryomov struct rbd_obj_request *obj_req = 2319afb97888SIlya Dryomov container_of(ex, struct rbd_obj_request, ex); 2320afb97888SIlya Dryomov struct ceph_bvec_iter *it = arg; 2321afb97888SIlya Dryomov 2322afb97888SIlya Dryomov ceph_bvec_iter_advance_step(it, bytes, ({ 2323afb97888SIlya Dryomov obj_req->bvec_pos.bvecs[obj_req->bvec_idx++] = bv; 2324afb97888SIlya Dryomov obj_req->bvec_pos.iter.bi_size += bv.bv_len; 2325afb97888SIlya Dryomov })); 2326afb97888SIlya Dryomov } 2327afb97888SIlya Dryomov 23285a237819SIlya Dryomov static int __rbd_img_fill_from_bvecs(struct rbd_img_request *img_req, 23295a237819SIlya Dryomov struct ceph_file_extent *img_extents, 23305a237819SIlya Dryomov u32 num_img_extents, 23315a237819SIlya Dryomov struct ceph_bvec_iter *bvec_pos) 23325a237819SIlya Dryomov { 23335a237819SIlya Dryomov struct rbd_img_fill_ctx fctx = { 23345a237819SIlya Dryomov .pos_type = OBJ_REQUEST_BVECS, 23355a237819SIlya Dryomov .pos = (union rbd_img_fill_iter *)bvec_pos, 23365a237819SIlya Dryomov .set_pos_fn = set_bvec_pos, 2337afb97888SIlya Dryomov .count_fn = count_bvecs, 2338afb97888SIlya Dryomov .copy_fn = copy_bvecs, 23395a237819SIlya Dryomov }; 23405a237819SIlya Dryomov 23415a237819SIlya Dryomov return rbd_img_fill_request(img_req, img_extents, num_img_extents, 23425a237819SIlya Dryomov &fctx); 23435a237819SIlya Dryomov } 23445a237819SIlya Dryomov 23455a237819SIlya Dryomov static int rbd_img_fill_from_bvecs(struct rbd_img_request *img_req, 23465a237819SIlya Dryomov struct ceph_file_extent *img_extents, 23475a237819SIlya Dryomov u32 num_img_extents, 23485a237819SIlya Dryomov struct bio_vec *bvecs) 23495a237819SIlya Dryomov { 23505a237819SIlya Dryomov struct ceph_bvec_iter it = { 23515a237819SIlya Dryomov .bvecs = bvecs, 23525a237819SIlya Dryomov .iter = { .bi_size = ceph_file_extents_bytes(img_extents, 23535a237819SIlya Dryomov num_img_extents) }, 23545a237819SIlya Dryomov }; 23555a237819SIlya Dryomov 23565a237819SIlya Dryomov return __rbd_img_fill_from_bvecs(img_req, img_extents, num_img_extents, 23575a237819SIlya Dryomov &it); 23585a237819SIlya Dryomov } 23595a237819SIlya Dryomov 2360efbd1a11SIlya Dryomov static void rbd_img_request_submit(struct rbd_img_request *img_request) 2361bf0d5f50SAlex Elder { 2362bf0d5f50SAlex Elder struct rbd_obj_request *obj_request; 2363bf0d5f50SAlex Elder 236437206ee5SAlex Elder dout("%s: img %p\n", __func__, img_request); 2365bf0d5f50SAlex Elder 2366663ae2ccSIlya Dryomov rbd_img_request_get(img_request); 2367efbd1a11SIlya Dryomov for_each_obj_request(img_request, obj_request) 23683da691bfSIlya Dryomov rbd_obj_request_submit(obj_request); 2369bf0d5f50SAlex Elder 2370663ae2ccSIlya Dryomov rbd_img_request_put(img_request); 2371bf0d5f50SAlex Elder } 2372bf0d5f50SAlex Elder 237386bd7998SIlya Dryomov static int rbd_obj_read_from_parent(struct rbd_obj_request *obj_req) 23743da691bfSIlya Dryomov { 23753da691bfSIlya Dryomov struct rbd_img_request *img_req = obj_req->img_request; 23763da691bfSIlya Dryomov struct rbd_img_request *child_img_req; 23773da691bfSIlya Dryomov int ret; 23783da691bfSIlya Dryomov 2379e93aca0aSIlya Dryomov child_img_req = rbd_img_request_create(img_req->rbd_dev->parent, 2380e93aca0aSIlya Dryomov OBJ_OP_READ, NULL); 23813da691bfSIlya Dryomov if (!child_img_req) 23823da691bfSIlya Dryomov return -ENOMEM; 23833da691bfSIlya Dryomov 2384e93aca0aSIlya Dryomov __set_bit(IMG_REQ_CHILD, &child_img_req->flags); 2385e93aca0aSIlya Dryomov child_img_req->obj_request = obj_req; 2386e93aca0aSIlya Dryomov 23873da691bfSIlya Dryomov if (!rbd_img_is_write(img_req)) { 2388ecc633caSIlya Dryomov switch (img_req->data_type) { 23893da691bfSIlya Dryomov case OBJ_REQUEST_BIO: 23905a237819SIlya Dryomov ret = __rbd_img_fill_from_bio(child_img_req, 23915a237819SIlya Dryomov obj_req->img_extents, 23925a237819SIlya Dryomov obj_req->num_img_extents, 23933da691bfSIlya Dryomov &obj_req->bio_pos); 23943da691bfSIlya Dryomov break; 23953da691bfSIlya Dryomov case OBJ_REQUEST_BVECS: 2396afb97888SIlya Dryomov case OBJ_REQUEST_OWN_BVECS: 23975a237819SIlya Dryomov ret = __rbd_img_fill_from_bvecs(child_img_req, 23985a237819SIlya Dryomov obj_req->img_extents, 23995a237819SIlya Dryomov obj_req->num_img_extents, 24003da691bfSIlya Dryomov &obj_req->bvec_pos); 24013da691bfSIlya Dryomov break; 24023da691bfSIlya Dryomov default: 24033da691bfSIlya Dryomov rbd_assert(0); 24043da691bfSIlya Dryomov } 24053da691bfSIlya Dryomov } else { 24065a237819SIlya Dryomov ret = rbd_img_fill_from_bvecs(child_img_req, 24075a237819SIlya Dryomov obj_req->img_extents, 24085a237819SIlya Dryomov obj_req->num_img_extents, 24095a237819SIlya Dryomov obj_req->copyup_bvecs); 24103da691bfSIlya Dryomov } 24113da691bfSIlya Dryomov if (ret) { 24123da691bfSIlya Dryomov rbd_img_request_put(child_img_req); 2413663ae2ccSIlya Dryomov return ret; 2414bf0d5f50SAlex Elder } 2415bf0d5f50SAlex Elder 24163da691bfSIlya Dryomov rbd_img_request_submit(child_img_req); 24173da691bfSIlya Dryomov return 0; 24183da691bfSIlya Dryomov } 24193da691bfSIlya Dryomov 24203da691bfSIlya Dryomov static bool rbd_obj_handle_read(struct rbd_obj_request *obj_req) 24218b3e1a56SAlex Elder { 24223da691bfSIlya Dryomov struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; 24233da691bfSIlya Dryomov int ret; 24248b3e1a56SAlex Elder 24253da691bfSIlya Dryomov if (obj_req->result == -ENOENT && 242686bd7998SIlya Dryomov rbd_dev->parent_overlap && !obj_req->tried_parent) { 242786bd7998SIlya Dryomov /* reverse map this object extent onto the parent */ 242886bd7998SIlya Dryomov ret = rbd_obj_calc_img_extents(obj_req, false); 242986bd7998SIlya Dryomov if (ret) { 243086bd7998SIlya Dryomov obj_req->result = ret; 243186bd7998SIlya Dryomov return true; 243286bd7998SIlya Dryomov } 24338b3e1a56SAlex Elder 243486bd7998SIlya Dryomov if (obj_req->num_img_extents) { 24353da691bfSIlya Dryomov obj_req->tried_parent = true; 243686bd7998SIlya Dryomov ret = rbd_obj_read_from_parent(obj_req); 24373da691bfSIlya Dryomov if (ret) { 24383da691bfSIlya Dryomov obj_req->result = ret; 24393da691bfSIlya Dryomov return true; 24403da691bfSIlya Dryomov } 24413da691bfSIlya Dryomov return false; 24423da691bfSIlya Dryomov } 244386bd7998SIlya Dryomov } 244402c74fbaSAlex Elder 244502c74fbaSAlex Elder /* 24463da691bfSIlya Dryomov * -ENOENT means a hole in the image -- zero-fill the entire 24473da691bfSIlya Dryomov * length of the request. A short read also implies zero-fill 24483da691bfSIlya Dryomov * to the end of the request. In both cases we update xferred 24493da691bfSIlya Dryomov * count to indicate the whole request was satisfied. 245002c74fbaSAlex Elder */ 24513da691bfSIlya Dryomov if (obj_req->result == -ENOENT || 245243df3d35SIlya Dryomov (!obj_req->result && obj_req->xferred < obj_req->ex.oe_len)) { 24533da691bfSIlya Dryomov rbd_assert(!obj_req->xferred || !obj_req->result); 24543da691bfSIlya Dryomov rbd_obj_zero_range(obj_req, obj_req->xferred, 245543df3d35SIlya Dryomov obj_req->ex.oe_len - obj_req->xferred); 24563da691bfSIlya Dryomov obj_req->result = 0; 245743df3d35SIlya Dryomov obj_req->xferred = obj_req->ex.oe_len; 24583da691bfSIlya Dryomov } 24593da691bfSIlya Dryomov 24603da691bfSIlya Dryomov return true; 24613da691bfSIlya Dryomov } 24623da691bfSIlya Dryomov 24633da691bfSIlya Dryomov /* 24643da691bfSIlya Dryomov * copyup_bvecs pages are never highmem pages 24653da691bfSIlya Dryomov */ 24663da691bfSIlya Dryomov static bool is_zero_bvecs(struct bio_vec *bvecs, u32 bytes) 24673da691bfSIlya Dryomov { 24683da691bfSIlya Dryomov struct ceph_bvec_iter it = { 24693da691bfSIlya Dryomov .bvecs = bvecs, 24703da691bfSIlya Dryomov .iter = { .bi_size = bytes }, 24713da691bfSIlya Dryomov }; 24723da691bfSIlya Dryomov 24733da691bfSIlya Dryomov ceph_bvec_iter_advance_step(&it, bytes, ({ 24743da691bfSIlya Dryomov if (memchr_inv(page_address(bv.bv_page) + bv.bv_offset, 0, 24753da691bfSIlya Dryomov bv.bv_len)) 24763da691bfSIlya Dryomov return false; 24773da691bfSIlya Dryomov })); 24783da691bfSIlya Dryomov return true; 24793da691bfSIlya Dryomov } 24803da691bfSIlya Dryomov 24813a482501SIlya Dryomov #define MODS_ONLY U32_MAX 24823a482501SIlya Dryomov 248389a59c1cSIlya Dryomov static int rbd_obj_issue_copyup_empty_snapc(struct rbd_obj_request *obj_req, 248489a59c1cSIlya Dryomov u32 bytes) 248589a59c1cSIlya Dryomov { 248689a59c1cSIlya Dryomov int ret; 248789a59c1cSIlya Dryomov 248889a59c1cSIlya Dryomov dout("%s obj_req %p bytes %u\n", __func__, obj_req, bytes); 248989a59c1cSIlya Dryomov rbd_assert(obj_req->osd_req->r_ops[0].op == CEPH_OSD_OP_STAT); 249089a59c1cSIlya Dryomov rbd_assert(bytes > 0 && bytes != MODS_ONLY); 249189a59c1cSIlya Dryomov rbd_osd_req_destroy(obj_req->osd_req); 249289a59c1cSIlya Dryomov 249389a59c1cSIlya Dryomov obj_req->osd_req = __rbd_osd_req_create(obj_req, &rbd_empty_snapc, 1); 249489a59c1cSIlya Dryomov if (!obj_req->osd_req) 249589a59c1cSIlya Dryomov return -ENOMEM; 249689a59c1cSIlya Dryomov 249789a59c1cSIlya Dryomov ret = osd_req_op_cls_init(obj_req->osd_req, 0, "rbd", "copyup"); 249889a59c1cSIlya Dryomov if (ret) 249989a59c1cSIlya Dryomov return ret; 250089a59c1cSIlya Dryomov 250189a59c1cSIlya Dryomov osd_req_op_cls_request_data_bvecs(obj_req->osd_req, 0, 250289a59c1cSIlya Dryomov obj_req->copyup_bvecs, 250389a59c1cSIlya Dryomov obj_req->copyup_bvec_count, 250489a59c1cSIlya Dryomov bytes); 250589a59c1cSIlya Dryomov rbd_osd_req_format_write(obj_req); 250689a59c1cSIlya Dryomov 250789a59c1cSIlya Dryomov ret = ceph_osdc_alloc_messages(obj_req->osd_req, GFP_NOIO); 250889a59c1cSIlya Dryomov if (ret) 250989a59c1cSIlya Dryomov return ret; 251089a59c1cSIlya Dryomov 251189a59c1cSIlya Dryomov rbd_obj_request_submit(obj_req); 251289a59c1cSIlya Dryomov return 0; 251389a59c1cSIlya Dryomov } 251489a59c1cSIlya Dryomov 25153a482501SIlya Dryomov static int rbd_obj_issue_copyup_ops(struct rbd_obj_request *obj_req, u32 bytes) 25163da691bfSIlya Dryomov { 251713488d53SIlya Dryomov struct rbd_img_request *img_req = obj_req->img_request; 25183a482501SIlya Dryomov unsigned int num_osd_ops = (bytes != MODS_ONLY); 25193a482501SIlya Dryomov unsigned int which = 0; 2520fe943d50SChengguang Xu int ret; 25213da691bfSIlya Dryomov 25223da691bfSIlya Dryomov dout("%s obj_req %p bytes %u\n", __func__, obj_req, bytes); 252389a59c1cSIlya Dryomov rbd_assert(obj_req->osd_req->r_ops[0].op == CEPH_OSD_OP_STAT || 252489a59c1cSIlya Dryomov obj_req->osd_req->r_ops[0].op == CEPH_OSD_OP_CALL); 25253da691bfSIlya Dryomov rbd_osd_req_destroy(obj_req->osd_req); 25263da691bfSIlya Dryomov 252713488d53SIlya Dryomov switch (img_req->op_type) { 252813488d53SIlya Dryomov case OBJ_OP_WRITE: 252913488d53SIlya Dryomov num_osd_ops += count_write_ops(obj_req); 253013488d53SIlya Dryomov break; 253113488d53SIlya Dryomov case OBJ_OP_ZEROOUT: 253213488d53SIlya Dryomov num_osd_ops += count_zeroout_ops(obj_req); 253313488d53SIlya Dryomov break; 253413488d53SIlya Dryomov default: 253513488d53SIlya Dryomov rbd_assert(0); 253613488d53SIlya Dryomov } 253713488d53SIlya Dryomov 2538a162b308SIlya Dryomov obj_req->osd_req = rbd_osd_req_create(obj_req, num_osd_ops); 25393da691bfSIlya Dryomov if (!obj_req->osd_req) 25403da691bfSIlya Dryomov return -ENOMEM; 25413da691bfSIlya Dryomov 25423a482501SIlya Dryomov if (bytes != MODS_ONLY) { 25433a482501SIlya Dryomov ret = osd_req_op_cls_init(obj_req->osd_req, which, "rbd", 25443a482501SIlya Dryomov "copyup"); 2545fe943d50SChengguang Xu if (ret) 2546fe943d50SChengguang Xu return ret; 2547fe943d50SChengguang Xu 25483a482501SIlya Dryomov osd_req_op_cls_request_data_bvecs(obj_req->osd_req, which++, 25490010f705SIlya Dryomov obj_req->copyup_bvecs, 25500010f705SIlya Dryomov obj_req->copyup_bvec_count, 25510010f705SIlya Dryomov bytes); 25523a482501SIlya Dryomov } 25533da691bfSIlya Dryomov 255413488d53SIlya Dryomov switch (img_req->op_type) { 25553da691bfSIlya Dryomov case OBJ_OP_WRITE: 25563a482501SIlya Dryomov __rbd_obj_setup_write(obj_req, which); 25573da691bfSIlya Dryomov break; 25586484cbe9SIlya Dryomov case OBJ_OP_ZEROOUT: 25593a482501SIlya Dryomov __rbd_obj_setup_zeroout(obj_req, which); 25603da691bfSIlya Dryomov break; 25613da691bfSIlya Dryomov default: 25623da691bfSIlya Dryomov rbd_assert(0); 25633da691bfSIlya Dryomov } 25643da691bfSIlya Dryomov 256526f887e0SIlya Dryomov ret = ceph_osdc_alloc_messages(obj_req->osd_req, GFP_NOIO); 256626f887e0SIlya Dryomov if (ret) 256726f887e0SIlya Dryomov return ret; 256826f887e0SIlya Dryomov 25693da691bfSIlya Dryomov rbd_obj_request_submit(obj_req); 25703da691bfSIlya Dryomov return 0; 25713da691bfSIlya Dryomov } 25723da691bfSIlya Dryomov 25733a482501SIlya Dryomov static int rbd_obj_issue_copyup(struct rbd_obj_request *obj_req, u32 bytes) 25743a482501SIlya Dryomov { 25753a482501SIlya Dryomov /* 25763a482501SIlya Dryomov * Only send non-zero copyup data to save some I/O and network 25773a482501SIlya Dryomov * bandwidth -- zero copyup data is equivalent to the object not 25783a482501SIlya Dryomov * existing. 25793a482501SIlya Dryomov */ 25803a482501SIlya Dryomov if (is_zero_bvecs(obj_req->copyup_bvecs, bytes)) { 25813a482501SIlya Dryomov dout("%s obj_req %p detected zeroes\n", __func__, obj_req); 25823a482501SIlya Dryomov bytes = 0; 25833a482501SIlya Dryomov } 25843a482501SIlya Dryomov 258589a59c1cSIlya Dryomov if (obj_req->img_request->snapc->num_snaps && bytes > 0) { 258689a59c1cSIlya Dryomov /* 258789a59c1cSIlya Dryomov * Send a copyup request with an empty snapshot context to 258889a59c1cSIlya Dryomov * deep-copyup the object through all existing snapshots. 258989a59c1cSIlya Dryomov * A second request with the current snapshot context will be 259089a59c1cSIlya Dryomov * sent for the actual modification. 259189a59c1cSIlya Dryomov */ 259289a59c1cSIlya Dryomov obj_req->write_state = RBD_OBJ_WRITE_COPYUP_EMPTY_SNAPC; 259389a59c1cSIlya Dryomov return rbd_obj_issue_copyup_empty_snapc(obj_req, bytes); 259489a59c1cSIlya Dryomov } 259589a59c1cSIlya Dryomov 25963a482501SIlya Dryomov obj_req->write_state = RBD_OBJ_WRITE_COPYUP_OPS; 25973a482501SIlya Dryomov return rbd_obj_issue_copyup_ops(obj_req, bytes); 25983a482501SIlya Dryomov } 25993a482501SIlya Dryomov 26007e07efb1SIlya Dryomov static int setup_copyup_bvecs(struct rbd_obj_request *obj_req, u64 obj_overlap) 26017e07efb1SIlya Dryomov { 26027e07efb1SIlya Dryomov u32 i; 26037e07efb1SIlya Dryomov 26047e07efb1SIlya Dryomov rbd_assert(!obj_req->copyup_bvecs); 26057e07efb1SIlya Dryomov obj_req->copyup_bvec_count = calc_pages_for(0, obj_overlap); 26067e07efb1SIlya Dryomov obj_req->copyup_bvecs = kcalloc(obj_req->copyup_bvec_count, 26077e07efb1SIlya Dryomov sizeof(*obj_req->copyup_bvecs), 26087e07efb1SIlya Dryomov GFP_NOIO); 26097e07efb1SIlya Dryomov if (!obj_req->copyup_bvecs) 26107e07efb1SIlya Dryomov return -ENOMEM; 26117e07efb1SIlya Dryomov 26127e07efb1SIlya Dryomov for (i = 0; i < obj_req->copyup_bvec_count; i++) { 26137e07efb1SIlya Dryomov unsigned int len = min(obj_overlap, (u64)PAGE_SIZE); 26147e07efb1SIlya Dryomov 26157e07efb1SIlya Dryomov obj_req->copyup_bvecs[i].bv_page = alloc_page(GFP_NOIO); 26167e07efb1SIlya Dryomov if (!obj_req->copyup_bvecs[i].bv_page) 26177e07efb1SIlya Dryomov return -ENOMEM; 26187e07efb1SIlya Dryomov 26197e07efb1SIlya Dryomov obj_req->copyup_bvecs[i].bv_offset = 0; 26207e07efb1SIlya Dryomov obj_req->copyup_bvecs[i].bv_len = len; 26217e07efb1SIlya Dryomov obj_overlap -= len; 26227e07efb1SIlya Dryomov } 26237e07efb1SIlya Dryomov 26247e07efb1SIlya Dryomov rbd_assert(!obj_overlap); 26257e07efb1SIlya Dryomov return 0; 26267e07efb1SIlya Dryomov } 26277e07efb1SIlya Dryomov 26283da691bfSIlya Dryomov static int rbd_obj_handle_write_guard(struct rbd_obj_request *obj_req) 26293da691bfSIlya Dryomov { 26303da691bfSIlya Dryomov struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; 26313da691bfSIlya Dryomov int ret; 26323da691bfSIlya Dryomov 263386bd7998SIlya Dryomov rbd_assert(obj_req->num_img_extents); 263486bd7998SIlya Dryomov prune_extents(obj_req->img_extents, &obj_req->num_img_extents, 263586bd7998SIlya Dryomov rbd_dev->parent_overlap); 263686bd7998SIlya Dryomov if (!obj_req->num_img_extents) { 26373da691bfSIlya Dryomov /* 26383da691bfSIlya Dryomov * The overlap has become 0 (most likely because the 26393a482501SIlya Dryomov * image has been flattened). Re-submit the original write 26403a482501SIlya Dryomov * request -- pass MODS_ONLY since the copyup isn't needed 26413a482501SIlya Dryomov * anymore. 26423da691bfSIlya Dryomov */ 26433a482501SIlya Dryomov obj_req->write_state = RBD_OBJ_WRITE_COPYUP_OPS; 26443a482501SIlya Dryomov return rbd_obj_issue_copyup_ops(obj_req, MODS_ONLY); 26453da691bfSIlya Dryomov } 26463da691bfSIlya Dryomov 264786bd7998SIlya Dryomov ret = setup_copyup_bvecs(obj_req, rbd_obj_img_extents_bytes(obj_req)); 26483da691bfSIlya Dryomov if (ret) 26493da691bfSIlya Dryomov return ret; 26503da691bfSIlya Dryomov 26513a482501SIlya Dryomov obj_req->write_state = RBD_OBJ_WRITE_READ_FROM_PARENT; 265286bd7998SIlya Dryomov return rbd_obj_read_from_parent(obj_req); 26533da691bfSIlya Dryomov } 26543da691bfSIlya Dryomov 26553da691bfSIlya Dryomov static bool rbd_obj_handle_write(struct rbd_obj_request *obj_req) 26563da691bfSIlya Dryomov { 26573da691bfSIlya Dryomov int ret; 26583da691bfSIlya Dryomov 26593da691bfSIlya Dryomov switch (obj_req->write_state) { 26603da691bfSIlya Dryomov case RBD_OBJ_WRITE_GUARD: 26613da691bfSIlya Dryomov rbd_assert(!obj_req->xferred); 26623da691bfSIlya Dryomov if (obj_req->result == -ENOENT) { 26633da691bfSIlya Dryomov /* 26643da691bfSIlya Dryomov * The target object doesn't exist. Read the data for 26653da691bfSIlya Dryomov * the entire target object up to the overlap point (if 26663da691bfSIlya Dryomov * any) from the parent, so we can use it for a copyup. 26673da691bfSIlya Dryomov */ 26683da691bfSIlya Dryomov ret = rbd_obj_handle_write_guard(obj_req); 26693da691bfSIlya Dryomov if (ret) { 26703da691bfSIlya Dryomov obj_req->result = ret; 26713da691bfSIlya Dryomov return true; 26723da691bfSIlya Dryomov } 26733da691bfSIlya Dryomov return false; 26743da691bfSIlya Dryomov } 26753da691bfSIlya Dryomov /* fall through */ 26763da691bfSIlya Dryomov case RBD_OBJ_WRITE_FLAT: 26773a482501SIlya Dryomov case RBD_OBJ_WRITE_COPYUP_OPS: 26783da691bfSIlya Dryomov if (!obj_req->result) 26793da691bfSIlya Dryomov /* 26803da691bfSIlya Dryomov * There is no such thing as a successful short 26813da691bfSIlya Dryomov * write -- indicate the whole request was satisfied. 26823da691bfSIlya Dryomov */ 268343df3d35SIlya Dryomov obj_req->xferred = obj_req->ex.oe_len; 26843da691bfSIlya Dryomov return true; 26853a482501SIlya Dryomov case RBD_OBJ_WRITE_READ_FROM_PARENT: 26863da691bfSIlya Dryomov if (obj_req->result) 26873a482501SIlya Dryomov return true; 26883da691bfSIlya Dryomov 26893da691bfSIlya Dryomov rbd_assert(obj_req->xferred); 26903da691bfSIlya Dryomov ret = rbd_obj_issue_copyup(obj_req, obj_req->xferred); 26913da691bfSIlya Dryomov if (ret) { 26923da691bfSIlya Dryomov obj_req->result = ret; 2693356889c4SIlya Dryomov obj_req->xferred = 0; 26943da691bfSIlya Dryomov return true; 26953da691bfSIlya Dryomov } 26963da691bfSIlya Dryomov return false; 269789a59c1cSIlya Dryomov case RBD_OBJ_WRITE_COPYUP_EMPTY_SNAPC: 269889a59c1cSIlya Dryomov if (obj_req->result) 269989a59c1cSIlya Dryomov return true; 270089a59c1cSIlya Dryomov 270189a59c1cSIlya Dryomov obj_req->write_state = RBD_OBJ_WRITE_COPYUP_OPS; 270289a59c1cSIlya Dryomov ret = rbd_obj_issue_copyup_ops(obj_req, MODS_ONLY); 270389a59c1cSIlya Dryomov if (ret) { 270489a59c1cSIlya Dryomov obj_req->result = ret; 270589a59c1cSIlya Dryomov return true; 270689a59c1cSIlya Dryomov } 270789a59c1cSIlya Dryomov return false; 27083da691bfSIlya Dryomov default: 2709c6244b3bSArnd Bergmann BUG(); 27103da691bfSIlya Dryomov } 27113da691bfSIlya Dryomov } 27123da691bfSIlya Dryomov 27133da691bfSIlya Dryomov /* 27143da691bfSIlya Dryomov * Returns true if @obj_req is completed, or false otherwise. 27153da691bfSIlya Dryomov */ 27163da691bfSIlya Dryomov static bool __rbd_obj_handle_request(struct rbd_obj_request *obj_req) 27173da691bfSIlya Dryomov { 27189bb0248dSIlya Dryomov switch (obj_req->img_request->op_type) { 27193da691bfSIlya Dryomov case OBJ_OP_READ: 27203da691bfSIlya Dryomov return rbd_obj_handle_read(obj_req); 27213da691bfSIlya Dryomov case OBJ_OP_WRITE: 27223da691bfSIlya Dryomov return rbd_obj_handle_write(obj_req); 27233da691bfSIlya Dryomov case OBJ_OP_DISCARD: 27246484cbe9SIlya Dryomov case OBJ_OP_ZEROOUT: 27253da691bfSIlya Dryomov if (rbd_obj_handle_write(obj_req)) { 27263da691bfSIlya Dryomov /* 27273da691bfSIlya Dryomov * Hide -ENOENT from delete/truncate/zero -- discarding 27283da691bfSIlya Dryomov * a non-existent object is not a problem. 27293da691bfSIlya Dryomov */ 27303da691bfSIlya Dryomov if (obj_req->result == -ENOENT) { 27313da691bfSIlya Dryomov obj_req->result = 0; 273243df3d35SIlya Dryomov obj_req->xferred = obj_req->ex.oe_len; 27333da691bfSIlya Dryomov } 27343da691bfSIlya Dryomov return true; 27353da691bfSIlya Dryomov } 27363da691bfSIlya Dryomov return false; 27373da691bfSIlya Dryomov default: 2738c6244b3bSArnd Bergmann BUG(); 27393da691bfSIlya Dryomov } 27403da691bfSIlya Dryomov } 27413da691bfSIlya Dryomov 27427114edacSIlya Dryomov static void rbd_obj_end_request(struct rbd_obj_request *obj_req) 27437114edacSIlya Dryomov { 27447114edacSIlya Dryomov struct rbd_img_request *img_req = obj_req->img_request; 27457114edacSIlya Dryomov 27467114edacSIlya Dryomov rbd_assert((!obj_req->result && 274743df3d35SIlya Dryomov obj_req->xferred == obj_req->ex.oe_len) || 27487114edacSIlya Dryomov (obj_req->result < 0 && !obj_req->xferred)); 27497114edacSIlya Dryomov if (!obj_req->result) { 27507114edacSIlya Dryomov img_req->xferred += obj_req->xferred; 275102c74fbaSAlex Elder return; 275202c74fbaSAlex Elder } 275302c74fbaSAlex Elder 27547114edacSIlya Dryomov rbd_warn(img_req->rbd_dev, 27557114edacSIlya Dryomov "%s at objno %llu %llu~%llu result %d xferred %llu", 275643df3d35SIlya Dryomov obj_op_name(img_req->op_type), obj_req->ex.oe_objno, 275743df3d35SIlya Dryomov obj_req->ex.oe_off, obj_req->ex.oe_len, obj_req->result, 27587114edacSIlya Dryomov obj_req->xferred); 27597114edacSIlya Dryomov if (!img_req->result) { 27607114edacSIlya Dryomov img_req->result = obj_req->result; 27617114edacSIlya Dryomov img_req->xferred = 0; 2762a9e8ba2cSAlex Elder } 27638b3e1a56SAlex Elder } 27648b3e1a56SAlex Elder 27653da691bfSIlya Dryomov static void rbd_img_end_child_request(struct rbd_img_request *img_req) 27668b3e1a56SAlex Elder { 27673da691bfSIlya Dryomov struct rbd_obj_request *obj_req = img_req->obj_request; 27688b3e1a56SAlex Elder 27693da691bfSIlya Dryomov rbd_assert(test_bit(IMG_REQ_CHILD, &img_req->flags)); 277086bd7998SIlya Dryomov rbd_assert((!img_req->result && 277186bd7998SIlya Dryomov img_req->xferred == rbd_obj_img_extents_bytes(obj_req)) || 277286bd7998SIlya Dryomov (img_req->result < 0 && !img_req->xferred)); 27738b3e1a56SAlex Elder 27743da691bfSIlya Dryomov obj_req->result = img_req->result; 27753da691bfSIlya Dryomov obj_req->xferred = img_req->xferred; 27763da691bfSIlya Dryomov rbd_img_request_put(img_req); 27777114edacSIlya Dryomov } 27788b3e1a56SAlex Elder 27797114edacSIlya Dryomov static void rbd_img_end_request(struct rbd_img_request *img_req) 27807114edacSIlya Dryomov { 27817114edacSIlya Dryomov rbd_assert(!test_bit(IMG_REQ_CHILD, &img_req->flags)); 27827114edacSIlya Dryomov rbd_assert((!img_req->result && 27837114edacSIlya Dryomov img_req->xferred == blk_rq_bytes(img_req->rq)) || 27847114edacSIlya Dryomov (img_req->result < 0 && !img_req->xferred)); 27858b3e1a56SAlex Elder 27867114edacSIlya Dryomov blk_mq_end_request(img_req->rq, 27877114edacSIlya Dryomov errno_to_blk_status(img_req->result)); 27887114edacSIlya Dryomov rbd_img_request_put(img_req); 27893da691bfSIlya Dryomov } 27908b3e1a56SAlex Elder 27913da691bfSIlya Dryomov static void rbd_obj_handle_request(struct rbd_obj_request *obj_req) 27923da691bfSIlya Dryomov { 27937114edacSIlya Dryomov struct rbd_img_request *img_req; 27947114edacSIlya Dryomov 27957114edacSIlya Dryomov again: 27963da691bfSIlya Dryomov if (!__rbd_obj_handle_request(obj_req)) 27978b3e1a56SAlex Elder return; 27983da691bfSIlya Dryomov 27997114edacSIlya Dryomov img_req = obj_req->img_request; 28007114edacSIlya Dryomov spin_lock(&img_req->completion_lock); 28017114edacSIlya Dryomov rbd_obj_end_request(obj_req); 28027114edacSIlya Dryomov rbd_assert(img_req->pending_count); 28037114edacSIlya Dryomov if (--img_req->pending_count) { 28047114edacSIlya Dryomov spin_unlock(&img_req->completion_lock); 28057114edacSIlya Dryomov return; 28067114edacSIlya Dryomov } 28077114edacSIlya Dryomov 28087114edacSIlya Dryomov spin_unlock(&img_req->completion_lock); 28097114edacSIlya Dryomov if (test_bit(IMG_REQ_CHILD, &img_req->flags)) { 28107114edacSIlya Dryomov obj_req = img_req->obj_request; 28117114edacSIlya Dryomov rbd_img_end_child_request(img_req); 28127114edacSIlya Dryomov goto again; 28137114edacSIlya Dryomov } 28147114edacSIlya Dryomov rbd_img_end_request(img_req); 28158b3e1a56SAlex Elder } 28168b3e1a56SAlex Elder 2817ed95b21aSIlya Dryomov static const struct rbd_client_id rbd_empty_cid; 2818ed95b21aSIlya Dryomov 2819ed95b21aSIlya Dryomov static bool rbd_cid_equal(const struct rbd_client_id *lhs, 2820ed95b21aSIlya Dryomov const struct rbd_client_id *rhs) 2821ed95b21aSIlya Dryomov { 2822ed95b21aSIlya Dryomov return lhs->gid == rhs->gid && lhs->handle == rhs->handle; 2823ed95b21aSIlya Dryomov } 2824ed95b21aSIlya Dryomov 2825ed95b21aSIlya Dryomov static struct rbd_client_id rbd_get_cid(struct rbd_device *rbd_dev) 2826ed95b21aSIlya Dryomov { 2827ed95b21aSIlya Dryomov struct rbd_client_id cid; 2828ed95b21aSIlya Dryomov 2829ed95b21aSIlya Dryomov mutex_lock(&rbd_dev->watch_mutex); 2830ed95b21aSIlya Dryomov cid.gid = ceph_client_gid(rbd_dev->rbd_client->client); 2831ed95b21aSIlya Dryomov cid.handle = rbd_dev->watch_cookie; 2832ed95b21aSIlya Dryomov mutex_unlock(&rbd_dev->watch_mutex); 2833ed95b21aSIlya Dryomov return cid; 2834ed95b21aSIlya Dryomov } 2835ed95b21aSIlya Dryomov 2836ed95b21aSIlya Dryomov /* 2837ed95b21aSIlya Dryomov * lock_rwsem must be held for write 2838ed95b21aSIlya Dryomov */ 2839ed95b21aSIlya Dryomov static void rbd_set_owner_cid(struct rbd_device *rbd_dev, 2840ed95b21aSIlya Dryomov const struct rbd_client_id *cid) 2841ed95b21aSIlya Dryomov { 2842ed95b21aSIlya Dryomov dout("%s rbd_dev %p %llu-%llu -> %llu-%llu\n", __func__, rbd_dev, 2843ed95b21aSIlya Dryomov rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle, 2844ed95b21aSIlya Dryomov cid->gid, cid->handle); 2845ed95b21aSIlya Dryomov rbd_dev->owner_cid = *cid; /* struct */ 2846ed95b21aSIlya Dryomov } 2847ed95b21aSIlya Dryomov 2848ed95b21aSIlya Dryomov static void format_lock_cookie(struct rbd_device *rbd_dev, char *buf) 2849ed95b21aSIlya Dryomov { 2850ed95b21aSIlya Dryomov mutex_lock(&rbd_dev->watch_mutex); 2851ed95b21aSIlya Dryomov sprintf(buf, "%s %llu", RBD_LOCK_COOKIE_PREFIX, rbd_dev->watch_cookie); 2852ed95b21aSIlya Dryomov mutex_unlock(&rbd_dev->watch_mutex); 2853ed95b21aSIlya Dryomov } 2854ed95b21aSIlya Dryomov 2855edd8ca80SFlorian Margaine static void __rbd_lock(struct rbd_device *rbd_dev, const char *cookie) 2856edd8ca80SFlorian Margaine { 2857edd8ca80SFlorian Margaine struct rbd_client_id cid = rbd_get_cid(rbd_dev); 2858edd8ca80SFlorian Margaine 2859edd8ca80SFlorian Margaine strcpy(rbd_dev->lock_cookie, cookie); 2860edd8ca80SFlorian Margaine rbd_set_owner_cid(rbd_dev, &cid); 2861edd8ca80SFlorian Margaine queue_work(rbd_dev->task_wq, &rbd_dev->acquired_lock_work); 2862edd8ca80SFlorian Margaine } 2863edd8ca80SFlorian Margaine 2864ed95b21aSIlya Dryomov /* 2865ed95b21aSIlya Dryomov * lock_rwsem must be held for write 2866ed95b21aSIlya Dryomov */ 2867ed95b21aSIlya Dryomov static int rbd_lock(struct rbd_device *rbd_dev) 2868ed95b21aSIlya Dryomov { 2869ed95b21aSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 2870ed95b21aSIlya Dryomov char cookie[32]; 2871ed95b21aSIlya Dryomov int ret; 2872ed95b21aSIlya Dryomov 2873cbbfb0ffSIlya Dryomov WARN_ON(__rbd_is_lock_owner(rbd_dev) || 2874cbbfb0ffSIlya Dryomov rbd_dev->lock_cookie[0] != '\0'); 2875ed95b21aSIlya Dryomov 2876ed95b21aSIlya Dryomov format_lock_cookie(rbd_dev, cookie); 2877ed95b21aSIlya Dryomov ret = ceph_cls_lock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc, 2878ed95b21aSIlya Dryomov RBD_LOCK_NAME, CEPH_CLS_LOCK_EXCLUSIVE, cookie, 2879ed95b21aSIlya Dryomov RBD_LOCK_TAG, "", 0); 2880ed95b21aSIlya Dryomov if (ret) 2881ed95b21aSIlya Dryomov return ret; 2882ed95b21aSIlya Dryomov 2883ed95b21aSIlya Dryomov rbd_dev->lock_state = RBD_LOCK_STATE_LOCKED; 2884edd8ca80SFlorian Margaine __rbd_lock(rbd_dev, cookie); 2885ed95b21aSIlya Dryomov return 0; 2886ed95b21aSIlya Dryomov } 2887ed95b21aSIlya Dryomov 2888ed95b21aSIlya Dryomov /* 2889ed95b21aSIlya Dryomov * lock_rwsem must be held for write 2890ed95b21aSIlya Dryomov */ 2891bbead745SIlya Dryomov static void rbd_unlock(struct rbd_device *rbd_dev) 2892ed95b21aSIlya Dryomov { 2893ed95b21aSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 2894ed95b21aSIlya Dryomov int ret; 2895ed95b21aSIlya Dryomov 2896cbbfb0ffSIlya Dryomov WARN_ON(!__rbd_is_lock_owner(rbd_dev) || 2897cbbfb0ffSIlya Dryomov rbd_dev->lock_cookie[0] == '\0'); 2898ed95b21aSIlya Dryomov 2899ed95b21aSIlya Dryomov ret = ceph_cls_unlock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc, 2900cbbfb0ffSIlya Dryomov RBD_LOCK_NAME, rbd_dev->lock_cookie); 2901bbead745SIlya Dryomov if (ret && ret != -ENOENT) 2902bbead745SIlya Dryomov rbd_warn(rbd_dev, "failed to unlock: %d", ret); 2903ed95b21aSIlya Dryomov 2904bbead745SIlya Dryomov /* treat errors as the image is unlocked */ 2905bbead745SIlya Dryomov rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED; 2906cbbfb0ffSIlya Dryomov rbd_dev->lock_cookie[0] = '\0'; 2907ed95b21aSIlya Dryomov rbd_set_owner_cid(rbd_dev, &rbd_empty_cid); 2908ed95b21aSIlya Dryomov queue_work(rbd_dev->task_wq, &rbd_dev->released_lock_work); 2909ed95b21aSIlya Dryomov } 2910ed95b21aSIlya Dryomov 2911ed95b21aSIlya Dryomov static int __rbd_notify_op_lock(struct rbd_device *rbd_dev, 2912ed95b21aSIlya Dryomov enum rbd_notify_op notify_op, 2913ed95b21aSIlya Dryomov struct page ***preply_pages, 2914ed95b21aSIlya Dryomov size_t *preply_len) 2915ed95b21aSIlya Dryomov { 2916ed95b21aSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 2917ed95b21aSIlya Dryomov struct rbd_client_id cid = rbd_get_cid(rbd_dev); 291808a79102SKyle Spiers char buf[4 + 8 + 8 + CEPH_ENCODING_START_BLK_LEN]; 291908a79102SKyle Spiers int buf_size = sizeof(buf); 2920ed95b21aSIlya Dryomov void *p = buf; 2921ed95b21aSIlya Dryomov 2922ed95b21aSIlya Dryomov dout("%s rbd_dev %p notify_op %d\n", __func__, rbd_dev, notify_op); 2923ed95b21aSIlya Dryomov 2924ed95b21aSIlya Dryomov /* encode *LockPayload NotifyMessage (op + ClientId) */ 2925ed95b21aSIlya Dryomov ceph_start_encoding(&p, 2, 1, buf_size - CEPH_ENCODING_START_BLK_LEN); 2926ed95b21aSIlya Dryomov ceph_encode_32(&p, notify_op); 2927ed95b21aSIlya Dryomov ceph_encode_64(&p, cid.gid); 2928ed95b21aSIlya Dryomov ceph_encode_64(&p, cid.handle); 2929ed95b21aSIlya Dryomov 2930ed95b21aSIlya Dryomov return ceph_osdc_notify(osdc, &rbd_dev->header_oid, 2931ed95b21aSIlya Dryomov &rbd_dev->header_oloc, buf, buf_size, 2932ed95b21aSIlya Dryomov RBD_NOTIFY_TIMEOUT, preply_pages, preply_len); 2933ed95b21aSIlya Dryomov } 2934ed95b21aSIlya Dryomov 2935ed95b21aSIlya Dryomov static void rbd_notify_op_lock(struct rbd_device *rbd_dev, 2936ed95b21aSIlya Dryomov enum rbd_notify_op notify_op) 2937ed95b21aSIlya Dryomov { 2938ed95b21aSIlya Dryomov struct page **reply_pages; 2939ed95b21aSIlya Dryomov size_t reply_len; 2940ed95b21aSIlya Dryomov 2941ed95b21aSIlya Dryomov __rbd_notify_op_lock(rbd_dev, notify_op, &reply_pages, &reply_len); 2942ed95b21aSIlya Dryomov ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len)); 2943ed95b21aSIlya Dryomov } 2944ed95b21aSIlya Dryomov 2945ed95b21aSIlya Dryomov static void rbd_notify_acquired_lock(struct work_struct *work) 2946ed95b21aSIlya Dryomov { 2947ed95b21aSIlya Dryomov struct rbd_device *rbd_dev = container_of(work, struct rbd_device, 2948ed95b21aSIlya Dryomov acquired_lock_work); 2949ed95b21aSIlya Dryomov 2950ed95b21aSIlya Dryomov rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_ACQUIRED_LOCK); 2951ed95b21aSIlya Dryomov } 2952ed95b21aSIlya Dryomov 2953ed95b21aSIlya Dryomov static void rbd_notify_released_lock(struct work_struct *work) 2954ed95b21aSIlya Dryomov { 2955ed95b21aSIlya Dryomov struct rbd_device *rbd_dev = container_of(work, struct rbd_device, 2956ed95b21aSIlya Dryomov released_lock_work); 2957ed95b21aSIlya Dryomov 2958ed95b21aSIlya Dryomov rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_RELEASED_LOCK); 2959ed95b21aSIlya Dryomov } 2960ed95b21aSIlya Dryomov 2961ed95b21aSIlya Dryomov static int rbd_request_lock(struct rbd_device *rbd_dev) 2962ed95b21aSIlya Dryomov { 2963ed95b21aSIlya Dryomov struct page **reply_pages; 2964ed95b21aSIlya Dryomov size_t reply_len; 2965ed95b21aSIlya Dryomov bool lock_owner_responded = false; 2966ed95b21aSIlya Dryomov int ret; 2967ed95b21aSIlya Dryomov 2968ed95b21aSIlya Dryomov dout("%s rbd_dev %p\n", __func__, rbd_dev); 2969ed95b21aSIlya Dryomov 2970ed95b21aSIlya Dryomov ret = __rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_REQUEST_LOCK, 2971ed95b21aSIlya Dryomov &reply_pages, &reply_len); 2972ed95b21aSIlya Dryomov if (ret && ret != -ETIMEDOUT) { 2973ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "failed to request lock: %d", ret); 2974ed95b21aSIlya Dryomov goto out; 2975ed95b21aSIlya Dryomov } 2976ed95b21aSIlya Dryomov 2977ed95b21aSIlya Dryomov if (reply_len > 0 && reply_len <= PAGE_SIZE) { 2978ed95b21aSIlya Dryomov void *p = page_address(reply_pages[0]); 2979ed95b21aSIlya Dryomov void *const end = p + reply_len; 2980ed95b21aSIlya Dryomov u32 n; 2981ed95b21aSIlya Dryomov 2982ed95b21aSIlya Dryomov ceph_decode_32_safe(&p, end, n, e_inval); /* num_acks */ 2983ed95b21aSIlya Dryomov while (n--) { 2984ed95b21aSIlya Dryomov u8 struct_v; 2985ed95b21aSIlya Dryomov u32 len; 2986ed95b21aSIlya Dryomov 2987ed95b21aSIlya Dryomov ceph_decode_need(&p, end, 8 + 8, e_inval); 2988ed95b21aSIlya Dryomov p += 8 + 8; /* skip gid and cookie */ 2989ed95b21aSIlya Dryomov 2990ed95b21aSIlya Dryomov ceph_decode_32_safe(&p, end, len, e_inval); 2991ed95b21aSIlya Dryomov if (!len) 2992ed95b21aSIlya Dryomov continue; 2993ed95b21aSIlya Dryomov 2994ed95b21aSIlya Dryomov if (lock_owner_responded) { 2995ed95b21aSIlya Dryomov rbd_warn(rbd_dev, 2996ed95b21aSIlya Dryomov "duplicate lock owners detected"); 2997ed95b21aSIlya Dryomov ret = -EIO; 2998ed95b21aSIlya Dryomov goto out; 2999ed95b21aSIlya Dryomov } 3000ed95b21aSIlya Dryomov 3001ed95b21aSIlya Dryomov lock_owner_responded = true; 3002ed95b21aSIlya Dryomov ret = ceph_start_decoding(&p, end, 1, "ResponseMessage", 3003ed95b21aSIlya Dryomov &struct_v, &len); 3004ed95b21aSIlya Dryomov if (ret) { 3005ed95b21aSIlya Dryomov rbd_warn(rbd_dev, 3006ed95b21aSIlya Dryomov "failed to decode ResponseMessage: %d", 3007ed95b21aSIlya Dryomov ret); 3008ed95b21aSIlya Dryomov goto e_inval; 3009ed95b21aSIlya Dryomov } 3010ed95b21aSIlya Dryomov 3011ed95b21aSIlya Dryomov ret = ceph_decode_32(&p); 3012ed95b21aSIlya Dryomov } 3013ed95b21aSIlya Dryomov } 3014ed95b21aSIlya Dryomov 3015ed95b21aSIlya Dryomov if (!lock_owner_responded) { 3016ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "no lock owners detected"); 3017ed95b21aSIlya Dryomov ret = -ETIMEDOUT; 3018ed95b21aSIlya Dryomov } 3019ed95b21aSIlya Dryomov 3020ed95b21aSIlya Dryomov out: 3021ed95b21aSIlya Dryomov ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len)); 3022ed95b21aSIlya Dryomov return ret; 3023ed95b21aSIlya Dryomov 3024ed95b21aSIlya Dryomov e_inval: 3025ed95b21aSIlya Dryomov ret = -EINVAL; 3026ed95b21aSIlya Dryomov goto out; 3027ed95b21aSIlya Dryomov } 3028ed95b21aSIlya Dryomov 3029ed95b21aSIlya Dryomov static void wake_requests(struct rbd_device *rbd_dev, bool wake_all) 3030ed95b21aSIlya Dryomov { 3031ed95b21aSIlya Dryomov dout("%s rbd_dev %p wake_all %d\n", __func__, rbd_dev, wake_all); 3032ed95b21aSIlya Dryomov 3033ed95b21aSIlya Dryomov cancel_delayed_work(&rbd_dev->lock_dwork); 3034ed95b21aSIlya Dryomov if (wake_all) 3035ed95b21aSIlya Dryomov wake_up_all(&rbd_dev->lock_waitq); 3036ed95b21aSIlya Dryomov else 3037ed95b21aSIlya Dryomov wake_up(&rbd_dev->lock_waitq); 3038ed95b21aSIlya Dryomov } 3039ed95b21aSIlya Dryomov 3040ed95b21aSIlya Dryomov static int get_lock_owner_info(struct rbd_device *rbd_dev, 3041ed95b21aSIlya Dryomov struct ceph_locker **lockers, u32 *num_lockers) 3042ed95b21aSIlya Dryomov { 3043ed95b21aSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 3044ed95b21aSIlya Dryomov u8 lock_type; 3045ed95b21aSIlya Dryomov char *lock_tag; 3046ed95b21aSIlya Dryomov int ret; 3047ed95b21aSIlya Dryomov 3048ed95b21aSIlya Dryomov dout("%s rbd_dev %p\n", __func__, rbd_dev); 3049ed95b21aSIlya Dryomov 3050ed95b21aSIlya Dryomov ret = ceph_cls_lock_info(osdc, &rbd_dev->header_oid, 3051ed95b21aSIlya Dryomov &rbd_dev->header_oloc, RBD_LOCK_NAME, 3052ed95b21aSIlya Dryomov &lock_type, &lock_tag, lockers, num_lockers); 3053ed95b21aSIlya Dryomov if (ret) 3054ed95b21aSIlya Dryomov return ret; 3055ed95b21aSIlya Dryomov 3056ed95b21aSIlya Dryomov if (*num_lockers == 0) { 3057ed95b21aSIlya Dryomov dout("%s rbd_dev %p no lockers detected\n", __func__, rbd_dev); 3058ed95b21aSIlya Dryomov goto out; 3059ed95b21aSIlya Dryomov } 3060ed95b21aSIlya Dryomov 3061ed95b21aSIlya Dryomov if (strcmp(lock_tag, RBD_LOCK_TAG)) { 3062ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "locked by external mechanism, tag %s", 3063ed95b21aSIlya Dryomov lock_tag); 3064ed95b21aSIlya Dryomov ret = -EBUSY; 3065ed95b21aSIlya Dryomov goto out; 3066ed95b21aSIlya Dryomov } 3067ed95b21aSIlya Dryomov 3068ed95b21aSIlya Dryomov if (lock_type == CEPH_CLS_LOCK_SHARED) { 3069ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "shared lock type detected"); 3070ed95b21aSIlya Dryomov ret = -EBUSY; 3071ed95b21aSIlya Dryomov goto out; 3072ed95b21aSIlya Dryomov } 3073ed95b21aSIlya Dryomov 3074ed95b21aSIlya Dryomov if (strncmp((*lockers)[0].id.cookie, RBD_LOCK_COOKIE_PREFIX, 3075ed95b21aSIlya Dryomov strlen(RBD_LOCK_COOKIE_PREFIX))) { 3076ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "locked by external mechanism, cookie %s", 3077ed95b21aSIlya Dryomov (*lockers)[0].id.cookie); 3078ed95b21aSIlya Dryomov ret = -EBUSY; 3079ed95b21aSIlya Dryomov goto out; 3080ed95b21aSIlya Dryomov } 3081ed95b21aSIlya Dryomov 3082ed95b21aSIlya Dryomov out: 3083ed95b21aSIlya Dryomov kfree(lock_tag); 3084ed95b21aSIlya Dryomov return ret; 3085ed95b21aSIlya Dryomov } 3086ed95b21aSIlya Dryomov 3087ed95b21aSIlya Dryomov static int find_watcher(struct rbd_device *rbd_dev, 3088ed95b21aSIlya Dryomov const struct ceph_locker *locker) 3089ed95b21aSIlya Dryomov { 3090ed95b21aSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 3091ed95b21aSIlya Dryomov struct ceph_watch_item *watchers; 3092ed95b21aSIlya Dryomov u32 num_watchers; 3093ed95b21aSIlya Dryomov u64 cookie; 3094ed95b21aSIlya Dryomov int i; 3095ed95b21aSIlya Dryomov int ret; 3096ed95b21aSIlya Dryomov 3097ed95b21aSIlya Dryomov ret = ceph_osdc_list_watchers(osdc, &rbd_dev->header_oid, 3098ed95b21aSIlya Dryomov &rbd_dev->header_oloc, &watchers, 3099ed95b21aSIlya Dryomov &num_watchers); 3100ed95b21aSIlya Dryomov if (ret) 3101ed95b21aSIlya Dryomov return ret; 3102ed95b21aSIlya Dryomov 3103ed95b21aSIlya Dryomov sscanf(locker->id.cookie, RBD_LOCK_COOKIE_PREFIX " %llu", &cookie); 3104ed95b21aSIlya Dryomov for (i = 0; i < num_watchers; i++) { 3105ed95b21aSIlya Dryomov if (!memcmp(&watchers[i].addr, &locker->info.addr, 3106ed95b21aSIlya Dryomov sizeof(locker->info.addr)) && 3107ed95b21aSIlya Dryomov watchers[i].cookie == cookie) { 3108ed95b21aSIlya Dryomov struct rbd_client_id cid = { 3109ed95b21aSIlya Dryomov .gid = le64_to_cpu(watchers[i].name.num), 3110ed95b21aSIlya Dryomov .handle = cookie, 3111ed95b21aSIlya Dryomov }; 3112ed95b21aSIlya Dryomov 3113ed95b21aSIlya Dryomov dout("%s rbd_dev %p found cid %llu-%llu\n", __func__, 3114ed95b21aSIlya Dryomov rbd_dev, cid.gid, cid.handle); 3115ed95b21aSIlya Dryomov rbd_set_owner_cid(rbd_dev, &cid); 3116ed95b21aSIlya Dryomov ret = 1; 3117ed95b21aSIlya Dryomov goto out; 3118ed95b21aSIlya Dryomov } 3119ed95b21aSIlya Dryomov } 3120ed95b21aSIlya Dryomov 3121ed95b21aSIlya Dryomov dout("%s rbd_dev %p no watchers\n", __func__, rbd_dev); 3122ed95b21aSIlya Dryomov ret = 0; 3123ed95b21aSIlya Dryomov out: 3124ed95b21aSIlya Dryomov kfree(watchers); 3125ed95b21aSIlya Dryomov return ret; 3126ed95b21aSIlya Dryomov } 3127ed95b21aSIlya Dryomov 3128ed95b21aSIlya Dryomov /* 3129ed95b21aSIlya Dryomov * lock_rwsem must be held for write 3130ed95b21aSIlya Dryomov */ 3131ed95b21aSIlya Dryomov static int rbd_try_lock(struct rbd_device *rbd_dev) 3132ed95b21aSIlya Dryomov { 3133ed95b21aSIlya Dryomov struct ceph_client *client = rbd_dev->rbd_client->client; 3134ed95b21aSIlya Dryomov struct ceph_locker *lockers; 3135ed95b21aSIlya Dryomov u32 num_lockers; 3136ed95b21aSIlya Dryomov int ret; 3137ed95b21aSIlya Dryomov 3138ed95b21aSIlya Dryomov for (;;) { 3139ed95b21aSIlya Dryomov ret = rbd_lock(rbd_dev); 3140ed95b21aSIlya Dryomov if (ret != -EBUSY) 3141ed95b21aSIlya Dryomov return ret; 3142ed95b21aSIlya Dryomov 3143ed95b21aSIlya Dryomov /* determine if the current lock holder is still alive */ 3144ed95b21aSIlya Dryomov ret = get_lock_owner_info(rbd_dev, &lockers, &num_lockers); 3145ed95b21aSIlya Dryomov if (ret) 3146ed95b21aSIlya Dryomov return ret; 3147ed95b21aSIlya Dryomov 3148ed95b21aSIlya Dryomov if (num_lockers == 0) 3149ed95b21aSIlya Dryomov goto again; 3150ed95b21aSIlya Dryomov 3151ed95b21aSIlya Dryomov ret = find_watcher(rbd_dev, lockers); 3152ed95b21aSIlya Dryomov if (ret) { 3153ed95b21aSIlya Dryomov if (ret > 0) 3154ed95b21aSIlya Dryomov ret = 0; /* have to request lock */ 3155ed95b21aSIlya Dryomov goto out; 3156ed95b21aSIlya Dryomov } 3157ed95b21aSIlya Dryomov 3158ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "%s%llu seems dead, breaking lock", 3159ed95b21aSIlya Dryomov ENTITY_NAME(lockers[0].id.name)); 3160ed95b21aSIlya Dryomov 3161ed95b21aSIlya Dryomov ret = ceph_monc_blacklist_add(&client->monc, 3162ed95b21aSIlya Dryomov &lockers[0].info.addr); 3163ed95b21aSIlya Dryomov if (ret) { 3164ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "blacklist of %s%llu failed: %d", 3165ed95b21aSIlya Dryomov ENTITY_NAME(lockers[0].id.name), ret); 3166ed95b21aSIlya Dryomov goto out; 3167ed95b21aSIlya Dryomov } 3168ed95b21aSIlya Dryomov 3169ed95b21aSIlya Dryomov ret = ceph_cls_break_lock(&client->osdc, &rbd_dev->header_oid, 3170ed95b21aSIlya Dryomov &rbd_dev->header_oloc, RBD_LOCK_NAME, 3171ed95b21aSIlya Dryomov lockers[0].id.cookie, 3172ed95b21aSIlya Dryomov &lockers[0].id.name); 3173ed95b21aSIlya Dryomov if (ret && ret != -ENOENT) 3174ed95b21aSIlya Dryomov goto out; 3175ed95b21aSIlya Dryomov 3176ed95b21aSIlya Dryomov again: 3177ed95b21aSIlya Dryomov ceph_free_lockers(lockers, num_lockers); 3178ed95b21aSIlya Dryomov } 3179ed95b21aSIlya Dryomov 3180ed95b21aSIlya Dryomov out: 3181ed95b21aSIlya Dryomov ceph_free_lockers(lockers, num_lockers); 3182ed95b21aSIlya Dryomov return ret; 3183ed95b21aSIlya Dryomov } 3184ed95b21aSIlya Dryomov 3185ed95b21aSIlya Dryomov /* 3186ed95b21aSIlya Dryomov * ret is set only if lock_state is RBD_LOCK_STATE_UNLOCKED 3187ed95b21aSIlya Dryomov */ 3188ed95b21aSIlya Dryomov static enum rbd_lock_state rbd_try_acquire_lock(struct rbd_device *rbd_dev, 3189ed95b21aSIlya Dryomov int *pret) 3190ed95b21aSIlya Dryomov { 3191ed95b21aSIlya Dryomov enum rbd_lock_state lock_state; 3192ed95b21aSIlya Dryomov 3193ed95b21aSIlya Dryomov down_read(&rbd_dev->lock_rwsem); 3194ed95b21aSIlya Dryomov dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev, 3195ed95b21aSIlya Dryomov rbd_dev->lock_state); 3196ed95b21aSIlya Dryomov if (__rbd_is_lock_owner(rbd_dev)) { 3197ed95b21aSIlya Dryomov lock_state = rbd_dev->lock_state; 3198ed95b21aSIlya Dryomov up_read(&rbd_dev->lock_rwsem); 3199ed95b21aSIlya Dryomov return lock_state; 3200ed95b21aSIlya Dryomov } 3201ed95b21aSIlya Dryomov 3202ed95b21aSIlya Dryomov up_read(&rbd_dev->lock_rwsem); 3203ed95b21aSIlya Dryomov down_write(&rbd_dev->lock_rwsem); 3204ed95b21aSIlya Dryomov dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev, 3205ed95b21aSIlya Dryomov rbd_dev->lock_state); 3206ed95b21aSIlya Dryomov if (!__rbd_is_lock_owner(rbd_dev)) { 3207ed95b21aSIlya Dryomov *pret = rbd_try_lock(rbd_dev); 3208ed95b21aSIlya Dryomov if (*pret) 3209ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "failed to acquire lock: %d", *pret); 3210ed95b21aSIlya Dryomov } 3211ed95b21aSIlya Dryomov 3212ed95b21aSIlya Dryomov lock_state = rbd_dev->lock_state; 3213ed95b21aSIlya Dryomov up_write(&rbd_dev->lock_rwsem); 3214ed95b21aSIlya Dryomov return lock_state; 3215ed95b21aSIlya Dryomov } 3216ed95b21aSIlya Dryomov 3217ed95b21aSIlya Dryomov static void rbd_acquire_lock(struct work_struct *work) 3218ed95b21aSIlya Dryomov { 3219ed95b21aSIlya Dryomov struct rbd_device *rbd_dev = container_of(to_delayed_work(work), 3220ed95b21aSIlya Dryomov struct rbd_device, lock_dwork); 3221ed95b21aSIlya Dryomov enum rbd_lock_state lock_state; 322237f13252SKefeng Wang int ret = 0; 3223ed95b21aSIlya Dryomov 3224ed95b21aSIlya Dryomov dout("%s rbd_dev %p\n", __func__, rbd_dev); 3225ed95b21aSIlya Dryomov again: 3226ed95b21aSIlya Dryomov lock_state = rbd_try_acquire_lock(rbd_dev, &ret); 3227ed95b21aSIlya Dryomov if (lock_state != RBD_LOCK_STATE_UNLOCKED || ret == -EBLACKLISTED) { 3228ed95b21aSIlya Dryomov if (lock_state == RBD_LOCK_STATE_LOCKED) 3229ed95b21aSIlya Dryomov wake_requests(rbd_dev, true); 3230ed95b21aSIlya Dryomov dout("%s rbd_dev %p lock_state %d ret %d - done\n", __func__, 3231ed95b21aSIlya Dryomov rbd_dev, lock_state, ret); 3232ed95b21aSIlya Dryomov return; 3233ed95b21aSIlya Dryomov } 3234ed95b21aSIlya Dryomov 3235ed95b21aSIlya Dryomov ret = rbd_request_lock(rbd_dev); 3236ed95b21aSIlya Dryomov if (ret == -ETIMEDOUT) { 3237ed95b21aSIlya Dryomov goto again; /* treat this as a dead client */ 3238e010dd0aSIlya Dryomov } else if (ret == -EROFS) { 3239e010dd0aSIlya Dryomov rbd_warn(rbd_dev, "peer will not release lock"); 3240e010dd0aSIlya Dryomov /* 3241e010dd0aSIlya Dryomov * If this is rbd_add_acquire_lock(), we want to fail 3242e010dd0aSIlya Dryomov * immediately -- reuse BLACKLISTED flag. Otherwise we 3243e010dd0aSIlya Dryomov * want to block. 3244e010dd0aSIlya Dryomov */ 3245e010dd0aSIlya Dryomov if (!(rbd_dev->disk->flags & GENHD_FL_UP)) { 3246e010dd0aSIlya Dryomov set_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags); 3247e010dd0aSIlya Dryomov /* wake "rbd map --exclusive" process */ 3248e010dd0aSIlya Dryomov wake_requests(rbd_dev, false); 3249e010dd0aSIlya Dryomov } 3250ed95b21aSIlya Dryomov } else if (ret < 0) { 3251ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "error requesting lock: %d", ret); 3252ed95b21aSIlya Dryomov mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 3253ed95b21aSIlya Dryomov RBD_RETRY_DELAY); 3254ed95b21aSIlya Dryomov } else { 3255ed95b21aSIlya Dryomov /* 3256ed95b21aSIlya Dryomov * lock owner acked, but resend if we don't see them 3257ed95b21aSIlya Dryomov * release the lock 3258ed95b21aSIlya Dryomov */ 3259ed95b21aSIlya Dryomov dout("%s rbd_dev %p requeueing lock_dwork\n", __func__, 3260ed95b21aSIlya Dryomov rbd_dev); 3261ed95b21aSIlya Dryomov mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 3262ed95b21aSIlya Dryomov msecs_to_jiffies(2 * RBD_NOTIFY_TIMEOUT * MSEC_PER_SEC)); 3263ed95b21aSIlya Dryomov } 3264ed95b21aSIlya Dryomov } 3265ed95b21aSIlya Dryomov 3266ed95b21aSIlya Dryomov /* 3267ed95b21aSIlya Dryomov * lock_rwsem must be held for write 3268ed95b21aSIlya Dryomov */ 3269ed95b21aSIlya Dryomov static bool rbd_release_lock(struct rbd_device *rbd_dev) 3270ed95b21aSIlya Dryomov { 3271ed95b21aSIlya Dryomov dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev, 3272ed95b21aSIlya Dryomov rbd_dev->lock_state); 3273ed95b21aSIlya Dryomov if (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED) 3274ed95b21aSIlya Dryomov return false; 3275ed95b21aSIlya Dryomov 3276ed95b21aSIlya Dryomov rbd_dev->lock_state = RBD_LOCK_STATE_RELEASING; 3277ed95b21aSIlya Dryomov downgrade_write(&rbd_dev->lock_rwsem); 3278ed95b21aSIlya Dryomov /* 3279ed95b21aSIlya Dryomov * Ensure that all in-flight IO is flushed. 3280ed95b21aSIlya Dryomov * 3281ed95b21aSIlya Dryomov * FIXME: ceph_osdc_sync() flushes the entire OSD client, which 3282ed95b21aSIlya Dryomov * may be shared with other devices. 3283ed95b21aSIlya Dryomov */ 3284ed95b21aSIlya Dryomov ceph_osdc_sync(&rbd_dev->rbd_client->client->osdc); 3285ed95b21aSIlya Dryomov up_read(&rbd_dev->lock_rwsem); 3286ed95b21aSIlya Dryomov 3287ed95b21aSIlya Dryomov down_write(&rbd_dev->lock_rwsem); 3288ed95b21aSIlya Dryomov dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev, 3289ed95b21aSIlya Dryomov rbd_dev->lock_state); 3290ed95b21aSIlya Dryomov if (rbd_dev->lock_state != RBD_LOCK_STATE_RELEASING) 3291ed95b21aSIlya Dryomov return false; 3292ed95b21aSIlya Dryomov 3293bbead745SIlya Dryomov rbd_unlock(rbd_dev); 3294ed95b21aSIlya Dryomov /* 3295ed95b21aSIlya Dryomov * Give others a chance to grab the lock - we would re-acquire 3296ed95b21aSIlya Dryomov * almost immediately if we got new IO during ceph_osdc_sync() 3297ed95b21aSIlya Dryomov * otherwise. We need to ack our own notifications, so this 3298ed95b21aSIlya Dryomov * lock_dwork will be requeued from rbd_wait_state_locked() 3299ed95b21aSIlya Dryomov * after wake_requests() in rbd_handle_released_lock(). 3300ed95b21aSIlya Dryomov */ 3301ed95b21aSIlya Dryomov cancel_delayed_work(&rbd_dev->lock_dwork); 3302ed95b21aSIlya Dryomov return true; 3303ed95b21aSIlya Dryomov } 3304ed95b21aSIlya Dryomov 3305ed95b21aSIlya Dryomov static void rbd_release_lock_work(struct work_struct *work) 3306ed95b21aSIlya Dryomov { 3307ed95b21aSIlya Dryomov struct rbd_device *rbd_dev = container_of(work, struct rbd_device, 3308ed95b21aSIlya Dryomov unlock_work); 3309ed95b21aSIlya Dryomov 3310ed95b21aSIlya Dryomov down_write(&rbd_dev->lock_rwsem); 3311ed95b21aSIlya Dryomov rbd_release_lock(rbd_dev); 3312ed95b21aSIlya Dryomov up_write(&rbd_dev->lock_rwsem); 3313ed95b21aSIlya Dryomov } 3314ed95b21aSIlya Dryomov 3315ed95b21aSIlya Dryomov static void rbd_handle_acquired_lock(struct rbd_device *rbd_dev, u8 struct_v, 3316ed95b21aSIlya Dryomov void **p) 3317ed95b21aSIlya Dryomov { 3318ed95b21aSIlya Dryomov struct rbd_client_id cid = { 0 }; 3319ed95b21aSIlya Dryomov 3320ed95b21aSIlya Dryomov if (struct_v >= 2) { 3321ed95b21aSIlya Dryomov cid.gid = ceph_decode_64(p); 3322ed95b21aSIlya Dryomov cid.handle = ceph_decode_64(p); 3323ed95b21aSIlya Dryomov } 3324ed95b21aSIlya Dryomov 3325ed95b21aSIlya Dryomov dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid, 3326ed95b21aSIlya Dryomov cid.handle); 3327ed95b21aSIlya Dryomov if (!rbd_cid_equal(&cid, &rbd_empty_cid)) { 3328ed95b21aSIlya Dryomov down_write(&rbd_dev->lock_rwsem); 3329ed95b21aSIlya Dryomov if (rbd_cid_equal(&cid, &rbd_dev->owner_cid)) { 3330ed95b21aSIlya Dryomov /* 3331ed95b21aSIlya Dryomov * we already know that the remote client is 3332ed95b21aSIlya Dryomov * the owner 3333ed95b21aSIlya Dryomov */ 3334ed95b21aSIlya Dryomov up_write(&rbd_dev->lock_rwsem); 3335ed95b21aSIlya Dryomov return; 3336ed95b21aSIlya Dryomov } 3337ed95b21aSIlya Dryomov 3338ed95b21aSIlya Dryomov rbd_set_owner_cid(rbd_dev, &cid); 3339ed95b21aSIlya Dryomov downgrade_write(&rbd_dev->lock_rwsem); 3340ed95b21aSIlya Dryomov } else { 3341ed95b21aSIlya Dryomov down_read(&rbd_dev->lock_rwsem); 3342ed95b21aSIlya Dryomov } 3343ed95b21aSIlya Dryomov 3344ed95b21aSIlya Dryomov if (!__rbd_is_lock_owner(rbd_dev)) 3345ed95b21aSIlya Dryomov wake_requests(rbd_dev, false); 3346ed95b21aSIlya Dryomov up_read(&rbd_dev->lock_rwsem); 3347ed95b21aSIlya Dryomov } 3348ed95b21aSIlya Dryomov 3349ed95b21aSIlya Dryomov static void rbd_handle_released_lock(struct rbd_device *rbd_dev, u8 struct_v, 3350ed95b21aSIlya Dryomov void **p) 3351ed95b21aSIlya Dryomov { 3352ed95b21aSIlya Dryomov struct rbd_client_id cid = { 0 }; 3353ed95b21aSIlya Dryomov 3354ed95b21aSIlya Dryomov if (struct_v >= 2) { 3355ed95b21aSIlya Dryomov cid.gid = ceph_decode_64(p); 3356ed95b21aSIlya Dryomov cid.handle = ceph_decode_64(p); 3357ed95b21aSIlya Dryomov } 3358ed95b21aSIlya Dryomov 3359ed95b21aSIlya Dryomov dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid, 3360ed95b21aSIlya Dryomov cid.handle); 3361ed95b21aSIlya Dryomov if (!rbd_cid_equal(&cid, &rbd_empty_cid)) { 3362ed95b21aSIlya Dryomov down_write(&rbd_dev->lock_rwsem); 3363ed95b21aSIlya Dryomov if (!rbd_cid_equal(&cid, &rbd_dev->owner_cid)) { 3364ed95b21aSIlya Dryomov dout("%s rbd_dev %p unexpected owner, cid %llu-%llu != owner_cid %llu-%llu\n", 3365ed95b21aSIlya Dryomov __func__, rbd_dev, cid.gid, cid.handle, 3366ed95b21aSIlya Dryomov rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle); 3367ed95b21aSIlya Dryomov up_write(&rbd_dev->lock_rwsem); 3368ed95b21aSIlya Dryomov return; 3369ed95b21aSIlya Dryomov } 3370ed95b21aSIlya Dryomov 3371ed95b21aSIlya Dryomov rbd_set_owner_cid(rbd_dev, &rbd_empty_cid); 3372ed95b21aSIlya Dryomov downgrade_write(&rbd_dev->lock_rwsem); 3373ed95b21aSIlya Dryomov } else { 3374ed95b21aSIlya Dryomov down_read(&rbd_dev->lock_rwsem); 3375ed95b21aSIlya Dryomov } 3376ed95b21aSIlya Dryomov 3377ed95b21aSIlya Dryomov if (!__rbd_is_lock_owner(rbd_dev)) 3378ed95b21aSIlya Dryomov wake_requests(rbd_dev, false); 3379ed95b21aSIlya Dryomov up_read(&rbd_dev->lock_rwsem); 3380ed95b21aSIlya Dryomov } 3381ed95b21aSIlya Dryomov 33823b77faa0SIlya Dryomov /* 33833b77faa0SIlya Dryomov * Returns result for ResponseMessage to be encoded (<= 0), or 1 if no 33843b77faa0SIlya Dryomov * ResponseMessage is needed. 33853b77faa0SIlya Dryomov */ 33863b77faa0SIlya Dryomov static int rbd_handle_request_lock(struct rbd_device *rbd_dev, u8 struct_v, 3387ed95b21aSIlya Dryomov void **p) 3388ed95b21aSIlya Dryomov { 3389ed95b21aSIlya Dryomov struct rbd_client_id my_cid = rbd_get_cid(rbd_dev); 3390ed95b21aSIlya Dryomov struct rbd_client_id cid = { 0 }; 33913b77faa0SIlya Dryomov int result = 1; 3392ed95b21aSIlya Dryomov 3393ed95b21aSIlya Dryomov if (struct_v >= 2) { 3394ed95b21aSIlya Dryomov cid.gid = ceph_decode_64(p); 3395ed95b21aSIlya Dryomov cid.handle = ceph_decode_64(p); 3396ed95b21aSIlya Dryomov } 3397ed95b21aSIlya Dryomov 3398ed95b21aSIlya Dryomov dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid, 3399ed95b21aSIlya Dryomov cid.handle); 3400ed95b21aSIlya Dryomov if (rbd_cid_equal(&cid, &my_cid)) 34013b77faa0SIlya Dryomov return result; 3402ed95b21aSIlya Dryomov 3403ed95b21aSIlya Dryomov down_read(&rbd_dev->lock_rwsem); 34043b77faa0SIlya Dryomov if (__rbd_is_lock_owner(rbd_dev)) { 34053b77faa0SIlya Dryomov if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED && 34063b77faa0SIlya Dryomov rbd_cid_equal(&rbd_dev->owner_cid, &rbd_empty_cid)) 34073b77faa0SIlya Dryomov goto out_unlock; 34083b77faa0SIlya Dryomov 34093b77faa0SIlya Dryomov /* 34103b77faa0SIlya Dryomov * encode ResponseMessage(0) so the peer can detect 34113b77faa0SIlya Dryomov * a missing owner 34123b77faa0SIlya Dryomov */ 34133b77faa0SIlya Dryomov result = 0; 34143b77faa0SIlya Dryomov 3415ed95b21aSIlya Dryomov if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED) { 3416e010dd0aSIlya Dryomov if (!rbd_dev->opts->exclusive) { 3417e010dd0aSIlya Dryomov dout("%s rbd_dev %p queueing unlock_work\n", 3418e010dd0aSIlya Dryomov __func__, rbd_dev); 3419e010dd0aSIlya Dryomov queue_work(rbd_dev->task_wq, 3420e010dd0aSIlya Dryomov &rbd_dev->unlock_work); 3421e010dd0aSIlya Dryomov } else { 3422e010dd0aSIlya Dryomov /* refuse to release the lock */ 3423e010dd0aSIlya Dryomov result = -EROFS; 3424ed95b21aSIlya Dryomov } 3425ed95b21aSIlya Dryomov } 3426ed95b21aSIlya Dryomov } 34273b77faa0SIlya Dryomov 34283b77faa0SIlya Dryomov out_unlock: 3429ed95b21aSIlya Dryomov up_read(&rbd_dev->lock_rwsem); 34303b77faa0SIlya Dryomov return result; 3431ed95b21aSIlya Dryomov } 3432ed95b21aSIlya Dryomov 3433ed95b21aSIlya Dryomov static void __rbd_acknowledge_notify(struct rbd_device *rbd_dev, 3434ed95b21aSIlya Dryomov u64 notify_id, u64 cookie, s32 *result) 3435ed95b21aSIlya Dryomov { 3436ed95b21aSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 343708a79102SKyle Spiers char buf[4 + CEPH_ENCODING_START_BLK_LEN]; 343808a79102SKyle Spiers int buf_size = sizeof(buf); 3439ed95b21aSIlya Dryomov int ret; 3440ed95b21aSIlya Dryomov 3441ed95b21aSIlya Dryomov if (result) { 3442ed95b21aSIlya Dryomov void *p = buf; 3443ed95b21aSIlya Dryomov 3444ed95b21aSIlya Dryomov /* encode ResponseMessage */ 3445ed95b21aSIlya Dryomov ceph_start_encoding(&p, 1, 1, 3446ed95b21aSIlya Dryomov buf_size - CEPH_ENCODING_START_BLK_LEN); 3447ed95b21aSIlya Dryomov ceph_encode_32(&p, *result); 3448ed95b21aSIlya Dryomov } else { 3449ed95b21aSIlya Dryomov buf_size = 0; 3450ed95b21aSIlya Dryomov } 3451ed95b21aSIlya Dryomov 3452ed95b21aSIlya Dryomov ret = ceph_osdc_notify_ack(osdc, &rbd_dev->header_oid, 3453ed95b21aSIlya Dryomov &rbd_dev->header_oloc, notify_id, cookie, 3454ed95b21aSIlya Dryomov buf, buf_size); 3455ed95b21aSIlya Dryomov if (ret) 3456ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "acknowledge_notify failed: %d", ret); 3457ed95b21aSIlya Dryomov } 3458ed95b21aSIlya Dryomov 3459ed95b21aSIlya Dryomov static void rbd_acknowledge_notify(struct rbd_device *rbd_dev, u64 notify_id, 3460ed95b21aSIlya Dryomov u64 cookie) 3461ed95b21aSIlya Dryomov { 3462ed95b21aSIlya Dryomov dout("%s rbd_dev %p\n", __func__, rbd_dev); 3463ed95b21aSIlya Dryomov __rbd_acknowledge_notify(rbd_dev, notify_id, cookie, NULL); 3464ed95b21aSIlya Dryomov } 3465ed95b21aSIlya Dryomov 3466ed95b21aSIlya Dryomov static void rbd_acknowledge_notify_result(struct rbd_device *rbd_dev, 3467ed95b21aSIlya Dryomov u64 notify_id, u64 cookie, s32 result) 3468ed95b21aSIlya Dryomov { 3469ed95b21aSIlya Dryomov dout("%s rbd_dev %p result %d\n", __func__, rbd_dev, result); 3470ed95b21aSIlya Dryomov __rbd_acknowledge_notify(rbd_dev, notify_id, cookie, &result); 3471ed95b21aSIlya Dryomov } 3472922dab61SIlya Dryomov 3473922dab61SIlya Dryomov static void rbd_watch_cb(void *arg, u64 notify_id, u64 cookie, 3474922dab61SIlya Dryomov u64 notifier_id, void *data, size_t data_len) 3475b8d70035SAlex Elder { 3476922dab61SIlya Dryomov struct rbd_device *rbd_dev = arg; 3477ed95b21aSIlya Dryomov void *p = data; 3478ed95b21aSIlya Dryomov void *const end = p + data_len; 3479d4c2269bSIlya Dryomov u8 struct_v = 0; 3480ed95b21aSIlya Dryomov u32 len; 3481ed95b21aSIlya Dryomov u32 notify_op; 3482b8d70035SAlex Elder int ret; 3483b8d70035SAlex Elder 3484ed95b21aSIlya Dryomov dout("%s rbd_dev %p cookie %llu notify_id %llu data_len %zu\n", 3485ed95b21aSIlya Dryomov __func__, rbd_dev, cookie, notify_id, data_len); 3486ed95b21aSIlya Dryomov if (data_len) { 3487ed95b21aSIlya Dryomov ret = ceph_start_decoding(&p, end, 1, "NotifyMessage", 3488ed95b21aSIlya Dryomov &struct_v, &len); 3489ed95b21aSIlya Dryomov if (ret) { 3490ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "failed to decode NotifyMessage: %d", 3491ed95b21aSIlya Dryomov ret); 3492ed95b21aSIlya Dryomov return; 3493ed95b21aSIlya Dryomov } 349452bb1f9bSIlya Dryomov 3495ed95b21aSIlya Dryomov notify_op = ceph_decode_32(&p); 3496ed95b21aSIlya Dryomov } else { 3497ed95b21aSIlya Dryomov /* legacy notification for header updates */ 3498ed95b21aSIlya Dryomov notify_op = RBD_NOTIFY_OP_HEADER_UPDATE; 3499ed95b21aSIlya Dryomov len = 0; 3500ed95b21aSIlya Dryomov } 3501ed95b21aSIlya Dryomov 3502ed95b21aSIlya Dryomov dout("%s rbd_dev %p notify_op %u\n", __func__, rbd_dev, notify_op); 3503ed95b21aSIlya Dryomov switch (notify_op) { 3504ed95b21aSIlya Dryomov case RBD_NOTIFY_OP_ACQUIRED_LOCK: 3505ed95b21aSIlya Dryomov rbd_handle_acquired_lock(rbd_dev, struct_v, &p); 3506ed95b21aSIlya Dryomov rbd_acknowledge_notify(rbd_dev, notify_id, cookie); 3507ed95b21aSIlya Dryomov break; 3508ed95b21aSIlya Dryomov case RBD_NOTIFY_OP_RELEASED_LOCK: 3509ed95b21aSIlya Dryomov rbd_handle_released_lock(rbd_dev, struct_v, &p); 3510ed95b21aSIlya Dryomov rbd_acknowledge_notify(rbd_dev, notify_id, cookie); 3511ed95b21aSIlya Dryomov break; 3512ed95b21aSIlya Dryomov case RBD_NOTIFY_OP_REQUEST_LOCK: 35133b77faa0SIlya Dryomov ret = rbd_handle_request_lock(rbd_dev, struct_v, &p); 35143b77faa0SIlya Dryomov if (ret <= 0) 3515ed95b21aSIlya Dryomov rbd_acknowledge_notify_result(rbd_dev, notify_id, 35163b77faa0SIlya Dryomov cookie, ret); 3517ed95b21aSIlya Dryomov else 3518ed95b21aSIlya Dryomov rbd_acknowledge_notify(rbd_dev, notify_id, cookie); 3519ed95b21aSIlya Dryomov break; 3520ed95b21aSIlya Dryomov case RBD_NOTIFY_OP_HEADER_UPDATE: 3521e627db08SAlex Elder ret = rbd_dev_refresh(rbd_dev); 3522e627db08SAlex Elder if (ret) 35239584d508SIlya Dryomov rbd_warn(rbd_dev, "refresh failed: %d", ret); 3524b8d70035SAlex Elder 3525ed95b21aSIlya Dryomov rbd_acknowledge_notify(rbd_dev, notify_id, cookie); 3526ed95b21aSIlya Dryomov break; 3527ed95b21aSIlya Dryomov default: 3528ed95b21aSIlya Dryomov if (rbd_is_lock_owner(rbd_dev)) 3529ed95b21aSIlya Dryomov rbd_acknowledge_notify_result(rbd_dev, notify_id, 3530ed95b21aSIlya Dryomov cookie, -EOPNOTSUPP); 3531ed95b21aSIlya Dryomov else 3532ed95b21aSIlya Dryomov rbd_acknowledge_notify(rbd_dev, notify_id, cookie); 3533ed95b21aSIlya Dryomov break; 3534b8d70035SAlex Elder } 3535b8d70035SAlex Elder } 3536b8d70035SAlex Elder 353799d16943SIlya Dryomov static void __rbd_unregister_watch(struct rbd_device *rbd_dev); 35389969ebc5SAlex Elder 3539922dab61SIlya Dryomov static void rbd_watch_errcb(void *arg, u64 cookie, int err) 3540bb040aa0SIlya Dryomov { 3541922dab61SIlya Dryomov struct rbd_device *rbd_dev = arg; 3542bb040aa0SIlya Dryomov 3543922dab61SIlya Dryomov rbd_warn(rbd_dev, "encountered watch error: %d", err); 3544bb040aa0SIlya Dryomov 3545ed95b21aSIlya Dryomov down_write(&rbd_dev->lock_rwsem); 3546ed95b21aSIlya Dryomov rbd_set_owner_cid(rbd_dev, &rbd_empty_cid); 3547ed95b21aSIlya Dryomov up_write(&rbd_dev->lock_rwsem); 3548bb040aa0SIlya Dryomov 354999d16943SIlya Dryomov mutex_lock(&rbd_dev->watch_mutex); 355099d16943SIlya Dryomov if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED) { 355199d16943SIlya Dryomov __rbd_unregister_watch(rbd_dev); 355299d16943SIlya Dryomov rbd_dev->watch_state = RBD_WATCH_STATE_ERROR; 3553bb040aa0SIlya Dryomov 355499d16943SIlya Dryomov queue_delayed_work(rbd_dev->task_wq, &rbd_dev->watch_dwork, 0); 3555bb040aa0SIlya Dryomov } 355699d16943SIlya Dryomov mutex_unlock(&rbd_dev->watch_mutex); 3557bb040aa0SIlya Dryomov } 3558bb040aa0SIlya Dryomov 3559bb040aa0SIlya Dryomov /* 356099d16943SIlya Dryomov * watch_mutex must be locked 35619969ebc5SAlex Elder */ 356299d16943SIlya Dryomov static int __rbd_register_watch(struct rbd_device *rbd_dev) 35639969ebc5SAlex Elder { 35649969ebc5SAlex Elder struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 3565922dab61SIlya Dryomov struct ceph_osd_linger_request *handle; 35669969ebc5SAlex Elder 3567922dab61SIlya Dryomov rbd_assert(!rbd_dev->watch_handle); 356899d16943SIlya Dryomov dout("%s rbd_dev %p\n", __func__, rbd_dev); 35699969ebc5SAlex Elder 3570922dab61SIlya Dryomov handle = ceph_osdc_watch(osdc, &rbd_dev->header_oid, 3571922dab61SIlya Dryomov &rbd_dev->header_oloc, rbd_watch_cb, 3572922dab61SIlya Dryomov rbd_watch_errcb, rbd_dev); 3573922dab61SIlya Dryomov if (IS_ERR(handle)) 3574922dab61SIlya Dryomov return PTR_ERR(handle); 35759969ebc5SAlex Elder 3576922dab61SIlya Dryomov rbd_dev->watch_handle = handle; 35778eb87565SAlex Elder return 0; 35789969ebc5SAlex Elder } 35799969ebc5SAlex Elder 358099d16943SIlya Dryomov /* 358199d16943SIlya Dryomov * watch_mutex must be locked 358299d16943SIlya Dryomov */ 358399d16943SIlya Dryomov static void __rbd_unregister_watch(struct rbd_device *rbd_dev) 3584fca27065SIlya Dryomov { 3585922dab61SIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 3586922dab61SIlya Dryomov int ret; 3587b30a01f2SIlya Dryomov 358899d16943SIlya Dryomov rbd_assert(rbd_dev->watch_handle); 358999d16943SIlya Dryomov dout("%s rbd_dev %p\n", __func__, rbd_dev); 3590b30a01f2SIlya Dryomov 3591922dab61SIlya Dryomov ret = ceph_osdc_unwatch(osdc, rbd_dev->watch_handle); 3592922dab61SIlya Dryomov if (ret) 3593922dab61SIlya Dryomov rbd_warn(rbd_dev, "failed to unwatch: %d", ret); 3594b30a01f2SIlya Dryomov 3595922dab61SIlya Dryomov rbd_dev->watch_handle = NULL; 3596c525f036SIlya Dryomov } 3597c525f036SIlya Dryomov 359899d16943SIlya Dryomov static int rbd_register_watch(struct rbd_device *rbd_dev) 3599c525f036SIlya Dryomov { 360099d16943SIlya Dryomov int ret; 3601811c6688SIlya Dryomov 360299d16943SIlya Dryomov mutex_lock(&rbd_dev->watch_mutex); 360399d16943SIlya Dryomov rbd_assert(rbd_dev->watch_state == RBD_WATCH_STATE_UNREGISTERED); 360499d16943SIlya Dryomov ret = __rbd_register_watch(rbd_dev); 360599d16943SIlya Dryomov if (ret) 360699d16943SIlya Dryomov goto out; 360799d16943SIlya Dryomov 360899d16943SIlya Dryomov rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED; 360999d16943SIlya Dryomov rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id; 361099d16943SIlya Dryomov 361199d16943SIlya Dryomov out: 361299d16943SIlya Dryomov mutex_unlock(&rbd_dev->watch_mutex); 361399d16943SIlya Dryomov return ret; 361499d16943SIlya Dryomov } 361599d16943SIlya Dryomov 361699d16943SIlya Dryomov static void cancel_tasks_sync(struct rbd_device *rbd_dev) 361799d16943SIlya Dryomov { 361899d16943SIlya Dryomov dout("%s rbd_dev %p\n", __func__, rbd_dev); 361999d16943SIlya Dryomov 3620ed95b21aSIlya Dryomov cancel_work_sync(&rbd_dev->acquired_lock_work); 3621ed95b21aSIlya Dryomov cancel_work_sync(&rbd_dev->released_lock_work); 3622ed95b21aSIlya Dryomov cancel_delayed_work_sync(&rbd_dev->lock_dwork); 3623ed95b21aSIlya Dryomov cancel_work_sync(&rbd_dev->unlock_work); 362499d16943SIlya Dryomov } 362599d16943SIlya Dryomov 362699d16943SIlya Dryomov static void rbd_unregister_watch(struct rbd_device *rbd_dev) 362799d16943SIlya Dryomov { 3628ed95b21aSIlya Dryomov WARN_ON(waitqueue_active(&rbd_dev->lock_waitq)); 362999d16943SIlya Dryomov cancel_tasks_sync(rbd_dev); 363099d16943SIlya Dryomov 363199d16943SIlya Dryomov mutex_lock(&rbd_dev->watch_mutex); 363299d16943SIlya Dryomov if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED) 363399d16943SIlya Dryomov __rbd_unregister_watch(rbd_dev); 363499d16943SIlya Dryomov rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED; 363599d16943SIlya Dryomov mutex_unlock(&rbd_dev->watch_mutex); 363699d16943SIlya Dryomov 363723edca86SDongsheng Yang cancel_delayed_work_sync(&rbd_dev->watch_dwork); 3638811c6688SIlya Dryomov ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc); 3639fca27065SIlya Dryomov } 3640fca27065SIlya Dryomov 364114bb211dSIlya Dryomov /* 364214bb211dSIlya Dryomov * lock_rwsem must be held for write 364314bb211dSIlya Dryomov */ 364414bb211dSIlya Dryomov static void rbd_reacquire_lock(struct rbd_device *rbd_dev) 364514bb211dSIlya Dryomov { 364614bb211dSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 364714bb211dSIlya Dryomov char cookie[32]; 364814bb211dSIlya Dryomov int ret; 364914bb211dSIlya Dryomov 365014bb211dSIlya Dryomov WARN_ON(rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED); 365114bb211dSIlya Dryomov 365214bb211dSIlya Dryomov format_lock_cookie(rbd_dev, cookie); 365314bb211dSIlya Dryomov ret = ceph_cls_set_cookie(osdc, &rbd_dev->header_oid, 365414bb211dSIlya Dryomov &rbd_dev->header_oloc, RBD_LOCK_NAME, 365514bb211dSIlya Dryomov CEPH_CLS_LOCK_EXCLUSIVE, rbd_dev->lock_cookie, 365614bb211dSIlya Dryomov RBD_LOCK_TAG, cookie); 365714bb211dSIlya Dryomov if (ret) { 365814bb211dSIlya Dryomov if (ret != -EOPNOTSUPP) 365914bb211dSIlya Dryomov rbd_warn(rbd_dev, "failed to update lock cookie: %d", 366014bb211dSIlya Dryomov ret); 366114bb211dSIlya Dryomov 366214bb211dSIlya Dryomov /* 366314bb211dSIlya Dryomov * Lock cookie cannot be updated on older OSDs, so do 366414bb211dSIlya Dryomov * a manual release and queue an acquire. 366514bb211dSIlya Dryomov */ 366614bb211dSIlya Dryomov if (rbd_release_lock(rbd_dev)) 366714bb211dSIlya Dryomov queue_delayed_work(rbd_dev->task_wq, 366814bb211dSIlya Dryomov &rbd_dev->lock_dwork, 0); 366914bb211dSIlya Dryomov } else { 3670edd8ca80SFlorian Margaine __rbd_lock(rbd_dev, cookie); 367114bb211dSIlya Dryomov } 367214bb211dSIlya Dryomov } 367314bb211dSIlya Dryomov 367499d16943SIlya Dryomov static void rbd_reregister_watch(struct work_struct *work) 367599d16943SIlya Dryomov { 367699d16943SIlya Dryomov struct rbd_device *rbd_dev = container_of(to_delayed_work(work), 367799d16943SIlya Dryomov struct rbd_device, watch_dwork); 367899d16943SIlya Dryomov int ret; 367999d16943SIlya Dryomov 368099d16943SIlya Dryomov dout("%s rbd_dev %p\n", __func__, rbd_dev); 368199d16943SIlya Dryomov 368299d16943SIlya Dryomov mutex_lock(&rbd_dev->watch_mutex); 368387c0fdedSIlya Dryomov if (rbd_dev->watch_state != RBD_WATCH_STATE_ERROR) { 368487c0fdedSIlya Dryomov mutex_unlock(&rbd_dev->watch_mutex); 368514bb211dSIlya Dryomov return; 368687c0fdedSIlya Dryomov } 368799d16943SIlya Dryomov 368899d16943SIlya Dryomov ret = __rbd_register_watch(rbd_dev); 368999d16943SIlya Dryomov if (ret) { 369099d16943SIlya Dryomov rbd_warn(rbd_dev, "failed to reregister watch: %d", ret); 36914d73644bSIlya Dryomov if (ret == -EBLACKLISTED || ret == -ENOENT) { 369287c0fdedSIlya Dryomov set_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags); 369314bb211dSIlya Dryomov wake_requests(rbd_dev, true); 369487c0fdedSIlya Dryomov } else { 369599d16943SIlya Dryomov queue_delayed_work(rbd_dev->task_wq, 369699d16943SIlya Dryomov &rbd_dev->watch_dwork, 369799d16943SIlya Dryomov RBD_RETRY_DELAY); 369887c0fdedSIlya Dryomov } 369987c0fdedSIlya Dryomov mutex_unlock(&rbd_dev->watch_mutex); 370014bb211dSIlya Dryomov return; 370199d16943SIlya Dryomov } 370299d16943SIlya Dryomov 370399d16943SIlya Dryomov rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED; 370499d16943SIlya Dryomov rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id; 370599d16943SIlya Dryomov mutex_unlock(&rbd_dev->watch_mutex); 370699d16943SIlya Dryomov 370714bb211dSIlya Dryomov down_write(&rbd_dev->lock_rwsem); 370814bb211dSIlya Dryomov if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED) 370914bb211dSIlya Dryomov rbd_reacquire_lock(rbd_dev); 371014bb211dSIlya Dryomov up_write(&rbd_dev->lock_rwsem); 371114bb211dSIlya Dryomov 371299d16943SIlya Dryomov ret = rbd_dev_refresh(rbd_dev); 371399d16943SIlya Dryomov if (ret) 3714f6870cc9SColin Ian King rbd_warn(rbd_dev, "reregistration refresh failed: %d", ret); 371599d16943SIlya Dryomov } 371699d16943SIlya Dryomov 371736be9a76SAlex Elder /* 3718f40eb349SAlex Elder * Synchronous osd object method call. Returns the number of bytes 3719f40eb349SAlex Elder * returned in the outbound buffer, or a negative error code. 372036be9a76SAlex Elder */ 372136be9a76SAlex Elder static int rbd_obj_method_sync(struct rbd_device *rbd_dev, 3722ecd4a68aSIlya Dryomov struct ceph_object_id *oid, 3723ecd4a68aSIlya Dryomov struct ceph_object_locator *oloc, 372436be9a76SAlex Elder const char *method_name, 37254157976bSAlex Elder const void *outbound, 372636be9a76SAlex Elder size_t outbound_size, 37274157976bSAlex Elder void *inbound, 3728e2a58ee5SAlex Elder size_t inbound_size) 372936be9a76SAlex Elder { 3730ecd4a68aSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 3731ecd4a68aSIlya Dryomov struct page *req_page = NULL; 3732ecd4a68aSIlya Dryomov struct page *reply_page; 373336be9a76SAlex Elder int ret; 373436be9a76SAlex Elder 373536be9a76SAlex Elder /* 37366010a451SAlex Elder * Method calls are ultimately read operations. The result 37376010a451SAlex Elder * should placed into the inbound buffer provided. They 37386010a451SAlex Elder * also supply outbound data--parameters for the object 37396010a451SAlex Elder * method. Currently if this is present it will be a 37406010a451SAlex Elder * snapshot id. 374136be9a76SAlex Elder */ 3742ecd4a68aSIlya Dryomov if (outbound) { 3743ecd4a68aSIlya Dryomov if (outbound_size > PAGE_SIZE) 3744ecd4a68aSIlya Dryomov return -E2BIG; 374536be9a76SAlex Elder 3746ecd4a68aSIlya Dryomov req_page = alloc_page(GFP_KERNEL); 3747ecd4a68aSIlya Dryomov if (!req_page) 3748ecd4a68aSIlya Dryomov return -ENOMEM; 374936be9a76SAlex Elder 3750ecd4a68aSIlya Dryomov memcpy(page_address(req_page), outbound, outbound_size); 375104017e29SAlex Elder } 3752430c28c3SAlex Elder 3753ecd4a68aSIlya Dryomov reply_page = alloc_page(GFP_KERNEL); 3754ecd4a68aSIlya Dryomov if (!reply_page) { 3755ecd4a68aSIlya Dryomov if (req_page) 3756ecd4a68aSIlya Dryomov __free_page(req_page); 3757ecd4a68aSIlya Dryomov return -ENOMEM; 3758ecd4a68aSIlya Dryomov } 375936be9a76SAlex Elder 3760ecd4a68aSIlya Dryomov ret = ceph_osdc_call(osdc, oid, oloc, RBD_DRV_NAME, method_name, 3761ecd4a68aSIlya Dryomov CEPH_OSD_FLAG_READ, req_page, outbound_size, 3762ecd4a68aSIlya Dryomov reply_page, &inbound_size); 3763ecd4a68aSIlya Dryomov if (!ret) { 3764ecd4a68aSIlya Dryomov memcpy(inbound, page_address(reply_page), inbound_size); 3765ecd4a68aSIlya Dryomov ret = inbound_size; 3766ecd4a68aSIlya Dryomov } 376757385b51SAlex Elder 3768ecd4a68aSIlya Dryomov if (req_page) 3769ecd4a68aSIlya Dryomov __free_page(req_page); 3770ecd4a68aSIlya Dryomov __free_page(reply_page); 377136be9a76SAlex Elder return ret; 377236be9a76SAlex Elder } 377336be9a76SAlex Elder 3774ed95b21aSIlya Dryomov /* 3775ed95b21aSIlya Dryomov * lock_rwsem must be held for read 3776ed95b21aSIlya Dryomov */ 37772f18d466SIlya Dryomov static int rbd_wait_state_locked(struct rbd_device *rbd_dev, bool may_acquire) 3778ed95b21aSIlya Dryomov { 3779ed95b21aSIlya Dryomov DEFINE_WAIT(wait); 378034f55d0bSDongsheng Yang unsigned long timeout; 37812f18d466SIlya Dryomov int ret = 0; 37822f18d466SIlya Dryomov 37832f18d466SIlya Dryomov if (test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags)) 37842f18d466SIlya Dryomov return -EBLACKLISTED; 37852f18d466SIlya Dryomov 37862f18d466SIlya Dryomov if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED) 37872f18d466SIlya Dryomov return 0; 37882f18d466SIlya Dryomov 37892f18d466SIlya Dryomov if (!may_acquire) { 37902f18d466SIlya Dryomov rbd_warn(rbd_dev, "exclusive lock required"); 37912f18d466SIlya Dryomov return -EROFS; 37922f18d466SIlya Dryomov } 3793ed95b21aSIlya Dryomov 3794ed95b21aSIlya Dryomov do { 3795ed95b21aSIlya Dryomov /* 3796ed95b21aSIlya Dryomov * Note the use of mod_delayed_work() in rbd_acquire_lock() 3797ed95b21aSIlya Dryomov * and cancel_delayed_work() in wake_requests(). 3798ed95b21aSIlya Dryomov */ 3799ed95b21aSIlya Dryomov dout("%s rbd_dev %p queueing lock_dwork\n", __func__, rbd_dev); 3800ed95b21aSIlya Dryomov queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0); 3801ed95b21aSIlya Dryomov prepare_to_wait_exclusive(&rbd_dev->lock_waitq, &wait, 3802ed95b21aSIlya Dryomov TASK_UNINTERRUPTIBLE); 3803ed95b21aSIlya Dryomov up_read(&rbd_dev->lock_rwsem); 380434f55d0bSDongsheng Yang timeout = schedule_timeout(ceph_timeout_jiffies( 380534f55d0bSDongsheng Yang rbd_dev->opts->lock_timeout)); 3806ed95b21aSIlya Dryomov down_read(&rbd_dev->lock_rwsem); 38072f18d466SIlya Dryomov if (test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags)) { 38082f18d466SIlya Dryomov ret = -EBLACKLISTED; 38092f18d466SIlya Dryomov break; 38102f18d466SIlya Dryomov } 381134f55d0bSDongsheng Yang if (!timeout) { 381234f55d0bSDongsheng Yang rbd_warn(rbd_dev, "timed out waiting for lock"); 381334f55d0bSDongsheng Yang ret = -ETIMEDOUT; 381434f55d0bSDongsheng Yang break; 381534f55d0bSDongsheng Yang } 38162f18d466SIlya Dryomov } while (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED); 381787c0fdedSIlya Dryomov 3818ed95b21aSIlya Dryomov finish_wait(&rbd_dev->lock_waitq, &wait); 38192f18d466SIlya Dryomov return ret; 3820ed95b21aSIlya Dryomov } 3821ed95b21aSIlya Dryomov 38227ad18afaSChristoph Hellwig static void rbd_queue_workfn(struct work_struct *work) 3823bc1ecc65SIlya Dryomov { 38247ad18afaSChristoph Hellwig struct request *rq = blk_mq_rq_from_pdu(work); 38257ad18afaSChristoph Hellwig struct rbd_device *rbd_dev = rq->q->queuedata; 3826bc1ecc65SIlya Dryomov struct rbd_img_request *img_request; 38274e752f0aSJosh Durgin struct ceph_snap_context *snapc = NULL; 3828bc1ecc65SIlya Dryomov u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT; 3829bc1ecc65SIlya Dryomov u64 length = blk_rq_bytes(rq); 38306d2940c8SGuangliang Zhao enum obj_operation_type op_type; 38314e752f0aSJosh Durgin u64 mapping_size; 383280de1912SIlya Dryomov bool must_be_locked; 3833bc1ecc65SIlya Dryomov int result; 3834bc1ecc65SIlya Dryomov 3835aebf526bSChristoph Hellwig switch (req_op(rq)) { 3836aebf526bSChristoph Hellwig case REQ_OP_DISCARD: 3837aebf526bSChristoph Hellwig op_type = OBJ_OP_DISCARD; 3838aebf526bSChristoph Hellwig break; 38396484cbe9SIlya Dryomov case REQ_OP_WRITE_ZEROES: 38406484cbe9SIlya Dryomov op_type = OBJ_OP_ZEROOUT; 38416484cbe9SIlya Dryomov break; 3842aebf526bSChristoph Hellwig case REQ_OP_WRITE: 3843aebf526bSChristoph Hellwig op_type = OBJ_OP_WRITE; 3844aebf526bSChristoph Hellwig break; 3845aebf526bSChristoph Hellwig case REQ_OP_READ: 3846aebf526bSChristoph Hellwig op_type = OBJ_OP_READ; 3847aebf526bSChristoph Hellwig break; 3848aebf526bSChristoph Hellwig default: 3849aebf526bSChristoph Hellwig dout("%s: non-fs request type %d\n", __func__, req_op(rq)); 38507ad18afaSChristoph Hellwig result = -EIO; 38517ad18afaSChristoph Hellwig goto err; 38527ad18afaSChristoph Hellwig } 38537ad18afaSChristoph Hellwig 3854bc1ecc65SIlya Dryomov /* Ignore/skip any zero-length requests */ 3855bc1ecc65SIlya Dryomov 3856bc1ecc65SIlya Dryomov if (!length) { 3857bc1ecc65SIlya Dryomov dout("%s: zero-length request\n", __func__); 3858bc1ecc65SIlya Dryomov result = 0; 3859bc1ecc65SIlya Dryomov goto err_rq; 3860bc1ecc65SIlya Dryomov } 3861bc1ecc65SIlya Dryomov 38629568c93eSIlya Dryomov rbd_assert(op_type == OBJ_OP_READ || 38639568c93eSIlya Dryomov rbd_dev->spec->snap_id == CEPH_NOSNAP); 3864bc1ecc65SIlya Dryomov 3865bc1ecc65SIlya Dryomov /* 3866bc1ecc65SIlya Dryomov * Quit early if the mapped snapshot no longer exists. It's 3867bc1ecc65SIlya Dryomov * still possible the snapshot will have disappeared by the 3868bc1ecc65SIlya Dryomov * time our request arrives at the osd, but there's no sense in 3869bc1ecc65SIlya Dryomov * sending it if we already know. 3870bc1ecc65SIlya Dryomov */ 3871bc1ecc65SIlya Dryomov if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) { 3872bc1ecc65SIlya Dryomov dout("request for non-existent snapshot"); 3873bc1ecc65SIlya Dryomov rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP); 3874bc1ecc65SIlya Dryomov result = -ENXIO; 3875bc1ecc65SIlya Dryomov goto err_rq; 3876bc1ecc65SIlya Dryomov } 3877bc1ecc65SIlya Dryomov 3878bc1ecc65SIlya Dryomov if (offset && length > U64_MAX - offset + 1) { 3879bc1ecc65SIlya Dryomov rbd_warn(rbd_dev, "bad request range (%llu~%llu)", offset, 3880bc1ecc65SIlya Dryomov length); 3881bc1ecc65SIlya Dryomov result = -EINVAL; 3882bc1ecc65SIlya Dryomov goto err_rq; /* Shouldn't happen */ 3883bc1ecc65SIlya Dryomov } 3884bc1ecc65SIlya Dryomov 38857ad18afaSChristoph Hellwig blk_mq_start_request(rq); 38867ad18afaSChristoph Hellwig 38874e752f0aSJosh Durgin down_read(&rbd_dev->header_rwsem); 38884e752f0aSJosh Durgin mapping_size = rbd_dev->mapping.size; 38896d2940c8SGuangliang Zhao if (op_type != OBJ_OP_READ) { 38904e752f0aSJosh Durgin snapc = rbd_dev->header.snapc; 38914e752f0aSJosh Durgin ceph_get_snap_context(snapc); 38924e752f0aSJosh Durgin } 38934e752f0aSJosh Durgin up_read(&rbd_dev->header_rwsem); 38944e752f0aSJosh Durgin 38954e752f0aSJosh Durgin if (offset + length > mapping_size) { 3896bc1ecc65SIlya Dryomov rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)", offset, 38974e752f0aSJosh Durgin length, mapping_size); 3898bc1ecc65SIlya Dryomov result = -EIO; 3899bc1ecc65SIlya Dryomov goto err_rq; 3900bc1ecc65SIlya Dryomov } 3901bc1ecc65SIlya Dryomov 3902f9bebd58SIlya Dryomov must_be_locked = 3903f9bebd58SIlya Dryomov (rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK) && 3904f9bebd58SIlya Dryomov (op_type != OBJ_OP_READ || rbd_dev->opts->lock_on_read); 3905ed95b21aSIlya Dryomov if (must_be_locked) { 3906ed95b21aSIlya Dryomov down_read(&rbd_dev->lock_rwsem); 39072f18d466SIlya Dryomov result = rbd_wait_state_locked(rbd_dev, 39082f18d466SIlya Dryomov !rbd_dev->opts->exclusive); 39092f18d466SIlya Dryomov if (result) 3910e010dd0aSIlya Dryomov goto err_unlock; 3911e010dd0aSIlya Dryomov } 3912ed95b21aSIlya Dryomov 3913dfd9875fSIlya Dryomov img_request = rbd_img_request_create(rbd_dev, op_type, snapc); 3914bc1ecc65SIlya Dryomov if (!img_request) { 3915bc1ecc65SIlya Dryomov result = -ENOMEM; 3916ed95b21aSIlya Dryomov goto err_unlock; 3917bc1ecc65SIlya Dryomov } 3918bc1ecc65SIlya Dryomov img_request->rq = rq; 391970b16db8SIlya Dryomov snapc = NULL; /* img_request consumes a ref */ 3920bc1ecc65SIlya Dryomov 39216484cbe9SIlya Dryomov if (op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_ZEROOUT) 39225a237819SIlya Dryomov result = rbd_img_fill_nodata(img_request, offset, length); 392390e98c52SGuangliang Zhao else 39245a237819SIlya Dryomov result = rbd_img_fill_from_bio(img_request, offset, length, 392590e98c52SGuangliang Zhao rq->bio); 39260c93e1b7SIlya Dryomov if (result || !img_request->pending_count) 3927bc1ecc65SIlya Dryomov goto err_img_request; 3928bc1ecc65SIlya Dryomov 3929efbd1a11SIlya Dryomov rbd_img_request_submit(img_request); 3930ed95b21aSIlya Dryomov if (must_be_locked) 3931ed95b21aSIlya Dryomov up_read(&rbd_dev->lock_rwsem); 3932bc1ecc65SIlya Dryomov return; 3933bc1ecc65SIlya Dryomov 3934bc1ecc65SIlya Dryomov err_img_request: 3935bc1ecc65SIlya Dryomov rbd_img_request_put(img_request); 3936ed95b21aSIlya Dryomov err_unlock: 3937ed95b21aSIlya Dryomov if (must_be_locked) 3938ed95b21aSIlya Dryomov up_read(&rbd_dev->lock_rwsem); 3939bc1ecc65SIlya Dryomov err_rq: 3940bc1ecc65SIlya Dryomov if (result) 3941bc1ecc65SIlya Dryomov rbd_warn(rbd_dev, "%s %llx at %llx result %d", 39426d2940c8SGuangliang Zhao obj_op_name(op_type), length, offset, result); 39434e752f0aSJosh Durgin ceph_put_snap_context(snapc); 39447ad18afaSChristoph Hellwig err: 39452a842acaSChristoph Hellwig blk_mq_end_request(rq, errno_to_blk_status(result)); 3946bc1ecc65SIlya Dryomov } 3947bc1ecc65SIlya Dryomov 3948fc17b653SChristoph Hellwig static blk_status_t rbd_queue_rq(struct blk_mq_hw_ctx *hctx, 39497ad18afaSChristoph Hellwig const struct blk_mq_queue_data *bd) 3950bc1ecc65SIlya Dryomov { 39517ad18afaSChristoph Hellwig struct request *rq = bd->rq; 39527ad18afaSChristoph Hellwig struct work_struct *work = blk_mq_rq_to_pdu(rq); 3953bc1ecc65SIlya Dryomov 39547ad18afaSChristoph Hellwig queue_work(rbd_wq, work); 3955fc17b653SChristoph Hellwig return BLK_STS_OK; 3956bf0d5f50SAlex Elder } 3957bf0d5f50SAlex Elder 3958602adf40SYehuda Sadeh static void rbd_free_disk(struct rbd_device *rbd_dev) 3959602adf40SYehuda Sadeh { 39605769ed0cSIlya Dryomov blk_cleanup_queue(rbd_dev->disk->queue); 39617ad18afaSChristoph Hellwig blk_mq_free_tag_set(&rbd_dev->tag_set); 39625769ed0cSIlya Dryomov put_disk(rbd_dev->disk); 39635769ed0cSIlya Dryomov rbd_dev->disk = NULL; 3964602adf40SYehuda Sadeh } 3965602adf40SYehuda Sadeh 3966788e2df3SAlex Elder static int rbd_obj_read_sync(struct rbd_device *rbd_dev, 3967fe5478e0SIlya Dryomov struct ceph_object_id *oid, 3968fe5478e0SIlya Dryomov struct ceph_object_locator *oloc, 3969fe5478e0SIlya Dryomov void *buf, int buf_len) 3970788e2df3SAlex Elder 3971788e2df3SAlex Elder { 3972fe5478e0SIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 3973fe5478e0SIlya Dryomov struct ceph_osd_request *req; 3974fe5478e0SIlya Dryomov struct page **pages; 3975fe5478e0SIlya Dryomov int num_pages = calc_pages_for(0, buf_len); 3976788e2df3SAlex Elder int ret; 3977788e2df3SAlex Elder 3978fe5478e0SIlya Dryomov req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_KERNEL); 3979fe5478e0SIlya Dryomov if (!req) 3980fe5478e0SIlya Dryomov return -ENOMEM; 3981788e2df3SAlex Elder 3982fe5478e0SIlya Dryomov ceph_oid_copy(&req->r_base_oid, oid); 3983fe5478e0SIlya Dryomov ceph_oloc_copy(&req->r_base_oloc, oloc); 3984fe5478e0SIlya Dryomov req->r_flags = CEPH_OSD_FLAG_READ; 3985788e2df3SAlex Elder 3986fe5478e0SIlya Dryomov pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); 3987fe5478e0SIlya Dryomov if (IS_ERR(pages)) { 3988fe5478e0SIlya Dryomov ret = PTR_ERR(pages); 3989fe5478e0SIlya Dryomov goto out_req; 3990fe5478e0SIlya Dryomov } 39911ceae7efSAlex Elder 3992fe5478e0SIlya Dryomov osd_req_op_extent_init(req, 0, CEPH_OSD_OP_READ, 0, buf_len, 0, 0); 3993fe5478e0SIlya Dryomov osd_req_op_extent_osd_data_pages(req, 0, pages, buf_len, 0, false, 3994fe5478e0SIlya Dryomov true); 3995788e2df3SAlex Elder 399626f887e0SIlya Dryomov ret = ceph_osdc_alloc_messages(req, GFP_KERNEL); 399726f887e0SIlya Dryomov if (ret) 399826f887e0SIlya Dryomov goto out_req; 399926f887e0SIlya Dryomov 4000fe5478e0SIlya Dryomov ceph_osdc_start_request(osdc, req, false); 4001fe5478e0SIlya Dryomov ret = ceph_osdc_wait_request(osdc, req); 4002fe5478e0SIlya Dryomov if (ret >= 0) 4003fe5478e0SIlya Dryomov ceph_copy_from_page_vector(pages, buf, 0, ret); 4004fe5478e0SIlya Dryomov 4005fe5478e0SIlya Dryomov out_req: 4006fe5478e0SIlya Dryomov ceph_osdc_put_request(req); 4007788e2df3SAlex Elder return ret; 4008788e2df3SAlex Elder } 4009788e2df3SAlex Elder 4010602adf40SYehuda Sadeh /* 4011662518b1SAlex Elder * Read the complete header for the given rbd device. On successful 4012662518b1SAlex Elder * return, the rbd_dev->header field will contain up-to-date 4013662518b1SAlex Elder * information about the image. 40144156d998SAlex Elder */ 401599a41ebcSAlex Elder static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev) 40164156d998SAlex Elder { 40174156d998SAlex Elder struct rbd_image_header_ondisk *ondisk = NULL; 40184156d998SAlex Elder u32 snap_count = 0; 40194156d998SAlex Elder u64 names_size = 0; 40204156d998SAlex Elder u32 want_count; 40214156d998SAlex Elder int ret; 40224156d998SAlex Elder 40234156d998SAlex Elder /* 40244156d998SAlex Elder * The complete header will include an array of its 64-bit 40254156d998SAlex Elder * snapshot ids, followed by the names of those snapshots as 40264156d998SAlex Elder * a contiguous block of NUL-terminated strings. Note that 40274156d998SAlex Elder * the number of snapshots could change by the time we read 40284156d998SAlex Elder * it in, in which case we re-read it. 40294156d998SAlex Elder */ 40304156d998SAlex Elder do { 40314156d998SAlex Elder size_t size; 40324156d998SAlex Elder 40334156d998SAlex Elder kfree(ondisk); 40344156d998SAlex Elder 40354156d998SAlex Elder size = sizeof (*ondisk); 40364156d998SAlex Elder size += snap_count * sizeof (struct rbd_image_snap_ondisk); 40374156d998SAlex Elder size += names_size; 40384156d998SAlex Elder ondisk = kmalloc(size, GFP_KERNEL); 40394156d998SAlex Elder if (!ondisk) 4040662518b1SAlex Elder return -ENOMEM; 40414156d998SAlex Elder 4042fe5478e0SIlya Dryomov ret = rbd_obj_read_sync(rbd_dev, &rbd_dev->header_oid, 4043fe5478e0SIlya Dryomov &rbd_dev->header_oloc, ondisk, size); 40444156d998SAlex Elder if (ret < 0) 4045662518b1SAlex Elder goto out; 4046c0cd10dbSAlex Elder if ((size_t)ret < size) { 40474156d998SAlex Elder ret = -ENXIO; 404806ecc6cbSAlex Elder rbd_warn(rbd_dev, "short header read (want %zd got %d)", 404906ecc6cbSAlex Elder size, ret); 4050662518b1SAlex Elder goto out; 40514156d998SAlex Elder } 40524156d998SAlex Elder if (!rbd_dev_ondisk_valid(ondisk)) { 40534156d998SAlex Elder ret = -ENXIO; 405406ecc6cbSAlex Elder rbd_warn(rbd_dev, "invalid header"); 4055662518b1SAlex Elder goto out; 40564156d998SAlex Elder } 40574156d998SAlex Elder 40584156d998SAlex Elder names_size = le64_to_cpu(ondisk->snap_names_len); 40594156d998SAlex Elder want_count = snap_count; 40604156d998SAlex Elder snap_count = le32_to_cpu(ondisk->snap_count); 40614156d998SAlex Elder } while (snap_count != want_count); 40624156d998SAlex Elder 4063662518b1SAlex Elder ret = rbd_header_from_disk(rbd_dev, ondisk); 4064662518b1SAlex Elder out: 40654156d998SAlex Elder kfree(ondisk); 40664156d998SAlex Elder 4067dfc5606dSYehuda Sadeh return ret; 4068602adf40SYehuda Sadeh } 4069602adf40SYehuda Sadeh 407015228edeSAlex Elder /* 407115228edeSAlex Elder * Clear the rbd device's EXISTS flag if the snapshot it's mapped to 407215228edeSAlex Elder * has disappeared from the (just updated) snapshot context. 407315228edeSAlex Elder */ 407415228edeSAlex Elder static void rbd_exists_validate(struct rbd_device *rbd_dev) 407515228edeSAlex Elder { 407615228edeSAlex Elder u64 snap_id; 407715228edeSAlex Elder 407815228edeSAlex Elder if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) 407915228edeSAlex Elder return; 408015228edeSAlex Elder 408115228edeSAlex Elder snap_id = rbd_dev->spec->snap_id; 408215228edeSAlex Elder if (snap_id == CEPH_NOSNAP) 408315228edeSAlex Elder return; 408415228edeSAlex Elder 408515228edeSAlex Elder if (rbd_dev_snap_index(rbd_dev, snap_id) == BAD_SNAP_INDEX) 408615228edeSAlex Elder clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); 408715228edeSAlex Elder } 408815228edeSAlex Elder 40899875201eSJosh Durgin static void rbd_dev_update_size(struct rbd_device *rbd_dev) 40909875201eSJosh Durgin { 40919875201eSJosh Durgin sector_t size; 40929875201eSJosh Durgin 40939875201eSJosh Durgin /* 4094811c6688SIlya Dryomov * If EXISTS is not set, rbd_dev->disk may be NULL, so don't 4095811c6688SIlya Dryomov * try to update its size. If REMOVING is set, updating size 4096811c6688SIlya Dryomov * is just useless work since the device can't be opened. 40979875201eSJosh Durgin */ 4098811c6688SIlya Dryomov if (test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags) && 4099811c6688SIlya Dryomov !test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags)) { 41009875201eSJosh Durgin size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE; 41019875201eSJosh Durgin dout("setting size to %llu sectors", (unsigned long long)size); 41029875201eSJosh Durgin set_capacity(rbd_dev->disk, size); 41039875201eSJosh Durgin revalidate_disk(rbd_dev->disk); 41049875201eSJosh Durgin } 41059875201eSJosh Durgin } 41069875201eSJosh Durgin 4107cc4a38bdSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev) 41081fe5e993SAlex Elder { 4109e627db08SAlex Elder u64 mapping_size; 41101fe5e993SAlex Elder int ret; 41111fe5e993SAlex Elder 4112cfbf6377SAlex Elder down_write(&rbd_dev->header_rwsem); 41133b5cf2a2SAlex Elder mapping_size = rbd_dev->mapping.size; 4114a720ae09SIlya Dryomov 4115a720ae09SIlya Dryomov ret = rbd_dev_header_info(rbd_dev); 411652bb1f9bSIlya Dryomov if (ret) 411773e39e4dSIlya Dryomov goto out; 411815228edeSAlex Elder 4119e8f59b59SIlya Dryomov /* 4120e8f59b59SIlya Dryomov * If there is a parent, see if it has disappeared due to the 4121e8f59b59SIlya Dryomov * mapped image getting flattened. 4122e8f59b59SIlya Dryomov */ 4123e8f59b59SIlya Dryomov if (rbd_dev->parent) { 4124e8f59b59SIlya Dryomov ret = rbd_dev_v2_parent_info(rbd_dev); 4125e8f59b59SIlya Dryomov if (ret) 412673e39e4dSIlya Dryomov goto out; 4127e8f59b59SIlya Dryomov } 4128e8f59b59SIlya Dryomov 41295ff1108cSIlya Dryomov if (rbd_dev->spec->snap_id == CEPH_NOSNAP) { 41305ff1108cSIlya Dryomov rbd_dev->mapping.size = rbd_dev->header.image_size; 41315ff1108cSIlya Dryomov } else { 41325ff1108cSIlya Dryomov /* validate mapped snapshot's EXISTS flag */ 413315228edeSAlex Elder rbd_exists_validate(rbd_dev); 41345ff1108cSIlya Dryomov } 41355ff1108cSIlya Dryomov 413673e39e4dSIlya Dryomov out: 4137cfbf6377SAlex Elder up_write(&rbd_dev->header_rwsem); 413873e39e4dSIlya Dryomov if (!ret && mapping_size != rbd_dev->mapping.size) 41399875201eSJosh Durgin rbd_dev_update_size(rbd_dev); 41401fe5e993SAlex Elder 414173e39e4dSIlya Dryomov return ret; 41421fe5e993SAlex Elder } 41431fe5e993SAlex Elder 4144d6296d39SChristoph Hellwig static int rbd_init_request(struct blk_mq_tag_set *set, struct request *rq, 4145d6296d39SChristoph Hellwig unsigned int hctx_idx, unsigned int numa_node) 41467ad18afaSChristoph Hellwig { 41477ad18afaSChristoph Hellwig struct work_struct *work = blk_mq_rq_to_pdu(rq); 41487ad18afaSChristoph Hellwig 41497ad18afaSChristoph Hellwig INIT_WORK(work, rbd_queue_workfn); 41507ad18afaSChristoph Hellwig return 0; 41517ad18afaSChristoph Hellwig } 41527ad18afaSChristoph Hellwig 4153f363b089SEric Biggers static const struct blk_mq_ops rbd_mq_ops = { 41547ad18afaSChristoph Hellwig .queue_rq = rbd_queue_rq, 41557ad18afaSChristoph Hellwig .init_request = rbd_init_request, 41567ad18afaSChristoph Hellwig }; 41577ad18afaSChristoph Hellwig 4158602adf40SYehuda Sadeh static int rbd_init_disk(struct rbd_device *rbd_dev) 4159602adf40SYehuda Sadeh { 4160602adf40SYehuda Sadeh struct gendisk *disk; 4161602adf40SYehuda Sadeh struct request_queue *q; 4162420efbdfSIlya Dryomov unsigned int objset_bytes = 4163420efbdfSIlya Dryomov rbd_dev->layout.object_size * rbd_dev->layout.stripe_count; 41647ad18afaSChristoph Hellwig int err; 4165602adf40SYehuda Sadeh 4166602adf40SYehuda Sadeh /* create gendisk info */ 41677e513d43SIlya Dryomov disk = alloc_disk(single_major ? 41687e513d43SIlya Dryomov (1 << RBD_SINGLE_MAJOR_PART_SHIFT) : 41697e513d43SIlya Dryomov RBD_MINORS_PER_MAJOR); 4170602adf40SYehuda Sadeh if (!disk) 41711fcdb8aaSAlex Elder return -ENOMEM; 4172602adf40SYehuda Sadeh 4173f0f8cef5SAlex Elder snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d", 4174de71a297SAlex Elder rbd_dev->dev_id); 4175602adf40SYehuda Sadeh disk->major = rbd_dev->major; 4176dd82fff1SIlya Dryomov disk->first_minor = rbd_dev->minor; 41777e513d43SIlya Dryomov if (single_major) 41787e513d43SIlya Dryomov disk->flags |= GENHD_FL_EXT_DEVT; 4179602adf40SYehuda Sadeh disk->fops = &rbd_bd_ops; 4180602adf40SYehuda Sadeh disk->private_data = rbd_dev; 4181602adf40SYehuda Sadeh 41827ad18afaSChristoph Hellwig memset(&rbd_dev->tag_set, 0, sizeof(rbd_dev->tag_set)); 41837ad18afaSChristoph Hellwig rbd_dev->tag_set.ops = &rbd_mq_ops; 4184b5584180SIlya Dryomov rbd_dev->tag_set.queue_depth = rbd_dev->opts->queue_depth; 41857ad18afaSChristoph Hellwig rbd_dev->tag_set.numa_node = NUMA_NO_NODE; 4186b5584180SIlya Dryomov rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE; 41877ad18afaSChristoph Hellwig rbd_dev->tag_set.nr_hw_queues = 1; 41887ad18afaSChristoph Hellwig rbd_dev->tag_set.cmd_size = sizeof(struct work_struct); 41897ad18afaSChristoph Hellwig 41907ad18afaSChristoph Hellwig err = blk_mq_alloc_tag_set(&rbd_dev->tag_set); 41917ad18afaSChristoph Hellwig if (err) 4192602adf40SYehuda Sadeh goto out_disk; 4193029bcbd8SJosh Durgin 41947ad18afaSChristoph Hellwig q = blk_mq_init_queue(&rbd_dev->tag_set); 41957ad18afaSChristoph Hellwig if (IS_ERR(q)) { 41967ad18afaSChristoph Hellwig err = PTR_ERR(q); 41977ad18afaSChristoph Hellwig goto out_tag_set; 41987ad18afaSChristoph Hellwig } 41997ad18afaSChristoph Hellwig 42008b904b5bSBart Van Assche blk_queue_flag_set(QUEUE_FLAG_NONROT, q); 4201d8a2c89cSIlya Dryomov /* QUEUE_FLAG_ADD_RANDOM is off by default for blk-mq */ 4202593a9e7bSAlex Elder 4203420efbdfSIlya Dryomov blk_queue_max_hw_sectors(q, objset_bytes >> SECTOR_SHIFT); 42040d9fde4fSIlya Dryomov q->limits.max_sectors = queue_max_hw_sectors(q); 420521acdf45SIlya Dryomov blk_queue_max_segments(q, USHRT_MAX); 420624f1df60SIlya Dryomov blk_queue_max_segment_size(q, UINT_MAX); 4207420efbdfSIlya Dryomov blk_queue_io_min(q, objset_bytes); 4208420efbdfSIlya Dryomov blk_queue_io_opt(q, objset_bytes); 4209029bcbd8SJosh Durgin 4210d9360540SIlya Dryomov if (rbd_dev->opts->trim) { 42118b904b5bSBart Van Assche blk_queue_flag_set(QUEUE_FLAG_DISCARD, q); 4212420efbdfSIlya Dryomov q->limits.discard_granularity = objset_bytes; 4213420efbdfSIlya Dryomov blk_queue_max_discard_sectors(q, objset_bytes >> SECTOR_SHIFT); 4214420efbdfSIlya Dryomov blk_queue_max_write_zeroes_sectors(q, objset_bytes >> SECTOR_SHIFT); 4215d9360540SIlya Dryomov } 421690e98c52SGuangliang Zhao 4217bae818eeSRonny Hegewald if (!ceph_test_opt(rbd_dev->rbd_client->client, NOCRC)) 4218dc3b17ccSJan Kara q->backing_dev_info->capabilities |= BDI_CAP_STABLE_WRITES; 4219bae818eeSRonny Hegewald 42205769ed0cSIlya Dryomov /* 42215769ed0cSIlya Dryomov * disk_release() expects a queue ref from add_disk() and will 42225769ed0cSIlya Dryomov * put it. Hold an extra ref until add_disk() is called. 42235769ed0cSIlya Dryomov */ 42245769ed0cSIlya Dryomov WARN_ON(!blk_get_queue(q)); 4225602adf40SYehuda Sadeh disk->queue = q; 4226602adf40SYehuda Sadeh q->queuedata = rbd_dev; 4227602adf40SYehuda Sadeh 4228602adf40SYehuda Sadeh rbd_dev->disk = disk; 4229602adf40SYehuda Sadeh 4230602adf40SYehuda Sadeh return 0; 42317ad18afaSChristoph Hellwig out_tag_set: 42327ad18afaSChristoph Hellwig blk_mq_free_tag_set(&rbd_dev->tag_set); 4233602adf40SYehuda Sadeh out_disk: 4234602adf40SYehuda Sadeh put_disk(disk); 42357ad18afaSChristoph Hellwig return err; 4236602adf40SYehuda Sadeh } 4237602adf40SYehuda Sadeh 4238dfc5606dSYehuda Sadeh /* 4239dfc5606dSYehuda Sadeh sysfs 4240dfc5606dSYehuda Sadeh */ 4241602adf40SYehuda Sadeh 4242593a9e7bSAlex Elder static struct rbd_device *dev_to_rbd_dev(struct device *dev) 4243593a9e7bSAlex Elder { 4244593a9e7bSAlex Elder return container_of(dev, struct rbd_device, dev); 4245593a9e7bSAlex Elder } 4246593a9e7bSAlex Elder 4247dfc5606dSYehuda Sadeh static ssize_t rbd_size_show(struct device *dev, 4248dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 4249602adf40SYehuda Sadeh { 4250593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4251dfc5606dSYehuda Sadeh 4252fc71d833SAlex Elder return sprintf(buf, "%llu\n", 4253fc71d833SAlex Elder (unsigned long long)rbd_dev->mapping.size); 4254602adf40SYehuda Sadeh } 4255602adf40SYehuda Sadeh 425634b13184SAlex Elder /* 425734b13184SAlex Elder * Note this shows the features for whatever's mapped, which is not 425834b13184SAlex Elder * necessarily the base image. 425934b13184SAlex Elder */ 426034b13184SAlex Elder static ssize_t rbd_features_show(struct device *dev, 426134b13184SAlex Elder struct device_attribute *attr, char *buf) 426234b13184SAlex Elder { 426334b13184SAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 426434b13184SAlex Elder 426534b13184SAlex Elder return sprintf(buf, "0x%016llx\n", 426634b13184SAlex Elder (unsigned long long)rbd_dev->mapping.features); 426734b13184SAlex Elder } 426834b13184SAlex Elder 4269dfc5606dSYehuda Sadeh static ssize_t rbd_major_show(struct device *dev, 4270dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 4271602adf40SYehuda Sadeh { 4272593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4273dfc5606dSYehuda Sadeh 4274fc71d833SAlex Elder if (rbd_dev->major) 4275dfc5606dSYehuda Sadeh return sprintf(buf, "%d\n", rbd_dev->major); 4276fc71d833SAlex Elder 4277fc71d833SAlex Elder return sprintf(buf, "(none)\n"); 4278dd82fff1SIlya Dryomov } 4279fc71d833SAlex Elder 4280dd82fff1SIlya Dryomov static ssize_t rbd_minor_show(struct device *dev, 4281dd82fff1SIlya Dryomov struct device_attribute *attr, char *buf) 4282dd82fff1SIlya Dryomov { 4283dd82fff1SIlya Dryomov struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4284dd82fff1SIlya Dryomov 4285dd82fff1SIlya Dryomov return sprintf(buf, "%d\n", rbd_dev->minor); 4286dfc5606dSYehuda Sadeh } 4287dfc5606dSYehuda Sadeh 4288005a07bfSIlya Dryomov static ssize_t rbd_client_addr_show(struct device *dev, 4289005a07bfSIlya Dryomov struct device_attribute *attr, char *buf) 4290005a07bfSIlya Dryomov { 4291005a07bfSIlya Dryomov struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4292005a07bfSIlya Dryomov struct ceph_entity_addr *client_addr = 4293005a07bfSIlya Dryomov ceph_client_addr(rbd_dev->rbd_client->client); 4294005a07bfSIlya Dryomov 4295005a07bfSIlya Dryomov return sprintf(buf, "%pISpc/%u\n", &client_addr->in_addr, 4296005a07bfSIlya Dryomov le32_to_cpu(client_addr->nonce)); 4297005a07bfSIlya Dryomov } 4298005a07bfSIlya Dryomov 4299dfc5606dSYehuda Sadeh static ssize_t rbd_client_id_show(struct device *dev, 4300dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 4301dfc5606dSYehuda Sadeh { 4302593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4303dfc5606dSYehuda Sadeh 43041dbb4399SAlex Elder return sprintf(buf, "client%lld\n", 4305033268a5SIlya Dryomov ceph_client_gid(rbd_dev->rbd_client->client)); 4306dfc5606dSYehuda Sadeh } 4307dfc5606dSYehuda Sadeh 4308267fb90bSMike Christie static ssize_t rbd_cluster_fsid_show(struct device *dev, 4309267fb90bSMike Christie struct device_attribute *attr, char *buf) 4310267fb90bSMike Christie { 4311267fb90bSMike Christie struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4312267fb90bSMike Christie 4313267fb90bSMike Christie return sprintf(buf, "%pU\n", &rbd_dev->rbd_client->client->fsid); 4314267fb90bSMike Christie } 4315267fb90bSMike Christie 43160d6d1e9cSMike Christie static ssize_t rbd_config_info_show(struct device *dev, 43170d6d1e9cSMike Christie struct device_attribute *attr, char *buf) 43180d6d1e9cSMike Christie { 43190d6d1e9cSMike Christie struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 43200d6d1e9cSMike Christie 43210d6d1e9cSMike Christie return sprintf(buf, "%s\n", rbd_dev->config_info); 4322dfc5606dSYehuda Sadeh } 4323dfc5606dSYehuda Sadeh 4324dfc5606dSYehuda Sadeh static ssize_t rbd_pool_show(struct device *dev, 4325dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 4326dfc5606dSYehuda Sadeh { 4327593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4328dfc5606dSYehuda Sadeh 43290d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->pool_name); 4330dfc5606dSYehuda Sadeh } 4331dfc5606dSYehuda Sadeh 43329bb2f334SAlex Elder static ssize_t rbd_pool_id_show(struct device *dev, 43339bb2f334SAlex Elder struct device_attribute *attr, char *buf) 43349bb2f334SAlex Elder { 43359bb2f334SAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 43369bb2f334SAlex Elder 43370d7dbfceSAlex Elder return sprintf(buf, "%llu\n", 43380d7dbfceSAlex Elder (unsigned long long) rbd_dev->spec->pool_id); 43399bb2f334SAlex Elder } 43409bb2f334SAlex Elder 4341b26c047bSIlya Dryomov static ssize_t rbd_pool_ns_show(struct device *dev, 4342b26c047bSIlya Dryomov struct device_attribute *attr, char *buf) 4343b26c047bSIlya Dryomov { 4344b26c047bSIlya Dryomov struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4345b26c047bSIlya Dryomov 4346b26c047bSIlya Dryomov return sprintf(buf, "%s\n", rbd_dev->spec->pool_ns ?: ""); 4347b26c047bSIlya Dryomov } 4348b26c047bSIlya Dryomov 4349dfc5606dSYehuda Sadeh static ssize_t rbd_name_show(struct device *dev, 4350dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 4351dfc5606dSYehuda Sadeh { 4352593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4353dfc5606dSYehuda Sadeh 4354a92ffdf8SAlex Elder if (rbd_dev->spec->image_name) 43550d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->image_name); 4356a92ffdf8SAlex Elder 4357a92ffdf8SAlex Elder return sprintf(buf, "(unknown)\n"); 4358dfc5606dSYehuda Sadeh } 4359dfc5606dSYehuda Sadeh 4360589d30e0SAlex Elder static ssize_t rbd_image_id_show(struct device *dev, 4361589d30e0SAlex Elder struct device_attribute *attr, char *buf) 4362589d30e0SAlex Elder { 4363589d30e0SAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4364589d30e0SAlex Elder 43650d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->image_id); 4366589d30e0SAlex Elder } 4367589d30e0SAlex Elder 436834b13184SAlex Elder /* 436934b13184SAlex Elder * Shows the name of the currently-mapped snapshot (or 437034b13184SAlex Elder * RBD_SNAP_HEAD_NAME for the base image). 437134b13184SAlex Elder */ 4372dfc5606dSYehuda Sadeh static ssize_t rbd_snap_show(struct device *dev, 4373dfc5606dSYehuda Sadeh struct device_attribute *attr, 4374dfc5606dSYehuda Sadeh char *buf) 4375dfc5606dSYehuda Sadeh { 4376593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4377dfc5606dSYehuda Sadeh 43780d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->snap_name); 4379dfc5606dSYehuda Sadeh } 4380dfc5606dSYehuda Sadeh 438192a58671SMike Christie static ssize_t rbd_snap_id_show(struct device *dev, 438292a58671SMike Christie struct device_attribute *attr, char *buf) 438392a58671SMike Christie { 438492a58671SMike Christie struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 438592a58671SMike Christie 438692a58671SMike Christie return sprintf(buf, "%llu\n", rbd_dev->spec->snap_id); 438792a58671SMike Christie } 438892a58671SMike Christie 438986b00e0dSAlex Elder /* 4390ff96128fSIlya Dryomov * For a v2 image, shows the chain of parent images, separated by empty 4391ff96128fSIlya Dryomov * lines. For v1 images or if there is no parent, shows "(no parent 4392ff96128fSIlya Dryomov * image)". 439386b00e0dSAlex Elder */ 439486b00e0dSAlex Elder static ssize_t rbd_parent_show(struct device *dev, 439586b00e0dSAlex Elder struct device_attribute *attr, 439686b00e0dSAlex Elder char *buf) 439786b00e0dSAlex Elder { 439886b00e0dSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4399ff96128fSIlya Dryomov ssize_t count = 0; 440086b00e0dSAlex Elder 4401ff96128fSIlya Dryomov if (!rbd_dev->parent) 440286b00e0dSAlex Elder return sprintf(buf, "(no parent image)\n"); 440386b00e0dSAlex Elder 4404ff96128fSIlya Dryomov for ( ; rbd_dev->parent; rbd_dev = rbd_dev->parent) { 4405ff96128fSIlya Dryomov struct rbd_spec *spec = rbd_dev->parent_spec; 440686b00e0dSAlex Elder 4407ff96128fSIlya Dryomov count += sprintf(&buf[count], "%s" 4408ff96128fSIlya Dryomov "pool_id %llu\npool_name %s\n" 4409e92c0eafSIlya Dryomov "pool_ns %s\n" 4410ff96128fSIlya Dryomov "image_id %s\nimage_name %s\n" 4411ff96128fSIlya Dryomov "snap_id %llu\nsnap_name %s\n" 4412ff96128fSIlya Dryomov "overlap %llu\n", 4413ff96128fSIlya Dryomov !count ? "" : "\n", /* first? */ 4414ff96128fSIlya Dryomov spec->pool_id, spec->pool_name, 4415e92c0eafSIlya Dryomov spec->pool_ns ?: "", 4416ff96128fSIlya Dryomov spec->image_id, spec->image_name ?: "(unknown)", 4417ff96128fSIlya Dryomov spec->snap_id, spec->snap_name, 4418ff96128fSIlya Dryomov rbd_dev->parent_overlap); 4419ff96128fSIlya Dryomov } 442086b00e0dSAlex Elder 442186b00e0dSAlex Elder return count; 442286b00e0dSAlex Elder } 442386b00e0dSAlex Elder 4424dfc5606dSYehuda Sadeh static ssize_t rbd_image_refresh(struct device *dev, 4425dfc5606dSYehuda Sadeh struct device_attribute *attr, 4426dfc5606dSYehuda Sadeh const char *buf, 4427dfc5606dSYehuda Sadeh size_t size) 4428dfc5606dSYehuda Sadeh { 4429593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4430b813623aSAlex Elder int ret; 4431602adf40SYehuda Sadeh 4432cc4a38bdSAlex Elder ret = rbd_dev_refresh(rbd_dev); 4433e627db08SAlex Elder if (ret) 443452bb1f9bSIlya Dryomov return ret; 4435b813623aSAlex Elder 443652bb1f9bSIlya Dryomov return size; 4437dfc5606dSYehuda Sadeh } 4438602adf40SYehuda Sadeh 44395657a819SJoe Perches static DEVICE_ATTR(size, 0444, rbd_size_show, NULL); 44405657a819SJoe Perches static DEVICE_ATTR(features, 0444, rbd_features_show, NULL); 44415657a819SJoe Perches static DEVICE_ATTR(major, 0444, rbd_major_show, NULL); 44425657a819SJoe Perches static DEVICE_ATTR(minor, 0444, rbd_minor_show, NULL); 44435657a819SJoe Perches static DEVICE_ATTR(client_addr, 0444, rbd_client_addr_show, NULL); 44445657a819SJoe Perches static DEVICE_ATTR(client_id, 0444, rbd_client_id_show, NULL); 44455657a819SJoe Perches static DEVICE_ATTR(cluster_fsid, 0444, rbd_cluster_fsid_show, NULL); 44465657a819SJoe Perches static DEVICE_ATTR(config_info, 0400, rbd_config_info_show, NULL); 44475657a819SJoe Perches static DEVICE_ATTR(pool, 0444, rbd_pool_show, NULL); 44485657a819SJoe Perches static DEVICE_ATTR(pool_id, 0444, rbd_pool_id_show, NULL); 4449b26c047bSIlya Dryomov static DEVICE_ATTR(pool_ns, 0444, rbd_pool_ns_show, NULL); 44505657a819SJoe Perches static DEVICE_ATTR(name, 0444, rbd_name_show, NULL); 44515657a819SJoe Perches static DEVICE_ATTR(image_id, 0444, rbd_image_id_show, NULL); 44525657a819SJoe Perches static DEVICE_ATTR(refresh, 0200, NULL, rbd_image_refresh); 44535657a819SJoe Perches static DEVICE_ATTR(current_snap, 0444, rbd_snap_show, NULL); 44545657a819SJoe Perches static DEVICE_ATTR(snap_id, 0444, rbd_snap_id_show, NULL); 44555657a819SJoe Perches static DEVICE_ATTR(parent, 0444, rbd_parent_show, NULL); 4456dfc5606dSYehuda Sadeh 4457dfc5606dSYehuda Sadeh static struct attribute *rbd_attrs[] = { 4458dfc5606dSYehuda Sadeh &dev_attr_size.attr, 445934b13184SAlex Elder &dev_attr_features.attr, 4460dfc5606dSYehuda Sadeh &dev_attr_major.attr, 4461dd82fff1SIlya Dryomov &dev_attr_minor.attr, 4462005a07bfSIlya Dryomov &dev_attr_client_addr.attr, 4463dfc5606dSYehuda Sadeh &dev_attr_client_id.attr, 4464267fb90bSMike Christie &dev_attr_cluster_fsid.attr, 44650d6d1e9cSMike Christie &dev_attr_config_info.attr, 4466dfc5606dSYehuda Sadeh &dev_attr_pool.attr, 44679bb2f334SAlex Elder &dev_attr_pool_id.attr, 4468b26c047bSIlya Dryomov &dev_attr_pool_ns.attr, 4469dfc5606dSYehuda Sadeh &dev_attr_name.attr, 4470589d30e0SAlex Elder &dev_attr_image_id.attr, 4471dfc5606dSYehuda Sadeh &dev_attr_current_snap.attr, 447292a58671SMike Christie &dev_attr_snap_id.attr, 447386b00e0dSAlex Elder &dev_attr_parent.attr, 4474dfc5606dSYehuda Sadeh &dev_attr_refresh.attr, 4475dfc5606dSYehuda Sadeh NULL 4476dfc5606dSYehuda Sadeh }; 4477dfc5606dSYehuda Sadeh 4478dfc5606dSYehuda Sadeh static struct attribute_group rbd_attr_group = { 4479dfc5606dSYehuda Sadeh .attrs = rbd_attrs, 4480dfc5606dSYehuda Sadeh }; 4481dfc5606dSYehuda Sadeh 4482dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_attr_groups[] = { 4483dfc5606dSYehuda Sadeh &rbd_attr_group, 4484dfc5606dSYehuda Sadeh NULL 4485dfc5606dSYehuda Sadeh }; 4486dfc5606dSYehuda Sadeh 44876cac4695SIlya Dryomov static void rbd_dev_release(struct device *dev); 4488dfc5606dSYehuda Sadeh 4489b9942bc9SBhumika Goyal static const struct device_type rbd_device_type = { 4490dfc5606dSYehuda Sadeh .name = "rbd", 4491dfc5606dSYehuda Sadeh .groups = rbd_attr_groups, 44926cac4695SIlya Dryomov .release = rbd_dev_release, 4493dfc5606dSYehuda Sadeh }; 4494dfc5606dSYehuda Sadeh 44958b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec) 44968b8fb99cSAlex Elder { 44978b8fb99cSAlex Elder kref_get(&spec->kref); 44988b8fb99cSAlex Elder 44998b8fb99cSAlex Elder return spec; 45008b8fb99cSAlex Elder } 45018b8fb99cSAlex Elder 45028b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref); 45038b8fb99cSAlex Elder static void rbd_spec_put(struct rbd_spec *spec) 45048b8fb99cSAlex Elder { 45058b8fb99cSAlex Elder if (spec) 45068b8fb99cSAlex Elder kref_put(&spec->kref, rbd_spec_free); 45078b8fb99cSAlex Elder } 45088b8fb99cSAlex Elder 45098b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_alloc(void) 45108b8fb99cSAlex Elder { 45118b8fb99cSAlex Elder struct rbd_spec *spec; 45128b8fb99cSAlex Elder 45138b8fb99cSAlex Elder spec = kzalloc(sizeof (*spec), GFP_KERNEL); 45148b8fb99cSAlex Elder if (!spec) 45158b8fb99cSAlex Elder return NULL; 451604077599SIlya Dryomov 451704077599SIlya Dryomov spec->pool_id = CEPH_NOPOOL; 451804077599SIlya Dryomov spec->snap_id = CEPH_NOSNAP; 45198b8fb99cSAlex Elder kref_init(&spec->kref); 45208b8fb99cSAlex Elder 45218b8fb99cSAlex Elder return spec; 45228b8fb99cSAlex Elder } 45238b8fb99cSAlex Elder 45248b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref) 45258b8fb99cSAlex Elder { 45268b8fb99cSAlex Elder struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref); 45278b8fb99cSAlex Elder 45288b8fb99cSAlex Elder kfree(spec->pool_name); 4529b26c047bSIlya Dryomov kfree(spec->pool_ns); 45308b8fb99cSAlex Elder kfree(spec->image_id); 45318b8fb99cSAlex Elder kfree(spec->image_name); 45328b8fb99cSAlex Elder kfree(spec->snap_name); 45338b8fb99cSAlex Elder kfree(spec); 45348b8fb99cSAlex Elder } 45358b8fb99cSAlex Elder 45361643dfa4SIlya Dryomov static void rbd_dev_free(struct rbd_device *rbd_dev) 4537dd5ac32dSIlya Dryomov { 453899d16943SIlya Dryomov WARN_ON(rbd_dev->watch_state != RBD_WATCH_STATE_UNREGISTERED); 4539ed95b21aSIlya Dryomov WARN_ON(rbd_dev->lock_state != RBD_LOCK_STATE_UNLOCKED); 4540dd5ac32dSIlya Dryomov 4541c41d13a3SIlya Dryomov ceph_oid_destroy(&rbd_dev->header_oid); 45426b6dddbeSIlya Dryomov ceph_oloc_destroy(&rbd_dev->header_oloc); 45430d6d1e9cSMike Christie kfree(rbd_dev->config_info); 4544c41d13a3SIlya Dryomov 4545dd5ac32dSIlya Dryomov rbd_put_client(rbd_dev->rbd_client); 4546dd5ac32dSIlya Dryomov rbd_spec_put(rbd_dev->spec); 4547dd5ac32dSIlya Dryomov kfree(rbd_dev->opts); 4548dd5ac32dSIlya Dryomov kfree(rbd_dev); 45491643dfa4SIlya Dryomov } 45501643dfa4SIlya Dryomov 45511643dfa4SIlya Dryomov static void rbd_dev_release(struct device *dev) 45521643dfa4SIlya Dryomov { 45531643dfa4SIlya Dryomov struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 45541643dfa4SIlya Dryomov bool need_put = !!rbd_dev->opts; 45551643dfa4SIlya Dryomov 45561643dfa4SIlya Dryomov if (need_put) { 45571643dfa4SIlya Dryomov destroy_workqueue(rbd_dev->task_wq); 45581643dfa4SIlya Dryomov ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id); 45591643dfa4SIlya Dryomov } 45601643dfa4SIlya Dryomov 45611643dfa4SIlya Dryomov rbd_dev_free(rbd_dev); 4562dd5ac32dSIlya Dryomov 4563dd5ac32dSIlya Dryomov /* 4564dd5ac32dSIlya Dryomov * This is racy, but way better than putting module outside of 4565dd5ac32dSIlya Dryomov * the release callback. The race window is pretty small, so 4566dd5ac32dSIlya Dryomov * doing something similar to dm (dm-builtin.c) is overkill. 4567dd5ac32dSIlya Dryomov */ 4568dd5ac32dSIlya Dryomov if (need_put) 4569dd5ac32dSIlya Dryomov module_put(THIS_MODULE); 4570dd5ac32dSIlya Dryomov } 4571dd5ac32dSIlya Dryomov 45721643dfa4SIlya Dryomov static struct rbd_device *__rbd_dev_create(struct rbd_client *rbdc, 45731643dfa4SIlya Dryomov struct rbd_spec *spec) 4574c53d5893SAlex Elder { 4575c53d5893SAlex Elder struct rbd_device *rbd_dev; 4576c53d5893SAlex Elder 4577c53d5893SAlex Elder rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL); 4578c53d5893SAlex Elder if (!rbd_dev) 4579c53d5893SAlex Elder return NULL; 4580c53d5893SAlex Elder 4581c53d5893SAlex Elder spin_lock_init(&rbd_dev->lock); 4582c53d5893SAlex Elder INIT_LIST_HEAD(&rbd_dev->node); 4583c53d5893SAlex Elder init_rwsem(&rbd_dev->header_rwsem); 4584c53d5893SAlex Elder 45857e97332eSIlya Dryomov rbd_dev->header.data_pool_id = CEPH_NOPOOL; 4586c41d13a3SIlya Dryomov ceph_oid_init(&rbd_dev->header_oid); 4587431a02cdSIlya Dryomov rbd_dev->header_oloc.pool = spec->pool_id; 4588b26c047bSIlya Dryomov if (spec->pool_ns) { 4589b26c047bSIlya Dryomov WARN_ON(!*spec->pool_ns); 4590b26c047bSIlya Dryomov rbd_dev->header_oloc.pool_ns = 4591b26c047bSIlya Dryomov ceph_find_or_create_string(spec->pool_ns, 4592b26c047bSIlya Dryomov strlen(spec->pool_ns)); 4593b26c047bSIlya Dryomov } 4594c41d13a3SIlya Dryomov 459599d16943SIlya Dryomov mutex_init(&rbd_dev->watch_mutex); 459699d16943SIlya Dryomov rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED; 459799d16943SIlya Dryomov INIT_DELAYED_WORK(&rbd_dev->watch_dwork, rbd_reregister_watch); 459899d16943SIlya Dryomov 4599ed95b21aSIlya Dryomov init_rwsem(&rbd_dev->lock_rwsem); 4600ed95b21aSIlya Dryomov rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED; 4601ed95b21aSIlya Dryomov INIT_WORK(&rbd_dev->acquired_lock_work, rbd_notify_acquired_lock); 4602ed95b21aSIlya Dryomov INIT_WORK(&rbd_dev->released_lock_work, rbd_notify_released_lock); 4603ed95b21aSIlya Dryomov INIT_DELAYED_WORK(&rbd_dev->lock_dwork, rbd_acquire_lock); 4604ed95b21aSIlya Dryomov INIT_WORK(&rbd_dev->unlock_work, rbd_release_lock_work); 4605ed95b21aSIlya Dryomov init_waitqueue_head(&rbd_dev->lock_waitq); 4606ed95b21aSIlya Dryomov 4607dd5ac32dSIlya Dryomov rbd_dev->dev.bus = &rbd_bus_type; 4608dd5ac32dSIlya Dryomov rbd_dev->dev.type = &rbd_device_type; 4609dd5ac32dSIlya Dryomov rbd_dev->dev.parent = &rbd_root_dev; 4610dd5ac32dSIlya Dryomov device_initialize(&rbd_dev->dev); 4611dd5ac32dSIlya Dryomov 4612c53d5893SAlex Elder rbd_dev->rbd_client = rbdc; 4613d147543dSIlya Dryomov rbd_dev->spec = spec; 46140903e875SAlex Elder 46151643dfa4SIlya Dryomov return rbd_dev; 46161643dfa4SIlya Dryomov } 46171643dfa4SIlya Dryomov 4618dd5ac32dSIlya Dryomov /* 46191643dfa4SIlya Dryomov * Create a mapping rbd_dev. 4620dd5ac32dSIlya Dryomov */ 46211643dfa4SIlya Dryomov static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc, 46221643dfa4SIlya Dryomov struct rbd_spec *spec, 46231643dfa4SIlya Dryomov struct rbd_options *opts) 46241643dfa4SIlya Dryomov { 46251643dfa4SIlya Dryomov struct rbd_device *rbd_dev; 46261643dfa4SIlya Dryomov 46271643dfa4SIlya Dryomov rbd_dev = __rbd_dev_create(rbdc, spec); 46281643dfa4SIlya Dryomov if (!rbd_dev) 46291643dfa4SIlya Dryomov return NULL; 46301643dfa4SIlya Dryomov 46311643dfa4SIlya Dryomov rbd_dev->opts = opts; 46321643dfa4SIlya Dryomov 46331643dfa4SIlya Dryomov /* get an id and fill in device name */ 46341643dfa4SIlya Dryomov rbd_dev->dev_id = ida_simple_get(&rbd_dev_id_ida, 0, 46351643dfa4SIlya Dryomov minor_to_rbd_dev_id(1 << MINORBITS), 46361643dfa4SIlya Dryomov GFP_KERNEL); 46371643dfa4SIlya Dryomov if (rbd_dev->dev_id < 0) 46381643dfa4SIlya Dryomov goto fail_rbd_dev; 46391643dfa4SIlya Dryomov 46401643dfa4SIlya Dryomov sprintf(rbd_dev->name, RBD_DRV_NAME "%d", rbd_dev->dev_id); 46411643dfa4SIlya Dryomov rbd_dev->task_wq = alloc_ordered_workqueue("%s-tasks", WQ_MEM_RECLAIM, 46421643dfa4SIlya Dryomov rbd_dev->name); 46431643dfa4SIlya Dryomov if (!rbd_dev->task_wq) 46441643dfa4SIlya Dryomov goto fail_dev_id; 46451643dfa4SIlya Dryomov 46461643dfa4SIlya Dryomov /* we have a ref from do_rbd_add() */ 4647dd5ac32dSIlya Dryomov __module_get(THIS_MODULE); 4648dd5ac32dSIlya Dryomov 46491643dfa4SIlya Dryomov dout("%s rbd_dev %p dev_id %d\n", __func__, rbd_dev, rbd_dev->dev_id); 4650c53d5893SAlex Elder return rbd_dev; 46511643dfa4SIlya Dryomov 46521643dfa4SIlya Dryomov fail_dev_id: 46531643dfa4SIlya Dryomov ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id); 46541643dfa4SIlya Dryomov fail_rbd_dev: 46551643dfa4SIlya Dryomov rbd_dev_free(rbd_dev); 46561643dfa4SIlya Dryomov return NULL; 4657c53d5893SAlex Elder } 4658c53d5893SAlex Elder 4659c53d5893SAlex Elder static void rbd_dev_destroy(struct rbd_device *rbd_dev) 4660c53d5893SAlex Elder { 4661dd5ac32dSIlya Dryomov if (rbd_dev) 4662dd5ac32dSIlya Dryomov put_device(&rbd_dev->dev); 4663c53d5893SAlex Elder } 4664c53d5893SAlex Elder 4665dfc5606dSYehuda Sadeh /* 46669d475de5SAlex Elder * Get the size and object order for an image snapshot, or if 46679d475de5SAlex Elder * snap_id is CEPH_NOSNAP, gets this information for the base 46689d475de5SAlex Elder * image. 46699d475de5SAlex Elder */ 46709d475de5SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id, 46719d475de5SAlex Elder u8 *order, u64 *snap_size) 46729d475de5SAlex Elder { 46739d475de5SAlex Elder __le64 snapid = cpu_to_le64(snap_id); 46749d475de5SAlex Elder int ret; 46759d475de5SAlex Elder struct { 46769d475de5SAlex Elder u8 order; 46779d475de5SAlex Elder __le64 size; 46789d475de5SAlex Elder } __attribute__ ((packed)) size_buf = { 0 }; 46799d475de5SAlex Elder 4680ecd4a68aSIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, 4681ecd4a68aSIlya Dryomov &rbd_dev->header_oloc, "get_size", 46824157976bSAlex Elder &snapid, sizeof(snapid), 4683e2a58ee5SAlex Elder &size_buf, sizeof(size_buf)); 468436be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 46859d475de5SAlex Elder if (ret < 0) 46869d475de5SAlex Elder return ret; 468757385b51SAlex Elder if (ret < sizeof (size_buf)) 468857385b51SAlex Elder return -ERANGE; 46899d475de5SAlex Elder 4690c3545579SJosh Durgin if (order) { 46919d475de5SAlex Elder *order = size_buf.order; 4692c3545579SJosh Durgin dout(" order %u", (unsigned int)*order); 4693c3545579SJosh Durgin } 46949d475de5SAlex Elder *snap_size = le64_to_cpu(size_buf.size); 46959d475de5SAlex Elder 4696c3545579SJosh Durgin dout(" snap_id 0x%016llx snap_size = %llu\n", 4697c3545579SJosh Durgin (unsigned long long)snap_id, 46989d475de5SAlex Elder (unsigned long long)*snap_size); 46999d475de5SAlex Elder 47009d475de5SAlex Elder return 0; 47019d475de5SAlex Elder } 47029d475de5SAlex Elder 47039d475de5SAlex Elder static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev) 47049d475de5SAlex Elder { 47059d475de5SAlex Elder return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP, 47069d475de5SAlex Elder &rbd_dev->header.obj_order, 47079d475de5SAlex Elder &rbd_dev->header.image_size); 47089d475de5SAlex Elder } 47099d475de5SAlex Elder 47101e130199SAlex Elder static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev) 47111e130199SAlex Elder { 47121e130199SAlex Elder void *reply_buf; 47131e130199SAlex Elder int ret; 47141e130199SAlex Elder void *p; 47151e130199SAlex Elder 47161e130199SAlex Elder reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL); 47171e130199SAlex Elder if (!reply_buf) 47181e130199SAlex Elder return -ENOMEM; 47191e130199SAlex Elder 4720ecd4a68aSIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, 4721ecd4a68aSIlya Dryomov &rbd_dev->header_oloc, "get_object_prefix", 4722ecd4a68aSIlya Dryomov NULL, 0, reply_buf, RBD_OBJ_PREFIX_LEN_MAX); 472336be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 47241e130199SAlex Elder if (ret < 0) 47251e130199SAlex Elder goto out; 47261e130199SAlex Elder 47271e130199SAlex Elder p = reply_buf; 47281e130199SAlex Elder rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p, 472957385b51SAlex Elder p + ret, NULL, GFP_NOIO); 473057385b51SAlex Elder ret = 0; 47311e130199SAlex Elder 47321e130199SAlex Elder if (IS_ERR(rbd_dev->header.object_prefix)) { 47331e130199SAlex Elder ret = PTR_ERR(rbd_dev->header.object_prefix); 47341e130199SAlex Elder rbd_dev->header.object_prefix = NULL; 47351e130199SAlex Elder } else { 47361e130199SAlex Elder dout(" object_prefix = %s\n", rbd_dev->header.object_prefix); 47371e130199SAlex Elder } 47381e130199SAlex Elder out: 47391e130199SAlex Elder kfree(reply_buf); 47401e130199SAlex Elder 47411e130199SAlex Elder return ret; 47421e130199SAlex Elder } 47431e130199SAlex Elder 4744b1b5402aSAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id, 4745b1b5402aSAlex Elder u64 *snap_features) 4746b1b5402aSAlex Elder { 4747b1b5402aSAlex Elder __le64 snapid = cpu_to_le64(snap_id); 4748b1b5402aSAlex Elder struct { 4749b1b5402aSAlex Elder __le64 features; 4750b1b5402aSAlex Elder __le64 incompat; 47514157976bSAlex Elder } __attribute__ ((packed)) features_buf = { 0 }; 4752d3767f0fSIlya Dryomov u64 unsup; 4753b1b5402aSAlex Elder int ret; 4754b1b5402aSAlex Elder 4755ecd4a68aSIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, 4756ecd4a68aSIlya Dryomov &rbd_dev->header_oloc, "get_features", 47574157976bSAlex Elder &snapid, sizeof(snapid), 4758e2a58ee5SAlex Elder &features_buf, sizeof(features_buf)); 475936be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 4760b1b5402aSAlex Elder if (ret < 0) 4761b1b5402aSAlex Elder return ret; 476257385b51SAlex Elder if (ret < sizeof (features_buf)) 476357385b51SAlex Elder return -ERANGE; 4764d889140cSAlex Elder 4765d3767f0fSIlya Dryomov unsup = le64_to_cpu(features_buf.incompat) & ~RBD_FEATURES_SUPPORTED; 4766d3767f0fSIlya Dryomov if (unsup) { 4767d3767f0fSIlya Dryomov rbd_warn(rbd_dev, "image uses unsupported features: 0x%llx", 4768d3767f0fSIlya Dryomov unsup); 4769b8f5c6edSAlex Elder return -ENXIO; 4770d3767f0fSIlya Dryomov } 4771d889140cSAlex Elder 4772b1b5402aSAlex Elder *snap_features = le64_to_cpu(features_buf.features); 4773b1b5402aSAlex Elder 4774b1b5402aSAlex Elder dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n", 4775b1b5402aSAlex Elder (unsigned long long)snap_id, 4776b1b5402aSAlex Elder (unsigned long long)*snap_features, 4777b1b5402aSAlex Elder (unsigned long long)le64_to_cpu(features_buf.incompat)); 4778b1b5402aSAlex Elder 4779b1b5402aSAlex Elder return 0; 4780b1b5402aSAlex Elder } 4781b1b5402aSAlex Elder 4782b1b5402aSAlex Elder static int rbd_dev_v2_features(struct rbd_device *rbd_dev) 4783b1b5402aSAlex Elder { 4784b1b5402aSAlex Elder return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP, 4785b1b5402aSAlex Elder &rbd_dev->header.features); 4786b1b5402aSAlex Elder } 4787b1b5402aSAlex Elder 4788eb3b2d6bSIlya Dryomov struct parent_image_info { 4789eb3b2d6bSIlya Dryomov u64 pool_id; 4790e92c0eafSIlya Dryomov const char *pool_ns; 4791eb3b2d6bSIlya Dryomov const char *image_id; 4792eb3b2d6bSIlya Dryomov u64 snap_id; 4793eb3b2d6bSIlya Dryomov 4794e92c0eafSIlya Dryomov bool has_overlap; 4795eb3b2d6bSIlya Dryomov u64 overlap; 4796eb3b2d6bSIlya Dryomov }; 4797eb3b2d6bSIlya Dryomov 4798eb3b2d6bSIlya Dryomov /* 4799eb3b2d6bSIlya Dryomov * The caller is responsible for @pii. 4800eb3b2d6bSIlya Dryomov */ 4801e92c0eafSIlya Dryomov static int decode_parent_image_spec(void **p, void *end, 4802e92c0eafSIlya Dryomov struct parent_image_info *pii) 4803e92c0eafSIlya Dryomov { 4804e92c0eafSIlya Dryomov u8 struct_v; 4805e92c0eafSIlya Dryomov u32 struct_len; 4806e92c0eafSIlya Dryomov int ret; 4807e92c0eafSIlya Dryomov 4808e92c0eafSIlya Dryomov ret = ceph_start_decoding(p, end, 1, "ParentImageSpec", 4809e92c0eafSIlya Dryomov &struct_v, &struct_len); 4810e92c0eafSIlya Dryomov if (ret) 4811e92c0eafSIlya Dryomov return ret; 4812e92c0eafSIlya Dryomov 4813e92c0eafSIlya Dryomov ceph_decode_64_safe(p, end, pii->pool_id, e_inval); 4814e92c0eafSIlya Dryomov pii->pool_ns = ceph_extract_encoded_string(p, end, NULL, GFP_KERNEL); 4815e92c0eafSIlya Dryomov if (IS_ERR(pii->pool_ns)) { 4816e92c0eafSIlya Dryomov ret = PTR_ERR(pii->pool_ns); 4817e92c0eafSIlya Dryomov pii->pool_ns = NULL; 4818e92c0eafSIlya Dryomov return ret; 4819e92c0eafSIlya Dryomov } 4820e92c0eafSIlya Dryomov pii->image_id = ceph_extract_encoded_string(p, end, NULL, GFP_KERNEL); 4821e92c0eafSIlya Dryomov if (IS_ERR(pii->image_id)) { 4822e92c0eafSIlya Dryomov ret = PTR_ERR(pii->image_id); 4823e92c0eafSIlya Dryomov pii->image_id = NULL; 4824e92c0eafSIlya Dryomov return ret; 4825e92c0eafSIlya Dryomov } 4826e92c0eafSIlya Dryomov ceph_decode_64_safe(p, end, pii->snap_id, e_inval); 4827e92c0eafSIlya Dryomov return 0; 4828e92c0eafSIlya Dryomov 4829e92c0eafSIlya Dryomov e_inval: 4830e92c0eafSIlya Dryomov return -EINVAL; 4831e92c0eafSIlya Dryomov } 4832e92c0eafSIlya Dryomov 4833e92c0eafSIlya Dryomov static int __get_parent_info(struct rbd_device *rbd_dev, 4834e92c0eafSIlya Dryomov struct page *req_page, 4835e92c0eafSIlya Dryomov struct page *reply_page, 4836e92c0eafSIlya Dryomov struct parent_image_info *pii) 4837e92c0eafSIlya Dryomov { 4838e92c0eafSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 4839e92c0eafSIlya Dryomov size_t reply_len = PAGE_SIZE; 4840e92c0eafSIlya Dryomov void *p, *end; 4841e92c0eafSIlya Dryomov int ret; 4842e92c0eafSIlya Dryomov 4843e92c0eafSIlya Dryomov ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc, 4844e92c0eafSIlya Dryomov "rbd", "parent_get", CEPH_OSD_FLAG_READ, 4845e92c0eafSIlya Dryomov req_page, sizeof(u64), reply_page, &reply_len); 4846e92c0eafSIlya Dryomov if (ret) 4847e92c0eafSIlya Dryomov return ret == -EOPNOTSUPP ? 1 : ret; 4848e92c0eafSIlya Dryomov 4849e92c0eafSIlya Dryomov p = page_address(reply_page); 4850e92c0eafSIlya Dryomov end = p + reply_len; 4851e92c0eafSIlya Dryomov ret = decode_parent_image_spec(&p, end, pii); 4852e92c0eafSIlya Dryomov if (ret) 4853e92c0eafSIlya Dryomov return ret; 4854e92c0eafSIlya Dryomov 4855e92c0eafSIlya Dryomov ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc, 4856e92c0eafSIlya Dryomov "rbd", "parent_overlap_get", CEPH_OSD_FLAG_READ, 4857e92c0eafSIlya Dryomov req_page, sizeof(u64), reply_page, &reply_len); 4858e92c0eafSIlya Dryomov if (ret) 4859e92c0eafSIlya Dryomov return ret; 4860e92c0eafSIlya Dryomov 4861e92c0eafSIlya Dryomov p = page_address(reply_page); 4862e92c0eafSIlya Dryomov end = p + reply_len; 4863e92c0eafSIlya Dryomov ceph_decode_8_safe(&p, end, pii->has_overlap, e_inval); 4864e92c0eafSIlya Dryomov if (pii->has_overlap) 4865e92c0eafSIlya Dryomov ceph_decode_64_safe(&p, end, pii->overlap, e_inval); 4866e92c0eafSIlya Dryomov 4867e92c0eafSIlya Dryomov return 0; 4868e92c0eafSIlya Dryomov 4869e92c0eafSIlya Dryomov e_inval: 4870e92c0eafSIlya Dryomov return -EINVAL; 4871e92c0eafSIlya Dryomov } 4872e92c0eafSIlya Dryomov 4873e92c0eafSIlya Dryomov /* 4874e92c0eafSIlya Dryomov * The caller is responsible for @pii. 4875e92c0eafSIlya Dryomov */ 4876eb3b2d6bSIlya Dryomov static int __get_parent_info_legacy(struct rbd_device *rbd_dev, 4877eb3b2d6bSIlya Dryomov struct page *req_page, 4878eb3b2d6bSIlya Dryomov struct page *reply_page, 4879eb3b2d6bSIlya Dryomov struct parent_image_info *pii) 4880eb3b2d6bSIlya Dryomov { 4881eb3b2d6bSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 4882eb3b2d6bSIlya Dryomov size_t reply_len = PAGE_SIZE; 4883eb3b2d6bSIlya Dryomov void *p, *end; 4884eb3b2d6bSIlya Dryomov int ret; 4885eb3b2d6bSIlya Dryomov 4886eb3b2d6bSIlya Dryomov ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc, 4887eb3b2d6bSIlya Dryomov "rbd", "get_parent", CEPH_OSD_FLAG_READ, 4888eb3b2d6bSIlya Dryomov req_page, sizeof(u64), reply_page, &reply_len); 4889eb3b2d6bSIlya Dryomov if (ret) 4890eb3b2d6bSIlya Dryomov return ret; 4891eb3b2d6bSIlya Dryomov 4892eb3b2d6bSIlya Dryomov p = page_address(reply_page); 4893eb3b2d6bSIlya Dryomov end = p + reply_len; 4894eb3b2d6bSIlya Dryomov ceph_decode_64_safe(&p, end, pii->pool_id, e_inval); 4895eb3b2d6bSIlya Dryomov pii->image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL); 4896eb3b2d6bSIlya Dryomov if (IS_ERR(pii->image_id)) { 4897eb3b2d6bSIlya Dryomov ret = PTR_ERR(pii->image_id); 4898eb3b2d6bSIlya Dryomov pii->image_id = NULL; 4899eb3b2d6bSIlya Dryomov return ret; 4900eb3b2d6bSIlya Dryomov } 4901eb3b2d6bSIlya Dryomov ceph_decode_64_safe(&p, end, pii->snap_id, e_inval); 4902e92c0eafSIlya Dryomov pii->has_overlap = true; 4903eb3b2d6bSIlya Dryomov ceph_decode_64_safe(&p, end, pii->overlap, e_inval); 4904eb3b2d6bSIlya Dryomov 4905eb3b2d6bSIlya Dryomov return 0; 4906eb3b2d6bSIlya Dryomov 4907eb3b2d6bSIlya Dryomov e_inval: 4908eb3b2d6bSIlya Dryomov return -EINVAL; 4909eb3b2d6bSIlya Dryomov } 4910eb3b2d6bSIlya Dryomov 4911eb3b2d6bSIlya Dryomov static int get_parent_info(struct rbd_device *rbd_dev, 4912eb3b2d6bSIlya Dryomov struct parent_image_info *pii) 4913eb3b2d6bSIlya Dryomov { 4914eb3b2d6bSIlya Dryomov struct page *req_page, *reply_page; 4915eb3b2d6bSIlya Dryomov void *p; 4916eb3b2d6bSIlya Dryomov int ret; 4917eb3b2d6bSIlya Dryomov 4918eb3b2d6bSIlya Dryomov req_page = alloc_page(GFP_KERNEL); 4919eb3b2d6bSIlya Dryomov if (!req_page) 4920eb3b2d6bSIlya Dryomov return -ENOMEM; 4921eb3b2d6bSIlya Dryomov 4922eb3b2d6bSIlya Dryomov reply_page = alloc_page(GFP_KERNEL); 4923eb3b2d6bSIlya Dryomov if (!reply_page) { 4924eb3b2d6bSIlya Dryomov __free_page(req_page); 4925eb3b2d6bSIlya Dryomov return -ENOMEM; 4926eb3b2d6bSIlya Dryomov } 4927eb3b2d6bSIlya Dryomov 4928eb3b2d6bSIlya Dryomov p = page_address(req_page); 4929eb3b2d6bSIlya Dryomov ceph_encode_64(&p, rbd_dev->spec->snap_id); 4930e92c0eafSIlya Dryomov ret = __get_parent_info(rbd_dev, req_page, reply_page, pii); 4931e92c0eafSIlya Dryomov if (ret > 0) 4932e92c0eafSIlya Dryomov ret = __get_parent_info_legacy(rbd_dev, req_page, reply_page, 4933e92c0eafSIlya Dryomov pii); 4934eb3b2d6bSIlya Dryomov 4935eb3b2d6bSIlya Dryomov __free_page(req_page); 4936eb3b2d6bSIlya Dryomov __free_page(reply_page); 4937eb3b2d6bSIlya Dryomov return ret; 4938eb3b2d6bSIlya Dryomov } 4939eb3b2d6bSIlya Dryomov 494086b00e0dSAlex Elder static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev) 494186b00e0dSAlex Elder { 494286b00e0dSAlex Elder struct rbd_spec *parent_spec; 4943eb3b2d6bSIlya Dryomov struct parent_image_info pii = { 0 }; 494486b00e0dSAlex Elder int ret; 494586b00e0dSAlex Elder 494686b00e0dSAlex Elder parent_spec = rbd_spec_alloc(); 494786b00e0dSAlex Elder if (!parent_spec) 494886b00e0dSAlex Elder return -ENOMEM; 494986b00e0dSAlex Elder 4950eb3b2d6bSIlya Dryomov ret = get_parent_info(rbd_dev, &pii); 4951eb3b2d6bSIlya Dryomov if (ret) 495286b00e0dSAlex Elder goto out_err; 495386b00e0dSAlex Elder 4954e92c0eafSIlya Dryomov dout("%s pool_id %llu pool_ns %s image_id %s snap_id %llu has_overlap %d overlap %llu\n", 4955e92c0eafSIlya Dryomov __func__, pii.pool_id, pii.pool_ns, pii.image_id, pii.snap_id, 4956e92c0eafSIlya Dryomov pii.has_overlap, pii.overlap); 4957eb3b2d6bSIlya Dryomov 4958e92c0eafSIlya Dryomov if (pii.pool_id == CEPH_NOPOOL || !pii.has_overlap) { 4959392a9dadSAlex Elder /* 4960392a9dadSAlex Elder * Either the parent never existed, or we have 4961392a9dadSAlex Elder * record of it but the image got flattened so it no 4962392a9dadSAlex Elder * longer has a parent. When the parent of a 4963392a9dadSAlex Elder * layered image disappears we immediately set the 4964392a9dadSAlex Elder * overlap to 0. The effect of this is that all new 4965392a9dadSAlex Elder * requests will be treated as if the image had no 4966392a9dadSAlex Elder * parent. 4967e92c0eafSIlya Dryomov * 4968e92c0eafSIlya Dryomov * If !pii.has_overlap, the parent image spec is not 4969e92c0eafSIlya Dryomov * applicable. It's there to avoid duplication in each 4970e92c0eafSIlya Dryomov * snapshot record. 4971392a9dadSAlex Elder */ 4972392a9dadSAlex Elder if (rbd_dev->parent_overlap) { 4973392a9dadSAlex Elder rbd_dev->parent_overlap = 0; 4974392a9dadSAlex Elder rbd_dev_parent_put(rbd_dev); 4975392a9dadSAlex Elder pr_info("%s: clone image has been flattened\n", 4976392a9dadSAlex Elder rbd_dev->disk->disk_name); 4977392a9dadSAlex Elder } 4978392a9dadSAlex Elder 497986b00e0dSAlex Elder goto out; /* No parent? No problem. */ 4980392a9dadSAlex Elder } 498186b00e0dSAlex Elder 49820903e875SAlex Elder /* The ceph file layout needs to fit pool id in 32 bits */ 49830903e875SAlex Elder 49840903e875SAlex Elder ret = -EIO; 4985eb3b2d6bSIlya Dryomov if (pii.pool_id > (u64)U32_MAX) { 49869584d508SIlya Dryomov rbd_warn(NULL, "parent pool id too large (%llu > %u)", 4987eb3b2d6bSIlya Dryomov (unsigned long long)pii.pool_id, U32_MAX); 498857385b51SAlex Elder goto out_err; 4989c0cd10dbSAlex Elder } 49900903e875SAlex Elder 49913b5cf2a2SAlex Elder /* 49923b5cf2a2SAlex Elder * The parent won't change (except when the clone is 49933b5cf2a2SAlex Elder * flattened, already handled that). So we only need to 49943b5cf2a2SAlex Elder * record the parent spec we have not already done so. 49953b5cf2a2SAlex Elder */ 49963b5cf2a2SAlex Elder if (!rbd_dev->parent_spec) { 4997eb3b2d6bSIlya Dryomov parent_spec->pool_id = pii.pool_id; 4998e92c0eafSIlya Dryomov if (pii.pool_ns && *pii.pool_ns) { 4999e92c0eafSIlya Dryomov parent_spec->pool_ns = pii.pool_ns; 5000e92c0eafSIlya Dryomov pii.pool_ns = NULL; 5001e92c0eafSIlya Dryomov } 5002eb3b2d6bSIlya Dryomov parent_spec->image_id = pii.image_id; 5003eb3b2d6bSIlya Dryomov pii.image_id = NULL; 5004eb3b2d6bSIlya Dryomov parent_spec->snap_id = pii.snap_id; 5005b26c047bSIlya Dryomov 500686b00e0dSAlex Elder rbd_dev->parent_spec = parent_spec; 500786b00e0dSAlex Elder parent_spec = NULL; /* rbd_dev now owns this */ 50083b5cf2a2SAlex Elder } 50093b5cf2a2SAlex Elder 50103b5cf2a2SAlex Elder /* 5011cf32bd9cSIlya Dryomov * We always update the parent overlap. If it's zero we issue 5012cf32bd9cSIlya Dryomov * a warning, as we will proceed as if there was no parent. 50133b5cf2a2SAlex Elder */ 5014eb3b2d6bSIlya Dryomov if (!pii.overlap) { 50153b5cf2a2SAlex Elder if (parent_spec) { 5016cf32bd9cSIlya Dryomov /* refresh, careful to warn just once */ 5017cf32bd9cSIlya Dryomov if (rbd_dev->parent_overlap) 5018cf32bd9cSIlya Dryomov rbd_warn(rbd_dev, 5019cf32bd9cSIlya Dryomov "clone now standalone (overlap became 0)"); 502070cf49cfSAlex Elder } else { 5021cf32bd9cSIlya Dryomov /* initial probe */ 5022cf32bd9cSIlya Dryomov rbd_warn(rbd_dev, "clone is standalone (overlap 0)"); 50233b5cf2a2SAlex Elder } 502470cf49cfSAlex Elder } 5025eb3b2d6bSIlya Dryomov rbd_dev->parent_overlap = pii.overlap; 5026cf32bd9cSIlya Dryomov 502786b00e0dSAlex Elder out: 502886b00e0dSAlex Elder ret = 0; 502986b00e0dSAlex Elder out_err: 5030e92c0eafSIlya Dryomov kfree(pii.pool_ns); 5031eb3b2d6bSIlya Dryomov kfree(pii.image_id); 503286b00e0dSAlex Elder rbd_spec_put(parent_spec); 503386b00e0dSAlex Elder return ret; 503486b00e0dSAlex Elder } 503586b00e0dSAlex Elder 5036cc070d59SAlex Elder static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev) 5037cc070d59SAlex Elder { 5038cc070d59SAlex Elder struct { 5039cc070d59SAlex Elder __le64 stripe_unit; 5040cc070d59SAlex Elder __le64 stripe_count; 5041cc070d59SAlex Elder } __attribute__ ((packed)) striping_info_buf = { 0 }; 5042cc070d59SAlex Elder size_t size = sizeof (striping_info_buf); 5043cc070d59SAlex Elder void *p; 5044cc070d59SAlex Elder int ret; 5045cc070d59SAlex Elder 5046ecd4a68aSIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, 5047ecd4a68aSIlya Dryomov &rbd_dev->header_oloc, "get_stripe_unit_count", 5048ecd4a68aSIlya Dryomov NULL, 0, &striping_info_buf, size); 5049cc070d59SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 5050cc070d59SAlex Elder if (ret < 0) 5051cc070d59SAlex Elder return ret; 5052cc070d59SAlex Elder if (ret < size) 5053cc070d59SAlex Elder return -ERANGE; 5054cc070d59SAlex Elder 5055cc070d59SAlex Elder p = &striping_info_buf; 5056b1331852SIlya Dryomov rbd_dev->header.stripe_unit = ceph_decode_64(&p); 5057b1331852SIlya Dryomov rbd_dev->header.stripe_count = ceph_decode_64(&p); 5058cc070d59SAlex Elder return 0; 5059cc070d59SAlex Elder } 5060cc070d59SAlex Elder 50617e97332eSIlya Dryomov static int rbd_dev_v2_data_pool(struct rbd_device *rbd_dev) 50627e97332eSIlya Dryomov { 50637e97332eSIlya Dryomov __le64 data_pool_id; 50647e97332eSIlya Dryomov int ret; 50657e97332eSIlya Dryomov 50667e97332eSIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, 50677e97332eSIlya Dryomov &rbd_dev->header_oloc, "get_data_pool", 50687e97332eSIlya Dryomov NULL, 0, &data_pool_id, sizeof(data_pool_id)); 50697e97332eSIlya Dryomov if (ret < 0) 50707e97332eSIlya Dryomov return ret; 50717e97332eSIlya Dryomov if (ret < sizeof(data_pool_id)) 50727e97332eSIlya Dryomov return -EBADMSG; 50737e97332eSIlya Dryomov 50747e97332eSIlya Dryomov rbd_dev->header.data_pool_id = le64_to_cpu(data_pool_id); 50757e97332eSIlya Dryomov WARN_ON(rbd_dev->header.data_pool_id == CEPH_NOPOOL); 50767e97332eSIlya Dryomov return 0; 50777e97332eSIlya Dryomov } 50787e97332eSIlya Dryomov 50799e15b77dSAlex Elder static char *rbd_dev_image_name(struct rbd_device *rbd_dev) 50809e15b77dSAlex Elder { 5081ecd4a68aSIlya Dryomov CEPH_DEFINE_OID_ONSTACK(oid); 50829e15b77dSAlex Elder size_t image_id_size; 50839e15b77dSAlex Elder char *image_id; 50849e15b77dSAlex Elder void *p; 50859e15b77dSAlex Elder void *end; 50869e15b77dSAlex Elder size_t size; 50879e15b77dSAlex Elder void *reply_buf = NULL; 50889e15b77dSAlex Elder size_t len = 0; 50899e15b77dSAlex Elder char *image_name = NULL; 50909e15b77dSAlex Elder int ret; 50919e15b77dSAlex Elder 50929e15b77dSAlex Elder rbd_assert(!rbd_dev->spec->image_name); 50939e15b77dSAlex Elder 509469e7a02fSAlex Elder len = strlen(rbd_dev->spec->image_id); 509569e7a02fSAlex Elder image_id_size = sizeof (__le32) + len; 50969e15b77dSAlex Elder image_id = kmalloc(image_id_size, GFP_KERNEL); 50979e15b77dSAlex Elder if (!image_id) 50989e15b77dSAlex Elder return NULL; 50999e15b77dSAlex Elder 51009e15b77dSAlex Elder p = image_id; 51014157976bSAlex Elder end = image_id + image_id_size; 510269e7a02fSAlex Elder ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len); 51039e15b77dSAlex Elder 51049e15b77dSAlex Elder size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX; 51059e15b77dSAlex Elder reply_buf = kmalloc(size, GFP_KERNEL); 51069e15b77dSAlex Elder if (!reply_buf) 51079e15b77dSAlex Elder goto out; 51089e15b77dSAlex Elder 5109ecd4a68aSIlya Dryomov ceph_oid_printf(&oid, "%s", RBD_DIRECTORY); 5110ecd4a68aSIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc, 5111ecd4a68aSIlya Dryomov "dir_get_name", image_id, image_id_size, 5112e2a58ee5SAlex Elder reply_buf, size); 51139e15b77dSAlex Elder if (ret < 0) 51149e15b77dSAlex Elder goto out; 51159e15b77dSAlex Elder p = reply_buf; 5116f40eb349SAlex Elder end = reply_buf + ret; 5117f40eb349SAlex Elder 51189e15b77dSAlex Elder image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL); 51199e15b77dSAlex Elder if (IS_ERR(image_name)) 51209e15b77dSAlex Elder image_name = NULL; 51219e15b77dSAlex Elder else 51229e15b77dSAlex Elder dout("%s: name is %s len is %zd\n", __func__, image_name, len); 51239e15b77dSAlex Elder out: 51249e15b77dSAlex Elder kfree(reply_buf); 51259e15b77dSAlex Elder kfree(image_id); 51269e15b77dSAlex Elder 51279e15b77dSAlex Elder return image_name; 51289e15b77dSAlex Elder } 51299e15b77dSAlex Elder 51302ad3d716SAlex Elder static u64 rbd_v1_snap_id_by_name(struct rbd_device *rbd_dev, const char *name) 51312ad3d716SAlex Elder { 51322ad3d716SAlex Elder struct ceph_snap_context *snapc = rbd_dev->header.snapc; 51332ad3d716SAlex Elder const char *snap_name; 51342ad3d716SAlex Elder u32 which = 0; 51352ad3d716SAlex Elder 51362ad3d716SAlex Elder /* Skip over names until we find the one we are looking for */ 51372ad3d716SAlex Elder 51382ad3d716SAlex Elder snap_name = rbd_dev->header.snap_names; 51392ad3d716SAlex Elder while (which < snapc->num_snaps) { 51402ad3d716SAlex Elder if (!strcmp(name, snap_name)) 51412ad3d716SAlex Elder return snapc->snaps[which]; 51422ad3d716SAlex Elder snap_name += strlen(snap_name) + 1; 51432ad3d716SAlex Elder which++; 51442ad3d716SAlex Elder } 51452ad3d716SAlex Elder return CEPH_NOSNAP; 51462ad3d716SAlex Elder } 51472ad3d716SAlex Elder 51482ad3d716SAlex Elder static u64 rbd_v2_snap_id_by_name(struct rbd_device *rbd_dev, const char *name) 51492ad3d716SAlex Elder { 51502ad3d716SAlex Elder struct ceph_snap_context *snapc = rbd_dev->header.snapc; 51512ad3d716SAlex Elder u32 which; 51522ad3d716SAlex Elder bool found = false; 51532ad3d716SAlex Elder u64 snap_id; 51542ad3d716SAlex Elder 51552ad3d716SAlex Elder for (which = 0; !found && which < snapc->num_snaps; which++) { 51562ad3d716SAlex Elder const char *snap_name; 51572ad3d716SAlex Elder 51582ad3d716SAlex Elder snap_id = snapc->snaps[which]; 51592ad3d716SAlex Elder snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id); 5160efadc98aSJosh Durgin if (IS_ERR(snap_name)) { 5161efadc98aSJosh Durgin /* ignore no-longer existing snapshots */ 5162efadc98aSJosh Durgin if (PTR_ERR(snap_name) == -ENOENT) 5163efadc98aSJosh Durgin continue; 5164efadc98aSJosh Durgin else 51652ad3d716SAlex Elder break; 5166efadc98aSJosh Durgin } 51672ad3d716SAlex Elder found = !strcmp(name, snap_name); 51682ad3d716SAlex Elder kfree(snap_name); 51692ad3d716SAlex Elder } 51702ad3d716SAlex Elder return found ? snap_id : CEPH_NOSNAP; 51712ad3d716SAlex Elder } 51722ad3d716SAlex Elder 51732ad3d716SAlex Elder /* 51742ad3d716SAlex Elder * Assumes name is never RBD_SNAP_HEAD_NAME; returns CEPH_NOSNAP if 51752ad3d716SAlex Elder * no snapshot by that name is found, or if an error occurs. 51762ad3d716SAlex Elder */ 51772ad3d716SAlex Elder static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name) 51782ad3d716SAlex Elder { 51792ad3d716SAlex Elder if (rbd_dev->image_format == 1) 51802ad3d716SAlex Elder return rbd_v1_snap_id_by_name(rbd_dev, name); 51812ad3d716SAlex Elder 51822ad3d716SAlex Elder return rbd_v2_snap_id_by_name(rbd_dev, name); 51832ad3d716SAlex Elder } 51842ad3d716SAlex Elder 51859e15b77dSAlex Elder /* 518604077599SIlya Dryomov * An image being mapped will have everything but the snap id. 51879e15b77dSAlex Elder */ 518804077599SIlya Dryomov static int rbd_spec_fill_snap_id(struct rbd_device *rbd_dev) 518904077599SIlya Dryomov { 519004077599SIlya Dryomov struct rbd_spec *spec = rbd_dev->spec; 519104077599SIlya Dryomov 519204077599SIlya Dryomov rbd_assert(spec->pool_id != CEPH_NOPOOL && spec->pool_name); 519304077599SIlya Dryomov rbd_assert(spec->image_id && spec->image_name); 519404077599SIlya Dryomov rbd_assert(spec->snap_name); 519504077599SIlya Dryomov 519604077599SIlya Dryomov if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) { 519704077599SIlya Dryomov u64 snap_id; 519804077599SIlya Dryomov 519904077599SIlya Dryomov snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name); 520004077599SIlya Dryomov if (snap_id == CEPH_NOSNAP) 520104077599SIlya Dryomov return -ENOENT; 520204077599SIlya Dryomov 520304077599SIlya Dryomov spec->snap_id = snap_id; 520404077599SIlya Dryomov } else { 520504077599SIlya Dryomov spec->snap_id = CEPH_NOSNAP; 520604077599SIlya Dryomov } 520704077599SIlya Dryomov 520804077599SIlya Dryomov return 0; 520904077599SIlya Dryomov } 521004077599SIlya Dryomov 521104077599SIlya Dryomov /* 521204077599SIlya Dryomov * A parent image will have all ids but none of the names. 521304077599SIlya Dryomov * 521404077599SIlya Dryomov * All names in an rbd spec are dynamically allocated. It's OK if we 521504077599SIlya Dryomov * can't figure out the name for an image id. 521604077599SIlya Dryomov */ 521704077599SIlya Dryomov static int rbd_spec_fill_names(struct rbd_device *rbd_dev) 52189e15b77dSAlex Elder { 52192e9f7f1cSAlex Elder struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 52202e9f7f1cSAlex Elder struct rbd_spec *spec = rbd_dev->spec; 52212e9f7f1cSAlex Elder const char *pool_name; 52222e9f7f1cSAlex Elder const char *image_name; 52232e9f7f1cSAlex Elder const char *snap_name; 52249e15b77dSAlex Elder int ret; 52259e15b77dSAlex Elder 522604077599SIlya Dryomov rbd_assert(spec->pool_id != CEPH_NOPOOL); 522704077599SIlya Dryomov rbd_assert(spec->image_id); 522804077599SIlya Dryomov rbd_assert(spec->snap_id != CEPH_NOSNAP); 52299e15b77dSAlex Elder 52302e9f7f1cSAlex Elder /* Get the pool name; we have to make our own copy of this */ 52319e15b77dSAlex Elder 52322e9f7f1cSAlex Elder pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id); 52332e9f7f1cSAlex Elder if (!pool_name) { 52342e9f7f1cSAlex Elder rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id); 5235935dc89fSAlex Elder return -EIO; 5236935dc89fSAlex Elder } 52372e9f7f1cSAlex Elder pool_name = kstrdup(pool_name, GFP_KERNEL); 52382e9f7f1cSAlex Elder if (!pool_name) 52399e15b77dSAlex Elder return -ENOMEM; 52409e15b77dSAlex Elder 52419e15b77dSAlex Elder /* Fetch the image name; tolerate failure here */ 52429e15b77dSAlex Elder 52432e9f7f1cSAlex Elder image_name = rbd_dev_image_name(rbd_dev); 52442e9f7f1cSAlex Elder if (!image_name) 524506ecc6cbSAlex Elder rbd_warn(rbd_dev, "unable to get image name"); 52469e15b77dSAlex Elder 524704077599SIlya Dryomov /* Fetch the snapshot name */ 52489e15b77dSAlex Elder 52492e9f7f1cSAlex Elder snap_name = rbd_snap_name(rbd_dev, spec->snap_id); 5250da6a6b63SJosh Durgin if (IS_ERR(snap_name)) { 5251da6a6b63SJosh Durgin ret = PTR_ERR(snap_name); 52529e15b77dSAlex Elder goto out_err; 52532e9f7f1cSAlex Elder } 52542e9f7f1cSAlex Elder 52552e9f7f1cSAlex Elder spec->pool_name = pool_name; 52562e9f7f1cSAlex Elder spec->image_name = image_name; 52572e9f7f1cSAlex Elder spec->snap_name = snap_name; 52589e15b77dSAlex Elder 52599e15b77dSAlex Elder return 0; 526004077599SIlya Dryomov 52619e15b77dSAlex Elder out_err: 52622e9f7f1cSAlex Elder kfree(image_name); 52632e9f7f1cSAlex Elder kfree(pool_name); 52649e15b77dSAlex Elder return ret; 52659e15b77dSAlex Elder } 52669e15b77dSAlex Elder 5267cc4a38bdSAlex Elder static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev) 526835d489f9SAlex Elder { 526935d489f9SAlex Elder size_t size; 527035d489f9SAlex Elder int ret; 527135d489f9SAlex Elder void *reply_buf; 527235d489f9SAlex Elder void *p; 527335d489f9SAlex Elder void *end; 527435d489f9SAlex Elder u64 seq; 527535d489f9SAlex Elder u32 snap_count; 527635d489f9SAlex Elder struct ceph_snap_context *snapc; 527735d489f9SAlex Elder u32 i; 527835d489f9SAlex Elder 527935d489f9SAlex Elder /* 528035d489f9SAlex Elder * We'll need room for the seq value (maximum snapshot id), 528135d489f9SAlex Elder * snapshot count, and array of that many snapshot ids. 528235d489f9SAlex Elder * For now we have a fixed upper limit on the number we're 528335d489f9SAlex Elder * prepared to receive. 528435d489f9SAlex Elder */ 528535d489f9SAlex Elder size = sizeof (__le64) + sizeof (__le32) + 528635d489f9SAlex Elder RBD_MAX_SNAP_COUNT * sizeof (__le64); 528735d489f9SAlex Elder reply_buf = kzalloc(size, GFP_KERNEL); 528835d489f9SAlex Elder if (!reply_buf) 528935d489f9SAlex Elder return -ENOMEM; 529035d489f9SAlex Elder 5291ecd4a68aSIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, 5292ecd4a68aSIlya Dryomov &rbd_dev->header_oloc, "get_snapcontext", 5293ecd4a68aSIlya Dryomov NULL, 0, reply_buf, size); 529436be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 529535d489f9SAlex Elder if (ret < 0) 529635d489f9SAlex Elder goto out; 529735d489f9SAlex Elder 529835d489f9SAlex Elder p = reply_buf; 529957385b51SAlex Elder end = reply_buf + ret; 530057385b51SAlex Elder ret = -ERANGE; 530135d489f9SAlex Elder ceph_decode_64_safe(&p, end, seq, out); 530235d489f9SAlex Elder ceph_decode_32_safe(&p, end, snap_count, out); 530335d489f9SAlex Elder 530435d489f9SAlex Elder /* 530535d489f9SAlex Elder * Make sure the reported number of snapshot ids wouldn't go 530635d489f9SAlex Elder * beyond the end of our buffer. But before checking that, 530735d489f9SAlex Elder * make sure the computed size of the snapshot context we 530835d489f9SAlex Elder * allocate is representable in a size_t. 530935d489f9SAlex Elder */ 531035d489f9SAlex Elder if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context)) 531135d489f9SAlex Elder / sizeof (u64)) { 531235d489f9SAlex Elder ret = -EINVAL; 531335d489f9SAlex Elder goto out; 531435d489f9SAlex Elder } 531535d489f9SAlex Elder if (!ceph_has_room(&p, end, snap_count * sizeof (__le64))) 531635d489f9SAlex Elder goto out; 5317468521c1SAlex Elder ret = 0; 531835d489f9SAlex Elder 5319812164f8SAlex Elder snapc = ceph_create_snap_context(snap_count, GFP_KERNEL); 532035d489f9SAlex Elder if (!snapc) { 532135d489f9SAlex Elder ret = -ENOMEM; 532235d489f9SAlex Elder goto out; 532335d489f9SAlex Elder } 532435d489f9SAlex Elder snapc->seq = seq; 532535d489f9SAlex Elder for (i = 0; i < snap_count; i++) 532635d489f9SAlex Elder snapc->snaps[i] = ceph_decode_64(&p); 532735d489f9SAlex Elder 532849ece554SAlex Elder ceph_put_snap_context(rbd_dev->header.snapc); 532935d489f9SAlex Elder rbd_dev->header.snapc = snapc; 533035d489f9SAlex Elder 533135d489f9SAlex Elder dout(" snap context seq = %llu, snap_count = %u\n", 533235d489f9SAlex Elder (unsigned long long)seq, (unsigned int)snap_count); 533335d489f9SAlex Elder out: 533435d489f9SAlex Elder kfree(reply_buf); 533535d489f9SAlex Elder 533657385b51SAlex Elder return ret; 533735d489f9SAlex Elder } 533835d489f9SAlex Elder 533954cac61fSAlex Elder static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, 534054cac61fSAlex Elder u64 snap_id) 5341b8b1e2dbSAlex Elder { 5342b8b1e2dbSAlex Elder size_t size; 5343b8b1e2dbSAlex Elder void *reply_buf; 534454cac61fSAlex Elder __le64 snapid; 5345b8b1e2dbSAlex Elder int ret; 5346b8b1e2dbSAlex Elder void *p; 5347b8b1e2dbSAlex Elder void *end; 5348b8b1e2dbSAlex Elder char *snap_name; 5349b8b1e2dbSAlex Elder 5350b8b1e2dbSAlex Elder size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN; 5351b8b1e2dbSAlex Elder reply_buf = kmalloc(size, GFP_KERNEL); 5352b8b1e2dbSAlex Elder if (!reply_buf) 5353b8b1e2dbSAlex Elder return ERR_PTR(-ENOMEM); 5354b8b1e2dbSAlex Elder 535554cac61fSAlex Elder snapid = cpu_to_le64(snap_id); 5356ecd4a68aSIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, 5357ecd4a68aSIlya Dryomov &rbd_dev->header_oloc, "get_snapshot_name", 5358ecd4a68aSIlya Dryomov &snapid, sizeof(snapid), reply_buf, size); 535936be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 5360f40eb349SAlex Elder if (ret < 0) { 5361f40eb349SAlex Elder snap_name = ERR_PTR(ret); 5362b8b1e2dbSAlex Elder goto out; 5363f40eb349SAlex Elder } 5364b8b1e2dbSAlex Elder 5365b8b1e2dbSAlex Elder p = reply_buf; 5366f40eb349SAlex Elder end = reply_buf + ret; 5367e5c35534SAlex Elder snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL); 5368f40eb349SAlex Elder if (IS_ERR(snap_name)) 5369b8b1e2dbSAlex Elder goto out; 5370f40eb349SAlex Elder 5371b8b1e2dbSAlex Elder dout(" snap_id 0x%016llx snap_name = %s\n", 537254cac61fSAlex Elder (unsigned long long)snap_id, snap_name); 5373b8b1e2dbSAlex Elder out: 5374b8b1e2dbSAlex Elder kfree(reply_buf); 5375b8b1e2dbSAlex Elder 5376f40eb349SAlex Elder return snap_name; 5377b8b1e2dbSAlex Elder } 5378b8b1e2dbSAlex Elder 53792df3fac7SAlex Elder static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev) 5380117973fbSAlex Elder { 53812df3fac7SAlex Elder bool first_time = rbd_dev->header.object_prefix == NULL; 5382117973fbSAlex Elder int ret; 5383117973fbSAlex Elder 53841617e40cSJosh Durgin ret = rbd_dev_v2_image_size(rbd_dev); 53851617e40cSJosh Durgin if (ret) 5386cfbf6377SAlex Elder return ret; 53871617e40cSJosh Durgin 53882df3fac7SAlex Elder if (first_time) { 53892df3fac7SAlex Elder ret = rbd_dev_v2_header_onetime(rbd_dev); 53902df3fac7SAlex Elder if (ret) 5391cfbf6377SAlex Elder return ret; 53922df3fac7SAlex Elder } 53932df3fac7SAlex Elder 5394cc4a38bdSAlex Elder ret = rbd_dev_v2_snap_context(rbd_dev); 5395d194cd1dSIlya Dryomov if (ret && first_time) { 5396d194cd1dSIlya Dryomov kfree(rbd_dev->header.object_prefix); 5397d194cd1dSIlya Dryomov rbd_dev->header.object_prefix = NULL; 5398d194cd1dSIlya Dryomov } 5399117973fbSAlex Elder 5400117973fbSAlex Elder return ret; 5401117973fbSAlex Elder } 5402117973fbSAlex Elder 5403a720ae09SIlya Dryomov static int rbd_dev_header_info(struct rbd_device *rbd_dev) 5404a720ae09SIlya Dryomov { 5405a720ae09SIlya Dryomov rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 5406a720ae09SIlya Dryomov 5407a720ae09SIlya Dryomov if (rbd_dev->image_format == 1) 5408a720ae09SIlya Dryomov return rbd_dev_v1_header_info(rbd_dev); 5409a720ae09SIlya Dryomov 5410a720ae09SIlya Dryomov return rbd_dev_v2_header_info(rbd_dev); 5411a720ae09SIlya Dryomov } 5412a720ae09SIlya Dryomov 54131ddbe94eSAlex Elder /* 5414e28fff26SAlex Elder * Skips over white space at *buf, and updates *buf to point to the 5415e28fff26SAlex Elder * first found non-space character (if any). Returns the length of 5416593a9e7bSAlex Elder * the token (string of non-white space characters) found. Note 5417593a9e7bSAlex Elder * that *buf must be terminated with '\0'. 5418e28fff26SAlex Elder */ 5419e28fff26SAlex Elder static inline size_t next_token(const char **buf) 5420e28fff26SAlex Elder { 5421e28fff26SAlex Elder /* 5422e28fff26SAlex Elder * These are the characters that produce nonzero for 5423e28fff26SAlex Elder * isspace() in the "C" and "POSIX" locales. 5424e28fff26SAlex Elder */ 5425e28fff26SAlex Elder const char *spaces = " \f\n\r\t\v"; 5426e28fff26SAlex Elder 5427e28fff26SAlex Elder *buf += strspn(*buf, spaces); /* Find start of token */ 5428e28fff26SAlex Elder 5429e28fff26SAlex Elder return strcspn(*buf, spaces); /* Return token length */ 5430e28fff26SAlex Elder } 5431e28fff26SAlex Elder 5432e28fff26SAlex Elder /* 5433ea3352f4SAlex Elder * Finds the next token in *buf, dynamically allocates a buffer big 5434ea3352f4SAlex Elder * enough to hold a copy of it, and copies the token into the new 5435ea3352f4SAlex Elder * buffer. The copy is guaranteed to be terminated with '\0'. Note 5436ea3352f4SAlex Elder * that a duplicate buffer is created even for a zero-length token. 5437ea3352f4SAlex Elder * 5438ea3352f4SAlex Elder * Returns a pointer to the newly-allocated duplicate, or a null 5439ea3352f4SAlex Elder * pointer if memory for the duplicate was not available. If 5440ea3352f4SAlex Elder * the lenp argument is a non-null pointer, the length of the token 5441ea3352f4SAlex Elder * (not including the '\0') is returned in *lenp. 5442ea3352f4SAlex Elder * 5443ea3352f4SAlex Elder * If successful, the *buf pointer will be updated to point beyond 5444ea3352f4SAlex Elder * the end of the found token. 5445ea3352f4SAlex Elder * 5446ea3352f4SAlex Elder * Note: uses GFP_KERNEL for allocation. 5447ea3352f4SAlex Elder */ 5448ea3352f4SAlex Elder static inline char *dup_token(const char **buf, size_t *lenp) 5449ea3352f4SAlex Elder { 5450ea3352f4SAlex Elder char *dup; 5451ea3352f4SAlex Elder size_t len; 5452ea3352f4SAlex Elder 5453ea3352f4SAlex Elder len = next_token(buf); 54544caf35f9SAlex Elder dup = kmemdup(*buf, len + 1, GFP_KERNEL); 5455ea3352f4SAlex Elder if (!dup) 5456ea3352f4SAlex Elder return NULL; 5457ea3352f4SAlex Elder *(dup + len) = '\0'; 5458ea3352f4SAlex Elder *buf += len; 5459ea3352f4SAlex Elder 5460ea3352f4SAlex Elder if (lenp) 5461ea3352f4SAlex Elder *lenp = len; 5462ea3352f4SAlex Elder 5463ea3352f4SAlex Elder return dup; 5464ea3352f4SAlex Elder } 5465ea3352f4SAlex Elder 5466ea3352f4SAlex Elder /* 5467859c31dfSAlex Elder * Parse the options provided for an "rbd add" (i.e., rbd image 5468859c31dfSAlex Elder * mapping) request. These arrive via a write to /sys/bus/rbd/add, 5469859c31dfSAlex Elder * and the data written is passed here via a NUL-terminated buffer. 5470859c31dfSAlex Elder * Returns 0 if successful or an error code otherwise. 5471d22f76e7SAlex Elder * 5472859c31dfSAlex Elder * The information extracted from these options is recorded in 5473859c31dfSAlex Elder * the other parameters which return dynamically-allocated 5474859c31dfSAlex Elder * structures: 5475859c31dfSAlex Elder * ceph_opts 5476859c31dfSAlex Elder * The address of a pointer that will refer to a ceph options 5477859c31dfSAlex Elder * structure. Caller must release the returned pointer using 5478859c31dfSAlex Elder * ceph_destroy_options() when it is no longer needed. 5479859c31dfSAlex Elder * rbd_opts 5480859c31dfSAlex Elder * Address of an rbd options pointer. Fully initialized by 5481859c31dfSAlex Elder * this function; caller must release with kfree(). 5482859c31dfSAlex Elder * spec 5483859c31dfSAlex Elder * Address of an rbd image specification pointer. Fully 5484859c31dfSAlex Elder * initialized by this function based on parsed options. 5485859c31dfSAlex Elder * Caller must release with rbd_spec_put(). 5486859c31dfSAlex Elder * 5487859c31dfSAlex Elder * The options passed take this form: 5488859c31dfSAlex Elder * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>] 5489859c31dfSAlex Elder * where: 5490859c31dfSAlex Elder * <mon_addrs> 5491859c31dfSAlex Elder * A comma-separated list of one or more monitor addresses. 5492859c31dfSAlex Elder * A monitor address is an ip address, optionally followed 5493859c31dfSAlex Elder * by a port number (separated by a colon). 5494859c31dfSAlex Elder * I.e.: ip1[:port1][,ip2[:port2]...] 5495859c31dfSAlex Elder * <options> 5496859c31dfSAlex Elder * A comma-separated list of ceph and/or rbd options. 5497859c31dfSAlex Elder * <pool_name> 5498859c31dfSAlex Elder * The name of the rados pool containing the rbd image. 5499859c31dfSAlex Elder * <image_name> 5500859c31dfSAlex Elder * The name of the image in that pool to map. 5501859c31dfSAlex Elder * <snap_id> 5502859c31dfSAlex Elder * An optional snapshot id. If provided, the mapping will 5503859c31dfSAlex Elder * present data from the image at the time that snapshot was 5504859c31dfSAlex Elder * created. The image head is used if no snapshot id is 5505859c31dfSAlex Elder * provided. Snapshot mappings are always read-only. 5506a725f65eSAlex Elder */ 5507859c31dfSAlex Elder static int rbd_add_parse_args(const char *buf, 5508dc79b113SAlex Elder struct ceph_options **ceph_opts, 5509859c31dfSAlex Elder struct rbd_options **opts, 5510859c31dfSAlex Elder struct rbd_spec **rbd_spec) 5511a725f65eSAlex Elder { 5512e28fff26SAlex Elder size_t len; 5513859c31dfSAlex Elder char *options; 55140ddebc0cSAlex Elder const char *mon_addrs; 5515ecb4dc22SAlex Elder char *snap_name; 55160ddebc0cSAlex Elder size_t mon_addrs_size; 5517c300156bSIlya Dryomov struct parse_rbd_opts_ctx pctx = { 0 }; 5518859c31dfSAlex Elder struct ceph_options *copts; 5519dc79b113SAlex Elder int ret; 5520e28fff26SAlex Elder 5521e28fff26SAlex Elder /* The first four tokens are required */ 5522e28fff26SAlex Elder 55237ef3214aSAlex Elder len = next_token(&buf); 55244fb5d671SAlex Elder if (!len) { 55254fb5d671SAlex Elder rbd_warn(NULL, "no monitor address(es) provided"); 55264fb5d671SAlex Elder return -EINVAL; 55274fb5d671SAlex Elder } 55280ddebc0cSAlex Elder mon_addrs = buf; 5529f28e565aSAlex Elder mon_addrs_size = len + 1; 55307ef3214aSAlex Elder buf += len; 5531a725f65eSAlex Elder 5532dc79b113SAlex Elder ret = -EINVAL; 5533f28e565aSAlex Elder options = dup_token(&buf, NULL); 5534f28e565aSAlex Elder if (!options) 5535dc79b113SAlex Elder return -ENOMEM; 55364fb5d671SAlex Elder if (!*options) { 55374fb5d671SAlex Elder rbd_warn(NULL, "no options provided"); 55384fb5d671SAlex Elder goto out_err; 55394fb5d671SAlex Elder } 5540a725f65eSAlex Elder 5541c300156bSIlya Dryomov pctx.spec = rbd_spec_alloc(); 5542c300156bSIlya Dryomov if (!pctx.spec) 5543f28e565aSAlex Elder goto out_mem; 5544859c31dfSAlex Elder 5545c300156bSIlya Dryomov pctx.spec->pool_name = dup_token(&buf, NULL); 5546c300156bSIlya Dryomov if (!pctx.spec->pool_name) 5547859c31dfSAlex Elder goto out_mem; 5548c300156bSIlya Dryomov if (!*pctx.spec->pool_name) { 55494fb5d671SAlex Elder rbd_warn(NULL, "no pool name provided"); 55504fb5d671SAlex Elder goto out_err; 55514fb5d671SAlex Elder } 5552e28fff26SAlex Elder 5553c300156bSIlya Dryomov pctx.spec->image_name = dup_token(&buf, NULL); 5554c300156bSIlya Dryomov if (!pctx.spec->image_name) 5555f28e565aSAlex Elder goto out_mem; 5556c300156bSIlya Dryomov if (!*pctx.spec->image_name) { 55574fb5d671SAlex Elder rbd_warn(NULL, "no image name provided"); 55584fb5d671SAlex Elder goto out_err; 55594fb5d671SAlex Elder } 5560e28fff26SAlex Elder 5561f28e565aSAlex Elder /* 5562f28e565aSAlex Elder * Snapshot name is optional; default is to use "-" 5563f28e565aSAlex Elder * (indicating the head/no snapshot). 5564f28e565aSAlex Elder */ 55653feeb894SAlex Elder len = next_token(&buf); 5566820a5f3eSAlex Elder if (!len) { 55673feeb894SAlex Elder buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */ 55683feeb894SAlex Elder len = sizeof (RBD_SNAP_HEAD_NAME) - 1; 5569f28e565aSAlex Elder } else if (len > RBD_MAX_SNAP_NAME_LEN) { 5570dc79b113SAlex Elder ret = -ENAMETOOLONG; 5571f28e565aSAlex Elder goto out_err; 5572849b4260SAlex Elder } 5573ecb4dc22SAlex Elder snap_name = kmemdup(buf, len + 1, GFP_KERNEL); 5574ecb4dc22SAlex Elder if (!snap_name) 5575f28e565aSAlex Elder goto out_mem; 5576ecb4dc22SAlex Elder *(snap_name + len) = '\0'; 5577c300156bSIlya Dryomov pctx.spec->snap_name = snap_name; 5578e5c35534SAlex Elder 55790ddebc0cSAlex Elder /* Initialize all rbd options to the defaults */ 5580e28fff26SAlex Elder 5581c300156bSIlya Dryomov pctx.opts = kzalloc(sizeof(*pctx.opts), GFP_KERNEL); 5582c300156bSIlya Dryomov if (!pctx.opts) 55834e9afebaSAlex Elder goto out_mem; 55844e9afebaSAlex Elder 5585c300156bSIlya Dryomov pctx.opts->read_only = RBD_READ_ONLY_DEFAULT; 5586c300156bSIlya Dryomov pctx.opts->queue_depth = RBD_QUEUE_DEPTH_DEFAULT; 55870c93e1b7SIlya Dryomov pctx.opts->alloc_size = RBD_ALLOC_SIZE_DEFAULT; 5588c300156bSIlya Dryomov pctx.opts->lock_timeout = RBD_LOCK_TIMEOUT_DEFAULT; 5589c300156bSIlya Dryomov pctx.opts->lock_on_read = RBD_LOCK_ON_READ_DEFAULT; 5590c300156bSIlya Dryomov pctx.opts->exclusive = RBD_EXCLUSIVE_DEFAULT; 5591c300156bSIlya Dryomov pctx.opts->trim = RBD_TRIM_DEFAULT; 5592d22f76e7SAlex Elder 5593859c31dfSAlex Elder copts = ceph_parse_options(options, mon_addrs, 55940ddebc0cSAlex Elder mon_addrs + mon_addrs_size - 1, 5595c300156bSIlya Dryomov parse_rbd_opts_token, &pctx); 5596859c31dfSAlex Elder if (IS_ERR(copts)) { 5597859c31dfSAlex Elder ret = PTR_ERR(copts); 5598dc79b113SAlex Elder goto out_err; 5599dc79b113SAlex Elder } 5600859c31dfSAlex Elder kfree(options); 5601859c31dfSAlex Elder 5602859c31dfSAlex Elder *ceph_opts = copts; 5603c300156bSIlya Dryomov *opts = pctx.opts; 5604c300156bSIlya Dryomov *rbd_spec = pctx.spec; 56050ddebc0cSAlex Elder 5606dc79b113SAlex Elder return 0; 5607f28e565aSAlex Elder out_mem: 5608dc79b113SAlex Elder ret = -ENOMEM; 5609d22f76e7SAlex Elder out_err: 5610c300156bSIlya Dryomov kfree(pctx.opts); 5611c300156bSIlya Dryomov rbd_spec_put(pctx.spec); 5612f28e565aSAlex Elder kfree(options); 5613d22f76e7SAlex Elder 5614dc79b113SAlex Elder return ret; 5615a725f65eSAlex Elder } 5616a725f65eSAlex Elder 5617e010dd0aSIlya Dryomov static void rbd_dev_image_unlock(struct rbd_device *rbd_dev) 5618e010dd0aSIlya Dryomov { 5619e010dd0aSIlya Dryomov down_write(&rbd_dev->lock_rwsem); 5620e010dd0aSIlya Dryomov if (__rbd_is_lock_owner(rbd_dev)) 5621e010dd0aSIlya Dryomov rbd_unlock(rbd_dev); 5622e010dd0aSIlya Dryomov up_write(&rbd_dev->lock_rwsem); 5623e010dd0aSIlya Dryomov } 5624e010dd0aSIlya Dryomov 5625e010dd0aSIlya Dryomov static int rbd_add_acquire_lock(struct rbd_device *rbd_dev) 5626e010dd0aSIlya Dryomov { 56272f18d466SIlya Dryomov int ret; 56282f18d466SIlya Dryomov 5629e010dd0aSIlya Dryomov if (!(rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK)) { 5630e010dd0aSIlya Dryomov rbd_warn(rbd_dev, "exclusive-lock feature is not enabled"); 5631e010dd0aSIlya Dryomov return -EINVAL; 5632e010dd0aSIlya Dryomov } 5633e010dd0aSIlya Dryomov 5634e010dd0aSIlya Dryomov /* FIXME: "rbd map --exclusive" should be in interruptible */ 5635e010dd0aSIlya Dryomov down_read(&rbd_dev->lock_rwsem); 56362f18d466SIlya Dryomov ret = rbd_wait_state_locked(rbd_dev, true); 5637e010dd0aSIlya Dryomov up_read(&rbd_dev->lock_rwsem); 56382f18d466SIlya Dryomov if (ret) { 5639e010dd0aSIlya Dryomov rbd_warn(rbd_dev, "failed to acquire exclusive lock"); 5640e010dd0aSIlya Dryomov return -EROFS; 5641e010dd0aSIlya Dryomov } 5642e010dd0aSIlya Dryomov 5643e010dd0aSIlya Dryomov return 0; 5644e010dd0aSIlya Dryomov } 5645e010dd0aSIlya Dryomov 564630ba1f02SIlya Dryomov /* 5647589d30e0SAlex Elder * An rbd format 2 image has a unique identifier, distinct from the 5648589d30e0SAlex Elder * name given to it by the user. Internally, that identifier is 5649589d30e0SAlex Elder * what's used to specify the names of objects related to the image. 5650589d30e0SAlex Elder * 5651589d30e0SAlex Elder * A special "rbd id" object is used to map an rbd image name to its 5652589d30e0SAlex Elder * id. If that object doesn't exist, then there is no v2 rbd image 5653589d30e0SAlex Elder * with the supplied name. 5654589d30e0SAlex Elder * 5655589d30e0SAlex Elder * This function will record the given rbd_dev's image_id field if 5656589d30e0SAlex Elder * it can be determined, and in that case will return 0. If any 5657589d30e0SAlex Elder * errors occur a negative errno will be returned and the rbd_dev's 5658589d30e0SAlex Elder * image_id field will be unchanged (and should be NULL). 5659589d30e0SAlex Elder */ 5660589d30e0SAlex Elder static int rbd_dev_image_id(struct rbd_device *rbd_dev) 5661589d30e0SAlex Elder { 5662589d30e0SAlex Elder int ret; 5663589d30e0SAlex Elder size_t size; 5664ecd4a68aSIlya Dryomov CEPH_DEFINE_OID_ONSTACK(oid); 5665589d30e0SAlex Elder void *response; 5666c0fba368SAlex Elder char *image_id; 56672f82ee54SAlex Elder 5668589d30e0SAlex Elder /* 56692c0d0a10SAlex Elder * When probing a parent image, the image id is already 56702c0d0a10SAlex Elder * known (and the image name likely is not). There's no 5671c0fba368SAlex Elder * need to fetch the image id again in this case. We 5672c0fba368SAlex Elder * do still need to set the image format though. 56732c0d0a10SAlex Elder */ 5674c0fba368SAlex Elder if (rbd_dev->spec->image_id) { 5675c0fba368SAlex Elder rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1; 5676c0fba368SAlex Elder 56772c0d0a10SAlex Elder return 0; 5678c0fba368SAlex Elder } 56792c0d0a10SAlex Elder 56802c0d0a10SAlex Elder /* 5681589d30e0SAlex Elder * First, see if the format 2 image id file exists, and if 5682589d30e0SAlex Elder * so, get the image's persistent id from it. 5683589d30e0SAlex Elder */ 5684ecd4a68aSIlya Dryomov ret = ceph_oid_aprintf(&oid, GFP_KERNEL, "%s%s", RBD_ID_PREFIX, 5685ecd4a68aSIlya Dryomov rbd_dev->spec->image_name); 5686ecd4a68aSIlya Dryomov if (ret) 5687ecd4a68aSIlya Dryomov return ret; 5688ecd4a68aSIlya Dryomov 5689ecd4a68aSIlya Dryomov dout("rbd id object name is %s\n", oid.name); 5690589d30e0SAlex Elder 5691589d30e0SAlex Elder /* Response will be an encoded string, which includes a length */ 5692589d30e0SAlex Elder 5693589d30e0SAlex Elder size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX; 5694589d30e0SAlex Elder response = kzalloc(size, GFP_NOIO); 5695589d30e0SAlex Elder if (!response) { 5696589d30e0SAlex Elder ret = -ENOMEM; 5697589d30e0SAlex Elder goto out; 5698589d30e0SAlex Elder } 5699589d30e0SAlex Elder 5700c0fba368SAlex Elder /* If it doesn't exist we'll assume it's a format 1 image */ 5701c0fba368SAlex Elder 5702ecd4a68aSIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc, 5703ecd4a68aSIlya Dryomov "get_id", NULL, 0, 5704e2a58ee5SAlex Elder response, RBD_IMAGE_ID_LEN_MAX); 570536be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 5706c0fba368SAlex Elder if (ret == -ENOENT) { 5707c0fba368SAlex Elder image_id = kstrdup("", GFP_KERNEL); 5708c0fba368SAlex Elder ret = image_id ? 0 : -ENOMEM; 5709c0fba368SAlex Elder if (!ret) 5710c0fba368SAlex Elder rbd_dev->image_format = 1; 57117dd440c9SIlya Dryomov } else if (ret >= 0) { 5712c0fba368SAlex Elder void *p = response; 5713589d30e0SAlex Elder 5714c0fba368SAlex Elder image_id = ceph_extract_encoded_string(&p, p + ret, 5715979ed480SAlex Elder NULL, GFP_NOIO); 5716461f758aSDuan Jiong ret = PTR_ERR_OR_ZERO(image_id); 5717c0fba368SAlex Elder if (!ret) 5718c0fba368SAlex Elder rbd_dev->image_format = 2; 5719c0fba368SAlex Elder } 5720c0fba368SAlex Elder 5721c0fba368SAlex Elder if (!ret) { 5722c0fba368SAlex Elder rbd_dev->spec->image_id = image_id; 5723c0fba368SAlex Elder dout("image_id is %s\n", image_id); 5724589d30e0SAlex Elder } 5725589d30e0SAlex Elder out: 5726589d30e0SAlex Elder kfree(response); 5727ecd4a68aSIlya Dryomov ceph_oid_destroy(&oid); 5728589d30e0SAlex Elder return ret; 5729589d30e0SAlex Elder } 5730589d30e0SAlex Elder 57313abef3b3SAlex Elder /* 57323abef3b3SAlex Elder * Undo whatever state changes are made by v1 or v2 header info 57333abef3b3SAlex Elder * call. 57343abef3b3SAlex Elder */ 57356fd48b3bSAlex Elder static void rbd_dev_unprobe(struct rbd_device *rbd_dev) 57366fd48b3bSAlex Elder { 57376fd48b3bSAlex Elder struct rbd_image_header *header; 57386fd48b3bSAlex Elder 5739a2acd00eSAlex Elder rbd_dev_parent_put(rbd_dev); 57406fd48b3bSAlex Elder 57416fd48b3bSAlex Elder /* Free dynamic fields from the header, then zero it out */ 57426fd48b3bSAlex Elder 57436fd48b3bSAlex Elder header = &rbd_dev->header; 5744812164f8SAlex Elder ceph_put_snap_context(header->snapc); 57456fd48b3bSAlex Elder kfree(header->snap_sizes); 57466fd48b3bSAlex Elder kfree(header->snap_names); 57476fd48b3bSAlex Elder kfree(header->object_prefix); 57486fd48b3bSAlex Elder memset(header, 0, sizeof (*header)); 57496fd48b3bSAlex Elder } 57506fd48b3bSAlex Elder 57512df3fac7SAlex Elder static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev) 5752a30b71b9SAlex Elder { 5753a30b71b9SAlex Elder int ret; 5754a30b71b9SAlex Elder 57551e130199SAlex Elder ret = rbd_dev_v2_object_prefix(rbd_dev); 575657385b51SAlex Elder if (ret) 57571e130199SAlex Elder goto out_err; 5758b1b5402aSAlex Elder 57592df3fac7SAlex Elder /* 57602df3fac7SAlex Elder * Get the and check features for the image. Currently the 57612df3fac7SAlex Elder * features are assumed to never change. 57622df3fac7SAlex Elder */ 5763b1b5402aSAlex Elder ret = rbd_dev_v2_features(rbd_dev); 576457385b51SAlex Elder if (ret) 5765b1b5402aSAlex Elder goto out_err; 576635d489f9SAlex Elder 5767cc070d59SAlex Elder /* If the image supports fancy striping, get its parameters */ 5768cc070d59SAlex Elder 5769cc070d59SAlex Elder if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) { 5770cc070d59SAlex Elder ret = rbd_dev_v2_striping_info(rbd_dev); 5771cc070d59SAlex Elder if (ret < 0) 5772cc070d59SAlex Elder goto out_err; 5773cc070d59SAlex Elder } 5774a30b71b9SAlex Elder 57757e97332eSIlya Dryomov if (rbd_dev->header.features & RBD_FEATURE_DATA_POOL) { 57767e97332eSIlya Dryomov ret = rbd_dev_v2_data_pool(rbd_dev); 57777e97332eSIlya Dryomov if (ret) 57787e97332eSIlya Dryomov goto out_err; 57797e97332eSIlya Dryomov } 57807e97332eSIlya Dryomov 5781263423f8SIlya Dryomov rbd_init_layout(rbd_dev); 578235152979SAlex Elder return 0; 5783263423f8SIlya Dryomov 57849d475de5SAlex Elder out_err: 5785642a2537SAlex Elder rbd_dev->header.features = 0; 57861e130199SAlex Elder kfree(rbd_dev->header.object_prefix); 57871e130199SAlex Elder rbd_dev->header.object_prefix = NULL; 57889d475de5SAlex Elder return ret; 5789a30b71b9SAlex Elder } 5790a30b71b9SAlex Elder 57916d69bb53SIlya Dryomov /* 57926d69bb53SIlya Dryomov * @depth is rbd_dev_image_probe() -> rbd_dev_probe_parent() -> 57936d69bb53SIlya Dryomov * rbd_dev_image_probe() recursion depth, which means it's also the 57946d69bb53SIlya Dryomov * length of the already discovered part of the parent chain. 57956d69bb53SIlya Dryomov */ 57966d69bb53SIlya Dryomov static int rbd_dev_probe_parent(struct rbd_device *rbd_dev, int depth) 579783a06263SAlex Elder { 57982f82ee54SAlex Elder struct rbd_device *parent = NULL; 5799124afba2SAlex Elder int ret; 5800124afba2SAlex Elder 5801124afba2SAlex Elder if (!rbd_dev->parent_spec) 5802124afba2SAlex Elder return 0; 5803124afba2SAlex Elder 58046d69bb53SIlya Dryomov if (++depth > RBD_MAX_PARENT_CHAIN_LEN) { 58056d69bb53SIlya Dryomov pr_info("parent chain is too long (%d)\n", depth); 58066d69bb53SIlya Dryomov ret = -EINVAL; 58076d69bb53SIlya Dryomov goto out_err; 58086d69bb53SIlya Dryomov } 58096d69bb53SIlya Dryomov 58101643dfa4SIlya Dryomov parent = __rbd_dev_create(rbd_dev->rbd_client, rbd_dev->parent_spec); 58111f2c6651SIlya Dryomov if (!parent) { 5812124afba2SAlex Elder ret = -ENOMEM; 5813124afba2SAlex Elder goto out_err; 58141f2c6651SIlya Dryomov } 58151f2c6651SIlya Dryomov 58161f2c6651SIlya Dryomov /* 58171f2c6651SIlya Dryomov * Images related by parent/child relationships always share 58181f2c6651SIlya Dryomov * rbd_client and spec/parent_spec, so bump their refcounts. 58191f2c6651SIlya Dryomov */ 58201f2c6651SIlya Dryomov __rbd_get_client(rbd_dev->rbd_client); 58211f2c6651SIlya Dryomov rbd_spec_get(rbd_dev->parent_spec); 5822124afba2SAlex Elder 58236d69bb53SIlya Dryomov ret = rbd_dev_image_probe(parent, depth); 5824124afba2SAlex Elder if (ret < 0) 5825124afba2SAlex Elder goto out_err; 58261f2c6651SIlya Dryomov 5827124afba2SAlex Elder rbd_dev->parent = parent; 5828a2acd00eSAlex Elder atomic_set(&rbd_dev->parent_ref, 1); 5829124afba2SAlex Elder return 0; 5830124afba2SAlex Elder 58311f2c6651SIlya Dryomov out_err: 58321f2c6651SIlya Dryomov rbd_dev_unparent(rbd_dev); 58331f2c6651SIlya Dryomov rbd_dev_destroy(parent); 5834124afba2SAlex Elder return ret; 5835124afba2SAlex Elder } 5836124afba2SAlex Elder 58375769ed0cSIlya Dryomov static void rbd_dev_device_release(struct rbd_device *rbd_dev) 58385769ed0cSIlya Dryomov { 58395769ed0cSIlya Dryomov clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); 58405769ed0cSIlya Dryomov rbd_dev_mapping_clear(rbd_dev); 58415769ed0cSIlya Dryomov rbd_free_disk(rbd_dev); 58425769ed0cSIlya Dryomov if (!single_major) 58435769ed0cSIlya Dryomov unregister_blkdev(rbd_dev->major, rbd_dev->name); 58445769ed0cSIlya Dryomov } 58455769ed0cSIlya Dryomov 5846811c6688SIlya Dryomov /* 5847811c6688SIlya Dryomov * rbd_dev->header_rwsem must be locked for write and will be unlocked 5848811c6688SIlya Dryomov * upon return. 5849811c6688SIlya Dryomov */ 5850200a6a8bSAlex Elder static int rbd_dev_device_setup(struct rbd_device *rbd_dev) 5851124afba2SAlex Elder { 585283a06263SAlex Elder int ret; 585383a06263SAlex Elder 58549b60e70bSIlya Dryomov /* Record our major and minor device numbers. */ 585583a06263SAlex Elder 58569b60e70bSIlya Dryomov if (!single_major) { 585783a06263SAlex Elder ret = register_blkdev(0, rbd_dev->name); 585883a06263SAlex Elder if (ret < 0) 58591643dfa4SIlya Dryomov goto err_out_unlock; 58609b60e70bSIlya Dryomov 586183a06263SAlex Elder rbd_dev->major = ret; 5862dd82fff1SIlya Dryomov rbd_dev->minor = 0; 58639b60e70bSIlya Dryomov } else { 58649b60e70bSIlya Dryomov rbd_dev->major = rbd_major; 58659b60e70bSIlya Dryomov rbd_dev->minor = rbd_dev_id_to_minor(rbd_dev->dev_id); 58669b60e70bSIlya Dryomov } 586783a06263SAlex Elder 586883a06263SAlex Elder /* Set up the blkdev mapping. */ 586983a06263SAlex Elder 587083a06263SAlex Elder ret = rbd_init_disk(rbd_dev); 587183a06263SAlex Elder if (ret) 587283a06263SAlex Elder goto err_out_blkdev; 587383a06263SAlex Elder 5874f35a4deeSAlex Elder ret = rbd_dev_mapping_set(rbd_dev); 587583a06263SAlex Elder if (ret) 587683a06263SAlex Elder goto err_out_disk; 5877bc1ecc65SIlya Dryomov 5878f35a4deeSAlex Elder set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE); 58799568c93eSIlya Dryomov set_disk_ro(rbd_dev->disk, rbd_dev->opts->read_only); 5880f35a4deeSAlex Elder 58815769ed0cSIlya Dryomov ret = dev_set_name(&rbd_dev->dev, "%d", rbd_dev->dev_id); 5882f35a4deeSAlex Elder if (ret) 5883f5ee37bdSIlya Dryomov goto err_out_mapping; 588483a06263SAlex Elder 5885129b79d4SAlex Elder set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); 5886811c6688SIlya Dryomov up_write(&rbd_dev->header_rwsem); 58875769ed0cSIlya Dryomov return 0; 58882f82ee54SAlex Elder 5889f35a4deeSAlex Elder err_out_mapping: 5890f35a4deeSAlex Elder rbd_dev_mapping_clear(rbd_dev); 589183a06263SAlex Elder err_out_disk: 589283a06263SAlex Elder rbd_free_disk(rbd_dev); 589383a06263SAlex Elder err_out_blkdev: 58949b60e70bSIlya Dryomov if (!single_major) 589583a06263SAlex Elder unregister_blkdev(rbd_dev->major, rbd_dev->name); 5896811c6688SIlya Dryomov err_out_unlock: 5897811c6688SIlya Dryomov up_write(&rbd_dev->header_rwsem); 589883a06263SAlex Elder return ret; 589983a06263SAlex Elder } 590083a06263SAlex Elder 5901332bb12dSAlex Elder static int rbd_dev_header_name(struct rbd_device *rbd_dev) 5902332bb12dSAlex Elder { 5903332bb12dSAlex Elder struct rbd_spec *spec = rbd_dev->spec; 5904c41d13a3SIlya Dryomov int ret; 5905332bb12dSAlex Elder 5906332bb12dSAlex Elder /* Record the header object name for this rbd image. */ 5907332bb12dSAlex Elder 5908332bb12dSAlex Elder rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 5909332bb12dSAlex Elder if (rbd_dev->image_format == 1) 5910c41d13a3SIlya Dryomov ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s", 5911332bb12dSAlex Elder spec->image_name, RBD_SUFFIX); 5912332bb12dSAlex Elder else 5913c41d13a3SIlya Dryomov ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s", 5914332bb12dSAlex Elder RBD_HEADER_PREFIX, spec->image_id); 5915c41d13a3SIlya Dryomov 5916c41d13a3SIlya Dryomov return ret; 5917332bb12dSAlex Elder } 5918332bb12dSAlex Elder 5919200a6a8bSAlex Elder static void rbd_dev_image_release(struct rbd_device *rbd_dev) 5920200a6a8bSAlex Elder { 59216fd48b3bSAlex Elder rbd_dev_unprobe(rbd_dev); 5922fd22aef8SIlya Dryomov if (rbd_dev->opts) 5923fd22aef8SIlya Dryomov rbd_unregister_watch(rbd_dev); 59246fd48b3bSAlex Elder rbd_dev->image_format = 0; 59256fd48b3bSAlex Elder kfree(rbd_dev->spec->image_id); 59266fd48b3bSAlex Elder rbd_dev->spec->image_id = NULL; 5927200a6a8bSAlex Elder } 5928200a6a8bSAlex Elder 5929a30b71b9SAlex Elder /* 5930a30b71b9SAlex Elder * Probe for the existence of the header object for the given rbd 59311f3ef788SAlex Elder * device. If this image is the one being mapped (i.e., not a 59321f3ef788SAlex Elder * parent), initiate a watch on its header object before using that 59331f3ef788SAlex Elder * object to get detailed information about the rbd image. 5934a30b71b9SAlex Elder */ 59356d69bb53SIlya Dryomov static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth) 5936a30b71b9SAlex Elder { 5937a30b71b9SAlex Elder int ret; 5938a30b71b9SAlex Elder 5939a30b71b9SAlex Elder /* 59403abef3b3SAlex Elder * Get the id from the image id object. Unless there's an 59413abef3b3SAlex Elder * error, rbd_dev->spec->image_id will be filled in with 59423abef3b3SAlex Elder * a dynamically-allocated string, and rbd_dev->image_format 59433abef3b3SAlex Elder * will be set to either 1 or 2. 5944a30b71b9SAlex Elder */ 5945a30b71b9SAlex Elder ret = rbd_dev_image_id(rbd_dev); 5946a30b71b9SAlex Elder if (ret) 5947c0fba368SAlex Elder return ret; 5948c0fba368SAlex Elder 5949332bb12dSAlex Elder ret = rbd_dev_header_name(rbd_dev); 5950332bb12dSAlex Elder if (ret) 5951332bb12dSAlex Elder goto err_out_format; 5952332bb12dSAlex Elder 59536d69bb53SIlya Dryomov if (!depth) { 595499d16943SIlya Dryomov ret = rbd_register_watch(rbd_dev); 59551fe48023SIlya Dryomov if (ret) { 59561fe48023SIlya Dryomov if (ret == -ENOENT) 5957b26c047bSIlya Dryomov pr_info("image %s/%s%s%s does not exist\n", 59581fe48023SIlya Dryomov rbd_dev->spec->pool_name, 5959b26c047bSIlya Dryomov rbd_dev->spec->pool_ns ?: "", 5960b26c047bSIlya Dryomov rbd_dev->spec->pool_ns ? "/" : "", 59611fe48023SIlya Dryomov rbd_dev->spec->image_name); 5962c41d13a3SIlya Dryomov goto err_out_format; 59631f3ef788SAlex Elder } 59641fe48023SIlya Dryomov } 5965b644de2bSAlex Elder 5966a720ae09SIlya Dryomov ret = rbd_dev_header_info(rbd_dev); 59675655c4d9SAlex Elder if (ret) 5968b644de2bSAlex Elder goto err_out_watch; 5969a30b71b9SAlex Elder 597004077599SIlya Dryomov /* 597104077599SIlya Dryomov * If this image is the one being mapped, we have pool name and 597204077599SIlya Dryomov * id, image name and id, and snap name - need to fill snap id. 597304077599SIlya Dryomov * Otherwise this is a parent image, identified by pool, image 597404077599SIlya Dryomov * and snap ids - need to fill in names for those ids. 597504077599SIlya Dryomov */ 59766d69bb53SIlya Dryomov if (!depth) 597704077599SIlya Dryomov ret = rbd_spec_fill_snap_id(rbd_dev); 597804077599SIlya Dryomov else 597904077599SIlya Dryomov ret = rbd_spec_fill_names(rbd_dev); 59801fe48023SIlya Dryomov if (ret) { 59811fe48023SIlya Dryomov if (ret == -ENOENT) 5982b26c047bSIlya Dryomov pr_info("snap %s/%s%s%s@%s does not exist\n", 59831fe48023SIlya Dryomov rbd_dev->spec->pool_name, 5984b26c047bSIlya Dryomov rbd_dev->spec->pool_ns ?: "", 5985b26c047bSIlya Dryomov rbd_dev->spec->pool_ns ? "/" : "", 59861fe48023SIlya Dryomov rbd_dev->spec->image_name, 59871fe48023SIlya Dryomov rbd_dev->spec->snap_name); 598833dca39fSAlex Elder goto err_out_probe; 59891fe48023SIlya Dryomov } 59909bb81c9bSAlex Elder 5991e8f59b59SIlya Dryomov if (rbd_dev->header.features & RBD_FEATURE_LAYERING) { 5992e8f59b59SIlya Dryomov ret = rbd_dev_v2_parent_info(rbd_dev); 5993e8f59b59SIlya Dryomov if (ret) 5994e8f59b59SIlya Dryomov goto err_out_probe; 5995e8f59b59SIlya Dryomov } 5996e8f59b59SIlya Dryomov 59976d69bb53SIlya Dryomov ret = rbd_dev_probe_parent(rbd_dev, depth); 599830d60ba2SAlex Elder if (ret) 599930d60ba2SAlex Elder goto err_out_probe; 600083a06263SAlex Elder 600130d60ba2SAlex Elder dout("discovered format %u image, header name is %s\n", 6002c41d13a3SIlya Dryomov rbd_dev->image_format, rbd_dev->header_oid.name); 600330d60ba2SAlex Elder return 0; 6004e8f59b59SIlya Dryomov 60056fd48b3bSAlex Elder err_out_probe: 60066fd48b3bSAlex Elder rbd_dev_unprobe(rbd_dev); 6007b644de2bSAlex Elder err_out_watch: 60086d69bb53SIlya Dryomov if (!depth) 600999d16943SIlya Dryomov rbd_unregister_watch(rbd_dev); 6010332bb12dSAlex Elder err_out_format: 6011332bb12dSAlex Elder rbd_dev->image_format = 0; 60125655c4d9SAlex Elder kfree(rbd_dev->spec->image_id); 60135655c4d9SAlex Elder rbd_dev->spec->image_id = NULL; 60145655c4d9SAlex Elder return ret; 601583a06263SAlex Elder } 601683a06263SAlex Elder 60179b60e70bSIlya Dryomov static ssize_t do_rbd_add(struct bus_type *bus, 601859c2be1eSYehuda Sadeh const char *buf, 601959c2be1eSYehuda Sadeh size_t count) 6020602adf40SYehuda Sadeh { 6021cb8627c7SAlex Elder struct rbd_device *rbd_dev = NULL; 6022dc79b113SAlex Elder struct ceph_options *ceph_opts = NULL; 60234e9afebaSAlex Elder struct rbd_options *rbd_opts = NULL; 6024859c31dfSAlex Elder struct rbd_spec *spec = NULL; 60259d3997fdSAlex Elder struct rbd_client *rbdc; 6026b51c83c2SIlya Dryomov int rc; 6027602adf40SYehuda Sadeh 6028602adf40SYehuda Sadeh if (!try_module_get(THIS_MODULE)) 6029602adf40SYehuda Sadeh return -ENODEV; 6030602adf40SYehuda Sadeh 6031a725f65eSAlex Elder /* parse add command */ 6032859c31dfSAlex Elder rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec); 6033dc79b113SAlex Elder if (rc < 0) 6034dd5ac32dSIlya Dryomov goto out; 6035a725f65eSAlex Elder 60369d3997fdSAlex Elder rbdc = rbd_get_client(ceph_opts); 60379d3997fdSAlex Elder if (IS_ERR(rbdc)) { 60389d3997fdSAlex Elder rc = PTR_ERR(rbdc); 60390ddebc0cSAlex Elder goto err_out_args; 60409d3997fdSAlex Elder } 6041602adf40SYehuda Sadeh 6042602adf40SYehuda Sadeh /* pick the pool */ 6043dd435855SIlya Dryomov rc = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, spec->pool_name); 60441fe48023SIlya Dryomov if (rc < 0) { 60451fe48023SIlya Dryomov if (rc == -ENOENT) 60461fe48023SIlya Dryomov pr_info("pool %s does not exist\n", spec->pool_name); 6047602adf40SYehuda Sadeh goto err_out_client; 60481fe48023SIlya Dryomov } 6049859c31dfSAlex Elder spec->pool_id = (u64)rc; 6050859c31dfSAlex Elder 6051d147543dSIlya Dryomov rbd_dev = rbd_dev_create(rbdc, spec, rbd_opts); 6052b51c83c2SIlya Dryomov if (!rbd_dev) { 6053b51c83c2SIlya Dryomov rc = -ENOMEM; 6054bd4ba655SAlex Elder goto err_out_client; 6055b51c83c2SIlya Dryomov } 6056c53d5893SAlex Elder rbdc = NULL; /* rbd_dev now owns this */ 6057c53d5893SAlex Elder spec = NULL; /* rbd_dev now owns this */ 6058d147543dSIlya Dryomov rbd_opts = NULL; /* rbd_dev now owns this */ 6059602adf40SYehuda Sadeh 60600d6d1e9cSMike Christie rbd_dev->config_info = kstrdup(buf, GFP_KERNEL); 60610d6d1e9cSMike Christie if (!rbd_dev->config_info) { 60620d6d1e9cSMike Christie rc = -ENOMEM; 60630d6d1e9cSMike Christie goto err_out_rbd_dev; 60640d6d1e9cSMike Christie } 60650d6d1e9cSMike Christie 6066811c6688SIlya Dryomov down_write(&rbd_dev->header_rwsem); 60676d69bb53SIlya Dryomov rc = rbd_dev_image_probe(rbd_dev, 0); 60680d6d1e9cSMike Christie if (rc < 0) { 60690d6d1e9cSMike Christie up_write(&rbd_dev->header_rwsem); 6070c53d5893SAlex Elder goto err_out_rbd_dev; 60710d6d1e9cSMike Christie } 607205fd6f6fSAlex Elder 60737ce4eef7SAlex Elder /* If we are mapping a snapshot it must be marked read-only */ 60747ce4eef7SAlex Elder if (rbd_dev->spec->snap_id != CEPH_NOSNAP) 60759568c93eSIlya Dryomov rbd_dev->opts->read_only = true; 60767ce4eef7SAlex Elder 60770c93e1b7SIlya Dryomov if (rbd_dev->opts->alloc_size > rbd_dev->layout.object_size) { 60780c93e1b7SIlya Dryomov rbd_warn(rbd_dev, "alloc_size adjusted to %u", 60790c93e1b7SIlya Dryomov rbd_dev->layout.object_size); 60800c93e1b7SIlya Dryomov rbd_dev->opts->alloc_size = rbd_dev->layout.object_size; 60810c93e1b7SIlya Dryomov } 60820c93e1b7SIlya Dryomov 6083b536f69aSAlex Elder rc = rbd_dev_device_setup(rbd_dev); 6084fd22aef8SIlya Dryomov if (rc) 60858b679ec5SIlya Dryomov goto err_out_image_probe; 60863abef3b3SAlex Elder 6087e010dd0aSIlya Dryomov if (rbd_dev->opts->exclusive) { 6088e010dd0aSIlya Dryomov rc = rbd_add_acquire_lock(rbd_dev); 6089e010dd0aSIlya Dryomov if (rc) 6090e010dd0aSIlya Dryomov goto err_out_device_setup; 6091b536f69aSAlex Elder } 6092b536f69aSAlex Elder 60935769ed0cSIlya Dryomov /* Everything's ready. Announce the disk to the world. */ 60945769ed0cSIlya Dryomov 60955769ed0cSIlya Dryomov rc = device_add(&rbd_dev->dev); 60965769ed0cSIlya Dryomov if (rc) 6097e010dd0aSIlya Dryomov goto err_out_image_lock; 60985769ed0cSIlya Dryomov 60995769ed0cSIlya Dryomov add_disk(rbd_dev->disk); 61005769ed0cSIlya Dryomov /* see rbd_init_disk() */ 61015769ed0cSIlya Dryomov blk_put_queue(rbd_dev->disk->queue); 61025769ed0cSIlya Dryomov 61035769ed0cSIlya Dryomov spin_lock(&rbd_dev_list_lock); 61045769ed0cSIlya Dryomov list_add_tail(&rbd_dev->node, &rbd_dev_list); 61055769ed0cSIlya Dryomov spin_unlock(&rbd_dev_list_lock); 61065769ed0cSIlya Dryomov 61075769ed0cSIlya Dryomov pr_info("%s: capacity %llu features 0x%llx\n", rbd_dev->disk->disk_name, 61085769ed0cSIlya Dryomov (unsigned long long)get_capacity(rbd_dev->disk) << SECTOR_SHIFT, 61095769ed0cSIlya Dryomov rbd_dev->header.features); 6110dd5ac32dSIlya Dryomov rc = count; 6111dd5ac32dSIlya Dryomov out: 6112dd5ac32dSIlya Dryomov module_put(THIS_MODULE); 6113dd5ac32dSIlya Dryomov return rc; 6114b536f69aSAlex Elder 6115e010dd0aSIlya Dryomov err_out_image_lock: 6116e010dd0aSIlya Dryomov rbd_dev_image_unlock(rbd_dev); 61175769ed0cSIlya Dryomov err_out_device_setup: 61185769ed0cSIlya Dryomov rbd_dev_device_release(rbd_dev); 61198b679ec5SIlya Dryomov err_out_image_probe: 61208b679ec5SIlya Dryomov rbd_dev_image_release(rbd_dev); 6121c53d5893SAlex Elder err_out_rbd_dev: 6122c53d5893SAlex Elder rbd_dev_destroy(rbd_dev); 6123bd4ba655SAlex Elder err_out_client: 61249d3997fdSAlex Elder rbd_put_client(rbdc); 61250ddebc0cSAlex Elder err_out_args: 6126859c31dfSAlex Elder rbd_spec_put(spec); 6127d147543dSIlya Dryomov kfree(rbd_opts); 6128dd5ac32dSIlya Dryomov goto out; 6129602adf40SYehuda Sadeh } 6130602adf40SYehuda Sadeh 61319b60e70bSIlya Dryomov static ssize_t rbd_add(struct bus_type *bus, 61329b60e70bSIlya Dryomov const char *buf, 61339b60e70bSIlya Dryomov size_t count) 61349b60e70bSIlya Dryomov { 61359b60e70bSIlya Dryomov if (single_major) 61369b60e70bSIlya Dryomov return -EINVAL; 61379b60e70bSIlya Dryomov 61389b60e70bSIlya Dryomov return do_rbd_add(bus, buf, count); 61399b60e70bSIlya Dryomov } 61409b60e70bSIlya Dryomov 61419b60e70bSIlya Dryomov static ssize_t rbd_add_single_major(struct bus_type *bus, 61429b60e70bSIlya Dryomov const char *buf, 61439b60e70bSIlya Dryomov size_t count) 61449b60e70bSIlya Dryomov { 61459b60e70bSIlya Dryomov return do_rbd_add(bus, buf, count); 61469b60e70bSIlya Dryomov } 61479b60e70bSIlya Dryomov 614805a46afdSAlex Elder static void rbd_dev_remove_parent(struct rbd_device *rbd_dev) 614905a46afdSAlex Elder { 6150ad945fc1SAlex Elder while (rbd_dev->parent) { 615105a46afdSAlex Elder struct rbd_device *first = rbd_dev; 615205a46afdSAlex Elder struct rbd_device *second = first->parent; 615305a46afdSAlex Elder struct rbd_device *third; 615405a46afdSAlex Elder 615505a46afdSAlex Elder /* 615605a46afdSAlex Elder * Follow to the parent with no grandparent and 615705a46afdSAlex Elder * remove it. 615805a46afdSAlex Elder */ 615905a46afdSAlex Elder while (second && (third = second->parent)) { 616005a46afdSAlex Elder first = second; 616105a46afdSAlex Elder second = third; 616205a46afdSAlex Elder } 6163ad945fc1SAlex Elder rbd_assert(second); 61648ad42cd0SAlex Elder rbd_dev_image_release(second); 61658b679ec5SIlya Dryomov rbd_dev_destroy(second); 6166ad945fc1SAlex Elder first->parent = NULL; 6167ad945fc1SAlex Elder first->parent_overlap = 0; 6168ad945fc1SAlex Elder 6169ad945fc1SAlex Elder rbd_assert(first->parent_spec); 617005a46afdSAlex Elder rbd_spec_put(first->parent_spec); 617105a46afdSAlex Elder first->parent_spec = NULL; 617205a46afdSAlex Elder } 617305a46afdSAlex Elder } 617405a46afdSAlex Elder 61759b60e70bSIlya Dryomov static ssize_t do_rbd_remove(struct bus_type *bus, 6176602adf40SYehuda Sadeh const char *buf, 6177602adf40SYehuda Sadeh size_t count) 6178602adf40SYehuda Sadeh { 6179602adf40SYehuda Sadeh struct rbd_device *rbd_dev = NULL; 6180751cc0e3SAlex Elder struct list_head *tmp; 6181751cc0e3SAlex Elder int dev_id; 61820276dca6SMike Christie char opt_buf[6]; 61830276dca6SMike Christie bool force = false; 61840d8189e1SAlex Elder int ret; 6185602adf40SYehuda Sadeh 61860276dca6SMike Christie dev_id = -1; 61870276dca6SMike Christie opt_buf[0] = '\0'; 61880276dca6SMike Christie sscanf(buf, "%d %5s", &dev_id, opt_buf); 61890276dca6SMike Christie if (dev_id < 0) { 61900276dca6SMike Christie pr_err("dev_id out of range\n"); 6191602adf40SYehuda Sadeh return -EINVAL; 61920276dca6SMike Christie } 61930276dca6SMike Christie if (opt_buf[0] != '\0') { 61940276dca6SMike Christie if (!strcmp(opt_buf, "force")) { 61950276dca6SMike Christie force = true; 61960276dca6SMike Christie } else { 61970276dca6SMike Christie pr_err("bad remove option at '%s'\n", opt_buf); 61980276dca6SMike Christie return -EINVAL; 61990276dca6SMike Christie } 62000276dca6SMike Christie } 6201602adf40SYehuda Sadeh 6202602adf40SYehuda Sadeh ret = -ENOENT; 6203751cc0e3SAlex Elder spin_lock(&rbd_dev_list_lock); 6204751cc0e3SAlex Elder list_for_each(tmp, &rbd_dev_list) { 6205751cc0e3SAlex Elder rbd_dev = list_entry(tmp, struct rbd_device, node); 6206751cc0e3SAlex Elder if (rbd_dev->dev_id == dev_id) { 6207751cc0e3SAlex Elder ret = 0; 6208751cc0e3SAlex Elder break; 6209602adf40SYehuda Sadeh } 6210751cc0e3SAlex Elder } 6211751cc0e3SAlex Elder if (!ret) { 6212a14ea269SAlex Elder spin_lock_irq(&rbd_dev->lock); 62130276dca6SMike Christie if (rbd_dev->open_count && !force) 621442382b70SAlex Elder ret = -EBUSY; 621585f5a4d6SIlya Dryomov else if (test_and_set_bit(RBD_DEV_FLAG_REMOVING, 621685f5a4d6SIlya Dryomov &rbd_dev->flags)) 621785f5a4d6SIlya Dryomov ret = -EINPROGRESS; 6218a14ea269SAlex Elder spin_unlock_irq(&rbd_dev->lock); 6219751cc0e3SAlex Elder } 6220751cc0e3SAlex Elder spin_unlock(&rbd_dev_list_lock); 622185f5a4d6SIlya Dryomov if (ret) 62221ba0f1e7SAlex Elder return ret; 6223751cc0e3SAlex Elder 62240276dca6SMike Christie if (force) { 62250276dca6SMike Christie /* 62260276dca6SMike Christie * Prevent new IO from being queued and wait for existing 62270276dca6SMike Christie * IO to complete/fail. 62280276dca6SMike Christie */ 62290276dca6SMike Christie blk_mq_freeze_queue(rbd_dev->disk->queue); 62300276dca6SMike Christie blk_set_queue_dying(rbd_dev->disk->queue); 62310276dca6SMike Christie } 62320276dca6SMike Christie 62335769ed0cSIlya Dryomov del_gendisk(rbd_dev->disk); 62345769ed0cSIlya Dryomov spin_lock(&rbd_dev_list_lock); 62355769ed0cSIlya Dryomov list_del_init(&rbd_dev->node); 62365769ed0cSIlya Dryomov spin_unlock(&rbd_dev_list_lock); 62375769ed0cSIlya Dryomov device_del(&rbd_dev->dev); 6238fca27065SIlya Dryomov 6239e010dd0aSIlya Dryomov rbd_dev_image_unlock(rbd_dev); 6240dd5ac32dSIlya Dryomov rbd_dev_device_release(rbd_dev); 62418ad42cd0SAlex Elder rbd_dev_image_release(rbd_dev); 62428b679ec5SIlya Dryomov rbd_dev_destroy(rbd_dev); 62431ba0f1e7SAlex Elder return count; 6244602adf40SYehuda Sadeh } 6245602adf40SYehuda Sadeh 62469b60e70bSIlya Dryomov static ssize_t rbd_remove(struct bus_type *bus, 62479b60e70bSIlya Dryomov const char *buf, 62489b60e70bSIlya Dryomov size_t count) 62499b60e70bSIlya Dryomov { 62509b60e70bSIlya Dryomov if (single_major) 62519b60e70bSIlya Dryomov return -EINVAL; 62529b60e70bSIlya Dryomov 62539b60e70bSIlya Dryomov return do_rbd_remove(bus, buf, count); 62549b60e70bSIlya Dryomov } 62559b60e70bSIlya Dryomov 62569b60e70bSIlya Dryomov static ssize_t rbd_remove_single_major(struct bus_type *bus, 62579b60e70bSIlya Dryomov const char *buf, 62589b60e70bSIlya Dryomov size_t count) 62599b60e70bSIlya Dryomov { 62609b60e70bSIlya Dryomov return do_rbd_remove(bus, buf, count); 62619b60e70bSIlya Dryomov } 62629b60e70bSIlya Dryomov 6263602adf40SYehuda Sadeh /* 6264602adf40SYehuda Sadeh * create control files in sysfs 6265dfc5606dSYehuda Sadeh * /sys/bus/rbd/... 6266602adf40SYehuda Sadeh */ 62677d8dc534SChengguang Xu static int __init rbd_sysfs_init(void) 6268602adf40SYehuda Sadeh { 6269dfc5606dSYehuda Sadeh int ret; 6270602adf40SYehuda Sadeh 6271fed4c143SAlex Elder ret = device_register(&rbd_root_dev); 6272dfc5606dSYehuda Sadeh if (ret < 0) 6273dfc5606dSYehuda Sadeh return ret; 6274602adf40SYehuda Sadeh 6275fed4c143SAlex Elder ret = bus_register(&rbd_bus_type); 6276fed4c143SAlex Elder if (ret < 0) 6277fed4c143SAlex Elder device_unregister(&rbd_root_dev); 6278602adf40SYehuda Sadeh 6279602adf40SYehuda Sadeh return ret; 6280602adf40SYehuda Sadeh } 6281602adf40SYehuda Sadeh 62827d8dc534SChengguang Xu static void __exit rbd_sysfs_cleanup(void) 6283602adf40SYehuda Sadeh { 6284dfc5606dSYehuda Sadeh bus_unregister(&rbd_bus_type); 6285fed4c143SAlex Elder device_unregister(&rbd_root_dev); 6286602adf40SYehuda Sadeh } 6287602adf40SYehuda Sadeh 62887d8dc534SChengguang Xu static int __init rbd_slab_init(void) 62891c2a9dfeSAlex Elder { 62901c2a9dfeSAlex Elder rbd_assert(!rbd_img_request_cache); 629103d94406SGeliang Tang rbd_img_request_cache = KMEM_CACHE(rbd_img_request, 0); 6292868311b1SAlex Elder if (!rbd_img_request_cache) 6293868311b1SAlex Elder return -ENOMEM; 6294868311b1SAlex Elder 6295868311b1SAlex Elder rbd_assert(!rbd_obj_request_cache); 629603d94406SGeliang Tang rbd_obj_request_cache = KMEM_CACHE(rbd_obj_request, 0); 629778c2a44aSAlex Elder if (!rbd_obj_request_cache) 629878c2a44aSAlex Elder goto out_err; 629978c2a44aSAlex Elder 63001c2a9dfeSAlex Elder return 0; 63011c2a9dfeSAlex Elder 63026c696d85SIlya Dryomov out_err: 6303868311b1SAlex Elder kmem_cache_destroy(rbd_img_request_cache); 6304868311b1SAlex Elder rbd_img_request_cache = NULL; 63051c2a9dfeSAlex Elder return -ENOMEM; 63061c2a9dfeSAlex Elder } 63071c2a9dfeSAlex Elder 63081c2a9dfeSAlex Elder static void rbd_slab_exit(void) 63091c2a9dfeSAlex Elder { 6310868311b1SAlex Elder rbd_assert(rbd_obj_request_cache); 6311868311b1SAlex Elder kmem_cache_destroy(rbd_obj_request_cache); 6312868311b1SAlex Elder rbd_obj_request_cache = NULL; 6313868311b1SAlex Elder 63141c2a9dfeSAlex Elder rbd_assert(rbd_img_request_cache); 63151c2a9dfeSAlex Elder kmem_cache_destroy(rbd_img_request_cache); 63161c2a9dfeSAlex Elder rbd_img_request_cache = NULL; 63171c2a9dfeSAlex Elder } 63181c2a9dfeSAlex Elder 6319cc344fa1SAlex Elder static int __init rbd_init(void) 6320602adf40SYehuda Sadeh { 6321602adf40SYehuda Sadeh int rc; 6322602adf40SYehuda Sadeh 63231e32d34cSAlex Elder if (!libceph_compatible(NULL)) { 63241e32d34cSAlex Elder rbd_warn(NULL, "libceph incompatibility (quitting)"); 63251e32d34cSAlex Elder return -EINVAL; 63261e32d34cSAlex Elder } 6327e1b4d96dSIlya Dryomov 63281c2a9dfeSAlex Elder rc = rbd_slab_init(); 6329602adf40SYehuda Sadeh if (rc) 6330602adf40SYehuda Sadeh return rc; 6331e1b4d96dSIlya Dryomov 6332f5ee37bdSIlya Dryomov /* 6333f5ee37bdSIlya Dryomov * The number of active work items is limited by the number of 6334f77303bdSIlya Dryomov * rbd devices * queue depth, so leave @max_active at default. 6335f5ee37bdSIlya Dryomov */ 6336f5ee37bdSIlya Dryomov rbd_wq = alloc_workqueue(RBD_DRV_NAME, WQ_MEM_RECLAIM, 0); 6337f5ee37bdSIlya Dryomov if (!rbd_wq) { 6338f5ee37bdSIlya Dryomov rc = -ENOMEM; 6339f5ee37bdSIlya Dryomov goto err_out_slab; 6340f5ee37bdSIlya Dryomov } 6341f5ee37bdSIlya Dryomov 63429b60e70bSIlya Dryomov if (single_major) { 63439b60e70bSIlya Dryomov rbd_major = register_blkdev(0, RBD_DRV_NAME); 63449b60e70bSIlya Dryomov if (rbd_major < 0) { 63459b60e70bSIlya Dryomov rc = rbd_major; 6346f5ee37bdSIlya Dryomov goto err_out_wq; 63479b60e70bSIlya Dryomov } 63489b60e70bSIlya Dryomov } 63499b60e70bSIlya Dryomov 63501c2a9dfeSAlex Elder rc = rbd_sysfs_init(); 63511c2a9dfeSAlex Elder if (rc) 63529b60e70bSIlya Dryomov goto err_out_blkdev; 63531c2a9dfeSAlex Elder 63549b60e70bSIlya Dryomov if (single_major) 63559b60e70bSIlya Dryomov pr_info("loaded (major %d)\n", rbd_major); 63569b60e70bSIlya Dryomov else 6357e1b4d96dSIlya Dryomov pr_info("loaded\n"); 63589b60e70bSIlya Dryomov 6359e1b4d96dSIlya Dryomov return 0; 6360e1b4d96dSIlya Dryomov 63619b60e70bSIlya Dryomov err_out_blkdev: 63629b60e70bSIlya Dryomov if (single_major) 63639b60e70bSIlya Dryomov unregister_blkdev(rbd_major, RBD_DRV_NAME); 6364f5ee37bdSIlya Dryomov err_out_wq: 6365f5ee37bdSIlya Dryomov destroy_workqueue(rbd_wq); 6366e1b4d96dSIlya Dryomov err_out_slab: 6367e1b4d96dSIlya Dryomov rbd_slab_exit(); 63681c2a9dfeSAlex Elder return rc; 6369602adf40SYehuda Sadeh } 6370602adf40SYehuda Sadeh 6371cc344fa1SAlex Elder static void __exit rbd_exit(void) 6372602adf40SYehuda Sadeh { 6373ffe312cfSIlya Dryomov ida_destroy(&rbd_dev_id_ida); 6374602adf40SYehuda Sadeh rbd_sysfs_cleanup(); 63759b60e70bSIlya Dryomov if (single_major) 63769b60e70bSIlya Dryomov unregister_blkdev(rbd_major, RBD_DRV_NAME); 6377f5ee37bdSIlya Dryomov destroy_workqueue(rbd_wq); 63781c2a9dfeSAlex Elder rbd_slab_exit(); 6379602adf40SYehuda Sadeh } 6380602adf40SYehuda Sadeh 6381602adf40SYehuda Sadeh module_init(rbd_init); 6382602adf40SYehuda Sadeh module_exit(rbd_exit); 6383602adf40SYehuda Sadeh 6384d552c619SAlex Elder MODULE_AUTHOR("Alex Elder <elder@inktank.com>"); 6385602adf40SYehuda Sadeh MODULE_AUTHOR("Sage Weil <sage@newdream.net>"); 6386602adf40SYehuda Sadeh MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>"); 6387602adf40SYehuda Sadeh /* following authorship retained from original osdblk.c */ 6388602adf40SYehuda Sadeh MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>"); 6389602adf40SYehuda Sadeh 639090da258bSIlya Dryomov MODULE_DESCRIPTION("RADOS Block Device (RBD) driver"); 6391602adf40SYehuda Sadeh MODULE_LICENSE("GPL"); 6392