1e2a58ee5SAlex Elder 2602adf40SYehuda Sadeh /* 3602adf40SYehuda Sadeh rbd.c -- Export ceph rados objects as a Linux block device 4602adf40SYehuda Sadeh 5602adf40SYehuda Sadeh 6602adf40SYehuda Sadeh based on drivers/block/osdblk.c: 7602adf40SYehuda Sadeh 8602adf40SYehuda Sadeh Copyright 2009 Red Hat, Inc. 9602adf40SYehuda Sadeh 10602adf40SYehuda Sadeh This program is free software; you can redistribute it and/or modify 11602adf40SYehuda Sadeh it under the terms of the GNU General Public License as published by 12602adf40SYehuda Sadeh the Free Software Foundation. 13602adf40SYehuda Sadeh 14602adf40SYehuda Sadeh This program is distributed in the hope that it will be useful, 15602adf40SYehuda Sadeh but WITHOUT ANY WARRANTY; without even the implied warranty of 16602adf40SYehuda Sadeh MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17602adf40SYehuda Sadeh GNU General Public License for more details. 18602adf40SYehuda Sadeh 19602adf40SYehuda Sadeh You should have received a copy of the GNU General Public License 20602adf40SYehuda Sadeh along with this program; see the file COPYING. If not, write to 21602adf40SYehuda Sadeh the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 22602adf40SYehuda Sadeh 23602adf40SYehuda Sadeh 24602adf40SYehuda Sadeh 25dfc5606dSYehuda Sadeh For usage instructions, please refer to: 26602adf40SYehuda Sadeh 27dfc5606dSYehuda Sadeh Documentation/ABI/testing/sysfs-bus-rbd 28602adf40SYehuda Sadeh 29602adf40SYehuda Sadeh */ 30602adf40SYehuda Sadeh 31602adf40SYehuda Sadeh #include <linux/ceph/libceph.h> 32602adf40SYehuda Sadeh #include <linux/ceph/osd_client.h> 33602adf40SYehuda Sadeh #include <linux/ceph/mon_client.h> 34ed95b21aSIlya Dryomov #include <linux/ceph/cls_lock_client.h> 35602adf40SYehuda Sadeh #include <linux/ceph/decode.h> 3659c2be1eSYehuda Sadeh #include <linux/parser.h> 3730d1cff8SAlex Elder #include <linux/bsearch.h> 38602adf40SYehuda Sadeh 39602adf40SYehuda Sadeh #include <linux/kernel.h> 40602adf40SYehuda Sadeh #include <linux/device.h> 41602adf40SYehuda Sadeh #include <linux/module.h> 427ad18afaSChristoph Hellwig #include <linux/blk-mq.h> 43602adf40SYehuda Sadeh #include <linux/fs.h> 44602adf40SYehuda Sadeh #include <linux/blkdev.h> 451c2a9dfeSAlex Elder #include <linux/slab.h> 46f8a22fc2SIlya Dryomov #include <linux/idr.h> 47bc1ecc65SIlya Dryomov #include <linux/workqueue.h> 48602adf40SYehuda Sadeh 49602adf40SYehuda Sadeh #include "rbd_types.h" 50602adf40SYehuda Sadeh 51aafb230eSAlex Elder #define RBD_DEBUG /* Activate rbd_assert() calls */ 52aafb230eSAlex Elder 53593a9e7bSAlex Elder /* 54593a9e7bSAlex Elder * The basic unit of block I/O is a sector. It is interpreted in a 55593a9e7bSAlex Elder * number of contexts in Linux (blk, bio, genhd), but the default is 56593a9e7bSAlex Elder * universally 512 bytes. These symbols are just slightly more 57593a9e7bSAlex Elder * meaningful than the bare numbers they represent. 58593a9e7bSAlex Elder */ 59593a9e7bSAlex Elder #define SECTOR_SHIFT 9 60593a9e7bSAlex Elder #define SECTOR_SIZE (1ULL << SECTOR_SHIFT) 61593a9e7bSAlex Elder 62a2acd00eSAlex Elder /* 63a2acd00eSAlex Elder * Increment the given counter and return its updated value. 64a2acd00eSAlex Elder * If the counter is already 0 it will not be incremented. 65a2acd00eSAlex Elder * If the counter is already at its maximum value returns 66a2acd00eSAlex Elder * -EINVAL without updating it. 67a2acd00eSAlex Elder */ 68a2acd00eSAlex Elder static int atomic_inc_return_safe(atomic_t *v) 69a2acd00eSAlex Elder { 70a2acd00eSAlex Elder unsigned int counter; 71a2acd00eSAlex Elder 72a2acd00eSAlex Elder counter = (unsigned int)__atomic_add_unless(v, 1, 0); 73a2acd00eSAlex Elder if (counter <= (unsigned int)INT_MAX) 74a2acd00eSAlex Elder return (int)counter; 75a2acd00eSAlex Elder 76a2acd00eSAlex Elder atomic_dec(v); 77a2acd00eSAlex Elder 78a2acd00eSAlex Elder return -EINVAL; 79a2acd00eSAlex Elder } 80a2acd00eSAlex Elder 81a2acd00eSAlex Elder /* Decrement the counter. Return the resulting value, or -EINVAL */ 82a2acd00eSAlex Elder static int atomic_dec_return_safe(atomic_t *v) 83a2acd00eSAlex Elder { 84a2acd00eSAlex Elder int counter; 85a2acd00eSAlex Elder 86a2acd00eSAlex Elder counter = atomic_dec_return(v); 87a2acd00eSAlex Elder if (counter >= 0) 88a2acd00eSAlex Elder return counter; 89a2acd00eSAlex Elder 90a2acd00eSAlex Elder atomic_inc(v); 91a2acd00eSAlex Elder 92a2acd00eSAlex Elder return -EINVAL; 93a2acd00eSAlex Elder } 94a2acd00eSAlex Elder 95f0f8cef5SAlex Elder #define RBD_DRV_NAME "rbd" 96602adf40SYehuda Sadeh 977e513d43SIlya Dryomov #define RBD_MINORS_PER_MAJOR 256 987e513d43SIlya Dryomov #define RBD_SINGLE_MAJOR_PART_SHIFT 4 99602adf40SYehuda Sadeh 1006d69bb53SIlya Dryomov #define RBD_MAX_PARENT_CHAIN_LEN 16 1016d69bb53SIlya Dryomov 102d4b125e9SAlex Elder #define RBD_SNAP_DEV_NAME_PREFIX "snap_" 103d4b125e9SAlex Elder #define RBD_MAX_SNAP_NAME_LEN \ 104d4b125e9SAlex Elder (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1)) 105d4b125e9SAlex Elder 10635d489f9SAlex Elder #define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */ 107602adf40SYehuda Sadeh 108602adf40SYehuda Sadeh #define RBD_SNAP_HEAD_NAME "-" 109602adf40SYehuda Sadeh 1109682fc6dSAlex Elder #define BAD_SNAP_INDEX U32_MAX /* invalid index into snap array */ 1119682fc6dSAlex Elder 1129e15b77dSAlex Elder /* This allows a single page to hold an image name sent by OSD */ 1139e15b77dSAlex Elder #define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1) 114589d30e0SAlex Elder #define RBD_IMAGE_ID_LEN_MAX 64 1159e15b77dSAlex Elder 1161e130199SAlex Elder #define RBD_OBJ_PREFIX_LEN_MAX 64 117589d30e0SAlex Elder 118ed95b21aSIlya Dryomov #define RBD_NOTIFY_TIMEOUT 5 /* seconds */ 11999d16943SIlya Dryomov #define RBD_RETRY_DELAY msecs_to_jiffies(1000) 12099d16943SIlya Dryomov 121d889140cSAlex Elder /* Feature bits */ 122d889140cSAlex Elder 1238767b293SIlya Dryomov #define RBD_FEATURE_LAYERING (1ULL<<0) 1248767b293SIlya Dryomov #define RBD_FEATURE_STRIPINGV2 (1ULL<<1) 1258767b293SIlya Dryomov #define RBD_FEATURE_EXCLUSIVE_LOCK (1ULL<<2) 1268767b293SIlya Dryomov #define RBD_FEATURE_DATA_POOL (1ULL<<7) 1278767b293SIlya Dryomov 128ed95b21aSIlya Dryomov #define RBD_FEATURES_ALL (RBD_FEATURE_LAYERING | \ 129ed95b21aSIlya Dryomov RBD_FEATURE_STRIPINGV2 | \ 1307e97332eSIlya Dryomov RBD_FEATURE_EXCLUSIVE_LOCK | \ 1317e97332eSIlya Dryomov RBD_FEATURE_DATA_POOL) 132d889140cSAlex Elder 133d889140cSAlex Elder /* Features supported by this (client software) implementation. */ 134d889140cSAlex Elder 135770eba6eSAlex Elder #define RBD_FEATURES_SUPPORTED (RBD_FEATURES_ALL) 136d889140cSAlex Elder 13781a89793SAlex Elder /* 13881a89793SAlex Elder * An RBD device name will be "rbd#", where the "rbd" comes from 13981a89793SAlex Elder * RBD_DRV_NAME above, and # is a unique integer identifier. 14081a89793SAlex Elder */ 141602adf40SYehuda Sadeh #define DEV_NAME_LEN 32 142602adf40SYehuda Sadeh 143602adf40SYehuda Sadeh /* 144602adf40SYehuda Sadeh * block device image metadata (in-memory version) 145602adf40SYehuda Sadeh */ 146602adf40SYehuda Sadeh struct rbd_image_header { 147f35a4deeSAlex Elder /* These six fields never change for a given rbd image */ 148849b4260SAlex Elder char *object_prefix; 149602adf40SYehuda Sadeh __u8 obj_order; 150f35a4deeSAlex Elder u64 stripe_unit; 151f35a4deeSAlex Elder u64 stripe_count; 1527e97332eSIlya Dryomov s64 data_pool_id; 153f35a4deeSAlex Elder u64 features; /* Might be changeable someday? */ 154602adf40SYehuda Sadeh 155f84344f3SAlex Elder /* The remaining fields need to be updated occasionally */ 156f84344f3SAlex Elder u64 image_size; 157f84344f3SAlex Elder struct ceph_snap_context *snapc; 158f35a4deeSAlex Elder char *snap_names; /* format 1 only */ 159f35a4deeSAlex Elder u64 *snap_sizes; /* format 1 only */ 16059c2be1eSYehuda Sadeh }; 16159c2be1eSYehuda Sadeh 1620d7dbfceSAlex Elder /* 1630d7dbfceSAlex Elder * An rbd image specification. 1640d7dbfceSAlex Elder * 1650d7dbfceSAlex Elder * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely 166c66c6e0cSAlex Elder * identify an image. Each rbd_dev structure includes a pointer to 167c66c6e0cSAlex Elder * an rbd_spec structure that encapsulates this identity. 168c66c6e0cSAlex Elder * 169c66c6e0cSAlex Elder * Each of the id's in an rbd_spec has an associated name. For a 170c66c6e0cSAlex Elder * user-mapped image, the names are supplied and the id's associated 171c66c6e0cSAlex Elder * with them are looked up. For a layered image, a parent image is 172c66c6e0cSAlex Elder * defined by the tuple, and the names are looked up. 173c66c6e0cSAlex Elder * 174c66c6e0cSAlex Elder * An rbd_dev structure contains a parent_spec pointer which is 175c66c6e0cSAlex Elder * non-null if the image it represents is a child in a layered 176c66c6e0cSAlex Elder * image. This pointer will refer to the rbd_spec structure used 177c66c6e0cSAlex Elder * by the parent rbd_dev for its own identity (i.e., the structure 178c66c6e0cSAlex Elder * is shared between the parent and child). 179c66c6e0cSAlex Elder * 180c66c6e0cSAlex Elder * Since these structures are populated once, during the discovery 181c66c6e0cSAlex Elder * phase of image construction, they are effectively immutable so 182c66c6e0cSAlex Elder * we make no effort to synchronize access to them. 183c66c6e0cSAlex Elder * 184c66c6e0cSAlex Elder * Note that code herein does not assume the image name is known (it 185c66c6e0cSAlex Elder * could be a null pointer). 1860d7dbfceSAlex Elder */ 1870d7dbfceSAlex Elder struct rbd_spec { 1880d7dbfceSAlex Elder u64 pool_id; 189ecb4dc22SAlex Elder const char *pool_name; 1900d7dbfceSAlex Elder 191ecb4dc22SAlex Elder const char *image_id; 192ecb4dc22SAlex Elder const char *image_name; 1930d7dbfceSAlex Elder 1940d7dbfceSAlex Elder u64 snap_id; 195ecb4dc22SAlex Elder const char *snap_name; 1960d7dbfceSAlex Elder 1970d7dbfceSAlex Elder struct kref kref; 1980d7dbfceSAlex Elder }; 1990d7dbfceSAlex Elder 200602adf40SYehuda Sadeh /* 201f0f8cef5SAlex Elder * an instance of the client. multiple devices may share an rbd client. 202602adf40SYehuda Sadeh */ 203602adf40SYehuda Sadeh struct rbd_client { 204602adf40SYehuda Sadeh struct ceph_client *client; 205602adf40SYehuda Sadeh struct kref kref; 206602adf40SYehuda Sadeh struct list_head node; 207602adf40SYehuda Sadeh }; 208602adf40SYehuda Sadeh 209bf0d5f50SAlex Elder struct rbd_img_request; 210bf0d5f50SAlex Elder typedef void (*rbd_img_callback_t)(struct rbd_img_request *); 211bf0d5f50SAlex Elder 212bf0d5f50SAlex Elder #define BAD_WHICH U32_MAX /* Good which or bad which, which? */ 213bf0d5f50SAlex Elder 214bf0d5f50SAlex Elder struct rbd_obj_request; 215bf0d5f50SAlex Elder typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *); 216bf0d5f50SAlex Elder 2179969ebc5SAlex Elder enum obj_request_type { 2189969ebc5SAlex Elder OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES 2199969ebc5SAlex Elder }; 220bf0d5f50SAlex Elder 2216d2940c8SGuangliang Zhao enum obj_operation_type { 2226d2940c8SGuangliang Zhao OBJ_OP_WRITE, 2236d2940c8SGuangliang Zhao OBJ_OP_READ, 22490e98c52SGuangliang Zhao OBJ_OP_DISCARD, 2256d2940c8SGuangliang Zhao }; 2266d2940c8SGuangliang Zhao 227926f9b3fSAlex Elder enum obj_req_flags { 228926f9b3fSAlex Elder OBJ_REQ_DONE, /* completion flag: not done = 0, done = 1 */ 2296365d33aSAlex Elder OBJ_REQ_IMG_DATA, /* object usage: standalone = 0, image = 1 */ 2305679c59fSAlex Elder OBJ_REQ_KNOWN, /* EXISTS flag valid: no = 0, yes = 1 */ 2315679c59fSAlex Elder OBJ_REQ_EXISTS, /* target exists: no = 0, yes = 1 */ 232926f9b3fSAlex Elder }; 233926f9b3fSAlex Elder 234bf0d5f50SAlex Elder struct rbd_obj_request { 235a90bb0c1SIlya Dryomov u64 object_no; 236bf0d5f50SAlex Elder u64 offset; /* object start byte */ 237bf0d5f50SAlex Elder u64 length; /* bytes from offset */ 238926f9b3fSAlex Elder unsigned long flags; 239bf0d5f50SAlex Elder 240c5b5ef6cSAlex Elder /* 241c5b5ef6cSAlex Elder * An object request associated with an image will have its 242c5b5ef6cSAlex Elder * img_data flag set; a standalone object request will not. 243c5b5ef6cSAlex Elder * 244c5b5ef6cSAlex Elder * A standalone object request will have which == BAD_WHICH 245c5b5ef6cSAlex Elder * and a null obj_request pointer. 246c5b5ef6cSAlex Elder * 247c5b5ef6cSAlex Elder * An object request initiated in support of a layered image 248c5b5ef6cSAlex Elder * object (to check for its existence before a write) will 249c5b5ef6cSAlex Elder * have which == BAD_WHICH and a non-null obj_request pointer. 250c5b5ef6cSAlex Elder * 251c5b5ef6cSAlex Elder * Finally, an object request for rbd image data will have 252c5b5ef6cSAlex Elder * which != BAD_WHICH, and will have a non-null img_request 253c5b5ef6cSAlex Elder * pointer. The value of which will be in the range 254c5b5ef6cSAlex Elder * 0..(img_request->obj_request_count-1). 255c5b5ef6cSAlex Elder */ 256c5b5ef6cSAlex Elder union { 257c5b5ef6cSAlex Elder struct rbd_obj_request *obj_request; /* STAT op */ 258c5b5ef6cSAlex Elder struct { 259bf0d5f50SAlex Elder struct rbd_img_request *img_request; 260c5b5ef6cSAlex Elder u64 img_offset; 261c5b5ef6cSAlex Elder /* links for img_request->obj_requests list */ 262c5b5ef6cSAlex Elder struct list_head links; 263c5b5ef6cSAlex Elder }; 264c5b5ef6cSAlex Elder }; 265bf0d5f50SAlex Elder u32 which; /* posn image request list */ 266bf0d5f50SAlex Elder 267bf0d5f50SAlex Elder enum obj_request_type type; 268788e2df3SAlex Elder union { 269bf0d5f50SAlex Elder struct bio *bio_list; 270788e2df3SAlex Elder struct { 271788e2df3SAlex Elder struct page **pages; 272788e2df3SAlex Elder u32 page_count; 273788e2df3SAlex Elder }; 274788e2df3SAlex Elder }; 2750eefd470SAlex Elder struct page **copyup_pages; 276ebda6408SAlex Elder u32 copyup_page_count; 277bf0d5f50SAlex Elder 278bf0d5f50SAlex Elder struct ceph_osd_request *osd_req; 279bf0d5f50SAlex Elder 280bf0d5f50SAlex Elder u64 xferred; /* bytes transferred */ 2811b83bef2SSage Weil int result; 282bf0d5f50SAlex Elder 283bf0d5f50SAlex Elder rbd_obj_callback_t callback; 284788e2df3SAlex Elder struct completion completion; 285bf0d5f50SAlex Elder 286bf0d5f50SAlex Elder struct kref kref; 287bf0d5f50SAlex Elder }; 288bf0d5f50SAlex Elder 2890c425248SAlex Elder enum img_req_flags { 2909849e986SAlex Elder IMG_REQ_WRITE, /* I/O direction: read = 0, write = 1 */ 2919849e986SAlex Elder IMG_REQ_CHILD, /* initiator: block = 0, child image = 1 */ 292d0b2e944SAlex Elder IMG_REQ_LAYERED, /* ENOENT handling: normal = 0, layered = 1 */ 29390e98c52SGuangliang Zhao IMG_REQ_DISCARD, /* discard: normal = 0, discard request = 1 */ 2940c425248SAlex Elder }; 2950c425248SAlex Elder 296bf0d5f50SAlex Elder struct rbd_img_request { 297bf0d5f50SAlex Elder struct rbd_device *rbd_dev; 298bf0d5f50SAlex Elder u64 offset; /* starting image byte offset */ 299bf0d5f50SAlex Elder u64 length; /* byte count from offset */ 3000c425248SAlex Elder unsigned long flags; 301bf0d5f50SAlex Elder union { 302bf0d5f50SAlex Elder u64 snap_id; /* for reads */ 3039849e986SAlex Elder struct ceph_snap_context *snapc; /* for writes */ 3049849e986SAlex Elder }; 3059849e986SAlex Elder union { 3069849e986SAlex Elder struct request *rq; /* block request */ 3079849e986SAlex Elder struct rbd_obj_request *obj_request; /* obj req initiator */ 308bf0d5f50SAlex Elder }; 3093d7efd18SAlex Elder struct page **copyup_pages; 310ebda6408SAlex Elder u32 copyup_page_count; 311bf0d5f50SAlex Elder spinlock_t completion_lock;/* protects next_completion */ 312bf0d5f50SAlex Elder u32 next_completion; 313bf0d5f50SAlex Elder rbd_img_callback_t callback; 31455f27e09SAlex Elder u64 xferred;/* aggregate bytes transferred */ 315a5a337d4SAlex Elder int result; /* first nonzero obj_request result */ 316bf0d5f50SAlex Elder 317bf0d5f50SAlex Elder u32 obj_request_count; 318bf0d5f50SAlex Elder struct list_head obj_requests; /* rbd_obj_request structs */ 319bf0d5f50SAlex Elder 320bf0d5f50SAlex Elder struct kref kref; 321bf0d5f50SAlex Elder }; 322bf0d5f50SAlex Elder 323bf0d5f50SAlex Elder #define for_each_obj_request(ireq, oreq) \ 324ef06f4d3SAlex Elder list_for_each_entry(oreq, &(ireq)->obj_requests, links) 325bf0d5f50SAlex Elder #define for_each_obj_request_from(ireq, oreq) \ 326ef06f4d3SAlex Elder list_for_each_entry_from(oreq, &(ireq)->obj_requests, links) 327bf0d5f50SAlex Elder #define for_each_obj_request_safe(ireq, oreq, n) \ 328ef06f4d3SAlex Elder list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links) 329bf0d5f50SAlex Elder 33099d16943SIlya Dryomov enum rbd_watch_state { 33199d16943SIlya Dryomov RBD_WATCH_STATE_UNREGISTERED, 33299d16943SIlya Dryomov RBD_WATCH_STATE_REGISTERED, 33399d16943SIlya Dryomov RBD_WATCH_STATE_ERROR, 33499d16943SIlya Dryomov }; 33599d16943SIlya Dryomov 336ed95b21aSIlya Dryomov enum rbd_lock_state { 337ed95b21aSIlya Dryomov RBD_LOCK_STATE_UNLOCKED, 338ed95b21aSIlya Dryomov RBD_LOCK_STATE_LOCKED, 339ed95b21aSIlya Dryomov RBD_LOCK_STATE_RELEASING, 340ed95b21aSIlya Dryomov }; 341ed95b21aSIlya Dryomov 342ed95b21aSIlya Dryomov /* WatchNotify::ClientId */ 343ed95b21aSIlya Dryomov struct rbd_client_id { 344ed95b21aSIlya Dryomov u64 gid; 345ed95b21aSIlya Dryomov u64 handle; 346ed95b21aSIlya Dryomov }; 347ed95b21aSIlya Dryomov 348f84344f3SAlex Elder struct rbd_mapping { 34999c1f08fSAlex Elder u64 size; 35034b13184SAlex Elder u64 features; 351f84344f3SAlex Elder bool read_only; 352f84344f3SAlex Elder }; 353f84344f3SAlex Elder 354602adf40SYehuda Sadeh /* 355602adf40SYehuda Sadeh * a single device 356602adf40SYehuda Sadeh */ 357602adf40SYehuda Sadeh struct rbd_device { 358de71a297SAlex Elder int dev_id; /* blkdev unique id */ 359602adf40SYehuda Sadeh 360602adf40SYehuda Sadeh int major; /* blkdev assigned major */ 361dd82fff1SIlya Dryomov int minor; 362602adf40SYehuda Sadeh struct gendisk *disk; /* blkdev's gendisk and rq */ 363602adf40SYehuda Sadeh 364a30b71b9SAlex Elder u32 image_format; /* Either 1 or 2 */ 365602adf40SYehuda Sadeh struct rbd_client *rbd_client; 366602adf40SYehuda Sadeh 367602adf40SYehuda Sadeh char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */ 368602adf40SYehuda Sadeh 369b82d167bSAlex Elder spinlock_t lock; /* queue, flags, open_count */ 370602adf40SYehuda Sadeh 371602adf40SYehuda Sadeh struct rbd_image_header header; 372b82d167bSAlex Elder unsigned long flags; /* possibly lock protected */ 3730d7dbfceSAlex Elder struct rbd_spec *spec; 374d147543dSIlya Dryomov struct rbd_options *opts; 3750d6d1e9cSMike Christie char *config_info; /* add{,_single_major} string */ 376602adf40SYehuda Sadeh 377c41d13a3SIlya Dryomov struct ceph_object_id header_oid; 378922dab61SIlya Dryomov struct ceph_object_locator header_oloc; 379971f839aSAlex Elder 3801643dfa4SIlya Dryomov struct ceph_file_layout layout; /* used for all rbd requests */ 3810903e875SAlex Elder 38299d16943SIlya Dryomov struct mutex watch_mutex; 38399d16943SIlya Dryomov enum rbd_watch_state watch_state; 384922dab61SIlya Dryomov struct ceph_osd_linger_request *watch_handle; 38599d16943SIlya Dryomov u64 watch_cookie; 38699d16943SIlya Dryomov struct delayed_work watch_dwork; 38759c2be1eSYehuda Sadeh 388ed95b21aSIlya Dryomov struct rw_semaphore lock_rwsem; 389ed95b21aSIlya Dryomov enum rbd_lock_state lock_state; 390cbbfb0ffSIlya Dryomov char lock_cookie[32]; 391ed95b21aSIlya Dryomov struct rbd_client_id owner_cid; 392ed95b21aSIlya Dryomov struct work_struct acquired_lock_work; 393ed95b21aSIlya Dryomov struct work_struct released_lock_work; 394ed95b21aSIlya Dryomov struct delayed_work lock_dwork; 395ed95b21aSIlya Dryomov struct work_struct unlock_work; 396ed95b21aSIlya Dryomov wait_queue_head_t lock_waitq; 397ed95b21aSIlya Dryomov 3981643dfa4SIlya Dryomov struct workqueue_struct *task_wq; 399602adf40SYehuda Sadeh 40086b00e0dSAlex Elder struct rbd_spec *parent_spec; 40186b00e0dSAlex Elder u64 parent_overlap; 402a2acd00eSAlex Elder atomic_t parent_ref; 4032f82ee54SAlex Elder struct rbd_device *parent; 40486b00e0dSAlex Elder 4057ad18afaSChristoph Hellwig /* Block layer tags. */ 4067ad18afaSChristoph Hellwig struct blk_mq_tag_set tag_set; 4077ad18afaSChristoph Hellwig 408c666601aSJosh Durgin /* protects updating the header */ 409c666601aSJosh Durgin struct rw_semaphore header_rwsem; 410f84344f3SAlex Elder 411f84344f3SAlex Elder struct rbd_mapping mapping; 412602adf40SYehuda Sadeh 413602adf40SYehuda Sadeh struct list_head node; 414dfc5606dSYehuda Sadeh 415dfc5606dSYehuda Sadeh /* sysfs related */ 416dfc5606dSYehuda Sadeh struct device dev; 417b82d167bSAlex Elder unsigned long open_count; /* protected by lock */ 418dfc5606dSYehuda Sadeh }; 419dfc5606dSYehuda Sadeh 420b82d167bSAlex Elder /* 42187c0fdedSIlya Dryomov * Flag bits for rbd_dev->flags: 42287c0fdedSIlya Dryomov * - REMOVING (which is coupled with rbd_dev->open_count) is protected 42387c0fdedSIlya Dryomov * by rbd_dev->lock 42487c0fdedSIlya Dryomov * - BLACKLISTED is protected by rbd_dev->lock_rwsem 425b82d167bSAlex Elder */ 4266d292906SAlex Elder enum rbd_dev_flags { 4276d292906SAlex Elder RBD_DEV_FLAG_EXISTS, /* mapped snapshot has not been deleted */ 428b82d167bSAlex Elder RBD_DEV_FLAG_REMOVING, /* this mapping is being removed */ 42987c0fdedSIlya Dryomov RBD_DEV_FLAG_BLACKLISTED, /* our ceph_client is blacklisted */ 4306d292906SAlex Elder }; 4316d292906SAlex Elder 432cfbf6377SAlex Elder static DEFINE_MUTEX(client_mutex); /* Serialize client creation */ 433e124a82fSAlex Elder 434602adf40SYehuda Sadeh static LIST_HEAD(rbd_dev_list); /* devices */ 435e124a82fSAlex Elder static DEFINE_SPINLOCK(rbd_dev_list_lock); 436e124a82fSAlex Elder 437602adf40SYehuda Sadeh static LIST_HEAD(rbd_client_list); /* clients */ 438432b8587SAlex Elder static DEFINE_SPINLOCK(rbd_client_list_lock); 439602adf40SYehuda Sadeh 44078c2a44aSAlex Elder /* Slab caches for frequently-allocated structures */ 44178c2a44aSAlex Elder 4421c2a9dfeSAlex Elder static struct kmem_cache *rbd_img_request_cache; 443868311b1SAlex Elder static struct kmem_cache *rbd_obj_request_cache; 4441c2a9dfeSAlex Elder 4459b60e70bSIlya Dryomov static int rbd_major; 446f8a22fc2SIlya Dryomov static DEFINE_IDA(rbd_dev_id_ida); 447f8a22fc2SIlya Dryomov 448f5ee37bdSIlya Dryomov static struct workqueue_struct *rbd_wq; 449f5ee37bdSIlya Dryomov 4509b60e70bSIlya Dryomov /* 4519b60e70bSIlya Dryomov * Default to false for now, as single-major requires >= 0.75 version of 4529b60e70bSIlya Dryomov * userspace rbd utility. 4539b60e70bSIlya Dryomov */ 4549b60e70bSIlya Dryomov static bool single_major = false; 4559b60e70bSIlya Dryomov module_param(single_major, bool, S_IRUGO); 4569b60e70bSIlya Dryomov MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: false)"); 4579b60e70bSIlya Dryomov 4583d7efd18SAlex Elder static int rbd_img_request_submit(struct rbd_img_request *img_request); 4593d7efd18SAlex Elder 460f0f8cef5SAlex Elder static ssize_t rbd_add(struct bus_type *bus, const char *buf, 461f0f8cef5SAlex Elder size_t count); 462f0f8cef5SAlex Elder static ssize_t rbd_remove(struct bus_type *bus, const char *buf, 463f0f8cef5SAlex Elder size_t count); 4649b60e70bSIlya Dryomov static ssize_t rbd_add_single_major(struct bus_type *bus, const char *buf, 4659b60e70bSIlya Dryomov size_t count); 4669b60e70bSIlya Dryomov static ssize_t rbd_remove_single_major(struct bus_type *bus, const char *buf, 4679b60e70bSIlya Dryomov size_t count); 4686d69bb53SIlya Dryomov static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth); 469a2acd00eSAlex Elder static void rbd_spec_put(struct rbd_spec *spec); 470f0f8cef5SAlex Elder 4719b60e70bSIlya Dryomov static int rbd_dev_id_to_minor(int dev_id) 4729b60e70bSIlya Dryomov { 4737e513d43SIlya Dryomov return dev_id << RBD_SINGLE_MAJOR_PART_SHIFT; 4749b60e70bSIlya Dryomov } 4759b60e70bSIlya Dryomov 4769b60e70bSIlya Dryomov static int minor_to_rbd_dev_id(int minor) 4779b60e70bSIlya Dryomov { 4787e513d43SIlya Dryomov return minor >> RBD_SINGLE_MAJOR_PART_SHIFT; 4799b60e70bSIlya Dryomov } 4809b60e70bSIlya Dryomov 481ed95b21aSIlya Dryomov static bool rbd_is_lock_supported(struct rbd_device *rbd_dev) 482ed95b21aSIlya Dryomov { 483ed95b21aSIlya Dryomov return (rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK) && 484ed95b21aSIlya Dryomov rbd_dev->spec->snap_id == CEPH_NOSNAP && 485ed95b21aSIlya Dryomov !rbd_dev->mapping.read_only; 486ed95b21aSIlya Dryomov } 487ed95b21aSIlya Dryomov 488ed95b21aSIlya Dryomov static bool __rbd_is_lock_owner(struct rbd_device *rbd_dev) 489ed95b21aSIlya Dryomov { 490ed95b21aSIlya Dryomov return rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED || 491ed95b21aSIlya Dryomov rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING; 492ed95b21aSIlya Dryomov } 493ed95b21aSIlya Dryomov 494ed95b21aSIlya Dryomov static bool rbd_is_lock_owner(struct rbd_device *rbd_dev) 495ed95b21aSIlya Dryomov { 496ed95b21aSIlya Dryomov bool is_lock_owner; 497ed95b21aSIlya Dryomov 498ed95b21aSIlya Dryomov down_read(&rbd_dev->lock_rwsem); 499ed95b21aSIlya Dryomov is_lock_owner = __rbd_is_lock_owner(rbd_dev); 500ed95b21aSIlya Dryomov up_read(&rbd_dev->lock_rwsem); 501ed95b21aSIlya Dryomov return is_lock_owner; 502ed95b21aSIlya Dryomov } 503ed95b21aSIlya Dryomov 5048767b293SIlya Dryomov static ssize_t rbd_supported_features_show(struct bus_type *bus, char *buf) 5058767b293SIlya Dryomov { 5068767b293SIlya Dryomov return sprintf(buf, "0x%llx\n", RBD_FEATURES_SUPPORTED); 5078767b293SIlya Dryomov } 5088767b293SIlya Dryomov 509b15a21ddSGreg Kroah-Hartman static BUS_ATTR(add, S_IWUSR, NULL, rbd_add); 510b15a21ddSGreg Kroah-Hartman static BUS_ATTR(remove, S_IWUSR, NULL, rbd_remove); 5119b60e70bSIlya Dryomov static BUS_ATTR(add_single_major, S_IWUSR, NULL, rbd_add_single_major); 5129b60e70bSIlya Dryomov static BUS_ATTR(remove_single_major, S_IWUSR, NULL, rbd_remove_single_major); 5138767b293SIlya Dryomov static BUS_ATTR(supported_features, S_IRUGO, rbd_supported_features_show, NULL); 514b15a21ddSGreg Kroah-Hartman 515b15a21ddSGreg Kroah-Hartman static struct attribute *rbd_bus_attrs[] = { 516b15a21ddSGreg Kroah-Hartman &bus_attr_add.attr, 517b15a21ddSGreg Kroah-Hartman &bus_attr_remove.attr, 5189b60e70bSIlya Dryomov &bus_attr_add_single_major.attr, 5199b60e70bSIlya Dryomov &bus_attr_remove_single_major.attr, 5208767b293SIlya Dryomov &bus_attr_supported_features.attr, 521b15a21ddSGreg Kroah-Hartman NULL, 522f0f8cef5SAlex Elder }; 52392c76dc0SIlya Dryomov 52492c76dc0SIlya Dryomov static umode_t rbd_bus_is_visible(struct kobject *kobj, 52592c76dc0SIlya Dryomov struct attribute *attr, int index) 52692c76dc0SIlya Dryomov { 5279b60e70bSIlya Dryomov if (!single_major && 5289b60e70bSIlya Dryomov (attr == &bus_attr_add_single_major.attr || 5299b60e70bSIlya Dryomov attr == &bus_attr_remove_single_major.attr)) 5309b60e70bSIlya Dryomov return 0; 5319b60e70bSIlya Dryomov 53292c76dc0SIlya Dryomov return attr->mode; 53392c76dc0SIlya Dryomov } 53492c76dc0SIlya Dryomov 53592c76dc0SIlya Dryomov static const struct attribute_group rbd_bus_group = { 53692c76dc0SIlya Dryomov .attrs = rbd_bus_attrs, 53792c76dc0SIlya Dryomov .is_visible = rbd_bus_is_visible, 53892c76dc0SIlya Dryomov }; 53992c76dc0SIlya Dryomov __ATTRIBUTE_GROUPS(rbd_bus); 540f0f8cef5SAlex Elder 541f0f8cef5SAlex Elder static struct bus_type rbd_bus_type = { 542f0f8cef5SAlex Elder .name = "rbd", 543b15a21ddSGreg Kroah-Hartman .bus_groups = rbd_bus_groups, 544f0f8cef5SAlex Elder }; 545f0f8cef5SAlex Elder 546f0f8cef5SAlex Elder static void rbd_root_dev_release(struct device *dev) 547f0f8cef5SAlex Elder { 548f0f8cef5SAlex Elder } 549f0f8cef5SAlex Elder 550f0f8cef5SAlex Elder static struct device rbd_root_dev = { 551f0f8cef5SAlex Elder .init_name = "rbd", 552f0f8cef5SAlex Elder .release = rbd_root_dev_release, 553f0f8cef5SAlex Elder }; 554f0f8cef5SAlex Elder 55506ecc6cbSAlex Elder static __printf(2, 3) 55606ecc6cbSAlex Elder void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...) 55706ecc6cbSAlex Elder { 55806ecc6cbSAlex Elder struct va_format vaf; 55906ecc6cbSAlex Elder va_list args; 56006ecc6cbSAlex Elder 56106ecc6cbSAlex Elder va_start(args, fmt); 56206ecc6cbSAlex Elder vaf.fmt = fmt; 56306ecc6cbSAlex Elder vaf.va = &args; 56406ecc6cbSAlex Elder 56506ecc6cbSAlex Elder if (!rbd_dev) 56606ecc6cbSAlex Elder printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf); 56706ecc6cbSAlex Elder else if (rbd_dev->disk) 56806ecc6cbSAlex Elder printk(KERN_WARNING "%s: %s: %pV\n", 56906ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf); 57006ecc6cbSAlex Elder else if (rbd_dev->spec && rbd_dev->spec->image_name) 57106ecc6cbSAlex Elder printk(KERN_WARNING "%s: image %s: %pV\n", 57206ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf); 57306ecc6cbSAlex Elder else if (rbd_dev->spec && rbd_dev->spec->image_id) 57406ecc6cbSAlex Elder printk(KERN_WARNING "%s: id %s: %pV\n", 57506ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf); 57606ecc6cbSAlex Elder else /* punt */ 57706ecc6cbSAlex Elder printk(KERN_WARNING "%s: rbd_dev %p: %pV\n", 57806ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev, &vaf); 57906ecc6cbSAlex Elder va_end(args); 58006ecc6cbSAlex Elder } 58106ecc6cbSAlex Elder 582aafb230eSAlex Elder #ifdef RBD_DEBUG 583aafb230eSAlex Elder #define rbd_assert(expr) \ 584aafb230eSAlex Elder if (unlikely(!(expr))) { \ 585aafb230eSAlex Elder printk(KERN_ERR "\nAssertion failure in %s() " \ 586aafb230eSAlex Elder "at line %d:\n\n" \ 587aafb230eSAlex Elder "\trbd_assert(%s);\n\n", \ 588aafb230eSAlex Elder __func__, __LINE__, #expr); \ 589aafb230eSAlex Elder BUG(); \ 590aafb230eSAlex Elder } 591aafb230eSAlex Elder #else /* !RBD_DEBUG */ 592aafb230eSAlex Elder # define rbd_assert(expr) ((void) 0) 593aafb230eSAlex Elder #endif /* !RBD_DEBUG */ 594dfc5606dSYehuda Sadeh 5952761713dSIlya Dryomov static void rbd_osd_copyup_callback(struct rbd_obj_request *obj_request); 596b454e36dSAlex Elder static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request); 59705a46afdSAlex Elder static void rbd_img_parent_read(struct rbd_obj_request *obj_request); 59805a46afdSAlex Elder static void rbd_dev_remove_parent(struct rbd_device *rbd_dev); 5998b3e1a56SAlex Elder 600cc4a38bdSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev); 6012df3fac7SAlex Elder static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev); 602a720ae09SIlya Dryomov static int rbd_dev_header_info(struct rbd_device *rbd_dev); 603e8f59b59SIlya Dryomov static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev); 60454cac61fSAlex Elder static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, 60554cac61fSAlex Elder u64 snap_id); 6062ad3d716SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id, 6072ad3d716SAlex Elder u8 *order, u64 *snap_size); 6082ad3d716SAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id, 6092ad3d716SAlex Elder u64 *snap_features); 61059c2be1eSYehuda Sadeh 611602adf40SYehuda Sadeh static int rbd_open(struct block_device *bdev, fmode_t mode) 612602adf40SYehuda Sadeh { 613f0f8cef5SAlex Elder struct rbd_device *rbd_dev = bdev->bd_disk->private_data; 614b82d167bSAlex Elder bool removing = false; 615602adf40SYehuda Sadeh 616f84344f3SAlex Elder if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only) 617602adf40SYehuda Sadeh return -EROFS; 618602adf40SYehuda Sadeh 619a14ea269SAlex Elder spin_lock_irq(&rbd_dev->lock); 620b82d167bSAlex Elder if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags)) 621b82d167bSAlex Elder removing = true; 622b82d167bSAlex Elder else 623b82d167bSAlex Elder rbd_dev->open_count++; 624a14ea269SAlex Elder spin_unlock_irq(&rbd_dev->lock); 625b82d167bSAlex Elder if (removing) 626b82d167bSAlex Elder return -ENOENT; 627b82d167bSAlex Elder 628c3e946ceSAlex Elder (void) get_device(&rbd_dev->dev); 629340c7a2bSAlex Elder 630602adf40SYehuda Sadeh return 0; 631602adf40SYehuda Sadeh } 632602adf40SYehuda Sadeh 633db2a144bSAl Viro static void rbd_release(struct gendisk *disk, fmode_t mode) 634dfc5606dSYehuda Sadeh { 635dfc5606dSYehuda Sadeh struct rbd_device *rbd_dev = disk->private_data; 636b82d167bSAlex Elder unsigned long open_count_before; 637b82d167bSAlex Elder 638a14ea269SAlex Elder spin_lock_irq(&rbd_dev->lock); 639b82d167bSAlex Elder open_count_before = rbd_dev->open_count--; 640a14ea269SAlex Elder spin_unlock_irq(&rbd_dev->lock); 641b82d167bSAlex Elder rbd_assert(open_count_before > 0); 642dfc5606dSYehuda Sadeh 643c3e946ceSAlex Elder put_device(&rbd_dev->dev); 644dfc5606dSYehuda Sadeh } 645dfc5606dSYehuda Sadeh 646131fd9f6SGuangliang Zhao static int rbd_ioctl_set_ro(struct rbd_device *rbd_dev, unsigned long arg) 647131fd9f6SGuangliang Zhao { 64877f33c03SJosh Durgin int ret = 0; 649131fd9f6SGuangliang Zhao int val; 650131fd9f6SGuangliang Zhao bool ro; 65177f33c03SJosh Durgin bool ro_changed = false; 652131fd9f6SGuangliang Zhao 65377f33c03SJosh Durgin /* get_user() may sleep, so call it before taking rbd_dev->lock */ 654131fd9f6SGuangliang Zhao if (get_user(val, (int __user *)(arg))) 655131fd9f6SGuangliang Zhao return -EFAULT; 656131fd9f6SGuangliang Zhao 657131fd9f6SGuangliang Zhao ro = val ? true : false; 658131fd9f6SGuangliang Zhao /* Snapshot doesn't allow to write*/ 659131fd9f6SGuangliang Zhao if (rbd_dev->spec->snap_id != CEPH_NOSNAP && !ro) 660131fd9f6SGuangliang Zhao return -EROFS; 661131fd9f6SGuangliang Zhao 66277f33c03SJosh Durgin spin_lock_irq(&rbd_dev->lock); 66377f33c03SJosh Durgin /* prevent others open this device */ 66477f33c03SJosh Durgin if (rbd_dev->open_count > 1) { 66577f33c03SJosh Durgin ret = -EBUSY; 66677f33c03SJosh Durgin goto out; 667131fd9f6SGuangliang Zhao } 668131fd9f6SGuangliang Zhao 66977f33c03SJosh Durgin if (rbd_dev->mapping.read_only != ro) { 67077f33c03SJosh Durgin rbd_dev->mapping.read_only = ro; 67177f33c03SJosh Durgin ro_changed = true; 67277f33c03SJosh Durgin } 67377f33c03SJosh Durgin 67477f33c03SJosh Durgin out: 67577f33c03SJosh Durgin spin_unlock_irq(&rbd_dev->lock); 67677f33c03SJosh Durgin /* set_disk_ro() may sleep, so call it after releasing rbd_dev->lock */ 67777f33c03SJosh Durgin if (ret == 0 && ro_changed) 67877f33c03SJosh Durgin set_disk_ro(rbd_dev->disk, ro ? 1 : 0); 67977f33c03SJosh Durgin 68077f33c03SJosh Durgin return ret; 681131fd9f6SGuangliang Zhao } 682131fd9f6SGuangliang Zhao 683131fd9f6SGuangliang Zhao static int rbd_ioctl(struct block_device *bdev, fmode_t mode, 684131fd9f6SGuangliang Zhao unsigned int cmd, unsigned long arg) 685131fd9f6SGuangliang Zhao { 686131fd9f6SGuangliang Zhao struct rbd_device *rbd_dev = bdev->bd_disk->private_data; 687131fd9f6SGuangliang Zhao int ret = 0; 688131fd9f6SGuangliang Zhao 689131fd9f6SGuangliang Zhao switch (cmd) { 690131fd9f6SGuangliang Zhao case BLKROSET: 691131fd9f6SGuangliang Zhao ret = rbd_ioctl_set_ro(rbd_dev, arg); 692131fd9f6SGuangliang Zhao break; 693131fd9f6SGuangliang Zhao default: 694131fd9f6SGuangliang Zhao ret = -ENOTTY; 695131fd9f6SGuangliang Zhao } 696131fd9f6SGuangliang Zhao 697131fd9f6SGuangliang Zhao return ret; 698131fd9f6SGuangliang Zhao } 699131fd9f6SGuangliang Zhao 700131fd9f6SGuangliang Zhao #ifdef CONFIG_COMPAT 701131fd9f6SGuangliang Zhao static int rbd_compat_ioctl(struct block_device *bdev, fmode_t mode, 702131fd9f6SGuangliang Zhao unsigned int cmd, unsigned long arg) 703131fd9f6SGuangliang Zhao { 704131fd9f6SGuangliang Zhao return rbd_ioctl(bdev, mode, cmd, arg); 705131fd9f6SGuangliang Zhao } 706131fd9f6SGuangliang Zhao #endif /* CONFIG_COMPAT */ 707131fd9f6SGuangliang Zhao 708602adf40SYehuda Sadeh static const struct block_device_operations rbd_bd_ops = { 709602adf40SYehuda Sadeh .owner = THIS_MODULE, 710602adf40SYehuda Sadeh .open = rbd_open, 711dfc5606dSYehuda Sadeh .release = rbd_release, 712131fd9f6SGuangliang Zhao .ioctl = rbd_ioctl, 713131fd9f6SGuangliang Zhao #ifdef CONFIG_COMPAT 714131fd9f6SGuangliang Zhao .compat_ioctl = rbd_compat_ioctl, 715131fd9f6SGuangliang Zhao #endif 716602adf40SYehuda Sadeh }; 717602adf40SYehuda Sadeh 718602adf40SYehuda Sadeh /* 7197262cfcaSAlex Elder * Initialize an rbd client instance. Success or not, this function 720cfbf6377SAlex Elder * consumes ceph_opts. Caller holds client_mutex. 721602adf40SYehuda Sadeh */ 722f8c38929SAlex Elder static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts) 723602adf40SYehuda Sadeh { 724602adf40SYehuda Sadeh struct rbd_client *rbdc; 725602adf40SYehuda Sadeh int ret = -ENOMEM; 726602adf40SYehuda Sadeh 72737206ee5SAlex Elder dout("%s:\n", __func__); 728602adf40SYehuda Sadeh rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL); 729602adf40SYehuda Sadeh if (!rbdc) 730602adf40SYehuda Sadeh goto out_opt; 731602adf40SYehuda Sadeh 732602adf40SYehuda Sadeh kref_init(&rbdc->kref); 733602adf40SYehuda Sadeh INIT_LIST_HEAD(&rbdc->node); 734602adf40SYehuda Sadeh 73574da4a0fSIlya Dryomov rbdc->client = ceph_create_client(ceph_opts, rbdc); 736602adf40SYehuda Sadeh if (IS_ERR(rbdc->client)) 73708f75463SAlex Elder goto out_rbdc; 73843ae4701SAlex Elder ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */ 739602adf40SYehuda Sadeh 740602adf40SYehuda Sadeh ret = ceph_open_session(rbdc->client); 741602adf40SYehuda Sadeh if (ret < 0) 74208f75463SAlex Elder goto out_client; 743602adf40SYehuda Sadeh 744432b8587SAlex Elder spin_lock(&rbd_client_list_lock); 745602adf40SYehuda Sadeh list_add_tail(&rbdc->node, &rbd_client_list); 746432b8587SAlex Elder spin_unlock(&rbd_client_list_lock); 747602adf40SYehuda Sadeh 74837206ee5SAlex Elder dout("%s: rbdc %p\n", __func__, rbdc); 749bc534d86SAlex Elder 750602adf40SYehuda Sadeh return rbdc; 75108f75463SAlex Elder out_client: 752602adf40SYehuda Sadeh ceph_destroy_client(rbdc->client); 75308f75463SAlex Elder out_rbdc: 754602adf40SYehuda Sadeh kfree(rbdc); 755602adf40SYehuda Sadeh out_opt: 75643ae4701SAlex Elder if (ceph_opts) 75743ae4701SAlex Elder ceph_destroy_options(ceph_opts); 75837206ee5SAlex Elder dout("%s: error %d\n", __func__, ret); 75937206ee5SAlex Elder 76028f259b7SVasiliy Kulikov return ERR_PTR(ret); 761602adf40SYehuda Sadeh } 762602adf40SYehuda Sadeh 7632f82ee54SAlex Elder static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc) 7642f82ee54SAlex Elder { 7652f82ee54SAlex Elder kref_get(&rbdc->kref); 7662f82ee54SAlex Elder 7672f82ee54SAlex Elder return rbdc; 7682f82ee54SAlex Elder } 7692f82ee54SAlex Elder 770602adf40SYehuda Sadeh /* 7711f7ba331SAlex Elder * Find a ceph client with specific addr and configuration. If 7721f7ba331SAlex Elder * found, bump its reference count. 773602adf40SYehuda Sadeh */ 7741f7ba331SAlex Elder static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts) 775602adf40SYehuda Sadeh { 776602adf40SYehuda Sadeh struct rbd_client *client_node; 7771f7ba331SAlex Elder bool found = false; 778602adf40SYehuda Sadeh 77943ae4701SAlex Elder if (ceph_opts->flags & CEPH_OPT_NOSHARE) 780602adf40SYehuda Sadeh return NULL; 781602adf40SYehuda Sadeh 7821f7ba331SAlex Elder spin_lock(&rbd_client_list_lock); 7831f7ba331SAlex Elder list_for_each_entry(client_node, &rbd_client_list, node) { 7841f7ba331SAlex Elder if (!ceph_compare_options(ceph_opts, client_node->client)) { 7852f82ee54SAlex Elder __rbd_get_client(client_node); 7862f82ee54SAlex Elder 7871f7ba331SAlex Elder found = true; 7881f7ba331SAlex Elder break; 7891f7ba331SAlex Elder } 7901f7ba331SAlex Elder } 7911f7ba331SAlex Elder spin_unlock(&rbd_client_list_lock); 7921f7ba331SAlex Elder 7931f7ba331SAlex Elder return found ? client_node : NULL; 794602adf40SYehuda Sadeh } 795602adf40SYehuda Sadeh 796602adf40SYehuda Sadeh /* 797210c104cSIlya Dryomov * (Per device) rbd map options 79859c2be1eSYehuda Sadeh */ 79959c2be1eSYehuda Sadeh enum { 800b5584180SIlya Dryomov Opt_queue_depth, 80159c2be1eSYehuda Sadeh Opt_last_int, 80259c2be1eSYehuda Sadeh /* int args above */ 80359c2be1eSYehuda Sadeh Opt_last_string, 80459c2be1eSYehuda Sadeh /* string args above */ 805cc0538b6SAlex Elder Opt_read_only, 806cc0538b6SAlex Elder Opt_read_write, 80780de1912SIlya Dryomov Opt_lock_on_read, 808210c104cSIlya Dryomov Opt_err 80959c2be1eSYehuda Sadeh }; 81059c2be1eSYehuda Sadeh 81143ae4701SAlex Elder static match_table_t rbd_opts_tokens = { 812b5584180SIlya Dryomov {Opt_queue_depth, "queue_depth=%d"}, 81359c2be1eSYehuda Sadeh /* int args above */ 81459c2be1eSYehuda Sadeh /* string args above */ 815be466c1cSAlex Elder {Opt_read_only, "read_only"}, 816cc0538b6SAlex Elder {Opt_read_only, "ro"}, /* Alternate spelling */ 817cc0538b6SAlex Elder {Opt_read_write, "read_write"}, 818cc0538b6SAlex Elder {Opt_read_write, "rw"}, /* Alternate spelling */ 81980de1912SIlya Dryomov {Opt_lock_on_read, "lock_on_read"}, 820210c104cSIlya Dryomov {Opt_err, NULL} 82159c2be1eSYehuda Sadeh }; 82259c2be1eSYehuda Sadeh 82398571b5aSAlex Elder struct rbd_options { 824b5584180SIlya Dryomov int queue_depth; 82598571b5aSAlex Elder bool read_only; 82680de1912SIlya Dryomov bool lock_on_read; 82798571b5aSAlex Elder }; 82898571b5aSAlex Elder 829b5584180SIlya Dryomov #define RBD_QUEUE_DEPTH_DEFAULT BLKDEV_MAX_RQ 83098571b5aSAlex Elder #define RBD_READ_ONLY_DEFAULT false 83180de1912SIlya Dryomov #define RBD_LOCK_ON_READ_DEFAULT false 83298571b5aSAlex Elder 83359c2be1eSYehuda Sadeh static int parse_rbd_opts_token(char *c, void *private) 83459c2be1eSYehuda Sadeh { 83543ae4701SAlex Elder struct rbd_options *rbd_opts = private; 83659c2be1eSYehuda Sadeh substring_t argstr[MAX_OPT_ARGS]; 83759c2be1eSYehuda Sadeh int token, intval, ret; 83859c2be1eSYehuda Sadeh 83943ae4701SAlex Elder token = match_token(c, rbd_opts_tokens, argstr); 84059c2be1eSYehuda Sadeh if (token < Opt_last_int) { 84159c2be1eSYehuda Sadeh ret = match_int(&argstr[0], &intval); 84259c2be1eSYehuda Sadeh if (ret < 0) { 843210c104cSIlya Dryomov pr_err("bad mount option arg (not int) at '%s'\n", c); 84459c2be1eSYehuda Sadeh return ret; 84559c2be1eSYehuda Sadeh } 84659c2be1eSYehuda Sadeh dout("got int token %d val %d\n", token, intval); 84759c2be1eSYehuda Sadeh } else if (token > Opt_last_int && token < Opt_last_string) { 848210c104cSIlya Dryomov dout("got string token %d val %s\n", token, argstr[0].from); 84959c2be1eSYehuda Sadeh } else { 85059c2be1eSYehuda Sadeh dout("got token %d\n", token); 85159c2be1eSYehuda Sadeh } 85259c2be1eSYehuda Sadeh 85359c2be1eSYehuda Sadeh switch (token) { 854b5584180SIlya Dryomov case Opt_queue_depth: 855b5584180SIlya Dryomov if (intval < 1) { 856b5584180SIlya Dryomov pr_err("queue_depth out of range\n"); 857b5584180SIlya Dryomov return -EINVAL; 858b5584180SIlya Dryomov } 859b5584180SIlya Dryomov rbd_opts->queue_depth = intval; 860b5584180SIlya Dryomov break; 861cc0538b6SAlex Elder case Opt_read_only: 862cc0538b6SAlex Elder rbd_opts->read_only = true; 863cc0538b6SAlex Elder break; 864cc0538b6SAlex Elder case Opt_read_write: 865cc0538b6SAlex Elder rbd_opts->read_only = false; 866cc0538b6SAlex Elder break; 86780de1912SIlya Dryomov case Opt_lock_on_read: 86880de1912SIlya Dryomov rbd_opts->lock_on_read = true; 86980de1912SIlya Dryomov break; 87059c2be1eSYehuda Sadeh default: 871210c104cSIlya Dryomov /* libceph prints "bad option" msg */ 872210c104cSIlya Dryomov return -EINVAL; 87359c2be1eSYehuda Sadeh } 874210c104cSIlya Dryomov 87559c2be1eSYehuda Sadeh return 0; 87659c2be1eSYehuda Sadeh } 87759c2be1eSYehuda Sadeh 8786d2940c8SGuangliang Zhao static char* obj_op_name(enum obj_operation_type op_type) 8796d2940c8SGuangliang Zhao { 8806d2940c8SGuangliang Zhao switch (op_type) { 8816d2940c8SGuangliang Zhao case OBJ_OP_READ: 8826d2940c8SGuangliang Zhao return "read"; 8836d2940c8SGuangliang Zhao case OBJ_OP_WRITE: 8846d2940c8SGuangliang Zhao return "write"; 88590e98c52SGuangliang Zhao case OBJ_OP_DISCARD: 88690e98c52SGuangliang Zhao return "discard"; 8876d2940c8SGuangliang Zhao default: 8886d2940c8SGuangliang Zhao return "???"; 8896d2940c8SGuangliang Zhao } 8906d2940c8SGuangliang Zhao } 8916d2940c8SGuangliang Zhao 89259c2be1eSYehuda Sadeh /* 893602adf40SYehuda Sadeh * Get a ceph client with specific addr and configuration, if one does 8947262cfcaSAlex Elder * not exist create it. Either way, ceph_opts is consumed by this 8957262cfcaSAlex Elder * function. 896602adf40SYehuda Sadeh */ 8979d3997fdSAlex Elder static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts) 898602adf40SYehuda Sadeh { 899f8c38929SAlex Elder struct rbd_client *rbdc; 90059c2be1eSYehuda Sadeh 901cfbf6377SAlex Elder mutex_lock_nested(&client_mutex, SINGLE_DEPTH_NESTING); 9021f7ba331SAlex Elder rbdc = rbd_client_find(ceph_opts); 9039d3997fdSAlex Elder if (rbdc) /* using an existing client */ 90443ae4701SAlex Elder ceph_destroy_options(ceph_opts); 9059d3997fdSAlex Elder else 906f8c38929SAlex Elder rbdc = rbd_client_create(ceph_opts); 907cfbf6377SAlex Elder mutex_unlock(&client_mutex); 908d720bcb0SAlex Elder 9099d3997fdSAlex Elder return rbdc; 910602adf40SYehuda Sadeh } 911602adf40SYehuda Sadeh 912602adf40SYehuda Sadeh /* 913602adf40SYehuda Sadeh * Destroy ceph client 914d23a4b3fSAlex Elder * 915432b8587SAlex Elder * Caller must hold rbd_client_list_lock. 916602adf40SYehuda Sadeh */ 917602adf40SYehuda Sadeh static void rbd_client_release(struct kref *kref) 918602adf40SYehuda Sadeh { 919602adf40SYehuda Sadeh struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref); 920602adf40SYehuda Sadeh 92137206ee5SAlex Elder dout("%s: rbdc %p\n", __func__, rbdc); 922cd9d9f5dSAlex Elder spin_lock(&rbd_client_list_lock); 923602adf40SYehuda Sadeh list_del(&rbdc->node); 924cd9d9f5dSAlex Elder spin_unlock(&rbd_client_list_lock); 925602adf40SYehuda Sadeh 926602adf40SYehuda Sadeh ceph_destroy_client(rbdc->client); 927602adf40SYehuda Sadeh kfree(rbdc); 928602adf40SYehuda Sadeh } 929602adf40SYehuda Sadeh 930602adf40SYehuda Sadeh /* 931602adf40SYehuda Sadeh * Drop reference to ceph client node. If it's not referenced anymore, release 932602adf40SYehuda Sadeh * it. 933602adf40SYehuda Sadeh */ 9349d3997fdSAlex Elder static void rbd_put_client(struct rbd_client *rbdc) 935602adf40SYehuda Sadeh { 936c53d5893SAlex Elder if (rbdc) 9379d3997fdSAlex Elder kref_put(&rbdc->kref, rbd_client_release); 938602adf40SYehuda Sadeh } 939602adf40SYehuda Sadeh 940a30b71b9SAlex Elder static bool rbd_image_format_valid(u32 image_format) 941a30b71b9SAlex Elder { 942a30b71b9SAlex Elder return image_format == 1 || image_format == 2; 943a30b71b9SAlex Elder } 944a30b71b9SAlex Elder 9458e94af8eSAlex Elder static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk) 9468e94af8eSAlex Elder { 947103a150fSAlex Elder size_t size; 948103a150fSAlex Elder u32 snap_count; 949103a150fSAlex Elder 950103a150fSAlex Elder /* The header has to start with the magic rbd header text */ 951103a150fSAlex Elder if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT))) 952103a150fSAlex Elder return false; 953103a150fSAlex Elder 954db2388b6SAlex Elder /* The bio layer requires at least sector-sized I/O */ 955db2388b6SAlex Elder 956db2388b6SAlex Elder if (ondisk->options.order < SECTOR_SHIFT) 957db2388b6SAlex Elder return false; 958db2388b6SAlex Elder 959db2388b6SAlex Elder /* If we use u64 in a few spots we may be able to loosen this */ 960db2388b6SAlex Elder 961db2388b6SAlex Elder if (ondisk->options.order > 8 * sizeof (int) - 1) 962db2388b6SAlex Elder return false; 963db2388b6SAlex Elder 964103a150fSAlex Elder /* 965103a150fSAlex Elder * The size of a snapshot header has to fit in a size_t, and 966103a150fSAlex Elder * that limits the number of snapshots. 967103a150fSAlex Elder */ 968103a150fSAlex Elder snap_count = le32_to_cpu(ondisk->snap_count); 969103a150fSAlex Elder size = SIZE_MAX - sizeof (struct ceph_snap_context); 970103a150fSAlex Elder if (snap_count > size / sizeof (__le64)) 971103a150fSAlex Elder return false; 972103a150fSAlex Elder 973103a150fSAlex Elder /* 974103a150fSAlex Elder * Not only that, but the size of the entire the snapshot 975103a150fSAlex Elder * header must also be representable in a size_t. 976103a150fSAlex Elder */ 977103a150fSAlex Elder size -= snap_count * sizeof (__le64); 978103a150fSAlex Elder if ((u64) size < le64_to_cpu(ondisk->snap_names_len)) 979103a150fSAlex Elder return false; 980103a150fSAlex Elder 981103a150fSAlex Elder return true; 9828e94af8eSAlex Elder } 9838e94af8eSAlex Elder 984602adf40SYehuda Sadeh /* 9855bc3fb17SIlya Dryomov * returns the size of an object in the image 9865bc3fb17SIlya Dryomov */ 9875bc3fb17SIlya Dryomov static u32 rbd_obj_bytes(struct rbd_image_header *header) 9885bc3fb17SIlya Dryomov { 9895bc3fb17SIlya Dryomov return 1U << header->obj_order; 9905bc3fb17SIlya Dryomov } 9915bc3fb17SIlya Dryomov 992263423f8SIlya Dryomov static void rbd_init_layout(struct rbd_device *rbd_dev) 993263423f8SIlya Dryomov { 994263423f8SIlya Dryomov if (rbd_dev->header.stripe_unit == 0 || 995263423f8SIlya Dryomov rbd_dev->header.stripe_count == 0) { 996263423f8SIlya Dryomov rbd_dev->header.stripe_unit = rbd_obj_bytes(&rbd_dev->header); 997263423f8SIlya Dryomov rbd_dev->header.stripe_count = 1; 998263423f8SIlya Dryomov } 999263423f8SIlya Dryomov 1000263423f8SIlya Dryomov rbd_dev->layout.stripe_unit = rbd_dev->header.stripe_unit; 1001263423f8SIlya Dryomov rbd_dev->layout.stripe_count = rbd_dev->header.stripe_count; 1002263423f8SIlya Dryomov rbd_dev->layout.object_size = rbd_obj_bytes(&rbd_dev->header); 10037e97332eSIlya Dryomov rbd_dev->layout.pool_id = rbd_dev->header.data_pool_id == CEPH_NOPOOL ? 10047e97332eSIlya Dryomov rbd_dev->spec->pool_id : rbd_dev->header.data_pool_id; 1005263423f8SIlya Dryomov RCU_INIT_POINTER(rbd_dev->layout.pool_ns, NULL); 1006263423f8SIlya Dryomov } 1007263423f8SIlya Dryomov 10085bc3fb17SIlya Dryomov /* 1009bb23e37aSAlex Elder * Fill an rbd image header with information from the given format 1 1010bb23e37aSAlex Elder * on-disk header. 1011602adf40SYehuda Sadeh */ 1012662518b1SAlex Elder static int rbd_header_from_disk(struct rbd_device *rbd_dev, 10134156d998SAlex Elder struct rbd_image_header_ondisk *ondisk) 1014602adf40SYehuda Sadeh { 1015662518b1SAlex Elder struct rbd_image_header *header = &rbd_dev->header; 1016bb23e37aSAlex Elder bool first_time = header->object_prefix == NULL; 1017bb23e37aSAlex Elder struct ceph_snap_context *snapc; 1018bb23e37aSAlex Elder char *object_prefix = NULL; 1019bb23e37aSAlex Elder char *snap_names = NULL; 1020bb23e37aSAlex Elder u64 *snap_sizes = NULL; 1021ccece235SAlex Elder u32 snap_count; 1022bb23e37aSAlex Elder int ret = -ENOMEM; 1023621901d6SAlex Elder u32 i; 1024602adf40SYehuda Sadeh 1025bb23e37aSAlex Elder /* Allocate this now to avoid having to handle failure below */ 1026103a150fSAlex Elder 1027bb23e37aSAlex Elder if (first_time) { 1028848d796cSIlya Dryomov object_prefix = kstrndup(ondisk->object_prefix, 1029848d796cSIlya Dryomov sizeof(ondisk->object_prefix), 1030848d796cSIlya Dryomov GFP_KERNEL); 1031bb23e37aSAlex Elder if (!object_prefix) 1032602adf40SYehuda Sadeh return -ENOMEM; 1033bb23e37aSAlex Elder } 103400f1f36fSAlex Elder 1035bb23e37aSAlex Elder /* Allocate the snapshot context and fill it in */ 1036d2bb24e5SAlex Elder 1037602adf40SYehuda Sadeh snap_count = le32_to_cpu(ondisk->snap_count); 1038bb23e37aSAlex Elder snapc = ceph_create_snap_context(snap_count, GFP_KERNEL); 1039bb23e37aSAlex Elder if (!snapc) 1040bb23e37aSAlex Elder goto out_err; 1041bb23e37aSAlex Elder snapc->seq = le64_to_cpu(ondisk->snap_seq); 1042602adf40SYehuda Sadeh if (snap_count) { 1043bb23e37aSAlex Elder struct rbd_image_snap_ondisk *snaps; 1044f785cc1dSAlex Elder u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len); 1045f785cc1dSAlex Elder 1046bb23e37aSAlex Elder /* We'll keep a copy of the snapshot names... */ 1047621901d6SAlex Elder 1048f785cc1dSAlex Elder if (snap_names_len > (u64)SIZE_MAX) 1049bb23e37aSAlex Elder goto out_2big; 1050bb23e37aSAlex Elder snap_names = kmalloc(snap_names_len, GFP_KERNEL); 1051bb23e37aSAlex Elder if (!snap_names) 1052602adf40SYehuda Sadeh goto out_err; 1053bb23e37aSAlex Elder 1054bb23e37aSAlex Elder /* ...as well as the array of their sizes. */ 105588a25a5fSMarkus Elfring snap_sizes = kmalloc_array(snap_count, 105688a25a5fSMarkus Elfring sizeof(*header->snap_sizes), 105788a25a5fSMarkus Elfring GFP_KERNEL); 1058bb23e37aSAlex Elder if (!snap_sizes) 1059bb23e37aSAlex Elder goto out_err; 1060bb23e37aSAlex Elder 1061f785cc1dSAlex Elder /* 1062bb23e37aSAlex Elder * Copy the names, and fill in each snapshot's id 1063bb23e37aSAlex Elder * and size. 1064bb23e37aSAlex Elder * 106599a41ebcSAlex Elder * Note that rbd_dev_v1_header_info() guarantees the 1066bb23e37aSAlex Elder * ondisk buffer we're working with has 1067f785cc1dSAlex Elder * snap_names_len bytes beyond the end of the 1068f785cc1dSAlex Elder * snapshot id array, this memcpy() is safe. 1069f785cc1dSAlex Elder */ 1070bb23e37aSAlex Elder memcpy(snap_names, &ondisk->snaps[snap_count], snap_names_len); 1071bb23e37aSAlex Elder snaps = ondisk->snaps; 1072bb23e37aSAlex Elder for (i = 0; i < snap_count; i++) { 1073bb23e37aSAlex Elder snapc->snaps[i] = le64_to_cpu(snaps[i].id); 1074bb23e37aSAlex Elder snap_sizes[i] = le64_to_cpu(snaps[i].image_size); 1075bb23e37aSAlex Elder } 1076602adf40SYehuda Sadeh } 1077849b4260SAlex Elder 1078bb23e37aSAlex Elder /* We won't fail any more, fill in the header */ 1079bb23e37aSAlex Elder 1080bb23e37aSAlex Elder if (first_time) { 1081bb23e37aSAlex Elder header->object_prefix = object_prefix; 1082602adf40SYehuda Sadeh header->obj_order = ondisk->options.order; 1083263423f8SIlya Dryomov rbd_init_layout(rbd_dev); 1084662518b1SAlex Elder } else { 1085662518b1SAlex Elder ceph_put_snap_context(header->snapc); 1086662518b1SAlex Elder kfree(header->snap_names); 1087662518b1SAlex Elder kfree(header->snap_sizes); 1088bb23e37aSAlex Elder } 10896a52325fSAlex Elder 1090bb23e37aSAlex Elder /* The remaining fields always get updated (when we refresh) */ 1091621901d6SAlex Elder 1092f84344f3SAlex Elder header->image_size = le64_to_cpu(ondisk->image_size); 1093bb23e37aSAlex Elder header->snapc = snapc; 1094bb23e37aSAlex Elder header->snap_names = snap_names; 1095bb23e37aSAlex Elder header->snap_sizes = snap_sizes; 1096468521c1SAlex Elder 1097602adf40SYehuda Sadeh return 0; 1098bb23e37aSAlex Elder out_2big: 1099bb23e37aSAlex Elder ret = -EIO; 11006a52325fSAlex Elder out_err: 1101bb23e37aSAlex Elder kfree(snap_sizes); 1102bb23e37aSAlex Elder kfree(snap_names); 1103bb23e37aSAlex Elder ceph_put_snap_context(snapc); 1104bb23e37aSAlex Elder kfree(object_prefix); 1105ccece235SAlex Elder 1106bb23e37aSAlex Elder return ret; 1107602adf40SYehuda Sadeh } 1108602adf40SYehuda Sadeh 11099682fc6dSAlex Elder static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which) 11109682fc6dSAlex Elder { 11119682fc6dSAlex Elder const char *snap_name; 11129682fc6dSAlex Elder 11139682fc6dSAlex Elder rbd_assert(which < rbd_dev->header.snapc->num_snaps); 11149682fc6dSAlex Elder 11159682fc6dSAlex Elder /* Skip over names until we find the one we are looking for */ 11169682fc6dSAlex Elder 11179682fc6dSAlex Elder snap_name = rbd_dev->header.snap_names; 11189682fc6dSAlex Elder while (which--) 11199682fc6dSAlex Elder snap_name += strlen(snap_name) + 1; 11209682fc6dSAlex Elder 11219682fc6dSAlex Elder return kstrdup(snap_name, GFP_KERNEL); 11229682fc6dSAlex Elder } 11239682fc6dSAlex Elder 112430d1cff8SAlex Elder /* 112530d1cff8SAlex Elder * Snapshot id comparison function for use with qsort()/bsearch(). 112630d1cff8SAlex Elder * Note that result is for snapshots in *descending* order. 112730d1cff8SAlex Elder */ 112830d1cff8SAlex Elder static int snapid_compare_reverse(const void *s1, const void *s2) 112930d1cff8SAlex Elder { 113030d1cff8SAlex Elder u64 snap_id1 = *(u64 *)s1; 113130d1cff8SAlex Elder u64 snap_id2 = *(u64 *)s2; 113230d1cff8SAlex Elder 113330d1cff8SAlex Elder if (snap_id1 < snap_id2) 113430d1cff8SAlex Elder return 1; 113530d1cff8SAlex Elder return snap_id1 == snap_id2 ? 0 : -1; 113630d1cff8SAlex Elder } 113730d1cff8SAlex Elder 113830d1cff8SAlex Elder /* 113930d1cff8SAlex Elder * Search a snapshot context to see if the given snapshot id is 114030d1cff8SAlex Elder * present. 114130d1cff8SAlex Elder * 114230d1cff8SAlex Elder * Returns the position of the snapshot id in the array if it's found, 114330d1cff8SAlex Elder * or BAD_SNAP_INDEX otherwise. 114430d1cff8SAlex Elder * 114530d1cff8SAlex Elder * Note: The snapshot array is in kept sorted (by the osd) in 114630d1cff8SAlex Elder * reverse order, highest snapshot id first. 114730d1cff8SAlex Elder */ 11489682fc6dSAlex Elder static u32 rbd_dev_snap_index(struct rbd_device *rbd_dev, u64 snap_id) 11499682fc6dSAlex Elder { 11509682fc6dSAlex Elder struct ceph_snap_context *snapc = rbd_dev->header.snapc; 115130d1cff8SAlex Elder u64 *found; 11529682fc6dSAlex Elder 115330d1cff8SAlex Elder found = bsearch(&snap_id, &snapc->snaps, snapc->num_snaps, 115430d1cff8SAlex Elder sizeof (snap_id), snapid_compare_reverse); 11559682fc6dSAlex Elder 115630d1cff8SAlex Elder return found ? (u32)(found - &snapc->snaps[0]) : BAD_SNAP_INDEX; 11579682fc6dSAlex Elder } 11589682fc6dSAlex Elder 11592ad3d716SAlex Elder static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, 11602ad3d716SAlex Elder u64 snap_id) 116154cac61fSAlex Elder { 116254cac61fSAlex Elder u32 which; 1163da6a6b63SJosh Durgin const char *snap_name; 116454cac61fSAlex Elder 116554cac61fSAlex Elder which = rbd_dev_snap_index(rbd_dev, snap_id); 116654cac61fSAlex Elder if (which == BAD_SNAP_INDEX) 1167da6a6b63SJosh Durgin return ERR_PTR(-ENOENT); 116854cac61fSAlex Elder 1169da6a6b63SJosh Durgin snap_name = _rbd_dev_v1_snap_name(rbd_dev, which); 1170da6a6b63SJosh Durgin return snap_name ? snap_name : ERR_PTR(-ENOMEM); 117154cac61fSAlex Elder } 117254cac61fSAlex Elder 11739e15b77dSAlex Elder static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id) 11749e15b77dSAlex Elder { 11759e15b77dSAlex Elder if (snap_id == CEPH_NOSNAP) 11769e15b77dSAlex Elder return RBD_SNAP_HEAD_NAME; 11779e15b77dSAlex Elder 117854cac61fSAlex Elder rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 117954cac61fSAlex Elder if (rbd_dev->image_format == 1) 118054cac61fSAlex Elder return rbd_dev_v1_snap_name(rbd_dev, snap_id); 11819e15b77dSAlex Elder 118254cac61fSAlex Elder return rbd_dev_v2_snap_name(rbd_dev, snap_id); 11839e15b77dSAlex Elder } 11849e15b77dSAlex Elder 11852ad3d716SAlex Elder static int rbd_snap_size(struct rbd_device *rbd_dev, u64 snap_id, 11862ad3d716SAlex Elder u64 *snap_size) 1187602adf40SYehuda Sadeh { 11882ad3d716SAlex Elder rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 11892ad3d716SAlex Elder if (snap_id == CEPH_NOSNAP) { 11902ad3d716SAlex Elder *snap_size = rbd_dev->header.image_size; 11912ad3d716SAlex Elder } else if (rbd_dev->image_format == 1) { 11922ad3d716SAlex Elder u32 which; 119300f1f36fSAlex Elder 11942ad3d716SAlex Elder which = rbd_dev_snap_index(rbd_dev, snap_id); 11952ad3d716SAlex Elder if (which == BAD_SNAP_INDEX) 11962ad3d716SAlex Elder return -ENOENT; 119700f1f36fSAlex Elder 11982ad3d716SAlex Elder *snap_size = rbd_dev->header.snap_sizes[which]; 11992ad3d716SAlex Elder } else { 12002ad3d716SAlex Elder u64 size = 0; 12012ad3d716SAlex Elder int ret; 12022ad3d716SAlex Elder 12032ad3d716SAlex Elder ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size); 12042ad3d716SAlex Elder if (ret) 12052ad3d716SAlex Elder return ret; 12062ad3d716SAlex Elder 12072ad3d716SAlex Elder *snap_size = size; 12082ad3d716SAlex Elder } 12092ad3d716SAlex Elder return 0; 12102ad3d716SAlex Elder } 12112ad3d716SAlex Elder 12122ad3d716SAlex Elder static int rbd_snap_features(struct rbd_device *rbd_dev, u64 snap_id, 12132ad3d716SAlex Elder u64 *snap_features) 12142ad3d716SAlex Elder { 12152ad3d716SAlex Elder rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 12162ad3d716SAlex Elder if (snap_id == CEPH_NOSNAP) { 12172ad3d716SAlex Elder *snap_features = rbd_dev->header.features; 12182ad3d716SAlex Elder } else if (rbd_dev->image_format == 1) { 12192ad3d716SAlex Elder *snap_features = 0; /* No features for format 1 */ 12202ad3d716SAlex Elder } else { 12212ad3d716SAlex Elder u64 features = 0; 12222ad3d716SAlex Elder int ret; 12232ad3d716SAlex Elder 12242ad3d716SAlex Elder ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, &features); 12252ad3d716SAlex Elder if (ret) 12262ad3d716SAlex Elder return ret; 12272ad3d716SAlex Elder 12282ad3d716SAlex Elder *snap_features = features; 12292ad3d716SAlex Elder } 12302ad3d716SAlex Elder return 0; 123100f1f36fSAlex Elder } 1232602adf40SYehuda Sadeh 1233d1cf5788SAlex Elder static int rbd_dev_mapping_set(struct rbd_device *rbd_dev) 1234602adf40SYehuda Sadeh { 12358f4b7d98SAlex Elder u64 snap_id = rbd_dev->spec->snap_id; 12362ad3d716SAlex Elder u64 size = 0; 12372ad3d716SAlex Elder u64 features = 0; 12382ad3d716SAlex Elder int ret; 12398b0241f8SAlex Elder 12402ad3d716SAlex Elder ret = rbd_snap_size(rbd_dev, snap_id, &size); 12412ad3d716SAlex Elder if (ret) 12422ad3d716SAlex Elder return ret; 12432ad3d716SAlex Elder ret = rbd_snap_features(rbd_dev, snap_id, &features); 12442ad3d716SAlex Elder if (ret) 12452ad3d716SAlex Elder return ret; 12462ad3d716SAlex Elder 12472ad3d716SAlex Elder rbd_dev->mapping.size = size; 12482ad3d716SAlex Elder rbd_dev->mapping.features = features; 12492ad3d716SAlex Elder 12508b0241f8SAlex Elder return 0; 1251602adf40SYehuda Sadeh } 1252602adf40SYehuda Sadeh 1253d1cf5788SAlex Elder static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev) 1254d1cf5788SAlex Elder { 1255d1cf5788SAlex Elder rbd_dev->mapping.size = 0; 1256d1cf5788SAlex Elder rbd_dev->mapping.features = 0; 1257200a6a8bSAlex Elder } 1258200a6a8bSAlex Elder 125965ccfe21SAlex Elder static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset) 126065ccfe21SAlex Elder { 12615bc3fb17SIlya Dryomov u64 segment_size = rbd_obj_bytes(&rbd_dev->header); 1262602adf40SYehuda Sadeh 126365ccfe21SAlex Elder return offset & (segment_size - 1); 126465ccfe21SAlex Elder } 126565ccfe21SAlex Elder 126665ccfe21SAlex Elder static u64 rbd_segment_length(struct rbd_device *rbd_dev, 126765ccfe21SAlex Elder u64 offset, u64 length) 126865ccfe21SAlex Elder { 12695bc3fb17SIlya Dryomov u64 segment_size = rbd_obj_bytes(&rbd_dev->header); 127065ccfe21SAlex Elder 127165ccfe21SAlex Elder offset &= segment_size - 1; 127265ccfe21SAlex Elder 1273aafb230eSAlex Elder rbd_assert(length <= U64_MAX - offset); 127465ccfe21SAlex Elder if (offset + length > segment_size) 127565ccfe21SAlex Elder length = segment_size - offset; 127665ccfe21SAlex Elder 127765ccfe21SAlex Elder return length; 1278602adf40SYehuda Sadeh } 1279602adf40SYehuda Sadeh 1280602adf40SYehuda Sadeh /* 1281602adf40SYehuda Sadeh * bio helpers 1282602adf40SYehuda Sadeh */ 1283602adf40SYehuda Sadeh 1284602adf40SYehuda Sadeh static void bio_chain_put(struct bio *chain) 1285602adf40SYehuda Sadeh { 1286602adf40SYehuda Sadeh struct bio *tmp; 1287602adf40SYehuda Sadeh 1288602adf40SYehuda Sadeh while (chain) { 1289602adf40SYehuda Sadeh tmp = chain; 1290602adf40SYehuda Sadeh chain = chain->bi_next; 1291602adf40SYehuda Sadeh bio_put(tmp); 1292602adf40SYehuda Sadeh } 1293602adf40SYehuda Sadeh } 1294602adf40SYehuda Sadeh 1295602adf40SYehuda Sadeh /* 1296602adf40SYehuda Sadeh * zeros a bio chain, starting at specific offset 1297602adf40SYehuda Sadeh */ 1298602adf40SYehuda Sadeh static void zero_bio_chain(struct bio *chain, int start_ofs) 1299602adf40SYehuda Sadeh { 13007988613bSKent Overstreet struct bio_vec bv; 13017988613bSKent Overstreet struct bvec_iter iter; 1302602adf40SYehuda Sadeh unsigned long flags; 1303602adf40SYehuda Sadeh void *buf; 1304602adf40SYehuda Sadeh int pos = 0; 1305602adf40SYehuda Sadeh 1306602adf40SYehuda Sadeh while (chain) { 13077988613bSKent Overstreet bio_for_each_segment(bv, chain, iter) { 13087988613bSKent Overstreet if (pos + bv.bv_len > start_ofs) { 1309602adf40SYehuda Sadeh int remainder = max(start_ofs - pos, 0); 13107988613bSKent Overstreet buf = bvec_kmap_irq(&bv, &flags); 1311602adf40SYehuda Sadeh memset(buf + remainder, 0, 13127988613bSKent Overstreet bv.bv_len - remainder); 13137988613bSKent Overstreet flush_dcache_page(bv.bv_page); 131485b5aaa6SDan Carpenter bvec_kunmap_irq(buf, &flags); 1315602adf40SYehuda Sadeh } 13167988613bSKent Overstreet pos += bv.bv_len; 1317602adf40SYehuda Sadeh } 1318602adf40SYehuda Sadeh 1319602adf40SYehuda Sadeh chain = chain->bi_next; 1320602adf40SYehuda Sadeh } 1321602adf40SYehuda Sadeh } 1322602adf40SYehuda Sadeh 1323602adf40SYehuda Sadeh /* 1324b9434c5bSAlex Elder * similar to zero_bio_chain(), zeros data defined by a page array, 1325b9434c5bSAlex Elder * starting at the given byte offset from the start of the array and 1326b9434c5bSAlex Elder * continuing up to the given end offset. The pages array is 1327b9434c5bSAlex Elder * assumed to be big enough to hold all bytes up to the end. 1328b9434c5bSAlex Elder */ 1329b9434c5bSAlex Elder static void zero_pages(struct page **pages, u64 offset, u64 end) 1330b9434c5bSAlex Elder { 1331b9434c5bSAlex Elder struct page **page = &pages[offset >> PAGE_SHIFT]; 1332b9434c5bSAlex Elder 1333b9434c5bSAlex Elder rbd_assert(end > offset); 1334b9434c5bSAlex Elder rbd_assert(end - offset <= (u64)SIZE_MAX); 1335b9434c5bSAlex Elder while (offset < end) { 1336b9434c5bSAlex Elder size_t page_offset; 1337b9434c5bSAlex Elder size_t length; 1338b9434c5bSAlex Elder unsigned long flags; 1339b9434c5bSAlex Elder void *kaddr; 1340b9434c5bSAlex Elder 1341491205a8SGeert Uytterhoeven page_offset = offset & ~PAGE_MASK; 1342491205a8SGeert Uytterhoeven length = min_t(size_t, PAGE_SIZE - page_offset, end - offset); 1343b9434c5bSAlex Elder local_irq_save(flags); 1344b9434c5bSAlex Elder kaddr = kmap_atomic(*page); 1345b9434c5bSAlex Elder memset(kaddr + page_offset, 0, length); 1346e2156054SAlex Elder flush_dcache_page(*page); 1347b9434c5bSAlex Elder kunmap_atomic(kaddr); 1348b9434c5bSAlex Elder local_irq_restore(flags); 1349b9434c5bSAlex Elder 1350b9434c5bSAlex Elder offset += length; 1351b9434c5bSAlex Elder page++; 1352b9434c5bSAlex Elder } 1353b9434c5bSAlex Elder } 1354b9434c5bSAlex Elder 1355b9434c5bSAlex Elder /* 1356f7760dadSAlex Elder * Clone a portion of a bio, starting at the given byte offset 1357f7760dadSAlex Elder * and continuing for the number of bytes indicated. 1358602adf40SYehuda Sadeh */ 1359f7760dadSAlex Elder static struct bio *bio_clone_range(struct bio *bio_src, 1360f7760dadSAlex Elder unsigned int offset, 1361f7760dadSAlex Elder unsigned int len, 1362f7760dadSAlex Elder gfp_t gfpmask) 1363602adf40SYehuda Sadeh { 1364f7760dadSAlex Elder struct bio *bio; 1365602adf40SYehuda Sadeh 13665341a627SKent Overstreet bio = bio_clone(bio_src, gfpmask); 1367f7760dadSAlex Elder if (!bio) 1368f7760dadSAlex Elder return NULL; /* ENOMEM */ 1369f7760dadSAlex Elder 13705341a627SKent Overstreet bio_advance(bio, offset); 13714f024f37SKent Overstreet bio->bi_iter.bi_size = len; 1372602adf40SYehuda Sadeh 1373f7760dadSAlex Elder return bio; 1374602adf40SYehuda Sadeh } 1375602adf40SYehuda Sadeh 1376f7760dadSAlex Elder /* 1377f7760dadSAlex Elder * Clone a portion of a bio chain, starting at the given byte offset 1378f7760dadSAlex Elder * into the first bio in the source chain and continuing for the 1379f7760dadSAlex Elder * number of bytes indicated. The result is another bio chain of 1380f7760dadSAlex Elder * exactly the given length, or a null pointer on error. 1381f7760dadSAlex Elder * 1382f7760dadSAlex Elder * The bio_src and offset parameters are both in-out. On entry they 1383f7760dadSAlex Elder * refer to the first source bio and the offset into that bio where 1384f7760dadSAlex Elder * the start of data to be cloned is located. 1385f7760dadSAlex Elder * 1386f7760dadSAlex Elder * On return, bio_src is updated to refer to the bio in the source 1387f7760dadSAlex Elder * chain that contains first un-cloned byte, and *offset will 1388f7760dadSAlex Elder * contain the offset of that byte within that bio. 1389f7760dadSAlex Elder */ 1390f7760dadSAlex Elder static struct bio *bio_chain_clone_range(struct bio **bio_src, 1391f7760dadSAlex Elder unsigned int *offset, 1392f7760dadSAlex Elder unsigned int len, 1393f7760dadSAlex Elder gfp_t gfpmask) 1394f7760dadSAlex Elder { 1395f7760dadSAlex Elder struct bio *bi = *bio_src; 1396f7760dadSAlex Elder unsigned int off = *offset; 1397f7760dadSAlex Elder struct bio *chain = NULL; 1398f7760dadSAlex Elder struct bio **end; 1399602adf40SYehuda Sadeh 1400f7760dadSAlex Elder /* Build up a chain of clone bios up to the limit */ 1401602adf40SYehuda Sadeh 14024f024f37SKent Overstreet if (!bi || off >= bi->bi_iter.bi_size || !len) 1403f7760dadSAlex Elder return NULL; /* Nothing to clone */ 1404602adf40SYehuda Sadeh 1405f7760dadSAlex Elder end = &chain; 1406f7760dadSAlex Elder while (len) { 1407f7760dadSAlex Elder unsigned int bi_size; 1408f7760dadSAlex Elder struct bio *bio; 1409f7760dadSAlex Elder 1410f5400b7aSAlex Elder if (!bi) { 1411f5400b7aSAlex Elder rbd_warn(NULL, "bio_chain exhausted with %u left", len); 1412f7760dadSAlex Elder goto out_err; /* EINVAL; ran out of bio's */ 1413f5400b7aSAlex Elder } 14144f024f37SKent Overstreet bi_size = min_t(unsigned int, bi->bi_iter.bi_size - off, len); 1415f7760dadSAlex Elder bio = bio_clone_range(bi, off, bi_size, gfpmask); 1416f7760dadSAlex Elder if (!bio) 1417f7760dadSAlex Elder goto out_err; /* ENOMEM */ 1418f7760dadSAlex Elder 1419f7760dadSAlex Elder *end = bio; 1420f7760dadSAlex Elder end = &bio->bi_next; 1421f7760dadSAlex Elder 1422f7760dadSAlex Elder off += bi_size; 14234f024f37SKent Overstreet if (off == bi->bi_iter.bi_size) { 1424f7760dadSAlex Elder bi = bi->bi_next; 1425f7760dadSAlex Elder off = 0; 1426f7760dadSAlex Elder } 1427f7760dadSAlex Elder len -= bi_size; 1428f7760dadSAlex Elder } 1429f7760dadSAlex Elder *bio_src = bi; 1430f7760dadSAlex Elder *offset = off; 1431f7760dadSAlex Elder 1432f7760dadSAlex Elder return chain; 1433f7760dadSAlex Elder out_err: 1434f7760dadSAlex Elder bio_chain_put(chain); 1435f7760dadSAlex Elder 1436602adf40SYehuda Sadeh return NULL; 1437602adf40SYehuda Sadeh } 1438602adf40SYehuda Sadeh 1439926f9b3fSAlex Elder /* 1440926f9b3fSAlex Elder * The default/initial value for all object request flags is 0. For 1441926f9b3fSAlex Elder * each flag, once its value is set to 1 it is never reset to 0 1442926f9b3fSAlex Elder * again. 1443926f9b3fSAlex Elder */ 14446365d33aSAlex Elder static void obj_request_img_data_set(struct rbd_obj_request *obj_request) 14456365d33aSAlex Elder { 14466365d33aSAlex Elder if (test_and_set_bit(OBJ_REQ_IMG_DATA, &obj_request->flags)) { 14476365d33aSAlex Elder struct rbd_device *rbd_dev; 14486365d33aSAlex Elder 144957acbaa7SAlex Elder rbd_dev = obj_request->img_request->rbd_dev; 14509584d508SIlya Dryomov rbd_warn(rbd_dev, "obj_request %p already marked img_data", 14516365d33aSAlex Elder obj_request); 14526365d33aSAlex Elder } 14536365d33aSAlex Elder } 14546365d33aSAlex Elder 14556365d33aSAlex Elder static bool obj_request_img_data_test(struct rbd_obj_request *obj_request) 14566365d33aSAlex Elder { 14576365d33aSAlex Elder smp_mb(); 14586365d33aSAlex Elder return test_bit(OBJ_REQ_IMG_DATA, &obj_request->flags) != 0; 14596365d33aSAlex Elder } 14606365d33aSAlex Elder 146157acbaa7SAlex Elder static void obj_request_done_set(struct rbd_obj_request *obj_request) 146257acbaa7SAlex Elder { 146357acbaa7SAlex Elder if (test_and_set_bit(OBJ_REQ_DONE, &obj_request->flags)) { 146457acbaa7SAlex Elder struct rbd_device *rbd_dev = NULL; 146557acbaa7SAlex Elder 146657acbaa7SAlex Elder if (obj_request_img_data_test(obj_request)) 146757acbaa7SAlex Elder rbd_dev = obj_request->img_request->rbd_dev; 14689584d508SIlya Dryomov rbd_warn(rbd_dev, "obj_request %p already marked done", 146957acbaa7SAlex Elder obj_request); 147057acbaa7SAlex Elder } 147157acbaa7SAlex Elder } 147257acbaa7SAlex Elder 147357acbaa7SAlex Elder static bool obj_request_done_test(struct rbd_obj_request *obj_request) 147457acbaa7SAlex Elder { 147557acbaa7SAlex Elder smp_mb(); 147657acbaa7SAlex Elder return test_bit(OBJ_REQ_DONE, &obj_request->flags) != 0; 147757acbaa7SAlex Elder } 147857acbaa7SAlex Elder 14795679c59fSAlex Elder /* 14805679c59fSAlex Elder * This sets the KNOWN flag after (possibly) setting the EXISTS 14815679c59fSAlex Elder * flag. The latter is set based on the "exists" value provided. 14825679c59fSAlex Elder * 14835679c59fSAlex Elder * Note that for our purposes once an object exists it never goes 14845679c59fSAlex Elder * away again. It's possible that the response from two existence 14855679c59fSAlex Elder * checks are separated by the creation of the target object, and 14865679c59fSAlex Elder * the first ("doesn't exist") response arrives *after* the second 14875679c59fSAlex Elder * ("does exist"). In that case we ignore the second one. 14885679c59fSAlex Elder */ 14895679c59fSAlex Elder static void obj_request_existence_set(struct rbd_obj_request *obj_request, 14905679c59fSAlex Elder bool exists) 14915679c59fSAlex Elder { 14925679c59fSAlex Elder if (exists) 14935679c59fSAlex Elder set_bit(OBJ_REQ_EXISTS, &obj_request->flags); 14945679c59fSAlex Elder set_bit(OBJ_REQ_KNOWN, &obj_request->flags); 14955679c59fSAlex Elder smp_mb(); 14965679c59fSAlex Elder } 14975679c59fSAlex Elder 14985679c59fSAlex Elder static bool obj_request_known_test(struct rbd_obj_request *obj_request) 14995679c59fSAlex Elder { 15005679c59fSAlex Elder smp_mb(); 15015679c59fSAlex Elder return test_bit(OBJ_REQ_KNOWN, &obj_request->flags) != 0; 15025679c59fSAlex Elder } 15035679c59fSAlex Elder 15045679c59fSAlex Elder static bool obj_request_exists_test(struct rbd_obj_request *obj_request) 15055679c59fSAlex Elder { 15065679c59fSAlex Elder smp_mb(); 15075679c59fSAlex Elder return test_bit(OBJ_REQ_EXISTS, &obj_request->flags) != 0; 15085679c59fSAlex Elder } 15095679c59fSAlex Elder 15109638556aSIlya Dryomov static bool obj_request_overlaps_parent(struct rbd_obj_request *obj_request) 15119638556aSIlya Dryomov { 15129638556aSIlya Dryomov struct rbd_device *rbd_dev = obj_request->img_request->rbd_dev; 15139638556aSIlya Dryomov 15149638556aSIlya Dryomov return obj_request->img_offset < 15159638556aSIlya Dryomov round_up(rbd_dev->parent_overlap, rbd_obj_bytes(&rbd_dev->header)); 15169638556aSIlya Dryomov } 15179638556aSIlya Dryomov 1518bf0d5f50SAlex Elder static void rbd_obj_request_get(struct rbd_obj_request *obj_request) 1519bf0d5f50SAlex Elder { 152037206ee5SAlex Elder dout("%s: obj %p (was %d)\n", __func__, obj_request, 15212c935bc5SPeter Zijlstra kref_read(&obj_request->kref)); 1522bf0d5f50SAlex Elder kref_get(&obj_request->kref); 1523bf0d5f50SAlex Elder } 1524bf0d5f50SAlex Elder 1525bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref); 1526bf0d5f50SAlex Elder static void rbd_obj_request_put(struct rbd_obj_request *obj_request) 1527bf0d5f50SAlex Elder { 1528bf0d5f50SAlex Elder rbd_assert(obj_request != NULL); 152937206ee5SAlex Elder dout("%s: obj %p (was %d)\n", __func__, obj_request, 15302c935bc5SPeter Zijlstra kref_read(&obj_request->kref)); 1531bf0d5f50SAlex Elder kref_put(&obj_request->kref, rbd_obj_request_destroy); 1532bf0d5f50SAlex Elder } 1533bf0d5f50SAlex Elder 15340f2d5be7SAlex Elder static void rbd_img_request_get(struct rbd_img_request *img_request) 15350f2d5be7SAlex Elder { 15360f2d5be7SAlex Elder dout("%s: img %p (was %d)\n", __func__, img_request, 15372c935bc5SPeter Zijlstra kref_read(&img_request->kref)); 15380f2d5be7SAlex Elder kref_get(&img_request->kref); 15390f2d5be7SAlex Elder } 15400f2d5be7SAlex Elder 1541e93f3152SAlex Elder static bool img_request_child_test(struct rbd_img_request *img_request); 1542e93f3152SAlex Elder static void rbd_parent_request_destroy(struct kref *kref); 1543bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref); 1544bf0d5f50SAlex Elder static void rbd_img_request_put(struct rbd_img_request *img_request) 1545bf0d5f50SAlex Elder { 1546bf0d5f50SAlex Elder rbd_assert(img_request != NULL); 154737206ee5SAlex Elder dout("%s: img %p (was %d)\n", __func__, img_request, 15482c935bc5SPeter Zijlstra kref_read(&img_request->kref)); 1549e93f3152SAlex Elder if (img_request_child_test(img_request)) 1550e93f3152SAlex Elder kref_put(&img_request->kref, rbd_parent_request_destroy); 1551e93f3152SAlex Elder else 1552bf0d5f50SAlex Elder kref_put(&img_request->kref, rbd_img_request_destroy); 1553bf0d5f50SAlex Elder } 1554bf0d5f50SAlex Elder 1555bf0d5f50SAlex Elder static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request, 1556bf0d5f50SAlex Elder struct rbd_obj_request *obj_request) 1557bf0d5f50SAlex Elder { 155825dcf954SAlex Elder rbd_assert(obj_request->img_request == NULL); 155925dcf954SAlex Elder 1560b155e86cSAlex Elder /* Image request now owns object's original reference */ 1561bf0d5f50SAlex Elder obj_request->img_request = img_request; 156225dcf954SAlex Elder obj_request->which = img_request->obj_request_count; 15636365d33aSAlex Elder rbd_assert(!obj_request_img_data_test(obj_request)); 15646365d33aSAlex Elder obj_request_img_data_set(obj_request); 1565bf0d5f50SAlex Elder rbd_assert(obj_request->which != BAD_WHICH); 156625dcf954SAlex Elder img_request->obj_request_count++; 156725dcf954SAlex Elder list_add_tail(&obj_request->links, &img_request->obj_requests); 156837206ee5SAlex Elder dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request, 156937206ee5SAlex Elder obj_request->which); 1570bf0d5f50SAlex Elder } 1571bf0d5f50SAlex Elder 1572bf0d5f50SAlex Elder static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request, 1573bf0d5f50SAlex Elder struct rbd_obj_request *obj_request) 1574bf0d5f50SAlex Elder { 1575bf0d5f50SAlex Elder rbd_assert(obj_request->which != BAD_WHICH); 157625dcf954SAlex Elder 157737206ee5SAlex Elder dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request, 157837206ee5SAlex Elder obj_request->which); 1579bf0d5f50SAlex Elder list_del(&obj_request->links); 158025dcf954SAlex Elder rbd_assert(img_request->obj_request_count > 0); 158125dcf954SAlex Elder img_request->obj_request_count--; 158225dcf954SAlex Elder rbd_assert(obj_request->which == img_request->obj_request_count); 158325dcf954SAlex Elder obj_request->which = BAD_WHICH; 15846365d33aSAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 1585bf0d5f50SAlex Elder rbd_assert(obj_request->img_request == img_request); 1586bf0d5f50SAlex Elder obj_request->img_request = NULL; 158725dcf954SAlex Elder obj_request->callback = NULL; 1588bf0d5f50SAlex Elder rbd_obj_request_put(obj_request); 1589bf0d5f50SAlex Elder } 1590bf0d5f50SAlex Elder 1591bf0d5f50SAlex Elder static bool obj_request_type_valid(enum obj_request_type type) 1592bf0d5f50SAlex Elder { 1593bf0d5f50SAlex Elder switch (type) { 15949969ebc5SAlex Elder case OBJ_REQUEST_NODATA: 1595bf0d5f50SAlex Elder case OBJ_REQUEST_BIO: 1596788e2df3SAlex Elder case OBJ_REQUEST_PAGES: 1597bf0d5f50SAlex Elder return true; 1598bf0d5f50SAlex Elder default: 1599bf0d5f50SAlex Elder return false; 1600bf0d5f50SAlex Elder } 1601bf0d5f50SAlex Elder } 1602bf0d5f50SAlex Elder 16034a17dadcSIlya Dryomov static void rbd_img_obj_callback(struct rbd_obj_request *obj_request); 16044a17dadcSIlya Dryomov 1605980917fcSIlya Dryomov static void rbd_obj_request_submit(struct rbd_obj_request *obj_request) 1606bf0d5f50SAlex Elder { 1607980917fcSIlya Dryomov struct ceph_osd_request *osd_req = obj_request->osd_req; 1608980917fcSIlya Dryomov 1609a90bb0c1SIlya Dryomov dout("%s %p object_no %016llx %llu~%llu osd_req %p\n", __func__, 1610a90bb0c1SIlya Dryomov obj_request, obj_request->object_no, obj_request->offset, 161167e2b652SIlya Dryomov obj_request->length, osd_req); 16124a17dadcSIlya Dryomov if (obj_request_img_data_test(obj_request)) { 16134a17dadcSIlya Dryomov WARN_ON(obj_request->callback != rbd_img_obj_callback); 16144a17dadcSIlya Dryomov rbd_img_request_get(obj_request->img_request); 16154a17dadcSIlya Dryomov } 1616980917fcSIlya Dryomov ceph_osdc_start_request(osd_req->r_osdc, osd_req, false); 1617bf0d5f50SAlex Elder } 1618bf0d5f50SAlex Elder 1619bf0d5f50SAlex Elder static void rbd_img_request_complete(struct rbd_img_request *img_request) 1620bf0d5f50SAlex Elder { 162155f27e09SAlex Elder 162237206ee5SAlex Elder dout("%s: img %p\n", __func__, img_request); 162355f27e09SAlex Elder 162455f27e09SAlex Elder /* 162555f27e09SAlex Elder * If no error occurred, compute the aggregate transfer 162655f27e09SAlex Elder * count for the image request. We could instead use 162755f27e09SAlex Elder * atomic64_cmpxchg() to update it as each object request 162855f27e09SAlex Elder * completes; not clear which way is better off hand. 162955f27e09SAlex Elder */ 163055f27e09SAlex Elder if (!img_request->result) { 163155f27e09SAlex Elder struct rbd_obj_request *obj_request; 163255f27e09SAlex Elder u64 xferred = 0; 163355f27e09SAlex Elder 163455f27e09SAlex Elder for_each_obj_request(img_request, obj_request) 163555f27e09SAlex Elder xferred += obj_request->xferred; 163655f27e09SAlex Elder img_request->xferred = xferred; 163755f27e09SAlex Elder } 163855f27e09SAlex Elder 1639bf0d5f50SAlex Elder if (img_request->callback) 1640bf0d5f50SAlex Elder img_request->callback(img_request); 1641bf0d5f50SAlex Elder else 1642bf0d5f50SAlex Elder rbd_img_request_put(img_request); 1643bf0d5f50SAlex Elder } 1644bf0d5f50SAlex Elder 16450c425248SAlex Elder /* 16460c425248SAlex Elder * The default/initial value for all image request flags is 0. Each 16470c425248SAlex Elder * is conditionally set to 1 at image request initialization time 16480c425248SAlex Elder * and currently never change thereafter. 16490c425248SAlex Elder */ 16500c425248SAlex Elder static void img_request_write_set(struct rbd_img_request *img_request) 16510c425248SAlex Elder { 16520c425248SAlex Elder set_bit(IMG_REQ_WRITE, &img_request->flags); 16530c425248SAlex Elder smp_mb(); 16540c425248SAlex Elder } 16550c425248SAlex Elder 16560c425248SAlex Elder static bool img_request_write_test(struct rbd_img_request *img_request) 16570c425248SAlex Elder { 16580c425248SAlex Elder smp_mb(); 16590c425248SAlex Elder return test_bit(IMG_REQ_WRITE, &img_request->flags) != 0; 16600c425248SAlex Elder } 16610c425248SAlex Elder 166290e98c52SGuangliang Zhao /* 166390e98c52SGuangliang Zhao * Set the discard flag when the img_request is an discard request 166490e98c52SGuangliang Zhao */ 166590e98c52SGuangliang Zhao static void img_request_discard_set(struct rbd_img_request *img_request) 166690e98c52SGuangliang Zhao { 166790e98c52SGuangliang Zhao set_bit(IMG_REQ_DISCARD, &img_request->flags); 166890e98c52SGuangliang Zhao smp_mb(); 166990e98c52SGuangliang Zhao } 167090e98c52SGuangliang Zhao 167190e98c52SGuangliang Zhao static bool img_request_discard_test(struct rbd_img_request *img_request) 167290e98c52SGuangliang Zhao { 167390e98c52SGuangliang Zhao smp_mb(); 167490e98c52SGuangliang Zhao return test_bit(IMG_REQ_DISCARD, &img_request->flags) != 0; 167590e98c52SGuangliang Zhao } 167690e98c52SGuangliang Zhao 16779849e986SAlex Elder static void img_request_child_set(struct rbd_img_request *img_request) 16789849e986SAlex Elder { 16799849e986SAlex Elder set_bit(IMG_REQ_CHILD, &img_request->flags); 16809849e986SAlex Elder smp_mb(); 16819849e986SAlex Elder } 16829849e986SAlex Elder 1683e93f3152SAlex Elder static void img_request_child_clear(struct rbd_img_request *img_request) 1684e93f3152SAlex Elder { 1685e93f3152SAlex Elder clear_bit(IMG_REQ_CHILD, &img_request->flags); 1686e93f3152SAlex Elder smp_mb(); 1687e93f3152SAlex Elder } 1688e93f3152SAlex Elder 16899849e986SAlex Elder static bool img_request_child_test(struct rbd_img_request *img_request) 16909849e986SAlex Elder { 16919849e986SAlex Elder smp_mb(); 16929849e986SAlex Elder return test_bit(IMG_REQ_CHILD, &img_request->flags) != 0; 16939849e986SAlex Elder } 16949849e986SAlex Elder 1695d0b2e944SAlex Elder static void img_request_layered_set(struct rbd_img_request *img_request) 1696d0b2e944SAlex Elder { 1697d0b2e944SAlex Elder set_bit(IMG_REQ_LAYERED, &img_request->flags); 1698d0b2e944SAlex Elder smp_mb(); 1699d0b2e944SAlex Elder } 1700d0b2e944SAlex Elder 1701a2acd00eSAlex Elder static void img_request_layered_clear(struct rbd_img_request *img_request) 1702a2acd00eSAlex Elder { 1703a2acd00eSAlex Elder clear_bit(IMG_REQ_LAYERED, &img_request->flags); 1704a2acd00eSAlex Elder smp_mb(); 1705a2acd00eSAlex Elder } 1706a2acd00eSAlex Elder 1707d0b2e944SAlex Elder static bool img_request_layered_test(struct rbd_img_request *img_request) 1708d0b2e944SAlex Elder { 1709d0b2e944SAlex Elder smp_mb(); 1710d0b2e944SAlex Elder return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0; 1711d0b2e944SAlex Elder } 1712d0b2e944SAlex Elder 17133b434a2aSJosh Durgin static enum obj_operation_type 17143b434a2aSJosh Durgin rbd_img_request_op_type(struct rbd_img_request *img_request) 17153b434a2aSJosh Durgin { 17163b434a2aSJosh Durgin if (img_request_write_test(img_request)) 17173b434a2aSJosh Durgin return OBJ_OP_WRITE; 17183b434a2aSJosh Durgin else if (img_request_discard_test(img_request)) 17193b434a2aSJosh Durgin return OBJ_OP_DISCARD; 17203b434a2aSJosh Durgin else 17213b434a2aSJosh Durgin return OBJ_OP_READ; 17223b434a2aSJosh Durgin } 17233b434a2aSJosh Durgin 17246e2a4505SAlex Elder static void 17256e2a4505SAlex Elder rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request) 17266e2a4505SAlex Elder { 1727b9434c5bSAlex Elder u64 xferred = obj_request->xferred; 1728b9434c5bSAlex Elder u64 length = obj_request->length; 1729b9434c5bSAlex Elder 17306e2a4505SAlex Elder dout("%s: obj %p img %p result %d %llu/%llu\n", __func__, 17316e2a4505SAlex Elder obj_request, obj_request->img_request, obj_request->result, 1732b9434c5bSAlex Elder xferred, length); 17336e2a4505SAlex Elder /* 173417c1cc1dSJosh Durgin * ENOENT means a hole in the image. We zero-fill the entire 173517c1cc1dSJosh Durgin * length of the request. A short read also implies zero-fill 173617c1cc1dSJosh Durgin * to the end of the request. An error requires the whole 173717c1cc1dSJosh Durgin * length of the request to be reported finished with an error 173817c1cc1dSJosh Durgin * to the block layer. In each case we update the xferred 173917c1cc1dSJosh Durgin * count to indicate the whole request was satisfied. 17406e2a4505SAlex Elder */ 1741b9434c5bSAlex Elder rbd_assert(obj_request->type != OBJ_REQUEST_NODATA); 17426e2a4505SAlex Elder if (obj_request->result == -ENOENT) { 1743b9434c5bSAlex Elder if (obj_request->type == OBJ_REQUEST_BIO) 17446e2a4505SAlex Elder zero_bio_chain(obj_request->bio_list, 0); 1745b9434c5bSAlex Elder else 1746b9434c5bSAlex Elder zero_pages(obj_request->pages, 0, length); 17476e2a4505SAlex Elder obj_request->result = 0; 1748b9434c5bSAlex Elder } else if (xferred < length && !obj_request->result) { 1749b9434c5bSAlex Elder if (obj_request->type == OBJ_REQUEST_BIO) 1750b9434c5bSAlex Elder zero_bio_chain(obj_request->bio_list, xferred); 1751b9434c5bSAlex Elder else 1752b9434c5bSAlex Elder zero_pages(obj_request->pages, xferred, length); 17536e2a4505SAlex Elder } 175417c1cc1dSJosh Durgin obj_request->xferred = length; 17556e2a4505SAlex Elder obj_request_done_set(obj_request); 17566e2a4505SAlex Elder } 17576e2a4505SAlex Elder 1758bf0d5f50SAlex Elder static void rbd_obj_request_complete(struct rbd_obj_request *obj_request) 1759bf0d5f50SAlex Elder { 176037206ee5SAlex Elder dout("%s: obj %p cb %p\n", __func__, obj_request, 176137206ee5SAlex Elder obj_request->callback); 1762bf0d5f50SAlex Elder if (obj_request->callback) 1763bf0d5f50SAlex Elder obj_request->callback(obj_request); 1764788e2df3SAlex Elder else 1765788e2df3SAlex Elder complete_all(&obj_request->completion); 1766bf0d5f50SAlex Elder } 1767bf0d5f50SAlex Elder 17680dcc685eSIlya Dryomov static void rbd_obj_request_error(struct rbd_obj_request *obj_request, int err) 17690dcc685eSIlya Dryomov { 17700dcc685eSIlya Dryomov obj_request->result = err; 17710dcc685eSIlya Dryomov obj_request->xferred = 0; 17720dcc685eSIlya Dryomov /* 17730dcc685eSIlya Dryomov * kludge - mirror rbd_obj_request_submit() to match a put in 17740dcc685eSIlya Dryomov * rbd_img_obj_callback() 17750dcc685eSIlya Dryomov */ 17760dcc685eSIlya Dryomov if (obj_request_img_data_test(obj_request)) { 17770dcc685eSIlya Dryomov WARN_ON(obj_request->callback != rbd_img_obj_callback); 17780dcc685eSIlya Dryomov rbd_img_request_get(obj_request->img_request); 17790dcc685eSIlya Dryomov } 17800dcc685eSIlya Dryomov obj_request_done_set(obj_request); 17810dcc685eSIlya Dryomov rbd_obj_request_complete(obj_request); 17820dcc685eSIlya Dryomov } 17830dcc685eSIlya Dryomov 1784c47f9371SAlex Elder static void rbd_osd_read_callback(struct rbd_obj_request *obj_request) 1785bf0d5f50SAlex Elder { 178657acbaa7SAlex Elder struct rbd_img_request *img_request = NULL; 1787a9e8ba2cSAlex Elder struct rbd_device *rbd_dev = NULL; 178857acbaa7SAlex Elder bool layered = false; 178957acbaa7SAlex Elder 179057acbaa7SAlex Elder if (obj_request_img_data_test(obj_request)) { 179157acbaa7SAlex Elder img_request = obj_request->img_request; 179257acbaa7SAlex Elder layered = img_request && img_request_layered_test(img_request); 1793a9e8ba2cSAlex Elder rbd_dev = img_request->rbd_dev; 179457acbaa7SAlex Elder } 17958b3e1a56SAlex Elder 17968b3e1a56SAlex Elder dout("%s: obj %p img %p result %d %llu/%llu\n", __func__, 17978b3e1a56SAlex Elder obj_request, img_request, obj_request->result, 17988b3e1a56SAlex Elder obj_request->xferred, obj_request->length); 1799a9e8ba2cSAlex Elder if (layered && obj_request->result == -ENOENT && 1800a9e8ba2cSAlex Elder obj_request->img_offset < rbd_dev->parent_overlap) 18018b3e1a56SAlex Elder rbd_img_parent_read(obj_request); 18028b3e1a56SAlex Elder else if (img_request) 18036e2a4505SAlex Elder rbd_img_obj_request_read_callback(obj_request); 18046e2a4505SAlex Elder else 180507741308SAlex Elder obj_request_done_set(obj_request); 1806bf0d5f50SAlex Elder } 1807bf0d5f50SAlex Elder 1808c47f9371SAlex Elder static void rbd_osd_write_callback(struct rbd_obj_request *obj_request) 1809bf0d5f50SAlex Elder { 18101b83bef2SSage Weil dout("%s: obj %p result %d %llu\n", __func__, obj_request, 18111b83bef2SSage Weil obj_request->result, obj_request->length); 18121b83bef2SSage Weil /* 18138b3e1a56SAlex Elder * There is no such thing as a successful short write. Set 18148b3e1a56SAlex Elder * it to our originally-requested length. 18151b83bef2SSage Weil */ 18161b83bef2SSage Weil obj_request->xferred = obj_request->length; 181707741308SAlex Elder obj_request_done_set(obj_request); 1818bf0d5f50SAlex Elder } 1819bf0d5f50SAlex Elder 182090e98c52SGuangliang Zhao static void rbd_osd_discard_callback(struct rbd_obj_request *obj_request) 182190e98c52SGuangliang Zhao { 182290e98c52SGuangliang Zhao dout("%s: obj %p result %d %llu\n", __func__, obj_request, 182390e98c52SGuangliang Zhao obj_request->result, obj_request->length); 182490e98c52SGuangliang Zhao /* 182590e98c52SGuangliang Zhao * There is no such thing as a successful short discard. Set 182690e98c52SGuangliang Zhao * it to our originally-requested length. 182790e98c52SGuangliang Zhao */ 182890e98c52SGuangliang Zhao obj_request->xferred = obj_request->length; 1829d0265de7SJosh Durgin /* discarding a non-existent object is not a problem */ 1830d0265de7SJosh Durgin if (obj_request->result == -ENOENT) 1831d0265de7SJosh Durgin obj_request->result = 0; 183290e98c52SGuangliang Zhao obj_request_done_set(obj_request); 183390e98c52SGuangliang Zhao } 183490e98c52SGuangliang Zhao 1835fbfab539SAlex Elder /* 1836fbfab539SAlex Elder * For a simple stat call there's nothing to do. We'll do more if 1837fbfab539SAlex Elder * this is part of a write sequence for a layered image. 1838fbfab539SAlex Elder */ 1839c47f9371SAlex Elder static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request) 1840fbfab539SAlex Elder { 184137206ee5SAlex Elder dout("%s: obj %p\n", __func__, obj_request); 1842fbfab539SAlex Elder obj_request_done_set(obj_request); 1843fbfab539SAlex Elder } 1844fbfab539SAlex Elder 18452761713dSIlya Dryomov static void rbd_osd_call_callback(struct rbd_obj_request *obj_request) 18462761713dSIlya Dryomov { 18472761713dSIlya Dryomov dout("%s: obj %p\n", __func__, obj_request); 18482761713dSIlya Dryomov 18492761713dSIlya Dryomov if (obj_request_img_data_test(obj_request)) 18502761713dSIlya Dryomov rbd_osd_copyup_callback(obj_request); 18512761713dSIlya Dryomov else 18522761713dSIlya Dryomov obj_request_done_set(obj_request); 18532761713dSIlya Dryomov } 18542761713dSIlya Dryomov 185585e084feSIlya Dryomov static void rbd_osd_req_callback(struct ceph_osd_request *osd_req) 1856bf0d5f50SAlex Elder { 1857bf0d5f50SAlex Elder struct rbd_obj_request *obj_request = osd_req->r_priv; 1858bf0d5f50SAlex Elder u16 opcode; 1859bf0d5f50SAlex Elder 186085e084feSIlya Dryomov dout("%s: osd_req %p\n", __func__, osd_req); 1861bf0d5f50SAlex Elder rbd_assert(osd_req == obj_request->osd_req); 186257acbaa7SAlex Elder if (obj_request_img_data_test(obj_request)) { 186357acbaa7SAlex Elder rbd_assert(obj_request->img_request); 186457acbaa7SAlex Elder rbd_assert(obj_request->which != BAD_WHICH); 186557acbaa7SAlex Elder } else { 186657acbaa7SAlex Elder rbd_assert(obj_request->which == BAD_WHICH); 186757acbaa7SAlex Elder } 1868bf0d5f50SAlex Elder 18691b83bef2SSage Weil if (osd_req->r_result < 0) 18701b83bef2SSage Weil obj_request->result = osd_req->r_result; 1871bf0d5f50SAlex Elder 1872c47f9371SAlex Elder /* 1873c47f9371SAlex Elder * We support a 64-bit length, but ultimately it has to be 18747ad18afaSChristoph Hellwig * passed to the block layer, which just supports a 32-bit 18757ad18afaSChristoph Hellwig * length field. 1876c47f9371SAlex Elder */ 18777665d85bSYan, Zheng obj_request->xferred = osd_req->r_ops[0].outdata_len; 1878c47f9371SAlex Elder rbd_assert(obj_request->xferred < (u64)UINT_MAX); 18790ccd5926SIlya Dryomov 188079528734SAlex Elder opcode = osd_req->r_ops[0].op; 1881bf0d5f50SAlex Elder switch (opcode) { 1882bf0d5f50SAlex Elder case CEPH_OSD_OP_READ: 1883c47f9371SAlex Elder rbd_osd_read_callback(obj_request); 1884bf0d5f50SAlex Elder break; 18850ccd5926SIlya Dryomov case CEPH_OSD_OP_SETALLOCHINT: 1886e30b7577SIlya Dryomov rbd_assert(osd_req->r_ops[1].op == CEPH_OSD_OP_WRITE || 1887e30b7577SIlya Dryomov osd_req->r_ops[1].op == CEPH_OSD_OP_WRITEFULL); 18880ccd5926SIlya Dryomov /* fall through */ 1889bf0d5f50SAlex Elder case CEPH_OSD_OP_WRITE: 1890e30b7577SIlya Dryomov case CEPH_OSD_OP_WRITEFULL: 1891c47f9371SAlex Elder rbd_osd_write_callback(obj_request); 1892bf0d5f50SAlex Elder break; 1893fbfab539SAlex Elder case CEPH_OSD_OP_STAT: 1894c47f9371SAlex Elder rbd_osd_stat_callback(obj_request); 1895fbfab539SAlex Elder break; 189690e98c52SGuangliang Zhao case CEPH_OSD_OP_DELETE: 189790e98c52SGuangliang Zhao case CEPH_OSD_OP_TRUNCATE: 189890e98c52SGuangliang Zhao case CEPH_OSD_OP_ZERO: 189990e98c52SGuangliang Zhao rbd_osd_discard_callback(obj_request); 190090e98c52SGuangliang Zhao break; 190136be9a76SAlex Elder case CEPH_OSD_OP_CALL: 19022761713dSIlya Dryomov rbd_osd_call_callback(obj_request); 19032761713dSIlya Dryomov break; 1904bf0d5f50SAlex Elder default: 1905a90bb0c1SIlya Dryomov rbd_warn(NULL, "unexpected OSD op: object_no %016llx opcode %d", 1906a90bb0c1SIlya Dryomov obj_request->object_no, opcode); 1907bf0d5f50SAlex Elder break; 1908bf0d5f50SAlex Elder } 1909bf0d5f50SAlex Elder 191007741308SAlex Elder if (obj_request_done_test(obj_request)) 1911bf0d5f50SAlex Elder rbd_obj_request_complete(obj_request); 1912bf0d5f50SAlex Elder } 1913bf0d5f50SAlex Elder 19149d4df01fSAlex Elder static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request) 1915430c28c3SAlex Elder { 19168c042b0dSAlex Elder struct ceph_osd_request *osd_req = obj_request->osd_req; 1917430c28c3SAlex Elder 19187c84883aSIlya Dryomov rbd_assert(obj_request_img_data_test(obj_request)); 19197c84883aSIlya Dryomov osd_req->r_snapid = obj_request->img_request->snap_id; 19209d4df01fSAlex Elder } 19219d4df01fSAlex Elder 19229d4df01fSAlex Elder static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request) 19239d4df01fSAlex Elder { 19249d4df01fSAlex Elder struct ceph_osd_request *osd_req = obj_request->osd_req; 19259d4df01fSAlex Elder 1926bb873b53SIlya Dryomov osd_req->r_mtime = CURRENT_TIME; 1927bb873b53SIlya Dryomov osd_req->r_data_offset = obj_request->offset; 1928430c28c3SAlex Elder } 1929430c28c3SAlex Elder 1930bc81207eSIlya Dryomov static struct ceph_osd_request * 1931bc81207eSIlya Dryomov __rbd_osd_req_create(struct rbd_device *rbd_dev, 1932bc81207eSIlya Dryomov struct ceph_snap_context *snapc, 1933bc81207eSIlya Dryomov int num_ops, unsigned int flags, 1934bc81207eSIlya Dryomov struct rbd_obj_request *obj_request) 1935bc81207eSIlya Dryomov { 1936bc81207eSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 1937bc81207eSIlya Dryomov struct ceph_osd_request *req; 1938a90bb0c1SIlya Dryomov const char *name_format = rbd_dev->image_format == 1 ? 1939a90bb0c1SIlya Dryomov RBD_V1_DATA_FORMAT : RBD_V2_DATA_FORMAT; 1940bc81207eSIlya Dryomov 1941bc81207eSIlya Dryomov req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false, GFP_NOIO); 1942bc81207eSIlya Dryomov if (!req) 1943bc81207eSIlya Dryomov return NULL; 1944bc81207eSIlya Dryomov 1945bc81207eSIlya Dryomov req->r_flags = flags; 1946bc81207eSIlya Dryomov req->r_callback = rbd_osd_req_callback; 1947bc81207eSIlya Dryomov req->r_priv = obj_request; 1948bc81207eSIlya Dryomov 1949bc81207eSIlya Dryomov req->r_base_oloc.pool = rbd_dev->layout.pool_id; 1950a90bb0c1SIlya Dryomov if (ceph_oid_aprintf(&req->r_base_oid, GFP_NOIO, name_format, 1951a90bb0c1SIlya Dryomov rbd_dev->header.object_prefix, obj_request->object_no)) 1952bc81207eSIlya Dryomov goto err_req; 1953bc81207eSIlya Dryomov 1954bc81207eSIlya Dryomov if (ceph_osdc_alloc_messages(req, GFP_NOIO)) 1955bc81207eSIlya Dryomov goto err_req; 1956bc81207eSIlya Dryomov 1957bc81207eSIlya Dryomov return req; 1958bc81207eSIlya Dryomov 1959bc81207eSIlya Dryomov err_req: 1960bc81207eSIlya Dryomov ceph_osdc_put_request(req); 1961bc81207eSIlya Dryomov return NULL; 1962bc81207eSIlya Dryomov } 1963bc81207eSIlya Dryomov 19640ccd5926SIlya Dryomov /* 19650ccd5926SIlya Dryomov * Create an osd request. A read request has one osd op (read). 19660ccd5926SIlya Dryomov * A write request has either one (watch) or two (hint+write) osd ops. 19670ccd5926SIlya Dryomov * (All rbd data writes are prefixed with an allocation hint op, but 19680ccd5926SIlya Dryomov * technically osd watch is a write request, hence this distinction.) 19690ccd5926SIlya Dryomov */ 1970bf0d5f50SAlex Elder static struct ceph_osd_request *rbd_osd_req_create( 1971bf0d5f50SAlex Elder struct rbd_device *rbd_dev, 19726d2940c8SGuangliang Zhao enum obj_operation_type op_type, 1973deb236b3SIlya Dryomov unsigned int num_ops, 1974430c28c3SAlex Elder struct rbd_obj_request *obj_request) 1975bf0d5f50SAlex Elder { 1976bf0d5f50SAlex Elder struct ceph_snap_context *snapc = NULL; 1977bf0d5f50SAlex Elder 197890e98c52SGuangliang Zhao if (obj_request_img_data_test(obj_request) && 197990e98c52SGuangliang Zhao (op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_WRITE)) { 19806365d33aSAlex Elder struct rbd_img_request *img_request = obj_request->img_request; 198190e98c52SGuangliang Zhao if (op_type == OBJ_OP_WRITE) { 19826d2940c8SGuangliang Zhao rbd_assert(img_request_write_test(img_request)); 198390e98c52SGuangliang Zhao } else { 198490e98c52SGuangliang Zhao rbd_assert(img_request_discard_test(img_request)); 198590e98c52SGuangliang Zhao } 1986bf0d5f50SAlex Elder snapc = img_request->snapc; 1987bf0d5f50SAlex Elder } 1988bf0d5f50SAlex Elder 19896d2940c8SGuangliang Zhao rbd_assert(num_ops == 1 || ((op_type == OBJ_OP_WRITE) && num_ops == 2)); 1990deb236b3SIlya Dryomov 1991bc81207eSIlya Dryomov return __rbd_osd_req_create(rbd_dev, snapc, num_ops, 1992bc81207eSIlya Dryomov (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD) ? 199354ea0046SIlya Dryomov CEPH_OSD_FLAG_WRITE : CEPH_OSD_FLAG_READ, obj_request); 1994bf0d5f50SAlex Elder } 1995bf0d5f50SAlex Elder 19960eefd470SAlex Elder /* 1997d3246fb0SJosh Durgin * Create a copyup osd request based on the information in the object 1998d3246fb0SJosh Durgin * request supplied. A copyup request has two or three osd ops, a 1999d3246fb0SJosh Durgin * copyup method call, potentially a hint op, and a write or truncate 2000d3246fb0SJosh Durgin * or zero op. 20010eefd470SAlex Elder */ 20020eefd470SAlex Elder static struct ceph_osd_request * 20030eefd470SAlex Elder rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request) 20040eefd470SAlex Elder { 20050eefd470SAlex Elder struct rbd_img_request *img_request; 2006d3246fb0SJosh Durgin int num_osd_ops = 3; 20070eefd470SAlex Elder 20080eefd470SAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 20090eefd470SAlex Elder img_request = obj_request->img_request; 20100eefd470SAlex Elder rbd_assert(img_request); 2011d3246fb0SJosh Durgin rbd_assert(img_request_write_test(img_request) || 2012d3246fb0SJosh Durgin img_request_discard_test(img_request)); 20130eefd470SAlex Elder 2014d3246fb0SJosh Durgin if (img_request_discard_test(img_request)) 2015d3246fb0SJosh Durgin num_osd_ops = 2; 2016d3246fb0SJosh Durgin 2017bc81207eSIlya Dryomov return __rbd_osd_req_create(img_request->rbd_dev, 2018bc81207eSIlya Dryomov img_request->snapc, num_osd_ops, 201954ea0046SIlya Dryomov CEPH_OSD_FLAG_WRITE, obj_request); 20200eefd470SAlex Elder } 20210eefd470SAlex Elder 2022bf0d5f50SAlex Elder static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req) 2023bf0d5f50SAlex Elder { 2024bf0d5f50SAlex Elder ceph_osdc_put_request(osd_req); 2025bf0d5f50SAlex Elder } 2026bf0d5f50SAlex Elder 20276c696d85SIlya Dryomov static struct rbd_obj_request * 20286c696d85SIlya Dryomov rbd_obj_request_create(enum obj_request_type type) 2029bf0d5f50SAlex Elder { 2030bf0d5f50SAlex Elder struct rbd_obj_request *obj_request; 2031bf0d5f50SAlex Elder 2032bf0d5f50SAlex Elder rbd_assert(obj_request_type_valid(type)); 2033bf0d5f50SAlex Elder 20345a60e876SIlya Dryomov obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_NOIO); 20356c696d85SIlya Dryomov if (!obj_request) 2036f907ad55SAlex Elder return NULL; 2037f907ad55SAlex Elder 2038bf0d5f50SAlex Elder obj_request->which = BAD_WHICH; 2039bf0d5f50SAlex Elder obj_request->type = type; 2040bf0d5f50SAlex Elder INIT_LIST_HEAD(&obj_request->links); 2041788e2df3SAlex Elder init_completion(&obj_request->completion); 2042bf0d5f50SAlex Elder kref_init(&obj_request->kref); 2043bf0d5f50SAlex Elder 204467e2b652SIlya Dryomov dout("%s %p\n", __func__, obj_request); 2045bf0d5f50SAlex Elder return obj_request; 2046bf0d5f50SAlex Elder } 2047bf0d5f50SAlex Elder 2048bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref) 2049bf0d5f50SAlex Elder { 2050bf0d5f50SAlex Elder struct rbd_obj_request *obj_request; 2051bf0d5f50SAlex Elder 2052bf0d5f50SAlex Elder obj_request = container_of(kref, struct rbd_obj_request, kref); 2053bf0d5f50SAlex Elder 205437206ee5SAlex Elder dout("%s: obj %p\n", __func__, obj_request); 205537206ee5SAlex Elder 2056bf0d5f50SAlex Elder rbd_assert(obj_request->img_request == NULL); 2057bf0d5f50SAlex Elder rbd_assert(obj_request->which == BAD_WHICH); 2058bf0d5f50SAlex Elder 2059bf0d5f50SAlex Elder if (obj_request->osd_req) 2060bf0d5f50SAlex Elder rbd_osd_req_destroy(obj_request->osd_req); 2061bf0d5f50SAlex Elder 2062bf0d5f50SAlex Elder rbd_assert(obj_request_type_valid(obj_request->type)); 2063bf0d5f50SAlex Elder switch (obj_request->type) { 20649969ebc5SAlex Elder case OBJ_REQUEST_NODATA: 20659969ebc5SAlex Elder break; /* Nothing to do */ 2066bf0d5f50SAlex Elder case OBJ_REQUEST_BIO: 2067bf0d5f50SAlex Elder if (obj_request->bio_list) 2068bf0d5f50SAlex Elder bio_chain_put(obj_request->bio_list); 2069bf0d5f50SAlex Elder break; 2070788e2df3SAlex Elder case OBJ_REQUEST_PAGES: 207104dc923cSIlya Dryomov /* img_data requests don't own their page array */ 207204dc923cSIlya Dryomov if (obj_request->pages && 207304dc923cSIlya Dryomov !obj_request_img_data_test(obj_request)) 2074788e2df3SAlex Elder ceph_release_page_vector(obj_request->pages, 2075788e2df3SAlex Elder obj_request->page_count); 2076788e2df3SAlex Elder break; 2077bf0d5f50SAlex Elder } 2078bf0d5f50SAlex Elder 2079868311b1SAlex Elder kmem_cache_free(rbd_obj_request_cache, obj_request); 2080bf0d5f50SAlex Elder } 2081bf0d5f50SAlex Elder 2082fb65d228SAlex Elder /* It's OK to call this for a device with no parent */ 2083fb65d228SAlex Elder 2084fb65d228SAlex Elder static void rbd_spec_put(struct rbd_spec *spec); 2085fb65d228SAlex Elder static void rbd_dev_unparent(struct rbd_device *rbd_dev) 2086fb65d228SAlex Elder { 2087fb65d228SAlex Elder rbd_dev_remove_parent(rbd_dev); 2088fb65d228SAlex Elder rbd_spec_put(rbd_dev->parent_spec); 2089fb65d228SAlex Elder rbd_dev->parent_spec = NULL; 2090fb65d228SAlex Elder rbd_dev->parent_overlap = 0; 2091fb65d228SAlex Elder } 2092fb65d228SAlex Elder 2093bf0d5f50SAlex Elder /* 2094a2acd00eSAlex Elder * Parent image reference counting is used to determine when an 2095a2acd00eSAlex Elder * image's parent fields can be safely torn down--after there are no 2096a2acd00eSAlex Elder * more in-flight requests to the parent image. When the last 2097a2acd00eSAlex Elder * reference is dropped, cleaning them up is safe. 2098a2acd00eSAlex Elder */ 2099a2acd00eSAlex Elder static void rbd_dev_parent_put(struct rbd_device *rbd_dev) 2100a2acd00eSAlex Elder { 2101a2acd00eSAlex Elder int counter; 2102a2acd00eSAlex Elder 2103a2acd00eSAlex Elder if (!rbd_dev->parent_spec) 2104a2acd00eSAlex Elder return; 2105a2acd00eSAlex Elder 2106a2acd00eSAlex Elder counter = atomic_dec_return_safe(&rbd_dev->parent_ref); 2107a2acd00eSAlex Elder if (counter > 0) 2108a2acd00eSAlex Elder return; 2109a2acd00eSAlex Elder 2110a2acd00eSAlex Elder /* Last reference; clean up parent data structures */ 2111a2acd00eSAlex Elder 2112a2acd00eSAlex Elder if (!counter) 2113a2acd00eSAlex Elder rbd_dev_unparent(rbd_dev); 2114a2acd00eSAlex Elder else 21159584d508SIlya Dryomov rbd_warn(rbd_dev, "parent reference underflow"); 2116a2acd00eSAlex Elder } 2117a2acd00eSAlex Elder 2118a2acd00eSAlex Elder /* 2119a2acd00eSAlex Elder * If an image has a non-zero parent overlap, get a reference to its 2120a2acd00eSAlex Elder * parent. 2121a2acd00eSAlex Elder * 2122a2acd00eSAlex Elder * Returns true if the rbd device has a parent with a non-zero 2123a2acd00eSAlex Elder * overlap and a reference for it was successfully taken, or 2124a2acd00eSAlex Elder * false otherwise. 2125a2acd00eSAlex Elder */ 2126a2acd00eSAlex Elder static bool rbd_dev_parent_get(struct rbd_device *rbd_dev) 2127a2acd00eSAlex Elder { 2128ae43e9d0SIlya Dryomov int counter = 0; 2129a2acd00eSAlex Elder 2130a2acd00eSAlex Elder if (!rbd_dev->parent_spec) 2131a2acd00eSAlex Elder return false; 2132a2acd00eSAlex Elder 2133ae43e9d0SIlya Dryomov down_read(&rbd_dev->header_rwsem); 2134ae43e9d0SIlya Dryomov if (rbd_dev->parent_overlap) 2135a2acd00eSAlex Elder counter = atomic_inc_return_safe(&rbd_dev->parent_ref); 2136ae43e9d0SIlya Dryomov up_read(&rbd_dev->header_rwsem); 2137a2acd00eSAlex Elder 2138a2acd00eSAlex Elder if (counter < 0) 21399584d508SIlya Dryomov rbd_warn(rbd_dev, "parent reference overflow"); 2140a2acd00eSAlex Elder 2141ae43e9d0SIlya Dryomov return counter > 0; 2142a2acd00eSAlex Elder } 2143a2acd00eSAlex Elder 2144bf0d5f50SAlex Elder /* 2145bf0d5f50SAlex Elder * Caller is responsible for filling in the list of object requests 2146bf0d5f50SAlex Elder * that comprises the image request, and the Linux request pointer 2147bf0d5f50SAlex Elder * (if there is one). 2148bf0d5f50SAlex Elder */ 2149cc344fa1SAlex Elder static struct rbd_img_request *rbd_img_request_create( 2150cc344fa1SAlex Elder struct rbd_device *rbd_dev, 2151bf0d5f50SAlex Elder u64 offset, u64 length, 21526d2940c8SGuangliang Zhao enum obj_operation_type op_type, 21534e752f0aSJosh Durgin struct ceph_snap_context *snapc) 2154bf0d5f50SAlex Elder { 2155bf0d5f50SAlex Elder struct rbd_img_request *img_request; 2156bf0d5f50SAlex Elder 21577a716aacSIlya Dryomov img_request = kmem_cache_alloc(rbd_img_request_cache, GFP_NOIO); 2158bf0d5f50SAlex Elder if (!img_request) 2159bf0d5f50SAlex Elder return NULL; 2160bf0d5f50SAlex Elder 2161bf0d5f50SAlex Elder img_request->rq = NULL; 2162bf0d5f50SAlex Elder img_request->rbd_dev = rbd_dev; 2163bf0d5f50SAlex Elder img_request->offset = offset; 2164bf0d5f50SAlex Elder img_request->length = length; 21650c425248SAlex Elder img_request->flags = 0; 216690e98c52SGuangliang Zhao if (op_type == OBJ_OP_DISCARD) { 216790e98c52SGuangliang Zhao img_request_discard_set(img_request); 216890e98c52SGuangliang Zhao img_request->snapc = snapc; 216990e98c52SGuangliang Zhao } else if (op_type == OBJ_OP_WRITE) { 21700c425248SAlex Elder img_request_write_set(img_request); 21714e752f0aSJosh Durgin img_request->snapc = snapc; 21720c425248SAlex Elder } else { 2173bf0d5f50SAlex Elder img_request->snap_id = rbd_dev->spec->snap_id; 21740c425248SAlex Elder } 2175a2acd00eSAlex Elder if (rbd_dev_parent_get(rbd_dev)) 2176d0b2e944SAlex Elder img_request_layered_set(img_request); 2177bf0d5f50SAlex Elder spin_lock_init(&img_request->completion_lock); 2178bf0d5f50SAlex Elder img_request->next_completion = 0; 2179bf0d5f50SAlex Elder img_request->callback = NULL; 2180a5a337d4SAlex Elder img_request->result = 0; 2181bf0d5f50SAlex Elder img_request->obj_request_count = 0; 2182bf0d5f50SAlex Elder INIT_LIST_HEAD(&img_request->obj_requests); 2183bf0d5f50SAlex Elder kref_init(&img_request->kref); 2184bf0d5f50SAlex Elder 218537206ee5SAlex Elder dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev, 21866d2940c8SGuangliang Zhao obj_op_name(op_type), offset, length, img_request); 218737206ee5SAlex Elder 2188bf0d5f50SAlex Elder return img_request; 2189bf0d5f50SAlex Elder } 2190bf0d5f50SAlex Elder 2191bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref) 2192bf0d5f50SAlex Elder { 2193bf0d5f50SAlex Elder struct rbd_img_request *img_request; 2194bf0d5f50SAlex Elder struct rbd_obj_request *obj_request; 2195bf0d5f50SAlex Elder struct rbd_obj_request *next_obj_request; 2196bf0d5f50SAlex Elder 2197bf0d5f50SAlex Elder img_request = container_of(kref, struct rbd_img_request, kref); 2198bf0d5f50SAlex Elder 219937206ee5SAlex Elder dout("%s: img %p\n", __func__, img_request); 220037206ee5SAlex Elder 2201bf0d5f50SAlex Elder for_each_obj_request_safe(img_request, obj_request, next_obj_request) 2202bf0d5f50SAlex Elder rbd_img_obj_request_del(img_request, obj_request); 220325dcf954SAlex Elder rbd_assert(img_request->obj_request_count == 0); 2204bf0d5f50SAlex Elder 2205a2acd00eSAlex Elder if (img_request_layered_test(img_request)) { 2206a2acd00eSAlex Elder img_request_layered_clear(img_request); 2207a2acd00eSAlex Elder rbd_dev_parent_put(img_request->rbd_dev); 2208a2acd00eSAlex Elder } 2209a2acd00eSAlex Elder 2210bef95455SJosh Durgin if (img_request_write_test(img_request) || 2211bef95455SJosh Durgin img_request_discard_test(img_request)) 2212812164f8SAlex Elder ceph_put_snap_context(img_request->snapc); 2213bf0d5f50SAlex Elder 22141c2a9dfeSAlex Elder kmem_cache_free(rbd_img_request_cache, img_request); 2215bf0d5f50SAlex Elder } 2216bf0d5f50SAlex Elder 2217e93f3152SAlex Elder static struct rbd_img_request *rbd_parent_request_create( 2218e93f3152SAlex Elder struct rbd_obj_request *obj_request, 2219e93f3152SAlex Elder u64 img_offset, u64 length) 2220e93f3152SAlex Elder { 2221e93f3152SAlex Elder struct rbd_img_request *parent_request; 2222e93f3152SAlex Elder struct rbd_device *rbd_dev; 2223e93f3152SAlex Elder 2224e93f3152SAlex Elder rbd_assert(obj_request->img_request); 2225e93f3152SAlex Elder rbd_dev = obj_request->img_request->rbd_dev; 2226e93f3152SAlex Elder 22274e752f0aSJosh Durgin parent_request = rbd_img_request_create(rbd_dev->parent, img_offset, 22286d2940c8SGuangliang Zhao length, OBJ_OP_READ, NULL); 2229e93f3152SAlex Elder if (!parent_request) 2230e93f3152SAlex Elder return NULL; 2231e93f3152SAlex Elder 2232e93f3152SAlex Elder img_request_child_set(parent_request); 2233e93f3152SAlex Elder rbd_obj_request_get(obj_request); 2234e93f3152SAlex Elder parent_request->obj_request = obj_request; 2235e93f3152SAlex Elder 2236e93f3152SAlex Elder return parent_request; 2237e93f3152SAlex Elder } 2238e93f3152SAlex Elder 2239e93f3152SAlex Elder static void rbd_parent_request_destroy(struct kref *kref) 2240e93f3152SAlex Elder { 2241e93f3152SAlex Elder struct rbd_img_request *parent_request; 2242e93f3152SAlex Elder struct rbd_obj_request *orig_request; 2243e93f3152SAlex Elder 2244e93f3152SAlex Elder parent_request = container_of(kref, struct rbd_img_request, kref); 2245e93f3152SAlex Elder orig_request = parent_request->obj_request; 2246e93f3152SAlex Elder 2247e93f3152SAlex Elder parent_request->obj_request = NULL; 2248e93f3152SAlex Elder rbd_obj_request_put(orig_request); 2249e93f3152SAlex Elder img_request_child_clear(parent_request); 2250e93f3152SAlex Elder 2251e93f3152SAlex Elder rbd_img_request_destroy(kref); 2252e93f3152SAlex Elder } 2253e93f3152SAlex Elder 22541217857fSAlex Elder static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request) 22551217857fSAlex Elder { 22566365d33aSAlex Elder struct rbd_img_request *img_request; 22571217857fSAlex Elder unsigned int xferred; 22581217857fSAlex Elder int result; 22598b3e1a56SAlex Elder bool more; 22601217857fSAlex Elder 22616365d33aSAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 22626365d33aSAlex Elder img_request = obj_request->img_request; 22636365d33aSAlex Elder 22641217857fSAlex Elder rbd_assert(obj_request->xferred <= (u64)UINT_MAX); 22651217857fSAlex Elder xferred = (unsigned int)obj_request->xferred; 22661217857fSAlex Elder result = obj_request->result; 22671217857fSAlex Elder if (result) { 22681217857fSAlex Elder struct rbd_device *rbd_dev = img_request->rbd_dev; 22696d2940c8SGuangliang Zhao enum obj_operation_type op_type; 22706d2940c8SGuangliang Zhao 227190e98c52SGuangliang Zhao if (img_request_discard_test(img_request)) 227290e98c52SGuangliang Zhao op_type = OBJ_OP_DISCARD; 227390e98c52SGuangliang Zhao else if (img_request_write_test(img_request)) 227490e98c52SGuangliang Zhao op_type = OBJ_OP_WRITE; 227590e98c52SGuangliang Zhao else 227690e98c52SGuangliang Zhao op_type = OBJ_OP_READ; 22771217857fSAlex Elder 22789584d508SIlya Dryomov rbd_warn(rbd_dev, "%s %llx at %llx (%llx)", 22796d2940c8SGuangliang Zhao obj_op_name(op_type), obj_request->length, 22806d2940c8SGuangliang Zhao obj_request->img_offset, obj_request->offset); 22819584d508SIlya Dryomov rbd_warn(rbd_dev, " result %d xferred %x", 22821217857fSAlex Elder result, xferred); 22831217857fSAlex Elder if (!img_request->result) 22841217857fSAlex Elder img_request->result = result; 2285082a75daSIlya Dryomov /* 2286082a75daSIlya Dryomov * Need to end I/O on the entire obj_request worth of 2287082a75daSIlya Dryomov * bytes in case of error. 2288082a75daSIlya Dryomov */ 2289082a75daSIlya Dryomov xferred = obj_request->length; 22901217857fSAlex Elder } 22911217857fSAlex Elder 22928b3e1a56SAlex Elder if (img_request_child_test(img_request)) { 22938b3e1a56SAlex Elder rbd_assert(img_request->obj_request != NULL); 22948b3e1a56SAlex Elder more = obj_request->which < img_request->obj_request_count - 1; 22958b3e1a56SAlex Elder } else { 22968b3e1a56SAlex Elder rbd_assert(img_request->rq != NULL); 22977ad18afaSChristoph Hellwig 22987ad18afaSChristoph Hellwig more = blk_update_request(img_request->rq, result, xferred); 22997ad18afaSChristoph Hellwig if (!more) 23007ad18afaSChristoph Hellwig __blk_mq_end_request(img_request->rq, result); 23018b3e1a56SAlex Elder } 23028b3e1a56SAlex Elder 23038b3e1a56SAlex Elder return more; 23041217857fSAlex Elder } 23051217857fSAlex Elder 23062169238dSAlex Elder static void rbd_img_obj_callback(struct rbd_obj_request *obj_request) 23072169238dSAlex Elder { 23082169238dSAlex Elder struct rbd_img_request *img_request; 23092169238dSAlex Elder u32 which = obj_request->which; 23102169238dSAlex Elder bool more = true; 23112169238dSAlex Elder 23126365d33aSAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 23132169238dSAlex Elder img_request = obj_request->img_request; 23142169238dSAlex Elder 23152169238dSAlex Elder dout("%s: img %p obj %p\n", __func__, img_request, obj_request); 23162169238dSAlex Elder rbd_assert(img_request != NULL); 23172169238dSAlex Elder rbd_assert(img_request->obj_request_count > 0); 23182169238dSAlex Elder rbd_assert(which != BAD_WHICH); 23192169238dSAlex Elder rbd_assert(which < img_request->obj_request_count); 23202169238dSAlex Elder 23212169238dSAlex Elder spin_lock_irq(&img_request->completion_lock); 23222169238dSAlex Elder if (which != img_request->next_completion) 23232169238dSAlex Elder goto out; 23242169238dSAlex Elder 23252169238dSAlex Elder for_each_obj_request_from(img_request, obj_request) { 23262169238dSAlex Elder rbd_assert(more); 23272169238dSAlex Elder rbd_assert(which < img_request->obj_request_count); 23282169238dSAlex Elder 23292169238dSAlex Elder if (!obj_request_done_test(obj_request)) 23302169238dSAlex Elder break; 23311217857fSAlex Elder more = rbd_img_obj_end_request(obj_request); 23322169238dSAlex Elder which++; 23332169238dSAlex Elder } 23342169238dSAlex Elder 23352169238dSAlex Elder rbd_assert(more ^ (which == img_request->obj_request_count)); 23362169238dSAlex Elder img_request->next_completion = which; 23372169238dSAlex Elder out: 23382169238dSAlex Elder spin_unlock_irq(&img_request->completion_lock); 23390f2d5be7SAlex Elder rbd_img_request_put(img_request); 23402169238dSAlex Elder 23412169238dSAlex Elder if (!more) 23422169238dSAlex Elder rbd_img_request_complete(img_request); 23432169238dSAlex Elder } 23442169238dSAlex Elder 2345f1a4739fSAlex Elder /* 23463b434a2aSJosh Durgin * Add individual osd ops to the given ceph_osd_request and prepare 23473b434a2aSJosh Durgin * them for submission. num_ops is the current number of 23483b434a2aSJosh Durgin * osd operations already to the object request. 23493b434a2aSJosh Durgin */ 23503b434a2aSJosh Durgin static void rbd_img_obj_request_fill(struct rbd_obj_request *obj_request, 23513b434a2aSJosh Durgin struct ceph_osd_request *osd_request, 23523b434a2aSJosh Durgin enum obj_operation_type op_type, 23533b434a2aSJosh Durgin unsigned int num_ops) 23543b434a2aSJosh Durgin { 23553b434a2aSJosh Durgin struct rbd_img_request *img_request = obj_request->img_request; 23563b434a2aSJosh Durgin struct rbd_device *rbd_dev = img_request->rbd_dev; 23573b434a2aSJosh Durgin u64 object_size = rbd_obj_bytes(&rbd_dev->header); 23583b434a2aSJosh Durgin u64 offset = obj_request->offset; 23593b434a2aSJosh Durgin u64 length = obj_request->length; 23603b434a2aSJosh Durgin u64 img_end; 23613b434a2aSJosh Durgin u16 opcode; 23623b434a2aSJosh Durgin 23633b434a2aSJosh Durgin if (op_type == OBJ_OP_DISCARD) { 2364d3246fb0SJosh Durgin if (!offset && length == object_size && 2365d3246fb0SJosh Durgin (!img_request_layered_test(img_request) || 2366d3246fb0SJosh Durgin !obj_request_overlaps_parent(obj_request))) { 23673b434a2aSJosh Durgin opcode = CEPH_OSD_OP_DELETE; 23683b434a2aSJosh Durgin } else if ((offset + length == object_size)) { 23693b434a2aSJosh Durgin opcode = CEPH_OSD_OP_TRUNCATE; 23703b434a2aSJosh Durgin } else { 23713b434a2aSJosh Durgin down_read(&rbd_dev->header_rwsem); 23723b434a2aSJosh Durgin img_end = rbd_dev->header.image_size; 23733b434a2aSJosh Durgin up_read(&rbd_dev->header_rwsem); 23743b434a2aSJosh Durgin 23753b434a2aSJosh Durgin if (obj_request->img_offset + length == img_end) 23763b434a2aSJosh Durgin opcode = CEPH_OSD_OP_TRUNCATE; 23773b434a2aSJosh Durgin else 23783b434a2aSJosh Durgin opcode = CEPH_OSD_OP_ZERO; 23793b434a2aSJosh Durgin } 23803b434a2aSJosh Durgin } else if (op_type == OBJ_OP_WRITE) { 2381e30b7577SIlya Dryomov if (!offset && length == object_size) 2382e30b7577SIlya Dryomov opcode = CEPH_OSD_OP_WRITEFULL; 2383e30b7577SIlya Dryomov else 23843b434a2aSJosh Durgin opcode = CEPH_OSD_OP_WRITE; 23853b434a2aSJosh Durgin osd_req_op_alloc_hint_init(osd_request, num_ops, 23863b434a2aSJosh Durgin object_size, object_size); 23873b434a2aSJosh Durgin num_ops++; 23883b434a2aSJosh Durgin } else { 23893b434a2aSJosh Durgin opcode = CEPH_OSD_OP_READ; 23903b434a2aSJosh Durgin } 23913b434a2aSJosh Durgin 23927e868b6eSIlya Dryomov if (opcode == CEPH_OSD_OP_DELETE) 2393144cba14SYan, Zheng osd_req_op_init(osd_request, num_ops, opcode, 0); 23947e868b6eSIlya Dryomov else 23957e868b6eSIlya Dryomov osd_req_op_extent_init(osd_request, num_ops, opcode, 23967e868b6eSIlya Dryomov offset, length, 0, 0); 23977e868b6eSIlya Dryomov 23983b434a2aSJosh Durgin if (obj_request->type == OBJ_REQUEST_BIO) 23993b434a2aSJosh Durgin osd_req_op_extent_osd_data_bio(osd_request, num_ops, 24003b434a2aSJosh Durgin obj_request->bio_list, length); 24013b434a2aSJosh Durgin else if (obj_request->type == OBJ_REQUEST_PAGES) 24023b434a2aSJosh Durgin osd_req_op_extent_osd_data_pages(osd_request, num_ops, 24033b434a2aSJosh Durgin obj_request->pages, length, 24043b434a2aSJosh Durgin offset & ~PAGE_MASK, false, false); 24053b434a2aSJosh Durgin 24063b434a2aSJosh Durgin /* Discards are also writes */ 24073b434a2aSJosh Durgin if (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD) 24083b434a2aSJosh Durgin rbd_osd_req_format_write(obj_request); 24093b434a2aSJosh Durgin else 24103b434a2aSJosh Durgin rbd_osd_req_format_read(obj_request); 24113b434a2aSJosh Durgin } 24123b434a2aSJosh Durgin 24133b434a2aSJosh Durgin /* 2414f1a4739fSAlex Elder * Split up an image request into one or more object requests, each 2415f1a4739fSAlex Elder * to a different object. The "type" parameter indicates whether 2416f1a4739fSAlex Elder * "data_desc" is the pointer to the head of a list of bio 2417f1a4739fSAlex Elder * structures, or the base of a page array. In either case this 2418f1a4739fSAlex Elder * function assumes data_desc describes memory sufficient to hold 2419f1a4739fSAlex Elder * all data described by the image request. 2420f1a4739fSAlex Elder */ 2421f1a4739fSAlex Elder static int rbd_img_request_fill(struct rbd_img_request *img_request, 2422f1a4739fSAlex Elder enum obj_request_type type, 2423f1a4739fSAlex Elder void *data_desc) 2424bf0d5f50SAlex Elder { 2425bf0d5f50SAlex Elder struct rbd_device *rbd_dev = img_request->rbd_dev; 2426bf0d5f50SAlex Elder struct rbd_obj_request *obj_request = NULL; 2427bf0d5f50SAlex Elder struct rbd_obj_request *next_obj_request; 2428a158073cSJingoo Han struct bio *bio_list = NULL; 2429f1a4739fSAlex Elder unsigned int bio_offset = 0; 2430a158073cSJingoo Han struct page **pages = NULL; 24316d2940c8SGuangliang Zhao enum obj_operation_type op_type; 24327da22d29SAlex Elder u64 img_offset; 2433bf0d5f50SAlex Elder u64 resid; 2434bf0d5f50SAlex Elder 2435f1a4739fSAlex Elder dout("%s: img %p type %d data_desc %p\n", __func__, img_request, 2436f1a4739fSAlex Elder (int)type, data_desc); 243737206ee5SAlex Elder 24387da22d29SAlex Elder img_offset = img_request->offset; 2439bf0d5f50SAlex Elder resid = img_request->length; 24404dda41d3SAlex Elder rbd_assert(resid > 0); 24413b434a2aSJosh Durgin op_type = rbd_img_request_op_type(img_request); 2442f1a4739fSAlex Elder 2443f1a4739fSAlex Elder if (type == OBJ_REQUEST_BIO) { 2444f1a4739fSAlex Elder bio_list = data_desc; 24454f024f37SKent Overstreet rbd_assert(img_offset == 24464f024f37SKent Overstreet bio_list->bi_iter.bi_sector << SECTOR_SHIFT); 244790e98c52SGuangliang Zhao } else if (type == OBJ_REQUEST_PAGES) { 2448f1a4739fSAlex Elder pages = data_desc; 2449f1a4739fSAlex Elder } 2450f1a4739fSAlex Elder 2451bf0d5f50SAlex Elder while (resid) { 24522fa12320SAlex Elder struct ceph_osd_request *osd_req; 2453a90bb0c1SIlya Dryomov u64 object_no = img_offset >> rbd_dev->header.obj_order; 245467e2b652SIlya Dryomov u64 offset = rbd_segment_offset(rbd_dev, img_offset); 245567e2b652SIlya Dryomov u64 length = rbd_segment_length(rbd_dev, img_offset, resid); 2456bf0d5f50SAlex Elder 24576c696d85SIlya Dryomov obj_request = rbd_obj_request_create(type); 2458bf0d5f50SAlex Elder if (!obj_request) 2459bf0d5f50SAlex Elder goto out_unwind; 246062054da6SIlya Dryomov 2461a90bb0c1SIlya Dryomov obj_request->object_no = object_no; 246267e2b652SIlya Dryomov obj_request->offset = offset; 246367e2b652SIlya Dryomov obj_request->length = length; 246467e2b652SIlya Dryomov 246503507db6SJosh Durgin /* 246603507db6SJosh Durgin * set obj_request->img_request before creating the 246703507db6SJosh Durgin * osd_request so that it gets the right snapc 246803507db6SJosh Durgin */ 246903507db6SJosh Durgin rbd_img_obj_request_add(img_request, obj_request); 2470bf0d5f50SAlex Elder 2471f1a4739fSAlex Elder if (type == OBJ_REQUEST_BIO) { 2472f1a4739fSAlex Elder unsigned int clone_size; 2473f1a4739fSAlex Elder 2474bf0d5f50SAlex Elder rbd_assert(length <= (u64)UINT_MAX); 2475bf0d5f50SAlex Elder clone_size = (unsigned int)length; 2476f1a4739fSAlex Elder obj_request->bio_list = 2477f1a4739fSAlex Elder bio_chain_clone_range(&bio_list, 2478f1a4739fSAlex Elder &bio_offset, 2479f1a4739fSAlex Elder clone_size, 24802224d879SDavid Disseldorp GFP_NOIO); 2481bf0d5f50SAlex Elder if (!obj_request->bio_list) 248262054da6SIlya Dryomov goto out_unwind; 248390e98c52SGuangliang Zhao } else if (type == OBJ_REQUEST_PAGES) { 2484f1a4739fSAlex Elder unsigned int page_count; 2485f1a4739fSAlex Elder 2486f1a4739fSAlex Elder obj_request->pages = pages; 2487f1a4739fSAlex Elder page_count = (u32)calc_pages_for(offset, length); 2488f1a4739fSAlex Elder obj_request->page_count = page_count; 2489f1a4739fSAlex Elder if ((offset + length) & ~PAGE_MASK) 2490f1a4739fSAlex Elder page_count--; /* more on last page */ 2491f1a4739fSAlex Elder pages += page_count; 2492f1a4739fSAlex Elder } 2493bf0d5f50SAlex Elder 24946d2940c8SGuangliang Zhao osd_req = rbd_osd_req_create(rbd_dev, op_type, 24956d2940c8SGuangliang Zhao (op_type == OBJ_OP_WRITE) ? 2 : 1, 24962fa12320SAlex Elder obj_request); 24972fa12320SAlex Elder if (!osd_req) 249862054da6SIlya Dryomov goto out_unwind; 24993b434a2aSJosh Durgin 25002fa12320SAlex Elder obj_request->osd_req = osd_req; 25012169238dSAlex Elder obj_request->callback = rbd_img_obj_callback; 25027da22d29SAlex Elder obj_request->img_offset = img_offset; 2503bf0d5f50SAlex Elder 25043b434a2aSJosh Durgin rbd_img_obj_request_fill(obj_request, osd_req, op_type, 0); 25053b434a2aSJosh Durgin 25067da22d29SAlex Elder img_offset += length; 2507bf0d5f50SAlex Elder resid -= length; 2508bf0d5f50SAlex Elder } 2509bf0d5f50SAlex Elder 2510bf0d5f50SAlex Elder return 0; 2511bf0d5f50SAlex Elder 2512bf0d5f50SAlex Elder out_unwind: 2513bf0d5f50SAlex Elder for_each_obj_request_safe(img_request, obj_request, next_obj_request) 251442dd037cSIlya Dryomov rbd_img_obj_request_del(img_request, obj_request); 2515bf0d5f50SAlex Elder 2516bf0d5f50SAlex Elder return -ENOMEM; 2517bf0d5f50SAlex Elder } 2518bf0d5f50SAlex Elder 25193d7efd18SAlex Elder static void 25202761713dSIlya Dryomov rbd_osd_copyup_callback(struct rbd_obj_request *obj_request) 25210eefd470SAlex Elder { 25220eefd470SAlex Elder struct rbd_img_request *img_request; 25230eefd470SAlex Elder struct rbd_device *rbd_dev; 2524ebda6408SAlex Elder struct page **pages; 25250eefd470SAlex Elder u32 page_count; 25260eefd470SAlex Elder 25272761713dSIlya Dryomov dout("%s: obj %p\n", __func__, obj_request); 25282761713dSIlya Dryomov 2529d3246fb0SJosh Durgin rbd_assert(obj_request->type == OBJ_REQUEST_BIO || 2530d3246fb0SJosh Durgin obj_request->type == OBJ_REQUEST_NODATA); 25310eefd470SAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 25320eefd470SAlex Elder img_request = obj_request->img_request; 25330eefd470SAlex Elder rbd_assert(img_request); 25340eefd470SAlex Elder 25350eefd470SAlex Elder rbd_dev = img_request->rbd_dev; 25360eefd470SAlex Elder rbd_assert(rbd_dev); 25370eefd470SAlex Elder 2538ebda6408SAlex Elder pages = obj_request->copyup_pages; 2539ebda6408SAlex Elder rbd_assert(pages != NULL); 25400eefd470SAlex Elder obj_request->copyup_pages = NULL; 2541ebda6408SAlex Elder page_count = obj_request->copyup_page_count; 2542ebda6408SAlex Elder rbd_assert(page_count); 2543ebda6408SAlex Elder obj_request->copyup_page_count = 0; 2544ebda6408SAlex Elder ceph_release_page_vector(pages, page_count); 25450eefd470SAlex Elder 25460eefd470SAlex Elder /* 25470eefd470SAlex Elder * We want the transfer count to reflect the size of the 25480eefd470SAlex Elder * original write request. There is no such thing as a 25490eefd470SAlex Elder * successful short write, so if the request was successful 25500eefd470SAlex Elder * we can just set it to the originally-requested length. 25510eefd470SAlex Elder */ 25520eefd470SAlex Elder if (!obj_request->result) 25530eefd470SAlex Elder obj_request->xferred = obj_request->length; 25540eefd470SAlex Elder 25552761713dSIlya Dryomov obj_request_done_set(obj_request); 25560eefd470SAlex Elder } 25570eefd470SAlex Elder 25580eefd470SAlex Elder static void 25593d7efd18SAlex Elder rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request) 25603d7efd18SAlex Elder { 25613d7efd18SAlex Elder struct rbd_obj_request *orig_request; 25620eefd470SAlex Elder struct ceph_osd_request *osd_req; 25630eefd470SAlex Elder struct rbd_device *rbd_dev; 25643d7efd18SAlex Elder struct page **pages; 2565d3246fb0SJosh Durgin enum obj_operation_type op_type; 2566ebda6408SAlex Elder u32 page_count; 2567bbea1c1aSAlex Elder int img_result; 2568ebda6408SAlex Elder u64 parent_length; 25693d7efd18SAlex Elder 25703d7efd18SAlex Elder rbd_assert(img_request_child_test(img_request)); 25713d7efd18SAlex Elder 25723d7efd18SAlex Elder /* First get what we need from the image request */ 25733d7efd18SAlex Elder 25743d7efd18SAlex Elder pages = img_request->copyup_pages; 25753d7efd18SAlex Elder rbd_assert(pages != NULL); 25763d7efd18SAlex Elder img_request->copyup_pages = NULL; 2577ebda6408SAlex Elder page_count = img_request->copyup_page_count; 2578ebda6408SAlex Elder rbd_assert(page_count); 2579ebda6408SAlex Elder img_request->copyup_page_count = 0; 25803d7efd18SAlex Elder 25813d7efd18SAlex Elder orig_request = img_request->obj_request; 25823d7efd18SAlex Elder rbd_assert(orig_request != NULL); 2583b91f09f1SAlex Elder rbd_assert(obj_request_type_valid(orig_request->type)); 2584bbea1c1aSAlex Elder img_result = img_request->result; 2585ebda6408SAlex Elder parent_length = img_request->length; 2586fa355112SIlya Dryomov rbd_assert(img_result || parent_length == img_request->xferred); 25873d7efd18SAlex Elder rbd_img_request_put(img_request); 25883d7efd18SAlex Elder 258991c6febbSAlex Elder rbd_assert(orig_request->img_request); 259091c6febbSAlex Elder rbd_dev = orig_request->img_request->rbd_dev; 25913d7efd18SAlex Elder rbd_assert(rbd_dev); 25923d7efd18SAlex Elder 2593bbea1c1aSAlex Elder /* 2594bbea1c1aSAlex Elder * If the overlap has become 0 (most likely because the 2595bbea1c1aSAlex Elder * image has been flattened) we need to free the pages 2596bbea1c1aSAlex Elder * and re-submit the original write request. 2597bbea1c1aSAlex Elder */ 2598bbea1c1aSAlex Elder if (!rbd_dev->parent_overlap) { 2599bbea1c1aSAlex Elder ceph_release_page_vector(pages, page_count); 2600980917fcSIlya Dryomov rbd_obj_request_submit(orig_request); 2601bbea1c1aSAlex Elder return; 2602bbea1c1aSAlex Elder } 2603bbea1c1aSAlex Elder 2604bbea1c1aSAlex Elder if (img_result) 26050eefd470SAlex Elder goto out_err; 26063d7efd18SAlex Elder 26078785b1d4SAlex Elder /* 26088785b1d4SAlex Elder * The original osd request is of no use to use any more. 26090ccd5926SIlya Dryomov * We need a new one that can hold the three ops in a copyup 26108785b1d4SAlex Elder * request. Allocate the new copyup osd request for the 26118785b1d4SAlex Elder * original request, and release the old one. 26128785b1d4SAlex Elder */ 2613bbea1c1aSAlex Elder img_result = -ENOMEM; 26140eefd470SAlex Elder osd_req = rbd_osd_req_create_copyup(orig_request); 26150eefd470SAlex Elder if (!osd_req) 26160eefd470SAlex Elder goto out_err; 26178785b1d4SAlex Elder rbd_osd_req_destroy(orig_request->osd_req); 26180eefd470SAlex Elder orig_request->osd_req = osd_req; 26190eefd470SAlex Elder orig_request->copyup_pages = pages; 2620ebda6408SAlex Elder orig_request->copyup_page_count = page_count; 26213d7efd18SAlex Elder 26220eefd470SAlex Elder /* Initialize the copyup op */ 26230eefd470SAlex Elder 26240eefd470SAlex Elder osd_req_op_cls_init(osd_req, 0, CEPH_OSD_OP_CALL, "rbd", "copyup"); 2625ebda6408SAlex Elder osd_req_op_cls_request_data_pages(osd_req, 0, pages, parent_length, 0, 26260eefd470SAlex Elder false, false); 26270eefd470SAlex Elder 2628d3246fb0SJosh Durgin /* Add the other op(s) */ 26290ccd5926SIlya Dryomov 2630d3246fb0SJosh Durgin op_type = rbd_img_request_op_type(orig_request->img_request); 2631d3246fb0SJosh Durgin rbd_img_obj_request_fill(orig_request, osd_req, op_type, 1); 26320eefd470SAlex Elder 26330eefd470SAlex Elder /* All set, send it off. */ 26340eefd470SAlex Elder 2635980917fcSIlya Dryomov rbd_obj_request_submit(orig_request); 26360eefd470SAlex Elder return; 26370eefd470SAlex Elder 26380eefd470SAlex Elder out_err: 2639fa355112SIlya Dryomov ceph_release_page_vector(pages, page_count); 26400dcc685eSIlya Dryomov rbd_obj_request_error(orig_request, img_result); 26413d7efd18SAlex Elder } 26423d7efd18SAlex Elder 26433d7efd18SAlex Elder /* 26443d7efd18SAlex Elder * Read from the parent image the range of data that covers the 26453d7efd18SAlex Elder * entire target of the given object request. This is used for 26463d7efd18SAlex Elder * satisfying a layered image write request when the target of an 26473d7efd18SAlex Elder * object request from the image request does not exist. 26483d7efd18SAlex Elder * 26493d7efd18SAlex Elder * A page array big enough to hold the returned data is allocated 26503d7efd18SAlex Elder * and supplied to rbd_img_request_fill() as the "data descriptor." 26513d7efd18SAlex Elder * When the read completes, this page array will be transferred to 26523d7efd18SAlex Elder * the original object request for the copyup operation. 26533d7efd18SAlex Elder * 2654c2e82414SIlya Dryomov * If an error occurs, it is recorded as the result of the original 2655c2e82414SIlya Dryomov * object request in rbd_img_obj_exists_callback(). 26563d7efd18SAlex Elder */ 26573d7efd18SAlex Elder static int rbd_img_obj_parent_read_full(struct rbd_obj_request *obj_request) 26583d7efd18SAlex Elder { 2659058aa991SIlya Dryomov struct rbd_device *rbd_dev = obj_request->img_request->rbd_dev; 26603d7efd18SAlex Elder struct rbd_img_request *parent_request = NULL; 26613d7efd18SAlex Elder u64 img_offset; 26623d7efd18SAlex Elder u64 length; 26633d7efd18SAlex Elder struct page **pages = NULL; 26643d7efd18SAlex Elder u32 page_count; 26653d7efd18SAlex Elder int result; 26663d7efd18SAlex Elder 26673d7efd18SAlex Elder rbd_assert(rbd_dev->parent != NULL); 26683d7efd18SAlex Elder 26693d7efd18SAlex Elder /* 26703d7efd18SAlex Elder * Determine the byte range covered by the object in the 26713d7efd18SAlex Elder * child image to which the original request was to be sent. 26723d7efd18SAlex Elder */ 26733d7efd18SAlex Elder img_offset = obj_request->img_offset - obj_request->offset; 26745bc3fb17SIlya Dryomov length = rbd_obj_bytes(&rbd_dev->header); 26753d7efd18SAlex Elder 26763d7efd18SAlex Elder /* 2677a9e8ba2cSAlex Elder * There is no defined parent data beyond the parent 2678a9e8ba2cSAlex Elder * overlap, so limit what we read at that boundary if 2679a9e8ba2cSAlex Elder * necessary. 2680a9e8ba2cSAlex Elder */ 2681a9e8ba2cSAlex Elder if (img_offset + length > rbd_dev->parent_overlap) { 2682a9e8ba2cSAlex Elder rbd_assert(img_offset < rbd_dev->parent_overlap); 2683a9e8ba2cSAlex Elder length = rbd_dev->parent_overlap - img_offset; 2684a9e8ba2cSAlex Elder } 2685a9e8ba2cSAlex Elder 2686a9e8ba2cSAlex Elder /* 26873d7efd18SAlex Elder * Allocate a page array big enough to receive the data read 26883d7efd18SAlex Elder * from the parent. 26893d7efd18SAlex Elder */ 26903d7efd18SAlex Elder page_count = (u32)calc_pages_for(0, length); 26913d7efd18SAlex Elder pages = ceph_alloc_page_vector(page_count, GFP_KERNEL); 26923d7efd18SAlex Elder if (IS_ERR(pages)) { 26933d7efd18SAlex Elder result = PTR_ERR(pages); 26943d7efd18SAlex Elder pages = NULL; 26953d7efd18SAlex Elder goto out_err; 26963d7efd18SAlex Elder } 26973d7efd18SAlex Elder 26983d7efd18SAlex Elder result = -ENOMEM; 2699e93f3152SAlex Elder parent_request = rbd_parent_request_create(obj_request, 2700e93f3152SAlex Elder img_offset, length); 27013d7efd18SAlex Elder if (!parent_request) 27023d7efd18SAlex Elder goto out_err; 27033d7efd18SAlex Elder 27043d7efd18SAlex Elder result = rbd_img_request_fill(parent_request, OBJ_REQUEST_PAGES, pages); 27053d7efd18SAlex Elder if (result) 27063d7efd18SAlex Elder goto out_err; 2707058aa991SIlya Dryomov 27083d7efd18SAlex Elder parent_request->copyup_pages = pages; 2709ebda6408SAlex Elder parent_request->copyup_page_count = page_count; 27103d7efd18SAlex Elder parent_request->callback = rbd_img_obj_parent_read_full_callback; 2711058aa991SIlya Dryomov 27123d7efd18SAlex Elder result = rbd_img_request_submit(parent_request); 27133d7efd18SAlex Elder if (!result) 27143d7efd18SAlex Elder return 0; 27153d7efd18SAlex Elder 27163d7efd18SAlex Elder parent_request->copyup_pages = NULL; 2717ebda6408SAlex Elder parent_request->copyup_page_count = 0; 27183d7efd18SAlex Elder parent_request->obj_request = NULL; 27193d7efd18SAlex Elder rbd_obj_request_put(obj_request); 27203d7efd18SAlex Elder out_err: 27213d7efd18SAlex Elder if (pages) 27223d7efd18SAlex Elder ceph_release_page_vector(pages, page_count); 27233d7efd18SAlex Elder if (parent_request) 27243d7efd18SAlex Elder rbd_img_request_put(parent_request); 27253d7efd18SAlex Elder return result; 27263d7efd18SAlex Elder } 27273d7efd18SAlex Elder 2728c5b5ef6cSAlex Elder static void rbd_img_obj_exists_callback(struct rbd_obj_request *obj_request) 2729c5b5ef6cSAlex Elder { 2730c5b5ef6cSAlex Elder struct rbd_obj_request *orig_request; 2731638f5abeSAlex Elder struct rbd_device *rbd_dev; 2732c5b5ef6cSAlex Elder int result; 2733c5b5ef6cSAlex Elder 2734c5b5ef6cSAlex Elder rbd_assert(!obj_request_img_data_test(obj_request)); 2735c5b5ef6cSAlex Elder 2736c5b5ef6cSAlex Elder /* 2737c5b5ef6cSAlex Elder * All we need from the object request is the original 2738c5b5ef6cSAlex Elder * request and the result of the STAT op. Grab those, then 2739c5b5ef6cSAlex Elder * we're done with the request. 2740c5b5ef6cSAlex Elder */ 2741c5b5ef6cSAlex Elder orig_request = obj_request->obj_request; 2742c5b5ef6cSAlex Elder obj_request->obj_request = NULL; 2743912c317dSAlex Elder rbd_obj_request_put(orig_request); 2744c5b5ef6cSAlex Elder rbd_assert(orig_request); 2745c5b5ef6cSAlex Elder rbd_assert(orig_request->img_request); 2746c5b5ef6cSAlex Elder 2747c5b5ef6cSAlex Elder result = obj_request->result; 2748c5b5ef6cSAlex Elder obj_request->result = 0; 2749c5b5ef6cSAlex Elder 2750c5b5ef6cSAlex Elder dout("%s: obj %p for obj %p result %d %llu/%llu\n", __func__, 2751c5b5ef6cSAlex Elder obj_request, orig_request, result, 2752c5b5ef6cSAlex Elder obj_request->xferred, obj_request->length); 2753c5b5ef6cSAlex Elder rbd_obj_request_put(obj_request); 2754c5b5ef6cSAlex Elder 2755638f5abeSAlex Elder /* 2756638f5abeSAlex Elder * If the overlap has become 0 (most likely because the 2757980917fcSIlya Dryomov * image has been flattened) we need to re-submit the 2758980917fcSIlya Dryomov * original request. 2759638f5abeSAlex Elder */ 2760638f5abeSAlex Elder rbd_dev = orig_request->img_request->rbd_dev; 2761638f5abeSAlex Elder if (!rbd_dev->parent_overlap) { 2762980917fcSIlya Dryomov rbd_obj_request_submit(orig_request); 2763638f5abeSAlex Elder return; 2764638f5abeSAlex Elder } 2765c5b5ef6cSAlex Elder 2766c5b5ef6cSAlex Elder /* 2767c5b5ef6cSAlex Elder * Our only purpose here is to determine whether the object 2768c5b5ef6cSAlex Elder * exists, and we don't want to treat the non-existence as 2769c5b5ef6cSAlex Elder * an error. If something else comes back, transfer the 2770c5b5ef6cSAlex Elder * error to the original request and complete it now. 2771c5b5ef6cSAlex Elder */ 2772c5b5ef6cSAlex Elder if (!result) { 2773c5b5ef6cSAlex Elder obj_request_existence_set(orig_request, true); 2774c5b5ef6cSAlex Elder } else if (result == -ENOENT) { 2775c5b5ef6cSAlex Elder obj_request_existence_set(orig_request, false); 2776c2e82414SIlya Dryomov } else { 2777c2e82414SIlya Dryomov goto fail_orig_request; 2778c5b5ef6cSAlex Elder } 2779c5b5ef6cSAlex Elder 2780c5b5ef6cSAlex Elder /* 2781c5b5ef6cSAlex Elder * Resubmit the original request now that we have recorded 2782c5b5ef6cSAlex Elder * whether the target object exists. 2783c5b5ef6cSAlex Elder */ 2784c2e82414SIlya Dryomov result = rbd_img_obj_request_submit(orig_request); 2785c2e82414SIlya Dryomov if (result) 2786c2e82414SIlya Dryomov goto fail_orig_request; 2787c2e82414SIlya Dryomov 2788c2e82414SIlya Dryomov return; 2789c2e82414SIlya Dryomov 2790c2e82414SIlya Dryomov fail_orig_request: 27910dcc685eSIlya Dryomov rbd_obj_request_error(orig_request, result); 2792c5b5ef6cSAlex Elder } 2793c5b5ef6cSAlex Elder 2794c5b5ef6cSAlex Elder static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request) 2795c5b5ef6cSAlex Elder { 2796058aa991SIlya Dryomov struct rbd_device *rbd_dev = obj_request->img_request->rbd_dev; 2797c5b5ef6cSAlex Elder struct rbd_obj_request *stat_request; 2798710214e3SIlya Dryomov struct page **pages; 2799c5b5ef6cSAlex Elder u32 page_count; 2800c5b5ef6cSAlex Elder size_t size; 2801c5b5ef6cSAlex Elder int ret; 2802c5b5ef6cSAlex Elder 28036c696d85SIlya Dryomov stat_request = rbd_obj_request_create(OBJ_REQUEST_PAGES); 2804710214e3SIlya Dryomov if (!stat_request) 2805710214e3SIlya Dryomov return -ENOMEM; 2806710214e3SIlya Dryomov 2807a90bb0c1SIlya Dryomov stat_request->object_no = obj_request->object_no; 2808a90bb0c1SIlya Dryomov 2809710214e3SIlya Dryomov stat_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1, 2810710214e3SIlya Dryomov stat_request); 2811710214e3SIlya Dryomov if (!stat_request->osd_req) { 2812710214e3SIlya Dryomov ret = -ENOMEM; 2813710214e3SIlya Dryomov goto fail_stat_request; 2814710214e3SIlya Dryomov } 2815710214e3SIlya Dryomov 2816c5b5ef6cSAlex Elder /* 2817c5b5ef6cSAlex Elder * The response data for a STAT call consists of: 2818c5b5ef6cSAlex Elder * le64 length; 2819c5b5ef6cSAlex Elder * struct { 2820c5b5ef6cSAlex Elder * le32 tv_sec; 2821c5b5ef6cSAlex Elder * le32 tv_nsec; 2822c5b5ef6cSAlex Elder * } mtime; 2823c5b5ef6cSAlex Elder */ 2824c5b5ef6cSAlex Elder size = sizeof (__le64) + sizeof (__le32) + sizeof (__le32); 2825c5b5ef6cSAlex Elder page_count = (u32)calc_pages_for(0, size); 2826c5b5ef6cSAlex Elder pages = ceph_alloc_page_vector(page_count, GFP_KERNEL); 2827710214e3SIlya Dryomov if (IS_ERR(pages)) { 2828710214e3SIlya Dryomov ret = PTR_ERR(pages); 2829710214e3SIlya Dryomov goto fail_stat_request; 2830710214e3SIlya Dryomov } 2831c5b5ef6cSAlex Elder 2832710214e3SIlya Dryomov osd_req_op_init(stat_request->osd_req, 0, CEPH_OSD_OP_STAT, 0); 2833710214e3SIlya Dryomov osd_req_op_raw_data_in_pages(stat_request->osd_req, 0, pages, size, 0, 2834710214e3SIlya Dryomov false, false); 2835c5b5ef6cSAlex Elder 2836c5b5ef6cSAlex Elder rbd_obj_request_get(obj_request); 2837c5b5ef6cSAlex Elder stat_request->obj_request = obj_request; 2838c5b5ef6cSAlex Elder stat_request->pages = pages; 2839c5b5ef6cSAlex Elder stat_request->page_count = page_count; 2840c5b5ef6cSAlex Elder stat_request->callback = rbd_img_obj_exists_callback; 2841c5b5ef6cSAlex Elder 2842980917fcSIlya Dryomov rbd_obj_request_submit(stat_request); 2843980917fcSIlya Dryomov return 0; 2844c5b5ef6cSAlex Elder 2845710214e3SIlya Dryomov fail_stat_request: 2846710214e3SIlya Dryomov rbd_obj_request_put(stat_request); 2847c5b5ef6cSAlex Elder return ret; 2848c5b5ef6cSAlex Elder } 2849c5b5ef6cSAlex Elder 285070d045f6SIlya Dryomov static bool img_obj_request_simple(struct rbd_obj_request *obj_request) 2851b454e36dSAlex Elder { 2852058aa991SIlya Dryomov struct rbd_img_request *img_request = obj_request->img_request; 2853058aa991SIlya Dryomov struct rbd_device *rbd_dev = img_request->rbd_dev; 2854b454e36dSAlex Elder 285570d045f6SIlya Dryomov /* Reads */ 28561c220881SJosh Durgin if (!img_request_write_test(img_request) && 28571c220881SJosh Durgin !img_request_discard_test(img_request)) 285870d045f6SIlya Dryomov return true; 2859b454e36dSAlex Elder 286070d045f6SIlya Dryomov /* Non-layered writes */ 286170d045f6SIlya Dryomov if (!img_request_layered_test(img_request)) 286270d045f6SIlya Dryomov return true; 286370d045f6SIlya Dryomov 286470d045f6SIlya Dryomov /* 286570d045f6SIlya Dryomov * Layered writes outside of the parent overlap range don't 286670d045f6SIlya Dryomov * share any data with the parent. 286770d045f6SIlya Dryomov */ 286870d045f6SIlya Dryomov if (!obj_request_overlaps_parent(obj_request)) 286970d045f6SIlya Dryomov return true; 287070d045f6SIlya Dryomov 287170d045f6SIlya Dryomov /* 2872c622d226SGuangliang Zhao * Entire-object layered writes - we will overwrite whatever 2873c622d226SGuangliang Zhao * parent data there is anyway. 2874c622d226SGuangliang Zhao */ 2875c622d226SGuangliang Zhao if (!obj_request->offset && 2876c622d226SGuangliang Zhao obj_request->length == rbd_obj_bytes(&rbd_dev->header)) 2877c622d226SGuangliang Zhao return true; 2878c622d226SGuangliang Zhao 2879c622d226SGuangliang Zhao /* 288070d045f6SIlya Dryomov * If the object is known to already exist, its parent data has 288170d045f6SIlya Dryomov * already been copied. 288270d045f6SIlya Dryomov */ 288370d045f6SIlya Dryomov if (obj_request_known_test(obj_request) && 288470d045f6SIlya Dryomov obj_request_exists_test(obj_request)) 288570d045f6SIlya Dryomov return true; 288670d045f6SIlya Dryomov 288770d045f6SIlya Dryomov return false; 288870d045f6SIlya Dryomov } 288970d045f6SIlya Dryomov 289070d045f6SIlya Dryomov static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request) 289170d045f6SIlya Dryomov { 2892058aa991SIlya Dryomov rbd_assert(obj_request_img_data_test(obj_request)); 2893058aa991SIlya Dryomov rbd_assert(obj_request_type_valid(obj_request->type)); 2894058aa991SIlya Dryomov rbd_assert(obj_request->img_request); 2895058aa991SIlya Dryomov 289670d045f6SIlya Dryomov if (img_obj_request_simple(obj_request)) { 2897980917fcSIlya Dryomov rbd_obj_request_submit(obj_request); 2898980917fcSIlya Dryomov return 0; 2899b454e36dSAlex Elder } 2900b454e36dSAlex Elder 2901b454e36dSAlex Elder /* 29023d7efd18SAlex Elder * It's a layered write. The target object might exist but 29033d7efd18SAlex Elder * we may not know that yet. If we know it doesn't exist, 29043d7efd18SAlex Elder * start by reading the data for the full target object from 29053d7efd18SAlex Elder * the parent so we can use it for a copyup to the target. 2906b454e36dSAlex Elder */ 290770d045f6SIlya Dryomov if (obj_request_known_test(obj_request)) 29083d7efd18SAlex Elder return rbd_img_obj_parent_read_full(obj_request); 29093d7efd18SAlex Elder 29103d7efd18SAlex Elder /* We don't know whether the target exists. Go find out. */ 2911b454e36dSAlex Elder 2912b454e36dSAlex Elder return rbd_img_obj_exists_submit(obj_request); 2913b454e36dSAlex Elder } 2914b454e36dSAlex Elder 2915bf0d5f50SAlex Elder static int rbd_img_request_submit(struct rbd_img_request *img_request) 2916bf0d5f50SAlex Elder { 2917bf0d5f50SAlex Elder struct rbd_obj_request *obj_request; 291846faeed4SAlex Elder struct rbd_obj_request *next_obj_request; 2919663ae2ccSIlya Dryomov int ret = 0; 2920bf0d5f50SAlex Elder 292137206ee5SAlex Elder dout("%s: img %p\n", __func__, img_request); 2922bf0d5f50SAlex Elder 2923663ae2ccSIlya Dryomov rbd_img_request_get(img_request); 2924663ae2ccSIlya Dryomov for_each_obj_request_safe(img_request, obj_request, next_obj_request) { 2925b454e36dSAlex Elder ret = rbd_img_obj_request_submit(obj_request); 2926bf0d5f50SAlex Elder if (ret) 2927663ae2ccSIlya Dryomov goto out_put_ireq; 2928bf0d5f50SAlex Elder } 2929bf0d5f50SAlex Elder 2930663ae2ccSIlya Dryomov out_put_ireq: 2931663ae2ccSIlya Dryomov rbd_img_request_put(img_request); 2932663ae2ccSIlya Dryomov return ret; 2933bf0d5f50SAlex Elder } 2934bf0d5f50SAlex Elder 29358b3e1a56SAlex Elder static void rbd_img_parent_read_callback(struct rbd_img_request *img_request) 29368b3e1a56SAlex Elder { 29378b3e1a56SAlex Elder struct rbd_obj_request *obj_request; 2938a9e8ba2cSAlex Elder struct rbd_device *rbd_dev; 2939a9e8ba2cSAlex Elder u64 obj_end; 294002c74fbaSAlex Elder u64 img_xferred; 294102c74fbaSAlex Elder int img_result; 29428b3e1a56SAlex Elder 29438b3e1a56SAlex Elder rbd_assert(img_request_child_test(img_request)); 29448b3e1a56SAlex Elder 294502c74fbaSAlex Elder /* First get what we need from the image request and release it */ 294602c74fbaSAlex Elder 29478b3e1a56SAlex Elder obj_request = img_request->obj_request; 294802c74fbaSAlex Elder img_xferred = img_request->xferred; 294902c74fbaSAlex Elder img_result = img_request->result; 295002c74fbaSAlex Elder rbd_img_request_put(img_request); 295102c74fbaSAlex Elder 295202c74fbaSAlex Elder /* 295302c74fbaSAlex Elder * If the overlap has become 0 (most likely because the 295402c74fbaSAlex Elder * image has been flattened) we need to re-submit the 295502c74fbaSAlex Elder * original request. 295602c74fbaSAlex Elder */ 2957a9e8ba2cSAlex Elder rbd_assert(obj_request); 2958a9e8ba2cSAlex Elder rbd_assert(obj_request->img_request); 295902c74fbaSAlex Elder rbd_dev = obj_request->img_request->rbd_dev; 296002c74fbaSAlex Elder if (!rbd_dev->parent_overlap) { 2961980917fcSIlya Dryomov rbd_obj_request_submit(obj_request); 296202c74fbaSAlex Elder return; 296302c74fbaSAlex Elder } 296402c74fbaSAlex Elder 296502c74fbaSAlex Elder obj_request->result = img_result; 2966a9e8ba2cSAlex Elder if (obj_request->result) 2967a9e8ba2cSAlex Elder goto out; 2968a9e8ba2cSAlex Elder 2969a9e8ba2cSAlex Elder /* 2970a9e8ba2cSAlex Elder * We need to zero anything beyond the parent overlap 2971a9e8ba2cSAlex Elder * boundary. Since rbd_img_obj_request_read_callback() 2972a9e8ba2cSAlex Elder * will zero anything beyond the end of a short read, an 2973a9e8ba2cSAlex Elder * easy way to do this is to pretend the data from the 2974a9e8ba2cSAlex Elder * parent came up short--ending at the overlap boundary. 2975a9e8ba2cSAlex Elder */ 2976a9e8ba2cSAlex Elder rbd_assert(obj_request->img_offset < U64_MAX - obj_request->length); 2977a9e8ba2cSAlex Elder obj_end = obj_request->img_offset + obj_request->length; 2978a9e8ba2cSAlex Elder if (obj_end > rbd_dev->parent_overlap) { 2979a9e8ba2cSAlex Elder u64 xferred = 0; 2980a9e8ba2cSAlex Elder 2981a9e8ba2cSAlex Elder if (obj_request->img_offset < rbd_dev->parent_overlap) 2982a9e8ba2cSAlex Elder xferred = rbd_dev->parent_overlap - 2983a9e8ba2cSAlex Elder obj_request->img_offset; 2984a9e8ba2cSAlex Elder 298502c74fbaSAlex Elder obj_request->xferred = min(img_xferred, xferred); 2986a9e8ba2cSAlex Elder } else { 298702c74fbaSAlex Elder obj_request->xferred = img_xferred; 2988a9e8ba2cSAlex Elder } 2989a9e8ba2cSAlex Elder out: 29908b3e1a56SAlex Elder rbd_img_obj_request_read_callback(obj_request); 29918b3e1a56SAlex Elder rbd_obj_request_complete(obj_request); 29928b3e1a56SAlex Elder } 29938b3e1a56SAlex Elder 29948b3e1a56SAlex Elder static void rbd_img_parent_read(struct rbd_obj_request *obj_request) 29958b3e1a56SAlex Elder { 29968b3e1a56SAlex Elder struct rbd_img_request *img_request; 29978b3e1a56SAlex Elder int result; 29988b3e1a56SAlex Elder 29998b3e1a56SAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 30008b3e1a56SAlex Elder rbd_assert(obj_request->img_request != NULL); 30018b3e1a56SAlex Elder rbd_assert(obj_request->result == (s32) -ENOENT); 30025b2ab72dSAlex Elder rbd_assert(obj_request_type_valid(obj_request->type)); 30038b3e1a56SAlex Elder 30048b3e1a56SAlex Elder /* rbd_read_finish(obj_request, obj_request->length); */ 3005e93f3152SAlex Elder img_request = rbd_parent_request_create(obj_request, 30068b3e1a56SAlex Elder obj_request->img_offset, 3007e93f3152SAlex Elder obj_request->length); 30088b3e1a56SAlex Elder result = -ENOMEM; 30098b3e1a56SAlex Elder if (!img_request) 30108b3e1a56SAlex Elder goto out_err; 30118b3e1a56SAlex Elder 30125b2ab72dSAlex Elder if (obj_request->type == OBJ_REQUEST_BIO) 3013f1a4739fSAlex Elder result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO, 3014f1a4739fSAlex Elder obj_request->bio_list); 30155b2ab72dSAlex Elder else 30165b2ab72dSAlex Elder result = rbd_img_request_fill(img_request, OBJ_REQUEST_PAGES, 30175b2ab72dSAlex Elder obj_request->pages); 30188b3e1a56SAlex Elder if (result) 30198b3e1a56SAlex Elder goto out_err; 30208b3e1a56SAlex Elder 30218b3e1a56SAlex Elder img_request->callback = rbd_img_parent_read_callback; 30228b3e1a56SAlex Elder result = rbd_img_request_submit(img_request); 30238b3e1a56SAlex Elder if (result) 30248b3e1a56SAlex Elder goto out_err; 30258b3e1a56SAlex Elder 30268b3e1a56SAlex Elder return; 30278b3e1a56SAlex Elder out_err: 30288b3e1a56SAlex Elder if (img_request) 30298b3e1a56SAlex Elder rbd_img_request_put(img_request); 30308b3e1a56SAlex Elder obj_request->result = result; 30318b3e1a56SAlex Elder obj_request->xferred = 0; 30328b3e1a56SAlex Elder obj_request_done_set(obj_request); 30338b3e1a56SAlex Elder } 30348b3e1a56SAlex Elder 3035ed95b21aSIlya Dryomov static const struct rbd_client_id rbd_empty_cid; 3036ed95b21aSIlya Dryomov 3037ed95b21aSIlya Dryomov static bool rbd_cid_equal(const struct rbd_client_id *lhs, 3038ed95b21aSIlya Dryomov const struct rbd_client_id *rhs) 3039ed95b21aSIlya Dryomov { 3040ed95b21aSIlya Dryomov return lhs->gid == rhs->gid && lhs->handle == rhs->handle; 3041ed95b21aSIlya Dryomov } 3042ed95b21aSIlya Dryomov 3043ed95b21aSIlya Dryomov static struct rbd_client_id rbd_get_cid(struct rbd_device *rbd_dev) 3044ed95b21aSIlya Dryomov { 3045ed95b21aSIlya Dryomov struct rbd_client_id cid; 3046ed95b21aSIlya Dryomov 3047ed95b21aSIlya Dryomov mutex_lock(&rbd_dev->watch_mutex); 3048ed95b21aSIlya Dryomov cid.gid = ceph_client_gid(rbd_dev->rbd_client->client); 3049ed95b21aSIlya Dryomov cid.handle = rbd_dev->watch_cookie; 3050ed95b21aSIlya Dryomov mutex_unlock(&rbd_dev->watch_mutex); 3051ed95b21aSIlya Dryomov return cid; 3052ed95b21aSIlya Dryomov } 3053ed95b21aSIlya Dryomov 3054ed95b21aSIlya Dryomov /* 3055ed95b21aSIlya Dryomov * lock_rwsem must be held for write 3056ed95b21aSIlya Dryomov */ 3057ed95b21aSIlya Dryomov static void rbd_set_owner_cid(struct rbd_device *rbd_dev, 3058ed95b21aSIlya Dryomov const struct rbd_client_id *cid) 3059ed95b21aSIlya Dryomov { 3060ed95b21aSIlya Dryomov dout("%s rbd_dev %p %llu-%llu -> %llu-%llu\n", __func__, rbd_dev, 3061ed95b21aSIlya Dryomov rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle, 3062ed95b21aSIlya Dryomov cid->gid, cid->handle); 3063ed95b21aSIlya Dryomov rbd_dev->owner_cid = *cid; /* struct */ 3064ed95b21aSIlya Dryomov } 3065ed95b21aSIlya Dryomov 3066ed95b21aSIlya Dryomov static void format_lock_cookie(struct rbd_device *rbd_dev, char *buf) 3067ed95b21aSIlya Dryomov { 3068ed95b21aSIlya Dryomov mutex_lock(&rbd_dev->watch_mutex); 3069ed95b21aSIlya Dryomov sprintf(buf, "%s %llu", RBD_LOCK_COOKIE_PREFIX, rbd_dev->watch_cookie); 3070ed95b21aSIlya Dryomov mutex_unlock(&rbd_dev->watch_mutex); 3071ed95b21aSIlya Dryomov } 3072ed95b21aSIlya Dryomov 3073ed95b21aSIlya Dryomov /* 3074ed95b21aSIlya Dryomov * lock_rwsem must be held for write 3075ed95b21aSIlya Dryomov */ 3076ed95b21aSIlya Dryomov static int rbd_lock(struct rbd_device *rbd_dev) 3077ed95b21aSIlya Dryomov { 3078ed95b21aSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 3079ed95b21aSIlya Dryomov struct rbd_client_id cid = rbd_get_cid(rbd_dev); 3080ed95b21aSIlya Dryomov char cookie[32]; 3081ed95b21aSIlya Dryomov int ret; 3082ed95b21aSIlya Dryomov 3083cbbfb0ffSIlya Dryomov WARN_ON(__rbd_is_lock_owner(rbd_dev) || 3084cbbfb0ffSIlya Dryomov rbd_dev->lock_cookie[0] != '\0'); 3085ed95b21aSIlya Dryomov 3086ed95b21aSIlya Dryomov format_lock_cookie(rbd_dev, cookie); 3087ed95b21aSIlya Dryomov ret = ceph_cls_lock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc, 3088ed95b21aSIlya Dryomov RBD_LOCK_NAME, CEPH_CLS_LOCK_EXCLUSIVE, cookie, 3089ed95b21aSIlya Dryomov RBD_LOCK_TAG, "", 0); 3090ed95b21aSIlya Dryomov if (ret) 3091ed95b21aSIlya Dryomov return ret; 3092ed95b21aSIlya Dryomov 3093ed95b21aSIlya Dryomov rbd_dev->lock_state = RBD_LOCK_STATE_LOCKED; 3094cbbfb0ffSIlya Dryomov strcpy(rbd_dev->lock_cookie, cookie); 3095ed95b21aSIlya Dryomov rbd_set_owner_cid(rbd_dev, &cid); 3096ed95b21aSIlya Dryomov queue_work(rbd_dev->task_wq, &rbd_dev->acquired_lock_work); 3097ed95b21aSIlya Dryomov return 0; 3098ed95b21aSIlya Dryomov } 3099ed95b21aSIlya Dryomov 3100ed95b21aSIlya Dryomov /* 3101ed95b21aSIlya Dryomov * lock_rwsem must be held for write 3102ed95b21aSIlya Dryomov */ 3103bbead745SIlya Dryomov static void rbd_unlock(struct rbd_device *rbd_dev) 3104ed95b21aSIlya Dryomov { 3105ed95b21aSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 3106ed95b21aSIlya Dryomov int ret; 3107ed95b21aSIlya Dryomov 3108cbbfb0ffSIlya Dryomov WARN_ON(!__rbd_is_lock_owner(rbd_dev) || 3109cbbfb0ffSIlya Dryomov rbd_dev->lock_cookie[0] == '\0'); 3110ed95b21aSIlya Dryomov 3111ed95b21aSIlya Dryomov ret = ceph_cls_unlock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc, 3112cbbfb0ffSIlya Dryomov RBD_LOCK_NAME, rbd_dev->lock_cookie); 3113bbead745SIlya Dryomov if (ret && ret != -ENOENT) 3114bbead745SIlya Dryomov rbd_warn(rbd_dev, "failed to unlock: %d", ret); 3115ed95b21aSIlya Dryomov 3116bbead745SIlya Dryomov /* treat errors as the image is unlocked */ 3117bbead745SIlya Dryomov rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED; 3118cbbfb0ffSIlya Dryomov rbd_dev->lock_cookie[0] = '\0'; 3119ed95b21aSIlya Dryomov rbd_set_owner_cid(rbd_dev, &rbd_empty_cid); 3120ed95b21aSIlya Dryomov queue_work(rbd_dev->task_wq, &rbd_dev->released_lock_work); 3121ed95b21aSIlya Dryomov } 3122ed95b21aSIlya Dryomov 3123ed95b21aSIlya Dryomov static int __rbd_notify_op_lock(struct rbd_device *rbd_dev, 3124ed95b21aSIlya Dryomov enum rbd_notify_op notify_op, 3125ed95b21aSIlya Dryomov struct page ***preply_pages, 3126ed95b21aSIlya Dryomov size_t *preply_len) 3127ed95b21aSIlya Dryomov { 3128ed95b21aSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 3129ed95b21aSIlya Dryomov struct rbd_client_id cid = rbd_get_cid(rbd_dev); 3130ed95b21aSIlya Dryomov int buf_size = 4 + 8 + 8 + CEPH_ENCODING_START_BLK_LEN; 3131ed95b21aSIlya Dryomov char buf[buf_size]; 3132ed95b21aSIlya Dryomov void *p = buf; 3133ed95b21aSIlya Dryomov 3134ed95b21aSIlya Dryomov dout("%s rbd_dev %p notify_op %d\n", __func__, rbd_dev, notify_op); 3135ed95b21aSIlya Dryomov 3136ed95b21aSIlya Dryomov /* encode *LockPayload NotifyMessage (op + ClientId) */ 3137ed95b21aSIlya Dryomov ceph_start_encoding(&p, 2, 1, buf_size - CEPH_ENCODING_START_BLK_LEN); 3138ed95b21aSIlya Dryomov ceph_encode_32(&p, notify_op); 3139ed95b21aSIlya Dryomov ceph_encode_64(&p, cid.gid); 3140ed95b21aSIlya Dryomov ceph_encode_64(&p, cid.handle); 3141ed95b21aSIlya Dryomov 3142ed95b21aSIlya Dryomov return ceph_osdc_notify(osdc, &rbd_dev->header_oid, 3143ed95b21aSIlya Dryomov &rbd_dev->header_oloc, buf, buf_size, 3144ed95b21aSIlya Dryomov RBD_NOTIFY_TIMEOUT, preply_pages, preply_len); 3145ed95b21aSIlya Dryomov } 3146ed95b21aSIlya Dryomov 3147ed95b21aSIlya Dryomov static void rbd_notify_op_lock(struct rbd_device *rbd_dev, 3148ed95b21aSIlya Dryomov enum rbd_notify_op notify_op) 3149ed95b21aSIlya Dryomov { 3150ed95b21aSIlya Dryomov struct page **reply_pages; 3151ed95b21aSIlya Dryomov size_t reply_len; 3152ed95b21aSIlya Dryomov 3153ed95b21aSIlya Dryomov __rbd_notify_op_lock(rbd_dev, notify_op, &reply_pages, &reply_len); 3154ed95b21aSIlya Dryomov ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len)); 3155ed95b21aSIlya Dryomov } 3156ed95b21aSIlya Dryomov 3157ed95b21aSIlya Dryomov static void rbd_notify_acquired_lock(struct work_struct *work) 3158ed95b21aSIlya Dryomov { 3159ed95b21aSIlya Dryomov struct rbd_device *rbd_dev = container_of(work, struct rbd_device, 3160ed95b21aSIlya Dryomov acquired_lock_work); 3161ed95b21aSIlya Dryomov 3162ed95b21aSIlya Dryomov rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_ACQUIRED_LOCK); 3163ed95b21aSIlya Dryomov } 3164ed95b21aSIlya Dryomov 3165ed95b21aSIlya Dryomov static void rbd_notify_released_lock(struct work_struct *work) 3166ed95b21aSIlya Dryomov { 3167ed95b21aSIlya Dryomov struct rbd_device *rbd_dev = container_of(work, struct rbd_device, 3168ed95b21aSIlya Dryomov released_lock_work); 3169ed95b21aSIlya Dryomov 3170ed95b21aSIlya Dryomov rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_RELEASED_LOCK); 3171ed95b21aSIlya Dryomov } 3172ed95b21aSIlya Dryomov 3173ed95b21aSIlya Dryomov static int rbd_request_lock(struct rbd_device *rbd_dev) 3174ed95b21aSIlya Dryomov { 3175ed95b21aSIlya Dryomov struct page **reply_pages; 3176ed95b21aSIlya Dryomov size_t reply_len; 3177ed95b21aSIlya Dryomov bool lock_owner_responded = false; 3178ed95b21aSIlya Dryomov int ret; 3179ed95b21aSIlya Dryomov 3180ed95b21aSIlya Dryomov dout("%s rbd_dev %p\n", __func__, rbd_dev); 3181ed95b21aSIlya Dryomov 3182ed95b21aSIlya Dryomov ret = __rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_REQUEST_LOCK, 3183ed95b21aSIlya Dryomov &reply_pages, &reply_len); 3184ed95b21aSIlya Dryomov if (ret && ret != -ETIMEDOUT) { 3185ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "failed to request lock: %d", ret); 3186ed95b21aSIlya Dryomov goto out; 3187ed95b21aSIlya Dryomov } 3188ed95b21aSIlya Dryomov 3189ed95b21aSIlya Dryomov if (reply_len > 0 && reply_len <= PAGE_SIZE) { 3190ed95b21aSIlya Dryomov void *p = page_address(reply_pages[0]); 3191ed95b21aSIlya Dryomov void *const end = p + reply_len; 3192ed95b21aSIlya Dryomov u32 n; 3193ed95b21aSIlya Dryomov 3194ed95b21aSIlya Dryomov ceph_decode_32_safe(&p, end, n, e_inval); /* num_acks */ 3195ed95b21aSIlya Dryomov while (n--) { 3196ed95b21aSIlya Dryomov u8 struct_v; 3197ed95b21aSIlya Dryomov u32 len; 3198ed95b21aSIlya Dryomov 3199ed95b21aSIlya Dryomov ceph_decode_need(&p, end, 8 + 8, e_inval); 3200ed95b21aSIlya Dryomov p += 8 + 8; /* skip gid and cookie */ 3201ed95b21aSIlya Dryomov 3202ed95b21aSIlya Dryomov ceph_decode_32_safe(&p, end, len, e_inval); 3203ed95b21aSIlya Dryomov if (!len) 3204ed95b21aSIlya Dryomov continue; 3205ed95b21aSIlya Dryomov 3206ed95b21aSIlya Dryomov if (lock_owner_responded) { 3207ed95b21aSIlya Dryomov rbd_warn(rbd_dev, 3208ed95b21aSIlya Dryomov "duplicate lock owners detected"); 3209ed95b21aSIlya Dryomov ret = -EIO; 3210ed95b21aSIlya Dryomov goto out; 3211ed95b21aSIlya Dryomov } 3212ed95b21aSIlya Dryomov 3213ed95b21aSIlya Dryomov lock_owner_responded = true; 3214ed95b21aSIlya Dryomov ret = ceph_start_decoding(&p, end, 1, "ResponseMessage", 3215ed95b21aSIlya Dryomov &struct_v, &len); 3216ed95b21aSIlya Dryomov if (ret) { 3217ed95b21aSIlya Dryomov rbd_warn(rbd_dev, 3218ed95b21aSIlya Dryomov "failed to decode ResponseMessage: %d", 3219ed95b21aSIlya Dryomov ret); 3220ed95b21aSIlya Dryomov goto e_inval; 3221ed95b21aSIlya Dryomov } 3222ed95b21aSIlya Dryomov 3223ed95b21aSIlya Dryomov ret = ceph_decode_32(&p); 3224ed95b21aSIlya Dryomov } 3225ed95b21aSIlya Dryomov } 3226ed95b21aSIlya Dryomov 3227ed95b21aSIlya Dryomov if (!lock_owner_responded) { 3228ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "no lock owners detected"); 3229ed95b21aSIlya Dryomov ret = -ETIMEDOUT; 3230ed95b21aSIlya Dryomov } 3231ed95b21aSIlya Dryomov 3232ed95b21aSIlya Dryomov out: 3233ed95b21aSIlya Dryomov ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len)); 3234ed95b21aSIlya Dryomov return ret; 3235ed95b21aSIlya Dryomov 3236ed95b21aSIlya Dryomov e_inval: 3237ed95b21aSIlya Dryomov ret = -EINVAL; 3238ed95b21aSIlya Dryomov goto out; 3239ed95b21aSIlya Dryomov } 3240ed95b21aSIlya Dryomov 3241ed95b21aSIlya Dryomov static void wake_requests(struct rbd_device *rbd_dev, bool wake_all) 3242ed95b21aSIlya Dryomov { 3243ed95b21aSIlya Dryomov dout("%s rbd_dev %p wake_all %d\n", __func__, rbd_dev, wake_all); 3244ed95b21aSIlya Dryomov 3245ed95b21aSIlya Dryomov cancel_delayed_work(&rbd_dev->lock_dwork); 3246ed95b21aSIlya Dryomov if (wake_all) 3247ed95b21aSIlya Dryomov wake_up_all(&rbd_dev->lock_waitq); 3248ed95b21aSIlya Dryomov else 3249ed95b21aSIlya Dryomov wake_up(&rbd_dev->lock_waitq); 3250ed95b21aSIlya Dryomov } 3251ed95b21aSIlya Dryomov 3252ed95b21aSIlya Dryomov static int get_lock_owner_info(struct rbd_device *rbd_dev, 3253ed95b21aSIlya Dryomov struct ceph_locker **lockers, u32 *num_lockers) 3254ed95b21aSIlya Dryomov { 3255ed95b21aSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 3256ed95b21aSIlya Dryomov u8 lock_type; 3257ed95b21aSIlya Dryomov char *lock_tag; 3258ed95b21aSIlya Dryomov int ret; 3259ed95b21aSIlya Dryomov 3260ed95b21aSIlya Dryomov dout("%s rbd_dev %p\n", __func__, rbd_dev); 3261ed95b21aSIlya Dryomov 3262ed95b21aSIlya Dryomov ret = ceph_cls_lock_info(osdc, &rbd_dev->header_oid, 3263ed95b21aSIlya Dryomov &rbd_dev->header_oloc, RBD_LOCK_NAME, 3264ed95b21aSIlya Dryomov &lock_type, &lock_tag, lockers, num_lockers); 3265ed95b21aSIlya Dryomov if (ret) 3266ed95b21aSIlya Dryomov return ret; 3267ed95b21aSIlya Dryomov 3268ed95b21aSIlya Dryomov if (*num_lockers == 0) { 3269ed95b21aSIlya Dryomov dout("%s rbd_dev %p no lockers detected\n", __func__, rbd_dev); 3270ed95b21aSIlya Dryomov goto out; 3271ed95b21aSIlya Dryomov } 3272ed95b21aSIlya Dryomov 3273ed95b21aSIlya Dryomov if (strcmp(lock_tag, RBD_LOCK_TAG)) { 3274ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "locked by external mechanism, tag %s", 3275ed95b21aSIlya Dryomov lock_tag); 3276ed95b21aSIlya Dryomov ret = -EBUSY; 3277ed95b21aSIlya Dryomov goto out; 3278ed95b21aSIlya Dryomov } 3279ed95b21aSIlya Dryomov 3280ed95b21aSIlya Dryomov if (lock_type == CEPH_CLS_LOCK_SHARED) { 3281ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "shared lock type detected"); 3282ed95b21aSIlya Dryomov ret = -EBUSY; 3283ed95b21aSIlya Dryomov goto out; 3284ed95b21aSIlya Dryomov } 3285ed95b21aSIlya Dryomov 3286ed95b21aSIlya Dryomov if (strncmp((*lockers)[0].id.cookie, RBD_LOCK_COOKIE_PREFIX, 3287ed95b21aSIlya Dryomov strlen(RBD_LOCK_COOKIE_PREFIX))) { 3288ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "locked by external mechanism, cookie %s", 3289ed95b21aSIlya Dryomov (*lockers)[0].id.cookie); 3290ed95b21aSIlya Dryomov ret = -EBUSY; 3291ed95b21aSIlya Dryomov goto out; 3292ed95b21aSIlya Dryomov } 3293ed95b21aSIlya Dryomov 3294ed95b21aSIlya Dryomov out: 3295ed95b21aSIlya Dryomov kfree(lock_tag); 3296ed95b21aSIlya Dryomov return ret; 3297ed95b21aSIlya Dryomov } 3298ed95b21aSIlya Dryomov 3299ed95b21aSIlya Dryomov static int find_watcher(struct rbd_device *rbd_dev, 3300ed95b21aSIlya Dryomov const struct ceph_locker *locker) 3301ed95b21aSIlya Dryomov { 3302ed95b21aSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 3303ed95b21aSIlya Dryomov struct ceph_watch_item *watchers; 3304ed95b21aSIlya Dryomov u32 num_watchers; 3305ed95b21aSIlya Dryomov u64 cookie; 3306ed95b21aSIlya Dryomov int i; 3307ed95b21aSIlya Dryomov int ret; 3308ed95b21aSIlya Dryomov 3309ed95b21aSIlya Dryomov ret = ceph_osdc_list_watchers(osdc, &rbd_dev->header_oid, 3310ed95b21aSIlya Dryomov &rbd_dev->header_oloc, &watchers, 3311ed95b21aSIlya Dryomov &num_watchers); 3312ed95b21aSIlya Dryomov if (ret) 3313ed95b21aSIlya Dryomov return ret; 3314ed95b21aSIlya Dryomov 3315ed95b21aSIlya Dryomov sscanf(locker->id.cookie, RBD_LOCK_COOKIE_PREFIX " %llu", &cookie); 3316ed95b21aSIlya Dryomov for (i = 0; i < num_watchers; i++) { 3317ed95b21aSIlya Dryomov if (!memcmp(&watchers[i].addr, &locker->info.addr, 3318ed95b21aSIlya Dryomov sizeof(locker->info.addr)) && 3319ed95b21aSIlya Dryomov watchers[i].cookie == cookie) { 3320ed95b21aSIlya Dryomov struct rbd_client_id cid = { 3321ed95b21aSIlya Dryomov .gid = le64_to_cpu(watchers[i].name.num), 3322ed95b21aSIlya Dryomov .handle = cookie, 3323ed95b21aSIlya Dryomov }; 3324ed95b21aSIlya Dryomov 3325ed95b21aSIlya Dryomov dout("%s rbd_dev %p found cid %llu-%llu\n", __func__, 3326ed95b21aSIlya Dryomov rbd_dev, cid.gid, cid.handle); 3327ed95b21aSIlya Dryomov rbd_set_owner_cid(rbd_dev, &cid); 3328ed95b21aSIlya Dryomov ret = 1; 3329ed95b21aSIlya Dryomov goto out; 3330ed95b21aSIlya Dryomov } 3331ed95b21aSIlya Dryomov } 3332ed95b21aSIlya Dryomov 3333ed95b21aSIlya Dryomov dout("%s rbd_dev %p no watchers\n", __func__, rbd_dev); 3334ed95b21aSIlya Dryomov ret = 0; 3335ed95b21aSIlya Dryomov out: 3336ed95b21aSIlya Dryomov kfree(watchers); 3337ed95b21aSIlya Dryomov return ret; 3338ed95b21aSIlya Dryomov } 3339ed95b21aSIlya Dryomov 3340ed95b21aSIlya Dryomov /* 3341ed95b21aSIlya Dryomov * lock_rwsem must be held for write 3342ed95b21aSIlya Dryomov */ 3343ed95b21aSIlya Dryomov static int rbd_try_lock(struct rbd_device *rbd_dev) 3344ed95b21aSIlya Dryomov { 3345ed95b21aSIlya Dryomov struct ceph_client *client = rbd_dev->rbd_client->client; 3346ed95b21aSIlya Dryomov struct ceph_locker *lockers; 3347ed95b21aSIlya Dryomov u32 num_lockers; 3348ed95b21aSIlya Dryomov int ret; 3349ed95b21aSIlya Dryomov 3350ed95b21aSIlya Dryomov for (;;) { 3351ed95b21aSIlya Dryomov ret = rbd_lock(rbd_dev); 3352ed95b21aSIlya Dryomov if (ret != -EBUSY) 3353ed95b21aSIlya Dryomov return ret; 3354ed95b21aSIlya Dryomov 3355ed95b21aSIlya Dryomov /* determine if the current lock holder is still alive */ 3356ed95b21aSIlya Dryomov ret = get_lock_owner_info(rbd_dev, &lockers, &num_lockers); 3357ed95b21aSIlya Dryomov if (ret) 3358ed95b21aSIlya Dryomov return ret; 3359ed95b21aSIlya Dryomov 3360ed95b21aSIlya Dryomov if (num_lockers == 0) 3361ed95b21aSIlya Dryomov goto again; 3362ed95b21aSIlya Dryomov 3363ed95b21aSIlya Dryomov ret = find_watcher(rbd_dev, lockers); 3364ed95b21aSIlya Dryomov if (ret) { 3365ed95b21aSIlya Dryomov if (ret > 0) 3366ed95b21aSIlya Dryomov ret = 0; /* have to request lock */ 3367ed95b21aSIlya Dryomov goto out; 3368ed95b21aSIlya Dryomov } 3369ed95b21aSIlya Dryomov 3370ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "%s%llu seems dead, breaking lock", 3371ed95b21aSIlya Dryomov ENTITY_NAME(lockers[0].id.name)); 3372ed95b21aSIlya Dryomov 3373ed95b21aSIlya Dryomov ret = ceph_monc_blacklist_add(&client->monc, 3374ed95b21aSIlya Dryomov &lockers[0].info.addr); 3375ed95b21aSIlya Dryomov if (ret) { 3376ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "blacklist of %s%llu failed: %d", 3377ed95b21aSIlya Dryomov ENTITY_NAME(lockers[0].id.name), ret); 3378ed95b21aSIlya Dryomov goto out; 3379ed95b21aSIlya Dryomov } 3380ed95b21aSIlya Dryomov 3381ed95b21aSIlya Dryomov ret = ceph_cls_break_lock(&client->osdc, &rbd_dev->header_oid, 3382ed95b21aSIlya Dryomov &rbd_dev->header_oloc, RBD_LOCK_NAME, 3383ed95b21aSIlya Dryomov lockers[0].id.cookie, 3384ed95b21aSIlya Dryomov &lockers[0].id.name); 3385ed95b21aSIlya Dryomov if (ret && ret != -ENOENT) 3386ed95b21aSIlya Dryomov goto out; 3387ed95b21aSIlya Dryomov 3388ed95b21aSIlya Dryomov again: 3389ed95b21aSIlya Dryomov ceph_free_lockers(lockers, num_lockers); 3390ed95b21aSIlya Dryomov } 3391ed95b21aSIlya Dryomov 3392ed95b21aSIlya Dryomov out: 3393ed95b21aSIlya Dryomov ceph_free_lockers(lockers, num_lockers); 3394ed95b21aSIlya Dryomov return ret; 3395ed95b21aSIlya Dryomov } 3396ed95b21aSIlya Dryomov 3397ed95b21aSIlya Dryomov /* 3398ed95b21aSIlya Dryomov * ret is set only if lock_state is RBD_LOCK_STATE_UNLOCKED 3399ed95b21aSIlya Dryomov */ 3400ed95b21aSIlya Dryomov static enum rbd_lock_state rbd_try_acquire_lock(struct rbd_device *rbd_dev, 3401ed95b21aSIlya Dryomov int *pret) 3402ed95b21aSIlya Dryomov { 3403ed95b21aSIlya Dryomov enum rbd_lock_state lock_state; 3404ed95b21aSIlya Dryomov 3405ed95b21aSIlya Dryomov down_read(&rbd_dev->lock_rwsem); 3406ed95b21aSIlya Dryomov dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev, 3407ed95b21aSIlya Dryomov rbd_dev->lock_state); 3408ed95b21aSIlya Dryomov if (__rbd_is_lock_owner(rbd_dev)) { 3409ed95b21aSIlya Dryomov lock_state = rbd_dev->lock_state; 3410ed95b21aSIlya Dryomov up_read(&rbd_dev->lock_rwsem); 3411ed95b21aSIlya Dryomov return lock_state; 3412ed95b21aSIlya Dryomov } 3413ed95b21aSIlya Dryomov 3414ed95b21aSIlya Dryomov up_read(&rbd_dev->lock_rwsem); 3415ed95b21aSIlya Dryomov down_write(&rbd_dev->lock_rwsem); 3416ed95b21aSIlya Dryomov dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev, 3417ed95b21aSIlya Dryomov rbd_dev->lock_state); 3418ed95b21aSIlya Dryomov if (!__rbd_is_lock_owner(rbd_dev)) { 3419ed95b21aSIlya Dryomov *pret = rbd_try_lock(rbd_dev); 3420ed95b21aSIlya Dryomov if (*pret) 3421ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "failed to acquire lock: %d", *pret); 3422ed95b21aSIlya Dryomov } 3423ed95b21aSIlya Dryomov 3424ed95b21aSIlya Dryomov lock_state = rbd_dev->lock_state; 3425ed95b21aSIlya Dryomov up_write(&rbd_dev->lock_rwsem); 3426ed95b21aSIlya Dryomov return lock_state; 3427ed95b21aSIlya Dryomov } 3428ed95b21aSIlya Dryomov 3429ed95b21aSIlya Dryomov static void rbd_acquire_lock(struct work_struct *work) 3430ed95b21aSIlya Dryomov { 3431ed95b21aSIlya Dryomov struct rbd_device *rbd_dev = container_of(to_delayed_work(work), 3432ed95b21aSIlya Dryomov struct rbd_device, lock_dwork); 3433ed95b21aSIlya Dryomov enum rbd_lock_state lock_state; 3434ed95b21aSIlya Dryomov int ret; 3435ed95b21aSIlya Dryomov 3436ed95b21aSIlya Dryomov dout("%s rbd_dev %p\n", __func__, rbd_dev); 3437ed95b21aSIlya Dryomov again: 3438ed95b21aSIlya Dryomov lock_state = rbd_try_acquire_lock(rbd_dev, &ret); 3439ed95b21aSIlya Dryomov if (lock_state != RBD_LOCK_STATE_UNLOCKED || ret == -EBLACKLISTED) { 3440ed95b21aSIlya Dryomov if (lock_state == RBD_LOCK_STATE_LOCKED) 3441ed95b21aSIlya Dryomov wake_requests(rbd_dev, true); 3442ed95b21aSIlya Dryomov dout("%s rbd_dev %p lock_state %d ret %d - done\n", __func__, 3443ed95b21aSIlya Dryomov rbd_dev, lock_state, ret); 3444ed95b21aSIlya Dryomov return; 3445ed95b21aSIlya Dryomov } 3446ed95b21aSIlya Dryomov 3447ed95b21aSIlya Dryomov ret = rbd_request_lock(rbd_dev); 3448ed95b21aSIlya Dryomov if (ret == -ETIMEDOUT) { 3449ed95b21aSIlya Dryomov goto again; /* treat this as a dead client */ 3450ed95b21aSIlya Dryomov } else if (ret < 0) { 3451ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "error requesting lock: %d", ret); 3452ed95b21aSIlya Dryomov mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 3453ed95b21aSIlya Dryomov RBD_RETRY_DELAY); 3454ed95b21aSIlya Dryomov } else { 3455ed95b21aSIlya Dryomov /* 3456ed95b21aSIlya Dryomov * lock owner acked, but resend if we don't see them 3457ed95b21aSIlya Dryomov * release the lock 3458ed95b21aSIlya Dryomov */ 3459ed95b21aSIlya Dryomov dout("%s rbd_dev %p requeueing lock_dwork\n", __func__, 3460ed95b21aSIlya Dryomov rbd_dev); 3461ed95b21aSIlya Dryomov mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 3462ed95b21aSIlya Dryomov msecs_to_jiffies(2 * RBD_NOTIFY_TIMEOUT * MSEC_PER_SEC)); 3463ed95b21aSIlya Dryomov } 3464ed95b21aSIlya Dryomov } 3465ed95b21aSIlya Dryomov 3466ed95b21aSIlya Dryomov /* 3467ed95b21aSIlya Dryomov * lock_rwsem must be held for write 3468ed95b21aSIlya Dryomov */ 3469ed95b21aSIlya Dryomov static bool rbd_release_lock(struct rbd_device *rbd_dev) 3470ed95b21aSIlya Dryomov { 3471ed95b21aSIlya Dryomov dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev, 3472ed95b21aSIlya Dryomov rbd_dev->lock_state); 3473ed95b21aSIlya Dryomov if (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED) 3474ed95b21aSIlya Dryomov return false; 3475ed95b21aSIlya Dryomov 3476ed95b21aSIlya Dryomov rbd_dev->lock_state = RBD_LOCK_STATE_RELEASING; 3477ed95b21aSIlya Dryomov downgrade_write(&rbd_dev->lock_rwsem); 3478ed95b21aSIlya Dryomov /* 3479ed95b21aSIlya Dryomov * Ensure that all in-flight IO is flushed. 3480ed95b21aSIlya Dryomov * 3481ed95b21aSIlya Dryomov * FIXME: ceph_osdc_sync() flushes the entire OSD client, which 3482ed95b21aSIlya Dryomov * may be shared with other devices. 3483ed95b21aSIlya Dryomov */ 3484ed95b21aSIlya Dryomov ceph_osdc_sync(&rbd_dev->rbd_client->client->osdc); 3485ed95b21aSIlya Dryomov up_read(&rbd_dev->lock_rwsem); 3486ed95b21aSIlya Dryomov 3487ed95b21aSIlya Dryomov down_write(&rbd_dev->lock_rwsem); 3488ed95b21aSIlya Dryomov dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev, 3489ed95b21aSIlya Dryomov rbd_dev->lock_state); 3490ed95b21aSIlya Dryomov if (rbd_dev->lock_state != RBD_LOCK_STATE_RELEASING) 3491ed95b21aSIlya Dryomov return false; 3492ed95b21aSIlya Dryomov 3493bbead745SIlya Dryomov rbd_unlock(rbd_dev); 3494ed95b21aSIlya Dryomov /* 3495ed95b21aSIlya Dryomov * Give others a chance to grab the lock - we would re-acquire 3496ed95b21aSIlya Dryomov * almost immediately if we got new IO during ceph_osdc_sync() 3497ed95b21aSIlya Dryomov * otherwise. We need to ack our own notifications, so this 3498ed95b21aSIlya Dryomov * lock_dwork will be requeued from rbd_wait_state_locked() 3499ed95b21aSIlya Dryomov * after wake_requests() in rbd_handle_released_lock(). 3500ed95b21aSIlya Dryomov */ 3501ed95b21aSIlya Dryomov cancel_delayed_work(&rbd_dev->lock_dwork); 3502ed95b21aSIlya Dryomov return true; 3503ed95b21aSIlya Dryomov } 3504ed95b21aSIlya Dryomov 3505ed95b21aSIlya Dryomov static void rbd_release_lock_work(struct work_struct *work) 3506ed95b21aSIlya Dryomov { 3507ed95b21aSIlya Dryomov struct rbd_device *rbd_dev = container_of(work, struct rbd_device, 3508ed95b21aSIlya Dryomov unlock_work); 3509ed95b21aSIlya Dryomov 3510ed95b21aSIlya Dryomov down_write(&rbd_dev->lock_rwsem); 3511ed95b21aSIlya Dryomov rbd_release_lock(rbd_dev); 3512ed95b21aSIlya Dryomov up_write(&rbd_dev->lock_rwsem); 3513ed95b21aSIlya Dryomov } 3514ed95b21aSIlya Dryomov 3515ed95b21aSIlya Dryomov static void rbd_handle_acquired_lock(struct rbd_device *rbd_dev, u8 struct_v, 3516ed95b21aSIlya Dryomov void **p) 3517ed95b21aSIlya Dryomov { 3518ed95b21aSIlya Dryomov struct rbd_client_id cid = { 0 }; 3519ed95b21aSIlya Dryomov 3520ed95b21aSIlya Dryomov if (struct_v >= 2) { 3521ed95b21aSIlya Dryomov cid.gid = ceph_decode_64(p); 3522ed95b21aSIlya Dryomov cid.handle = ceph_decode_64(p); 3523ed95b21aSIlya Dryomov } 3524ed95b21aSIlya Dryomov 3525ed95b21aSIlya Dryomov dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid, 3526ed95b21aSIlya Dryomov cid.handle); 3527ed95b21aSIlya Dryomov if (!rbd_cid_equal(&cid, &rbd_empty_cid)) { 3528ed95b21aSIlya Dryomov down_write(&rbd_dev->lock_rwsem); 3529ed95b21aSIlya Dryomov if (rbd_cid_equal(&cid, &rbd_dev->owner_cid)) { 3530ed95b21aSIlya Dryomov /* 3531ed95b21aSIlya Dryomov * we already know that the remote client is 3532ed95b21aSIlya Dryomov * the owner 3533ed95b21aSIlya Dryomov */ 3534ed95b21aSIlya Dryomov up_write(&rbd_dev->lock_rwsem); 3535ed95b21aSIlya Dryomov return; 3536ed95b21aSIlya Dryomov } 3537ed95b21aSIlya Dryomov 3538ed95b21aSIlya Dryomov rbd_set_owner_cid(rbd_dev, &cid); 3539ed95b21aSIlya Dryomov downgrade_write(&rbd_dev->lock_rwsem); 3540ed95b21aSIlya Dryomov } else { 3541ed95b21aSIlya Dryomov down_read(&rbd_dev->lock_rwsem); 3542ed95b21aSIlya Dryomov } 3543ed95b21aSIlya Dryomov 3544ed95b21aSIlya Dryomov if (!__rbd_is_lock_owner(rbd_dev)) 3545ed95b21aSIlya Dryomov wake_requests(rbd_dev, false); 3546ed95b21aSIlya Dryomov up_read(&rbd_dev->lock_rwsem); 3547ed95b21aSIlya Dryomov } 3548ed95b21aSIlya Dryomov 3549ed95b21aSIlya Dryomov static void rbd_handle_released_lock(struct rbd_device *rbd_dev, u8 struct_v, 3550ed95b21aSIlya Dryomov void **p) 3551ed95b21aSIlya Dryomov { 3552ed95b21aSIlya Dryomov struct rbd_client_id cid = { 0 }; 3553ed95b21aSIlya Dryomov 3554ed95b21aSIlya Dryomov if (struct_v >= 2) { 3555ed95b21aSIlya Dryomov cid.gid = ceph_decode_64(p); 3556ed95b21aSIlya Dryomov cid.handle = ceph_decode_64(p); 3557ed95b21aSIlya Dryomov } 3558ed95b21aSIlya Dryomov 3559ed95b21aSIlya Dryomov dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid, 3560ed95b21aSIlya Dryomov cid.handle); 3561ed95b21aSIlya Dryomov if (!rbd_cid_equal(&cid, &rbd_empty_cid)) { 3562ed95b21aSIlya Dryomov down_write(&rbd_dev->lock_rwsem); 3563ed95b21aSIlya Dryomov if (!rbd_cid_equal(&cid, &rbd_dev->owner_cid)) { 3564ed95b21aSIlya Dryomov dout("%s rbd_dev %p unexpected owner, cid %llu-%llu != owner_cid %llu-%llu\n", 3565ed95b21aSIlya Dryomov __func__, rbd_dev, cid.gid, cid.handle, 3566ed95b21aSIlya Dryomov rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle); 3567ed95b21aSIlya Dryomov up_write(&rbd_dev->lock_rwsem); 3568ed95b21aSIlya Dryomov return; 3569ed95b21aSIlya Dryomov } 3570ed95b21aSIlya Dryomov 3571ed95b21aSIlya Dryomov rbd_set_owner_cid(rbd_dev, &rbd_empty_cid); 3572ed95b21aSIlya Dryomov downgrade_write(&rbd_dev->lock_rwsem); 3573ed95b21aSIlya Dryomov } else { 3574ed95b21aSIlya Dryomov down_read(&rbd_dev->lock_rwsem); 3575ed95b21aSIlya Dryomov } 3576ed95b21aSIlya Dryomov 3577ed95b21aSIlya Dryomov if (!__rbd_is_lock_owner(rbd_dev)) 3578ed95b21aSIlya Dryomov wake_requests(rbd_dev, false); 3579ed95b21aSIlya Dryomov up_read(&rbd_dev->lock_rwsem); 3580ed95b21aSIlya Dryomov } 3581ed95b21aSIlya Dryomov 3582ed95b21aSIlya Dryomov static bool rbd_handle_request_lock(struct rbd_device *rbd_dev, u8 struct_v, 3583ed95b21aSIlya Dryomov void **p) 3584ed95b21aSIlya Dryomov { 3585ed95b21aSIlya Dryomov struct rbd_client_id my_cid = rbd_get_cid(rbd_dev); 3586ed95b21aSIlya Dryomov struct rbd_client_id cid = { 0 }; 3587ed95b21aSIlya Dryomov bool need_to_send; 3588ed95b21aSIlya Dryomov 3589ed95b21aSIlya Dryomov if (struct_v >= 2) { 3590ed95b21aSIlya Dryomov cid.gid = ceph_decode_64(p); 3591ed95b21aSIlya Dryomov cid.handle = ceph_decode_64(p); 3592ed95b21aSIlya Dryomov } 3593ed95b21aSIlya Dryomov 3594ed95b21aSIlya Dryomov dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid, 3595ed95b21aSIlya Dryomov cid.handle); 3596ed95b21aSIlya Dryomov if (rbd_cid_equal(&cid, &my_cid)) 3597ed95b21aSIlya Dryomov return false; 3598ed95b21aSIlya Dryomov 3599ed95b21aSIlya Dryomov down_read(&rbd_dev->lock_rwsem); 3600ed95b21aSIlya Dryomov need_to_send = __rbd_is_lock_owner(rbd_dev); 3601ed95b21aSIlya Dryomov if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED) { 3602ed95b21aSIlya Dryomov if (!rbd_cid_equal(&rbd_dev->owner_cid, &rbd_empty_cid)) { 3603ed95b21aSIlya Dryomov dout("%s rbd_dev %p queueing unlock_work\n", __func__, 3604ed95b21aSIlya Dryomov rbd_dev); 3605ed95b21aSIlya Dryomov queue_work(rbd_dev->task_wq, &rbd_dev->unlock_work); 3606ed95b21aSIlya Dryomov } 3607ed95b21aSIlya Dryomov } 3608ed95b21aSIlya Dryomov up_read(&rbd_dev->lock_rwsem); 3609ed95b21aSIlya Dryomov return need_to_send; 3610ed95b21aSIlya Dryomov } 3611ed95b21aSIlya Dryomov 3612ed95b21aSIlya Dryomov static void __rbd_acknowledge_notify(struct rbd_device *rbd_dev, 3613ed95b21aSIlya Dryomov u64 notify_id, u64 cookie, s32 *result) 3614ed95b21aSIlya Dryomov { 3615ed95b21aSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 3616ed95b21aSIlya Dryomov int buf_size = 4 + CEPH_ENCODING_START_BLK_LEN; 3617ed95b21aSIlya Dryomov char buf[buf_size]; 3618ed95b21aSIlya Dryomov int ret; 3619ed95b21aSIlya Dryomov 3620ed95b21aSIlya Dryomov if (result) { 3621ed95b21aSIlya Dryomov void *p = buf; 3622ed95b21aSIlya Dryomov 3623ed95b21aSIlya Dryomov /* encode ResponseMessage */ 3624ed95b21aSIlya Dryomov ceph_start_encoding(&p, 1, 1, 3625ed95b21aSIlya Dryomov buf_size - CEPH_ENCODING_START_BLK_LEN); 3626ed95b21aSIlya Dryomov ceph_encode_32(&p, *result); 3627ed95b21aSIlya Dryomov } else { 3628ed95b21aSIlya Dryomov buf_size = 0; 3629ed95b21aSIlya Dryomov } 3630ed95b21aSIlya Dryomov 3631ed95b21aSIlya Dryomov ret = ceph_osdc_notify_ack(osdc, &rbd_dev->header_oid, 3632ed95b21aSIlya Dryomov &rbd_dev->header_oloc, notify_id, cookie, 3633ed95b21aSIlya Dryomov buf, buf_size); 3634ed95b21aSIlya Dryomov if (ret) 3635ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "acknowledge_notify failed: %d", ret); 3636ed95b21aSIlya Dryomov } 3637ed95b21aSIlya Dryomov 3638ed95b21aSIlya Dryomov static void rbd_acknowledge_notify(struct rbd_device *rbd_dev, u64 notify_id, 3639ed95b21aSIlya Dryomov u64 cookie) 3640ed95b21aSIlya Dryomov { 3641ed95b21aSIlya Dryomov dout("%s rbd_dev %p\n", __func__, rbd_dev); 3642ed95b21aSIlya Dryomov __rbd_acknowledge_notify(rbd_dev, notify_id, cookie, NULL); 3643ed95b21aSIlya Dryomov } 3644ed95b21aSIlya Dryomov 3645ed95b21aSIlya Dryomov static void rbd_acknowledge_notify_result(struct rbd_device *rbd_dev, 3646ed95b21aSIlya Dryomov u64 notify_id, u64 cookie, s32 result) 3647ed95b21aSIlya Dryomov { 3648ed95b21aSIlya Dryomov dout("%s rbd_dev %p result %d\n", __func__, rbd_dev, result); 3649ed95b21aSIlya Dryomov __rbd_acknowledge_notify(rbd_dev, notify_id, cookie, &result); 3650ed95b21aSIlya Dryomov } 3651922dab61SIlya Dryomov 3652922dab61SIlya Dryomov static void rbd_watch_cb(void *arg, u64 notify_id, u64 cookie, 3653922dab61SIlya Dryomov u64 notifier_id, void *data, size_t data_len) 3654b8d70035SAlex Elder { 3655922dab61SIlya Dryomov struct rbd_device *rbd_dev = arg; 3656ed95b21aSIlya Dryomov void *p = data; 3657ed95b21aSIlya Dryomov void *const end = p + data_len; 3658d4c2269bSIlya Dryomov u8 struct_v = 0; 3659ed95b21aSIlya Dryomov u32 len; 3660ed95b21aSIlya Dryomov u32 notify_op; 3661b8d70035SAlex Elder int ret; 3662b8d70035SAlex Elder 3663ed95b21aSIlya Dryomov dout("%s rbd_dev %p cookie %llu notify_id %llu data_len %zu\n", 3664ed95b21aSIlya Dryomov __func__, rbd_dev, cookie, notify_id, data_len); 3665ed95b21aSIlya Dryomov if (data_len) { 3666ed95b21aSIlya Dryomov ret = ceph_start_decoding(&p, end, 1, "NotifyMessage", 3667ed95b21aSIlya Dryomov &struct_v, &len); 3668ed95b21aSIlya Dryomov if (ret) { 3669ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "failed to decode NotifyMessage: %d", 3670ed95b21aSIlya Dryomov ret); 3671ed95b21aSIlya Dryomov return; 3672ed95b21aSIlya Dryomov } 367352bb1f9bSIlya Dryomov 3674ed95b21aSIlya Dryomov notify_op = ceph_decode_32(&p); 3675ed95b21aSIlya Dryomov } else { 3676ed95b21aSIlya Dryomov /* legacy notification for header updates */ 3677ed95b21aSIlya Dryomov notify_op = RBD_NOTIFY_OP_HEADER_UPDATE; 3678ed95b21aSIlya Dryomov len = 0; 3679ed95b21aSIlya Dryomov } 3680ed95b21aSIlya Dryomov 3681ed95b21aSIlya Dryomov dout("%s rbd_dev %p notify_op %u\n", __func__, rbd_dev, notify_op); 3682ed95b21aSIlya Dryomov switch (notify_op) { 3683ed95b21aSIlya Dryomov case RBD_NOTIFY_OP_ACQUIRED_LOCK: 3684ed95b21aSIlya Dryomov rbd_handle_acquired_lock(rbd_dev, struct_v, &p); 3685ed95b21aSIlya Dryomov rbd_acknowledge_notify(rbd_dev, notify_id, cookie); 3686ed95b21aSIlya Dryomov break; 3687ed95b21aSIlya Dryomov case RBD_NOTIFY_OP_RELEASED_LOCK: 3688ed95b21aSIlya Dryomov rbd_handle_released_lock(rbd_dev, struct_v, &p); 3689ed95b21aSIlya Dryomov rbd_acknowledge_notify(rbd_dev, notify_id, cookie); 3690ed95b21aSIlya Dryomov break; 3691ed95b21aSIlya Dryomov case RBD_NOTIFY_OP_REQUEST_LOCK: 3692ed95b21aSIlya Dryomov if (rbd_handle_request_lock(rbd_dev, struct_v, &p)) 369352bb1f9bSIlya Dryomov /* 3694ed95b21aSIlya Dryomov * send ResponseMessage(0) back so the client 3695ed95b21aSIlya Dryomov * can detect a missing owner 369652bb1f9bSIlya Dryomov */ 3697ed95b21aSIlya Dryomov rbd_acknowledge_notify_result(rbd_dev, notify_id, 3698ed95b21aSIlya Dryomov cookie, 0); 3699ed95b21aSIlya Dryomov else 3700ed95b21aSIlya Dryomov rbd_acknowledge_notify(rbd_dev, notify_id, cookie); 3701ed95b21aSIlya Dryomov break; 3702ed95b21aSIlya Dryomov case RBD_NOTIFY_OP_HEADER_UPDATE: 3703e627db08SAlex Elder ret = rbd_dev_refresh(rbd_dev); 3704e627db08SAlex Elder if (ret) 37059584d508SIlya Dryomov rbd_warn(rbd_dev, "refresh failed: %d", ret); 3706b8d70035SAlex Elder 3707ed95b21aSIlya Dryomov rbd_acknowledge_notify(rbd_dev, notify_id, cookie); 3708ed95b21aSIlya Dryomov break; 3709ed95b21aSIlya Dryomov default: 3710ed95b21aSIlya Dryomov if (rbd_is_lock_owner(rbd_dev)) 3711ed95b21aSIlya Dryomov rbd_acknowledge_notify_result(rbd_dev, notify_id, 3712ed95b21aSIlya Dryomov cookie, -EOPNOTSUPP); 3713ed95b21aSIlya Dryomov else 3714ed95b21aSIlya Dryomov rbd_acknowledge_notify(rbd_dev, notify_id, cookie); 3715ed95b21aSIlya Dryomov break; 3716b8d70035SAlex Elder } 3717b8d70035SAlex Elder } 3718b8d70035SAlex Elder 371999d16943SIlya Dryomov static void __rbd_unregister_watch(struct rbd_device *rbd_dev); 37209969ebc5SAlex Elder 3721922dab61SIlya Dryomov static void rbd_watch_errcb(void *arg, u64 cookie, int err) 3722bb040aa0SIlya Dryomov { 3723922dab61SIlya Dryomov struct rbd_device *rbd_dev = arg; 3724bb040aa0SIlya Dryomov 3725922dab61SIlya Dryomov rbd_warn(rbd_dev, "encountered watch error: %d", err); 3726bb040aa0SIlya Dryomov 3727ed95b21aSIlya Dryomov down_write(&rbd_dev->lock_rwsem); 3728ed95b21aSIlya Dryomov rbd_set_owner_cid(rbd_dev, &rbd_empty_cid); 3729ed95b21aSIlya Dryomov up_write(&rbd_dev->lock_rwsem); 3730bb040aa0SIlya Dryomov 373199d16943SIlya Dryomov mutex_lock(&rbd_dev->watch_mutex); 373299d16943SIlya Dryomov if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED) { 373399d16943SIlya Dryomov __rbd_unregister_watch(rbd_dev); 373499d16943SIlya Dryomov rbd_dev->watch_state = RBD_WATCH_STATE_ERROR; 3735bb040aa0SIlya Dryomov 373699d16943SIlya Dryomov queue_delayed_work(rbd_dev->task_wq, &rbd_dev->watch_dwork, 0); 3737bb040aa0SIlya Dryomov } 373899d16943SIlya Dryomov mutex_unlock(&rbd_dev->watch_mutex); 3739bb040aa0SIlya Dryomov } 3740bb040aa0SIlya Dryomov 3741bb040aa0SIlya Dryomov /* 374299d16943SIlya Dryomov * watch_mutex must be locked 37439969ebc5SAlex Elder */ 374499d16943SIlya Dryomov static int __rbd_register_watch(struct rbd_device *rbd_dev) 37459969ebc5SAlex Elder { 37469969ebc5SAlex Elder struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 3747922dab61SIlya Dryomov struct ceph_osd_linger_request *handle; 37489969ebc5SAlex Elder 3749922dab61SIlya Dryomov rbd_assert(!rbd_dev->watch_handle); 375099d16943SIlya Dryomov dout("%s rbd_dev %p\n", __func__, rbd_dev); 37519969ebc5SAlex Elder 3752922dab61SIlya Dryomov handle = ceph_osdc_watch(osdc, &rbd_dev->header_oid, 3753922dab61SIlya Dryomov &rbd_dev->header_oloc, rbd_watch_cb, 3754922dab61SIlya Dryomov rbd_watch_errcb, rbd_dev); 3755922dab61SIlya Dryomov if (IS_ERR(handle)) 3756922dab61SIlya Dryomov return PTR_ERR(handle); 37579969ebc5SAlex Elder 3758922dab61SIlya Dryomov rbd_dev->watch_handle = handle; 37598eb87565SAlex Elder return 0; 37609969ebc5SAlex Elder } 37619969ebc5SAlex Elder 376299d16943SIlya Dryomov /* 376399d16943SIlya Dryomov * watch_mutex must be locked 376499d16943SIlya Dryomov */ 376599d16943SIlya Dryomov static void __rbd_unregister_watch(struct rbd_device *rbd_dev) 3766fca27065SIlya Dryomov { 3767922dab61SIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 3768922dab61SIlya Dryomov int ret; 3769b30a01f2SIlya Dryomov 377099d16943SIlya Dryomov rbd_assert(rbd_dev->watch_handle); 377199d16943SIlya Dryomov dout("%s rbd_dev %p\n", __func__, rbd_dev); 3772b30a01f2SIlya Dryomov 3773922dab61SIlya Dryomov ret = ceph_osdc_unwatch(osdc, rbd_dev->watch_handle); 3774922dab61SIlya Dryomov if (ret) 3775922dab61SIlya Dryomov rbd_warn(rbd_dev, "failed to unwatch: %d", ret); 3776b30a01f2SIlya Dryomov 3777922dab61SIlya Dryomov rbd_dev->watch_handle = NULL; 3778c525f036SIlya Dryomov } 3779c525f036SIlya Dryomov 378099d16943SIlya Dryomov static int rbd_register_watch(struct rbd_device *rbd_dev) 3781c525f036SIlya Dryomov { 378299d16943SIlya Dryomov int ret; 3783811c6688SIlya Dryomov 378499d16943SIlya Dryomov mutex_lock(&rbd_dev->watch_mutex); 378599d16943SIlya Dryomov rbd_assert(rbd_dev->watch_state == RBD_WATCH_STATE_UNREGISTERED); 378699d16943SIlya Dryomov ret = __rbd_register_watch(rbd_dev); 378799d16943SIlya Dryomov if (ret) 378899d16943SIlya Dryomov goto out; 378999d16943SIlya Dryomov 379099d16943SIlya Dryomov rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED; 379199d16943SIlya Dryomov rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id; 379299d16943SIlya Dryomov 379399d16943SIlya Dryomov out: 379499d16943SIlya Dryomov mutex_unlock(&rbd_dev->watch_mutex); 379599d16943SIlya Dryomov return ret; 379699d16943SIlya Dryomov } 379799d16943SIlya Dryomov 379899d16943SIlya Dryomov static void cancel_tasks_sync(struct rbd_device *rbd_dev) 379999d16943SIlya Dryomov { 380099d16943SIlya Dryomov dout("%s rbd_dev %p\n", __func__, rbd_dev); 380199d16943SIlya Dryomov 380299d16943SIlya Dryomov cancel_delayed_work_sync(&rbd_dev->watch_dwork); 3803ed95b21aSIlya Dryomov cancel_work_sync(&rbd_dev->acquired_lock_work); 3804ed95b21aSIlya Dryomov cancel_work_sync(&rbd_dev->released_lock_work); 3805ed95b21aSIlya Dryomov cancel_delayed_work_sync(&rbd_dev->lock_dwork); 3806ed95b21aSIlya Dryomov cancel_work_sync(&rbd_dev->unlock_work); 380799d16943SIlya Dryomov } 380899d16943SIlya Dryomov 380999d16943SIlya Dryomov static void rbd_unregister_watch(struct rbd_device *rbd_dev) 381099d16943SIlya Dryomov { 3811ed95b21aSIlya Dryomov WARN_ON(waitqueue_active(&rbd_dev->lock_waitq)); 381299d16943SIlya Dryomov cancel_tasks_sync(rbd_dev); 381399d16943SIlya Dryomov 381499d16943SIlya Dryomov mutex_lock(&rbd_dev->watch_mutex); 381599d16943SIlya Dryomov if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED) 381699d16943SIlya Dryomov __rbd_unregister_watch(rbd_dev); 381799d16943SIlya Dryomov rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED; 381899d16943SIlya Dryomov mutex_unlock(&rbd_dev->watch_mutex); 381999d16943SIlya Dryomov 3820811c6688SIlya Dryomov ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc); 3821fca27065SIlya Dryomov } 3822fca27065SIlya Dryomov 382399d16943SIlya Dryomov static void rbd_reregister_watch(struct work_struct *work) 382499d16943SIlya Dryomov { 382599d16943SIlya Dryomov struct rbd_device *rbd_dev = container_of(to_delayed_work(work), 382699d16943SIlya Dryomov struct rbd_device, watch_dwork); 3827ed95b21aSIlya Dryomov bool was_lock_owner = false; 382887c0fdedSIlya Dryomov bool need_to_wake = false; 382999d16943SIlya Dryomov int ret; 383099d16943SIlya Dryomov 383199d16943SIlya Dryomov dout("%s rbd_dev %p\n", __func__, rbd_dev); 383299d16943SIlya Dryomov 3833ed95b21aSIlya Dryomov down_write(&rbd_dev->lock_rwsem); 3834ed95b21aSIlya Dryomov if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED) 3835ed95b21aSIlya Dryomov was_lock_owner = rbd_release_lock(rbd_dev); 3836ed95b21aSIlya Dryomov 383799d16943SIlya Dryomov mutex_lock(&rbd_dev->watch_mutex); 383887c0fdedSIlya Dryomov if (rbd_dev->watch_state != RBD_WATCH_STATE_ERROR) { 383987c0fdedSIlya Dryomov mutex_unlock(&rbd_dev->watch_mutex); 384087c0fdedSIlya Dryomov goto out; 384187c0fdedSIlya Dryomov } 384299d16943SIlya Dryomov 384399d16943SIlya Dryomov ret = __rbd_register_watch(rbd_dev); 384499d16943SIlya Dryomov if (ret) { 384599d16943SIlya Dryomov rbd_warn(rbd_dev, "failed to reregister watch: %d", ret); 38464d73644bSIlya Dryomov if (ret == -EBLACKLISTED || ret == -ENOENT) { 384787c0fdedSIlya Dryomov set_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags); 384887c0fdedSIlya Dryomov need_to_wake = true; 384987c0fdedSIlya Dryomov } else { 385099d16943SIlya Dryomov queue_delayed_work(rbd_dev->task_wq, 385199d16943SIlya Dryomov &rbd_dev->watch_dwork, 385299d16943SIlya Dryomov RBD_RETRY_DELAY); 385387c0fdedSIlya Dryomov } 385487c0fdedSIlya Dryomov mutex_unlock(&rbd_dev->watch_mutex); 385587c0fdedSIlya Dryomov goto out; 385699d16943SIlya Dryomov } 385799d16943SIlya Dryomov 385887c0fdedSIlya Dryomov need_to_wake = true; 385999d16943SIlya Dryomov rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED; 386099d16943SIlya Dryomov rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id; 386199d16943SIlya Dryomov mutex_unlock(&rbd_dev->watch_mutex); 386299d16943SIlya Dryomov 386399d16943SIlya Dryomov ret = rbd_dev_refresh(rbd_dev); 386499d16943SIlya Dryomov if (ret) 386599d16943SIlya Dryomov rbd_warn(rbd_dev, "reregisteration refresh failed: %d", ret); 386699d16943SIlya Dryomov 3867ed95b21aSIlya Dryomov if (was_lock_owner) { 3868ed95b21aSIlya Dryomov ret = rbd_try_lock(rbd_dev); 3869ed95b21aSIlya Dryomov if (ret) 3870ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "reregisteration lock failed: %d", 3871ed95b21aSIlya Dryomov ret); 3872ed95b21aSIlya Dryomov } 3873ed95b21aSIlya Dryomov 387487c0fdedSIlya Dryomov out: 3875ed95b21aSIlya Dryomov up_write(&rbd_dev->lock_rwsem); 387687c0fdedSIlya Dryomov if (need_to_wake) 3877ed95b21aSIlya Dryomov wake_requests(rbd_dev, true); 387899d16943SIlya Dryomov } 387999d16943SIlya Dryomov 388036be9a76SAlex Elder /* 3881f40eb349SAlex Elder * Synchronous osd object method call. Returns the number of bytes 3882f40eb349SAlex Elder * returned in the outbound buffer, or a negative error code. 388336be9a76SAlex Elder */ 388436be9a76SAlex Elder static int rbd_obj_method_sync(struct rbd_device *rbd_dev, 3885ecd4a68aSIlya Dryomov struct ceph_object_id *oid, 3886ecd4a68aSIlya Dryomov struct ceph_object_locator *oloc, 388736be9a76SAlex Elder const char *method_name, 38884157976bSAlex Elder const void *outbound, 388936be9a76SAlex Elder size_t outbound_size, 38904157976bSAlex Elder void *inbound, 3891e2a58ee5SAlex Elder size_t inbound_size) 389236be9a76SAlex Elder { 3893ecd4a68aSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 3894ecd4a68aSIlya Dryomov struct page *req_page = NULL; 3895ecd4a68aSIlya Dryomov struct page *reply_page; 389636be9a76SAlex Elder int ret; 389736be9a76SAlex Elder 389836be9a76SAlex Elder /* 38996010a451SAlex Elder * Method calls are ultimately read operations. The result 39006010a451SAlex Elder * should placed into the inbound buffer provided. They 39016010a451SAlex Elder * also supply outbound data--parameters for the object 39026010a451SAlex Elder * method. Currently if this is present it will be a 39036010a451SAlex Elder * snapshot id. 390436be9a76SAlex Elder */ 3905ecd4a68aSIlya Dryomov if (outbound) { 3906ecd4a68aSIlya Dryomov if (outbound_size > PAGE_SIZE) 3907ecd4a68aSIlya Dryomov return -E2BIG; 390836be9a76SAlex Elder 3909ecd4a68aSIlya Dryomov req_page = alloc_page(GFP_KERNEL); 3910ecd4a68aSIlya Dryomov if (!req_page) 3911ecd4a68aSIlya Dryomov return -ENOMEM; 391236be9a76SAlex Elder 3913ecd4a68aSIlya Dryomov memcpy(page_address(req_page), outbound, outbound_size); 391404017e29SAlex Elder } 3915430c28c3SAlex Elder 3916ecd4a68aSIlya Dryomov reply_page = alloc_page(GFP_KERNEL); 3917ecd4a68aSIlya Dryomov if (!reply_page) { 3918ecd4a68aSIlya Dryomov if (req_page) 3919ecd4a68aSIlya Dryomov __free_page(req_page); 3920ecd4a68aSIlya Dryomov return -ENOMEM; 3921ecd4a68aSIlya Dryomov } 392236be9a76SAlex Elder 3923ecd4a68aSIlya Dryomov ret = ceph_osdc_call(osdc, oid, oloc, RBD_DRV_NAME, method_name, 3924ecd4a68aSIlya Dryomov CEPH_OSD_FLAG_READ, req_page, outbound_size, 3925ecd4a68aSIlya Dryomov reply_page, &inbound_size); 3926ecd4a68aSIlya Dryomov if (!ret) { 3927ecd4a68aSIlya Dryomov memcpy(inbound, page_address(reply_page), inbound_size); 3928ecd4a68aSIlya Dryomov ret = inbound_size; 3929ecd4a68aSIlya Dryomov } 393057385b51SAlex Elder 3931ecd4a68aSIlya Dryomov if (req_page) 3932ecd4a68aSIlya Dryomov __free_page(req_page); 3933ecd4a68aSIlya Dryomov __free_page(reply_page); 393436be9a76SAlex Elder return ret; 393536be9a76SAlex Elder } 393636be9a76SAlex Elder 3937ed95b21aSIlya Dryomov /* 3938ed95b21aSIlya Dryomov * lock_rwsem must be held for read 3939ed95b21aSIlya Dryomov */ 3940ed95b21aSIlya Dryomov static void rbd_wait_state_locked(struct rbd_device *rbd_dev) 3941ed95b21aSIlya Dryomov { 3942ed95b21aSIlya Dryomov DEFINE_WAIT(wait); 3943ed95b21aSIlya Dryomov 3944ed95b21aSIlya Dryomov do { 3945ed95b21aSIlya Dryomov /* 3946ed95b21aSIlya Dryomov * Note the use of mod_delayed_work() in rbd_acquire_lock() 3947ed95b21aSIlya Dryomov * and cancel_delayed_work() in wake_requests(). 3948ed95b21aSIlya Dryomov */ 3949ed95b21aSIlya Dryomov dout("%s rbd_dev %p queueing lock_dwork\n", __func__, rbd_dev); 3950ed95b21aSIlya Dryomov queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0); 3951ed95b21aSIlya Dryomov prepare_to_wait_exclusive(&rbd_dev->lock_waitq, &wait, 3952ed95b21aSIlya Dryomov TASK_UNINTERRUPTIBLE); 3953ed95b21aSIlya Dryomov up_read(&rbd_dev->lock_rwsem); 3954ed95b21aSIlya Dryomov schedule(); 3955ed95b21aSIlya Dryomov down_read(&rbd_dev->lock_rwsem); 395687c0fdedSIlya Dryomov } while (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED && 395787c0fdedSIlya Dryomov !test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags)); 395887c0fdedSIlya Dryomov 3959ed95b21aSIlya Dryomov finish_wait(&rbd_dev->lock_waitq, &wait); 3960ed95b21aSIlya Dryomov } 3961ed95b21aSIlya Dryomov 39627ad18afaSChristoph Hellwig static void rbd_queue_workfn(struct work_struct *work) 3963bc1ecc65SIlya Dryomov { 39647ad18afaSChristoph Hellwig struct request *rq = blk_mq_rq_from_pdu(work); 39657ad18afaSChristoph Hellwig struct rbd_device *rbd_dev = rq->q->queuedata; 3966bc1ecc65SIlya Dryomov struct rbd_img_request *img_request; 39674e752f0aSJosh Durgin struct ceph_snap_context *snapc = NULL; 3968bc1ecc65SIlya Dryomov u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT; 3969bc1ecc65SIlya Dryomov u64 length = blk_rq_bytes(rq); 39706d2940c8SGuangliang Zhao enum obj_operation_type op_type; 39714e752f0aSJosh Durgin u64 mapping_size; 397280de1912SIlya Dryomov bool must_be_locked; 3973bc1ecc65SIlya Dryomov int result; 3974bc1ecc65SIlya Dryomov 3975aebf526bSChristoph Hellwig switch (req_op(rq)) { 3976aebf526bSChristoph Hellwig case REQ_OP_DISCARD: 3977aebf526bSChristoph Hellwig op_type = OBJ_OP_DISCARD; 3978aebf526bSChristoph Hellwig break; 3979aebf526bSChristoph Hellwig case REQ_OP_WRITE: 3980aebf526bSChristoph Hellwig op_type = OBJ_OP_WRITE; 3981aebf526bSChristoph Hellwig break; 3982aebf526bSChristoph Hellwig case REQ_OP_READ: 3983aebf526bSChristoph Hellwig op_type = OBJ_OP_READ; 3984aebf526bSChristoph Hellwig break; 3985aebf526bSChristoph Hellwig default: 3986aebf526bSChristoph Hellwig dout("%s: non-fs request type %d\n", __func__, req_op(rq)); 39877ad18afaSChristoph Hellwig result = -EIO; 39887ad18afaSChristoph Hellwig goto err; 39897ad18afaSChristoph Hellwig } 39907ad18afaSChristoph Hellwig 3991bc1ecc65SIlya Dryomov /* Ignore/skip any zero-length requests */ 3992bc1ecc65SIlya Dryomov 3993bc1ecc65SIlya Dryomov if (!length) { 3994bc1ecc65SIlya Dryomov dout("%s: zero-length request\n", __func__); 3995bc1ecc65SIlya Dryomov result = 0; 3996bc1ecc65SIlya Dryomov goto err_rq; 3997bc1ecc65SIlya Dryomov } 3998bc1ecc65SIlya Dryomov 39996d2940c8SGuangliang Zhao /* Only reads are allowed to a read-only device */ 4000bc1ecc65SIlya Dryomov 40016d2940c8SGuangliang Zhao if (op_type != OBJ_OP_READ) { 4002bc1ecc65SIlya Dryomov if (rbd_dev->mapping.read_only) { 4003bc1ecc65SIlya Dryomov result = -EROFS; 4004bc1ecc65SIlya Dryomov goto err_rq; 4005bc1ecc65SIlya Dryomov } 4006bc1ecc65SIlya Dryomov rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP); 4007bc1ecc65SIlya Dryomov } 4008bc1ecc65SIlya Dryomov 4009bc1ecc65SIlya Dryomov /* 4010bc1ecc65SIlya Dryomov * Quit early if the mapped snapshot no longer exists. It's 4011bc1ecc65SIlya Dryomov * still possible the snapshot will have disappeared by the 4012bc1ecc65SIlya Dryomov * time our request arrives at the osd, but there's no sense in 4013bc1ecc65SIlya Dryomov * sending it if we already know. 4014bc1ecc65SIlya Dryomov */ 4015bc1ecc65SIlya Dryomov if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) { 4016bc1ecc65SIlya Dryomov dout("request for non-existent snapshot"); 4017bc1ecc65SIlya Dryomov rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP); 4018bc1ecc65SIlya Dryomov result = -ENXIO; 4019bc1ecc65SIlya Dryomov goto err_rq; 4020bc1ecc65SIlya Dryomov } 4021bc1ecc65SIlya Dryomov 4022bc1ecc65SIlya Dryomov if (offset && length > U64_MAX - offset + 1) { 4023bc1ecc65SIlya Dryomov rbd_warn(rbd_dev, "bad request range (%llu~%llu)", offset, 4024bc1ecc65SIlya Dryomov length); 4025bc1ecc65SIlya Dryomov result = -EINVAL; 4026bc1ecc65SIlya Dryomov goto err_rq; /* Shouldn't happen */ 4027bc1ecc65SIlya Dryomov } 4028bc1ecc65SIlya Dryomov 40297ad18afaSChristoph Hellwig blk_mq_start_request(rq); 40307ad18afaSChristoph Hellwig 40314e752f0aSJosh Durgin down_read(&rbd_dev->header_rwsem); 40324e752f0aSJosh Durgin mapping_size = rbd_dev->mapping.size; 40336d2940c8SGuangliang Zhao if (op_type != OBJ_OP_READ) { 40344e752f0aSJosh Durgin snapc = rbd_dev->header.snapc; 40354e752f0aSJosh Durgin ceph_get_snap_context(snapc); 4036ed95b21aSIlya Dryomov must_be_locked = rbd_is_lock_supported(rbd_dev); 403780de1912SIlya Dryomov } else { 403880de1912SIlya Dryomov must_be_locked = rbd_dev->opts->lock_on_read && 403980de1912SIlya Dryomov rbd_is_lock_supported(rbd_dev); 40404e752f0aSJosh Durgin } 40414e752f0aSJosh Durgin up_read(&rbd_dev->header_rwsem); 40424e752f0aSJosh Durgin 40434e752f0aSJosh Durgin if (offset + length > mapping_size) { 4044bc1ecc65SIlya Dryomov rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)", offset, 40454e752f0aSJosh Durgin length, mapping_size); 4046bc1ecc65SIlya Dryomov result = -EIO; 4047bc1ecc65SIlya Dryomov goto err_rq; 4048bc1ecc65SIlya Dryomov } 4049bc1ecc65SIlya Dryomov 4050ed95b21aSIlya Dryomov if (must_be_locked) { 4051ed95b21aSIlya Dryomov down_read(&rbd_dev->lock_rwsem); 405287c0fdedSIlya Dryomov if (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED && 405387c0fdedSIlya Dryomov !test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags)) 4054ed95b21aSIlya Dryomov rbd_wait_state_locked(rbd_dev); 405587c0fdedSIlya Dryomov 405687c0fdedSIlya Dryomov WARN_ON((rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED) ^ 405787c0fdedSIlya Dryomov !test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags)); 405887c0fdedSIlya Dryomov if (test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags)) { 405987c0fdedSIlya Dryomov result = -EBLACKLISTED; 406087c0fdedSIlya Dryomov goto err_unlock; 406187c0fdedSIlya Dryomov } 4062ed95b21aSIlya Dryomov } 4063ed95b21aSIlya Dryomov 40646d2940c8SGuangliang Zhao img_request = rbd_img_request_create(rbd_dev, offset, length, op_type, 40654e752f0aSJosh Durgin snapc); 4066bc1ecc65SIlya Dryomov if (!img_request) { 4067bc1ecc65SIlya Dryomov result = -ENOMEM; 4068ed95b21aSIlya Dryomov goto err_unlock; 4069bc1ecc65SIlya Dryomov } 4070bc1ecc65SIlya Dryomov img_request->rq = rq; 407170b16db8SIlya Dryomov snapc = NULL; /* img_request consumes a ref */ 4072bc1ecc65SIlya Dryomov 407390e98c52SGuangliang Zhao if (op_type == OBJ_OP_DISCARD) 407490e98c52SGuangliang Zhao result = rbd_img_request_fill(img_request, OBJ_REQUEST_NODATA, 407590e98c52SGuangliang Zhao NULL); 407690e98c52SGuangliang Zhao else 407790e98c52SGuangliang Zhao result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO, 407890e98c52SGuangliang Zhao rq->bio); 4079bc1ecc65SIlya Dryomov if (result) 4080bc1ecc65SIlya Dryomov goto err_img_request; 4081bc1ecc65SIlya Dryomov 4082bc1ecc65SIlya Dryomov result = rbd_img_request_submit(img_request); 4083bc1ecc65SIlya Dryomov if (result) 4084bc1ecc65SIlya Dryomov goto err_img_request; 4085bc1ecc65SIlya Dryomov 4086ed95b21aSIlya Dryomov if (must_be_locked) 4087ed95b21aSIlya Dryomov up_read(&rbd_dev->lock_rwsem); 4088bc1ecc65SIlya Dryomov return; 4089bc1ecc65SIlya Dryomov 4090bc1ecc65SIlya Dryomov err_img_request: 4091bc1ecc65SIlya Dryomov rbd_img_request_put(img_request); 4092ed95b21aSIlya Dryomov err_unlock: 4093ed95b21aSIlya Dryomov if (must_be_locked) 4094ed95b21aSIlya Dryomov up_read(&rbd_dev->lock_rwsem); 4095bc1ecc65SIlya Dryomov err_rq: 4096bc1ecc65SIlya Dryomov if (result) 4097bc1ecc65SIlya Dryomov rbd_warn(rbd_dev, "%s %llx at %llx result %d", 40986d2940c8SGuangliang Zhao obj_op_name(op_type), length, offset, result); 40994e752f0aSJosh Durgin ceph_put_snap_context(snapc); 41007ad18afaSChristoph Hellwig err: 41017ad18afaSChristoph Hellwig blk_mq_end_request(rq, result); 4102bc1ecc65SIlya Dryomov } 4103bc1ecc65SIlya Dryomov 41047ad18afaSChristoph Hellwig static int rbd_queue_rq(struct blk_mq_hw_ctx *hctx, 41057ad18afaSChristoph Hellwig const struct blk_mq_queue_data *bd) 4106bc1ecc65SIlya Dryomov { 41077ad18afaSChristoph Hellwig struct request *rq = bd->rq; 41087ad18afaSChristoph Hellwig struct work_struct *work = blk_mq_rq_to_pdu(rq); 4109bc1ecc65SIlya Dryomov 41107ad18afaSChristoph Hellwig queue_work(rbd_wq, work); 41117ad18afaSChristoph Hellwig return BLK_MQ_RQ_QUEUE_OK; 4112bf0d5f50SAlex Elder } 4113bf0d5f50SAlex Elder 4114602adf40SYehuda Sadeh static void rbd_free_disk(struct rbd_device *rbd_dev) 4115602adf40SYehuda Sadeh { 41165769ed0cSIlya Dryomov blk_cleanup_queue(rbd_dev->disk->queue); 41177ad18afaSChristoph Hellwig blk_mq_free_tag_set(&rbd_dev->tag_set); 41185769ed0cSIlya Dryomov put_disk(rbd_dev->disk); 41195769ed0cSIlya Dryomov rbd_dev->disk = NULL; 4120602adf40SYehuda Sadeh } 4121602adf40SYehuda Sadeh 4122788e2df3SAlex Elder static int rbd_obj_read_sync(struct rbd_device *rbd_dev, 4123fe5478e0SIlya Dryomov struct ceph_object_id *oid, 4124fe5478e0SIlya Dryomov struct ceph_object_locator *oloc, 4125fe5478e0SIlya Dryomov void *buf, int buf_len) 4126788e2df3SAlex Elder 4127788e2df3SAlex Elder { 4128fe5478e0SIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 4129fe5478e0SIlya Dryomov struct ceph_osd_request *req; 4130fe5478e0SIlya Dryomov struct page **pages; 4131fe5478e0SIlya Dryomov int num_pages = calc_pages_for(0, buf_len); 4132788e2df3SAlex Elder int ret; 4133788e2df3SAlex Elder 4134fe5478e0SIlya Dryomov req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_KERNEL); 4135fe5478e0SIlya Dryomov if (!req) 4136fe5478e0SIlya Dryomov return -ENOMEM; 4137788e2df3SAlex Elder 4138fe5478e0SIlya Dryomov ceph_oid_copy(&req->r_base_oid, oid); 4139fe5478e0SIlya Dryomov ceph_oloc_copy(&req->r_base_oloc, oloc); 4140fe5478e0SIlya Dryomov req->r_flags = CEPH_OSD_FLAG_READ; 4141788e2df3SAlex Elder 4142fe5478e0SIlya Dryomov ret = ceph_osdc_alloc_messages(req, GFP_KERNEL); 4143788e2df3SAlex Elder if (ret) 4144fe5478e0SIlya Dryomov goto out_req; 4145788e2df3SAlex Elder 4146fe5478e0SIlya Dryomov pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); 4147fe5478e0SIlya Dryomov if (IS_ERR(pages)) { 4148fe5478e0SIlya Dryomov ret = PTR_ERR(pages); 4149fe5478e0SIlya Dryomov goto out_req; 4150fe5478e0SIlya Dryomov } 41511ceae7efSAlex Elder 4152fe5478e0SIlya Dryomov osd_req_op_extent_init(req, 0, CEPH_OSD_OP_READ, 0, buf_len, 0, 0); 4153fe5478e0SIlya Dryomov osd_req_op_extent_osd_data_pages(req, 0, pages, buf_len, 0, false, 4154fe5478e0SIlya Dryomov true); 4155788e2df3SAlex Elder 4156fe5478e0SIlya Dryomov ceph_osdc_start_request(osdc, req, false); 4157fe5478e0SIlya Dryomov ret = ceph_osdc_wait_request(osdc, req); 4158fe5478e0SIlya Dryomov if (ret >= 0) 4159fe5478e0SIlya Dryomov ceph_copy_from_page_vector(pages, buf, 0, ret); 4160fe5478e0SIlya Dryomov 4161fe5478e0SIlya Dryomov out_req: 4162fe5478e0SIlya Dryomov ceph_osdc_put_request(req); 4163788e2df3SAlex Elder return ret; 4164788e2df3SAlex Elder } 4165788e2df3SAlex Elder 4166602adf40SYehuda Sadeh /* 4167662518b1SAlex Elder * Read the complete header for the given rbd device. On successful 4168662518b1SAlex Elder * return, the rbd_dev->header field will contain up-to-date 4169662518b1SAlex Elder * information about the image. 41704156d998SAlex Elder */ 417199a41ebcSAlex Elder static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev) 41724156d998SAlex Elder { 41734156d998SAlex Elder struct rbd_image_header_ondisk *ondisk = NULL; 41744156d998SAlex Elder u32 snap_count = 0; 41754156d998SAlex Elder u64 names_size = 0; 41764156d998SAlex Elder u32 want_count; 41774156d998SAlex Elder int ret; 41784156d998SAlex Elder 41794156d998SAlex Elder /* 41804156d998SAlex Elder * The complete header will include an array of its 64-bit 41814156d998SAlex Elder * snapshot ids, followed by the names of those snapshots as 41824156d998SAlex Elder * a contiguous block of NUL-terminated strings. Note that 41834156d998SAlex Elder * the number of snapshots could change by the time we read 41844156d998SAlex Elder * it in, in which case we re-read it. 41854156d998SAlex Elder */ 41864156d998SAlex Elder do { 41874156d998SAlex Elder size_t size; 41884156d998SAlex Elder 41894156d998SAlex Elder kfree(ondisk); 41904156d998SAlex Elder 41914156d998SAlex Elder size = sizeof (*ondisk); 41924156d998SAlex Elder size += snap_count * sizeof (struct rbd_image_snap_ondisk); 41934156d998SAlex Elder size += names_size; 41944156d998SAlex Elder ondisk = kmalloc(size, GFP_KERNEL); 41954156d998SAlex Elder if (!ondisk) 4196662518b1SAlex Elder return -ENOMEM; 41974156d998SAlex Elder 4198fe5478e0SIlya Dryomov ret = rbd_obj_read_sync(rbd_dev, &rbd_dev->header_oid, 4199fe5478e0SIlya Dryomov &rbd_dev->header_oloc, ondisk, size); 42004156d998SAlex Elder if (ret < 0) 4201662518b1SAlex Elder goto out; 4202c0cd10dbSAlex Elder if ((size_t)ret < size) { 42034156d998SAlex Elder ret = -ENXIO; 420406ecc6cbSAlex Elder rbd_warn(rbd_dev, "short header read (want %zd got %d)", 420506ecc6cbSAlex Elder size, ret); 4206662518b1SAlex Elder goto out; 42074156d998SAlex Elder } 42084156d998SAlex Elder if (!rbd_dev_ondisk_valid(ondisk)) { 42094156d998SAlex Elder ret = -ENXIO; 421006ecc6cbSAlex Elder rbd_warn(rbd_dev, "invalid header"); 4211662518b1SAlex Elder goto out; 42124156d998SAlex Elder } 42134156d998SAlex Elder 42144156d998SAlex Elder names_size = le64_to_cpu(ondisk->snap_names_len); 42154156d998SAlex Elder want_count = snap_count; 42164156d998SAlex Elder snap_count = le32_to_cpu(ondisk->snap_count); 42174156d998SAlex Elder } while (snap_count != want_count); 42184156d998SAlex Elder 4219662518b1SAlex Elder ret = rbd_header_from_disk(rbd_dev, ondisk); 4220662518b1SAlex Elder out: 42214156d998SAlex Elder kfree(ondisk); 42224156d998SAlex Elder 4223dfc5606dSYehuda Sadeh return ret; 4224602adf40SYehuda Sadeh } 4225602adf40SYehuda Sadeh 422615228edeSAlex Elder /* 422715228edeSAlex Elder * Clear the rbd device's EXISTS flag if the snapshot it's mapped to 422815228edeSAlex Elder * has disappeared from the (just updated) snapshot context. 422915228edeSAlex Elder */ 423015228edeSAlex Elder static void rbd_exists_validate(struct rbd_device *rbd_dev) 423115228edeSAlex Elder { 423215228edeSAlex Elder u64 snap_id; 423315228edeSAlex Elder 423415228edeSAlex Elder if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) 423515228edeSAlex Elder return; 423615228edeSAlex Elder 423715228edeSAlex Elder snap_id = rbd_dev->spec->snap_id; 423815228edeSAlex Elder if (snap_id == CEPH_NOSNAP) 423915228edeSAlex Elder return; 424015228edeSAlex Elder 424115228edeSAlex Elder if (rbd_dev_snap_index(rbd_dev, snap_id) == BAD_SNAP_INDEX) 424215228edeSAlex Elder clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); 424315228edeSAlex Elder } 424415228edeSAlex Elder 42459875201eSJosh Durgin static void rbd_dev_update_size(struct rbd_device *rbd_dev) 42469875201eSJosh Durgin { 42479875201eSJosh Durgin sector_t size; 42489875201eSJosh Durgin 42499875201eSJosh Durgin /* 4250811c6688SIlya Dryomov * If EXISTS is not set, rbd_dev->disk may be NULL, so don't 4251811c6688SIlya Dryomov * try to update its size. If REMOVING is set, updating size 4252811c6688SIlya Dryomov * is just useless work since the device can't be opened. 42539875201eSJosh Durgin */ 4254811c6688SIlya Dryomov if (test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags) && 4255811c6688SIlya Dryomov !test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags)) { 42569875201eSJosh Durgin size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE; 42579875201eSJosh Durgin dout("setting size to %llu sectors", (unsigned long long)size); 42589875201eSJosh Durgin set_capacity(rbd_dev->disk, size); 42599875201eSJosh Durgin revalidate_disk(rbd_dev->disk); 42609875201eSJosh Durgin } 42619875201eSJosh Durgin } 42629875201eSJosh Durgin 4263cc4a38bdSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev) 42641fe5e993SAlex Elder { 4265e627db08SAlex Elder u64 mapping_size; 42661fe5e993SAlex Elder int ret; 42671fe5e993SAlex Elder 4268cfbf6377SAlex Elder down_write(&rbd_dev->header_rwsem); 42693b5cf2a2SAlex Elder mapping_size = rbd_dev->mapping.size; 4270a720ae09SIlya Dryomov 4271a720ae09SIlya Dryomov ret = rbd_dev_header_info(rbd_dev); 427252bb1f9bSIlya Dryomov if (ret) 427373e39e4dSIlya Dryomov goto out; 427415228edeSAlex Elder 4275e8f59b59SIlya Dryomov /* 4276e8f59b59SIlya Dryomov * If there is a parent, see if it has disappeared due to the 4277e8f59b59SIlya Dryomov * mapped image getting flattened. 4278e8f59b59SIlya Dryomov */ 4279e8f59b59SIlya Dryomov if (rbd_dev->parent) { 4280e8f59b59SIlya Dryomov ret = rbd_dev_v2_parent_info(rbd_dev); 4281e8f59b59SIlya Dryomov if (ret) 428273e39e4dSIlya Dryomov goto out; 4283e8f59b59SIlya Dryomov } 4284e8f59b59SIlya Dryomov 42855ff1108cSIlya Dryomov if (rbd_dev->spec->snap_id == CEPH_NOSNAP) { 42865ff1108cSIlya Dryomov rbd_dev->mapping.size = rbd_dev->header.image_size; 42875ff1108cSIlya Dryomov } else { 42885ff1108cSIlya Dryomov /* validate mapped snapshot's EXISTS flag */ 428915228edeSAlex Elder rbd_exists_validate(rbd_dev); 42905ff1108cSIlya Dryomov } 42915ff1108cSIlya Dryomov 429273e39e4dSIlya Dryomov out: 4293cfbf6377SAlex Elder up_write(&rbd_dev->header_rwsem); 429473e39e4dSIlya Dryomov if (!ret && mapping_size != rbd_dev->mapping.size) 42959875201eSJosh Durgin rbd_dev_update_size(rbd_dev); 42961fe5e993SAlex Elder 429773e39e4dSIlya Dryomov return ret; 42981fe5e993SAlex Elder } 42991fe5e993SAlex Elder 43007ad18afaSChristoph Hellwig static int rbd_init_request(void *data, struct request *rq, 43017ad18afaSChristoph Hellwig unsigned int hctx_idx, unsigned int request_idx, 43027ad18afaSChristoph Hellwig unsigned int numa_node) 43037ad18afaSChristoph Hellwig { 43047ad18afaSChristoph Hellwig struct work_struct *work = blk_mq_rq_to_pdu(rq); 43057ad18afaSChristoph Hellwig 43067ad18afaSChristoph Hellwig INIT_WORK(work, rbd_queue_workfn); 43077ad18afaSChristoph Hellwig return 0; 43087ad18afaSChristoph Hellwig } 43097ad18afaSChristoph Hellwig 43107ad18afaSChristoph Hellwig static struct blk_mq_ops rbd_mq_ops = { 43117ad18afaSChristoph Hellwig .queue_rq = rbd_queue_rq, 43127ad18afaSChristoph Hellwig .init_request = rbd_init_request, 43137ad18afaSChristoph Hellwig }; 43147ad18afaSChristoph Hellwig 4315602adf40SYehuda Sadeh static int rbd_init_disk(struct rbd_device *rbd_dev) 4316602adf40SYehuda Sadeh { 4317602adf40SYehuda Sadeh struct gendisk *disk; 4318602adf40SYehuda Sadeh struct request_queue *q; 4319593a9e7bSAlex Elder u64 segment_size; 43207ad18afaSChristoph Hellwig int err; 4321602adf40SYehuda Sadeh 4322602adf40SYehuda Sadeh /* create gendisk info */ 43237e513d43SIlya Dryomov disk = alloc_disk(single_major ? 43247e513d43SIlya Dryomov (1 << RBD_SINGLE_MAJOR_PART_SHIFT) : 43257e513d43SIlya Dryomov RBD_MINORS_PER_MAJOR); 4326602adf40SYehuda Sadeh if (!disk) 43271fcdb8aaSAlex Elder return -ENOMEM; 4328602adf40SYehuda Sadeh 4329f0f8cef5SAlex Elder snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d", 4330de71a297SAlex Elder rbd_dev->dev_id); 4331602adf40SYehuda Sadeh disk->major = rbd_dev->major; 4332dd82fff1SIlya Dryomov disk->first_minor = rbd_dev->minor; 43337e513d43SIlya Dryomov if (single_major) 43347e513d43SIlya Dryomov disk->flags |= GENHD_FL_EXT_DEVT; 4335602adf40SYehuda Sadeh disk->fops = &rbd_bd_ops; 4336602adf40SYehuda Sadeh disk->private_data = rbd_dev; 4337602adf40SYehuda Sadeh 43387ad18afaSChristoph Hellwig memset(&rbd_dev->tag_set, 0, sizeof(rbd_dev->tag_set)); 43397ad18afaSChristoph Hellwig rbd_dev->tag_set.ops = &rbd_mq_ops; 4340b5584180SIlya Dryomov rbd_dev->tag_set.queue_depth = rbd_dev->opts->queue_depth; 43417ad18afaSChristoph Hellwig rbd_dev->tag_set.numa_node = NUMA_NO_NODE; 4342b5584180SIlya Dryomov rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE; 43437ad18afaSChristoph Hellwig rbd_dev->tag_set.nr_hw_queues = 1; 43447ad18afaSChristoph Hellwig rbd_dev->tag_set.cmd_size = sizeof(struct work_struct); 43457ad18afaSChristoph Hellwig 43467ad18afaSChristoph Hellwig err = blk_mq_alloc_tag_set(&rbd_dev->tag_set); 43477ad18afaSChristoph Hellwig if (err) 4348602adf40SYehuda Sadeh goto out_disk; 4349029bcbd8SJosh Durgin 43507ad18afaSChristoph Hellwig q = blk_mq_init_queue(&rbd_dev->tag_set); 43517ad18afaSChristoph Hellwig if (IS_ERR(q)) { 43527ad18afaSChristoph Hellwig err = PTR_ERR(q); 43537ad18afaSChristoph Hellwig goto out_tag_set; 43547ad18afaSChristoph Hellwig } 43557ad18afaSChristoph Hellwig 4356d8a2c89cSIlya Dryomov queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q); 4357d8a2c89cSIlya Dryomov /* QUEUE_FLAG_ADD_RANDOM is off by default for blk-mq */ 4358593a9e7bSAlex Elder 4359029bcbd8SJosh Durgin /* set io sizes to object size */ 4360593a9e7bSAlex Elder segment_size = rbd_obj_bytes(&rbd_dev->header); 4361593a9e7bSAlex Elder blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE); 43620d9fde4fSIlya Dryomov q->limits.max_sectors = queue_max_hw_sectors(q); 4363d3834fefSIlya Dryomov blk_queue_max_segments(q, segment_size / SECTOR_SIZE); 4364593a9e7bSAlex Elder blk_queue_max_segment_size(q, segment_size); 4365593a9e7bSAlex Elder blk_queue_io_min(q, segment_size); 4366593a9e7bSAlex Elder blk_queue_io_opt(q, segment_size); 4367029bcbd8SJosh Durgin 436890e98c52SGuangliang Zhao /* enable the discard support */ 436990e98c52SGuangliang Zhao queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q); 437090e98c52SGuangliang Zhao q->limits.discard_granularity = segment_size; 437190e98c52SGuangliang Zhao q->limits.discard_alignment = segment_size; 43722bb4cd5cSJens Axboe blk_queue_max_discard_sectors(q, segment_size / SECTOR_SIZE); 4373b76f8239SJosh Durgin q->limits.discard_zeroes_data = 1; 437490e98c52SGuangliang Zhao 4375bae818eeSRonny Hegewald if (!ceph_test_opt(rbd_dev->rbd_client->client, NOCRC)) 4376dc3b17ccSJan Kara q->backing_dev_info->capabilities |= BDI_CAP_STABLE_WRITES; 4377bae818eeSRonny Hegewald 43785769ed0cSIlya Dryomov /* 43795769ed0cSIlya Dryomov * disk_release() expects a queue ref from add_disk() and will 43805769ed0cSIlya Dryomov * put it. Hold an extra ref until add_disk() is called. 43815769ed0cSIlya Dryomov */ 43825769ed0cSIlya Dryomov WARN_ON(!blk_get_queue(q)); 4383602adf40SYehuda Sadeh disk->queue = q; 4384602adf40SYehuda Sadeh q->queuedata = rbd_dev; 4385602adf40SYehuda Sadeh 4386602adf40SYehuda Sadeh rbd_dev->disk = disk; 4387602adf40SYehuda Sadeh 4388602adf40SYehuda Sadeh return 0; 43897ad18afaSChristoph Hellwig out_tag_set: 43907ad18afaSChristoph Hellwig blk_mq_free_tag_set(&rbd_dev->tag_set); 4391602adf40SYehuda Sadeh out_disk: 4392602adf40SYehuda Sadeh put_disk(disk); 43937ad18afaSChristoph Hellwig return err; 4394602adf40SYehuda Sadeh } 4395602adf40SYehuda Sadeh 4396dfc5606dSYehuda Sadeh /* 4397dfc5606dSYehuda Sadeh sysfs 4398dfc5606dSYehuda Sadeh */ 4399602adf40SYehuda Sadeh 4400593a9e7bSAlex Elder static struct rbd_device *dev_to_rbd_dev(struct device *dev) 4401593a9e7bSAlex Elder { 4402593a9e7bSAlex Elder return container_of(dev, struct rbd_device, dev); 4403593a9e7bSAlex Elder } 4404593a9e7bSAlex Elder 4405dfc5606dSYehuda Sadeh static ssize_t rbd_size_show(struct device *dev, 4406dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 4407602adf40SYehuda Sadeh { 4408593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4409dfc5606dSYehuda Sadeh 4410fc71d833SAlex Elder return sprintf(buf, "%llu\n", 4411fc71d833SAlex Elder (unsigned long long)rbd_dev->mapping.size); 4412602adf40SYehuda Sadeh } 4413602adf40SYehuda Sadeh 441434b13184SAlex Elder /* 441534b13184SAlex Elder * Note this shows the features for whatever's mapped, which is not 441634b13184SAlex Elder * necessarily the base image. 441734b13184SAlex Elder */ 441834b13184SAlex Elder static ssize_t rbd_features_show(struct device *dev, 441934b13184SAlex Elder struct device_attribute *attr, char *buf) 442034b13184SAlex Elder { 442134b13184SAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 442234b13184SAlex Elder 442334b13184SAlex Elder return sprintf(buf, "0x%016llx\n", 442434b13184SAlex Elder (unsigned long long)rbd_dev->mapping.features); 442534b13184SAlex Elder } 442634b13184SAlex Elder 4427dfc5606dSYehuda Sadeh static ssize_t rbd_major_show(struct device *dev, 4428dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 4429602adf40SYehuda Sadeh { 4430593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4431dfc5606dSYehuda Sadeh 4432fc71d833SAlex Elder if (rbd_dev->major) 4433dfc5606dSYehuda Sadeh return sprintf(buf, "%d\n", rbd_dev->major); 4434fc71d833SAlex Elder 4435fc71d833SAlex Elder return sprintf(buf, "(none)\n"); 4436dd82fff1SIlya Dryomov } 4437fc71d833SAlex Elder 4438dd82fff1SIlya Dryomov static ssize_t rbd_minor_show(struct device *dev, 4439dd82fff1SIlya Dryomov struct device_attribute *attr, char *buf) 4440dd82fff1SIlya Dryomov { 4441dd82fff1SIlya Dryomov struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4442dd82fff1SIlya Dryomov 4443dd82fff1SIlya Dryomov return sprintf(buf, "%d\n", rbd_dev->minor); 4444dfc5606dSYehuda Sadeh } 4445dfc5606dSYehuda Sadeh 4446005a07bfSIlya Dryomov static ssize_t rbd_client_addr_show(struct device *dev, 4447005a07bfSIlya Dryomov struct device_attribute *attr, char *buf) 4448005a07bfSIlya Dryomov { 4449005a07bfSIlya Dryomov struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4450005a07bfSIlya Dryomov struct ceph_entity_addr *client_addr = 4451005a07bfSIlya Dryomov ceph_client_addr(rbd_dev->rbd_client->client); 4452005a07bfSIlya Dryomov 4453005a07bfSIlya Dryomov return sprintf(buf, "%pISpc/%u\n", &client_addr->in_addr, 4454005a07bfSIlya Dryomov le32_to_cpu(client_addr->nonce)); 4455005a07bfSIlya Dryomov } 4456005a07bfSIlya Dryomov 4457dfc5606dSYehuda Sadeh static ssize_t rbd_client_id_show(struct device *dev, 4458dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 4459dfc5606dSYehuda Sadeh { 4460593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4461dfc5606dSYehuda Sadeh 44621dbb4399SAlex Elder return sprintf(buf, "client%lld\n", 4463033268a5SIlya Dryomov ceph_client_gid(rbd_dev->rbd_client->client)); 4464dfc5606dSYehuda Sadeh } 4465dfc5606dSYehuda Sadeh 4466267fb90bSMike Christie static ssize_t rbd_cluster_fsid_show(struct device *dev, 4467267fb90bSMike Christie struct device_attribute *attr, char *buf) 4468267fb90bSMike Christie { 4469267fb90bSMike Christie struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4470267fb90bSMike Christie 4471267fb90bSMike Christie return sprintf(buf, "%pU\n", &rbd_dev->rbd_client->client->fsid); 4472267fb90bSMike Christie } 4473267fb90bSMike Christie 44740d6d1e9cSMike Christie static ssize_t rbd_config_info_show(struct device *dev, 44750d6d1e9cSMike Christie struct device_attribute *attr, char *buf) 44760d6d1e9cSMike Christie { 44770d6d1e9cSMike Christie struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 44780d6d1e9cSMike Christie 44790d6d1e9cSMike Christie return sprintf(buf, "%s\n", rbd_dev->config_info); 4480dfc5606dSYehuda Sadeh } 4481dfc5606dSYehuda Sadeh 4482dfc5606dSYehuda Sadeh static ssize_t rbd_pool_show(struct device *dev, 4483dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 4484dfc5606dSYehuda Sadeh { 4485593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4486dfc5606dSYehuda Sadeh 44870d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->pool_name); 4488dfc5606dSYehuda Sadeh } 4489dfc5606dSYehuda Sadeh 44909bb2f334SAlex Elder static ssize_t rbd_pool_id_show(struct device *dev, 44919bb2f334SAlex Elder struct device_attribute *attr, char *buf) 44929bb2f334SAlex Elder { 44939bb2f334SAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 44949bb2f334SAlex Elder 44950d7dbfceSAlex Elder return sprintf(buf, "%llu\n", 44960d7dbfceSAlex Elder (unsigned long long) rbd_dev->spec->pool_id); 44979bb2f334SAlex Elder } 44989bb2f334SAlex Elder 4499dfc5606dSYehuda Sadeh static ssize_t rbd_name_show(struct device *dev, 4500dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 4501dfc5606dSYehuda Sadeh { 4502593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4503dfc5606dSYehuda Sadeh 4504a92ffdf8SAlex Elder if (rbd_dev->spec->image_name) 45050d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->image_name); 4506a92ffdf8SAlex Elder 4507a92ffdf8SAlex Elder return sprintf(buf, "(unknown)\n"); 4508dfc5606dSYehuda Sadeh } 4509dfc5606dSYehuda Sadeh 4510589d30e0SAlex Elder static ssize_t rbd_image_id_show(struct device *dev, 4511589d30e0SAlex Elder struct device_attribute *attr, char *buf) 4512589d30e0SAlex Elder { 4513589d30e0SAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4514589d30e0SAlex Elder 45150d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->image_id); 4516589d30e0SAlex Elder } 4517589d30e0SAlex Elder 451834b13184SAlex Elder /* 451934b13184SAlex Elder * Shows the name of the currently-mapped snapshot (or 452034b13184SAlex Elder * RBD_SNAP_HEAD_NAME for the base image). 452134b13184SAlex Elder */ 4522dfc5606dSYehuda Sadeh static ssize_t rbd_snap_show(struct device *dev, 4523dfc5606dSYehuda Sadeh struct device_attribute *attr, 4524dfc5606dSYehuda Sadeh char *buf) 4525dfc5606dSYehuda Sadeh { 4526593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4527dfc5606dSYehuda Sadeh 45280d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->snap_name); 4529dfc5606dSYehuda Sadeh } 4530dfc5606dSYehuda Sadeh 453192a58671SMike Christie static ssize_t rbd_snap_id_show(struct device *dev, 453292a58671SMike Christie struct device_attribute *attr, char *buf) 453392a58671SMike Christie { 453492a58671SMike Christie struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 453592a58671SMike Christie 453692a58671SMike Christie return sprintf(buf, "%llu\n", rbd_dev->spec->snap_id); 453792a58671SMike Christie } 453892a58671SMike Christie 453986b00e0dSAlex Elder /* 4540ff96128fSIlya Dryomov * For a v2 image, shows the chain of parent images, separated by empty 4541ff96128fSIlya Dryomov * lines. For v1 images or if there is no parent, shows "(no parent 4542ff96128fSIlya Dryomov * image)". 454386b00e0dSAlex Elder */ 454486b00e0dSAlex Elder static ssize_t rbd_parent_show(struct device *dev, 454586b00e0dSAlex Elder struct device_attribute *attr, 454686b00e0dSAlex Elder char *buf) 454786b00e0dSAlex Elder { 454886b00e0dSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4549ff96128fSIlya Dryomov ssize_t count = 0; 455086b00e0dSAlex Elder 4551ff96128fSIlya Dryomov if (!rbd_dev->parent) 455286b00e0dSAlex Elder return sprintf(buf, "(no parent image)\n"); 455386b00e0dSAlex Elder 4554ff96128fSIlya Dryomov for ( ; rbd_dev->parent; rbd_dev = rbd_dev->parent) { 4555ff96128fSIlya Dryomov struct rbd_spec *spec = rbd_dev->parent_spec; 455686b00e0dSAlex Elder 4557ff96128fSIlya Dryomov count += sprintf(&buf[count], "%s" 4558ff96128fSIlya Dryomov "pool_id %llu\npool_name %s\n" 4559ff96128fSIlya Dryomov "image_id %s\nimage_name %s\n" 4560ff96128fSIlya Dryomov "snap_id %llu\nsnap_name %s\n" 4561ff96128fSIlya Dryomov "overlap %llu\n", 4562ff96128fSIlya Dryomov !count ? "" : "\n", /* first? */ 4563ff96128fSIlya Dryomov spec->pool_id, spec->pool_name, 4564ff96128fSIlya Dryomov spec->image_id, spec->image_name ?: "(unknown)", 4565ff96128fSIlya Dryomov spec->snap_id, spec->snap_name, 4566ff96128fSIlya Dryomov rbd_dev->parent_overlap); 4567ff96128fSIlya Dryomov } 456886b00e0dSAlex Elder 456986b00e0dSAlex Elder return count; 457086b00e0dSAlex Elder } 457186b00e0dSAlex Elder 4572dfc5606dSYehuda Sadeh static ssize_t rbd_image_refresh(struct device *dev, 4573dfc5606dSYehuda Sadeh struct device_attribute *attr, 4574dfc5606dSYehuda Sadeh const char *buf, 4575dfc5606dSYehuda Sadeh size_t size) 4576dfc5606dSYehuda Sadeh { 4577593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4578b813623aSAlex Elder int ret; 4579602adf40SYehuda Sadeh 4580cc4a38bdSAlex Elder ret = rbd_dev_refresh(rbd_dev); 4581e627db08SAlex Elder if (ret) 458252bb1f9bSIlya Dryomov return ret; 4583b813623aSAlex Elder 458452bb1f9bSIlya Dryomov return size; 4585dfc5606dSYehuda Sadeh } 4586602adf40SYehuda Sadeh 4587dfc5606dSYehuda Sadeh static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL); 458834b13184SAlex Elder static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL); 4589dfc5606dSYehuda Sadeh static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL); 4590dd82fff1SIlya Dryomov static DEVICE_ATTR(minor, S_IRUGO, rbd_minor_show, NULL); 4591005a07bfSIlya Dryomov static DEVICE_ATTR(client_addr, S_IRUGO, rbd_client_addr_show, NULL); 4592dfc5606dSYehuda Sadeh static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL); 4593267fb90bSMike Christie static DEVICE_ATTR(cluster_fsid, S_IRUGO, rbd_cluster_fsid_show, NULL); 45940d6d1e9cSMike Christie static DEVICE_ATTR(config_info, S_IRUSR, rbd_config_info_show, NULL); 4595dfc5606dSYehuda Sadeh static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL); 45969bb2f334SAlex Elder static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL); 4597dfc5606dSYehuda Sadeh static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL); 4598589d30e0SAlex Elder static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL); 4599dfc5606dSYehuda Sadeh static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh); 4600dfc5606dSYehuda Sadeh static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL); 460192a58671SMike Christie static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL); 460286b00e0dSAlex Elder static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL); 4603dfc5606dSYehuda Sadeh 4604dfc5606dSYehuda Sadeh static struct attribute *rbd_attrs[] = { 4605dfc5606dSYehuda Sadeh &dev_attr_size.attr, 460634b13184SAlex Elder &dev_attr_features.attr, 4607dfc5606dSYehuda Sadeh &dev_attr_major.attr, 4608dd82fff1SIlya Dryomov &dev_attr_minor.attr, 4609005a07bfSIlya Dryomov &dev_attr_client_addr.attr, 4610dfc5606dSYehuda Sadeh &dev_attr_client_id.attr, 4611267fb90bSMike Christie &dev_attr_cluster_fsid.attr, 46120d6d1e9cSMike Christie &dev_attr_config_info.attr, 4613dfc5606dSYehuda Sadeh &dev_attr_pool.attr, 46149bb2f334SAlex Elder &dev_attr_pool_id.attr, 4615dfc5606dSYehuda Sadeh &dev_attr_name.attr, 4616589d30e0SAlex Elder &dev_attr_image_id.attr, 4617dfc5606dSYehuda Sadeh &dev_attr_current_snap.attr, 461892a58671SMike Christie &dev_attr_snap_id.attr, 461986b00e0dSAlex Elder &dev_attr_parent.attr, 4620dfc5606dSYehuda Sadeh &dev_attr_refresh.attr, 4621dfc5606dSYehuda Sadeh NULL 4622dfc5606dSYehuda Sadeh }; 4623dfc5606dSYehuda Sadeh 4624dfc5606dSYehuda Sadeh static struct attribute_group rbd_attr_group = { 4625dfc5606dSYehuda Sadeh .attrs = rbd_attrs, 4626dfc5606dSYehuda Sadeh }; 4627dfc5606dSYehuda Sadeh 4628dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_attr_groups[] = { 4629dfc5606dSYehuda Sadeh &rbd_attr_group, 4630dfc5606dSYehuda Sadeh NULL 4631dfc5606dSYehuda Sadeh }; 4632dfc5606dSYehuda Sadeh 46336cac4695SIlya Dryomov static void rbd_dev_release(struct device *dev); 4634dfc5606dSYehuda Sadeh 4635b9942bc9SBhumika Goyal static const struct device_type rbd_device_type = { 4636dfc5606dSYehuda Sadeh .name = "rbd", 4637dfc5606dSYehuda Sadeh .groups = rbd_attr_groups, 46386cac4695SIlya Dryomov .release = rbd_dev_release, 4639dfc5606dSYehuda Sadeh }; 4640dfc5606dSYehuda Sadeh 46418b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec) 46428b8fb99cSAlex Elder { 46438b8fb99cSAlex Elder kref_get(&spec->kref); 46448b8fb99cSAlex Elder 46458b8fb99cSAlex Elder return spec; 46468b8fb99cSAlex Elder } 46478b8fb99cSAlex Elder 46488b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref); 46498b8fb99cSAlex Elder static void rbd_spec_put(struct rbd_spec *spec) 46508b8fb99cSAlex Elder { 46518b8fb99cSAlex Elder if (spec) 46528b8fb99cSAlex Elder kref_put(&spec->kref, rbd_spec_free); 46538b8fb99cSAlex Elder } 46548b8fb99cSAlex Elder 46558b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_alloc(void) 46568b8fb99cSAlex Elder { 46578b8fb99cSAlex Elder struct rbd_spec *spec; 46588b8fb99cSAlex Elder 46598b8fb99cSAlex Elder spec = kzalloc(sizeof (*spec), GFP_KERNEL); 46608b8fb99cSAlex Elder if (!spec) 46618b8fb99cSAlex Elder return NULL; 466204077599SIlya Dryomov 466304077599SIlya Dryomov spec->pool_id = CEPH_NOPOOL; 466404077599SIlya Dryomov spec->snap_id = CEPH_NOSNAP; 46658b8fb99cSAlex Elder kref_init(&spec->kref); 46668b8fb99cSAlex Elder 46678b8fb99cSAlex Elder return spec; 46688b8fb99cSAlex Elder } 46698b8fb99cSAlex Elder 46708b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref) 46718b8fb99cSAlex Elder { 46728b8fb99cSAlex Elder struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref); 46738b8fb99cSAlex Elder 46748b8fb99cSAlex Elder kfree(spec->pool_name); 46758b8fb99cSAlex Elder kfree(spec->image_id); 46768b8fb99cSAlex Elder kfree(spec->image_name); 46778b8fb99cSAlex Elder kfree(spec->snap_name); 46788b8fb99cSAlex Elder kfree(spec); 46798b8fb99cSAlex Elder } 46808b8fb99cSAlex Elder 46811643dfa4SIlya Dryomov static void rbd_dev_free(struct rbd_device *rbd_dev) 4682dd5ac32dSIlya Dryomov { 468399d16943SIlya Dryomov WARN_ON(rbd_dev->watch_state != RBD_WATCH_STATE_UNREGISTERED); 4684ed95b21aSIlya Dryomov WARN_ON(rbd_dev->lock_state != RBD_LOCK_STATE_UNLOCKED); 4685dd5ac32dSIlya Dryomov 4686c41d13a3SIlya Dryomov ceph_oid_destroy(&rbd_dev->header_oid); 46876b6dddbeSIlya Dryomov ceph_oloc_destroy(&rbd_dev->header_oloc); 46880d6d1e9cSMike Christie kfree(rbd_dev->config_info); 4689c41d13a3SIlya Dryomov 4690dd5ac32dSIlya Dryomov rbd_put_client(rbd_dev->rbd_client); 4691dd5ac32dSIlya Dryomov rbd_spec_put(rbd_dev->spec); 4692dd5ac32dSIlya Dryomov kfree(rbd_dev->opts); 4693dd5ac32dSIlya Dryomov kfree(rbd_dev); 46941643dfa4SIlya Dryomov } 46951643dfa4SIlya Dryomov 46961643dfa4SIlya Dryomov static void rbd_dev_release(struct device *dev) 46971643dfa4SIlya Dryomov { 46981643dfa4SIlya Dryomov struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 46991643dfa4SIlya Dryomov bool need_put = !!rbd_dev->opts; 47001643dfa4SIlya Dryomov 47011643dfa4SIlya Dryomov if (need_put) { 47021643dfa4SIlya Dryomov destroy_workqueue(rbd_dev->task_wq); 47031643dfa4SIlya Dryomov ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id); 47041643dfa4SIlya Dryomov } 47051643dfa4SIlya Dryomov 47061643dfa4SIlya Dryomov rbd_dev_free(rbd_dev); 4707dd5ac32dSIlya Dryomov 4708dd5ac32dSIlya Dryomov /* 4709dd5ac32dSIlya Dryomov * This is racy, but way better than putting module outside of 4710dd5ac32dSIlya Dryomov * the release callback. The race window is pretty small, so 4711dd5ac32dSIlya Dryomov * doing something similar to dm (dm-builtin.c) is overkill. 4712dd5ac32dSIlya Dryomov */ 4713dd5ac32dSIlya Dryomov if (need_put) 4714dd5ac32dSIlya Dryomov module_put(THIS_MODULE); 4715dd5ac32dSIlya Dryomov } 4716dd5ac32dSIlya Dryomov 47171643dfa4SIlya Dryomov static struct rbd_device *__rbd_dev_create(struct rbd_client *rbdc, 47181643dfa4SIlya Dryomov struct rbd_spec *spec) 4719c53d5893SAlex Elder { 4720c53d5893SAlex Elder struct rbd_device *rbd_dev; 4721c53d5893SAlex Elder 4722c53d5893SAlex Elder rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL); 4723c53d5893SAlex Elder if (!rbd_dev) 4724c53d5893SAlex Elder return NULL; 4725c53d5893SAlex Elder 4726c53d5893SAlex Elder spin_lock_init(&rbd_dev->lock); 4727c53d5893SAlex Elder INIT_LIST_HEAD(&rbd_dev->node); 4728c53d5893SAlex Elder init_rwsem(&rbd_dev->header_rwsem); 4729c53d5893SAlex Elder 47307e97332eSIlya Dryomov rbd_dev->header.data_pool_id = CEPH_NOPOOL; 4731c41d13a3SIlya Dryomov ceph_oid_init(&rbd_dev->header_oid); 4732431a02cdSIlya Dryomov rbd_dev->header_oloc.pool = spec->pool_id; 4733c41d13a3SIlya Dryomov 473499d16943SIlya Dryomov mutex_init(&rbd_dev->watch_mutex); 473599d16943SIlya Dryomov rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED; 473699d16943SIlya Dryomov INIT_DELAYED_WORK(&rbd_dev->watch_dwork, rbd_reregister_watch); 473799d16943SIlya Dryomov 4738ed95b21aSIlya Dryomov init_rwsem(&rbd_dev->lock_rwsem); 4739ed95b21aSIlya Dryomov rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED; 4740ed95b21aSIlya Dryomov INIT_WORK(&rbd_dev->acquired_lock_work, rbd_notify_acquired_lock); 4741ed95b21aSIlya Dryomov INIT_WORK(&rbd_dev->released_lock_work, rbd_notify_released_lock); 4742ed95b21aSIlya Dryomov INIT_DELAYED_WORK(&rbd_dev->lock_dwork, rbd_acquire_lock); 4743ed95b21aSIlya Dryomov INIT_WORK(&rbd_dev->unlock_work, rbd_release_lock_work); 4744ed95b21aSIlya Dryomov init_waitqueue_head(&rbd_dev->lock_waitq); 4745ed95b21aSIlya Dryomov 4746dd5ac32dSIlya Dryomov rbd_dev->dev.bus = &rbd_bus_type; 4747dd5ac32dSIlya Dryomov rbd_dev->dev.type = &rbd_device_type; 4748dd5ac32dSIlya Dryomov rbd_dev->dev.parent = &rbd_root_dev; 4749dd5ac32dSIlya Dryomov device_initialize(&rbd_dev->dev); 4750dd5ac32dSIlya Dryomov 4751c53d5893SAlex Elder rbd_dev->rbd_client = rbdc; 4752d147543dSIlya Dryomov rbd_dev->spec = spec; 47530903e875SAlex Elder 47541643dfa4SIlya Dryomov return rbd_dev; 47551643dfa4SIlya Dryomov } 47561643dfa4SIlya Dryomov 4757dd5ac32dSIlya Dryomov /* 47581643dfa4SIlya Dryomov * Create a mapping rbd_dev. 4759dd5ac32dSIlya Dryomov */ 47601643dfa4SIlya Dryomov static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc, 47611643dfa4SIlya Dryomov struct rbd_spec *spec, 47621643dfa4SIlya Dryomov struct rbd_options *opts) 47631643dfa4SIlya Dryomov { 47641643dfa4SIlya Dryomov struct rbd_device *rbd_dev; 47651643dfa4SIlya Dryomov 47661643dfa4SIlya Dryomov rbd_dev = __rbd_dev_create(rbdc, spec); 47671643dfa4SIlya Dryomov if (!rbd_dev) 47681643dfa4SIlya Dryomov return NULL; 47691643dfa4SIlya Dryomov 47701643dfa4SIlya Dryomov rbd_dev->opts = opts; 47711643dfa4SIlya Dryomov 47721643dfa4SIlya Dryomov /* get an id and fill in device name */ 47731643dfa4SIlya Dryomov rbd_dev->dev_id = ida_simple_get(&rbd_dev_id_ida, 0, 47741643dfa4SIlya Dryomov minor_to_rbd_dev_id(1 << MINORBITS), 47751643dfa4SIlya Dryomov GFP_KERNEL); 47761643dfa4SIlya Dryomov if (rbd_dev->dev_id < 0) 47771643dfa4SIlya Dryomov goto fail_rbd_dev; 47781643dfa4SIlya Dryomov 47791643dfa4SIlya Dryomov sprintf(rbd_dev->name, RBD_DRV_NAME "%d", rbd_dev->dev_id); 47801643dfa4SIlya Dryomov rbd_dev->task_wq = alloc_ordered_workqueue("%s-tasks", WQ_MEM_RECLAIM, 47811643dfa4SIlya Dryomov rbd_dev->name); 47821643dfa4SIlya Dryomov if (!rbd_dev->task_wq) 47831643dfa4SIlya Dryomov goto fail_dev_id; 47841643dfa4SIlya Dryomov 47851643dfa4SIlya Dryomov /* we have a ref from do_rbd_add() */ 4786dd5ac32dSIlya Dryomov __module_get(THIS_MODULE); 4787dd5ac32dSIlya Dryomov 47881643dfa4SIlya Dryomov dout("%s rbd_dev %p dev_id %d\n", __func__, rbd_dev, rbd_dev->dev_id); 4789c53d5893SAlex Elder return rbd_dev; 47901643dfa4SIlya Dryomov 47911643dfa4SIlya Dryomov fail_dev_id: 47921643dfa4SIlya Dryomov ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id); 47931643dfa4SIlya Dryomov fail_rbd_dev: 47941643dfa4SIlya Dryomov rbd_dev_free(rbd_dev); 47951643dfa4SIlya Dryomov return NULL; 4796c53d5893SAlex Elder } 4797c53d5893SAlex Elder 4798c53d5893SAlex Elder static void rbd_dev_destroy(struct rbd_device *rbd_dev) 4799c53d5893SAlex Elder { 4800dd5ac32dSIlya Dryomov if (rbd_dev) 4801dd5ac32dSIlya Dryomov put_device(&rbd_dev->dev); 4802c53d5893SAlex Elder } 4803c53d5893SAlex Elder 4804dfc5606dSYehuda Sadeh /* 48059d475de5SAlex Elder * Get the size and object order for an image snapshot, or if 48069d475de5SAlex Elder * snap_id is CEPH_NOSNAP, gets this information for the base 48079d475de5SAlex Elder * image. 48089d475de5SAlex Elder */ 48099d475de5SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id, 48109d475de5SAlex Elder u8 *order, u64 *snap_size) 48119d475de5SAlex Elder { 48129d475de5SAlex Elder __le64 snapid = cpu_to_le64(snap_id); 48139d475de5SAlex Elder int ret; 48149d475de5SAlex Elder struct { 48159d475de5SAlex Elder u8 order; 48169d475de5SAlex Elder __le64 size; 48179d475de5SAlex Elder } __attribute__ ((packed)) size_buf = { 0 }; 48189d475de5SAlex Elder 4819ecd4a68aSIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, 4820ecd4a68aSIlya Dryomov &rbd_dev->header_oloc, "get_size", 48214157976bSAlex Elder &snapid, sizeof(snapid), 4822e2a58ee5SAlex Elder &size_buf, sizeof(size_buf)); 482336be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 48249d475de5SAlex Elder if (ret < 0) 48259d475de5SAlex Elder return ret; 482657385b51SAlex Elder if (ret < sizeof (size_buf)) 482757385b51SAlex Elder return -ERANGE; 48289d475de5SAlex Elder 4829c3545579SJosh Durgin if (order) { 48309d475de5SAlex Elder *order = size_buf.order; 4831c3545579SJosh Durgin dout(" order %u", (unsigned int)*order); 4832c3545579SJosh Durgin } 48339d475de5SAlex Elder *snap_size = le64_to_cpu(size_buf.size); 48349d475de5SAlex Elder 4835c3545579SJosh Durgin dout(" snap_id 0x%016llx snap_size = %llu\n", 4836c3545579SJosh Durgin (unsigned long long)snap_id, 48379d475de5SAlex Elder (unsigned long long)*snap_size); 48389d475de5SAlex Elder 48399d475de5SAlex Elder return 0; 48409d475de5SAlex Elder } 48419d475de5SAlex Elder 48429d475de5SAlex Elder static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev) 48439d475de5SAlex Elder { 48449d475de5SAlex Elder return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP, 48459d475de5SAlex Elder &rbd_dev->header.obj_order, 48469d475de5SAlex Elder &rbd_dev->header.image_size); 48479d475de5SAlex Elder } 48489d475de5SAlex Elder 48491e130199SAlex Elder static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev) 48501e130199SAlex Elder { 48511e130199SAlex Elder void *reply_buf; 48521e130199SAlex Elder int ret; 48531e130199SAlex Elder void *p; 48541e130199SAlex Elder 48551e130199SAlex Elder reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL); 48561e130199SAlex Elder if (!reply_buf) 48571e130199SAlex Elder return -ENOMEM; 48581e130199SAlex Elder 4859ecd4a68aSIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, 4860ecd4a68aSIlya Dryomov &rbd_dev->header_oloc, "get_object_prefix", 4861ecd4a68aSIlya Dryomov NULL, 0, reply_buf, RBD_OBJ_PREFIX_LEN_MAX); 486236be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 48631e130199SAlex Elder if (ret < 0) 48641e130199SAlex Elder goto out; 48651e130199SAlex Elder 48661e130199SAlex Elder p = reply_buf; 48671e130199SAlex Elder rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p, 486857385b51SAlex Elder p + ret, NULL, GFP_NOIO); 486957385b51SAlex Elder ret = 0; 48701e130199SAlex Elder 48711e130199SAlex Elder if (IS_ERR(rbd_dev->header.object_prefix)) { 48721e130199SAlex Elder ret = PTR_ERR(rbd_dev->header.object_prefix); 48731e130199SAlex Elder rbd_dev->header.object_prefix = NULL; 48741e130199SAlex Elder } else { 48751e130199SAlex Elder dout(" object_prefix = %s\n", rbd_dev->header.object_prefix); 48761e130199SAlex Elder } 48771e130199SAlex Elder out: 48781e130199SAlex Elder kfree(reply_buf); 48791e130199SAlex Elder 48801e130199SAlex Elder return ret; 48811e130199SAlex Elder } 48821e130199SAlex Elder 4883b1b5402aSAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id, 4884b1b5402aSAlex Elder u64 *snap_features) 4885b1b5402aSAlex Elder { 4886b1b5402aSAlex Elder __le64 snapid = cpu_to_le64(snap_id); 4887b1b5402aSAlex Elder struct { 4888b1b5402aSAlex Elder __le64 features; 4889b1b5402aSAlex Elder __le64 incompat; 48904157976bSAlex Elder } __attribute__ ((packed)) features_buf = { 0 }; 4891d3767f0fSIlya Dryomov u64 unsup; 4892b1b5402aSAlex Elder int ret; 4893b1b5402aSAlex Elder 4894ecd4a68aSIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, 4895ecd4a68aSIlya Dryomov &rbd_dev->header_oloc, "get_features", 48964157976bSAlex Elder &snapid, sizeof(snapid), 4897e2a58ee5SAlex Elder &features_buf, sizeof(features_buf)); 489836be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 4899b1b5402aSAlex Elder if (ret < 0) 4900b1b5402aSAlex Elder return ret; 490157385b51SAlex Elder if (ret < sizeof (features_buf)) 490257385b51SAlex Elder return -ERANGE; 4903d889140cSAlex Elder 4904d3767f0fSIlya Dryomov unsup = le64_to_cpu(features_buf.incompat) & ~RBD_FEATURES_SUPPORTED; 4905d3767f0fSIlya Dryomov if (unsup) { 4906d3767f0fSIlya Dryomov rbd_warn(rbd_dev, "image uses unsupported features: 0x%llx", 4907d3767f0fSIlya Dryomov unsup); 4908b8f5c6edSAlex Elder return -ENXIO; 4909d3767f0fSIlya Dryomov } 4910d889140cSAlex Elder 4911b1b5402aSAlex Elder *snap_features = le64_to_cpu(features_buf.features); 4912b1b5402aSAlex Elder 4913b1b5402aSAlex Elder dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n", 4914b1b5402aSAlex Elder (unsigned long long)snap_id, 4915b1b5402aSAlex Elder (unsigned long long)*snap_features, 4916b1b5402aSAlex Elder (unsigned long long)le64_to_cpu(features_buf.incompat)); 4917b1b5402aSAlex Elder 4918b1b5402aSAlex Elder return 0; 4919b1b5402aSAlex Elder } 4920b1b5402aSAlex Elder 4921b1b5402aSAlex Elder static int rbd_dev_v2_features(struct rbd_device *rbd_dev) 4922b1b5402aSAlex Elder { 4923b1b5402aSAlex Elder return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP, 4924b1b5402aSAlex Elder &rbd_dev->header.features); 4925b1b5402aSAlex Elder } 4926b1b5402aSAlex Elder 492786b00e0dSAlex Elder static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev) 492886b00e0dSAlex Elder { 492986b00e0dSAlex Elder struct rbd_spec *parent_spec; 493086b00e0dSAlex Elder size_t size; 493186b00e0dSAlex Elder void *reply_buf = NULL; 493286b00e0dSAlex Elder __le64 snapid; 493386b00e0dSAlex Elder void *p; 493486b00e0dSAlex Elder void *end; 4935642a2537SAlex Elder u64 pool_id; 493686b00e0dSAlex Elder char *image_id; 49373b5cf2a2SAlex Elder u64 snap_id; 493886b00e0dSAlex Elder u64 overlap; 493986b00e0dSAlex Elder int ret; 494086b00e0dSAlex Elder 494186b00e0dSAlex Elder parent_spec = rbd_spec_alloc(); 494286b00e0dSAlex Elder if (!parent_spec) 494386b00e0dSAlex Elder return -ENOMEM; 494486b00e0dSAlex Elder 494586b00e0dSAlex Elder size = sizeof (__le64) + /* pool_id */ 494686b00e0dSAlex Elder sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX + /* image_id */ 494786b00e0dSAlex Elder sizeof (__le64) + /* snap_id */ 494886b00e0dSAlex Elder sizeof (__le64); /* overlap */ 494986b00e0dSAlex Elder reply_buf = kmalloc(size, GFP_KERNEL); 495086b00e0dSAlex Elder if (!reply_buf) { 495186b00e0dSAlex Elder ret = -ENOMEM; 495286b00e0dSAlex Elder goto out_err; 495386b00e0dSAlex Elder } 495486b00e0dSAlex Elder 49554d9b67cdSIlya Dryomov snapid = cpu_to_le64(rbd_dev->spec->snap_id); 4956ecd4a68aSIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, 4957ecd4a68aSIlya Dryomov &rbd_dev->header_oloc, "get_parent", 4958ecd4a68aSIlya Dryomov &snapid, sizeof(snapid), reply_buf, size); 495936be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 496086b00e0dSAlex Elder if (ret < 0) 496186b00e0dSAlex Elder goto out_err; 496286b00e0dSAlex Elder 496386b00e0dSAlex Elder p = reply_buf; 496457385b51SAlex Elder end = reply_buf + ret; 496557385b51SAlex Elder ret = -ERANGE; 4966642a2537SAlex Elder ceph_decode_64_safe(&p, end, pool_id, out_err); 4967392a9dadSAlex Elder if (pool_id == CEPH_NOPOOL) { 4968392a9dadSAlex Elder /* 4969392a9dadSAlex Elder * Either the parent never existed, or we have 4970392a9dadSAlex Elder * record of it but the image got flattened so it no 4971392a9dadSAlex Elder * longer has a parent. When the parent of a 4972392a9dadSAlex Elder * layered image disappears we immediately set the 4973392a9dadSAlex Elder * overlap to 0. The effect of this is that all new 4974392a9dadSAlex Elder * requests will be treated as if the image had no 4975392a9dadSAlex Elder * parent. 4976392a9dadSAlex Elder */ 4977392a9dadSAlex Elder if (rbd_dev->parent_overlap) { 4978392a9dadSAlex Elder rbd_dev->parent_overlap = 0; 4979392a9dadSAlex Elder rbd_dev_parent_put(rbd_dev); 4980392a9dadSAlex Elder pr_info("%s: clone image has been flattened\n", 4981392a9dadSAlex Elder rbd_dev->disk->disk_name); 4982392a9dadSAlex Elder } 4983392a9dadSAlex Elder 498486b00e0dSAlex Elder goto out; /* No parent? No problem. */ 4985392a9dadSAlex Elder } 498686b00e0dSAlex Elder 49870903e875SAlex Elder /* The ceph file layout needs to fit pool id in 32 bits */ 49880903e875SAlex Elder 49890903e875SAlex Elder ret = -EIO; 4990642a2537SAlex Elder if (pool_id > (u64)U32_MAX) { 49919584d508SIlya Dryomov rbd_warn(NULL, "parent pool id too large (%llu > %u)", 4992642a2537SAlex Elder (unsigned long long)pool_id, U32_MAX); 499357385b51SAlex Elder goto out_err; 4994c0cd10dbSAlex Elder } 49950903e875SAlex Elder 4996979ed480SAlex Elder image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL); 499786b00e0dSAlex Elder if (IS_ERR(image_id)) { 499886b00e0dSAlex Elder ret = PTR_ERR(image_id); 499986b00e0dSAlex Elder goto out_err; 500086b00e0dSAlex Elder } 50013b5cf2a2SAlex Elder ceph_decode_64_safe(&p, end, snap_id, out_err); 500286b00e0dSAlex Elder ceph_decode_64_safe(&p, end, overlap, out_err); 500386b00e0dSAlex Elder 50043b5cf2a2SAlex Elder /* 50053b5cf2a2SAlex Elder * The parent won't change (except when the clone is 50063b5cf2a2SAlex Elder * flattened, already handled that). So we only need to 50073b5cf2a2SAlex Elder * record the parent spec we have not already done so. 50083b5cf2a2SAlex Elder */ 50093b5cf2a2SAlex Elder if (!rbd_dev->parent_spec) { 50103b5cf2a2SAlex Elder parent_spec->pool_id = pool_id; 50113b5cf2a2SAlex Elder parent_spec->image_id = image_id; 50123b5cf2a2SAlex Elder parent_spec->snap_id = snap_id; 501386b00e0dSAlex Elder rbd_dev->parent_spec = parent_spec; 501486b00e0dSAlex Elder parent_spec = NULL; /* rbd_dev now owns this */ 5015fbba11b3SIlya Dryomov } else { 5016fbba11b3SIlya Dryomov kfree(image_id); 50173b5cf2a2SAlex Elder } 50183b5cf2a2SAlex Elder 50193b5cf2a2SAlex Elder /* 5020cf32bd9cSIlya Dryomov * We always update the parent overlap. If it's zero we issue 5021cf32bd9cSIlya Dryomov * a warning, as we will proceed as if there was no parent. 50223b5cf2a2SAlex Elder */ 50233b5cf2a2SAlex Elder if (!overlap) { 50243b5cf2a2SAlex Elder if (parent_spec) { 5025cf32bd9cSIlya Dryomov /* refresh, careful to warn just once */ 5026cf32bd9cSIlya Dryomov if (rbd_dev->parent_overlap) 5027cf32bd9cSIlya Dryomov rbd_warn(rbd_dev, 5028cf32bd9cSIlya Dryomov "clone now standalone (overlap became 0)"); 502970cf49cfSAlex Elder } else { 5030cf32bd9cSIlya Dryomov /* initial probe */ 5031cf32bd9cSIlya Dryomov rbd_warn(rbd_dev, "clone is standalone (overlap 0)"); 50323b5cf2a2SAlex Elder } 503370cf49cfSAlex Elder } 5034cf32bd9cSIlya Dryomov rbd_dev->parent_overlap = overlap; 5035cf32bd9cSIlya Dryomov 503686b00e0dSAlex Elder out: 503786b00e0dSAlex Elder ret = 0; 503886b00e0dSAlex Elder out_err: 503986b00e0dSAlex Elder kfree(reply_buf); 504086b00e0dSAlex Elder rbd_spec_put(parent_spec); 504186b00e0dSAlex Elder 504286b00e0dSAlex Elder return ret; 504386b00e0dSAlex Elder } 504486b00e0dSAlex Elder 5045cc070d59SAlex Elder static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev) 5046cc070d59SAlex Elder { 5047cc070d59SAlex Elder struct { 5048cc070d59SAlex Elder __le64 stripe_unit; 5049cc070d59SAlex Elder __le64 stripe_count; 5050cc070d59SAlex Elder } __attribute__ ((packed)) striping_info_buf = { 0 }; 5051cc070d59SAlex Elder size_t size = sizeof (striping_info_buf); 5052cc070d59SAlex Elder void *p; 5053cc070d59SAlex Elder u64 obj_size; 5054cc070d59SAlex Elder u64 stripe_unit; 5055cc070d59SAlex Elder u64 stripe_count; 5056cc070d59SAlex Elder int ret; 5057cc070d59SAlex Elder 5058ecd4a68aSIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, 5059ecd4a68aSIlya Dryomov &rbd_dev->header_oloc, "get_stripe_unit_count", 5060ecd4a68aSIlya Dryomov NULL, 0, &striping_info_buf, size); 5061cc070d59SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 5062cc070d59SAlex Elder if (ret < 0) 5063cc070d59SAlex Elder return ret; 5064cc070d59SAlex Elder if (ret < size) 5065cc070d59SAlex Elder return -ERANGE; 5066cc070d59SAlex Elder 5067cc070d59SAlex Elder /* 5068cc070d59SAlex Elder * We don't actually support the "fancy striping" feature 5069cc070d59SAlex Elder * (STRIPINGV2) yet, but if the striping sizes are the 5070cc070d59SAlex Elder * defaults the behavior is the same as before. So find 5071cc070d59SAlex Elder * out, and only fail if the image has non-default values. 5072cc070d59SAlex Elder */ 5073cc070d59SAlex Elder ret = -EINVAL; 50745bc3fb17SIlya Dryomov obj_size = rbd_obj_bytes(&rbd_dev->header); 5075cc070d59SAlex Elder p = &striping_info_buf; 5076cc070d59SAlex Elder stripe_unit = ceph_decode_64(&p); 5077cc070d59SAlex Elder if (stripe_unit != obj_size) { 5078cc070d59SAlex Elder rbd_warn(rbd_dev, "unsupported stripe unit " 5079cc070d59SAlex Elder "(got %llu want %llu)", 5080cc070d59SAlex Elder stripe_unit, obj_size); 5081cc070d59SAlex Elder return -EINVAL; 5082cc070d59SAlex Elder } 5083cc070d59SAlex Elder stripe_count = ceph_decode_64(&p); 5084cc070d59SAlex Elder if (stripe_count != 1) { 5085cc070d59SAlex Elder rbd_warn(rbd_dev, "unsupported stripe count " 5086cc070d59SAlex Elder "(got %llu want 1)", stripe_count); 5087cc070d59SAlex Elder return -EINVAL; 5088cc070d59SAlex Elder } 5089500d0c0fSAlex Elder rbd_dev->header.stripe_unit = stripe_unit; 5090500d0c0fSAlex Elder rbd_dev->header.stripe_count = stripe_count; 5091cc070d59SAlex Elder 5092cc070d59SAlex Elder return 0; 5093cc070d59SAlex Elder } 5094cc070d59SAlex Elder 50957e97332eSIlya Dryomov static int rbd_dev_v2_data_pool(struct rbd_device *rbd_dev) 50967e97332eSIlya Dryomov { 50977e97332eSIlya Dryomov __le64 data_pool_id; 50987e97332eSIlya Dryomov int ret; 50997e97332eSIlya Dryomov 51007e97332eSIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, 51017e97332eSIlya Dryomov &rbd_dev->header_oloc, "get_data_pool", 51027e97332eSIlya Dryomov NULL, 0, &data_pool_id, sizeof(data_pool_id)); 51037e97332eSIlya Dryomov if (ret < 0) 51047e97332eSIlya Dryomov return ret; 51057e97332eSIlya Dryomov if (ret < sizeof(data_pool_id)) 51067e97332eSIlya Dryomov return -EBADMSG; 51077e97332eSIlya Dryomov 51087e97332eSIlya Dryomov rbd_dev->header.data_pool_id = le64_to_cpu(data_pool_id); 51097e97332eSIlya Dryomov WARN_ON(rbd_dev->header.data_pool_id == CEPH_NOPOOL); 51107e97332eSIlya Dryomov return 0; 51117e97332eSIlya Dryomov } 51127e97332eSIlya Dryomov 51139e15b77dSAlex Elder static char *rbd_dev_image_name(struct rbd_device *rbd_dev) 51149e15b77dSAlex Elder { 5115ecd4a68aSIlya Dryomov CEPH_DEFINE_OID_ONSTACK(oid); 51169e15b77dSAlex Elder size_t image_id_size; 51179e15b77dSAlex Elder char *image_id; 51189e15b77dSAlex Elder void *p; 51199e15b77dSAlex Elder void *end; 51209e15b77dSAlex Elder size_t size; 51219e15b77dSAlex Elder void *reply_buf = NULL; 51229e15b77dSAlex Elder size_t len = 0; 51239e15b77dSAlex Elder char *image_name = NULL; 51249e15b77dSAlex Elder int ret; 51259e15b77dSAlex Elder 51269e15b77dSAlex Elder rbd_assert(!rbd_dev->spec->image_name); 51279e15b77dSAlex Elder 512869e7a02fSAlex Elder len = strlen(rbd_dev->spec->image_id); 512969e7a02fSAlex Elder image_id_size = sizeof (__le32) + len; 51309e15b77dSAlex Elder image_id = kmalloc(image_id_size, GFP_KERNEL); 51319e15b77dSAlex Elder if (!image_id) 51329e15b77dSAlex Elder return NULL; 51339e15b77dSAlex Elder 51349e15b77dSAlex Elder p = image_id; 51354157976bSAlex Elder end = image_id + image_id_size; 513669e7a02fSAlex Elder ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len); 51379e15b77dSAlex Elder 51389e15b77dSAlex Elder size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX; 51399e15b77dSAlex Elder reply_buf = kmalloc(size, GFP_KERNEL); 51409e15b77dSAlex Elder if (!reply_buf) 51419e15b77dSAlex Elder goto out; 51429e15b77dSAlex Elder 5143ecd4a68aSIlya Dryomov ceph_oid_printf(&oid, "%s", RBD_DIRECTORY); 5144ecd4a68aSIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc, 5145ecd4a68aSIlya Dryomov "dir_get_name", image_id, image_id_size, 5146e2a58ee5SAlex Elder reply_buf, size); 51479e15b77dSAlex Elder if (ret < 0) 51489e15b77dSAlex Elder goto out; 51499e15b77dSAlex Elder p = reply_buf; 5150f40eb349SAlex Elder end = reply_buf + ret; 5151f40eb349SAlex Elder 51529e15b77dSAlex Elder image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL); 51539e15b77dSAlex Elder if (IS_ERR(image_name)) 51549e15b77dSAlex Elder image_name = NULL; 51559e15b77dSAlex Elder else 51569e15b77dSAlex Elder dout("%s: name is %s len is %zd\n", __func__, image_name, len); 51579e15b77dSAlex Elder out: 51589e15b77dSAlex Elder kfree(reply_buf); 51599e15b77dSAlex Elder kfree(image_id); 51609e15b77dSAlex Elder 51619e15b77dSAlex Elder return image_name; 51629e15b77dSAlex Elder } 51639e15b77dSAlex Elder 51642ad3d716SAlex Elder static u64 rbd_v1_snap_id_by_name(struct rbd_device *rbd_dev, const char *name) 51652ad3d716SAlex Elder { 51662ad3d716SAlex Elder struct ceph_snap_context *snapc = rbd_dev->header.snapc; 51672ad3d716SAlex Elder const char *snap_name; 51682ad3d716SAlex Elder u32 which = 0; 51692ad3d716SAlex Elder 51702ad3d716SAlex Elder /* Skip over names until we find the one we are looking for */ 51712ad3d716SAlex Elder 51722ad3d716SAlex Elder snap_name = rbd_dev->header.snap_names; 51732ad3d716SAlex Elder while (which < snapc->num_snaps) { 51742ad3d716SAlex Elder if (!strcmp(name, snap_name)) 51752ad3d716SAlex Elder return snapc->snaps[which]; 51762ad3d716SAlex Elder snap_name += strlen(snap_name) + 1; 51772ad3d716SAlex Elder which++; 51782ad3d716SAlex Elder } 51792ad3d716SAlex Elder return CEPH_NOSNAP; 51802ad3d716SAlex Elder } 51812ad3d716SAlex Elder 51822ad3d716SAlex Elder static u64 rbd_v2_snap_id_by_name(struct rbd_device *rbd_dev, const char *name) 51832ad3d716SAlex Elder { 51842ad3d716SAlex Elder struct ceph_snap_context *snapc = rbd_dev->header.snapc; 51852ad3d716SAlex Elder u32 which; 51862ad3d716SAlex Elder bool found = false; 51872ad3d716SAlex Elder u64 snap_id; 51882ad3d716SAlex Elder 51892ad3d716SAlex Elder for (which = 0; !found && which < snapc->num_snaps; which++) { 51902ad3d716SAlex Elder const char *snap_name; 51912ad3d716SAlex Elder 51922ad3d716SAlex Elder snap_id = snapc->snaps[which]; 51932ad3d716SAlex Elder snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id); 5194efadc98aSJosh Durgin if (IS_ERR(snap_name)) { 5195efadc98aSJosh Durgin /* ignore no-longer existing snapshots */ 5196efadc98aSJosh Durgin if (PTR_ERR(snap_name) == -ENOENT) 5197efadc98aSJosh Durgin continue; 5198efadc98aSJosh Durgin else 51992ad3d716SAlex Elder break; 5200efadc98aSJosh Durgin } 52012ad3d716SAlex Elder found = !strcmp(name, snap_name); 52022ad3d716SAlex Elder kfree(snap_name); 52032ad3d716SAlex Elder } 52042ad3d716SAlex Elder return found ? snap_id : CEPH_NOSNAP; 52052ad3d716SAlex Elder } 52062ad3d716SAlex Elder 52072ad3d716SAlex Elder /* 52082ad3d716SAlex Elder * Assumes name is never RBD_SNAP_HEAD_NAME; returns CEPH_NOSNAP if 52092ad3d716SAlex Elder * no snapshot by that name is found, or if an error occurs. 52102ad3d716SAlex Elder */ 52112ad3d716SAlex Elder static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name) 52122ad3d716SAlex Elder { 52132ad3d716SAlex Elder if (rbd_dev->image_format == 1) 52142ad3d716SAlex Elder return rbd_v1_snap_id_by_name(rbd_dev, name); 52152ad3d716SAlex Elder 52162ad3d716SAlex Elder return rbd_v2_snap_id_by_name(rbd_dev, name); 52172ad3d716SAlex Elder } 52182ad3d716SAlex Elder 52199e15b77dSAlex Elder /* 522004077599SIlya Dryomov * An image being mapped will have everything but the snap id. 52219e15b77dSAlex Elder */ 522204077599SIlya Dryomov static int rbd_spec_fill_snap_id(struct rbd_device *rbd_dev) 522304077599SIlya Dryomov { 522404077599SIlya Dryomov struct rbd_spec *spec = rbd_dev->spec; 522504077599SIlya Dryomov 522604077599SIlya Dryomov rbd_assert(spec->pool_id != CEPH_NOPOOL && spec->pool_name); 522704077599SIlya Dryomov rbd_assert(spec->image_id && spec->image_name); 522804077599SIlya Dryomov rbd_assert(spec->snap_name); 522904077599SIlya Dryomov 523004077599SIlya Dryomov if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) { 523104077599SIlya Dryomov u64 snap_id; 523204077599SIlya Dryomov 523304077599SIlya Dryomov snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name); 523404077599SIlya Dryomov if (snap_id == CEPH_NOSNAP) 523504077599SIlya Dryomov return -ENOENT; 523604077599SIlya Dryomov 523704077599SIlya Dryomov spec->snap_id = snap_id; 523804077599SIlya Dryomov } else { 523904077599SIlya Dryomov spec->snap_id = CEPH_NOSNAP; 524004077599SIlya Dryomov } 524104077599SIlya Dryomov 524204077599SIlya Dryomov return 0; 524304077599SIlya Dryomov } 524404077599SIlya Dryomov 524504077599SIlya Dryomov /* 524604077599SIlya Dryomov * A parent image will have all ids but none of the names. 524704077599SIlya Dryomov * 524804077599SIlya Dryomov * All names in an rbd spec are dynamically allocated. It's OK if we 524904077599SIlya Dryomov * can't figure out the name for an image id. 525004077599SIlya Dryomov */ 525104077599SIlya Dryomov static int rbd_spec_fill_names(struct rbd_device *rbd_dev) 52529e15b77dSAlex Elder { 52532e9f7f1cSAlex Elder struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 52542e9f7f1cSAlex Elder struct rbd_spec *spec = rbd_dev->spec; 52552e9f7f1cSAlex Elder const char *pool_name; 52562e9f7f1cSAlex Elder const char *image_name; 52572e9f7f1cSAlex Elder const char *snap_name; 52589e15b77dSAlex Elder int ret; 52599e15b77dSAlex Elder 526004077599SIlya Dryomov rbd_assert(spec->pool_id != CEPH_NOPOOL); 526104077599SIlya Dryomov rbd_assert(spec->image_id); 526204077599SIlya Dryomov rbd_assert(spec->snap_id != CEPH_NOSNAP); 52639e15b77dSAlex Elder 52642e9f7f1cSAlex Elder /* Get the pool name; we have to make our own copy of this */ 52659e15b77dSAlex Elder 52662e9f7f1cSAlex Elder pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id); 52672e9f7f1cSAlex Elder if (!pool_name) { 52682e9f7f1cSAlex Elder rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id); 5269935dc89fSAlex Elder return -EIO; 5270935dc89fSAlex Elder } 52712e9f7f1cSAlex Elder pool_name = kstrdup(pool_name, GFP_KERNEL); 52722e9f7f1cSAlex Elder if (!pool_name) 52739e15b77dSAlex Elder return -ENOMEM; 52749e15b77dSAlex Elder 52759e15b77dSAlex Elder /* Fetch the image name; tolerate failure here */ 52769e15b77dSAlex Elder 52772e9f7f1cSAlex Elder image_name = rbd_dev_image_name(rbd_dev); 52782e9f7f1cSAlex Elder if (!image_name) 527906ecc6cbSAlex Elder rbd_warn(rbd_dev, "unable to get image name"); 52809e15b77dSAlex Elder 528104077599SIlya Dryomov /* Fetch the snapshot name */ 52829e15b77dSAlex Elder 52832e9f7f1cSAlex Elder snap_name = rbd_snap_name(rbd_dev, spec->snap_id); 5284da6a6b63SJosh Durgin if (IS_ERR(snap_name)) { 5285da6a6b63SJosh Durgin ret = PTR_ERR(snap_name); 52869e15b77dSAlex Elder goto out_err; 52872e9f7f1cSAlex Elder } 52882e9f7f1cSAlex Elder 52892e9f7f1cSAlex Elder spec->pool_name = pool_name; 52902e9f7f1cSAlex Elder spec->image_name = image_name; 52912e9f7f1cSAlex Elder spec->snap_name = snap_name; 52929e15b77dSAlex Elder 52939e15b77dSAlex Elder return 0; 529404077599SIlya Dryomov 52959e15b77dSAlex Elder out_err: 52962e9f7f1cSAlex Elder kfree(image_name); 52972e9f7f1cSAlex Elder kfree(pool_name); 52989e15b77dSAlex Elder return ret; 52999e15b77dSAlex Elder } 53009e15b77dSAlex Elder 5301cc4a38bdSAlex Elder static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev) 530235d489f9SAlex Elder { 530335d489f9SAlex Elder size_t size; 530435d489f9SAlex Elder int ret; 530535d489f9SAlex Elder void *reply_buf; 530635d489f9SAlex Elder void *p; 530735d489f9SAlex Elder void *end; 530835d489f9SAlex Elder u64 seq; 530935d489f9SAlex Elder u32 snap_count; 531035d489f9SAlex Elder struct ceph_snap_context *snapc; 531135d489f9SAlex Elder u32 i; 531235d489f9SAlex Elder 531335d489f9SAlex Elder /* 531435d489f9SAlex Elder * We'll need room for the seq value (maximum snapshot id), 531535d489f9SAlex Elder * snapshot count, and array of that many snapshot ids. 531635d489f9SAlex Elder * For now we have a fixed upper limit on the number we're 531735d489f9SAlex Elder * prepared to receive. 531835d489f9SAlex Elder */ 531935d489f9SAlex Elder size = sizeof (__le64) + sizeof (__le32) + 532035d489f9SAlex Elder RBD_MAX_SNAP_COUNT * sizeof (__le64); 532135d489f9SAlex Elder reply_buf = kzalloc(size, GFP_KERNEL); 532235d489f9SAlex Elder if (!reply_buf) 532335d489f9SAlex Elder return -ENOMEM; 532435d489f9SAlex Elder 5325ecd4a68aSIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, 5326ecd4a68aSIlya Dryomov &rbd_dev->header_oloc, "get_snapcontext", 5327ecd4a68aSIlya Dryomov NULL, 0, reply_buf, size); 532836be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 532935d489f9SAlex Elder if (ret < 0) 533035d489f9SAlex Elder goto out; 533135d489f9SAlex Elder 533235d489f9SAlex Elder p = reply_buf; 533357385b51SAlex Elder end = reply_buf + ret; 533457385b51SAlex Elder ret = -ERANGE; 533535d489f9SAlex Elder ceph_decode_64_safe(&p, end, seq, out); 533635d489f9SAlex Elder ceph_decode_32_safe(&p, end, snap_count, out); 533735d489f9SAlex Elder 533835d489f9SAlex Elder /* 533935d489f9SAlex Elder * Make sure the reported number of snapshot ids wouldn't go 534035d489f9SAlex Elder * beyond the end of our buffer. But before checking that, 534135d489f9SAlex Elder * make sure the computed size of the snapshot context we 534235d489f9SAlex Elder * allocate is representable in a size_t. 534335d489f9SAlex Elder */ 534435d489f9SAlex Elder if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context)) 534535d489f9SAlex Elder / sizeof (u64)) { 534635d489f9SAlex Elder ret = -EINVAL; 534735d489f9SAlex Elder goto out; 534835d489f9SAlex Elder } 534935d489f9SAlex Elder if (!ceph_has_room(&p, end, snap_count * sizeof (__le64))) 535035d489f9SAlex Elder goto out; 5351468521c1SAlex Elder ret = 0; 535235d489f9SAlex Elder 5353812164f8SAlex Elder snapc = ceph_create_snap_context(snap_count, GFP_KERNEL); 535435d489f9SAlex Elder if (!snapc) { 535535d489f9SAlex Elder ret = -ENOMEM; 535635d489f9SAlex Elder goto out; 535735d489f9SAlex Elder } 535835d489f9SAlex Elder snapc->seq = seq; 535935d489f9SAlex Elder for (i = 0; i < snap_count; i++) 536035d489f9SAlex Elder snapc->snaps[i] = ceph_decode_64(&p); 536135d489f9SAlex Elder 536249ece554SAlex Elder ceph_put_snap_context(rbd_dev->header.snapc); 536335d489f9SAlex Elder rbd_dev->header.snapc = snapc; 536435d489f9SAlex Elder 536535d489f9SAlex Elder dout(" snap context seq = %llu, snap_count = %u\n", 536635d489f9SAlex Elder (unsigned long long)seq, (unsigned int)snap_count); 536735d489f9SAlex Elder out: 536835d489f9SAlex Elder kfree(reply_buf); 536935d489f9SAlex Elder 537057385b51SAlex Elder return ret; 537135d489f9SAlex Elder } 537235d489f9SAlex Elder 537354cac61fSAlex Elder static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, 537454cac61fSAlex Elder u64 snap_id) 5375b8b1e2dbSAlex Elder { 5376b8b1e2dbSAlex Elder size_t size; 5377b8b1e2dbSAlex Elder void *reply_buf; 537854cac61fSAlex Elder __le64 snapid; 5379b8b1e2dbSAlex Elder int ret; 5380b8b1e2dbSAlex Elder void *p; 5381b8b1e2dbSAlex Elder void *end; 5382b8b1e2dbSAlex Elder char *snap_name; 5383b8b1e2dbSAlex Elder 5384b8b1e2dbSAlex Elder size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN; 5385b8b1e2dbSAlex Elder reply_buf = kmalloc(size, GFP_KERNEL); 5386b8b1e2dbSAlex Elder if (!reply_buf) 5387b8b1e2dbSAlex Elder return ERR_PTR(-ENOMEM); 5388b8b1e2dbSAlex Elder 538954cac61fSAlex Elder snapid = cpu_to_le64(snap_id); 5390ecd4a68aSIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, 5391ecd4a68aSIlya Dryomov &rbd_dev->header_oloc, "get_snapshot_name", 5392ecd4a68aSIlya Dryomov &snapid, sizeof(snapid), reply_buf, size); 539336be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 5394f40eb349SAlex Elder if (ret < 0) { 5395f40eb349SAlex Elder snap_name = ERR_PTR(ret); 5396b8b1e2dbSAlex Elder goto out; 5397f40eb349SAlex Elder } 5398b8b1e2dbSAlex Elder 5399b8b1e2dbSAlex Elder p = reply_buf; 5400f40eb349SAlex Elder end = reply_buf + ret; 5401e5c35534SAlex Elder snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL); 5402f40eb349SAlex Elder if (IS_ERR(snap_name)) 5403b8b1e2dbSAlex Elder goto out; 5404f40eb349SAlex Elder 5405b8b1e2dbSAlex Elder dout(" snap_id 0x%016llx snap_name = %s\n", 540654cac61fSAlex Elder (unsigned long long)snap_id, snap_name); 5407b8b1e2dbSAlex Elder out: 5408b8b1e2dbSAlex Elder kfree(reply_buf); 5409b8b1e2dbSAlex Elder 5410f40eb349SAlex Elder return snap_name; 5411b8b1e2dbSAlex Elder } 5412b8b1e2dbSAlex Elder 54132df3fac7SAlex Elder static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev) 5414117973fbSAlex Elder { 54152df3fac7SAlex Elder bool first_time = rbd_dev->header.object_prefix == NULL; 5416117973fbSAlex Elder int ret; 5417117973fbSAlex Elder 54181617e40cSJosh Durgin ret = rbd_dev_v2_image_size(rbd_dev); 54191617e40cSJosh Durgin if (ret) 5420cfbf6377SAlex Elder return ret; 54211617e40cSJosh Durgin 54222df3fac7SAlex Elder if (first_time) { 54232df3fac7SAlex Elder ret = rbd_dev_v2_header_onetime(rbd_dev); 54242df3fac7SAlex Elder if (ret) 5425cfbf6377SAlex Elder return ret; 54262df3fac7SAlex Elder } 54272df3fac7SAlex Elder 5428cc4a38bdSAlex Elder ret = rbd_dev_v2_snap_context(rbd_dev); 5429d194cd1dSIlya Dryomov if (ret && first_time) { 5430d194cd1dSIlya Dryomov kfree(rbd_dev->header.object_prefix); 5431d194cd1dSIlya Dryomov rbd_dev->header.object_prefix = NULL; 5432d194cd1dSIlya Dryomov } 5433117973fbSAlex Elder 5434117973fbSAlex Elder return ret; 5435117973fbSAlex Elder } 5436117973fbSAlex Elder 5437a720ae09SIlya Dryomov static int rbd_dev_header_info(struct rbd_device *rbd_dev) 5438a720ae09SIlya Dryomov { 5439a720ae09SIlya Dryomov rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 5440a720ae09SIlya Dryomov 5441a720ae09SIlya Dryomov if (rbd_dev->image_format == 1) 5442a720ae09SIlya Dryomov return rbd_dev_v1_header_info(rbd_dev); 5443a720ae09SIlya Dryomov 5444a720ae09SIlya Dryomov return rbd_dev_v2_header_info(rbd_dev); 5445a720ae09SIlya Dryomov } 5446a720ae09SIlya Dryomov 54471ddbe94eSAlex Elder /* 5448e28fff26SAlex Elder * Skips over white space at *buf, and updates *buf to point to the 5449e28fff26SAlex Elder * first found non-space character (if any). Returns the length of 5450593a9e7bSAlex Elder * the token (string of non-white space characters) found. Note 5451593a9e7bSAlex Elder * that *buf must be terminated with '\0'. 5452e28fff26SAlex Elder */ 5453e28fff26SAlex Elder static inline size_t next_token(const char **buf) 5454e28fff26SAlex Elder { 5455e28fff26SAlex Elder /* 5456e28fff26SAlex Elder * These are the characters that produce nonzero for 5457e28fff26SAlex Elder * isspace() in the "C" and "POSIX" locales. 5458e28fff26SAlex Elder */ 5459e28fff26SAlex Elder const char *spaces = " \f\n\r\t\v"; 5460e28fff26SAlex Elder 5461e28fff26SAlex Elder *buf += strspn(*buf, spaces); /* Find start of token */ 5462e28fff26SAlex Elder 5463e28fff26SAlex Elder return strcspn(*buf, spaces); /* Return token length */ 5464e28fff26SAlex Elder } 5465e28fff26SAlex Elder 5466e28fff26SAlex Elder /* 5467ea3352f4SAlex Elder * Finds the next token in *buf, dynamically allocates a buffer big 5468ea3352f4SAlex Elder * enough to hold a copy of it, and copies the token into the new 5469ea3352f4SAlex Elder * buffer. The copy is guaranteed to be terminated with '\0'. Note 5470ea3352f4SAlex Elder * that a duplicate buffer is created even for a zero-length token. 5471ea3352f4SAlex Elder * 5472ea3352f4SAlex Elder * Returns a pointer to the newly-allocated duplicate, or a null 5473ea3352f4SAlex Elder * pointer if memory for the duplicate was not available. If 5474ea3352f4SAlex Elder * the lenp argument is a non-null pointer, the length of the token 5475ea3352f4SAlex Elder * (not including the '\0') is returned in *lenp. 5476ea3352f4SAlex Elder * 5477ea3352f4SAlex Elder * If successful, the *buf pointer will be updated to point beyond 5478ea3352f4SAlex Elder * the end of the found token. 5479ea3352f4SAlex Elder * 5480ea3352f4SAlex Elder * Note: uses GFP_KERNEL for allocation. 5481ea3352f4SAlex Elder */ 5482ea3352f4SAlex Elder static inline char *dup_token(const char **buf, size_t *lenp) 5483ea3352f4SAlex Elder { 5484ea3352f4SAlex Elder char *dup; 5485ea3352f4SAlex Elder size_t len; 5486ea3352f4SAlex Elder 5487ea3352f4SAlex Elder len = next_token(buf); 54884caf35f9SAlex Elder dup = kmemdup(*buf, len + 1, GFP_KERNEL); 5489ea3352f4SAlex Elder if (!dup) 5490ea3352f4SAlex Elder return NULL; 5491ea3352f4SAlex Elder *(dup + len) = '\0'; 5492ea3352f4SAlex Elder *buf += len; 5493ea3352f4SAlex Elder 5494ea3352f4SAlex Elder if (lenp) 5495ea3352f4SAlex Elder *lenp = len; 5496ea3352f4SAlex Elder 5497ea3352f4SAlex Elder return dup; 5498ea3352f4SAlex Elder } 5499ea3352f4SAlex Elder 5500ea3352f4SAlex Elder /* 5501859c31dfSAlex Elder * Parse the options provided for an "rbd add" (i.e., rbd image 5502859c31dfSAlex Elder * mapping) request. These arrive via a write to /sys/bus/rbd/add, 5503859c31dfSAlex Elder * and the data written is passed here via a NUL-terminated buffer. 5504859c31dfSAlex Elder * Returns 0 if successful or an error code otherwise. 5505d22f76e7SAlex Elder * 5506859c31dfSAlex Elder * The information extracted from these options is recorded in 5507859c31dfSAlex Elder * the other parameters which return dynamically-allocated 5508859c31dfSAlex Elder * structures: 5509859c31dfSAlex Elder * ceph_opts 5510859c31dfSAlex Elder * The address of a pointer that will refer to a ceph options 5511859c31dfSAlex Elder * structure. Caller must release the returned pointer using 5512859c31dfSAlex Elder * ceph_destroy_options() when it is no longer needed. 5513859c31dfSAlex Elder * rbd_opts 5514859c31dfSAlex Elder * Address of an rbd options pointer. Fully initialized by 5515859c31dfSAlex Elder * this function; caller must release with kfree(). 5516859c31dfSAlex Elder * spec 5517859c31dfSAlex Elder * Address of an rbd image specification pointer. Fully 5518859c31dfSAlex Elder * initialized by this function based on parsed options. 5519859c31dfSAlex Elder * Caller must release with rbd_spec_put(). 5520859c31dfSAlex Elder * 5521859c31dfSAlex Elder * The options passed take this form: 5522859c31dfSAlex Elder * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>] 5523859c31dfSAlex Elder * where: 5524859c31dfSAlex Elder * <mon_addrs> 5525859c31dfSAlex Elder * A comma-separated list of one or more monitor addresses. 5526859c31dfSAlex Elder * A monitor address is an ip address, optionally followed 5527859c31dfSAlex Elder * by a port number (separated by a colon). 5528859c31dfSAlex Elder * I.e.: ip1[:port1][,ip2[:port2]...] 5529859c31dfSAlex Elder * <options> 5530859c31dfSAlex Elder * A comma-separated list of ceph and/or rbd options. 5531859c31dfSAlex Elder * <pool_name> 5532859c31dfSAlex Elder * The name of the rados pool containing the rbd image. 5533859c31dfSAlex Elder * <image_name> 5534859c31dfSAlex Elder * The name of the image in that pool to map. 5535859c31dfSAlex Elder * <snap_id> 5536859c31dfSAlex Elder * An optional snapshot id. If provided, the mapping will 5537859c31dfSAlex Elder * present data from the image at the time that snapshot was 5538859c31dfSAlex Elder * created. The image head is used if no snapshot id is 5539859c31dfSAlex Elder * provided. Snapshot mappings are always read-only. 5540a725f65eSAlex Elder */ 5541859c31dfSAlex Elder static int rbd_add_parse_args(const char *buf, 5542dc79b113SAlex Elder struct ceph_options **ceph_opts, 5543859c31dfSAlex Elder struct rbd_options **opts, 5544859c31dfSAlex Elder struct rbd_spec **rbd_spec) 5545a725f65eSAlex Elder { 5546e28fff26SAlex Elder size_t len; 5547859c31dfSAlex Elder char *options; 55480ddebc0cSAlex Elder const char *mon_addrs; 5549ecb4dc22SAlex Elder char *snap_name; 55500ddebc0cSAlex Elder size_t mon_addrs_size; 5551859c31dfSAlex Elder struct rbd_spec *spec = NULL; 55524e9afebaSAlex Elder struct rbd_options *rbd_opts = NULL; 5553859c31dfSAlex Elder struct ceph_options *copts; 5554dc79b113SAlex Elder int ret; 5555e28fff26SAlex Elder 5556e28fff26SAlex Elder /* The first four tokens are required */ 5557e28fff26SAlex Elder 55587ef3214aSAlex Elder len = next_token(&buf); 55594fb5d671SAlex Elder if (!len) { 55604fb5d671SAlex Elder rbd_warn(NULL, "no monitor address(es) provided"); 55614fb5d671SAlex Elder return -EINVAL; 55624fb5d671SAlex Elder } 55630ddebc0cSAlex Elder mon_addrs = buf; 5564f28e565aSAlex Elder mon_addrs_size = len + 1; 55657ef3214aSAlex Elder buf += len; 5566a725f65eSAlex Elder 5567dc79b113SAlex Elder ret = -EINVAL; 5568f28e565aSAlex Elder options = dup_token(&buf, NULL); 5569f28e565aSAlex Elder if (!options) 5570dc79b113SAlex Elder return -ENOMEM; 55714fb5d671SAlex Elder if (!*options) { 55724fb5d671SAlex Elder rbd_warn(NULL, "no options provided"); 55734fb5d671SAlex Elder goto out_err; 55744fb5d671SAlex Elder } 5575a725f65eSAlex Elder 5576859c31dfSAlex Elder spec = rbd_spec_alloc(); 5577859c31dfSAlex Elder if (!spec) 5578f28e565aSAlex Elder goto out_mem; 5579859c31dfSAlex Elder 5580859c31dfSAlex Elder spec->pool_name = dup_token(&buf, NULL); 5581859c31dfSAlex Elder if (!spec->pool_name) 5582859c31dfSAlex Elder goto out_mem; 55834fb5d671SAlex Elder if (!*spec->pool_name) { 55844fb5d671SAlex Elder rbd_warn(NULL, "no pool name provided"); 55854fb5d671SAlex Elder goto out_err; 55864fb5d671SAlex Elder } 5587e28fff26SAlex Elder 558869e7a02fSAlex Elder spec->image_name = dup_token(&buf, NULL); 5589859c31dfSAlex Elder if (!spec->image_name) 5590f28e565aSAlex Elder goto out_mem; 55914fb5d671SAlex Elder if (!*spec->image_name) { 55924fb5d671SAlex Elder rbd_warn(NULL, "no image name provided"); 55934fb5d671SAlex Elder goto out_err; 55944fb5d671SAlex Elder } 5595e28fff26SAlex Elder 5596f28e565aSAlex Elder /* 5597f28e565aSAlex Elder * Snapshot name is optional; default is to use "-" 5598f28e565aSAlex Elder * (indicating the head/no snapshot). 5599f28e565aSAlex Elder */ 56003feeb894SAlex Elder len = next_token(&buf); 5601820a5f3eSAlex Elder if (!len) { 56023feeb894SAlex Elder buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */ 56033feeb894SAlex Elder len = sizeof (RBD_SNAP_HEAD_NAME) - 1; 5604f28e565aSAlex Elder } else if (len > RBD_MAX_SNAP_NAME_LEN) { 5605dc79b113SAlex Elder ret = -ENAMETOOLONG; 5606f28e565aSAlex Elder goto out_err; 5607849b4260SAlex Elder } 5608ecb4dc22SAlex Elder snap_name = kmemdup(buf, len + 1, GFP_KERNEL); 5609ecb4dc22SAlex Elder if (!snap_name) 5610f28e565aSAlex Elder goto out_mem; 5611ecb4dc22SAlex Elder *(snap_name + len) = '\0'; 5612ecb4dc22SAlex Elder spec->snap_name = snap_name; 5613e5c35534SAlex Elder 56140ddebc0cSAlex Elder /* Initialize all rbd options to the defaults */ 5615e28fff26SAlex Elder 56164e9afebaSAlex Elder rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL); 56174e9afebaSAlex Elder if (!rbd_opts) 56184e9afebaSAlex Elder goto out_mem; 56194e9afebaSAlex Elder 56204e9afebaSAlex Elder rbd_opts->read_only = RBD_READ_ONLY_DEFAULT; 5621b5584180SIlya Dryomov rbd_opts->queue_depth = RBD_QUEUE_DEPTH_DEFAULT; 562280de1912SIlya Dryomov rbd_opts->lock_on_read = RBD_LOCK_ON_READ_DEFAULT; 5623d22f76e7SAlex Elder 5624859c31dfSAlex Elder copts = ceph_parse_options(options, mon_addrs, 56250ddebc0cSAlex Elder mon_addrs + mon_addrs_size - 1, 56264e9afebaSAlex Elder parse_rbd_opts_token, rbd_opts); 5627859c31dfSAlex Elder if (IS_ERR(copts)) { 5628859c31dfSAlex Elder ret = PTR_ERR(copts); 5629dc79b113SAlex Elder goto out_err; 5630dc79b113SAlex Elder } 5631859c31dfSAlex Elder kfree(options); 5632859c31dfSAlex Elder 5633859c31dfSAlex Elder *ceph_opts = copts; 56344e9afebaSAlex Elder *opts = rbd_opts; 5635859c31dfSAlex Elder *rbd_spec = spec; 56360ddebc0cSAlex Elder 5637dc79b113SAlex Elder return 0; 5638f28e565aSAlex Elder out_mem: 5639dc79b113SAlex Elder ret = -ENOMEM; 5640d22f76e7SAlex Elder out_err: 5641859c31dfSAlex Elder kfree(rbd_opts); 5642859c31dfSAlex Elder rbd_spec_put(spec); 5643f28e565aSAlex Elder kfree(options); 5644d22f76e7SAlex Elder 5645dc79b113SAlex Elder return ret; 5646a725f65eSAlex Elder } 5647a725f65eSAlex Elder 5648589d30e0SAlex Elder /* 564930ba1f02SIlya Dryomov * Return pool id (>= 0) or a negative error code. 565030ba1f02SIlya Dryomov */ 565130ba1f02SIlya Dryomov static int rbd_add_get_pool_id(struct rbd_client *rbdc, const char *pool_name) 565230ba1f02SIlya Dryomov { 5653a319bf56SIlya Dryomov struct ceph_options *opts = rbdc->client->options; 565430ba1f02SIlya Dryomov u64 newest_epoch; 565530ba1f02SIlya Dryomov int tries = 0; 565630ba1f02SIlya Dryomov int ret; 565730ba1f02SIlya Dryomov 565830ba1f02SIlya Dryomov again: 565930ba1f02SIlya Dryomov ret = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, pool_name); 566030ba1f02SIlya Dryomov if (ret == -ENOENT && tries++ < 1) { 5661d0b19705SIlya Dryomov ret = ceph_monc_get_version(&rbdc->client->monc, "osdmap", 566230ba1f02SIlya Dryomov &newest_epoch); 566330ba1f02SIlya Dryomov if (ret < 0) 566430ba1f02SIlya Dryomov return ret; 566530ba1f02SIlya Dryomov 566630ba1f02SIlya Dryomov if (rbdc->client->osdc.osdmap->epoch < newest_epoch) { 56677cca78c9SIlya Dryomov ceph_osdc_maybe_request_map(&rbdc->client->osdc); 566830ba1f02SIlya Dryomov (void) ceph_monc_wait_osdmap(&rbdc->client->monc, 5669a319bf56SIlya Dryomov newest_epoch, 5670a319bf56SIlya Dryomov opts->mount_timeout); 567130ba1f02SIlya Dryomov goto again; 567230ba1f02SIlya Dryomov } else { 567330ba1f02SIlya Dryomov /* the osdmap we have is new enough */ 567430ba1f02SIlya Dryomov return -ENOENT; 567530ba1f02SIlya Dryomov } 567630ba1f02SIlya Dryomov } 567730ba1f02SIlya Dryomov 567830ba1f02SIlya Dryomov return ret; 567930ba1f02SIlya Dryomov } 568030ba1f02SIlya Dryomov 568130ba1f02SIlya Dryomov /* 5682589d30e0SAlex Elder * An rbd format 2 image has a unique identifier, distinct from the 5683589d30e0SAlex Elder * name given to it by the user. Internally, that identifier is 5684589d30e0SAlex Elder * what's used to specify the names of objects related to the image. 5685589d30e0SAlex Elder * 5686589d30e0SAlex Elder * A special "rbd id" object is used to map an rbd image name to its 5687589d30e0SAlex Elder * id. If that object doesn't exist, then there is no v2 rbd image 5688589d30e0SAlex Elder * with the supplied name. 5689589d30e0SAlex Elder * 5690589d30e0SAlex Elder * This function will record the given rbd_dev's image_id field if 5691589d30e0SAlex Elder * it can be determined, and in that case will return 0. If any 5692589d30e0SAlex Elder * errors occur a negative errno will be returned and the rbd_dev's 5693589d30e0SAlex Elder * image_id field will be unchanged (and should be NULL). 5694589d30e0SAlex Elder */ 5695589d30e0SAlex Elder static int rbd_dev_image_id(struct rbd_device *rbd_dev) 5696589d30e0SAlex Elder { 5697589d30e0SAlex Elder int ret; 5698589d30e0SAlex Elder size_t size; 5699ecd4a68aSIlya Dryomov CEPH_DEFINE_OID_ONSTACK(oid); 5700589d30e0SAlex Elder void *response; 5701c0fba368SAlex Elder char *image_id; 57022f82ee54SAlex Elder 5703589d30e0SAlex Elder /* 57042c0d0a10SAlex Elder * When probing a parent image, the image id is already 57052c0d0a10SAlex Elder * known (and the image name likely is not). There's no 5706c0fba368SAlex Elder * need to fetch the image id again in this case. We 5707c0fba368SAlex Elder * do still need to set the image format though. 57082c0d0a10SAlex Elder */ 5709c0fba368SAlex Elder if (rbd_dev->spec->image_id) { 5710c0fba368SAlex Elder rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1; 5711c0fba368SAlex Elder 57122c0d0a10SAlex Elder return 0; 5713c0fba368SAlex Elder } 57142c0d0a10SAlex Elder 57152c0d0a10SAlex Elder /* 5716589d30e0SAlex Elder * First, see if the format 2 image id file exists, and if 5717589d30e0SAlex Elder * so, get the image's persistent id from it. 5718589d30e0SAlex Elder */ 5719ecd4a68aSIlya Dryomov ret = ceph_oid_aprintf(&oid, GFP_KERNEL, "%s%s", RBD_ID_PREFIX, 5720ecd4a68aSIlya Dryomov rbd_dev->spec->image_name); 5721ecd4a68aSIlya Dryomov if (ret) 5722ecd4a68aSIlya Dryomov return ret; 5723ecd4a68aSIlya Dryomov 5724ecd4a68aSIlya Dryomov dout("rbd id object name is %s\n", oid.name); 5725589d30e0SAlex Elder 5726589d30e0SAlex Elder /* Response will be an encoded string, which includes a length */ 5727589d30e0SAlex Elder 5728589d30e0SAlex Elder size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX; 5729589d30e0SAlex Elder response = kzalloc(size, GFP_NOIO); 5730589d30e0SAlex Elder if (!response) { 5731589d30e0SAlex Elder ret = -ENOMEM; 5732589d30e0SAlex Elder goto out; 5733589d30e0SAlex Elder } 5734589d30e0SAlex Elder 5735c0fba368SAlex Elder /* If it doesn't exist we'll assume it's a format 1 image */ 5736c0fba368SAlex Elder 5737ecd4a68aSIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc, 5738ecd4a68aSIlya Dryomov "get_id", NULL, 0, 5739e2a58ee5SAlex Elder response, RBD_IMAGE_ID_LEN_MAX); 574036be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 5741c0fba368SAlex Elder if (ret == -ENOENT) { 5742c0fba368SAlex Elder image_id = kstrdup("", GFP_KERNEL); 5743c0fba368SAlex Elder ret = image_id ? 0 : -ENOMEM; 5744c0fba368SAlex Elder if (!ret) 5745c0fba368SAlex Elder rbd_dev->image_format = 1; 57467dd440c9SIlya Dryomov } else if (ret >= 0) { 5747c0fba368SAlex Elder void *p = response; 5748589d30e0SAlex Elder 5749c0fba368SAlex Elder image_id = ceph_extract_encoded_string(&p, p + ret, 5750979ed480SAlex Elder NULL, GFP_NOIO); 5751461f758aSDuan Jiong ret = PTR_ERR_OR_ZERO(image_id); 5752c0fba368SAlex Elder if (!ret) 5753c0fba368SAlex Elder rbd_dev->image_format = 2; 5754c0fba368SAlex Elder } 5755c0fba368SAlex Elder 5756c0fba368SAlex Elder if (!ret) { 5757c0fba368SAlex Elder rbd_dev->spec->image_id = image_id; 5758c0fba368SAlex Elder dout("image_id is %s\n", image_id); 5759589d30e0SAlex Elder } 5760589d30e0SAlex Elder out: 5761589d30e0SAlex Elder kfree(response); 5762ecd4a68aSIlya Dryomov ceph_oid_destroy(&oid); 5763589d30e0SAlex Elder return ret; 5764589d30e0SAlex Elder } 5765589d30e0SAlex Elder 57663abef3b3SAlex Elder /* 57673abef3b3SAlex Elder * Undo whatever state changes are made by v1 or v2 header info 57683abef3b3SAlex Elder * call. 57693abef3b3SAlex Elder */ 57706fd48b3bSAlex Elder static void rbd_dev_unprobe(struct rbd_device *rbd_dev) 57716fd48b3bSAlex Elder { 57726fd48b3bSAlex Elder struct rbd_image_header *header; 57736fd48b3bSAlex Elder 5774a2acd00eSAlex Elder rbd_dev_parent_put(rbd_dev); 57756fd48b3bSAlex Elder 57766fd48b3bSAlex Elder /* Free dynamic fields from the header, then zero it out */ 57776fd48b3bSAlex Elder 57786fd48b3bSAlex Elder header = &rbd_dev->header; 5779812164f8SAlex Elder ceph_put_snap_context(header->snapc); 57806fd48b3bSAlex Elder kfree(header->snap_sizes); 57816fd48b3bSAlex Elder kfree(header->snap_names); 57826fd48b3bSAlex Elder kfree(header->object_prefix); 57836fd48b3bSAlex Elder memset(header, 0, sizeof (*header)); 57846fd48b3bSAlex Elder } 57856fd48b3bSAlex Elder 57862df3fac7SAlex Elder static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev) 5787a30b71b9SAlex Elder { 5788a30b71b9SAlex Elder int ret; 5789a30b71b9SAlex Elder 57901e130199SAlex Elder ret = rbd_dev_v2_object_prefix(rbd_dev); 579157385b51SAlex Elder if (ret) 57921e130199SAlex Elder goto out_err; 5793b1b5402aSAlex Elder 57942df3fac7SAlex Elder /* 57952df3fac7SAlex Elder * Get the and check features for the image. Currently the 57962df3fac7SAlex Elder * features are assumed to never change. 57972df3fac7SAlex Elder */ 5798b1b5402aSAlex Elder ret = rbd_dev_v2_features(rbd_dev); 579957385b51SAlex Elder if (ret) 5800b1b5402aSAlex Elder goto out_err; 580135d489f9SAlex Elder 5802cc070d59SAlex Elder /* If the image supports fancy striping, get its parameters */ 5803cc070d59SAlex Elder 5804cc070d59SAlex Elder if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) { 5805cc070d59SAlex Elder ret = rbd_dev_v2_striping_info(rbd_dev); 5806cc070d59SAlex Elder if (ret < 0) 5807cc070d59SAlex Elder goto out_err; 5808cc070d59SAlex Elder } 5809a30b71b9SAlex Elder 58107e97332eSIlya Dryomov if (rbd_dev->header.features & RBD_FEATURE_DATA_POOL) { 58117e97332eSIlya Dryomov ret = rbd_dev_v2_data_pool(rbd_dev); 58127e97332eSIlya Dryomov if (ret) 58137e97332eSIlya Dryomov goto out_err; 58147e97332eSIlya Dryomov } 58157e97332eSIlya Dryomov 5816263423f8SIlya Dryomov rbd_init_layout(rbd_dev); 581735152979SAlex Elder return 0; 5818263423f8SIlya Dryomov 58199d475de5SAlex Elder out_err: 5820642a2537SAlex Elder rbd_dev->header.features = 0; 58211e130199SAlex Elder kfree(rbd_dev->header.object_prefix); 58221e130199SAlex Elder rbd_dev->header.object_prefix = NULL; 58239d475de5SAlex Elder return ret; 5824a30b71b9SAlex Elder } 5825a30b71b9SAlex Elder 58266d69bb53SIlya Dryomov /* 58276d69bb53SIlya Dryomov * @depth is rbd_dev_image_probe() -> rbd_dev_probe_parent() -> 58286d69bb53SIlya Dryomov * rbd_dev_image_probe() recursion depth, which means it's also the 58296d69bb53SIlya Dryomov * length of the already discovered part of the parent chain. 58306d69bb53SIlya Dryomov */ 58316d69bb53SIlya Dryomov static int rbd_dev_probe_parent(struct rbd_device *rbd_dev, int depth) 583283a06263SAlex Elder { 58332f82ee54SAlex Elder struct rbd_device *parent = NULL; 5834124afba2SAlex Elder int ret; 5835124afba2SAlex Elder 5836124afba2SAlex Elder if (!rbd_dev->parent_spec) 5837124afba2SAlex Elder return 0; 5838124afba2SAlex Elder 58396d69bb53SIlya Dryomov if (++depth > RBD_MAX_PARENT_CHAIN_LEN) { 58406d69bb53SIlya Dryomov pr_info("parent chain is too long (%d)\n", depth); 58416d69bb53SIlya Dryomov ret = -EINVAL; 58426d69bb53SIlya Dryomov goto out_err; 58436d69bb53SIlya Dryomov } 58446d69bb53SIlya Dryomov 58451643dfa4SIlya Dryomov parent = __rbd_dev_create(rbd_dev->rbd_client, rbd_dev->parent_spec); 58461f2c6651SIlya Dryomov if (!parent) { 5847124afba2SAlex Elder ret = -ENOMEM; 5848124afba2SAlex Elder goto out_err; 58491f2c6651SIlya Dryomov } 58501f2c6651SIlya Dryomov 58511f2c6651SIlya Dryomov /* 58521f2c6651SIlya Dryomov * Images related by parent/child relationships always share 58531f2c6651SIlya Dryomov * rbd_client and spec/parent_spec, so bump their refcounts. 58541f2c6651SIlya Dryomov */ 58551f2c6651SIlya Dryomov __rbd_get_client(rbd_dev->rbd_client); 58561f2c6651SIlya Dryomov rbd_spec_get(rbd_dev->parent_spec); 5857124afba2SAlex Elder 58586d69bb53SIlya Dryomov ret = rbd_dev_image_probe(parent, depth); 5859124afba2SAlex Elder if (ret < 0) 5860124afba2SAlex Elder goto out_err; 58611f2c6651SIlya Dryomov 5862124afba2SAlex Elder rbd_dev->parent = parent; 5863a2acd00eSAlex Elder atomic_set(&rbd_dev->parent_ref, 1); 5864124afba2SAlex Elder return 0; 5865124afba2SAlex Elder 58661f2c6651SIlya Dryomov out_err: 58671f2c6651SIlya Dryomov rbd_dev_unparent(rbd_dev); 58681f2c6651SIlya Dryomov rbd_dev_destroy(parent); 5869124afba2SAlex Elder return ret; 5870124afba2SAlex Elder } 5871124afba2SAlex Elder 58725769ed0cSIlya Dryomov static void rbd_dev_device_release(struct rbd_device *rbd_dev) 58735769ed0cSIlya Dryomov { 58745769ed0cSIlya Dryomov clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); 58755769ed0cSIlya Dryomov rbd_dev_mapping_clear(rbd_dev); 58765769ed0cSIlya Dryomov rbd_free_disk(rbd_dev); 58775769ed0cSIlya Dryomov if (!single_major) 58785769ed0cSIlya Dryomov unregister_blkdev(rbd_dev->major, rbd_dev->name); 58795769ed0cSIlya Dryomov } 58805769ed0cSIlya Dryomov 5881811c6688SIlya Dryomov /* 5882811c6688SIlya Dryomov * rbd_dev->header_rwsem must be locked for write and will be unlocked 5883811c6688SIlya Dryomov * upon return. 5884811c6688SIlya Dryomov */ 5885200a6a8bSAlex Elder static int rbd_dev_device_setup(struct rbd_device *rbd_dev) 5886124afba2SAlex Elder { 588783a06263SAlex Elder int ret; 588883a06263SAlex Elder 58899b60e70bSIlya Dryomov /* Record our major and minor device numbers. */ 589083a06263SAlex Elder 58919b60e70bSIlya Dryomov if (!single_major) { 589283a06263SAlex Elder ret = register_blkdev(0, rbd_dev->name); 589383a06263SAlex Elder if (ret < 0) 58941643dfa4SIlya Dryomov goto err_out_unlock; 58959b60e70bSIlya Dryomov 589683a06263SAlex Elder rbd_dev->major = ret; 5897dd82fff1SIlya Dryomov rbd_dev->minor = 0; 58989b60e70bSIlya Dryomov } else { 58999b60e70bSIlya Dryomov rbd_dev->major = rbd_major; 59009b60e70bSIlya Dryomov rbd_dev->minor = rbd_dev_id_to_minor(rbd_dev->dev_id); 59019b60e70bSIlya Dryomov } 590283a06263SAlex Elder 590383a06263SAlex Elder /* Set up the blkdev mapping. */ 590483a06263SAlex Elder 590583a06263SAlex Elder ret = rbd_init_disk(rbd_dev); 590683a06263SAlex Elder if (ret) 590783a06263SAlex Elder goto err_out_blkdev; 590883a06263SAlex Elder 5909f35a4deeSAlex Elder ret = rbd_dev_mapping_set(rbd_dev); 591083a06263SAlex Elder if (ret) 591183a06263SAlex Elder goto err_out_disk; 5912bc1ecc65SIlya Dryomov 5913f35a4deeSAlex Elder set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE); 591422001f61SJosh Durgin set_disk_ro(rbd_dev->disk, rbd_dev->mapping.read_only); 5915f35a4deeSAlex Elder 59165769ed0cSIlya Dryomov ret = dev_set_name(&rbd_dev->dev, "%d", rbd_dev->dev_id); 5917f35a4deeSAlex Elder if (ret) 5918f5ee37bdSIlya Dryomov goto err_out_mapping; 591983a06263SAlex Elder 5920129b79d4SAlex Elder set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); 5921811c6688SIlya Dryomov up_write(&rbd_dev->header_rwsem); 59225769ed0cSIlya Dryomov return 0; 59232f82ee54SAlex Elder 5924f35a4deeSAlex Elder err_out_mapping: 5925f35a4deeSAlex Elder rbd_dev_mapping_clear(rbd_dev); 592683a06263SAlex Elder err_out_disk: 592783a06263SAlex Elder rbd_free_disk(rbd_dev); 592883a06263SAlex Elder err_out_blkdev: 59299b60e70bSIlya Dryomov if (!single_major) 593083a06263SAlex Elder unregister_blkdev(rbd_dev->major, rbd_dev->name); 5931811c6688SIlya Dryomov err_out_unlock: 5932811c6688SIlya Dryomov up_write(&rbd_dev->header_rwsem); 593383a06263SAlex Elder return ret; 593483a06263SAlex Elder } 593583a06263SAlex Elder 5936332bb12dSAlex Elder static int rbd_dev_header_name(struct rbd_device *rbd_dev) 5937332bb12dSAlex Elder { 5938332bb12dSAlex Elder struct rbd_spec *spec = rbd_dev->spec; 5939c41d13a3SIlya Dryomov int ret; 5940332bb12dSAlex Elder 5941332bb12dSAlex Elder /* Record the header object name for this rbd image. */ 5942332bb12dSAlex Elder 5943332bb12dSAlex Elder rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 5944332bb12dSAlex Elder if (rbd_dev->image_format == 1) 5945c41d13a3SIlya Dryomov ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s", 5946332bb12dSAlex Elder spec->image_name, RBD_SUFFIX); 5947332bb12dSAlex Elder else 5948c41d13a3SIlya Dryomov ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s", 5949332bb12dSAlex Elder RBD_HEADER_PREFIX, spec->image_id); 5950c41d13a3SIlya Dryomov 5951c41d13a3SIlya Dryomov return ret; 5952332bb12dSAlex Elder } 5953332bb12dSAlex Elder 5954200a6a8bSAlex Elder static void rbd_dev_image_release(struct rbd_device *rbd_dev) 5955200a6a8bSAlex Elder { 59566fd48b3bSAlex Elder rbd_dev_unprobe(rbd_dev); 5957fd22aef8SIlya Dryomov if (rbd_dev->opts) 5958fd22aef8SIlya Dryomov rbd_unregister_watch(rbd_dev); 59596fd48b3bSAlex Elder rbd_dev->image_format = 0; 59606fd48b3bSAlex Elder kfree(rbd_dev->spec->image_id); 59616fd48b3bSAlex Elder rbd_dev->spec->image_id = NULL; 5962200a6a8bSAlex Elder } 5963200a6a8bSAlex Elder 5964a30b71b9SAlex Elder /* 5965a30b71b9SAlex Elder * Probe for the existence of the header object for the given rbd 59661f3ef788SAlex Elder * device. If this image is the one being mapped (i.e., not a 59671f3ef788SAlex Elder * parent), initiate a watch on its header object before using that 59681f3ef788SAlex Elder * object to get detailed information about the rbd image. 5969a30b71b9SAlex Elder */ 59706d69bb53SIlya Dryomov static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth) 5971a30b71b9SAlex Elder { 5972a30b71b9SAlex Elder int ret; 5973a30b71b9SAlex Elder 5974a30b71b9SAlex Elder /* 59753abef3b3SAlex Elder * Get the id from the image id object. Unless there's an 59763abef3b3SAlex Elder * error, rbd_dev->spec->image_id will be filled in with 59773abef3b3SAlex Elder * a dynamically-allocated string, and rbd_dev->image_format 59783abef3b3SAlex Elder * will be set to either 1 or 2. 5979a30b71b9SAlex Elder */ 5980a30b71b9SAlex Elder ret = rbd_dev_image_id(rbd_dev); 5981a30b71b9SAlex Elder if (ret) 5982c0fba368SAlex Elder return ret; 5983c0fba368SAlex Elder 5984332bb12dSAlex Elder ret = rbd_dev_header_name(rbd_dev); 5985332bb12dSAlex Elder if (ret) 5986332bb12dSAlex Elder goto err_out_format; 5987332bb12dSAlex Elder 59886d69bb53SIlya Dryomov if (!depth) { 598999d16943SIlya Dryomov ret = rbd_register_watch(rbd_dev); 59901fe48023SIlya Dryomov if (ret) { 59911fe48023SIlya Dryomov if (ret == -ENOENT) 59921fe48023SIlya Dryomov pr_info("image %s/%s does not exist\n", 59931fe48023SIlya Dryomov rbd_dev->spec->pool_name, 59941fe48023SIlya Dryomov rbd_dev->spec->image_name); 5995c41d13a3SIlya Dryomov goto err_out_format; 59961f3ef788SAlex Elder } 59971fe48023SIlya Dryomov } 5998b644de2bSAlex Elder 5999a720ae09SIlya Dryomov ret = rbd_dev_header_info(rbd_dev); 60005655c4d9SAlex Elder if (ret) 6001b644de2bSAlex Elder goto err_out_watch; 6002a30b71b9SAlex Elder 600304077599SIlya Dryomov /* 600404077599SIlya Dryomov * If this image is the one being mapped, we have pool name and 600504077599SIlya Dryomov * id, image name and id, and snap name - need to fill snap id. 600604077599SIlya Dryomov * Otherwise this is a parent image, identified by pool, image 600704077599SIlya Dryomov * and snap ids - need to fill in names for those ids. 600804077599SIlya Dryomov */ 60096d69bb53SIlya Dryomov if (!depth) 601004077599SIlya Dryomov ret = rbd_spec_fill_snap_id(rbd_dev); 601104077599SIlya Dryomov else 601204077599SIlya Dryomov ret = rbd_spec_fill_names(rbd_dev); 60131fe48023SIlya Dryomov if (ret) { 60141fe48023SIlya Dryomov if (ret == -ENOENT) 60151fe48023SIlya Dryomov pr_info("snap %s/%s@%s does not exist\n", 60161fe48023SIlya Dryomov rbd_dev->spec->pool_name, 60171fe48023SIlya Dryomov rbd_dev->spec->image_name, 60181fe48023SIlya Dryomov rbd_dev->spec->snap_name); 601933dca39fSAlex Elder goto err_out_probe; 60201fe48023SIlya Dryomov } 60219bb81c9bSAlex Elder 6022e8f59b59SIlya Dryomov if (rbd_dev->header.features & RBD_FEATURE_LAYERING) { 6023e8f59b59SIlya Dryomov ret = rbd_dev_v2_parent_info(rbd_dev); 6024e8f59b59SIlya Dryomov if (ret) 6025e8f59b59SIlya Dryomov goto err_out_probe; 6026e8f59b59SIlya Dryomov 6027e8f59b59SIlya Dryomov /* 6028e8f59b59SIlya Dryomov * Need to warn users if this image is the one being 6029e8f59b59SIlya Dryomov * mapped and has a parent. 6030e8f59b59SIlya Dryomov */ 60316d69bb53SIlya Dryomov if (!depth && rbd_dev->parent_spec) 6032e8f59b59SIlya Dryomov rbd_warn(rbd_dev, 6033e8f59b59SIlya Dryomov "WARNING: kernel layering is EXPERIMENTAL!"); 6034e8f59b59SIlya Dryomov } 6035e8f59b59SIlya Dryomov 60366d69bb53SIlya Dryomov ret = rbd_dev_probe_parent(rbd_dev, depth); 603730d60ba2SAlex Elder if (ret) 603830d60ba2SAlex Elder goto err_out_probe; 603983a06263SAlex Elder 604030d60ba2SAlex Elder dout("discovered format %u image, header name is %s\n", 6041c41d13a3SIlya Dryomov rbd_dev->image_format, rbd_dev->header_oid.name); 604230d60ba2SAlex Elder return 0; 6043e8f59b59SIlya Dryomov 60446fd48b3bSAlex Elder err_out_probe: 60456fd48b3bSAlex Elder rbd_dev_unprobe(rbd_dev); 6046b644de2bSAlex Elder err_out_watch: 60476d69bb53SIlya Dryomov if (!depth) 604899d16943SIlya Dryomov rbd_unregister_watch(rbd_dev); 6049332bb12dSAlex Elder err_out_format: 6050332bb12dSAlex Elder rbd_dev->image_format = 0; 60515655c4d9SAlex Elder kfree(rbd_dev->spec->image_id); 60525655c4d9SAlex Elder rbd_dev->spec->image_id = NULL; 60535655c4d9SAlex Elder return ret; 605483a06263SAlex Elder } 605583a06263SAlex Elder 60569b60e70bSIlya Dryomov static ssize_t do_rbd_add(struct bus_type *bus, 605759c2be1eSYehuda Sadeh const char *buf, 605859c2be1eSYehuda Sadeh size_t count) 6059602adf40SYehuda Sadeh { 6060cb8627c7SAlex Elder struct rbd_device *rbd_dev = NULL; 6061dc79b113SAlex Elder struct ceph_options *ceph_opts = NULL; 60624e9afebaSAlex Elder struct rbd_options *rbd_opts = NULL; 6063859c31dfSAlex Elder struct rbd_spec *spec = NULL; 60649d3997fdSAlex Elder struct rbd_client *rbdc; 606551344a38SAlex Elder bool read_only; 6066b51c83c2SIlya Dryomov int rc; 6067602adf40SYehuda Sadeh 6068602adf40SYehuda Sadeh if (!try_module_get(THIS_MODULE)) 6069602adf40SYehuda Sadeh return -ENODEV; 6070602adf40SYehuda Sadeh 6071a725f65eSAlex Elder /* parse add command */ 6072859c31dfSAlex Elder rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec); 6073dc79b113SAlex Elder if (rc < 0) 6074dd5ac32dSIlya Dryomov goto out; 6075a725f65eSAlex Elder 60769d3997fdSAlex Elder rbdc = rbd_get_client(ceph_opts); 60779d3997fdSAlex Elder if (IS_ERR(rbdc)) { 60789d3997fdSAlex Elder rc = PTR_ERR(rbdc); 60790ddebc0cSAlex Elder goto err_out_args; 60809d3997fdSAlex Elder } 6081602adf40SYehuda Sadeh 6082602adf40SYehuda Sadeh /* pick the pool */ 608330ba1f02SIlya Dryomov rc = rbd_add_get_pool_id(rbdc, spec->pool_name); 60841fe48023SIlya Dryomov if (rc < 0) { 60851fe48023SIlya Dryomov if (rc == -ENOENT) 60861fe48023SIlya Dryomov pr_info("pool %s does not exist\n", spec->pool_name); 6087602adf40SYehuda Sadeh goto err_out_client; 60881fe48023SIlya Dryomov } 6089859c31dfSAlex Elder spec->pool_id = (u64)rc; 6090859c31dfSAlex Elder 6091d147543dSIlya Dryomov rbd_dev = rbd_dev_create(rbdc, spec, rbd_opts); 6092b51c83c2SIlya Dryomov if (!rbd_dev) { 6093b51c83c2SIlya Dryomov rc = -ENOMEM; 6094bd4ba655SAlex Elder goto err_out_client; 6095b51c83c2SIlya Dryomov } 6096c53d5893SAlex Elder rbdc = NULL; /* rbd_dev now owns this */ 6097c53d5893SAlex Elder spec = NULL; /* rbd_dev now owns this */ 6098d147543dSIlya Dryomov rbd_opts = NULL; /* rbd_dev now owns this */ 6099602adf40SYehuda Sadeh 61000d6d1e9cSMike Christie rbd_dev->config_info = kstrdup(buf, GFP_KERNEL); 61010d6d1e9cSMike Christie if (!rbd_dev->config_info) { 61020d6d1e9cSMike Christie rc = -ENOMEM; 61030d6d1e9cSMike Christie goto err_out_rbd_dev; 61040d6d1e9cSMike Christie } 61050d6d1e9cSMike Christie 6106811c6688SIlya Dryomov down_write(&rbd_dev->header_rwsem); 61076d69bb53SIlya Dryomov rc = rbd_dev_image_probe(rbd_dev, 0); 61080d6d1e9cSMike Christie if (rc < 0) { 61090d6d1e9cSMike Christie up_write(&rbd_dev->header_rwsem); 6110c53d5893SAlex Elder goto err_out_rbd_dev; 61110d6d1e9cSMike Christie } 611205fd6f6fSAlex Elder 61137ce4eef7SAlex Elder /* If we are mapping a snapshot it must be marked read-only */ 61147ce4eef7SAlex Elder 6115d147543dSIlya Dryomov read_only = rbd_dev->opts->read_only; 61167ce4eef7SAlex Elder if (rbd_dev->spec->snap_id != CEPH_NOSNAP) 61177ce4eef7SAlex Elder read_only = true; 61187ce4eef7SAlex Elder rbd_dev->mapping.read_only = read_only; 61197ce4eef7SAlex Elder 6120b536f69aSAlex Elder rc = rbd_dev_device_setup(rbd_dev); 6121fd22aef8SIlya Dryomov if (rc) 61228b679ec5SIlya Dryomov goto err_out_image_probe; 61233abef3b3SAlex Elder 61245769ed0cSIlya Dryomov /* Everything's ready. Announce the disk to the world. */ 61255769ed0cSIlya Dryomov 61265769ed0cSIlya Dryomov rc = device_add(&rbd_dev->dev); 61275769ed0cSIlya Dryomov if (rc) 61285769ed0cSIlya Dryomov goto err_out_device_setup; 61295769ed0cSIlya Dryomov 61305769ed0cSIlya Dryomov add_disk(rbd_dev->disk); 61315769ed0cSIlya Dryomov /* see rbd_init_disk() */ 61325769ed0cSIlya Dryomov blk_put_queue(rbd_dev->disk->queue); 61335769ed0cSIlya Dryomov 61345769ed0cSIlya Dryomov spin_lock(&rbd_dev_list_lock); 61355769ed0cSIlya Dryomov list_add_tail(&rbd_dev->node, &rbd_dev_list); 61365769ed0cSIlya Dryomov spin_unlock(&rbd_dev_list_lock); 61375769ed0cSIlya Dryomov 61385769ed0cSIlya Dryomov pr_info("%s: capacity %llu features 0x%llx\n", rbd_dev->disk->disk_name, 61395769ed0cSIlya Dryomov (unsigned long long)get_capacity(rbd_dev->disk) << SECTOR_SHIFT, 61405769ed0cSIlya Dryomov rbd_dev->header.features); 6141dd5ac32dSIlya Dryomov rc = count; 6142dd5ac32dSIlya Dryomov out: 6143dd5ac32dSIlya Dryomov module_put(THIS_MODULE); 6144dd5ac32dSIlya Dryomov return rc; 6145b536f69aSAlex Elder 61465769ed0cSIlya Dryomov err_out_device_setup: 61475769ed0cSIlya Dryomov rbd_dev_device_release(rbd_dev); 61488b679ec5SIlya Dryomov err_out_image_probe: 61498b679ec5SIlya Dryomov rbd_dev_image_release(rbd_dev); 6150c53d5893SAlex Elder err_out_rbd_dev: 6151c53d5893SAlex Elder rbd_dev_destroy(rbd_dev); 6152bd4ba655SAlex Elder err_out_client: 61539d3997fdSAlex Elder rbd_put_client(rbdc); 61540ddebc0cSAlex Elder err_out_args: 6155859c31dfSAlex Elder rbd_spec_put(spec); 6156d147543dSIlya Dryomov kfree(rbd_opts); 6157dd5ac32dSIlya Dryomov goto out; 6158602adf40SYehuda Sadeh } 6159602adf40SYehuda Sadeh 61609b60e70bSIlya Dryomov static ssize_t rbd_add(struct bus_type *bus, 61619b60e70bSIlya Dryomov const char *buf, 61629b60e70bSIlya Dryomov size_t count) 61639b60e70bSIlya Dryomov { 61649b60e70bSIlya Dryomov if (single_major) 61659b60e70bSIlya Dryomov return -EINVAL; 61669b60e70bSIlya Dryomov 61679b60e70bSIlya Dryomov return do_rbd_add(bus, buf, count); 61689b60e70bSIlya Dryomov } 61699b60e70bSIlya Dryomov 61709b60e70bSIlya Dryomov static ssize_t rbd_add_single_major(struct bus_type *bus, 61719b60e70bSIlya Dryomov const char *buf, 61729b60e70bSIlya Dryomov size_t count) 61739b60e70bSIlya Dryomov { 61749b60e70bSIlya Dryomov return do_rbd_add(bus, buf, count); 61759b60e70bSIlya Dryomov } 61769b60e70bSIlya Dryomov 617705a46afdSAlex Elder static void rbd_dev_remove_parent(struct rbd_device *rbd_dev) 617805a46afdSAlex Elder { 6179ad945fc1SAlex Elder while (rbd_dev->parent) { 618005a46afdSAlex Elder struct rbd_device *first = rbd_dev; 618105a46afdSAlex Elder struct rbd_device *second = first->parent; 618205a46afdSAlex Elder struct rbd_device *third; 618305a46afdSAlex Elder 618405a46afdSAlex Elder /* 618505a46afdSAlex Elder * Follow to the parent with no grandparent and 618605a46afdSAlex Elder * remove it. 618705a46afdSAlex Elder */ 618805a46afdSAlex Elder while (second && (third = second->parent)) { 618905a46afdSAlex Elder first = second; 619005a46afdSAlex Elder second = third; 619105a46afdSAlex Elder } 6192ad945fc1SAlex Elder rbd_assert(second); 61938ad42cd0SAlex Elder rbd_dev_image_release(second); 61948b679ec5SIlya Dryomov rbd_dev_destroy(second); 6195ad945fc1SAlex Elder first->parent = NULL; 6196ad945fc1SAlex Elder first->parent_overlap = 0; 6197ad945fc1SAlex Elder 6198ad945fc1SAlex Elder rbd_assert(first->parent_spec); 619905a46afdSAlex Elder rbd_spec_put(first->parent_spec); 620005a46afdSAlex Elder first->parent_spec = NULL; 620105a46afdSAlex Elder } 620205a46afdSAlex Elder } 620305a46afdSAlex Elder 62049b60e70bSIlya Dryomov static ssize_t do_rbd_remove(struct bus_type *bus, 6205602adf40SYehuda Sadeh const char *buf, 6206602adf40SYehuda Sadeh size_t count) 6207602adf40SYehuda Sadeh { 6208602adf40SYehuda Sadeh struct rbd_device *rbd_dev = NULL; 6209751cc0e3SAlex Elder struct list_head *tmp; 6210751cc0e3SAlex Elder int dev_id; 62110276dca6SMike Christie char opt_buf[6]; 621282a442d2SAlex Elder bool already = false; 62130276dca6SMike Christie bool force = false; 62140d8189e1SAlex Elder int ret; 6215602adf40SYehuda Sadeh 62160276dca6SMike Christie dev_id = -1; 62170276dca6SMike Christie opt_buf[0] = '\0'; 62180276dca6SMike Christie sscanf(buf, "%d %5s", &dev_id, opt_buf); 62190276dca6SMike Christie if (dev_id < 0) { 62200276dca6SMike Christie pr_err("dev_id out of range\n"); 6221602adf40SYehuda Sadeh return -EINVAL; 62220276dca6SMike Christie } 62230276dca6SMike Christie if (opt_buf[0] != '\0') { 62240276dca6SMike Christie if (!strcmp(opt_buf, "force")) { 62250276dca6SMike Christie force = true; 62260276dca6SMike Christie } else { 62270276dca6SMike Christie pr_err("bad remove option at '%s'\n", opt_buf); 62280276dca6SMike Christie return -EINVAL; 62290276dca6SMike Christie } 62300276dca6SMike Christie } 6231602adf40SYehuda Sadeh 6232602adf40SYehuda Sadeh ret = -ENOENT; 6233751cc0e3SAlex Elder spin_lock(&rbd_dev_list_lock); 6234751cc0e3SAlex Elder list_for_each(tmp, &rbd_dev_list) { 6235751cc0e3SAlex Elder rbd_dev = list_entry(tmp, struct rbd_device, node); 6236751cc0e3SAlex Elder if (rbd_dev->dev_id == dev_id) { 6237751cc0e3SAlex Elder ret = 0; 6238751cc0e3SAlex Elder break; 6239602adf40SYehuda Sadeh } 6240751cc0e3SAlex Elder } 6241751cc0e3SAlex Elder if (!ret) { 6242a14ea269SAlex Elder spin_lock_irq(&rbd_dev->lock); 62430276dca6SMike Christie if (rbd_dev->open_count && !force) 624442382b70SAlex Elder ret = -EBUSY; 6245b82d167bSAlex Elder else 624682a442d2SAlex Elder already = test_and_set_bit(RBD_DEV_FLAG_REMOVING, 624782a442d2SAlex Elder &rbd_dev->flags); 6248a14ea269SAlex Elder spin_unlock_irq(&rbd_dev->lock); 6249751cc0e3SAlex Elder } 6250751cc0e3SAlex Elder spin_unlock(&rbd_dev_list_lock); 625182a442d2SAlex Elder if (ret < 0 || already) 62521ba0f1e7SAlex Elder return ret; 6253751cc0e3SAlex Elder 62540276dca6SMike Christie if (force) { 62550276dca6SMike Christie /* 62560276dca6SMike Christie * Prevent new IO from being queued and wait for existing 62570276dca6SMike Christie * IO to complete/fail. 62580276dca6SMike Christie */ 62590276dca6SMike Christie blk_mq_freeze_queue(rbd_dev->disk->queue); 62600276dca6SMike Christie blk_set_queue_dying(rbd_dev->disk->queue); 62610276dca6SMike Christie } 62620276dca6SMike Christie 62635769ed0cSIlya Dryomov del_gendisk(rbd_dev->disk); 62645769ed0cSIlya Dryomov spin_lock(&rbd_dev_list_lock); 62655769ed0cSIlya Dryomov list_del_init(&rbd_dev->node); 62665769ed0cSIlya Dryomov spin_unlock(&rbd_dev_list_lock); 62675769ed0cSIlya Dryomov device_del(&rbd_dev->dev); 62685769ed0cSIlya Dryomov 6269ed95b21aSIlya Dryomov down_write(&rbd_dev->lock_rwsem); 6270ed95b21aSIlya Dryomov if (__rbd_is_lock_owner(rbd_dev)) 6271ed95b21aSIlya Dryomov rbd_unlock(rbd_dev); 6272ed95b21aSIlya Dryomov up_write(&rbd_dev->lock_rwsem); 6273fca27065SIlya Dryomov 6274dd5ac32dSIlya Dryomov rbd_dev_device_release(rbd_dev); 62758ad42cd0SAlex Elder rbd_dev_image_release(rbd_dev); 62768b679ec5SIlya Dryomov rbd_dev_destroy(rbd_dev); 62771ba0f1e7SAlex Elder return count; 6278602adf40SYehuda Sadeh } 6279602adf40SYehuda Sadeh 62809b60e70bSIlya Dryomov static ssize_t rbd_remove(struct bus_type *bus, 62819b60e70bSIlya Dryomov const char *buf, 62829b60e70bSIlya Dryomov size_t count) 62839b60e70bSIlya Dryomov { 62849b60e70bSIlya Dryomov if (single_major) 62859b60e70bSIlya Dryomov return -EINVAL; 62869b60e70bSIlya Dryomov 62879b60e70bSIlya Dryomov return do_rbd_remove(bus, buf, count); 62889b60e70bSIlya Dryomov } 62899b60e70bSIlya Dryomov 62909b60e70bSIlya Dryomov static ssize_t rbd_remove_single_major(struct bus_type *bus, 62919b60e70bSIlya Dryomov const char *buf, 62929b60e70bSIlya Dryomov size_t count) 62939b60e70bSIlya Dryomov { 62949b60e70bSIlya Dryomov return do_rbd_remove(bus, buf, count); 62959b60e70bSIlya Dryomov } 62969b60e70bSIlya Dryomov 6297602adf40SYehuda Sadeh /* 6298602adf40SYehuda Sadeh * create control files in sysfs 6299dfc5606dSYehuda Sadeh * /sys/bus/rbd/... 6300602adf40SYehuda Sadeh */ 6301602adf40SYehuda Sadeh static int rbd_sysfs_init(void) 6302602adf40SYehuda Sadeh { 6303dfc5606dSYehuda Sadeh int ret; 6304602adf40SYehuda Sadeh 6305fed4c143SAlex Elder ret = device_register(&rbd_root_dev); 6306dfc5606dSYehuda Sadeh if (ret < 0) 6307dfc5606dSYehuda Sadeh return ret; 6308602adf40SYehuda Sadeh 6309fed4c143SAlex Elder ret = bus_register(&rbd_bus_type); 6310fed4c143SAlex Elder if (ret < 0) 6311fed4c143SAlex Elder device_unregister(&rbd_root_dev); 6312602adf40SYehuda Sadeh 6313602adf40SYehuda Sadeh return ret; 6314602adf40SYehuda Sadeh } 6315602adf40SYehuda Sadeh 6316602adf40SYehuda Sadeh static void rbd_sysfs_cleanup(void) 6317602adf40SYehuda Sadeh { 6318dfc5606dSYehuda Sadeh bus_unregister(&rbd_bus_type); 6319fed4c143SAlex Elder device_unregister(&rbd_root_dev); 6320602adf40SYehuda Sadeh } 6321602adf40SYehuda Sadeh 63221c2a9dfeSAlex Elder static int rbd_slab_init(void) 63231c2a9dfeSAlex Elder { 63241c2a9dfeSAlex Elder rbd_assert(!rbd_img_request_cache); 632503d94406SGeliang Tang rbd_img_request_cache = KMEM_CACHE(rbd_img_request, 0); 6326868311b1SAlex Elder if (!rbd_img_request_cache) 6327868311b1SAlex Elder return -ENOMEM; 6328868311b1SAlex Elder 6329868311b1SAlex Elder rbd_assert(!rbd_obj_request_cache); 633003d94406SGeliang Tang rbd_obj_request_cache = KMEM_CACHE(rbd_obj_request, 0); 633178c2a44aSAlex Elder if (!rbd_obj_request_cache) 633278c2a44aSAlex Elder goto out_err; 633378c2a44aSAlex Elder 63341c2a9dfeSAlex Elder return 0; 63351c2a9dfeSAlex Elder 63366c696d85SIlya Dryomov out_err: 6337868311b1SAlex Elder kmem_cache_destroy(rbd_img_request_cache); 6338868311b1SAlex Elder rbd_img_request_cache = NULL; 63391c2a9dfeSAlex Elder return -ENOMEM; 63401c2a9dfeSAlex Elder } 63411c2a9dfeSAlex Elder 63421c2a9dfeSAlex Elder static void rbd_slab_exit(void) 63431c2a9dfeSAlex Elder { 6344868311b1SAlex Elder rbd_assert(rbd_obj_request_cache); 6345868311b1SAlex Elder kmem_cache_destroy(rbd_obj_request_cache); 6346868311b1SAlex Elder rbd_obj_request_cache = NULL; 6347868311b1SAlex Elder 63481c2a9dfeSAlex Elder rbd_assert(rbd_img_request_cache); 63491c2a9dfeSAlex Elder kmem_cache_destroy(rbd_img_request_cache); 63501c2a9dfeSAlex Elder rbd_img_request_cache = NULL; 63511c2a9dfeSAlex Elder } 63521c2a9dfeSAlex Elder 6353cc344fa1SAlex Elder static int __init rbd_init(void) 6354602adf40SYehuda Sadeh { 6355602adf40SYehuda Sadeh int rc; 6356602adf40SYehuda Sadeh 63571e32d34cSAlex Elder if (!libceph_compatible(NULL)) { 63581e32d34cSAlex Elder rbd_warn(NULL, "libceph incompatibility (quitting)"); 63591e32d34cSAlex Elder return -EINVAL; 63601e32d34cSAlex Elder } 6361e1b4d96dSIlya Dryomov 63621c2a9dfeSAlex Elder rc = rbd_slab_init(); 6363602adf40SYehuda Sadeh if (rc) 6364602adf40SYehuda Sadeh return rc; 6365e1b4d96dSIlya Dryomov 6366f5ee37bdSIlya Dryomov /* 6367f5ee37bdSIlya Dryomov * The number of active work items is limited by the number of 6368f77303bdSIlya Dryomov * rbd devices * queue depth, so leave @max_active at default. 6369f5ee37bdSIlya Dryomov */ 6370f5ee37bdSIlya Dryomov rbd_wq = alloc_workqueue(RBD_DRV_NAME, WQ_MEM_RECLAIM, 0); 6371f5ee37bdSIlya Dryomov if (!rbd_wq) { 6372f5ee37bdSIlya Dryomov rc = -ENOMEM; 6373f5ee37bdSIlya Dryomov goto err_out_slab; 6374f5ee37bdSIlya Dryomov } 6375f5ee37bdSIlya Dryomov 63769b60e70bSIlya Dryomov if (single_major) { 63779b60e70bSIlya Dryomov rbd_major = register_blkdev(0, RBD_DRV_NAME); 63789b60e70bSIlya Dryomov if (rbd_major < 0) { 63799b60e70bSIlya Dryomov rc = rbd_major; 6380f5ee37bdSIlya Dryomov goto err_out_wq; 63819b60e70bSIlya Dryomov } 63829b60e70bSIlya Dryomov } 63839b60e70bSIlya Dryomov 63841c2a9dfeSAlex Elder rc = rbd_sysfs_init(); 63851c2a9dfeSAlex Elder if (rc) 63869b60e70bSIlya Dryomov goto err_out_blkdev; 63871c2a9dfeSAlex Elder 63889b60e70bSIlya Dryomov if (single_major) 63899b60e70bSIlya Dryomov pr_info("loaded (major %d)\n", rbd_major); 63909b60e70bSIlya Dryomov else 6391e1b4d96dSIlya Dryomov pr_info("loaded\n"); 63929b60e70bSIlya Dryomov 6393e1b4d96dSIlya Dryomov return 0; 6394e1b4d96dSIlya Dryomov 63959b60e70bSIlya Dryomov err_out_blkdev: 63969b60e70bSIlya Dryomov if (single_major) 63979b60e70bSIlya Dryomov unregister_blkdev(rbd_major, RBD_DRV_NAME); 6398f5ee37bdSIlya Dryomov err_out_wq: 6399f5ee37bdSIlya Dryomov destroy_workqueue(rbd_wq); 6400e1b4d96dSIlya Dryomov err_out_slab: 6401e1b4d96dSIlya Dryomov rbd_slab_exit(); 64021c2a9dfeSAlex Elder return rc; 6403602adf40SYehuda Sadeh } 6404602adf40SYehuda Sadeh 6405cc344fa1SAlex Elder static void __exit rbd_exit(void) 6406602adf40SYehuda Sadeh { 6407ffe312cfSIlya Dryomov ida_destroy(&rbd_dev_id_ida); 6408602adf40SYehuda Sadeh rbd_sysfs_cleanup(); 64099b60e70bSIlya Dryomov if (single_major) 64109b60e70bSIlya Dryomov unregister_blkdev(rbd_major, RBD_DRV_NAME); 6411f5ee37bdSIlya Dryomov destroy_workqueue(rbd_wq); 64121c2a9dfeSAlex Elder rbd_slab_exit(); 6413602adf40SYehuda Sadeh } 6414602adf40SYehuda Sadeh 6415602adf40SYehuda Sadeh module_init(rbd_init); 6416602adf40SYehuda Sadeh module_exit(rbd_exit); 6417602adf40SYehuda Sadeh 6418d552c619SAlex Elder MODULE_AUTHOR("Alex Elder <elder@inktank.com>"); 6419602adf40SYehuda Sadeh MODULE_AUTHOR("Sage Weil <sage@newdream.net>"); 6420602adf40SYehuda Sadeh MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>"); 6421602adf40SYehuda Sadeh /* following authorship retained from original osdblk.c */ 6422602adf40SYehuda Sadeh MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>"); 6423602adf40SYehuda Sadeh 642490da258bSIlya Dryomov MODULE_DESCRIPTION("RADOS Block Device (RBD) driver"); 6425602adf40SYehuda Sadeh MODULE_LICENSE("GPL"); 6426