1e2a58ee5SAlex Elder 2602adf40SYehuda Sadeh /* 3602adf40SYehuda Sadeh rbd.c -- Export ceph rados objects as a Linux block device 4602adf40SYehuda Sadeh 5602adf40SYehuda Sadeh 6602adf40SYehuda Sadeh based on drivers/block/osdblk.c: 7602adf40SYehuda Sadeh 8602adf40SYehuda Sadeh Copyright 2009 Red Hat, Inc. 9602adf40SYehuda Sadeh 10602adf40SYehuda Sadeh This program is free software; you can redistribute it and/or modify 11602adf40SYehuda Sadeh it under the terms of the GNU General Public License as published by 12602adf40SYehuda Sadeh the Free Software Foundation. 13602adf40SYehuda Sadeh 14602adf40SYehuda Sadeh This program is distributed in the hope that it will be useful, 15602adf40SYehuda Sadeh but WITHOUT ANY WARRANTY; without even the implied warranty of 16602adf40SYehuda Sadeh MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17602adf40SYehuda Sadeh GNU General Public License for more details. 18602adf40SYehuda Sadeh 19602adf40SYehuda Sadeh You should have received a copy of the GNU General Public License 20602adf40SYehuda Sadeh along with this program; see the file COPYING. If not, write to 21602adf40SYehuda Sadeh the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 22602adf40SYehuda Sadeh 23602adf40SYehuda Sadeh 24602adf40SYehuda Sadeh 25dfc5606dSYehuda Sadeh For usage instructions, please refer to: 26602adf40SYehuda Sadeh 27dfc5606dSYehuda Sadeh Documentation/ABI/testing/sysfs-bus-rbd 28602adf40SYehuda Sadeh 29602adf40SYehuda Sadeh */ 30602adf40SYehuda Sadeh 31602adf40SYehuda Sadeh #include <linux/ceph/libceph.h> 32602adf40SYehuda Sadeh #include <linux/ceph/osd_client.h> 33602adf40SYehuda Sadeh #include <linux/ceph/mon_client.h> 34ed95b21aSIlya Dryomov #include <linux/ceph/cls_lock_client.h> 35602adf40SYehuda Sadeh #include <linux/ceph/decode.h> 3659c2be1eSYehuda Sadeh #include <linux/parser.h> 3730d1cff8SAlex Elder #include <linux/bsearch.h> 38602adf40SYehuda Sadeh 39602adf40SYehuda Sadeh #include <linux/kernel.h> 40602adf40SYehuda Sadeh #include <linux/device.h> 41602adf40SYehuda Sadeh #include <linux/module.h> 427ad18afaSChristoph Hellwig #include <linux/blk-mq.h> 43602adf40SYehuda Sadeh #include <linux/fs.h> 44602adf40SYehuda Sadeh #include <linux/blkdev.h> 451c2a9dfeSAlex Elder #include <linux/slab.h> 46f8a22fc2SIlya Dryomov #include <linux/idr.h> 47bc1ecc65SIlya Dryomov #include <linux/workqueue.h> 48602adf40SYehuda Sadeh 49602adf40SYehuda Sadeh #include "rbd_types.h" 50602adf40SYehuda Sadeh 51aafb230eSAlex Elder #define RBD_DEBUG /* Activate rbd_assert() calls */ 52aafb230eSAlex Elder 53593a9e7bSAlex Elder /* 54593a9e7bSAlex Elder * The basic unit of block I/O is a sector. It is interpreted in a 55593a9e7bSAlex Elder * number of contexts in Linux (blk, bio, genhd), but the default is 56593a9e7bSAlex Elder * universally 512 bytes. These symbols are just slightly more 57593a9e7bSAlex Elder * meaningful than the bare numbers they represent. 58593a9e7bSAlex Elder */ 59593a9e7bSAlex Elder #define SECTOR_SHIFT 9 60593a9e7bSAlex Elder #define SECTOR_SIZE (1ULL << SECTOR_SHIFT) 61593a9e7bSAlex Elder 62a2acd00eSAlex Elder /* 63a2acd00eSAlex Elder * Increment the given counter and return its updated value. 64a2acd00eSAlex Elder * If the counter is already 0 it will not be incremented. 65a2acd00eSAlex Elder * If the counter is already at its maximum value returns 66a2acd00eSAlex Elder * -EINVAL without updating it. 67a2acd00eSAlex Elder */ 68a2acd00eSAlex Elder static int atomic_inc_return_safe(atomic_t *v) 69a2acd00eSAlex Elder { 70a2acd00eSAlex Elder unsigned int counter; 71a2acd00eSAlex Elder 72a2acd00eSAlex Elder counter = (unsigned int)__atomic_add_unless(v, 1, 0); 73a2acd00eSAlex Elder if (counter <= (unsigned int)INT_MAX) 74a2acd00eSAlex Elder return (int)counter; 75a2acd00eSAlex Elder 76a2acd00eSAlex Elder atomic_dec(v); 77a2acd00eSAlex Elder 78a2acd00eSAlex Elder return -EINVAL; 79a2acd00eSAlex Elder } 80a2acd00eSAlex Elder 81a2acd00eSAlex Elder /* Decrement the counter. Return the resulting value, or -EINVAL */ 82a2acd00eSAlex Elder static int atomic_dec_return_safe(atomic_t *v) 83a2acd00eSAlex Elder { 84a2acd00eSAlex Elder int counter; 85a2acd00eSAlex Elder 86a2acd00eSAlex Elder counter = atomic_dec_return(v); 87a2acd00eSAlex Elder if (counter >= 0) 88a2acd00eSAlex Elder return counter; 89a2acd00eSAlex Elder 90a2acd00eSAlex Elder atomic_inc(v); 91a2acd00eSAlex Elder 92a2acd00eSAlex Elder return -EINVAL; 93a2acd00eSAlex Elder } 94a2acd00eSAlex Elder 95f0f8cef5SAlex Elder #define RBD_DRV_NAME "rbd" 96602adf40SYehuda Sadeh 977e513d43SIlya Dryomov #define RBD_MINORS_PER_MAJOR 256 987e513d43SIlya Dryomov #define RBD_SINGLE_MAJOR_PART_SHIFT 4 99602adf40SYehuda Sadeh 1006d69bb53SIlya Dryomov #define RBD_MAX_PARENT_CHAIN_LEN 16 1016d69bb53SIlya Dryomov 102d4b125e9SAlex Elder #define RBD_SNAP_DEV_NAME_PREFIX "snap_" 103d4b125e9SAlex Elder #define RBD_MAX_SNAP_NAME_LEN \ 104d4b125e9SAlex Elder (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1)) 105d4b125e9SAlex Elder 10635d489f9SAlex Elder #define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */ 107602adf40SYehuda Sadeh 108602adf40SYehuda Sadeh #define RBD_SNAP_HEAD_NAME "-" 109602adf40SYehuda Sadeh 1109682fc6dSAlex Elder #define BAD_SNAP_INDEX U32_MAX /* invalid index into snap array */ 1119682fc6dSAlex Elder 1129e15b77dSAlex Elder /* This allows a single page to hold an image name sent by OSD */ 1139e15b77dSAlex Elder #define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1) 114589d30e0SAlex Elder #define RBD_IMAGE_ID_LEN_MAX 64 1159e15b77dSAlex Elder 1161e130199SAlex Elder #define RBD_OBJ_PREFIX_LEN_MAX 64 117589d30e0SAlex Elder 118ed95b21aSIlya Dryomov #define RBD_NOTIFY_TIMEOUT 5 /* seconds */ 11999d16943SIlya Dryomov #define RBD_RETRY_DELAY msecs_to_jiffies(1000) 12099d16943SIlya Dryomov 121d889140cSAlex Elder /* Feature bits */ 122d889140cSAlex Elder 1235cbf6f12SAlex Elder #define RBD_FEATURE_LAYERING (1<<0) 1245cbf6f12SAlex Elder #define RBD_FEATURE_STRIPINGV2 (1<<1) 125ed95b21aSIlya Dryomov #define RBD_FEATURE_EXCLUSIVE_LOCK (1<<2) 1267e97332eSIlya Dryomov #define RBD_FEATURE_DATA_POOL (1<<7) 127ed95b21aSIlya Dryomov #define RBD_FEATURES_ALL (RBD_FEATURE_LAYERING | \ 128ed95b21aSIlya Dryomov RBD_FEATURE_STRIPINGV2 | \ 1297e97332eSIlya Dryomov RBD_FEATURE_EXCLUSIVE_LOCK | \ 1307e97332eSIlya Dryomov RBD_FEATURE_DATA_POOL) 131d889140cSAlex Elder 132d889140cSAlex Elder /* Features supported by this (client software) implementation. */ 133d889140cSAlex Elder 134770eba6eSAlex Elder #define RBD_FEATURES_SUPPORTED (RBD_FEATURES_ALL) 135d889140cSAlex Elder 13681a89793SAlex Elder /* 13781a89793SAlex Elder * An RBD device name will be "rbd#", where the "rbd" comes from 13881a89793SAlex Elder * RBD_DRV_NAME above, and # is a unique integer identifier. 13981a89793SAlex Elder */ 140602adf40SYehuda Sadeh #define DEV_NAME_LEN 32 141602adf40SYehuda Sadeh 142602adf40SYehuda Sadeh /* 143602adf40SYehuda Sadeh * block device image metadata (in-memory version) 144602adf40SYehuda Sadeh */ 145602adf40SYehuda Sadeh struct rbd_image_header { 146f35a4deeSAlex Elder /* These six fields never change for a given rbd image */ 147849b4260SAlex Elder char *object_prefix; 148602adf40SYehuda Sadeh __u8 obj_order; 149f35a4deeSAlex Elder u64 stripe_unit; 150f35a4deeSAlex Elder u64 stripe_count; 1517e97332eSIlya Dryomov s64 data_pool_id; 152f35a4deeSAlex Elder u64 features; /* Might be changeable someday? */ 153602adf40SYehuda Sadeh 154f84344f3SAlex Elder /* The remaining fields need to be updated occasionally */ 155f84344f3SAlex Elder u64 image_size; 156f84344f3SAlex Elder struct ceph_snap_context *snapc; 157f35a4deeSAlex Elder char *snap_names; /* format 1 only */ 158f35a4deeSAlex Elder u64 *snap_sizes; /* format 1 only */ 15959c2be1eSYehuda Sadeh }; 16059c2be1eSYehuda Sadeh 1610d7dbfceSAlex Elder /* 1620d7dbfceSAlex Elder * An rbd image specification. 1630d7dbfceSAlex Elder * 1640d7dbfceSAlex Elder * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely 165c66c6e0cSAlex Elder * identify an image. Each rbd_dev structure includes a pointer to 166c66c6e0cSAlex Elder * an rbd_spec structure that encapsulates this identity. 167c66c6e0cSAlex Elder * 168c66c6e0cSAlex Elder * Each of the id's in an rbd_spec has an associated name. For a 169c66c6e0cSAlex Elder * user-mapped image, the names are supplied and the id's associated 170c66c6e0cSAlex Elder * with them are looked up. For a layered image, a parent image is 171c66c6e0cSAlex Elder * defined by the tuple, and the names are looked up. 172c66c6e0cSAlex Elder * 173c66c6e0cSAlex Elder * An rbd_dev structure contains a parent_spec pointer which is 174c66c6e0cSAlex Elder * non-null if the image it represents is a child in a layered 175c66c6e0cSAlex Elder * image. This pointer will refer to the rbd_spec structure used 176c66c6e0cSAlex Elder * by the parent rbd_dev for its own identity (i.e., the structure 177c66c6e0cSAlex Elder * is shared between the parent and child). 178c66c6e0cSAlex Elder * 179c66c6e0cSAlex Elder * Since these structures are populated once, during the discovery 180c66c6e0cSAlex Elder * phase of image construction, they are effectively immutable so 181c66c6e0cSAlex Elder * we make no effort to synchronize access to them. 182c66c6e0cSAlex Elder * 183c66c6e0cSAlex Elder * Note that code herein does not assume the image name is known (it 184c66c6e0cSAlex Elder * could be a null pointer). 1850d7dbfceSAlex Elder */ 1860d7dbfceSAlex Elder struct rbd_spec { 1870d7dbfceSAlex Elder u64 pool_id; 188ecb4dc22SAlex Elder const char *pool_name; 1890d7dbfceSAlex Elder 190ecb4dc22SAlex Elder const char *image_id; 191ecb4dc22SAlex Elder const char *image_name; 1920d7dbfceSAlex Elder 1930d7dbfceSAlex Elder u64 snap_id; 194ecb4dc22SAlex Elder const char *snap_name; 1950d7dbfceSAlex Elder 1960d7dbfceSAlex Elder struct kref kref; 1970d7dbfceSAlex Elder }; 1980d7dbfceSAlex Elder 199602adf40SYehuda Sadeh /* 200f0f8cef5SAlex Elder * an instance of the client. multiple devices may share an rbd client. 201602adf40SYehuda Sadeh */ 202602adf40SYehuda Sadeh struct rbd_client { 203602adf40SYehuda Sadeh struct ceph_client *client; 204602adf40SYehuda Sadeh struct kref kref; 205602adf40SYehuda Sadeh struct list_head node; 206602adf40SYehuda Sadeh }; 207602adf40SYehuda Sadeh 208bf0d5f50SAlex Elder struct rbd_img_request; 209bf0d5f50SAlex Elder typedef void (*rbd_img_callback_t)(struct rbd_img_request *); 210bf0d5f50SAlex Elder 211bf0d5f50SAlex Elder #define BAD_WHICH U32_MAX /* Good which or bad which, which? */ 212bf0d5f50SAlex Elder 213bf0d5f50SAlex Elder struct rbd_obj_request; 214bf0d5f50SAlex Elder typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *); 215bf0d5f50SAlex Elder 2169969ebc5SAlex Elder enum obj_request_type { 2179969ebc5SAlex Elder OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES 2189969ebc5SAlex Elder }; 219bf0d5f50SAlex Elder 2206d2940c8SGuangliang Zhao enum obj_operation_type { 2216d2940c8SGuangliang Zhao OBJ_OP_WRITE, 2226d2940c8SGuangliang Zhao OBJ_OP_READ, 22390e98c52SGuangliang Zhao OBJ_OP_DISCARD, 2246d2940c8SGuangliang Zhao }; 2256d2940c8SGuangliang Zhao 226926f9b3fSAlex Elder enum obj_req_flags { 227926f9b3fSAlex Elder OBJ_REQ_DONE, /* completion flag: not done = 0, done = 1 */ 2286365d33aSAlex Elder OBJ_REQ_IMG_DATA, /* object usage: standalone = 0, image = 1 */ 2295679c59fSAlex Elder OBJ_REQ_KNOWN, /* EXISTS flag valid: no = 0, yes = 1 */ 2305679c59fSAlex Elder OBJ_REQ_EXISTS, /* target exists: no = 0, yes = 1 */ 231926f9b3fSAlex Elder }; 232926f9b3fSAlex Elder 233bf0d5f50SAlex Elder struct rbd_obj_request { 234bf0d5f50SAlex Elder const char *object_name; 235bf0d5f50SAlex Elder u64 offset; /* object start byte */ 236bf0d5f50SAlex Elder u64 length; /* bytes from offset */ 237926f9b3fSAlex Elder unsigned long flags; 238bf0d5f50SAlex Elder 239c5b5ef6cSAlex Elder /* 240c5b5ef6cSAlex Elder * An object request associated with an image will have its 241c5b5ef6cSAlex Elder * img_data flag set; a standalone object request will not. 242c5b5ef6cSAlex Elder * 243c5b5ef6cSAlex Elder * A standalone object request will have which == BAD_WHICH 244c5b5ef6cSAlex Elder * and a null obj_request pointer. 245c5b5ef6cSAlex Elder * 246c5b5ef6cSAlex Elder * An object request initiated in support of a layered image 247c5b5ef6cSAlex Elder * object (to check for its existence before a write) will 248c5b5ef6cSAlex Elder * have which == BAD_WHICH and a non-null obj_request pointer. 249c5b5ef6cSAlex Elder * 250c5b5ef6cSAlex Elder * Finally, an object request for rbd image data will have 251c5b5ef6cSAlex Elder * which != BAD_WHICH, and will have a non-null img_request 252c5b5ef6cSAlex Elder * pointer. The value of which will be in the range 253c5b5ef6cSAlex Elder * 0..(img_request->obj_request_count-1). 254c5b5ef6cSAlex Elder */ 255c5b5ef6cSAlex Elder union { 256c5b5ef6cSAlex Elder struct rbd_obj_request *obj_request; /* STAT op */ 257c5b5ef6cSAlex Elder struct { 258bf0d5f50SAlex Elder struct rbd_img_request *img_request; 259c5b5ef6cSAlex Elder u64 img_offset; 260c5b5ef6cSAlex Elder /* links for img_request->obj_requests list */ 261c5b5ef6cSAlex Elder struct list_head links; 262c5b5ef6cSAlex Elder }; 263c5b5ef6cSAlex Elder }; 264bf0d5f50SAlex Elder u32 which; /* posn image request list */ 265bf0d5f50SAlex Elder 266bf0d5f50SAlex Elder enum obj_request_type type; 267788e2df3SAlex Elder union { 268bf0d5f50SAlex Elder struct bio *bio_list; 269788e2df3SAlex Elder struct { 270788e2df3SAlex Elder struct page **pages; 271788e2df3SAlex Elder u32 page_count; 272788e2df3SAlex Elder }; 273788e2df3SAlex Elder }; 2740eefd470SAlex Elder struct page **copyup_pages; 275ebda6408SAlex Elder u32 copyup_page_count; 276bf0d5f50SAlex Elder 277bf0d5f50SAlex Elder struct ceph_osd_request *osd_req; 278bf0d5f50SAlex Elder 279bf0d5f50SAlex Elder u64 xferred; /* bytes transferred */ 2801b83bef2SSage Weil int result; 281bf0d5f50SAlex Elder 282bf0d5f50SAlex Elder rbd_obj_callback_t callback; 283788e2df3SAlex Elder struct completion completion; 284bf0d5f50SAlex Elder 285bf0d5f50SAlex Elder struct kref kref; 286bf0d5f50SAlex Elder }; 287bf0d5f50SAlex Elder 2880c425248SAlex Elder enum img_req_flags { 2899849e986SAlex Elder IMG_REQ_WRITE, /* I/O direction: read = 0, write = 1 */ 2909849e986SAlex Elder IMG_REQ_CHILD, /* initiator: block = 0, child image = 1 */ 291d0b2e944SAlex Elder IMG_REQ_LAYERED, /* ENOENT handling: normal = 0, layered = 1 */ 29290e98c52SGuangliang Zhao IMG_REQ_DISCARD, /* discard: normal = 0, discard request = 1 */ 2930c425248SAlex Elder }; 2940c425248SAlex Elder 295bf0d5f50SAlex Elder struct rbd_img_request { 296bf0d5f50SAlex Elder struct rbd_device *rbd_dev; 297bf0d5f50SAlex Elder u64 offset; /* starting image byte offset */ 298bf0d5f50SAlex Elder u64 length; /* byte count from offset */ 2990c425248SAlex Elder unsigned long flags; 300bf0d5f50SAlex Elder union { 301bf0d5f50SAlex Elder u64 snap_id; /* for reads */ 3029849e986SAlex Elder struct ceph_snap_context *snapc; /* for writes */ 3039849e986SAlex Elder }; 3049849e986SAlex Elder union { 3059849e986SAlex Elder struct request *rq; /* block request */ 3069849e986SAlex Elder struct rbd_obj_request *obj_request; /* obj req initiator */ 307bf0d5f50SAlex Elder }; 3083d7efd18SAlex Elder struct page **copyup_pages; 309ebda6408SAlex Elder u32 copyup_page_count; 310bf0d5f50SAlex Elder spinlock_t completion_lock;/* protects next_completion */ 311bf0d5f50SAlex Elder u32 next_completion; 312bf0d5f50SAlex Elder rbd_img_callback_t callback; 31355f27e09SAlex Elder u64 xferred;/* aggregate bytes transferred */ 314a5a337d4SAlex Elder int result; /* first nonzero obj_request result */ 315bf0d5f50SAlex Elder 316bf0d5f50SAlex Elder u32 obj_request_count; 317bf0d5f50SAlex Elder struct list_head obj_requests; /* rbd_obj_request structs */ 318bf0d5f50SAlex Elder 319bf0d5f50SAlex Elder struct kref kref; 320bf0d5f50SAlex Elder }; 321bf0d5f50SAlex Elder 322bf0d5f50SAlex Elder #define for_each_obj_request(ireq, oreq) \ 323ef06f4d3SAlex Elder list_for_each_entry(oreq, &(ireq)->obj_requests, links) 324bf0d5f50SAlex Elder #define for_each_obj_request_from(ireq, oreq) \ 325ef06f4d3SAlex Elder list_for_each_entry_from(oreq, &(ireq)->obj_requests, links) 326bf0d5f50SAlex Elder #define for_each_obj_request_safe(ireq, oreq, n) \ 327ef06f4d3SAlex Elder list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links) 328bf0d5f50SAlex Elder 32999d16943SIlya Dryomov enum rbd_watch_state { 33099d16943SIlya Dryomov RBD_WATCH_STATE_UNREGISTERED, 33199d16943SIlya Dryomov RBD_WATCH_STATE_REGISTERED, 33299d16943SIlya Dryomov RBD_WATCH_STATE_ERROR, 33399d16943SIlya Dryomov }; 33499d16943SIlya Dryomov 335ed95b21aSIlya Dryomov enum rbd_lock_state { 336ed95b21aSIlya Dryomov RBD_LOCK_STATE_UNLOCKED, 337ed95b21aSIlya Dryomov RBD_LOCK_STATE_LOCKED, 338ed95b21aSIlya Dryomov RBD_LOCK_STATE_RELEASING, 339ed95b21aSIlya Dryomov }; 340ed95b21aSIlya Dryomov 341ed95b21aSIlya Dryomov /* WatchNotify::ClientId */ 342ed95b21aSIlya Dryomov struct rbd_client_id { 343ed95b21aSIlya Dryomov u64 gid; 344ed95b21aSIlya Dryomov u64 handle; 345ed95b21aSIlya Dryomov }; 346ed95b21aSIlya Dryomov 347f84344f3SAlex Elder struct rbd_mapping { 34899c1f08fSAlex Elder u64 size; 34934b13184SAlex Elder u64 features; 350f84344f3SAlex Elder bool read_only; 351f84344f3SAlex Elder }; 352f84344f3SAlex Elder 353602adf40SYehuda Sadeh /* 354602adf40SYehuda Sadeh * a single device 355602adf40SYehuda Sadeh */ 356602adf40SYehuda Sadeh struct rbd_device { 357de71a297SAlex Elder int dev_id; /* blkdev unique id */ 358602adf40SYehuda Sadeh 359602adf40SYehuda Sadeh int major; /* blkdev assigned major */ 360dd82fff1SIlya Dryomov int minor; 361602adf40SYehuda Sadeh struct gendisk *disk; /* blkdev's gendisk and rq */ 362602adf40SYehuda Sadeh 363a30b71b9SAlex Elder u32 image_format; /* Either 1 or 2 */ 364602adf40SYehuda Sadeh struct rbd_client *rbd_client; 365602adf40SYehuda Sadeh 366602adf40SYehuda Sadeh char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */ 367602adf40SYehuda Sadeh 368b82d167bSAlex Elder spinlock_t lock; /* queue, flags, open_count */ 369602adf40SYehuda Sadeh 370602adf40SYehuda Sadeh struct rbd_image_header header; 371b82d167bSAlex Elder unsigned long flags; /* possibly lock protected */ 3720d7dbfceSAlex Elder struct rbd_spec *spec; 373d147543dSIlya Dryomov struct rbd_options *opts; 3740d6d1e9cSMike Christie char *config_info; /* add{,_single_major} string */ 375602adf40SYehuda Sadeh 376c41d13a3SIlya Dryomov struct ceph_object_id header_oid; 377922dab61SIlya Dryomov struct ceph_object_locator header_oloc; 378971f839aSAlex Elder 3791643dfa4SIlya Dryomov struct ceph_file_layout layout; /* used for all rbd requests */ 3800903e875SAlex Elder 38199d16943SIlya Dryomov struct mutex watch_mutex; 38299d16943SIlya Dryomov enum rbd_watch_state watch_state; 383922dab61SIlya Dryomov struct ceph_osd_linger_request *watch_handle; 38499d16943SIlya Dryomov u64 watch_cookie; 38599d16943SIlya Dryomov struct delayed_work watch_dwork; 38659c2be1eSYehuda Sadeh 387ed95b21aSIlya Dryomov struct rw_semaphore lock_rwsem; 388ed95b21aSIlya Dryomov enum rbd_lock_state lock_state; 389ed95b21aSIlya Dryomov struct rbd_client_id owner_cid; 390ed95b21aSIlya Dryomov struct work_struct acquired_lock_work; 391ed95b21aSIlya Dryomov struct work_struct released_lock_work; 392ed95b21aSIlya Dryomov struct delayed_work lock_dwork; 393ed95b21aSIlya Dryomov struct work_struct unlock_work; 394ed95b21aSIlya Dryomov wait_queue_head_t lock_waitq; 395ed95b21aSIlya Dryomov 3961643dfa4SIlya Dryomov struct workqueue_struct *task_wq; 397602adf40SYehuda Sadeh 39886b00e0dSAlex Elder struct rbd_spec *parent_spec; 39986b00e0dSAlex Elder u64 parent_overlap; 400a2acd00eSAlex Elder atomic_t parent_ref; 4012f82ee54SAlex Elder struct rbd_device *parent; 40286b00e0dSAlex Elder 4037ad18afaSChristoph Hellwig /* Block layer tags. */ 4047ad18afaSChristoph Hellwig struct blk_mq_tag_set tag_set; 4057ad18afaSChristoph Hellwig 406c666601aSJosh Durgin /* protects updating the header */ 407c666601aSJosh Durgin struct rw_semaphore header_rwsem; 408f84344f3SAlex Elder 409f84344f3SAlex Elder struct rbd_mapping mapping; 410602adf40SYehuda Sadeh 411602adf40SYehuda Sadeh struct list_head node; 412dfc5606dSYehuda Sadeh 413dfc5606dSYehuda Sadeh /* sysfs related */ 414dfc5606dSYehuda Sadeh struct device dev; 415b82d167bSAlex Elder unsigned long open_count; /* protected by lock */ 416dfc5606dSYehuda Sadeh }; 417dfc5606dSYehuda Sadeh 418b82d167bSAlex Elder /* 41987c0fdedSIlya Dryomov * Flag bits for rbd_dev->flags: 42087c0fdedSIlya Dryomov * - REMOVING (which is coupled with rbd_dev->open_count) is protected 42187c0fdedSIlya Dryomov * by rbd_dev->lock 42287c0fdedSIlya Dryomov * - BLACKLISTED is protected by rbd_dev->lock_rwsem 423b82d167bSAlex Elder */ 4246d292906SAlex Elder enum rbd_dev_flags { 4256d292906SAlex Elder RBD_DEV_FLAG_EXISTS, /* mapped snapshot has not been deleted */ 426b82d167bSAlex Elder RBD_DEV_FLAG_REMOVING, /* this mapping is being removed */ 42787c0fdedSIlya Dryomov RBD_DEV_FLAG_BLACKLISTED, /* our ceph_client is blacklisted */ 4286d292906SAlex Elder }; 4296d292906SAlex Elder 430cfbf6377SAlex Elder static DEFINE_MUTEX(client_mutex); /* Serialize client creation */ 431e124a82fSAlex Elder 432602adf40SYehuda Sadeh static LIST_HEAD(rbd_dev_list); /* devices */ 433e124a82fSAlex Elder static DEFINE_SPINLOCK(rbd_dev_list_lock); 434e124a82fSAlex Elder 435602adf40SYehuda Sadeh static LIST_HEAD(rbd_client_list); /* clients */ 436432b8587SAlex Elder static DEFINE_SPINLOCK(rbd_client_list_lock); 437602adf40SYehuda Sadeh 43878c2a44aSAlex Elder /* Slab caches for frequently-allocated structures */ 43978c2a44aSAlex Elder 4401c2a9dfeSAlex Elder static struct kmem_cache *rbd_img_request_cache; 441868311b1SAlex Elder static struct kmem_cache *rbd_obj_request_cache; 44278c2a44aSAlex Elder static struct kmem_cache *rbd_segment_name_cache; 4431c2a9dfeSAlex Elder 4449b60e70bSIlya Dryomov static int rbd_major; 445f8a22fc2SIlya Dryomov static DEFINE_IDA(rbd_dev_id_ida); 446f8a22fc2SIlya Dryomov 447f5ee37bdSIlya Dryomov static struct workqueue_struct *rbd_wq; 448f5ee37bdSIlya Dryomov 4499b60e70bSIlya Dryomov /* 4509b60e70bSIlya Dryomov * Default to false for now, as single-major requires >= 0.75 version of 4519b60e70bSIlya Dryomov * userspace rbd utility. 4529b60e70bSIlya Dryomov */ 4539b60e70bSIlya Dryomov static bool single_major = false; 4549b60e70bSIlya Dryomov module_param(single_major, bool, S_IRUGO); 4559b60e70bSIlya Dryomov MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: false)"); 4569b60e70bSIlya Dryomov 4573d7efd18SAlex Elder static int rbd_img_request_submit(struct rbd_img_request *img_request); 4583d7efd18SAlex Elder 459f0f8cef5SAlex Elder static ssize_t rbd_add(struct bus_type *bus, const char *buf, 460f0f8cef5SAlex Elder size_t count); 461f0f8cef5SAlex Elder static ssize_t rbd_remove(struct bus_type *bus, const char *buf, 462f0f8cef5SAlex Elder size_t count); 4639b60e70bSIlya Dryomov static ssize_t rbd_add_single_major(struct bus_type *bus, const char *buf, 4649b60e70bSIlya Dryomov size_t count); 4659b60e70bSIlya Dryomov static ssize_t rbd_remove_single_major(struct bus_type *bus, const char *buf, 4669b60e70bSIlya Dryomov size_t count); 4676d69bb53SIlya Dryomov static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth); 468a2acd00eSAlex Elder static void rbd_spec_put(struct rbd_spec *spec); 469f0f8cef5SAlex Elder 4709b60e70bSIlya Dryomov static int rbd_dev_id_to_minor(int dev_id) 4719b60e70bSIlya Dryomov { 4727e513d43SIlya Dryomov return dev_id << RBD_SINGLE_MAJOR_PART_SHIFT; 4739b60e70bSIlya Dryomov } 4749b60e70bSIlya Dryomov 4759b60e70bSIlya Dryomov static int minor_to_rbd_dev_id(int minor) 4769b60e70bSIlya Dryomov { 4777e513d43SIlya Dryomov return minor >> RBD_SINGLE_MAJOR_PART_SHIFT; 4789b60e70bSIlya Dryomov } 4799b60e70bSIlya Dryomov 480ed95b21aSIlya Dryomov static bool rbd_is_lock_supported(struct rbd_device *rbd_dev) 481ed95b21aSIlya Dryomov { 482ed95b21aSIlya Dryomov return (rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK) && 483ed95b21aSIlya Dryomov rbd_dev->spec->snap_id == CEPH_NOSNAP && 484ed95b21aSIlya Dryomov !rbd_dev->mapping.read_only; 485ed95b21aSIlya Dryomov } 486ed95b21aSIlya Dryomov 487ed95b21aSIlya Dryomov static bool __rbd_is_lock_owner(struct rbd_device *rbd_dev) 488ed95b21aSIlya Dryomov { 489ed95b21aSIlya Dryomov return rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED || 490ed95b21aSIlya Dryomov rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING; 491ed95b21aSIlya Dryomov } 492ed95b21aSIlya Dryomov 493ed95b21aSIlya Dryomov static bool rbd_is_lock_owner(struct rbd_device *rbd_dev) 494ed95b21aSIlya Dryomov { 495ed95b21aSIlya Dryomov bool is_lock_owner; 496ed95b21aSIlya Dryomov 497ed95b21aSIlya Dryomov down_read(&rbd_dev->lock_rwsem); 498ed95b21aSIlya Dryomov is_lock_owner = __rbd_is_lock_owner(rbd_dev); 499ed95b21aSIlya Dryomov up_read(&rbd_dev->lock_rwsem); 500ed95b21aSIlya Dryomov return is_lock_owner; 501ed95b21aSIlya Dryomov } 502ed95b21aSIlya Dryomov 503b15a21ddSGreg Kroah-Hartman static BUS_ATTR(add, S_IWUSR, NULL, rbd_add); 504b15a21ddSGreg Kroah-Hartman static BUS_ATTR(remove, S_IWUSR, NULL, rbd_remove); 5059b60e70bSIlya Dryomov static BUS_ATTR(add_single_major, S_IWUSR, NULL, rbd_add_single_major); 5069b60e70bSIlya Dryomov static BUS_ATTR(remove_single_major, S_IWUSR, NULL, rbd_remove_single_major); 507b15a21ddSGreg Kroah-Hartman 508b15a21ddSGreg Kroah-Hartman static struct attribute *rbd_bus_attrs[] = { 509b15a21ddSGreg Kroah-Hartman &bus_attr_add.attr, 510b15a21ddSGreg Kroah-Hartman &bus_attr_remove.attr, 5119b60e70bSIlya Dryomov &bus_attr_add_single_major.attr, 5129b60e70bSIlya Dryomov &bus_attr_remove_single_major.attr, 513b15a21ddSGreg Kroah-Hartman NULL, 514f0f8cef5SAlex Elder }; 51592c76dc0SIlya Dryomov 51692c76dc0SIlya Dryomov static umode_t rbd_bus_is_visible(struct kobject *kobj, 51792c76dc0SIlya Dryomov struct attribute *attr, int index) 51892c76dc0SIlya Dryomov { 5199b60e70bSIlya Dryomov if (!single_major && 5209b60e70bSIlya Dryomov (attr == &bus_attr_add_single_major.attr || 5219b60e70bSIlya Dryomov attr == &bus_attr_remove_single_major.attr)) 5229b60e70bSIlya Dryomov return 0; 5239b60e70bSIlya Dryomov 52492c76dc0SIlya Dryomov return attr->mode; 52592c76dc0SIlya Dryomov } 52692c76dc0SIlya Dryomov 52792c76dc0SIlya Dryomov static const struct attribute_group rbd_bus_group = { 52892c76dc0SIlya Dryomov .attrs = rbd_bus_attrs, 52992c76dc0SIlya Dryomov .is_visible = rbd_bus_is_visible, 53092c76dc0SIlya Dryomov }; 53192c76dc0SIlya Dryomov __ATTRIBUTE_GROUPS(rbd_bus); 532f0f8cef5SAlex Elder 533f0f8cef5SAlex Elder static struct bus_type rbd_bus_type = { 534f0f8cef5SAlex Elder .name = "rbd", 535b15a21ddSGreg Kroah-Hartman .bus_groups = rbd_bus_groups, 536f0f8cef5SAlex Elder }; 537f0f8cef5SAlex Elder 538f0f8cef5SAlex Elder static void rbd_root_dev_release(struct device *dev) 539f0f8cef5SAlex Elder { 540f0f8cef5SAlex Elder } 541f0f8cef5SAlex Elder 542f0f8cef5SAlex Elder static struct device rbd_root_dev = { 543f0f8cef5SAlex Elder .init_name = "rbd", 544f0f8cef5SAlex Elder .release = rbd_root_dev_release, 545f0f8cef5SAlex Elder }; 546f0f8cef5SAlex Elder 54706ecc6cbSAlex Elder static __printf(2, 3) 54806ecc6cbSAlex Elder void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...) 54906ecc6cbSAlex Elder { 55006ecc6cbSAlex Elder struct va_format vaf; 55106ecc6cbSAlex Elder va_list args; 55206ecc6cbSAlex Elder 55306ecc6cbSAlex Elder va_start(args, fmt); 55406ecc6cbSAlex Elder vaf.fmt = fmt; 55506ecc6cbSAlex Elder vaf.va = &args; 55606ecc6cbSAlex Elder 55706ecc6cbSAlex Elder if (!rbd_dev) 55806ecc6cbSAlex Elder printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf); 55906ecc6cbSAlex Elder else if (rbd_dev->disk) 56006ecc6cbSAlex Elder printk(KERN_WARNING "%s: %s: %pV\n", 56106ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf); 56206ecc6cbSAlex Elder else if (rbd_dev->spec && rbd_dev->spec->image_name) 56306ecc6cbSAlex Elder printk(KERN_WARNING "%s: image %s: %pV\n", 56406ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf); 56506ecc6cbSAlex Elder else if (rbd_dev->spec && rbd_dev->spec->image_id) 56606ecc6cbSAlex Elder printk(KERN_WARNING "%s: id %s: %pV\n", 56706ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf); 56806ecc6cbSAlex Elder else /* punt */ 56906ecc6cbSAlex Elder printk(KERN_WARNING "%s: rbd_dev %p: %pV\n", 57006ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev, &vaf); 57106ecc6cbSAlex Elder va_end(args); 57206ecc6cbSAlex Elder } 57306ecc6cbSAlex Elder 574aafb230eSAlex Elder #ifdef RBD_DEBUG 575aafb230eSAlex Elder #define rbd_assert(expr) \ 576aafb230eSAlex Elder if (unlikely(!(expr))) { \ 577aafb230eSAlex Elder printk(KERN_ERR "\nAssertion failure in %s() " \ 578aafb230eSAlex Elder "at line %d:\n\n" \ 579aafb230eSAlex Elder "\trbd_assert(%s);\n\n", \ 580aafb230eSAlex Elder __func__, __LINE__, #expr); \ 581aafb230eSAlex Elder BUG(); \ 582aafb230eSAlex Elder } 583aafb230eSAlex Elder #else /* !RBD_DEBUG */ 584aafb230eSAlex Elder # define rbd_assert(expr) ((void) 0) 585aafb230eSAlex Elder #endif /* !RBD_DEBUG */ 586dfc5606dSYehuda Sadeh 5872761713dSIlya Dryomov static void rbd_osd_copyup_callback(struct rbd_obj_request *obj_request); 588b454e36dSAlex Elder static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request); 58905a46afdSAlex Elder static void rbd_img_parent_read(struct rbd_obj_request *obj_request); 59005a46afdSAlex Elder static void rbd_dev_remove_parent(struct rbd_device *rbd_dev); 5918b3e1a56SAlex Elder 592cc4a38bdSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev); 5932df3fac7SAlex Elder static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev); 594a720ae09SIlya Dryomov static int rbd_dev_header_info(struct rbd_device *rbd_dev); 595e8f59b59SIlya Dryomov static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev); 59654cac61fSAlex Elder static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, 59754cac61fSAlex Elder u64 snap_id); 5982ad3d716SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id, 5992ad3d716SAlex Elder u8 *order, u64 *snap_size); 6002ad3d716SAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id, 6012ad3d716SAlex Elder u64 *snap_features); 60259c2be1eSYehuda Sadeh 603602adf40SYehuda Sadeh static int rbd_open(struct block_device *bdev, fmode_t mode) 604602adf40SYehuda Sadeh { 605f0f8cef5SAlex Elder struct rbd_device *rbd_dev = bdev->bd_disk->private_data; 606b82d167bSAlex Elder bool removing = false; 607602adf40SYehuda Sadeh 608f84344f3SAlex Elder if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only) 609602adf40SYehuda Sadeh return -EROFS; 610602adf40SYehuda Sadeh 611a14ea269SAlex Elder spin_lock_irq(&rbd_dev->lock); 612b82d167bSAlex Elder if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags)) 613b82d167bSAlex Elder removing = true; 614b82d167bSAlex Elder else 615b82d167bSAlex Elder rbd_dev->open_count++; 616a14ea269SAlex Elder spin_unlock_irq(&rbd_dev->lock); 617b82d167bSAlex Elder if (removing) 618b82d167bSAlex Elder return -ENOENT; 619b82d167bSAlex Elder 620c3e946ceSAlex Elder (void) get_device(&rbd_dev->dev); 621340c7a2bSAlex Elder 622602adf40SYehuda Sadeh return 0; 623602adf40SYehuda Sadeh } 624602adf40SYehuda Sadeh 625db2a144bSAl Viro static void rbd_release(struct gendisk *disk, fmode_t mode) 626dfc5606dSYehuda Sadeh { 627dfc5606dSYehuda Sadeh struct rbd_device *rbd_dev = disk->private_data; 628b82d167bSAlex Elder unsigned long open_count_before; 629b82d167bSAlex Elder 630a14ea269SAlex Elder spin_lock_irq(&rbd_dev->lock); 631b82d167bSAlex Elder open_count_before = rbd_dev->open_count--; 632a14ea269SAlex Elder spin_unlock_irq(&rbd_dev->lock); 633b82d167bSAlex Elder rbd_assert(open_count_before > 0); 634dfc5606dSYehuda Sadeh 635c3e946ceSAlex Elder put_device(&rbd_dev->dev); 636dfc5606dSYehuda Sadeh } 637dfc5606dSYehuda Sadeh 638131fd9f6SGuangliang Zhao static int rbd_ioctl_set_ro(struct rbd_device *rbd_dev, unsigned long arg) 639131fd9f6SGuangliang Zhao { 64077f33c03SJosh Durgin int ret = 0; 641131fd9f6SGuangliang Zhao int val; 642131fd9f6SGuangliang Zhao bool ro; 64377f33c03SJosh Durgin bool ro_changed = false; 644131fd9f6SGuangliang Zhao 64577f33c03SJosh Durgin /* get_user() may sleep, so call it before taking rbd_dev->lock */ 646131fd9f6SGuangliang Zhao if (get_user(val, (int __user *)(arg))) 647131fd9f6SGuangliang Zhao return -EFAULT; 648131fd9f6SGuangliang Zhao 649131fd9f6SGuangliang Zhao ro = val ? true : false; 650131fd9f6SGuangliang Zhao /* Snapshot doesn't allow to write*/ 651131fd9f6SGuangliang Zhao if (rbd_dev->spec->snap_id != CEPH_NOSNAP && !ro) 652131fd9f6SGuangliang Zhao return -EROFS; 653131fd9f6SGuangliang Zhao 65477f33c03SJosh Durgin spin_lock_irq(&rbd_dev->lock); 65577f33c03SJosh Durgin /* prevent others open this device */ 65677f33c03SJosh Durgin if (rbd_dev->open_count > 1) { 65777f33c03SJosh Durgin ret = -EBUSY; 65877f33c03SJosh Durgin goto out; 659131fd9f6SGuangliang Zhao } 660131fd9f6SGuangliang Zhao 66177f33c03SJosh Durgin if (rbd_dev->mapping.read_only != ro) { 66277f33c03SJosh Durgin rbd_dev->mapping.read_only = ro; 66377f33c03SJosh Durgin ro_changed = true; 66477f33c03SJosh Durgin } 66577f33c03SJosh Durgin 66677f33c03SJosh Durgin out: 66777f33c03SJosh Durgin spin_unlock_irq(&rbd_dev->lock); 66877f33c03SJosh Durgin /* set_disk_ro() may sleep, so call it after releasing rbd_dev->lock */ 66977f33c03SJosh Durgin if (ret == 0 && ro_changed) 67077f33c03SJosh Durgin set_disk_ro(rbd_dev->disk, ro ? 1 : 0); 67177f33c03SJosh Durgin 67277f33c03SJosh Durgin return ret; 673131fd9f6SGuangliang Zhao } 674131fd9f6SGuangliang Zhao 675131fd9f6SGuangliang Zhao static int rbd_ioctl(struct block_device *bdev, fmode_t mode, 676131fd9f6SGuangliang Zhao unsigned int cmd, unsigned long arg) 677131fd9f6SGuangliang Zhao { 678131fd9f6SGuangliang Zhao struct rbd_device *rbd_dev = bdev->bd_disk->private_data; 679131fd9f6SGuangliang Zhao int ret = 0; 680131fd9f6SGuangliang Zhao 681131fd9f6SGuangliang Zhao switch (cmd) { 682131fd9f6SGuangliang Zhao case BLKROSET: 683131fd9f6SGuangliang Zhao ret = rbd_ioctl_set_ro(rbd_dev, arg); 684131fd9f6SGuangliang Zhao break; 685131fd9f6SGuangliang Zhao default: 686131fd9f6SGuangliang Zhao ret = -ENOTTY; 687131fd9f6SGuangliang Zhao } 688131fd9f6SGuangliang Zhao 689131fd9f6SGuangliang Zhao return ret; 690131fd9f6SGuangliang Zhao } 691131fd9f6SGuangliang Zhao 692131fd9f6SGuangliang Zhao #ifdef CONFIG_COMPAT 693131fd9f6SGuangliang Zhao static int rbd_compat_ioctl(struct block_device *bdev, fmode_t mode, 694131fd9f6SGuangliang Zhao unsigned int cmd, unsigned long arg) 695131fd9f6SGuangliang Zhao { 696131fd9f6SGuangliang Zhao return rbd_ioctl(bdev, mode, cmd, arg); 697131fd9f6SGuangliang Zhao } 698131fd9f6SGuangliang Zhao #endif /* CONFIG_COMPAT */ 699131fd9f6SGuangliang Zhao 700602adf40SYehuda Sadeh static const struct block_device_operations rbd_bd_ops = { 701602adf40SYehuda Sadeh .owner = THIS_MODULE, 702602adf40SYehuda Sadeh .open = rbd_open, 703dfc5606dSYehuda Sadeh .release = rbd_release, 704131fd9f6SGuangliang Zhao .ioctl = rbd_ioctl, 705131fd9f6SGuangliang Zhao #ifdef CONFIG_COMPAT 706131fd9f6SGuangliang Zhao .compat_ioctl = rbd_compat_ioctl, 707131fd9f6SGuangliang Zhao #endif 708602adf40SYehuda Sadeh }; 709602adf40SYehuda Sadeh 710602adf40SYehuda Sadeh /* 7117262cfcaSAlex Elder * Initialize an rbd client instance. Success or not, this function 712cfbf6377SAlex Elder * consumes ceph_opts. Caller holds client_mutex. 713602adf40SYehuda Sadeh */ 714f8c38929SAlex Elder static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts) 715602adf40SYehuda Sadeh { 716602adf40SYehuda Sadeh struct rbd_client *rbdc; 717602adf40SYehuda Sadeh int ret = -ENOMEM; 718602adf40SYehuda Sadeh 71937206ee5SAlex Elder dout("%s:\n", __func__); 720602adf40SYehuda Sadeh rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL); 721602adf40SYehuda Sadeh if (!rbdc) 722602adf40SYehuda Sadeh goto out_opt; 723602adf40SYehuda Sadeh 724602adf40SYehuda Sadeh kref_init(&rbdc->kref); 725602adf40SYehuda Sadeh INIT_LIST_HEAD(&rbdc->node); 726602adf40SYehuda Sadeh 72743ae4701SAlex Elder rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0); 728602adf40SYehuda Sadeh if (IS_ERR(rbdc->client)) 72908f75463SAlex Elder goto out_rbdc; 73043ae4701SAlex Elder ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */ 731602adf40SYehuda Sadeh 732602adf40SYehuda Sadeh ret = ceph_open_session(rbdc->client); 733602adf40SYehuda Sadeh if (ret < 0) 73408f75463SAlex Elder goto out_client; 735602adf40SYehuda Sadeh 736432b8587SAlex Elder spin_lock(&rbd_client_list_lock); 737602adf40SYehuda Sadeh list_add_tail(&rbdc->node, &rbd_client_list); 738432b8587SAlex Elder spin_unlock(&rbd_client_list_lock); 739602adf40SYehuda Sadeh 74037206ee5SAlex Elder dout("%s: rbdc %p\n", __func__, rbdc); 741bc534d86SAlex Elder 742602adf40SYehuda Sadeh return rbdc; 74308f75463SAlex Elder out_client: 744602adf40SYehuda Sadeh ceph_destroy_client(rbdc->client); 74508f75463SAlex Elder out_rbdc: 746602adf40SYehuda Sadeh kfree(rbdc); 747602adf40SYehuda Sadeh out_opt: 74843ae4701SAlex Elder if (ceph_opts) 74943ae4701SAlex Elder ceph_destroy_options(ceph_opts); 75037206ee5SAlex Elder dout("%s: error %d\n", __func__, ret); 75137206ee5SAlex Elder 75228f259b7SVasiliy Kulikov return ERR_PTR(ret); 753602adf40SYehuda Sadeh } 754602adf40SYehuda Sadeh 7552f82ee54SAlex Elder static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc) 7562f82ee54SAlex Elder { 7572f82ee54SAlex Elder kref_get(&rbdc->kref); 7582f82ee54SAlex Elder 7592f82ee54SAlex Elder return rbdc; 7602f82ee54SAlex Elder } 7612f82ee54SAlex Elder 762602adf40SYehuda Sadeh /* 7631f7ba331SAlex Elder * Find a ceph client with specific addr and configuration. If 7641f7ba331SAlex Elder * found, bump its reference count. 765602adf40SYehuda Sadeh */ 7661f7ba331SAlex Elder static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts) 767602adf40SYehuda Sadeh { 768602adf40SYehuda Sadeh struct rbd_client *client_node; 7691f7ba331SAlex Elder bool found = false; 770602adf40SYehuda Sadeh 77143ae4701SAlex Elder if (ceph_opts->flags & CEPH_OPT_NOSHARE) 772602adf40SYehuda Sadeh return NULL; 773602adf40SYehuda Sadeh 7741f7ba331SAlex Elder spin_lock(&rbd_client_list_lock); 7751f7ba331SAlex Elder list_for_each_entry(client_node, &rbd_client_list, node) { 7761f7ba331SAlex Elder if (!ceph_compare_options(ceph_opts, client_node->client)) { 7772f82ee54SAlex Elder __rbd_get_client(client_node); 7782f82ee54SAlex Elder 7791f7ba331SAlex Elder found = true; 7801f7ba331SAlex Elder break; 7811f7ba331SAlex Elder } 7821f7ba331SAlex Elder } 7831f7ba331SAlex Elder spin_unlock(&rbd_client_list_lock); 7841f7ba331SAlex Elder 7851f7ba331SAlex Elder return found ? client_node : NULL; 786602adf40SYehuda Sadeh } 787602adf40SYehuda Sadeh 788602adf40SYehuda Sadeh /* 789210c104cSIlya Dryomov * (Per device) rbd map options 79059c2be1eSYehuda Sadeh */ 79159c2be1eSYehuda Sadeh enum { 792b5584180SIlya Dryomov Opt_queue_depth, 79359c2be1eSYehuda Sadeh Opt_last_int, 79459c2be1eSYehuda Sadeh /* int args above */ 79559c2be1eSYehuda Sadeh Opt_last_string, 79659c2be1eSYehuda Sadeh /* string args above */ 797cc0538b6SAlex Elder Opt_read_only, 798cc0538b6SAlex Elder Opt_read_write, 79980de1912SIlya Dryomov Opt_lock_on_read, 800210c104cSIlya Dryomov Opt_err 80159c2be1eSYehuda Sadeh }; 80259c2be1eSYehuda Sadeh 80343ae4701SAlex Elder static match_table_t rbd_opts_tokens = { 804b5584180SIlya Dryomov {Opt_queue_depth, "queue_depth=%d"}, 80559c2be1eSYehuda Sadeh /* int args above */ 80659c2be1eSYehuda Sadeh /* string args above */ 807be466c1cSAlex Elder {Opt_read_only, "read_only"}, 808cc0538b6SAlex Elder {Opt_read_only, "ro"}, /* Alternate spelling */ 809cc0538b6SAlex Elder {Opt_read_write, "read_write"}, 810cc0538b6SAlex Elder {Opt_read_write, "rw"}, /* Alternate spelling */ 81180de1912SIlya Dryomov {Opt_lock_on_read, "lock_on_read"}, 812210c104cSIlya Dryomov {Opt_err, NULL} 81359c2be1eSYehuda Sadeh }; 81459c2be1eSYehuda Sadeh 81598571b5aSAlex Elder struct rbd_options { 816b5584180SIlya Dryomov int queue_depth; 81798571b5aSAlex Elder bool read_only; 81880de1912SIlya Dryomov bool lock_on_read; 81998571b5aSAlex Elder }; 82098571b5aSAlex Elder 821b5584180SIlya Dryomov #define RBD_QUEUE_DEPTH_DEFAULT BLKDEV_MAX_RQ 82298571b5aSAlex Elder #define RBD_READ_ONLY_DEFAULT false 82380de1912SIlya Dryomov #define RBD_LOCK_ON_READ_DEFAULT false 82498571b5aSAlex Elder 82559c2be1eSYehuda Sadeh static int parse_rbd_opts_token(char *c, void *private) 82659c2be1eSYehuda Sadeh { 82743ae4701SAlex Elder struct rbd_options *rbd_opts = private; 82859c2be1eSYehuda Sadeh substring_t argstr[MAX_OPT_ARGS]; 82959c2be1eSYehuda Sadeh int token, intval, ret; 83059c2be1eSYehuda Sadeh 83143ae4701SAlex Elder token = match_token(c, rbd_opts_tokens, argstr); 83259c2be1eSYehuda Sadeh if (token < Opt_last_int) { 83359c2be1eSYehuda Sadeh ret = match_int(&argstr[0], &intval); 83459c2be1eSYehuda Sadeh if (ret < 0) { 835210c104cSIlya Dryomov pr_err("bad mount option arg (not int) at '%s'\n", c); 83659c2be1eSYehuda Sadeh return ret; 83759c2be1eSYehuda Sadeh } 83859c2be1eSYehuda Sadeh dout("got int token %d val %d\n", token, intval); 83959c2be1eSYehuda Sadeh } else if (token > Opt_last_int && token < Opt_last_string) { 840210c104cSIlya Dryomov dout("got string token %d val %s\n", token, argstr[0].from); 84159c2be1eSYehuda Sadeh } else { 84259c2be1eSYehuda Sadeh dout("got token %d\n", token); 84359c2be1eSYehuda Sadeh } 84459c2be1eSYehuda Sadeh 84559c2be1eSYehuda Sadeh switch (token) { 846b5584180SIlya Dryomov case Opt_queue_depth: 847b5584180SIlya Dryomov if (intval < 1) { 848b5584180SIlya Dryomov pr_err("queue_depth out of range\n"); 849b5584180SIlya Dryomov return -EINVAL; 850b5584180SIlya Dryomov } 851b5584180SIlya Dryomov rbd_opts->queue_depth = intval; 852b5584180SIlya Dryomov break; 853cc0538b6SAlex Elder case Opt_read_only: 854cc0538b6SAlex Elder rbd_opts->read_only = true; 855cc0538b6SAlex Elder break; 856cc0538b6SAlex Elder case Opt_read_write: 857cc0538b6SAlex Elder rbd_opts->read_only = false; 858cc0538b6SAlex Elder break; 85980de1912SIlya Dryomov case Opt_lock_on_read: 86080de1912SIlya Dryomov rbd_opts->lock_on_read = true; 86180de1912SIlya Dryomov break; 86259c2be1eSYehuda Sadeh default: 863210c104cSIlya Dryomov /* libceph prints "bad option" msg */ 864210c104cSIlya Dryomov return -EINVAL; 86559c2be1eSYehuda Sadeh } 866210c104cSIlya Dryomov 86759c2be1eSYehuda Sadeh return 0; 86859c2be1eSYehuda Sadeh } 86959c2be1eSYehuda Sadeh 8706d2940c8SGuangliang Zhao static char* obj_op_name(enum obj_operation_type op_type) 8716d2940c8SGuangliang Zhao { 8726d2940c8SGuangliang Zhao switch (op_type) { 8736d2940c8SGuangliang Zhao case OBJ_OP_READ: 8746d2940c8SGuangliang Zhao return "read"; 8756d2940c8SGuangliang Zhao case OBJ_OP_WRITE: 8766d2940c8SGuangliang Zhao return "write"; 87790e98c52SGuangliang Zhao case OBJ_OP_DISCARD: 87890e98c52SGuangliang Zhao return "discard"; 8796d2940c8SGuangliang Zhao default: 8806d2940c8SGuangliang Zhao return "???"; 8816d2940c8SGuangliang Zhao } 8826d2940c8SGuangliang Zhao } 8836d2940c8SGuangliang Zhao 88459c2be1eSYehuda Sadeh /* 885602adf40SYehuda Sadeh * Get a ceph client with specific addr and configuration, if one does 8867262cfcaSAlex Elder * not exist create it. Either way, ceph_opts is consumed by this 8877262cfcaSAlex Elder * function. 888602adf40SYehuda Sadeh */ 8899d3997fdSAlex Elder static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts) 890602adf40SYehuda Sadeh { 891f8c38929SAlex Elder struct rbd_client *rbdc; 89259c2be1eSYehuda Sadeh 893cfbf6377SAlex Elder mutex_lock_nested(&client_mutex, SINGLE_DEPTH_NESTING); 8941f7ba331SAlex Elder rbdc = rbd_client_find(ceph_opts); 8959d3997fdSAlex Elder if (rbdc) /* using an existing client */ 89643ae4701SAlex Elder ceph_destroy_options(ceph_opts); 8979d3997fdSAlex Elder else 898f8c38929SAlex Elder rbdc = rbd_client_create(ceph_opts); 899cfbf6377SAlex Elder mutex_unlock(&client_mutex); 900d720bcb0SAlex Elder 9019d3997fdSAlex Elder return rbdc; 902602adf40SYehuda Sadeh } 903602adf40SYehuda Sadeh 904602adf40SYehuda Sadeh /* 905602adf40SYehuda Sadeh * Destroy ceph client 906d23a4b3fSAlex Elder * 907432b8587SAlex Elder * Caller must hold rbd_client_list_lock. 908602adf40SYehuda Sadeh */ 909602adf40SYehuda Sadeh static void rbd_client_release(struct kref *kref) 910602adf40SYehuda Sadeh { 911602adf40SYehuda Sadeh struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref); 912602adf40SYehuda Sadeh 91337206ee5SAlex Elder dout("%s: rbdc %p\n", __func__, rbdc); 914cd9d9f5dSAlex Elder spin_lock(&rbd_client_list_lock); 915602adf40SYehuda Sadeh list_del(&rbdc->node); 916cd9d9f5dSAlex Elder spin_unlock(&rbd_client_list_lock); 917602adf40SYehuda Sadeh 918602adf40SYehuda Sadeh ceph_destroy_client(rbdc->client); 919602adf40SYehuda Sadeh kfree(rbdc); 920602adf40SYehuda Sadeh } 921602adf40SYehuda Sadeh 922602adf40SYehuda Sadeh /* 923602adf40SYehuda Sadeh * Drop reference to ceph client node. If it's not referenced anymore, release 924602adf40SYehuda Sadeh * it. 925602adf40SYehuda Sadeh */ 9269d3997fdSAlex Elder static void rbd_put_client(struct rbd_client *rbdc) 927602adf40SYehuda Sadeh { 928c53d5893SAlex Elder if (rbdc) 9299d3997fdSAlex Elder kref_put(&rbdc->kref, rbd_client_release); 930602adf40SYehuda Sadeh } 931602adf40SYehuda Sadeh 932a30b71b9SAlex Elder static bool rbd_image_format_valid(u32 image_format) 933a30b71b9SAlex Elder { 934a30b71b9SAlex Elder return image_format == 1 || image_format == 2; 935a30b71b9SAlex Elder } 936a30b71b9SAlex Elder 9378e94af8eSAlex Elder static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk) 9388e94af8eSAlex Elder { 939103a150fSAlex Elder size_t size; 940103a150fSAlex Elder u32 snap_count; 941103a150fSAlex Elder 942103a150fSAlex Elder /* The header has to start with the magic rbd header text */ 943103a150fSAlex Elder if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT))) 944103a150fSAlex Elder return false; 945103a150fSAlex Elder 946db2388b6SAlex Elder /* The bio layer requires at least sector-sized I/O */ 947db2388b6SAlex Elder 948db2388b6SAlex Elder if (ondisk->options.order < SECTOR_SHIFT) 949db2388b6SAlex Elder return false; 950db2388b6SAlex Elder 951db2388b6SAlex Elder /* If we use u64 in a few spots we may be able to loosen this */ 952db2388b6SAlex Elder 953db2388b6SAlex Elder if (ondisk->options.order > 8 * sizeof (int) - 1) 954db2388b6SAlex Elder return false; 955db2388b6SAlex Elder 956103a150fSAlex Elder /* 957103a150fSAlex Elder * The size of a snapshot header has to fit in a size_t, and 958103a150fSAlex Elder * that limits the number of snapshots. 959103a150fSAlex Elder */ 960103a150fSAlex Elder snap_count = le32_to_cpu(ondisk->snap_count); 961103a150fSAlex Elder size = SIZE_MAX - sizeof (struct ceph_snap_context); 962103a150fSAlex Elder if (snap_count > size / sizeof (__le64)) 963103a150fSAlex Elder return false; 964103a150fSAlex Elder 965103a150fSAlex Elder /* 966103a150fSAlex Elder * Not only that, but the size of the entire the snapshot 967103a150fSAlex Elder * header must also be representable in a size_t. 968103a150fSAlex Elder */ 969103a150fSAlex Elder size -= snap_count * sizeof (__le64); 970103a150fSAlex Elder if ((u64) size < le64_to_cpu(ondisk->snap_names_len)) 971103a150fSAlex Elder return false; 972103a150fSAlex Elder 973103a150fSAlex Elder return true; 9748e94af8eSAlex Elder } 9758e94af8eSAlex Elder 976602adf40SYehuda Sadeh /* 9775bc3fb17SIlya Dryomov * returns the size of an object in the image 9785bc3fb17SIlya Dryomov */ 9795bc3fb17SIlya Dryomov static u32 rbd_obj_bytes(struct rbd_image_header *header) 9805bc3fb17SIlya Dryomov { 9815bc3fb17SIlya Dryomov return 1U << header->obj_order; 9825bc3fb17SIlya Dryomov } 9835bc3fb17SIlya Dryomov 984263423f8SIlya Dryomov static void rbd_init_layout(struct rbd_device *rbd_dev) 985263423f8SIlya Dryomov { 986263423f8SIlya Dryomov if (rbd_dev->header.stripe_unit == 0 || 987263423f8SIlya Dryomov rbd_dev->header.stripe_count == 0) { 988263423f8SIlya Dryomov rbd_dev->header.stripe_unit = rbd_obj_bytes(&rbd_dev->header); 989263423f8SIlya Dryomov rbd_dev->header.stripe_count = 1; 990263423f8SIlya Dryomov } 991263423f8SIlya Dryomov 992263423f8SIlya Dryomov rbd_dev->layout.stripe_unit = rbd_dev->header.stripe_unit; 993263423f8SIlya Dryomov rbd_dev->layout.stripe_count = rbd_dev->header.stripe_count; 994263423f8SIlya Dryomov rbd_dev->layout.object_size = rbd_obj_bytes(&rbd_dev->header); 9957e97332eSIlya Dryomov rbd_dev->layout.pool_id = rbd_dev->header.data_pool_id == CEPH_NOPOOL ? 9967e97332eSIlya Dryomov rbd_dev->spec->pool_id : rbd_dev->header.data_pool_id; 997263423f8SIlya Dryomov RCU_INIT_POINTER(rbd_dev->layout.pool_ns, NULL); 998263423f8SIlya Dryomov } 999263423f8SIlya Dryomov 10005bc3fb17SIlya Dryomov /* 1001bb23e37aSAlex Elder * Fill an rbd image header with information from the given format 1 1002bb23e37aSAlex Elder * on-disk header. 1003602adf40SYehuda Sadeh */ 1004662518b1SAlex Elder static int rbd_header_from_disk(struct rbd_device *rbd_dev, 10054156d998SAlex Elder struct rbd_image_header_ondisk *ondisk) 1006602adf40SYehuda Sadeh { 1007662518b1SAlex Elder struct rbd_image_header *header = &rbd_dev->header; 1008bb23e37aSAlex Elder bool first_time = header->object_prefix == NULL; 1009bb23e37aSAlex Elder struct ceph_snap_context *snapc; 1010bb23e37aSAlex Elder char *object_prefix = NULL; 1011bb23e37aSAlex Elder char *snap_names = NULL; 1012bb23e37aSAlex Elder u64 *snap_sizes = NULL; 1013ccece235SAlex Elder u32 snap_count; 1014bb23e37aSAlex Elder int ret = -ENOMEM; 1015621901d6SAlex Elder u32 i; 1016602adf40SYehuda Sadeh 1017bb23e37aSAlex Elder /* Allocate this now to avoid having to handle failure below */ 1018103a150fSAlex Elder 1019bb23e37aSAlex Elder if (first_time) { 1020848d796cSIlya Dryomov object_prefix = kstrndup(ondisk->object_prefix, 1021848d796cSIlya Dryomov sizeof(ondisk->object_prefix), 1022848d796cSIlya Dryomov GFP_KERNEL); 1023bb23e37aSAlex Elder if (!object_prefix) 1024602adf40SYehuda Sadeh return -ENOMEM; 1025bb23e37aSAlex Elder } 102600f1f36fSAlex Elder 1027bb23e37aSAlex Elder /* Allocate the snapshot context and fill it in */ 1028d2bb24e5SAlex Elder 1029602adf40SYehuda Sadeh snap_count = le32_to_cpu(ondisk->snap_count); 1030bb23e37aSAlex Elder snapc = ceph_create_snap_context(snap_count, GFP_KERNEL); 1031bb23e37aSAlex Elder if (!snapc) 1032bb23e37aSAlex Elder goto out_err; 1033bb23e37aSAlex Elder snapc->seq = le64_to_cpu(ondisk->snap_seq); 1034602adf40SYehuda Sadeh if (snap_count) { 1035bb23e37aSAlex Elder struct rbd_image_snap_ondisk *snaps; 1036f785cc1dSAlex Elder u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len); 1037f785cc1dSAlex Elder 1038bb23e37aSAlex Elder /* We'll keep a copy of the snapshot names... */ 1039621901d6SAlex Elder 1040f785cc1dSAlex Elder if (snap_names_len > (u64)SIZE_MAX) 1041bb23e37aSAlex Elder goto out_2big; 1042bb23e37aSAlex Elder snap_names = kmalloc(snap_names_len, GFP_KERNEL); 1043bb23e37aSAlex Elder if (!snap_names) 1044602adf40SYehuda Sadeh goto out_err; 1045bb23e37aSAlex Elder 1046bb23e37aSAlex Elder /* ...as well as the array of their sizes. */ 104788a25a5fSMarkus Elfring snap_sizes = kmalloc_array(snap_count, 104888a25a5fSMarkus Elfring sizeof(*header->snap_sizes), 104988a25a5fSMarkus Elfring GFP_KERNEL); 1050bb23e37aSAlex Elder if (!snap_sizes) 1051bb23e37aSAlex Elder goto out_err; 1052bb23e37aSAlex Elder 1053f785cc1dSAlex Elder /* 1054bb23e37aSAlex Elder * Copy the names, and fill in each snapshot's id 1055bb23e37aSAlex Elder * and size. 1056bb23e37aSAlex Elder * 105799a41ebcSAlex Elder * Note that rbd_dev_v1_header_info() guarantees the 1058bb23e37aSAlex Elder * ondisk buffer we're working with has 1059f785cc1dSAlex Elder * snap_names_len bytes beyond the end of the 1060f785cc1dSAlex Elder * snapshot id array, this memcpy() is safe. 1061f785cc1dSAlex Elder */ 1062bb23e37aSAlex Elder memcpy(snap_names, &ondisk->snaps[snap_count], snap_names_len); 1063bb23e37aSAlex Elder snaps = ondisk->snaps; 1064bb23e37aSAlex Elder for (i = 0; i < snap_count; i++) { 1065bb23e37aSAlex Elder snapc->snaps[i] = le64_to_cpu(snaps[i].id); 1066bb23e37aSAlex Elder snap_sizes[i] = le64_to_cpu(snaps[i].image_size); 1067bb23e37aSAlex Elder } 1068602adf40SYehuda Sadeh } 1069849b4260SAlex Elder 1070bb23e37aSAlex Elder /* We won't fail any more, fill in the header */ 1071bb23e37aSAlex Elder 1072bb23e37aSAlex Elder if (first_time) { 1073bb23e37aSAlex Elder header->object_prefix = object_prefix; 1074602adf40SYehuda Sadeh header->obj_order = ondisk->options.order; 1075263423f8SIlya Dryomov rbd_init_layout(rbd_dev); 1076662518b1SAlex Elder } else { 1077662518b1SAlex Elder ceph_put_snap_context(header->snapc); 1078662518b1SAlex Elder kfree(header->snap_names); 1079662518b1SAlex Elder kfree(header->snap_sizes); 1080bb23e37aSAlex Elder } 10816a52325fSAlex Elder 1082bb23e37aSAlex Elder /* The remaining fields always get updated (when we refresh) */ 1083621901d6SAlex Elder 1084f84344f3SAlex Elder header->image_size = le64_to_cpu(ondisk->image_size); 1085bb23e37aSAlex Elder header->snapc = snapc; 1086bb23e37aSAlex Elder header->snap_names = snap_names; 1087bb23e37aSAlex Elder header->snap_sizes = snap_sizes; 1088468521c1SAlex Elder 1089602adf40SYehuda Sadeh return 0; 1090bb23e37aSAlex Elder out_2big: 1091bb23e37aSAlex Elder ret = -EIO; 10926a52325fSAlex Elder out_err: 1093bb23e37aSAlex Elder kfree(snap_sizes); 1094bb23e37aSAlex Elder kfree(snap_names); 1095bb23e37aSAlex Elder ceph_put_snap_context(snapc); 1096bb23e37aSAlex Elder kfree(object_prefix); 1097ccece235SAlex Elder 1098bb23e37aSAlex Elder return ret; 1099602adf40SYehuda Sadeh } 1100602adf40SYehuda Sadeh 11019682fc6dSAlex Elder static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which) 11029682fc6dSAlex Elder { 11039682fc6dSAlex Elder const char *snap_name; 11049682fc6dSAlex Elder 11059682fc6dSAlex Elder rbd_assert(which < rbd_dev->header.snapc->num_snaps); 11069682fc6dSAlex Elder 11079682fc6dSAlex Elder /* Skip over names until we find the one we are looking for */ 11089682fc6dSAlex Elder 11099682fc6dSAlex Elder snap_name = rbd_dev->header.snap_names; 11109682fc6dSAlex Elder while (which--) 11119682fc6dSAlex Elder snap_name += strlen(snap_name) + 1; 11129682fc6dSAlex Elder 11139682fc6dSAlex Elder return kstrdup(snap_name, GFP_KERNEL); 11149682fc6dSAlex Elder } 11159682fc6dSAlex Elder 111630d1cff8SAlex Elder /* 111730d1cff8SAlex Elder * Snapshot id comparison function for use with qsort()/bsearch(). 111830d1cff8SAlex Elder * Note that result is for snapshots in *descending* order. 111930d1cff8SAlex Elder */ 112030d1cff8SAlex Elder static int snapid_compare_reverse(const void *s1, const void *s2) 112130d1cff8SAlex Elder { 112230d1cff8SAlex Elder u64 snap_id1 = *(u64 *)s1; 112330d1cff8SAlex Elder u64 snap_id2 = *(u64 *)s2; 112430d1cff8SAlex Elder 112530d1cff8SAlex Elder if (snap_id1 < snap_id2) 112630d1cff8SAlex Elder return 1; 112730d1cff8SAlex Elder return snap_id1 == snap_id2 ? 0 : -1; 112830d1cff8SAlex Elder } 112930d1cff8SAlex Elder 113030d1cff8SAlex Elder /* 113130d1cff8SAlex Elder * Search a snapshot context to see if the given snapshot id is 113230d1cff8SAlex Elder * present. 113330d1cff8SAlex Elder * 113430d1cff8SAlex Elder * Returns the position of the snapshot id in the array if it's found, 113530d1cff8SAlex Elder * or BAD_SNAP_INDEX otherwise. 113630d1cff8SAlex Elder * 113730d1cff8SAlex Elder * Note: The snapshot array is in kept sorted (by the osd) in 113830d1cff8SAlex Elder * reverse order, highest snapshot id first. 113930d1cff8SAlex Elder */ 11409682fc6dSAlex Elder static u32 rbd_dev_snap_index(struct rbd_device *rbd_dev, u64 snap_id) 11419682fc6dSAlex Elder { 11429682fc6dSAlex Elder struct ceph_snap_context *snapc = rbd_dev->header.snapc; 114330d1cff8SAlex Elder u64 *found; 11449682fc6dSAlex Elder 114530d1cff8SAlex Elder found = bsearch(&snap_id, &snapc->snaps, snapc->num_snaps, 114630d1cff8SAlex Elder sizeof (snap_id), snapid_compare_reverse); 11479682fc6dSAlex Elder 114830d1cff8SAlex Elder return found ? (u32)(found - &snapc->snaps[0]) : BAD_SNAP_INDEX; 11499682fc6dSAlex Elder } 11509682fc6dSAlex Elder 11512ad3d716SAlex Elder static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, 11522ad3d716SAlex Elder u64 snap_id) 115354cac61fSAlex Elder { 115454cac61fSAlex Elder u32 which; 1155da6a6b63SJosh Durgin const char *snap_name; 115654cac61fSAlex Elder 115754cac61fSAlex Elder which = rbd_dev_snap_index(rbd_dev, snap_id); 115854cac61fSAlex Elder if (which == BAD_SNAP_INDEX) 1159da6a6b63SJosh Durgin return ERR_PTR(-ENOENT); 116054cac61fSAlex Elder 1161da6a6b63SJosh Durgin snap_name = _rbd_dev_v1_snap_name(rbd_dev, which); 1162da6a6b63SJosh Durgin return snap_name ? snap_name : ERR_PTR(-ENOMEM); 116354cac61fSAlex Elder } 116454cac61fSAlex Elder 11659e15b77dSAlex Elder static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id) 11669e15b77dSAlex Elder { 11679e15b77dSAlex Elder if (snap_id == CEPH_NOSNAP) 11689e15b77dSAlex Elder return RBD_SNAP_HEAD_NAME; 11699e15b77dSAlex Elder 117054cac61fSAlex Elder rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 117154cac61fSAlex Elder if (rbd_dev->image_format == 1) 117254cac61fSAlex Elder return rbd_dev_v1_snap_name(rbd_dev, snap_id); 11739e15b77dSAlex Elder 117454cac61fSAlex Elder return rbd_dev_v2_snap_name(rbd_dev, snap_id); 11759e15b77dSAlex Elder } 11769e15b77dSAlex Elder 11772ad3d716SAlex Elder static int rbd_snap_size(struct rbd_device *rbd_dev, u64 snap_id, 11782ad3d716SAlex Elder u64 *snap_size) 1179602adf40SYehuda Sadeh { 11802ad3d716SAlex Elder rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 11812ad3d716SAlex Elder if (snap_id == CEPH_NOSNAP) { 11822ad3d716SAlex Elder *snap_size = rbd_dev->header.image_size; 11832ad3d716SAlex Elder } else if (rbd_dev->image_format == 1) { 11842ad3d716SAlex Elder u32 which; 118500f1f36fSAlex Elder 11862ad3d716SAlex Elder which = rbd_dev_snap_index(rbd_dev, snap_id); 11872ad3d716SAlex Elder if (which == BAD_SNAP_INDEX) 11882ad3d716SAlex Elder return -ENOENT; 118900f1f36fSAlex Elder 11902ad3d716SAlex Elder *snap_size = rbd_dev->header.snap_sizes[which]; 11912ad3d716SAlex Elder } else { 11922ad3d716SAlex Elder u64 size = 0; 11932ad3d716SAlex Elder int ret; 11942ad3d716SAlex Elder 11952ad3d716SAlex Elder ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size); 11962ad3d716SAlex Elder if (ret) 11972ad3d716SAlex Elder return ret; 11982ad3d716SAlex Elder 11992ad3d716SAlex Elder *snap_size = size; 12002ad3d716SAlex Elder } 12012ad3d716SAlex Elder return 0; 12022ad3d716SAlex Elder } 12032ad3d716SAlex Elder 12042ad3d716SAlex Elder static int rbd_snap_features(struct rbd_device *rbd_dev, u64 snap_id, 12052ad3d716SAlex Elder u64 *snap_features) 12062ad3d716SAlex Elder { 12072ad3d716SAlex Elder rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 12082ad3d716SAlex Elder if (snap_id == CEPH_NOSNAP) { 12092ad3d716SAlex Elder *snap_features = rbd_dev->header.features; 12102ad3d716SAlex Elder } else if (rbd_dev->image_format == 1) { 12112ad3d716SAlex Elder *snap_features = 0; /* No features for format 1 */ 12122ad3d716SAlex Elder } else { 12132ad3d716SAlex Elder u64 features = 0; 12142ad3d716SAlex Elder int ret; 12152ad3d716SAlex Elder 12162ad3d716SAlex Elder ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, &features); 12172ad3d716SAlex Elder if (ret) 12182ad3d716SAlex Elder return ret; 12192ad3d716SAlex Elder 12202ad3d716SAlex Elder *snap_features = features; 12212ad3d716SAlex Elder } 12222ad3d716SAlex Elder return 0; 122300f1f36fSAlex Elder } 1224602adf40SYehuda Sadeh 1225d1cf5788SAlex Elder static int rbd_dev_mapping_set(struct rbd_device *rbd_dev) 1226602adf40SYehuda Sadeh { 12278f4b7d98SAlex Elder u64 snap_id = rbd_dev->spec->snap_id; 12282ad3d716SAlex Elder u64 size = 0; 12292ad3d716SAlex Elder u64 features = 0; 12302ad3d716SAlex Elder int ret; 12318b0241f8SAlex Elder 12322ad3d716SAlex Elder ret = rbd_snap_size(rbd_dev, snap_id, &size); 12332ad3d716SAlex Elder if (ret) 12342ad3d716SAlex Elder return ret; 12352ad3d716SAlex Elder ret = rbd_snap_features(rbd_dev, snap_id, &features); 12362ad3d716SAlex Elder if (ret) 12372ad3d716SAlex Elder return ret; 12382ad3d716SAlex Elder 12392ad3d716SAlex Elder rbd_dev->mapping.size = size; 12402ad3d716SAlex Elder rbd_dev->mapping.features = features; 12412ad3d716SAlex Elder 12428b0241f8SAlex Elder return 0; 1243602adf40SYehuda Sadeh } 1244602adf40SYehuda Sadeh 1245d1cf5788SAlex Elder static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev) 1246d1cf5788SAlex Elder { 1247d1cf5788SAlex Elder rbd_dev->mapping.size = 0; 1248d1cf5788SAlex Elder rbd_dev->mapping.features = 0; 1249200a6a8bSAlex Elder } 1250200a6a8bSAlex Elder 12517d5079aaSHimangi Saraogi static void rbd_segment_name_free(const char *name) 12527d5079aaSHimangi Saraogi { 12537d5079aaSHimangi Saraogi /* The explicit cast here is needed to drop the const qualifier */ 12547d5079aaSHimangi Saraogi 12557d5079aaSHimangi Saraogi kmem_cache_free(rbd_segment_name_cache, (void *)name); 12567d5079aaSHimangi Saraogi } 12577d5079aaSHimangi Saraogi 125898571b5aSAlex Elder static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset) 1259602adf40SYehuda Sadeh { 126065ccfe21SAlex Elder char *name; 126165ccfe21SAlex Elder u64 segment; 126265ccfe21SAlex Elder int ret; 12633a96d5cdSJosh Durgin char *name_format; 1264602adf40SYehuda Sadeh 126578c2a44aSAlex Elder name = kmem_cache_alloc(rbd_segment_name_cache, GFP_NOIO); 126665ccfe21SAlex Elder if (!name) 126765ccfe21SAlex Elder return NULL; 126865ccfe21SAlex Elder segment = offset >> rbd_dev->header.obj_order; 12693a96d5cdSJosh Durgin name_format = "%s.%012llx"; 12703a96d5cdSJosh Durgin if (rbd_dev->image_format == 2) 12713a96d5cdSJosh Durgin name_format = "%s.%016llx"; 12722d0ebc5dSIlya Dryomov ret = snprintf(name, CEPH_MAX_OID_NAME_LEN + 1, name_format, 127365ccfe21SAlex Elder rbd_dev->header.object_prefix, segment); 12742d0ebc5dSIlya Dryomov if (ret < 0 || ret > CEPH_MAX_OID_NAME_LEN) { 127565ccfe21SAlex Elder pr_err("error formatting segment name for #%llu (%d)\n", 127665ccfe21SAlex Elder segment, ret); 12777d5079aaSHimangi Saraogi rbd_segment_name_free(name); 127865ccfe21SAlex Elder name = NULL; 127965ccfe21SAlex Elder } 1280602adf40SYehuda Sadeh 128165ccfe21SAlex Elder return name; 128265ccfe21SAlex Elder } 1283602adf40SYehuda Sadeh 128465ccfe21SAlex Elder static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset) 128565ccfe21SAlex Elder { 12865bc3fb17SIlya Dryomov u64 segment_size = rbd_obj_bytes(&rbd_dev->header); 1287602adf40SYehuda Sadeh 128865ccfe21SAlex Elder return offset & (segment_size - 1); 128965ccfe21SAlex Elder } 129065ccfe21SAlex Elder 129165ccfe21SAlex Elder static u64 rbd_segment_length(struct rbd_device *rbd_dev, 129265ccfe21SAlex Elder u64 offset, u64 length) 129365ccfe21SAlex Elder { 12945bc3fb17SIlya Dryomov u64 segment_size = rbd_obj_bytes(&rbd_dev->header); 129565ccfe21SAlex Elder 129665ccfe21SAlex Elder offset &= segment_size - 1; 129765ccfe21SAlex Elder 1298aafb230eSAlex Elder rbd_assert(length <= U64_MAX - offset); 129965ccfe21SAlex Elder if (offset + length > segment_size) 130065ccfe21SAlex Elder length = segment_size - offset; 130165ccfe21SAlex Elder 130265ccfe21SAlex Elder return length; 1303602adf40SYehuda Sadeh } 1304602adf40SYehuda Sadeh 1305602adf40SYehuda Sadeh /* 1306602adf40SYehuda Sadeh * bio helpers 1307602adf40SYehuda Sadeh */ 1308602adf40SYehuda Sadeh 1309602adf40SYehuda Sadeh static void bio_chain_put(struct bio *chain) 1310602adf40SYehuda Sadeh { 1311602adf40SYehuda Sadeh struct bio *tmp; 1312602adf40SYehuda Sadeh 1313602adf40SYehuda Sadeh while (chain) { 1314602adf40SYehuda Sadeh tmp = chain; 1315602adf40SYehuda Sadeh chain = chain->bi_next; 1316602adf40SYehuda Sadeh bio_put(tmp); 1317602adf40SYehuda Sadeh } 1318602adf40SYehuda Sadeh } 1319602adf40SYehuda Sadeh 1320602adf40SYehuda Sadeh /* 1321602adf40SYehuda Sadeh * zeros a bio chain, starting at specific offset 1322602adf40SYehuda Sadeh */ 1323602adf40SYehuda Sadeh static void zero_bio_chain(struct bio *chain, int start_ofs) 1324602adf40SYehuda Sadeh { 13257988613bSKent Overstreet struct bio_vec bv; 13267988613bSKent Overstreet struct bvec_iter iter; 1327602adf40SYehuda Sadeh unsigned long flags; 1328602adf40SYehuda Sadeh void *buf; 1329602adf40SYehuda Sadeh int pos = 0; 1330602adf40SYehuda Sadeh 1331602adf40SYehuda Sadeh while (chain) { 13327988613bSKent Overstreet bio_for_each_segment(bv, chain, iter) { 13337988613bSKent Overstreet if (pos + bv.bv_len > start_ofs) { 1334602adf40SYehuda Sadeh int remainder = max(start_ofs - pos, 0); 13357988613bSKent Overstreet buf = bvec_kmap_irq(&bv, &flags); 1336602adf40SYehuda Sadeh memset(buf + remainder, 0, 13377988613bSKent Overstreet bv.bv_len - remainder); 13387988613bSKent Overstreet flush_dcache_page(bv.bv_page); 133985b5aaa6SDan Carpenter bvec_kunmap_irq(buf, &flags); 1340602adf40SYehuda Sadeh } 13417988613bSKent Overstreet pos += bv.bv_len; 1342602adf40SYehuda Sadeh } 1343602adf40SYehuda Sadeh 1344602adf40SYehuda Sadeh chain = chain->bi_next; 1345602adf40SYehuda Sadeh } 1346602adf40SYehuda Sadeh } 1347602adf40SYehuda Sadeh 1348602adf40SYehuda Sadeh /* 1349b9434c5bSAlex Elder * similar to zero_bio_chain(), zeros data defined by a page array, 1350b9434c5bSAlex Elder * starting at the given byte offset from the start of the array and 1351b9434c5bSAlex Elder * continuing up to the given end offset. The pages array is 1352b9434c5bSAlex Elder * assumed to be big enough to hold all bytes up to the end. 1353b9434c5bSAlex Elder */ 1354b9434c5bSAlex Elder static void zero_pages(struct page **pages, u64 offset, u64 end) 1355b9434c5bSAlex Elder { 1356b9434c5bSAlex Elder struct page **page = &pages[offset >> PAGE_SHIFT]; 1357b9434c5bSAlex Elder 1358b9434c5bSAlex Elder rbd_assert(end > offset); 1359b9434c5bSAlex Elder rbd_assert(end - offset <= (u64)SIZE_MAX); 1360b9434c5bSAlex Elder while (offset < end) { 1361b9434c5bSAlex Elder size_t page_offset; 1362b9434c5bSAlex Elder size_t length; 1363b9434c5bSAlex Elder unsigned long flags; 1364b9434c5bSAlex Elder void *kaddr; 1365b9434c5bSAlex Elder 1366491205a8SGeert Uytterhoeven page_offset = offset & ~PAGE_MASK; 1367491205a8SGeert Uytterhoeven length = min_t(size_t, PAGE_SIZE - page_offset, end - offset); 1368b9434c5bSAlex Elder local_irq_save(flags); 1369b9434c5bSAlex Elder kaddr = kmap_atomic(*page); 1370b9434c5bSAlex Elder memset(kaddr + page_offset, 0, length); 1371e2156054SAlex Elder flush_dcache_page(*page); 1372b9434c5bSAlex Elder kunmap_atomic(kaddr); 1373b9434c5bSAlex Elder local_irq_restore(flags); 1374b9434c5bSAlex Elder 1375b9434c5bSAlex Elder offset += length; 1376b9434c5bSAlex Elder page++; 1377b9434c5bSAlex Elder } 1378b9434c5bSAlex Elder } 1379b9434c5bSAlex Elder 1380b9434c5bSAlex Elder /* 1381f7760dadSAlex Elder * Clone a portion of a bio, starting at the given byte offset 1382f7760dadSAlex Elder * and continuing for the number of bytes indicated. 1383602adf40SYehuda Sadeh */ 1384f7760dadSAlex Elder static struct bio *bio_clone_range(struct bio *bio_src, 1385f7760dadSAlex Elder unsigned int offset, 1386f7760dadSAlex Elder unsigned int len, 1387f7760dadSAlex Elder gfp_t gfpmask) 1388602adf40SYehuda Sadeh { 1389f7760dadSAlex Elder struct bio *bio; 1390602adf40SYehuda Sadeh 13915341a627SKent Overstreet bio = bio_clone(bio_src, gfpmask); 1392f7760dadSAlex Elder if (!bio) 1393f7760dadSAlex Elder return NULL; /* ENOMEM */ 1394f7760dadSAlex Elder 13955341a627SKent Overstreet bio_advance(bio, offset); 13964f024f37SKent Overstreet bio->bi_iter.bi_size = len; 1397602adf40SYehuda Sadeh 1398f7760dadSAlex Elder return bio; 1399602adf40SYehuda Sadeh } 1400602adf40SYehuda Sadeh 1401f7760dadSAlex Elder /* 1402f7760dadSAlex Elder * Clone a portion of a bio chain, starting at the given byte offset 1403f7760dadSAlex Elder * into the first bio in the source chain and continuing for the 1404f7760dadSAlex Elder * number of bytes indicated. The result is another bio chain of 1405f7760dadSAlex Elder * exactly the given length, or a null pointer on error. 1406f7760dadSAlex Elder * 1407f7760dadSAlex Elder * The bio_src and offset parameters are both in-out. On entry they 1408f7760dadSAlex Elder * refer to the first source bio and the offset into that bio where 1409f7760dadSAlex Elder * the start of data to be cloned is located. 1410f7760dadSAlex Elder * 1411f7760dadSAlex Elder * On return, bio_src is updated to refer to the bio in the source 1412f7760dadSAlex Elder * chain that contains first un-cloned byte, and *offset will 1413f7760dadSAlex Elder * contain the offset of that byte within that bio. 1414f7760dadSAlex Elder */ 1415f7760dadSAlex Elder static struct bio *bio_chain_clone_range(struct bio **bio_src, 1416f7760dadSAlex Elder unsigned int *offset, 1417f7760dadSAlex Elder unsigned int len, 1418f7760dadSAlex Elder gfp_t gfpmask) 1419f7760dadSAlex Elder { 1420f7760dadSAlex Elder struct bio *bi = *bio_src; 1421f7760dadSAlex Elder unsigned int off = *offset; 1422f7760dadSAlex Elder struct bio *chain = NULL; 1423f7760dadSAlex Elder struct bio **end; 1424602adf40SYehuda Sadeh 1425f7760dadSAlex Elder /* Build up a chain of clone bios up to the limit */ 1426602adf40SYehuda Sadeh 14274f024f37SKent Overstreet if (!bi || off >= bi->bi_iter.bi_size || !len) 1428f7760dadSAlex Elder return NULL; /* Nothing to clone */ 1429602adf40SYehuda Sadeh 1430f7760dadSAlex Elder end = &chain; 1431f7760dadSAlex Elder while (len) { 1432f7760dadSAlex Elder unsigned int bi_size; 1433f7760dadSAlex Elder struct bio *bio; 1434f7760dadSAlex Elder 1435f5400b7aSAlex Elder if (!bi) { 1436f5400b7aSAlex Elder rbd_warn(NULL, "bio_chain exhausted with %u left", len); 1437f7760dadSAlex Elder goto out_err; /* EINVAL; ran out of bio's */ 1438f5400b7aSAlex Elder } 14394f024f37SKent Overstreet bi_size = min_t(unsigned int, bi->bi_iter.bi_size - off, len); 1440f7760dadSAlex Elder bio = bio_clone_range(bi, off, bi_size, gfpmask); 1441f7760dadSAlex Elder if (!bio) 1442f7760dadSAlex Elder goto out_err; /* ENOMEM */ 1443f7760dadSAlex Elder 1444f7760dadSAlex Elder *end = bio; 1445f7760dadSAlex Elder end = &bio->bi_next; 1446f7760dadSAlex Elder 1447f7760dadSAlex Elder off += bi_size; 14484f024f37SKent Overstreet if (off == bi->bi_iter.bi_size) { 1449f7760dadSAlex Elder bi = bi->bi_next; 1450f7760dadSAlex Elder off = 0; 1451f7760dadSAlex Elder } 1452f7760dadSAlex Elder len -= bi_size; 1453f7760dadSAlex Elder } 1454f7760dadSAlex Elder *bio_src = bi; 1455f7760dadSAlex Elder *offset = off; 1456f7760dadSAlex Elder 1457f7760dadSAlex Elder return chain; 1458f7760dadSAlex Elder out_err: 1459f7760dadSAlex Elder bio_chain_put(chain); 1460f7760dadSAlex Elder 1461602adf40SYehuda Sadeh return NULL; 1462602adf40SYehuda Sadeh } 1463602adf40SYehuda Sadeh 1464926f9b3fSAlex Elder /* 1465926f9b3fSAlex Elder * The default/initial value for all object request flags is 0. For 1466926f9b3fSAlex Elder * each flag, once its value is set to 1 it is never reset to 0 1467926f9b3fSAlex Elder * again. 1468926f9b3fSAlex Elder */ 14696365d33aSAlex Elder static void obj_request_img_data_set(struct rbd_obj_request *obj_request) 14706365d33aSAlex Elder { 14716365d33aSAlex Elder if (test_and_set_bit(OBJ_REQ_IMG_DATA, &obj_request->flags)) { 14726365d33aSAlex Elder struct rbd_device *rbd_dev; 14736365d33aSAlex Elder 147457acbaa7SAlex Elder rbd_dev = obj_request->img_request->rbd_dev; 14759584d508SIlya Dryomov rbd_warn(rbd_dev, "obj_request %p already marked img_data", 14766365d33aSAlex Elder obj_request); 14776365d33aSAlex Elder } 14786365d33aSAlex Elder } 14796365d33aSAlex Elder 14806365d33aSAlex Elder static bool obj_request_img_data_test(struct rbd_obj_request *obj_request) 14816365d33aSAlex Elder { 14826365d33aSAlex Elder smp_mb(); 14836365d33aSAlex Elder return test_bit(OBJ_REQ_IMG_DATA, &obj_request->flags) != 0; 14846365d33aSAlex Elder } 14856365d33aSAlex Elder 148657acbaa7SAlex Elder static void obj_request_done_set(struct rbd_obj_request *obj_request) 148757acbaa7SAlex Elder { 148857acbaa7SAlex Elder if (test_and_set_bit(OBJ_REQ_DONE, &obj_request->flags)) { 148957acbaa7SAlex Elder struct rbd_device *rbd_dev = NULL; 149057acbaa7SAlex Elder 149157acbaa7SAlex Elder if (obj_request_img_data_test(obj_request)) 149257acbaa7SAlex Elder rbd_dev = obj_request->img_request->rbd_dev; 14939584d508SIlya Dryomov rbd_warn(rbd_dev, "obj_request %p already marked done", 149457acbaa7SAlex Elder obj_request); 149557acbaa7SAlex Elder } 149657acbaa7SAlex Elder } 149757acbaa7SAlex Elder 149857acbaa7SAlex Elder static bool obj_request_done_test(struct rbd_obj_request *obj_request) 149957acbaa7SAlex Elder { 150057acbaa7SAlex Elder smp_mb(); 150157acbaa7SAlex Elder return test_bit(OBJ_REQ_DONE, &obj_request->flags) != 0; 150257acbaa7SAlex Elder } 150357acbaa7SAlex Elder 15045679c59fSAlex Elder /* 15055679c59fSAlex Elder * This sets the KNOWN flag after (possibly) setting the EXISTS 15065679c59fSAlex Elder * flag. The latter is set based on the "exists" value provided. 15075679c59fSAlex Elder * 15085679c59fSAlex Elder * Note that for our purposes once an object exists it never goes 15095679c59fSAlex Elder * away again. It's possible that the response from two existence 15105679c59fSAlex Elder * checks are separated by the creation of the target object, and 15115679c59fSAlex Elder * the first ("doesn't exist") response arrives *after* the second 15125679c59fSAlex Elder * ("does exist"). In that case we ignore the second one. 15135679c59fSAlex Elder */ 15145679c59fSAlex Elder static void obj_request_existence_set(struct rbd_obj_request *obj_request, 15155679c59fSAlex Elder bool exists) 15165679c59fSAlex Elder { 15175679c59fSAlex Elder if (exists) 15185679c59fSAlex Elder set_bit(OBJ_REQ_EXISTS, &obj_request->flags); 15195679c59fSAlex Elder set_bit(OBJ_REQ_KNOWN, &obj_request->flags); 15205679c59fSAlex Elder smp_mb(); 15215679c59fSAlex Elder } 15225679c59fSAlex Elder 15235679c59fSAlex Elder static bool obj_request_known_test(struct rbd_obj_request *obj_request) 15245679c59fSAlex Elder { 15255679c59fSAlex Elder smp_mb(); 15265679c59fSAlex Elder return test_bit(OBJ_REQ_KNOWN, &obj_request->flags) != 0; 15275679c59fSAlex Elder } 15285679c59fSAlex Elder 15295679c59fSAlex Elder static bool obj_request_exists_test(struct rbd_obj_request *obj_request) 15305679c59fSAlex Elder { 15315679c59fSAlex Elder smp_mb(); 15325679c59fSAlex Elder return test_bit(OBJ_REQ_EXISTS, &obj_request->flags) != 0; 15335679c59fSAlex Elder } 15345679c59fSAlex Elder 15359638556aSIlya Dryomov static bool obj_request_overlaps_parent(struct rbd_obj_request *obj_request) 15369638556aSIlya Dryomov { 15379638556aSIlya Dryomov struct rbd_device *rbd_dev = obj_request->img_request->rbd_dev; 15389638556aSIlya Dryomov 15399638556aSIlya Dryomov return obj_request->img_offset < 15409638556aSIlya Dryomov round_up(rbd_dev->parent_overlap, rbd_obj_bytes(&rbd_dev->header)); 15419638556aSIlya Dryomov } 15429638556aSIlya Dryomov 1543bf0d5f50SAlex Elder static void rbd_obj_request_get(struct rbd_obj_request *obj_request) 1544bf0d5f50SAlex Elder { 154537206ee5SAlex Elder dout("%s: obj %p (was %d)\n", __func__, obj_request, 154637206ee5SAlex Elder atomic_read(&obj_request->kref.refcount)); 1547bf0d5f50SAlex Elder kref_get(&obj_request->kref); 1548bf0d5f50SAlex Elder } 1549bf0d5f50SAlex Elder 1550bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref); 1551bf0d5f50SAlex Elder static void rbd_obj_request_put(struct rbd_obj_request *obj_request) 1552bf0d5f50SAlex Elder { 1553bf0d5f50SAlex Elder rbd_assert(obj_request != NULL); 155437206ee5SAlex Elder dout("%s: obj %p (was %d)\n", __func__, obj_request, 155537206ee5SAlex Elder atomic_read(&obj_request->kref.refcount)); 1556bf0d5f50SAlex Elder kref_put(&obj_request->kref, rbd_obj_request_destroy); 1557bf0d5f50SAlex Elder } 1558bf0d5f50SAlex Elder 15590f2d5be7SAlex Elder static void rbd_img_request_get(struct rbd_img_request *img_request) 15600f2d5be7SAlex Elder { 15610f2d5be7SAlex Elder dout("%s: img %p (was %d)\n", __func__, img_request, 15620f2d5be7SAlex Elder atomic_read(&img_request->kref.refcount)); 15630f2d5be7SAlex Elder kref_get(&img_request->kref); 15640f2d5be7SAlex Elder } 15650f2d5be7SAlex Elder 1566e93f3152SAlex Elder static bool img_request_child_test(struct rbd_img_request *img_request); 1567e93f3152SAlex Elder static void rbd_parent_request_destroy(struct kref *kref); 1568bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref); 1569bf0d5f50SAlex Elder static void rbd_img_request_put(struct rbd_img_request *img_request) 1570bf0d5f50SAlex Elder { 1571bf0d5f50SAlex Elder rbd_assert(img_request != NULL); 157237206ee5SAlex Elder dout("%s: img %p (was %d)\n", __func__, img_request, 157337206ee5SAlex Elder atomic_read(&img_request->kref.refcount)); 1574e93f3152SAlex Elder if (img_request_child_test(img_request)) 1575e93f3152SAlex Elder kref_put(&img_request->kref, rbd_parent_request_destroy); 1576e93f3152SAlex Elder else 1577bf0d5f50SAlex Elder kref_put(&img_request->kref, rbd_img_request_destroy); 1578bf0d5f50SAlex Elder } 1579bf0d5f50SAlex Elder 1580bf0d5f50SAlex Elder static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request, 1581bf0d5f50SAlex Elder struct rbd_obj_request *obj_request) 1582bf0d5f50SAlex Elder { 158325dcf954SAlex Elder rbd_assert(obj_request->img_request == NULL); 158425dcf954SAlex Elder 1585b155e86cSAlex Elder /* Image request now owns object's original reference */ 1586bf0d5f50SAlex Elder obj_request->img_request = img_request; 158725dcf954SAlex Elder obj_request->which = img_request->obj_request_count; 15886365d33aSAlex Elder rbd_assert(!obj_request_img_data_test(obj_request)); 15896365d33aSAlex Elder obj_request_img_data_set(obj_request); 1590bf0d5f50SAlex Elder rbd_assert(obj_request->which != BAD_WHICH); 159125dcf954SAlex Elder img_request->obj_request_count++; 159225dcf954SAlex Elder list_add_tail(&obj_request->links, &img_request->obj_requests); 159337206ee5SAlex Elder dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request, 159437206ee5SAlex Elder obj_request->which); 1595bf0d5f50SAlex Elder } 1596bf0d5f50SAlex Elder 1597bf0d5f50SAlex Elder static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request, 1598bf0d5f50SAlex Elder struct rbd_obj_request *obj_request) 1599bf0d5f50SAlex Elder { 1600bf0d5f50SAlex Elder rbd_assert(obj_request->which != BAD_WHICH); 160125dcf954SAlex Elder 160237206ee5SAlex Elder dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request, 160337206ee5SAlex Elder obj_request->which); 1604bf0d5f50SAlex Elder list_del(&obj_request->links); 160525dcf954SAlex Elder rbd_assert(img_request->obj_request_count > 0); 160625dcf954SAlex Elder img_request->obj_request_count--; 160725dcf954SAlex Elder rbd_assert(obj_request->which == img_request->obj_request_count); 160825dcf954SAlex Elder obj_request->which = BAD_WHICH; 16096365d33aSAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 1610bf0d5f50SAlex Elder rbd_assert(obj_request->img_request == img_request); 1611bf0d5f50SAlex Elder obj_request->img_request = NULL; 161225dcf954SAlex Elder obj_request->callback = NULL; 1613bf0d5f50SAlex Elder rbd_obj_request_put(obj_request); 1614bf0d5f50SAlex Elder } 1615bf0d5f50SAlex Elder 1616bf0d5f50SAlex Elder static bool obj_request_type_valid(enum obj_request_type type) 1617bf0d5f50SAlex Elder { 1618bf0d5f50SAlex Elder switch (type) { 16199969ebc5SAlex Elder case OBJ_REQUEST_NODATA: 1620bf0d5f50SAlex Elder case OBJ_REQUEST_BIO: 1621788e2df3SAlex Elder case OBJ_REQUEST_PAGES: 1622bf0d5f50SAlex Elder return true; 1623bf0d5f50SAlex Elder default: 1624bf0d5f50SAlex Elder return false; 1625bf0d5f50SAlex Elder } 1626bf0d5f50SAlex Elder } 1627bf0d5f50SAlex Elder 16284a17dadcSIlya Dryomov static void rbd_img_obj_callback(struct rbd_obj_request *obj_request); 16294a17dadcSIlya Dryomov 1630980917fcSIlya Dryomov static void rbd_obj_request_submit(struct rbd_obj_request *obj_request) 1631bf0d5f50SAlex Elder { 1632980917fcSIlya Dryomov struct ceph_osd_request *osd_req = obj_request->osd_req; 1633980917fcSIlya Dryomov 163467e2b652SIlya Dryomov dout("%s %p \"%s\" %llu~%llu osd_req %p\n", __func__, 163567e2b652SIlya Dryomov obj_request, obj_request->object_name, obj_request->offset, 163667e2b652SIlya Dryomov obj_request->length, osd_req); 16374a17dadcSIlya Dryomov if (obj_request_img_data_test(obj_request)) { 16384a17dadcSIlya Dryomov WARN_ON(obj_request->callback != rbd_img_obj_callback); 16394a17dadcSIlya Dryomov rbd_img_request_get(obj_request->img_request); 16404a17dadcSIlya Dryomov } 1641980917fcSIlya Dryomov ceph_osdc_start_request(osd_req->r_osdc, osd_req, false); 1642bf0d5f50SAlex Elder } 1643bf0d5f50SAlex Elder 1644bf0d5f50SAlex Elder static void rbd_img_request_complete(struct rbd_img_request *img_request) 1645bf0d5f50SAlex Elder { 164655f27e09SAlex Elder 164737206ee5SAlex Elder dout("%s: img %p\n", __func__, img_request); 164855f27e09SAlex Elder 164955f27e09SAlex Elder /* 165055f27e09SAlex Elder * If no error occurred, compute the aggregate transfer 165155f27e09SAlex Elder * count for the image request. We could instead use 165255f27e09SAlex Elder * atomic64_cmpxchg() to update it as each object request 165355f27e09SAlex Elder * completes; not clear which way is better off hand. 165455f27e09SAlex Elder */ 165555f27e09SAlex Elder if (!img_request->result) { 165655f27e09SAlex Elder struct rbd_obj_request *obj_request; 165755f27e09SAlex Elder u64 xferred = 0; 165855f27e09SAlex Elder 165955f27e09SAlex Elder for_each_obj_request(img_request, obj_request) 166055f27e09SAlex Elder xferred += obj_request->xferred; 166155f27e09SAlex Elder img_request->xferred = xferred; 166255f27e09SAlex Elder } 166355f27e09SAlex Elder 1664bf0d5f50SAlex Elder if (img_request->callback) 1665bf0d5f50SAlex Elder img_request->callback(img_request); 1666bf0d5f50SAlex Elder else 1667bf0d5f50SAlex Elder rbd_img_request_put(img_request); 1668bf0d5f50SAlex Elder } 1669bf0d5f50SAlex Elder 16700c425248SAlex Elder /* 16710c425248SAlex Elder * The default/initial value for all image request flags is 0. Each 16720c425248SAlex Elder * is conditionally set to 1 at image request initialization time 16730c425248SAlex Elder * and currently never change thereafter. 16740c425248SAlex Elder */ 16750c425248SAlex Elder static void img_request_write_set(struct rbd_img_request *img_request) 16760c425248SAlex Elder { 16770c425248SAlex Elder set_bit(IMG_REQ_WRITE, &img_request->flags); 16780c425248SAlex Elder smp_mb(); 16790c425248SAlex Elder } 16800c425248SAlex Elder 16810c425248SAlex Elder static bool img_request_write_test(struct rbd_img_request *img_request) 16820c425248SAlex Elder { 16830c425248SAlex Elder smp_mb(); 16840c425248SAlex Elder return test_bit(IMG_REQ_WRITE, &img_request->flags) != 0; 16850c425248SAlex Elder } 16860c425248SAlex Elder 168790e98c52SGuangliang Zhao /* 168890e98c52SGuangliang Zhao * Set the discard flag when the img_request is an discard request 168990e98c52SGuangliang Zhao */ 169090e98c52SGuangliang Zhao static void img_request_discard_set(struct rbd_img_request *img_request) 169190e98c52SGuangliang Zhao { 169290e98c52SGuangliang Zhao set_bit(IMG_REQ_DISCARD, &img_request->flags); 169390e98c52SGuangliang Zhao smp_mb(); 169490e98c52SGuangliang Zhao } 169590e98c52SGuangliang Zhao 169690e98c52SGuangliang Zhao static bool img_request_discard_test(struct rbd_img_request *img_request) 169790e98c52SGuangliang Zhao { 169890e98c52SGuangliang Zhao smp_mb(); 169990e98c52SGuangliang Zhao return test_bit(IMG_REQ_DISCARD, &img_request->flags) != 0; 170090e98c52SGuangliang Zhao } 170190e98c52SGuangliang Zhao 17029849e986SAlex Elder static void img_request_child_set(struct rbd_img_request *img_request) 17039849e986SAlex Elder { 17049849e986SAlex Elder set_bit(IMG_REQ_CHILD, &img_request->flags); 17059849e986SAlex Elder smp_mb(); 17069849e986SAlex Elder } 17079849e986SAlex Elder 1708e93f3152SAlex Elder static void img_request_child_clear(struct rbd_img_request *img_request) 1709e93f3152SAlex Elder { 1710e93f3152SAlex Elder clear_bit(IMG_REQ_CHILD, &img_request->flags); 1711e93f3152SAlex Elder smp_mb(); 1712e93f3152SAlex Elder } 1713e93f3152SAlex Elder 17149849e986SAlex Elder static bool img_request_child_test(struct rbd_img_request *img_request) 17159849e986SAlex Elder { 17169849e986SAlex Elder smp_mb(); 17179849e986SAlex Elder return test_bit(IMG_REQ_CHILD, &img_request->flags) != 0; 17189849e986SAlex Elder } 17199849e986SAlex Elder 1720d0b2e944SAlex Elder static void img_request_layered_set(struct rbd_img_request *img_request) 1721d0b2e944SAlex Elder { 1722d0b2e944SAlex Elder set_bit(IMG_REQ_LAYERED, &img_request->flags); 1723d0b2e944SAlex Elder smp_mb(); 1724d0b2e944SAlex Elder } 1725d0b2e944SAlex Elder 1726a2acd00eSAlex Elder static void img_request_layered_clear(struct rbd_img_request *img_request) 1727a2acd00eSAlex Elder { 1728a2acd00eSAlex Elder clear_bit(IMG_REQ_LAYERED, &img_request->flags); 1729a2acd00eSAlex Elder smp_mb(); 1730a2acd00eSAlex Elder } 1731a2acd00eSAlex Elder 1732d0b2e944SAlex Elder static bool img_request_layered_test(struct rbd_img_request *img_request) 1733d0b2e944SAlex Elder { 1734d0b2e944SAlex Elder smp_mb(); 1735d0b2e944SAlex Elder return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0; 1736d0b2e944SAlex Elder } 1737d0b2e944SAlex Elder 17383b434a2aSJosh Durgin static enum obj_operation_type 17393b434a2aSJosh Durgin rbd_img_request_op_type(struct rbd_img_request *img_request) 17403b434a2aSJosh Durgin { 17413b434a2aSJosh Durgin if (img_request_write_test(img_request)) 17423b434a2aSJosh Durgin return OBJ_OP_WRITE; 17433b434a2aSJosh Durgin else if (img_request_discard_test(img_request)) 17443b434a2aSJosh Durgin return OBJ_OP_DISCARD; 17453b434a2aSJosh Durgin else 17463b434a2aSJosh Durgin return OBJ_OP_READ; 17473b434a2aSJosh Durgin } 17483b434a2aSJosh Durgin 17496e2a4505SAlex Elder static void 17506e2a4505SAlex Elder rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request) 17516e2a4505SAlex Elder { 1752b9434c5bSAlex Elder u64 xferred = obj_request->xferred; 1753b9434c5bSAlex Elder u64 length = obj_request->length; 1754b9434c5bSAlex Elder 17556e2a4505SAlex Elder dout("%s: obj %p img %p result %d %llu/%llu\n", __func__, 17566e2a4505SAlex Elder obj_request, obj_request->img_request, obj_request->result, 1757b9434c5bSAlex Elder xferred, length); 17586e2a4505SAlex Elder /* 175917c1cc1dSJosh Durgin * ENOENT means a hole in the image. We zero-fill the entire 176017c1cc1dSJosh Durgin * length of the request. A short read also implies zero-fill 176117c1cc1dSJosh Durgin * to the end of the request. An error requires the whole 176217c1cc1dSJosh Durgin * length of the request to be reported finished with an error 176317c1cc1dSJosh Durgin * to the block layer. In each case we update the xferred 176417c1cc1dSJosh Durgin * count to indicate the whole request was satisfied. 17656e2a4505SAlex Elder */ 1766b9434c5bSAlex Elder rbd_assert(obj_request->type != OBJ_REQUEST_NODATA); 17676e2a4505SAlex Elder if (obj_request->result == -ENOENT) { 1768b9434c5bSAlex Elder if (obj_request->type == OBJ_REQUEST_BIO) 17696e2a4505SAlex Elder zero_bio_chain(obj_request->bio_list, 0); 1770b9434c5bSAlex Elder else 1771b9434c5bSAlex Elder zero_pages(obj_request->pages, 0, length); 17726e2a4505SAlex Elder obj_request->result = 0; 1773b9434c5bSAlex Elder } else if (xferred < length && !obj_request->result) { 1774b9434c5bSAlex Elder if (obj_request->type == OBJ_REQUEST_BIO) 1775b9434c5bSAlex Elder zero_bio_chain(obj_request->bio_list, xferred); 1776b9434c5bSAlex Elder else 1777b9434c5bSAlex Elder zero_pages(obj_request->pages, xferred, length); 17786e2a4505SAlex Elder } 177917c1cc1dSJosh Durgin obj_request->xferred = length; 17806e2a4505SAlex Elder obj_request_done_set(obj_request); 17816e2a4505SAlex Elder } 17826e2a4505SAlex Elder 1783bf0d5f50SAlex Elder static void rbd_obj_request_complete(struct rbd_obj_request *obj_request) 1784bf0d5f50SAlex Elder { 178537206ee5SAlex Elder dout("%s: obj %p cb %p\n", __func__, obj_request, 178637206ee5SAlex Elder obj_request->callback); 1787bf0d5f50SAlex Elder if (obj_request->callback) 1788bf0d5f50SAlex Elder obj_request->callback(obj_request); 1789788e2df3SAlex Elder else 1790788e2df3SAlex Elder complete_all(&obj_request->completion); 1791bf0d5f50SAlex Elder } 1792bf0d5f50SAlex Elder 17930dcc685eSIlya Dryomov static void rbd_obj_request_error(struct rbd_obj_request *obj_request, int err) 17940dcc685eSIlya Dryomov { 17950dcc685eSIlya Dryomov obj_request->result = err; 17960dcc685eSIlya Dryomov obj_request->xferred = 0; 17970dcc685eSIlya Dryomov /* 17980dcc685eSIlya Dryomov * kludge - mirror rbd_obj_request_submit() to match a put in 17990dcc685eSIlya Dryomov * rbd_img_obj_callback() 18000dcc685eSIlya Dryomov */ 18010dcc685eSIlya Dryomov if (obj_request_img_data_test(obj_request)) { 18020dcc685eSIlya Dryomov WARN_ON(obj_request->callback != rbd_img_obj_callback); 18030dcc685eSIlya Dryomov rbd_img_request_get(obj_request->img_request); 18040dcc685eSIlya Dryomov } 18050dcc685eSIlya Dryomov obj_request_done_set(obj_request); 18060dcc685eSIlya Dryomov rbd_obj_request_complete(obj_request); 18070dcc685eSIlya Dryomov } 18080dcc685eSIlya Dryomov 1809c47f9371SAlex Elder static void rbd_osd_read_callback(struct rbd_obj_request *obj_request) 1810bf0d5f50SAlex Elder { 181157acbaa7SAlex Elder struct rbd_img_request *img_request = NULL; 1812a9e8ba2cSAlex Elder struct rbd_device *rbd_dev = NULL; 181357acbaa7SAlex Elder bool layered = false; 181457acbaa7SAlex Elder 181557acbaa7SAlex Elder if (obj_request_img_data_test(obj_request)) { 181657acbaa7SAlex Elder img_request = obj_request->img_request; 181757acbaa7SAlex Elder layered = img_request && img_request_layered_test(img_request); 1818a9e8ba2cSAlex Elder rbd_dev = img_request->rbd_dev; 181957acbaa7SAlex Elder } 18208b3e1a56SAlex Elder 18218b3e1a56SAlex Elder dout("%s: obj %p img %p result %d %llu/%llu\n", __func__, 18228b3e1a56SAlex Elder obj_request, img_request, obj_request->result, 18238b3e1a56SAlex Elder obj_request->xferred, obj_request->length); 1824a9e8ba2cSAlex Elder if (layered && obj_request->result == -ENOENT && 1825a9e8ba2cSAlex Elder obj_request->img_offset < rbd_dev->parent_overlap) 18268b3e1a56SAlex Elder rbd_img_parent_read(obj_request); 18278b3e1a56SAlex Elder else if (img_request) 18286e2a4505SAlex Elder rbd_img_obj_request_read_callback(obj_request); 18296e2a4505SAlex Elder else 183007741308SAlex Elder obj_request_done_set(obj_request); 1831bf0d5f50SAlex Elder } 1832bf0d5f50SAlex Elder 1833c47f9371SAlex Elder static void rbd_osd_write_callback(struct rbd_obj_request *obj_request) 1834bf0d5f50SAlex Elder { 18351b83bef2SSage Weil dout("%s: obj %p result %d %llu\n", __func__, obj_request, 18361b83bef2SSage Weil obj_request->result, obj_request->length); 18371b83bef2SSage Weil /* 18388b3e1a56SAlex Elder * There is no such thing as a successful short write. Set 18398b3e1a56SAlex Elder * it to our originally-requested length. 18401b83bef2SSage Weil */ 18411b83bef2SSage Weil obj_request->xferred = obj_request->length; 184207741308SAlex Elder obj_request_done_set(obj_request); 1843bf0d5f50SAlex Elder } 1844bf0d5f50SAlex Elder 184590e98c52SGuangliang Zhao static void rbd_osd_discard_callback(struct rbd_obj_request *obj_request) 184690e98c52SGuangliang Zhao { 184790e98c52SGuangliang Zhao dout("%s: obj %p result %d %llu\n", __func__, obj_request, 184890e98c52SGuangliang Zhao obj_request->result, obj_request->length); 184990e98c52SGuangliang Zhao /* 185090e98c52SGuangliang Zhao * There is no such thing as a successful short discard. Set 185190e98c52SGuangliang Zhao * it to our originally-requested length. 185290e98c52SGuangliang Zhao */ 185390e98c52SGuangliang Zhao obj_request->xferred = obj_request->length; 1854d0265de7SJosh Durgin /* discarding a non-existent object is not a problem */ 1855d0265de7SJosh Durgin if (obj_request->result == -ENOENT) 1856d0265de7SJosh Durgin obj_request->result = 0; 185790e98c52SGuangliang Zhao obj_request_done_set(obj_request); 185890e98c52SGuangliang Zhao } 185990e98c52SGuangliang Zhao 1860fbfab539SAlex Elder /* 1861fbfab539SAlex Elder * For a simple stat call there's nothing to do. We'll do more if 1862fbfab539SAlex Elder * this is part of a write sequence for a layered image. 1863fbfab539SAlex Elder */ 1864c47f9371SAlex Elder static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request) 1865fbfab539SAlex Elder { 186637206ee5SAlex Elder dout("%s: obj %p\n", __func__, obj_request); 1867fbfab539SAlex Elder obj_request_done_set(obj_request); 1868fbfab539SAlex Elder } 1869fbfab539SAlex Elder 18702761713dSIlya Dryomov static void rbd_osd_call_callback(struct rbd_obj_request *obj_request) 18712761713dSIlya Dryomov { 18722761713dSIlya Dryomov dout("%s: obj %p\n", __func__, obj_request); 18732761713dSIlya Dryomov 18742761713dSIlya Dryomov if (obj_request_img_data_test(obj_request)) 18752761713dSIlya Dryomov rbd_osd_copyup_callback(obj_request); 18762761713dSIlya Dryomov else 18772761713dSIlya Dryomov obj_request_done_set(obj_request); 18782761713dSIlya Dryomov } 18792761713dSIlya Dryomov 188085e084feSIlya Dryomov static void rbd_osd_req_callback(struct ceph_osd_request *osd_req) 1881bf0d5f50SAlex Elder { 1882bf0d5f50SAlex Elder struct rbd_obj_request *obj_request = osd_req->r_priv; 1883bf0d5f50SAlex Elder u16 opcode; 1884bf0d5f50SAlex Elder 188585e084feSIlya Dryomov dout("%s: osd_req %p\n", __func__, osd_req); 1886bf0d5f50SAlex Elder rbd_assert(osd_req == obj_request->osd_req); 188757acbaa7SAlex Elder if (obj_request_img_data_test(obj_request)) { 188857acbaa7SAlex Elder rbd_assert(obj_request->img_request); 188957acbaa7SAlex Elder rbd_assert(obj_request->which != BAD_WHICH); 189057acbaa7SAlex Elder } else { 189157acbaa7SAlex Elder rbd_assert(obj_request->which == BAD_WHICH); 189257acbaa7SAlex Elder } 1893bf0d5f50SAlex Elder 18941b83bef2SSage Weil if (osd_req->r_result < 0) 18951b83bef2SSage Weil obj_request->result = osd_req->r_result; 1896bf0d5f50SAlex Elder 1897c47f9371SAlex Elder /* 1898c47f9371SAlex Elder * We support a 64-bit length, but ultimately it has to be 18997ad18afaSChristoph Hellwig * passed to the block layer, which just supports a 32-bit 19007ad18afaSChristoph Hellwig * length field. 1901c47f9371SAlex Elder */ 19027665d85bSYan, Zheng obj_request->xferred = osd_req->r_ops[0].outdata_len; 1903c47f9371SAlex Elder rbd_assert(obj_request->xferred < (u64)UINT_MAX); 19040ccd5926SIlya Dryomov 190579528734SAlex Elder opcode = osd_req->r_ops[0].op; 1906bf0d5f50SAlex Elder switch (opcode) { 1907bf0d5f50SAlex Elder case CEPH_OSD_OP_READ: 1908c47f9371SAlex Elder rbd_osd_read_callback(obj_request); 1909bf0d5f50SAlex Elder break; 19100ccd5926SIlya Dryomov case CEPH_OSD_OP_SETALLOCHINT: 1911e30b7577SIlya Dryomov rbd_assert(osd_req->r_ops[1].op == CEPH_OSD_OP_WRITE || 1912e30b7577SIlya Dryomov osd_req->r_ops[1].op == CEPH_OSD_OP_WRITEFULL); 19130ccd5926SIlya Dryomov /* fall through */ 1914bf0d5f50SAlex Elder case CEPH_OSD_OP_WRITE: 1915e30b7577SIlya Dryomov case CEPH_OSD_OP_WRITEFULL: 1916c47f9371SAlex Elder rbd_osd_write_callback(obj_request); 1917bf0d5f50SAlex Elder break; 1918fbfab539SAlex Elder case CEPH_OSD_OP_STAT: 1919c47f9371SAlex Elder rbd_osd_stat_callback(obj_request); 1920fbfab539SAlex Elder break; 192190e98c52SGuangliang Zhao case CEPH_OSD_OP_DELETE: 192290e98c52SGuangliang Zhao case CEPH_OSD_OP_TRUNCATE: 192390e98c52SGuangliang Zhao case CEPH_OSD_OP_ZERO: 192490e98c52SGuangliang Zhao rbd_osd_discard_callback(obj_request); 192590e98c52SGuangliang Zhao break; 192636be9a76SAlex Elder case CEPH_OSD_OP_CALL: 19272761713dSIlya Dryomov rbd_osd_call_callback(obj_request); 19282761713dSIlya Dryomov break; 1929bf0d5f50SAlex Elder default: 19309584d508SIlya Dryomov rbd_warn(NULL, "%s: unsupported op %hu", 1931bf0d5f50SAlex Elder obj_request->object_name, (unsigned short) opcode); 1932bf0d5f50SAlex Elder break; 1933bf0d5f50SAlex Elder } 1934bf0d5f50SAlex Elder 193507741308SAlex Elder if (obj_request_done_test(obj_request)) 1936bf0d5f50SAlex Elder rbd_obj_request_complete(obj_request); 1937bf0d5f50SAlex Elder } 1938bf0d5f50SAlex Elder 19399d4df01fSAlex Elder static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request) 1940430c28c3SAlex Elder { 19418c042b0dSAlex Elder struct ceph_osd_request *osd_req = obj_request->osd_req; 1942430c28c3SAlex Elder 19437c84883aSIlya Dryomov rbd_assert(obj_request_img_data_test(obj_request)); 19447c84883aSIlya Dryomov osd_req->r_snapid = obj_request->img_request->snap_id; 19459d4df01fSAlex Elder } 19469d4df01fSAlex Elder 19479d4df01fSAlex Elder static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request) 19489d4df01fSAlex Elder { 19499d4df01fSAlex Elder struct ceph_osd_request *osd_req = obj_request->osd_req; 19509d4df01fSAlex Elder 1951bb873b53SIlya Dryomov osd_req->r_mtime = CURRENT_TIME; 1952bb873b53SIlya Dryomov osd_req->r_data_offset = obj_request->offset; 1953430c28c3SAlex Elder } 1954430c28c3SAlex Elder 19550ccd5926SIlya Dryomov /* 19560ccd5926SIlya Dryomov * Create an osd request. A read request has one osd op (read). 19570ccd5926SIlya Dryomov * A write request has either one (watch) or two (hint+write) osd ops. 19580ccd5926SIlya Dryomov * (All rbd data writes are prefixed with an allocation hint op, but 19590ccd5926SIlya Dryomov * technically osd watch is a write request, hence this distinction.) 19600ccd5926SIlya Dryomov */ 1961bf0d5f50SAlex Elder static struct ceph_osd_request *rbd_osd_req_create( 1962bf0d5f50SAlex Elder struct rbd_device *rbd_dev, 19636d2940c8SGuangliang Zhao enum obj_operation_type op_type, 1964deb236b3SIlya Dryomov unsigned int num_ops, 1965430c28c3SAlex Elder struct rbd_obj_request *obj_request) 1966bf0d5f50SAlex Elder { 1967bf0d5f50SAlex Elder struct ceph_snap_context *snapc = NULL; 1968bf0d5f50SAlex Elder struct ceph_osd_client *osdc; 1969bf0d5f50SAlex Elder struct ceph_osd_request *osd_req; 1970bf0d5f50SAlex Elder 197190e98c52SGuangliang Zhao if (obj_request_img_data_test(obj_request) && 197290e98c52SGuangliang Zhao (op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_WRITE)) { 19736365d33aSAlex Elder struct rbd_img_request *img_request = obj_request->img_request; 197490e98c52SGuangliang Zhao if (op_type == OBJ_OP_WRITE) { 19756d2940c8SGuangliang Zhao rbd_assert(img_request_write_test(img_request)); 197690e98c52SGuangliang Zhao } else { 197790e98c52SGuangliang Zhao rbd_assert(img_request_discard_test(img_request)); 197890e98c52SGuangliang Zhao } 1979bf0d5f50SAlex Elder snapc = img_request->snapc; 1980bf0d5f50SAlex Elder } 1981bf0d5f50SAlex Elder 19826d2940c8SGuangliang Zhao rbd_assert(num_ops == 1 || ((op_type == OBJ_OP_WRITE) && num_ops == 2)); 1983deb236b3SIlya Dryomov 1984deb236b3SIlya Dryomov /* Allocate and initialize the request, for the num_ops ops */ 1985bf0d5f50SAlex Elder 1986bf0d5f50SAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 1987deb236b3SIlya Dryomov osd_req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false, 19882224d879SDavid Disseldorp GFP_NOIO); 1989bf0d5f50SAlex Elder if (!osd_req) 199013d1ad16SIlya Dryomov goto fail; 1991bf0d5f50SAlex Elder 199290e98c52SGuangliang Zhao if (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD) 1993bf0d5f50SAlex Elder osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK; 1994430c28c3SAlex Elder else 1995bf0d5f50SAlex Elder osd_req->r_flags = CEPH_OSD_FLAG_READ; 1996bf0d5f50SAlex Elder 1997bf0d5f50SAlex Elder osd_req->r_callback = rbd_osd_req_callback; 1998bf0d5f50SAlex Elder osd_req->r_priv = obj_request; 1999bf0d5f50SAlex Elder 20007627151eSYan, Zheng osd_req->r_base_oloc.pool = rbd_dev->layout.pool_id; 2001d30291b9SIlya Dryomov if (ceph_oid_aprintf(&osd_req->r_base_oid, GFP_NOIO, "%s", 2002d30291b9SIlya Dryomov obj_request->object_name)) 2003d30291b9SIlya Dryomov goto fail; 2004bf0d5f50SAlex Elder 200513d1ad16SIlya Dryomov if (ceph_osdc_alloc_messages(osd_req, GFP_NOIO)) 200613d1ad16SIlya Dryomov goto fail; 200713d1ad16SIlya Dryomov 2008bf0d5f50SAlex Elder return osd_req; 200913d1ad16SIlya Dryomov 201013d1ad16SIlya Dryomov fail: 201113d1ad16SIlya Dryomov ceph_osdc_put_request(osd_req); 201213d1ad16SIlya Dryomov return NULL; 2013bf0d5f50SAlex Elder } 2014bf0d5f50SAlex Elder 20150eefd470SAlex Elder /* 2016d3246fb0SJosh Durgin * Create a copyup osd request based on the information in the object 2017d3246fb0SJosh Durgin * request supplied. A copyup request has two or three osd ops, a 2018d3246fb0SJosh Durgin * copyup method call, potentially a hint op, and a write or truncate 2019d3246fb0SJosh Durgin * or zero op. 20200eefd470SAlex Elder */ 20210eefd470SAlex Elder static struct ceph_osd_request * 20220eefd470SAlex Elder rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request) 20230eefd470SAlex Elder { 20240eefd470SAlex Elder struct rbd_img_request *img_request; 20250eefd470SAlex Elder struct ceph_snap_context *snapc; 20260eefd470SAlex Elder struct rbd_device *rbd_dev; 20270eefd470SAlex Elder struct ceph_osd_client *osdc; 20280eefd470SAlex Elder struct ceph_osd_request *osd_req; 2029d3246fb0SJosh Durgin int num_osd_ops = 3; 20300eefd470SAlex Elder 20310eefd470SAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 20320eefd470SAlex Elder img_request = obj_request->img_request; 20330eefd470SAlex Elder rbd_assert(img_request); 2034d3246fb0SJosh Durgin rbd_assert(img_request_write_test(img_request) || 2035d3246fb0SJosh Durgin img_request_discard_test(img_request)); 20360eefd470SAlex Elder 2037d3246fb0SJosh Durgin if (img_request_discard_test(img_request)) 2038d3246fb0SJosh Durgin num_osd_ops = 2; 2039d3246fb0SJosh Durgin 2040d3246fb0SJosh Durgin /* Allocate and initialize the request, for all the ops */ 20410eefd470SAlex Elder 20420eefd470SAlex Elder snapc = img_request->snapc; 20430eefd470SAlex Elder rbd_dev = img_request->rbd_dev; 20440eefd470SAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 2045d3246fb0SJosh Durgin osd_req = ceph_osdc_alloc_request(osdc, snapc, num_osd_ops, 20462224d879SDavid Disseldorp false, GFP_NOIO); 20470eefd470SAlex Elder if (!osd_req) 204813d1ad16SIlya Dryomov goto fail; 20490eefd470SAlex Elder 20500eefd470SAlex Elder osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK; 20510eefd470SAlex Elder osd_req->r_callback = rbd_osd_req_callback; 20520eefd470SAlex Elder osd_req->r_priv = obj_request; 20530eefd470SAlex Elder 20547627151eSYan, Zheng osd_req->r_base_oloc.pool = rbd_dev->layout.pool_id; 2055d30291b9SIlya Dryomov if (ceph_oid_aprintf(&osd_req->r_base_oid, GFP_NOIO, "%s", 2056d30291b9SIlya Dryomov obj_request->object_name)) 2057d30291b9SIlya Dryomov goto fail; 20580eefd470SAlex Elder 205913d1ad16SIlya Dryomov if (ceph_osdc_alloc_messages(osd_req, GFP_NOIO)) 206013d1ad16SIlya Dryomov goto fail; 206113d1ad16SIlya Dryomov 20620eefd470SAlex Elder return osd_req; 206313d1ad16SIlya Dryomov 206413d1ad16SIlya Dryomov fail: 206513d1ad16SIlya Dryomov ceph_osdc_put_request(osd_req); 206613d1ad16SIlya Dryomov return NULL; 20670eefd470SAlex Elder } 20680eefd470SAlex Elder 20690eefd470SAlex Elder 2070bf0d5f50SAlex Elder static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req) 2071bf0d5f50SAlex Elder { 2072bf0d5f50SAlex Elder ceph_osdc_put_request(osd_req); 2073bf0d5f50SAlex Elder } 2074bf0d5f50SAlex Elder 2075bf0d5f50SAlex Elder /* object_name is assumed to be a non-null pointer and NUL-terminated */ 2076bf0d5f50SAlex Elder 2077bf0d5f50SAlex Elder static struct rbd_obj_request *rbd_obj_request_create(const char *object_name, 2078bf0d5f50SAlex Elder enum obj_request_type type) 2079bf0d5f50SAlex Elder { 2080bf0d5f50SAlex Elder struct rbd_obj_request *obj_request; 2081bf0d5f50SAlex Elder size_t size; 2082bf0d5f50SAlex Elder char *name; 2083bf0d5f50SAlex Elder 2084bf0d5f50SAlex Elder rbd_assert(obj_request_type_valid(type)); 2085bf0d5f50SAlex Elder 2086bf0d5f50SAlex Elder size = strlen(object_name) + 1; 20875a60e876SIlya Dryomov name = kmalloc(size, GFP_NOIO); 2088f907ad55SAlex Elder if (!name) 2089bf0d5f50SAlex Elder return NULL; 2090bf0d5f50SAlex Elder 20915a60e876SIlya Dryomov obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_NOIO); 2092f907ad55SAlex Elder if (!obj_request) { 2093f907ad55SAlex Elder kfree(name); 2094f907ad55SAlex Elder return NULL; 2095f907ad55SAlex Elder } 2096f907ad55SAlex Elder 2097bf0d5f50SAlex Elder obj_request->object_name = memcpy(name, object_name, size); 2098bf0d5f50SAlex Elder obj_request->which = BAD_WHICH; 2099bf0d5f50SAlex Elder obj_request->type = type; 2100bf0d5f50SAlex Elder INIT_LIST_HEAD(&obj_request->links); 2101788e2df3SAlex Elder init_completion(&obj_request->completion); 2102bf0d5f50SAlex Elder kref_init(&obj_request->kref); 2103bf0d5f50SAlex Elder 210467e2b652SIlya Dryomov dout("%s %p\n", __func__, obj_request); 2105bf0d5f50SAlex Elder return obj_request; 2106bf0d5f50SAlex Elder } 2107bf0d5f50SAlex Elder 2108bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref) 2109bf0d5f50SAlex Elder { 2110bf0d5f50SAlex Elder struct rbd_obj_request *obj_request; 2111bf0d5f50SAlex Elder 2112bf0d5f50SAlex Elder obj_request = container_of(kref, struct rbd_obj_request, kref); 2113bf0d5f50SAlex Elder 211437206ee5SAlex Elder dout("%s: obj %p\n", __func__, obj_request); 211537206ee5SAlex Elder 2116bf0d5f50SAlex Elder rbd_assert(obj_request->img_request == NULL); 2117bf0d5f50SAlex Elder rbd_assert(obj_request->which == BAD_WHICH); 2118bf0d5f50SAlex Elder 2119bf0d5f50SAlex Elder if (obj_request->osd_req) 2120bf0d5f50SAlex Elder rbd_osd_req_destroy(obj_request->osd_req); 2121bf0d5f50SAlex Elder 2122bf0d5f50SAlex Elder rbd_assert(obj_request_type_valid(obj_request->type)); 2123bf0d5f50SAlex Elder switch (obj_request->type) { 21249969ebc5SAlex Elder case OBJ_REQUEST_NODATA: 21259969ebc5SAlex Elder break; /* Nothing to do */ 2126bf0d5f50SAlex Elder case OBJ_REQUEST_BIO: 2127bf0d5f50SAlex Elder if (obj_request->bio_list) 2128bf0d5f50SAlex Elder bio_chain_put(obj_request->bio_list); 2129bf0d5f50SAlex Elder break; 2130788e2df3SAlex Elder case OBJ_REQUEST_PAGES: 213104dc923cSIlya Dryomov /* img_data requests don't own their page array */ 213204dc923cSIlya Dryomov if (obj_request->pages && 213304dc923cSIlya Dryomov !obj_request_img_data_test(obj_request)) 2134788e2df3SAlex Elder ceph_release_page_vector(obj_request->pages, 2135788e2df3SAlex Elder obj_request->page_count); 2136788e2df3SAlex Elder break; 2137bf0d5f50SAlex Elder } 2138bf0d5f50SAlex Elder 2139f907ad55SAlex Elder kfree(obj_request->object_name); 2140868311b1SAlex Elder obj_request->object_name = NULL; 2141868311b1SAlex Elder kmem_cache_free(rbd_obj_request_cache, obj_request); 2142bf0d5f50SAlex Elder } 2143bf0d5f50SAlex Elder 2144fb65d228SAlex Elder /* It's OK to call this for a device with no parent */ 2145fb65d228SAlex Elder 2146fb65d228SAlex Elder static void rbd_spec_put(struct rbd_spec *spec); 2147fb65d228SAlex Elder static void rbd_dev_unparent(struct rbd_device *rbd_dev) 2148fb65d228SAlex Elder { 2149fb65d228SAlex Elder rbd_dev_remove_parent(rbd_dev); 2150fb65d228SAlex Elder rbd_spec_put(rbd_dev->parent_spec); 2151fb65d228SAlex Elder rbd_dev->parent_spec = NULL; 2152fb65d228SAlex Elder rbd_dev->parent_overlap = 0; 2153fb65d228SAlex Elder } 2154fb65d228SAlex Elder 2155bf0d5f50SAlex Elder /* 2156a2acd00eSAlex Elder * Parent image reference counting is used to determine when an 2157a2acd00eSAlex Elder * image's parent fields can be safely torn down--after there are no 2158a2acd00eSAlex Elder * more in-flight requests to the parent image. When the last 2159a2acd00eSAlex Elder * reference is dropped, cleaning them up is safe. 2160a2acd00eSAlex Elder */ 2161a2acd00eSAlex Elder static void rbd_dev_parent_put(struct rbd_device *rbd_dev) 2162a2acd00eSAlex Elder { 2163a2acd00eSAlex Elder int counter; 2164a2acd00eSAlex Elder 2165a2acd00eSAlex Elder if (!rbd_dev->parent_spec) 2166a2acd00eSAlex Elder return; 2167a2acd00eSAlex Elder 2168a2acd00eSAlex Elder counter = atomic_dec_return_safe(&rbd_dev->parent_ref); 2169a2acd00eSAlex Elder if (counter > 0) 2170a2acd00eSAlex Elder return; 2171a2acd00eSAlex Elder 2172a2acd00eSAlex Elder /* Last reference; clean up parent data structures */ 2173a2acd00eSAlex Elder 2174a2acd00eSAlex Elder if (!counter) 2175a2acd00eSAlex Elder rbd_dev_unparent(rbd_dev); 2176a2acd00eSAlex Elder else 21779584d508SIlya Dryomov rbd_warn(rbd_dev, "parent reference underflow"); 2178a2acd00eSAlex Elder } 2179a2acd00eSAlex Elder 2180a2acd00eSAlex Elder /* 2181a2acd00eSAlex Elder * If an image has a non-zero parent overlap, get a reference to its 2182a2acd00eSAlex Elder * parent. 2183a2acd00eSAlex Elder * 2184a2acd00eSAlex Elder * Returns true if the rbd device has a parent with a non-zero 2185a2acd00eSAlex Elder * overlap and a reference for it was successfully taken, or 2186a2acd00eSAlex Elder * false otherwise. 2187a2acd00eSAlex Elder */ 2188a2acd00eSAlex Elder static bool rbd_dev_parent_get(struct rbd_device *rbd_dev) 2189a2acd00eSAlex Elder { 2190ae43e9d0SIlya Dryomov int counter = 0; 2191a2acd00eSAlex Elder 2192a2acd00eSAlex Elder if (!rbd_dev->parent_spec) 2193a2acd00eSAlex Elder return false; 2194a2acd00eSAlex Elder 2195ae43e9d0SIlya Dryomov down_read(&rbd_dev->header_rwsem); 2196ae43e9d0SIlya Dryomov if (rbd_dev->parent_overlap) 2197a2acd00eSAlex Elder counter = atomic_inc_return_safe(&rbd_dev->parent_ref); 2198ae43e9d0SIlya Dryomov up_read(&rbd_dev->header_rwsem); 2199a2acd00eSAlex Elder 2200a2acd00eSAlex Elder if (counter < 0) 22019584d508SIlya Dryomov rbd_warn(rbd_dev, "parent reference overflow"); 2202a2acd00eSAlex Elder 2203ae43e9d0SIlya Dryomov return counter > 0; 2204a2acd00eSAlex Elder } 2205a2acd00eSAlex Elder 2206bf0d5f50SAlex Elder /* 2207bf0d5f50SAlex Elder * Caller is responsible for filling in the list of object requests 2208bf0d5f50SAlex Elder * that comprises the image request, and the Linux request pointer 2209bf0d5f50SAlex Elder * (if there is one). 2210bf0d5f50SAlex Elder */ 2211cc344fa1SAlex Elder static struct rbd_img_request *rbd_img_request_create( 2212cc344fa1SAlex Elder struct rbd_device *rbd_dev, 2213bf0d5f50SAlex Elder u64 offset, u64 length, 22146d2940c8SGuangliang Zhao enum obj_operation_type op_type, 22154e752f0aSJosh Durgin struct ceph_snap_context *snapc) 2216bf0d5f50SAlex Elder { 2217bf0d5f50SAlex Elder struct rbd_img_request *img_request; 2218bf0d5f50SAlex Elder 22197a716aacSIlya Dryomov img_request = kmem_cache_alloc(rbd_img_request_cache, GFP_NOIO); 2220bf0d5f50SAlex Elder if (!img_request) 2221bf0d5f50SAlex Elder return NULL; 2222bf0d5f50SAlex Elder 2223bf0d5f50SAlex Elder img_request->rq = NULL; 2224bf0d5f50SAlex Elder img_request->rbd_dev = rbd_dev; 2225bf0d5f50SAlex Elder img_request->offset = offset; 2226bf0d5f50SAlex Elder img_request->length = length; 22270c425248SAlex Elder img_request->flags = 0; 222890e98c52SGuangliang Zhao if (op_type == OBJ_OP_DISCARD) { 222990e98c52SGuangliang Zhao img_request_discard_set(img_request); 223090e98c52SGuangliang Zhao img_request->snapc = snapc; 223190e98c52SGuangliang Zhao } else if (op_type == OBJ_OP_WRITE) { 22320c425248SAlex Elder img_request_write_set(img_request); 22334e752f0aSJosh Durgin img_request->snapc = snapc; 22340c425248SAlex Elder } else { 2235bf0d5f50SAlex Elder img_request->snap_id = rbd_dev->spec->snap_id; 22360c425248SAlex Elder } 2237a2acd00eSAlex Elder if (rbd_dev_parent_get(rbd_dev)) 2238d0b2e944SAlex Elder img_request_layered_set(img_request); 2239bf0d5f50SAlex Elder spin_lock_init(&img_request->completion_lock); 2240bf0d5f50SAlex Elder img_request->next_completion = 0; 2241bf0d5f50SAlex Elder img_request->callback = NULL; 2242a5a337d4SAlex Elder img_request->result = 0; 2243bf0d5f50SAlex Elder img_request->obj_request_count = 0; 2244bf0d5f50SAlex Elder INIT_LIST_HEAD(&img_request->obj_requests); 2245bf0d5f50SAlex Elder kref_init(&img_request->kref); 2246bf0d5f50SAlex Elder 224737206ee5SAlex Elder dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev, 22486d2940c8SGuangliang Zhao obj_op_name(op_type), offset, length, img_request); 224937206ee5SAlex Elder 2250bf0d5f50SAlex Elder return img_request; 2251bf0d5f50SAlex Elder } 2252bf0d5f50SAlex Elder 2253bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref) 2254bf0d5f50SAlex Elder { 2255bf0d5f50SAlex Elder struct rbd_img_request *img_request; 2256bf0d5f50SAlex Elder struct rbd_obj_request *obj_request; 2257bf0d5f50SAlex Elder struct rbd_obj_request *next_obj_request; 2258bf0d5f50SAlex Elder 2259bf0d5f50SAlex Elder img_request = container_of(kref, struct rbd_img_request, kref); 2260bf0d5f50SAlex Elder 226137206ee5SAlex Elder dout("%s: img %p\n", __func__, img_request); 226237206ee5SAlex Elder 2263bf0d5f50SAlex Elder for_each_obj_request_safe(img_request, obj_request, next_obj_request) 2264bf0d5f50SAlex Elder rbd_img_obj_request_del(img_request, obj_request); 226525dcf954SAlex Elder rbd_assert(img_request->obj_request_count == 0); 2266bf0d5f50SAlex Elder 2267a2acd00eSAlex Elder if (img_request_layered_test(img_request)) { 2268a2acd00eSAlex Elder img_request_layered_clear(img_request); 2269a2acd00eSAlex Elder rbd_dev_parent_put(img_request->rbd_dev); 2270a2acd00eSAlex Elder } 2271a2acd00eSAlex Elder 2272bef95455SJosh Durgin if (img_request_write_test(img_request) || 2273bef95455SJosh Durgin img_request_discard_test(img_request)) 2274812164f8SAlex Elder ceph_put_snap_context(img_request->snapc); 2275bf0d5f50SAlex Elder 22761c2a9dfeSAlex Elder kmem_cache_free(rbd_img_request_cache, img_request); 2277bf0d5f50SAlex Elder } 2278bf0d5f50SAlex Elder 2279e93f3152SAlex Elder static struct rbd_img_request *rbd_parent_request_create( 2280e93f3152SAlex Elder struct rbd_obj_request *obj_request, 2281e93f3152SAlex Elder u64 img_offset, u64 length) 2282e93f3152SAlex Elder { 2283e93f3152SAlex Elder struct rbd_img_request *parent_request; 2284e93f3152SAlex Elder struct rbd_device *rbd_dev; 2285e93f3152SAlex Elder 2286e93f3152SAlex Elder rbd_assert(obj_request->img_request); 2287e93f3152SAlex Elder rbd_dev = obj_request->img_request->rbd_dev; 2288e93f3152SAlex Elder 22894e752f0aSJosh Durgin parent_request = rbd_img_request_create(rbd_dev->parent, img_offset, 22906d2940c8SGuangliang Zhao length, OBJ_OP_READ, NULL); 2291e93f3152SAlex Elder if (!parent_request) 2292e93f3152SAlex Elder return NULL; 2293e93f3152SAlex Elder 2294e93f3152SAlex Elder img_request_child_set(parent_request); 2295e93f3152SAlex Elder rbd_obj_request_get(obj_request); 2296e93f3152SAlex Elder parent_request->obj_request = obj_request; 2297e93f3152SAlex Elder 2298e93f3152SAlex Elder return parent_request; 2299e93f3152SAlex Elder } 2300e93f3152SAlex Elder 2301e93f3152SAlex Elder static void rbd_parent_request_destroy(struct kref *kref) 2302e93f3152SAlex Elder { 2303e93f3152SAlex Elder struct rbd_img_request *parent_request; 2304e93f3152SAlex Elder struct rbd_obj_request *orig_request; 2305e93f3152SAlex Elder 2306e93f3152SAlex Elder parent_request = container_of(kref, struct rbd_img_request, kref); 2307e93f3152SAlex Elder orig_request = parent_request->obj_request; 2308e93f3152SAlex Elder 2309e93f3152SAlex Elder parent_request->obj_request = NULL; 2310e93f3152SAlex Elder rbd_obj_request_put(orig_request); 2311e93f3152SAlex Elder img_request_child_clear(parent_request); 2312e93f3152SAlex Elder 2313e93f3152SAlex Elder rbd_img_request_destroy(kref); 2314e93f3152SAlex Elder } 2315e93f3152SAlex Elder 23161217857fSAlex Elder static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request) 23171217857fSAlex Elder { 23186365d33aSAlex Elder struct rbd_img_request *img_request; 23191217857fSAlex Elder unsigned int xferred; 23201217857fSAlex Elder int result; 23218b3e1a56SAlex Elder bool more; 23221217857fSAlex Elder 23236365d33aSAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 23246365d33aSAlex Elder img_request = obj_request->img_request; 23256365d33aSAlex Elder 23261217857fSAlex Elder rbd_assert(obj_request->xferred <= (u64)UINT_MAX); 23271217857fSAlex Elder xferred = (unsigned int)obj_request->xferred; 23281217857fSAlex Elder result = obj_request->result; 23291217857fSAlex Elder if (result) { 23301217857fSAlex Elder struct rbd_device *rbd_dev = img_request->rbd_dev; 23316d2940c8SGuangliang Zhao enum obj_operation_type op_type; 23326d2940c8SGuangliang Zhao 233390e98c52SGuangliang Zhao if (img_request_discard_test(img_request)) 233490e98c52SGuangliang Zhao op_type = OBJ_OP_DISCARD; 233590e98c52SGuangliang Zhao else if (img_request_write_test(img_request)) 233690e98c52SGuangliang Zhao op_type = OBJ_OP_WRITE; 233790e98c52SGuangliang Zhao else 233890e98c52SGuangliang Zhao op_type = OBJ_OP_READ; 23391217857fSAlex Elder 23409584d508SIlya Dryomov rbd_warn(rbd_dev, "%s %llx at %llx (%llx)", 23416d2940c8SGuangliang Zhao obj_op_name(op_type), obj_request->length, 23426d2940c8SGuangliang Zhao obj_request->img_offset, obj_request->offset); 23439584d508SIlya Dryomov rbd_warn(rbd_dev, " result %d xferred %x", 23441217857fSAlex Elder result, xferred); 23451217857fSAlex Elder if (!img_request->result) 23461217857fSAlex Elder img_request->result = result; 2347082a75daSIlya Dryomov /* 2348082a75daSIlya Dryomov * Need to end I/O on the entire obj_request worth of 2349082a75daSIlya Dryomov * bytes in case of error. 2350082a75daSIlya Dryomov */ 2351082a75daSIlya Dryomov xferred = obj_request->length; 23521217857fSAlex Elder } 23531217857fSAlex Elder 23548b3e1a56SAlex Elder if (img_request_child_test(img_request)) { 23558b3e1a56SAlex Elder rbd_assert(img_request->obj_request != NULL); 23568b3e1a56SAlex Elder more = obj_request->which < img_request->obj_request_count - 1; 23578b3e1a56SAlex Elder } else { 23588b3e1a56SAlex Elder rbd_assert(img_request->rq != NULL); 23597ad18afaSChristoph Hellwig 23607ad18afaSChristoph Hellwig more = blk_update_request(img_request->rq, result, xferred); 23617ad18afaSChristoph Hellwig if (!more) 23627ad18afaSChristoph Hellwig __blk_mq_end_request(img_request->rq, result); 23638b3e1a56SAlex Elder } 23648b3e1a56SAlex Elder 23658b3e1a56SAlex Elder return more; 23661217857fSAlex Elder } 23671217857fSAlex Elder 23682169238dSAlex Elder static void rbd_img_obj_callback(struct rbd_obj_request *obj_request) 23692169238dSAlex Elder { 23702169238dSAlex Elder struct rbd_img_request *img_request; 23712169238dSAlex Elder u32 which = obj_request->which; 23722169238dSAlex Elder bool more = true; 23732169238dSAlex Elder 23746365d33aSAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 23752169238dSAlex Elder img_request = obj_request->img_request; 23762169238dSAlex Elder 23772169238dSAlex Elder dout("%s: img %p obj %p\n", __func__, img_request, obj_request); 23782169238dSAlex Elder rbd_assert(img_request != NULL); 23792169238dSAlex Elder rbd_assert(img_request->obj_request_count > 0); 23802169238dSAlex Elder rbd_assert(which != BAD_WHICH); 23812169238dSAlex Elder rbd_assert(which < img_request->obj_request_count); 23822169238dSAlex Elder 23832169238dSAlex Elder spin_lock_irq(&img_request->completion_lock); 23842169238dSAlex Elder if (which != img_request->next_completion) 23852169238dSAlex Elder goto out; 23862169238dSAlex Elder 23872169238dSAlex Elder for_each_obj_request_from(img_request, obj_request) { 23882169238dSAlex Elder rbd_assert(more); 23892169238dSAlex Elder rbd_assert(which < img_request->obj_request_count); 23902169238dSAlex Elder 23912169238dSAlex Elder if (!obj_request_done_test(obj_request)) 23922169238dSAlex Elder break; 23931217857fSAlex Elder more = rbd_img_obj_end_request(obj_request); 23942169238dSAlex Elder which++; 23952169238dSAlex Elder } 23962169238dSAlex Elder 23972169238dSAlex Elder rbd_assert(more ^ (which == img_request->obj_request_count)); 23982169238dSAlex Elder img_request->next_completion = which; 23992169238dSAlex Elder out: 24002169238dSAlex Elder spin_unlock_irq(&img_request->completion_lock); 24010f2d5be7SAlex Elder rbd_img_request_put(img_request); 24022169238dSAlex Elder 24032169238dSAlex Elder if (!more) 24042169238dSAlex Elder rbd_img_request_complete(img_request); 24052169238dSAlex Elder } 24062169238dSAlex Elder 2407f1a4739fSAlex Elder /* 24083b434a2aSJosh Durgin * Add individual osd ops to the given ceph_osd_request and prepare 24093b434a2aSJosh Durgin * them for submission. num_ops is the current number of 24103b434a2aSJosh Durgin * osd operations already to the object request. 24113b434a2aSJosh Durgin */ 24123b434a2aSJosh Durgin static void rbd_img_obj_request_fill(struct rbd_obj_request *obj_request, 24133b434a2aSJosh Durgin struct ceph_osd_request *osd_request, 24143b434a2aSJosh Durgin enum obj_operation_type op_type, 24153b434a2aSJosh Durgin unsigned int num_ops) 24163b434a2aSJosh Durgin { 24173b434a2aSJosh Durgin struct rbd_img_request *img_request = obj_request->img_request; 24183b434a2aSJosh Durgin struct rbd_device *rbd_dev = img_request->rbd_dev; 24193b434a2aSJosh Durgin u64 object_size = rbd_obj_bytes(&rbd_dev->header); 24203b434a2aSJosh Durgin u64 offset = obj_request->offset; 24213b434a2aSJosh Durgin u64 length = obj_request->length; 24223b434a2aSJosh Durgin u64 img_end; 24233b434a2aSJosh Durgin u16 opcode; 24243b434a2aSJosh Durgin 24253b434a2aSJosh Durgin if (op_type == OBJ_OP_DISCARD) { 2426d3246fb0SJosh Durgin if (!offset && length == object_size && 2427d3246fb0SJosh Durgin (!img_request_layered_test(img_request) || 2428d3246fb0SJosh Durgin !obj_request_overlaps_parent(obj_request))) { 24293b434a2aSJosh Durgin opcode = CEPH_OSD_OP_DELETE; 24303b434a2aSJosh Durgin } else if ((offset + length == object_size)) { 24313b434a2aSJosh Durgin opcode = CEPH_OSD_OP_TRUNCATE; 24323b434a2aSJosh Durgin } else { 24333b434a2aSJosh Durgin down_read(&rbd_dev->header_rwsem); 24343b434a2aSJosh Durgin img_end = rbd_dev->header.image_size; 24353b434a2aSJosh Durgin up_read(&rbd_dev->header_rwsem); 24363b434a2aSJosh Durgin 24373b434a2aSJosh Durgin if (obj_request->img_offset + length == img_end) 24383b434a2aSJosh Durgin opcode = CEPH_OSD_OP_TRUNCATE; 24393b434a2aSJosh Durgin else 24403b434a2aSJosh Durgin opcode = CEPH_OSD_OP_ZERO; 24413b434a2aSJosh Durgin } 24423b434a2aSJosh Durgin } else if (op_type == OBJ_OP_WRITE) { 2443e30b7577SIlya Dryomov if (!offset && length == object_size) 2444e30b7577SIlya Dryomov opcode = CEPH_OSD_OP_WRITEFULL; 2445e30b7577SIlya Dryomov else 24463b434a2aSJosh Durgin opcode = CEPH_OSD_OP_WRITE; 24473b434a2aSJosh Durgin osd_req_op_alloc_hint_init(osd_request, num_ops, 24483b434a2aSJosh Durgin object_size, object_size); 24493b434a2aSJosh Durgin num_ops++; 24503b434a2aSJosh Durgin } else { 24513b434a2aSJosh Durgin opcode = CEPH_OSD_OP_READ; 24523b434a2aSJosh Durgin } 24533b434a2aSJosh Durgin 24547e868b6eSIlya Dryomov if (opcode == CEPH_OSD_OP_DELETE) 2455144cba14SYan, Zheng osd_req_op_init(osd_request, num_ops, opcode, 0); 24567e868b6eSIlya Dryomov else 24577e868b6eSIlya Dryomov osd_req_op_extent_init(osd_request, num_ops, opcode, 24587e868b6eSIlya Dryomov offset, length, 0, 0); 24597e868b6eSIlya Dryomov 24603b434a2aSJosh Durgin if (obj_request->type == OBJ_REQUEST_BIO) 24613b434a2aSJosh Durgin osd_req_op_extent_osd_data_bio(osd_request, num_ops, 24623b434a2aSJosh Durgin obj_request->bio_list, length); 24633b434a2aSJosh Durgin else if (obj_request->type == OBJ_REQUEST_PAGES) 24643b434a2aSJosh Durgin osd_req_op_extent_osd_data_pages(osd_request, num_ops, 24653b434a2aSJosh Durgin obj_request->pages, length, 24663b434a2aSJosh Durgin offset & ~PAGE_MASK, false, false); 24673b434a2aSJosh Durgin 24683b434a2aSJosh Durgin /* Discards are also writes */ 24693b434a2aSJosh Durgin if (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD) 24703b434a2aSJosh Durgin rbd_osd_req_format_write(obj_request); 24713b434a2aSJosh Durgin else 24723b434a2aSJosh Durgin rbd_osd_req_format_read(obj_request); 24733b434a2aSJosh Durgin } 24743b434a2aSJosh Durgin 24753b434a2aSJosh Durgin /* 2476f1a4739fSAlex Elder * Split up an image request into one or more object requests, each 2477f1a4739fSAlex Elder * to a different object. The "type" parameter indicates whether 2478f1a4739fSAlex Elder * "data_desc" is the pointer to the head of a list of bio 2479f1a4739fSAlex Elder * structures, or the base of a page array. In either case this 2480f1a4739fSAlex Elder * function assumes data_desc describes memory sufficient to hold 2481f1a4739fSAlex Elder * all data described by the image request. 2482f1a4739fSAlex Elder */ 2483f1a4739fSAlex Elder static int rbd_img_request_fill(struct rbd_img_request *img_request, 2484f1a4739fSAlex Elder enum obj_request_type type, 2485f1a4739fSAlex Elder void *data_desc) 2486bf0d5f50SAlex Elder { 2487bf0d5f50SAlex Elder struct rbd_device *rbd_dev = img_request->rbd_dev; 2488bf0d5f50SAlex Elder struct rbd_obj_request *obj_request = NULL; 2489bf0d5f50SAlex Elder struct rbd_obj_request *next_obj_request; 2490a158073cSJingoo Han struct bio *bio_list = NULL; 2491f1a4739fSAlex Elder unsigned int bio_offset = 0; 2492a158073cSJingoo Han struct page **pages = NULL; 24936d2940c8SGuangliang Zhao enum obj_operation_type op_type; 24947da22d29SAlex Elder u64 img_offset; 2495bf0d5f50SAlex Elder u64 resid; 2496bf0d5f50SAlex Elder 2497f1a4739fSAlex Elder dout("%s: img %p type %d data_desc %p\n", __func__, img_request, 2498f1a4739fSAlex Elder (int)type, data_desc); 249937206ee5SAlex Elder 25007da22d29SAlex Elder img_offset = img_request->offset; 2501bf0d5f50SAlex Elder resid = img_request->length; 25024dda41d3SAlex Elder rbd_assert(resid > 0); 25033b434a2aSJosh Durgin op_type = rbd_img_request_op_type(img_request); 2504f1a4739fSAlex Elder 2505f1a4739fSAlex Elder if (type == OBJ_REQUEST_BIO) { 2506f1a4739fSAlex Elder bio_list = data_desc; 25074f024f37SKent Overstreet rbd_assert(img_offset == 25084f024f37SKent Overstreet bio_list->bi_iter.bi_sector << SECTOR_SHIFT); 250990e98c52SGuangliang Zhao } else if (type == OBJ_REQUEST_PAGES) { 2510f1a4739fSAlex Elder pages = data_desc; 2511f1a4739fSAlex Elder } 2512f1a4739fSAlex Elder 2513bf0d5f50SAlex Elder while (resid) { 25142fa12320SAlex Elder struct ceph_osd_request *osd_req; 2515bf0d5f50SAlex Elder const char *object_name; 251667e2b652SIlya Dryomov u64 offset = rbd_segment_offset(rbd_dev, img_offset); 251767e2b652SIlya Dryomov u64 length = rbd_segment_length(rbd_dev, img_offset, resid); 2518bf0d5f50SAlex Elder 25197da22d29SAlex Elder object_name = rbd_segment_name(rbd_dev, img_offset); 2520bf0d5f50SAlex Elder if (!object_name) 2521bf0d5f50SAlex Elder goto out_unwind; 252267e2b652SIlya Dryomov obj_request = rbd_obj_request_create(object_name, type); 252378c2a44aSAlex Elder /* object request has its own copy of the object name */ 252478c2a44aSAlex Elder rbd_segment_name_free(object_name); 2525bf0d5f50SAlex Elder if (!obj_request) 2526bf0d5f50SAlex Elder goto out_unwind; 252762054da6SIlya Dryomov 252867e2b652SIlya Dryomov obj_request->offset = offset; 252967e2b652SIlya Dryomov obj_request->length = length; 253067e2b652SIlya Dryomov 253103507db6SJosh Durgin /* 253203507db6SJosh Durgin * set obj_request->img_request before creating the 253303507db6SJosh Durgin * osd_request so that it gets the right snapc 253403507db6SJosh Durgin */ 253503507db6SJosh Durgin rbd_img_obj_request_add(img_request, obj_request); 2536bf0d5f50SAlex Elder 2537f1a4739fSAlex Elder if (type == OBJ_REQUEST_BIO) { 2538f1a4739fSAlex Elder unsigned int clone_size; 2539f1a4739fSAlex Elder 2540bf0d5f50SAlex Elder rbd_assert(length <= (u64)UINT_MAX); 2541bf0d5f50SAlex Elder clone_size = (unsigned int)length; 2542f1a4739fSAlex Elder obj_request->bio_list = 2543f1a4739fSAlex Elder bio_chain_clone_range(&bio_list, 2544f1a4739fSAlex Elder &bio_offset, 2545f1a4739fSAlex Elder clone_size, 25462224d879SDavid Disseldorp GFP_NOIO); 2547bf0d5f50SAlex Elder if (!obj_request->bio_list) 254862054da6SIlya Dryomov goto out_unwind; 254990e98c52SGuangliang Zhao } else if (type == OBJ_REQUEST_PAGES) { 2550f1a4739fSAlex Elder unsigned int page_count; 2551f1a4739fSAlex Elder 2552f1a4739fSAlex Elder obj_request->pages = pages; 2553f1a4739fSAlex Elder page_count = (u32)calc_pages_for(offset, length); 2554f1a4739fSAlex Elder obj_request->page_count = page_count; 2555f1a4739fSAlex Elder if ((offset + length) & ~PAGE_MASK) 2556f1a4739fSAlex Elder page_count--; /* more on last page */ 2557f1a4739fSAlex Elder pages += page_count; 2558f1a4739fSAlex Elder } 2559bf0d5f50SAlex Elder 25606d2940c8SGuangliang Zhao osd_req = rbd_osd_req_create(rbd_dev, op_type, 25616d2940c8SGuangliang Zhao (op_type == OBJ_OP_WRITE) ? 2 : 1, 25622fa12320SAlex Elder obj_request); 25632fa12320SAlex Elder if (!osd_req) 256462054da6SIlya Dryomov goto out_unwind; 25653b434a2aSJosh Durgin 25662fa12320SAlex Elder obj_request->osd_req = osd_req; 25672169238dSAlex Elder obj_request->callback = rbd_img_obj_callback; 25687da22d29SAlex Elder obj_request->img_offset = img_offset; 2569bf0d5f50SAlex Elder 25703b434a2aSJosh Durgin rbd_img_obj_request_fill(obj_request, osd_req, op_type, 0); 25713b434a2aSJosh Durgin 25727da22d29SAlex Elder img_offset += length; 2573bf0d5f50SAlex Elder resid -= length; 2574bf0d5f50SAlex Elder } 2575bf0d5f50SAlex Elder 2576bf0d5f50SAlex Elder return 0; 2577bf0d5f50SAlex Elder 2578bf0d5f50SAlex Elder out_unwind: 2579bf0d5f50SAlex Elder for_each_obj_request_safe(img_request, obj_request, next_obj_request) 258042dd037cSIlya Dryomov rbd_img_obj_request_del(img_request, obj_request); 2581bf0d5f50SAlex Elder 2582bf0d5f50SAlex Elder return -ENOMEM; 2583bf0d5f50SAlex Elder } 2584bf0d5f50SAlex Elder 25853d7efd18SAlex Elder static void 25862761713dSIlya Dryomov rbd_osd_copyup_callback(struct rbd_obj_request *obj_request) 25870eefd470SAlex Elder { 25880eefd470SAlex Elder struct rbd_img_request *img_request; 25890eefd470SAlex Elder struct rbd_device *rbd_dev; 2590ebda6408SAlex Elder struct page **pages; 25910eefd470SAlex Elder u32 page_count; 25920eefd470SAlex Elder 25932761713dSIlya Dryomov dout("%s: obj %p\n", __func__, obj_request); 25942761713dSIlya Dryomov 2595d3246fb0SJosh Durgin rbd_assert(obj_request->type == OBJ_REQUEST_BIO || 2596d3246fb0SJosh Durgin obj_request->type == OBJ_REQUEST_NODATA); 25970eefd470SAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 25980eefd470SAlex Elder img_request = obj_request->img_request; 25990eefd470SAlex Elder rbd_assert(img_request); 26000eefd470SAlex Elder 26010eefd470SAlex Elder rbd_dev = img_request->rbd_dev; 26020eefd470SAlex Elder rbd_assert(rbd_dev); 26030eefd470SAlex Elder 2604ebda6408SAlex Elder pages = obj_request->copyup_pages; 2605ebda6408SAlex Elder rbd_assert(pages != NULL); 26060eefd470SAlex Elder obj_request->copyup_pages = NULL; 2607ebda6408SAlex Elder page_count = obj_request->copyup_page_count; 2608ebda6408SAlex Elder rbd_assert(page_count); 2609ebda6408SAlex Elder obj_request->copyup_page_count = 0; 2610ebda6408SAlex Elder ceph_release_page_vector(pages, page_count); 26110eefd470SAlex Elder 26120eefd470SAlex Elder /* 26130eefd470SAlex Elder * We want the transfer count to reflect the size of the 26140eefd470SAlex Elder * original write request. There is no such thing as a 26150eefd470SAlex Elder * successful short write, so if the request was successful 26160eefd470SAlex Elder * we can just set it to the originally-requested length. 26170eefd470SAlex Elder */ 26180eefd470SAlex Elder if (!obj_request->result) 26190eefd470SAlex Elder obj_request->xferred = obj_request->length; 26200eefd470SAlex Elder 26212761713dSIlya Dryomov obj_request_done_set(obj_request); 26220eefd470SAlex Elder } 26230eefd470SAlex Elder 26240eefd470SAlex Elder static void 26253d7efd18SAlex Elder rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request) 26263d7efd18SAlex Elder { 26273d7efd18SAlex Elder struct rbd_obj_request *orig_request; 26280eefd470SAlex Elder struct ceph_osd_request *osd_req; 26290eefd470SAlex Elder struct rbd_device *rbd_dev; 26303d7efd18SAlex Elder struct page **pages; 2631d3246fb0SJosh Durgin enum obj_operation_type op_type; 2632ebda6408SAlex Elder u32 page_count; 2633bbea1c1aSAlex Elder int img_result; 2634ebda6408SAlex Elder u64 parent_length; 26353d7efd18SAlex Elder 26363d7efd18SAlex Elder rbd_assert(img_request_child_test(img_request)); 26373d7efd18SAlex Elder 26383d7efd18SAlex Elder /* First get what we need from the image request */ 26393d7efd18SAlex Elder 26403d7efd18SAlex Elder pages = img_request->copyup_pages; 26413d7efd18SAlex Elder rbd_assert(pages != NULL); 26423d7efd18SAlex Elder img_request->copyup_pages = NULL; 2643ebda6408SAlex Elder page_count = img_request->copyup_page_count; 2644ebda6408SAlex Elder rbd_assert(page_count); 2645ebda6408SAlex Elder img_request->copyup_page_count = 0; 26463d7efd18SAlex Elder 26473d7efd18SAlex Elder orig_request = img_request->obj_request; 26483d7efd18SAlex Elder rbd_assert(orig_request != NULL); 2649b91f09f1SAlex Elder rbd_assert(obj_request_type_valid(orig_request->type)); 2650bbea1c1aSAlex Elder img_result = img_request->result; 2651ebda6408SAlex Elder parent_length = img_request->length; 2652fa355112SIlya Dryomov rbd_assert(img_result || parent_length == img_request->xferred); 26533d7efd18SAlex Elder rbd_img_request_put(img_request); 26543d7efd18SAlex Elder 265591c6febbSAlex Elder rbd_assert(orig_request->img_request); 265691c6febbSAlex Elder rbd_dev = orig_request->img_request->rbd_dev; 26573d7efd18SAlex Elder rbd_assert(rbd_dev); 26583d7efd18SAlex Elder 2659bbea1c1aSAlex Elder /* 2660bbea1c1aSAlex Elder * If the overlap has become 0 (most likely because the 2661bbea1c1aSAlex Elder * image has been flattened) we need to free the pages 2662bbea1c1aSAlex Elder * and re-submit the original write request. 2663bbea1c1aSAlex Elder */ 2664bbea1c1aSAlex Elder if (!rbd_dev->parent_overlap) { 2665bbea1c1aSAlex Elder ceph_release_page_vector(pages, page_count); 2666980917fcSIlya Dryomov rbd_obj_request_submit(orig_request); 2667bbea1c1aSAlex Elder return; 2668bbea1c1aSAlex Elder } 2669bbea1c1aSAlex Elder 2670bbea1c1aSAlex Elder if (img_result) 26710eefd470SAlex Elder goto out_err; 26723d7efd18SAlex Elder 26738785b1d4SAlex Elder /* 26748785b1d4SAlex Elder * The original osd request is of no use to use any more. 26750ccd5926SIlya Dryomov * We need a new one that can hold the three ops in a copyup 26768785b1d4SAlex Elder * request. Allocate the new copyup osd request for the 26778785b1d4SAlex Elder * original request, and release the old one. 26788785b1d4SAlex Elder */ 2679bbea1c1aSAlex Elder img_result = -ENOMEM; 26800eefd470SAlex Elder osd_req = rbd_osd_req_create_copyup(orig_request); 26810eefd470SAlex Elder if (!osd_req) 26820eefd470SAlex Elder goto out_err; 26838785b1d4SAlex Elder rbd_osd_req_destroy(orig_request->osd_req); 26840eefd470SAlex Elder orig_request->osd_req = osd_req; 26850eefd470SAlex Elder orig_request->copyup_pages = pages; 2686ebda6408SAlex Elder orig_request->copyup_page_count = page_count; 26873d7efd18SAlex Elder 26880eefd470SAlex Elder /* Initialize the copyup op */ 26890eefd470SAlex Elder 26900eefd470SAlex Elder osd_req_op_cls_init(osd_req, 0, CEPH_OSD_OP_CALL, "rbd", "copyup"); 2691ebda6408SAlex Elder osd_req_op_cls_request_data_pages(osd_req, 0, pages, parent_length, 0, 26920eefd470SAlex Elder false, false); 26930eefd470SAlex Elder 2694d3246fb0SJosh Durgin /* Add the other op(s) */ 26950ccd5926SIlya Dryomov 2696d3246fb0SJosh Durgin op_type = rbd_img_request_op_type(orig_request->img_request); 2697d3246fb0SJosh Durgin rbd_img_obj_request_fill(orig_request, osd_req, op_type, 1); 26980eefd470SAlex Elder 26990eefd470SAlex Elder /* All set, send it off. */ 27000eefd470SAlex Elder 2701980917fcSIlya Dryomov rbd_obj_request_submit(orig_request); 27020eefd470SAlex Elder return; 27030eefd470SAlex Elder 27040eefd470SAlex Elder out_err: 2705fa355112SIlya Dryomov ceph_release_page_vector(pages, page_count); 27060dcc685eSIlya Dryomov rbd_obj_request_error(orig_request, img_result); 27073d7efd18SAlex Elder } 27083d7efd18SAlex Elder 27093d7efd18SAlex Elder /* 27103d7efd18SAlex Elder * Read from the parent image the range of data that covers the 27113d7efd18SAlex Elder * entire target of the given object request. This is used for 27123d7efd18SAlex Elder * satisfying a layered image write request when the target of an 27133d7efd18SAlex Elder * object request from the image request does not exist. 27143d7efd18SAlex Elder * 27153d7efd18SAlex Elder * A page array big enough to hold the returned data is allocated 27163d7efd18SAlex Elder * and supplied to rbd_img_request_fill() as the "data descriptor." 27173d7efd18SAlex Elder * When the read completes, this page array will be transferred to 27183d7efd18SAlex Elder * the original object request for the copyup operation. 27193d7efd18SAlex Elder * 2720c2e82414SIlya Dryomov * If an error occurs, it is recorded as the result of the original 2721c2e82414SIlya Dryomov * object request in rbd_img_obj_exists_callback(). 27223d7efd18SAlex Elder */ 27233d7efd18SAlex Elder static int rbd_img_obj_parent_read_full(struct rbd_obj_request *obj_request) 27243d7efd18SAlex Elder { 2725058aa991SIlya Dryomov struct rbd_device *rbd_dev = obj_request->img_request->rbd_dev; 27263d7efd18SAlex Elder struct rbd_img_request *parent_request = NULL; 27273d7efd18SAlex Elder u64 img_offset; 27283d7efd18SAlex Elder u64 length; 27293d7efd18SAlex Elder struct page **pages = NULL; 27303d7efd18SAlex Elder u32 page_count; 27313d7efd18SAlex Elder int result; 27323d7efd18SAlex Elder 27333d7efd18SAlex Elder rbd_assert(rbd_dev->parent != NULL); 27343d7efd18SAlex Elder 27353d7efd18SAlex Elder /* 27363d7efd18SAlex Elder * Determine the byte range covered by the object in the 27373d7efd18SAlex Elder * child image to which the original request was to be sent. 27383d7efd18SAlex Elder */ 27393d7efd18SAlex Elder img_offset = obj_request->img_offset - obj_request->offset; 27405bc3fb17SIlya Dryomov length = rbd_obj_bytes(&rbd_dev->header); 27413d7efd18SAlex Elder 27423d7efd18SAlex Elder /* 2743a9e8ba2cSAlex Elder * There is no defined parent data beyond the parent 2744a9e8ba2cSAlex Elder * overlap, so limit what we read at that boundary if 2745a9e8ba2cSAlex Elder * necessary. 2746a9e8ba2cSAlex Elder */ 2747a9e8ba2cSAlex Elder if (img_offset + length > rbd_dev->parent_overlap) { 2748a9e8ba2cSAlex Elder rbd_assert(img_offset < rbd_dev->parent_overlap); 2749a9e8ba2cSAlex Elder length = rbd_dev->parent_overlap - img_offset; 2750a9e8ba2cSAlex Elder } 2751a9e8ba2cSAlex Elder 2752a9e8ba2cSAlex Elder /* 27533d7efd18SAlex Elder * Allocate a page array big enough to receive the data read 27543d7efd18SAlex Elder * from the parent. 27553d7efd18SAlex Elder */ 27563d7efd18SAlex Elder page_count = (u32)calc_pages_for(0, length); 27573d7efd18SAlex Elder pages = ceph_alloc_page_vector(page_count, GFP_KERNEL); 27583d7efd18SAlex Elder if (IS_ERR(pages)) { 27593d7efd18SAlex Elder result = PTR_ERR(pages); 27603d7efd18SAlex Elder pages = NULL; 27613d7efd18SAlex Elder goto out_err; 27623d7efd18SAlex Elder } 27633d7efd18SAlex Elder 27643d7efd18SAlex Elder result = -ENOMEM; 2765e93f3152SAlex Elder parent_request = rbd_parent_request_create(obj_request, 2766e93f3152SAlex Elder img_offset, length); 27673d7efd18SAlex Elder if (!parent_request) 27683d7efd18SAlex Elder goto out_err; 27693d7efd18SAlex Elder 27703d7efd18SAlex Elder result = rbd_img_request_fill(parent_request, OBJ_REQUEST_PAGES, pages); 27713d7efd18SAlex Elder if (result) 27723d7efd18SAlex Elder goto out_err; 2773058aa991SIlya Dryomov 27743d7efd18SAlex Elder parent_request->copyup_pages = pages; 2775ebda6408SAlex Elder parent_request->copyup_page_count = page_count; 27763d7efd18SAlex Elder parent_request->callback = rbd_img_obj_parent_read_full_callback; 2777058aa991SIlya Dryomov 27783d7efd18SAlex Elder result = rbd_img_request_submit(parent_request); 27793d7efd18SAlex Elder if (!result) 27803d7efd18SAlex Elder return 0; 27813d7efd18SAlex Elder 27823d7efd18SAlex Elder parent_request->copyup_pages = NULL; 2783ebda6408SAlex Elder parent_request->copyup_page_count = 0; 27843d7efd18SAlex Elder parent_request->obj_request = NULL; 27853d7efd18SAlex Elder rbd_obj_request_put(obj_request); 27863d7efd18SAlex Elder out_err: 27873d7efd18SAlex Elder if (pages) 27883d7efd18SAlex Elder ceph_release_page_vector(pages, page_count); 27893d7efd18SAlex Elder if (parent_request) 27903d7efd18SAlex Elder rbd_img_request_put(parent_request); 27913d7efd18SAlex Elder return result; 27923d7efd18SAlex Elder } 27933d7efd18SAlex Elder 2794c5b5ef6cSAlex Elder static void rbd_img_obj_exists_callback(struct rbd_obj_request *obj_request) 2795c5b5ef6cSAlex Elder { 2796c5b5ef6cSAlex Elder struct rbd_obj_request *orig_request; 2797638f5abeSAlex Elder struct rbd_device *rbd_dev; 2798c5b5ef6cSAlex Elder int result; 2799c5b5ef6cSAlex Elder 2800c5b5ef6cSAlex Elder rbd_assert(!obj_request_img_data_test(obj_request)); 2801c5b5ef6cSAlex Elder 2802c5b5ef6cSAlex Elder /* 2803c5b5ef6cSAlex Elder * All we need from the object request is the original 2804c5b5ef6cSAlex Elder * request and the result of the STAT op. Grab those, then 2805c5b5ef6cSAlex Elder * we're done with the request. 2806c5b5ef6cSAlex Elder */ 2807c5b5ef6cSAlex Elder orig_request = obj_request->obj_request; 2808c5b5ef6cSAlex Elder obj_request->obj_request = NULL; 2809912c317dSAlex Elder rbd_obj_request_put(orig_request); 2810c5b5ef6cSAlex Elder rbd_assert(orig_request); 2811c5b5ef6cSAlex Elder rbd_assert(orig_request->img_request); 2812c5b5ef6cSAlex Elder 2813c5b5ef6cSAlex Elder result = obj_request->result; 2814c5b5ef6cSAlex Elder obj_request->result = 0; 2815c5b5ef6cSAlex Elder 2816c5b5ef6cSAlex Elder dout("%s: obj %p for obj %p result %d %llu/%llu\n", __func__, 2817c5b5ef6cSAlex Elder obj_request, orig_request, result, 2818c5b5ef6cSAlex Elder obj_request->xferred, obj_request->length); 2819c5b5ef6cSAlex Elder rbd_obj_request_put(obj_request); 2820c5b5ef6cSAlex Elder 2821638f5abeSAlex Elder /* 2822638f5abeSAlex Elder * If the overlap has become 0 (most likely because the 2823980917fcSIlya Dryomov * image has been flattened) we need to re-submit the 2824980917fcSIlya Dryomov * original request. 2825638f5abeSAlex Elder */ 2826638f5abeSAlex Elder rbd_dev = orig_request->img_request->rbd_dev; 2827638f5abeSAlex Elder if (!rbd_dev->parent_overlap) { 2828980917fcSIlya Dryomov rbd_obj_request_submit(orig_request); 2829638f5abeSAlex Elder return; 2830638f5abeSAlex Elder } 2831c5b5ef6cSAlex Elder 2832c5b5ef6cSAlex Elder /* 2833c5b5ef6cSAlex Elder * Our only purpose here is to determine whether the object 2834c5b5ef6cSAlex Elder * exists, and we don't want to treat the non-existence as 2835c5b5ef6cSAlex Elder * an error. If something else comes back, transfer the 2836c5b5ef6cSAlex Elder * error to the original request and complete it now. 2837c5b5ef6cSAlex Elder */ 2838c5b5ef6cSAlex Elder if (!result) { 2839c5b5ef6cSAlex Elder obj_request_existence_set(orig_request, true); 2840c5b5ef6cSAlex Elder } else if (result == -ENOENT) { 2841c5b5ef6cSAlex Elder obj_request_existence_set(orig_request, false); 2842c2e82414SIlya Dryomov } else { 2843c2e82414SIlya Dryomov goto fail_orig_request; 2844c5b5ef6cSAlex Elder } 2845c5b5ef6cSAlex Elder 2846c5b5ef6cSAlex Elder /* 2847c5b5ef6cSAlex Elder * Resubmit the original request now that we have recorded 2848c5b5ef6cSAlex Elder * whether the target object exists. 2849c5b5ef6cSAlex Elder */ 2850c2e82414SIlya Dryomov result = rbd_img_obj_request_submit(orig_request); 2851c2e82414SIlya Dryomov if (result) 2852c2e82414SIlya Dryomov goto fail_orig_request; 2853c2e82414SIlya Dryomov 2854c2e82414SIlya Dryomov return; 2855c2e82414SIlya Dryomov 2856c2e82414SIlya Dryomov fail_orig_request: 28570dcc685eSIlya Dryomov rbd_obj_request_error(orig_request, result); 2858c5b5ef6cSAlex Elder } 2859c5b5ef6cSAlex Elder 2860c5b5ef6cSAlex Elder static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request) 2861c5b5ef6cSAlex Elder { 2862058aa991SIlya Dryomov struct rbd_device *rbd_dev = obj_request->img_request->rbd_dev; 2863c5b5ef6cSAlex Elder struct rbd_obj_request *stat_request; 2864710214e3SIlya Dryomov struct page **pages; 2865c5b5ef6cSAlex Elder u32 page_count; 2866c5b5ef6cSAlex Elder size_t size; 2867c5b5ef6cSAlex Elder int ret; 2868c5b5ef6cSAlex Elder 286967e2b652SIlya Dryomov stat_request = rbd_obj_request_create(obj_request->object_name, 2870710214e3SIlya Dryomov OBJ_REQUEST_PAGES); 2871710214e3SIlya Dryomov if (!stat_request) 2872710214e3SIlya Dryomov return -ENOMEM; 2873710214e3SIlya Dryomov 2874710214e3SIlya Dryomov stat_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1, 2875710214e3SIlya Dryomov stat_request); 2876710214e3SIlya Dryomov if (!stat_request->osd_req) { 2877710214e3SIlya Dryomov ret = -ENOMEM; 2878710214e3SIlya Dryomov goto fail_stat_request; 2879710214e3SIlya Dryomov } 2880710214e3SIlya Dryomov 2881c5b5ef6cSAlex Elder /* 2882c5b5ef6cSAlex Elder * The response data for a STAT call consists of: 2883c5b5ef6cSAlex Elder * le64 length; 2884c5b5ef6cSAlex Elder * struct { 2885c5b5ef6cSAlex Elder * le32 tv_sec; 2886c5b5ef6cSAlex Elder * le32 tv_nsec; 2887c5b5ef6cSAlex Elder * } mtime; 2888c5b5ef6cSAlex Elder */ 2889c5b5ef6cSAlex Elder size = sizeof (__le64) + sizeof (__le32) + sizeof (__le32); 2890c5b5ef6cSAlex Elder page_count = (u32)calc_pages_for(0, size); 2891c5b5ef6cSAlex Elder pages = ceph_alloc_page_vector(page_count, GFP_KERNEL); 2892710214e3SIlya Dryomov if (IS_ERR(pages)) { 2893710214e3SIlya Dryomov ret = PTR_ERR(pages); 2894710214e3SIlya Dryomov goto fail_stat_request; 2895710214e3SIlya Dryomov } 2896c5b5ef6cSAlex Elder 2897710214e3SIlya Dryomov osd_req_op_init(stat_request->osd_req, 0, CEPH_OSD_OP_STAT, 0); 2898710214e3SIlya Dryomov osd_req_op_raw_data_in_pages(stat_request->osd_req, 0, pages, size, 0, 2899710214e3SIlya Dryomov false, false); 2900c5b5ef6cSAlex Elder 2901c5b5ef6cSAlex Elder rbd_obj_request_get(obj_request); 2902c5b5ef6cSAlex Elder stat_request->obj_request = obj_request; 2903c5b5ef6cSAlex Elder stat_request->pages = pages; 2904c5b5ef6cSAlex Elder stat_request->page_count = page_count; 2905c5b5ef6cSAlex Elder stat_request->callback = rbd_img_obj_exists_callback; 2906c5b5ef6cSAlex Elder 2907980917fcSIlya Dryomov rbd_obj_request_submit(stat_request); 2908980917fcSIlya Dryomov return 0; 2909c5b5ef6cSAlex Elder 2910710214e3SIlya Dryomov fail_stat_request: 2911710214e3SIlya Dryomov rbd_obj_request_put(stat_request); 2912c5b5ef6cSAlex Elder return ret; 2913c5b5ef6cSAlex Elder } 2914c5b5ef6cSAlex Elder 291570d045f6SIlya Dryomov static bool img_obj_request_simple(struct rbd_obj_request *obj_request) 2916b454e36dSAlex Elder { 2917058aa991SIlya Dryomov struct rbd_img_request *img_request = obj_request->img_request; 2918058aa991SIlya Dryomov struct rbd_device *rbd_dev = img_request->rbd_dev; 2919b454e36dSAlex Elder 292070d045f6SIlya Dryomov /* Reads */ 29211c220881SJosh Durgin if (!img_request_write_test(img_request) && 29221c220881SJosh Durgin !img_request_discard_test(img_request)) 292370d045f6SIlya Dryomov return true; 2924b454e36dSAlex Elder 292570d045f6SIlya Dryomov /* Non-layered writes */ 292670d045f6SIlya Dryomov if (!img_request_layered_test(img_request)) 292770d045f6SIlya Dryomov return true; 292870d045f6SIlya Dryomov 292970d045f6SIlya Dryomov /* 293070d045f6SIlya Dryomov * Layered writes outside of the parent overlap range don't 293170d045f6SIlya Dryomov * share any data with the parent. 293270d045f6SIlya Dryomov */ 293370d045f6SIlya Dryomov if (!obj_request_overlaps_parent(obj_request)) 293470d045f6SIlya Dryomov return true; 293570d045f6SIlya Dryomov 293670d045f6SIlya Dryomov /* 2937c622d226SGuangliang Zhao * Entire-object layered writes - we will overwrite whatever 2938c622d226SGuangliang Zhao * parent data there is anyway. 2939c622d226SGuangliang Zhao */ 2940c622d226SGuangliang Zhao if (!obj_request->offset && 2941c622d226SGuangliang Zhao obj_request->length == rbd_obj_bytes(&rbd_dev->header)) 2942c622d226SGuangliang Zhao return true; 2943c622d226SGuangliang Zhao 2944c622d226SGuangliang Zhao /* 294570d045f6SIlya Dryomov * If the object is known to already exist, its parent data has 294670d045f6SIlya Dryomov * already been copied. 294770d045f6SIlya Dryomov */ 294870d045f6SIlya Dryomov if (obj_request_known_test(obj_request) && 294970d045f6SIlya Dryomov obj_request_exists_test(obj_request)) 295070d045f6SIlya Dryomov return true; 295170d045f6SIlya Dryomov 295270d045f6SIlya Dryomov return false; 295370d045f6SIlya Dryomov } 295470d045f6SIlya Dryomov 295570d045f6SIlya Dryomov static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request) 295670d045f6SIlya Dryomov { 2957058aa991SIlya Dryomov rbd_assert(obj_request_img_data_test(obj_request)); 2958058aa991SIlya Dryomov rbd_assert(obj_request_type_valid(obj_request->type)); 2959058aa991SIlya Dryomov rbd_assert(obj_request->img_request); 2960058aa991SIlya Dryomov 296170d045f6SIlya Dryomov if (img_obj_request_simple(obj_request)) { 2962980917fcSIlya Dryomov rbd_obj_request_submit(obj_request); 2963980917fcSIlya Dryomov return 0; 2964b454e36dSAlex Elder } 2965b454e36dSAlex Elder 2966b454e36dSAlex Elder /* 29673d7efd18SAlex Elder * It's a layered write. The target object might exist but 29683d7efd18SAlex Elder * we may not know that yet. If we know it doesn't exist, 29693d7efd18SAlex Elder * start by reading the data for the full target object from 29703d7efd18SAlex Elder * the parent so we can use it for a copyup to the target. 2971b454e36dSAlex Elder */ 297270d045f6SIlya Dryomov if (obj_request_known_test(obj_request)) 29733d7efd18SAlex Elder return rbd_img_obj_parent_read_full(obj_request); 29743d7efd18SAlex Elder 29753d7efd18SAlex Elder /* We don't know whether the target exists. Go find out. */ 2976b454e36dSAlex Elder 2977b454e36dSAlex Elder return rbd_img_obj_exists_submit(obj_request); 2978b454e36dSAlex Elder } 2979b454e36dSAlex Elder 2980bf0d5f50SAlex Elder static int rbd_img_request_submit(struct rbd_img_request *img_request) 2981bf0d5f50SAlex Elder { 2982bf0d5f50SAlex Elder struct rbd_obj_request *obj_request; 298346faeed4SAlex Elder struct rbd_obj_request *next_obj_request; 2984663ae2ccSIlya Dryomov int ret = 0; 2985bf0d5f50SAlex Elder 298637206ee5SAlex Elder dout("%s: img %p\n", __func__, img_request); 2987bf0d5f50SAlex Elder 2988663ae2ccSIlya Dryomov rbd_img_request_get(img_request); 2989663ae2ccSIlya Dryomov for_each_obj_request_safe(img_request, obj_request, next_obj_request) { 2990b454e36dSAlex Elder ret = rbd_img_obj_request_submit(obj_request); 2991bf0d5f50SAlex Elder if (ret) 2992663ae2ccSIlya Dryomov goto out_put_ireq; 2993bf0d5f50SAlex Elder } 2994bf0d5f50SAlex Elder 2995663ae2ccSIlya Dryomov out_put_ireq: 2996663ae2ccSIlya Dryomov rbd_img_request_put(img_request); 2997663ae2ccSIlya Dryomov return ret; 2998bf0d5f50SAlex Elder } 2999bf0d5f50SAlex Elder 30008b3e1a56SAlex Elder static void rbd_img_parent_read_callback(struct rbd_img_request *img_request) 30018b3e1a56SAlex Elder { 30028b3e1a56SAlex Elder struct rbd_obj_request *obj_request; 3003a9e8ba2cSAlex Elder struct rbd_device *rbd_dev; 3004a9e8ba2cSAlex Elder u64 obj_end; 300502c74fbaSAlex Elder u64 img_xferred; 300602c74fbaSAlex Elder int img_result; 30078b3e1a56SAlex Elder 30088b3e1a56SAlex Elder rbd_assert(img_request_child_test(img_request)); 30098b3e1a56SAlex Elder 301002c74fbaSAlex Elder /* First get what we need from the image request and release it */ 301102c74fbaSAlex Elder 30128b3e1a56SAlex Elder obj_request = img_request->obj_request; 301302c74fbaSAlex Elder img_xferred = img_request->xferred; 301402c74fbaSAlex Elder img_result = img_request->result; 301502c74fbaSAlex Elder rbd_img_request_put(img_request); 301602c74fbaSAlex Elder 301702c74fbaSAlex Elder /* 301802c74fbaSAlex Elder * If the overlap has become 0 (most likely because the 301902c74fbaSAlex Elder * image has been flattened) we need to re-submit the 302002c74fbaSAlex Elder * original request. 302102c74fbaSAlex Elder */ 3022a9e8ba2cSAlex Elder rbd_assert(obj_request); 3023a9e8ba2cSAlex Elder rbd_assert(obj_request->img_request); 302402c74fbaSAlex Elder rbd_dev = obj_request->img_request->rbd_dev; 302502c74fbaSAlex Elder if (!rbd_dev->parent_overlap) { 3026980917fcSIlya Dryomov rbd_obj_request_submit(obj_request); 302702c74fbaSAlex Elder return; 302802c74fbaSAlex Elder } 302902c74fbaSAlex Elder 303002c74fbaSAlex Elder obj_request->result = img_result; 3031a9e8ba2cSAlex Elder if (obj_request->result) 3032a9e8ba2cSAlex Elder goto out; 3033a9e8ba2cSAlex Elder 3034a9e8ba2cSAlex Elder /* 3035a9e8ba2cSAlex Elder * We need to zero anything beyond the parent overlap 3036a9e8ba2cSAlex Elder * boundary. Since rbd_img_obj_request_read_callback() 3037a9e8ba2cSAlex Elder * will zero anything beyond the end of a short read, an 3038a9e8ba2cSAlex Elder * easy way to do this is to pretend the data from the 3039a9e8ba2cSAlex Elder * parent came up short--ending at the overlap boundary. 3040a9e8ba2cSAlex Elder */ 3041a9e8ba2cSAlex Elder rbd_assert(obj_request->img_offset < U64_MAX - obj_request->length); 3042a9e8ba2cSAlex Elder obj_end = obj_request->img_offset + obj_request->length; 3043a9e8ba2cSAlex Elder if (obj_end > rbd_dev->parent_overlap) { 3044a9e8ba2cSAlex Elder u64 xferred = 0; 3045a9e8ba2cSAlex Elder 3046a9e8ba2cSAlex Elder if (obj_request->img_offset < rbd_dev->parent_overlap) 3047a9e8ba2cSAlex Elder xferred = rbd_dev->parent_overlap - 3048a9e8ba2cSAlex Elder obj_request->img_offset; 3049a9e8ba2cSAlex Elder 305002c74fbaSAlex Elder obj_request->xferred = min(img_xferred, xferred); 3051a9e8ba2cSAlex Elder } else { 305202c74fbaSAlex Elder obj_request->xferred = img_xferred; 3053a9e8ba2cSAlex Elder } 3054a9e8ba2cSAlex Elder out: 30558b3e1a56SAlex Elder rbd_img_obj_request_read_callback(obj_request); 30568b3e1a56SAlex Elder rbd_obj_request_complete(obj_request); 30578b3e1a56SAlex Elder } 30588b3e1a56SAlex Elder 30598b3e1a56SAlex Elder static void rbd_img_parent_read(struct rbd_obj_request *obj_request) 30608b3e1a56SAlex Elder { 30618b3e1a56SAlex Elder struct rbd_img_request *img_request; 30628b3e1a56SAlex Elder int result; 30638b3e1a56SAlex Elder 30648b3e1a56SAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 30658b3e1a56SAlex Elder rbd_assert(obj_request->img_request != NULL); 30668b3e1a56SAlex Elder rbd_assert(obj_request->result == (s32) -ENOENT); 30675b2ab72dSAlex Elder rbd_assert(obj_request_type_valid(obj_request->type)); 30688b3e1a56SAlex Elder 30698b3e1a56SAlex Elder /* rbd_read_finish(obj_request, obj_request->length); */ 3070e93f3152SAlex Elder img_request = rbd_parent_request_create(obj_request, 30718b3e1a56SAlex Elder obj_request->img_offset, 3072e93f3152SAlex Elder obj_request->length); 30738b3e1a56SAlex Elder result = -ENOMEM; 30748b3e1a56SAlex Elder if (!img_request) 30758b3e1a56SAlex Elder goto out_err; 30768b3e1a56SAlex Elder 30775b2ab72dSAlex Elder if (obj_request->type == OBJ_REQUEST_BIO) 3078f1a4739fSAlex Elder result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO, 3079f1a4739fSAlex Elder obj_request->bio_list); 30805b2ab72dSAlex Elder else 30815b2ab72dSAlex Elder result = rbd_img_request_fill(img_request, OBJ_REQUEST_PAGES, 30825b2ab72dSAlex Elder obj_request->pages); 30838b3e1a56SAlex Elder if (result) 30848b3e1a56SAlex Elder goto out_err; 30858b3e1a56SAlex Elder 30868b3e1a56SAlex Elder img_request->callback = rbd_img_parent_read_callback; 30878b3e1a56SAlex Elder result = rbd_img_request_submit(img_request); 30888b3e1a56SAlex Elder if (result) 30898b3e1a56SAlex Elder goto out_err; 30908b3e1a56SAlex Elder 30918b3e1a56SAlex Elder return; 30928b3e1a56SAlex Elder out_err: 30938b3e1a56SAlex Elder if (img_request) 30948b3e1a56SAlex Elder rbd_img_request_put(img_request); 30958b3e1a56SAlex Elder obj_request->result = result; 30968b3e1a56SAlex Elder obj_request->xferred = 0; 30978b3e1a56SAlex Elder obj_request_done_set(obj_request); 30988b3e1a56SAlex Elder } 30998b3e1a56SAlex Elder 3100ed95b21aSIlya Dryomov static const struct rbd_client_id rbd_empty_cid; 3101ed95b21aSIlya Dryomov 3102ed95b21aSIlya Dryomov static bool rbd_cid_equal(const struct rbd_client_id *lhs, 3103ed95b21aSIlya Dryomov const struct rbd_client_id *rhs) 3104ed95b21aSIlya Dryomov { 3105ed95b21aSIlya Dryomov return lhs->gid == rhs->gid && lhs->handle == rhs->handle; 3106ed95b21aSIlya Dryomov } 3107ed95b21aSIlya Dryomov 3108ed95b21aSIlya Dryomov static struct rbd_client_id rbd_get_cid(struct rbd_device *rbd_dev) 3109ed95b21aSIlya Dryomov { 3110ed95b21aSIlya Dryomov struct rbd_client_id cid; 3111ed95b21aSIlya Dryomov 3112ed95b21aSIlya Dryomov mutex_lock(&rbd_dev->watch_mutex); 3113ed95b21aSIlya Dryomov cid.gid = ceph_client_gid(rbd_dev->rbd_client->client); 3114ed95b21aSIlya Dryomov cid.handle = rbd_dev->watch_cookie; 3115ed95b21aSIlya Dryomov mutex_unlock(&rbd_dev->watch_mutex); 3116ed95b21aSIlya Dryomov return cid; 3117ed95b21aSIlya Dryomov } 3118ed95b21aSIlya Dryomov 3119ed95b21aSIlya Dryomov /* 3120ed95b21aSIlya Dryomov * lock_rwsem must be held for write 3121ed95b21aSIlya Dryomov */ 3122ed95b21aSIlya Dryomov static void rbd_set_owner_cid(struct rbd_device *rbd_dev, 3123ed95b21aSIlya Dryomov const struct rbd_client_id *cid) 3124ed95b21aSIlya Dryomov { 3125ed95b21aSIlya Dryomov dout("%s rbd_dev %p %llu-%llu -> %llu-%llu\n", __func__, rbd_dev, 3126ed95b21aSIlya Dryomov rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle, 3127ed95b21aSIlya Dryomov cid->gid, cid->handle); 3128ed95b21aSIlya Dryomov rbd_dev->owner_cid = *cid; /* struct */ 3129ed95b21aSIlya Dryomov } 3130ed95b21aSIlya Dryomov 3131ed95b21aSIlya Dryomov static void format_lock_cookie(struct rbd_device *rbd_dev, char *buf) 3132ed95b21aSIlya Dryomov { 3133ed95b21aSIlya Dryomov mutex_lock(&rbd_dev->watch_mutex); 3134ed95b21aSIlya Dryomov sprintf(buf, "%s %llu", RBD_LOCK_COOKIE_PREFIX, rbd_dev->watch_cookie); 3135ed95b21aSIlya Dryomov mutex_unlock(&rbd_dev->watch_mutex); 3136ed95b21aSIlya Dryomov } 3137ed95b21aSIlya Dryomov 3138ed95b21aSIlya Dryomov /* 3139ed95b21aSIlya Dryomov * lock_rwsem must be held for write 3140ed95b21aSIlya Dryomov */ 3141ed95b21aSIlya Dryomov static int rbd_lock(struct rbd_device *rbd_dev) 3142ed95b21aSIlya Dryomov { 3143ed95b21aSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 3144ed95b21aSIlya Dryomov struct rbd_client_id cid = rbd_get_cid(rbd_dev); 3145ed95b21aSIlya Dryomov char cookie[32]; 3146ed95b21aSIlya Dryomov int ret; 3147ed95b21aSIlya Dryomov 3148ed95b21aSIlya Dryomov WARN_ON(__rbd_is_lock_owner(rbd_dev)); 3149ed95b21aSIlya Dryomov 3150ed95b21aSIlya Dryomov format_lock_cookie(rbd_dev, cookie); 3151ed95b21aSIlya Dryomov ret = ceph_cls_lock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc, 3152ed95b21aSIlya Dryomov RBD_LOCK_NAME, CEPH_CLS_LOCK_EXCLUSIVE, cookie, 3153ed95b21aSIlya Dryomov RBD_LOCK_TAG, "", 0); 3154ed95b21aSIlya Dryomov if (ret) 3155ed95b21aSIlya Dryomov return ret; 3156ed95b21aSIlya Dryomov 3157ed95b21aSIlya Dryomov rbd_dev->lock_state = RBD_LOCK_STATE_LOCKED; 3158ed95b21aSIlya Dryomov rbd_set_owner_cid(rbd_dev, &cid); 3159ed95b21aSIlya Dryomov queue_work(rbd_dev->task_wq, &rbd_dev->acquired_lock_work); 3160ed95b21aSIlya Dryomov return 0; 3161ed95b21aSIlya Dryomov } 3162ed95b21aSIlya Dryomov 3163ed95b21aSIlya Dryomov /* 3164ed95b21aSIlya Dryomov * lock_rwsem must be held for write 3165ed95b21aSIlya Dryomov */ 3166ed95b21aSIlya Dryomov static int rbd_unlock(struct rbd_device *rbd_dev) 3167ed95b21aSIlya Dryomov { 3168ed95b21aSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 3169ed95b21aSIlya Dryomov char cookie[32]; 3170ed95b21aSIlya Dryomov int ret; 3171ed95b21aSIlya Dryomov 3172ed95b21aSIlya Dryomov WARN_ON(!__rbd_is_lock_owner(rbd_dev)); 3173ed95b21aSIlya Dryomov 3174ed95b21aSIlya Dryomov rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED; 3175ed95b21aSIlya Dryomov 3176ed95b21aSIlya Dryomov format_lock_cookie(rbd_dev, cookie); 3177ed95b21aSIlya Dryomov ret = ceph_cls_unlock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc, 3178ed95b21aSIlya Dryomov RBD_LOCK_NAME, cookie); 3179ed95b21aSIlya Dryomov if (ret && ret != -ENOENT) { 3180ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "cls_unlock failed: %d", ret); 3181ed95b21aSIlya Dryomov return ret; 3182ed95b21aSIlya Dryomov } 3183ed95b21aSIlya Dryomov 3184ed95b21aSIlya Dryomov rbd_set_owner_cid(rbd_dev, &rbd_empty_cid); 3185ed95b21aSIlya Dryomov queue_work(rbd_dev->task_wq, &rbd_dev->released_lock_work); 3186ed95b21aSIlya Dryomov return 0; 3187ed95b21aSIlya Dryomov } 3188ed95b21aSIlya Dryomov 3189ed95b21aSIlya Dryomov static int __rbd_notify_op_lock(struct rbd_device *rbd_dev, 3190ed95b21aSIlya Dryomov enum rbd_notify_op notify_op, 3191ed95b21aSIlya Dryomov struct page ***preply_pages, 3192ed95b21aSIlya Dryomov size_t *preply_len) 3193ed95b21aSIlya Dryomov { 3194ed95b21aSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 3195ed95b21aSIlya Dryomov struct rbd_client_id cid = rbd_get_cid(rbd_dev); 3196ed95b21aSIlya Dryomov int buf_size = 4 + 8 + 8 + CEPH_ENCODING_START_BLK_LEN; 3197ed95b21aSIlya Dryomov char buf[buf_size]; 3198ed95b21aSIlya Dryomov void *p = buf; 3199ed95b21aSIlya Dryomov 3200ed95b21aSIlya Dryomov dout("%s rbd_dev %p notify_op %d\n", __func__, rbd_dev, notify_op); 3201ed95b21aSIlya Dryomov 3202ed95b21aSIlya Dryomov /* encode *LockPayload NotifyMessage (op + ClientId) */ 3203ed95b21aSIlya Dryomov ceph_start_encoding(&p, 2, 1, buf_size - CEPH_ENCODING_START_BLK_LEN); 3204ed95b21aSIlya Dryomov ceph_encode_32(&p, notify_op); 3205ed95b21aSIlya Dryomov ceph_encode_64(&p, cid.gid); 3206ed95b21aSIlya Dryomov ceph_encode_64(&p, cid.handle); 3207ed95b21aSIlya Dryomov 3208ed95b21aSIlya Dryomov return ceph_osdc_notify(osdc, &rbd_dev->header_oid, 3209ed95b21aSIlya Dryomov &rbd_dev->header_oloc, buf, buf_size, 3210ed95b21aSIlya Dryomov RBD_NOTIFY_TIMEOUT, preply_pages, preply_len); 3211ed95b21aSIlya Dryomov } 3212ed95b21aSIlya Dryomov 3213ed95b21aSIlya Dryomov static void rbd_notify_op_lock(struct rbd_device *rbd_dev, 3214ed95b21aSIlya Dryomov enum rbd_notify_op notify_op) 3215ed95b21aSIlya Dryomov { 3216ed95b21aSIlya Dryomov struct page **reply_pages; 3217ed95b21aSIlya Dryomov size_t reply_len; 3218ed95b21aSIlya Dryomov 3219ed95b21aSIlya Dryomov __rbd_notify_op_lock(rbd_dev, notify_op, &reply_pages, &reply_len); 3220ed95b21aSIlya Dryomov ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len)); 3221ed95b21aSIlya Dryomov } 3222ed95b21aSIlya Dryomov 3223ed95b21aSIlya Dryomov static void rbd_notify_acquired_lock(struct work_struct *work) 3224ed95b21aSIlya Dryomov { 3225ed95b21aSIlya Dryomov struct rbd_device *rbd_dev = container_of(work, struct rbd_device, 3226ed95b21aSIlya Dryomov acquired_lock_work); 3227ed95b21aSIlya Dryomov 3228ed95b21aSIlya Dryomov rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_ACQUIRED_LOCK); 3229ed95b21aSIlya Dryomov } 3230ed95b21aSIlya Dryomov 3231ed95b21aSIlya Dryomov static void rbd_notify_released_lock(struct work_struct *work) 3232ed95b21aSIlya Dryomov { 3233ed95b21aSIlya Dryomov struct rbd_device *rbd_dev = container_of(work, struct rbd_device, 3234ed95b21aSIlya Dryomov released_lock_work); 3235ed95b21aSIlya Dryomov 3236ed95b21aSIlya Dryomov rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_RELEASED_LOCK); 3237ed95b21aSIlya Dryomov } 3238ed95b21aSIlya Dryomov 3239ed95b21aSIlya Dryomov static int rbd_request_lock(struct rbd_device *rbd_dev) 3240ed95b21aSIlya Dryomov { 3241ed95b21aSIlya Dryomov struct page **reply_pages; 3242ed95b21aSIlya Dryomov size_t reply_len; 3243ed95b21aSIlya Dryomov bool lock_owner_responded = false; 3244ed95b21aSIlya Dryomov int ret; 3245ed95b21aSIlya Dryomov 3246ed95b21aSIlya Dryomov dout("%s rbd_dev %p\n", __func__, rbd_dev); 3247ed95b21aSIlya Dryomov 3248ed95b21aSIlya Dryomov ret = __rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_REQUEST_LOCK, 3249ed95b21aSIlya Dryomov &reply_pages, &reply_len); 3250ed95b21aSIlya Dryomov if (ret && ret != -ETIMEDOUT) { 3251ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "failed to request lock: %d", ret); 3252ed95b21aSIlya Dryomov goto out; 3253ed95b21aSIlya Dryomov } 3254ed95b21aSIlya Dryomov 3255ed95b21aSIlya Dryomov if (reply_len > 0 && reply_len <= PAGE_SIZE) { 3256ed95b21aSIlya Dryomov void *p = page_address(reply_pages[0]); 3257ed95b21aSIlya Dryomov void *const end = p + reply_len; 3258ed95b21aSIlya Dryomov u32 n; 3259ed95b21aSIlya Dryomov 3260ed95b21aSIlya Dryomov ceph_decode_32_safe(&p, end, n, e_inval); /* num_acks */ 3261ed95b21aSIlya Dryomov while (n--) { 3262ed95b21aSIlya Dryomov u8 struct_v; 3263ed95b21aSIlya Dryomov u32 len; 3264ed95b21aSIlya Dryomov 3265ed95b21aSIlya Dryomov ceph_decode_need(&p, end, 8 + 8, e_inval); 3266ed95b21aSIlya Dryomov p += 8 + 8; /* skip gid and cookie */ 3267ed95b21aSIlya Dryomov 3268ed95b21aSIlya Dryomov ceph_decode_32_safe(&p, end, len, e_inval); 3269ed95b21aSIlya Dryomov if (!len) 3270ed95b21aSIlya Dryomov continue; 3271ed95b21aSIlya Dryomov 3272ed95b21aSIlya Dryomov if (lock_owner_responded) { 3273ed95b21aSIlya Dryomov rbd_warn(rbd_dev, 3274ed95b21aSIlya Dryomov "duplicate lock owners detected"); 3275ed95b21aSIlya Dryomov ret = -EIO; 3276ed95b21aSIlya Dryomov goto out; 3277ed95b21aSIlya Dryomov } 3278ed95b21aSIlya Dryomov 3279ed95b21aSIlya Dryomov lock_owner_responded = true; 3280ed95b21aSIlya Dryomov ret = ceph_start_decoding(&p, end, 1, "ResponseMessage", 3281ed95b21aSIlya Dryomov &struct_v, &len); 3282ed95b21aSIlya Dryomov if (ret) { 3283ed95b21aSIlya Dryomov rbd_warn(rbd_dev, 3284ed95b21aSIlya Dryomov "failed to decode ResponseMessage: %d", 3285ed95b21aSIlya Dryomov ret); 3286ed95b21aSIlya Dryomov goto e_inval; 3287ed95b21aSIlya Dryomov } 3288ed95b21aSIlya Dryomov 3289ed95b21aSIlya Dryomov ret = ceph_decode_32(&p); 3290ed95b21aSIlya Dryomov } 3291ed95b21aSIlya Dryomov } 3292ed95b21aSIlya Dryomov 3293ed95b21aSIlya Dryomov if (!lock_owner_responded) { 3294ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "no lock owners detected"); 3295ed95b21aSIlya Dryomov ret = -ETIMEDOUT; 3296ed95b21aSIlya Dryomov } 3297ed95b21aSIlya Dryomov 3298ed95b21aSIlya Dryomov out: 3299ed95b21aSIlya Dryomov ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len)); 3300ed95b21aSIlya Dryomov return ret; 3301ed95b21aSIlya Dryomov 3302ed95b21aSIlya Dryomov e_inval: 3303ed95b21aSIlya Dryomov ret = -EINVAL; 3304ed95b21aSIlya Dryomov goto out; 3305ed95b21aSIlya Dryomov } 3306ed95b21aSIlya Dryomov 3307ed95b21aSIlya Dryomov static void wake_requests(struct rbd_device *rbd_dev, bool wake_all) 3308ed95b21aSIlya Dryomov { 3309ed95b21aSIlya Dryomov dout("%s rbd_dev %p wake_all %d\n", __func__, rbd_dev, wake_all); 3310ed95b21aSIlya Dryomov 3311ed95b21aSIlya Dryomov cancel_delayed_work(&rbd_dev->lock_dwork); 3312ed95b21aSIlya Dryomov if (wake_all) 3313ed95b21aSIlya Dryomov wake_up_all(&rbd_dev->lock_waitq); 3314ed95b21aSIlya Dryomov else 3315ed95b21aSIlya Dryomov wake_up(&rbd_dev->lock_waitq); 3316ed95b21aSIlya Dryomov } 3317ed95b21aSIlya Dryomov 3318ed95b21aSIlya Dryomov static int get_lock_owner_info(struct rbd_device *rbd_dev, 3319ed95b21aSIlya Dryomov struct ceph_locker **lockers, u32 *num_lockers) 3320ed95b21aSIlya Dryomov { 3321ed95b21aSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 3322ed95b21aSIlya Dryomov u8 lock_type; 3323ed95b21aSIlya Dryomov char *lock_tag; 3324ed95b21aSIlya Dryomov int ret; 3325ed95b21aSIlya Dryomov 3326ed95b21aSIlya Dryomov dout("%s rbd_dev %p\n", __func__, rbd_dev); 3327ed95b21aSIlya Dryomov 3328ed95b21aSIlya Dryomov ret = ceph_cls_lock_info(osdc, &rbd_dev->header_oid, 3329ed95b21aSIlya Dryomov &rbd_dev->header_oloc, RBD_LOCK_NAME, 3330ed95b21aSIlya Dryomov &lock_type, &lock_tag, lockers, num_lockers); 3331ed95b21aSIlya Dryomov if (ret) 3332ed95b21aSIlya Dryomov return ret; 3333ed95b21aSIlya Dryomov 3334ed95b21aSIlya Dryomov if (*num_lockers == 0) { 3335ed95b21aSIlya Dryomov dout("%s rbd_dev %p no lockers detected\n", __func__, rbd_dev); 3336ed95b21aSIlya Dryomov goto out; 3337ed95b21aSIlya Dryomov } 3338ed95b21aSIlya Dryomov 3339ed95b21aSIlya Dryomov if (strcmp(lock_tag, RBD_LOCK_TAG)) { 3340ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "locked by external mechanism, tag %s", 3341ed95b21aSIlya Dryomov lock_tag); 3342ed95b21aSIlya Dryomov ret = -EBUSY; 3343ed95b21aSIlya Dryomov goto out; 3344ed95b21aSIlya Dryomov } 3345ed95b21aSIlya Dryomov 3346ed95b21aSIlya Dryomov if (lock_type == CEPH_CLS_LOCK_SHARED) { 3347ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "shared lock type detected"); 3348ed95b21aSIlya Dryomov ret = -EBUSY; 3349ed95b21aSIlya Dryomov goto out; 3350ed95b21aSIlya Dryomov } 3351ed95b21aSIlya Dryomov 3352ed95b21aSIlya Dryomov if (strncmp((*lockers)[0].id.cookie, RBD_LOCK_COOKIE_PREFIX, 3353ed95b21aSIlya Dryomov strlen(RBD_LOCK_COOKIE_PREFIX))) { 3354ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "locked by external mechanism, cookie %s", 3355ed95b21aSIlya Dryomov (*lockers)[0].id.cookie); 3356ed95b21aSIlya Dryomov ret = -EBUSY; 3357ed95b21aSIlya Dryomov goto out; 3358ed95b21aSIlya Dryomov } 3359ed95b21aSIlya Dryomov 3360ed95b21aSIlya Dryomov out: 3361ed95b21aSIlya Dryomov kfree(lock_tag); 3362ed95b21aSIlya Dryomov return ret; 3363ed95b21aSIlya Dryomov } 3364ed95b21aSIlya Dryomov 3365ed95b21aSIlya Dryomov static int find_watcher(struct rbd_device *rbd_dev, 3366ed95b21aSIlya Dryomov const struct ceph_locker *locker) 3367ed95b21aSIlya Dryomov { 3368ed95b21aSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 3369ed95b21aSIlya Dryomov struct ceph_watch_item *watchers; 3370ed95b21aSIlya Dryomov u32 num_watchers; 3371ed95b21aSIlya Dryomov u64 cookie; 3372ed95b21aSIlya Dryomov int i; 3373ed95b21aSIlya Dryomov int ret; 3374ed95b21aSIlya Dryomov 3375ed95b21aSIlya Dryomov ret = ceph_osdc_list_watchers(osdc, &rbd_dev->header_oid, 3376ed95b21aSIlya Dryomov &rbd_dev->header_oloc, &watchers, 3377ed95b21aSIlya Dryomov &num_watchers); 3378ed95b21aSIlya Dryomov if (ret) 3379ed95b21aSIlya Dryomov return ret; 3380ed95b21aSIlya Dryomov 3381ed95b21aSIlya Dryomov sscanf(locker->id.cookie, RBD_LOCK_COOKIE_PREFIX " %llu", &cookie); 3382ed95b21aSIlya Dryomov for (i = 0; i < num_watchers; i++) { 3383ed95b21aSIlya Dryomov if (!memcmp(&watchers[i].addr, &locker->info.addr, 3384ed95b21aSIlya Dryomov sizeof(locker->info.addr)) && 3385ed95b21aSIlya Dryomov watchers[i].cookie == cookie) { 3386ed95b21aSIlya Dryomov struct rbd_client_id cid = { 3387ed95b21aSIlya Dryomov .gid = le64_to_cpu(watchers[i].name.num), 3388ed95b21aSIlya Dryomov .handle = cookie, 3389ed95b21aSIlya Dryomov }; 3390ed95b21aSIlya Dryomov 3391ed95b21aSIlya Dryomov dout("%s rbd_dev %p found cid %llu-%llu\n", __func__, 3392ed95b21aSIlya Dryomov rbd_dev, cid.gid, cid.handle); 3393ed95b21aSIlya Dryomov rbd_set_owner_cid(rbd_dev, &cid); 3394ed95b21aSIlya Dryomov ret = 1; 3395ed95b21aSIlya Dryomov goto out; 3396ed95b21aSIlya Dryomov } 3397ed95b21aSIlya Dryomov } 3398ed95b21aSIlya Dryomov 3399ed95b21aSIlya Dryomov dout("%s rbd_dev %p no watchers\n", __func__, rbd_dev); 3400ed95b21aSIlya Dryomov ret = 0; 3401ed95b21aSIlya Dryomov out: 3402ed95b21aSIlya Dryomov kfree(watchers); 3403ed95b21aSIlya Dryomov return ret; 3404ed95b21aSIlya Dryomov } 3405ed95b21aSIlya Dryomov 3406ed95b21aSIlya Dryomov /* 3407ed95b21aSIlya Dryomov * lock_rwsem must be held for write 3408ed95b21aSIlya Dryomov */ 3409ed95b21aSIlya Dryomov static int rbd_try_lock(struct rbd_device *rbd_dev) 3410ed95b21aSIlya Dryomov { 3411ed95b21aSIlya Dryomov struct ceph_client *client = rbd_dev->rbd_client->client; 3412ed95b21aSIlya Dryomov struct ceph_locker *lockers; 3413ed95b21aSIlya Dryomov u32 num_lockers; 3414ed95b21aSIlya Dryomov int ret; 3415ed95b21aSIlya Dryomov 3416ed95b21aSIlya Dryomov for (;;) { 3417ed95b21aSIlya Dryomov ret = rbd_lock(rbd_dev); 3418ed95b21aSIlya Dryomov if (ret != -EBUSY) 3419ed95b21aSIlya Dryomov return ret; 3420ed95b21aSIlya Dryomov 3421ed95b21aSIlya Dryomov /* determine if the current lock holder is still alive */ 3422ed95b21aSIlya Dryomov ret = get_lock_owner_info(rbd_dev, &lockers, &num_lockers); 3423ed95b21aSIlya Dryomov if (ret) 3424ed95b21aSIlya Dryomov return ret; 3425ed95b21aSIlya Dryomov 3426ed95b21aSIlya Dryomov if (num_lockers == 0) 3427ed95b21aSIlya Dryomov goto again; 3428ed95b21aSIlya Dryomov 3429ed95b21aSIlya Dryomov ret = find_watcher(rbd_dev, lockers); 3430ed95b21aSIlya Dryomov if (ret) { 3431ed95b21aSIlya Dryomov if (ret > 0) 3432ed95b21aSIlya Dryomov ret = 0; /* have to request lock */ 3433ed95b21aSIlya Dryomov goto out; 3434ed95b21aSIlya Dryomov } 3435ed95b21aSIlya Dryomov 3436ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "%s%llu seems dead, breaking lock", 3437ed95b21aSIlya Dryomov ENTITY_NAME(lockers[0].id.name)); 3438ed95b21aSIlya Dryomov 3439ed95b21aSIlya Dryomov ret = ceph_monc_blacklist_add(&client->monc, 3440ed95b21aSIlya Dryomov &lockers[0].info.addr); 3441ed95b21aSIlya Dryomov if (ret) { 3442ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "blacklist of %s%llu failed: %d", 3443ed95b21aSIlya Dryomov ENTITY_NAME(lockers[0].id.name), ret); 3444ed95b21aSIlya Dryomov goto out; 3445ed95b21aSIlya Dryomov } 3446ed95b21aSIlya Dryomov 3447ed95b21aSIlya Dryomov ret = ceph_cls_break_lock(&client->osdc, &rbd_dev->header_oid, 3448ed95b21aSIlya Dryomov &rbd_dev->header_oloc, RBD_LOCK_NAME, 3449ed95b21aSIlya Dryomov lockers[0].id.cookie, 3450ed95b21aSIlya Dryomov &lockers[0].id.name); 3451ed95b21aSIlya Dryomov if (ret && ret != -ENOENT) 3452ed95b21aSIlya Dryomov goto out; 3453ed95b21aSIlya Dryomov 3454ed95b21aSIlya Dryomov again: 3455ed95b21aSIlya Dryomov ceph_free_lockers(lockers, num_lockers); 3456ed95b21aSIlya Dryomov } 3457ed95b21aSIlya Dryomov 3458ed95b21aSIlya Dryomov out: 3459ed95b21aSIlya Dryomov ceph_free_lockers(lockers, num_lockers); 3460ed95b21aSIlya Dryomov return ret; 3461ed95b21aSIlya Dryomov } 3462ed95b21aSIlya Dryomov 3463ed95b21aSIlya Dryomov /* 3464ed95b21aSIlya Dryomov * ret is set only if lock_state is RBD_LOCK_STATE_UNLOCKED 3465ed95b21aSIlya Dryomov */ 3466ed95b21aSIlya Dryomov static enum rbd_lock_state rbd_try_acquire_lock(struct rbd_device *rbd_dev, 3467ed95b21aSIlya Dryomov int *pret) 3468ed95b21aSIlya Dryomov { 3469ed95b21aSIlya Dryomov enum rbd_lock_state lock_state; 3470ed95b21aSIlya Dryomov 3471ed95b21aSIlya Dryomov down_read(&rbd_dev->lock_rwsem); 3472ed95b21aSIlya Dryomov dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev, 3473ed95b21aSIlya Dryomov rbd_dev->lock_state); 3474ed95b21aSIlya Dryomov if (__rbd_is_lock_owner(rbd_dev)) { 3475ed95b21aSIlya Dryomov lock_state = rbd_dev->lock_state; 3476ed95b21aSIlya Dryomov up_read(&rbd_dev->lock_rwsem); 3477ed95b21aSIlya Dryomov return lock_state; 3478ed95b21aSIlya Dryomov } 3479ed95b21aSIlya Dryomov 3480ed95b21aSIlya Dryomov up_read(&rbd_dev->lock_rwsem); 3481ed95b21aSIlya Dryomov down_write(&rbd_dev->lock_rwsem); 3482ed95b21aSIlya Dryomov dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev, 3483ed95b21aSIlya Dryomov rbd_dev->lock_state); 3484ed95b21aSIlya Dryomov if (!__rbd_is_lock_owner(rbd_dev)) { 3485ed95b21aSIlya Dryomov *pret = rbd_try_lock(rbd_dev); 3486ed95b21aSIlya Dryomov if (*pret) 3487ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "failed to acquire lock: %d", *pret); 3488ed95b21aSIlya Dryomov } 3489ed95b21aSIlya Dryomov 3490ed95b21aSIlya Dryomov lock_state = rbd_dev->lock_state; 3491ed95b21aSIlya Dryomov up_write(&rbd_dev->lock_rwsem); 3492ed95b21aSIlya Dryomov return lock_state; 3493ed95b21aSIlya Dryomov } 3494ed95b21aSIlya Dryomov 3495ed95b21aSIlya Dryomov static void rbd_acquire_lock(struct work_struct *work) 3496ed95b21aSIlya Dryomov { 3497ed95b21aSIlya Dryomov struct rbd_device *rbd_dev = container_of(to_delayed_work(work), 3498ed95b21aSIlya Dryomov struct rbd_device, lock_dwork); 3499ed95b21aSIlya Dryomov enum rbd_lock_state lock_state; 3500ed95b21aSIlya Dryomov int ret; 3501ed95b21aSIlya Dryomov 3502ed95b21aSIlya Dryomov dout("%s rbd_dev %p\n", __func__, rbd_dev); 3503ed95b21aSIlya Dryomov again: 3504ed95b21aSIlya Dryomov lock_state = rbd_try_acquire_lock(rbd_dev, &ret); 3505ed95b21aSIlya Dryomov if (lock_state != RBD_LOCK_STATE_UNLOCKED || ret == -EBLACKLISTED) { 3506ed95b21aSIlya Dryomov if (lock_state == RBD_LOCK_STATE_LOCKED) 3507ed95b21aSIlya Dryomov wake_requests(rbd_dev, true); 3508ed95b21aSIlya Dryomov dout("%s rbd_dev %p lock_state %d ret %d - done\n", __func__, 3509ed95b21aSIlya Dryomov rbd_dev, lock_state, ret); 3510ed95b21aSIlya Dryomov return; 3511ed95b21aSIlya Dryomov } 3512ed95b21aSIlya Dryomov 3513ed95b21aSIlya Dryomov ret = rbd_request_lock(rbd_dev); 3514ed95b21aSIlya Dryomov if (ret == -ETIMEDOUT) { 3515ed95b21aSIlya Dryomov goto again; /* treat this as a dead client */ 3516ed95b21aSIlya Dryomov } else if (ret < 0) { 3517ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "error requesting lock: %d", ret); 3518ed95b21aSIlya Dryomov mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 3519ed95b21aSIlya Dryomov RBD_RETRY_DELAY); 3520ed95b21aSIlya Dryomov } else { 3521ed95b21aSIlya Dryomov /* 3522ed95b21aSIlya Dryomov * lock owner acked, but resend if we don't see them 3523ed95b21aSIlya Dryomov * release the lock 3524ed95b21aSIlya Dryomov */ 3525ed95b21aSIlya Dryomov dout("%s rbd_dev %p requeueing lock_dwork\n", __func__, 3526ed95b21aSIlya Dryomov rbd_dev); 3527ed95b21aSIlya Dryomov mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 3528ed95b21aSIlya Dryomov msecs_to_jiffies(2 * RBD_NOTIFY_TIMEOUT * MSEC_PER_SEC)); 3529ed95b21aSIlya Dryomov } 3530ed95b21aSIlya Dryomov } 3531ed95b21aSIlya Dryomov 3532ed95b21aSIlya Dryomov /* 3533ed95b21aSIlya Dryomov * lock_rwsem must be held for write 3534ed95b21aSIlya Dryomov */ 3535ed95b21aSIlya Dryomov static bool rbd_release_lock(struct rbd_device *rbd_dev) 3536ed95b21aSIlya Dryomov { 3537ed95b21aSIlya Dryomov dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev, 3538ed95b21aSIlya Dryomov rbd_dev->lock_state); 3539ed95b21aSIlya Dryomov if (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED) 3540ed95b21aSIlya Dryomov return false; 3541ed95b21aSIlya Dryomov 3542ed95b21aSIlya Dryomov rbd_dev->lock_state = RBD_LOCK_STATE_RELEASING; 3543ed95b21aSIlya Dryomov downgrade_write(&rbd_dev->lock_rwsem); 3544ed95b21aSIlya Dryomov /* 3545ed95b21aSIlya Dryomov * Ensure that all in-flight IO is flushed. 3546ed95b21aSIlya Dryomov * 3547ed95b21aSIlya Dryomov * FIXME: ceph_osdc_sync() flushes the entire OSD client, which 3548ed95b21aSIlya Dryomov * may be shared with other devices. 3549ed95b21aSIlya Dryomov */ 3550ed95b21aSIlya Dryomov ceph_osdc_sync(&rbd_dev->rbd_client->client->osdc); 3551ed95b21aSIlya Dryomov up_read(&rbd_dev->lock_rwsem); 3552ed95b21aSIlya Dryomov 3553ed95b21aSIlya Dryomov down_write(&rbd_dev->lock_rwsem); 3554ed95b21aSIlya Dryomov dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev, 3555ed95b21aSIlya Dryomov rbd_dev->lock_state); 3556ed95b21aSIlya Dryomov if (rbd_dev->lock_state != RBD_LOCK_STATE_RELEASING) 3557ed95b21aSIlya Dryomov return false; 3558ed95b21aSIlya Dryomov 3559ed95b21aSIlya Dryomov if (!rbd_unlock(rbd_dev)) 3560ed95b21aSIlya Dryomov /* 3561ed95b21aSIlya Dryomov * Give others a chance to grab the lock - we would re-acquire 3562ed95b21aSIlya Dryomov * almost immediately if we got new IO during ceph_osdc_sync() 3563ed95b21aSIlya Dryomov * otherwise. We need to ack our own notifications, so this 3564ed95b21aSIlya Dryomov * lock_dwork will be requeued from rbd_wait_state_locked() 3565ed95b21aSIlya Dryomov * after wake_requests() in rbd_handle_released_lock(). 3566ed95b21aSIlya Dryomov */ 3567ed95b21aSIlya Dryomov cancel_delayed_work(&rbd_dev->lock_dwork); 3568ed95b21aSIlya Dryomov 3569ed95b21aSIlya Dryomov return true; 3570ed95b21aSIlya Dryomov } 3571ed95b21aSIlya Dryomov 3572ed95b21aSIlya Dryomov static void rbd_release_lock_work(struct work_struct *work) 3573ed95b21aSIlya Dryomov { 3574ed95b21aSIlya Dryomov struct rbd_device *rbd_dev = container_of(work, struct rbd_device, 3575ed95b21aSIlya Dryomov unlock_work); 3576ed95b21aSIlya Dryomov 3577ed95b21aSIlya Dryomov down_write(&rbd_dev->lock_rwsem); 3578ed95b21aSIlya Dryomov rbd_release_lock(rbd_dev); 3579ed95b21aSIlya Dryomov up_write(&rbd_dev->lock_rwsem); 3580ed95b21aSIlya Dryomov } 3581ed95b21aSIlya Dryomov 3582ed95b21aSIlya Dryomov static void rbd_handle_acquired_lock(struct rbd_device *rbd_dev, u8 struct_v, 3583ed95b21aSIlya Dryomov void **p) 3584ed95b21aSIlya Dryomov { 3585ed95b21aSIlya Dryomov struct rbd_client_id cid = { 0 }; 3586ed95b21aSIlya Dryomov 3587ed95b21aSIlya Dryomov if (struct_v >= 2) { 3588ed95b21aSIlya Dryomov cid.gid = ceph_decode_64(p); 3589ed95b21aSIlya Dryomov cid.handle = ceph_decode_64(p); 3590ed95b21aSIlya Dryomov } 3591ed95b21aSIlya Dryomov 3592ed95b21aSIlya Dryomov dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid, 3593ed95b21aSIlya Dryomov cid.handle); 3594ed95b21aSIlya Dryomov if (!rbd_cid_equal(&cid, &rbd_empty_cid)) { 3595ed95b21aSIlya Dryomov down_write(&rbd_dev->lock_rwsem); 3596ed95b21aSIlya Dryomov if (rbd_cid_equal(&cid, &rbd_dev->owner_cid)) { 3597ed95b21aSIlya Dryomov /* 3598ed95b21aSIlya Dryomov * we already know that the remote client is 3599ed95b21aSIlya Dryomov * the owner 3600ed95b21aSIlya Dryomov */ 3601ed95b21aSIlya Dryomov up_write(&rbd_dev->lock_rwsem); 3602ed95b21aSIlya Dryomov return; 3603ed95b21aSIlya Dryomov } 3604ed95b21aSIlya Dryomov 3605ed95b21aSIlya Dryomov rbd_set_owner_cid(rbd_dev, &cid); 3606ed95b21aSIlya Dryomov downgrade_write(&rbd_dev->lock_rwsem); 3607ed95b21aSIlya Dryomov } else { 3608ed95b21aSIlya Dryomov down_read(&rbd_dev->lock_rwsem); 3609ed95b21aSIlya Dryomov } 3610ed95b21aSIlya Dryomov 3611ed95b21aSIlya Dryomov if (!__rbd_is_lock_owner(rbd_dev)) 3612ed95b21aSIlya Dryomov wake_requests(rbd_dev, false); 3613ed95b21aSIlya Dryomov up_read(&rbd_dev->lock_rwsem); 3614ed95b21aSIlya Dryomov } 3615ed95b21aSIlya Dryomov 3616ed95b21aSIlya Dryomov static void rbd_handle_released_lock(struct rbd_device *rbd_dev, u8 struct_v, 3617ed95b21aSIlya Dryomov void **p) 3618ed95b21aSIlya Dryomov { 3619ed95b21aSIlya Dryomov struct rbd_client_id cid = { 0 }; 3620ed95b21aSIlya Dryomov 3621ed95b21aSIlya Dryomov if (struct_v >= 2) { 3622ed95b21aSIlya Dryomov cid.gid = ceph_decode_64(p); 3623ed95b21aSIlya Dryomov cid.handle = ceph_decode_64(p); 3624ed95b21aSIlya Dryomov } 3625ed95b21aSIlya Dryomov 3626ed95b21aSIlya Dryomov dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid, 3627ed95b21aSIlya Dryomov cid.handle); 3628ed95b21aSIlya Dryomov if (!rbd_cid_equal(&cid, &rbd_empty_cid)) { 3629ed95b21aSIlya Dryomov down_write(&rbd_dev->lock_rwsem); 3630ed95b21aSIlya Dryomov if (!rbd_cid_equal(&cid, &rbd_dev->owner_cid)) { 3631ed95b21aSIlya Dryomov dout("%s rbd_dev %p unexpected owner, cid %llu-%llu != owner_cid %llu-%llu\n", 3632ed95b21aSIlya Dryomov __func__, rbd_dev, cid.gid, cid.handle, 3633ed95b21aSIlya Dryomov rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle); 3634ed95b21aSIlya Dryomov up_write(&rbd_dev->lock_rwsem); 3635ed95b21aSIlya Dryomov return; 3636ed95b21aSIlya Dryomov } 3637ed95b21aSIlya Dryomov 3638ed95b21aSIlya Dryomov rbd_set_owner_cid(rbd_dev, &rbd_empty_cid); 3639ed95b21aSIlya Dryomov downgrade_write(&rbd_dev->lock_rwsem); 3640ed95b21aSIlya Dryomov } else { 3641ed95b21aSIlya Dryomov down_read(&rbd_dev->lock_rwsem); 3642ed95b21aSIlya Dryomov } 3643ed95b21aSIlya Dryomov 3644ed95b21aSIlya Dryomov if (!__rbd_is_lock_owner(rbd_dev)) 3645ed95b21aSIlya Dryomov wake_requests(rbd_dev, false); 3646ed95b21aSIlya Dryomov up_read(&rbd_dev->lock_rwsem); 3647ed95b21aSIlya Dryomov } 3648ed95b21aSIlya Dryomov 3649ed95b21aSIlya Dryomov static bool rbd_handle_request_lock(struct rbd_device *rbd_dev, u8 struct_v, 3650ed95b21aSIlya Dryomov void **p) 3651ed95b21aSIlya Dryomov { 3652ed95b21aSIlya Dryomov struct rbd_client_id my_cid = rbd_get_cid(rbd_dev); 3653ed95b21aSIlya Dryomov struct rbd_client_id cid = { 0 }; 3654ed95b21aSIlya Dryomov bool need_to_send; 3655ed95b21aSIlya Dryomov 3656ed95b21aSIlya Dryomov if (struct_v >= 2) { 3657ed95b21aSIlya Dryomov cid.gid = ceph_decode_64(p); 3658ed95b21aSIlya Dryomov cid.handle = ceph_decode_64(p); 3659ed95b21aSIlya Dryomov } 3660ed95b21aSIlya Dryomov 3661ed95b21aSIlya Dryomov dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid, 3662ed95b21aSIlya Dryomov cid.handle); 3663ed95b21aSIlya Dryomov if (rbd_cid_equal(&cid, &my_cid)) 3664ed95b21aSIlya Dryomov return false; 3665ed95b21aSIlya Dryomov 3666ed95b21aSIlya Dryomov down_read(&rbd_dev->lock_rwsem); 3667ed95b21aSIlya Dryomov need_to_send = __rbd_is_lock_owner(rbd_dev); 3668ed95b21aSIlya Dryomov if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED) { 3669ed95b21aSIlya Dryomov if (!rbd_cid_equal(&rbd_dev->owner_cid, &rbd_empty_cid)) { 3670ed95b21aSIlya Dryomov dout("%s rbd_dev %p queueing unlock_work\n", __func__, 3671ed95b21aSIlya Dryomov rbd_dev); 3672ed95b21aSIlya Dryomov queue_work(rbd_dev->task_wq, &rbd_dev->unlock_work); 3673ed95b21aSIlya Dryomov } 3674ed95b21aSIlya Dryomov } 3675ed95b21aSIlya Dryomov up_read(&rbd_dev->lock_rwsem); 3676ed95b21aSIlya Dryomov return need_to_send; 3677ed95b21aSIlya Dryomov } 3678ed95b21aSIlya Dryomov 3679ed95b21aSIlya Dryomov static void __rbd_acknowledge_notify(struct rbd_device *rbd_dev, 3680ed95b21aSIlya Dryomov u64 notify_id, u64 cookie, s32 *result) 3681ed95b21aSIlya Dryomov { 3682ed95b21aSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 3683ed95b21aSIlya Dryomov int buf_size = 4 + CEPH_ENCODING_START_BLK_LEN; 3684ed95b21aSIlya Dryomov char buf[buf_size]; 3685ed95b21aSIlya Dryomov int ret; 3686ed95b21aSIlya Dryomov 3687ed95b21aSIlya Dryomov if (result) { 3688ed95b21aSIlya Dryomov void *p = buf; 3689ed95b21aSIlya Dryomov 3690ed95b21aSIlya Dryomov /* encode ResponseMessage */ 3691ed95b21aSIlya Dryomov ceph_start_encoding(&p, 1, 1, 3692ed95b21aSIlya Dryomov buf_size - CEPH_ENCODING_START_BLK_LEN); 3693ed95b21aSIlya Dryomov ceph_encode_32(&p, *result); 3694ed95b21aSIlya Dryomov } else { 3695ed95b21aSIlya Dryomov buf_size = 0; 3696ed95b21aSIlya Dryomov } 3697ed95b21aSIlya Dryomov 3698ed95b21aSIlya Dryomov ret = ceph_osdc_notify_ack(osdc, &rbd_dev->header_oid, 3699ed95b21aSIlya Dryomov &rbd_dev->header_oloc, notify_id, cookie, 3700ed95b21aSIlya Dryomov buf, buf_size); 3701ed95b21aSIlya Dryomov if (ret) 3702ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "acknowledge_notify failed: %d", ret); 3703ed95b21aSIlya Dryomov } 3704ed95b21aSIlya Dryomov 3705ed95b21aSIlya Dryomov static void rbd_acknowledge_notify(struct rbd_device *rbd_dev, u64 notify_id, 3706ed95b21aSIlya Dryomov u64 cookie) 3707ed95b21aSIlya Dryomov { 3708ed95b21aSIlya Dryomov dout("%s rbd_dev %p\n", __func__, rbd_dev); 3709ed95b21aSIlya Dryomov __rbd_acknowledge_notify(rbd_dev, notify_id, cookie, NULL); 3710ed95b21aSIlya Dryomov } 3711ed95b21aSIlya Dryomov 3712ed95b21aSIlya Dryomov static void rbd_acknowledge_notify_result(struct rbd_device *rbd_dev, 3713ed95b21aSIlya Dryomov u64 notify_id, u64 cookie, s32 result) 3714ed95b21aSIlya Dryomov { 3715ed95b21aSIlya Dryomov dout("%s rbd_dev %p result %d\n", __func__, rbd_dev, result); 3716ed95b21aSIlya Dryomov __rbd_acknowledge_notify(rbd_dev, notify_id, cookie, &result); 3717ed95b21aSIlya Dryomov } 3718922dab61SIlya Dryomov 3719922dab61SIlya Dryomov static void rbd_watch_cb(void *arg, u64 notify_id, u64 cookie, 3720922dab61SIlya Dryomov u64 notifier_id, void *data, size_t data_len) 3721b8d70035SAlex Elder { 3722922dab61SIlya Dryomov struct rbd_device *rbd_dev = arg; 3723ed95b21aSIlya Dryomov void *p = data; 3724ed95b21aSIlya Dryomov void *const end = p + data_len; 3725d4c2269bSIlya Dryomov u8 struct_v = 0; 3726ed95b21aSIlya Dryomov u32 len; 3727ed95b21aSIlya Dryomov u32 notify_op; 3728b8d70035SAlex Elder int ret; 3729b8d70035SAlex Elder 3730ed95b21aSIlya Dryomov dout("%s rbd_dev %p cookie %llu notify_id %llu data_len %zu\n", 3731ed95b21aSIlya Dryomov __func__, rbd_dev, cookie, notify_id, data_len); 3732ed95b21aSIlya Dryomov if (data_len) { 3733ed95b21aSIlya Dryomov ret = ceph_start_decoding(&p, end, 1, "NotifyMessage", 3734ed95b21aSIlya Dryomov &struct_v, &len); 3735ed95b21aSIlya Dryomov if (ret) { 3736ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "failed to decode NotifyMessage: %d", 3737ed95b21aSIlya Dryomov ret); 3738ed95b21aSIlya Dryomov return; 3739ed95b21aSIlya Dryomov } 374052bb1f9bSIlya Dryomov 3741ed95b21aSIlya Dryomov notify_op = ceph_decode_32(&p); 3742ed95b21aSIlya Dryomov } else { 3743ed95b21aSIlya Dryomov /* legacy notification for header updates */ 3744ed95b21aSIlya Dryomov notify_op = RBD_NOTIFY_OP_HEADER_UPDATE; 3745ed95b21aSIlya Dryomov len = 0; 3746ed95b21aSIlya Dryomov } 3747ed95b21aSIlya Dryomov 3748ed95b21aSIlya Dryomov dout("%s rbd_dev %p notify_op %u\n", __func__, rbd_dev, notify_op); 3749ed95b21aSIlya Dryomov switch (notify_op) { 3750ed95b21aSIlya Dryomov case RBD_NOTIFY_OP_ACQUIRED_LOCK: 3751ed95b21aSIlya Dryomov rbd_handle_acquired_lock(rbd_dev, struct_v, &p); 3752ed95b21aSIlya Dryomov rbd_acknowledge_notify(rbd_dev, notify_id, cookie); 3753ed95b21aSIlya Dryomov break; 3754ed95b21aSIlya Dryomov case RBD_NOTIFY_OP_RELEASED_LOCK: 3755ed95b21aSIlya Dryomov rbd_handle_released_lock(rbd_dev, struct_v, &p); 3756ed95b21aSIlya Dryomov rbd_acknowledge_notify(rbd_dev, notify_id, cookie); 3757ed95b21aSIlya Dryomov break; 3758ed95b21aSIlya Dryomov case RBD_NOTIFY_OP_REQUEST_LOCK: 3759ed95b21aSIlya Dryomov if (rbd_handle_request_lock(rbd_dev, struct_v, &p)) 376052bb1f9bSIlya Dryomov /* 3761ed95b21aSIlya Dryomov * send ResponseMessage(0) back so the client 3762ed95b21aSIlya Dryomov * can detect a missing owner 376352bb1f9bSIlya Dryomov */ 3764ed95b21aSIlya Dryomov rbd_acknowledge_notify_result(rbd_dev, notify_id, 3765ed95b21aSIlya Dryomov cookie, 0); 3766ed95b21aSIlya Dryomov else 3767ed95b21aSIlya Dryomov rbd_acknowledge_notify(rbd_dev, notify_id, cookie); 3768ed95b21aSIlya Dryomov break; 3769ed95b21aSIlya Dryomov case RBD_NOTIFY_OP_HEADER_UPDATE: 3770e627db08SAlex Elder ret = rbd_dev_refresh(rbd_dev); 3771e627db08SAlex Elder if (ret) 37729584d508SIlya Dryomov rbd_warn(rbd_dev, "refresh failed: %d", ret); 3773b8d70035SAlex Elder 3774ed95b21aSIlya Dryomov rbd_acknowledge_notify(rbd_dev, notify_id, cookie); 3775ed95b21aSIlya Dryomov break; 3776ed95b21aSIlya Dryomov default: 3777ed95b21aSIlya Dryomov if (rbd_is_lock_owner(rbd_dev)) 3778ed95b21aSIlya Dryomov rbd_acknowledge_notify_result(rbd_dev, notify_id, 3779ed95b21aSIlya Dryomov cookie, -EOPNOTSUPP); 3780ed95b21aSIlya Dryomov else 3781ed95b21aSIlya Dryomov rbd_acknowledge_notify(rbd_dev, notify_id, cookie); 3782ed95b21aSIlya Dryomov break; 3783b8d70035SAlex Elder } 3784b8d70035SAlex Elder } 3785b8d70035SAlex Elder 378699d16943SIlya Dryomov static void __rbd_unregister_watch(struct rbd_device *rbd_dev); 37879969ebc5SAlex Elder 3788922dab61SIlya Dryomov static void rbd_watch_errcb(void *arg, u64 cookie, int err) 3789bb040aa0SIlya Dryomov { 3790922dab61SIlya Dryomov struct rbd_device *rbd_dev = arg; 3791bb040aa0SIlya Dryomov 3792922dab61SIlya Dryomov rbd_warn(rbd_dev, "encountered watch error: %d", err); 3793bb040aa0SIlya Dryomov 3794ed95b21aSIlya Dryomov down_write(&rbd_dev->lock_rwsem); 3795ed95b21aSIlya Dryomov rbd_set_owner_cid(rbd_dev, &rbd_empty_cid); 3796ed95b21aSIlya Dryomov up_write(&rbd_dev->lock_rwsem); 3797bb040aa0SIlya Dryomov 379899d16943SIlya Dryomov mutex_lock(&rbd_dev->watch_mutex); 379999d16943SIlya Dryomov if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED) { 380099d16943SIlya Dryomov __rbd_unregister_watch(rbd_dev); 380199d16943SIlya Dryomov rbd_dev->watch_state = RBD_WATCH_STATE_ERROR; 3802bb040aa0SIlya Dryomov 380399d16943SIlya Dryomov queue_delayed_work(rbd_dev->task_wq, &rbd_dev->watch_dwork, 0); 3804bb040aa0SIlya Dryomov } 380599d16943SIlya Dryomov mutex_unlock(&rbd_dev->watch_mutex); 3806bb040aa0SIlya Dryomov } 3807bb040aa0SIlya Dryomov 3808bb040aa0SIlya Dryomov /* 380999d16943SIlya Dryomov * watch_mutex must be locked 38109969ebc5SAlex Elder */ 381199d16943SIlya Dryomov static int __rbd_register_watch(struct rbd_device *rbd_dev) 38129969ebc5SAlex Elder { 38139969ebc5SAlex Elder struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 3814922dab61SIlya Dryomov struct ceph_osd_linger_request *handle; 38159969ebc5SAlex Elder 3816922dab61SIlya Dryomov rbd_assert(!rbd_dev->watch_handle); 381799d16943SIlya Dryomov dout("%s rbd_dev %p\n", __func__, rbd_dev); 38189969ebc5SAlex Elder 3819922dab61SIlya Dryomov handle = ceph_osdc_watch(osdc, &rbd_dev->header_oid, 3820922dab61SIlya Dryomov &rbd_dev->header_oloc, rbd_watch_cb, 3821922dab61SIlya Dryomov rbd_watch_errcb, rbd_dev); 3822922dab61SIlya Dryomov if (IS_ERR(handle)) 3823922dab61SIlya Dryomov return PTR_ERR(handle); 38249969ebc5SAlex Elder 3825922dab61SIlya Dryomov rbd_dev->watch_handle = handle; 38268eb87565SAlex Elder return 0; 38279969ebc5SAlex Elder } 38289969ebc5SAlex Elder 382999d16943SIlya Dryomov /* 383099d16943SIlya Dryomov * watch_mutex must be locked 383199d16943SIlya Dryomov */ 383299d16943SIlya Dryomov static void __rbd_unregister_watch(struct rbd_device *rbd_dev) 3833fca27065SIlya Dryomov { 3834922dab61SIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 3835922dab61SIlya Dryomov int ret; 3836b30a01f2SIlya Dryomov 383799d16943SIlya Dryomov rbd_assert(rbd_dev->watch_handle); 383899d16943SIlya Dryomov dout("%s rbd_dev %p\n", __func__, rbd_dev); 3839b30a01f2SIlya Dryomov 3840922dab61SIlya Dryomov ret = ceph_osdc_unwatch(osdc, rbd_dev->watch_handle); 3841922dab61SIlya Dryomov if (ret) 3842922dab61SIlya Dryomov rbd_warn(rbd_dev, "failed to unwatch: %d", ret); 3843b30a01f2SIlya Dryomov 3844922dab61SIlya Dryomov rbd_dev->watch_handle = NULL; 3845c525f036SIlya Dryomov } 3846c525f036SIlya Dryomov 384799d16943SIlya Dryomov static int rbd_register_watch(struct rbd_device *rbd_dev) 3848c525f036SIlya Dryomov { 384999d16943SIlya Dryomov int ret; 3850811c6688SIlya Dryomov 385199d16943SIlya Dryomov mutex_lock(&rbd_dev->watch_mutex); 385299d16943SIlya Dryomov rbd_assert(rbd_dev->watch_state == RBD_WATCH_STATE_UNREGISTERED); 385399d16943SIlya Dryomov ret = __rbd_register_watch(rbd_dev); 385499d16943SIlya Dryomov if (ret) 385599d16943SIlya Dryomov goto out; 385699d16943SIlya Dryomov 385799d16943SIlya Dryomov rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED; 385899d16943SIlya Dryomov rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id; 385999d16943SIlya Dryomov 386099d16943SIlya Dryomov out: 386199d16943SIlya Dryomov mutex_unlock(&rbd_dev->watch_mutex); 386299d16943SIlya Dryomov return ret; 386399d16943SIlya Dryomov } 386499d16943SIlya Dryomov 386599d16943SIlya Dryomov static void cancel_tasks_sync(struct rbd_device *rbd_dev) 386699d16943SIlya Dryomov { 386799d16943SIlya Dryomov dout("%s rbd_dev %p\n", __func__, rbd_dev); 386899d16943SIlya Dryomov 386999d16943SIlya Dryomov cancel_delayed_work_sync(&rbd_dev->watch_dwork); 3870ed95b21aSIlya Dryomov cancel_work_sync(&rbd_dev->acquired_lock_work); 3871ed95b21aSIlya Dryomov cancel_work_sync(&rbd_dev->released_lock_work); 3872ed95b21aSIlya Dryomov cancel_delayed_work_sync(&rbd_dev->lock_dwork); 3873ed95b21aSIlya Dryomov cancel_work_sync(&rbd_dev->unlock_work); 387499d16943SIlya Dryomov } 387599d16943SIlya Dryomov 387699d16943SIlya Dryomov static void rbd_unregister_watch(struct rbd_device *rbd_dev) 387799d16943SIlya Dryomov { 3878ed95b21aSIlya Dryomov WARN_ON(waitqueue_active(&rbd_dev->lock_waitq)); 387999d16943SIlya Dryomov cancel_tasks_sync(rbd_dev); 388099d16943SIlya Dryomov 388199d16943SIlya Dryomov mutex_lock(&rbd_dev->watch_mutex); 388299d16943SIlya Dryomov if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED) 388399d16943SIlya Dryomov __rbd_unregister_watch(rbd_dev); 388499d16943SIlya Dryomov rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED; 388599d16943SIlya Dryomov mutex_unlock(&rbd_dev->watch_mutex); 388699d16943SIlya Dryomov 3887811c6688SIlya Dryomov ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc); 3888fca27065SIlya Dryomov } 3889fca27065SIlya Dryomov 389099d16943SIlya Dryomov static void rbd_reregister_watch(struct work_struct *work) 389199d16943SIlya Dryomov { 389299d16943SIlya Dryomov struct rbd_device *rbd_dev = container_of(to_delayed_work(work), 389399d16943SIlya Dryomov struct rbd_device, watch_dwork); 3894ed95b21aSIlya Dryomov bool was_lock_owner = false; 389587c0fdedSIlya Dryomov bool need_to_wake = false; 389699d16943SIlya Dryomov int ret; 389799d16943SIlya Dryomov 389899d16943SIlya Dryomov dout("%s rbd_dev %p\n", __func__, rbd_dev); 389999d16943SIlya Dryomov 3900ed95b21aSIlya Dryomov down_write(&rbd_dev->lock_rwsem); 3901ed95b21aSIlya Dryomov if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED) 3902ed95b21aSIlya Dryomov was_lock_owner = rbd_release_lock(rbd_dev); 3903ed95b21aSIlya Dryomov 390499d16943SIlya Dryomov mutex_lock(&rbd_dev->watch_mutex); 390587c0fdedSIlya Dryomov if (rbd_dev->watch_state != RBD_WATCH_STATE_ERROR) { 390687c0fdedSIlya Dryomov mutex_unlock(&rbd_dev->watch_mutex); 390787c0fdedSIlya Dryomov goto out; 390887c0fdedSIlya Dryomov } 390999d16943SIlya Dryomov 391099d16943SIlya Dryomov ret = __rbd_register_watch(rbd_dev); 391199d16943SIlya Dryomov if (ret) { 391299d16943SIlya Dryomov rbd_warn(rbd_dev, "failed to reregister watch: %d", ret); 39134d73644bSIlya Dryomov if (ret == -EBLACKLISTED || ret == -ENOENT) { 391487c0fdedSIlya Dryomov set_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags); 391587c0fdedSIlya Dryomov need_to_wake = true; 391687c0fdedSIlya Dryomov } else { 391799d16943SIlya Dryomov queue_delayed_work(rbd_dev->task_wq, 391899d16943SIlya Dryomov &rbd_dev->watch_dwork, 391999d16943SIlya Dryomov RBD_RETRY_DELAY); 392087c0fdedSIlya Dryomov } 392187c0fdedSIlya Dryomov mutex_unlock(&rbd_dev->watch_mutex); 392287c0fdedSIlya Dryomov goto out; 392399d16943SIlya Dryomov } 392499d16943SIlya Dryomov 392587c0fdedSIlya Dryomov need_to_wake = true; 392699d16943SIlya Dryomov rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED; 392799d16943SIlya Dryomov rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id; 392899d16943SIlya Dryomov mutex_unlock(&rbd_dev->watch_mutex); 392999d16943SIlya Dryomov 393099d16943SIlya Dryomov ret = rbd_dev_refresh(rbd_dev); 393199d16943SIlya Dryomov if (ret) 393299d16943SIlya Dryomov rbd_warn(rbd_dev, "reregisteration refresh failed: %d", ret); 393399d16943SIlya Dryomov 3934ed95b21aSIlya Dryomov if (was_lock_owner) { 3935ed95b21aSIlya Dryomov ret = rbd_try_lock(rbd_dev); 3936ed95b21aSIlya Dryomov if (ret) 3937ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "reregisteration lock failed: %d", 3938ed95b21aSIlya Dryomov ret); 3939ed95b21aSIlya Dryomov } 3940ed95b21aSIlya Dryomov 394187c0fdedSIlya Dryomov out: 3942ed95b21aSIlya Dryomov up_write(&rbd_dev->lock_rwsem); 394387c0fdedSIlya Dryomov if (need_to_wake) 3944ed95b21aSIlya Dryomov wake_requests(rbd_dev, true); 394599d16943SIlya Dryomov } 394699d16943SIlya Dryomov 394736be9a76SAlex Elder /* 3948f40eb349SAlex Elder * Synchronous osd object method call. Returns the number of bytes 3949f40eb349SAlex Elder * returned in the outbound buffer, or a negative error code. 395036be9a76SAlex Elder */ 395136be9a76SAlex Elder static int rbd_obj_method_sync(struct rbd_device *rbd_dev, 3952ecd4a68aSIlya Dryomov struct ceph_object_id *oid, 3953ecd4a68aSIlya Dryomov struct ceph_object_locator *oloc, 395436be9a76SAlex Elder const char *method_name, 39554157976bSAlex Elder const void *outbound, 395636be9a76SAlex Elder size_t outbound_size, 39574157976bSAlex Elder void *inbound, 3958e2a58ee5SAlex Elder size_t inbound_size) 395936be9a76SAlex Elder { 3960ecd4a68aSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 3961ecd4a68aSIlya Dryomov struct page *req_page = NULL; 3962ecd4a68aSIlya Dryomov struct page *reply_page; 396336be9a76SAlex Elder int ret; 396436be9a76SAlex Elder 396536be9a76SAlex Elder /* 39666010a451SAlex Elder * Method calls are ultimately read operations. The result 39676010a451SAlex Elder * should placed into the inbound buffer provided. They 39686010a451SAlex Elder * also supply outbound data--parameters for the object 39696010a451SAlex Elder * method. Currently if this is present it will be a 39706010a451SAlex Elder * snapshot id. 397136be9a76SAlex Elder */ 3972ecd4a68aSIlya Dryomov if (outbound) { 3973ecd4a68aSIlya Dryomov if (outbound_size > PAGE_SIZE) 3974ecd4a68aSIlya Dryomov return -E2BIG; 397536be9a76SAlex Elder 3976ecd4a68aSIlya Dryomov req_page = alloc_page(GFP_KERNEL); 3977ecd4a68aSIlya Dryomov if (!req_page) 3978ecd4a68aSIlya Dryomov return -ENOMEM; 397936be9a76SAlex Elder 3980ecd4a68aSIlya Dryomov memcpy(page_address(req_page), outbound, outbound_size); 398104017e29SAlex Elder } 3982430c28c3SAlex Elder 3983ecd4a68aSIlya Dryomov reply_page = alloc_page(GFP_KERNEL); 3984ecd4a68aSIlya Dryomov if (!reply_page) { 3985ecd4a68aSIlya Dryomov if (req_page) 3986ecd4a68aSIlya Dryomov __free_page(req_page); 3987ecd4a68aSIlya Dryomov return -ENOMEM; 3988ecd4a68aSIlya Dryomov } 398936be9a76SAlex Elder 3990ecd4a68aSIlya Dryomov ret = ceph_osdc_call(osdc, oid, oloc, RBD_DRV_NAME, method_name, 3991ecd4a68aSIlya Dryomov CEPH_OSD_FLAG_READ, req_page, outbound_size, 3992ecd4a68aSIlya Dryomov reply_page, &inbound_size); 3993ecd4a68aSIlya Dryomov if (!ret) { 3994ecd4a68aSIlya Dryomov memcpy(inbound, page_address(reply_page), inbound_size); 3995ecd4a68aSIlya Dryomov ret = inbound_size; 3996ecd4a68aSIlya Dryomov } 399757385b51SAlex Elder 3998ecd4a68aSIlya Dryomov if (req_page) 3999ecd4a68aSIlya Dryomov __free_page(req_page); 4000ecd4a68aSIlya Dryomov __free_page(reply_page); 400136be9a76SAlex Elder return ret; 400236be9a76SAlex Elder } 400336be9a76SAlex Elder 4004ed95b21aSIlya Dryomov /* 4005ed95b21aSIlya Dryomov * lock_rwsem must be held for read 4006ed95b21aSIlya Dryomov */ 4007ed95b21aSIlya Dryomov static void rbd_wait_state_locked(struct rbd_device *rbd_dev) 4008ed95b21aSIlya Dryomov { 4009ed95b21aSIlya Dryomov DEFINE_WAIT(wait); 4010ed95b21aSIlya Dryomov 4011ed95b21aSIlya Dryomov do { 4012ed95b21aSIlya Dryomov /* 4013ed95b21aSIlya Dryomov * Note the use of mod_delayed_work() in rbd_acquire_lock() 4014ed95b21aSIlya Dryomov * and cancel_delayed_work() in wake_requests(). 4015ed95b21aSIlya Dryomov */ 4016ed95b21aSIlya Dryomov dout("%s rbd_dev %p queueing lock_dwork\n", __func__, rbd_dev); 4017ed95b21aSIlya Dryomov queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0); 4018ed95b21aSIlya Dryomov prepare_to_wait_exclusive(&rbd_dev->lock_waitq, &wait, 4019ed95b21aSIlya Dryomov TASK_UNINTERRUPTIBLE); 4020ed95b21aSIlya Dryomov up_read(&rbd_dev->lock_rwsem); 4021ed95b21aSIlya Dryomov schedule(); 4022ed95b21aSIlya Dryomov down_read(&rbd_dev->lock_rwsem); 402387c0fdedSIlya Dryomov } while (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED && 402487c0fdedSIlya Dryomov !test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags)); 402587c0fdedSIlya Dryomov 4026ed95b21aSIlya Dryomov finish_wait(&rbd_dev->lock_waitq, &wait); 4027ed95b21aSIlya Dryomov } 4028ed95b21aSIlya Dryomov 40297ad18afaSChristoph Hellwig static void rbd_queue_workfn(struct work_struct *work) 4030bc1ecc65SIlya Dryomov { 40317ad18afaSChristoph Hellwig struct request *rq = blk_mq_rq_from_pdu(work); 40327ad18afaSChristoph Hellwig struct rbd_device *rbd_dev = rq->q->queuedata; 4033bc1ecc65SIlya Dryomov struct rbd_img_request *img_request; 40344e752f0aSJosh Durgin struct ceph_snap_context *snapc = NULL; 4035bc1ecc65SIlya Dryomov u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT; 4036bc1ecc65SIlya Dryomov u64 length = blk_rq_bytes(rq); 40376d2940c8SGuangliang Zhao enum obj_operation_type op_type; 40384e752f0aSJosh Durgin u64 mapping_size; 403980de1912SIlya Dryomov bool must_be_locked; 4040bc1ecc65SIlya Dryomov int result; 4041bc1ecc65SIlya Dryomov 40427ad18afaSChristoph Hellwig if (rq->cmd_type != REQ_TYPE_FS) { 40437ad18afaSChristoph Hellwig dout("%s: non-fs request type %d\n", __func__, 40447ad18afaSChristoph Hellwig (int) rq->cmd_type); 40457ad18afaSChristoph Hellwig result = -EIO; 40467ad18afaSChristoph Hellwig goto err; 40477ad18afaSChristoph Hellwig } 40487ad18afaSChristoph Hellwig 4049c2df40dfSMike Christie if (req_op(rq) == REQ_OP_DISCARD) 405090e98c52SGuangliang Zhao op_type = OBJ_OP_DISCARD; 4051c2df40dfSMike Christie else if (req_op(rq) == REQ_OP_WRITE) 40526d2940c8SGuangliang Zhao op_type = OBJ_OP_WRITE; 40536d2940c8SGuangliang Zhao else 40546d2940c8SGuangliang Zhao op_type = OBJ_OP_READ; 40556d2940c8SGuangliang Zhao 4056bc1ecc65SIlya Dryomov /* Ignore/skip any zero-length requests */ 4057bc1ecc65SIlya Dryomov 4058bc1ecc65SIlya Dryomov if (!length) { 4059bc1ecc65SIlya Dryomov dout("%s: zero-length request\n", __func__); 4060bc1ecc65SIlya Dryomov result = 0; 4061bc1ecc65SIlya Dryomov goto err_rq; 4062bc1ecc65SIlya Dryomov } 4063bc1ecc65SIlya Dryomov 40646d2940c8SGuangliang Zhao /* Only reads are allowed to a read-only device */ 4065bc1ecc65SIlya Dryomov 40666d2940c8SGuangliang Zhao if (op_type != OBJ_OP_READ) { 4067bc1ecc65SIlya Dryomov if (rbd_dev->mapping.read_only) { 4068bc1ecc65SIlya Dryomov result = -EROFS; 4069bc1ecc65SIlya Dryomov goto err_rq; 4070bc1ecc65SIlya Dryomov } 4071bc1ecc65SIlya Dryomov rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP); 4072bc1ecc65SIlya Dryomov } 4073bc1ecc65SIlya Dryomov 4074bc1ecc65SIlya Dryomov /* 4075bc1ecc65SIlya Dryomov * Quit early if the mapped snapshot no longer exists. It's 4076bc1ecc65SIlya Dryomov * still possible the snapshot will have disappeared by the 4077bc1ecc65SIlya Dryomov * time our request arrives at the osd, but there's no sense in 4078bc1ecc65SIlya Dryomov * sending it if we already know. 4079bc1ecc65SIlya Dryomov */ 4080bc1ecc65SIlya Dryomov if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) { 4081bc1ecc65SIlya Dryomov dout("request for non-existent snapshot"); 4082bc1ecc65SIlya Dryomov rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP); 4083bc1ecc65SIlya Dryomov result = -ENXIO; 4084bc1ecc65SIlya Dryomov goto err_rq; 4085bc1ecc65SIlya Dryomov } 4086bc1ecc65SIlya Dryomov 4087bc1ecc65SIlya Dryomov if (offset && length > U64_MAX - offset + 1) { 4088bc1ecc65SIlya Dryomov rbd_warn(rbd_dev, "bad request range (%llu~%llu)", offset, 4089bc1ecc65SIlya Dryomov length); 4090bc1ecc65SIlya Dryomov result = -EINVAL; 4091bc1ecc65SIlya Dryomov goto err_rq; /* Shouldn't happen */ 4092bc1ecc65SIlya Dryomov } 4093bc1ecc65SIlya Dryomov 40947ad18afaSChristoph Hellwig blk_mq_start_request(rq); 40957ad18afaSChristoph Hellwig 40964e752f0aSJosh Durgin down_read(&rbd_dev->header_rwsem); 40974e752f0aSJosh Durgin mapping_size = rbd_dev->mapping.size; 40986d2940c8SGuangliang Zhao if (op_type != OBJ_OP_READ) { 40994e752f0aSJosh Durgin snapc = rbd_dev->header.snapc; 41004e752f0aSJosh Durgin ceph_get_snap_context(snapc); 4101ed95b21aSIlya Dryomov must_be_locked = rbd_is_lock_supported(rbd_dev); 410280de1912SIlya Dryomov } else { 410380de1912SIlya Dryomov must_be_locked = rbd_dev->opts->lock_on_read && 410480de1912SIlya Dryomov rbd_is_lock_supported(rbd_dev); 41054e752f0aSJosh Durgin } 41064e752f0aSJosh Durgin up_read(&rbd_dev->header_rwsem); 41074e752f0aSJosh Durgin 41084e752f0aSJosh Durgin if (offset + length > mapping_size) { 4109bc1ecc65SIlya Dryomov rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)", offset, 41104e752f0aSJosh Durgin length, mapping_size); 4111bc1ecc65SIlya Dryomov result = -EIO; 4112bc1ecc65SIlya Dryomov goto err_rq; 4113bc1ecc65SIlya Dryomov } 4114bc1ecc65SIlya Dryomov 4115ed95b21aSIlya Dryomov if (must_be_locked) { 4116ed95b21aSIlya Dryomov down_read(&rbd_dev->lock_rwsem); 411787c0fdedSIlya Dryomov if (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED && 411887c0fdedSIlya Dryomov !test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags)) 4119ed95b21aSIlya Dryomov rbd_wait_state_locked(rbd_dev); 412087c0fdedSIlya Dryomov 412187c0fdedSIlya Dryomov WARN_ON((rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED) ^ 412287c0fdedSIlya Dryomov !test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags)); 412387c0fdedSIlya Dryomov if (test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags)) { 412487c0fdedSIlya Dryomov result = -EBLACKLISTED; 412587c0fdedSIlya Dryomov goto err_unlock; 412687c0fdedSIlya Dryomov } 4127ed95b21aSIlya Dryomov } 4128ed95b21aSIlya Dryomov 41296d2940c8SGuangliang Zhao img_request = rbd_img_request_create(rbd_dev, offset, length, op_type, 41304e752f0aSJosh Durgin snapc); 4131bc1ecc65SIlya Dryomov if (!img_request) { 4132bc1ecc65SIlya Dryomov result = -ENOMEM; 4133ed95b21aSIlya Dryomov goto err_unlock; 4134bc1ecc65SIlya Dryomov } 4135bc1ecc65SIlya Dryomov img_request->rq = rq; 413670b16db8SIlya Dryomov snapc = NULL; /* img_request consumes a ref */ 4137bc1ecc65SIlya Dryomov 413890e98c52SGuangliang Zhao if (op_type == OBJ_OP_DISCARD) 413990e98c52SGuangliang Zhao result = rbd_img_request_fill(img_request, OBJ_REQUEST_NODATA, 414090e98c52SGuangliang Zhao NULL); 414190e98c52SGuangliang Zhao else 414290e98c52SGuangliang Zhao result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO, 414390e98c52SGuangliang Zhao rq->bio); 4144bc1ecc65SIlya Dryomov if (result) 4145bc1ecc65SIlya Dryomov goto err_img_request; 4146bc1ecc65SIlya Dryomov 4147bc1ecc65SIlya Dryomov result = rbd_img_request_submit(img_request); 4148bc1ecc65SIlya Dryomov if (result) 4149bc1ecc65SIlya Dryomov goto err_img_request; 4150bc1ecc65SIlya Dryomov 4151ed95b21aSIlya Dryomov if (must_be_locked) 4152ed95b21aSIlya Dryomov up_read(&rbd_dev->lock_rwsem); 4153bc1ecc65SIlya Dryomov return; 4154bc1ecc65SIlya Dryomov 4155bc1ecc65SIlya Dryomov err_img_request: 4156bc1ecc65SIlya Dryomov rbd_img_request_put(img_request); 4157ed95b21aSIlya Dryomov err_unlock: 4158ed95b21aSIlya Dryomov if (must_be_locked) 4159ed95b21aSIlya Dryomov up_read(&rbd_dev->lock_rwsem); 4160bc1ecc65SIlya Dryomov err_rq: 4161bc1ecc65SIlya Dryomov if (result) 4162bc1ecc65SIlya Dryomov rbd_warn(rbd_dev, "%s %llx at %llx result %d", 41636d2940c8SGuangliang Zhao obj_op_name(op_type), length, offset, result); 41644e752f0aSJosh Durgin ceph_put_snap_context(snapc); 41657ad18afaSChristoph Hellwig err: 41667ad18afaSChristoph Hellwig blk_mq_end_request(rq, result); 4167bc1ecc65SIlya Dryomov } 4168bc1ecc65SIlya Dryomov 41697ad18afaSChristoph Hellwig static int rbd_queue_rq(struct blk_mq_hw_ctx *hctx, 41707ad18afaSChristoph Hellwig const struct blk_mq_queue_data *bd) 4171bc1ecc65SIlya Dryomov { 41727ad18afaSChristoph Hellwig struct request *rq = bd->rq; 41737ad18afaSChristoph Hellwig struct work_struct *work = blk_mq_rq_to_pdu(rq); 4174bc1ecc65SIlya Dryomov 41757ad18afaSChristoph Hellwig queue_work(rbd_wq, work); 41767ad18afaSChristoph Hellwig return BLK_MQ_RQ_QUEUE_OK; 4177bf0d5f50SAlex Elder } 4178bf0d5f50SAlex Elder 4179602adf40SYehuda Sadeh static void rbd_free_disk(struct rbd_device *rbd_dev) 4180602adf40SYehuda Sadeh { 4181602adf40SYehuda Sadeh struct gendisk *disk = rbd_dev->disk; 4182602adf40SYehuda Sadeh 4183602adf40SYehuda Sadeh if (!disk) 4184602adf40SYehuda Sadeh return; 4185602adf40SYehuda Sadeh 4186a0cab924SAlex Elder rbd_dev->disk = NULL; 4187a0cab924SAlex Elder if (disk->flags & GENHD_FL_UP) { 4188602adf40SYehuda Sadeh del_gendisk(disk); 4189602adf40SYehuda Sadeh if (disk->queue) 4190602adf40SYehuda Sadeh blk_cleanup_queue(disk->queue); 41917ad18afaSChristoph Hellwig blk_mq_free_tag_set(&rbd_dev->tag_set); 4192a0cab924SAlex Elder } 4193602adf40SYehuda Sadeh put_disk(disk); 4194602adf40SYehuda Sadeh } 4195602adf40SYehuda Sadeh 4196788e2df3SAlex Elder static int rbd_obj_read_sync(struct rbd_device *rbd_dev, 4197fe5478e0SIlya Dryomov struct ceph_object_id *oid, 4198fe5478e0SIlya Dryomov struct ceph_object_locator *oloc, 4199fe5478e0SIlya Dryomov void *buf, int buf_len) 4200788e2df3SAlex Elder 4201788e2df3SAlex Elder { 4202fe5478e0SIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 4203fe5478e0SIlya Dryomov struct ceph_osd_request *req; 4204fe5478e0SIlya Dryomov struct page **pages; 4205fe5478e0SIlya Dryomov int num_pages = calc_pages_for(0, buf_len); 4206788e2df3SAlex Elder int ret; 4207788e2df3SAlex Elder 4208fe5478e0SIlya Dryomov req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_KERNEL); 4209fe5478e0SIlya Dryomov if (!req) 4210fe5478e0SIlya Dryomov return -ENOMEM; 4211788e2df3SAlex Elder 4212fe5478e0SIlya Dryomov ceph_oid_copy(&req->r_base_oid, oid); 4213fe5478e0SIlya Dryomov ceph_oloc_copy(&req->r_base_oloc, oloc); 4214fe5478e0SIlya Dryomov req->r_flags = CEPH_OSD_FLAG_READ; 4215788e2df3SAlex Elder 4216fe5478e0SIlya Dryomov ret = ceph_osdc_alloc_messages(req, GFP_KERNEL); 4217788e2df3SAlex Elder if (ret) 4218fe5478e0SIlya Dryomov goto out_req; 4219788e2df3SAlex Elder 4220fe5478e0SIlya Dryomov pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); 4221fe5478e0SIlya Dryomov if (IS_ERR(pages)) { 4222fe5478e0SIlya Dryomov ret = PTR_ERR(pages); 4223fe5478e0SIlya Dryomov goto out_req; 4224fe5478e0SIlya Dryomov } 42251ceae7efSAlex Elder 4226fe5478e0SIlya Dryomov osd_req_op_extent_init(req, 0, CEPH_OSD_OP_READ, 0, buf_len, 0, 0); 4227fe5478e0SIlya Dryomov osd_req_op_extent_osd_data_pages(req, 0, pages, buf_len, 0, false, 4228fe5478e0SIlya Dryomov true); 4229788e2df3SAlex Elder 4230fe5478e0SIlya Dryomov ceph_osdc_start_request(osdc, req, false); 4231fe5478e0SIlya Dryomov ret = ceph_osdc_wait_request(osdc, req); 4232fe5478e0SIlya Dryomov if (ret >= 0) 4233fe5478e0SIlya Dryomov ceph_copy_from_page_vector(pages, buf, 0, ret); 4234fe5478e0SIlya Dryomov 4235fe5478e0SIlya Dryomov out_req: 4236fe5478e0SIlya Dryomov ceph_osdc_put_request(req); 4237788e2df3SAlex Elder return ret; 4238788e2df3SAlex Elder } 4239788e2df3SAlex Elder 4240602adf40SYehuda Sadeh /* 4241662518b1SAlex Elder * Read the complete header for the given rbd device. On successful 4242662518b1SAlex Elder * return, the rbd_dev->header field will contain up-to-date 4243662518b1SAlex Elder * information about the image. 42444156d998SAlex Elder */ 424599a41ebcSAlex Elder static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev) 42464156d998SAlex Elder { 42474156d998SAlex Elder struct rbd_image_header_ondisk *ondisk = NULL; 42484156d998SAlex Elder u32 snap_count = 0; 42494156d998SAlex Elder u64 names_size = 0; 42504156d998SAlex Elder u32 want_count; 42514156d998SAlex Elder int ret; 42524156d998SAlex Elder 42534156d998SAlex Elder /* 42544156d998SAlex Elder * The complete header will include an array of its 64-bit 42554156d998SAlex Elder * snapshot ids, followed by the names of those snapshots as 42564156d998SAlex Elder * a contiguous block of NUL-terminated strings. Note that 42574156d998SAlex Elder * the number of snapshots could change by the time we read 42584156d998SAlex Elder * it in, in which case we re-read it. 42594156d998SAlex Elder */ 42604156d998SAlex Elder do { 42614156d998SAlex Elder size_t size; 42624156d998SAlex Elder 42634156d998SAlex Elder kfree(ondisk); 42644156d998SAlex Elder 42654156d998SAlex Elder size = sizeof (*ondisk); 42664156d998SAlex Elder size += snap_count * sizeof (struct rbd_image_snap_ondisk); 42674156d998SAlex Elder size += names_size; 42684156d998SAlex Elder ondisk = kmalloc(size, GFP_KERNEL); 42694156d998SAlex Elder if (!ondisk) 4270662518b1SAlex Elder return -ENOMEM; 42714156d998SAlex Elder 4272fe5478e0SIlya Dryomov ret = rbd_obj_read_sync(rbd_dev, &rbd_dev->header_oid, 4273fe5478e0SIlya Dryomov &rbd_dev->header_oloc, ondisk, size); 42744156d998SAlex Elder if (ret < 0) 4275662518b1SAlex Elder goto out; 4276c0cd10dbSAlex Elder if ((size_t)ret < size) { 42774156d998SAlex Elder ret = -ENXIO; 427806ecc6cbSAlex Elder rbd_warn(rbd_dev, "short header read (want %zd got %d)", 427906ecc6cbSAlex Elder size, ret); 4280662518b1SAlex Elder goto out; 42814156d998SAlex Elder } 42824156d998SAlex Elder if (!rbd_dev_ondisk_valid(ondisk)) { 42834156d998SAlex Elder ret = -ENXIO; 428406ecc6cbSAlex Elder rbd_warn(rbd_dev, "invalid header"); 4285662518b1SAlex Elder goto out; 42864156d998SAlex Elder } 42874156d998SAlex Elder 42884156d998SAlex Elder names_size = le64_to_cpu(ondisk->snap_names_len); 42894156d998SAlex Elder want_count = snap_count; 42904156d998SAlex Elder snap_count = le32_to_cpu(ondisk->snap_count); 42914156d998SAlex Elder } while (snap_count != want_count); 42924156d998SAlex Elder 4293662518b1SAlex Elder ret = rbd_header_from_disk(rbd_dev, ondisk); 4294662518b1SAlex Elder out: 42954156d998SAlex Elder kfree(ondisk); 42964156d998SAlex Elder 4297dfc5606dSYehuda Sadeh return ret; 4298602adf40SYehuda Sadeh } 4299602adf40SYehuda Sadeh 430015228edeSAlex Elder /* 430115228edeSAlex Elder * Clear the rbd device's EXISTS flag if the snapshot it's mapped to 430215228edeSAlex Elder * has disappeared from the (just updated) snapshot context. 430315228edeSAlex Elder */ 430415228edeSAlex Elder static void rbd_exists_validate(struct rbd_device *rbd_dev) 430515228edeSAlex Elder { 430615228edeSAlex Elder u64 snap_id; 430715228edeSAlex Elder 430815228edeSAlex Elder if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) 430915228edeSAlex Elder return; 431015228edeSAlex Elder 431115228edeSAlex Elder snap_id = rbd_dev->spec->snap_id; 431215228edeSAlex Elder if (snap_id == CEPH_NOSNAP) 431315228edeSAlex Elder return; 431415228edeSAlex Elder 431515228edeSAlex Elder if (rbd_dev_snap_index(rbd_dev, snap_id) == BAD_SNAP_INDEX) 431615228edeSAlex Elder clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); 431715228edeSAlex Elder } 431815228edeSAlex Elder 43199875201eSJosh Durgin static void rbd_dev_update_size(struct rbd_device *rbd_dev) 43209875201eSJosh Durgin { 43219875201eSJosh Durgin sector_t size; 43229875201eSJosh Durgin 43239875201eSJosh Durgin /* 4324811c6688SIlya Dryomov * If EXISTS is not set, rbd_dev->disk may be NULL, so don't 4325811c6688SIlya Dryomov * try to update its size. If REMOVING is set, updating size 4326811c6688SIlya Dryomov * is just useless work since the device can't be opened. 43279875201eSJosh Durgin */ 4328811c6688SIlya Dryomov if (test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags) && 4329811c6688SIlya Dryomov !test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags)) { 43309875201eSJosh Durgin size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE; 43319875201eSJosh Durgin dout("setting size to %llu sectors", (unsigned long long)size); 43329875201eSJosh Durgin set_capacity(rbd_dev->disk, size); 43339875201eSJosh Durgin revalidate_disk(rbd_dev->disk); 43349875201eSJosh Durgin } 43359875201eSJosh Durgin } 43369875201eSJosh Durgin 4337cc4a38bdSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev) 43381fe5e993SAlex Elder { 4339e627db08SAlex Elder u64 mapping_size; 43401fe5e993SAlex Elder int ret; 43411fe5e993SAlex Elder 4342cfbf6377SAlex Elder down_write(&rbd_dev->header_rwsem); 43433b5cf2a2SAlex Elder mapping_size = rbd_dev->mapping.size; 4344a720ae09SIlya Dryomov 4345a720ae09SIlya Dryomov ret = rbd_dev_header_info(rbd_dev); 434652bb1f9bSIlya Dryomov if (ret) 434773e39e4dSIlya Dryomov goto out; 434815228edeSAlex Elder 4349e8f59b59SIlya Dryomov /* 4350e8f59b59SIlya Dryomov * If there is a parent, see if it has disappeared due to the 4351e8f59b59SIlya Dryomov * mapped image getting flattened. 4352e8f59b59SIlya Dryomov */ 4353e8f59b59SIlya Dryomov if (rbd_dev->parent) { 4354e8f59b59SIlya Dryomov ret = rbd_dev_v2_parent_info(rbd_dev); 4355e8f59b59SIlya Dryomov if (ret) 435673e39e4dSIlya Dryomov goto out; 4357e8f59b59SIlya Dryomov } 4358e8f59b59SIlya Dryomov 43595ff1108cSIlya Dryomov if (rbd_dev->spec->snap_id == CEPH_NOSNAP) { 43605ff1108cSIlya Dryomov rbd_dev->mapping.size = rbd_dev->header.image_size; 43615ff1108cSIlya Dryomov } else { 43625ff1108cSIlya Dryomov /* validate mapped snapshot's EXISTS flag */ 436315228edeSAlex Elder rbd_exists_validate(rbd_dev); 43645ff1108cSIlya Dryomov } 43655ff1108cSIlya Dryomov 436673e39e4dSIlya Dryomov out: 4367cfbf6377SAlex Elder up_write(&rbd_dev->header_rwsem); 436873e39e4dSIlya Dryomov if (!ret && mapping_size != rbd_dev->mapping.size) 43699875201eSJosh Durgin rbd_dev_update_size(rbd_dev); 43701fe5e993SAlex Elder 437173e39e4dSIlya Dryomov return ret; 43721fe5e993SAlex Elder } 43731fe5e993SAlex Elder 43747ad18afaSChristoph Hellwig static int rbd_init_request(void *data, struct request *rq, 43757ad18afaSChristoph Hellwig unsigned int hctx_idx, unsigned int request_idx, 43767ad18afaSChristoph Hellwig unsigned int numa_node) 43777ad18afaSChristoph Hellwig { 43787ad18afaSChristoph Hellwig struct work_struct *work = blk_mq_rq_to_pdu(rq); 43797ad18afaSChristoph Hellwig 43807ad18afaSChristoph Hellwig INIT_WORK(work, rbd_queue_workfn); 43817ad18afaSChristoph Hellwig return 0; 43827ad18afaSChristoph Hellwig } 43837ad18afaSChristoph Hellwig 43847ad18afaSChristoph Hellwig static struct blk_mq_ops rbd_mq_ops = { 43857ad18afaSChristoph Hellwig .queue_rq = rbd_queue_rq, 43867ad18afaSChristoph Hellwig .init_request = rbd_init_request, 43877ad18afaSChristoph Hellwig }; 43887ad18afaSChristoph Hellwig 4389602adf40SYehuda Sadeh static int rbd_init_disk(struct rbd_device *rbd_dev) 4390602adf40SYehuda Sadeh { 4391602adf40SYehuda Sadeh struct gendisk *disk; 4392602adf40SYehuda Sadeh struct request_queue *q; 4393593a9e7bSAlex Elder u64 segment_size; 43947ad18afaSChristoph Hellwig int err; 4395602adf40SYehuda Sadeh 4396602adf40SYehuda Sadeh /* create gendisk info */ 43977e513d43SIlya Dryomov disk = alloc_disk(single_major ? 43987e513d43SIlya Dryomov (1 << RBD_SINGLE_MAJOR_PART_SHIFT) : 43997e513d43SIlya Dryomov RBD_MINORS_PER_MAJOR); 4400602adf40SYehuda Sadeh if (!disk) 44011fcdb8aaSAlex Elder return -ENOMEM; 4402602adf40SYehuda Sadeh 4403f0f8cef5SAlex Elder snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d", 4404de71a297SAlex Elder rbd_dev->dev_id); 4405602adf40SYehuda Sadeh disk->major = rbd_dev->major; 4406dd82fff1SIlya Dryomov disk->first_minor = rbd_dev->minor; 44077e513d43SIlya Dryomov if (single_major) 44087e513d43SIlya Dryomov disk->flags |= GENHD_FL_EXT_DEVT; 4409602adf40SYehuda Sadeh disk->fops = &rbd_bd_ops; 4410602adf40SYehuda Sadeh disk->private_data = rbd_dev; 4411602adf40SYehuda Sadeh 44127ad18afaSChristoph Hellwig memset(&rbd_dev->tag_set, 0, sizeof(rbd_dev->tag_set)); 44137ad18afaSChristoph Hellwig rbd_dev->tag_set.ops = &rbd_mq_ops; 4414b5584180SIlya Dryomov rbd_dev->tag_set.queue_depth = rbd_dev->opts->queue_depth; 44157ad18afaSChristoph Hellwig rbd_dev->tag_set.numa_node = NUMA_NO_NODE; 4416b5584180SIlya Dryomov rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE; 44177ad18afaSChristoph Hellwig rbd_dev->tag_set.nr_hw_queues = 1; 44187ad18afaSChristoph Hellwig rbd_dev->tag_set.cmd_size = sizeof(struct work_struct); 44197ad18afaSChristoph Hellwig 44207ad18afaSChristoph Hellwig err = blk_mq_alloc_tag_set(&rbd_dev->tag_set); 44217ad18afaSChristoph Hellwig if (err) 4422602adf40SYehuda Sadeh goto out_disk; 4423029bcbd8SJosh Durgin 44247ad18afaSChristoph Hellwig q = blk_mq_init_queue(&rbd_dev->tag_set); 44257ad18afaSChristoph Hellwig if (IS_ERR(q)) { 44267ad18afaSChristoph Hellwig err = PTR_ERR(q); 44277ad18afaSChristoph Hellwig goto out_tag_set; 44287ad18afaSChristoph Hellwig } 44297ad18afaSChristoph Hellwig 4430d8a2c89cSIlya Dryomov queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q); 4431d8a2c89cSIlya Dryomov /* QUEUE_FLAG_ADD_RANDOM is off by default for blk-mq */ 4432593a9e7bSAlex Elder 4433029bcbd8SJosh Durgin /* set io sizes to object size */ 4434593a9e7bSAlex Elder segment_size = rbd_obj_bytes(&rbd_dev->header); 4435593a9e7bSAlex Elder blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE); 44360d9fde4fSIlya Dryomov q->limits.max_sectors = queue_max_hw_sectors(q); 4437d3834fefSIlya Dryomov blk_queue_max_segments(q, segment_size / SECTOR_SIZE); 4438593a9e7bSAlex Elder blk_queue_max_segment_size(q, segment_size); 4439593a9e7bSAlex Elder blk_queue_io_min(q, segment_size); 4440593a9e7bSAlex Elder blk_queue_io_opt(q, segment_size); 4441029bcbd8SJosh Durgin 444290e98c52SGuangliang Zhao /* enable the discard support */ 444390e98c52SGuangliang Zhao queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q); 444490e98c52SGuangliang Zhao q->limits.discard_granularity = segment_size; 444590e98c52SGuangliang Zhao q->limits.discard_alignment = segment_size; 44462bb4cd5cSJens Axboe blk_queue_max_discard_sectors(q, segment_size / SECTOR_SIZE); 4447b76f8239SJosh Durgin q->limits.discard_zeroes_data = 1; 444890e98c52SGuangliang Zhao 4449bae818eeSRonny Hegewald if (!ceph_test_opt(rbd_dev->rbd_client->client, NOCRC)) 4450bae818eeSRonny Hegewald q->backing_dev_info.capabilities |= BDI_CAP_STABLE_WRITES; 4451bae818eeSRonny Hegewald 4452602adf40SYehuda Sadeh disk->queue = q; 4453602adf40SYehuda Sadeh 4454602adf40SYehuda Sadeh q->queuedata = rbd_dev; 4455602adf40SYehuda Sadeh 4456602adf40SYehuda Sadeh rbd_dev->disk = disk; 4457602adf40SYehuda Sadeh 4458602adf40SYehuda Sadeh return 0; 44597ad18afaSChristoph Hellwig out_tag_set: 44607ad18afaSChristoph Hellwig blk_mq_free_tag_set(&rbd_dev->tag_set); 4461602adf40SYehuda Sadeh out_disk: 4462602adf40SYehuda Sadeh put_disk(disk); 44637ad18afaSChristoph Hellwig return err; 4464602adf40SYehuda Sadeh } 4465602adf40SYehuda Sadeh 4466dfc5606dSYehuda Sadeh /* 4467dfc5606dSYehuda Sadeh sysfs 4468dfc5606dSYehuda Sadeh */ 4469602adf40SYehuda Sadeh 4470593a9e7bSAlex Elder static struct rbd_device *dev_to_rbd_dev(struct device *dev) 4471593a9e7bSAlex Elder { 4472593a9e7bSAlex Elder return container_of(dev, struct rbd_device, dev); 4473593a9e7bSAlex Elder } 4474593a9e7bSAlex Elder 4475dfc5606dSYehuda Sadeh static ssize_t rbd_size_show(struct device *dev, 4476dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 4477602adf40SYehuda Sadeh { 4478593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4479dfc5606dSYehuda Sadeh 4480fc71d833SAlex Elder return sprintf(buf, "%llu\n", 4481fc71d833SAlex Elder (unsigned long long)rbd_dev->mapping.size); 4482602adf40SYehuda Sadeh } 4483602adf40SYehuda Sadeh 448434b13184SAlex Elder /* 448534b13184SAlex Elder * Note this shows the features for whatever's mapped, which is not 448634b13184SAlex Elder * necessarily the base image. 448734b13184SAlex Elder */ 448834b13184SAlex Elder static ssize_t rbd_features_show(struct device *dev, 448934b13184SAlex Elder struct device_attribute *attr, char *buf) 449034b13184SAlex Elder { 449134b13184SAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 449234b13184SAlex Elder 449334b13184SAlex Elder return sprintf(buf, "0x%016llx\n", 449434b13184SAlex Elder (unsigned long long)rbd_dev->mapping.features); 449534b13184SAlex Elder } 449634b13184SAlex Elder 4497dfc5606dSYehuda Sadeh static ssize_t rbd_major_show(struct device *dev, 4498dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 4499602adf40SYehuda Sadeh { 4500593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4501dfc5606dSYehuda Sadeh 4502fc71d833SAlex Elder if (rbd_dev->major) 4503dfc5606dSYehuda Sadeh return sprintf(buf, "%d\n", rbd_dev->major); 4504fc71d833SAlex Elder 4505fc71d833SAlex Elder return sprintf(buf, "(none)\n"); 4506dd82fff1SIlya Dryomov } 4507fc71d833SAlex Elder 4508dd82fff1SIlya Dryomov static ssize_t rbd_minor_show(struct device *dev, 4509dd82fff1SIlya Dryomov struct device_attribute *attr, char *buf) 4510dd82fff1SIlya Dryomov { 4511dd82fff1SIlya Dryomov struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4512dd82fff1SIlya Dryomov 4513dd82fff1SIlya Dryomov return sprintf(buf, "%d\n", rbd_dev->minor); 4514dfc5606dSYehuda Sadeh } 4515dfc5606dSYehuda Sadeh 4516005a07bfSIlya Dryomov static ssize_t rbd_client_addr_show(struct device *dev, 4517005a07bfSIlya Dryomov struct device_attribute *attr, char *buf) 4518005a07bfSIlya Dryomov { 4519005a07bfSIlya Dryomov struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4520005a07bfSIlya Dryomov struct ceph_entity_addr *client_addr = 4521005a07bfSIlya Dryomov ceph_client_addr(rbd_dev->rbd_client->client); 4522005a07bfSIlya Dryomov 4523005a07bfSIlya Dryomov return sprintf(buf, "%pISpc/%u\n", &client_addr->in_addr, 4524005a07bfSIlya Dryomov le32_to_cpu(client_addr->nonce)); 4525005a07bfSIlya Dryomov } 4526005a07bfSIlya Dryomov 4527dfc5606dSYehuda Sadeh static ssize_t rbd_client_id_show(struct device *dev, 4528dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 4529dfc5606dSYehuda Sadeh { 4530593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4531dfc5606dSYehuda Sadeh 45321dbb4399SAlex Elder return sprintf(buf, "client%lld\n", 4533033268a5SIlya Dryomov ceph_client_gid(rbd_dev->rbd_client->client)); 4534dfc5606dSYehuda Sadeh } 4535dfc5606dSYehuda Sadeh 4536267fb90bSMike Christie static ssize_t rbd_cluster_fsid_show(struct device *dev, 4537267fb90bSMike Christie struct device_attribute *attr, char *buf) 4538267fb90bSMike Christie { 4539267fb90bSMike Christie struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4540267fb90bSMike Christie 4541267fb90bSMike Christie return sprintf(buf, "%pU\n", &rbd_dev->rbd_client->client->fsid); 4542267fb90bSMike Christie } 4543267fb90bSMike Christie 45440d6d1e9cSMike Christie static ssize_t rbd_config_info_show(struct device *dev, 45450d6d1e9cSMike Christie struct device_attribute *attr, char *buf) 45460d6d1e9cSMike Christie { 45470d6d1e9cSMike Christie struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 45480d6d1e9cSMike Christie 45490d6d1e9cSMike Christie return sprintf(buf, "%s\n", rbd_dev->config_info); 4550dfc5606dSYehuda Sadeh } 4551dfc5606dSYehuda Sadeh 4552dfc5606dSYehuda Sadeh static ssize_t rbd_pool_show(struct device *dev, 4553dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 4554dfc5606dSYehuda Sadeh { 4555593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4556dfc5606dSYehuda Sadeh 45570d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->pool_name); 4558dfc5606dSYehuda Sadeh } 4559dfc5606dSYehuda Sadeh 45609bb2f334SAlex Elder static ssize_t rbd_pool_id_show(struct device *dev, 45619bb2f334SAlex Elder struct device_attribute *attr, char *buf) 45629bb2f334SAlex Elder { 45639bb2f334SAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 45649bb2f334SAlex Elder 45650d7dbfceSAlex Elder return sprintf(buf, "%llu\n", 45660d7dbfceSAlex Elder (unsigned long long) rbd_dev->spec->pool_id); 45679bb2f334SAlex Elder } 45689bb2f334SAlex Elder 4569dfc5606dSYehuda Sadeh static ssize_t rbd_name_show(struct device *dev, 4570dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 4571dfc5606dSYehuda Sadeh { 4572593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4573dfc5606dSYehuda Sadeh 4574a92ffdf8SAlex Elder if (rbd_dev->spec->image_name) 45750d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->image_name); 4576a92ffdf8SAlex Elder 4577a92ffdf8SAlex Elder return sprintf(buf, "(unknown)\n"); 4578dfc5606dSYehuda Sadeh } 4579dfc5606dSYehuda Sadeh 4580589d30e0SAlex Elder static ssize_t rbd_image_id_show(struct device *dev, 4581589d30e0SAlex Elder struct device_attribute *attr, char *buf) 4582589d30e0SAlex Elder { 4583589d30e0SAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4584589d30e0SAlex Elder 45850d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->image_id); 4586589d30e0SAlex Elder } 4587589d30e0SAlex Elder 458834b13184SAlex Elder /* 458934b13184SAlex Elder * Shows the name of the currently-mapped snapshot (or 459034b13184SAlex Elder * RBD_SNAP_HEAD_NAME for the base image). 459134b13184SAlex Elder */ 4592dfc5606dSYehuda Sadeh static ssize_t rbd_snap_show(struct device *dev, 4593dfc5606dSYehuda Sadeh struct device_attribute *attr, 4594dfc5606dSYehuda Sadeh char *buf) 4595dfc5606dSYehuda Sadeh { 4596593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4597dfc5606dSYehuda Sadeh 45980d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->snap_name); 4599dfc5606dSYehuda Sadeh } 4600dfc5606dSYehuda Sadeh 460192a58671SMike Christie static ssize_t rbd_snap_id_show(struct device *dev, 460292a58671SMike Christie struct device_attribute *attr, char *buf) 460392a58671SMike Christie { 460492a58671SMike Christie struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 460592a58671SMike Christie 460692a58671SMike Christie return sprintf(buf, "%llu\n", rbd_dev->spec->snap_id); 460792a58671SMike Christie } 460892a58671SMike Christie 460986b00e0dSAlex Elder /* 4610ff96128fSIlya Dryomov * For a v2 image, shows the chain of parent images, separated by empty 4611ff96128fSIlya Dryomov * lines. For v1 images or if there is no parent, shows "(no parent 4612ff96128fSIlya Dryomov * image)". 461386b00e0dSAlex Elder */ 461486b00e0dSAlex Elder static ssize_t rbd_parent_show(struct device *dev, 461586b00e0dSAlex Elder struct device_attribute *attr, 461686b00e0dSAlex Elder char *buf) 461786b00e0dSAlex Elder { 461886b00e0dSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4619ff96128fSIlya Dryomov ssize_t count = 0; 462086b00e0dSAlex Elder 4621ff96128fSIlya Dryomov if (!rbd_dev->parent) 462286b00e0dSAlex Elder return sprintf(buf, "(no parent image)\n"); 462386b00e0dSAlex Elder 4624ff96128fSIlya Dryomov for ( ; rbd_dev->parent; rbd_dev = rbd_dev->parent) { 4625ff96128fSIlya Dryomov struct rbd_spec *spec = rbd_dev->parent_spec; 462686b00e0dSAlex Elder 4627ff96128fSIlya Dryomov count += sprintf(&buf[count], "%s" 4628ff96128fSIlya Dryomov "pool_id %llu\npool_name %s\n" 4629ff96128fSIlya Dryomov "image_id %s\nimage_name %s\n" 4630ff96128fSIlya Dryomov "snap_id %llu\nsnap_name %s\n" 4631ff96128fSIlya Dryomov "overlap %llu\n", 4632ff96128fSIlya Dryomov !count ? "" : "\n", /* first? */ 4633ff96128fSIlya Dryomov spec->pool_id, spec->pool_name, 4634ff96128fSIlya Dryomov spec->image_id, spec->image_name ?: "(unknown)", 4635ff96128fSIlya Dryomov spec->snap_id, spec->snap_name, 4636ff96128fSIlya Dryomov rbd_dev->parent_overlap); 4637ff96128fSIlya Dryomov } 463886b00e0dSAlex Elder 463986b00e0dSAlex Elder return count; 464086b00e0dSAlex Elder } 464186b00e0dSAlex Elder 4642dfc5606dSYehuda Sadeh static ssize_t rbd_image_refresh(struct device *dev, 4643dfc5606dSYehuda Sadeh struct device_attribute *attr, 4644dfc5606dSYehuda Sadeh const char *buf, 4645dfc5606dSYehuda Sadeh size_t size) 4646dfc5606dSYehuda Sadeh { 4647593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4648b813623aSAlex Elder int ret; 4649602adf40SYehuda Sadeh 4650cc4a38bdSAlex Elder ret = rbd_dev_refresh(rbd_dev); 4651e627db08SAlex Elder if (ret) 465252bb1f9bSIlya Dryomov return ret; 4653b813623aSAlex Elder 465452bb1f9bSIlya Dryomov return size; 4655dfc5606dSYehuda Sadeh } 4656602adf40SYehuda Sadeh 4657dfc5606dSYehuda Sadeh static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL); 465834b13184SAlex Elder static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL); 4659dfc5606dSYehuda Sadeh static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL); 4660dd82fff1SIlya Dryomov static DEVICE_ATTR(minor, S_IRUGO, rbd_minor_show, NULL); 4661005a07bfSIlya Dryomov static DEVICE_ATTR(client_addr, S_IRUGO, rbd_client_addr_show, NULL); 4662dfc5606dSYehuda Sadeh static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL); 4663267fb90bSMike Christie static DEVICE_ATTR(cluster_fsid, S_IRUGO, rbd_cluster_fsid_show, NULL); 46640d6d1e9cSMike Christie static DEVICE_ATTR(config_info, S_IRUSR, rbd_config_info_show, NULL); 4665dfc5606dSYehuda Sadeh static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL); 46669bb2f334SAlex Elder static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL); 4667dfc5606dSYehuda Sadeh static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL); 4668589d30e0SAlex Elder static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL); 4669dfc5606dSYehuda Sadeh static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh); 4670dfc5606dSYehuda Sadeh static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL); 467192a58671SMike Christie static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL); 467286b00e0dSAlex Elder static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL); 4673dfc5606dSYehuda Sadeh 4674dfc5606dSYehuda Sadeh static struct attribute *rbd_attrs[] = { 4675dfc5606dSYehuda Sadeh &dev_attr_size.attr, 467634b13184SAlex Elder &dev_attr_features.attr, 4677dfc5606dSYehuda Sadeh &dev_attr_major.attr, 4678dd82fff1SIlya Dryomov &dev_attr_minor.attr, 4679005a07bfSIlya Dryomov &dev_attr_client_addr.attr, 4680dfc5606dSYehuda Sadeh &dev_attr_client_id.attr, 4681267fb90bSMike Christie &dev_attr_cluster_fsid.attr, 46820d6d1e9cSMike Christie &dev_attr_config_info.attr, 4683dfc5606dSYehuda Sadeh &dev_attr_pool.attr, 46849bb2f334SAlex Elder &dev_attr_pool_id.attr, 4685dfc5606dSYehuda Sadeh &dev_attr_name.attr, 4686589d30e0SAlex Elder &dev_attr_image_id.attr, 4687dfc5606dSYehuda Sadeh &dev_attr_current_snap.attr, 468892a58671SMike Christie &dev_attr_snap_id.attr, 468986b00e0dSAlex Elder &dev_attr_parent.attr, 4690dfc5606dSYehuda Sadeh &dev_attr_refresh.attr, 4691dfc5606dSYehuda Sadeh NULL 4692dfc5606dSYehuda Sadeh }; 4693dfc5606dSYehuda Sadeh 4694dfc5606dSYehuda Sadeh static struct attribute_group rbd_attr_group = { 4695dfc5606dSYehuda Sadeh .attrs = rbd_attrs, 4696dfc5606dSYehuda Sadeh }; 4697dfc5606dSYehuda Sadeh 4698dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_attr_groups[] = { 4699dfc5606dSYehuda Sadeh &rbd_attr_group, 4700dfc5606dSYehuda Sadeh NULL 4701dfc5606dSYehuda Sadeh }; 4702dfc5606dSYehuda Sadeh 47036cac4695SIlya Dryomov static void rbd_dev_release(struct device *dev); 4704dfc5606dSYehuda Sadeh 4705dfc5606dSYehuda Sadeh static struct device_type rbd_device_type = { 4706dfc5606dSYehuda Sadeh .name = "rbd", 4707dfc5606dSYehuda Sadeh .groups = rbd_attr_groups, 47086cac4695SIlya Dryomov .release = rbd_dev_release, 4709dfc5606dSYehuda Sadeh }; 4710dfc5606dSYehuda Sadeh 47118b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec) 47128b8fb99cSAlex Elder { 47138b8fb99cSAlex Elder kref_get(&spec->kref); 47148b8fb99cSAlex Elder 47158b8fb99cSAlex Elder return spec; 47168b8fb99cSAlex Elder } 47178b8fb99cSAlex Elder 47188b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref); 47198b8fb99cSAlex Elder static void rbd_spec_put(struct rbd_spec *spec) 47208b8fb99cSAlex Elder { 47218b8fb99cSAlex Elder if (spec) 47228b8fb99cSAlex Elder kref_put(&spec->kref, rbd_spec_free); 47238b8fb99cSAlex Elder } 47248b8fb99cSAlex Elder 47258b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_alloc(void) 47268b8fb99cSAlex Elder { 47278b8fb99cSAlex Elder struct rbd_spec *spec; 47288b8fb99cSAlex Elder 47298b8fb99cSAlex Elder spec = kzalloc(sizeof (*spec), GFP_KERNEL); 47308b8fb99cSAlex Elder if (!spec) 47318b8fb99cSAlex Elder return NULL; 473204077599SIlya Dryomov 473304077599SIlya Dryomov spec->pool_id = CEPH_NOPOOL; 473404077599SIlya Dryomov spec->snap_id = CEPH_NOSNAP; 47358b8fb99cSAlex Elder kref_init(&spec->kref); 47368b8fb99cSAlex Elder 47378b8fb99cSAlex Elder return spec; 47388b8fb99cSAlex Elder } 47398b8fb99cSAlex Elder 47408b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref) 47418b8fb99cSAlex Elder { 47428b8fb99cSAlex Elder struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref); 47438b8fb99cSAlex Elder 47448b8fb99cSAlex Elder kfree(spec->pool_name); 47458b8fb99cSAlex Elder kfree(spec->image_id); 47468b8fb99cSAlex Elder kfree(spec->image_name); 47478b8fb99cSAlex Elder kfree(spec->snap_name); 47488b8fb99cSAlex Elder kfree(spec); 47498b8fb99cSAlex Elder } 47508b8fb99cSAlex Elder 47511643dfa4SIlya Dryomov static void rbd_dev_free(struct rbd_device *rbd_dev) 4752dd5ac32dSIlya Dryomov { 475399d16943SIlya Dryomov WARN_ON(rbd_dev->watch_state != RBD_WATCH_STATE_UNREGISTERED); 4754ed95b21aSIlya Dryomov WARN_ON(rbd_dev->lock_state != RBD_LOCK_STATE_UNLOCKED); 4755dd5ac32dSIlya Dryomov 4756c41d13a3SIlya Dryomov ceph_oid_destroy(&rbd_dev->header_oid); 47576b6dddbeSIlya Dryomov ceph_oloc_destroy(&rbd_dev->header_oloc); 47580d6d1e9cSMike Christie kfree(rbd_dev->config_info); 4759c41d13a3SIlya Dryomov 4760dd5ac32dSIlya Dryomov rbd_put_client(rbd_dev->rbd_client); 4761dd5ac32dSIlya Dryomov rbd_spec_put(rbd_dev->spec); 4762dd5ac32dSIlya Dryomov kfree(rbd_dev->opts); 4763dd5ac32dSIlya Dryomov kfree(rbd_dev); 47641643dfa4SIlya Dryomov } 47651643dfa4SIlya Dryomov 47661643dfa4SIlya Dryomov static void rbd_dev_release(struct device *dev) 47671643dfa4SIlya Dryomov { 47681643dfa4SIlya Dryomov struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 47691643dfa4SIlya Dryomov bool need_put = !!rbd_dev->opts; 47701643dfa4SIlya Dryomov 47711643dfa4SIlya Dryomov if (need_put) { 47721643dfa4SIlya Dryomov destroy_workqueue(rbd_dev->task_wq); 47731643dfa4SIlya Dryomov ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id); 47741643dfa4SIlya Dryomov } 47751643dfa4SIlya Dryomov 47761643dfa4SIlya Dryomov rbd_dev_free(rbd_dev); 4777dd5ac32dSIlya Dryomov 4778dd5ac32dSIlya Dryomov /* 4779dd5ac32dSIlya Dryomov * This is racy, but way better than putting module outside of 4780dd5ac32dSIlya Dryomov * the release callback. The race window is pretty small, so 4781dd5ac32dSIlya Dryomov * doing something similar to dm (dm-builtin.c) is overkill. 4782dd5ac32dSIlya Dryomov */ 4783dd5ac32dSIlya Dryomov if (need_put) 4784dd5ac32dSIlya Dryomov module_put(THIS_MODULE); 4785dd5ac32dSIlya Dryomov } 4786dd5ac32dSIlya Dryomov 47871643dfa4SIlya Dryomov static struct rbd_device *__rbd_dev_create(struct rbd_client *rbdc, 47881643dfa4SIlya Dryomov struct rbd_spec *spec) 4789c53d5893SAlex Elder { 4790c53d5893SAlex Elder struct rbd_device *rbd_dev; 4791c53d5893SAlex Elder 4792c53d5893SAlex Elder rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL); 4793c53d5893SAlex Elder if (!rbd_dev) 4794c53d5893SAlex Elder return NULL; 4795c53d5893SAlex Elder 4796c53d5893SAlex Elder spin_lock_init(&rbd_dev->lock); 4797c53d5893SAlex Elder INIT_LIST_HEAD(&rbd_dev->node); 4798c53d5893SAlex Elder init_rwsem(&rbd_dev->header_rwsem); 4799c53d5893SAlex Elder 48007e97332eSIlya Dryomov rbd_dev->header.data_pool_id = CEPH_NOPOOL; 4801c41d13a3SIlya Dryomov ceph_oid_init(&rbd_dev->header_oid); 4802431a02cdSIlya Dryomov rbd_dev->header_oloc.pool = spec->pool_id; 4803c41d13a3SIlya Dryomov 480499d16943SIlya Dryomov mutex_init(&rbd_dev->watch_mutex); 480599d16943SIlya Dryomov rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED; 480699d16943SIlya Dryomov INIT_DELAYED_WORK(&rbd_dev->watch_dwork, rbd_reregister_watch); 480799d16943SIlya Dryomov 4808ed95b21aSIlya Dryomov init_rwsem(&rbd_dev->lock_rwsem); 4809ed95b21aSIlya Dryomov rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED; 4810ed95b21aSIlya Dryomov INIT_WORK(&rbd_dev->acquired_lock_work, rbd_notify_acquired_lock); 4811ed95b21aSIlya Dryomov INIT_WORK(&rbd_dev->released_lock_work, rbd_notify_released_lock); 4812ed95b21aSIlya Dryomov INIT_DELAYED_WORK(&rbd_dev->lock_dwork, rbd_acquire_lock); 4813ed95b21aSIlya Dryomov INIT_WORK(&rbd_dev->unlock_work, rbd_release_lock_work); 4814ed95b21aSIlya Dryomov init_waitqueue_head(&rbd_dev->lock_waitq); 4815ed95b21aSIlya Dryomov 4816dd5ac32dSIlya Dryomov rbd_dev->dev.bus = &rbd_bus_type; 4817dd5ac32dSIlya Dryomov rbd_dev->dev.type = &rbd_device_type; 4818dd5ac32dSIlya Dryomov rbd_dev->dev.parent = &rbd_root_dev; 4819dd5ac32dSIlya Dryomov device_initialize(&rbd_dev->dev); 4820dd5ac32dSIlya Dryomov 4821c53d5893SAlex Elder rbd_dev->rbd_client = rbdc; 4822d147543dSIlya Dryomov rbd_dev->spec = spec; 48230903e875SAlex Elder 48241643dfa4SIlya Dryomov return rbd_dev; 48251643dfa4SIlya Dryomov } 48261643dfa4SIlya Dryomov 4827dd5ac32dSIlya Dryomov /* 48281643dfa4SIlya Dryomov * Create a mapping rbd_dev. 4829dd5ac32dSIlya Dryomov */ 48301643dfa4SIlya Dryomov static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc, 48311643dfa4SIlya Dryomov struct rbd_spec *spec, 48321643dfa4SIlya Dryomov struct rbd_options *opts) 48331643dfa4SIlya Dryomov { 48341643dfa4SIlya Dryomov struct rbd_device *rbd_dev; 48351643dfa4SIlya Dryomov 48361643dfa4SIlya Dryomov rbd_dev = __rbd_dev_create(rbdc, spec); 48371643dfa4SIlya Dryomov if (!rbd_dev) 48381643dfa4SIlya Dryomov return NULL; 48391643dfa4SIlya Dryomov 48401643dfa4SIlya Dryomov rbd_dev->opts = opts; 48411643dfa4SIlya Dryomov 48421643dfa4SIlya Dryomov /* get an id and fill in device name */ 48431643dfa4SIlya Dryomov rbd_dev->dev_id = ida_simple_get(&rbd_dev_id_ida, 0, 48441643dfa4SIlya Dryomov minor_to_rbd_dev_id(1 << MINORBITS), 48451643dfa4SIlya Dryomov GFP_KERNEL); 48461643dfa4SIlya Dryomov if (rbd_dev->dev_id < 0) 48471643dfa4SIlya Dryomov goto fail_rbd_dev; 48481643dfa4SIlya Dryomov 48491643dfa4SIlya Dryomov sprintf(rbd_dev->name, RBD_DRV_NAME "%d", rbd_dev->dev_id); 48501643dfa4SIlya Dryomov rbd_dev->task_wq = alloc_ordered_workqueue("%s-tasks", WQ_MEM_RECLAIM, 48511643dfa4SIlya Dryomov rbd_dev->name); 48521643dfa4SIlya Dryomov if (!rbd_dev->task_wq) 48531643dfa4SIlya Dryomov goto fail_dev_id; 48541643dfa4SIlya Dryomov 48551643dfa4SIlya Dryomov /* we have a ref from do_rbd_add() */ 4856dd5ac32dSIlya Dryomov __module_get(THIS_MODULE); 4857dd5ac32dSIlya Dryomov 48581643dfa4SIlya Dryomov dout("%s rbd_dev %p dev_id %d\n", __func__, rbd_dev, rbd_dev->dev_id); 4859c53d5893SAlex Elder return rbd_dev; 48601643dfa4SIlya Dryomov 48611643dfa4SIlya Dryomov fail_dev_id: 48621643dfa4SIlya Dryomov ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id); 48631643dfa4SIlya Dryomov fail_rbd_dev: 48641643dfa4SIlya Dryomov rbd_dev_free(rbd_dev); 48651643dfa4SIlya Dryomov return NULL; 4866c53d5893SAlex Elder } 4867c53d5893SAlex Elder 4868c53d5893SAlex Elder static void rbd_dev_destroy(struct rbd_device *rbd_dev) 4869c53d5893SAlex Elder { 4870dd5ac32dSIlya Dryomov if (rbd_dev) 4871dd5ac32dSIlya Dryomov put_device(&rbd_dev->dev); 4872c53d5893SAlex Elder } 4873c53d5893SAlex Elder 4874dfc5606dSYehuda Sadeh /* 48759d475de5SAlex Elder * Get the size and object order for an image snapshot, or if 48769d475de5SAlex Elder * snap_id is CEPH_NOSNAP, gets this information for the base 48779d475de5SAlex Elder * image. 48789d475de5SAlex Elder */ 48799d475de5SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id, 48809d475de5SAlex Elder u8 *order, u64 *snap_size) 48819d475de5SAlex Elder { 48829d475de5SAlex Elder __le64 snapid = cpu_to_le64(snap_id); 48839d475de5SAlex Elder int ret; 48849d475de5SAlex Elder struct { 48859d475de5SAlex Elder u8 order; 48869d475de5SAlex Elder __le64 size; 48879d475de5SAlex Elder } __attribute__ ((packed)) size_buf = { 0 }; 48889d475de5SAlex Elder 4889ecd4a68aSIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, 4890ecd4a68aSIlya Dryomov &rbd_dev->header_oloc, "get_size", 48914157976bSAlex Elder &snapid, sizeof(snapid), 4892e2a58ee5SAlex Elder &size_buf, sizeof(size_buf)); 489336be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 48949d475de5SAlex Elder if (ret < 0) 48959d475de5SAlex Elder return ret; 489657385b51SAlex Elder if (ret < sizeof (size_buf)) 489757385b51SAlex Elder return -ERANGE; 48989d475de5SAlex Elder 4899c3545579SJosh Durgin if (order) { 49009d475de5SAlex Elder *order = size_buf.order; 4901c3545579SJosh Durgin dout(" order %u", (unsigned int)*order); 4902c3545579SJosh Durgin } 49039d475de5SAlex Elder *snap_size = le64_to_cpu(size_buf.size); 49049d475de5SAlex Elder 4905c3545579SJosh Durgin dout(" snap_id 0x%016llx snap_size = %llu\n", 4906c3545579SJosh Durgin (unsigned long long)snap_id, 49079d475de5SAlex Elder (unsigned long long)*snap_size); 49089d475de5SAlex Elder 49099d475de5SAlex Elder return 0; 49109d475de5SAlex Elder } 49119d475de5SAlex Elder 49129d475de5SAlex Elder static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev) 49139d475de5SAlex Elder { 49149d475de5SAlex Elder return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP, 49159d475de5SAlex Elder &rbd_dev->header.obj_order, 49169d475de5SAlex Elder &rbd_dev->header.image_size); 49179d475de5SAlex Elder } 49189d475de5SAlex Elder 49191e130199SAlex Elder static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev) 49201e130199SAlex Elder { 49211e130199SAlex Elder void *reply_buf; 49221e130199SAlex Elder int ret; 49231e130199SAlex Elder void *p; 49241e130199SAlex Elder 49251e130199SAlex Elder reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL); 49261e130199SAlex Elder if (!reply_buf) 49271e130199SAlex Elder return -ENOMEM; 49281e130199SAlex Elder 4929ecd4a68aSIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, 4930ecd4a68aSIlya Dryomov &rbd_dev->header_oloc, "get_object_prefix", 4931ecd4a68aSIlya Dryomov NULL, 0, reply_buf, RBD_OBJ_PREFIX_LEN_MAX); 493236be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 49331e130199SAlex Elder if (ret < 0) 49341e130199SAlex Elder goto out; 49351e130199SAlex Elder 49361e130199SAlex Elder p = reply_buf; 49371e130199SAlex Elder rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p, 493857385b51SAlex Elder p + ret, NULL, GFP_NOIO); 493957385b51SAlex Elder ret = 0; 49401e130199SAlex Elder 49411e130199SAlex Elder if (IS_ERR(rbd_dev->header.object_prefix)) { 49421e130199SAlex Elder ret = PTR_ERR(rbd_dev->header.object_prefix); 49431e130199SAlex Elder rbd_dev->header.object_prefix = NULL; 49441e130199SAlex Elder } else { 49451e130199SAlex Elder dout(" object_prefix = %s\n", rbd_dev->header.object_prefix); 49461e130199SAlex Elder } 49471e130199SAlex Elder out: 49481e130199SAlex Elder kfree(reply_buf); 49491e130199SAlex Elder 49501e130199SAlex Elder return ret; 49511e130199SAlex Elder } 49521e130199SAlex Elder 4953b1b5402aSAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id, 4954b1b5402aSAlex Elder u64 *snap_features) 4955b1b5402aSAlex Elder { 4956b1b5402aSAlex Elder __le64 snapid = cpu_to_le64(snap_id); 4957b1b5402aSAlex Elder struct { 4958b1b5402aSAlex Elder __le64 features; 4959b1b5402aSAlex Elder __le64 incompat; 49604157976bSAlex Elder } __attribute__ ((packed)) features_buf = { 0 }; 4961d3767f0fSIlya Dryomov u64 unsup; 4962b1b5402aSAlex Elder int ret; 4963b1b5402aSAlex Elder 4964ecd4a68aSIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, 4965ecd4a68aSIlya Dryomov &rbd_dev->header_oloc, "get_features", 49664157976bSAlex Elder &snapid, sizeof(snapid), 4967e2a58ee5SAlex Elder &features_buf, sizeof(features_buf)); 496836be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 4969b1b5402aSAlex Elder if (ret < 0) 4970b1b5402aSAlex Elder return ret; 497157385b51SAlex Elder if (ret < sizeof (features_buf)) 497257385b51SAlex Elder return -ERANGE; 4973d889140cSAlex Elder 4974d3767f0fSIlya Dryomov unsup = le64_to_cpu(features_buf.incompat) & ~RBD_FEATURES_SUPPORTED; 4975d3767f0fSIlya Dryomov if (unsup) { 4976d3767f0fSIlya Dryomov rbd_warn(rbd_dev, "image uses unsupported features: 0x%llx", 4977d3767f0fSIlya Dryomov unsup); 4978b8f5c6edSAlex Elder return -ENXIO; 4979d3767f0fSIlya Dryomov } 4980d889140cSAlex Elder 4981b1b5402aSAlex Elder *snap_features = le64_to_cpu(features_buf.features); 4982b1b5402aSAlex Elder 4983b1b5402aSAlex Elder dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n", 4984b1b5402aSAlex Elder (unsigned long long)snap_id, 4985b1b5402aSAlex Elder (unsigned long long)*snap_features, 4986b1b5402aSAlex Elder (unsigned long long)le64_to_cpu(features_buf.incompat)); 4987b1b5402aSAlex Elder 4988b1b5402aSAlex Elder return 0; 4989b1b5402aSAlex Elder } 4990b1b5402aSAlex Elder 4991b1b5402aSAlex Elder static int rbd_dev_v2_features(struct rbd_device *rbd_dev) 4992b1b5402aSAlex Elder { 4993b1b5402aSAlex Elder return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP, 4994b1b5402aSAlex Elder &rbd_dev->header.features); 4995b1b5402aSAlex Elder } 4996b1b5402aSAlex Elder 499786b00e0dSAlex Elder static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev) 499886b00e0dSAlex Elder { 499986b00e0dSAlex Elder struct rbd_spec *parent_spec; 500086b00e0dSAlex Elder size_t size; 500186b00e0dSAlex Elder void *reply_buf = NULL; 500286b00e0dSAlex Elder __le64 snapid; 500386b00e0dSAlex Elder void *p; 500486b00e0dSAlex Elder void *end; 5005642a2537SAlex Elder u64 pool_id; 500686b00e0dSAlex Elder char *image_id; 50073b5cf2a2SAlex Elder u64 snap_id; 500886b00e0dSAlex Elder u64 overlap; 500986b00e0dSAlex Elder int ret; 501086b00e0dSAlex Elder 501186b00e0dSAlex Elder parent_spec = rbd_spec_alloc(); 501286b00e0dSAlex Elder if (!parent_spec) 501386b00e0dSAlex Elder return -ENOMEM; 501486b00e0dSAlex Elder 501586b00e0dSAlex Elder size = sizeof (__le64) + /* pool_id */ 501686b00e0dSAlex Elder sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX + /* image_id */ 501786b00e0dSAlex Elder sizeof (__le64) + /* snap_id */ 501886b00e0dSAlex Elder sizeof (__le64); /* overlap */ 501986b00e0dSAlex Elder reply_buf = kmalloc(size, GFP_KERNEL); 502086b00e0dSAlex Elder if (!reply_buf) { 502186b00e0dSAlex Elder ret = -ENOMEM; 502286b00e0dSAlex Elder goto out_err; 502386b00e0dSAlex Elder } 502486b00e0dSAlex Elder 50254d9b67cdSIlya Dryomov snapid = cpu_to_le64(rbd_dev->spec->snap_id); 5026ecd4a68aSIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, 5027ecd4a68aSIlya Dryomov &rbd_dev->header_oloc, "get_parent", 5028ecd4a68aSIlya Dryomov &snapid, sizeof(snapid), reply_buf, size); 502936be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 503086b00e0dSAlex Elder if (ret < 0) 503186b00e0dSAlex Elder goto out_err; 503286b00e0dSAlex Elder 503386b00e0dSAlex Elder p = reply_buf; 503457385b51SAlex Elder end = reply_buf + ret; 503557385b51SAlex Elder ret = -ERANGE; 5036642a2537SAlex Elder ceph_decode_64_safe(&p, end, pool_id, out_err); 5037392a9dadSAlex Elder if (pool_id == CEPH_NOPOOL) { 5038392a9dadSAlex Elder /* 5039392a9dadSAlex Elder * Either the parent never existed, or we have 5040392a9dadSAlex Elder * record of it but the image got flattened so it no 5041392a9dadSAlex Elder * longer has a parent. When the parent of a 5042392a9dadSAlex Elder * layered image disappears we immediately set the 5043392a9dadSAlex Elder * overlap to 0. The effect of this is that all new 5044392a9dadSAlex Elder * requests will be treated as if the image had no 5045392a9dadSAlex Elder * parent. 5046392a9dadSAlex Elder */ 5047392a9dadSAlex Elder if (rbd_dev->parent_overlap) { 5048392a9dadSAlex Elder rbd_dev->parent_overlap = 0; 5049392a9dadSAlex Elder rbd_dev_parent_put(rbd_dev); 5050392a9dadSAlex Elder pr_info("%s: clone image has been flattened\n", 5051392a9dadSAlex Elder rbd_dev->disk->disk_name); 5052392a9dadSAlex Elder } 5053392a9dadSAlex Elder 505486b00e0dSAlex Elder goto out; /* No parent? No problem. */ 5055392a9dadSAlex Elder } 505686b00e0dSAlex Elder 50570903e875SAlex Elder /* The ceph file layout needs to fit pool id in 32 bits */ 50580903e875SAlex Elder 50590903e875SAlex Elder ret = -EIO; 5060642a2537SAlex Elder if (pool_id > (u64)U32_MAX) { 50619584d508SIlya Dryomov rbd_warn(NULL, "parent pool id too large (%llu > %u)", 5062642a2537SAlex Elder (unsigned long long)pool_id, U32_MAX); 506357385b51SAlex Elder goto out_err; 5064c0cd10dbSAlex Elder } 50650903e875SAlex Elder 5066979ed480SAlex Elder image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL); 506786b00e0dSAlex Elder if (IS_ERR(image_id)) { 506886b00e0dSAlex Elder ret = PTR_ERR(image_id); 506986b00e0dSAlex Elder goto out_err; 507086b00e0dSAlex Elder } 50713b5cf2a2SAlex Elder ceph_decode_64_safe(&p, end, snap_id, out_err); 507286b00e0dSAlex Elder ceph_decode_64_safe(&p, end, overlap, out_err); 507386b00e0dSAlex Elder 50743b5cf2a2SAlex Elder /* 50753b5cf2a2SAlex Elder * The parent won't change (except when the clone is 50763b5cf2a2SAlex Elder * flattened, already handled that). So we only need to 50773b5cf2a2SAlex Elder * record the parent spec we have not already done so. 50783b5cf2a2SAlex Elder */ 50793b5cf2a2SAlex Elder if (!rbd_dev->parent_spec) { 50803b5cf2a2SAlex Elder parent_spec->pool_id = pool_id; 50813b5cf2a2SAlex Elder parent_spec->image_id = image_id; 50823b5cf2a2SAlex Elder parent_spec->snap_id = snap_id; 508386b00e0dSAlex Elder rbd_dev->parent_spec = parent_spec; 508486b00e0dSAlex Elder parent_spec = NULL; /* rbd_dev now owns this */ 5085fbba11b3SIlya Dryomov } else { 5086fbba11b3SIlya Dryomov kfree(image_id); 50873b5cf2a2SAlex Elder } 50883b5cf2a2SAlex Elder 50893b5cf2a2SAlex Elder /* 5090cf32bd9cSIlya Dryomov * We always update the parent overlap. If it's zero we issue 5091cf32bd9cSIlya Dryomov * a warning, as we will proceed as if there was no parent. 50923b5cf2a2SAlex Elder */ 50933b5cf2a2SAlex Elder if (!overlap) { 50943b5cf2a2SAlex Elder if (parent_spec) { 5095cf32bd9cSIlya Dryomov /* refresh, careful to warn just once */ 5096cf32bd9cSIlya Dryomov if (rbd_dev->parent_overlap) 5097cf32bd9cSIlya Dryomov rbd_warn(rbd_dev, 5098cf32bd9cSIlya Dryomov "clone now standalone (overlap became 0)"); 509970cf49cfSAlex Elder } else { 5100cf32bd9cSIlya Dryomov /* initial probe */ 5101cf32bd9cSIlya Dryomov rbd_warn(rbd_dev, "clone is standalone (overlap 0)"); 51023b5cf2a2SAlex Elder } 510370cf49cfSAlex Elder } 5104cf32bd9cSIlya Dryomov rbd_dev->parent_overlap = overlap; 5105cf32bd9cSIlya Dryomov 510686b00e0dSAlex Elder out: 510786b00e0dSAlex Elder ret = 0; 510886b00e0dSAlex Elder out_err: 510986b00e0dSAlex Elder kfree(reply_buf); 511086b00e0dSAlex Elder rbd_spec_put(parent_spec); 511186b00e0dSAlex Elder 511286b00e0dSAlex Elder return ret; 511386b00e0dSAlex Elder } 511486b00e0dSAlex Elder 5115cc070d59SAlex Elder static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev) 5116cc070d59SAlex Elder { 5117cc070d59SAlex Elder struct { 5118cc070d59SAlex Elder __le64 stripe_unit; 5119cc070d59SAlex Elder __le64 stripe_count; 5120cc070d59SAlex Elder } __attribute__ ((packed)) striping_info_buf = { 0 }; 5121cc070d59SAlex Elder size_t size = sizeof (striping_info_buf); 5122cc070d59SAlex Elder void *p; 5123cc070d59SAlex Elder u64 obj_size; 5124cc070d59SAlex Elder u64 stripe_unit; 5125cc070d59SAlex Elder u64 stripe_count; 5126cc070d59SAlex Elder int ret; 5127cc070d59SAlex Elder 5128ecd4a68aSIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, 5129ecd4a68aSIlya Dryomov &rbd_dev->header_oloc, "get_stripe_unit_count", 5130ecd4a68aSIlya Dryomov NULL, 0, &striping_info_buf, size); 5131cc070d59SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 5132cc070d59SAlex Elder if (ret < 0) 5133cc070d59SAlex Elder return ret; 5134cc070d59SAlex Elder if (ret < size) 5135cc070d59SAlex Elder return -ERANGE; 5136cc070d59SAlex Elder 5137cc070d59SAlex Elder /* 5138cc070d59SAlex Elder * We don't actually support the "fancy striping" feature 5139cc070d59SAlex Elder * (STRIPINGV2) yet, but if the striping sizes are the 5140cc070d59SAlex Elder * defaults the behavior is the same as before. So find 5141cc070d59SAlex Elder * out, and only fail if the image has non-default values. 5142cc070d59SAlex Elder */ 5143cc070d59SAlex Elder ret = -EINVAL; 51445bc3fb17SIlya Dryomov obj_size = rbd_obj_bytes(&rbd_dev->header); 5145cc070d59SAlex Elder p = &striping_info_buf; 5146cc070d59SAlex Elder stripe_unit = ceph_decode_64(&p); 5147cc070d59SAlex Elder if (stripe_unit != obj_size) { 5148cc070d59SAlex Elder rbd_warn(rbd_dev, "unsupported stripe unit " 5149cc070d59SAlex Elder "(got %llu want %llu)", 5150cc070d59SAlex Elder stripe_unit, obj_size); 5151cc070d59SAlex Elder return -EINVAL; 5152cc070d59SAlex Elder } 5153cc070d59SAlex Elder stripe_count = ceph_decode_64(&p); 5154cc070d59SAlex Elder if (stripe_count != 1) { 5155cc070d59SAlex Elder rbd_warn(rbd_dev, "unsupported stripe count " 5156cc070d59SAlex Elder "(got %llu want 1)", stripe_count); 5157cc070d59SAlex Elder return -EINVAL; 5158cc070d59SAlex Elder } 5159500d0c0fSAlex Elder rbd_dev->header.stripe_unit = stripe_unit; 5160500d0c0fSAlex Elder rbd_dev->header.stripe_count = stripe_count; 5161cc070d59SAlex Elder 5162cc070d59SAlex Elder return 0; 5163cc070d59SAlex Elder } 5164cc070d59SAlex Elder 51657e97332eSIlya Dryomov static int rbd_dev_v2_data_pool(struct rbd_device *rbd_dev) 51667e97332eSIlya Dryomov { 51677e97332eSIlya Dryomov __le64 data_pool_id; 51687e97332eSIlya Dryomov int ret; 51697e97332eSIlya Dryomov 51707e97332eSIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, 51717e97332eSIlya Dryomov &rbd_dev->header_oloc, "get_data_pool", 51727e97332eSIlya Dryomov NULL, 0, &data_pool_id, sizeof(data_pool_id)); 51737e97332eSIlya Dryomov if (ret < 0) 51747e97332eSIlya Dryomov return ret; 51757e97332eSIlya Dryomov if (ret < sizeof(data_pool_id)) 51767e97332eSIlya Dryomov return -EBADMSG; 51777e97332eSIlya Dryomov 51787e97332eSIlya Dryomov rbd_dev->header.data_pool_id = le64_to_cpu(data_pool_id); 51797e97332eSIlya Dryomov WARN_ON(rbd_dev->header.data_pool_id == CEPH_NOPOOL); 51807e97332eSIlya Dryomov return 0; 51817e97332eSIlya Dryomov } 51827e97332eSIlya Dryomov 51839e15b77dSAlex Elder static char *rbd_dev_image_name(struct rbd_device *rbd_dev) 51849e15b77dSAlex Elder { 5185ecd4a68aSIlya Dryomov CEPH_DEFINE_OID_ONSTACK(oid); 51869e15b77dSAlex Elder size_t image_id_size; 51879e15b77dSAlex Elder char *image_id; 51889e15b77dSAlex Elder void *p; 51899e15b77dSAlex Elder void *end; 51909e15b77dSAlex Elder size_t size; 51919e15b77dSAlex Elder void *reply_buf = NULL; 51929e15b77dSAlex Elder size_t len = 0; 51939e15b77dSAlex Elder char *image_name = NULL; 51949e15b77dSAlex Elder int ret; 51959e15b77dSAlex Elder 51969e15b77dSAlex Elder rbd_assert(!rbd_dev->spec->image_name); 51979e15b77dSAlex Elder 519869e7a02fSAlex Elder len = strlen(rbd_dev->spec->image_id); 519969e7a02fSAlex Elder image_id_size = sizeof (__le32) + len; 52009e15b77dSAlex Elder image_id = kmalloc(image_id_size, GFP_KERNEL); 52019e15b77dSAlex Elder if (!image_id) 52029e15b77dSAlex Elder return NULL; 52039e15b77dSAlex Elder 52049e15b77dSAlex Elder p = image_id; 52054157976bSAlex Elder end = image_id + image_id_size; 520669e7a02fSAlex Elder ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len); 52079e15b77dSAlex Elder 52089e15b77dSAlex Elder size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX; 52099e15b77dSAlex Elder reply_buf = kmalloc(size, GFP_KERNEL); 52109e15b77dSAlex Elder if (!reply_buf) 52119e15b77dSAlex Elder goto out; 52129e15b77dSAlex Elder 5213ecd4a68aSIlya Dryomov ceph_oid_printf(&oid, "%s", RBD_DIRECTORY); 5214ecd4a68aSIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc, 5215ecd4a68aSIlya Dryomov "dir_get_name", image_id, image_id_size, 5216e2a58ee5SAlex Elder reply_buf, size); 52179e15b77dSAlex Elder if (ret < 0) 52189e15b77dSAlex Elder goto out; 52199e15b77dSAlex Elder p = reply_buf; 5220f40eb349SAlex Elder end = reply_buf + ret; 5221f40eb349SAlex Elder 52229e15b77dSAlex Elder image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL); 52239e15b77dSAlex Elder if (IS_ERR(image_name)) 52249e15b77dSAlex Elder image_name = NULL; 52259e15b77dSAlex Elder else 52269e15b77dSAlex Elder dout("%s: name is %s len is %zd\n", __func__, image_name, len); 52279e15b77dSAlex Elder out: 52289e15b77dSAlex Elder kfree(reply_buf); 52299e15b77dSAlex Elder kfree(image_id); 52309e15b77dSAlex Elder 52319e15b77dSAlex Elder return image_name; 52329e15b77dSAlex Elder } 52339e15b77dSAlex Elder 52342ad3d716SAlex Elder static u64 rbd_v1_snap_id_by_name(struct rbd_device *rbd_dev, const char *name) 52352ad3d716SAlex Elder { 52362ad3d716SAlex Elder struct ceph_snap_context *snapc = rbd_dev->header.snapc; 52372ad3d716SAlex Elder const char *snap_name; 52382ad3d716SAlex Elder u32 which = 0; 52392ad3d716SAlex Elder 52402ad3d716SAlex Elder /* Skip over names until we find the one we are looking for */ 52412ad3d716SAlex Elder 52422ad3d716SAlex Elder snap_name = rbd_dev->header.snap_names; 52432ad3d716SAlex Elder while (which < snapc->num_snaps) { 52442ad3d716SAlex Elder if (!strcmp(name, snap_name)) 52452ad3d716SAlex Elder return snapc->snaps[which]; 52462ad3d716SAlex Elder snap_name += strlen(snap_name) + 1; 52472ad3d716SAlex Elder which++; 52482ad3d716SAlex Elder } 52492ad3d716SAlex Elder return CEPH_NOSNAP; 52502ad3d716SAlex Elder } 52512ad3d716SAlex Elder 52522ad3d716SAlex Elder static u64 rbd_v2_snap_id_by_name(struct rbd_device *rbd_dev, const char *name) 52532ad3d716SAlex Elder { 52542ad3d716SAlex Elder struct ceph_snap_context *snapc = rbd_dev->header.snapc; 52552ad3d716SAlex Elder u32 which; 52562ad3d716SAlex Elder bool found = false; 52572ad3d716SAlex Elder u64 snap_id; 52582ad3d716SAlex Elder 52592ad3d716SAlex Elder for (which = 0; !found && which < snapc->num_snaps; which++) { 52602ad3d716SAlex Elder const char *snap_name; 52612ad3d716SAlex Elder 52622ad3d716SAlex Elder snap_id = snapc->snaps[which]; 52632ad3d716SAlex Elder snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id); 5264efadc98aSJosh Durgin if (IS_ERR(snap_name)) { 5265efadc98aSJosh Durgin /* ignore no-longer existing snapshots */ 5266efadc98aSJosh Durgin if (PTR_ERR(snap_name) == -ENOENT) 5267efadc98aSJosh Durgin continue; 5268efadc98aSJosh Durgin else 52692ad3d716SAlex Elder break; 5270efadc98aSJosh Durgin } 52712ad3d716SAlex Elder found = !strcmp(name, snap_name); 52722ad3d716SAlex Elder kfree(snap_name); 52732ad3d716SAlex Elder } 52742ad3d716SAlex Elder return found ? snap_id : CEPH_NOSNAP; 52752ad3d716SAlex Elder } 52762ad3d716SAlex Elder 52772ad3d716SAlex Elder /* 52782ad3d716SAlex Elder * Assumes name is never RBD_SNAP_HEAD_NAME; returns CEPH_NOSNAP if 52792ad3d716SAlex Elder * no snapshot by that name is found, or if an error occurs. 52802ad3d716SAlex Elder */ 52812ad3d716SAlex Elder static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name) 52822ad3d716SAlex Elder { 52832ad3d716SAlex Elder if (rbd_dev->image_format == 1) 52842ad3d716SAlex Elder return rbd_v1_snap_id_by_name(rbd_dev, name); 52852ad3d716SAlex Elder 52862ad3d716SAlex Elder return rbd_v2_snap_id_by_name(rbd_dev, name); 52872ad3d716SAlex Elder } 52882ad3d716SAlex Elder 52899e15b77dSAlex Elder /* 529004077599SIlya Dryomov * An image being mapped will have everything but the snap id. 52919e15b77dSAlex Elder */ 529204077599SIlya Dryomov static int rbd_spec_fill_snap_id(struct rbd_device *rbd_dev) 529304077599SIlya Dryomov { 529404077599SIlya Dryomov struct rbd_spec *spec = rbd_dev->spec; 529504077599SIlya Dryomov 529604077599SIlya Dryomov rbd_assert(spec->pool_id != CEPH_NOPOOL && spec->pool_name); 529704077599SIlya Dryomov rbd_assert(spec->image_id && spec->image_name); 529804077599SIlya Dryomov rbd_assert(spec->snap_name); 529904077599SIlya Dryomov 530004077599SIlya Dryomov if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) { 530104077599SIlya Dryomov u64 snap_id; 530204077599SIlya Dryomov 530304077599SIlya Dryomov snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name); 530404077599SIlya Dryomov if (snap_id == CEPH_NOSNAP) 530504077599SIlya Dryomov return -ENOENT; 530604077599SIlya Dryomov 530704077599SIlya Dryomov spec->snap_id = snap_id; 530804077599SIlya Dryomov } else { 530904077599SIlya Dryomov spec->snap_id = CEPH_NOSNAP; 531004077599SIlya Dryomov } 531104077599SIlya Dryomov 531204077599SIlya Dryomov return 0; 531304077599SIlya Dryomov } 531404077599SIlya Dryomov 531504077599SIlya Dryomov /* 531604077599SIlya Dryomov * A parent image will have all ids but none of the names. 531704077599SIlya Dryomov * 531804077599SIlya Dryomov * All names in an rbd spec are dynamically allocated. It's OK if we 531904077599SIlya Dryomov * can't figure out the name for an image id. 532004077599SIlya Dryomov */ 532104077599SIlya Dryomov static int rbd_spec_fill_names(struct rbd_device *rbd_dev) 53229e15b77dSAlex Elder { 53232e9f7f1cSAlex Elder struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 53242e9f7f1cSAlex Elder struct rbd_spec *spec = rbd_dev->spec; 53252e9f7f1cSAlex Elder const char *pool_name; 53262e9f7f1cSAlex Elder const char *image_name; 53272e9f7f1cSAlex Elder const char *snap_name; 53289e15b77dSAlex Elder int ret; 53299e15b77dSAlex Elder 533004077599SIlya Dryomov rbd_assert(spec->pool_id != CEPH_NOPOOL); 533104077599SIlya Dryomov rbd_assert(spec->image_id); 533204077599SIlya Dryomov rbd_assert(spec->snap_id != CEPH_NOSNAP); 53339e15b77dSAlex Elder 53342e9f7f1cSAlex Elder /* Get the pool name; we have to make our own copy of this */ 53359e15b77dSAlex Elder 53362e9f7f1cSAlex Elder pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id); 53372e9f7f1cSAlex Elder if (!pool_name) { 53382e9f7f1cSAlex Elder rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id); 5339935dc89fSAlex Elder return -EIO; 5340935dc89fSAlex Elder } 53412e9f7f1cSAlex Elder pool_name = kstrdup(pool_name, GFP_KERNEL); 53422e9f7f1cSAlex Elder if (!pool_name) 53439e15b77dSAlex Elder return -ENOMEM; 53449e15b77dSAlex Elder 53459e15b77dSAlex Elder /* Fetch the image name; tolerate failure here */ 53469e15b77dSAlex Elder 53472e9f7f1cSAlex Elder image_name = rbd_dev_image_name(rbd_dev); 53482e9f7f1cSAlex Elder if (!image_name) 534906ecc6cbSAlex Elder rbd_warn(rbd_dev, "unable to get image name"); 53509e15b77dSAlex Elder 535104077599SIlya Dryomov /* Fetch the snapshot name */ 53529e15b77dSAlex Elder 53532e9f7f1cSAlex Elder snap_name = rbd_snap_name(rbd_dev, spec->snap_id); 5354da6a6b63SJosh Durgin if (IS_ERR(snap_name)) { 5355da6a6b63SJosh Durgin ret = PTR_ERR(snap_name); 53569e15b77dSAlex Elder goto out_err; 53572e9f7f1cSAlex Elder } 53582e9f7f1cSAlex Elder 53592e9f7f1cSAlex Elder spec->pool_name = pool_name; 53602e9f7f1cSAlex Elder spec->image_name = image_name; 53612e9f7f1cSAlex Elder spec->snap_name = snap_name; 53629e15b77dSAlex Elder 53639e15b77dSAlex Elder return 0; 536404077599SIlya Dryomov 53659e15b77dSAlex Elder out_err: 53662e9f7f1cSAlex Elder kfree(image_name); 53672e9f7f1cSAlex Elder kfree(pool_name); 53689e15b77dSAlex Elder return ret; 53699e15b77dSAlex Elder } 53709e15b77dSAlex Elder 5371cc4a38bdSAlex Elder static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev) 537235d489f9SAlex Elder { 537335d489f9SAlex Elder size_t size; 537435d489f9SAlex Elder int ret; 537535d489f9SAlex Elder void *reply_buf; 537635d489f9SAlex Elder void *p; 537735d489f9SAlex Elder void *end; 537835d489f9SAlex Elder u64 seq; 537935d489f9SAlex Elder u32 snap_count; 538035d489f9SAlex Elder struct ceph_snap_context *snapc; 538135d489f9SAlex Elder u32 i; 538235d489f9SAlex Elder 538335d489f9SAlex Elder /* 538435d489f9SAlex Elder * We'll need room for the seq value (maximum snapshot id), 538535d489f9SAlex Elder * snapshot count, and array of that many snapshot ids. 538635d489f9SAlex Elder * For now we have a fixed upper limit on the number we're 538735d489f9SAlex Elder * prepared to receive. 538835d489f9SAlex Elder */ 538935d489f9SAlex Elder size = sizeof (__le64) + sizeof (__le32) + 539035d489f9SAlex Elder RBD_MAX_SNAP_COUNT * sizeof (__le64); 539135d489f9SAlex Elder reply_buf = kzalloc(size, GFP_KERNEL); 539235d489f9SAlex Elder if (!reply_buf) 539335d489f9SAlex Elder return -ENOMEM; 539435d489f9SAlex Elder 5395ecd4a68aSIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, 5396ecd4a68aSIlya Dryomov &rbd_dev->header_oloc, "get_snapcontext", 5397ecd4a68aSIlya Dryomov NULL, 0, reply_buf, size); 539836be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 539935d489f9SAlex Elder if (ret < 0) 540035d489f9SAlex Elder goto out; 540135d489f9SAlex Elder 540235d489f9SAlex Elder p = reply_buf; 540357385b51SAlex Elder end = reply_buf + ret; 540457385b51SAlex Elder ret = -ERANGE; 540535d489f9SAlex Elder ceph_decode_64_safe(&p, end, seq, out); 540635d489f9SAlex Elder ceph_decode_32_safe(&p, end, snap_count, out); 540735d489f9SAlex Elder 540835d489f9SAlex Elder /* 540935d489f9SAlex Elder * Make sure the reported number of snapshot ids wouldn't go 541035d489f9SAlex Elder * beyond the end of our buffer. But before checking that, 541135d489f9SAlex Elder * make sure the computed size of the snapshot context we 541235d489f9SAlex Elder * allocate is representable in a size_t. 541335d489f9SAlex Elder */ 541435d489f9SAlex Elder if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context)) 541535d489f9SAlex Elder / sizeof (u64)) { 541635d489f9SAlex Elder ret = -EINVAL; 541735d489f9SAlex Elder goto out; 541835d489f9SAlex Elder } 541935d489f9SAlex Elder if (!ceph_has_room(&p, end, snap_count * sizeof (__le64))) 542035d489f9SAlex Elder goto out; 5421468521c1SAlex Elder ret = 0; 542235d489f9SAlex Elder 5423812164f8SAlex Elder snapc = ceph_create_snap_context(snap_count, GFP_KERNEL); 542435d489f9SAlex Elder if (!snapc) { 542535d489f9SAlex Elder ret = -ENOMEM; 542635d489f9SAlex Elder goto out; 542735d489f9SAlex Elder } 542835d489f9SAlex Elder snapc->seq = seq; 542935d489f9SAlex Elder for (i = 0; i < snap_count; i++) 543035d489f9SAlex Elder snapc->snaps[i] = ceph_decode_64(&p); 543135d489f9SAlex Elder 543249ece554SAlex Elder ceph_put_snap_context(rbd_dev->header.snapc); 543335d489f9SAlex Elder rbd_dev->header.snapc = snapc; 543435d489f9SAlex Elder 543535d489f9SAlex Elder dout(" snap context seq = %llu, snap_count = %u\n", 543635d489f9SAlex Elder (unsigned long long)seq, (unsigned int)snap_count); 543735d489f9SAlex Elder out: 543835d489f9SAlex Elder kfree(reply_buf); 543935d489f9SAlex Elder 544057385b51SAlex Elder return ret; 544135d489f9SAlex Elder } 544235d489f9SAlex Elder 544354cac61fSAlex Elder static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, 544454cac61fSAlex Elder u64 snap_id) 5445b8b1e2dbSAlex Elder { 5446b8b1e2dbSAlex Elder size_t size; 5447b8b1e2dbSAlex Elder void *reply_buf; 544854cac61fSAlex Elder __le64 snapid; 5449b8b1e2dbSAlex Elder int ret; 5450b8b1e2dbSAlex Elder void *p; 5451b8b1e2dbSAlex Elder void *end; 5452b8b1e2dbSAlex Elder char *snap_name; 5453b8b1e2dbSAlex Elder 5454b8b1e2dbSAlex Elder size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN; 5455b8b1e2dbSAlex Elder reply_buf = kmalloc(size, GFP_KERNEL); 5456b8b1e2dbSAlex Elder if (!reply_buf) 5457b8b1e2dbSAlex Elder return ERR_PTR(-ENOMEM); 5458b8b1e2dbSAlex Elder 545954cac61fSAlex Elder snapid = cpu_to_le64(snap_id); 5460ecd4a68aSIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, 5461ecd4a68aSIlya Dryomov &rbd_dev->header_oloc, "get_snapshot_name", 5462ecd4a68aSIlya Dryomov &snapid, sizeof(snapid), reply_buf, size); 546336be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 5464f40eb349SAlex Elder if (ret < 0) { 5465f40eb349SAlex Elder snap_name = ERR_PTR(ret); 5466b8b1e2dbSAlex Elder goto out; 5467f40eb349SAlex Elder } 5468b8b1e2dbSAlex Elder 5469b8b1e2dbSAlex Elder p = reply_buf; 5470f40eb349SAlex Elder end = reply_buf + ret; 5471e5c35534SAlex Elder snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL); 5472f40eb349SAlex Elder if (IS_ERR(snap_name)) 5473b8b1e2dbSAlex Elder goto out; 5474f40eb349SAlex Elder 5475b8b1e2dbSAlex Elder dout(" snap_id 0x%016llx snap_name = %s\n", 547654cac61fSAlex Elder (unsigned long long)snap_id, snap_name); 5477b8b1e2dbSAlex Elder out: 5478b8b1e2dbSAlex Elder kfree(reply_buf); 5479b8b1e2dbSAlex Elder 5480f40eb349SAlex Elder return snap_name; 5481b8b1e2dbSAlex Elder } 5482b8b1e2dbSAlex Elder 54832df3fac7SAlex Elder static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev) 5484117973fbSAlex Elder { 54852df3fac7SAlex Elder bool first_time = rbd_dev->header.object_prefix == NULL; 5486117973fbSAlex Elder int ret; 5487117973fbSAlex Elder 54881617e40cSJosh Durgin ret = rbd_dev_v2_image_size(rbd_dev); 54891617e40cSJosh Durgin if (ret) 5490cfbf6377SAlex Elder return ret; 54911617e40cSJosh Durgin 54922df3fac7SAlex Elder if (first_time) { 54932df3fac7SAlex Elder ret = rbd_dev_v2_header_onetime(rbd_dev); 54942df3fac7SAlex Elder if (ret) 5495cfbf6377SAlex Elder return ret; 54962df3fac7SAlex Elder } 54972df3fac7SAlex Elder 5498cc4a38bdSAlex Elder ret = rbd_dev_v2_snap_context(rbd_dev); 5499d194cd1dSIlya Dryomov if (ret && first_time) { 5500d194cd1dSIlya Dryomov kfree(rbd_dev->header.object_prefix); 5501d194cd1dSIlya Dryomov rbd_dev->header.object_prefix = NULL; 5502d194cd1dSIlya Dryomov } 5503117973fbSAlex Elder 5504117973fbSAlex Elder return ret; 5505117973fbSAlex Elder } 5506117973fbSAlex Elder 5507a720ae09SIlya Dryomov static int rbd_dev_header_info(struct rbd_device *rbd_dev) 5508a720ae09SIlya Dryomov { 5509a720ae09SIlya Dryomov rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 5510a720ae09SIlya Dryomov 5511a720ae09SIlya Dryomov if (rbd_dev->image_format == 1) 5512a720ae09SIlya Dryomov return rbd_dev_v1_header_info(rbd_dev); 5513a720ae09SIlya Dryomov 5514a720ae09SIlya Dryomov return rbd_dev_v2_header_info(rbd_dev); 5515a720ae09SIlya Dryomov } 5516a720ae09SIlya Dryomov 55171ddbe94eSAlex Elder /* 5518e28fff26SAlex Elder * Skips over white space at *buf, and updates *buf to point to the 5519e28fff26SAlex Elder * first found non-space character (if any). Returns the length of 5520593a9e7bSAlex Elder * the token (string of non-white space characters) found. Note 5521593a9e7bSAlex Elder * that *buf must be terminated with '\0'. 5522e28fff26SAlex Elder */ 5523e28fff26SAlex Elder static inline size_t next_token(const char **buf) 5524e28fff26SAlex Elder { 5525e28fff26SAlex Elder /* 5526e28fff26SAlex Elder * These are the characters that produce nonzero for 5527e28fff26SAlex Elder * isspace() in the "C" and "POSIX" locales. 5528e28fff26SAlex Elder */ 5529e28fff26SAlex Elder const char *spaces = " \f\n\r\t\v"; 5530e28fff26SAlex Elder 5531e28fff26SAlex Elder *buf += strspn(*buf, spaces); /* Find start of token */ 5532e28fff26SAlex Elder 5533e28fff26SAlex Elder return strcspn(*buf, spaces); /* Return token length */ 5534e28fff26SAlex Elder } 5535e28fff26SAlex Elder 5536e28fff26SAlex Elder /* 5537ea3352f4SAlex Elder * Finds the next token in *buf, dynamically allocates a buffer big 5538ea3352f4SAlex Elder * enough to hold a copy of it, and copies the token into the new 5539ea3352f4SAlex Elder * buffer. The copy is guaranteed to be terminated with '\0'. Note 5540ea3352f4SAlex Elder * that a duplicate buffer is created even for a zero-length token. 5541ea3352f4SAlex Elder * 5542ea3352f4SAlex Elder * Returns a pointer to the newly-allocated duplicate, or a null 5543ea3352f4SAlex Elder * pointer if memory for the duplicate was not available. If 5544ea3352f4SAlex Elder * the lenp argument is a non-null pointer, the length of the token 5545ea3352f4SAlex Elder * (not including the '\0') is returned in *lenp. 5546ea3352f4SAlex Elder * 5547ea3352f4SAlex Elder * If successful, the *buf pointer will be updated to point beyond 5548ea3352f4SAlex Elder * the end of the found token. 5549ea3352f4SAlex Elder * 5550ea3352f4SAlex Elder * Note: uses GFP_KERNEL for allocation. 5551ea3352f4SAlex Elder */ 5552ea3352f4SAlex Elder static inline char *dup_token(const char **buf, size_t *lenp) 5553ea3352f4SAlex Elder { 5554ea3352f4SAlex Elder char *dup; 5555ea3352f4SAlex Elder size_t len; 5556ea3352f4SAlex Elder 5557ea3352f4SAlex Elder len = next_token(buf); 55584caf35f9SAlex Elder dup = kmemdup(*buf, len + 1, GFP_KERNEL); 5559ea3352f4SAlex Elder if (!dup) 5560ea3352f4SAlex Elder return NULL; 5561ea3352f4SAlex Elder *(dup + len) = '\0'; 5562ea3352f4SAlex Elder *buf += len; 5563ea3352f4SAlex Elder 5564ea3352f4SAlex Elder if (lenp) 5565ea3352f4SAlex Elder *lenp = len; 5566ea3352f4SAlex Elder 5567ea3352f4SAlex Elder return dup; 5568ea3352f4SAlex Elder } 5569ea3352f4SAlex Elder 5570ea3352f4SAlex Elder /* 5571859c31dfSAlex Elder * Parse the options provided for an "rbd add" (i.e., rbd image 5572859c31dfSAlex Elder * mapping) request. These arrive via a write to /sys/bus/rbd/add, 5573859c31dfSAlex Elder * and the data written is passed here via a NUL-terminated buffer. 5574859c31dfSAlex Elder * Returns 0 if successful or an error code otherwise. 5575d22f76e7SAlex Elder * 5576859c31dfSAlex Elder * The information extracted from these options is recorded in 5577859c31dfSAlex Elder * the other parameters which return dynamically-allocated 5578859c31dfSAlex Elder * structures: 5579859c31dfSAlex Elder * ceph_opts 5580859c31dfSAlex Elder * The address of a pointer that will refer to a ceph options 5581859c31dfSAlex Elder * structure. Caller must release the returned pointer using 5582859c31dfSAlex Elder * ceph_destroy_options() when it is no longer needed. 5583859c31dfSAlex Elder * rbd_opts 5584859c31dfSAlex Elder * Address of an rbd options pointer. Fully initialized by 5585859c31dfSAlex Elder * this function; caller must release with kfree(). 5586859c31dfSAlex Elder * spec 5587859c31dfSAlex Elder * Address of an rbd image specification pointer. Fully 5588859c31dfSAlex Elder * initialized by this function based on parsed options. 5589859c31dfSAlex Elder * Caller must release with rbd_spec_put(). 5590859c31dfSAlex Elder * 5591859c31dfSAlex Elder * The options passed take this form: 5592859c31dfSAlex Elder * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>] 5593859c31dfSAlex Elder * where: 5594859c31dfSAlex Elder * <mon_addrs> 5595859c31dfSAlex Elder * A comma-separated list of one or more monitor addresses. 5596859c31dfSAlex Elder * A monitor address is an ip address, optionally followed 5597859c31dfSAlex Elder * by a port number (separated by a colon). 5598859c31dfSAlex Elder * I.e.: ip1[:port1][,ip2[:port2]...] 5599859c31dfSAlex Elder * <options> 5600859c31dfSAlex Elder * A comma-separated list of ceph and/or rbd options. 5601859c31dfSAlex Elder * <pool_name> 5602859c31dfSAlex Elder * The name of the rados pool containing the rbd image. 5603859c31dfSAlex Elder * <image_name> 5604859c31dfSAlex Elder * The name of the image in that pool to map. 5605859c31dfSAlex Elder * <snap_id> 5606859c31dfSAlex Elder * An optional snapshot id. If provided, the mapping will 5607859c31dfSAlex Elder * present data from the image at the time that snapshot was 5608859c31dfSAlex Elder * created. The image head is used if no snapshot id is 5609859c31dfSAlex Elder * provided. Snapshot mappings are always read-only. 5610a725f65eSAlex Elder */ 5611859c31dfSAlex Elder static int rbd_add_parse_args(const char *buf, 5612dc79b113SAlex Elder struct ceph_options **ceph_opts, 5613859c31dfSAlex Elder struct rbd_options **opts, 5614859c31dfSAlex Elder struct rbd_spec **rbd_spec) 5615a725f65eSAlex Elder { 5616e28fff26SAlex Elder size_t len; 5617859c31dfSAlex Elder char *options; 56180ddebc0cSAlex Elder const char *mon_addrs; 5619ecb4dc22SAlex Elder char *snap_name; 56200ddebc0cSAlex Elder size_t mon_addrs_size; 5621859c31dfSAlex Elder struct rbd_spec *spec = NULL; 56224e9afebaSAlex Elder struct rbd_options *rbd_opts = NULL; 5623859c31dfSAlex Elder struct ceph_options *copts; 5624dc79b113SAlex Elder int ret; 5625e28fff26SAlex Elder 5626e28fff26SAlex Elder /* The first four tokens are required */ 5627e28fff26SAlex Elder 56287ef3214aSAlex Elder len = next_token(&buf); 56294fb5d671SAlex Elder if (!len) { 56304fb5d671SAlex Elder rbd_warn(NULL, "no monitor address(es) provided"); 56314fb5d671SAlex Elder return -EINVAL; 56324fb5d671SAlex Elder } 56330ddebc0cSAlex Elder mon_addrs = buf; 5634f28e565aSAlex Elder mon_addrs_size = len + 1; 56357ef3214aSAlex Elder buf += len; 5636a725f65eSAlex Elder 5637dc79b113SAlex Elder ret = -EINVAL; 5638f28e565aSAlex Elder options = dup_token(&buf, NULL); 5639f28e565aSAlex Elder if (!options) 5640dc79b113SAlex Elder return -ENOMEM; 56414fb5d671SAlex Elder if (!*options) { 56424fb5d671SAlex Elder rbd_warn(NULL, "no options provided"); 56434fb5d671SAlex Elder goto out_err; 56444fb5d671SAlex Elder } 5645a725f65eSAlex Elder 5646859c31dfSAlex Elder spec = rbd_spec_alloc(); 5647859c31dfSAlex Elder if (!spec) 5648f28e565aSAlex Elder goto out_mem; 5649859c31dfSAlex Elder 5650859c31dfSAlex Elder spec->pool_name = dup_token(&buf, NULL); 5651859c31dfSAlex Elder if (!spec->pool_name) 5652859c31dfSAlex Elder goto out_mem; 56534fb5d671SAlex Elder if (!*spec->pool_name) { 56544fb5d671SAlex Elder rbd_warn(NULL, "no pool name provided"); 56554fb5d671SAlex Elder goto out_err; 56564fb5d671SAlex Elder } 5657e28fff26SAlex Elder 565869e7a02fSAlex Elder spec->image_name = dup_token(&buf, NULL); 5659859c31dfSAlex Elder if (!spec->image_name) 5660f28e565aSAlex Elder goto out_mem; 56614fb5d671SAlex Elder if (!*spec->image_name) { 56624fb5d671SAlex Elder rbd_warn(NULL, "no image name provided"); 56634fb5d671SAlex Elder goto out_err; 56644fb5d671SAlex Elder } 5665e28fff26SAlex Elder 5666f28e565aSAlex Elder /* 5667f28e565aSAlex Elder * Snapshot name is optional; default is to use "-" 5668f28e565aSAlex Elder * (indicating the head/no snapshot). 5669f28e565aSAlex Elder */ 56703feeb894SAlex Elder len = next_token(&buf); 5671820a5f3eSAlex Elder if (!len) { 56723feeb894SAlex Elder buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */ 56733feeb894SAlex Elder len = sizeof (RBD_SNAP_HEAD_NAME) - 1; 5674f28e565aSAlex Elder } else if (len > RBD_MAX_SNAP_NAME_LEN) { 5675dc79b113SAlex Elder ret = -ENAMETOOLONG; 5676f28e565aSAlex Elder goto out_err; 5677849b4260SAlex Elder } 5678ecb4dc22SAlex Elder snap_name = kmemdup(buf, len + 1, GFP_KERNEL); 5679ecb4dc22SAlex Elder if (!snap_name) 5680f28e565aSAlex Elder goto out_mem; 5681ecb4dc22SAlex Elder *(snap_name + len) = '\0'; 5682ecb4dc22SAlex Elder spec->snap_name = snap_name; 5683e5c35534SAlex Elder 56840ddebc0cSAlex Elder /* Initialize all rbd options to the defaults */ 5685e28fff26SAlex Elder 56864e9afebaSAlex Elder rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL); 56874e9afebaSAlex Elder if (!rbd_opts) 56884e9afebaSAlex Elder goto out_mem; 56894e9afebaSAlex Elder 56904e9afebaSAlex Elder rbd_opts->read_only = RBD_READ_ONLY_DEFAULT; 5691b5584180SIlya Dryomov rbd_opts->queue_depth = RBD_QUEUE_DEPTH_DEFAULT; 569280de1912SIlya Dryomov rbd_opts->lock_on_read = RBD_LOCK_ON_READ_DEFAULT; 5693d22f76e7SAlex Elder 5694859c31dfSAlex Elder copts = ceph_parse_options(options, mon_addrs, 56950ddebc0cSAlex Elder mon_addrs + mon_addrs_size - 1, 56964e9afebaSAlex Elder parse_rbd_opts_token, rbd_opts); 5697859c31dfSAlex Elder if (IS_ERR(copts)) { 5698859c31dfSAlex Elder ret = PTR_ERR(copts); 5699dc79b113SAlex Elder goto out_err; 5700dc79b113SAlex Elder } 5701859c31dfSAlex Elder kfree(options); 5702859c31dfSAlex Elder 5703859c31dfSAlex Elder *ceph_opts = copts; 57044e9afebaSAlex Elder *opts = rbd_opts; 5705859c31dfSAlex Elder *rbd_spec = spec; 57060ddebc0cSAlex Elder 5707dc79b113SAlex Elder return 0; 5708f28e565aSAlex Elder out_mem: 5709dc79b113SAlex Elder ret = -ENOMEM; 5710d22f76e7SAlex Elder out_err: 5711859c31dfSAlex Elder kfree(rbd_opts); 5712859c31dfSAlex Elder rbd_spec_put(spec); 5713f28e565aSAlex Elder kfree(options); 5714d22f76e7SAlex Elder 5715dc79b113SAlex Elder return ret; 5716a725f65eSAlex Elder } 5717a725f65eSAlex Elder 5718589d30e0SAlex Elder /* 571930ba1f02SIlya Dryomov * Return pool id (>= 0) or a negative error code. 572030ba1f02SIlya Dryomov */ 572130ba1f02SIlya Dryomov static int rbd_add_get_pool_id(struct rbd_client *rbdc, const char *pool_name) 572230ba1f02SIlya Dryomov { 5723a319bf56SIlya Dryomov struct ceph_options *opts = rbdc->client->options; 572430ba1f02SIlya Dryomov u64 newest_epoch; 572530ba1f02SIlya Dryomov int tries = 0; 572630ba1f02SIlya Dryomov int ret; 572730ba1f02SIlya Dryomov 572830ba1f02SIlya Dryomov again: 572930ba1f02SIlya Dryomov ret = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, pool_name); 573030ba1f02SIlya Dryomov if (ret == -ENOENT && tries++ < 1) { 5731d0b19705SIlya Dryomov ret = ceph_monc_get_version(&rbdc->client->monc, "osdmap", 573230ba1f02SIlya Dryomov &newest_epoch); 573330ba1f02SIlya Dryomov if (ret < 0) 573430ba1f02SIlya Dryomov return ret; 573530ba1f02SIlya Dryomov 573630ba1f02SIlya Dryomov if (rbdc->client->osdc.osdmap->epoch < newest_epoch) { 57377cca78c9SIlya Dryomov ceph_osdc_maybe_request_map(&rbdc->client->osdc); 573830ba1f02SIlya Dryomov (void) ceph_monc_wait_osdmap(&rbdc->client->monc, 5739a319bf56SIlya Dryomov newest_epoch, 5740a319bf56SIlya Dryomov opts->mount_timeout); 574130ba1f02SIlya Dryomov goto again; 574230ba1f02SIlya Dryomov } else { 574330ba1f02SIlya Dryomov /* the osdmap we have is new enough */ 574430ba1f02SIlya Dryomov return -ENOENT; 574530ba1f02SIlya Dryomov } 574630ba1f02SIlya Dryomov } 574730ba1f02SIlya Dryomov 574830ba1f02SIlya Dryomov return ret; 574930ba1f02SIlya Dryomov } 575030ba1f02SIlya Dryomov 575130ba1f02SIlya Dryomov /* 5752589d30e0SAlex Elder * An rbd format 2 image has a unique identifier, distinct from the 5753589d30e0SAlex Elder * name given to it by the user. Internally, that identifier is 5754589d30e0SAlex Elder * what's used to specify the names of objects related to the image. 5755589d30e0SAlex Elder * 5756589d30e0SAlex Elder * A special "rbd id" object is used to map an rbd image name to its 5757589d30e0SAlex Elder * id. If that object doesn't exist, then there is no v2 rbd image 5758589d30e0SAlex Elder * with the supplied name. 5759589d30e0SAlex Elder * 5760589d30e0SAlex Elder * This function will record the given rbd_dev's image_id field if 5761589d30e0SAlex Elder * it can be determined, and in that case will return 0. If any 5762589d30e0SAlex Elder * errors occur a negative errno will be returned and the rbd_dev's 5763589d30e0SAlex Elder * image_id field will be unchanged (and should be NULL). 5764589d30e0SAlex Elder */ 5765589d30e0SAlex Elder static int rbd_dev_image_id(struct rbd_device *rbd_dev) 5766589d30e0SAlex Elder { 5767589d30e0SAlex Elder int ret; 5768589d30e0SAlex Elder size_t size; 5769ecd4a68aSIlya Dryomov CEPH_DEFINE_OID_ONSTACK(oid); 5770589d30e0SAlex Elder void *response; 5771c0fba368SAlex Elder char *image_id; 57722f82ee54SAlex Elder 5773589d30e0SAlex Elder /* 57742c0d0a10SAlex Elder * When probing a parent image, the image id is already 57752c0d0a10SAlex Elder * known (and the image name likely is not). There's no 5776c0fba368SAlex Elder * need to fetch the image id again in this case. We 5777c0fba368SAlex Elder * do still need to set the image format though. 57782c0d0a10SAlex Elder */ 5779c0fba368SAlex Elder if (rbd_dev->spec->image_id) { 5780c0fba368SAlex Elder rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1; 5781c0fba368SAlex Elder 57822c0d0a10SAlex Elder return 0; 5783c0fba368SAlex Elder } 57842c0d0a10SAlex Elder 57852c0d0a10SAlex Elder /* 5786589d30e0SAlex Elder * First, see if the format 2 image id file exists, and if 5787589d30e0SAlex Elder * so, get the image's persistent id from it. 5788589d30e0SAlex Elder */ 5789ecd4a68aSIlya Dryomov ret = ceph_oid_aprintf(&oid, GFP_KERNEL, "%s%s", RBD_ID_PREFIX, 5790ecd4a68aSIlya Dryomov rbd_dev->spec->image_name); 5791ecd4a68aSIlya Dryomov if (ret) 5792ecd4a68aSIlya Dryomov return ret; 5793ecd4a68aSIlya Dryomov 5794ecd4a68aSIlya Dryomov dout("rbd id object name is %s\n", oid.name); 5795589d30e0SAlex Elder 5796589d30e0SAlex Elder /* Response will be an encoded string, which includes a length */ 5797589d30e0SAlex Elder 5798589d30e0SAlex Elder size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX; 5799589d30e0SAlex Elder response = kzalloc(size, GFP_NOIO); 5800589d30e0SAlex Elder if (!response) { 5801589d30e0SAlex Elder ret = -ENOMEM; 5802589d30e0SAlex Elder goto out; 5803589d30e0SAlex Elder } 5804589d30e0SAlex Elder 5805c0fba368SAlex Elder /* If it doesn't exist we'll assume it's a format 1 image */ 5806c0fba368SAlex Elder 5807ecd4a68aSIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc, 5808ecd4a68aSIlya Dryomov "get_id", NULL, 0, 5809e2a58ee5SAlex Elder response, RBD_IMAGE_ID_LEN_MAX); 581036be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 5811c0fba368SAlex Elder if (ret == -ENOENT) { 5812c0fba368SAlex Elder image_id = kstrdup("", GFP_KERNEL); 5813c0fba368SAlex Elder ret = image_id ? 0 : -ENOMEM; 5814c0fba368SAlex Elder if (!ret) 5815c0fba368SAlex Elder rbd_dev->image_format = 1; 58167dd440c9SIlya Dryomov } else if (ret >= 0) { 5817c0fba368SAlex Elder void *p = response; 5818589d30e0SAlex Elder 5819c0fba368SAlex Elder image_id = ceph_extract_encoded_string(&p, p + ret, 5820979ed480SAlex Elder NULL, GFP_NOIO); 5821461f758aSDuan Jiong ret = PTR_ERR_OR_ZERO(image_id); 5822c0fba368SAlex Elder if (!ret) 5823c0fba368SAlex Elder rbd_dev->image_format = 2; 5824c0fba368SAlex Elder } 5825c0fba368SAlex Elder 5826c0fba368SAlex Elder if (!ret) { 5827c0fba368SAlex Elder rbd_dev->spec->image_id = image_id; 5828c0fba368SAlex Elder dout("image_id is %s\n", image_id); 5829589d30e0SAlex Elder } 5830589d30e0SAlex Elder out: 5831589d30e0SAlex Elder kfree(response); 5832ecd4a68aSIlya Dryomov ceph_oid_destroy(&oid); 5833589d30e0SAlex Elder return ret; 5834589d30e0SAlex Elder } 5835589d30e0SAlex Elder 58363abef3b3SAlex Elder /* 58373abef3b3SAlex Elder * Undo whatever state changes are made by v1 or v2 header info 58383abef3b3SAlex Elder * call. 58393abef3b3SAlex Elder */ 58406fd48b3bSAlex Elder static void rbd_dev_unprobe(struct rbd_device *rbd_dev) 58416fd48b3bSAlex Elder { 58426fd48b3bSAlex Elder struct rbd_image_header *header; 58436fd48b3bSAlex Elder 5844a2acd00eSAlex Elder rbd_dev_parent_put(rbd_dev); 58456fd48b3bSAlex Elder 58466fd48b3bSAlex Elder /* Free dynamic fields from the header, then zero it out */ 58476fd48b3bSAlex Elder 58486fd48b3bSAlex Elder header = &rbd_dev->header; 5849812164f8SAlex Elder ceph_put_snap_context(header->snapc); 58506fd48b3bSAlex Elder kfree(header->snap_sizes); 58516fd48b3bSAlex Elder kfree(header->snap_names); 58526fd48b3bSAlex Elder kfree(header->object_prefix); 58536fd48b3bSAlex Elder memset(header, 0, sizeof (*header)); 58546fd48b3bSAlex Elder } 58556fd48b3bSAlex Elder 58562df3fac7SAlex Elder static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev) 5857a30b71b9SAlex Elder { 5858a30b71b9SAlex Elder int ret; 5859a30b71b9SAlex Elder 58601e130199SAlex Elder ret = rbd_dev_v2_object_prefix(rbd_dev); 586157385b51SAlex Elder if (ret) 58621e130199SAlex Elder goto out_err; 5863b1b5402aSAlex Elder 58642df3fac7SAlex Elder /* 58652df3fac7SAlex Elder * Get the and check features for the image. Currently the 58662df3fac7SAlex Elder * features are assumed to never change. 58672df3fac7SAlex Elder */ 5868b1b5402aSAlex Elder ret = rbd_dev_v2_features(rbd_dev); 586957385b51SAlex Elder if (ret) 5870b1b5402aSAlex Elder goto out_err; 587135d489f9SAlex Elder 5872cc070d59SAlex Elder /* If the image supports fancy striping, get its parameters */ 5873cc070d59SAlex Elder 5874cc070d59SAlex Elder if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) { 5875cc070d59SAlex Elder ret = rbd_dev_v2_striping_info(rbd_dev); 5876cc070d59SAlex Elder if (ret < 0) 5877cc070d59SAlex Elder goto out_err; 5878cc070d59SAlex Elder } 5879a30b71b9SAlex Elder 58807e97332eSIlya Dryomov if (rbd_dev->header.features & RBD_FEATURE_DATA_POOL) { 58817e97332eSIlya Dryomov ret = rbd_dev_v2_data_pool(rbd_dev); 58827e97332eSIlya Dryomov if (ret) 58837e97332eSIlya Dryomov goto out_err; 58847e97332eSIlya Dryomov } 58857e97332eSIlya Dryomov 5886263423f8SIlya Dryomov rbd_init_layout(rbd_dev); 588735152979SAlex Elder return 0; 5888263423f8SIlya Dryomov 58899d475de5SAlex Elder out_err: 5890642a2537SAlex Elder rbd_dev->header.features = 0; 58911e130199SAlex Elder kfree(rbd_dev->header.object_prefix); 58921e130199SAlex Elder rbd_dev->header.object_prefix = NULL; 58939d475de5SAlex Elder return ret; 5894a30b71b9SAlex Elder } 5895a30b71b9SAlex Elder 58966d69bb53SIlya Dryomov /* 58976d69bb53SIlya Dryomov * @depth is rbd_dev_image_probe() -> rbd_dev_probe_parent() -> 58986d69bb53SIlya Dryomov * rbd_dev_image_probe() recursion depth, which means it's also the 58996d69bb53SIlya Dryomov * length of the already discovered part of the parent chain. 59006d69bb53SIlya Dryomov */ 59016d69bb53SIlya Dryomov static int rbd_dev_probe_parent(struct rbd_device *rbd_dev, int depth) 590283a06263SAlex Elder { 59032f82ee54SAlex Elder struct rbd_device *parent = NULL; 5904124afba2SAlex Elder int ret; 5905124afba2SAlex Elder 5906124afba2SAlex Elder if (!rbd_dev->parent_spec) 5907124afba2SAlex Elder return 0; 5908124afba2SAlex Elder 59096d69bb53SIlya Dryomov if (++depth > RBD_MAX_PARENT_CHAIN_LEN) { 59106d69bb53SIlya Dryomov pr_info("parent chain is too long (%d)\n", depth); 59116d69bb53SIlya Dryomov ret = -EINVAL; 59126d69bb53SIlya Dryomov goto out_err; 59136d69bb53SIlya Dryomov } 59146d69bb53SIlya Dryomov 59151643dfa4SIlya Dryomov parent = __rbd_dev_create(rbd_dev->rbd_client, rbd_dev->parent_spec); 59161f2c6651SIlya Dryomov if (!parent) { 5917124afba2SAlex Elder ret = -ENOMEM; 5918124afba2SAlex Elder goto out_err; 59191f2c6651SIlya Dryomov } 59201f2c6651SIlya Dryomov 59211f2c6651SIlya Dryomov /* 59221f2c6651SIlya Dryomov * Images related by parent/child relationships always share 59231f2c6651SIlya Dryomov * rbd_client and spec/parent_spec, so bump their refcounts. 59241f2c6651SIlya Dryomov */ 59251f2c6651SIlya Dryomov __rbd_get_client(rbd_dev->rbd_client); 59261f2c6651SIlya Dryomov rbd_spec_get(rbd_dev->parent_spec); 5927124afba2SAlex Elder 59286d69bb53SIlya Dryomov ret = rbd_dev_image_probe(parent, depth); 5929124afba2SAlex Elder if (ret < 0) 5930124afba2SAlex Elder goto out_err; 59311f2c6651SIlya Dryomov 5932124afba2SAlex Elder rbd_dev->parent = parent; 5933a2acd00eSAlex Elder atomic_set(&rbd_dev->parent_ref, 1); 5934124afba2SAlex Elder return 0; 5935124afba2SAlex Elder 59361f2c6651SIlya Dryomov out_err: 59371f2c6651SIlya Dryomov rbd_dev_unparent(rbd_dev); 59381f2c6651SIlya Dryomov rbd_dev_destroy(parent); 5939124afba2SAlex Elder return ret; 5940124afba2SAlex Elder } 5941124afba2SAlex Elder 5942811c6688SIlya Dryomov /* 5943811c6688SIlya Dryomov * rbd_dev->header_rwsem must be locked for write and will be unlocked 5944811c6688SIlya Dryomov * upon return. 5945811c6688SIlya Dryomov */ 5946200a6a8bSAlex Elder static int rbd_dev_device_setup(struct rbd_device *rbd_dev) 5947124afba2SAlex Elder { 594883a06263SAlex Elder int ret; 594983a06263SAlex Elder 59509b60e70bSIlya Dryomov /* Record our major and minor device numbers. */ 595183a06263SAlex Elder 59529b60e70bSIlya Dryomov if (!single_major) { 595383a06263SAlex Elder ret = register_blkdev(0, rbd_dev->name); 595483a06263SAlex Elder if (ret < 0) 59551643dfa4SIlya Dryomov goto err_out_unlock; 59569b60e70bSIlya Dryomov 595783a06263SAlex Elder rbd_dev->major = ret; 5958dd82fff1SIlya Dryomov rbd_dev->minor = 0; 59599b60e70bSIlya Dryomov } else { 59609b60e70bSIlya Dryomov rbd_dev->major = rbd_major; 59619b60e70bSIlya Dryomov rbd_dev->minor = rbd_dev_id_to_minor(rbd_dev->dev_id); 59629b60e70bSIlya Dryomov } 596383a06263SAlex Elder 596483a06263SAlex Elder /* Set up the blkdev mapping. */ 596583a06263SAlex Elder 596683a06263SAlex Elder ret = rbd_init_disk(rbd_dev); 596783a06263SAlex Elder if (ret) 596883a06263SAlex Elder goto err_out_blkdev; 596983a06263SAlex Elder 5970f35a4deeSAlex Elder ret = rbd_dev_mapping_set(rbd_dev); 597183a06263SAlex Elder if (ret) 597283a06263SAlex Elder goto err_out_disk; 5973bc1ecc65SIlya Dryomov 5974f35a4deeSAlex Elder set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE); 597522001f61SJosh Durgin set_disk_ro(rbd_dev->disk, rbd_dev->mapping.read_only); 5976f35a4deeSAlex Elder 5977dd5ac32dSIlya Dryomov dev_set_name(&rbd_dev->dev, "%d", rbd_dev->dev_id); 5978dd5ac32dSIlya Dryomov ret = device_add(&rbd_dev->dev); 5979f35a4deeSAlex Elder if (ret) 5980f5ee37bdSIlya Dryomov goto err_out_mapping; 598183a06263SAlex Elder 598283a06263SAlex Elder /* Everything's ready. Announce the disk to the world. */ 598383a06263SAlex Elder 5984129b79d4SAlex Elder set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); 5985811c6688SIlya Dryomov up_write(&rbd_dev->header_rwsem); 598683a06263SAlex Elder 59871643dfa4SIlya Dryomov spin_lock(&rbd_dev_list_lock); 59881643dfa4SIlya Dryomov list_add_tail(&rbd_dev->node, &rbd_dev_list); 59891643dfa4SIlya Dryomov spin_unlock(&rbd_dev_list_lock); 59901643dfa4SIlya Dryomov 5991811c6688SIlya Dryomov add_disk(rbd_dev->disk); 5992ca7909e8SIlya Dryomov pr_info("%s: capacity %llu features 0x%llx\n", rbd_dev->disk->disk_name, 5993ca7909e8SIlya Dryomov (unsigned long long)get_capacity(rbd_dev->disk) << SECTOR_SHIFT, 5994ca7909e8SIlya Dryomov rbd_dev->header.features); 599583a06263SAlex Elder 599683a06263SAlex Elder return ret; 59972f82ee54SAlex Elder 5998f35a4deeSAlex Elder err_out_mapping: 5999f35a4deeSAlex Elder rbd_dev_mapping_clear(rbd_dev); 600083a06263SAlex Elder err_out_disk: 600183a06263SAlex Elder rbd_free_disk(rbd_dev); 600283a06263SAlex Elder err_out_blkdev: 60039b60e70bSIlya Dryomov if (!single_major) 600483a06263SAlex Elder unregister_blkdev(rbd_dev->major, rbd_dev->name); 6005811c6688SIlya Dryomov err_out_unlock: 6006811c6688SIlya Dryomov up_write(&rbd_dev->header_rwsem); 600783a06263SAlex Elder return ret; 600883a06263SAlex Elder } 600983a06263SAlex Elder 6010332bb12dSAlex Elder static int rbd_dev_header_name(struct rbd_device *rbd_dev) 6011332bb12dSAlex Elder { 6012332bb12dSAlex Elder struct rbd_spec *spec = rbd_dev->spec; 6013c41d13a3SIlya Dryomov int ret; 6014332bb12dSAlex Elder 6015332bb12dSAlex Elder /* Record the header object name for this rbd image. */ 6016332bb12dSAlex Elder 6017332bb12dSAlex Elder rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 6018332bb12dSAlex Elder if (rbd_dev->image_format == 1) 6019c41d13a3SIlya Dryomov ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s", 6020332bb12dSAlex Elder spec->image_name, RBD_SUFFIX); 6021332bb12dSAlex Elder else 6022c41d13a3SIlya Dryomov ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s", 6023332bb12dSAlex Elder RBD_HEADER_PREFIX, spec->image_id); 6024c41d13a3SIlya Dryomov 6025c41d13a3SIlya Dryomov return ret; 6026332bb12dSAlex Elder } 6027332bb12dSAlex Elder 6028200a6a8bSAlex Elder static void rbd_dev_image_release(struct rbd_device *rbd_dev) 6029200a6a8bSAlex Elder { 60306fd48b3bSAlex Elder rbd_dev_unprobe(rbd_dev); 60316fd48b3bSAlex Elder rbd_dev->image_format = 0; 60326fd48b3bSAlex Elder kfree(rbd_dev->spec->image_id); 60336fd48b3bSAlex Elder rbd_dev->spec->image_id = NULL; 60346fd48b3bSAlex Elder 6035200a6a8bSAlex Elder rbd_dev_destroy(rbd_dev); 6036200a6a8bSAlex Elder } 6037200a6a8bSAlex Elder 6038a30b71b9SAlex Elder /* 6039a30b71b9SAlex Elder * Probe for the existence of the header object for the given rbd 60401f3ef788SAlex Elder * device. If this image is the one being mapped (i.e., not a 60411f3ef788SAlex Elder * parent), initiate a watch on its header object before using that 60421f3ef788SAlex Elder * object to get detailed information about the rbd image. 6043a30b71b9SAlex Elder */ 60446d69bb53SIlya Dryomov static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth) 6045a30b71b9SAlex Elder { 6046a30b71b9SAlex Elder int ret; 6047a30b71b9SAlex Elder 6048a30b71b9SAlex Elder /* 60493abef3b3SAlex Elder * Get the id from the image id object. Unless there's an 60503abef3b3SAlex Elder * error, rbd_dev->spec->image_id will be filled in with 60513abef3b3SAlex Elder * a dynamically-allocated string, and rbd_dev->image_format 60523abef3b3SAlex Elder * will be set to either 1 or 2. 6053a30b71b9SAlex Elder */ 6054a30b71b9SAlex Elder ret = rbd_dev_image_id(rbd_dev); 6055a30b71b9SAlex Elder if (ret) 6056c0fba368SAlex Elder return ret; 6057c0fba368SAlex Elder 6058332bb12dSAlex Elder ret = rbd_dev_header_name(rbd_dev); 6059332bb12dSAlex Elder if (ret) 6060332bb12dSAlex Elder goto err_out_format; 6061332bb12dSAlex Elder 60626d69bb53SIlya Dryomov if (!depth) { 606399d16943SIlya Dryomov ret = rbd_register_watch(rbd_dev); 60641fe48023SIlya Dryomov if (ret) { 60651fe48023SIlya Dryomov if (ret == -ENOENT) 60661fe48023SIlya Dryomov pr_info("image %s/%s does not exist\n", 60671fe48023SIlya Dryomov rbd_dev->spec->pool_name, 60681fe48023SIlya Dryomov rbd_dev->spec->image_name); 6069c41d13a3SIlya Dryomov goto err_out_format; 60701f3ef788SAlex Elder } 60711fe48023SIlya Dryomov } 6072b644de2bSAlex Elder 6073a720ae09SIlya Dryomov ret = rbd_dev_header_info(rbd_dev); 60745655c4d9SAlex Elder if (ret) 6075b644de2bSAlex Elder goto err_out_watch; 6076a30b71b9SAlex Elder 607704077599SIlya Dryomov /* 607804077599SIlya Dryomov * If this image is the one being mapped, we have pool name and 607904077599SIlya Dryomov * id, image name and id, and snap name - need to fill snap id. 608004077599SIlya Dryomov * Otherwise this is a parent image, identified by pool, image 608104077599SIlya Dryomov * and snap ids - need to fill in names for those ids. 608204077599SIlya Dryomov */ 60836d69bb53SIlya Dryomov if (!depth) 608404077599SIlya Dryomov ret = rbd_spec_fill_snap_id(rbd_dev); 608504077599SIlya Dryomov else 608604077599SIlya Dryomov ret = rbd_spec_fill_names(rbd_dev); 60871fe48023SIlya Dryomov if (ret) { 60881fe48023SIlya Dryomov if (ret == -ENOENT) 60891fe48023SIlya Dryomov pr_info("snap %s/%s@%s does not exist\n", 60901fe48023SIlya Dryomov rbd_dev->spec->pool_name, 60911fe48023SIlya Dryomov rbd_dev->spec->image_name, 60921fe48023SIlya Dryomov rbd_dev->spec->snap_name); 609333dca39fSAlex Elder goto err_out_probe; 60941fe48023SIlya Dryomov } 60959bb81c9bSAlex Elder 6096e8f59b59SIlya Dryomov if (rbd_dev->header.features & RBD_FEATURE_LAYERING) { 6097e8f59b59SIlya Dryomov ret = rbd_dev_v2_parent_info(rbd_dev); 6098e8f59b59SIlya Dryomov if (ret) 6099e8f59b59SIlya Dryomov goto err_out_probe; 6100e8f59b59SIlya Dryomov 6101e8f59b59SIlya Dryomov /* 6102e8f59b59SIlya Dryomov * Need to warn users if this image is the one being 6103e8f59b59SIlya Dryomov * mapped and has a parent. 6104e8f59b59SIlya Dryomov */ 61056d69bb53SIlya Dryomov if (!depth && rbd_dev->parent_spec) 6106e8f59b59SIlya Dryomov rbd_warn(rbd_dev, 6107e8f59b59SIlya Dryomov "WARNING: kernel layering is EXPERIMENTAL!"); 6108e8f59b59SIlya Dryomov } 6109e8f59b59SIlya Dryomov 61106d69bb53SIlya Dryomov ret = rbd_dev_probe_parent(rbd_dev, depth); 611130d60ba2SAlex Elder if (ret) 611230d60ba2SAlex Elder goto err_out_probe; 611383a06263SAlex Elder 611430d60ba2SAlex Elder dout("discovered format %u image, header name is %s\n", 6115c41d13a3SIlya Dryomov rbd_dev->image_format, rbd_dev->header_oid.name); 611630d60ba2SAlex Elder return 0; 6117e8f59b59SIlya Dryomov 61186fd48b3bSAlex Elder err_out_probe: 61196fd48b3bSAlex Elder rbd_dev_unprobe(rbd_dev); 6120b644de2bSAlex Elder err_out_watch: 61216d69bb53SIlya Dryomov if (!depth) 612299d16943SIlya Dryomov rbd_unregister_watch(rbd_dev); 6123332bb12dSAlex Elder err_out_format: 6124332bb12dSAlex Elder rbd_dev->image_format = 0; 61255655c4d9SAlex Elder kfree(rbd_dev->spec->image_id); 61265655c4d9SAlex Elder rbd_dev->spec->image_id = NULL; 61275655c4d9SAlex Elder return ret; 612883a06263SAlex Elder } 612983a06263SAlex Elder 61309b60e70bSIlya Dryomov static ssize_t do_rbd_add(struct bus_type *bus, 613159c2be1eSYehuda Sadeh const char *buf, 613259c2be1eSYehuda Sadeh size_t count) 6133602adf40SYehuda Sadeh { 6134cb8627c7SAlex Elder struct rbd_device *rbd_dev = NULL; 6135dc79b113SAlex Elder struct ceph_options *ceph_opts = NULL; 61364e9afebaSAlex Elder struct rbd_options *rbd_opts = NULL; 6137859c31dfSAlex Elder struct rbd_spec *spec = NULL; 61389d3997fdSAlex Elder struct rbd_client *rbdc; 613951344a38SAlex Elder bool read_only; 6140b51c83c2SIlya Dryomov int rc; 6141602adf40SYehuda Sadeh 6142602adf40SYehuda Sadeh if (!try_module_get(THIS_MODULE)) 6143602adf40SYehuda Sadeh return -ENODEV; 6144602adf40SYehuda Sadeh 6145a725f65eSAlex Elder /* parse add command */ 6146859c31dfSAlex Elder rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec); 6147dc79b113SAlex Elder if (rc < 0) 6148dd5ac32dSIlya Dryomov goto out; 6149a725f65eSAlex Elder 61509d3997fdSAlex Elder rbdc = rbd_get_client(ceph_opts); 61519d3997fdSAlex Elder if (IS_ERR(rbdc)) { 61529d3997fdSAlex Elder rc = PTR_ERR(rbdc); 61530ddebc0cSAlex Elder goto err_out_args; 61549d3997fdSAlex Elder } 6155602adf40SYehuda Sadeh 6156602adf40SYehuda Sadeh /* pick the pool */ 615730ba1f02SIlya Dryomov rc = rbd_add_get_pool_id(rbdc, spec->pool_name); 61581fe48023SIlya Dryomov if (rc < 0) { 61591fe48023SIlya Dryomov if (rc == -ENOENT) 61601fe48023SIlya Dryomov pr_info("pool %s does not exist\n", spec->pool_name); 6161602adf40SYehuda Sadeh goto err_out_client; 61621fe48023SIlya Dryomov } 6163859c31dfSAlex Elder spec->pool_id = (u64)rc; 6164859c31dfSAlex Elder 6165d147543dSIlya Dryomov rbd_dev = rbd_dev_create(rbdc, spec, rbd_opts); 6166b51c83c2SIlya Dryomov if (!rbd_dev) { 6167b51c83c2SIlya Dryomov rc = -ENOMEM; 6168bd4ba655SAlex Elder goto err_out_client; 6169b51c83c2SIlya Dryomov } 6170c53d5893SAlex Elder rbdc = NULL; /* rbd_dev now owns this */ 6171c53d5893SAlex Elder spec = NULL; /* rbd_dev now owns this */ 6172d147543dSIlya Dryomov rbd_opts = NULL; /* rbd_dev now owns this */ 6173602adf40SYehuda Sadeh 61740d6d1e9cSMike Christie rbd_dev->config_info = kstrdup(buf, GFP_KERNEL); 61750d6d1e9cSMike Christie if (!rbd_dev->config_info) { 61760d6d1e9cSMike Christie rc = -ENOMEM; 61770d6d1e9cSMike Christie goto err_out_rbd_dev; 61780d6d1e9cSMike Christie } 61790d6d1e9cSMike Christie 6180811c6688SIlya Dryomov down_write(&rbd_dev->header_rwsem); 61816d69bb53SIlya Dryomov rc = rbd_dev_image_probe(rbd_dev, 0); 61820d6d1e9cSMike Christie if (rc < 0) { 61830d6d1e9cSMike Christie up_write(&rbd_dev->header_rwsem); 6184c53d5893SAlex Elder goto err_out_rbd_dev; 61850d6d1e9cSMike Christie } 618605fd6f6fSAlex Elder 61877ce4eef7SAlex Elder /* If we are mapping a snapshot it must be marked read-only */ 61887ce4eef7SAlex Elder 6189d147543dSIlya Dryomov read_only = rbd_dev->opts->read_only; 61907ce4eef7SAlex Elder if (rbd_dev->spec->snap_id != CEPH_NOSNAP) 61917ce4eef7SAlex Elder read_only = true; 61927ce4eef7SAlex Elder rbd_dev->mapping.read_only = read_only; 61937ce4eef7SAlex Elder 6194b536f69aSAlex Elder rc = rbd_dev_device_setup(rbd_dev); 61953abef3b3SAlex Elder if (rc) { 6196e37180c0SIlya Dryomov /* 619799d16943SIlya Dryomov * rbd_unregister_watch() can't be moved into 6198e37180c0SIlya Dryomov * rbd_dev_image_release() without refactoring, see 6199e37180c0SIlya Dryomov * commit 1f3ef78861ac. 6200e37180c0SIlya Dryomov */ 620199d16943SIlya Dryomov rbd_unregister_watch(rbd_dev); 62023abef3b3SAlex Elder rbd_dev_image_release(rbd_dev); 6203dd5ac32dSIlya Dryomov goto out; 62043abef3b3SAlex Elder } 62053abef3b3SAlex Elder 6206dd5ac32dSIlya Dryomov rc = count; 6207dd5ac32dSIlya Dryomov out: 6208dd5ac32dSIlya Dryomov module_put(THIS_MODULE); 6209dd5ac32dSIlya Dryomov return rc; 6210b536f69aSAlex Elder 6211c53d5893SAlex Elder err_out_rbd_dev: 6212c53d5893SAlex Elder rbd_dev_destroy(rbd_dev); 6213bd4ba655SAlex Elder err_out_client: 62149d3997fdSAlex Elder rbd_put_client(rbdc); 62150ddebc0cSAlex Elder err_out_args: 6216859c31dfSAlex Elder rbd_spec_put(spec); 6217d147543dSIlya Dryomov kfree(rbd_opts); 6218dd5ac32dSIlya Dryomov goto out; 6219602adf40SYehuda Sadeh } 6220602adf40SYehuda Sadeh 62219b60e70bSIlya Dryomov static ssize_t rbd_add(struct bus_type *bus, 62229b60e70bSIlya Dryomov const char *buf, 62239b60e70bSIlya Dryomov size_t count) 62249b60e70bSIlya Dryomov { 62259b60e70bSIlya Dryomov if (single_major) 62269b60e70bSIlya Dryomov return -EINVAL; 62279b60e70bSIlya Dryomov 62289b60e70bSIlya Dryomov return do_rbd_add(bus, buf, count); 62299b60e70bSIlya Dryomov } 62309b60e70bSIlya Dryomov 62319b60e70bSIlya Dryomov static ssize_t rbd_add_single_major(struct bus_type *bus, 62329b60e70bSIlya Dryomov const char *buf, 62339b60e70bSIlya Dryomov size_t count) 62349b60e70bSIlya Dryomov { 62359b60e70bSIlya Dryomov return do_rbd_add(bus, buf, count); 62369b60e70bSIlya Dryomov } 62379b60e70bSIlya Dryomov 6238dd5ac32dSIlya Dryomov static void rbd_dev_device_release(struct rbd_device *rbd_dev) 6239602adf40SYehuda Sadeh { 6240602adf40SYehuda Sadeh rbd_free_disk(rbd_dev); 62411643dfa4SIlya Dryomov 62421643dfa4SIlya Dryomov spin_lock(&rbd_dev_list_lock); 62431643dfa4SIlya Dryomov list_del_init(&rbd_dev->node); 62441643dfa4SIlya Dryomov spin_unlock(&rbd_dev_list_lock); 62451643dfa4SIlya Dryomov 6246200a6a8bSAlex Elder clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); 6247dd5ac32dSIlya Dryomov device_del(&rbd_dev->dev); 62486d80b130SAlex Elder rbd_dev_mapping_clear(rbd_dev); 62499b60e70bSIlya Dryomov if (!single_major) 6250602adf40SYehuda Sadeh unregister_blkdev(rbd_dev->major, rbd_dev->name); 6251602adf40SYehuda Sadeh } 6252602adf40SYehuda Sadeh 625305a46afdSAlex Elder static void rbd_dev_remove_parent(struct rbd_device *rbd_dev) 625405a46afdSAlex Elder { 6255ad945fc1SAlex Elder while (rbd_dev->parent) { 625605a46afdSAlex Elder struct rbd_device *first = rbd_dev; 625705a46afdSAlex Elder struct rbd_device *second = first->parent; 625805a46afdSAlex Elder struct rbd_device *third; 625905a46afdSAlex Elder 626005a46afdSAlex Elder /* 626105a46afdSAlex Elder * Follow to the parent with no grandparent and 626205a46afdSAlex Elder * remove it. 626305a46afdSAlex Elder */ 626405a46afdSAlex Elder while (second && (third = second->parent)) { 626505a46afdSAlex Elder first = second; 626605a46afdSAlex Elder second = third; 626705a46afdSAlex Elder } 6268ad945fc1SAlex Elder rbd_assert(second); 62698ad42cd0SAlex Elder rbd_dev_image_release(second); 6270ad945fc1SAlex Elder first->parent = NULL; 6271ad945fc1SAlex Elder first->parent_overlap = 0; 6272ad945fc1SAlex Elder 6273ad945fc1SAlex Elder rbd_assert(first->parent_spec); 627405a46afdSAlex Elder rbd_spec_put(first->parent_spec); 627505a46afdSAlex Elder first->parent_spec = NULL; 627605a46afdSAlex Elder } 627705a46afdSAlex Elder } 627805a46afdSAlex Elder 62799b60e70bSIlya Dryomov static ssize_t do_rbd_remove(struct bus_type *bus, 6280602adf40SYehuda Sadeh const char *buf, 6281602adf40SYehuda Sadeh size_t count) 6282602adf40SYehuda Sadeh { 6283602adf40SYehuda Sadeh struct rbd_device *rbd_dev = NULL; 6284751cc0e3SAlex Elder struct list_head *tmp; 6285751cc0e3SAlex Elder int dev_id; 62860276dca6SMike Christie char opt_buf[6]; 628782a442d2SAlex Elder bool already = false; 62880276dca6SMike Christie bool force = false; 62890d8189e1SAlex Elder int ret; 6290602adf40SYehuda Sadeh 62910276dca6SMike Christie dev_id = -1; 62920276dca6SMike Christie opt_buf[0] = '\0'; 62930276dca6SMike Christie sscanf(buf, "%d %5s", &dev_id, opt_buf); 62940276dca6SMike Christie if (dev_id < 0) { 62950276dca6SMike Christie pr_err("dev_id out of range\n"); 6296602adf40SYehuda Sadeh return -EINVAL; 62970276dca6SMike Christie } 62980276dca6SMike Christie if (opt_buf[0] != '\0') { 62990276dca6SMike Christie if (!strcmp(opt_buf, "force")) { 63000276dca6SMike Christie force = true; 63010276dca6SMike Christie } else { 63020276dca6SMike Christie pr_err("bad remove option at '%s'\n", opt_buf); 63030276dca6SMike Christie return -EINVAL; 63040276dca6SMike Christie } 63050276dca6SMike Christie } 6306602adf40SYehuda Sadeh 6307602adf40SYehuda Sadeh ret = -ENOENT; 6308751cc0e3SAlex Elder spin_lock(&rbd_dev_list_lock); 6309751cc0e3SAlex Elder list_for_each(tmp, &rbd_dev_list) { 6310751cc0e3SAlex Elder rbd_dev = list_entry(tmp, struct rbd_device, node); 6311751cc0e3SAlex Elder if (rbd_dev->dev_id == dev_id) { 6312751cc0e3SAlex Elder ret = 0; 6313751cc0e3SAlex Elder break; 6314602adf40SYehuda Sadeh } 6315751cc0e3SAlex Elder } 6316751cc0e3SAlex Elder if (!ret) { 6317a14ea269SAlex Elder spin_lock_irq(&rbd_dev->lock); 63180276dca6SMike Christie if (rbd_dev->open_count && !force) 631942382b70SAlex Elder ret = -EBUSY; 6320b82d167bSAlex Elder else 632182a442d2SAlex Elder already = test_and_set_bit(RBD_DEV_FLAG_REMOVING, 632282a442d2SAlex Elder &rbd_dev->flags); 6323a14ea269SAlex Elder spin_unlock_irq(&rbd_dev->lock); 6324751cc0e3SAlex Elder } 6325751cc0e3SAlex Elder spin_unlock(&rbd_dev_list_lock); 632682a442d2SAlex Elder if (ret < 0 || already) 63271ba0f1e7SAlex Elder return ret; 6328751cc0e3SAlex Elder 63290276dca6SMike Christie if (force) { 63300276dca6SMike Christie /* 63310276dca6SMike Christie * Prevent new IO from being queued and wait for existing 63320276dca6SMike Christie * IO to complete/fail. 63330276dca6SMike Christie */ 63340276dca6SMike Christie blk_mq_freeze_queue(rbd_dev->disk->queue); 63350276dca6SMike Christie blk_set_queue_dying(rbd_dev->disk->queue); 63360276dca6SMike Christie } 63370276dca6SMike Christie 6338ed95b21aSIlya Dryomov down_write(&rbd_dev->lock_rwsem); 6339ed95b21aSIlya Dryomov if (__rbd_is_lock_owner(rbd_dev)) 6340ed95b21aSIlya Dryomov rbd_unlock(rbd_dev); 6341ed95b21aSIlya Dryomov up_write(&rbd_dev->lock_rwsem); 634299d16943SIlya Dryomov rbd_unregister_watch(rbd_dev); 6343fca27065SIlya Dryomov 63449875201eSJosh Durgin /* 63459875201eSJosh Durgin * Don't free anything from rbd_dev->disk until after all 63469875201eSJosh Durgin * notifies are completely processed. Otherwise 63479875201eSJosh Durgin * rbd_bus_del_dev() will race with rbd_watch_cb(), resulting 63489875201eSJosh Durgin * in a potential use after free of rbd_dev->disk or rbd_dev. 63499875201eSJosh Durgin */ 6350dd5ac32dSIlya Dryomov rbd_dev_device_release(rbd_dev); 63518ad42cd0SAlex Elder rbd_dev_image_release(rbd_dev); 6352aafb230eSAlex Elder 63531ba0f1e7SAlex Elder return count; 6354602adf40SYehuda Sadeh } 6355602adf40SYehuda Sadeh 63569b60e70bSIlya Dryomov static ssize_t rbd_remove(struct bus_type *bus, 63579b60e70bSIlya Dryomov const char *buf, 63589b60e70bSIlya Dryomov size_t count) 63599b60e70bSIlya Dryomov { 63609b60e70bSIlya Dryomov if (single_major) 63619b60e70bSIlya Dryomov return -EINVAL; 63629b60e70bSIlya Dryomov 63639b60e70bSIlya Dryomov return do_rbd_remove(bus, buf, count); 63649b60e70bSIlya Dryomov } 63659b60e70bSIlya Dryomov 63669b60e70bSIlya Dryomov static ssize_t rbd_remove_single_major(struct bus_type *bus, 63679b60e70bSIlya Dryomov const char *buf, 63689b60e70bSIlya Dryomov size_t count) 63699b60e70bSIlya Dryomov { 63709b60e70bSIlya Dryomov return do_rbd_remove(bus, buf, count); 63719b60e70bSIlya Dryomov } 63729b60e70bSIlya Dryomov 6373602adf40SYehuda Sadeh /* 6374602adf40SYehuda Sadeh * create control files in sysfs 6375dfc5606dSYehuda Sadeh * /sys/bus/rbd/... 6376602adf40SYehuda Sadeh */ 6377602adf40SYehuda Sadeh static int rbd_sysfs_init(void) 6378602adf40SYehuda Sadeh { 6379dfc5606dSYehuda Sadeh int ret; 6380602adf40SYehuda Sadeh 6381fed4c143SAlex Elder ret = device_register(&rbd_root_dev); 6382dfc5606dSYehuda Sadeh if (ret < 0) 6383dfc5606dSYehuda Sadeh return ret; 6384602adf40SYehuda Sadeh 6385fed4c143SAlex Elder ret = bus_register(&rbd_bus_type); 6386fed4c143SAlex Elder if (ret < 0) 6387fed4c143SAlex Elder device_unregister(&rbd_root_dev); 6388602adf40SYehuda Sadeh 6389602adf40SYehuda Sadeh return ret; 6390602adf40SYehuda Sadeh } 6391602adf40SYehuda Sadeh 6392602adf40SYehuda Sadeh static void rbd_sysfs_cleanup(void) 6393602adf40SYehuda Sadeh { 6394dfc5606dSYehuda Sadeh bus_unregister(&rbd_bus_type); 6395fed4c143SAlex Elder device_unregister(&rbd_root_dev); 6396602adf40SYehuda Sadeh } 6397602adf40SYehuda Sadeh 63981c2a9dfeSAlex Elder static int rbd_slab_init(void) 63991c2a9dfeSAlex Elder { 64001c2a9dfeSAlex Elder rbd_assert(!rbd_img_request_cache); 640103d94406SGeliang Tang rbd_img_request_cache = KMEM_CACHE(rbd_img_request, 0); 6402868311b1SAlex Elder if (!rbd_img_request_cache) 6403868311b1SAlex Elder return -ENOMEM; 6404868311b1SAlex Elder 6405868311b1SAlex Elder rbd_assert(!rbd_obj_request_cache); 640603d94406SGeliang Tang rbd_obj_request_cache = KMEM_CACHE(rbd_obj_request, 0); 640778c2a44aSAlex Elder if (!rbd_obj_request_cache) 640878c2a44aSAlex Elder goto out_err; 640978c2a44aSAlex Elder 641078c2a44aSAlex Elder rbd_assert(!rbd_segment_name_cache); 641178c2a44aSAlex Elder rbd_segment_name_cache = kmem_cache_create("rbd_segment_name", 64122d0ebc5dSIlya Dryomov CEPH_MAX_OID_NAME_LEN + 1, 1, 0, NULL); 641378c2a44aSAlex Elder if (rbd_segment_name_cache) 64141c2a9dfeSAlex Elder return 0; 641578c2a44aSAlex Elder out_err: 641678c2a44aSAlex Elder kmem_cache_destroy(rbd_obj_request_cache); 641778c2a44aSAlex Elder rbd_obj_request_cache = NULL; 64181c2a9dfeSAlex Elder 6419868311b1SAlex Elder kmem_cache_destroy(rbd_img_request_cache); 6420868311b1SAlex Elder rbd_img_request_cache = NULL; 6421868311b1SAlex Elder 64221c2a9dfeSAlex Elder return -ENOMEM; 64231c2a9dfeSAlex Elder } 64241c2a9dfeSAlex Elder 64251c2a9dfeSAlex Elder static void rbd_slab_exit(void) 64261c2a9dfeSAlex Elder { 642778c2a44aSAlex Elder rbd_assert(rbd_segment_name_cache); 642878c2a44aSAlex Elder kmem_cache_destroy(rbd_segment_name_cache); 642978c2a44aSAlex Elder rbd_segment_name_cache = NULL; 643078c2a44aSAlex Elder 6431868311b1SAlex Elder rbd_assert(rbd_obj_request_cache); 6432868311b1SAlex Elder kmem_cache_destroy(rbd_obj_request_cache); 6433868311b1SAlex Elder rbd_obj_request_cache = NULL; 6434868311b1SAlex Elder 64351c2a9dfeSAlex Elder rbd_assert(rbd_img_request_cache); 64361c2a9dfeSAlex Elder kmem_cache_destroy(rbd_img_request_cache); 64371c2a9dfeSAlex Elder rbd_img_request_cache = NULL; 64381c2a9dfeSAlex Elder } 64391c2a9dfeSAlex Elder 6440cc344fa1SAlex Elder static int __init rbd_init(void) 6441602adf40SYehuda Sadeh { 6442602adf40SYehuda Sadeh int rc; 6443602adf40SYehuda Sadeh 64441e32d34cSAlex Elder if (!libceph_compatible(NULL)) { 64451e32d34cSAlex Elder rbd_warn(NULL, "libceph incompatibility (quitting)"); 64461e32d34cSAlex Elder return -EINVAL; 64471e32d34cSAlex Elder } 6448e1b4d96dSIlya Dryomov 64491c2a9dfeSAlex Elder rc = rbd_slab_init(); 6450602adf40SYehuda Sadeh if (rc) 6451602adf40SYehuda Sadeh return rc; 6452e1b4d96dSIlya Dryomov 6453f5ee37bdSIlya Dryomov /* 6454f5ee37bdSIlya Dryomov * The number of active work items is limited by the number of 6455f77303bdSIlya Dryomov * rbd devices * queue depth, so leave @max_active at default. 6456f5ee37bdSIlya Dryomov */ 6457f5ee37bdSIlya Dryomov rbd_wq = alloc_workqueue(RBD_DRV_NAME, WQ_MEM_RECLAIM, 0); 6458f5ee37bdSIlya Dryomov if (!rbd_wq) { 6459f5ee37bdSIlya Dryomov rc = -ENOMEM; 6460f5ee37bdSIlya Dryomov goto err_out_slab; 6461f5ee37bdSIlya Dryomov } 6462f5ee37bdSIlya Dryomov 64639b60e70bSIlya Dryomov if (single_major) { 64649b60e70bSIlya Dryomov rbd_major = register_blkdev(0, RBD_DRV_NAME); 64659b60e70bSIlya Dryomov if (rbd_major < 0) { 64669b60e70bSIlya Dryomov rc = rbd_major; 6467f5ee37bdSIlya Dryomov goto err_out_wq; 64689b60e70bSIlya Dryomov } 64699b60e70bSIlya Dryomov } 64709b60e70bSIlya Dryomov 64711c2a9dfeSAlex Elder rc = rbd_sysfs_init(); 64721c2a9dfeSAlex Elder if (rc) 64739b60e70bSIlya Dryomov goto err_out_blkdev; 64741c2a9dfeSAlex Elder 64759b60e70bSIlya Dryomov if (single_major) 64769b60e70bSIlya Dryomov pr_info("loaded (major %d)\n", rbd_major); 64779b60e70bSIlya Dryomov else 6478e1b4d96dSIlya Dryomov pr_info("loaded\n"); 64799b60e70bSIlya Dryomov 6480e1b4d96dSIlya Dryomov return 0; 6481e1b4d96dSIlya Dryomov 64829b60e70bSIlya Dryomov err_out_blkdev: 64839b60e70bSIlya Dryomov if (single_major) 64849b60e70bSIlya Dryomov unregister_blkdev(rbd_major, RBD_DRV_NAME); 6485f5ee37bdSIlya Dryomov err_out_wq: 6486f5ee37bdSIlya Dryomov destroy_workqueue(rbd_wq); 6487e1b4d96dSIlya Dryomov err_out_slab: 6488e1b4d96dSIlya Dryomov rbd_slab_exit(); 64891c2a9dfeSAlex Elder return rc; 6490602adf40SYehuda Sadeh } 6491602adf40SYehuda Sadeh 6492cc344fa1SAlex Elder static void __exit rbd_exit(void) 6493602adf40SYehuda Sadeh { 6494ffe312cfSIlya Dryomov ida_destroy(&rbd_dev_id_ida); 6495602adf40SYehuda Sadeh rbd_sysfs_cleanup(); 64969b60e70bSIlya Dryomov if (single_major) 64979b60e70bSIlya Dryomov unregister_blkdev(rbd_major, RBD_DRV_NAME); 6498f5ee37bdSIlya Dryomov destroy_workqueue(rbd_wq); 64991c2a9dfeSAlex Elder rbd_slab_exit(); 6500602adf40SYehuda Sadeh } 6501602adf40SYehuda Sadeh 6502602adf40SYehuda Sadeh module_init(rbd_init); 6503602adf40SYehuda Sadeh module_exit(rbd_exit); 6504602adf40SYehuda Sadeh 6505d552c619SAlex Elder MODULE_AUTHOR("Alex Elder <elder@inktank.com>"); 6506602adf40SYehuda Sadeh MODULE_AUTHOR("Sage Weil <sage@newdream.net>"); 6507602adf40SYehuda Sadeh MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>"); 6508602adf40SYehuda Sadeh /* following authorship retained from original osdblk.c */ 6509602adf40SYehuda Sadeh MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>"); 6510602adf40SYehuda Sadeh 651190da258bSIlya Dryomov MODULE_DESCRIPTION("RADOS Block Device (RBD) driver"); 6512602adf40SYehuda Sadeh MODULE_LICENSE("GPL"); 6513