1e2a58ee5SAlex Elder 2602adf40SYehuda Sadeh /* 3602adf40SYehuda Sadeh rbd.c -- Export ceph rados objects as a Linux block device 4602adf40SYehuda Sadeh 5602adf40SYehuda Sadeh 6602adf40SYehuda Sadeh based on drivers/block/osdblk.c: 7602adf40SYehuda Sadeh 8602adf40SYehuda Sadeh Copyright 2009 Red Hat, Inc. 9602adf40SYehuda Sadeh 10602adf40SYehuda Sadeh This program is free software; you can redistribute it and/or modify 11602adf40SYehuda Sadeh it under the terms of the GNU General Public License as published by 12602adf40SYehuda Sadeh the Free Software Foundation. 13602adf40SYehuda Sadeh 14602adf40SYehuda Sadeh This program is distributed in the hope that it will be useful, 15602adf40SYehuda Sadeh but WITHOUT ANY WARRANTY; without even the implied warranty of 16602adf40SYehuda Sadeh MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17602adf40SYehuda Sadeh GNU General Public License for more details. 18602adf40SYehuda Sadeh 19602adf40SYehuda Sadeh You should have received a copy of the GNU General Public License 20602adf40SYehuda Sadeh along with this program; see the file COPYING. If not, write to 21602adf40SYehuda Sadeh the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 22602adf40SYehuda Sadeh 23602adf40SYehuda Sadeh 24602adf40SYehuda Sadeh 25dfc5606dSYehuda Sadeh For usage instructions, please refer to: 26602adf40SYehuda Sadeh 27dfc5606dSYehuda Sadeh Documentation/ABI/testing/sysfs-bus-rbd 28602adf40SYehuda Sadeh 29602adf40SYehuda Sadeh */ 30602adf40SYehuda Sadeh 31602adf40SYehuda Sadeh #include <linux/ceph/libceph.h> 32602adf40SYehuda Sadeh #include <linux/ceph/osd_client.h> 33602adf40SYehuda Sadeh #include <linux/ceph/mon_client.h> 34ed95b21aSIlya Dryomov #include <linux/ceph/cls_lock_client.h> 35602adf40SYehuda Sadeh #include <linux/ceph/decode.h> 3659c2be1eSYehuda Sadeh #include <linux/parser.h> 3730d1cff8SAlex Elder #include <linux/bsearch.h> 38602adf40SYehuda Sadeh 39602adf40SYehuda Sadeh #include <linux/kernel.h> 40602adf40SYehuda Sadeh #include <linux/device.h> 41602adf40SYehuda Sadeh #include <linux/module.h> 427ad18afaSChristoph Hellwig #include <linux/blk-mq.h> 43602adf40SYehuda Sadeh #include <linux/fs.h> 44602adf40SYehuda Sadeh #include <linux/blkdev.h> 451c2a9dfeSAlex Elder #include <linux/slab.h> 46f8a22fc2SIlya Dryomov #include <linux/idr.h> 47bc1ecc65SIlya Dryomov #include <linux/workqueue.h> 48602adf40SYehuda Sadeh 49602adf40SYehuda Sadeh #include "rbd_types.h" 50602adf40SYehuda Sadeh 51aafb230eSAlex Elder #define RBD_DEBUG /* Activate rbd_assert() calls */ 52aafb230eSAlex Elder 53593a9e7bSAlex Elder /* 54593a9e7bSAlex Elder * The basic unit of block I/O is a sector. It is interpreted in a 55593a9e7bSAlex Elder * number of contexts in Linux (blk, bio, genhd), but the default is 56593a9e7bSAlex Elder * universally 512 bytes. These symbols are just slightly more 57593a9e7bSAlex Elder * meaningful than the bare numbers they represent. 58593a9e7bSAlex Elder */ 59593a9e7bSAlex Elder #define SECTOR_SHIFT 9 60593a9e7bSAlex Elder #define SECTOR_SIZE (1ULL << SECTOR_SHIFT) 61593a9e7bSAlex Elder 62a2acd00eSAlex Elder /* 63a2acd00eSAlex Elder * Increment the given counter and return its updated value. 64a2acd00eSAlex Elder * If the counter is already 0 it will not be incremented. 65a2acd00eSAlex Elder * If the counter is already at its maximum value returns 66a2acd00eSAlex Elder * -EINVAL without updating it. 67a2acd00eSAlex Elder */ 68a2acd00eSAlex Elder static int atomic_inc_return_safe(atomic_t *v) 69a2acd00eSAlex Elder { 70a2acd00eSAlex Elder unsigned int counter; 71a2acd00eSAlex Elder 72a2acd00eSAlex Elder counter = (unsigned int)__atomic_add_unless(v, 1, 0); 73a2acd00eSAlex Elder if (counter <= (unsigned int)INT_MAX) 74a2acd00eSAlex Elder return (int)counter; 75a2acd00eSAlex Elder 76a2acd00eSAlex Elder atomic_dec(v); 77a2acd00eSAlex Elder 78a2acd00eSAlex Elder return -EINVAL; 79a2acd00eSAlex Elder } 80a2acd00eSAlex Elder 81a2acd00eSAlex Elder /* Decrement the counter. Return the resulting value, or -EINVAL */ 82a2acd00eSAlex Elder static int atomic_dec_return_safe(atomic_t *v) 83a2acd00eSAlex Elder { 84a2acd00eSAlex Elder int counter; 85a2acd00eSAlex Elder 86a2acd00eSAlex Elder counter = atomic_dec_return(v); 87a2acd00eSAlex Elder if (counter >= 0) 88a2acd00eSAlex Elder return counter; 89a2acd00eSAlex Elder 90a2acd00eSAlex Elder atomic_inc(v); 91a2acd00eSAlex Elder 92a2acd00eSAlex Elder return -EINVAL; 93a2acd00eSAlex Elder } 94a2acd00eSAlex Elder 95f0f8cef5SAlex Elder #define RBD_DRV_NAME "rbd" 96602adf40SYehuda Sadeh 977e513d43SIlya Dryomov #define RBD_MINORS_PER_MAJOR 256 987e513d43SIlya Dryomov #define RBD_SINGLE_MAJOR_PART_SHIFT 4 99602adf40SYehuda Sadeh 1006d69bb53SIlya Dryomov #define RBD_MAX_PARENT_CHAIN_LEN 16 1016d69bb53SIlya Dryomov 102d4b125e9SAlex Elder #define RBD_SNAP_DEV_NAME_PREFIX "snap_" 103d4b125e9SAlex Elder #define RBD_MAX_SNAP_NAME_LEN \ 104d4b125e9SAlex Elder (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1)) 105d4b125e9SAlex Elder 10635d489f9SAlex Elder #define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */ 107602adf40SYehuda Sadeh 108602adf40SYehuda Sadeh #define RBD_SNAP_HEAD_NAME "-" 109602adf40SYehuda Sadeh 1109682fc6dSAlex Elder #define BAD_SNAP_INDEX U32_MAX /* invalid index into snap array */ 1119682fc6dSAlex Elder 1129e15b77dSAlex Elder /* This allows a single page to hold an image name sent by OSD */ 1139e15b77dSAlex Elder #define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1) 114589d30e0SAlex Elder #define RBD_IMAGE_ID_LEN_MAX 64 1159e15b77dSAlex Elder 1161e130199SAlex Elder #define RBD_OBJ_PREFIX_LEN_MAX 64 117589d30e0SAlex Elder 118ed95b21aSIlya Dryomov #define RBD_NOTIFY_TIMEOUT 5 /* seconds */ 11999d16943SIlya Dryomov #define RBD_RETRY_DELAY msecs_to_jiffies(1000) 12099d16943SIlya Dryomov 121d889140cSAlex Elder /* Feature bits */ 122d889140cSAlex Elder 1235cbf6f12SAlex Elder #define RBD_FEATURE_LAYERING (1<<0) 1245cbf6f12SAlex Elder #define RBD_FEATURE_STRIPINGV2 (1<<1) 125ed95b21aSIlya Dryomov #define RBD_FEATURE_EXCLUSIVE_LOCK (1<<2) 1267e97332eSIlya Dryomov #define RBD_FEATURE_DATA_POOL (1<<7) 127ed95b21aSIlya Dryomov #define RBD_FEATURES_ALL (RBD_FEATURE_LAYERING | \ 128ed95b21aSIlya Dryomov RBD_FEATURE_STRIPINGV2 | \ 1297e97332eSIlya Dryomov RBD_FEATURE_EXCLUSIVE_LOCK | \ 1307e97332eSIlya Dryomov RBD_FEATURE_DATA_POOL) 131d889140cSAlex Elder 132d889140cSAlex Elder /* Features supported by this (client software) implementation. */ 133d889140cSAlex Elder 134770eba6eSAlex Elder #define RBD_FEATURES_SUPPORTED (RBD_FEATURES_ALL) 135d889140cSAlex Elder 13681a89793SAlex Elder /* 13781a89793SAlex Elder * An RBD device name will be "rbd#", where the "rbd" comes from 13881a89793SAlex Elder * RBD_DRV_NAME above, and # is a unique integer identifier. 13981a89793SAlex Elder */ 140602adf40SYehuda Sadeh #define DEV_NAME_LEN 32 141602adf40SYehuda Sadeh 142602adf40SYehuda Sadeh /* 143602adf40SYehuda Sadeh * block device image metadata (in-memory version) 144602adf40SYehuda Sadeh */ 145602adf40SYehuda Sadeh struct rbd_image_header { 146f35a4deeSAlex Elder /* These six fields never change for a given rbd image */ 147849b4260SAlex Elder char *object_prefix; 148602adf40SYehuda Sadeh __u8 obj_order; 149f35a4deeSAlex Elder u64 stripe_unit; 150f35a4deeSAlex Elder u64 stripe_count; 1517e97332eSIlya Dryomov s64 data_pool_id; 152f35a4deeSAlex Elder u64 features; /* Might be changeable someday? */ 153602adf40SYehuda Sadeh 154f84344f3SAlex Elder /* The remaining fields need to be updated occasionally */ 155f84344f3SAlex Elder u64 image_size; 156f84344f3SAlex Elder struct ceph_snap_context *snapc; 157f35a4deeSAlex Elder char *snap_names; /* format 1 only */ 158f35a4deeSAlex Elder u64 *snap_sizes; /* format 1 only */ 15959c2be1eSYehuda Sadeh }; 16059c2be1eSYehuda Sadeh 1610d7dbfceSAlex Elder /* 1620d7dbfceSAlex Elder * An rbd image specification. 1630d7dbfceSAlex Elder * 1640d7dbfceSAlex Elder * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely 165c66c6e0cSAlex Elder * identify an image. Each rbd_dev structure includes a pointer to 166c66c6e0cSAlex Elder * an rbd_spec structure that encapsulates this identity. 167c66c6e0cSAlex Elder * 168c66c6e0cSAlex Elder * Each of the id's in an rbd_spec has an associated name. For a 169c66c6e0cSAlex Elder * user-mapped image, the names are supplied and the id's associated 170c66c6e0cSAlex Elder * with them are looked up. For a layered image, a parent image is 171c66c6e0cSAlex Elder * defined by the tuple, and the names are looked up. 172c66c6e0cSAlex Elder * 173c66c6e0cSAlex Elder * An rbd_dev structure contains a parent_spec pointer which is 174c66c6e0cSAlex Elder * non-null if the image it represents is a child in a layered 175c66c6e0cSAlex Elder * image. This pointer will refer to the rbd_spec structure used 176c66c6e0cSAlex Elder * by the parent rbd_dev for its own identity (i.e., the structure 177c66c6e0cSAlex Elder * is shared between the parent and child). 178c66c6e0cSAlex Elder * 179c66c6e0cSAlex Elder * Since these structures are populated once, during the discovery 180c66c6e0cSAlex Elder * phase of image construction, they are effectively immutable so 181c66c6e0cSAlex Elder * we make no effort to synchronize access to them. 182c66c6e0cSAlex Elder * 183c66c6e0cSAlex Elder * Note that code herein does not assume the image name is known (it 184c66c6e0cSAlex Elder * could be a null pointer). 1850d7dbfceSAlex Elder */ 1860d7dbfceSAlex Elder struct rbd_spec { 1870d7dbfceSAlex Elder u64 pool_id; 188ecb4dc22SAlex Elder const char *pool_name; 1890d7dbfceSAlex Elder 190ecb4dc22SAlex Elder const char *image_id; 191ecb4dc22SAlex Elder const char *image_name; 1920d7dbfceSAlex Elder 1930d7dbfceSAlex Elder u64 snap_id; 194ecb4dc22SAlex Elder const char *snap_name; 1950d7dbfceSAlex Elder 1960d7dbfceSAlex Elder struct kref kref; 1970d7dbfceSAlex Elder }; 1980d7dbfceSAlex Elder 199602adf40SYehuda Sadeh /* 200f0f8cef5SAlex Elder * an instance of the client. multiple devices may share an rbd client. 201602adf40SYehuda Sadeh */ 202602adf40SYehuda Sadeh struct rbd_client { 203602adf40SYehuda Sadeh struct ceph_client *client; 204602adf40SYehuda Sadeh struct kref kref; 205602adf40SYehuda Sadeh struct list_head node; 206602adf40SYehuda Sadeh }; 207602adf40SYehuda Sadeh 208bf0d5f50SAlex Elder struct rbd_img_request; 209bf0d5f50SAlex Elder typedef void (*rbd_img_callback_t)(struct rbd_img_request *); 210bf0d5f50SAlex Elder 211bf0d5f50SAlex Elder #define BAD_WHICH U32_MAX /* Good which or bad which, which? */ 212bf0d5f50SAlex Elder 213bf0d5f50SAlex Elder struct rbd_obj_request; 214bf0d5f50SAlex Elder typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *); 215bf0d5f50SAlex Elder 2169969ebc5SAlex Elder enum obj_request_type { 2179969ebc5SAlex Elder OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES 2189969ebc5SAlex Elder }; 219bf0d5f50SAlex Elder 2206d2940c8SGuangliang Zhao enum obj_operation_type { 2216d2940c8SGuangliang Zhao OBJ_OP_WRITE, 2226d2940c8SGuangliang Zhao OBJ_OP_READ, 22390e98c52SGuangliang Zhao OBJ_OP_DISCARD, 2246d2940c8SGuangliang Zhao }; 2256d2940c8SGuangliang Zhao 226926f9b3fSAlex Elder enum obj_req_flags { 227926f9b3fSAlex Elder OBJ_REQ_DONE, /* completion flag: not done = 0, done = 1 */ 2286365d33aSAlex Elder OBJ_REQ_IMG_DATA, /* object usage: standalone = 0, image = 1 */ 2295679c59fSAlex Elder OBJ_REQ_KNOWN, /* EXISTS flag valid: no = 0, yes = 1 */ 2305679c59fSAlex Elder OBJ_REQ_EXISTS, /* target exists: no = 0, yes = 1 */ 231926f9b3fSAlex Elder }; 232926f9b3fSAlex Elder 233bf0d5f50SAlex Elder struct rbd_obj_request { 234a90bb0c1SIlya Dryomov u64 object_no; 235bf0d5f50SAlex Elder u64 offset; /* object start byte */ 236bf0d5f50SAlex Elder u64 length; /* bytes from offset */ 237926f9b3fSAlex Elder unsigned long flags; 238bf0d5f50SAlex Elder 239c5b5ef6cSAlex Elder /* 240c5b5ef6cSAlex Elder * An object request associated with an image will have its 241c5b5ef6cSAlex Elder * img_data flag set; a standalone object request will not. 242c5b5ef6cSAlex Elder * 243c5b5ef6cSAlex Elder * A standalone object request will have which == BAD_WHICH 244c5b5ef6cSAlex Elder * and a null obj_request pointer. 245c5b5ef6cSAlex Elder * 246c5b5ef6cSAlex Elder * An object request initiated in support of a layered image 247c5b5ef6cSAlex Elder * object (to check for its existence before a write) will 248c5b5ef6cSAlex Elder * have which == BAD_WHICH and a non-null obj_request pointer. 249c5b5ef6cSAlex Elder * 250c5b5ef6cSAlex Elder * Finally, an object request for rbd image data will have 251c5b5ef6cSAlex Elder * which != BAD_WHICH, and will have a non-null img_request 252c5b5ef6cSAlex Elder * pointer. The value of which will be in the range 253c5b5ef6cSAlex Elder * 0..(img_request->obj_request_count-1). 254c5b5ef6cSAlex Elder */ 255c5b5ef6cSAlex Elder union { 256c5b5ef6cSAlex Elder struct rbd_obj_request *obj_request; /* STAT op */ 257c5b5ef6cSAlex Elder struct { 258bf0d5f50SAlex Elder struct rbd_img_request *img_request; 259c5b5ef6cSAlex Elder u64 img_offset; 260c5b5ef6cSAlex Elder /* links for img_request->obj_requests list */ 261c5b5ef6cSAlex Elder struct list_head links; 262c5b5ef6cSAlex Elder }; 263c5b5ef6cSAlex Elder }; 264bf0d5f50SAlex Elder u32 which; /* posn image request list */ 265bf0d5f50SAlex Elder 266bf0d5f50SAlex Elder enum obj_request_type type; 267788e2df3SAlex Elder union { 268bf0d5f50SAlex Elder struct bio *bio_list; 269788e2df3SAlex Elder struct { 270788e2df3SAlex Elder struct page **pages; 271788e2df3SAlex Elder u32 page_count; 272788e2df3SAlex Elder }; 273788e2df3SAlex Elder }; 2740eefd470SAlex Elder struct page **copyup_pages; 275ebda6408SAlex Elder u32 copyup_page_count; 276bf0d5f50SAlex Elder 277bf0d5f50SAlex Elder struct ceph_osd_request *osd_req; 278bf0d5f50SAlex Elder 279bf0d5f50SAlex Elder u64 xferred; /* bytes transferred */ 2801b83bef2SSage Weil int result; 281bf0d5f50SAlex Elder 282bf0d5f50SAlex Elder rbd_obj_callback_t callback; 283788e2df3SAlex Elder struct completion completion; 284bf0d5f50SAlex Elder 285bf0d5f50SAlex Elder struct kref kref; 286bf0d5f50SAlex Elder }; 287bf0d5f50SAlex Elder 2880c425248SAlex Elder enum img_req_flags { 2899849e986SAlex Elder IMG_REQ_WRITE, /* I/O direction: read = 0, write = 1 */ 2909849e986SAlex Elder IMG_REQ_CHILD, /* initiator: block = 0, child image = 1 */ 291d0b2e944SAlex Elder IMG_REQ_LAYERED, /* ENOENT handling: normal = 0, layered = 1 */ 29290e98c52SGuangliang Zhao IMG_REQ_DISCARD, /* discard: normal = 0, discard request = 1 */ 2930c425248SAlex Elder }; 2940c425248SAlex Elder 295bf0d5f50SAlex Elder struct rbd_img_request { 296bf0d5f50SAlex Elder struct rbd_device *rbd_dev; 297bf0d5f50SAlex Elder u64 offset; /* starting image byte offset */ 298bf0d5f50SAlex Elder u64 length; /* byte count from offset */ 2990c425248SAlex Elder unsigned long flags; 300bf0d5f50SAlex Elder union { 301bf0d5f50SAlex Elder u64 snap_id; /* for reads */ 3029849e986SAlex Elder struct ceph_snap_context *snapc; /* for writes */ 3039849e986SAlex Elder }; 3049849e986SAlex Elder union { 3059849e986SAlex Elder struct request *rq; /* block request */ 3069849e986SAlex Elder struct rbd_obj_request *obj_request; /* obj req initiator */ 307bf0d5f50SAlex Elder }; 3083d7efd18SAlex Elder struct page **copyup_pages; 309ebda6408SAlex Elder u32 copyup_page_count; 310bf0d5f50SAlex Elder spinlock_t completion_lock;/* protects next_completion */ 311bf0d5f50SAlex Elder u32 next_completion; 312bf0d5f50SAlex Elder rbd_img_callback_t callback; 31355f27e09SAlex Elder u64 xferred;/* aggregate bytes transferred */ 314a5a337d4SAlex Elder int result; /* first nonzero obj_request result */ 315bf0d5f50SAlex Elder 316bf0d5f50SAlex Elder u32 obj_request_count; 317bf0d5f50SAlex Elder struct list_head obj_requests; /* rbd_obj_request structs */ 318bf0d5f50SAlex Elder 319bf0d5f50SAlex Elder struct kref kref; 320bf0d5f50SAlex Elder }; 321bf0d5f50SAlex Elder 322bf0d5f50SAlex Elder #define for_each_obj_request(ireq, oreq) \ 323ef06f4d3SAlex Elder list_for_each_entry(oreq, &(ireq)->obj_requests, links) 324bf0d5f50SAlex Elder #define for_each_obj_request_from(ireq, oreq) \ 325ef06f4d3SAlex Elder list_for_each_entry_from(oreq, &(ireq)->obj_requests, links) 326bf0d5f50SAlex Elder #define for_each_obj_request_safe(ireq, oreq, n) \ 327ef06f4d3SAlex Elder list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links) 328bf0d5f50SAlex Elder 32999d16943SIlya Dryomov enum rbd_watch_state { 33099d16943SIlya Dryomov RBD_WATCH_STATE_UNREGISTERED, 33199d16943SIlya Dryomov RBD_WATCH_STATE_REGISTERED, 33299d16943SIlya Dryomov RBD_WATCH_STATE_ERROR, 33399d16943SIlya Dryomov }; 33499d16943SIlya Dryomov 335ed95b21aSIlya Dryomov enum rbd_lock_state { 336ed95b21aSIlya Dryomov RBD_LOCK_STATE_UNLOCKED, 337ed95b21aSIlya Dryomov RBD_LOCK_STATE_LOCKED, 338ed95b21aSIlya Dryomov RBD_LOCK_STATE_RELEASING, 339ed95b21aSIlya Dryomov }; 340ed95b21aSIlya Dryomov 341ed95b21aSIlya Dryomov /* WatchNotify::ClientId */ 342ed95b21aSIlya Dryomov struct rbd_client_id { 343ed95b21aSIlya Dryomov u64 gid; 344ed95b21aSIlya Dryomov u64 handle; 345ed95b21aSIlya Dryomov }; 346ed95b21aSIlya Dryomov 347f84344f3SAlex Elder struct rbd_mapping { 34899c1f08fSAlex Elder u64 size; 34934b13184SAlex Elder u64 features; 350f84344f3SAlex Elder bool read_only; 351f84344f3SAlex Elder }; 352f84344f3SAlex Elder 353602adf40SYehuda Sadeh /* 354602adf40SYehuda Sadeh * a single device 355602adf40SYehuda Sadeh */ 356602adf40SYehuda Sadeh struct rbd_device { 357de71a297SAlex Elder int dev_id; /* blkdev unique id */ 358602adf40SYehuda Sadeh 359602adf40SYehuda Sadeh int major; /* blkdev assigned major */ 360dd82fff1SIlya Dryomov int minor; 361602adf40SYehuda Sadeh struct gendisk *disk; /* blkdev's gendisk and rq */ 362602adf40SYehuda Sadeh 363a30b71b9SAlex Elder u32 image_format; /* Either 1 or 2 */ 364602adf40SYehuda Sadeh struct rbd_client *rbd_client; 365602adf40SYehuda Sadeh 366602adf40SYehuda Sadeh char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */ 367602adf40SYehuda Sadeh 368b82d167bSAlex Elder spinlock_t lock; /* queue, flags, open_count */ 369602adf40SYehuda Sadeh 370602adf40SYehuda Sadeh struct rbd_image_header header; 371b82d167bSAlex Elder unsigned long flags; /* possibly lock protected */ 3720d7dbfceSAlex Elder struct rbd_spec *spec; 373d147543dSIlya Dryomov struct rbd_options *opts; 3740d6d1e9cSMike Christie char *config_info; /* add{,_single_major} string */ 375602adf40SYehuda Sadeh 376c41d13a3SIlya Dryomov struct ceph_object_id header_oid; 377922dab61SIlya Dryomov struct ceph_object_locator header_oloc; 378971f839aSAlex Elder 3791643dfa4SIlya Dryomov struct ceph_file_layout layout; /* used for all rbd requests */ 3800903e875SAlex Elder 38199d16943SIlya Dryomov struct mutex watch_mutex; 38299d16943SIlya Dryomov enum rbd_watch_state watch_state; 383922dab61SIlya Dryomov struct ceph_osd_linger_request *watch_handle; 38499d16943SIlya Dryomov u64 watch_cookie; 38599d16943SIlya Dryomov struct delayed_work watch_dwork; 38659c2be1eSYehuda Sadeh 387ed95b21aSIlya Dryomov struct rw_semaphore lock_rwsem; 388ed95b21aSIlya Dryomov enum rbd_lock_state lock_state; 389ed95b21aSIlya Dryomov struct rbd_client_id owner_cid; 390ed95b21aSIlya Dryomov struct work_struct acquired_lock_work; 391ed95b21aSIlya Dryomov struct work_struct released_lock_work; 392ed95b21aSIlya Dryomov struct delayed_work lock_dwork; 393ed95b21aSIlya Dryomov struct work_struct unlock_work; 394ed95b21aSIlya Dryomov wait_queue_head_t lock_waitq; 395ed95b21aSIlya Dryomov 3961643dfa4SIlya Dryomov struct workqueue_struct *task_wq; 397602adf40SYehuda Sadeh 39886b00e0dSAlex Elder struct rbd_spec *parent_spec; 39986b00e0dSAlex Elder u64 parent_overlap; 400a2acd00eSAlex Elder atomic_t parent_ref; 4012f82ee54SAlex Elder struct rbd_device *parent; 40286b00e0dSAlex Elder 4037ad18afaSChristoph Hellwig /* Block layer tags. */ 4047ad18afaSChristoph Hellwig struct blk_mq_tag_set tag_set; 4057ad18afaSChristoph Hellwig 406c666601aSJosh Durgin /* protects updating the header */ 407c666601aSJosh Durgin struct rw_semaphore header_rwsem; 408f84344f3SAlex Elder 409f84344f3SAlex Elder struct rbd_mapping mapping; 410602adf40SYehuda Sadeh 411602adf40SYehuda Sadeh struct list_head node; 412dfc5606dSYehuda Sadeh 413dfc5606dSYehuda Sadeh /* sysfs related */ 414dfc5606dSYehuda Sadeh struct device dev; 415b82d167bSAlex Elder unsigned long open_count; /* protected by lock */ 416dfc5606dSYehuda Sadeh }; 417dfc5606dSYehuda Sadeh 418b82d167bSAlex Elder /* 41987c0fdedSIlya Dryomov * Flag bits for rbd_dev->flags: 42087c0fdedSIlya Dryomov * - REMOVING (which is coupled with rbd_dev->open_count) is protected 42187c0fdedSIlya Dryomov * by rbd_dev->lock 42287c0fdedSIlya Dryomov * - BLACKLISTED is protected by rbd_dev->lock_rwsem 423b82d167bSAlex Elder */ 4246d292906SAlex Elder enum rbd_dev_flags { 4256d292906SAlex Elder RBD_DEV_FLAG_EXISTS, /* mapped snapshot has not been deleted */ 426b82d167bSAlex Elder RBD_DEV_FLAG_REMOVING, /* this mapping is being removed */ 42787c0fdedSIlya Dryomov RBD_DEV_FLAG_BLACKLISTED, /* our ceph_client is blacklisted */ 4286d292906SAlex Elder }; 4296d292906SAlex Elder 430cfbf6377SAlex Elder static DEFINE_MUTEX(client_mutex); /* Serialize client creation */ 431e124a82fSAlex Elder 432602adf40SYehuda Sadeh static LIST_HEAD(rbd_dev_list); /* devices */ 433e124a82fSAlex Elder static DEFINE_SPINLOCK(rbd_dev_list_lock); 434e124a82fSAlex Elder 435602adf40SYehuda Sadeh static LIST_HEAD(rbd_client_list); /* clients */ 436432b8587SAlex Elder static DEFINE_SPINLOCK(rbd_client_list_lock); 437602adf40SYehuda Sadeh 43878c2a44aSAlex Elder /* Slab caches for frequently-allocated structures */ 43978c2a44aSAlex Elder 4401c2a9dfeSAlex Elder static struct kmem_cache *rbd_img_request_cache; 441868311b1SAlex Elder static struct kmem_cache *rbd_obj_request_cache; 4421c2a9dfeSAlex Elder 4439b60e70bSIlya Dryomov static int rbd_major; 444f8a22fc2SIlya Dryomov static DEFINE_IDA(rbd_dev_id_ida); 445f8a22fc2SIlya Dryomov 446f5ee37bdSIlya Dryomov static struct workqueue_struct *rbd_wq; 447f5ee37bdSIlya Dryomov 4489b60e70bSIlya Dryomov /* 4499b60e70bSIlya Dryomov * Default to false for now, as single-major requires >= 0.75 version of 4509b60e70bSIlya Dryomov * userspace rbd utility. 4519b60e70bSIlya Dryomov */ 4529b60e70bSIlya Dryomov static bool single_major = false; 4539b60e70bSIlya Dryomov module_param(single_major, bool, S_IRUGO); 4549b60e70bSIlya Dryomov MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: false)"); 4559b60e70bSIlya Dryomov 4563d7efd18SAlex Elder static int rbd_img_request_submit(struct rbd_img_request *img_request); 4573d7efd18SAlex Elder 458f0f8cef5SAlex Elder static ssize_t rbd_add(struct bus_type *bus, const char *buf, 459f0f8cef5SAlex Elder size_t count); 460f0f8cef5SAlex Elder static ssize_t rbd_remove(struct bus_type *bus, const char *buf, 461f0f8cef5SAlex Elder size_t count); 4629b60e70bSIlya Dryomov static ssize_t rbd_add_single_major(struct bus_type *bus, const char *buf, 4639b60e70bSIlya Dryomov size_t count); 4649b60e70bSIlya Dryomov static ssize_t rbd_remove_single_major(struct bus_type *bus, const char *buf, 4659b60e70bSIlya Dryomov size_t count); 4666d69bb53SIlya Dryomov static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth); 467a2acd00eSAlex Elder static void rbd_spec_put(struct rbd_spec *spec); 468f0f8cef5SAlex Elder 4699b60e70bSIlya Dryomov static int rbd_dev_id_to_minor(int dev_id) 4709b60e70bSIlya Dryomov { 4717e513d43SIlya Dryomov return dev_id << RBD_SINGLE_MAJOR_PART_SHIFT; 4729b60e70bSIlya Dryomov } 4739b60e70bSIlya Dryomov 4749b60e70bSIlya Dryomov static int minor_to_rbd_dev_id(int minor) 4759b60e70bSIlya Dryomov { 4767e513d43SIlya Dryomov return minor >> RBD_SINGLE_MAJOR_PART_SHIFT; 4779b60e70bSIlya Dryomov } 4789b60e70bSIlya Dryomov 479ed95b21aSIlya Dryomov static bool rbd_is_lock_supported(struct rbd_device *rbd_dev) 480ed95b21aSIlya Dryomov { 481ed95b21aSIlya Dryomov return (rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK) && 482ed95b21aSIlya Dryomov rbd_dev->spec->snap_id == CEPH_NOSNAP && 483ed95b21aSIlya Dryomov !rbd_dev->mapping.read_only; 484ed95b21aSIlya Dryomov } 485ed95b21aSIlya Dryomov 486ed95b21aSIlya Dryomov static bool __rbd_is_lock_owner(struct rbd_device *rbd_dev) 487ed95b21aSIlya Dryomov { 488ed95b21aSIlya Dryomov return rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED || 489ed95b21aSIlya Dryomov rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING; 490ed95b21aSIlya Dryomov } 491ed95b21aSIlya Dryomov 492ed95b21aSIlya Dryomov static bool rbd_is_lock_owner(struct rbd_device *rbd_dev) 493ed95b21aSIlya Dryomov { 494ed95b21aSIlya Dryomov bool is_lock_owner; 495ed95b21aSIlya Dryomov 496ed95b21aSIlya Dryomov down_read(&rbd_dev->lock_rwsem); 497ed95b21aSIlya Dryomov is_lock_owner = __rbd_is_lock_owner(rbd_dev); 498ed95b21aSIlya Dryomov up_read(&rbd_dev->lock_rwsem); 499ed95b21aSIlya Dryomov return is_lock_owner; 500ed95b21aSIlya Dryomov } 501ed95b21aSIlya Dryomov 502b15a21ddSGreg Kroah-Hartman static BUS_ATTR(add, S_IWUSR, NULL, rbd_add); 503b15a21ddSGreg Kroah-Hartman static BUS_ATTR(remove, S_IWUSR, NULL, rbd_remove); 5049b60e70bSIlya Dryomov static BUS_ATTR(add_single_major, S_IWUSR, NULL, rbd_add_single_major); 5059b60e70bSIlya Dryomov static BUS_ATTR(remove_single_major, S_IWUSR, NULL, rbd_remove_single_major); 506b15a21ddSGreg Kroah-Hartman 507b15a21ddSGreg Kroah-Hartman static struct attribute *rbd_bus_attrs[] = { 508b15a21ddSGreg Kroah-Hartman &bus_attr_add.attr, 509b15a21ddSGreg Kroah-Hartman &bus_attr_remove.attr, 5109b60e70bSIlya Dryomov &bus_attr_add_single_major.attr, 5119b60e70bSIlya Dryomov &bus_attr_remove_single_major.attr, 512b15a21ddSGreg Kroah-Hartman NULL, 513f0f8cef5SAlex Elder }; 51492c76dc0SIlya Dryomov 51592c76dc0SIlya Dryomov static umode_t rbd_bus_is_visible(struct kobject *kobj, 51692c76dc0SIlya Dryomov struct attribute *attr, int index) 51792c76dc0SIlya Dryomov { 5189b60e70bSIlya Dryomov if (!single_major && 5199b60e70bSIlya Dryomov (attr == &bus_attr_add_single_major.attr || 5209b60e70bSIlya Dryomov attr == &bus_attr_remove_single_major.attr)) 5219b60e70bSIlya Dryomov return 0; 5229b60e70bSIlya Dryomov 52392c76dc0SIlya Dryomov return attr->mode; 52492c76dc0SIlya Dryomov } 52592c76dc0SIlya Dryomov 52692c76dc0SIlya Dryomov static const struct attribute_group rbd_bus_group = { 52792c76dc0SIlya Dryomov .attrs = rbd_bus_attrs, 52892c76dc0SIlya Dryomov .is_visible = rbd_bus_is_visible, 52992c76dc0SIlya Dryomov }; 53092c76dc0SIlya Dryomov __ATTRIBUTE_GROUPS(rbd_bus); 531f0f8cef5SAlex Elder 532f0f8cef5SAlex Elder static struct bus_type rbd_bus_type = { 533f0f8cef5SAlex Elder .name = "rbd", 534b15a21ddSGreg Kroah-Hartman .bus_groups = rbd_bus_groups, 535f0f8cef5SAlex Elder }; 536f0f8cef5SAlex Elder 537f0f8cef5SAlex Elder static void rbd_root_dev_release(struct device *dev) 538f0f8cef5SAlex Elder { 539f0f8cef5SAlex Elder } 540f0f8cef5SAlex Elder 541f0f8cef5SAlex Elder static struct device rbd_root_dev = { 542f0f8cef5SAlex Elder .init_name = "rbd", 543f0f8cef5SAlex Elder .release = rbd_root_dev_release, 544f0f8cef5SAlex Elder }; 545f0f8cef5SAlex Elder 54606ecc6cbSAlex Elder static __printf(2, 3) 54706ecc6cbSAlex Elder void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...) 54806ecc6cbSAlex Elder { 54906ecc6cbSAlex Elder struct va_format vaf; 55006ecc6cbSAlex Elder va_list args; 55106ecc6cbSAlex Elder 55206ecc6cbSAlex Elder va_start(args, fmt); 55306ecc6cbSAlex Elder vaf.fmt = fmt; 55406ecc6cbSAlex Elder vaf.va = &args; 55506ecc6cbSAlex Elder 55606ecc6cbSAlex Elder if (!rbd_dev) 55706ecc6cbSAlex Elder printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf); 55806ecc6cbSAlex Elder else if (rbd_dev->disk) 55906ecc6cbSAlex Elder printk(KERN_WARNING "%s: %s: %pV\n", 56006ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf); 56106ecc6cbSAlex Elder else if (rbd_dev->spec && rbd_dev->spec->image_name) 56206ecc6cbSAlex Elder printk(KERN_WARNING "%s: image %s: %pV\n", 56306ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf); 56406ecc6cbSAlex Elder else if (rbd_dev->spec && rbd_dev->spec->image_id) 56506ecc6cbSAlex Elder printk(KERN_WARNING "%s: id %s: %pV\n", 56606ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf); 56706ecc6cbSAlex Elder else /* punt */ 56806ecc6cbSAlex Elder printk(KERN_WARNING "%s: rbd_dev %p: %pV\n", 56906ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev, &vaf); 57006ecc6cbSAlex Elder va_end(args); 57106ecc6cbSAlex Elder } 57206ecc6cbSAlex Elder 573aafb230eSAlex Elder #ifdef RBD_DEBUG 574aafb230eSAlex Elder #define rbd_assert(expr) \ 575aafb230eSAlex Elder if (unlikely(!(expr))) { \ 576aafb230eSAlex Elder printk(KERN_ERR "\nAssertion failure in %s() " \ 577aafb230eSAlex Elder "at line %d:\n\n" \ 578aafb230eSAlex Elder "\trbd_assert(%s);\n\n", \ 579aafb230eSAlex Elder __func__, __LINE__, #expr); \ 580aafb230eSAlex Elder BUG(); \ 581aafb230eSAlex Elder } 582aafb230eSAlex Elder #else /* !RBD_DEBUG */ 583aafb230eSAlex Elder # define rbd_assert(expr) ((void) 0) 584aafb230eSAlex Elder #endif /* !RBD_DEBUG */ 585dfc5606dSYehuda Sadeh 5862761713dSIlya Dryomov static void rbd_osd_copyup_callback(struct rbd_obj_request *obj_request); 587b454e36dSAlex Elder static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request); 58805a46afdSAlex Elder static void rbd_img_parent_read(struct rbd_obj_request *obj_request); 58905a46afdSAlex Elder static void rbd_dev_remove_parent(struct rbd_device *rbd_dev); 5908b3e1a56SAlex Elder 591cc4a38bdSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev); 5922df3fac7SAlex Elder static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev); 593a720ae09SIlya Dryomov static int rbd_dev_header_info(struct rbd_device *rbd_dev); 594e8f59b59SIlya Dryomov static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev); 59554cac61fSAlex Elder static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, 59654cac61fSAlex Elder u64 snap_id); 5972ad3d716SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id, 5982ad3d716SAlex Elder u8 *order, u64 *snap_size); 5992ad3d716SAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id, 6002ad3d716SAlex Elder u64 *snap_features); 60159c2be1eSYehuda Sadeh 602602adf40SYehuda Sadeh static int rbd_open(struct block_device *bdev, fmode_t mode) 603602adf40SYehuda Sadeh { 604f0f8cef5SAlex Elder struct rbd_device *rbd_dev = bdev->bd_disk->private_data; 605b82d167bSAlex Elder bool removing = false; 606602adf40SYehuda Sadeh 607f84344f3SAlex Elder if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only) 608602adf40SYehuda Sadeh return -EROFS; 609602adf40SYehuda Sadeh 610a14ea269SAlex Elder spin_lock_irq(&rbd_dev->lock); 611b82d167bSAlex Elder if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags)) 612b82d167bSAlex Elder removing = true; 613b82d167bSAlex Elder else 614b82d167bSAlex Elder rbd_dev->open_count++; 615a14ea269SAlex Elder spin_unlock_irq(&rbd_dev->lock); 616b82d167bSAlex Elder if (removing) 617b82d167bSAlex Elder return -ENOENT; 618b82d167bSAlex Elder 619c3e946ceSAlex Elder (void) get_device(&rbd_dev->dev); 620340c7a2bSAlex Elder 621602adf40SYehuda Sadeh return 0; 622602adf40SYehuda Sadeh } 623602adf40SYehuda Sadeh 624db2a144bSAl Viro static void rbd_release(struct gendisk *disk, fmode_t mode) 625dfc5606dSYehuda Sadeh { 626dfc5606dSYehuda Sadeh struct rbd_device *rbd_dev = disk->private_data; 627b82d167bSAlex Elder unsigned long open_count_before; 628b82d167bSAlex Elder 629a14ea269SAlex Elder spin_lock_irq(&rbd_dev->lock); 630b82d167bSAlex Elder open_count_before = rbd_dev->open_count--; 631a14ea269SAlex Elder spin_unlock_irq(&rbd_dev->lock); 632b82d167bSAlex Elder rbd_assert(open_count_before > 0); 633dfc5606dSYehuda Sadeh 634c3e946ceSAlex Elder put_device(&rbd_dev->dev); 635dfc5606dSYehuda Sadeh } 636dfc5606dSYehuda Sadeh 637131fd9f6SGuangliang Zhao static int rbd_ioctl_set_ro(struct rbd_device *rbd_dev, unsigned long arg) 638131fd9f6SGuangliang Zhao { 63977f33c03SJosh Durgin int ret = 0; 640131fd9f6SGuangliang Zhao int val; 641131fd9f6SGuangliang Zhao bool ro; 64277f33c03SJosh Durgin bool ro_changed = false; 643131fd9f6SGuangliang Zhao 64477f33c03SJosh Durgin /* get_user() may sleep, so call it before taking rbd_dev->lock */ 645131fd9f6SGuangliang Zhao if (get_user(val, (int __user *)(arg))) 646131fd9f6SGuangliang Zhao return -EFAULT; 647131fd9f6SGuangliang Zhao 648131fd9f6SGuangliang Zhao ro = val ? true : false; 649131fd9f6SGuangliang Zhao /* Snapshot doesn't allow to write*/ 650131fd9f6SGuangliang Zhao if (rbd_dev->spec->snap_id != CEPH_NOSNAP && !ro) 651131fd9f6SGuangliang Zhao return -EROFS; 652131fd9f6SGuangliang Zhao 65377f33c03SJosh Durgin spin_lock_irq(&rbd_dev->lock); 65477f33c03SJosh Durgin /* prevent others open this device */ 65577f33c03SJosh Durgin if (rbd_dev->open_count > 1) { 65677f33c03SJosh Durgin ret = -EBUSY; 65777f33c03SJosh Durgin goto out; 658131fd9f6SGuangliang Zhao } 659131fd9f6SGuangliang Zhao 66077f33c03SJosh Durgin if (rbd_dev->mapping.read_only != ro) { 66177f33c03SJosh Durgin rbd_dev->mapping.read_only = ro; 66277f33c03SJosh Durgin ro_changed = true; 66377f33c03SJosh Durgin } 66477f33c03SJosh Durgin 66577f33c03SJosh Durgin out: 66677f33c03SJosh Durgin spin_unlock_irq(&rbd_dev->lock); 66777f33c03SJosh Durgin /* set_disk_ro() may sleep, so call it after releasing rbd_dev->lock */ 66877f33c03SJosh Durgin if (ret == 0 && ro_changed) 66977f33c03SJosh Durgin set_disk_ro(rbd_dev->disk, ro ? 1 : 0); 67077f33c03SJosh Durgin 67177f33c03SJosh Durgin return ret; 672131fd9f6SGuangliang Zhao } 673131fd9f6SGuangliang Zhao 674131fd9f6SGuangliang Zhao static int rbd_ioctl(struct block_device *bdev, fmode_t mode, 675131fd9f6SGuangliang Zhao unsigned int cmd, unsigned long arg) 676131fd9f6SGuangliang Zhao { 677131fd9f6SGuangliang Zhao struct rbd_device *rbd_dev = bdev->bd_disk->private_data; 678131fd9f6SGuangliang Zhao int ret = 0; 679131fd9f6SGuangliang Zhao 680131fd9f6SGuangliang Zhao switch (cmd) { 681131fd9f6SGuangliang Zhao case BLKROSET: 682131fd9f6SGuangliang Zhao ret = rbd_ioctl_set_ro(rbd_dev, arg); 683131fd9f6SGuangliang Zhao break; 684131fd9f6SGuangliang Zhao default: 685131fd9f6SGuangliang Zhao ret = -ENOTTY; 686131fd9f6SGuangliang Zhao } 687131fd9f6SGuangliang Zhao 688131fd9f6SGuangliang Zhao return ret; 689131fd9f6SGuangliang Zhao } 690131fd9f6SGuangliang Zhao 691131fd9f6SGuangliang Zhao #ifdef CONFIG_COMPAT 692131fd9f6SGuangliang Zhao static int rbd_compat_ioctl(struct block_device *bdev, fmode_t mode, 693131fd9f6SGuangliang Zhao unsigned int cmd, unsigned long arg) 694131fd9f6SGuangliang Zhao { 695131fd9f6SGuangliang Zhao return rbd_ioctl(bdev, mode, cmd, arg); 696131fd9f6SGuangliang Zhao } 697131fd9f6SGuangliang Zhao #endif /* CONFIG_COMPAT */ 698131fd9f6SGuangliang Zhao 699602adf40SYehuda Sadeh static const struct block_device_operations rbd_bd_ops = { 700602adf40SYehuda Sadeh .owner = THIS_MODULE, 701602adf40SYehuda Sadeh .open = rbd_open, 702dfc5606dSYehuda Sadeh .release = rbd_release, 703131fd9f6SGuangliang Zhao .ioctl = rbd_ioctl, 704131fd9f6SGuangliang Zhao #ifdef CONFIG_COMPAT 705131fd9f6SGuangliang Zhao .compat_ioctl = rbd_compat_ioctl, 706131fd9f6SGuangliang Zhao #endif 707602adf40SYehuda Sadeh }; 708602adf40SYehuda Sadeh 709602adf40SYehuda Sadeh /* 7107262cfcaSAlex Elder * Initialize an rbd client instance. Success or not, this function 711cfbf6377SAlex Elder * consumes ceph_opts. Caller holds client_mutex. 712602adf40SYehuda Sadeh */ 713f8c38929SAlex Elder static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts) 714602adf40SYehuda Sadeh { 715602adf40SYehuda Sadeh struct rbd_client *rbdc; 716602adf40SYehuda Sadeh int ret = -ENOMEM; 717602adf40SYehuda Sadeh 71837206ee5SAlex Elder dout("%s:\n", __func__); 719602adf40SYehuda Sadeh rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL); 720602adf40SYehuda Sadeh if (!rbdc) 721602adf40SYehuda Sadeh goto out_opt; 722602adf40SYehuda Sadeh 723602adf40SYehuda Sadeh kref_init(&rbdc->kref); 724602adf40SYehuda Sadeh INIT_LIST_HEAD(&rbdc->node); 725602adf40SYehuda Sadeh 72643ae4701SAlex Elder rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0); 727602adf40SYehuda Sadeh if (IS_ERR(rbdc->client)) 72808f75463SAlex Elder goto out_rbdc; 72943ae4701SAlex Elder ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */ 730602adf40SYehuda Sadeh 731602adf40SYehuda Sadeh ret = ceph_open_session(rbdc->client); 732602adf40SYehuda Sadeh if (ret < 0) 73308f75463SAlex Elder goto out_client; 734602adf40SYehuda Sadeh 735432b8587SAlex Elder spin_lock(&rbd_client_list_lock); 736602adf40SYehuda Sadeh list_add_tail(&rbdc->node, &rbd_client_list); 737432b8587SAlex Elder spin_unlock(&rbd_client_list_lock); 738602adf40SYehuda Sadeh 73937206ee5SAlex Elder dout("%s: rbdc %p\n", __func__, rbdc); 740bc534d86SAlex Elder 741602adf40SYehuda Sadeh return rbdc; 74208f75463SAlex Elder out_client: 743602adf40SYehuda Sadeh ceph_destroy_client(rbdc->client); 74408f75463SAlex Elder out_rbdc: 745602adf40SYehuda Sadeh kfree(rbdc); 746602adf40SYehuda Sadeh out_opt: 74743ae4701SAlex Elder if (ceph_opts) 74843ae4701SAlex Elder ceph_destroy_options(ceph_opts); 74937206ee5SAlex Elder dout("%s: error %d\n", __func__, ret); 75037206ee5SAlex Elder 75128f259b7SVasiliy Kulikov return ERR_PTR(ret); 752602adf40SYehuda Sadeh } 753602adf40SYehuda Sadeh 7542f82ee54SAlex Elder static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc) 7552f82ee54SAlex Elder { 7562f82ee54SAlex Elder kref_get(&rbdc->kref); 7572f82ee54SAlex Elder 7582f82ee54SAlex Elder return rbdc; 7592f82ee54SAlex Elder } 7602f82ee54SAlex Elder 761602adf40SYehuda Sadeh /* 7621f7ba331SAlex Elder * Find a ceph client with specific addr and configuration. If 7631f7ba331SAlex Elder * found, bump its reference count. 764602adf40SYehuda Sadeh */ 7651f7ba331SAlex Elder static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts) 766602adf40SYehuda Sadeh { 767602adf40SYehuda Sadeh struct rbd_client *client_node; 7681f7ba331SAlex Elder bool found = false; 769602adf40SYehuda Sadeh 77043ae4701SAlex Elder if (ceph_opts->flags & CEPH_OPT_NOSHARE) 771602adf40SYehuda Sadeh return NULL; 772602adf40SYehuda Sadeh 7731f7ba331SAlex Elder spin_lock(&rbd_client_list_lock); 7741f7ba331SAlex Elder list_for_each_entry(client_node, &rbd_client_list, node) { 7751f7ba331SAlex Elder if (!ceph_compare_options(ceph_opts, client_node->client)) { 7762f82ee54SAlex Elder __rbd_get_client(client_node); 7772f82ee54SAlex Elder 7781f7ba331SAlex Elder found = true; 7791f7ba331SAlex Elder break; 7801f7ba331SAlex Elder } 7811f7ba331SAlex Elder } 7821f7ba331SAlex Elder spin_unlock(&rbd_client_list_lock); 7831f7ba331SAlex Elder 7841f7ba331SAlex Elder return found ? client_node : NULL; 785602adf40SYehuda Sadeh } 786602adf40SYehuda Sadeh 787602adf40SYehuda Sadeh /* 788210c104cSIlya Dryomov * (Per device) rbd map options 78959c2be1eSYehuda Sadeh */ 79059c2be1eSYehuda Sadeh enum { 791b5584180SIlya Dryomov Opt_queue_depth, 79259c2be1eSYehuda Sadeh Opt_last_int, 79359c2be1eSYehuda Sadeh /* int args above */ 79459c2be1eSYehuda Sadeh Opt_last_string, 79559c2be1eSYehuda Sadeh /* string args above */ 796cc0538b6SAlex Elder Opt_read_only, 797cc0538b6SAlex Elder Opt_read_write, 79880de1912SIlya Dryomov Opt_lock_on_read, 799210c104cSIlya Dryomov Opt_err 80059c2be1eSYehuda Sadeh }; 80159c2be1eSYehuda Sadeh 80243ae4701SAlex Elder static match_table_t rbd_opts_tokens = { 803b5584180SIlya Dryomov {Opt_queue_depth, "queue_depth=%d"}, 80459c2be1eSYehuda Sadeh /* int args above */ 80559c2be1eSYehuda Sadeh /* string args above */ 806be466c1cSAlex Elder {Opt_read_only, "read_only"}, 807cc0538b6SAlex Elder {Opt_read_only, "ro"}, /* Alternate spelling */ 808cc0538b6SAlex Elder {Opt_read_write, "read_write"}, 809cc0538b6SAlex Elder {Opt_read_write, "rw"}, /* Alternate spelling */ 81080de1912SIlya Dryomov {Opt_lock_on_read, "lock_on_read"}, 811210c104cSIlya Dryomov {Opt_err, NULL} 81259c2be1eSYehuda Sadeh }; 81359c2be1eSYehuda Sadeh 81498571b5aSAlex Elder struct rbd_options { 815b5584180SIlya Dryomov int queue_depth; 81698571b5aSAlex Elder bool read_only; 81780de1912SIlya Dryomov bool lock_on_read; 81898571b5aSAlex Elder }; 81998571b5aSAlex Elder 820b5584180SIlya Dryomov #define RBD_QUEUE_DEPTH_DEFAULT BLKDEV_MAX_RQ 82198571b5aSAlex Elder #define RBD_READ_ONLY_DEFAULT false 82280de1912SIlya Dryomov #define RBD_LOCK_ON_READ_DEFAULT false 82398571b5aSAlex Elder 82459c2be1eSYehuda Sadeh static int parse_rbd_opts_token(char *c, void *private) 82559c2be1eSYehuda Sadeh { 82643ae4701SAlex Elder struct rbd_options *rbd_opts = private; 82759c2be1eSYehuda Sadeh substring_t argstr[MAX_OPT_ARGS]; 82859c2be1eSYehuda Sadeh int token, intval, ret; 82959c2be1eSYehuda Sadeh 83043ae4701SAlex Elder token = match_token(c, rbd_opts_tokens, argstr); 83159c2be1eSYehuda Sadeh if (token < Opt_last_int) { 83259c2be1eSYehuda Sadeh ret = match_int(&argstr[0], &intval); 83359c2be1eSYehuda Sadeh if (ret < 0) { 834210c104cSIlya Dryomov pr_err("bad mount option arg (not int) at '%s'\n", c); 83559c2be1eSYehuda Sadeh return ret; 83659c2be1eSYehuda Sadeh } 83759c2be1eSYehuda Sadeh dout("got int token %d val %d\n", token, intval); 83859c2be1eSYehuda Sadeh } else if (token > Opt_last_int && token < Opt_last_string) { 839210c104cSIlya Dryomov dout("got string token %d val %s\n", token, argstr[0].from); 84059c2be1eSYehuda Sadeh } else { 84159c2be1eSYehuda Sadeh dout("got token %d\n", token); 84259c2be1eSYehuda Sadeh } 84359c2be1eSYehuda Sadeh 84459c2be1eSYehuda Sadeh switch (token) { 845b5584180SIlya Dryomov case Opt_queue_depth: 846b5584180SIlya Dryomov if (intval < 1) { 847b5584180SIlya Dryomov pr_err("queue_depth out of range\n"); 848b5584180SIlya Dryomov return -EINVAL; 849b5584180SIlya Dryomov } 850b5584180SIlya Dryomov rbd_opts->queue_depth = intval; 851b5584180SIlya Dryomov break; 852cc0538b6SAlex Elder case Opt_read_only: 853cc0538b6SAlex Elder rbd_opts->read_only = true; 854cc0538b6SAlex Elder break; 855cc0538b6SAlex Elder case Opt_read_write: 856cc0538b6SAlex Elder rbd_opts->read_only = false; 857cc0538b6SAlex Elder break; 85880de1912SIlya Dryomov case Opt_lock_on_read: 85980de1912SIlya Dryomov rbd_opts->lock_on_read = true; 86080de1912SIlya Dryomov break; 86159c2be1eSYehuda Sadeh default: 862210c104cSIlya Dryomov /* libceph prints "bad option" msg */ 863210c104cSIlya Dryomov return -EINVAL; 86459c2be1eSYehuda Sadeh } 865210c104cSIlya Dryomov 86659c2be1eSYehuda Sadeh return 0; 86759c2be1eSYehuda Sadeh } 86859c2be1eSYehuda Sadeh 8696d2940c8SGuangliang Zhao static char* obj_op_name(enum obj_operation_type op_type) 8706d2940c8SGuangliang Zhao { 8716d2940c8SGuangliang Zhao switch (op_type) { 8726d2940c8SGuangliang Zhao case OBJ_OP_READ: 8736d2940c8SGuangliang Zhao return "read"; 8746d2940c8SGuangliang Zhao case OBJ_OP_WRITE: 8756d2940c8SGuangliang Zhao return "write"; 87690e98c52SGuangliang Zhao case OBJ_OP_DISCARD: 87790e98c52SGuangliang Zhao return "discard"; 8786d2940c8SGuangliang Zhao default: 8796d2940c8SGuangliang Zhao return "???"; 8806d2940c8SGuangliang Zhao } 8816d2940c8SGuangliang Zhao } 8826d2940c8SGuangliang Zhao 88359c2be1eSYehuda Sadeh /* 884602adf40SYehuda Sadeh * Get a ceph client with specific addr and configuration, if one does 8857262cfcaSAlex Elder * not exist create it. Either way, ceph_opts is consumed by this 8867262cfcaSAlex Elder * function. 887602adf40SYehuda Sadeh */ 8889d3997fdSAlex Elder static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts) 889602adf40SYehuda Sadeh { 890f8c38929SAlex Elder struct rbd_client *rbdc; 89159c2be1eSYehuda Sadeh 892cfbf6377SAlex Elder mutex_lock_nested(&client_mutex, SINGLE_DEPTH_NESTING); 8931f7ba331SAlex Elder rbdc = rbd_client_find(ceph_opts); 8949d3997fdSAlex Elder if (rbdc) /* using an existing client */ 89543ae4701SAlex Elder ceph_destroy_options(ceph_opts); 8969d3997fdSAlex Elder else 897f8c38929SAlex Elder rbdc = rbd_client_create(ceph_opts); 898cfbf6377SAlex Elder mutex_unlock(&client_mutex); 899d720bcb0SAlex Elder 9009d3997fdSAlex Elder return rbdc; 901602adf40SYehuda Sadeh } 902602adf40SYehuda Sadeh 903602adf40SYehuda Sadeh /* 904602adf40SYehuda Sadeh * Destroy ceph client 905d23a4b3fSAlex Elder * 906432b8587SAlex Elder * Caller must hold rbd_client_list_lock. 907602adf40SYehuda Sadeh */ 908602adf40SYehuda Sadeh static void rbd_client_release(struct kref *kref) 909602adf40SYehuda Sadeh { 910602adf40SYehuda Sadeh struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref); 911602adf40SYehuda Sadeh 91237206ee5SAlex Elder dout("%s: rbdc %p\n", __func__, rbdc); 913cd9d9f5dSAlex Elder spin_lock(&rbd_client_list_lock); 914602adf40SYehuda Sadeh list_del(&rbdc->node); 915cd9d9f5dSAlex Elder spin_unlock(&rbd_client_list_lock); 916602adf40SYehuda Sadeh 917602adf40SYehuda Sadeh ceph_destroy_client(rbdc->client); 918602adf40SYehuda Sadeh kfree(rbdc); 919602adf40SYehuda Sadeh } 920602adf40SYehuda Sadeh 921602adf40SYehuda Sadeh /* 922602adf40SYehuda Sadeh * Drop reference to ceph client node. If it's not referenced anymore, release 923602adf40SYehuda Sadeh * it. 924602adf40SYehuda Sadeh */ 9259d3997fdSAlex Elder static void rbd_put_client(struct rbd_client *rbdc) 926602adf40SYehuda Sadeh { 927c53d5893SAlex Elder if (rbdc) 9289d3997fdSAlex Elder kref_put(&rbdc->kref, rbd_client_release); 929602adf40SYehuda Sadeh } 930602adf40SYehuda Sadeh 931a30b71b9SAlex Elder static bool rbd_image_format_valid(u32 image_format) 932a30b71b9SAlex Elder { 933a30b71b9SAlex Elder return image_format == 1 || image_format == 2; 934a30b71b9SAlex Elder } 935a30b71b9SAlex Elder 9368e94af8eSAlex Elder static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk) 9378e94af8eSAlex Elder { 938103a150fSAlex Elder size_t size; 939103a150fSAlex Elder u32 snap_count; 940103a150fSAlex Elder 941103a150fSAlex Elder /* The header has to start with the magic rbd header text */ 942103a150fSAlex Elder if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT))) 943103a150fSAlex Elder return false; 944103a150fSAlex Elder 945db2388b6SAlex Elder /* The bio layer requires at least sector-sized I/O */ 946db2388b6SAlex Elder 947db2388b6SAlex Elder if (ondisk->options.order < SECTOR_SHIFT) 948db2388b6SAlex Elder return false; 949db2388b6SAlex Elder 950db2388b6SAlex Elder /* If we use u64 in a few spots we may be able to loosen this */ 951db2388b6SAlex Elder 952db2388b6SAlex Elder if (ondisk->options.order > 8 * sizeof (int) - 1) 953db2388b6SAlex Elder return false; 954db2388b6SAlex Elder 955103a150fSAlex Elder /* 956103a150fSAlex Elder * The size of a snapshot header has to fit in a size_t, and 957103a150fSAlex Elder * that limits the number of snapshots. 958103a150fSAlex Elder */ 959103a150fSAlex Elder snap_count = le32_to_cpu(ondisk->snap_count); 960103a150fSAlex Elder size = SIZE_MAX - sizeof (struct ceph_snap_context); 961103a150fSAlex Elder if (snap_count > size / sizeof (__le64)) 962103a150fSAlex Elder return false; 963103a150fSAlex Elder 964103a150fSAlex Elder /* 965103a150fSAlex Elder * Not only that, but the size of the entire the snapshot 966103a150fSAlex Elder * header must also be representable in a size_t. 967103a150fSAlex Elder */ 968103a150fSAlex Elder size -= snap_count * sizeof (__le64); 969103a150fSAlex Elder if ((u64) size < le64_to_cpu(ondisk->snap_names_len)) 970103a150fSAlex Elder return false; 971103a150fSAlex Elder 972103a150fSAlex Elder return true; 9738e94af8eSAlex Elder } 9748e94af8eSAlex Elder 975602adf40SYehuda Sadeh /* 9765bc3fb17SIlya Dryomov * returns the size of an object in the image 9775bc3fb17SIlya Dryomov */ 9785bc3fb17SIlya Dryomov static u32 rbd_obj_bytes(struct rbd_image_header *header) 9795bc3fb17SIlya Dryomov { 9805bc3fb17SIlya Dryomov return 1U << header->obj_order; 9815bc3fb17SIlya Dryomov } 9825bc3fb17SIlya Dryomov 983263423f8SIlya Dryomov static void rbd_init_layout(struct rbd_device *rbd_dev) 984263423f8SIlya Dryomov { 985263423f8SIlya Dryomov if (rbd_dev->header.stripe_unit == 0 || 986263423f8SIlya Dryomov rbd_dev->header.stripe_count == 0) { 987263423f8SIlya Dryomov rbd_dev->header.stripe_unit = rbd_obj_bytes(&rbd_dev->header); 988263423f8SIlya Dryomov rbd_dev->header.stripe_count = 1; 989263423f8SIlya Dryomov } 990263423f8SIlya Dryomov 991263423f8SIlya Dryomov rbd_dev->layout.stripe_unit = rbd_dev->header.stripe_unit; 992263423f8SIlya Dryomov rbd_dev->layout.stripe_count = rbd_dev->header.stripe_count; 993263423f8SIlya Dryomov rbd_dev->layout.object_size = rbd_obj_bytes(&rbd_dev->header); 9947e97332eSIlya Dryomov rbd_dev->layout.pool_id = rbd_dev->header.data_pool_id == CEPH_NOPOOL ? 9957e97332eSIlya Dryomov rbd_dev->spec->pool_id : rbd_dev->header.data_pool_id; 996263423f8SIlya Dryomov RCU_INIT_POINTER(rbd_dev->layout.pool_ns, NULL); 997263423f8SIlya Dryomov } 998263423f8SIlya Dryomov 9995bc3fb17SIlya Dryomov /* 1000bb23e37aSAlex Elder * Fill an rbd image header with information from the given format 1 1001bb23e37aSAlex Elder * on-disk header. 1002602adf40SYehuda Sadeh */ 1003662518b1SAlex Elder static int rbd_header_from_disk(struct rbd_device *rbd_dev, 10044156d998SAlex Elder struct rbd_image_header_ondisk *ondisk) 1005602adf40SYehuda Sadeh { 1006662518b1SAlex Elder struct rbd_image_header *header = &rbd_dev->header; 1007bb23e37aSAlex Elder bool first_time = header->object_prefix == NULL; 1008bb23e37aSAlex Elder struct ceph_snap_context *snapc; 1009bb23e37aSAlex Elder char *object_prefix = NULL; 1010bb23e37aSAlex Elder char *snap_names = NULL; 1011bb23e37aSAlex Elder u64 *snap_sizes = NULL; 1012ccece235SAlex Elder u32 snap_count; 1013bb23e37aSAlex Elder int ret = -ENOMEM; 1014621901d6SAlex Elder u32 i; 1015602adf40SYehuda Sadeh 1016bb23e37aSAlex Elder /* Allocate this now to avoid having to handle failure below */ 1017103a150fSAlex Elder 1018bb23e37aSAlex Elder if (first_time) { 1019848d796cSIlya Dryomov object_prefix = kstrndup(ondisk->object_prefix, 1020848d796cSIlya Dryomov sizeof(ondisk->object_prefix), 1021848d796cSIlya Dryomov GFP_KERNEL); 1022bb23e37aSAlex Elder if (!object_prefix) 1023602adf40SYehuda Sadeh return -ENOMEM; 1024bb23e37aSAlex Elder } 102500f1f36fSAlex Elder 1026bb23e37aSAlex Elder /* Allocate the snapshot context and fill it in */ 1027d2bb24e5SAlex Elder 1028602adf40SYehuda Sadeh snap_count = le32_to_cpu(ondisk->snap_count); 1029bb23e37aSAlex Elder snapc = ceph_create_snap_context(snap_count, GFP_KERNEL); 1030bb23e37aSAlex Elder if (!snapc) 1031bb23e37aSAlex Elder goto out_err; 1032bb23e37aSAlex Elder snapc->seq = le64_to_cpu(ondisk->snap_seq); 1033602adf40SYehuda Sadeh if (snap_count) { 1034bb23e37aSAlex Elder struct rbd_image_snap_ondisk *snaps; 1035f785cc1dSAlex Elder u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len); 1036f785cc1dSAlex Elder 1037bb23e37aSAlex Elder /* We'll keep a copy of the snapshot names... */ 1038621901d6SAlex Elder 1039f785cc1dSAlex Elder if (snap_names_len > (u64)SIZE_MAX) 1040bb23e37aSAlex Elder goto out_2big; 1041bb23e37aSAlex Elder snap_names = kmalloc(snap_names_len, GFP_KERNEL); 1042bb23e37aSAlex Elder if (!snap_names) 1043602adf40SYehuda Sadeh goto out_err; 1044bb23e37aSAlex Elder 1045bb23e37aSAlex Elder /* ...as well as the array of their sizes. */ 104688a25a5fSMarkus Elfring snap_sizes = kmalloc_array(snap_count, 104788a25a5fSMarkus Elfring sizeof(*header->snap_sizes), 104888a25a5fSMarkus Elfring GFP_KERNEL); 1049bb23e37aSAlex Elder if (!snap_sizes) 1050bb23e37aSAlex Elder goto out_err; 1051bb23e37aSAlex Elder 1052f785cc1dSAlex Elder /* 1053bb23e37aSAlex Elder * Copy the names, and fill in each snapshot's id 1054bb23e37aSAlex Elder * and size. 1055bb23e37aSAlex Elder * 105699a41ebcSAlex Elder * Note that rbd_dev_v1_header_info() guarantees the 1057bb23e37aSAlex Elder * ondisk buffer we're working with has 1058f785cc1dSAlex Elder * snap_names_len bytes beyond the end of the 1059f785cc1dSAlex Elder * snapshot id array, this memcpy() is safe. 1060f785cc1dSAlex Elder */ 1061bb23e37aSAlex Elder memcpy(snap_names, &ondisk->snaps[snap_count], snap_names_len); 1062bb23e37aSAlex Elder snaps = ondisk->snaps; 1063bb23e37aSAlex Elder for (i = 0; i < snap_count; i++) { 1064bb23e37aSAlex Elder snapc->snaps[i] = le64_to_cpu(snaps[i].id); 1065bb23e37aSAlex Elder snap_sizes[i] = le64_to_cpu(snaps[i].image_size); 1066bb23e37aSAlex Elder } 1067602adf40SYehuda Sadeh } 1068849b4260SAlex Elder 1069bb23e37aSAlex Elder /* We won't fail any more, fill in the header */ 1070bb23e37aSAlex Elder 1071bb23e37aSAlex Elder if (first_time) { 1072bb23e37aSAlex Elder header->object_prefix = object_prefix; 1073602adf40SYehuda Sadeh header->obj_order = ondisk->options.order; 1074263423f8SIlya Dryomov rbd_init_layout(rbd_dev); 1075662518b1SAlex Elder } else { 1076662518b1SAlex Elder ceph_put_snap_context(header->snapc); 1077662518b1SAlex Elder kfree(header->snap_names); 1078662518b1SAlex Elder kfree(header->snap_sizes); 1079bb23e37aSAlex Elder } 10806a52325fSAlex Elder 1081bb23e37aSAlex Elder /* The remaining fields always get updated (when we refresh) */ 1082621901d6SAlex Elder 1083f84344f3SAlex Elder header->image_size = le64_to_cpu(ondisk->image_size); 1084bb23e37aSAlex Elder header->snapc = snapc; 1085bb23e37aSAlex Elder header->snap_names = snap_names; 1086bb23e37aSAlex Elder header->snap_sizes = snap_sizes; 1087468521c1SAlex Elder 1088602adf40SYehuda Sadeh return 0; 1089bb23e37aSAlex Elder out_2big: 1090bb23e37aSAlex Elder ret = -EIO; 10916a52325fSAlex Elder out_err: 1092bb23e37aSAlex Elder kfree(snap_sizes); 1093bb23e37aSAlex Elder kfree(snap_names); 1094bb23e37aSAlex Elder ceph_put_snap_context(snapc); 1095bb23e37aSAlex Elder kfree(object_prefix); 1096ccece235SAlex Elder 1097bb23e37aSAlex Elder return ret; 1098602adf40SYehuda Sadeh } 1099602adf40SYehuda Sadeh 11009682fc6dSAlex Elder static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which) 11019682fc6dSAlex Elder { 11029682fc6dSAlex Elder const char *snap_name; 11039682fc6dSAlex Elder 11049682fc6dSAlex Elder rbd_assert(which < rbd_dev->header.snapc->num_snaps); 11059682fc6dSAlex Elder 11069682fc6dSAlex Elder /* Skip over names until we find the one we are looking for */ 11079682fc6dSAlex Elder 11089682fc6dSAlex Elder snap_name = rbd_dev->header.snap_names; 11099682fc6dSAlex Elder while (which--) 11109682fc6dSAlex Elder snap_name += strlen(snap_name) + 1; 11119682fc6dSAlex Elder 11129682fc6dSAlex Elder return kstrdup(snap_name, GFP_KERNEL); 11139682fc6dSAlex Elder } 11149682fc6dSAlex Elder 111530d1cff8SAlex Elder /* 111630d1cff8SAlex Elder * Snapshot id comparison function for use with qsort()/bsearch(). 111730d1cff8SAlex Elder * Note that result is for snapshots in *descending* order. 111830d1cff8SAlex Elder */ 111930d1cff8SAlex Elder static int snapid_compare_reverse(const void *s1, const void *s2) 112030d1cff8SAlex Elder { 112130d1cff8SAlex Elder u64 snap_id1 = *(u64 *)s1; 112230d1cff8SAlex Elder u64 snap_id2 = *(u64 *)s2; 112330d1cff8SAlex Elder 112430d1cff8SAlex Elder if (snap_id1 < snap_id2) 112530d1cff8SAlex Elder return 1; 112630d1cff8SAlex Elder return snap_id1 == snap_id2 ? 0 : -1; 112730d1cff8SAlex Elder } 112830d1cff8SAlex Elder 112930d1cff8SAlex Elder /* 113030d1cff8SAlex Elder * Search a snapshot context to see if the given snapshot id is 113130d1cff8SAlex Elder * present. 113230d1cff8SAlex Elder * 113330d1cff8SAlex Elder * Returns the position of the snapshot id in the array if it's found, 113430d1cff8SAlex Elder * or BAD_SNAP_INDEX otherwise. 113530d1cff8SAlex Elder * 113630d1cff8SAlex Elder * Note: The snapshot array is in kept sorted (by the osd) in 113730d1cff8SAlex Elder * reverse order, highest snapshot id first. 113830d1cff8SAlex Elder */ 11399682fc6dSAlex Elder static u32 rbd_dev_snap_index(struct rbd_device *rbd_dev, u64 snap_id) 11409682fc6dSAlex Elder { 11419682fc6dSAlex Elder struct ceph_snap_context *snapc = rbd_dev->header.snapc; 114230d1cff8SAlex Elder u64 *found; 11439682fc6dSAlex Elder 114430d1cff8SAlex Elder found = bsearch(&snap_id, &snapc->snaps, snapc->num_snaps, 114530d1cff8SAlex Elder sizeof (snap_id), snapid_compare_reverse); 11469682fc6dSAlex Elder 114730d1cff8SAlex Elder return found ? (u32)(found - &snapc->snaps[0]) : BAD_SNAP_INDEX; 11489682fc6dSAlex Elder } 11499682fc6dSAlex Elder 11502ad3d716SAlex Elder static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, 11512ad3d716SAlex Elder u64 snap_id) 115254cac61fSAlex Elder { 115354cac61fSAlex Elder u32 which; 1154da6a6b63SJosh Durgin const char *snap_name; 115554cac61fSAlex Elder 115654cac61fSAlex Elder which = rbd_dev_snap_index(rbd_dev, snap_id); 115754cac61fSAlex Elder if (which == BAD_SNAP_INDEX) 1158da6a6b63SJosh Durgin return ERR_PTR(-ENOENT); 115954cac61fSAlex Elder 1160da6a6b63SJosh Durgin snap_name = _rbd_dev_v1_snap_name(rbd_dev, which); 1161da6a6b63SJosh Durgin return snap_name ? snap_name : ERR_PTR(-ENOMEM); 116254cac61fSAlex Elder } 116354cac61fSAlex Elder 11649e15b77dSAlex Elder static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id) 11659e15b77dSAlex Elder { 11669e15b77dSAlex Elder if (snap_id == CEPH_NOSNAP) 11679e15b77dSAlex Elder return RBD_SNAP_HEAD_NAME; 11689e15b77dSAlex Elder 116954cac61fSAlex Elder rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 117054cac61fSAlex Elder if (rbd_dev->image_format == 1) 117154cac61fSAlex Elder return rbd_dev_v1_snap_name(rbd_dev, snap_id); 11729e15b77dSAlex Elder 117354cac61fSAlex Elder return rbd_dev_v2_snap_name(rbd_dev, snap_id); 11749e15b77dSAlex Elder } 11759e15b77dSAlex Elder 11762ad3d716SAlex Elder static int rbd_snap_size(struct rbd_device *rbd_dev, u64 snap_id, 11772ad3d716SAlex Elder u64 *snap_size) 1178602adf40SYehuda Sadeh { 11792ad3d716SAlex Elder rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 11802ad3d716SAlex Elder if (snap_id == CEPH_NOSNAP) { 11812ad3d716SAlex Elder *snap_size = rbd_dev->header.image_size; 11822ad3d716SAlex Elder } else if (rbd_dev->image_format == 1) { 11832ad3d716SAlex Elder u32 which; 118400f1f36fSAlex Elder 11852ad3d716SAlex Elder which = rbd_dev_snap_index(rbd_dev, snap_id); 11862ad3d716SAlex Elder if (which == BAD_SNAP_INDEX) 11872ad3d716SAlex Elder return -ENOENT; 118800f1f36fSAlex Elder 11892ad3d716SAlex Elder *snap_size = rbd_dev->header.snap_sizes[which]; 11902ad3d716SAlex Elder } else { 11912ad3d716SAlex Elder u64 size = 0; 11922ad3d716SAlex Elder int ret; 11932ad3d716SAlex Elder 11942ad3d716SAlex Elder ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size); 11952ad3d716SAlex Elder if (ret) 11962ad3d716SAlex Elder return ret; 11972ad3d716SAlex Elder 11982ad3d716SAlex Elder *snap_size = size; 11992ad3d716SAlex Elder } 12002ad3d716SAlex Elder return 0; 12012ad3d716SAlex Elder } 12022ad3d716SAlex Elder 12032ad3d716SAlex Elder static int rbd_snap_features(struct rbd_device *rbd_dev, u64 snap_id, 12042ad3d716SAlex Elder u64 *snap_features) 12052ad3d716SAlex Elder { 12062ad3d716SAlex Elder rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 12072ad3d716SAlex Elder if (snap_id == CEPH_NOSNAP) { 12082ad3d716SAlex Elder *snap_features = rbd_dev->header.features; 12092ad3d716SAlex Elder } else if (rbd_dev->image_format == 1) { 12102ad3d716SAlex Elder *snap_features = 0; /* No features for format 1 */ 12112ad3d716SAlex Elder } else { 12122ad3d716SAlex Elder u64 features = 0; 12132ad3d716SAlex Elder int ret; 12142ad3d716SAlex Elder 12152ad3d716SAlex Elder ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, &features); 12162ad3d716SAlex Elder if (ret) 12172ad3d716SAlex Elder return ret; 12182ad3d716SAlex Elder 12192ad3d716SAlex Elder *snap_features = features; 12202ad3d716SAlex Elder } 12212ad3d716SAlex Elder return 0; 122200f1f36fSAlex Elder } 1223602adf40SYehuda Sadeh 1224d1cf5788SAlex Elder static int rbd_dev_mapping_set(struct rbd_device *rbd_dev) 1225602adf40SYehuda Sadeh { 12268f4b7d98SAlex Elder u64 snap_id = rbd_dev->spec->snap_id; 12272ad3d716SAlex Elder u64 size = 0; 12282ad3d716SAlex Elder u64 features = 0; 12292ad3d716SAlex Elder int ret; 12308b0241f8SAlex Elder 12312ad3d716SAlex Elder ret = rbd_snap_size(rbd_dev, snap_id, &size); 12322ad3d716SAlex Elder if (ret) 12332ad3d716SAlex Elder return ret; 12342ad3d716SAlex Elder ret = rbd_snap_features(rbd_dev, snap_id, &features); 12352ad3d716SAlex Elder if (ret) 12362ad3d716SAlex Elder return ret; 12372ad3d716SAlex Elder 12382ad3d716SAlex Elder rbd_dev->mapping.size = size; 12392ad3d716SAlex Elder rbd_dev->mapping.features = features; 12402ad3d716SAlex Elder 12418b0241f8SAlex Elder return 0; 1242602adf40SYehuda Sadeh } 1243602adf40SYehuda Sadeh 1244d1cf5788SAlex Elder static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev) 1245d1cf5788SAlex Elder { 1246d1cf5788SAlex Elder rbd_dev->mapping.size = 0; 1247d1cf5788SAlex Elder rbd_dev->mapping.features = 0; 1248200a6a8bSAlex Elder } 1249200a6a8bSAlex Elder 125065ccfe21SAlex Elder static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset) 125165ccfe21SAlex Elder { 12525bc3fb17SIlya Dryomov u64 segment_size = rbd_obj_bytes(&rbd_dev->header); 1253602adf40SYehuda Sadeh 125465ccfe21SAlex Elder return offset & (segment_size - 1); 125565ccfe21SAlex Elder } 125665ccfe21SAlex Elder 125765ccfe21SAlex Elder static u64 rbd_segment_length(struct rbd_device *rbd_dev, 125865ccfe21SAlex Elder u64 offset, u64 length) 125965ccfe21SAlex Elder { 12605bc3fb17SIlya Dryomov u64 segment_size = rbd_obj_bytes(&rbd_dev->header); 126165ccfe21SAlex Elder 126265ccfe21SAlex Elder offset &= segment_size - 1; 126365ccfe21SAlex Elder 1264aafb230eSAlex Elder rbd_assert(length <= U64_MAX - offset); 126565ccfe21SAlex Elder if (offset + length > segment_size) 126665ccfe21SAlex Elder length = segment_size - offset; 126765ccfe21SAlex Elder 126865ccfe21SAlex Elder return length; 1269602adf40SYehuda Sadeh } 1270602adf40SYehuda Sadeh 1271602adf40SYehuda Sadeh /* 1272602adf40SYehuda Sadeh * bio helpers 1273602adf40SYehuda Sadeh */ 1274602adf40SYehuda Sadeh 1275602adf40SYehuda Sadeh static void bio_chain_put(struct bio *chain) 1276602adf40SYehuda Sadeh { 1277602adf40SYehuda Sadeh struct bio *tmp; 1278602adf40SYehuda Sadeh 1279602adf40SYehuda Sadeh while (chain) { 1280602adf40SYehuda Sadeh tmp = chain; 1281602adf40SYehuda Sadeh chain = chain->bi_next; 1282602adf40SYehuda Sadeh bio_put(tmp); 1283602adf40SYehuda Sadeh } 1284602adf40SYehuda Sadeh } 1285602adf40SYehuda Sadeh 1286602adf40SYehuda Sadeh /* 1287602adf40SYehuda Sadeh * zeros a bio chain, starting at specific offset 1288602adf40SYehuda Sadeh */ 1289602adf40SYehuda Sadeh static void zero_bio_chain(struct bio *chain, int start_ofs) 1290602adf40SYehuda Sadeh { 12917988613bSKent Overstreet struct bio_vec bv; 12927988613bSKent Overstreet struct bvec_iter iter; 1293602adf40SYehuda Sadeh unsigned long flags; 1294602adf40SYehuda Sadeh void *buf; 1295602adf40SYehuda Sadeh int pos = 0; 1296602adf40SYehuda Sadeh 1297602adf40SYehuda Sadeh while (chain) { 12987988613bSKent Overstreet bio_for_each_segment(bv, chain, iter) { 12997988613bSKent Overstreet if (pos + bv.bv_len > start_ofs) { 1300602adf40SYehuda Sadeh int remainder = max(start_ofs - pos, 0); 13017988613bSKent Overstreet buf = bvec_kmap_irq(&bv, &flags); 1302602adf40SYehuda Sadeh memset(buf + remainder, 0, 13037988613bSKent Overstreet bv.bv_len - remainder); 13047988613bSKent Overstreet flush_dcache_page(bv.bv_page); 130585b5aaa6SDan Carpenter bvec_kunmap_irq(buf, &flags); 1306602adf40SYehuda Sadeh } 13077988613bSKent Overstreet pos += bv.bv_len; 1308602adf40SYehuda Sadeh } 1309602adf40SYehuda Sadeh 1310602adf40SYehuda Sadeh chain = chain->bi_next; 1311602adf40SYehuda Sadeh } 1312602adf40SYehuda Sadeh } 1313602adf40SYehuda Sadeh 1314602adf40SYehuda Sadeh /* 1315b9434c5bSAlex Elder * similar to zero_bio_chain(), zeros data defined by a page array, 1316b9434c5bSAlex Elder * starting at the given byte offset from the start of the array and 1317b9434c5bSAlex Elder * continuing up to the given end offset. The pages array is 1318b9434c5bSAlex Elder * assumed to be big enough to hold all bytes up to the end. 1319b9434c5bSAlex Elder */ 1320b9434c5bSAlex Elder static void zero_pages(struct page **pages, u64 offset, u64 end) 1321b9434c5bSAlex Elder { 1322b9434c5bSAlex Elder struct page **page = &pages[offset >> PAGE_SHIFT]; 1323b9434c5bSAlex Elder 1324b9434c5bSAlex Elder rbd_assert(end > offset); 1325b9434c5bSAlex Elder rbd_assert(end - offset <= (u64)SIZE_MAX); 1326b9434c5bSAlex Elder while (offset < end) { 1327b9434c5bSAlex Elder size_t page_offset; 1328b9434c5bSAlex Elder size_t length; 1329b9434c5bSAlex Elder unsigned long flags; 1330b9434c5bSAlex Elder void *kaddr; 1331b9434c5bSAlex Elder 1332491205a8SGeert Uytterhoeven page_offset = offset & ~PAGE_MASK; 1333491205a8SGeert Uytterhoeven length = min_t(size_t, PAGE_SIZE - page_offset, end - offset); 1334b9434c5bSAlex Elder local_irq_save(flags); 1335b9434c5bSAlex Elder kaddr = kmap_atomic(*page); 1336b9434c5bSAlex Elder memset(kaddr + page_offset, 0, length); 1337e2156054SAlex Elder flush_dcache_page(*page); 1338b9434c5bSAlex Elder kunmap_atomic(kaddr); 1339b9434c5bSAlex Elder local_irq_restore(flags); 1340b9434c5bSAlex Elder 1341b9434c5bSAlex Elder offset += length; 1342b9434c5bSAlex Elder page++; 1343b9434c5bSAlex Elder } 1344b9434c5bSAlex Elder } 1345b9434c5bSAlex Elder 1346b9434c5bSAlex Elder /* 1347f7760dadSAlex Elder * Clone a portion of a bio, starting at the given byte offset 1348f7760dadSAlex Elder * and continuing for the number of bytes indicated. 1349602adf40SYehuda Sadeh */ 1350f7760dadSAlex Elder static struct bio *bio_clone_range(struct bio *bio_src, 1351f7760dadSAlex Elder unsigned int offset, 1352f7760dadSAlex Elder unsigned int len, 1353f7760dadSAlex Elder gfp_t gfpmask) 1354602adf40SYehuda Sadeh { 1355f7760dadSAlex Elder struct bio *bio; 1356602adf40SYehuda Sadeh 13575341a627SKent Overstreet bio = bio_clone(bio_src, gfpmask); 1358f7760dadSAlex Elder if (!bio) 1359f7760dadSAlex Elder return NULL; /* ENOMEM */ 1360f7760dadSAlex Elder 13615341a627SKent Overstreet bio_advance(bio, offset); 13624f024f37SKent Overstreet bio->bi_iter.bi_size = len; 1363602adf40SYehuda Sadeh 1364f7760dadSAlex Elder return bio; 1365602adf40SYehuda Sadeh } 1366602adf40SYehuda Sadeh 1367f7760dadSAlex Elder /* 1368f7760dadSAlex Elder * Clone a portion of a bio chain, starting at the given byte offset 1369f7760dadSAlex Elder * into the first bio in the source chain and continuing for the 1370f7760dadSAlex Elder * number of bytes indicated. The result is another bio chain of 1371f7760dadSAlex Elder * exactly the given length, or a null pointer on error. 1372f7760dadSAlex Elder * 1373f7760dadSAlex Elder * The bio_src and offset parameters are both in-out. On entry they 1374f7760dadSAlex Elder * refer to the first source bio and the offset into that bio where 1375f7760dadSAlex Elder * the start of data to be cloned is located. 1376f7760dadSAlex Elder * 1377f7760dadSAlex Elder * On return, bio_src is updated to refer to the bio in the source 1378f7760dadSAlex Elder * chain that contains first un-cloned byte, and *offset will 1379f7760dadSAlex Elder * contain the offset of that byte within that bio. 1380f7760dadSAlex Elder */ 1381f7760dadSAlex Elder static struct bio *bio_chain_clone_range(struct bio **bio_src, 1382f7760dadSAlex Elder unsigned int *offset, 1383f7760dadSAlex Elder unsigned int len, 1384f7760dadSAlex Elder gfp_t gfpmask) 1385f7760dadSAlex Elder { 1386f7760dadSAlex Elder struct bio *bi = *bio_src; 1387f7760dadSAlex Elder unsigned int off = *offset; 1388f7760dadSAlex Elder struct bio *chain = NULL; 1389f7760dadSAlex Elder struct bio **end; 1390602adf40SYehuda Sadeh 1391f7760dadSAlex Elder /* Build up a chain of clone bios up to the limit */ 1392602adf40SYehuda Sadeh 13934f024f37SKent Overstreet if (!bi || off >= bi->bi_iter.bi_size || !len) 1394f7760dadSAlex Elder return NULL; /* Nothing to clone */ 1395602adf40SYehuda Sadeh 1396f7760dadSAlex Elder end = &chain; 1397f7760dadSAlex Elder while (len) { 1398f7760dadSAlex Elder unsigned int bi_size; 1399f7760dadSAlex Elder struct bio *bio; 1400f7760dadSAlex Elder 1401f5400b7aSAlex Elder if (!bi) { 1402f5400b7aSAlex Elder rbd_warn(NULL, "bio_chain exhausted with %u left", len); 1403f7760dadSAlex Elder goto out_err; /* EINVAL; ran out of bio's */ 1404f5400b7aSAlex Elder } 14054f024f37SKent Overstreet bi_size = min_t(unsigned int, bi->bi_iter.bi_size - off, len); 1406f7760dadSAlex Elder bio = bio_clone_range(bi, off, bi_size, gfpmask); 1407f7760dadSAlex Elder if (!bio) 1408f7760dadSAlex Elder goto out_err; /* ENOMEM */ 1409f7760dadSAlex Elder 1410f7760dadSAlex Elder *end = bio; 1411f7760dadSAlex Elder end = &bio->bi_next; 1412f7760dadSAlex Elder 1413f7760dadSAlex Elder off += bi_size; 14144f024f37SKent Overstreet if (off == bi->bi_iter.bi_size) { 1415f7760dadSAlex Elder bi = bi->bi_next; 1416f7760dadSAlex Elder off = 0; 1417f7760dadSAlex Elder } 1418f7760dadSAlex Elder len -= bi_size; 1419f7760dadSAlex Elder } 1420f7760dadSAlex Elder *bio_src = bi; 1421f7760dadSAlex Elder *offset = off; 1422f7760dadSAlex Elder 1423f7760dadSAlex Elder return chain; 1424f7760dadSAlex Elder out_err: 1425f7760dadSAlex Elder bio_chain_put(chain); 1426f7760dadSAlex Elder 1427602adf40SYehuda Sadeh return NULL; 1428602adf40SYehuda Sadeh } 1429602adf40SYehuda Sadeh 1430926f9b3fSAlex Elder /* 1431926f9b3fSAlex Elder * The default/initial value for all object request flags is 0. For 1432926f9b3fSAlex Elder * each flag, once its value is set to 1 it is never reset to 0 1433926f9b3fSAlex Elder * again. 1434926f9b3fSAlex Elder */ 14356365d33aSAlex Elder static void obj_request_img_data_set(struct rbd_obj_request *obj_request) 14366365d33aSAlex Elder { 14376365d33aSAlex Elder if (test_and_set_bit(OBJ_REQ_IMG_DATA, &obj_request->flags)) { 14386365d33aSAlex Elder struct rbd_device *rbd_dev; 14396365d33aSAlex Elder 144057acbaa7SAlex Elder rbd_dev = obj_request->img_request->rbd_dev; 14419584d508SIlya Dryomov rbd_warn(rbd_dev, "obj_request %p already marked img_data", 14426365d33aSAlex Elder obj_request); 14436365d33aSAlex Elder } 14446365d33aSAlex Elder } 14456365d33aSAlex Elder 14466365d33aSAlex Elder static bool obj_request_img_data_test(struct rbd_obj_request *obj_request) 14476365d33aSAlex Elder { 14486365d33aSAlex Elder smp_mb(); 14496365d33aSAlex Elder return test_bit(OBJ_REQ_IMG_DATA, &obj_request->flags) != 0; 14506365d33aSAlex Elder } 14516365d33aSAlex Elder 145257acbaa7SAlex Elder static void obj_request_done_set(struct rbd_obj_request *obj_request) 145357acbaa7SAlex Elder { 145457acbaa7SAlex Elder if (test_and_set_bit(OBJ_REQ_DONE, &obj_request->flags)) { 145557acbaa7SAlex Elder struct rbd_device *rbd_dev = NULL; 145657acbaa7SAlex Elder 145757acbaa7SAlex Elder if (obj_request_img_data_test(obj_request)) 145857acbaa7SAlex Elder rbd_dev = obj_request->img_request->rbd_dev; 14599584d508SIlya Dryomov rbd_warn(rbd_dev, "obj_request %p already marked done", 146057acbaa7SAlex Elder obj_request); 146157acbaa7SAlex Elder } 146257acbaa7SAlex Elder } 146357acbaa7SAlex Elder 146457acbaa7SAlex Elder static bool obj_request_done_test(struct rbd_obj_request *obj_request) 146557acbaa7SAlex Elder { 146657acbaa7SAlex Elder smp_mb(); 146757acbaa7SAlex Elder return test_bit(OBJ_REQ_DONE, &obj_request->flags) != 0; 146857acbaa7SAlex Elder } 146957acbaa7SAlex Elder 14705679c59fSAlex Elder /* 14715679c59fSAlex Elder * This sets the KNOWN flag after (possibly) setting the EXISTS 14725679c59fSAlex Elder * flag. The latter is set based on the "exists" value provided. 14735679c59fSAlex Elder * 14745679c59fSAlex Elder * Note that for our purposes once an object exists it never goes 14755679c59fSAlex Elder * away again. It's possible that the response from two existence 14765679c59fSAlex Elder * checks are separated by the creation of the target object, and 14775679c59fSAlex Elder * the first ("doesn't exist") response arrives *after* the second 14785679c59fSAlex Elder * ("does exist"). In that case we ignore the second one. 14795679c59fSAlex Elder */ 14805679c59fSAlex Elder static void obj_request_existence_set(struct rbd_obj_request *obj_request, 14815679c59fSAlex Elder bool exists) 14825679c59fSAlex Elder { 14835679c59fSAlex Elder if (exists) 14845679c59fSAlex Elder set_bit(OBJ_REQ_EXISTS, &obj_request->flags); 14855679c59fSAlex Elder set_bit(OBJ_REQ_KNOWN, &obj_request->flags); 14865679c59fSAlex Elder smp_mb(); 14875679c59fSAlex Elder } 14885679c59fSAlex Elder 14895679c59fSAlex Elder static bool obj_request_known_test(struct rbd_obj_request *obj_request) 14905679c59fSAlex Elder { 14915679c59fSAlex Elder smp_mb(); 14925679c59fSAlex Elder return test_bit(OBJ_REQ_KNOWN, &obj_request->flags) != 0; 14935679c59fSAlex Elder } 14945679c59fSAlex Elder 14955679c59fSAlex Elder static bool obj_request_exists_test(struct rbd_obj_request *obj_request) 14965679c59fSAlex Elder { 14975679c59fSAlex Elder smp_mb(); 14985679c59fSAlex Elder return test_bit(OBJ_REQ_EXISTS, &obj_request->flags) != 0; 14995679c59fSAlex Elder } 15005679c59fSAlex Elder 15019638556aSIlya Dryomov static bool obj_request_overlaps_parent(struct rbd_obj_request *obj_request) 15029638556aSIlya Dryomov { 15039638556aSIlya Dryomov struct rbd_device *rbd_dev = obj_request->img_request->rbd_dev; 15049638556aSIlya Dryomov 15059638556aSIlya Dryomov return obj_request->img_offset < 15069638556aSIlya Dryomov round_up(rbd_dev->parent_overlap, rbd_obj_bytes(&rbd_dev->header)); 15079638556aSIlya Dryomov } 15089638556aSIlya Dryomov 1509bf0d5f50SAlex Elder static void rbd_obj_request_get(struct rbd_obj_request *obj_request) 1510bf0d5f50SAlex Elder { 151137206ee5SAlex Elder dout("%s: obj %p (was %d)\n", __func__, obj_request, 151237206ee5SAlex Elder atomic_read(&obj_request->kref.refcount)); 1513bf0d5f50SAlex Elder kref_get(&obj_request->kref); 1514bf0d5f50SAlex Elder } 1515bf0d5f50SAlex Elder 1516bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref); 1517bf0d5f50SAlex Elder static void rbd_obj_request_put(struct rbd_obj_request *obj_request) 1518bf0d5f50SAlex Elder { 1519bf0d5f50SAlex Elder rbd_assert(obj_request != NULL); 152037206ee5SAlex Elder dout("%s: obj %p (was %d)\n", __func__, obj_request, 152137206ee5SAlex Elder atomic_read(&obj_request->kref.refcount)); 1522bf0d5f50SAlex Elder kref_put(&obj_request->kref, rbd_obj_request_destroy); 1523bf0d5f50SAlex Elder } 1524bf0d5f50SAlex Elder 15250f2d5be7SAlex Elder static void rbd_img_request_get(struct rbd_img_request *img_request) 15260f2d5be7SAlex Elder { 15270f2d5be7SAlex Elder dout("%s: img %p (was %d)\n", __func__, img_request, 15280f2d5be7SAlex Elder atomic_read(&img_request->kref.refcount)); 15290f2d5be7SAlex Elder kref_get(&img_request->kref); 15300f2d5be7SAlex Elder } 15310f2d5be7SAlex Elder 1532e93f3152SAlex Elder static bool img_request_child_test(struct rbd_img_request *img_request); 1533e93f3152SAlex Elder static void rbd_parent_request_destroy(struct kref *kref); 1534bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref); 1535bf0d5f50SAlex Elder static void rbd_img_request_put(struct rbd_img_request *img_request) 1536bf0d5f50SAlex Elder { 1537bf0d5f50SAlex Elder rbd_assert(img_request != NULL); 153837206ee5SAlex Elder dout("%s: img %p (was %d)\n", __func__, img_request, 153937206ee5SAlex Elder atomic_read(&img_request->kref.refcount)); 1540e93f3152SAlex Elder if (img_request_child_test(img_request)) 1541e93f3152SAlex Elder kref_put(&img_request->kref, rbd_parent_request_destroy); 1542e93f3152SAlex Elder else 1543bf0d5f50SAlex Elder kref_put(&img_request->kref, rbd_img_request_destroy); 1544bf0d5f50SAlex Elder } 1545bf0d5f50SAlex Elder 1546bf0d5f50SAlex Elder static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request, 1547bf0d5f50SAlex Elder struct rbd_obj_request *obj_request) 1548bf0d5f50SAlex Elder { 154925dcf954SAlex Elder rbd_assert(obj_request->img_request == NULL); 155025dcf954SAlex Elder 1551b155e86cSAlex Elder /* Image request now owns object's original reference */ 1552bf0d5f50SAlex Elder obj_request->img_request = img_request; 155325dcf954SAlex Elder obj_request->which = img_request->obj_request_count; 15546365d33aSAlex Elder rbd_assert(!obj_request_img_data_test(obj_request)); 15556365d33aSAlex Elder obj_request_img_data_set(obj_request); 1556bf0d5f50SAlex Elder rbd_assert(obj_request->which != BAD_WHICH); 155725dcf954SAlex Elder img_request->obj_request_count++; 155825dcf954SAlex Elder list_add_tail(&obj_request->links, &img_request->obj_requests); 155937206ee5SAlex Elder dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request, 156037206ee5SAlex Elder obj_request->which); 1561bf0d5f50SAlex Elder } 1562bf0d5f50SAlex Elder 1563bf0d5f50SAlex Elder static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request, 1564bf0d5f50SAlex Elder struct rbd_obj_request *obj_request) 1565bf0d5f50SAlex Elder { 1566bf0d5f50SAlex Elder rbd_assert(obj_request->which != BAD_WHICH); 156725dcf954SAlex Elder 156837206ee5SAlex Elder dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request, 156937206ee5SAlex Elder obj_request->which); 1570bf0d5f50SAlex Elder list_del(&obj_request->links); 157125dcf954SAlex Elder rbd_assert(img_request->obj_request_count > 0); 157225dcf954SAlex Elder img_request->obj_request_count--; 157325dcf954SAlex Elder rbd_assert(obj_request->which == img_request->obj_request_count); 157425dcf954SAlex Elder obj_request->which = BAD_WHICH; 15756365d33aSAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 1576bf0d5f50SAlex Elder rbd_assert(obj_request->img_request == img_request); 1577bf0d5f50SAlex Elder obj_request->img_request = NULL; 157825dcf954SAlex Elder obj_request->callback = NULL; 1579bf0d5f50SAlex Elder rbd_obj_request_put(obj_request); 1580bf0d5f50SAlex Elder } 1581bf0d5f50SAlex Elder 1582bf0d5f50SAlex Elder static bool obj_request_type_valid(enum obj_request_type type) 1583bf0d5f50SAlex Elder { 1584bf0d5f50SAlex Elder switch (type) { 15859969ebc5SAlex Elder case OBJ_REQUEST_NODATA: 1586bf0d5f50SAlex Elder case OBJ_REQUEST_BIO: 1587788e2df3SAlex Elder case OBJ_REQUEST_PAGES: 1588bf0d5f50SAlex Elder return true; 1589bf0d5f50SAlex Elder default: 1590bf0d5f50SAlex Elder return false; 1591bf0d5f50SAlex Elder } 1592bf0d5f50SAlex Elder } 1593bf0d5f50SAlex Elder 15944a17dadcSIlya Dryomov static void rbd_img_obj_callback(struct rbd_obj_request *obj_request); 15954a17dadcSIlya Dryomov 1596980917fcSIlya Dryomov static void rbd_obj_request_submit(struct rbd_obj_request *obj_request) 1597bf0d5f50SAlex Elder { 1598980917fcSIlya Dryomov struct ceph_osd_request *osd_req = obj_request->osd_req; 1599980917fcSIlya Dryomov 1600a90bb0c1SIlya Dryomov dout("%s %p object_no %016llx %llu~%llu osd_req %p\n", __func__, 1601a90bb0c1SIlya Dryomov obj_request, obj_request->object_no, obj_request->offset, 160267e2b652SIlya Dryomov obj_request->length, osd_req); 16034a17dadcSIlya Dryomov if (obj_request_img_data_test(obj_request)) { 16044a17dadcSIlya Dryomov WARN_ON(obj_request->callback != rbd_img_obj_callback); 16054a17dadcSIlya Dryomov rbd_img_request_get(obj_request->img_request); 16064a17dadcSIlya Dryomov } 1607980917fcSIlya Dryomov ceph_osdc_start_request(osd_req->r_osdc, osd_req, false); 1608bf0d5f50SAlex Elder } 1609bf0d5f50SAlex Elder 1610bf0d5f50SAlex Elder static void rbd_img_request_complete(struct rbd_img_request *img_request) 1611bf0d5f50SAlex Elder { 161255f27e09SAlex Elder 161337206ee5SAlex Elder dout("%s: img %p\n", __func__, img_request); 161455f27e09SAlex Elder 161555f27e09SAlex Elder /* 161655f27e09SAlex Elder * If no error occurred, compute the aggregate transfer 161755f27e09SAlex Elder * count for the image request. We could instead use 161855f27e09SAlex Elder * atomic64_cmpxchg() to update it as each object request 161955f27e09SAlex Elder * completes; not clear which way is better off hand. 162055f27e09SAlex Elder */ 162155f27e09SAlex Elder if (!img_request->result) { 162255f27e09SAlex Elder struct rbd_obj_request *obj_request; 162355f27e09SAlex Elder u64 xferred = 0; 162455f27e09SAlex Elder 162555f27e09SAlex Elder for_each_obj_request(img_request, obj_request) 162655f27e09SAlex Elder xferred += obj_request->xferred; 162755f27e09SAlex Elder img_request->xferred = xferred; 162855f27e09SAlex Elder } 162955f27e09SAlex Elder 1630bf0d5f50SAlex Elder if (img_request->callback) 1631bf0d5f50SAlex Elder img_request->callback(img_request); 1632bf0d5f50SAlex Elder else 1633bf0d5f50SAlex Elder rbd_img_request_put(img_request); 1634bf0d5f50SAlex Elder } 1635bf0d5f50SAlex Elder 16360c425248SAlex Elder /* 16370c425248SAlex Elder * The default/initial value for all image request flags is 0. Each 16380c425248SAlex Elder * is conditionally set to 1 at image request initialization time 16390c425248SAlex Elder * and currently never change thereafter. 16400c425248SAlex Elder */ 16410c425248SAlex Elder static void img_request_write_set(struct rbd_img_request *img_request) 16420c425248SAlex Elder { 16430c425248SAlex Elder set_bit(IMG_REQ_WRITE, &img_request->flags); 16440c425248SAlex Elder smp_mb(); 16450c425248SAlex Elder } 16460c425248SAlex Elder 16470c425248SAlex Elder static bool img_request_write_test(struct rbd_img_request *img_request) 16480c425248SAlex Elder { 16490c425248SAlex Elder smp_mb(); 16500c425248SAlex Elder return test_bit(IMG_REQ_WRITE, &img_request->flags) != 0; 16510c425248SAlex Elder } 16520c425248SAlex Elder 165390e98c52SGuangliang Zhao /* 165490e98c52SGuangliang Zhao * Set the discard flag when the img_request is an discard request 165590e98c52SGuangliang Zhao */ 165690e98c52SGuangliang Zhao static void img_request_discard_set(struct rbd_img_request *img_request) 165790e98c52SGuangliang Zhao { 165890e98c52SGuangliang Zhao set_bit(IMG_REQ_DISCARD, &img_request->flags); 165990e98c52SGuangliang Zhao smp_mb(); 166090e98c52SGuangliang Zhao } 166190e98c52SGuangliang Zhao 166290e98c52SGuangliang Zhao static bool img_request_discard_test(struct rbd_img_request *img_request) 166390e98c52SGuangliang Zhao { 166490e98c52SGuangliang Zhao smp_mb(); 166590e98c52SGuangliang Zhao return test_bit(IMG_REQ_DISCARD, &img_request->flags) != 0; 166690e98c52SGuangliang Zhao } 166790e98c52SGuangliang Zhao 16689849e986SAlex Elder static void img_request_child_set(struct rbd_img_request *img_request) 16699849e986SAlex Elder { 16709849e986SAlex Elder set_bit(IMG_REQ_CHILD, &img_request->flags); 16719849e986SAlex Elder smp_mb(); 16729849e986SAlex Elder } 16739849e986SAlex Elder 1674e93f3152SAlex Elder static void img_request_child_clear(struct rbd_img_request *img_request) 1675e93f3152SAlex Elder { 1676e93f3152SAlex Elder clear_bit(IMG_REQ_CHILD, &img_request->flags); 1677e93f3152SAlex Elder smp_mb(); 1678e93f3152SAlex Elder } 1679e93f3152SAlex Elder 16809849e986SAlex Elder static bool img_request_child_test(struct rbd_img_request *img_request) 16819849e986SAlex Elder { 16829849e986SAlex Elder smp_mb(); 16839849e986SAlex Elder return test_bit(IMG_REQ_CHILD, &img_request->flags) != 0; 16849849e986SAlex Elder } 16859849e986SAlex Elder 1686d0b2e944SAlex Elder static void img_request_layered_set(struct rbd_img_request *img_request) 1687d0b2e944SAlex Elder { 1688d0b2e944SAlex Elder set_bit(IMG_REQ_LAYERED, &img_request->flags); 1689d0b2e944SAlex Elder smp_mb(); 1690d0b2e944SAlex Elder } 1691d0b2e944SAlex Elder 1692a2acd00eSAlex Elder static void img_request_layered_clear(struct rbd_img_request *img_request) 1693a2acd00eSAlex Elder { 1694a2acd00eSAlex Elder clear_bit(IMG_REQ_LAYERED, &img_request->flags); 1695a2acd00eSAlex Elder smp_mb(); 1696a2acd00eSAlex Elder } 1697a2acd00eSAlex Elder 1698d0b2e944SAlex Elder static bool img_request_layered_test(struct rbd_img_request *img_request) 1699d0b2e944SAlex Elder { 1700d0b2e944SAlex Elder smp_mb(); 1701d0b2e944SAlex Elder return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0; 1702d0b2e944SAlex Elder } 1703d0b2e944SAlex Elder 17043b434a2aSJosh Durgin static enum obj_operation_type 17053b434a2aSJosh Durgin rbd_img_request_op_type(struct rbd_img_request *img_request) 17063b434a2aSJosh Durgin { 17073b434a2aSJosh Durgin if (img_request_write_test(img_request)) 17083b434a2aSJosh Durgin return OBJ_OP_WRITE; 17093b434a2aSJosh Durgin else if (img_request_discard_test(img_request)) 17103b434a2aSJosh Durgin return OBJ_OP_DISCARD; 17113b434a2aSJosh Durgin else 17123b434a2aSJosh Durgin return OBJ_OP_READ; 17133b434a2aSJosh Durgin } 17143b434a2aSJosh Durgin 17156e2a4505SAlex Elder static void 17166e2a4505SAlex Elder rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request) 17176e2a4505SAlex Elder { 1718b9434c5bSAlex Elder u64 xferred = obj_request->xferred; 1719b9434c5bSAlex Elder u64 length = obj_request->length; 1720b9434c5bSAlex Elder 17216e2a4505SAlex Elder dout("%s: obj %p img %p result %d %llu/%llu\n", __func__, 17226e2a4505SAlex Elder obj_request, obj_request->img_request, obj_request->result, 1723b9434c5bSAlex Elder xferred, length); 17246e2a4505SAlex Elder /* 172517c1cc1dSJosh Durgin * ENOENT means a hole in the image. We zero-fill the entire 172617c1cc1dSJosh Durgin * length of the request. A short read also implies zero-fill 172717c1cc1dSJosh Durgin * to the end of the request. An error requires the whole 172817c1cc1dSJosh Durgin * length of the request to be reported finished with an error 172917c1cc1dSJosh Durgin * to the block layer. In each case we update the xferred 173017c1cc1dSJosh Durgin * count to indicate the whole request was satisfied. 17316e2a4505SAlex Elder */ 1732b9434c5bSAlex Elder rbd_assert(obj_request->type != OBJ_REQUEST_NODATA); 17336e2a4505SAlex Elder if (obj_request->result == -ENOENT) { 1734b9434c5bSAlex Elder if (obj_request->type == OBJ_REQUEST_BIO) 17356e2a4505SAlex Elder zero_bio_chain(obj_request->bio_list, 0); 1736b9434c5bSAlex Elder else 1737b9434c5bSAlex Elder zero_pages(obj_request->pages, 0, length); 17386e2a4505SAlex Elder obj_request->result = 0; 1739b9434c5bSAlex Elder } else if (xferred < length && !obj_request->result) { 1740b9434c5bSAlex Elder if (obj_request->type == OBJ_REQUEST_BIO) 1741b9434c5bSAlex Elder zero_bio_chain(obj_request->bio_list, xferred); 1742b9434c5bSAlex Elder else 1743b9434c5bSAlex Elder zero_pages(obj_request->pages, xferred, length); 17446e2a4505SAlex Elder } 174517c1cc1dSJosh Durgin obj_request->xferred = length; 17466e2a4505SAlex Elder obj_request_done_set(obj_request); 17476e2a4505SAlex Elder } 17486e2a4505SAlex Elder 1749bf0d5f50SAlex Elder static void rbd_obj_request_complete(struct rbd_obj_request *obj_request) 1750bf0d5f50SAlex Elder { 175137206ee5SAlex Elder dout("%s: obj %p cb %p\n", __func__, obj_request, 175237206ee5SAlex Elder obj_request->callback); 1753bf0d5f50SAlex Elder if (obj_request->callback) 1754bf0d5f50SAlex Elder obj_request->callback(obj_request); 1755788e2df3SAlex Elder else 1756788e2df3SAlex Elder complete_all(&obj_request->completion); 1757bf0d5f50SAlex Elder } 1758bf0d5f50SAlex Elder 17590dcc685eSIlya Dryomov static void rbd_obj_request_error(struct rbd_obj_request *obj_request, int err) 17600dcc685eSIlya Dryomov { 17610dcc685eSIlya Dryomov obj_request->result = err; 17620dcc685eSIlya Dryomov obj_request->xferred = 0; 17630dcc685eSIlya Dryomov /* 17640dcc685eSIlya Dryomov * kludge - mirror rbd_obj_request_submit() to match a put in 17650dcc685eSIlya Dryomov * rbd_img_obj_callback() 17660dcc685eSIlya Dryomov */ 17670dcc685eSIlya Dryomov if (obj_request_img_data_test(obj_request)) { 17680dcc685eSIlya Dryomov WARN_ON(obj_request->callback != rbd_img_obj_callback); 17690dcc685eSIlya Dryomov rbd_img_request_get(obj_request->img_request); 17700dcc685eSIlya Dryomov } 17710dcc685eSIlya Dryomov obj_request_done_set(obj_request); 17720dcc685eSIlya Dryomov rbd_obj_request_complete(obj_request); 17730dcc685eSIlya Dryomov } 17740dcc685eSIlya Dryomov 1775c47f9371SAlex Elder static void rbd_osd_read_callback(struct rbd_obj_request *obj_request) 1776bf0d5f50SAlex Elder { 177757acbaa7SAlex Elder struct rbd_img_request *img_request = NULL; 1778a9e8ba2cSAlex Elder struct rbd_device *rbd_dev = NULL; 177957acbaa7SAlex Elder bool layered = false; 178057acbaa7SAlex Elder 178157acbaa7SAlex Elder if (obj_request_img_data_test(obj_request)) { 178257acbaa7SAlex Elder img_request = obj_request->img_request; 178357acbaa7SAlex Elder layered = img_request && img_request_layered_test(img_request); 1784a9e8ba2cSAlex Elder rbd_dev = img_request->rbd_dev; 178557acbaa7SAlex Elder } 17868b3e1a56SAlex Elder 17878b3e1a56SAlex Elder dout("%s: obj %p img %p result %d %llu/%llu\n", __func__, 17888b3e1a56SAlex Elder obj_request, img_request, obj_request->result, 17898b3e1a56SAlex Elder obj_request->xferred, obj_request->length); 1790a9e8ba2cSAlex Elder if (layered && obj_request->result == -ENOENT && 1791a9e8ba2cSAlex Elder obj_request->img_offset < rbd_dev->parent_overlap) 17928b3e1a56SAlex Elder rbd_img_parent_read(obj_request); 17938b3e1a56SAlex Elder else if (img_request) 17946e2a4505SAlex Elder rbd_img_obj_request_read_callback(obj_request); 17956e2a4505SAlex Elder else 179607741308SAlex Elder obj_request_done_set(obj_request); 1797bf0d5f50SAlex Elder } 1798bf0d5f50SAlex Elder 1799c47f9371SAlex Elder static void rbd_osd_write_callback(struct rbd_obj_request *obj_request) 1800bf0d5f50SAlex Elder { 18011b83bef2SSage Weil dout("%s: obj %p result %d %llu\n", __func__, obj_request, 18021b83bef2SSage Weil obj_request->result, obj_request->length); 18031b83bef2SSage Weil /* 18048b3e1a56SAlex Elder * There is no such thing as a successful short write. Set 18058b3e1a56SAlex Elder * it to our originally-requested length. 18061b83bef2SSage Weil */ 18071b83bef2SSage Weil obj_request->xferred = obj_request->length; 180807741308SAlex Elder obj_request_done_set(obj_request); 1809bf0d5f50SAlex Elder } 1810bf0d5f50SAlex Elder 181190e98c52SGuangliang Zhao static void rbd_osd_discard_callback(struct rbd_obj_request *obj_request) 181290e98c52SGuangliang Zhao { 181390e98c52SGuangliang Zhao dout("%s: obj %p result %d %llu\n", __func__, obj_request, 181490e98c52SGuangliang Zhao obj_request->result, obj_request->length); 181590e98c52SGuangliang Zhao /* 181690e98c52SGuangliang Zhao * There is no such thing as a successful short discard. Set 181790e98c52SGuangliang Zhao * it to our originally-requested length. 181890e98c52SGuangliang Zhao */ 181990e98c52SGuangliang Zhao obj_request->xferred = obj_request->length; 1820d0265de7SJosh Durgin /* discarding a non-existent object is not a problem */ 1821d0265de7SJosh Durgin if (obj_request->result == -ENOENT) 1822d0265de7SJosh Durgin obj_request->result = 0; 182390e98c52SGuangliang Zhao obj_request_done_set(obj_request); 182490e98c52SGuangliang Zhao } 182590e98c52SGuangliang Zhao 1826fbfab539SAlex Elder /* 1827fbfab539SAlex Elder * For a simple stat call there's nothing to do. We'll do more if 1828fbfab539SAlex Elder * this is part of a write sequence for a layered image. 1829fbfab539SAlex Elder */ 1830c47f9371SAlex Elder static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request) 1831fbfab539SAlex Elder { 183237206ee5SAlex Elder dout("%s: obj %p\n", __func__, obj_request); 1833fbfab539SAlex Elder obj_request_done_set(obj_request); 1834fbfab539SAlex Elder } 1835fbfab539SAlex Elder 18362761713dSIlya Dryomov static void rbd_osd_call_callback(struct rbd_obj_request *obj_request) 18372761713dSIlya Dryomov { 18382761713dSIlya Dryomov dout("%s: obj %p\n", __func__, obj_request); 18392761713dSIlya Dryomov 18402761713dSIlya Dryomov if (obj_request_img_data_test(obj_request)) 18412761713dSIlya Dryomov rbd_osd_copyup_callback(obj_request); 18422761713dSIlya Dryomov else 18432761713dSIlya Dryomov obj_request_done_set(obj_request); 18442761713dSIlya Dryomov } 18452761713dSIlya Dryomov 184685e084feSIlya Dryomov static void rbd_osd_req_callback(struct ceph_osd_request *osd_req) 1847bf0d5f50SAlex Elder { 1848bf0d5f50SAlex Elder struct rbd_obj_request *obj_request = osd_req->r_priv; 1849bf0d5f50SAlex Elder u16 opcode; 1850bf0d5f50SAlex Elder 185185e084feSIlya Dryomov dout("%s: osd_req %p\n", __func__, osd_req); 1852bf0d5f50SAlex Elder rbd_assert(osd_req == obj_request->osd_req); 185357acbaa7SAlex Elder if (obj_request_img_data_test(obj_request)) { 185457acbaa7SAlex Elder rbd_assert(obj_request->img_request); 185557acbaa7SAlex Elder rbd_assert(obj_request->which != BAD_WHICH); 185657acbaa7SAlex Elder } else { 185757acbaa7SAlex Elder rbd_assert(obj_request->which == BAD_WHICH); 185857acbaa7SAlex Elder } 1859bf0d5f50SAlex Elder 18601b83bef2SSage Weil if (osd_req->r_result < 0) 18611b83bef2SSage Weil obj_request->result = osd_req->r_result; 1862bf0d5f50SAlex Elder 1863c47f9371SAlex Elder /* 1864c47f9371SAlex Elder * We support a 64-bit length, but ultimately it has to be 18657ad18afaSChristoph Hellwig * passed to the block layer, which just supports a 32-bit 18667ad18afaSChristoph Hellwig * length field. 1867c47f9371SAlex Elder */ 18687665d85bSYan, Zheng obj_request->xferred = osd_req->r_ops[0].outdata_len; 1869c47f9371SAlex Elder rbd_assert(obj_request->xferred < (u64)UINT_MAX); 18700ccd5926SIlya Dryomov 187179528734SAlex Elder opcode = osd_req->r_ops[0].op; 1872bf0d5f50SAlex Elder switch (opcode) { 1873bf0d5f50SAlex Elder case CEPH_OSD_OP_READ: 1874c47f9371SAlex Elder rbd_osd_read_callback(obj_request); 1875bf0d5f50SAlex Elder break; 18760ccd5926SIlya Dryomov case CEPH_OSD_OP_SETALLOCHINT: 1877e30b7577SIlya Dryomov rbd_assert(osd_req->r_ops[1].op == CEPH_OSD_OP_WRITE || 1878e30b7577SIlya Dryomov osd_req->r_ops[1].op == CEPH_OSD_OP_WRITEFULL); 18790ccd5926SIlya Dryomov /* fall through */ 1880bf0d5f50SAlex Elder case CEPH_OSD_OP_WRITE: 1881e30b7577SIlya Dryomov case CEPH_OSD_OP_WRITEFULL: 1882c47f9371SAlex Elder rbd_osd_write_callback(obj_request); 1883bf0d5f50SAlex Elder break; 1884fbfab539SAlex Elder case CEPH_OSD_OP_STAT: 1885c47f9371SAlex Elder rbd_osd_stat_callback(obj_request); 1886fbfab539SAlex Elder break; 188790e98c52SGuangliang Zhao case CEPH_OSD_OP_DELETE: 188890e98c52SGuangliang Zhao case CEPH_OSD_OP_TRUNCATE: 188990e98c52SGuangliang Zhao case CEPH_OSD_OP_ZERO: 189090e98c52SGuangliang Zhao rbd_osd_discard_callback(obj_request); 189190e98c52SGuangliang Zhao break; 189236be9a76SAlex Elder case CEPH_OSD_OP_CALL: 18932761713dSIlya Dryomov rbd_osd_call_callback(obj_request); 18942761713dSIlya Dryomov break; 1895bf0d5f50SAlex Elder default: 1896a90bb0c1SIlya Dryomov rbd_warn(NULL, "unexpected OSD op: object_no %016llx opcode %d", 1897a90bb0c1SIlya Dryomov obj_request->object_no, opcode); 1898bf0d5f50SAlex Elder break; 1899bf0d5f50SAlex Elder } 1900bf0d5f50SAlex Elder 190107741308SAlex Elder if (obj_request_done_test(obj_request)) 1902bf0d5f50SAlex Elder rbd_obj_request_complete(obj_request); 1903bf0d5f50SAlex Elder } 1904bf0d5f50SAlex Elder 19059d4df01fSAlex Elder static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request) 1906430c28c3SAlex Elder { 19078c042b0dSAlex Elder struct ceph_osd_request *osd_req = obj_request->osd_req; 1908430c28c3SAlex Elder 19097c84883aSIlya Dryomov rbd_assert(obj_request_img_data_test(obj_request)); 19107c84883aSIlya Dryomov osd_req->r_snapid = obj_request->img_request->snap_id; 19119d4df01fSAlex Elder } 19129d4df01fSAlex Elder 19139d4df01fSAlex Elder static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request) 19149d4df01fSAlex Elder { 19159d4df01fSAlex Elder struct ceph_osd_request *osd_req = obj_request->osd_req; 19169d4df01fSAlex Elder 1917bb873b53SIlya Dryomov osd_req->r_mtime = CURRENT_TIME; 1918bb873b53SIlya Dryomov osd_req->r_data_offset = obj_request->offset; 1919430c28c3SAlex Elder } 1920430c28c3SAlex Elder 1921bc81207eSIlya Dryomov static struct ceph_osd_request * 1922bc81207eSIlya Dryomov __rbd_osd_req_create(struct rbd_device *rbd_dev, 1923bc81207eSIlya Dryomov struct ceph_snap_context *snapc, 1924bc81207eSIlya Dryomov int num_ops, unsigned int flags, 1925bc81207eSIlya Dryomov struct rbd_obj_request *obj_request) 1926bc81207eSIlya Dryomov { 1927bc81207eSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 1928bc81207eSIlya Dryomov struct ceph_osd_request *req; 1929a90bb0c1SIlya Dryomov const char *name_format = rbd_dev->image_format == 1 ? 1930a90bb0c1SIlya Dryomov RBD_V1_DATA_FORMAT : RBD_V2_DATA_FORMAT; 1931bc81207eSIlya Dryomov 1932bc81207eSIlya Dryomov req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false, GFP_NOIO); 1933bc81207eSIlya Dryomov if (!req) 1934bc81207eSIlya Dryomov return NULL; 1935bc81207eSIlya Dryomov 1936bc81207eSIlya Dryomov req->r_flags = flags; 1937bc81207eSIlya Dryomov req->r_callback = rbd_osd_req_callback; 1938bc81207eSIlya Dryomov req->r_priv = obj_request; 1939bc81207eSIlya Dryomov 1940bc81207eSIlya Dryomov req->r_base_oloc.pool = rbd_dev->layout.pool_id; 1941a90bb0c1SIlya Dryomov if (ceph_oid_aprintf(&req->r_base_oid, GFP_NOIO, name_format, 1942a90bb0c1SIlya Dryomov rbd_dev->header.object_prefix, obj_request->object_no)) 1943bc81207eSIlya Dryomov goto err_req; 1944bc81207eSIlya Dryomov 1945bc81207eSIlya Dryomov if (ceph_osdc_alloc_messages(req, GFP_NOIO)) 1946bc81207eSIlya Dryomov goto err_req; 1947bc81207eSIlya Dryomov 1948bc81207eSIlya Dryomov return req; 1949bc81207eSIlya Dryomov 1950bc81207eSIlya Dryomov err_req: 1951bc81207eSIlya Dryomov ceph_osdc_put_request(req); 1952bc81207eSIlya Dryomov return NULL; 1953bc81207eSIlya Dryomov } 1954bc81207eSIlya Dryomov 19550ccd5926SIlya Dryomov /* 19560ccd5926SIlya Dryomov * Create an osd request. A read request has one osd op (read). 19570ccd5926SIlya Dryomov * A write request has either one (watch) or two (hint+write) osd ops. 19580ccd5926SIlya Dryomov * (All rbd data writes are prefixed with an allocation hint op, but 19590ccd5926SIlya Dryomov * technically osd watch is a write request, hence this distinction.) 19600ccd5926SIlya Dryomov */ 1961bf0d5f50SAlex Elder static struct ceph_osd_request *rbd_osd_req_create( 1962bf0d5f50SAlex Elder struct rbd_device *rbd_dev, 19636d2940c8SGuangliang Zhao enum obj_operation_type op_type, 1964deb236b3SIlya Dryomov unsigned int num_ops, 1965430c28c3SAlex Elder struct rbd_obj_request *obj_request) 1966bf0d5f50SAlex Elder { 1967bf0d5f50SAlex Elder struct ceph_snap_context *snapc = NULL; 1968bf0d5f50SAlex Elder 196990e98c52SGuangliang Zhao if (obj_request_img_data_test(obj_request) && 197090e98c52SGuangliang Zhao (op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_WRITE)) { 19716365d33aSAlex Elder struct rbd_img_request *img_request = obj_request->img_request; 197290e98c52SGuangliang Zhao if (op_type == OBJ_OP_WRITE) { 19736d2940c8SGuangliang Zhao rbd_assert(img_request_write_test(img_request)); 197490e98c52SGuangliang Zhao } else { 197590e98c52SGuangliang Zhao rbd_assert(img_request_discard_test(img_request)); 197690e98c52SGuangliang Zhao } 1977bf0d5f50SAlex Elder snapc = img_request->snapc; 1978bf0d5f50SAlex Elder } 1979bf0d5f50SAlex Elder 19806d2940c8SGuangliang Zhao rbd_assert(num_ops == 1 || ((op_type == OBJ_OP_WRITE) && num_ops == 2)); 1981deb236b3SIlya Dryomov 1982bc81207eSIlya Dryomov return __rbd_osd_req_create(rbd_dev, snapc, num_ops, 1983bc81207eSIlya Dryomov (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD) ? 1984bc81207eSIlya Dryomov CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK : CEPH_OSD_FLAG_READ, 1985bc81207eSIlya Dryomov obj_request); 1986bf0d5f50SAlex Elder } 1987bf0d5f50SAlex Elder 19880eefd470SAlex Elder /* 1989d3246fb0SJosh Durgin * Create a copyup osd request based on the information in the object 1990d3246fb0SJosh Durgin * request supplied. A copyup request has two or three osd ops, a 1991d3246fb0SJosh Durgin * copyup method call, potentially a hint op, and a write or truncate 1992d3246fb0SJosh Durgin * or zero op. 19930eefd470SAlex Elder */ 19940eefd470SAlex Elder static struct ceph_osd_request * 19950eefd470SAlex Elder rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request) 19960eefd470SAlex Elder { 19970eefd470SAlex Elder struct rbd_img_request *img_request; 1998d3246fb0SJosh Durgin int num_osd_ops = 3; 19990eefd470SAlex Elder 20000eefd470SAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 20010eefd470SAlex Elder img_request = obj_request->img_request; 20020eefd470SAlex Elder rbd_assert(img_request); 2003d3246fb0SJosh Durgin rbd_assert(img_request_write_test(img_request) || 2004d3246fb0SJosh Durgin img_request_discard_test(img_request)); 20050eefd470SAlex Elder 2006d3246fb0SJosh Durgin if (img_request_discard_test(img_request)) 2007d3246fb0SJosh Durgin num_osd_ops = 2; 2008d3246fb0SJosh Durgin 2009bc81207eSIlya Dryomov return __rbd_osd_req_create(img_request->rbd_dev, 2010bc81207eSIlya Dryomov img_request->snapc, num_osd_ops, 2011bc81207eSIlya Dryomov CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, 2012bc81207eSIlya Dryomov obj_request); 20130eefd470SAlex Elder } 20140eefd470SAlex Elder 2015bf0d5f50SAlex Elder static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req) 2016bf0d5f50SAlex Elder { 2017bf0d5f50SAlex Elder ceph_osdc_put_request(osd_req); 2018bf0d5f50SAlex Elder } 2019bf0d5f50SAlex Elder 20206c696d85SIlya Dryomov static struct rbd_obj_request * 20216c696d85SIlya Dryomov rbd_obj_request_create(enum obj_request_type type) 2022bf0d5f50SAlex Elder { 2023bf0d5f50SAlex Elder struct rbd_obj_request *obj_request; 2024bf0d5f50SAlex Elder 2025bf0d5f50SAlex Elder rbd_assert(obj_request_type_valid(type)); 2026bf0d5f50SAlex Elder 20275a60e876SIlya Dryomov obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_NOIO); 20286c696d85SIlya Dryomov if (!obj_request) 2029f907ad55SAlex Elder return NULL; 2030f907ad55SAlex Elder 2031bf0d5f50SAlex Elder obj_request->which = BAD_WHICH; 2032bf0d5f50SAlex Elder obj_request->type = type; 2033bf0d5f50SAlex Elder INIT_LIST_HEAD(&obj_request->links); 2034788e2df3SAlex Elder init_completion(&obj_request->completion); 2035bf0d5f50SAlex Elder kref_init(&obj_request->kref); 2036bf0d5f50SAlex Elder 203767e2b652SIlya Dryomov dout("%s %p\n", __func__, obj_request); 2038bf0d5f50SAlex Elder return obj_request; 2039bf0d5f50SAlex Elder } 2040bf0d5f50SAlex Elder 2041bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref) 2042bf0d5f50SAlex Elder { 2043bf0d5f50SAlex Elder struct rbd_obj_request *obj_request; 2044bf0d5f50SAlex Elder 2045bf0d5f50SAlex Elder obj_request = container_of(kref, struct rbd_obj_request, kref); 2046bf0d5f50SAlex Elder 204737206ee5SAlex Elder dout("%s: obj %p\n", __func__, obj_request); 204837206ee5SAlex Elder 2049bf0d5f50SAlex Elder rbd_assert(obj_request->img_request == NULL); 2050bf0d5f50SAlex Elder rbd_assert(obj_request->which == BAD_WHICH); 2051bf0d5f50SAlex Elder 2052bf0d5f50SAlex Elder if (obj_request->osd_req) 2053bf0d5f50SAlex Elder rbd_osd_req_destroy(obj_request->osd_req); 2054bf0d5f50SAlex Elder 2055bf0d5f50SAlex Elder rbd_assert(obj_request_type_valid(obj_request->type)); 2056bf0d5f50SAlex Elder switch (obj_request->type) { 20579969ebc5SAlex Elder case OBJ_REQUEST_NODATA: 20589969ebc5SAlex Elder break; /* Nothing to do */ 2059bf0d5f50SAlex Elder case OBJ_REQUEST_BIO: 2060bf0d5f50SAlex Elder if (obj_request->bio_list) 2061bf0d5f50SAlex Elder bio_chain_put(obj_request->bio_list); 2062bf0d5f50SAlex Elder break; 2063788e2df3SAlex Elder case OBJ_REQUEST_PAGES: 206404dc923cSIlya Dryomov /* img_data requests don't own their page array */ 206504dc923cSIlya Dryomov if (obj_request->pages && 206604dc923cSIlya Dryomov !obj_request_img_data_test(obj_request)) 2067788e2df3SAlex Elder ceph_release_page_vector(obj_request->pages, 2068788e2df3SAlex Elder obj_request->page_count); 2069788e2df3SAlex Elder break; 2070bf0d5f50SAlex Elder } 2071bf0d5f50SAlex Elder 2072868311b1SAlex Elder kmem_cache_free(rbd_obj_request_cache, obj_request); 2073bf0d5f50SAlex Elder } 2074bf0d5f50SAlex Elder 2075fb65d228SAlex Elder /* It's OK to call this for a device with no parent */ 2076fb65d228SAlex Elder 2077fb65d228SAlex Elder static void rbd_spec_put(struct rbd_spec *spec); 2078fb65d228SAlex Elder static void rbd_dev_unparent(struct rbd_device *rbd_dev) 2079fb65d228SAlex Elder { 2080fb65d228SAlex Elder rbd_dev_remove_parent(rbd_dev); 2081fb65d228SAlex Elder rbd_spec_put(rbd_dev->parent_spec); 2082fb65d228SAlex Elder rbd_dev->parent_spec = NULL; 2083fb65d228SAlex Elder rbd_dev->parent_overlap = 0; 2084fb65d228SAlex Elder } 2085fb65d228SAlex Elder 2086bf0d5f50SAlex Elder /* 2087a2acd00eSAlex Elder * Parent image reference counting is used to determine when an 2088a2acd00eSAlex Elder * image's parent fields can be safely torn down--after there are no 2089a2acd00eSAlex Elder * more in-flight requests to the parent image. When the last 2090a2acd00eSAlex Elder * reference is dropped, cleaning them up is safe. 2091a2acd00eSAlex Elder */ 2092a2acd00eSAlex Elder static void rbd_dev_parent_put(struct rbd_device *rbd_dev) 2093a2acd00eSAlex Elder { 2094a2acd00eSAlex Elder int counter; 2095a2acd00eSAlex Elder 2096a2acd00eSAlex Elder if (!rbd_dev->parent_spec) 2097a2acd00eSAlex Elder return; 2098a2acd00eSAlex Elder 2099a2acd00eSAlex Elder counter = atomic_dec_return_safe(&rbd_dev->parent_ref); 2100a2acd00eSAlex Elder if (counter > 0) 2101a2acd00eSAlex Elder return; 2102a2acd00eSAlex Elder 2103a2acd00eSAlex Elder /* Last reference; clean up parent data structures */ 2104a2acd00eSAlex Elder 2105a2acd00eSAlex Elder if (!counter) 2106a2acd00eSAlex Elder rbd_dev_unparent(rbd_dev); 2107a2acd00eSAlex Elder else 21089584d508SIlya Dryomov rbd_warn(rbd_dev, "parent reference underflow"); 2109a2acd00eSAlex Elder } 2110a2acd00eSAlex Elder 2111a2acd00eSAlex Elder /* 2112a2acd00eSAlex Elder * If an image has a non-zero parent overlap, get a reference to its 2113a2acd00eSAlex Elder * parent. 2114a2acd00eSAlex Elder * 2115a2acd00eSAlex Elder * Returns true if the rbd device has a parent with a non-zero 2116a2acd00eSAlex Elder * overlap and a reference for it was successfully taken, or 2117a2acd00eSAlex Elder * false otherwise. 2118a2acd00eSAlex Elder */ 2119a2acd00eSAlex Elder static bool rbd_dev_parent_get(struct rbd_device *rbd_dev) 2120a2acd00eSAlex Elder { 2121ae43e9d0SIlya Dryomov int counter = 0; 2122a2acd00eSAlex Elder 2123a2acd00eSAlex Elder if (!rbd_dev->parent_spec) 2124a2acd00eSAlex Elder return false; 2125a2acd00eSAlex Elder 2126ae43e9d0SIlya Dryomov down_read(&rbd_dev->header_rwsem); 2127ae43e9d0SIlya Dryomov if (rbd_dev->parent_overlap) 2128a2acd00eSAlex Elder counter = atomic_inc_return_safe(&rbd_dev->parent_ref); 2129ae43e9d0SIlya Dryomov up_read(&rbd_dev->header_rwsem); 2130a2acd00eSAlex Elder 2131a2acd00eSAlex Elder if (counter < 0) 21329584d508SIlya Dryomov rbd_warn(rbd_dev, "parent reference overflow"); 2133a2acd00eSAlex Elder 2134ae43e9d0SIlya Dryomov return counter > 0; 2135a2acd00eSAlex Elder } 2136a2acd00eSAlex Elder 2137bf0d5f50SAlex Elder /* 2138bf0d5f50SAlex Elder * Caller is responsible for filling in the list of object requests 2139bf0d5f50SAlex Elder * that comprises the image request, and the Linux request pointer 2140bf0d5f50SAlex Elder * (if there is one). 2141bf0d5f50SAlex Elder */ 2142cc344fa1SAlex Elder static struct rbd_img_request *rbd_img_request_create( 2143cc344fa1SAlex Elder struct rbd_device *rbd_dev, 2144bf0d5f50SAlex Elder u64 offset, u64 length, 21456d2940c8SGuangliang Zhao enum obj_operation_type op_type, 21464e752f0aSJosh Durgin struct ceph_snap_context *snapc) 2147bf0d5f50SAlex Elder { 2148bf0d5f50SAlex Elder struct rbd_img_request *img_request; 2149bf0d5f50SAlex Elder 21507a716aacSIlya Dryomov img_request = kmem_cache_alloc(rbd_img_request_cache, GFP_NOIO); 2151bf0d5f50SAlex Elder if (!img_request) 2152bf0d5f50SAlex Elder return NULL; 2153bf0d5f50SAlex Elder 2154bf0d5f50SAlex Elder img_request->rq = NULL; 2155bf0d5f50SAlex Elder img_request->rbd_dev = rbd_dev; 2156bf0d5f50SAlex Elder img_request->offset = offset; 2157bf0d5f50SAlex Elder img_request->length = length; 21580c425248SAlex Elder img_request->flags = 0; 215990e98c52SGuangliang Zhao if (op_type == OBJ_OP_DISCARD) { 216090e98c52SGuangliang Zhao img_request_discard_set(img_request); 216190e98c52SGuangliang Zhao img_request->snapc = snapc; 216290e98c52SGuangliang Zhao } else if (op_type == OBJ_OP_WRITE) { 21630c425248SAlex Elder img_request_write_set(img_request); 21644e752f0aSJosh Durgin img_request->snapc = snapc; 21650c425248SAlex Elder } else { 2166bf0d5f50SAlex Elder img_request->snap_id = rbd_dev->spec->snap_id; 21670c425248SAlex Elder } 2168a2acd00eSAlex Elder if (rbd_dev_parent_get(rbd_dev)) 2169d0b2e944SAlex Elder img_request_layered_set(img_request); 2170bf0d5f50SAlex Elder spin_lock_init(&img_request->completion_lock); 2171bf0d5f50SAlex Elder img_request->next_completion = 0; 2172bf0d5f50SAlex Elder img_request->callback = NULL; 2173a5a337d4SAlex Elder img_request->result = 0; 2174bf0d5f50SAlex Elder img_request->obj_request_count = 0; 2175bf0d5f50SAlex Elder INIT_LIST_HEAD(&img_request->obj_requests); 2176bf0d5f50SAlex Elder kref_init(&img_request->kref); 2177bf0d5f50SAlex Elder 217837206ee5SAlex Elder dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev, 21796d2940c8SGuangliang Zhao obj_op_name(op_type), offset, length, img_request); 218037206ee5SAlex Elder 2181bf0d5f50SAlex Elder return img_request; 2182bf0d5f50SAlex Elder } 2183bf0d5f50SAlex Elder 2184bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref) 2185bf0d5f50SAlex Elder { 2186bf0d5f50SAlex Elder struct rbd_img_request *img_request; 2187bf0d5f50SAlex Elder struct rbd_obj_request *obj_request; 2188bf0d5f50SAlex Elder struct rbd_obj_request *next_obj_request; 2189bf0d5f50SAlex Elder 2190bf0d5f50SAlex Elder img_request = container_of(kref, struct rbd_img_request, kref); 2191bf0d5f50SAlex Elder 219237206ee5SAlex Elder dout("%s: img %p\n", __func__, img_request); 219337206ee5SAlex Elder 2194bf0d5f50SAlex Elder for_each_obj_request_safe(img_request, obj_request, next_obj_request) 2195bf0d5f50SAlex Elder rbd_img_obj_request_del(img_request, obj_request); 219625dcf954SAlex Elder rbd_assert(img_request->obj_request_count == 0); 2197bf0d5f50SAlex Elder 2198a2acd00eSAlex Elder if (img_request_layered_test(img_request)) { 2199a2acd00eSAlex Elder img_request_layered_clear(img_request); 2200a2acd00eSAlex Elder rbd_dev_parent_put(img_request->rbd_dev); 2201a2acd00eSAlex Elder } 2202a2acd00eSAlex Elder 2203bef95455SJosh Durgin if (img_request_write_test(img_request) || 2204bef95455SJosh Durgin img_request_discard_test(img_request)) 2205812164f8SAlex Elder ceph_put_snap_context(img_request->snapc); 2206bf0d5f50SAlex Elder 22071c2a9dfeSAlex Elder kmem_cache_free(rbd_img_request_cache, img_request); 2208bf0d5f50SAlex Elder } 2209bf0d5f50SAlex Elder 2210e93f3152SAlex Elder static struct rbd_img_request *rbd_parent_request_create( 2211e93f3152SAlex Elder struct rbd_obj_request *obj_request, 2212e93f3152SAlex Elder u64 img_offset, u64 length) 2213e93f3152SAlex Elder { 2214e93f3152SAlex Elder struct rbd_img_request *parent_request; 2215e93f3152SAlex Elder struct rbd_device *rbd_dev; 2216e93f3152SAlex Elder 2217e93f3152SAlex Elder rbd_assert(obj_request->img_request); 2218e93f3152SAlex Elder rbd_dev = obj_request->img_request->rbd_dev; 2219e93f3152SAlex Elder 22204e752f0aSJosh Durgin parent_request = rbd_img_request_create(rbd_dev->parent, img_offset, 22216d2940c8SGuangliang Zhao length, OBJ_OP_READ, NULL); 2222e93f3152SAlex Elder if (!parent_request) 2223e93f3152SAlex Elder return NULL; 2224e93f3152SAlex Elder 2225e93f3152SAlex Elder img_request_child_set(parent_request); 2226e93f3152SAlex Elder rbd_obj_request_get(obj_request); 2227e93f3152SAlex Elder parent_request->obj_request = obj_request; 2228e93f3152SAlex Elder 2229e93f3152SAlex Elder return parent_request; 2230e93f3152SAlex Elder } 2231e93f3152SAlex Elder 2232e93f3152SAlex Elder static void rbd_parent_request_destroy(struct kref *kref) 2233e93f3152SAlex Elder { 2234e93f3152SAlex Elder struct rbd_img_request *parent_request; 2235e93f3152SAlex Elder struct rbd_obj_request *orig_request; 2236e93f3152SAlex Elder 2237e93f3152SAlex Elder parent_request = container_of(kref, struct rbd_img_request, kref); 2238e93f3152SAlex Elder orig_request = parent_request->obj_request; 2239e93f3152SAlex Elder 2240e93f3152SAlex Elder parent_request->obj_request = NULL; 2241e93f3152SAlex Elder rbd_obj_request_put(orig_request); 2242e93f3152SAlex Elder img_request_child_clear(parent_request); 2243e93f3152SAlex Elder 2244e93f3152SAlex Elder rbd_img_request_destroy(kref); 2245e93f3152SAlex Elder } 2246e93f3152SAlex Elder 22471217857fSAlex Elder static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request) 22481217857fSAlex Elder { 22496365d33aSAlex Elder struct rbd_img_request *img_request; 22501217857fSAlex Elder unsigned int xferred; 22511217857fSAlex Elder int result; 22528b3e1a56SAlex Elder bool more; 22531217857fSAlex Elder 22546365d33aSAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 22556365d33aSAlex Elder img_request = obj_request->img_request; 22566365d33aSAlex Elder 22571217857fSAlex Elder rbd_assert(obj_request->xferred <= (u64)UINT_MAX); 22581217857fSAlex Elder xferred = (unsigned int)obj_request->xferred; 22591217857fSAlex Elder result = obj_request->result; 22601217857fSAlex Elder if (result) { 22611217857fSAlex Elder struct rbd_device *rbd_dev = img_request->rbd_dev; 22626d2940c8SGuangliang Zhao enum obj_operation_type op_type; 22636d2940c8SGuangliang Zhao 226490e98c52SGuangliang Zhao if (img_request_discard_test(img_request)) 226590e98c52SGuangliang Zhao op_type = OBJ_OP_DISCARD; 226690e98c52SGuangliang Zhao else if (img_request_write_test(img_request)) 226790e98c52SGuangliang Zhao op_type = OBJ_OP_WRITE; 226890e98c52SGuangliang Zhao else 226990e98c52SGuangliang Zhao op_type = OBJ_OP_READ; 22701217857fSAlex Elder 22719584d508SIlya Dryomov rbd_warn(rbd_dev, "%s %llx at %llx (%llx)", 22726d2940c8SGuangliang Zhao obj_op_name(op_type), obj_request->length, 22736d2940c8SGuangliang Zhao obj_request->img_offset, obj_request->offset); 22749584d508SIlya Dryomov rbd_warn(rbd_dev, " result %d xferred %x", 22751217857fSAlex Elder result, xferred); 22761217857fSAlex Elder if (!img_request->result) 22771217857fSAlex Elder img_request->result = result; 2278082a75daSIlya Dryomov /* 2279082a75daSIlya Dryomov * Need to end I/O on the entire obj_request worth of 2280082a75daSIlya Dryomov * bytes in case of error. 2281082a75daSIlya Dryomov */ 2282082a75daSIlya Dryomov xferred = obj_request->length; 22831217857fSAlex Elder } 22841217857fSAlex Elder 22858b3e1a56SAlex Elder if (img_request_child_test(img_request)) { 22868b3e1a56SAlex Elder rbd_assert(img_request->obj_request != NULL); 22878b3e1a56SAlex Elder more = obj_request->which < img_request->obj_request_count - 1; 22888b3e1a56SAlex Elder } else { 22898b3e1a56SAlex Elder rbd_assert(img_request->rq != NULL); 22907ad18afaSChristoph Hellwig 22917ad18afaSChristoph Hellwig more = blk_update_request(img_request->rq, result, xferred); 22927ad18afaSChristoph Hellwig if (!more) 22937ad18afaSChristoph Hellwig __blk_mq_end_request(img_request->rq, result); 22948b3e1a56SAlex Elder } 22958b3e1a56SAlex Elder 22968b3e1a56SAlex Elder return more; 22971217857fSAlex Elder } 22981217857fSAlex Elder 22992169238dSAlex Elder static void rbd_img_obj_callback(struct rbd_obj_request *obj_request) 23002169238dSAlex Elder { 23012169238dSAlex Elder struct rbd_img_request *img_request; 23022169238dSAlex Elder u32 which = obj_request->which; 23032169238dSAlex Elder bool more = true; 23042169238dSAlex Elder 23056365d33aSAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 23062169238dSAlex Elder img_request = obj_request->img_request; 23072169238dSAlex Elder 23082169238dSAlex Elder dout("%s: img %p obj %p\n", __func__, img_request, obj_request); 23092169238dSAlex Elder rbd_assert(img_request != NULL); 23102169238dSAlex Elder rbd_assert(img_request->obj_request_count > 0); 23112169238dSAlex Elder rbd_assert(which != BAD_WHICH); 23122169238dSAlex Elder rbd_assert(which < img_request->obj_request_count); 23132169238dSAlex Elder 23142169238dSAlex Elder spin_lock_irq(&img_request->completion_lock); 23152169238dSAlex Elder if (which != img_request->next_completion) 23162169238dSAlex Elder goto out; 23172169238dSAlex Elder 23182169238dSAlex Elder for_each_obj_request_from(img_request, obj_request) { 23192169238dSAlex Elder rbd_assert(more); 23202169238dSAlex Elder rbd_assert(which < img_request->obj_request_count); 23212169238dSAlex Elder 23222169238dSAlex Elder if (!obj_request_done_test(obj_request)) 23232169238dSAlex Elder break; 23241217857fSAlex Elder more = rbd_img_obj_end_request(obj_request); 23252169238dSAlex Elder which++; 23262169238dSAlex Elder } 23272169238dSAlex Elder 23282169238dSAlex Elder rbd_assert(more ^ (which == img_request->obj_request_count)); 23292169238dSAlex Elder img_request->next_completion = which; 23302169238dSAlex Elder out: 23312169238dSAlex Elder spin_unlock_irq(&img_request->completion_lock); 23320f2d5be7SAlex Elder rbd_img_request_put(img_request); 23332169238dSAlex Elder 23342169238dSAlex Elder if (!more) 23352169238dSAlex Elder rbd_img_request_complete(img_request); 23362169238dSAlex Elder } 23372169238dSAlex Elder 2338f1a4739fSAlex Elder /* 23393b434a2aSJosh Durgin * Add individual osd ops to the given ceph_osd_request and prepare 23403b434a2aSJosh Durgin * them for submission. num_ops is the current number of 23413b434a2aSJosh Durgin * osd operations already to the object request. 23423b434a2aSJosh Durgin */ 23433b434a2aSJosh Durgin static void rbd_img_obj_request_fill(struct rbd_obj_request *obj_request, 23443b434a2aSJosh Durgin struct ceph_osd_request *osd_request, 23453b434a2aSJosh Durgin enum obj_operation_type op_type, 23463b434a2aSJosh Durgin unsigned int num_ops) 23473b434a2aSJosh Durgin { 23483b434a2aSJosh Durgin struct rbd_img_request *img_request = obj_request->img_request; 23493b434a2aSJosh Durgin struct rbd_device *rbd_dev = img_request->rbd_dev; 23503b434a2aSJosh Durgin u64 object_size = rbd_obj_bytes(&rbd_dev->header); 23513b434a2aSJosh Durgin u64 offset = obj_request->offset; 23523b434a2aSJosh Durgin u64 length = obj_request->length; 23533b434a2aSJosh Durgin u64 img_end; 23543b434a2aSJosh Durgin u16 opcode; 23553b434a2aSJosh Durgin 23563b434a2aSJosh Durgin if (op_type == OBJ_OP_DISCARD) { 2357d3246fb0SJosh Durgin if (!offset && length == object_size && 2358d3246fb0SJosh Durgin (!img_request_layered_test(img_request) || 2359d3246fb0SJosh Durgin !obj_request_overlaps_parent(obj_request))) { 23603b434a2aSJosh Durgin opcode = CEPH_OSD_OP_DELETE; 23613b434a2aSJosh Durgin } else if ((offset + length == object_size)) { 23623b434a2aSJosh Durgin opcode = CEPH_OSD_OP_TRUNCATE; 23633b434a2aSJosh Durgin } else { 23643b434a2aSJosh Durgin down_read(&rbd_dev->header_rwsem); 23653b434a2aSJosh Durgin img_end = rbd_dev->header.image_size; 23663b434a2aSJosh Durgin up_read(&rbd_dev->header_rwsem); 23673b434a2aSJosh Durgin 23683b434a2aSJosh Durgin if (obj_request->img_offset + length == img_end) 23693b434a2aSJosh Durgin opcode = CEPH_OSD_OP_TRUNCATE; 23703b434a2aSJosh Durgin else 23713b434a2aSJosh Durgin opcode = CEPH_OSD_OP_ZERO; 23723b434a2aSJosh Durgin } 23733b434a2aSJosh Durgin } else if (op_type == OBJ_OP_WRITE) { 2374e30b7577SIlya Dryomov if (!offset && length == object_size) 2375e30b7577SIlya Dryomov opcode = CEPH_OSD_OP_WRITEFULL; 2376e30b7577SIlya Dryomov else 23773b434a2aSJosh Durgin opcode = CEPH_OSD_OP_WRITE; 23783b434a2aSJosh Durgin osd_req_op_alloc_hint_init(osd_request, num_ops, 23793b434a2aSJosh Durgin object_size, object_size); 23803b434a2aSJosh Durgin num_ops++; 23813b434a2aSJosh Durgin } else { 23823b434a2aSJosh Durgin opcode = CEPH_OSD_OP_READ; 23833b434a2aSJosh Durgin } 23843b434a2aSJosh Durgin 23857e868b6eSIlya Dryomov if (opcode == CEPH_OSD_OP_DELETE) 2386144cba14SYan, Zheng osd_req_op_init(osd_request, num_ops, opcode, 0); 23877e868b6eSIlya Dryomov else 23887e868b6eSIlya Dryomov osd_req_op_extent_init(osd_request, num_ops, opcode, 23897e868b6eSIlya Dryomov offset, length, 0, 0); 23907e868b6eSIlya Dryomov 23913b434a2aSJosh Durgin if (obj_request->type == OBJ_REQUEST_BIO) 23923b434a2aSJosh Durgin osd_req_op_extent_osd_data_bio(osd_request, num_ops, 23933b434a2aSJosh Durgin obj_request->bio_list, length); 23943b434a2aSJosh Durgin else if (obj_request->type == OBJ_REQUEST_PAGES) 23953b434a2aSJosh Durgin osd_req_op_extent_osd_data_pages(osd_request, num_ops, 23963b434a2aSJosh Durgin obj_request->pages, length, 23973b434a2aSJosh Durgin offset & ~PAGE_MASK, false, false); 23983b434a2aSJosh Durgin 23993b434a2aSJosh Durgin /* Discards are also writes */ 24003b434a2aSJosh Durgin if (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD) 24013b434a2aSJosh Durgin rbd_osd_req_format_write(obj_request); 24023b434a2aSJosh Durgin else 24033b434a2aSJosh Durgin rbd_osd_req_format_read(obj_request); 24043b434a2aSJosh Durgin } 24053b434a2aSJosh Durgin 24063b434a2aSJosh Durgin /* 2407f1a4739fSAlex Elder * Split up an image request into one or more object requests, each 2408f1a4739fSAlex Elder * to a different object. The "type" parameter indicates whether 2409f1a4739fSAlex Elder * "data_desc" is the pointer to the head of a list of bio 2410f1a4739fSAlex Elder * structures, or the base of a page array. In either case this 2411f1a4739fSAlex Elder * function assumes data_desc describes memory sufficient to hold 2412f1a4739fSAlex Elder * all data described by the image request. 2413f1a4739fSAlex Elder */ 2414f1a4739fSAlex Elder static int rbd_img_request_fill(struct rbd_img_request *img_request, 2415f1a4739fSAlex Elder enum obj_request_type type, 2416f1a4739fSAlex Elder void *data_desc) 2417bf0d5f50SAlex Elder { 2418bf0d5f50SAlex Elder struct rbd_device *rbd_dev = img_request->rbd_dev; 2419bf0d5f50SAlex Elder struct rbd_obj_request *obj_request = NULL; 2420bf0d5f50SAlex Elder struct rbd_obj_request *next_obj_request; 2421a158073cSJingoo Han struct bio *bio_list = NULL; 2422f1a4739fSAlex Elder unsigned int bio_offset = 0; 2423a158073cSJingoo Han struct page **pages = NULL; 24246d2940c8SGuangliang Zhao enum obj_operation_type op_type; 24257da22d29SAlex Elder u64 img_offset; 2426bf0d5f50SAlex Elder u64 resid; 2427bf0d5f50SAlex Elder 2428f1a4739fSAlex Elder dout("%s: img %p type %d data_desc %p\n", __func__, img_request, 2429f1a4739fSAlex Elder (int)type, data_desc); 243037206ee5SAlex Elder 24317da22d29SAlex Elder img_offset = img_request->offset; 2432bf0d5f50SAlex Elder resid = img_request->length; 24334dda41d3SAlex Elder rbd_assert(resid > 0); 24343b434a2aSJosh Durgin op_type = rbd_img_request_op_type(img_request); 2435f1a4739fSAlex Elder 2436f1a4739fSAlex Elder if (type == OBJ_REQUEST_BIO) { 2437f1a4739fSAlex Elder bio_list = data_desc; 24384f024f37SKent Overstreet rbd_assert(img_offset == 24394f024f37SKent Overstreet bio_list->bi_iter.bi_sector << SECTOR_SHIFT); 244090e98c52SGuangliang Zhao } else if (type == OBJ_REQUEST_PAGES) { 2441f1a4739fSAlex Elder pages = data_desc; 2442f1a4739fSAlex Elder } 2443f1a4739fSAlex Elder 2444bf0d5f50SAlex Elder while (resid) { 24452fa12320SAlex Elder struct ceph_osd_request *osd_req; 2446a90bb0c1SIlya Dryomov u64 object_no = img_offset >> rbd_dev->header.obj_order; 244767e2b652SIlya Dryomov u64 offset = rbd_segment_offset(rbd_dev, img_offset); 244867e2b652SIlya Dryomov u64 length = rbd_segment_length(rbd_dev, img_offset, resid); 2449bf0d5f50SAlex Elder 24506c696d85SIlya Dryomov obj_request = rbd_obj_request_create(type); 2451bf0d5f50SAlex Elder if (!obj_request) 2452bf0d5f50SAlex Elder goto out_unwind; 245362054da6SIlya Dryomov 2454a90bb0c1SIlya Dryomov obj_request->object_no = object_no; 245567e2b652SIlya Dryomov obj_request->offset = offset; 245667e2b652SIlya Dryomov obj_request->length = length; 245767e2b652SIlya Dryomov 245803507db6SJosh Durgin /* 245903507db6SJosh Durgin * set obj_request->img_request before creating the 246003507db6SJosh Durgin * osd_request so that it gets the right snapc 246103507db6SJosh Durgin */ 246203507db6SJosh Durgin rbd_img_obj_request_add(img_request, obj_request); 2463bf0d5f50SAlex Elder 2464f1a4739fSAlex Elder if (type == OBJ_REQUEST_BIO) { 2465f1a4739fSAlex Elder unsigned int clone_size; 2466f1a4739fSAlex Elder 2467bf0d5f50SAlex Elder rbd_assert(length <= (u64)UINT_MAX); 2468bf0d5f50SAlex Elder clone_size = (unsigned int)length; 2469f1a4739fSAlex Elder obj_request->bio_list = 2470f1a4739fSAlex Elder bio_chain_clone_range(&bio_list, 2471f1a4739fSAlex Elder &bio_offset, 2472f1a4739fSAlex Elder clone_size, 24732224d879SDavid Disseldorp GFP_NOIO); 2474bf0d5f50SAlex Elder if (!obj_request->bio_list) 247562054da6SIlya Dryomov goto out_unwind; 247690e98c52SGuangliang Zhao } else if (type == OBJ_REQUEST_PAGES) { 2477f1a4739fSAlex Elder unsigned int page_count; 2478f1a4739fSAlex Elder 2479f1a4739fSAlex Elder obj_request->pages = pages; 2480f1a4739fSAlex Elder page_count = (u32)calc_pages_for(offset, length); 2481f1a4739fSAlex Elder obj_request->page_count = page_count; 2482f1a4739fSAlex Elder if ((offset + length) & ~PAGE_MASK) 2483f1a4739fSAlex Elder page_count--; /* more on last page */ 2484f1a4739fSAlex Elder pages += page_count; 2485f1a4739fSAlex Elder } 2486bf0d5f50SAlex Elder 24876d2940c8SGuangliang Zhao osd_req = rbd_osd_req_create(rbd_dev, op_type, 24886d2940c8SGuangliang Zhao (op_type == OBJ_OP_WRITE) ? 2 : 1, 24892fa12320SAlex Elder obj_request); 24902fa12320SAlex Elder if (!osd_req) 249162054da6SIlya Dryomov goto out_unwind; 24923b434a2aSJosh Durgin 24932fa12320SAlex Elder obj_request->osd_req = osd_req; 24942169238dSAlex Elder obj_request->callback = rbd_img_obj_callback; 24957da22d29SAlex Elder obj_request->img_offset = img_offset; 2496bf0d5f50SAlex Elder 24973b434a2aSJosh Durgin rbd_img_obj_request_fill(obj_request, osd_req, op_type, 0); 24983b434a2aSJosh Durgin 24997da22d29SAlex Elder img_offset += length; 2500bf0d5f50SAlex Elder resid -= length; 2501bf0d5f50SAlex Elder } 2502bf0d5f50SAlex Elder 2503bf0d5f50SAlex Elder return 0; 2504bf0d5f50SAlex Elder 2505bf0d5f50SAlex Elder out_unwind: 2506bf0d5f50SAlex Elder for_each_obj_request_safe(img_request, obj_request, next_obj_request) 250742dd037cSIlya Dryomov rbd_img_obj_request_del(img_request, obj_request); 2508bf0d5f50SAlex Elder 2509bf0d5f50SAlex Elder return -ENOMEM; 2510bf0d5f50SAlex Elder } 2511bf0d5f50SAlex Elder 25123d7efd18SAlex Elder static void 25132761713dSIlya Dryomov rbd_osd_copyup_callback(struct rbd_obj_request *obj_request) 25140eefd470SAlex Elder { 25150eefd470SAlex Elder struct rbd_img_request *img_request; 25160eefd470SAlex Elder struct rbd_device *rbd_dev; 2517ebda6408SAlex Elder struct page **pages; 25180eefd470SAlex Elder u32 page_count; 25190eefd470SAlex Elder 25202761713dSIlya Dryomov dout("%s: obj %p\n", __func__, obj_request); 25212761713dSIlya Dryomov 2522d3246fb0SJosh Durgin rbd_assert(obj_request->type == OBJ_REQUEST_BIO || 2523d3246fb0SJosh Durgin obj_request->type == OBJ_REQUEST_NODATA); 25240eefd470SAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 25250eefd470SAlex Elder img_request = obj_request->img_request; 25260eefd470SAlex Elder rbd_assert(img_request); 25270eefd470SAlex Elder 25280eefd470SAlex Elder rbd_dev = img_request->rbd_dev; 25290eefd470SAlex Elder rbd_assert(rbd_dev); 25300eefd470SAlex Elder 2531ebda6408SAlex Elder pages = obj_request->copyup_pages; 2532ebda6408SAlex Elder rbd_assert(pages != NULL); 25330eefd470SAlex Elder obj_request->copyup_pages = NULL; 2534ebda6408SAlex Elder page_count = obj_request->copyup_page_count; 2535ebda6408SAlex Elder rbd_assert(page_count); 2536ebda6408SAlex Elder obj_request->copyup_page_count = 0; 2537ebda6408SAlex Elder ceph_release_page_vector(pages, page_count); 25380eefd470SAlex Elder 25390eefd470SAlex Elder /* 25400eefd470SAlex Elder * We want the transfer count to reflect the size of the 25410eefd470SAlex Elder * original write request. There is no such thing as a 25420eefd470SAlex Elder * successful short write, so if the request was successful 25430eefd470SAlex Elder * we can just set it to the originally-requested length. 25440eefd470SAlex Elder */ 25450eefd470SAlex Elder if (!obj_request->result) 25460eefd470SAlex Elder obj_request->xferred = obj_request->length; 25470eefd470SAlex Elder 25482761713dSIlya Dryomov obj_request_done_set(obj_request); 25490eefd470SAlex Elder } 25500eefd470SAlex Elder 25510eefd470SAlex Elder static void 25523d7efd18SAlex Elder rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request) 25533d7efd18SAlex Elder { 25543d7efd18SAlex Elder struct rbd_obj_request *orig_request; 25550eefd470SAlex Elder struct ceph_osd_request *osd_req; 25560eefd470SAlex Elder struct rbd_device *rbd_dev; 25573d7efd18SAlex Elder struct page **pages; 2558d3246fb0SJosh Durgin enum obj_operation_type op_type; 2559ebda6408SAlex Elder u32 page_count; 2560bbea1c1aSAlex Elder int img_result; 2561ebda6408SAlex Elder u64 parent_length; 25623d7efd18SAlex Elder 25633d7efd18SAlex Elder rbd_assert(img_request_child_test(img_request)); 25643d7efd18SAlex Elder 25653d7efd18SAlex Elder /* First get what we need from the image request */ 25663d7efd18SAlex Elder 25673d7efd18SAlex Elder pages = img_request->copyup_pages; 25683d7efd18SAlex Elder rbd_assert(pages != NULL); 25693d7efd18SAlex Elder img_request->copyup_pages = NULL; 2570ebda6408SAlex Elder page_count = img_request->copyup_page_count; 2571ebda6408SAlex Elder rbd_assert(page_count); 2572ebda6408SAlex Elder img_request->copyup_page_count = 0; 25733d7efd18SAlex Elder 25743d7efd18SAlex Elder orig_request = img_request->obj_request; 25753d7efd18SAlex Elder rbd_assert(orig_request != NULL); 2576b91f09f1SAlex Elder rbd_assert(obj_request_type_valid(orig_request->type)); 2577bbea1c1aSAlex Elder img_result = img_request->result; 2578ebda6408SAlex Elder parent_length = img_request->length; 2579fa355112SIlya Dryomov rbd_assert(img_result || parent_length == img_request->xferred); 25803d7efd18SAlex Elder rbd_img_request_put(img_request); 25813d7efd18SAlex Elder 258291c6febbSAlex Elder rbd_assert(orig_request->img_request); 258391c6febbSAlex Elder rbd_dev = orig_request->img_request->rbd_dev; 25843d7efd18SAlex Elder rbd_assert(rbd_dev); 25853d7efd18SAlex Elder 2586bbea1c1aSAlex Elder /* 2587bbea1c1aSAlex Elder * If the overlap has become 0 (most likely because the 2588bbea1c1aSAlex Elder * image has been flattened) we need to free the pages 2589bbea1c1aSAlex Elder * and re-submit the original write request. 2590bbea1c1aSAlex Elder */ 2591bbea1c1aSAlex Elder if (!rbd_dev->parent_overlap) { 2592bbea1c1aSAlex Elder ceph_release_page_vector(pages, page_count); 2593980917fcSIlya Dryomov rbd_obj_request_submit(orig_request); 2594bbea1c1aSAlex Elder return; 2595bbea1c1aSAlex Elder } 2596bbea1c1aSAlex Elder 2597bbea1c1aSAlex Elder if (img_result) 25980eefd470SAlex Elder goto out_err; 25993d7efd18SAlex Elder 26008785b1d4SAlex Elder /* 26018785b1d4SAlex Elder * The original osd request is of no use to use any more. 26020ccd5926SIlya Dryomov * We need a new one that can hold the three ops in a copyup 26038785b1d4SAlex Elder * request. Allocate the new copyup osd request for the 26048785b1d4SAlex Elder * original request, and release the old one. 26058785b1d4SAlex Elder */ 2606bbea1c1aSAlex Elder img_result = -ENOMEM; 26070eefd470SAlex Elder osd_req = rbd_osd_req_create_copyup(orig_request); 26080eefd470SAlex Elder if (!osd_req) 26090eefd470SAlex Elder goto out_err; 26108785b1d4SAlex Elder rbd_osd_req_destroy(orig_request->osd_req); 26110eefd470SAlex Elder orig_request->osd_req = osd_req; 26120eefd470SAlex Elder orig_request->copyup_pages = pages; 2613ebda6408SAlex Elder orig_request->copyup_page_count = page_count; 26143d7efd18SAlex Elder 26150eefd470SAlex Elder /* Initialize the copyup op */ 26160eefd470SAlex Elder 26170eefd470SAlex Elder osd_req_op_cls_init(osd_req, 0, CEPH_OSD_OP_CALL, "rbd", "copyup"); 2618ebda6408SAlex Elder osd_req_op_cls_request_data_pages(osd_req, 0, pages, parent_length, 0, 26190eefd470SAlex Elder false, false); 26200eefd470SAlex Elder 2621d3246fb0SJosh Durgin /* Add the other op(s) */ 26220ccd5926SIlya Dryomov 2623d3246fb0SJosh Durgin op_type = rbd_img_request_op_type(orig_request->img_request); 2624d3246fb0SJosh Durgin rbd_img_obj_request_fill(orig_request, osd_req, op_type, 1); 26250eefd470SAlex Elder 26260eefd470SAlex Elder /* All set, send it off. */ 26270eefd470SAlex Elder 2628980917fcSIlya Dryomov rbd_obj_request_submit(orig_request); 26290eefd470SAlex Elder return; 26300eefd470SAlex Elder 26310eefd470SAlex Elder out_err: 2632fa355112SIlya Dryomov ceph_release_page_vector(pages, page_count); 26330dcc685eSIlya Dryomov rbd_obj_request_error(orig_request, img_result); 26343d7efd18SAlex Elder } 26353d7efd18SAlex Elder 26363d7efd18SAlex Elder /* 26373d7efd18SAlex Elder * Read from the parent image the range of data that covers the 26383d7efd18SAlex Elder * entire target of the given object request. This is used for 26393d7efd18SAlex Elder * satisfying a layered image write request when the target of an 26403d7efd18SAlex Elder * object request from the image request does not exist. 26413d7efd18SAlex Elder * 26423d7efd18SAlex Elder * A page array big enough to hold the returned data is allocated 26433d7efd18SAlex Elder * and supplied to rbd_img_request_fill() as the "data descriptor." 26443d7efd18SAlex Elder * When the read completes, this page array will be transferred to 26453d7efd18SAlex Elder * the original object request for the copyup operation. 26463d7efd18SAlex Elder * 2647c2e82414SIlya Dryomov * If an error occurs, it is recorded as the result of the original 2648c2e82414SIlya Dryomov * object request in rbd_img_obj_exists_callback(). 26493d7efd18SAlex Elder */ 26503d7efd18SAlex Elder static int rbd_img_obj_parent_read_full(struct rbd_obj_request *obj_request) 26513d7efd18SAlex Elder { 2652058aa991SIlya Dryomov struct rbd_device *rbd_dev = obj_request->img_request->rbd_dev; 26533d7efd18SAlex Elder struct rbd_img_request *parent_request = NULL; 26543d7efd18SAlex Elder u64 img_offset; 26553d7efd18SAlex Elder u64 length; 26563d7efd18SAlex Elder struct page **pages = NULL; 26573d7efd18SAlex Elder u32 page_count; 26583d7efd18SAlex Elder int result; 26593d7efd18SAlex Elder 26603d7efd18SAlex Elder rbd_assert(rbd_dev->parent != NULL); 26613d7efd18SAlex Elder 26623d7efd18SAlex Elder /* 26633d7efd18SAlex Elder * Determine the byte range covered by the object in the 26643d7efd18SAlex Elder * child image to which the original request was to be sent. 26653d7efd18SAlex Elder */ 26663d7efd18SAlex Elder img_offset = obj_request->img_offset - obj_request->offset; 26675bc3fb17SIlya Dryomov length = rbd_obj_bytes(&rbd_dev->header); 26683d7efd18SAlex Elder 26693d7efd18SAlex Elder /* 2670a9e8ba2cSAlex Elder * There is no defined parent data beyond the parent 2671a9e8ba2cSAlex Elder * overlap, so limit what we read at that boundary if 2672a9e8ba2cSAlex Elder * necessary. 2673a9e8ba2cSAlex Elder */ 2674a9e8ba2cSAlex Elder if (img_offset + length > rbd_dev->parent_overlap) { 2675a9e8ba2cSAlex Elder rbd_assert(img_offset < rbd_dev->parent_overlap); 2676a9e8ba2cSAlex Elder length = rbd_dev->parent_overlap - img_offset; 2677a9e8ba2cSAlex Elder } 2678a9e8ba2cSAlex Elder 2679a9e8ba2cSAlex Elder /* 26803d7efd18SAlex Elder * Allocate a page array big enough to receive the data read 26813d7efd18SAlex Elder * from the parent. 26823d7efd18SAlex Elder */ 26833d7efd18SAlex Elder page_count = (u32)calc_pages_for(0, length); 26843d7efd18SAlex Elder pages = ceph_alloc_page_vector(page_count, GFP_KERNEL); 26853d7efd18SAlex Elder if (IS_ERR(pages)) { 26863d7efd18SAlex Elder result = PTR_ERR(pages); 26873d7efd18SAlex Elder pages = NULL; 26883d7efd18SAlex Elder goto out_err; 26893d7efd18SAlex Elder } 26903d7efd18SAlex Elder 26913d7efd18SAlex Elder result = -ENOMEM; 2692e93f3152SAlex Elder parent_request = rbd_parent_request_create(obj_request, 2693e93f3152SAlex Elder img_offset, length); 26943d7efd18SAlex Elder if (!parent_request) 26953d7efd18SAlex Elder goto out_err; 26963d7efd18SAlex Elder 26973d7efd18SAlex Elder result = rbd_img_request_fill(parent_request, OBJ_REQUEST_PAGES, pages); 26983d7efd18SAlex Elder if (result) 26993d7efd18SAlex Elder goto out_err; 2700058aa991SIlya Dryomov 27013d7efd18SAlex Elder parent_request->copyup_pages = pages; 2702ebda6408SAlex Elder parent_request->copyup_page_count = page_count; 27033d7efd18SAlex Elder parent_request->callback = rbd_img_obj_parent_read_full_callback; 2704058aa991SIlya Dryomov 27053d7efd18SAlex Elder result = rbd_img_request_submit(parent_request); 27063d7efd18SAlex Elder if (!result) 27073d7efd18SAlex Elder return 0; 27083d7efd18SAlex Elder 27093d7efd18SAlex Elder parent_request->copyup_pages = NULL; 2710ebda6408SAlex Elder parent_request->copyup_page_count = 0; 27113d7efd18SAlex Elder parent_request->obj_request = NULL; 27123d7efd18SAlex Elder rbd_obj_request_put(obj_request); 27133d7efd18SAlex Elder out_err: 27143d7efd18SAlex Elder if (pages) 27153d7efd18SAlex Elder ceph_release_page_vector(pages, page_count); 27163d7efd18SAlex Elder if (parent_request) 27173d7efd18SAlex Elder rbd_img_request_put(parent_request); 27183d7efd18SAlex Elder return result; 27193d7efd18SAlex Elder } 27203d7efd18SAlex Elder 2721c5b5ef6cSAlex Elder static void rbd_img_obj_exists_callback(struct rbd_obj_request *obj_request) 2722c5b5ef6cSAlex Elder { 2723c5b5ef6cSAlex Elder struct rbd_obj_request *orig_request; 2724638f5abeSAlex Elder struct rbd_device *rbd_dev; 2725c5b5ef6cSAlex Elder int result; 2726c5b5ef6cSAlex Elder 2727c5b5ef6cSAlex Elder rbd_assert(!obj_request_img_data_test(obj_request)); 2728c5b5ef6cSAlex Elder 2729c5b5ef6cSAlex Elder /* 2730c5b5ef6cSAlex Elder * All we need from the object request is the original 2731c5b5ef6cSAlex Elder * request and the result of the STAT op. Grab those, then 2732c5b5ef6cSAlex Elder * we're done with the request. 2733c5b5ef6cSAlex Elder */ 2734c5b5ef6cSAlex Elder orig_request = obj_request->obj_request; 2735c5b5ef6cSAlex Elder obj_request->obj_request = NULL; 2736912c317dSAlex Elder rbd_obj_request_put(orig_request); 2737c5b5ef6cSAlex Elder rbd_assert(orig_request); 2738c5b5ef6cSAlex Elder rbd_assert(orig_request->img_request); 2739c5b5ef6cSAlex Elder 2740c5b5ef6cSAlex Elder result = obj_request->result; 2741c5b5ef6cSAlex Elder obj_request->result = 0; 2742c5b5ef6cSAlex Elder 2743c5b5ef6cSAlex Elder dout("%s: obj %p for obj %p result %d %llu/%llu\n", __func__, 2744c5b5ef6cSAlex Elder obj_request, orig_request, result, 2745c5b5ef6cSAlex Elder obj_request->xferred, obj_request->length); 2746c5b5ef6cSAlex Elder rbd_obj_request_put(obj_request); 2747c5b5ef6cSAlex Elder 2748638f5abeSAlex Elder /* 2749638f5abeSAlex Elder * If the overlap has become 0 (most likely because the 2750980917fcSIlya Dryomov * image has been flattened) we need to re-submit the 2751980917fcSIlya Dryomov * original request. 2752638f5abeSAlex Elder */ 2753638f5abeSAlex Elder rbd_dev = orig_request->img_request->rbd_dev; 2754638f5abeSAlex Elder if (!rbd_dev->parent_overlap) { 2755980917fcSIlya Dryomov rbd_obj_request_submit(orig_request); 2756638f5abeSAlex Elder return; 2757638f5abeSAlex Elder } 2758c5b5ef6cSAlex Elder 2759c5b5ef6cSAlex Elder /* 2760c5b5ef6cSAlex Elder * Our only purpose here is to determine whether the object 2761c5b5ef6cSAlex Elder * exists, and we don't want to treat the non-existence as 2762c5b5ef6cSAlex Elder * an error. If something else comes back, transfer the 2763c5b5ef6cSAlex Elder * error to the original request and complete it now. 2764c5b5ef6cSAlex Elder */ 2765c5b5ef6cSAlex Elder if (!result) { 2766c5b5ef6cSAlex Elder obj_request_existence_set(orig_request, true); 2767c5b5ef6cSAlex Elder } else if (result == -ENOENT) { 2768c5b5ef6cSAlex Elder obj_request_existence_set(orig_request, false); 2769c2e82414SIlya Dryomov } else { 2770c2e82414SIlya Dryomov goto fail_orig_request; 2771c5b5ef6cSAlex Elder } 2772c5b5ef6cSAlex Elder 2773c5b5ef6cSAlex Elder /* 2774c5b5ef6cSAlex Elder * Resubmit the original request now that we have recorded 2775c5b5ef6cSAlex Elder * whether the target object exists. 2776c5b5ef6cSAlex Elder */ 2777c2e82414SIlya Dryomov result = rbd_img_obj_request_submit(orig_request); 2778c2e82414SIlya Dryomov if (result) 2779c2e82414SIlya Dryomov goto fail_orig_request; 2780c2e82414SIlya Dryomov 2781c2e82414SIlya Dryomov return; 2782c2e82414SIlya Dryomov 2783c2e82414SIlya Dryomov fail_orig_request: 27840dcc685eSIlya Dryomov rbd_obj_request_error(orig_request, result); 2785c5b5ef6cSAlex Elder } 2786c5b5ef6cSAlex Elder 2787c5b5ef6cSAlex Elder static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request) 2788c5b5ef6cSAlex Elder { 2789058aa991SIlya Dryomov struct rbd_device *rbd_dev = obj_request->img_request->rbd_dev; 2790c5b5ef6cSAlex Elder struct rbd_obj_request *stat_request; 2791710214e3SIlya Dryomov struct page **pages; 2792c5b5ef6cSAlex Elder u32 page_count; 2793c5b5ef6cSAlex Elder size_t size; 2794c5b5ef6cSAlex Elder int ret; 2795c5b5ef6cSAlex Elder 27966c696d85SIlya Dryomov stat_request = rbd_obj_request_create(OBJ_REQUEST_PAGES); 2797710214e3SIlya Dryomov if (!stat_request) 2798710214e3SIlya Dryomov return -ENOMEM; 2799710214e3SIlya Dryomov 2800a90bb0c1SIlya Dryomov stat_request->object_no = obj_request->object_no; 2801a90bb0c1SIlya Dryomov 2802710214e3SIlya Dryomov stat_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1, 2803710214e3SIlya Dryomov stat_request); 2804710214e3SIlya Dryomov if (!stat_request->osd_req) { 2805710214e3SIlya Dryomov ret = -ENOMEM; 2806710214e3SIlya Dryomov goto fail_stat_request; 2807710214e3SIlya Dryomov } 2808710214e3SIlya Dryomov 2809c5b5ef6cSAlex Elder /* 2810c5b5ef6cSAlex Elder * The response data for a STAT call consists of: 2811c5b5ef6cSAlex Elder * le64 length; 2812c5b5ef6cSAlex Elder * struct { 2813c5b5ef6cSAlex Elder * le32 tv_sec; 2814c5b5ef6cSAlex Elder * le32 tv_nsec; 2815c5b5ef6cSAlex Elder * } mtime; 2816c5b5ef6cSAlex Elder */ 2817c5b5ef6cSAlex Elder size = sizeof (__le64) + sizeof (__le32) + sizeof (__le32); 2818c5b5ef6cSAlex Elder page_count = (u32)calc_pages_for(0, size); 2819c5b5ef6cSAlex Elder pages = ceph_alloc_page_vector(page_count, GFP_KERNEL); 2820710214e3SIlya Dryomov if (IS_ERR(pages)) { 2821710214e3SIlya Dryomov ret = PTR_ERR(pages); 2822710214e3SIlya Dryomov goto fail_stat_request; 2823710214e3SIlya Dryomov } 2824c5b5ef6cSAlex Elder 2825710214e3SIlya Dryomov osd_req_op_init(stat_request->osd_req, 0, CEPH_OSD_OP_STAT, 0); 2826710214e3SIlya Dryomov osd_req_op_raw_data_in_pages(stat_request->osd_req, 0, pages, size, 0, 2827710214e3SIlya Dryomov false, false); 2828c5b5ef6cSAlex Elder 2829c5b5ef6cSAlex Elder rbd_obj_request_get(obj_request); 2830c5b5ef6cSAlex Elder stat_request->obj_request = obj_request; 2831c5b5ef6cSAlex Elder stat_request->pages = pages; 2832c5b5ef6cSAlex Elder stat_request->page_count = page_count; 2833c5b5ef6cSAlex Elder stat_request->callback = rbd_img_obj_exists_callback; 2834c5b5ef6cSAlex Elder 2835980917fcSIlya Dryomov rbd_obj_request_submit(stat_request); 2836980917fcSIlya Dryomov return 0; 2837c5b5ef6cSAlex Elder 2838710214e3SIlya Dryomov fail_stat_request: 2839710214e3SIlya Dryomov rbd_obj_request_put(stat_request); 2840c5b5ef6cSAlex Elder return ret; 2841c5b5ef6cSAlex Elder } 2842c5b5ef6cSAlex Elder 284370d045f6SIlya Dryomov static bool img_obj_request_simple(struct rbd_obj_request *obj_request) 2844b454e36dSAlex Elder { 2845058aa991SIlya Dryomov struct rbd_img_request *img_request = obj_request->img_request; 2846058aa991SIlya Dryomov struct rbd_device *rbd_dev = img_request->rbd_dev; 2847b454e36dSAlex Elder 284870d045f6SIlya Dryomov /* Reads */ 28491c220881SJosh Durgin if (!img_request_write_test(img_request) && 28501c220881SJosh Durgin !img_request_discard_test(img_request)) 285170d045f6SIlya Dryomov return true; 2852b454e36dSAlex Elder 285370d045f6SIlya Dryomov /* Non-layered writes */ 285470d045f6SIlya Dryomov if (!img_request_layered_test(img_request)) 285570d045f6SIlya Dryomov return true; 285670d045f6SIlya Dryomov 285770d045f6SIlya Dryomov /* 285870d045f6SIlya Dryomov * Layered writes outside of the parent overlap range don't 285970d045f6SIlya Dryomov * share any data with the parent. 286070d045f6SIlya Dryomov */ 286170d045f6SIlya Dryomov if (!obj_request_overlaps_parent(obj_request)) 286270d045f6SIlya Dryomov return true; 286370d045f6SIlya Dryomov 286470d045f6SIlya Dryomov /* 2865c622d226SGuangliang Zhao * Entire-object layered writes - we will overwrite whatever 2866c622d226SGuangliang Zhao * parent data there is anyway. 2867c622d226SGuangliang Zhao */ 2868c622d226SGuangliang Zhao if (!obj_request->offset && 2869c622d226SGuangliang Zhao obj_request->length == rbd_obj_bytes(&rbd_dev->header)) 2870c622d226SGuangliang Zhao return true; 2871c622d226SGuangliang Zhao 2872c622d226SGuangliang Zhao /* 287370d045f6SIlya Dryomov * If the object is known to already exist, its parent data has 287470d045f6SIlya Dryomov * already been copied. 287570d045f6SIlya Dryomov */ 287670d045f6SIlya Dryomov if (obj_request_known_test(obj_request) && 287770d045f6SIlya Dryomov obj_request_exists_test(obj_request)) 287870d045f6SIlya Dryomov return true; 287970d045f6SIlya Dryomov 288070d045f6SIlya Dryomov return false; 288170d045f6SIlya Dryomov } 288270d045f6SIlya Dryomov 288370d045f6SIlya Dryomov static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request) 288470d045f6SIlya Dryomov { 2885058aa991SIlya Dryomov rbd_assert(obj_request_img_data_test(obj_request)); 2886058aa991SIlya Dryomov rbd_assert(obj_request_type_valid(obj_request->type)); 2887058aa991SIlya Dryomov rbd_assert(obj_request->img_request); 2888058aa991SIlya Dryomov 288970d045f6SIlya Dryomov if (img_obj_request_simple(obj_request)) { 2890980917fcSIlya Dryomov rbd_obj_request_submit(obj_request); 2891980917fcSIlya Dryomov return 0; 2892b454e36dSAlex Elder } 2893b454e36dSAlex Elder 2894b454e36dSAlex Elder /* 28953d7efd18SAlex Elder * It's a layered write. The target object might exist but 28963d7efd18SAlex Elder * we may not know that yet. If we know it doesn't exist, 28973d7efd18SAlex Elder * start by reading the data for the full target object from 28983d7efd18SAlex Elder * the parent so we can use it for a copyup to the target. 2899b454e36dSAlex Elder */ 290070d045f6SIlya Dryomov if (obj_request_known_test(obj_request)) 29013d7efd18SAlex Elder return rbd_img_obj_parent_read_full(obj_request); 29023d7efd18SAlex Elder 29033d7efd18SAlex Elder /* We don't know whether the target exists. Go find out. */ 2904b454e36dSAlex Elder 2905b454e36dSAlex Elder return rbd_img_obj_exists_submit(obj_request); 2906b454e36dSAlex Elder } 2907b454e36dSAlex Elder 2908bf0d5f50SAlex Elder static int rbd_img_request_submit(struct rbd_img_request *img_request) 2909bf0d5f50SAlex Elder { 2910bf0d5f50SAlex Elder struct rbd_obj_request *obj_request; 291146faeed4SAlex Elder struct rbd_obj_request *next_obj_request; 2912663ae2ccSIlya Dryomov int ret = 0; 2913bf0d5f50SAlex Elder 291437206ee5SAlex Elder dout("%s: img %p\n", __func__, img_request); 2915bf0d5f50SAlex Elder 2916663ae2ccSIlya Dryomov rbd_img_request_get(img_request); 2917663ae2ccSIlya Dryomov for_each_obj_request_safe(img_request, obj_request, next_obj_request) { 2918b454e36dSAlex Elder ret = rbd_img_obj_request_submit(obj_request); 2919bf0d5f50SAlex Elder if (ret) 2920663ae2ccSIlya Dryomov goto out_put_ireq; 2921bf0d5f50SAlex Elder } 2922bf0d5f50SAlex Elder 2923663ae2ccSIlya Dryomov out_put_ireq: 2924663ae2ccSIlya Dryomov rbd_img_request_put(img_request); 2925663ae2ccSIlya Dryomov return ret; 2926bf0d5f50SAlex Elder } 2927bf0d5f50SAlex Elder 29288b3e1a56SAlex Elder static void rbd_img_parent_read_callback(struct rbd_img_request *img_request) 29298b3e1a56SAlex Elder { 29308b3e1a56SAlex Elder struct rbd_obj_request *obj_request; 2931a9e8ba2cSAlex Elder struct rbd_device *rbd_dev; 2932a9e8ba2cSAlex Elder u64 obj_end; 293302c74fbaSAlex Elder u64 img_xferred; 293402c74fbaSAlex Elder int img_result; 29358b3e1a56SAlex Elder 29368b3e1a56SAlex Elder rbd_assert(img_request_child_test(img_request)); 29378b3e1a56SAlex Elder 293802c74fbaSAlex Elder /* First get what we need from the image request and release it */ 293902c74fbaSAlex Elder 29408b3e1a56SAlex Elder obj_request = img_request->obj_request; 294102c74fbaSAlex Elder img_xferred = img_request->xferred; 294202c74fbaSAlex Elder img_result = img_request->result; 294302c74fbaSAlex Elder rbd_img_request_put(img_request); 294402c74fbaSAlex Elder 294502c74fbaSAlex Elder /* 294602c74fbaSAlex Elder * If the overlap has become 0 (most likely because the 294702c74fbaSAlex Elder * image has been flattened) we need to re-submit the 294802c74fbaSAlex Elder * original request. 294902c74fbaSAlex Elder */ 2950a9e8ba2cSAlex Elder rbd_assert(obj_request); 2951a9e8ba2cSAlex Elder rbd_assert(obj_request->img_request); 295202c74fbaSAlex Elder rbd_dev = obj_request->img_request->rbd_dev; 295302c74fbaSAlex Elder if (!rbd_dev->parent_overlap) { 2954980917fcSIlya Dryomov rbd_obj_request_submit(obj_request); 295502c74fbaSAlex Elder return; 295602c74fbaSAlex Elder } 295702c74fbaSAlex Elder 295802c74fbaSAlex Elder obj_request->result = img_result; 2959a9e8ba2cSAlex Elder if (obj_request->result) 2960a9e8ba2cSAlex Elder goto out; 2961a9e8ba2cSAlex Elder 2962a9e8ba2cSAlex Elder /* 2963a9e8ba2cSAlex Elder * We need to zero anything beyond the parent overlap 2964a9e8ba2cSAlex Elder * boundary. Since rbd_img_obj_request_read_callback() 2965a9e8ba2cSAlex Elder * will zero anything beyond the end of a short read, an 2966a9e8ba2cSAlex Elder * easy way to do this is to pretend the data from the 2967a9e8ba2cSAlex Elder * parent came up short--ending at the overlap boundary. 2968a9e8ba2cSAlex Elder */ 2969a9e8ba2cSAlex Elder rbd_assert(obj_request->img_offset < U64_MAX - obj_request->length); 2970a9e8ba2cSAlex Elder obj_end = obj_request->img_offset + obj_request->length; 2971a9e8ba2cSAlex Elder if (obj_end > rbd_dev->parent_overlap) { 2972a9e8ba2cSAlex Elder u64 xferred = 0; 2973a9e8ba2cSAlex Elder 2974a9e8ba2cSAlex Elder if (obj_request->img_offset < rbd_dev->parent_overlap) 2975a9e8ba2cSAlex Elder xferred = rbd_dev->parent_overlap - 2976a9e8ba2cSAlex Elder obj_request->img_offset; 2977a9e8ba2cSAlex Elder 297802c74fbaSAlex Elder obj_request->xferred = min(img_xferred, xferred); 2979a9e8ba2cSAlex Elder } else { 298002c74fbaSAlex Elder obj_request->xferred = img_xferred; 2981a9e8ba2cSAlex Elder } 2982a9e8ba2cSAlex Elder out: 29838b3e1a56SAlex Elder rbd_img_obj_request_read_callback(obj_request); 29848b3e1a56SAlex Elder rbd_obj_request_complete(obj_request); 29858b3e1a56SAlex Elder } 29868b3e1a56SAlex Elder 29878b3e1a56SAlex Elder static void rbd_img_parent_read(struct rbd_obj_request *obj_request) 29888b3e1a56SAlex Elder { 29898b3e1a56SAlex Elder struct rbd_img_request *img_request; 29908b3e1a56SAlex Elder int result; 29918b3e1a56SAlex Elder 29928b3e1a56SAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 29938b3e1a56SAlex Elder rbd_assert(obj_request->img_request != NULL); 29948b3e1a56SAlex Elder rbd_assert(obj_request->result == (s32) -ENOENT); 29955b2ab72dSAlex Elder rbd_assert(obj_request_type_valid(obj_request->type)); 29968b3e1a56SAlex Elder 29978b3e1a56SAlex Elder /* rbd_read_finish(obj_request, obj_request->length); */ 2998e93f3152SAlex Elder img_request = rbd_parent_request_create(obj_request, 29998b3e1a56SAlex Elder obj_request->img_offset, 3000e93f3152SAlex Elder obj_request->length); 30018b3e1a56SAlex Elder result = -ENOMEM; 30028b3e1a56SAlex Elder if (!img_request) 30038b3e1a56SAlex Elder goto out_err; 30048b3e1a56SAlex Elder 30055b2ab72dSAlex Elder if (obj_request->type == OBJ_REQUEST_BIO) 3006f1a4739fSAlex Elder result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO, 3007f1a4739fSAlex Elder obj_request->bio_list); 30085b2ab72dSAlex Elder else 30095b2ab72dSAlex Elder result = rbd_img_request_fill(img_request, OBJ_REQUEST_PAGES, 30105b2ab72dSAlex Elder obj_request->pages); 30118b3e1a56SAlex Elder if (result) 30128b3e1a56SAlex Elder goto out_err; 30138b3e1a56SAlex Elder 30148b3e1a56SAlex Elder img_request->callback = rbd_img_parent_read_callback; 30158b3e1a56SAlex Elder result = rbd_img_request_submit(img_request); 30168b3e1a56SAlex Elder if (result) 30178b3e1a56SAlex Elder goto out_err; 30188b3e1a56SAlex Elder 30198b3e1a56SAlex Elder return; 30208b3e1a56SAlex Elder out_err: 30218b3e1a56SAlex Elder if (img_request) 30228b3e1a56SAlex Elder rbd_img_request_put(img_request); 30238b3e1a56SAlex Elder obj_request->result = result; 30248b3e1a56SAlex Elder obj_request->xferred = 0; 30258b3e1a56SAlex Elder obj_request_done_set(obj_request); 30268b3e1a56SAlex Elder } 30278b3e1a56SAlex Elder 3028ed95b21aSIlya Dryomov static const struct rbd_client_id rbd_empty_cid; 3029ed95b21aSIlya Dryomov 3030ed95b21aSIlya Dryomov static bool rbd_cid_equal(const struct rbd_client_id *lhs, 3031ed95b21aSIlya Dryomov const struct rbd_client_id *rhs) 3032ed95b21aSIlya Dryomov { 3033ed95b21aSIlya Dryomov return lhs->gid == rhs->gid && lhs->handle == rhs->handle; 3034ed95b21aSIlya Dryomov } 3035ed95b21aSIlya Dryomov 3036ed95b21aSIlya Dryomov static struct rbd_client_id rbd_get_cid(struct rbd_device *rbd_dev) 3037ed95b21aSIlya Dryomov { 3038ed95b21aSIlya Dryomov struct rbd_client_id cid; 3039ed95b21aSIlya Dryomov 3040ed95b21aSIlya Dryomov mutex_lock(&rbd_dev->watch_mutex); 3041ed95b21aSIlya Dryomov cid.gid = ceph_client_gid(rbd_dev->rbd_client->client); 3042ed95b21aSIlya Dryomov cid.handle = rbd_dev->watch_cookie; 3043ed95b21aSIlya Dryomov mutex_unlock(&rbd_dev->watch_mutex); 3044ed95b21aSIlya Dryomov return cid; 3045ed95b21aSIlya Dryomov } 3046ed95b21aSIlya Dryomov 3047ed95b21aSIlya Dryomov /* 3048ed95b21aSIlya Dryomov * lock_rwsem must be held for write 3049ed95b21aSIlya Dryomov */ 3050ed95b21aSIlya Dryomov static void rbd_set_owner_cid(struct rbd_device *rbd_dev, 3051ed95b21aSIlya Dryomov const struct rbd_client_id *cid) 3052ed95b21aSIlya Dryomov { 3053ed95b21aSIlya Dryomov dout("%s rbd_dev %p %llu-%llu -> %llu-%llu\n", __func__, rbd_dev, 3054ed95b21aSIlya Dryomov rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle, 3055ed95b21aSIlya Dryomov cid->gid, cid->handle); 3056ed95b21aSIlya Dryomov rbd_dev->owner_cid = *cid; /* struct */ 3057ed95b21aSIlya Dryomov } 3058ed95b21aSIlya Dryomov 3059ed95b21aSIlya Dryomov static void format_lock_cookie(struct rbd_device *rbd_dev, char *buf) 3060ed95b21aSIlya Dryomov { 3061ed95b21aSIlya Dryomov mutex_lock(&rbd_dev->watch_mutex); 3062ed95b21aSIlya Dryomov sprintf(buf, "%s %llu", RBD_LOCK_COOKIE_PREFIX, rbd_dev->watch_cookie); 3063ed95b21aSIlya Dryomov mutex_unlock(&rbd_dev->watch_mutex); 3064ed95b21aSIlya Dryomov } 3065ed95b21aSIlya Dryomov 3066ed95b21aSIlya Dryomov /* 3067ed95b21aSIlya Dryomov * lock_rwsem must be held for write 3068ed95b21aSIlya Dryomov */ 3069ed95b21aSIlya Dryomov static int rbd_lock(struct rbd_device *rbd_dev) 3070ed95b21aSIlya Dryomov { 3071ed95b21aSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 3072ed95b21aSIlya Dryomov struct rbd_client_id cid = rbd_get_cid(rbd_dev); 3073ed95b21aSIlya Dryomov char cookie[32]; 3074ed95b21aSIlya Dryomov int ret; 3075ed95b21aSIlya Dryomov 3076ed95b21aSIlya Dryomov WARN_ON(__rbd_is_lock_owner(rbd_dev)); 3077ed95b21aSIlya Dryomov 3078ed95b21aSIlya Dryomov format_lock_cookie(rbd_dev, cookie); 3079ed95b21aSIlya Dryomov ret = ceph_cls_lock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc, 3080ed95b21aSIlya Dryomov RBD_LOCK_NAME, CEPH_CLS_LOCK_EXCLUSIVE, cookie, 3081ed95b21aSIlya Dryomov RBD_LOCK_TAG, "", 0); 3082ed95b21aSIlya Dryomov if (ret) 3083ed95b21aSIlya Dryomov return ret; 3084ed95b21aSIlya Dryomov 3085ed95b21aSIlya Dryomov rbd_dev->lock_state = RBD_LOCK_STATE_LOCKED; 3086ed95b21aSIlya Dryomov rbd_set_owner_cid(rbd_dev, &cid); 3087ed95b21aSIlya Dryomov queue_work(rbd_dev->task_wq, &rbd_dev->acquired_lock_work); 3088ed95b21aSIlya Dryomov return 0; 3089ed95b21aSIlya Dryomov } 3090ed95b21aSIlya Dryomov 3091ed95b21aSIlya Dryomov /* 3092ed95b21aSIlya Dryomov * lock_rwsem must be held for write 3093ed95b21aSIlya Dryomov */ 3094ed95b21aSIlya Dryomov static int rbd_unlock(struct rbd_device *rbd_dev) 3095ed95b21aSIlya Dryomov { 3096ed95b21aSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 3097ed95b21aSIlya Dryomov char cookie[32]; 3098ed95b21aSIlya Dryomov int ret; 3099ed95b21aSIlya Dryomov 3100ed95b21aSIlya Dryomov WARN_ON(!__rbd_is_lock_owner(rbd_dev)); 3101ed95b21aSIlya Dryomov 3102ed95b21aSIlya Dryomov rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED; 3103ed95b21aSIlya Dryomov 3104ed95b21aSIlya Dryomov format_lock_cookie(rbd_dev, cookie); 3105ed95b21aSIlya Dryomov ret = ceph_cls_unlock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc, 3106ed95b21aSIlya Dryomov RBD_LOCK_NAME, cookie); 3107ed95b21aSIlya Dryomov if (ret && ret != -ENOENT) { 3108ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "cls_unlock failed: %d", ret); 3109ed95b21aSIlya Dryomov return ret; 3110ed95b21aSIlya Dryomov } 3111ed95b21aSIlya Dryomov 3112ed95b21aSIlya Dryomov rbd_set_owner_cid(rbd_dev, &rbd_empty_cid); 3113ed95b21aSIlya Dryomov queue_work(rbd_dev->task_wq, &rbd_dev->released_lock_work); 3114ed95b21aSIlya Dryomov return 0; 3115ed95b21aSIlya Dryomov } 3116ed95b21aSIlya Dryomov 3117ed95b21aSIlya Dryomov static int __rbd_notify_op_lock(struct rbd_device *rbd_dev, 3118ed95b21aSIlya Dryomov enum rbd_notify_op notify_op, 3119ed95b21aSIlya Dryomov struct page ***preply_pages, 3120ed95b21aSIlya Dryomov size_t *preply_len) 3121ed95b21aSIlya Dryomov { 3122ed95b21aSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 3123ed95b21aSIlya Dryomov struct rbd_client_id cid = rbd_get_cid(rbd_dev); 3124ed95b21aSIlya Dryomov int buf_size = 4 + 8 + 8 + CEPH_ENCODING_START_BLK_LEN; 3125ed95b21aSIlya Dryomov char buf[buf_size]; 3126ed95b21aSIlya Dryomov void *p = buf; 3127ed95b21aSIlya Dryomov 3128ed95b21aSIlya Dryomov dout("%s rbd_dev %p notify_op %d\n", __func__, rbd_dev, notify_op); 3129ed95b21aSIlya Dryomov 3130ed95b21aSIlya Dryomov /* encode *LockPayload NotifyMessage (op + ClientId) */ 3131ed95b21aSIlya Dryomov ceph_start_encoding(&p, 2, 1, buf_size - CEPH_ENCODING_START_BLK_LEN); 3132ed95b21aSIlya Dryomov ceph_encode_32(&p, notify_op); 3133ed95b21aSIlya Dryomov ceph_encode_64(&p, cid.gid); 3134ed95b21aSIlya Dryomov ceph_encode_64(&p, cid.handle); 3135ed95b21aSIlya Dryomov 3136ed95b21aSIlya Dryomov return ceph_osdc_notify(osdc, &rbd_dev->header_oid, 3137ed95b21aSIlya Dryomov &rbd_dev->header_oloc, buf, buf_size, 3138ed95b21aSIlya Dryomov RBD_NOTIFY_TIMEOUT, preply_pages, preply_len); 3139ed95b21aSIlya Dryomov } 3140ed95b21aSIlya Dryomov 3141ed95b21aSIlya Dryomov static void rbd_notify_op_lock(struct rbd_device *rbd_dev, 3142ed95b21aSIlya Dryomov enum rbd_notify_op notify_op) 3143ed95b21aSIlya Dryomov { 3144ed95b21aSIlya Dryomov struct page **reply_pages; 3145ed95b21aSIlya Dryomov size_t reply_len; 3146ed95b21aSIlya Dryomov 3147ed95b21aSIlya Dryomov __rbd_notify_op_lock(rbd_dev, notify_op, &reply_pages, &reply_len); 3148ed95b21aSIlya Dryomov ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len)); 3149ed95b21aSIlya Dryomov } 3150ed95b21aSIlya Dryomov 3151ed95b21aSIlya Dryomov static void rbd_notify_acquired_lock(struct work_struct *work) 3152ed95b21aSIlya Dryomov { 3153ed95b21aSIlya Dryomov struct rbd_device *rbd_dev = container_of(work, struct rbd_device, 3154ed95b21aSIlya Dryomov acquired_lock_work); 3155ed95b21aSIlya Dryomov 3156ed95b21aSIlya Dryomov rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_ACQUIRED_LOCK); 3157ed95b21aSIlya Dryomov } 3158ed95b21aSIlya Dryomov 3159ed95b21aSIlya Dryomov static void rbd_notify_released_lock(struct work_struct *work) 3160ed95b21aSIlya Dryomov { 3161ed95b21aSIlya Dryomov struct rbd_device *rbd_dev = container_of(work, struct rbd_device, 3162ed95b21aSIlya Dryomov released_lock_work); 3163ed95b21aSIlya Dryomov 3164ed95b21aSIlya Dryomov rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_RELEASED_LOCK); 3165ed95b21aSIlya Dryomov } 3166ed95b21aSIlya Dryomov 3167ed95b21aSIlya Dryomov static int rbd_request_lock(struct rbd_device *rbd_dev) 3168ed95b21aSIlya Dryomov { 3169ed95b21aSIlya Dryomov struct page **reply_pages; 3170ed95b21aSIlya Dryomov size_t reply_len; 3171ed95b21aSIlya Dryomov bool lock_owner_responded = false; 3172ed95b21aSIlya Dryomov int ret; 3173ed95b21aSIlya Dryomov 3174ed95b21aSIlya Dryomov dout("%s rbd_dev %p\n", __func__, rbd_dev); 3175ed95b21aSIlya Dryomov 3176ed95b21aSIlya Dryomov ret = __rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_REQUEST_LOCK, 3177ed95b21aSIlya Dryomov &reply_pages, &reply_len); 3178ed95b21aSIlya Dryomov if (ret && ret != -ETIMEDOUT) { 3179ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "failed to request lock: %d", ret); 3180ed95b21aSIlya Dryomov goto out; 3181ed95b21aSIlya Dryomov } 3182ed95b21aSIlya Dryomov 3183ed95b21aSIlya Dryomov if (reply_len > 0 && reply_len <= PAGE_SIZE) { 3184ed95b21aSIlya Dryomov void *p = page_address(reply_pages[0]); 3185ed95b21aSIlya Dryomov void *const end = p + reply_len; 3186ed95b21aSIlya Dryomov u32 n; 3187ed95b21aSIlya Dryomov 3188ed95b21aSIlya Dryomov ceph_decode_32_safe(&p, end, n, e_inval); /* num_acks */ 3189ed95b21aSIlya Dryomov while (n--) { 3190ed95b21aSIlya Dryomov u8 struct_v; 3191ed95b21aSIlya Dryomov u32 len; 3192ed95b21aSIlya Dryomov 3193ed95b21aSIlya Dryomov ceph_decode_need(&p, end, 8 + 8, e_inval); 3194ed95b21aSIlya Dryomov p += 8 + 8; /* skip gid and cookie */ 3195ed95b21aSIlya Dryomov 3196ed95b21aSIlya Dryomov ceph_decode_32_safe(&p, end, len, e_inval); 3197ed95b21aSIlya Dryomov if (!len) 3198ed95b21aSIlya Dryomov continue; 3199ed95b21aSIlya Dryomov 3200ed95b21aSIlya Dryomov if (lock_owner_responded) { 3201ed95b21aSIlya Dryomov rbd_warn(rbd_dev, 3202ed95b21aSIlya Dryomov "duplicate lock owners detected"); 3203ed95b21aSIlya Dryomov ret = -EIO; 3204ed95b21aSIlya Dryomov goto out; 3205ed95b21aSIlya Dryomov } 3206ed95b21aSIlya Dryomov 3207ed95b21aSIlya Dryomov lock_owner_responded = true; 3208ed95b21aSIlya Dryomov ret = ceph_start_decoding(&p, end, 1, "ResponseMessage", 3209ed95b21aSIlya Dryomov &struct_v, &len); 3210ed95b21aSIlya Dryomov if (ret) { 3211ed95b21aSIlya Dryomov rbd_warn(rbd_dev, 3212ed95b21aSIlya Dryomov "failed to decode ResponseMessage: %d", 3213ed95b21aSIlya Dryomov ret); 3214ed95b21aSIlya Dryomov goto e_inval; 3215ed95b21aSIlya Dryomov } 3216ed95b21aSIlya Dryomov 3217ed95b21aSIlya Dryomov ret = ceph_decode_32(&p); 3218ed95b21aSIlya Dryomov } 3219ed95b21aSIlya Dryomov } 3220ed95b21aSIlya Dryomov 3221ed95b21aSIlya Dryomov if (!lock_owner_responded) { 3222ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "no lock owners detected"); 3223ed95b21aSIlya Dryomov ret = -ETIMEDOUT; 3224ed95b21aSIlya Dryomov } 3225ed95b21aSIlya Dryomov 3226ed95b21aSIlya Dryomov out: 3227ed95b21aSIlya Dryomov ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len)); 3228ed95b21aSIlya Dryomov return ret; 3229ed95b21aSIlya Dryomov 3230ed95b21aSIlya Dryomov e_inval: 3231ed95b21aSIlya Dryomov ret = -EINVAL; 3232ed95b21aSIlya Dryomov goto out; 3233ed95b21aSIlya Dryomov } 3234ed95b21aSIlya Dryomov 3235ed95b21aSIlya Dryomov static void wake_requests(struct rbd_device *rbd_dev, bool wake_all) 3236ed95b21aSIlya Dryomov { 3237ed95b21aSIlya Dryomov dout("%s rbd_dev %p wake_all %d\n", __func__, rbd_dev, wake_all); 3238ed95b21aSIlya Dryomov 3239ed95b21aSIlya Dryomov cancel_delayed_work(&rbd_dev->lock_dwork); 3240ed95b21aSIlya Dryomov if (wake_all) 3241ed95b21aSIlya Dryomov wake_up_all(&rbd_dev->lock_waitq); 3242ed95b21aSIlya Dryomov else 3243ed95b21aSIlya Dryomov wake_up(&rbd_dev->lock_waitq); 3244ed95b21aSIlya Dryomov } 3245ed95b21aSIlya Dryomov 3246ed95b21aSIlya Dryomov static int get_lock_owner_info(struct rbd_device *rbd_dev, 3247ed95b21aSIlya Dryomov struct ceph_locker **lockers, u32 *num_lockers) 3248ed95b21aSIlya Dryomov { 3249ed95b21aSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 3250ed95b21aSIlya Dryomov u8 lock_type; 3251ed95b21aSIlya Dryomov char *lock_tag; 3252ed95b21aSIlya Dryomov int ret; 3253ed95b21aSIlya Dryomov 3254ed95b21aSIlya Dryomov dout("%s rbd_dev %p\n", __func__, rbd_dev); 3255ed95b21aSIlya Dryomov 3256ed95b21aSIlya Dryomov ret = ceph_cls_lock_info(osdc, &rbd_dev->header_oid, 3257ed95b21aSIlya Dryomov &rbd_dev->header_oloc, RBD_LOCK_NAME, 3258ed95b21aSIlya Dryomov &lock_type, &lock_tag, lockers, num_lockers); 3259ed95b21aSIlya Dryomov if (ret) 3260ed95b21aSIlya Dryomov return ret; 3261ed95b21aSIlya Dryomov 3262ed95b21aSIlya Dryomov if (*num_lockers == 0) { 3263ed95b21aSIlya Dryomov dout("%s rbd_dev %p no lockers detected\n", __func__, rbd_dev); 3264ed95b21aSIlya Dryomov goto out; 3265ed95b21aSIlya Dryomov } 3266ed95b21aSIlya Dryomov 3267ed95b21aSIlya Dryomov if (strcmp(lock_tag, RBD_LOCK_TAG)) { 3268ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "locked by external mechanism, tag %s", 3269ed95b21aSIlya Dryomov lock_tag); 3270ed95b21aSIlya Dryomov ret = -EBUSY; 3271ed95b21aSIlya Dryomov goto out; 3272ed95b21aSIlya Dryomov } 3273ed95b21aSIlya Dryomov 3274ed95b21aSIlya Dryomov if (lock_type == CEPH_CLS_LOCK_SHARED) { 3275ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "shared lock type detected"); 3276ed95b21aSIlya Dryomov ret = -EBUSY; 3277ed95b21aSIlya Dryomov goto out; 3278ed95b21aSIlya Dryomov } 3279ed95b21aSIlya Dryomov 3280ed95b21aSIlya Dryomov if (strncmp((*lockers)[0].id.cookie, RBD_LOCK_COOKIE_PREFIX, 3281ed95b21aSIlya Dryomov strlen(RBD_LOCK_COOKIE_PREFIX))) { 3282ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "locked by external mechanism, cookie %s", 3283ed95b21aSIlya Dryomov (*lockers)[0].id.cookie); 3284ed95b21aSIlya Dryomov ret = -EBUSY; 3285ed95b21aSIlya Dryomov goto out; 3286ed95b21aSIlya Dryomov } 3287ed95b21aSIlya Dryomov 3288ed95b21aSIlya Dryomov out: 3289ed95b21aSIlya Dryomov kfree(lock_tag); 3290ed95b21aSIlya Dryomov return ret; 3291ed95b21aSIlya Dryomov } 3292ed95b21aSIlya Dryomov 3293ed95b21aSIlya Dryomov static int find_watcher(struct rbd_device *rbd_dev, 3294ed95b21aSIlya Dryomov const struct ceph_locker *locker) 3295ed95b21aSIlya Dryomov { 3296ed95b21aSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 3297ed95b21aSIlya Dryomov struct ceph_watch_item *watchers; 3298ed95b21aSIlya Dryomov u32 num_watchers; 3299ed95b21aSIlya Dryomov u64 cookie; 3300ed95b21aSIlya Dryomov int i; 3301ed95b21aSIlya Dryomov int ret; 3302ed95b21aSIlya Dryomov 3303ed95b21aSIlya Dryomov ret = ceph_osdc_list_watchers(osdc, &rbd_dev->header_oid, 3304ed95b21aSIlya Dryomov &rbd_dev->header_oloc, &watchers, 3305ed95b21aSIlya Dryomov &num_watchers); 3306ed95b21aSIlya Dryomov if (ret) 3307ed95b21aSIlya Dryomov return ret; 3308ed95b21aSIlya Dryomov 3309ed95b21aSIlya Dryomov sscanf(locker->id.cookie, RBD_LOCK_COOKIE_PREFIX " %llu", &cookie); 3310ed95b21aSIlya Dryomov for (i = 0; i < num_watchers; i++) { 3311ed95b21aSIlya Dryomov if (!memcmp(&watchers[i].addr, &locker->info.addr, 3312ed95b21aSIlya Dryomov sizeof(locker->info.addr)) && 3313ed95b21aSIlya Dryomov watchers[i].cookie == cookie) { 3314ed95b21aSIlya Dryomov struct rbd_client_id cid = { 3315ed95b21aSIlya Dryomov .gid = le64_to_cpu(watchers[i].name.num), 3316ed95b21aSIlya Dryomov .handle = cookie, 3317ed95b21aSIlya Dryomov }; 3318ed95b21aSIlya Dryomov 3319ed95b21aSIlya Dryomov dout("%s rbd_dev %p found cid %llu-%llu\n", __func__, 3320ed95b21aSIlya Dryomov rbd_dev, cid.gid, cid.handle); 3321ed95b21aSIlya Dryomov rbd_set_owner_cid(rbd_dev, &cid); 3322ed95b21aSIlya Dryomov ret = 1; 3323ed95b21aSIlya Dryomov goto out; 3324ed95b21aSIlya Dryomov } 3325ed95b21aSIlya Dryomov } 3326ed95b21aSIlya Dryomov 3327ed95b21aSIlya Dryomov dout("%s rbd_dev %p no watchers\n", __func__, rbd_dev); 3328ed95b21aSIlya Dryomov ret = 0; 3329ed95b21aSIlya Dryomov out: 3330ed95b21aSIlya Dryomov kfree(watchers); 3331ed95b21aSIlya Dryomov return ret; 3332ed95b21aSIlya Dryomov } 3333ed95b21aSIlya Dryomov 3334ed95b21aSIlya Dryomov /* 3335ed95b21aSIlya Dryomov * lock_rwsem must be held for write 3336ed95b21aSIlya Dryomov */ 3337ed95b21aSIlya Dryomov static int rbd_try_lock(struct rbd_device *rbd_dev) 3338ed95b21aSIlya Dryomov { 3339ed95b21aSIlya Dryomov struct ceph_client *client = rbd_dev->rbd_client->client; 3340ed95b21aSIlya Dryomov struct ceph_locker *lockers; 3341ed95b21aSIlya Dryomov u32 num_lockers; 3342ed95b21aSIlya Dryomov int ret; 3343ed95b21aSIlya Dryomov 3344ed95b21aSIlya Dryomov for (;;) { 3345ed95b21aSIlya Dryomov ret = rbd_lock(rbd_dev); 3346ed95b21aSIlya Dryomov if (ret != -EBUSY) 3347ed95b21aSIlya Dryomov return ret; 3348ed95b21aSIlya Dryomov 3349ed95b21aSIlya Dryomov /* determine if the current lock holder is still alive */ 3350ed95b21aSIlya Dryomov ret = get_lock_owner_info(rbd_dev, &lockers, &num_lockers); 3351ed95b21aSIlya Dryomov if (ret) 3352ed95b21aSIlya Dryomov return ret; 3353ed95b21aSIlya Dryomov 3354ed95b21aSIlya Dryomov if (num_lockers == 0) 3355ed95b21aSIlya Dryomov goto again; 3356ed95b21aSIlya Dryomov 3357ed95b21aSIlya Dryomov ret = find_watcher(rbd_dev, lockers); 3358ed95b21aSIlya Dryomov if (ret) { 3359ed95b21aSIlya Dryomov if (ret > 0) 3360ed95b21aSIlya Dryomov ret = 0; /* have to request lock */ 3361ed95b21aSIlya Dryomov goto out; 3362ed95b21aSIlya Dryomov } 3363ed95b21aSIlya Dryomov 3364ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "%s%llu seems dead, breaking lock", 3365ed95b21aSIlya Dryomov ENTITY_NAME(lockers[0].id.name)); 3366ed95b21aSIlya Dryomov 3367ed95b21aSIlya Dryomov ret = ceph_monc_blacklist_add(&client->monc, 3368ed95b21aSIlya Dryomov &lockers[0].info.addr); 3369ed95b21aSIlya Dryomov if (ret) { 3370ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "blacklist of %s%llu failed: %d", 3371ed95b21aSIlya Dryomov ENTITY_NAME(lockers[0].id.name), ret); 3372ed95b21aSIlya Dryomov goto out; 3373ed95b21aSIlya Dryomov } 3374ed95b21aSIlya Dryomov 3375ed95b21aSIlya Dryomov ret = ceph_cls_break_lock(&client->osdc, &rbd_dev->header_oid, 3376ed95b21aSIlya Dryomov &rbd_dev->header_oloc, RBD_LOCK_NAME, 3377ed95b21aSIlya Dryomov lockers[0].id.cookie, 3378ed95b21aSIlya Dryomov &lockers[0].id.name); 3379ed95b21aSIlya Dryomov if (ret && ret != -ENOENT) 3380ed95b21aSIlya Dryomov goto out; 3381ed95b21aSIlya Dryomov 3382ed95b21aSIlya Dryomov again: 3383ed95b21aSIlya Dryomov ceph_free_lockers(lockers, num_lockers); 3384ed95b21aSIlya Dryomov } 3385ed95b21aSIlya Dryomov 3386ed95b21aSIlya Dryomov out: 3387ed95b21aSIlya Dryomov ceph_free_lockers(lockers, num_lockers); 3388ed95b21aSIlya Dryomov return ret; 3389ed95b21aSIlya Dryomov } 3390ed95b21aSIlya Dryomov 3391ed95b21aSIlya Dryomov /* 3392ed95b21aSIlya Dryomov * ret is set only if lock_state is RBD_LOCK_STATE_UNLOCKED 3393ed95b21aSIlya Dryomov */ 3394ed95b21aSIlya Dryomov static enum rbd_lock_state rbd_try_acquire_lock(struct rbd_device *rbd_dev, 3395ed95b21aSIlya Dryomov int *pret) 3396ed95b21aSIlya Dryomov { 3397ed95b21aSIlya Dryomov enum rbd_lock_state lock_state; 3398ed95b21aSIlya Dryomov 3399ed95b21aSIlya Dryomov down_read(&rbd_dev->lock_rwsem); 3400ed95b21aSIlya Dryomov dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev, 3401ed95b21aSIlya Dryomov rbd_dev->lock_state); 3402ed95b21aSIlya Dryomov if (__rbd_is_lock_owner(rbd_dev)) { 3403ed95b21aSIlya Dryomov lock_state = rbd_dev->lock_state; 3404ed95b21aSIlya Dryomov up_read(&rbd_dev->lock_rwsem); 3405ed95b21aSIlya Dryomov return lock_state; 3406ed95b21aSIlya Dryomov } 3407ed95b21aSIlya Dryomov 3408ed95b21aSIlya Dryomov up_read(&rbd_dev->lock_rwsem); 3409ed95b21aSIlya Dryomov down_write(&rbd_dev->lock_rwsem); 3410ed95b21aSIlya Dryomov dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev, 3411ed95b21aSIlya Dryomov rbd_dev->lock_state); 3412ed95b21aSIlya Dryomov if (!__rbd_is_lock_owner(rbd_dev)) { 3413ed95b21aSIlya Dryomov *pret = rbd_try_lock(rbd_dev); 3414ed95b21aSIlya Dryomov if (*pret) 3415ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "failed to acquire lock: %d", *pret); 3416ed95b21aSIlya Dryomov } 3417ed95b21aSIlya Dryomov 3418ed95b21aSIlya Dryomov lock_state = rbd_dev->lock_state; 3419ed95b21aSIlya Dryomov up_write(&rbd_dev->lock_rwsem); 3420ed95b21aSIlya Dryomov return lock_state; 3421ed95b21aSIlya Dryomov } 3422ed95b21aSIlya Dryomov 3423ed95b21aSIlya Dryomov static void rbd_acquire_lock(struct work_struct *work) 3424ed95b21aSIlya Dryomov { 3425ed95b21aSIlya Dryomov struct rbd_device *rbd_dev = container_of(to_delayed_work(work), 3426ed95b21aSIlya Dryomov struct rbd_device, lock_dwork); 3427ed95b21aSIlya Dryomov enum rbd_lock_state lock_state; 3428ed95b21aSIlya Dryomov int ret; 3429ed95b21aSIlya Dryomov 3430ed95b21aSIlya Dryomov dout("%s rbd_dev %p\n", __func__, rbd_dev); 3431ed95b21aSIlya Dryomov again: 3432ed95b21aSIlya Dryomov lock_state = rbd_try_acquire_lock(rbd_dev, &ret); 3433ed95b21aSIlya Dryomov if (lock_state != RBD_LOCK_STATE_UNLOCKED || ret == -EBLACKLISTED) { 3434ed95b21aSIlya Dryomov if (lock_state == RBD_LOCK_STATE_LOCKED) 3435ed95b21aSIlya Dryomov wake_requests(rbd_dev, true); 3436ed95b21aSIlya Dryomov dout("%s rbd_dev %p lock_state %d ret %d - done\n", __func__, 3437ed95b21aSIlya Dryomov rbd_dev, lock_state, ret); 3438ed95b21aSIlya Dryomov return; 3439ed95b21aSIlya Dryomov } 3440ed95b21aSIlya Dryomov 3441ed95b21aSIlya Dryomov ret = rbd_request_lock(rbd_dev); 3442ed95b21aSIlya Dryomov if (ret == -ETIMEDOUT) { 3443ed95b21aSIlya Dryomov goto again; /* treat this as a dead client */ 3444ed95b21aSIlya Dryomov } else if (ret < 0) { 3445ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "error requesting lock: %d", ret); 3446ed95b21aSIlya Dryomov mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 3447ed95b21aSIlya Dryomov RBD_RETRY_DELAY); 3448ed95b21aSIlya Dryomov } else { 3449ed95b21aSIlya Dryomov /* 3450ed95b21aSIlya Dryomov * lock owner acked, but resend if we don't see them 3451ed95b21aSIlya Dryomov * release the lock 3452ed95b21aSIlya Dryomov */ 3453ed95b21aSIlya Dryomov dout("%s rbd_dev %p requeueing lock_dwork\n", __func__, 3454ed95b21aSIlya Dryomov rbd_dev); 3455ed95b21aSIlya Dryomov mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 3456ed95b21aSIlya Dryomov msecs_to_jiffies(2 * RBD_NOTIFY_TIMEOUT * MSEC_PER_SEC)); 3457ed95b21aSIlya Dryomov } 3458ed95b21aSIlya Dryomov } 3459ed95b21aSIlya Dryomov 3460ed95b21aSIlya Dryomov /* 3461ed95b21aSIlya Dryomov * lock_rwsem must be held for write 3462ed95b21aSIlya Dryomov */ 3463ed95b21aSIlya Dryomov static bool rbd_release_lock(struct rbd_device *rbd_dev) 3464ed95b21aSIlya Dryomov { 3465ed95b21aSIlya Dryomov dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev, 3466ed95b21aSIlya Dryomov rbd_dev->lock_state); 3467ed95b21aSIlya Dryomov if (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED) 3468ed95b21aSIlya Dryomov return false; 3469ed95b21aSIlya Dryomov 3470ed95b21aSIlya Dryomov rbd_dev->lock_state = RBD_LOCK_STATE_RELEASING; 3471ed95b21aSIlya Dryomov downgrade_write(&rbd_dev->lock_rwsem); 3472ed95b21aSIlya Dryomov /* 3473ed95b21aSIlya Dryomov * Ensure that all in-flight IO is flushed. 3474ed95b21aSIlya Dryomov * 3475ed95b21aSIlya Dryomov * FIXME: ceph_osdc_sync() flushes the entire OSD client, which 3476ed95b21aSIlya Dryomov * may be shared with other devices. 3477ed95b21aSIlya Dryomov */ 3478ed95b21aSIlya Dryomov ceph_osdc_sync(&rbd_dev->rbd_client->client->osdc); 3479ed95b21aSIlya Dryomov up_read(&rbd_dev->lock_rwsem); 3480ed95b21aSIlya Dryomov 3481ed95b21aSIlya Dryomov down_write(&rbd_dev->lock_rwsem); 3482ed95b21aSIlya Dryomov dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev, 3483ed95b21aSIlya Dryomov rbd_dev->lock_state); 3484ed95b21aSIlya Dryomov if (rbd_dev->lock_state != RBD_LOCK_STATE_RELEASING) 3485ed95b21aSIlya Dryomov return false; 3486ed95b21aSIlya Dryomov 3487ed95b21aSIlya Dryomov if (!rbd_unlock(rbd_dev)) 3488ed95b21aSIlya Dryomov /* 3489ed95b21aSIlya Dryomov * Give others a chance to grab the lock - we would re-acquire 3490ed95b21aSIlya Dryomov * almost immediately if we got new IO during ceph_osdc_sync() 3491ed95b21aSIlya Dryomov * otherwise. We need to ack our own notifications, so this 3492ed95b21aSIlya Dryomov * lock_dwork will be requeued from rbd_wait_state_locked() 3493ed95b21aSIlya Dryomov * after wake_requests() in rbd_handle_released_lock(). 3494ed95b21aSIlya Dryomov */ 3495ed95b21aSIlya Dryomov cancel_delayed_work(&rbd_dev->lock_dwork); 3496ed95b21aSIlya Dryomov 3497ed95b21aSIlya Dryomov return true; 3498ed95b21aSIlya Dryomov } 3499ed95b21aSIlya Dryomov 3500ed95b21aSIlya Dryomov static void rbd_release_lock_work(struct work_struct *work) 3501ed95b21aSIlya Dryomov { 3502ed95b21aSIlya Dryomov struct rbd_device *rbd_dev = container_of(work, struct rbd_device, 3503ed95b21aSIlya Dryomov unlock_work); 3504ed95b21aSIlya Dryomov 3505ed95b21aSIlya Dryomov down_write(&rbd_dev->lock_rwsem); 3506ed95b21aSIlya Dryomov rbd_release_lock(rbd_dev); 3507ed95b21aSIlya Dryomov up_write(&rbd_dev->lock_rwsem); 3508ed95b21aSIlya Dryomov } 3509ed95b21aSIlya Dryomov 3510ed95b21aSIlya Dryomov static void rbd_handle_acquired_lock(struct rbd_device *rbd_dev, u8 struct_v, 3511ed95b21aSIlya Dryomov void **p) 3512ed95b21aSIlya Dryomov { 3513ed95b21aSIlya Dryomov struct rbd_client_id cid = { 0 }; 3514ed95b21aSIlya Dryomov 3515ed95b21aSIlya Dryomov if (struct_v >= 2) { 3516ed95b21aSIlya Dryomov cid.gid = ceph_decode_64(p); 3517ed95b21aSIlya Dryomov cid.handle = ceph_decode_64(p); 3518ed95b21aSIlya Dryomov } 3519ed95b21aSIlya Dryomov 3520ed95b21aSIlya Dryomov dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid, 3521ed95b21aSIlya Dryomov cid.handle); 3522ed95b21aSIlya Dryomov if (!rbd_cid_equal(&cid, &rbd_empty_cid)) { 3523ed95b21aSIlya Dryomov down_write(&rbd_dev->lock_rwsem); 3524ed95b21aSIlya Dryomov if (rbd_cid_equal(&cid, &rbd_dev->owner_cid)) { 3525ed95b21aSIlya Dryomov /* 3526ed95b21aSIlya Dryomov * we already know that the remote client is 3527ed95b21aSIlya Dryomov * the owner 3528ed95b21aSIlya Dryomov */ 3529ed95b21aSIlya Dryomov up_write(&rbd_dev->lock_rwsem); 3530ed95b21aSIlya Dryomov return; 3531ed95b21aSIlya Dryomov } 3532ed95b21aSIlya Dryomov 3533ed95b21aSIlya Dryomov rbd_set_owner_cid(rbd_dev, &cid); 3534ed95b21aSIlya Dryomov downgrade_write(&rbd_dev->lock_rwsem); 3535ed95b21aSIlya Dryomov } else { 3536ed95b21aSIlya Dryomov down_read(&rbd_dev->lock_rwsem); 3537ed95b21aSIlya Dryomov } 3538ed95b21aSIlya Dryomov 3539ed95b21aSIlya Dryomov if (!__rbd_is_lock_owner(rbd_dev)) 3540ed95b21aSIlya Dryomov wake_requests(rbd_dev, false); 3541ed95b21aSIlya Dryomov up_read(&rbd_dev->lock_rwsem); 3542ed95b21aSIlya Dryomov } 3543ed95b21aSIlya Dryomov 3544ed95b21aSIlya Dryomov static void rbd_handle_released_lock(struct rbd_device *rbd_dev, u8 struct_v, 3545ed95b21aSIlya Dryomov void **p) 3546ed95b21aSIlya Dryomov { 3547ed95b21aSIlya Dryomov struct rbd_client_id cid = { 0 }; 3548ed95b21aSIlya Dryomov 3549ed95b21aSIlya Dryomov if (struct_v >= 2) { 3550ed95b21aSIlya Dryomov cid.gid = ceph_decode_64(p); 3551ed95b21aSIlya Dryomov cid.handle = ceph_decode_64(p); 3552ed95b21aSIlya Dryomov } 3553ed95b21aSIlya Dryomov 3554ed95b21aSIlya Dryomov dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid, 3555ed95b21aSIlya Dryomov cid.handle); 3556ed95b21aSIlya Dryomov if (!rbd_cid_equal(&cid, &rbd_empty_cid)) { 3557ed95b21aSIlya Dryomov down_write(&rbd_dev->lock_rwsem); 3558ed95b21aSIlya Dryomov if (!rbd_cid_equal(&cid, &rbd_dev->owner_cid)) { 3559ed95b21aSIlya Dryomov dout("%s rbd_dev %p unexpected owner, cid %llu-%llu != owner_cid %llu-%llu\n", 3560ed95b21aSIlya Dryomov __func__, rbd_dev, cid.gid, cid.handle, 3561ed95b21aSIlya Dryomov rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle); 3562ed95b21aSIlya Dryomov up_write(&rbd_dev->lock_rwsem); 3563ed95b21aSIlya Dryomov return; 3564ed95b21aSIlya Dryomov } 3565ed95b21aSIlya Dryomov 3566ed95b21aSIlya Dryomov rbd_set_owner_cid(rbd_dev, &rbd_empty_cid); 3567ed95b21aSIlya Dryomov downgrade_write(&rbd_dev->lock_rwsem); 3568ed95b21aSIlya Dryomov } else { 3569ed95b21aSIlya Dryomov down_read(&rbd_dev->lock_rwsem); 3570ed95b21aSIlya Dryomov } 3571ed95b21aSIlya Dryomov 3572ed95b21aSIlya Dryomov if (!__rbd_is_lock_owner(rbd_dev)) 3573ed95b21aSIlya Dryomov wake_requests(rbd_dev, false); 3574ed95b21aSIlya Dryomov up_read(&rbd_dev->lock_rwsem); 3575ed95b21aSIlya Dryomov } 3576ed95b21aSIlya Dryomov 3577ed95b21aSIlya Dryomov static bool rbd_handle_request_lock(struct rbd_device *rbd_dev, u8 struct_v, 3578ed95b21aSIlya Dryomov void **p) 3579ed95b21aSIlya Dryomov { 3580ed95b21aSIlya Dryomov struct rbd_client_id my_cid = rbd_get_cid(rbd_dev); 3581ed95b21aSIlya Dryomov struct rbd_client_id cid = { 0 }; 3582ed95b21aSIlya Dryomov bool need_to_send; 3583ed95b21aSIlya Dryomov 3584ed95b21aSIlya Dryomov if (struct_v >= 2) { 3585ed95b21aSIlya Dryomov cid.gid = ceph_decode_64(p); 3586ed95b21aSIlya Dryomov cid.handle = ceph_decode_64(p); 3587ed95b21aSIlya Dryomov } 3588ed95b21aSIlya Dryomov 3589ed95b21aSIlya Dryomov dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid, 3590ed95b21aSIlya Dryomov cid.handle); 3591ed95b21aSIlya Dryomov if (rbd_cid_equal(&cid, &my_cid)) 3592ed95b21aSIlya Dryomov return false; 3593ed95b21aSIlya Dryomov 3594ed95b21aSIlya Dryomov down_read(&rbd_dev->lock_rwsem); 3595ed95b21aSIlya Dryomov need_to_send = __rbd_is_lock_owner(rbd_dev); 3596ed95b21aSIlya Dryomov if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED) { 3597ed95b21aSIlya Dryomov if (!rbd_cid_equal(&rbd_dev->owner_cid, &rbd_empty_cid)) { 3598ed95b21aSIlya Dryomov dout("%s rbd_dev %p queueing unlock_work\n", __func__, 3599ed95b21aSIlya Dryomov rbd_dev); 3600ed95b21aSIlya Dryomov queue_work(rbd_dev->task_wq, &rbd_dev->unlock_work); 3601ed95b21aSIlya Dryomov } 3602ed95b21aSIlya Dryomov } 3603ed95b21aSIlya Dryomov up_read(&rbd_dev->lock_rwsem); 3604ed95b21aSIlya Dryomov return need_to_send; 3605ed95b21aSIlya Dryomov } 3606ed95b21aSIlya Dryomov 3607ed95b21aSIlya Dryomov static void __rbd_acknowledge_notify(struct rbd_device *rbd_dev, 3608ed95b21aSIlya Dryomov u64 notify_id, u64 cookie, s32 *result) 3609ed95b21aSIlya Dryomov { 3610ed95b21aSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 3611ed95b21aSIlya Dryomov int buf_size = 4 + CEPH_ENCODING_START_BLK_LEN; 3612ed95b21aSIlya Dryomov char buf[buf_size]; 3613ed95b21aSIlya Dryomov int ret; 3614ed95b21aSIlya Dryomov 3615ed95b21aSIlya Dryomov if (result) { 3616ed95b21aSIlya Dryomov void *p = buf; 3617ed95b21aSIlya Dryomov 3618ed95b21aSIlya Dryomov /* encode ResponseMessage */ 3619ed95b21aSIlya Dryomov ceph_start_encoding(&p, 1, 1, 3620ed95b21aSIlya Dryomov buf_size - CEPH_ENCODING_START_BLK_LEN); 3621ed95b21aSIlya Dryomov ceph_encode_32(&p, *result); 3622ed95b21aSIlya Dryomov } else { 3623ed95b21aSIlya Dryomov buf_size = 0; 3624ed95b21aSIlya Dryomov } 3625ed95b21aSIlya Dryomov 3626ed95b21aSIlya Dryomov ret = ceph_osdc_notify_ack(osdc, &rbd_dev->header_oid, 3627ed95b21aSIlya Dryomov &rbd_dev->header_oloc, notify_id, cookie, 3628ed95b21aSIlya Dryomov buf, buf_size); 3629ed95b21aSIlya Dryomov if (ret) 3630ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "acknowledge_notify failed: %d", ret); 3631ed95b21aSIlya Dryomov } 3632ed95b21aSIlya Dryomov 3633ed95b21aSIlya Dryomov static void rbd_acknowledge_notify(struct rbd_device *rbd_dev, u64 notify_id, 3634ed95b21aSIlya Dryomov u64 cookie) 3635ed95b21aSIlya Dryomov { 3636ed95b21aSIlya Dryomov dout("%s rbd_dev %p\n", __func__, rbd_dev); 3637ed95b21aSIlya Dryomov __rbd_acknowledge_notify(rbd_dev, notify_id, cookie, NULL); 3638ed95b21aSIlya Dryomov } 3639ed95b21aSIlya Dryomov 3640ed95b21aSIlya Dryomov static void rbd_acknowledge_notify_result(struct rbd_device *rbd_dev, 3641ed95b21aSIlya Dryomov u64 notify_id, u64 cookie, s32 result) 3642ed95b21aSIlya Dryomov { 3643ed95b21aSIlya Dryomov dout("%s rbd_dev %p result %d\n", __func__, rbd_dev, result); 3644ed95b21aSIlya Dryomov __rbd_acknowledge_notify(rbd_dev, notify_id, cookie, &result); 3645ed95b21aSIlya Dryomov } 3646922dab61SIlya Dryomov 3647922dab61SIlya Dryomov static void rbd_watch_cb(void *arg, u64 notify_id, u64 cookie, 3648922dab61SIlya Dryomov u64 notifier_id, void *data, size_t data_len) 3649b8d70035SAlex Elder { 3650922dab61SIlya Dryomov struct rbd_device *rbd_dev = arg; 3651ed95b21aSIlya Dryomov void *p = data; 3652ed95b21aSIlya Dryomov void *const end = p + data_len; 3653d4c2269bSIlya Dryomov u8 struct_v = 0; 3654ed95b21aSIlya Dryomov u32 len; 3655ed95b21aSIlya Dryomov u32 notify_op; 3656b8d70035SAlex Elder int ret; 3657b8d70035SAlex Elder 3658ed95b21aSIlya Dryomov dout("%s rbd_dev %p cookie %llu notify_id %llu data_len %zu\n", 3659ed95b21aSIlya Dryomov __func__, rbd_dev, cookie, notify_id, data_len); 3660ed95b21aSIlya Dryomov if (data_len) { 3661ed95b21aSIlya Dryomov ret = ceph_start_decoding(&p, end, 1, "NotifyMessage", 3662ed95b21aSIlya Dryomov &struct_v, &len); 3663ed95b21aSIlya Dryomov if (ret) { 3664ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "failed to decode NotifyMessage: %d", 3665ed95b21aSIlya Dryomov ret); 3666ed95b21aSIlya Dryomov return; 3667ed95b21aSIlya Dryomov } 366852bb1f9bSIlya Dryomov 3669ed95b21aSIlya Dryomov notify_op = ceph_decode_32(&p); 3670ed95b21aSIlya Dryomov } else { 3671ed95b21aSIlya Dryomov /* legacy notification for header updates */ 3672ed95b21aSIlya Dryomov notify_op = RBD_NOTIFY_OP_HEADER_UPDATE; 3673ed95b21aSIlya Dryomov len = 0; 3674ed95b21aSIlya Dryomov } 3675ed95b21aSIlya Dryomov 3676ed95b21aSIlya Dryomov dout("%s rbd_dev %p notify_op %u\n", __func__, rbd_dev, notify_op); 3677ed95b21aSIlya Dryomov switch (notify_op) { 3678ed95b21aSIlya Dryomov case RBD_NOTIFY_OP_ACQUIRED_LOCK: 3679ed95b21aSIlya Dryomov rbd_handle_acquired_lock(rbd_dev, struct_v, &p); 3680ed95b21aSIlya Dryomov rbd_acknowledge_notify(rbd_dev, notify_id, cookie); 3681ed95b21aSIlya Dryomov break; 3682ed95b21aSIlya Dryomov case RBD_NOTIFY_OP_RELEASED_LOCK: 3683ed95b21aSIlya Dryomov rbd_handle_released_lock(rbd_dev, struct_v, &p); 3684ed95b21aSIlya Dryomov rbd_acknowledge_notify(rbd_dev, notify_id, cookie); 3685ed95b21aSIlya Dryomov break; 3686ed95b21aSIlya Dryomov case RBD_NOTIFY_OP_REQUEST_LOCK: 3687ed95b21aSIlya Dryomov if (rbd_handle_request_lock(rbd_dev, struct_v, &p)) 368852bb1f9bSIlya Dryomov /* 3689ed95b21aSIlya Dryomov * send ResponseMessage(0) back so the client 3690ed95b21aSIlya Dryomov * can detect a missing owner 369152bb1f9bSIlya Dryomov */ 3692ed95b21aSIlya Dryomov rbd_acknowledge_notify_result(rbd_dev, notify_id, 3693ed95b21aSIlya Dryomov cookie, 0); 3694ed95b21aSIlya Dryomov else 3695ed95b21aSIlya Dryomov rbd_acknowledge_notify(rbd_dev, notify_id, cookie); 3696ed95b21aSIlya Dryomov break; 3697ed95b21aSIlya Dryomov case RBD_NOTIFY_OP_HEADER_UPDATE: 3698e627db08SAlex Elder ret = rbd_dev_refresh(rbd_dev); 3699e627db08SAlex Elder if (ret) 37009584d508SIlya Dryomov rbd_warn(rbd_dev, "refresh failed: %d", ret); 3701b8d70035SAlex Elder 3702ed95b21aSIlya Dryomov rbd_acknowledge_notify(rbd_dev, notify_id, cookie); 3703ed95b21aSIlya Dryomov break; 3704ed95b21aSIlya Dryomov default: 3705ed95b21aSIlya Dryomov if (rbd_is_lock_owner(rbd_dev)) 3706ed95b21aSIlya Dryomov rbd_acknowledge_notify_result(rbd_dev, notify_id, 3707ed95b21aSIlya Dryomov cookie, -EOPNOTSUPP); 3708ed95b21aSIlya Dryomov else 3709ed95b21aSIlya Dryomov rbd_acknowledge_notify(rbd_dev, notify_id, cookie); 3710ed95b21aSIlya Dryomov break; 3711b8d70035SAlex Elder } 3712b8d70035SAlex Elder } 3713b8d70035SAlex Elder 371499d16943SIlya Dryomov static void __rbd_unregister_watch(struct rbd_device *rbd_dev); 37159969ebc5SAlex Elder 3716922dab61SIlya Dryomov static void rbd_watch_errcb(void *arg, u64 cookie, int err) 3717bb040aa0SIlya Dryomov { 3718922dab61SIlya Dryomov struct rbd_device *rbd_dev = arg; 3719bb040aa0SIlya Dryomov 3720922dab61SIlya Dryomov rbd_warn(rbd_dev, "encountered watch error: %d", err); 3721bb040aa0SIlya Dryomov 3722ed95b21aSIlya Dryomov down_write(&rbd_dev->lock_rwsem); 3723ed95b21aSIlya Dryomov rbd_set_owner_cid(rbd_dev, &rbd_empty_cid); 3724ed95b21aSIlya Dryomov up_write(&rbd_dev->lock_rwsem); 3725bb040aa0SIlya Dryomov 372699d16943SIlya Dryomov mutex_lock(&rbd_dev->watch_mutex); 372799d16943SIlya Dryomov if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED) { 372899d16943SIlya Dryomov __rbd_unregister_watch(rbd_dev); 372999d16943SIlya Dryomov rbd_dev->watch_state = RBD_WATCH_STATE_ERROR; 3730bb040aa0SIlya Dryomov 373199d16943SIlya Dryomov queue_delayed_work(rbd_dev->task_wq, &rbd_dev->watch_dwork, 0); 3732bb040aa0SIlya Dryomov } 373399d16943SIlya Dryomov mutex_unlock(&rbd_dev->watch_mutex); 3734bb040aa0SIlya Dryomov } 3735bb040aa0SIlya Dryomov 3736bb040aa0SIlya Dryomov /* 373799d16943SIlya Dryomov * watch_mutex must be locked 37389969ebc5SAlex Elder */ 373999d16943SIlya Dryomov static int __rbd_register_watch(struct rbd_device *rbd_dev) 37409969ebc5SAlex Elder { 37419969ebc5SAlex Elder struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 3742922dab61SIlya Dryomov struct ceph_osd_linger_request *handle; 37439969ebc5SAlex Elder 3744922dab61SIlya Dryomov rbd_assert(!rbd_dev->watch_handle); 374599d16943SIlya Dryomov dout("%s rbd_dev %p\n", __func__, rbd_dev); 37469969ebc5SAlex Elder 3747922dab61SIlya Dryomov handle = ceph_osdc_watch(osdc, &rbd_dev->header_oid, 3748922dab61SIlya Dryomov &rbd_dev->header_oloc, rbd_watch_cb, 3749922dab61SIlya Dryomov rbd_watch_errcb, rbd_dev); 3750922dab61SIlya Dryomov if (IS_ERR(handle)) 3751922dab61SIlya Dryomov return PTR_ERR(handle); 37529969ebc5SAlex Elder 3753922dab61SIlya Dryomov rbd_dev->watch_handle = handle; 37548eb87565SAlex Elder return 0; 37559969ebc5SAlex Elder } 37569969ebc5SAlex Elder 375799d16943SIlya Dryomov /* 375899d16943SIlya Dryomov * watch_mutex must be locked 375999d16943SIlya Dryomov */ 376099d16943SIlya Dryomov static void __rbd_unregister_watch(struct rbd_device *rbd_dev) 3761fca27065SIlya Dryomov { 3762922dab61SIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 3763922dab61SIlya Dryomov int ret; 3764b30a01f2SIlya Dryomov 376599d16943SIlya Dryomov rbd_assert(rbd_dev->watch_handle); 376699d16943SIlya Dryomov dout("%s rbd_dev %p\n", __func__, rbd_dev); 3767b30a01f2SIlya Dryomov 3768922dab61SIlya Dryomov ret = ceph_osdc_unwatch(osdc, rbd_dev->watch_handle); 3769922dab61SIlya Dryomov if (ret) 3770922dab61SIlya Dryomov rbd_warn(rbd_dev, "failed to unwatch: %d", ret); 3771b30a01f2SIlya Dryomov 3772922dab61SIlya Dryomov rbd_dev->watch_handle = NULL; 3773c525f036SIlya Dryomov } 3774c525f036SIlya Dryomov 377599d16943SIlya Dryomov static int rbd_register_watch(struct rbd_device *rbd_dev) 3776c525f036SIlya Dryomov { 377799d16943SIlya Dryomov int ret; 3778811c6688SIlya Dryomov 377999d16943SIlya Dryomov mutex_lock(&rbd_dev->watch_mutex); 378099d16943SIlya Dryomov rbd_assert(rbd_dev->watch_state == RBD_WATCH_STATE_UNREGISTERED); 378199d16943SIlya Dryomov ret = __rbd_register_watch(rbd_dev); 378299d16943SIlya Dryomov if (ret) 378399d16943SIlya Dryomov goto out; 378499d16943SIlya Dryomov 378599d16943SIlya Dryomov rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED; 378699d16943SIlya Dryomov rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id; 378799d16943SIlya Dryomov 378899d16943SIlya Dryomov out: 378999d16943SIlya Dryomov mutex_unlock(&rbd_dev->watch_mutex); 379099d16943SIlya Dryomov return ret; 379199d16943SIlya Dryomov } 379299d16943SIlya Dryomov 379399d16943SIlya Dryomov static void cancel_tasks_sync(struct rbd_device *rbd_dev) 379499d16943SIlya Dryomov { 379599d16943SIlya Dryomov dout("%s rbd_dev %p\n", __func__, rbd_dev); 379699d16943SIlya Dryomov 379799d16943SIlya Dryomov cancel_delayed_work_sync(&rbd_dev->watch_dwork); 3798ed95b21aSIlya Dryomov cancel_work_sync(&rbd_dev->acquired_lock_work); 3799ed95b21aSIlya Dryomov cancel_work_sync(&rbd_dev->released_lock_work); 3800ed95b21aSIlya Dryomov cancel_delayed_work_sync(&rbd_dev->lock_dwork); 3801ed95b21aSIlya Dryomov cancel_work_sync(&rbd_dev->unlock_work); 380299d16943SIlya Dryomov } 380399d16943SIlya Dryomov 380499d16943SIlya Dryomov static void rbd_unregister_watch(struct rbd_device *rbd_dev) 380599d16943SIlya Dryomov { 3806ed95b21aSIlya Dryomov WARN_ON(waitqueue_active(&rbd_dev->lock_waitq)); 380799d16943SIlya Dryomov cancel_tasks_sync(rbd_dev); 380899d16943SIlya Dryomov 380999d16943SIlya Dryomov mutex_lock(&rbd_dev->watch_mutex); 381099d16943SIlya Dryomov if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED) 381199d16943SIlya Dryomov __rbd_unregister_watch(rbd_dev); 381299d16943SIlya Dryomov rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED; 381399d16943SIlya Dryomov mutex_unlock(&rbd_dev->watch_mutex); 381499d16943SIlya Dryomov 3815811c6688SIlya Dryomov ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc); 3816fca27065SIlya Dryomov } 3817fca27065SIlya Dryomov 381899d16943SIlya Dryomov static void rbd_reregister_watch(struct work_struct *work) 381999d16943SIlya Dryomov { 382099d16943SIlya Dryomov struct rbd_device *rbd_dev = container_of(to_delayed_work(work), 382199d16943SIlya Dryomov struct rbd_device, watch_dwork); 3822ed95b21aSIlya Dryomov bool was_lock_owner = false; 382387c0fdedSIlya Dryomov bool need_to_wake = false; 382499d16943SIlya Dryomov int ret; 382599d16943SIlya Dryomov 382699d16943SIlya Dryomov dout("%s rbd_dev %p\n", __func__, rbd_dev); 382799d16943SIlya Dryomov 3828ed95b21aSIlya Dryomov down_write(&rbd_dev->lock_rwsem); 3829ed95b21aSIlya Dryomov if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED) 3830ed95b21aSIlya Dryomov was_lock_owner = rbd_release_lock(rbd_dev); 3831ed95b21aSIlya Dryomov 383299d16943SIlya Dryomov mutex_lock(&rbd_dev->watch_mutex); 383387c0fdedSIlya Dryomov if (rbd_dev->watch_state != RBD_WATCH_STATE_ERROR) { 383487c0fdedSIlya Dryomov mutex_unlock(&rbd_dev->watch_mutex); 383587c0fdedSIlya Dryomov goto out; 383687c0fdedSIlya Dryomov } 383799d16943SIlya Dryomov 383899d16943SIlya Dryomov ret = __rbd_register_watch(rbd_dev); 383999d16943SIlya Dryomov if (ret) { 384099d16943SIlya Dryomov rbd_warn(rbd_dev, "failed to reregister watch: %d", ret); 38414d73644bSIlya Dryomov if (ret == -EBLACKLISTED || ret == -ENOENT) { 384287c0fdedSIlya Dryomov set_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags); 384387c0fdedSIlya Dryomov need_to_wake = true; 384487c0fdedSIlya Dryomov } else { 384599d16943SIlya Dryomov queue_delayed_work(rbd_dev->task_wq, 384699d16943SIlya Dryomov &rbd_dev->watch_dwork, 384799d16943SIlya Dryomov RBD_RETRY_DELAY); 384887c0fdedSIlya Dryomov } 384987c0fdedSIlya Dryomov mutex_unlock(&rbd_dev->watch_mutex); 385087c0fdedSIlya Dryomov goto out; 385199d16943SIlya Dryomov } 385299d16943SIlya Dryomov 385387c0fdedSIlya Dryomov need_to_wake = true; 385499d16943SIlya Dryomov rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED; 385599d16943SIlya Dryomov rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id; 385699d16943SIlya Dryomov mutex_unlock(&rbd_dev->watch_mutex); 385799d16943SIlya Dryomov 385899d16943SIlya Dryomov ret = rbd_dev_refresh(rbd_dev); 385999d16943SIlya Dryomov if (ret) 386099d16943SIlya Dryomov rbd_warn(rbd_dev, "reregisteration refresh failed: %d", ret); 386199d16943SIlya Dryomov 3862ed95b21aSIlya Dryomov if (was_lock_owner) { 3863ed95b21aSIlya Dryomov ret = rbd_try_lock(rbd_dev); 3864ed95b21aSIlya Dryomov if (ret) 3865ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "reregisteration lock failed: %d", 3866ed95b21aSIlya Dryomov ret); 3867ed95b21aSIlya Dryomov } 3868ed95b21aSIlya Dryomov 386987c0fdedSIlya Dryomov out: 3870ed95b21aSIlya Dryomov up_write(&rbd_dev->lock_rwsem); 387187c0fdedSIlya Dryomov if (need_to_wake) 3872ed95b21aSIlya Dryomov wake_requests(rbd_dev, true); 387399d16943SIlya Dryomov } 387499d16943SIlya Dryomov 387536be9a76SAlex Elder /* 3876f40eb349SAlex Elder * Synchronous osd object method call. Returns the number of bytes 3877f40eb349SAlex Elder * returned in the outbound buffer, or a negative error code. 387836be9a76SAlex Elder */ 387936be9a76SAlex Elder static int rbd_obj_method_sync(struct rbd_device *rbd_dev, 3880ecd4a68aSIlya Dryomov struct ceph_object_id *oid, 3881ecd4a68aSIlya Dryomov struct ceph_object_locator *oloc, 388236be9a76SAlex Elder const char *method_name, 38834157976bSAlex Elder const void *outbound, 388436be9a76SAlex Elder size_t outbound_size, 38854157976bSAlex Elder void *inbound, 3886e2a58ee5SAlex Elder size_t inbound_size) 388736be9a76SAlex Elder { 3888ecd4a68aSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 3889ecd4a68aSIlya Dryomov struct page *req_page = NULL; 3890ecd4a68aSIlya Dryomov struct page *reply_page; 389136be9a76SAlex Elder int ret; 389236be9a76SAlex Elder 389336be9a76SAlex Elder /* 38946010a451SAlex Elder * Method calls are ultimately read operations. The result 38956010a451SAlex Elder * should placed into the inbound buffer provided. They 38966010a451SAlex Elder * also supply outbound data--parameters for the object 38976010a451SAlex Elder * method. Currently if this is present it will be a 38986010a451SAlex Elder * snapshot id. 389936be9a76SAlex Elder */ 3900ecd4a68aSIlya Dryomov if (outbound) { 3901ecd4a68aSIlya Dryomov if (outbound_size > PAGE_SIZE) 3902ecd4a68aSIlya Dryomov return -E2BIG; 390336be9a76SAlex Elder 3904ecd4a68aSIlya Dryomov req_page = alloc_page(GFP_KERNEL); 3905ecd4a68aSIlya Dryomov if (!req_page) 3906ecd4a68aSIlya Dryomov return -ENOMEM; 390736be9a76SAlex Elder 3908ecd4a68aSIlya Dryomov memcpy(page_address(req_page), outbound, outbound_size); 390904017e29SAlex Elder } 3910430c28c3SAlex Elder 3911ecd4a68aSIlya Dryomov reply_page = alloc_page(GFP_KERNEL); 3912ecd4a68aSIlya Dryomov if (!reply_page) { 3913ecd4a68aSIlya Dryomov if (req_page) 3914ecd4a68aSIlya Dryomov __free_page(req_page); 3915ecd4a68aSIlya Dryomov return -ENOMEM; 3916ecd4a68aSIlya Dryomov } 391736be9a76SAlex Elder 3918ecd4a68aSIlya Dryomov ret = ceph_osdc_call(osdc, oid, oloc, RBD_DRV_NAME, method_name, 3919ecd4a68aSIlya Dryomov CEPH_OSD_FLAG_READ, req_page, outbound_size, 3920ecd4a68aSIlya Dryomov reply_page, &inbound_size); 3921ecd4a68aSIlya Dryomov if (!ret) { 3922ecd4a68aSIlya Dryomov memcpy(inbound, page_address(reply_page), inbound_size); 3923ecd4a68aSIlya Dryomov ret = inbound_size; 3924ecd4a68aSIlya Dryomov } 392557385b51SAlex Elder 3926ecd4a68aSIlya Dryomov if (req_page) 3927ecd4a68aSIlya Dryomov __free_page(req_page); 3928ecd4a68aSIlya Dryomov __free_page(reply_page); 392936be9a76SAlex Elder return ret; 393036be9a76SAlex Elder } 393136be9a76SAlex Elder 3932ed95b21aSIlya Dryomov /* 3933ed95b21aSIlya Dryomov * lock_rwsem must be held for read 3934ed95b21aSIlya Dryomov */ 3935ed95b21aSIlya Dryomov static void rbd_wait_state_locked(struct rbd_device *rbd_dev) 3936ed95b21aSIlya Dryomov { 3937ed95b21aSIlya Dryomov DEFINE_WAIT(wait); 3938ed95b21aSIlya Dryomov 3939ed95b21aSIlya Dryomov do { 3940ed95b21aSIlya Dryomov /* 3941ed95b21aSIlya Dryomov * Note the use of mod_delayed_work() in rbd_acquire_lock() 3942ed95b21aSIlya Dryomov * and cancel_delayed_work() in wake_requests(). 3943ed95b21aSIlya Dryomov */ 3944ed95b21aSIlya Dryomov dout("%s rbd_dev %p queueing lock_dwork\n", __func__, rbd_dev); 3945ed95b21aSIlya Dryomov queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0); 3946ed95b21aSIlya Dryomov prepare_to_wait_exclusive(&rbd_dev->lock_waitq, &wait, 3947ed95b21aSIlya Dryomov TASK_UNINTERRUPTIBLE); 3948ed95b21aSIlya Dryomov up_read(&rbd_dev->lock_rwsem); 3949ed95b21aSIlya Dryomov schedule(); 3950ed95b21aSIlya Dryomov down_read(&rbd_dev->lock_rwsem); 395187c0fdedSIlya Dryomov } while (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED && 395287c0fdedSIlya Dryomov !test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags)); 395387c0fdedSIlya Dryomov 3954ed95b21aSIlya Dryomov finish_wait(&rbd_dev->lock_waitq, &wait); 3955ed95b21aSIlya Dryomov } 3956ed95b21aSIlya Dryomov 39577ad18afaSChristoph Hellwig static void rbd_queue_workfn(struct work_struct *work) 3958bc1ecc65SIlya Dryomov { 39597ad18afaSChristoph Hellwig struct request *rq = blk_mq_rq_from_pdu(work); 39607ad18afaSChristoph Hellwig struct rbd_device *rbd_dev = rq->q->queuedata; 3961bc1ecc65SIlya Dryomov struct rbd_img_request *img_request; 39624e752f0aSJosh Durgin struct ceph_snap_context *snapc = NULL; 3963bc1ecc65SIlya Dryomov u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT; 3964bc1ecc65SIlya Dryomov u64 length = blk_rq_bytes(rq); 39656d2940c8SGuangliang Zhao enum obj_operation_type op_type; 39664e752f0aSJosh Durgin u64 mapping_size; 396780de1912SIlya Dryomov bool must_be_locked; 3968bc1ecc65SIlya Dryomov int result; 3969bc1ecc65SIlya Dryomov 39707ad18afaSChristoph Hellwig if (rq->cmd_type != REQ_TYPE_FS) { 39717ad18afaSChristoph Hellwig dout("%s: non-fs request type %d\n", __func__, 39727ad18afaSChristoph Hellwig (int) rq->cmd_type); 39737ad18afaSChristoph Hellwig result = -EIO; 39747ad18afaSChristoph Hellwig goto err; 39757ad18afaSChristoph Hellwig } 39767ad18afaSChristoph Hellwig 3977c2df40dfSMike Christie if (req_op(rq) == REQ_OP_DISCARD) 397890e98c52SGuangliang Zhao op_type = OBJ_OP_DISCARD; 3979c2df40dfSMike Christie else if (req_op(rq) == REQ_OP_WRITE) 39806d2940c8SGuangliang Zhao op_type = OBJ_OP_WRITE; 39816d2940c8SGuangliang Zhao else 39826d2940c8SGuangliang Zhao op_type = OBJ_OP_READ; 39836d2940c8SGuangliang Zhao 3984bc1ecc65SIlya Dryomov /* Ignore/skip any zero-length requests */ 3985bc1ecc65SIlya Dryomov 3986bc1ecc65SIlya Dryomov if (!length) { 3987bc1ecc65SIlya Dryomov dout("%s: zero-length request\n", __func__); 3988bc1ecc65SIlya Dryomov result = 0; 3989bc1ecc65SIlya Dryomov goto err_rq; 3990bc1ecc65SIlya Dryomov } 3991bc1ecc65SIlya Dryomov 39926d2940c8SGuangliang Zhao /* Only reads are allowed to a read-only device */ 3993bc1ecc65SIlya Dryomov 39946d2940c8SGuangliang Zhao if (op_type != OBJ_OP_READ) { 3995bc1ecc65SIlya Dryomov if (rbd_dev->mapping.read_only) { 3996bc1ecc65SIlya Dryomov result = -EROFS; 3997bc1ecc65SIlya Dryomov goto err_rq; 3998bc1ecc65SIlya Dryomov } 3999bc1ecc65SIlya Dryomov rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP); 4000bc1ecc65SIlya Dryomov } 4001bc1ecc65SIlya Dryomov 4002bc1ecc65SIlya Dryomov /* 4003bc1ecc65SIlya Dryomov * Quit early if the mapped snapshot no longer exists. It's 4004bc1ecc65SIlya Dryomov * still possible the snapshot will have disappeared by the 4005bc1ecc65SIlya Dryomov * time our request arrives at the osd, but there's no sense in 4006bc1ecc65SIlya Dryomov * sending it if we already know. 4007bc1ecc65SIlya Dryomov */ 4008bc1ecc65SIlya Dryomov if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) { 4009bc1ecc65SIlya Dryomov dout("request for non-existent snapshot"); 4010bc1ecc65SIlya Dryomov rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP); 4011bc1ecc65SIlya Dryomov result = -ENXIO; 4012bc1ecc65SIlya Dryomov goto err_rq; 4013bc1ecc65SIlya Dryomov } 4014bc1ecc65SIlya Dryomov 4015bc1ecc65SIlya Dryomov if (offset && length > U64_MAX - offset + 1) { 4016bc1ecc65SIlya Dryomov rbd_warn(rbd_dev, "bad request range (%llu~%llu)", offset, 4017bc1ecc65SIlya Dryomov length); 4018bc1ecc65SIlya Dryomov result = -EINVAL; 4019bc1ecc65SIlya Dryomov goto err_rq; /* Shouldn't happen */ 4020bc1ecc65SIlya Dryomov } 4021bc1ecc65SIlya Dryomov 40227ad18afaSChristoph Hellwig blk_mq_start_request(rq); 40237ad18afaSChristoph Hellwig 40244e752f0aSJosh Durgin down_read(&rbd_dev->header_rwsem); 40254e752f0aSJosh Durgin mapping_size = rbd_dev->mapping.size; 40266d2940c8SGuangliang Zhao if (op_type != OBJ_OP_READ) { 40274e752f0aSJosh Durgin snapc = rbd_dev->header.snapc; 40284e752f0aSJosh Durgin ceph_get_snap_context(snapc); 4029ed95b21aSIlya Dryomov must_be_locked = rbd_is_lock_supported(rbd_dev); 403080de1912SIlya Dryomov } else { 403180de1912SIlya Dryomov must_be_locked = rbd_dev->opts->lock_on_read && 403280de1912SIlya Dryomov rbd_is_lock_supported(rbd_dev); 40334e752f0aSJosh Durgin } 40344e752f0aSJosh Durgin up_read(&rbd_dev->header_rwsem); 40354e752f0aSJosh Durgin 40364e752f0aSJosh Durgin if (offset + length > mapping_size) { 4037bc1ecc65SIlya Dryomov rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)", offset, 40384e752f0aSJosh Durgin length, mapping_size); 4039bc1ecc65SIlya Dryomov result = -EIO; 4040bc1ecc65SIlya Dryomov goto err_rq; 4041bc1ecc65SIlya Dryomov } 4042bc1ecc65SIlya Dryomov 4043ed95b21aSIlya Dryomov if (must_be_locked) { 4044ed95b21aSIlya Dryomov down_read(&rbd_dev->lock_rwsem); 404587c0fdedSIlya Dryomov if (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED && 404687c0fdedSIlya Dryomov !test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags)) 4047ed95b21aSIlya Dryomov rbd_wait_state_locked(rbd_dev); 404887c0fdedSIlya Dryomov 404987c0fdedSIlya Dryomov WARN_ON((rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED) ^ 405087c0fdedSIlya Dryomov !test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags)); 405187c0fdedSIlya Dryomov if (test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags)) { 405287c0fdedSIlya Dryomov result = -EBLACKLISTED; 405387c0fdedSIlya Dryomov goto err_unlock; 405487c0fdedSIlya Dryomov } 4055ed95b21aSIlya Dryomov } 4056ed95b21aSIlya Dryomov 40576d2940c8SGuangliang Zhao img_request = rbd_img_request_create(rbd_dev, offset, length, op_type, 40584e752f0aSJosh Durgin snapc); 4059bc1ecc65SIlya Dryomov if (!img_request) { 4060bc1ecc65SIlya Dryomov result = -ENOMEM; 4061ed95b21aSIlya Dryomov goto err_unlock; 4062bc1ecc65SIlya Dryomov } 4063bc1ecc65SIlya Dryomov img_request->rq = rq; 406470b16db8SIlya Dryomov snapc = NULL; /* img_request consumes a ref */ 4065bc1ecc65SIlya Dryomov 406690e98c52SGuangliang Zhao if (op_type == OBJ_OP_DISCARD) 406790e98c52SGuangliang Zhao result = rbd_img_request_fill(img_request, OBJ_REQUEST_NODATA, 406890e98c52SGuangliang Zhao NULL); 406990e98c52SGuangliang Zhao else 407090e98c52SGuangliang Zhao result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO, 407190e98c52SGuangliang Zhao rq->bio); 4072bc1ecc65SIlya Dryomov if (result) 4073bc1ecc65SIlya Dryomov goto err_img_request; 4074bc1ecc65SIlya Dryomov 4075bc1ecc65SIlya Dryomov result = rbd_img_request_submit(img_request); 4076bc1ecc65SIlya Dryomov if (result) 4077bc1ecc65SIlya Dryomov goto err_img_request; 4078bc1ecc65SIlya Dryomov 4079ed95b21aSIlya Dryomov if (must_be_locked) 4080ed95b21aSIlya Dryomov up_read(&rbd_dev->lock_rwsem); 4081bc1ecc65SIlya Dryomov return; 4082bc1ecc65SIlya Dryomov 4083bc1ecc65SIlya Dryomov err_img_request: 4084bc1ecc65SIlya Dryomov rbd_img_request_put(img_request); 4085ed95b21aSIlya Dryomov err_unlock: 4086ed95b21aSIlya Dryomov if (must_be_locked) 4087ed95b21aSIlya Dryomov up_read(&rbd_dev->lock_rwsem); 4088bc1ecc65SIlya Dryomov err_rq: 4089bc1ecc65SIlya Dryomov if (result) 4090bc1ecc65SIlya Dryomov rbd_warn(rbd_dev, "%s %llx at %llx result %d", 40916d2940c8SGuangliang Zhao obj_op_name(op_type), length, offset, result); 40924e752f0aSJosh Durgin ceph_put_snap_context(snapc); 40937ad18afaSChristoph Hellwig err: 40947ad18afaSChristoph Hellwig blk_mq_end_request(rq, result); 4095bc1ecc65SIlya Dryomov } 4096bc1ecc65SIlya Dryomov 40977ad18afaSChristoph Hellwig static int rbd_queue_rq(struct blk_mq_hw_ctx *hctx, 40987ad18afaSChristoph Hellwig const struct blk_mq_queue_data *bd) 4099bc1ecc65SIlya Dryomov { 41007ad18afaSChristoph Hellwig struct request *rq = bd->rq; 41017ad18afaSChristoph Hellwig struct work_struct *work = blk_mq_rq_to_pdu(rq); 4102bc1ecc65SIlya Dryomov 41037ad18afaSChristoph Hellwig queue_work(rbd_wq, work); 41047ad18afaSChristoph Hellwig return BLK_MQ_RQ_QUEUE_OK; 4105bf0d5f50SAlex Elder } 4106bf0d5f50SAlex Elder 4107602adf40SYehuda Sadeh static void rbd_free_disk(struct rbd_device *rbd_dev) 4108602adf40SYehuda Sadeh { 4109602adf40SYehuda Sadeh struct gendisk *disk = rbd_dev->disk; 4110602adf40SYehuda Sadeh 4111602adf40SYehuda Sadeh if (!disk) 4112602adf40SYehuda Sadeh return; 4113602adf40SYehuda Sadeh 4114a0cab924SAlex Elder rbd_dev->disk = NULL; 4115a0cab924SAlex Elder if (disk->flags & GENHD_FL_UP) { 4116602adf40SYehuda Sadeh del_gendisk(disk); 4117602adf40SYehuda Sadeh if (disk->queue) 4118602adf40SYehuda Sadeh blk_cleanup_queue(disk->queue); 41197ad18afaSChristoph Hellwig blk_mq_free_tag_set(&rbd_dev->tag_set); 4120a0cab924SAlex Elder } 4121602adf40SYehuda Sadeh put_disk(disk); 4122602adf40SYehuda Sadeh } 4123602adf40SYehuda Sadeh 4124788e2df3SAlex Elder static int rbd_obj_read_sync(struct rbd_device *rbd_dev, 4125fe5478e0SIlya Dryomov struct ceph_object_id *oid, 4126fe5478e0SIlya Dryomov struct ceph_object_locator *oloc, 4127fe5478e0SIlya Dryomov void *buf, int buf_len) 4128788e2df3SAlex Elder 4129788e2df3SAlex Elder { 4130fe5478e0SIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 4131fe5478e0SIlya Dryomov struct ceph_osd_request *req; 4132fe5478e0SIlya Dryomov struct page **pages; 4133fe5478e0SIlya Dryomov int num_pages = calc_pages_for(0, buf_len); 4134788e2df3SAlex Elder int ret; 4135788e2df3SAlex Elder 4136fe5478e0SIlya Dryomov req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_KERNEL); 4137fe5478e0SIlya Dryomov if (!req) 4138fe5478e0SIlya Dryomov return -ENOMEM; 4139788e2df3SAlex Elder 4140fe5478e0SIlya Dryomov ceph_oid_copy(&req->r_base_oid, oid); 4141fe5478e0SIlya Dryomov ceph_oloc_copy(&req->r_base_oloc, oloc); 4142fe5478e0SIlya Dryomov req->r_flags = CEPH_OSD_FLAG_READ; 4143788e2df3SAlex Elder 4144fe5478e0SIlya Dryomov ret = ceph_osdc_alloc_messages(req, GFP_KERNEL); 4145788e2df3SAlex Elder if (ret) 4146fe5478e0SIlya Dryomov goto out_req; 4147788e2df3SAlex Elder 4148fe5478e0SIlya Dryomov pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); 4149fe5478e0SIlya Dryomov if (IS_ERR(pages)) { 4150fe5478e0SIlya Dryomov ret = PTR_ERR(pages); 4151fe5478e0SIlya Dryomov goto out_req; 4152fe5478e0SIlya Dryomov } 41531ceae7efSAlex Elder 4154fe5478e0SIlya Dryomov osd_req_op_extent_init(req, 0, CEPH_OSD_OP_READ, 0, buf_len, 0, 0); 4155fe5478e0SIlya Dryomov osd_req_op_extent_osd_data_pages(req, 0, pages, buf_len, 0, false, 4156fe5478e0SIlya Dryomov true); 4157788e2df3SAlex Elder 4158fe5478e0SIlya Dryomov ceph_osdc_start_request(osdc, req, false); 4159fe5478e0SIlya Dryomov ret = ceph_osdc_wait_request(osdc, req); 4160fe5478e0SIlya Dryomov if (ret >= 0) 4161fe5478e0SIlya Dryomov ceph_copy_from_page_vector(pages, buf, 0, ret); 4162fe5478e0SIlya Dryomov 4163fe5478e0SIlya Dryomov out_req: 4164fe5478e0SIlya Dryomov ceph_osdc_put_request(req); 4165788e2df3SAlex Elder return ret; 4166788e2df3SAlex Elder } 4167788e2df3SAlex Elder 4168602adf40SYehuda Sadeh /* 4169662518b1SAlex Elder * Read the complete header for the given rbd device. On successful 4170662518b1SAlex Elder * return, the rbd_dev->header field will contain up-to-date 4171662518b1SAlex Elder * information about the image. 41724156d998SAlex Elder */ 417399a41ebcSAlex Elder static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev) 41744156d998SAlex Elder { 41754156d998SAlex Elder struct rbd_image_header_ondisk *ondisk = NULL; 41764156d998SAlex Elder u32 snap_count = 0; 41774156d998SAlex Elder u64 names_size = 0; 41784156d998SAlex Elder u32 want_count; 41794156d998SAlex Elder int ret; 41804156d998SAlex Elder 41814156d998SAlex Elder /* 41824156d998SAlex Elder * The complete header will include an array of its 64-bit 41834156d998SAlex Elder * snapshot ids, followed by the names of those snapshots as 41844156d998SAlex Elder * a contiguous block of NUL-terminated strings. Note that 41854156d998SAlex Elder * the number of snapshots could change by the time we read 41864156d998SAlex Elder * it in, in which case we re-read it. 41874156d998SAlex Elder */ 41884156d998SAlex Elder do { 41894156d998SAlex Elder size_t size; 41904156d998SAlex Elder 41914156d998SAlex Elder kfree(ondisk); 41924156d998SAlex Elder 41934156d998SAlex Elder size = sizeof (*ondisk); 41944156d998SAlex Elder size += snap_count * sizeof (struct rbd_image_snap_ondisk); 41954156d998SAlex Elder size += names_size; 41964156d998SAlex Elder ondisk = kmalloc(size, GFP_KERNEL); 41974156d998SAlex Elder if (!ondisk) 4198662518b1SAlex Elder return -ENOMEM; 41994156d998SAlex Elder 4200fe5478e0SIlya Dryomov ret = rbd_obj_read_sync(rbd_dev, &rbd_dev->header_oid, 4201fe5478e0SIlya Dryomov &rbd_dev->header_oloc, ondisk, size); 42024156d998SAlex Elder if (ret < 0) 4203662518b1SAlex Elder goto out; 4204c0cd10dbSAlex Elder if ((size_t)ret < size) { 42054156d998SAlex Elder ret = -ENXIO; 420606ecc6cbSAlex Elder rbd_warn(rbd_dev, "short header read (want %zd got %d)", 420706ecc6cbSAlex Elder size, ret); 4208662518b1SAlex Elder goto out; 42094156d998SAlex Elder } 42104156d998SAlex Elder if (!rbd_dev_ondisk_valid(ondisk)) { 42114156d998SAlex Elder ret = -ENXIO; 421206ecc6cbSAlex Elder rbd_warn(rbd_dev, "invalid header"); 4213662518b1SAlex Elder goto out; 42144156d998SAlex Elder } 42154156d998SAlex Elder 42164156d998SAlex Elder names_size = le64_to_cpu(ondisk->snap_names_len); 42174156d998SAlex Elder want_count = snap_count; 42184156d998SAlex Elder snap_count = le32_to_cpu(ondisk->snap_count); 42194156d998SAlex Elder } while (snap_count != want_count); 42204156d998SAlex Elder 4221662518b1SAlex Elder ret = rbd_header_from_disk(rbd_dev, ondisk); 4222662518b1SAlex Elder out: 42234156d998SAlex Elder kfree(ondisk); 42244156d998SAlex Elder 4225dfc5606dSYehuda Sadeh return ret; 4226602adf40SYehuda Sadeh } 4227602adf40SYehuda Sadeh 422815228edeSAlex Elder /* 422915228edeSAlex Elder * Clear the rbd device's EXISTS flag if the snapshot it's mapped to 423015228edeSAlex Elder * has disappeared from the (just updated) snapshot context. 423115228edeSAlex Elder */ 423215228edeSAlex Elder static void rbd_exists_validate(struct rbd_device *rbd_dev) 423315228edeSAlex Elder { 423415228edeSAlex Elder u64 snap_id; 423515228edeSAlex Elder 423615228edeSAlex Elder if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) 423715228edeSAlex Elder return; 423815228edeSAlex Elder 423915228edeSAlex Elder snap_id = rbd_dev->spec->snap_id; 424015228edeSAlex Elder if (snap_id == CEPH_NOSNAP) 424115228edeSAlex Elder return; 424215228edeSAlex Elder 424315228edeSAlex Elder if (rbd_dev_snap_index(rbd_dev, snap_id) == BAD_SNAP_INDEX) 424415228edeSAlex Elder clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); 424515228edeSAlex Elder } 424615228edeSAlex Elder 42479875201eSJosh Durgin static void rbd_dev_update_size(struct rbd_device *rbd_dev) 42489875201eSJosh Durgin { 42499875201eSJosh Durgin sector_t size; 42509875201eSJosh Durgin 42519875201eSJosh Durgin /* 4252811c6688SIlya Dryomov * If EXISTS is not set, rbd_dev->disk may be NULL, so don't 4253811c6688SIlya Dryomov * try to update its size. If REMOVING is set, updating size 4254811c6688SIlya Dryomov * is just useless work since the device can't be opened. 42559875201eSJosh Durgin */ 4256811c6688SIlya Dryomov if (test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags) && 4257811c6688SIlya Dryomov !test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags)) { 42589875201eSJosh Durgin size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE; 42599875201eSJosh Durgin dout("setting size to %llu sectors", (unsigned long long)size); 42609875201eSJosh Durgin set_capacity(rbd_dev->disk, size); 42619875201eSJosh Durgin revalidate_disk(rbd_dev->disk); 42629875201eSJosh Durgin } 42639875201eSJosh Durgin } 42649875201eSJosh Durgin 4265cc4a38bdSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev) 42661fe5e993SAlex Elder { 4267e627db08SAlex Elder u64 mapping_size; 42681fe5e993SAlex Elder int ret; 42691fe5e993SAlex Elder 4270cfbf6377SAlex Elder down_write(&rbd_dev->header_rwsem); 42713b5cf2a2SAlex Elder mapping_size = rbd_dev->mapping.size; 4272a720ae09SIlya Dryomov 4273a720ae09SIlya Dryomov ret = rbd_dev_header_info(rbd_dev); 427452bb1f9bSIlya Dryomov if (ret) 427573e39e4dSIlya Dryomov goto out; 427615228edeSAlex Elder 4277e8f59b59SIlya Dryomov /* 4278e8f59b59SIlya Dryomov * If there is a parent, see if it has disappeared due to the 4279e8f59b59SIlya Dryomov * mapped image getting flattened. 4280e8f59b59SIlya Dryomov */ 4281e8f59b59SIlya Dryomov if (rbd_dev->parent) { 4282e8f59b59SIlya Dryomov ret = rbd_dev_v2_parent_info(rbd_dev); 4283e8f59b59SIlya Dryomov if (ret) 428473e39e4dSIlya Dryomov goto out; 4285e8f59b59SIlya Dryomov } 4286e8f59b59SIlya Dryomov 42875ff1108cSIlya Dryomov if (rbd_dev->spec->snap_id == CEPH_NOSNAP) { 42885ff1108cSIlya Dryomov rbd_dev->mapping.size = rbd_dev->header.image_size; 42895ff1108cSIlya Dryomov } else { 42905ff1108cSIlya Dryomov /* validate mapped snapshot's EXISTS flag */ 429115228edeSAlex Elder rbd_exists_validate(rbd_dev); 42925ff1108cSIlya Dryomov } 42935ff1108cSIlya Dryomov 429473e39e4dSIlya Dryomov out: 4295cfbf6377SAlex Elder up_write(&rbd_dev->header_rwsem); 429673e39e4dSIlya Dryomov if (!ret && mapping_size != rbd_dev->mapping.size) 42979875201eSJosh Durgin rbd_dev_update_size(rbd_dev); 42981fe5e993SAlex Elder 429973e39e4dSIlya Dryomov return ret; 43001fe5e993SAlex Elder } 43011fe5e993SAlex Elder 43027ad18afaSChristoph Hellwig static int rbd_init_request(void *data, struct request *rq, 43037ad18afaSChristoph Hellwig unsigned int hctx_idx, unsigned int request_idx, 43047ad18afaSChristoph Hellwig unsigned int numa_node) 43057ad18afaSChristoph Hellwig { 43067ad18afaSChristoph Hellwig struct work_struct *work = blk_mq_rq_to_pdu(rq); 43077ad18afaSChristoph Hellwig 43087ad18afaSChristoph Hellwig INIT_WORK(work, rbd_queue_workfn); 43097ad18afaSChristoph Hellwig return 0; 43107ad18afaSChristoph Hellwig } 43117ad18afaSChristoph Hellwig 43127ad18afaSChristoph Hellwig static struct blk_mq_ops rbd_mq_ops = { 43137ad18afaSChristoph Hellwig .queue_rq = rbd_queue_rq, 43147ad18afaSChristoph Hellwig .init_request = rbd_init_request, 43157ad18afaSChristoph Hellwig }; 43167ad18afaSChristoph Hellwig 4317602adf40SYehuda Sadeh static int rbd_init_disk(struct rbd_device *rbd_dev) 4318602adf40SYehuda Sadeh { 4319602adf40SYehuda Sadeh struct gendisk *disk; 4320602adf40SYehuda Sadeh struct request_queue *q; 4321593a9e7bSAlex Elder u64 segment_size; 43227ad18afaSChristoph Hellwig int err; 4323602adf40SYehuda Sadeh 4324602adf40SYehuda Sadeh /* create gendisk info */ 43257e513d43SIlya Dryomov disk = alloc_disk(single_major ? 43267e513d43SIlya Dryomov (1 << RBD_SINGLE_MAJOR_PART_SHIFT) : 43277e513d43SIlya Dryomov RBD_MINORS_PER_MAJOR); 4328602adf40SYehuda Sadeh if (!disk) 43291fcdb8aaSAlex Elder return -ENOMEM; 4330602adf40SYehuda Sadeh 4331f0f8cef5SAlex Elder snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d", 4332de71a297SAlex Elder rbd_dev->dev_id); 4333602adf40SYehuda Sadeh disk->major = rbd_dev->major; 4334dd82fff1SIlya Dryomov disk->first_minor = rbd_dev->minor; 43357e513d43SIlya Dryomov if (single_major) 43367e513d43SIlya Dryomov disk->flags |= GENHD_FL_EXT_DEVT; 4337602adf40SYehuda Sadeh disk->fops = &rbd_bd_ops; 4338602adf40SYehuda Sadeh disk->private_data = rbd_dev; 4339602adf40SYehuda Sadeh 43407ad18afaSChristoph Hellwig memset(&rbd_dev->tag_set, 0, sizeof(rbd_dev->tag_set)); 43417ad18afaSChristoph Hellwig rbd_dev->tag_set.ops = &rbd_mq_ops; 4342b5584180SIlya Dryomov rbd_dev->tag_set.queue_depth = rbd_dev->opts->queue_depth; 43437ad18afaSChristoph Hellwig rbd_dev->tag_set.numa_node = NUMA_NO_NODE; 4344b5584180SIlya Dryomov rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE; 43457ad18afaSChristoph Hellwig rbd_dev->tag_set.nr_hw_queues = 1; 43467ad18afaSChristoph Hellwig rbd_dev->tag_set.cmd_size = sizeof(struct work_struct); 43477ad18afaSChristoph Hellwig 43487ad18afaSChristoph Hellwig err = blk_mq_alloc_tag_set(&rbd_dev->tag_set); 43497ad18afaSChristoph Hellwig if (err) 4350602adf40SYehuda Sadeh goto out_disk; 4351029bcbd8SJosh Durgin 43527ad18afaSChristoph Hellwig q = blk_mq_init_queue(&rbd_dev->tag_set); 43537ad18afaSChristoph Hellwig if (IS_ERR(q)) { 43547ad18afaSChristoph Hellwig err = PTR_ERR(q); 43557ad18afaSChristoph Hellwig goto out_tag_set; 43567ad18afaSChristoph Hellwig } 43577ad18afaSChristoph Hellwig 4358d8a2c89cSIlya Dryomov queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q); 4359d8a2c89cSIlya Dryomov /* QUEUE_FLAG_ADD_RANDOM is off by default for blk-mq */ 4360593a9e7bSAlex Elder 4361029bcbd8SJosh Durgin /* set io sizes to object size */ 4362593a9e7bSAlex Elder segment_size = rbd_obj_bytes(&rbd_dev->header); 4363593a9e7bSAlex Elder blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE); 43640d9fde4fSIlya Dryomov q->limits.max_sectors = queue_max_hw_sectors(q); 4365d3834fefSIlya Dryomov blk_queue_max_segments(q, segment_size / SECTOR_SIZE); 4366593a9e7bSAlex Elder blk_queue_max_segment_size(q, segment_size); 4367593a9e7bSAlex Elder blk_queue_io_min(q, segment_size); 4368593a9e7bSAlex Elder blk_queue_io_opt(q, segment_size); 4369029bcbd8SJosh Durgin 437090e98c52SGuangliang Zhao /* enable the discard support */ 437190e98c52SGuangliang Zhao queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q); 437290e98c52SGuangliang Zhao q->limits.discard_granularity = segment_size; 437390e98c52SGuangliang Zhao q->limits.discard_alignment = segment_size; 43742bb4cd5cSJens Axboe blk_queue_max_discard_sectors(q, segment_size / SECTOR_SIZE); 4375b76f8239SJosh Durgin q->limits.discard_zeroes_data = 1; 437690e98c52SGuangliang Zhao 4377bae818eeSRonny Hegewald if (!ceph_test_opt(rbd_dev->rbd_client->client, NOCRC)) 4378bae818eeSRonny Hegewald q->backing_dev_info.capabilities |= BDI_CAP_STABLE_WRITES; 4379bae818eeSRonny Hegewald 4380602adf40SYehuda Sadeh disk->queue = q; 4381602adf40SYehuda Sadeh 4382602adf40SYehuda Sadeh q->queuedata = rbd_dev; 4383602adf40SYehuda Sadeh 4384602adf40SYehuda Sadeh rbd_dev->disk = disk; 4385602adf40SYehuda Sadeh 4386602adf40SYehuda Sadeh return 0; 43877ad18afaSChristoph Hellwig out_tag_set: 43887ad18afaSChristoph Hellwig blk_mq_free_tag_set(&rbd_dev->tag_set); 4389602adf40SYehuda Sadeh out_disk: 4390602adf40SYehuda Sadeh put_disk(disk); 43917ad18afaSChristoph Hellwig return err; 4392602adf40SYehuda Sadeh } 4393602adf40SYehuda Sadeh 4394dfc5606dSYehuda Sadeh /* 4395dfc5606dSYehuda Sadeh sysfs 4396dfc5606dSYehuda Sadeh */ 4397602adf40SYehuda Sadeh 4398593a9e7bSAlex Elder static struct rbd_device *dev_to_rbd_dev(struct device *dev) 4399593a9e7bSAlex Elder { 4400593a9e7bSAlex Elder return container_of(dev, struct rbd_device, dev); 4401593a9e7bSAlex Elder } 4402593a9e7bSAlex Elder 4403dfc5606dSYehuda Sadeh static ssize_t rbd_size_show(struct device *dev, 4404dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 4405602adf40SYehuda Sadeh { 4406593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4407dfc5606dSYehuda Sadeh 4408fc71d833SAlex Elder return sprintf(buf, "%llu\n", 4409fc71d833SAlex Elder (unsigned long long)rbd_dev->mapping.size); 4410602adf40SYehuda Sadeh } 4411602adf40SYehuda Sadeh 441234b13184SAlex Elder /* 441334b13184SAlex Elder * Note this shows the features for whatever's mapped, which is not 441434b13184SAlex Elder * necessarily the base image. 441534b13184SAlex Elder */ 441634b13184SAlex Elder static ssize_t rbd_features_show(struct device *dev, 441734b13184SAlex Elder struct device_attribute *attr, char *buf) 441834b13184SAlex Elder { 441934b13184SAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 442034b13184SAlex Elder 442134b13184SAlex Elder return sprintf(buf, "0x%016llx\n", 442234b13184SAlex Elder (unsigned long long)rbd_dev->mapping.features); 442334b13184SAlex Elder } 442434b13184SAlex Elder 4425dfc5606dSYehuda Sadeh static ssize_t rbd_major_show(struct device *dev, 4426dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 4427602adf40SYehuda Sadeh { 4428593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4429dfc5606dSYehuda Sadeh 4430fc71d833SAlex Elder if (rbd_dev->major) 4431dfc5606dSYehuda Sadeh return sprintf(buf, "%d\n", rbd_dev->major); 4432fc71d833SAlex Elder 4433fc71d833SAlex Elder return sprintf(buf, "(none)\n"); 4434dd82fff1SIlya Dryomov } 4435fc71d833SAlex Elder 4436dd82fff1SIlya Dryomov static ssize_t rbd_minor_show(struct device *dev, 4437dd82fff1SIlya Dryomov struct device_attribute *attr, char *buf) 4438dd82fff1SIlya Dryomov { 4439dd82fff1SIlya Dryomov struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4440dd82fff1SIlya Dryomov 4441dd82fff1SIlya Dryomov return sprintf(buf, "%d\n", rbd_dev->minor); 4442dfc5606dSYehuda Sadeh } 4443dfc5606dSYehuda Sadeh 4444005a07bfSIlya Dryomov static ssize_t rbd_client_addr_show(struct device *dev, 4445005a07bfSIlya Dryomov struct device_attribute *attr, char *buf) 4446005a07bfSIlya Dryomov { 4447005a07bfSIlya Dryomov struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4448005a07bfSIlya Dryomov struct ceph_entity_addr *client_addr = 4449005a07bfSIlya Dryomov ceph_client_addr(rbd_dev->rbd_client->client); 4450005a07bfSIlya Dryomov 4451005a07bfSIlya Dryomov return sprintf(buf, "%pISpc/%u\n", &client_addr->in_addr, 4452005a07bfSIlya Dryomov le32_to_cpu(client_addr->nonce)); 4453005a07bfSIlya Dryomov } 4454005a07bfSIlya Dryomov 4455dfc5606dSYehuda Sadeh static ssize_t rbd_client_id_show(struct device *dev, 4456dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 4457dfc5606dSYehuda Sadeh { 4458593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4459dfc5606dSYehuda Sadeh 44601dbb4399SAlex Elder return sprintf(buf, "client%lld\n", 4461033268a5SIlya Dryomov ceph_client_gid(rbd_dev->rbd_client->client)); 4462dfc5606dSYehuda Sadeh } 4463dfc5606dSYehuda Sadeh 4464267fb90bSMike Christie static ssize_t rbd_cluster_fsid_show(struct device *dev, 4465267fb90bSMike Christie struct device_attribute *attr, char *buf) 4466267fb90bSMike Christie { 4467267fb90bSMike Christie struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4468267fb90bSMike Christie 4469267fb90bSMike Christie return sprintf(buf, "%pU\n", &rbd_dev->rbd_client->client->fsid); 4470267fb90bSMike Christie } 4471267fb90bSMike Christie 44720d6d1e9cSMike Christie static ssize_t rbd_config_info_show(struct device *dev, 44730d6d1e9cSMike Christie struct device_attribute *attr, char *buf) 44740d6d1e9cSMike Christie { 44750d6d1e9cSMike Christie struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 44760d6d1e9cSMike Christie 44770d6d1e9cSMike Christie return sprintf(buf, "%s\n", rbd_dev->config_info); 4478dfc5606dSYehuda Sadeh } 4479dfc5606dSYehuda Sadeh 4480dfc5606dSYehuda Sadeh static ssize_t rbd_pool_show(struct device *dev, 4481dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 4482dfc5606dSYehuda Sadeh { 4483593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4484dfc5606dSYehuda Sadeh 44850d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->pool_name); 4486dfc5606dSYehuda Sadeh } 4487dfc5606dSYehuda Sadeh 44889bb2f334SAlex Elder static ssize_t rbd_pool_id_show(struct device *dev, 44899bb2f334SAlex Elder struct device_attribute *attr, char *buf) 44909bb2f334SAlex Elder { 44919bb2f334SAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 44929bb2f334SAlex Elder 44930d7dbfceSAlex Elder return sprintf(buf, "%llu\n", 44940d7dbfceSAlex Elder (unsigned long long) rbd_dev->spec->pool_id); 44959bb2f334SAlex Elder } 44969bb2f334SAlex Elder 4497dfc5606dSYehuda Sadeh static ssize_t rbd_name_show(struct device *dev, 4498dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 4499dfc5606dSYehuda Sadeh { 4500593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4501dfc5606dSYehuda Sadeh 4502a92ffdf8SAlex Elder if (rbd_dev->spec->image_name) 45030d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->image_name); 4504a92ffdf8SAlex Elder 4505a92ffdf8SAlex Elder return sprintf(buf, "(unknown)\n"); 4506dfc5606dSYehuda Sadeh } 4507dfc5606dSYehuda Sadeh 4508589d30e0SAlex Elder static ssize_t rbd_image_id_show(struct device *dev, 4509589d30e0SAlex Elder struct device_attribute *attr, char *buf) 4510589d30e0SAlex Elder { 4511589d30e0SAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4512589d30e0SAlex Elder 45130d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->image_id); 4514589d30e0SAlex Elder } 4515589d30e0SAlex Elder 451634b13184SAlex Elder /* 451734b13184SAlex Elder * Shows the name of the currently-mapped snapshot (or 451834b13184SAlex Elder * RBD_SNAP_HEAD_NAME for the base image). 451934b13184SAlex Elder */ 4520dfc5606dSYehuda Sadeh static ssize_t rbd_snap_show(struct device *dev, 4521dfc5606dSYehuda Sadeh struct device_attribute *attr, 4522dfc5606dSYehuda Sadeh char *buf) 4523dfc5606dSYehuda Sadeh { 4524593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4525dfc5606dSYehuda Sadeh 45260d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->snap_name); 4527dfc5606dSYehuda Sadeh } 4528dfc5606dSYehuda Sadeh 452992a58671SMike Christie static ssize_t rbd_snap_id_show(struct device *dev, 453092a58671SMike Christie struct device_attribute *attr, char *buf) 453192a58671SMike Christie { 453292a58671SMike Christie struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 453392a58671SMike Christie 453492a58671SMike Christie return sprintf(buf, "%llu\n", rbd_dev->spec->snap_id); 453592a58671SMike Christie } 453692a58671SMike Christie 453786b00e0dSAlex Elder /* 4538ff96128fSIlya Dryomov * For a v2 image, shows the chain of parent images, separated by empty 4539ff96128fSIlya Dryomov * lines. For v1 images or if there is no parent, shows "(no parent 4540ff96128fSIlya Dryomov * image)". 454186b00e0dSAlex Elder */ 454286b00e0dSAlex Elder static ssize_t rbd_parent_show(struct device *dev, 454386b00e0dSAlex Elder struct device_attribute *attr, 454486b00e0dSAlex Elder char *buf) 454586b00e0dSAlex Elder { 454686b00e0dSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4547ff96128fSIlya Dryomov ssize_t count = 0; 454886b00e0dSAlex Elder 4549ff96128fSIlya Dryomov if (!rbd_dev->parent) 455086b00e0dSAlex Elder return sprintf(buf, "(no parent image)\n"); 455186b00e0dSAlex Elder 4552ff96128fSIlya Dryomov for ( ; rbd_dev->parent; rbd_dev = rbd_dev->parent) { 4553ff96128fSIlya Dryomov struct rbd_spec *spec = rbd_dev->parent_spec; 455486b00e0dSAlex Elder 4555ff96128fSIlya Dryomov count += sprintf(&buf[count], "%s" 4556ff96128fSIlya Dryomov "pool_id %llu\npool_name %s\n" 4557ff96128fSIlya Dryomov "image_id %s\nimage_name %s\n" 4558ff96128fSIlya Dryomov "snap_id %llu\nsnap_name %s\n" 4559ff96128fSIlya Dryomov "overlap %llu\n", 4560ff96128fSIlya Dryomov !count ? "" : "\n", /* first? */ 4561ff96128fSIlya Dryomov spec->pool_id, spec->pool_name, 4562ff96128fSIlya Dryomov spec->image_id, spec->image_name ?: "(unknown)", 4563ff96128fSIlya Dryomov spec->snap_id, spec->snap_name, 4564ff96128fSIlya Dryomov rbd_dev->parent_overlap); 4565ff96128fSIlya Dryomov } 456686b00e0dSAlex Elder 456786b00e0dSAlex Elder return count; 456886b00e0dSAlex Elder } 456986b00e0dSAlex Elder 4570dfc5606dSYehuda Sadeh static ssize_t rbd_image_refresh(struct device *dev, 4571dfc5606dSYehuda Sadeh struct device_attribute *attr, 4572dfc5606dSYehuda Sadeh const char *buf, 4573dfc5606dSYehuda Sadeh size_t size) 4574dfc5606dSYehuda Sadeh { 4575593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4576b813623aSAlex Elder int ret; 4577602adf40SYehuda Sadeh 4578cc4a38bdSAlex Elder ret = rbd_dev_refresh(rbd_dev); 4579e627db08SAlex Elder if (ret) 458052bb1f9bSIlya Dryomov return ret; 4581b813623aSAlex Elder 458252bb1f9bSIlya Dryomov return size; 4583dfc5606dSYehuda Sadeh } 4584602adf40SYehuda Sadeh 4585dfc5606dSYehuda Sadeh static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL); 458634b13184SAlex Elder static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL); 4587dfc5606dSYehuda Sadeh static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL); 4588dd82fff1SIlya Dryomov static DEVICE_ATTR(minor, S_IRUGO, rbd_minor_show, NULL); 4589005a07bfSIlya Dryomov static DEVICE_ATTR(client_addr, S_IRUGO, rbd_client_addr_show, NULL); 4590dfc5606dSYehuda Sadeh static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL); 4591267fb90bSMike Christie static DEVICE_ATTR(cluster_fsid, S_IRUGO, rbd_cluster_fsid_show, NULL); 45920d6d1e9cSMike Christie static DEVICE_ATTR(config_info, S_IRUSR, rbd_config_info_show, NULL); 4593dfc5606dSYehuda Sadeh static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL); 45949bb2f334SAlex Elder static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL); 4595dfc5606dSYehuda Sadeh static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL); 4596589d30e0SAlex Elder static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL); 4597dfc5606dSYehuda Sadeh static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh); 4598dfc5606dSYehuda Sadeh static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL); 459992a58671SMike Christie static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL); 460086b00e0dSAlex Elder static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL); 4601dfc5606dSYehuda Sadeh 4602dfc5606dSYehuda Sadeh static struct attribute *rbd_attrs[] = { 4603dfc5606dSYehuda Sadeh &dev_attr_size.attr, 460434b13184SAlex Elder &dev_attr_features.attr, 4605dfc5606dSYehuda Sadeh &dev_attr_major.attr, 4606dd82fff1SIlya Dryomov &dev_attr_minor.attr, 4607005a07bfSIlya Dryomov &dev_attr_client_addr.attr, 4608dfc5606dSYehuda Sadeh &dev_attr_client_id.attr, 4609267fb90bSMike Christie &dev_attr_cluster_fsid.attr, 46100d6d1e9cSMike Christie &dev_attr_config_info.attr, 4611dfc5606dSYehuda Sadeh &dev_attr_pool.attr, 46129bb2f334SAlex Elder &dev_attr_pool_id.attr, 4613dfc5606dSYehuda Sadeh &dev_attr_name.attr, 4614589d30e0SAlex Elder &dev_attr_image_id.attr, 4615dfc5606dSYehuda Sadeh &dev_attr_current_snap.attr, 461692a58671SMike Christie &dev_attr_snap_id.attr, 461786b00e0dSAlex Elder &dev_attr_parent.attr, 4618dfc5606dSYehuda Sadeh &dev_attr_refresh.attr, 4619dfc5606dSYehuda Sadeh NULL 4620dfc5606dSYehuda Sadeh }; 4621dfc5606dSYehuda Sadeh 4622dfc5606dSYehuda Sadeh static struct attribute_group rbd_attr_group = { 4623dfc5606dSYehuda Sadeh .attrs = rbd_attrs, 4624dfc5606dSYehuda Sadeh }; 4625dfc5606dSYehuda Sadeh 4626dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_attr_groups[] = { 4627dfc5606dSYehuda Sadeh &rbd_attr_group, 4628dfc5606dSYehuda Sadeh NULL 4629dfc5606dSYehuda Sadeh }; 4630dfc5606dSYehuda Sadeh 46316cac4695SIlya Dryomov static void rbd_dev_release(struct device *dev); 4632dfc5606dSYehuda Sadeh 4633dfc5606dSYehuda Sadeh static struct device_type rbd_device_type = { 4634dfc5606dSYehuda Sadeh .name = "rbd", 4635dfc5606dSYehuda Sadeh .groups = rbd_attr_groups, 46366cac4695SIlya Dryomov .release = rbd_dev_release, 4637dfc5606dSYehuda Sadeh }; 4638dfc5606dSYehuda Sadeh 46398b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec) 46408b8fb99cSAlex Elder { 46418b8fb99cSAlex Elder kref_get(&spec->kref); 46428b8fb99cSAlex Elder 46438b8fb99cSAlex Elder return spec; 46448b8fb99cSAlex Elder } 46458b8fb99cSAlex Elder 46468b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref); 46478b8fb99cSAlex Elder static void rbd_spec_put(struct rbd_spec *spec) 46488b8fb99cSAlex Elder { 46498b8fb99cSAlex Elder if (spec) 46508b8fb99cSAlex Elder kref_put(&spec->kref, rbd_spec_free); 46518b8fb99cSAlex Elder } 46528b8fb99cSAlex Elder 46538b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_alloc(void) 46548b8fb99cSAlex Elder { 46558b8fb99cSAlex Elder struct rbd_spec *spec; 46568b8fb99cSAlex Elder 46578b8fb99cSAlex Elder spec = kzalloc(sizeof (*spec), GFP_KERNEL); 46588b8fb99cSAlex Elder if (!spec) 46598b8fb99cSAlex Elder return NULL; 466004077599SIlya Dryomov 466104077599SIlya Dryomov spec->pool_id = CEPH_NOPOOL; 466204077599SIlya Dryomov spec->snap_id = CEPH_NOSNAP; 46638b8fb99cSAlex Elder kref_init(&spec->kref); 46648b8fb99cSAlex Elder 46658b8fb99cSAlex Elder return spec; 46668b8fb99cSAlex Elder } 46678b8fb99cSAlex Elder 46688b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref) 46698b8fb99cSAlex Elder { 46708b8fb99cSAlex Elder struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref); 46718b8fb99cSAlex Elder 46728b8fb99cSAlex Elder kfree(spec->pool_name); 46738b8fb99cSAlex Elder kfree(spec->image_id); 46748b8fb99cSAlex Elder kfree(spec->image_name); 46758b8fb99cSAlex Elder kfree(spec->snap_name); 46768b8fb99cSAlex Elder kfree(spec); 46778b8fb99cSAlex Elder } 46788b8fb99cSAlex Elder 46791643dfa4SIlya Dryomov static void rbd_dev_free(struct rbd_device *rbd_dev) 4680dd5ac32dSIlya Dryomov { 468199d16943SIlya Dryomov WARN_ON(rbd_dev->watch_state != RBD_WATCH_STATE_UNREGISTERED); 4682ed95b21aSIlya Dryomov WARN_ON(rbd_dev->lock_state != RBD_LOCK_STATE_UNLOCKED); 4683dd5ac32dSIlya Dryomov 4684c41d13a3SIlya Dryomov ceph_oid_destroy(&rbd_dev->header_oid); 46856b6dddbeSIlya Dryomov ceph_oloc_destroy(&rbd_dev->header_oloc); 46860d6d1e9cSMike Christie kfree(rbd_dev->config_info); 4687c41d13a3SIlya Dryomov 4688dd5ac32dSIlya Dryomov rbd_put_client(rbd_dev->rbd_client); 4689dd5ac32dSIlya Dryomov rbd_spec_put(rbd_dev->spec); 4690dd5ac32dSIlya Dryomov kfree(rbd_dev->opts); 4691dd5ac32dSIlya Dryomov kfree(rbd_dev); 46921643dfa4SIlya Dryomov } 46931643dfa4SIlya Dryomov 46941643dfa4SIlya Dryomov static void rbd_dev_release(struct device *dev) 46951643dfa4SIlya Dryomov { 46961643dfa4SIlya Dryomov struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 46971643dfa4SIlya Dryomov bool need_put = !!rbd_dev->opts; 46981643dfa4SIlya Dryomov 46991643dfa4SIlya Dryomov if (need_put) { 47001643dfa4SIlya Dryomov destroy_workqueue(rbd_dev->task_wq); 47011643dfa4SIlya Dryomov ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id); 47021643dfa4SIlya Dryomov } 47031643dfa4SIlya Dryomov 47041643dfa4SIlya Dryomov rbd_dev_free(rbd_dev); 4705dd5ac32dSIlya Dryomov 4706dd5ac32dSIlya Dryomov /* 4707dd5ac32dSIlya Dryomov * This is racy, but way better than putting module outside of 4708dd5ac32dSIlya Dryomov * the release callback. The race window is pretty small, so 4709dd5ac32dSIlya Dryomov * doing something similar to dm (dm-builtin.c) is overkill. 4710dd5ac32dSIlya Dryomov */ 4711dd5ac32dSIlya Dryomov if (need_put) 4712dd5ac32dSIlya Dryomov module_put(THIS_MODULE); 4713dd5ac32dSIlya Dryomov } 4714dd5ac32dSIlya Dryomov 47151643dfa4SIlya Dryomov static struct rbd_device *__rbd_dev_create(struct rbd_client *rbdc, 47161643dfa4SIlya Dryomov struct rbd_spec *spec) 4717c53d5893SAlex Elder { 4718c53d5893SAlex Elder struct rbd_device *rbd_dev; 4719c53d5893SAlex Elder 4720c53d5893SAlex Elder rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL); 4721c53d5893SAlex Elder if (!rbd_dev) 4722c53d5893SAlex Elder return NULL; 4723c53d5893SAlex Elder 4724c53d5893SAlex Elder spin_lock_init(&rbd_dev->lock); 4725c53d5893SAlex Elder INIT_LIST_HEAD(&rbd_dev->node); 4726c53d5893SAlex Elder init_rwsem(&rbd_dev->header_rwsem); 4727c53d5893SAlex Elder 47287e97332eSIlya Dryomov rbd_dev->header.data_pool_id = CEPH_NOPOOL; 4729c41d13a3SIlya Dryomov ceph_oid_init(&rbd_dev->header_oid); 4730431a02cdSIlya Dryomov rbd_dev->header_oloc.pool = spec->pool_id; 4731c41d13a3SIlya Dryomov 473299d16943SIlya Dryomov mutex_init(&rbd_dev->watch_mutex); 473399d16943SIlya Dryomov rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED; 473499d16943SIlya Dryomov INIT_DELAYED_WORK(&rbd_dev->watch_dwork, rbd_reregister_watch); 473599d16943SIlya Dryomov 4736ed95b21aSIlya Dryomov init_rwsem(&rbd_dev->lock_rwsem); 4737ed95b21aSIlya Dryomov rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED; 4738ed95b21aSIlya Dryomov INIT_WORK(&rbd_dev->acquired_lock_work, rbd_notify_acquired_lock); 4739ed95b21aSIlya Dryomov INIT_WORK(&rbd_dev->released_lock_work, rbd_notify_released_lock); 4740ed95b21aSIlya Dryomov INIT_DELAYED_WORK(&rbd_dev->lock_dwork, rbd_acquire_lock); 4741ed95b21aSIlya Dryomov INIT_WORK(&rbd_dev->unlock_work, rbd_release_lock_work); 4742ed95b21aSIlya Dryomov init_waitqueue_head(&rbd_dev->lock_waitq); 4743ed95b21aSIlya Dryomov 4744dd5ac32dSIlya Dryomov rbd_dev->dev.bus = &rbd_bus_type; 4745dd5ac32dSIlya Dryomov rbd_dev->dev.type = &rbd_device_type; 4746dd5ac32dSIlya Dryomov rbd_dev->dev.parent = &rbd_root_dev; 4747dd5ac32dSIlya Dryomov device_initialize(&rbd_dev->dev); 4748dd5ac32dSIlya Dryomov 4749c53d5893SAlex Elder rbd_dev->rbd_client = rbdc; 4750d147543dSIlya Dryomov rbd_dev->spec = spec; 47510903e875SAlex Elder 47521643dfa4SIlya Dryomov return rbd_dev; 47531643dfa4SIlya Dryomov } 47541643dfa4SIlya Dryomov 4755dd5ac32dSIlya Dryomov /* 47561643dfa4SIlya Dryomov * Create a mapping rbd_dev. 4757dd5ac32dSIlya Dryomov */ 47581643dfa4SIlya Dryomov static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc, 47591643dfa4SIlya Dryomov struct rbd_spec *spec, 47601643dfa4SIlya Dryomov struct rbd_options *opts) 47611643dfa4SIlya Dryomov { 47621643dfa4SIlya Dryomov struct rbd_device *rbd_dev; 47631643dfa4SIlya Dryomov 47641643dfa4SIlya Dryomov rbd_dev = __rbd_dev_create(rbdc, spec); 47651643dfa4SIlya Dryomov if (!rbd_dev) 47661643dfa4SIlya Dryomov return NULL; 47671643dfa4SIlya Dryomov 47681643dfa4SIlya Dryomov rbd_dev->opts = opts; 47691643dfa4SIlya Dryomov 47701643dfa4SIlya Dryomov /* get an id and fill in device name */ 47711643dfa4SIlya Dryomov rbd_dev->dev_id = ida_simple_get(&rbd_dev_id_ida, 0, 47721643dfa4SIlya Dryomov minor_to_rbd_dev_id(1 << MINORBITS), 47731643dfa4SIlya Dryomov GFP_KERNEL); 47741643dfa4SIlya Dryomov if (rbd_dev->dev_id < 0) 47751643dfa4SIlya Dryomov goto fail_rbd_dev; 47761643dfa4SIlya Dryomov 47771643dfa4SIlya Dryomov sprintf(rbd_dev->name, RBD_DRV_NAME "%d", rbd_dev->dev_id); 47781643dfa4SIlya Dryomov rbd_dev->task_wq = alloc_ordered_workqueue("%s-tasks", WQ_MEM_RECLAIM, 47791643dfa4SIlya Dryomov rbd_dev->name); 47801643dfa4SIlya Dryomov if (!rbd_dev->task_wq) 47811643dfa4SIlya Dryomov goto fail_dev_id; 47821643dfa4SIlya Dryomov 47831643dfa4SIlya Dryomov /* we have a ref from do_rbd_add() */ 4784dd5ac32dSIlya Dryomov __module_get(THIS_MODULE); 4785dd5ac32dSIlya Dryomov 47861643dfa4SIlya Dryomov dout("%s rbd_dev %p dev_id %d\n", __func__, rbd_dev, rbd_dev->dev_id); 4787c53d5893SAlex Elder return rbd_dev; 47881643dfa4SIlya Dryomov 47891643dfa4SIlya Dryomov fail_dev_id: 47901643dfa4SIlya Dryomov ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id); 47911643dfa4SIlya Dryomov fail_rbd_dev: 47921643dfa4SIlya Dryomov rbd_dev_free(rbd_dev); 47931643dfa4SIlya Dryomov return NULL; 4794c53d5893SAlex Elder } 4795c53d5893SAlex Elder 4796c53d5893SAlex Elder static void rbd_dev_destroy(struct rbd_device *rbd_dev) 4797c53d5893SAlex Elder { 4798dd5ac32dSIlya Dryomov if (rbd_dev) 4799dd5ac32dSIlya Dryomov put_device(&rbd_dev->dev); 4800c53d5893SAlex Elder } 4801c53d5893SAlex Elder 4802dfc5606dSYehuda Sadeh /* 48039d475de5SAlex Elder * Get the size and object order for an image snapshot, or if 48049d475de5SAlex Elder * snap_id is CEPH_NOSNAP, gets this information for the base 48059d475de5SAlex Elder * image. 48069d475de5SAlex Elder */ 48079d475de5SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id, 48089d475de5SAlex Elder u8 *order, u64 *snap_size) 48099d475de5SAlex Elder { 48109d475de5SAlex Elder __le64 snapid = cpu_to_le64(snap_id); 48119d475de5SAlex Elder int ret; 48129d475de5SAlex Elder struct { 48139d475de5SAlex Elder u8 order; 48149d475de5SAlex Elder __le64 size; 48159d475de5SAlex Elder } __attribute__ ((packed)) size_buf = { 0 }; 48169d475de5SAlex Elder 4817ecd4a68aSIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, 4818ecd4a68aSIlya Dryomov &rbd_dev->header_oloc, "get_size", 48194157976bSAlex Elder &snapid, sizeof(snapid), 4820e2a58ee5SAlex Elder &size_buf, sizeof(size_buf)); 482136be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 48229d475de5SAlex Elder if (ret < 0) 48239d475de5SAlex Elder return ret; 482457385b51SAlex Elder if (ret < sizeof (size_buf)) 482557385b51SAlex Elder return -ERANGE; 48269d475de5SAlex Elder 4827c3545579SJosh Durgin if (order) { 48289d475de5SAlex Elder *order = size_buf.order; 4829c3545579SJosh Durgin dout(" order %u", (unsigned int)*order); 4830c3545579SJosh Durgin } 48319d475de5SAlex Elder *snap_size = le64_to_cpu(size_buf.size); 48329d475de5SAlex Elder 4833c3545579SJosh Durgin dout(" snap_id 0x%016llx snap_size = %llu\n", 4834c3545579SJosh Durgin (unsigned long long)snap_id, 48359d475de5SAlex Elder (unsigned long long)*snap_size); 48369d475de5SAlex Elder 48379d475de5SAlex Elder return 0; 48389d475de5SAlex Elder } 48399d475de5SAlex Elder 48409d475de5SAlex Elder static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev) 48419d475de5SAlex Elder { 48429d475de5SAlex Elder return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP, 48439d475de5SAlex Elder &rbd_dev->header.obj_order, 48449d475de5SAlex Elder &rbd_dev->header.image_size); 48459d475de5SAlex Elder } 48469d475de5SAlex Elder 48471e130199SAlex Elder static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev) 48481e130199SAlex Elder { 48491e130199SAlex Elder void *reply_buf; 48501e130199SAlex Elder int ret; 48511e130199SAlex Elder void *p; 48521e130199SAlex Elder 48531e130199SAlex Elder reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL); 48541e130199SAlex Elder if (!reply_buf) 48551e130199SAlex Elder return -ENOMEM; 48561e130199SAlex Elder 4857ecd4a68aSIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, 4858ecd4a68aSIlya Dryomov &rbd_dev->header_oloc, "get_object_prefix", 4859ecd4a68aSIlya Dryomov NULL, 0, reply_buf, RBD_OBJ_PREFIX_LEN_MAX); 486036be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 48611e130199SAlex Elder if (ret < 0) 48621e130199SAlex Elder goto out; 48631e130199SAlex Elder 48641e130199SAlex Elder p = reply_buf; 48651e130199SAlex Elder rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p, 486657385b51SAlex Elder p + ret, NULL, GFP_NOIO); 486757385b51SAlex Elder ret = 0; 48681e130199SAlex Elder 48691e130199SAlex Elder if (IS_ERR(rbd_dev->header.object_prefix)) { 48701e130199SAlex Elder ret = PTR_ERR(rbd_dev->header.object_prefix); 48711e130199SAlex Elder rbd_dev->header.object_prefix = NULL; 48721e130199SAlex Elder } else { 48731e130199SAlex Elder dout(" object_prefix = %s\n", rbd_dev->header.object_prefix); 48741e130199SAlex Elder } 48751e130199SAlex Elder out: 48761e130199SAlex Elder kfree(reply_buf); 48771e130199SAlex Elder 48781e130199SAlex Elder return ret; 48791e130199SAlex Elder } 48801e130199SAlex Elder 4881b1b5402aSAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id, 4882b1b5402aSAlex Elder u64 *snap_features) 4883b1b5402aSAlex Elder { 4884b1b5402aSAlex Elder __le64 snapid = cpu_to_le64(snap_id); 4885b1b5402aSAlex Elder struct { 4886b1b5402aSAlex Elder __le64 features; 4887b1b5402aSAlex Elder __le64 incompat; 48884157976bSAlex Elder } __attribute__ ((packed)) features_buf = { 0 }; 4889d3767f0fSIlya Dryomov u64 unsup; 4890b1b5402aSAlex Elder int ret; 4891b1b5402aSAlex Elder 4892ecd4a68aSIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, 4893ecd4a68aSIlya Dryomov &rbd_dev->header_oloc, "get_features", 48944157976bSAlex Elder &snapid, sizeof(snapid), 4895e2a58ee5SAlex Elder &features_buf, sizeof(features_buf)); 489636be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 4897b1b5402aSAlex Elder if (ret < 0) 4898b1b5402aSAlex Elder return ret; 489957385b51SAlex Elder if (ret < sizeof (features_buf)) 490057385b51SAlex Elder return -ERANGE; 4901d889140cSAlex Elder 4902d3767f0fSIlya Dryomov unsup = le64_to_cpu(features_buf.incompat) & ~RBD_FEATURES_SUPPORTED; 4903d3767f0fSIlya Dryomov if (unsup) { 4904d3767f0fSIlya Dryomov rbd_warn(rbd_dev, "image uses unsupported features: 0x%llx", 4905d3767f0fSIlya Dryomov unsup); 4906b8f5c6edSAlex Elder return -ENXIO; 4907d3767f0fSIlya Dryomov } 4908d889140cSAlex Elder 4909b1b5402aSAlex Elder *snap_features = le64_to_cpu(features_buf.features); 4910b1b5402aSAlex Elder 4911b1b5402aSAlex Elder dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n", 4912b1b5402aSAlex Elder (unsigned long long)snap_id, 4913b1b5402aSAlex Elder (unsigned long long)*snap_features, 4914b1b5402aSAlex Elder (unsigned long long)le64_to_cpu(features_buf.incompat)); 4915b1b5402aSAlex Elder 4916b1b5402aSAlex Elder return 0; 4917b1b5402aSAlex Elder } 4918b1b5402aSAlex Elder 4919b1b5402aSAlex Elder static int rbd_dev_v2_features(struct rbd_device *rbd_dev) 4920b1b5402aSAlex Elder { 4921b1b5402aSAlex Elder return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP, 4922b1b5402aSAlex Elder &rbd_dev->header.features); 4923b1b5402aSAlex Elder } 4924b1b5402aSAlex Elder 492586b00e0dSAlex Elder static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev) 492686b00e0dSAlex Elder { 492786b00e0dSAlex Elder struct rbd_spec *parent_spec; 492886b00e0dSAlex Elder size_t size; 492986b00e0dSAlex Elder void *reply_buf = NULL; 493086b00e0dSAlex Elder __le64 snapid; 493186b00e0dSAlex Elder void *p; 493286b00e0dSAlex Elder void *end; 4933642a2537SAlex Elder u64 pool_id; 493486b00e0dSAlex Elder char *image_id; 49353b5cf2a2SAlex Elder u64 snap_id; 493686b00e0dSAlex Elder u64 overlap; 493786b00e0dSAlex Elder int ret; 493886b00e0dSAlex Elder 493986b00e0dSAlex Elder parent_spec = rbd_spec_alloc(); 494086b00e0dSAlex Elder if (!parent_spec) 494186b00e0dSAlex Elder return -ENOMEM; 494286b00e0dSAlex Elder 494386b00e0dSAlex Elder size = sizeof (__le64) + /* pool_id */ 494486b00e0dSAlex Elder sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX + /* image_id */ 494586b00e0dSAlex Elder sizeof (__le64) + /* snap_id */ 494686b00e0dSAlex Elder sizeof (__le64); /* overlap */ 494786b00e0dSAlex Elder reply_buf = kmalloc(size, GFP_KERNEL); 494886b00e0dSAlex Elder if (!reply_buf) { 494986b00e0dSAlex Elder ret = -ENOMEM; 495086b00e0dSAlex Elder goto out_err; 495186b00e0dSAlex Elder } 495286b00e0dSAlex Elder 49534d9b67cdSIlya Dryomov snapid = cpu_to_le64(rbd_dev->spec->snap_id); 4954ecd4a68aSIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, 4955ecd4a68aSIlya Dryomov &rbd_dev->header_oloc, "get_parent", 4956ecd4a68aSIlya Dryomov &snapid, sizeof(snapid), reply_buf, size); 495736be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 495886b00e0dSAlex Elder if (ret < 0) 495986b00e0dSAlex Elder goto out_err; 496086b00e0dSAlex Elder 496186b00e0dSAlex Elder p = reply_buf; 496257385b51SAlex Elder end = reply_buf + ret; 496357385b51SAlex Elder ret = -ERANGE; 4964642a2537SAlex Elder ceph_decode_64_safe(&p, end, pool_id, out_err); 4965392a9dadSAlex Elder if (pool_id == CEPH_NOPOOL) { 4966392a9dadSAlex Elder /* 4967392a9dadSAlex Elder * Either the parent never existed, or we have 4968392a9dadSAlex Elder * record of it but the image got flattened so it no 4969392a9dadSAlex Elder * longer has a parent. When the parent of a 4970392a9dadSAlex Elder * layered image disappears we immediately set the 4971392a9dadSAlex Elder * overlap to 0. The effect of this is that all new 4972392a9dadSAlex Elder * requests will be treated as if the image had no 4973392a9dadSAlex Elder * parent. 4974392a9dadSAlex Elder */ 4975392a9dadSAlex Elder if (rbd_dev->parent_overlap) { 4976392a9dadSAlex Elder rbd_dev->parent_overlap = 0; 4977392a9dadSAlex Elder rbd_dev_parent_put(rbd_dev); 4978392a9dadSAlex Elder pr_info("%s: clone image has been flattened\n", 4979392a9dadSAlex Elder rbd_dev->disk->disk_name); 4980392a9dadSAlex Elder } 4981392a9dadSAlex Elder 498286b00e0dSAlex Elder goto out; /* No parent? No problem. */ 4983392a9dadSAlex Elder } 498486b00e0dSAlex Elder 49850903e875SAlex Elder /* The ceph file layout needs to fit pool id in 32 bits */ 49860903e875SAlex Elder 49870903e875SAlex Elder ret = -EIO; 4988642a2537SAlex Elder if (pool_id > (u64)U32_MAX) { 49899584d508SIlya Dryomov rbd_warn(NULL, "parent pool id too large (%llu > %u)", 4990642a2537SAlex Elder (unsigned long long)pool_id, U32_MAX); 499157385b51SAlex Elder goto out_err; 4992c0cd10dbSAlex Elder } 49930903e875SAlex Elder 4994979ed480SAlex Elder image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL); 499586b00e0dSAlex Elder if (IS_ERR(image_id)) { 499686b00e0dSAlex Elder ret = PTR_ERR(image_id); 499786b00e0dSAlex Elder goto out_err; 499886b00e0dSAlex Elder } 49993b5cf2a2SAlex Elder ceph_decode_64_safe(&p, end, snap_id, out_err); 500086b00e0dSAlex Elder ceph_decode_64_safe(&p, end, overlap, out_err); 500186b00e0dSAlex Elder 50023b5cf2a2SAlex Elder /* 50033b5cf2a2SAlex Elder * The parent won't change (except when the clone is 50043b5cf2a2SAlex Elder * flattened, already handled that). So we only need to 50053b5cf2a2SAlex Elder * record the parent spec we have not already done so. 50063b5cf2a2SAlex Elder */ 50073b5cf2a2SAlex Elder if (!rbd_dev->parent_spec) { 50083b5cf2a2SAlex Elder parent_spec->pool_id = pool_id; 50093b5cf2a2SAlex Elder parent_spec->image_id = image_id; 50103b5cf2a2SAlex Elder parent_spec->snap_id = snap_id; 501186b00e0dSAlex Elder rbd_dev->parent_spec = parent_spec; 501286b00e0dSAlex Elder parent_spec = NULL; /* rbd_dev now owns this */ 5013fbba11b3SIlya Dryomov } else { 5014fbba11b3SIlya Dryomov kfree(image_id); 50153b5cf2a2SAlex Elder } 50163b5cf2a2SAlex Elder 50173b5cf2a2SAlex Elder /* 5018cf32bd9cSIlya Dryomov * We always update the parent overlap. If it's zero we issue 5019cf32bd9cSIlya Dryomov * a warning, as we will proceed as if there was no parent. 50203b5cf2a2SAlex Elder */ 50213b5cf2a2SAlex Elder if (!overlap) { 50223b5cf2a2SAlex Elder if (parent_spec) { 5023cf32bd9cSIlya Dryomov /* refresh, careful to warn just once */ 5024cf32bd9cSIlya Dryomov if (rbd_dev->parent_overlap) 5025cf32bd9cSIlya Dryomov rbd_warn(rbd_dev, 5026cf32bd9cSIlya Dryomov "clone now standalone (overlap became 0)"); 502770cf49cfSAlex Elder } else { 5028cf32bd9cSIlya Dryomov /* initial probe */ 5029cf32bd9cSIlya Dryomov rbd_warn(rbd_dev, "clone is standalone (overlap 0)"); 50303b5cf2a2SAlex Elder } 503170cf49cfSAlex Elder } 5032cf32bd9cSIlya Dryomov rbd_dev->parent_overlap = overlap; 5033cf32bd9cSIlya Dryomov 503486b00e0dSAlex Elder out: 503586b00e0dSAlex Elder ret = 0; 503686b00e0dSAlex Elder out_err: 503786b00e0dSAlex Elder kfree(reply_buf); 503886b00e0dSAlex Elder rbd_spec_put(parent_spec); 503986b00e0dSAlex Elder 504086b00e0dSAlex Elder return ret; 504186b00e0dSAlex Elder } 504286b00e0dSAlex Elder 5043cc070d59SAlex Elder static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev) 5044cc070d59SAlex Elder { 5045cc070d59SAlex Elder struct { 5046cc070d59SAlex Elder __le64 stripe_unit; 5047cc070d59SAlex Elder __le64 stripe_count; 5048cc070d59SAlex Elder } __attribute__ ((packed)) striping_info_buf = { 0 }; 5049cc070d59SAlex Elder size_t size = sizeof (striping_info_buf); 5050cc070d59SAlex Elder void *p; 5051cc070d59SAlex Elder u64 obj_size; 5052cc070d59SAlex Elder u64 stripe_unit; 5053cc070d59SAlex Elder u64 stripe_count; 5054cc070d59SAlex Elder int ret; 5055cc070d59SAlex Elder 5056ecd4a68aSIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, 5057ecd4a68aSIlya Dryomov &rbd_dev->header_oloc, "get_stripe_unit_count", 5058ecd4a68aSIlya Dryomov NULL, 0, &striping_info_buf, size); 5059cc070d59SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 5060cc070d59SAlex Elder if (ret < 0) 5061cc070d59SAlex Elder return ret; 5062cc070d59SAlex Elder if (ret < size) 5063cc070d59SAlex Elder return -ERANGE; 5064cc070d59SAlex Elder 5065cc070d59SAlex Elder /* 5066cc070d59SAlex Elder * We don't actually support the "fancy striping" feature 5067cc070d59SAlex Elder * (STRIPINGV2) yet, but if the striping sizes are the 5068cc070d59SAlex Elder * defaults the behavior is the same as before. So find 5069cc070d59SAlex Elder * out, and only fail if the image has non-default values. 5070cc070d59SAlex Elder */ 5071cc070d59SAlex Elder ret = -EINVAL; 50725bc3fb17SIlya Dryomov obj_size = rbd_obj_bytes(&rbd_dev->header); 5073cc070d59SAlex Elder p = &striping_info_buf; 5074cc070d59SAlex Elder stripe_unit = ceph_decode_64(&p); 5075cc070d59SAlex Elder if (stripe_unit != obj_size) { 5076cc070d59SAlex Elder rbd_warn(rbd_dev, "unsupported stripe unit " 5077cc070d59SAlex Elder "(got %llu want %llu)", 5078cc070d59SAlex Elder stripe_unit, obj_size); 5079cc070d59SAlex Elder return -EINVAL; 5080cc070d59SAlex Elder } 5081cc070d59SAlex Elder stripe_count = ceph_decode_64(&p); 5082cc070d59SAlex Elder if (stripe_count != 1) { 5083cc070d59SAlex Elder rbd_warn(rbd_dev, "unsupported stripe count " 5084cc070d59SAlex Elder "(got %llu want 1)", stripe_count); 5085cc070d59SAlex Elder return -EINVAL; 5086cc070d59SAlex Elder } 5087500d0c0fSAlex Elder rbd_dev->header.stripe_unit = stripe_unit; 5088500d0c0fSAlex Elder rbd_dev->header.stripe_count = stripe_count; 5089cc070d59SAlex Elder 5090cc070d59SAlex Elder return 0; 5091cc070d59SAlex Elder } 5092cc070d59SAlex Elder 50937e97332eSIlya Dryomov static int rbd_dev_v2_data_pool(struct rbd_device *rbd_dev) 50947e97332eSIlya Dryomov { 50957e97332eSIlya Dryomov __le64 data_pool_id; 50967e97332eSIlya Dryomov int ret; 50977e97332eSIlya Dryomov 50987e97332eSIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, 50997e97332eSIlya Dryomov &rbd_dev->header_oloc, "get_data_pool", 51007e97332eSIlya Dryomov NULL, 0, &data_pool_id, sizeof(data_pool_id)); 51017e97332eSIlya Dryomov if (ret < 0) 51027e97332eSIlya Dryomov return ret; 51037e97332eSIlya Dryomov if (ret < sizeof(data_pool_id)) 51047e97332eSIlya Dryomov return -EBADMSG; 51057e97332eSIlya Dryomov 51067e97332eSIlya Dryomov rbd_dev->header.data_pool_id = le64_to_cpu(data_pool_id); 51077e97332eSIlya Dryomov WARN_ON(rbd_dev->header.data_pool_id == CEPH_NOPOOL); 51087e97332eSIlya Dryomov return 0; 51097e97332eSIlya Dryomov } 51107e97332eSIlya Dryomov 51119e15b77dSAlex Elder static char *rbd_dev_image_name(struct rbd_device *rbd_dev) 51129e15b77dSAlex Elder { 5113ecd4a68aSIlya Dryomov CEPH_DEFINE_OID_ONSTACK(oid); 51149e15b77dSAlex Elder size_t image_id_size; 51159e15b77dSAlex Elder char *image_id; 51169e15b77dSAlex Elder void *p; 51179e15b77dSAlex Elder void *end; 51189e15b77dSAlex Elder size_t size; 51199e15b77dSAlex Elder void *reply_buf = NULL; 51209e15b77dSAlex Elder size_t len = 0; 51219e15b77dSAlex Elder char *image_name = NULL; 51229e15b77dSAlex Elder int ret; 51239e15b77dSAlex Elder 51249e15b77dSAlex Elder rbd_assert(!rbd_dev->spec->image_name); 51259e15b77dSAlex Elder 512669e7a02fSAlex Elder len = strlen(rbd_dev->spec->image_id); 512769e7a02fSAlex Elder image_id_size = sizeof (__le32) + len; 51289e15b77dSAlex Elder image_id = kmalloc(image_id_size, GFP_KERNEL); 51299e15b77dSAlex Elder if (!image_id) 51309e15b77dSAlex Elder return NULL; 51319e15b77dSAlex Elder 51329e15b77dSAlex Elder p = image_id; 51334157976bSAlex Elder end = image_id + image_id_size; 513469e7a02fSAlex Elder ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len); 51359e15b77dSAlex Elder 51369e15b77dSAlex Elder size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX; 51379e15b77dSAlex Elder reply_buf = kmalloc(size, GFP_KERNEL); 51389e15b77dSAlex Elder if (!reply_buf) 51399e15b77dSAlex Elder goto out; 51409e15b77dSAlex Elder 5141ecd4a68aSIlya Dryomov ceph_oid_printf(&oid, "%s", RBD_DIRECTORY); 5142ecd4a68aSIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc, 5143ecd4a68aSIlya Dryomov "dir_get_name", image_id, image_id_size, 5144e2a58ee5SAlex Elder reply_buf, size); 51459e15b77dSAlex Elder if (ret < 0) 51469e15b77dSAlex Elder goto out; 51479e15b77dSAlex Elder p = reply_buf; 5148f40eb349SAlex Elder end = reply_buf + ret; 5149f40eb349SAlex Elder 51509e15b77dSAlex Elder image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL); 51519e15b77dSAlex Elder if (IS_ERR(image_name)) 51529e15b77dSAlex Elder image_name = NULL; 51539e15b77dSAlex Elder else 51549e15b77dSAlex Elder dout("%s: name is %s len is %zd\n", __func__, image_name, len); 51559e15b77dSAlex Elder out: 51569e15b77dSAlex Elder kfree(reply_buf); 51579e15b77dSAlex Elder kfree(image_id); 51589e15b77dSAlex Elder 51599e15b77dSAlex Elder return image_name; 51609e15b77dSAlex Elder } 51619e15b77dSAlex Elder 51622ad3d716SAlex Elder static u64 rbd_v1_snap_id_by_name(struct rbd_device *rbd_dev, const char *name) 51632ad3d716SAlex Elder { 51642ad3d716SAlex Elder struct ceph_snap_context *snapc = rbd_dev->header.snapc; 51652ad3d716SAlex Elder const char *snap_name; 51662ad3d716SAlex Elder u32 which = 0; 51672ad3d716SAlex Elder 51682ad3d716SAlex Elder /* Skip over names until we find the one we are looking for */ 51692ad3d716SAlex Elder 51702ad3d716SAlex Elder snap_name = rbd_dev->header.snap_names; 51712ad3d716SAlex Elder while (which < snapc->num_snaps) { 51722ad3d716SAlex Elder if (!strcmp(name, snap_name)) 51732ad3d716SAlex Elder return snapc->snaps[which]; 51742ad3d716SAlex Elder snap_name += strlen(snap_name) + 1; 51752ad3d716SAlex Elder which++; 51762ad3d716SAlex Elder } 51772ad3d716SAlex Elder return CEPH_NOSNAP; 51782ad3d716SAlex Elder } 51792ad3d716SAlex Elder 51802ad3d716SAlex Elder static u64 rbd_v2_snap_id_by_name(struct rbd_device *rbd_dev, const char *name) 51812ad3d716SAlex Elder { 51822ad3d716SAlex Elder struct ceph_snap_context *snapc = rbd_dev->header.snapc; 51832ad3d716SAlex Elder u32 which; 51842ad3d716SAlex Elder bool found = false; 51852ad3d716SAlex Elder u64 snap_id; 51862ad3d716SAlex Elder 51872ad3d716SAlex Elder for (which = 0; !found && which < snapc->num_snaps; which++) { 51882ad3d716SAlex Elder const char *snap_name; 51892ad3d716SAlex Elder 51902ad3d716SAlex Elder snap_id = snapc->snaps[which]; 51912ad3d716SAlex Elder snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id); 5192efadc98aSJosh Durgin if (IS_ERR(snap_name)) { 5193efadc98aSJosh Durgin /* ignore no-longer existing snapshots */ 5194efadc98aSJosh Durgin if (PTR_ERR(snap_name) == -ENOENT) 5195efadc98aSJosh Durgin continue; 5196efadc98aSJosh Durgin else 51972ad3d716SAlex Elder break; 5198efadc98aSJosh Durgin } 51992ad3d716SAlex Elder found = !strcmp(name, snap_name); 52002ad3d716SAlex Elder kfree(snap_name); 52012ad3d716SAlex Elder } 52022ad3d716SAlex Elder return found ? snap_id : CEPH_NOSNAP; 52032ad3d716SAlex Elder } 52042ad3d716SAlex Elder 52052ad3d716SAlex Elder /* 52062ad3d716SAlex Elder * Assumes name is never RBD_SNAP_HEAD_NAME; returns CEPH_NOSNAP if 52072ad3d716SAlex Elder * no snapshot by that name is found, or if an error occurs. 52082ad3d716SAlex Elder */ 52092ad3d716SAlex Elder static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name) 52102ad3d716SAlex Elder { 52112ad3d716SAlex Elder if (rbd_dev->image_format == 1) 52122ad3d716SAlex Elder return rbd_v1_snap_id_by_name(rbd_dev, name); 52132ad3d716SAlex Elder 52142ad3d716SAlex Elder return rbd_v2_snap_id_by_name(rbd_dev, name); 52152ad3d716SAlex Elder } 52162ad3d716SAlex Elder 52179e15b77dSAlex Elder /* 521804077599SIlya Dryomov * An image being mapped will have everything but the snap id. 52199e15b77dSAlex Elder */ 522004077599SIlya Dryomov static int rbd_spec_fill_snap_id(struct rbd_device *rbd_dev) 522104077599SIlya Dryomov { 522204077599SIlya Dryomov struct rbd_spec *spec = rbd_dev->spec; 522304077599SIlya Dryomov 522404077599SIlya Dryomov rbd_assert(spec->pool_id != CEPH_NOPOOL && spec->pool_name); 522504077599SIlya Dryomov rbd_assert(spec->image_id && spec->image_name); 522604077599SIlya Dryomov rbd_assert(spec->snap_name); 522704077599SIlya Dryomov 522804077599SIlya Dryomov if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) { 522904077599SIlya Dryomov u64 snap_id; 523004077599SIlya Dryomov 523104077599SIlya Dryomov snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name); 523204077599SIlya Dryomov if (snap_id == CEPH_NOSNAP) 523304077599SIlya Dryomov return -ENOENT; 523404077599SIlya Dryomov 523504077599SIlya Dryomov spec->snap_id = snap_id; 523604077599SIlya Dryomov } else { 523704077599SIlya Dryomov spec->snap_id = CEPH_NOSNAP; 523804077599SIlya Dryomov } 523904077599SIlya Dryomov 524004077599SIlya Dryomov return 0; 524104077599SIlya Dryomov } 524204077599SIlya Dryomov 524304077599SIlya Dryomov /* 524404077599SIlya Dryomov * A parent image will have all ids but none of the names. 524504077599SIlya Dryomov * 524604077599SIlya Dryomov * All names in an rbd spec are dynamically allocated. It's OK if we 524704077599SIlya Dryomov * can't figure out the name for an image id. 524804077599SIlya Dryomov */ 524904077599SIlya Dryomov static int rbd_spec_fill_names(struct rbd_device *rbd_dev) 52509e15b77dSAlex Elder { 52512e9f7f1cSAlex Elder struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 52522e9f7f1cSAlex Elder struct rbd_spec *spec = rbd_dev->spec; 52532e9f7f1cSAlex Elder const char *pool_name; 52542e9f7f1cSAlex Elder const char *image_name; 52552e9f7f1cSAlex Elder const char *snap_name; 52569e15b77dSAlex Elder int ret; 52579e15b77dSAlex Elder 525804077599SIlya Dryomov rbd_assert(spec->pool_id != CEPH_NOPOOL); 525904077599SIlya Dryomov rbd_assert(spec->image_id); 526004077599SIlya Dryomov rbd_assert(spec->snap_id != CEPH_NOSNAP); 52619e15b77dSAlex Elder 52622e9f7f1cSAlex Elder /* Get the pool name; we have to make our own copy of this */ 52639e15b77dSAlex Elder 52642e9f7f1cSAlex Elder pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id); 52652e9f7f1cSAlex Elder if (!pool_name) { 52662e9f7f1cSAlex Elder rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id); 5267935dc89fSAlex Elder return -EIO; 5268935dc89fSAlex Elder } 52692e9f7f1cSAlex Elder pool_name = kstrdup(pool_name, GFP_KERNEL); 52702e9f7f1cSAlex Elder if (!pool_name) 52719e15b77dSAlex Elder return -ENOMEM; 52729e15b77dSAlex Elder 52739e15b77dSAlex Elder /* Fetch the image name; tolerate failure here */ 52749e15b77dSAlex Elder 52752e9f7f1cSAlex Elder image_name = rbd_dev_image_name(rbd_dev); 52762e9f7f1cSAlex Elder if (!image_name) 527706ecc6cbSAlex Elder rbd_warn(rbd_dev, "unable to get image name"); 52789e15b77dSAlex Elder 527904077599SIlya Dryomov /* Fetch the snapshot name */ 52809e15b77dSAlex Elder 52812e9f7f1cSAlex Elder snap_name = rbd_snap_name(rbd_dev, spec->snap_id); 5282da6a6b63SJosh Durgin if (IS_ERR(snap_name)) { 5283da6a6b63SJosh Durgin ret = PTR_ERR(snap_name); 52849e15b77dSAlex Elder goto out_err; 52852e9f7f1cSAlex Elder } 52862e9f7f1cSAlex Elder 52872e9f7f1cSAlex Elder spec->pool_name = pool_name; 52882e9f7f1cSAlex Elder spec->image_name = image_name; 52892e9f7f1cSAlex Elder spec->snap_name = snap_name; 52909e15b77dSAlex Elder 52919e15b77dSAlex Elder return 0; 529204077599SIlya Dryomov 52939e15b77dSAlex Elder out_err: 52942e9f7f1cSAlex Elder kfree(image_name); 52952e9f7f1cSAlex Elder kfree(pool_name); 52969e15b77dSAlex Elder return ret; 52979e15b77dSAlex Elder } 52989e15b77dSAlex Elder 5299cc4a38bdSAlex Elder static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev) 530035d489f9SAlex Elder { 530135d489f9SAlex Elder size_t size; 530235d489f9SAlex Elder int ret; 530335d489f9SAlex Elder void *reply_buf; 530435d489f9SAlex Elder void *p; 530535d489f9SAlex Elder void *end; 530635d489f9SAlex Elder u64 seq; 530735d489f9SAlex Elder u32 snap_count; 530835d489f9SAlex Elder struct ceph_snap_context *snapc; 530935d489f9SAlex Elder u32 i; 531035d489f9SAlex Elder 531135d489f9SAlex Elder /* 531235d489f9SAlex Elder * We'll need room for the seq value (maximum snapshot id), 531335d489f9SAlex Elder * snapshot count, and array of that many snapshot ids. 531435d489f9SAlex Elder * For now we have a fixed upper limit on the number we're 531535d489f9SAlex Elder * prepared to receive. 531635d489f9SAlex Elder */ 531735d489f9SAlex Elder size = sizeof (__le64) + sizeof (__le32) + 531835d489f9SAlex Elder RBD_MAX_SNAP_COUNT * sizeof (__le64); 531935d489f9SAlex Elder reply_buf = kzalloc(size, GFP_KERNEL); 532035d489f9SAlex Elder if (!reply_buf) 532135d489f9SAlex Elder return -ENOMEM; 532235d489f9SAlex Elder 5323ecd4a68aSIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, 5324ecd4a68aSIlya Dryomov &rbd_dev->header_oloc, "get_snapcontext", 5325ecd4a68aSIlya Dryomov NULL, 0, reply_buf, size); 532636be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 532735d489f9SAlex Elder if (ret < 0) 532835d489f9SAlex Elder goto out; 532935d489f9SAlex Elder 533035d489f9SAlex Elder p = reply_buf; 533157385b51SAlex Elder end = reply_buf + ret; 533257385b51SAlex Elder ret = -ERANGE; 533335d489f9SAlex Elder ceph_decode_64_safe(&p, end, seq, out); 533435d489f9SAlex Elder ceph_decode_32_safe(&p, end, snap_count, out); 533535d489f9SAlex Elder 533635d489f9SAlex Elder /* 533735d489f9SAlex Elder * Make sure the reported number of snapshot ids wouldn't go 533835d489f9SAlex Elder * beyond the end of our buffer. But before checking that, 533935d489f9SAlex Elder * make sure the computed size of the snapshot context we 534035d489f9SAlex Elder * allocate is representable in a size_t. 534135d489f9SAlex Elder */ 534235d489f9SAlex Elder if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context)) 534335d489f9SAlex Elder / sizeof (u64)) { 534435d489f9SAlex Elder ret = -EINVAL; 534535d489f9SAlex Elder goto out; 534635d489f9SAlex Elder } 534735d489f9SAlex Elder if (!ceph_has_room(&p, end, snap_count * sizeof (__le64))) 534835d489f9SAlex Elder goto out; 5349468521c1SAlex Elder ret = 0; 535035d489f9SAlex Elder 5351812164f8SAlex Elder snapc = ceph_create_snap_context(snap_count, GFP_KERNEL); 535235d489f9SAlex Elder if (!snapc) { 535335d489f9SAlex Elder ret = -ENOMEM; 535435d489f9SAlex Elder goto out; 535535d489f9SAlex Elder } 535635d489f9SAlex Elder snapc->seq = seq; 535735d489f9SAlex Elder for (i = 0; i < snap_count; i++) 535835d489f9SAlex Elder snapc->snaps[i] = ceph_decode_64(&p); 535935d489f9SAlex Elder 536049ece554SAlex Elder ceph_put_snap_context(rbd_dev->header.snapc); 536135d489f9SAlex Elder rbd_dev->header.snapc = snapc; 536235d489f9SAlex Elder 536335d489f9SAlex Elder dout(" snap context seq = %llu, snap_count = %u\n", 536435d489f9SAlex Elder (unsigned long long)seq, (unsigned int)snap_count); 536535d489f9SAlex Elder out: 536635d489f9SAlex Elder kfree(reply_buf); 536735d489f9SAlex Elder 536857385b51SAlex Elder return ret; 536935d489f9SAlex Elder } 537035d489f9SAlex Elder 537154cac61fSAlex Elder static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, 537254cac61fSAlex Elder u64 snap_id) 5373b8b1e2dbSAlex Elder { 5374b8b1e2dbSAlex Elder size_t size; 5375b8b1e2dbSAlex Elder void *reply_buf; 537654cac61fSAlex Elder __le64 snapid; 5377b8b1e2dbSAlex Elder int ret; 5378b8b1e2dbSAlex Elder void *p; 5379b8b1e2dbSAlex Elder void *end; 5380b8b1e2dbSAlex Elder char *snap_name; 5381b8b1e2dbSAlex Elder 5382b8b1e2dbSAlex Elder size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN; 5383b8b1e2dbSAlex Elder reply_buf = kmalloc(size, GFP_KERNEL); 5384b8b1e2dbSAlex Elder if (!reply_buf) 5385b8b1e2dbSAlex Elder return ERR_PTR(-ENOMEM); 5386b8b1e2dbSAlex Elder 538754cac61fSAlex Elder snapid = cpu_to_le64(snap_id); 5388ecd4a68aSIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, 5389ecd4a68aSIlya Dryomov &rbd_dev->header_oloc, "get_snapshot_name", 5390ecd4a68aSIlya Dryomov &snapid, sizeof(snapid), reply_buf, size); 539136be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 5392f40eb349SAlex Elder if (ret < 0) { 5393f40eb349SAlex Elder snap_name = ERR_PTR(ret); 5394b8b1e2dbSAlex Elder goto out; 5395f40eb349SAlex Elder } 5396b8b1e2dbSAlex Elder 5397b8b1e2dbSAlex Elder p = reply_buf; 5398f40eb349SAlex Elder end = reply_buf + ret; 5399e5c35534SAlex Elder snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL); 5400f40eb349SAlex Elder if (IS_ERR(snap_name)) 5401b8b1e2dbSAlex Elder goto out; 5402f40eb349SAlex Elder 5403b8b1e2dbSAlex Elder dout(" snap_id 0x%016llx snap_name = %s\n", 540454cac61fSAlex Elder (unsigned long long)snap_id, snap_name); 5405b8b1e2dbSAlex Elder out: 5406b8b1e2dbSAlex Elder kfree(reply_buf); 5407b8b1e2dbSAlex Elder 5408f40eb349SAlex Elder return snap_name; 5409b8b1e2dbSAlex Elder } 5410b8b1e2dbSAlex Elder 54112df3fac7SAlex Elder static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev) 5412117973fbSAlex Elder { 54132df3fac7SAlex Elder bool first_time = rbd_dev->header.object_prefix == NULL; 5414117973fbSAlex Elder int ret; 5415117973fbSAlex Elder 54161617e40cSJosh Durgin ret = rbd_dev_v2_image_size(rbd_dev); 54171617e40cSJosh Durgin if (ret) 5418cfbf6377SAlex Elder return ret; 54191617e40cSJosh Durgin 54202df3fac7SAlex Elder if (first_time) { 54212df3fac7SAlex Elder ret = rbd_dev_v2_header_onetime(rbd_dev); 54222df3fac7SAlex Elder if (ret) 5423cfbf6377SAlex Elder return ret; 54242df3fac7SAlex Elder } 54252df3fac7SAlex Elder 5426cc4a38bdSAlex Elder ret = rbd_dev_v2_snap_context(rbd_dev); 5427d194cd1dSIlya Dryomov if (ret && first_time) { 5428d194cd1dSIlya Dryomov kfree(rbd_dev->header.object_prefix); 5429d194cd1dSIlya Dryomov rbd_dev->header.object_prefix = NULL; 5430d194cd1dSIlya Dryomov } 5431117973fbSAlex Elder 5432117973fbSAlex Elder return ret; 5433117973fbSAlex Elder } 5434117973fbSAlex Elder 5435a720ae09SIlya Dryomov static int rbd_dev_header_info(struct rbd_device *rbd_dev) 5436a720ae09SIlya Dryomov { 5437a720ae09SIlya Dryomov rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 5438a720ae09SIlya Dryomov 5439a720ae09SIlya Dryomov if (rbd_dev->image_format == 1) 5440a720ae09SIlya Dryomov return rbd_dev_v1_header_info(rbd_dev); 5441a720ae09SIlya Dryomov 5442a720ae09SIlya Dryomov return rbd_dev_v2_header_info(rbd_dev); 5443a720ae09SIlya Dryomov } 5444a720ae09SIlya Dryomov 54451ddbe94eSAlex Elder /* 5446e28fff26SAlex Elder * Skips over white space at *buf, and updates *buf to point to the 5447e28fff26SAlex Elder * first found non-space character (if any). Returns the length of 5448593a9e7bSAlex Elder * the token (string of non-white space characters) found. Note 5449593a9e7bSAlex Elder * that *buf must be terminated with '\0'. 5450e28fff26SAlex Elder */ 5451e28fff26SAlex Elder static inline size_t next_token(const char **buf) 5452e28fff26SAlex Elder { 5453e28fff26SAlex Elder /* 5454e28fff26SAlex Elder * These are the characters that produce nonzero for 5455e28fff26SAlex Elder * isspace() in the "C" and "POSIX" locales. 5456e28fff26SAlex Elder */ 5457e28fff26SAlex Elder const char *spaces = " \f\n\r\t\v"; 5458e28fff26SAlex Elder 5459e28fff26SAlex Elder *buf += strspn(*buf, spaces); /* Find start of token */ 5460e28fff26SAlex Elder 5461e28fff26SAlex Elder return strcspn(*buf, spaces); /* Return token length */ 5462e28fff26SAlex Elder } 5463e28fff26SAlex Elder 5464e28fff26SAlex Elder /* 5465ea3352f4SAlex Elder * Finds the next token in *buf, dynamically allocates a buffer big 5466ea3352f4SAlex Elder * enough to hold a copy of it, and copies the token into the new 5467ea3352f4SAlex Elder * buffer. The copy is guaranteed to be terminated with '\0'. Note 5468ea3352f4SAlex Elder * that a duplicate buffer is created even for a zero-length token. 5469ea3352f4SAlex Elder * 5470ea3352f4SAlex Elder * Returns a pointer to the newly-allocated duplicate, or a null 5471ea3352f4SAlex Elder * pointer if memory for the duplicate was not available. If 5472ea3352f4SAlex Elder * the lenp argument is a non-null pointer, the length of the token 5473ea3352f4SAlex Elder * (not including the '\0') is returned in *lenp. 5474ea3352f4SAlex Elder * 5475ea3352f4SAlex Elder * If successful, the *buf pointer will be updated to point beyond 5476ea3352f4SAlex Elder * the end of the found token. 5477ea3352f4SAlex Elder * 5478ea3352f4SAlex Elder * Note: uses GFP_KERNEL for allocation. 5479ea3352f4SAlex Elder */ 5480ea3352f4SAlex Elder static inline char *dup_token(const char **buf, size_t *lenp) 5481ea3352f4SAlex Elder { 5482ea3352f4SAlex Elder char *dup; 5483ea3352f4SAlex Elder size_t len; 5484ea3352f4SAlex Elder 5485ea3352f4SAlex Elder len = next_token(buf); 54864caf35f9SAlex Elder dup = kmemdup(*buf, len + 1, GFP_KERNEL); 5487ea3352f4SAlex Elder if (!dup) 5488ea3352f4SAlex Elder return NULL; 5489ea3352f4SAlex Elder *(dup + len) = '\0'; 5490ea3352f4SAlex Elder *buf += len; 5491ea3352f4SAlex Elder 5492ea3352f4SAlex Elder if (lenp) 5493ea3352f4SAlex Elder *lenp = len; 5494ea3352f4SAlex Elder 5495ea3352f4SAlex Elder return dup; 5496ea3352f4SAlex Elder } 5497ea3352f4SAlex Elder 5498ea3352f4SAlex Elder /* 5499859c31dfSAlex Elder * Parse the options provided for an "rbd add" (i.e., rbd image 5500859c31dfSAlex Elder * mapping) request. These arrive via a write to /sys/bus/rbd/add, 5501859c31dfSAlex Elder * and the data written is passed here via a NUL-terminated buffer. 5502859c31dfSAlex Elder * Returns 0 if successful or an error code otherwise. 5503d22f76e7SAlex Elder * 5504859c31dfSAlex Elder * The information extracted from these options is recorded in 5505859c31dfSAlex Elder * the other parameters which return dynamically-allocated 5506859c31dfSAlex Elder * structures: 5507859c31dfSAlex Elder * ceph_opts 5508859c31dfSAlex Elder * The address of a pointer that will refer to a ceph options 5509859c31dfSAlex Elder * structure. Caller must release the returned pointer using 5510859c31dfSAlex Elder * ceph_destroy_options() when it is no longer needed. 5511859c31dfSAlex Elder * rbd_opts 5512859c31dfSAlex Elder * Address of an rbd options pointer. Fully initialized by 5513859c31dfSAlex Elder * this function; caller must release with kfree(). 5514859c31dfSAlex Elder * spec 5515859c31dfSAlex Elder * Address of an rbd image specification pointer. Fully 5516859c31dfSAlex Elder * initialized by this function based on parsed options. 5517859c31dfSAlex Elder * Caller must release with rbd_spec_put(). 5518859c31dfSAlex Elder * 5519859c31dfSAlex Elder * The options passed take this form: 5520859c31dfSAlex Elder * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>] 5521859c31dfSAlex Elder * where: 5522859c31dfSAlex Elder * <mon_addrs> 5523859c31dfSAlex Elder * A comma-separated list of one or more monitor addresses. 5524859c31dfSAlex Elder * A monitor address is an ip address, optionally followed 5525859c31dfSAlex Elder * by a port number (separated by a colon). 5526859c31dfSAlex Elder * I.e.: ip1[:port1][,ip2[:port2]...] 5527859c31dfSAlex Elder * <options> 5528859c31dfSAlex Elder * A comma-separated list of ceph and/or rbd options. 5529859c31dfSAlex Elder * <pool_name> 5530859c31dfSAlex Elder * The name of the rados pool containing the rbd image. 5531859c31dfSAlex Elder * <image_name> 5532859c31dfSAlex Elder * The name of the image in that pool to map. 5533859c31dfSAlex Elder * <snap_id> 5534859c31dfSAlex Elder * An optional snapshot id. If provided, the mapping will 5535859c31dfSAlex Elder * present data from the image at the time that snapshot was 5536859c31dfSAlex Elder * created. The image head is used if no snapshot id is 5537859c31dfSAlex Elder * provided. Snapshot mappings are always read-only. 5538a725f65eSAlex Elder */ 5539859c31dfSAlex Elder static int rbd_add_parse_args(const char *buf, 5540dc79b113SAlex Elder struct ceph_options **ceph_opts, 5541859c31dfSAlex Elder struct rbd_options **opts, 5542859c31dfSAlex Elder struct rbd_spec **rbd_spec) 5543a725f65eSAlex Elder { 5544e28fff26SAlex Elder size_t len; 5545859c31dfSAlex Elder char *options; 55460ddebc0cSAlex Elder const char *mon_addrs; 5547ecb4dc22SAlex Elder char *snap_name; 55480ddebc0cSAlex Elder size_t mon_addrs_size; 5549859c31dfSAlex Elder struct rbd_spec *spec = NULL; 55504e9afebaSAlex Elder struct rbd_options *rbd_opts = NULL; 5551859c31dfSAlex Elder struct ceph_options *copts; 5552dc79b113SAlex Elder int ret; 5553e28fff26SAlex Elder 5554e28fff26SAlex Elder /* The first four tokens are required */ 5555e28fff26SAlex Elder 55567ef3214aSAlex Elder len = next_token(&buf); 55574fb5d671SAlex Elder if (!len) { 55584fb5d671SAlex Elder rbd_warn(NULL, "no monitor address(es) provided"); 55594fb5d671SAlex Elder return -EINVAL; 55604fb5d671SAlex Elder } 55610ddebc0cSAlex Elder mon_addrs = buf; 5562f28e565aSAlex Elder mon_addrs_size = len + 1; 55637ef3214aSAlex Elder buf += len; 5564a725f65eSAlex Elder 5565dc79b113SAlex Elder ret = -EINVAL; 5566f28e565aSAlex Elder options = dup_token(&buf, NULL); 5567f28e565aSAlex Elder if (!options) 5568dc79b113SAlex Elder return -ENOMEM; 55694fb5d671SAlex Elder if (!*options) { 55704fb5d671SAlex Elder rbd_warn(NULL, "no options provided"); 55714fb5d671SAlex Elder goto out_err; 55724fb5d671SAlex Elder } 5573a725f65eSAlex Elder 5574859c31dfSAlex Elder spec = rbd_spec_alloc(); 5575859c31dfSAlex Elder if (!spec) 5576f28e565aSAlex Elder goto out_mem; 5577859c31dfSAlex Elder 5578859c31dfSAlex Elder spec->pool_name = dup_token(&buf, NULL); 5579859c31dfSAlex Elder if (!spec->pool_name) 5580859c31dfSAlex Elder goto out_mem; 55814fb5d671SAlex Elder if (!*spec->pool_name) { 55824fb5d671SAlex Elder rbd_warn(NULL, "no pool name provided"); 55834fb5d671SAlex Elder goto out_err; 55844fb5d671SAlex Elder } 5585e28fff26SAlex Elder 558669e7a02fSAlex Elder spec->image_name = dup_token(&buf, NULL); 5587859c31dfSAlex Elder if (!spec->image_name) 5588f28e565aSAlex Elder goto out_mem; 55894fb5d671SAlex Elder if (!*spec->image_name) { 55904fb5d671SAlex Elder rbd_warn(NULL, "no image name provided"); 55914fb5d671SAlex Elder goto out_err; 55924fb5d671SAlex Elder } 5593e28fff26SAlex Elder 5594f28e565aSAlex Elder /* 5595f28e565aSAlex Elder * Snapshot name is optional; default is to use "-" 5596f28e565aSAlex Elder * (indicating the head/no snapshot). 5597f28e565aSAlex Elder */ 55983feeb894SAlex Elder len = next_token(&buf); 5599820a5f3eSAlex Elder if (!len) { 56003feeb894SAlex Elder buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */ 56013feeb894SAlex Elder len = sizeof (RBD_SNAP_HEAD_NAME) - 1; 5602f28e565aSAlex Elder } else if (len > RBD_MAX_SNAP_NAME_LEN) { 5603dc79b113SAlex Elder ret = -ENAMETOOLONG; 5604f28e565aSAlex Elder goto out_err; 5605849b4260SAlex Elder } 5606ecb4dc22SAlex Elder snap_name = kmemdup(buf, len + 1, GFP_KERNEL); 5607ecb4dc22SAlex Elder if (!snap_name) 5608f28e565aSAlex Elder goto out_mem; 5609ecb4dc22SAlex Elder *(snap_name + len) = '\0'; 5610ecb4dc22SAlex Elder spec->snap_name = snap_name; 5611e5c35534SAlex Elder 56120ddebc0cSAlex Elder /* Initialize all rbd options to the defaults */ 5613e28fff26SAlex Elder 56144e9afebaSAlex Elder rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL); 56154e9afebaSAlex Elder if (!rbd_opts) 56164e9afebaSAlex Elder goto out_mem; 56174e9afebaSAlex Elder 56184e9afebaSAlex Elder rbd_opts->read_only = RBD_READ_ONLY_DEFAULT; 5619b5584180SIlya Dryomov rbd_opts->queue_depth = RBD_QUEUE_DEPTH_DEFAULT; 562080de1912SIlya Dryomov rbd_opts->lock_on_read = RBD_LOCK_ON_READ_DEFAULT; 5621d22f76e7SAlex Elder 5622859c31dfSAlex Elder copts = ceph_parse_options(options, mon_addrs, 56230ddebc0cSAlex Elder mon_addrs + mon_addrs_size - 1, 56244e9afebaSAlex Elder parse_rbd_opts_token, rbd_opts); 5625859c31dfSAlex Elder if (IS_ERR(copts)) { 5626859c31dfSAlex Elder ret = PTR_ERR(copts); 5627dc79b113SAlex Elder goto out_err; 5628dc79b113SAlex Elder } 5629859c31dfSAlex Elder kfree(options); 5630859c31dfSAlex Elder 5631859c31dfSAlex Elder *ceph_opts = copts; 56324e9afebaSAlex Elder *opts = rbd_opts; 5633859c31dfSAlex Elder *rbd_spec = spec; 56340ddebc0cSAlex Elder 5635dc79b113SAlex Elder return 0; 5636f28e565aSAlex Elder out_mem: 5637dc79b113SAlex Elder ret = -ENOMEM; 5638d22f76e7SAlex Elder out_err: 5639859c31dfSAlex Elder kfree(rbd_opts); 5640859c31dfSAlex Elder rbd_spec_put(spec); 5641f28e565aSAlex Elder kfree(options); 5642d22f76e7SAlex Elder 5643dc79b113SAlex Elder return ret; 5644a725f65eSAlex Elder } 5645a725f65eSAlex Elder 5646589d30e0SAlex Elder /* 564730ba1f02SIlya Dryomov * Return pool id (>= 0) or a negative error code. 564830ba1f02SIlya Dryomov */ 564930ba1f02SIlya Dryomov static int rbd_add_get_pool_id(struct rbd_client *rbdc, const char *pool_name) 565030ba1f02SIlya Dryomov { 5651a319bf56SIlya Dryomov struct ceph_options *opts = rbdc->client->options; 565230ba1f02SIlya Dryomov u64 newest_epoch; 565330ba1f02SIlya Dryomov int tries = 0; 565430ba1f02SIlya Dryomov int ret; 565530ba1f02SIlya Dryomov 565630ba1f02SIlya Dryomov again: 565730ba1f02SIlya Dryomov ret = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, pool_name); 565830ba1f02SIlya Dryomov if (ret == -ENOENT && tries++ < 1) { 5659d0b19705SIlya Dryomov ret = ceph_monc_get_version(&rbdc->client->monc, "osdmap", 566030ba1f02SIlya Dryomov &newest_epoch); 566130ba1f02SIlya Dryomov if (ret < 0) 566230ba1f02SIlya Dryomov return ret; 566330ba1f02SIlya Dryomov 566430ba1f02SIlya Dryomov if (rbdc->client->osdc.osdmap->epoch < newest_epoch) { 56657cca78c9SIlya Dryomov ceph_osdc_maybe_request_map(&rbdc->client->osdc); 566630ba1f02SIlya Dryomov (void) ceph_monc_wait_osdmap(&rbdc->client->monc, 5667a319bf56SIlya Dryomov newest_epoch, 5668a319bf56SIlya Dryomov opts->mount_timeout); 566930ba1f02SIlya Dryomov goto again; 567030ba1f02SIlya Dryomov } else { 567130ba1f02SIlya Dryomov /* the osdmap we have is new enough */ 567230ba1f02SIlya Dryomov return -ENOENT; 567330ba1f02SIlya Dryomov } 567430ba1f02SIlya Dryomov } 567530ba1f02SIlya Dryomov 567630ba1f02SIlya Dryomov return ret; 567730ba1f02SIlya Dryomov } 567830ba1f02SIlya Dryomov 567930ba1f02SIlya Dryomov /* 5680589d30e0SAlex Elder * An rbd format 2 image has a unique identifier, distinct from the 5681589d30e0SAlex Elder * name given to it by the user. Internally, that identifier is 5682589d30e0SAlex Elder * what's used to specify the names of objects related to the image. 5683589d30e0SAlex Elder * 5684589d30e0SAlex Elder * A special "rbd id" object is used to map an rbd image name to its 5685589d30e0SAlex Elder * id. If that object doesn't exist, then there is no v2 rbd image 5686589d30e0SAlex Elder * with the supplied name. 5687589d30e0SAlex Elder * 5688589d30e0SAlex Elder * This function will record the given rbd_dev's image_id field if 5689589d30e0SAlex Elder * it can be determined, and in that case will return 0. If any 5690589d30e0SAlex Elder * errors occur a negative errno will be returned and the rbd_dev's 5691589d30e0SAlex Elder * image_id field will be unchanged (and should be NULL). 5692589d30e0SAlex Elder */ 5693589d30e0SAlex Elder static int rbd_dev_image_id(struct rbd_device *rbd_dev) 5694589d30e0SAlex Elder { 5695589d30e0SAlex Elder int ret; 5696589d30e0SAlex Elder size_t size; 5697ecd4a68aSIlya Dryomov CEPH_DEFINE_OID_ONSTACK(oid); 5698589d30e0SAlex Elder void *response; 5699c0fba368SAlex Elder char *image_id; 57002f82ee54SAlex Elder 5701589d30e0SAlex Elder /* 57022c0d0a10SAlex Elder * When probing a parent image, the image id is already 57032c0d0a10SAlex Elder * known (and the image name likely is not). There's no 5704c0fba368SAlex Elder * need to fetch the image id again in this case. We 5705c0fba368SAlex Elder * do still need to set the image format though. 57062c0d0a10SAlex Elder */ 5707c0fba368SAlex Elder if (rbd_dev->spec->image_id) { 5708c0fba368SAlex Elder rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1; 5709c0fba368SAlex Elder 57102c0d0a10SAlex Elder return 0; 5711c0fba368SAlex Elder } 57122c0d0a10SAlex Elder 57132c0d0a10SAlex Elder /* 5714589d30e0SAlex Elder * First, see if the format 2 image id file exists, and if 5715589d30e0SAlex Elder * so, get the image's persistent id from it. 5716589d30e0SAlex Elder */ 5717ecd4a68aSIlya Dryomov ret = ceph_oid_aprintf(&oid, GFP_KERNEL, "%s%s", RBD_ID_PREFIX, 5718ecd4a68aSIlya Dryomov rbd_dev->spec->image_name); 5719ecd4a68aSIlya Dryomov if (ret) 5720ecd4a68aSIlya Dryomov return ret; 5721ecd4a68aSIlya Dryomov 5722ecd4a68aSIlya Dryomov dout("rbd id object name is %s\n", oid.name); 5723589d30e0SAlex Elder 5724589d30e0SAlex Elder /* Response will be an encoded string, which includes a length */ 5725589d30e0SAlex Elder 5726589d30e0SAlex Elder size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX; 5727589d30e0SAlex Elder response = kzalloc(size, GFP_NOIO); 5728589d30e0SAlex Elder if (!response) { 5729589d30e0SAlex Elder ret = -ENOMEM; 5730589d30e0SAlex Elder goto out; 5731589d30e0SAlex Elder } 5732589d30e0SAlex Elder 5733c0fba368SAlex Elder /* If it doesn't exist we'll assume it's a format 1 image */ 5734c0fba368SAlex Elder 5735ecd4a68aSIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc, 5736ecd4a68aSIlya Dryomov "get_id", NULL, 0, 5737e2a58ee5SAlex Elder response, RBD_IMAGE_ID_LEN_MAX); 573836be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 5739c0fba368SAlex Elder if (ret == -ENOENT) { 5740c0fba368SAlex Elder image_id = kstrdup("", GFP_KERNEL); 5741c0fba368SAlex Elder ret = image_id ? 0 : -ENOMEM; 5742c0fba368SAlex Elder if (!ret) 5743c0fba368SAlex Elder rbd_dev->image_format = 1; 57447dd440c9SIlya Dryomov } else if (ret >= 0) { 5745c0fba368SAlex Elder void *p = response; 5746589d30e0SAlex Elder 5747c0fba368SAlex Elder image_id = ceph_extract_encoded_string(&p, p + ret, 5748979ed480SAlex Elder NULL, GFP_NOIO); 5749461f758aSDuan Jiong ret = PTR_ERR_OR_ZERO(image_id); 5750c0fba368SAlex Elder if (!ret) 5751c0fba368SAlex Elder rbd_dev->image_format = 2; 5752c0fba368SAlex Elder } 5753c0fba368SAlex Elder 5754c0fba368SAlex Elder if (!ret) { 5755c0fba368SAlex Elder rbd_dev->spec->image_id = image_id; 5756c0fba368SAlex Elder dout("image_id is %s\n", image_id); 5757589d30e0SAlex Elder } 5758589d30e0SAlex Elder out: 5759589d30e0SAlex Elder kfree(response); 5760ecd4a68aSIlya Dryomov ceph_oid_destroy(&oid); 5761589d30e0SAlex Elder return ret; 5762589d30e0SAlex Elder } 5763589d30e0SAlex Elder 57643abef3b3SAlex Elder /* 57653abef3b3SAlex Elder * Undo whatever state changes are made by v1 or v2 header info 57663abef3b3SAlex Elder * call. 57673abef3b3SAlex Elder */ 57686fd48b3bSAlex Elder static void rbd_dev_unprobe(struct rbd_device *rbd_dev) 57696fd48b3bSAlex Elder { 57706fd48b3bSAlex Elder struct rbd_image_header *header; 57716fd48b3bSAlex Elder 5772a2acd00eSAlex Elder rbd_dev_parent_put(rbd_dev); 57736fd48b3bSAlex Elder 57746fd48b3bSAlex Elder /* Free dynamic fields from the header, then zero it out */ 57756fd48b3bSAlex Elder 57766fd48b3bSAlex Elder header = &rbd_dev->header; 5777812164f8SAlex Elder ceph_put_snap_context(header->snapc); 57786fd48b3bSAlex Elder kfree(header->snap_sizes); 57796fd48b3bSAlex Elder kfree(header->snap_names); 57806fd48b3bSAlex Elder kfree(header->object_prefix); 57816fd48b3bSAlex Elder memset(header, 0, sizeof (*header)); 57826fd48b3bSAlex Elder } 57836fd48b3bSAlex Elder 57842df3fac7SAlex Elder static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev) 5785a30b71b9SAlex Elder { 5786a30b71b9SAlex Elder int ret; 5787a30b71b9SAlex Elder 57881e130199SAlex Elder ret = rbd_dev_v2_object_prefix(rbd_dev); 578957385b51SAlex Elder if (ret) 57901e130199SAlex Elder goto out_err; 5791b1b5402aSAlex Elder 57922df3fac7SAlex Elder /* 57932df3fac7SAlex Elder * Get the and check features for the image. Currently the 57942df3fac7SAlex Elder * features are assumed to never change. 57952df3fac7SAlex Elder */ 5796b1b5402aSAlex Elder ret = rbd_dev_v2_features(rbd_dev); 579757385b51SAlex Elder if (ret) 5798b1b5402aSAlex Elder goto out_err; 579935d489f9SAlex Elder 5800cc070d59SAlex Elder /* If the image supports fancy striping, get its parameters */ 5801cc070d59SAlex Elder 5802cc070d59SAlex Elder if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) { 5803cc070d59SAlex Elder ret = rbd_dev_v2_striping_info(rbd_dev); 5804cc070d59SAlex Elder if (ret < 0) 5805cc070d59SAlex Elder goto out_err; 5806cc070d59SAlex Elder } 5807a30b71b9SAlex Elder 58087e97332eSIlya Dryomov if (rbd_dev->header.features & RBD_FEATURE_DATA_POOL) { 58097e97332eSIlya Dryomov ret = rbd_dev_v2_data_pool(rbd_dev); 58107e97332eSIlya Dryomov if (ret) 58117e97332eSIlya Dryomov goto out_err; 58127e97332eSIlya Dryomov } 58137e97332eSIlya Dryomov 5814263423f8SIlya Dryomov rbd_init_layout(rbd_dev); 581535152979SAlex Elder return 0; 5816263423f8SIlya Dryomov 58179d475de5SAlex Elder out_err: 5818642a2537SAlex Elder rbd_dev->header.features = 0; 58191e130199SAlex Elder kfree(rbd_dev->header.object_prefix); 58201e130199SAlex Elder rbd_dev->header.object_prefix = NULL; 58219d475de5SAlex Elder return ret; 5822a30b71b9SAlex Elder } 5823a30b71b9SAlex Elder 58246d69bb53SIlya Dryomov /* 58256d69bb53SIlya Dryomov * @depth is rbd_dev_image_probe() -> rbd_dev_probe_parent() -> 58266d69bb53SIlya Dryomov * rbd_dev_image_probe() recursion depth, which means it's also the 58276d69bb53SIlya Dryomov * length of the already discovered part of the parent chain. 58286d69bb53SIlya Dryomov */ 58296d69bb53SIlya Dryomov static int rbd_dev_probe_parent(struct rbd_device *rbd_dev, int depth) 583083a06263SAlex Elder { 58312f82ee54SAlex Elder struct rbd_device *parent = NULL; 5832124afba2SAlex Elder int ret; 5833124afba2SAlex Elder 5834124afba2SAlex Elder if (!rbd_dev->parent_spec) 5835124afba2SAlex Elder return 0; 5836124afba2SAlex Elder 58376d69bb53SIlya Dryomov if (++depth > RBD_MAX_PARENT_CHAIN_LEN) { 58386d69bb53SIlya Dryomov pr_info("parent chain is too long (%d)\n", depth); 58396d69bb53SIlya Dryomov ret = -EINVAL; 58406d69bb53SIlya Dryomov goto out_err; 58416d69bb53SIlya Dryomov } 58426d69bb53SIlya Dryomov 58431643dfa4SIlya Dryomov parent = __rbd_dev_create(rbd_dev->rbd_client, rbd_dev->parent_spec); 58441f2c6651SIlya Dryomov if (!parent) { 5845124afba2SAlex Elder ret = -ENOMEM; 5846124afba2SAlex Elder goto out_err; 58471f2c6651SIlya Dryomov } 58481f2c6651SIlya Dryomov 58491f2c6651SIlya Dryomov /* 58501f2c6651SIlya Dryomov * Images related by parent/child relationships always share 58511f2c6651SIlya Dryomov * rbd_client and spec/parent_spec, so bump their refcounts. 58521f2c6651SIlya Dryomov */ 58531f2c6651SIlya Dryomov __rbd_get_client(rbd_dev->rbd_client); 58541f2c6651SIlya Dryomov rbd_spec_get(rbd_dev->parent_spec); 5855124afba2SAlex Elder 58566d69bb53SIlya Dryomov ret = rbd_dev_image_probe(parent, depth); 5857124afba2SAlex Elder if (ret < 0) 5858124afba2SAlex Elder goto out_err; 58591f2c6651SIlya Dryomov 5860124afba2SAlex Elder rbd_dev->parent = parent; 5861a2acd00eSAlex Elder atomic_set(&rbd_dev->parent_ref, 1); 5862124afba2SAlex Elder return 0; 5863124afba2SAlex Elder 58641f2c6651SIlya Dryomov out_err: 58651f2c6651SIlya Dryomov rbd_dev_unparent(rbd_dev); 58661f2c6651SIlya Dryomov rbd_dev_destroy(parent); 5867124afba2SAlex Elder return ret; 5868124afba2SAlex Elder } 5869124afba2SAlex Elder 5870811c6688SIlya Dryomov /* 5871811c6688SIlya Dryomov * rbd_dev->header_rwsem must be locked for write and will be unlocked 5872811c6688SIlya Dryomov * upon return. 5873811c6688SIlya Dryomov */ 5874200a6a8bSAlex Elder static int rbd_dev_device_setup(struct rbd_device *rbd_dev) 5875124afba2SAlex Elder { 587683a06263SAlex Elder int ret; 587783a06263SAlex Elder 58789b60e70bSIlya Dryomov /* Record our major and minor device numbers. */ 587983a06263SAlex Elder 58809b60e70bSIlya Dryomov if (!single_major) { 588183a06263SAlex Elder ret = register_blkdev(0, rbd_dev->name); 588283a06263SAlex Elder if (ret < 0) 58831643dfa4SIlya Dryomov goto err_out_unlock; 58849b60e70bSIlya Dryomov 588583a06263SAlex Elder rbd_dev->major = ret; 5886dd82fff1SIlya Dryomov rbd_dev->minor = 0; 58879b60e70bSIlya Dryomov } else { 58889b60e70bSIlya Dryomov rbd_dev->major = rbd_major; 58899b60e70bSIlya Dryomov rbd_dev->minor = rbd_dev_id_to_minor(rbd_dev->dev_id); 58909b60e70bSIlya Dryomov } 589183a06263SAlex Elder 589283a06263SAlex Elder /* Set up the blkdev mapping. */ 589383a06263SAlex Elder 589483a06263SAlex Elder ret = rbd_init_disk(rbd_dev); 589583a06263SAlex Elder if (ret) 589683a06263SAlex Elder goto err_out_blkdev; 589783a06263SAlex Elder 5898f35a4deeSAlex Elder ret = rbd_dev_mapping_set(rbd_dev); 589983a06263SAlex Elder if (ret) 590083a06263SAlex Elder goto err_out_disk; 5901bc1ecc65SIlya Dryomov 5902f35a4deeSAlex Elder set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE); 590322001f61SJosh Durgin set_disk_ro(rbd_dev->disk, rbd_dev->mapping.read_only); 5904f35a4deeSAlex Elder 5905dd5ac32dSIlya Dryomov dev_set_name(&rbd_dev->dev, "%d", rbd_dev->dev_id); 5906dd5ac32dSIlya Dryomov ret = device_add(&rbd_dev->dev); 5907f35a4deeSAlex Elder if (ret) 5908f5ee37bdSIlya Dryomov goto err_out_mapping; 590983a06263SAlex Elder 591083a06263SAlex Elder /* Everything's ready. Announce the disk to the world. */ 591183a06263SAlex Elder 5912129b79d4SAlex Elder set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); 5913811c6688SIlya Dryomov up_write(&rbd_dev->header_rwsem); 591483a06263SAlex Elder 59151643dfa4SIlya Dryomov spin_lock(&rbd_dev_list_lock); 59161643dfa4SIlya Dryomov list_add_tail(&rbd_dev->node, &rbd_dev_list); 59171643dfa4SIlya Dryomov spin_unlock(&rbd_dev_list_lock); 59181643dfa4SIlya Dryomov 5919811c6688SIlya Dryomov add_disk(rbd_dev->disk); 5920ca7909e8SIlya Dryomov pr_info("%s: capacity %llu features 0x%llx\n", rbd_dev->disk->disk_name, 5921ca7909e8SIlya Dryomov (unsigned long long)get_capacity(rbd_dev->disk) << SECTOR_SHIFT, 5922ca7909e8SIlya Dryomov rbd_dev->header.features); 592383a06263SAlex Elder 592483a06263SAlex Elder return ret; 59252f82ee54SAlex Elder 5926f35a4deeSAlex Elder err_out_mapping: 5927f35a4deeSAlex Elder rbd_dev_mapping_clear(rbd_dev); 592883a06263SAlex Elder err_out_disk: 592983a06263SAlex Elder rbd_free_disk(rbd_dev); 593083a06263SAlex Elder err_out_blkdev: 59319b60e70bSIlya Dryomov if (!single_major) 593283a06263SAlex Elder unregister_blkdev(rbd_dev->major, rbd_dev->name); 5933811c6688SIlya Dryomov err_out_unlock: 5934811c6688SIlya Dryomov up_write(&rbd_dev->header_rwsem); 593583a06263SAlex Elder return ret; 593683a06263SAlex Elder } 593783a06263SAlex Elder 5938332bb12dSAlex Elder static int rbd_dev_header_name(struct rbd_device *rbd_dev) 5939332bb12dSAlex Elder { 5940332bb12dSAlex Elder struct rbd_spec *spec = rbd_dev->spec; 5941c41d13a3SIlya Dryomov int ret; 5942332bb12dSAlex Elder 5943332bb12dSAlex Elder /* Record the header object name for this rbd image. */ 5944332bb12dSAlex Elder 5945332bb12dSAlex Elder rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 5946332bb12dSAlex Elder if (rbd_dev->image_format == 1) 5947c41d13a3SIlya Dryomov ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s", 5948332bb12dSAlex Elder spec->image_name, RBD_SUFFIX); 5949332bb12dSAlex Elder else 5950c41d13a3SIlya Dryomov ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s", 5951332bb12dSAlex Elder RBD_HEADER_PREFIX, spec->image_id); 5952c41d13a3SIlya Dryomov 5953c41d13a3SIlya Dryomov return ret; 5954332bb12dSAlex Elder } 5955332bb12dSAlex Elder 5956200a6a8bSAlex Elder static void rbd_dev_image_release(struct rbd_device *rbd_dev) 5957200a6a8bSAlex Elder { 59586fd48b3bSAlex Elder rbd_dev_unprobe(rbd_dev); 59596fd48b3bSAlex Elder rbd_dev->image_format = 0; 59606fd48b3bSAlex Elder kfree(rbd_dev->spec->image_id); 59616fd48b3bSAlex Elder rbd_dev->spec->image_id = NULL; 59626fd48b3bSAlex Elder 5963200a6a8bSAlex Elder rbd_dev_destroy(rbd_dev); 5964200a6a8bSAlex Elder } 5965200a6a8bSAlex Elder 5966a30b71b9SAlex Elder /* 5967a30b71b9SAlex Elder * Probe for the existence of the header object for the given rbd 59681f3ef788SAlex Elder * device. If this image is the one being mapped (i.e., not a 59691f3ef788SAlex Elder * parent), initiate a watch on its header object before using that 59701f3ef788SAlex Elder * object to get detailed information about the rbd image. 5971a30b71b9SAlex Elder */ 59726d69bb53SIlya Dryomov static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth) 5973a30b71b9SAlex Elder { 5974a30b71b9SAlex Elder int ret; 5975a30b71b9SAlex Elder 5976a30b71b9SAlex Elder /* 59773abef3b3SAlex Elder * Get the id from the image id object. Unless there's an 59783abef3b3SAlex Elder * error, rbd_dev->spec->image_id will be filled in with 59793abef3b3SAlex Elder * a dynamically-allocated string, and rbd_dev->image_format 59803abef3b3SAlex Elder * will be set to either 1 or 2. 5981a30b71b9SAlex Elder */ 5982a30b71b9SAlex Elder ret = rbd_dev_image_id(rbd_dev); 5983a30b71b9SAlex Elder if (ret) 5984c0fba368SAlex Elder return ret; 5985c0fba368SAlex Elder 5986332bb12dSAlex Elder ret = rbd_dev_header_name(rbd_dev); 5987332bb12dSAlex Elder if (ret) 5988332bb12dSAlex Elder goto err_out_format; 5989332bb12dSAlex Elder 59906d69bb53SIlya Dryomov if (!depth) { 599199d16943SIlya Dryomov ret = rbd_register_watch(rbd_dev); 59921fe48023SIlya Dryomov if (ret) { 59931fe48023SIlya Dryomov if (ret == -ENOENT) 59941fe48023SIlya Dryomov pr_info("image %s/%s does not exist\n", 59951fe48023SIlya Dryomov rbd_dev->spec->pool_name, 59961fe48023SIlya Dryomov rbd_dev->spec->image_name); 5997c41d13a3SIlya Dryomov goto err_out_format; 59981f3ef788SAlex Elder } 59991fe48023SIlya Dryomov } 6000b644de2bSAlex Elder 6001a720ae09SIlya Dryomov ret = rbd_dev_header_info(rbd_dev); 60025655c4d9SAlex Elder if (ret) 6003b644de2bSAlex Elder goto err_out_watch; 6004a30b71b9SAlex Elder 600504077599SIlya Dryomov /* 600604077599SIlya Dryomov * If this image is the one being mapped, we have pool name and 600704077599SIlya Dryomov * id, image name and id, and snap name - need to fill snap id. 600804077599SIlya Dryomov * Otherwise this is a parent image, identified by pool, image 600904077599SIlya Dryomov * and snap ids - need to fill in names for those ids. 601004077599SIlya Dryomov */ 60116d69bb53SIlya Dryomov if (!depth) 601204077599SIlya Dryomov ret = rbd_spec_fill_snap_id(rbd_dev); 601304077599SIlya Dryomov else 601404077599SIlya Dryomov ret = rbd_spec_fill_names(rbd_dev); 60151fe48023SIlya Dryomov if (ret) { 60161fe48023SIlya Dryomov if (ret == -ENOENT) 60171fe48023SIlya Dryomov pr_info("snap %s/%s@%s does not exist\n", 60181fe48023SIlya Dryomov rbd_dev->spec->pool_name, 60191fe48023SIlya Dryomov rbd_dev->spec->image_name, 60201fe48023SIlya Dryomov rbd_dev->spec->snap_name); 602133dca39fSAlex Elder goto err_out_probe; 60221fe48023SIlya Dryomov } 60239bb81c9bSAlex Elder 6024e8f59b59SIlya Dryomov if (rbd_dev->header.features & RBD_FEATURE_LAYERING) { 6025e8f59b59SIlya Dryomov ret = rbd_dev_v2_parent_info(rbd_dev); 6026e8f59b59SIlya Dryomov if (ret) 6027e8f59b59SIlya Dryomov goto err_out_probe; 6028e8f59b59SIlya Dryomov 6029e8f59b59SIlya Dryomov /* 6030e8f59b59SIlya Dryomov * Need to warn users if this image is the one being 6031e8f59b59SIlya Dryomov * mapped and has a parent. 6032e8f59b59SIlya Dryomov */ 60336d69bb53SIlya Dryomov if (!depth && rbd_dev->parent_spec) 6034e8f59b59SIlya Dryomov rbd_warn(rbd_dev, 6035e8f59b59SIlya Dryomov "WARNING: kernel layering is EXPERIMENTAL!"); 6036e8f59b59SIlya Dryomov } 6037e8f59b59SIlya Dryomov 60386d69bb53SIlya Dryomov ret = rbd_dev_probe_parent(rbd_dev, depth); 603930d60ba2SAlex Elder if (ret) 604030d60ba2SAlex Elder goto err_out_probe; 604183a06263SAlex Elder 604230d60ba2SAlex Elder dout("discovered format %u image, header name is %s\n", 6043c41d13a3SIlya Dryomov rbd_dev->image_format, rbd_dev->header_oid.name); 604430d60ba2SAlex Elder return 0; 6045e8f59b59SIlya Dryomov 60466fd48b3bSAlex Elder err_out_probe: 60476fd48b3bSAlex Elder rbd_dev_unprobe(rbd_dev); 6048b644de2bSAlex Elder err_out_watch: 60496d69bb53SIlya Dryomov if (!depth) 605099d16943SIlya Dryomov rbd_unregister_watch(rbd_dev); 6051332bb12dSAlex Elder err_out_format: 6052332bb12dSAlex Elder rbd_dev->image_format = 0; 60535655c4d9SAlex Elder kfree(rbd_dev->spec->image_id); 60545655c4d9SAlex Elder rbd_dev->spec->image_id = NULL; 60555655c4d9SAlex Elder return ret; 605683a06263SAlex Elder } 605783a06263SAlex Elder 60589b60e70bSIlya Dryomov static ssize_t do_rbd_add(struct bus_type *bus, 605959c2be1eSYehuda Sadeh const char *buf, 606059c2be1eSYehuda Sadeh size_t count) 6061602adf40SYehuda Sadeh { 6062cb8627c7SAlex Elder struct rbd_device *rbd_dev = NULL; 6063dc79b113SAlex Elder struct ceph_options *ceph_opts = NULL; 60644e9afebaSAlex Elder struct rbd_options *rbd_opts = NULL; 6065859c31dfSAlex Elder struct rbd_spec *spec = NULL; 60669d3997fdSAlex Elder struct rbd_client *rbdc; 606751344a38SAlex Elder bool read_only; 6068b51c83c2SIlya Dryomov int rc; 6069602adf40SYehuda Sadeh 6070602adf40SYehuda Sadeh if (!try_module_get(THIS_MODULE)) 6071602adf40SYehuda Sadeh return -ENODEV; 6072602adf40SYehuda Sadeh 6073a725f65eSAlex Elder /* parse add command */ 6074859c31dfSAlex Elder rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec); 6075dc79b113SAlex Elder if (rc < 0) 6076dd5ac32dSIlya Dryomov goto out; 6077a725f65eSAlex Elder 60789d3997fdSAlex Elder rbdc = rbd_get_client(ceph_opts); 60799d3997fdSAlex Elder if (IS_ERR(rbdc)) { 60809d3997fdSAlex Elder rc = PTR_ERR(rbdc); 60810ddebc0cSAlex Elder goto err_out_args; 60829d3997fdSAlex Elder } 6083602adf40SYehuda Sadeh 6084602adf40SYehuda Sadeh /* pick the pool */ 608530ba1f02SIlya Dryomov rc = rbd_add_get_pool_id(rbdc, spec->pool_name); 60861fe48023SIlya Dryomov if (rc < 0) { 60871fe48023SIlya Dryomov if (rc == -ENOENT) 60881fe48023SIlya Dryomov pr_info("pool %s does not exist\n", spec->pool_name); 6089602adf40SYehuda Sadeh goto err_out_client; 60901fe48023SIlya Dryomov } 6091859c31dfSAlex Elder spec->pool_id = (u64)rc; 6092859c31dfSAlex Elder 6093d147543dSIlya Dryomov rbd_dev = rbd_dev_create(rbdc, spec, rbd_opts); 6094b51c83c2SIlya Dryomov if (!rbd_dev) { 6095b51c83c2SIlya Dryomov rc = -ENOMEM; 6096bd4ba655SAlex Elder goto err_out_client; 6097b51c83c2SIlya Dryomov } 6098c53d5893SAlex Elder rbdc = NULL; /* rbd_dev now owns this */ 6099c53d5893SAlex Elder spec = NULL; /* rbd_dev now owns this */ 6100d147543dSIlya Dryomov rbd_opts = NULL; /* rbd_dev now owns this */ 6101602adf40SYehuda Sadeh 61020d6d1e9cSMike Christie rbd_dev->config_info = kstrdup(buf, GFP_KERNEL); 61030d6d1e9cSMike Christie if (!rbd_dev->config_info) { 61040d6d1e9cSMike Christie rc = -ENOMEM; 61050d6d1e9cSMike Christie goto err_out_rbd_dev; 61060d6d1e9cSMike Christie } 61070d6d1e9cSMike Christie 6108811c6688SIlya Dryomov down_write(&rbd_dev->header_rwsem); 61096d69bb53SIlya Dryomov rc = rbd_dev_image_probe(rbd_dev, 0); 61100d6d1e9cSMike Christie if (rc < 0) { 61110d6d1e9cSMike Christie up_write(&rbd_dev->header_rwsem); 6112c53d5893SAlex Elder goto err_out_rbd_dev; 61130d6d1e9cSMike Christie } 611405fd6f6fSAlex Elder 61157ce4eef7SAlex Elder /* If we are mapping a snapshot it must be marked read-only */ 61167ce4eef7SAlex Elder 6117d147543dSIlya Dryomov read_only = rbd_dev->opts->read_only; 61187ce4eef7SAlex Elder if (rbd_dev->spec->snap_id != CEPH_NOSNAP) 61197ce4eef7SAlex Elder read_only = true; 61207ce4eef7SAlex Elder rbd_dev->mapping.read_only = read_only; 61217ce4eef7SAlex Elder 6122b536f69aSAlex Elder rc = rbd_dev_device_setup(rbd_dev); 61233abef3b3SAlex Elder if (rc) { 6124e37180c0SIlya Dryomov /* 612599d16943SIlya Dryomov * rbd_unregister_watch() can't be moved into 6126e37180c0SIlya Dryomov * rbd_dev_image_release() without refactoring, see 6127e37180c0SIlya Dryomov * commit 1f3ef78861ac. 6128e37180c0SIlya Dryomov */ 612999d16943SIlya Dryomov rbd_unregister_watch(rbd_dev); 61303abef3b3SAlex Elder rbd_dev_image_release(rbd_dev); 6131dd5ac32dSIlya Dryomov goto out; 61323abef3b3SAlex Elder } 61333abef3b3SAlex Elder 6134dd5ac32dSIlya Dryomov rc = count; 6135dd5ac32dSIlya Dryomov out: 6136dd5ac32dSIlya Dryomov module_put(THIS_MODULE); 6137dd5ac32dSIlya Dryomov return rc; 6138b536f69aSAlex Elder 6139c53d5893SAlex Elder err_out_rbd_dev: 6140c53d5893SAlex Elder rbd_dev_destroy(rbd_dev); 6141bd4ba655SAlex Elder err_out_client: 61429d3997fdSAlex Elder rbd_put_client(rbdc); 61430ddebc0cSAlex Elder err_out_args: 6144859c31dfSAlex Elder rbd_spec_put(spec); 6145d147543dSIlya Dryomov kfree(rbd_opts); 6146dd5ac32dSIlya Dryomov goto out; 6147602adf40SYehuda Sadeh } 6148602adf40SYehuda Sadeh 61499b60e70bSIlya Dryomov static ssize_t rbd_add(struct bus_type *bus, 61509b60e70bSIlya Dryomov const char *buf, 61519b60e70bSIlya Dryomov size_t count) 61529b60e70bSIlya Dryomov { 61539b60e70bSIlya Dryomov if (single_major) 61549b60e70bSIlya Dryomov return -EINVAL; 61559b60e70bSIlya Dryomov 61569b60e70bSIlya Dryomov return do_rbd_add(bus, buf, count); 61579b60e70bSIlya Dryomov } 61589b60e70bSIlya Dryomov 61599b60e70bSIlya Dryomov static ssize_t rbd_add_single_major(struct bus_type *bus, 61609b60e70bSIlya Dryomov const char *buf, 61619b60e70bSIlya Dryomov size_t count) 61629b60e70bSIlya Dryomov { 61639b60e70bSIlya Dryomov return do_rbd_add(bus, buf, count); 61649b60e70bSIlya Dryomov } 61659b60e70bSIlya Dryomov 6166dd5ac32dSIlya Dryomov static void rbd_dev_device_release(struct rbd_device *rbd_dev) 6167602adf40SYehuda Sadeh { 6168602adf40SYehuda Sadeh rbd_free_disk(rbd_dev); 61691643dfa4SIlya Dryomov 61701643dfa4SIlya Dryomov spin_lock(&rbd_dev_list_lock); 61711643dfa4SIlya Dryomov list_del_init(&rbd_dev->node); 61721643dfa4SIlya Dryomov spin_unlock(&rbd_dev_list_lock); 61731643dfa4SIlya Dryomov 6174200a6a8bSAlex Elder clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); 6175dd5ac32dSIlya Dryomov device_del(&rbd_dev->dev); 61766d80b130SAlex Elder rbd_dev_mapping_clear(rbd_dev); 61779b60e70bSIlya Dryomov if (!single_major) 6178602adf40SYehuda Sadeh unregister_blkdev(rbd_dev->major, rbd_dev->name); 6179602adf40SYehuda Sadeh } 6180602adf40SYehuda Sadeh 618105a46afdSAlex Elder static void rbd_dev_remove_parent(struct rbd_device *rbd_dev) 618205a46afdSAlex Elder { 6183ad945fc1SAlex Elder while (rbd_dev->parent) { 618405a46afdSAlex Elder struct rbd_device *first = rbd_dev; 618505a46afdSAlex Elder struct rbd_device *second = first->parent; 618605a46afdSAlex Elder struct rbd_device *third; 618705a46afdSAlex Elder 618805a46afdSAlex Elder /* 618905a46afdSAlex Elder * Follow to the parent with no grandparent and 619005a46afdSAlex Elder * remove it. 619105a46afdSAlex Elder */ 619205a46afdSAlex Elder while (second && (third = second->parent)) { 619305a46afdSAlex Elder first = second; 619405a46afdSAlex Elder second = third; 619505a46afdSAlex Elder } 6196ad945fc1SAlex Elder rbd_assert(second); 61978ad42cd0SAlex Elder rbd_dev_image_release(second); 6198ad945fc1SAlex Elder first->parent = NULL; 6199ad945fc1SAlex Elder first->parent_overlap = 0; 6200ad945fc1SAlex Elder 6201ad945fc1SAlex Elder rbd_assert(first->parent_spec); 620205a46afdSAlex Elder rbd_spec_put(first->parent_spec); 620305a46afdSAlex Elder first->parent_spec = NULL; 620405a46afdSAlex Elder } 620505a46afdSAlex Elder } 620605a46afdSAlex Elder 62079b60e70bSIlya Dryomov static ssize_t do_rbd_remove(struct bus_type *bus, 6208602adf40SYehuda Sadeh const char *buf, 6209602adf40SYehuda Sadeh size_t count) 6210602adf40SYehuda Sadeh { 6211602adf40SYehuda Sadeh struct rbd_device *rbd_dev = NULL; 6212751cc0e3SAlex Elder struct list_head *tmp; 6213751cc0e3SAlex Elder int dev_id; 62140276dca6SMike Christie char opt_buf[6]; 621582a442d2SAlex Elder bool already = false; 62160276dca6SMike Christie bool force = false; 62170d8189e1SAlex Elder int ret; 6218602adf40SYehuda Sadeh 62190276dca6SMike Christie dev_id = -1; 62200276dca6SMike Christie opt_buf[0] = '\0'; 62210276dca6SMike Christie sscanf(buf, "%d %5s", &dev_id, opt_buf); 62220276dca6SMike Christie if (dev_id < 0) { 62230276dca6SMike Christie pr_err("dev_id out of range\n"); 6224602adf40SYehuda Sadeh return -EINVAL; 62250276dca6SMike Christie } 62260276dca6SMike Christie if (opt_buf[0] != '\0') { 62270276dca6SMike Christie if (!strcmp(opt_buf, "force")) { 62280276dca6SMike Christie force = true; 62290276dca6SMike Christie } else { 62300276dca6SMike Christie pr_err("bad remove option at '%s'\n", opt_buf); 62310276dca6SMike Christie return -EINVAL; 62320276dca6SMike Christie } 62330276dca6SMike Christie } 6234602adf40SYehuda Sadeh 6235602adf40SYehuda Sadeh ret = -ENOENT; 6236751cc0e3SAlex Elder spin_lock(&rbd_dev_list_lock); 6237751cc0e3SAlex Elder list_for_each(tmp, &rbd_dev_list) { 6238751cc0e3SAlex Elder rbd_dev = list_entry(tmp, struct rbd_device, node); 6239751cc0e3SAlex Elder if (rbd_dev->dev_id == dev_id) { 6240751cc0e3SAlex Elder ret = 0; 6241751cc0e3SAlex Elder break; 6242602adf40SYehuda Sadeh } 6243751cc0e3SAlex Elder } 6244751cc0e3SAlex Elder if (!ret) { 6245a14ea269SAlex Elder spin_lock_irq(&rbd_dev->lock); 62460276dca6SMike Christie if (rbd_dev->open_count && !force) 624742382b70SAlex Elder ret = -EBUSY; 6248b82d167bSAlex Elder else 624982a442d2SAlex Elder already = test_and_set_bit(RBD_DEV_FLAG_REMOVING, 625082a442d2SAlex Elder &rbd_dev->flags); 6251a14ea269SAlex Elder spin_unlock_irq(&rbd_dev->lock); 6252751cc0e3SAlex Elder } 6253751cc0e3SAlex Elder spin_unlock(&rbd_dev_list_lock); 625482a442d2SAlex Elder if (ret < 0 || already) 62551ba0f1e7SAlex Elder return ret; 6256751cc0e3SAlex Elder 62570276dca6SMike Christie if (force) { 62580276dca6SMike Christie /* 62590276dca6SMike Christie * Prevent new IO from being queued and wait for existing 62600276dca6SMike Christie * IO to complete/fail. 62610276dca6SMike Christie */ 62620276dca6SMike Christie blk_mq_freeze_queue(rbd_dev->disk->queue); 62630276dca6SMike Christie blk_set_queue_dying(rbd_dev->disk->queue); 62640276dca6SMike Christie } 62650276dca6SMike Christie 6266ed95b21aSIlya Dryomov down_write(&rbd_dev->lock_rwsem); 6267ed95b21aSIlya Dryomov if (__rbd_is_lock_owner(rbd_dev)) 6268ed95b21aSIlya Dryomov rbd_unlock(rbd_dev); 6269ed95b21aSIlya Dryomov up_write(&rbd_dev->lock_rwsem); 627099d16943SIlya Dryomov rbd_unregister_watch(rbd_dev); 6271fca27065SIlya Dryomov 62729875201eSJosh Durgin /* 62739875201eSJosh Durgin * Don't free anything from rbd_dev->disk until after all 62749875201eSJosh Durgin * notifies are completely processed. Otherwise 62759875201eSJosh Durgin * rbd_bus_del_dev() will race with rbd_watch_cb(), resulting 62769875201eSJosh Durgin * in a potential use after free of rbd_dev->disk or rbd_dev. 62779875201eSJosh Durgin */ 6278dd5ac32dSIlya Dryomov rbd_dev_device_release(rbd_dev); 62798ad42cd0SAlex Elder rbd_dev_image_release(rbd_dev); 6280aafb230eSAlex Elder 62811ba0f1e7SAlex Elder return count; 6282602adf40SYehuda Sadeh } 6283602adf40SYehuda Sadeh 62849b60e70bSIlya Dryomov static ssize_t rbd_remove(struct bus_type *bus, 62859b60e70bSIlya Dryomov const char *buf, 62869b60e70bSIlya Dryomov size_t count) 62879b60e70bSIlya Dryomov { 62889b60e70bSIlya Dryomov if (single_major) 62899b60e70bSIlya Dryomov return -EINVAL; 62909b60e70bSIlya Dryomov 62919b60e70bSIlya Dryomov return do_rbd_remove(bus, buf, count); 62929b60e70bSIlya Dryomov } 62939b60e70bSIlya Dryomov 62949b60e70bSIlya Dryomov static ssize_t rbd_remove_single_major(struct bus_type *bus, 62959b60e70bSIlya Dryomov const char *buf, 62969b60e70bSIlya Dryomov size_t count) 62979b60e70bSIlya Dryomov { 62989b60e70bSIlya Dryomov return do_rbd_remove(bus, buf, count); 62999b60e70bSIlya Dryomov } 63009b60e70bSIlya Dryomov 6301602adf40SYehuda Sadeh /* 6302602adf40SYehuda Sadeh * create control files in sysfs 6303dfc5606dSYehuda Sadeh * /sys/bus/rbd/... 6304602adf40SYehuda Sadeh */ 6305602adf40SYehuda Sadeh static int rbd_sysfs_init(void) 6306602adf40SYehuda Sadeh { 6307dfc5606dSYehuda Sadeh int ret; 6308602adf40SYehuda Sadeh 6309fed4c143SAlex Elder ret = device_register(&rbd_root_dev); 6310dfc5606dSYehuda Sadeh if (ret < 0) 6311dfc5606dSYehuda Sadeh return ret; 6312602adf40SYehuda Sadeh 6313fed4c143SAlex Elder ret = bus_register(&rbd_bus_type); 6314fed4c143SAlex Elder if (ret < 0) 6315fed4c143SAlex Elder device_unregister(&rbd_root_dev); 6316602adf40SYehuda Sadeh 6317602adf40SYehuda Sadeh return ret; 6318602adf40SYehuda Sadeh } 6319602adf40SYehuda Sadeh 6320602adf40SYehuda Sadeh static void rbd_sysfs_cleanup(void) 6321602adf40SYehuda Sadeh { 6322dfc5606dSYehuda Sadeh bus_unregister(&rbd_bus_type); 6323fed4c143SAlex Elder device_unregister(&rbd_root_dev); 6324602adf40SYehuda Sadeh } 6325602adf40SYehuda Sadeh 63261c2a9dfeSAlex Elder static int rbd_slab_init(void) 63271c2a9dfeSAlex Elder { 63281c2a9dfeSAlex Elder rbd_assert(!rbd_img_request_cache); 632903d94406SGeliang Tang rbd_img_request_cache = KMEM_CACHE(rbd_img_request, 0); 6330868311b1SAlex Elder if (!rbd_img_request_cache) 6331868311b1SAlex Elder return -ENOMEM; 6332868311b1SAlex Elder 6333868311b1SAlex Elder rbd_assert(!rbd_obj_request_cache); 633403d94406SGeliang Tang rbd_obj_request_cache = KMEM_CACHE(rbd_obj_request, 0); 633578c2a44aSAlex Elder if (!rbd_obj_request_cache) 633678c2a44aSAlex Elder goto out_err; 633778c2a44aSAlex Elder 63381c2a9dfeSAlex Elder return 0; 63391c2a9dfeSAlex Elder 63406c696d85SIlya Dryomov out_err: 6341868311b1SAlex Elder kmem_cache_destroy(rbd_img_request_cache); 6342868311b1SAlex Elder rbd_img_request_cache = NULL; 63431c2a9dfeSAlex Elder return -ENOMEM; 63441c2a9dfeSAlex Elder } 63451c2a9dfeSAlex Elder 63461c2a9dfeSAlex Elder static void rbd_slab_exit(void) 63471c2a9dfeSAlex Elder { 6348868311b1SAlex Elder rbd_assert(rbd_obj_request_cache); 6349868311b1SAlex Elder kmem_cache_destroy(rbd_obj_request_cache); 6350868311b1SAlex Elder rbd_obj_request_cache = NULL; 6351868311b1SAlex Elder 63521c2a9dfeSAlex Elder rbd_assert(rbd_img_request_cache); 63531c2a9dfeSAlex Elder kmem_cache_destroy(rbd_img_request_cache); 63541c2a9dfeSAlex Elder rbd_img_request_cache = NULL; 63551c2a9dfeSAlex Elder } 63561c2a9dfeSAlex Elder 6357cc344fa1SAlex Elder static int __init rbd_init(void) 6358602adf40SYehuda Sadeh { 6359602adf40SYehuda Sadeh int rc; 6360602adf40SYehuda Sadeh 63611e32d34cSAlex Elder if (!libceph_compatible(NULL)) { 63621e32d34cSAlex Elder rbd_warn(NULL, "libceph incompatibility (quitting)"); 63631e32d34cSAlex Elder return -EINVAL; 63641e32d34cSAlex Elder } 6365e1b4d96dSIlya Dryomov 63661c2a9dfeSAlex Elder rc = rbd_slab_init(); 6367602adf40SYehuda Sadeh if (rc) 6368602adf40SYehuda Sadeh return rc; 6369e1b4d96dSIlya Dryomov 6370f5ee37bdSIlya Dryomov /* 6371f5ee37bdSIlya Dryomov * The number of active work items is limited by the number of 6372f77303bdSIlya Dryomov * rbd devices * queue depth, so leave @max_active at default. 6373f5ee37bdSIlya Dryomov */ 6374f5ee37bdSIlya Dryomov rbd_wq = alloc_workqueue(RBD_DRV_NAME, WQ_MEM_RECLAIM, 0); 6375f5ee37bdSIlya Dryomov if (!rbd_wq) { 6376f5ee37bdSIlya Dryomov rc = -ENOMEM; 6377f5ee37bdSIlya Dryomov goto err_out_slab; 6378f5ee37bdSIlya Dryomov } 6379f5ee37bdSIlya Dryomov 63809b60e70bSIlya Dryomov if (single_major) { 63819b60e70bSIlya Dryomov rbd_major = register_blkdev(0, RBD_DRV_NAME); 63829b60e70bSIlya Dryomov if (rbd_major < 0) { 63839b60e70bSIlya Dryomov rc = rbd_major; 6384f5ee37bdSIlya Dryomov goto err_out_wq; 63859b60e70bSIlya Dryomov } 63869b60e70bSIlya Dryomov } 63879b60e70bSIlya Dryomov 63881c2a9dfeSAlex Elder rc = rbd_sysfs_init(); 63891c2a9dfeSAlex Elder if (rc) 63909b60e70bSIlya Dryomov goto err_out_blkdev; 63911c2a9dfeSAlex Elder 63929b60e70bSIlya Dryomov if (single_major) 63939b60e70bSIlya Dryomov pr_info("loaded (major %d)\n", rbd_major); 63949b60e70bSIlya Dryomov else 6395e1b4d96dSIlya Dryomov pr_info("loaded\n"); 63969b60e70bSIlya Dryomov 6397e1b4d96dSIlya Dryomov return 0; 6398e1b4d96dSIlya Dryomov 63999b60e70bSIlya Dryomov err_out_blkdev: 64009b60e70bSIlya Dryomov if (single_major) 64019b60e70bSIlya Dryomov unregister_blkdev(rbd_major, RBD_DRV_NAME); 6402f5ee37bdSIlya Dryomov err_out_wq: 6403f5ee37bdSIlya Dryomov destroy_workqueue(rbd_wq); 6404e1b4d96dSIlya Dryomov err_out_slab: 6405e1b4d96dSIlya Dryomov rbd_slab_exit(); 64061c2a9dfeSAlex Elder return rc; 6407602adf40SYehuda Sadeh } 6408602adf40SYehuda Sadeh 6409cc344fa1SAlex Elder static void __exit rbd_exit(void) 6410602adf40SYehuda Sadeh { 6411ffe312cfSIlya Dryomov ida_destroy(&rbd_dev_id_ida); 6412602adf40SYehuda Sadeh rbd_sysfs_cleanup(); 64139b60e70bSIlya Dryomov if (single_major) 64149b60e70bSIlya Dryomov unregister_blkdev(rbd_major, RBD_DRV_NAME); 6415f5ee37bdSIlya Dryomov destroy_workqueue(rbd_wq); 64161c2a9dfeSAlex Elder rbd_slab_exit(); 6417602adf40SYehuda Sadeh } 6418602adf40SYehuda Sadeh 6419602adf40SYehuda Sadeh module_init(rbd_init); 6420602adf40SYehuda Sadeh module_exit(rbd_exit); 6421602adf40SYehuda Sadeh 6422d552c619SAlex Elder MODULE_AUTHOR("Alex Elder <elder@inktank.com>"); 6423602adf40SYehuda Sadeh MODULE_AUTHOR("Sage Weil <sage@newdream.net>"); 6424602adf40SYehuda Sadeh MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>"); 6425602adf40SYehuda Sadeh /* following authorship retained from original osdblk.c */ 6426602adf40SYehuda Sadeh MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>"); 6427602adf40SYehuda Sadeh 642890da258bSIlya Dryomov MODULE_DESCRIPTION("RADOS Block Device (RBD) driver"); 6429602adf40SYehuda Sadeh MODULE_LICENSE("GPL"); 6430