1e2a58ee5SAlex Elder 2602adf40SYehuda Sadeh /* 3602adf40SYehuda Sadeh rbd.c -- Export ceph rados objects as a Linux block device 4602adf40SYehuda Sadeh 5602adf40SYehuda Sadeh 6602adf40SYehuda Sadeh based on drivers/block/osdblk.c: 7602adf40SYehuda Sadeh 8602adf40SYehuda Sadeh Copyright 2009 Red Hat, Inc. 9602adf40SYehuda Sadeh 10602adf40SYehuda Sadeh This program is free software; you can redistribute it and/or modify 11602adf40SYehuda Sadeh it under the terms of the GNU General Public License as published by 12602adf40SYehuda Sadeh the Free Software Foundation. 13602adf40SYehuda Sadeh 14602adf40SYehuda Sadeh This program is distributed in the hope that it will be useful, 15602adf40SYehuda Sadeh but WITHOUT ANY WARRANTY; without even the implied warranty of 16602adf40SYehuda Sadeh MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17602adf40SYehuda Sadeh GNU General Public License for more details. 18602adf40SYehuda Sadeh 19602adf40SYehuda Sadeh You should have received a copy of the GNU General Public License 20602adf40SYehuda Sadeh along with this program; see the file COPYING. If not, write to 21602adf40SYehuda Sadeh the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 22602adf40SYehuda Sadeh 23602adf40SYehuda Sadeh 24602adf40SYehuda Sadeh 25dfc5606dSYehuda Sadeh For usage instructions, please refer to: 26602adf40SYehuda Sadeh 27dfc5606dSYehuda Sadeh Documentation/ABI/testing/sysfs-bus-rbd 28602adf40SYehuda Sadeh 29602adf40SYehuda Sadeh */ 30602adf40SYehuda Sadeh 31602adf40SYehuda Sadeh #include <linux/ceph/libceph.h> 32602adf40SYehuda Sadeh #include <linux/ceph/osd_client.h> 33602adf40SYehuda Sadeh #include <linux/ceph/mon_client.h> 34ed95b21aSIlya Dryomov #include <linux/ceph/cls_lock_client.h> 35602adf40SYehuda Sadeh #include <linux/ceph/decode.h> 3659c2be1eSYehuda Sadeh #include <linux/parser.h> 3730d1cff8SAlex Elder #include <linux/bsearch.h> 38602adf40SYehuda Sadeh 39602adf40SYehuda Sadeh #include <linux/kernel.h> 40602adf40SYehuda Sadeh #include <linux/device.h> 41602adf40SYehuda Sadeh #include <linux/module.h> 427ad18afaSChristoph Hellwig #include <linux/blk-mq.h> 43602adf40SYehuda Sadeh #include <linux/fs.h> 44602adf40SYehuda Sadeh #include <linux/blkdev.h> 451c2a9dfeSAlex Elder #include <linux/slab.h> 46f8a22fc2SIlya Dryomov #include <linux/idr.h> 47bc1ecc65SIlya Dryomov #include <linux/workqueue.h> 48602adf40SYehuda Sadeh 49602adf40SYehuda Sadeh #include "rbd_types.h" 50602adf40SYehuda Sadeh 51aafb230eSAlex Elder #define RBD_DEBUG /* Activate rbd_assert() calls */ 52aafb230eSAlex Elder 53593a9e7bSAlex Elder /* 54593a9e7bSAlex Elder * The basic unit of block I/O is a sector. It is interpreted in a 55593a9e7bSAlex Elder * number of contexts in Linux (blk, bio, genhd), but the default is 56593a9e7bSAlex Elder * universally 512 bytes. These symbols are just slightly more 57593a9e7bSAlex Elder * meaningful than the bare numbers they represent. 58593a9e7bSAlex Elder */ 59593a9e7bSAlex Elder #define SECTOR_SHIFT 9 60593a9e7bSAlex Elder #define SECTOR_SIZE (1ULL << SECTOR_SHIFT) 61593a9e7bSAlex Elder 62a2acd00eSAlex Elder /* 63a2acd00eSAlex Elder * Increment the given counter and return its updated value. 64a2acd00eSAlex Elder * If the counter is already 0 it will not be incremented. 65a2acd00eSAlex Elder * If the counter is already at its maximum value returns 66a2acd00eSAlex Elder * -EINVAL without updating it. 67a2acd00eSAlex Elder */ 68a2acd00eSAlex Elder static int atomic_inc_return_safe(atomic_t *v) 69a2acd00eSAlex Elder { 70a2acd00eSAlex Elder unsigned int counter; 71a2acd00eSAlex Elder 72a2acd00eSAlex Elder counter = (unsigned int)__atomic_add_unless(v, 1, 0); 73a2acd00eSAlex Elder if (counter <= (unsigned int)INT_MAX) 74a2acd00eSAlex Elder return (int)counter; 75a2acd00eSAlex Elder 76a2acd00eSAlex Elder atomic_dec(v); 77a2acd00eSAlex Elder 78a2acd00eSAlex Elder return -EINVAL; 79a2acd00eSAlex Elder } 80a2acd00eSAlex Elder 81a2acd00eSAlex Elder /* Decrement the counter. Return the resulting value, or -EINVAL */ 82a2acd00eSAlex Elder static int atomic_dec_return_safe(atomic_t *v) 83a2acd00eSAlex Elder { 84a2acd00eSAlex Elder int counter; 85a2acd00eSAlex Elder 86a2acd00eSAlex Elder counter = atomic_dec_return(v); 87a2acd00eSAlex Elder if (counter >= 0) 88a2acd00eSAlex Elder return counter; 89a2acd00eSAlex Elder 90a2acd00eSAlex Elder atomic_inc(v); 91a2acd00eSAlex Elder 92a2acd00eSAlex Elder return -EINVAL; 93a2acd00eSAlex Elder } 94a2acd00eSAlex Elder 95f0f8cef5SAlex Elder #define RBD_DRV_NAME "rbd" 96602adf40SYehuda Sadeh 977e513d43SIlya Dryomov #define RBD_MINORS_PER_MAJOR 256 987e513d43SIlya Dryomov #define RBD_SINGLE_MAJOR_PART_SHIFT 4 99602adf40SYehuda Sadeh 1006d69bb53SIlya Dryomov #define RBD_MAX_PARENT_CHAIN_LEN 16 1016d69bb53SIlya Dryomov 102d4b125e9SAlex Elder #define RBD_SNAP_DEV_NAME_PREFIX "snap_" 103d4b125e9SAlex Elder #define RBD_MAX_SNAP_NAME_LEN \ 104d4b125e9SAlex Elder (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1)) 105d4b125e9SAlex Elder 10635d489f9SAlex Elder #define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */ 107602adf40SYehuda Sadeh 108602adf40SYehuda Sadeh #define RBD_SNAP_HEAD_NAME "-" 109602adf40SYehuda Sadeh 1109682fc6dSAlex Elder #define BAD_SNAP_INDEX U32_MAX /* invalid index into snap array */ 1119682fc6dSAlex Elder 1129e15b77dSAlex Elder /* This allows a single page to hold an image name sent by OSD */ 1139e15b77dSAlex Elder #define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1) 114589d30e0SAlex Elder #define RBD_IMAGE_ID_LEN_MAX 64 1159e15b77dSAlex Elder 1161e130199SAlex Elder #define RBD_OBJ_PREFIX_LEN_MAX 64 117589d30e0SAlex Elder 118ed95b21aSIlya Dryomov #define RBD_NOTIFY_TIMEOUT 5 /* seconds */ 11999d16943SIlya Dryomov #define RBD_RETRY_DELAY msecs_to_jiffies(1000) 12099d16943SIlya Dryomov 121d889140cSAlex Elder /* Feature bits */ 122d889140cSAlex Elder 1235cbf6f12SAlex Elder #define RBD_FEATURE_LAYERING (1<<0) 1245cbf6f12SAlex Elder #define RBD_FEATURE_STRIPINGV2 (1<<1) 125ed95b21aSIlya Dryomov #define RBD_FEATURE_EXCLUSIVE_LOCK (1<<2) 126ed95b21aSIlya Dryomov #define RBD_FEATURES_ALL (RBD_FEATURE_LAYERING | \ 127ed95b21aSIlya Dryomov RBD_FEATURE_STRIPINGV2 | \ 128ed95b21aSIlya Dryomov RBD_FEATURE_EXCLUSIVE_LOCK) 129d889140cSAlex Elder 130d889140cSAlex Elder /* Features supported by this (client software) implementation. */ 131d889140cSAlex Elder 132770eba6eSAlex Elder #define RBD_FEATURES_SUPPORTED (RBD_FEATURES_ALL) 133d889140cSAlex Elder 13481a89793SAlex Elder /* 13581a89793SAlex Elder * An RBD device name will be "rbd#", where the "rbd" comes from 13681a89793SAlex Elder * RBD_DRV_NAME above, and # is a unique integer identifier. 13781a89793SAlex Elder */ 138602adf40SYehuda Sadeh #define DEV_NAME_LEN 32 139602adf40SYehuda Sadeh 140602adf40SYehuda Sadeh /* 141602adf40SYehuda Sadeh * block device image metadata (in-memory version) 142602adf40SYehuda Sadeh */ 143602adf40SYehuda Sadeh struct rbd_image_header { 144f35a4deeSAlex Elder /* These six fields never change for a given rbd image */ 145849b4260SAlex Elder char *object_prefix; 146602adf40SYehuda Sadeh __u8 obj_order; 147602adf40SYehuda Sadeh __u8 crypt_type; 148602adf40SYehuda Sadeh __u8 comp_type; 149f35a4deeSAlex Elder u64 stripe_unit; 150f35a4deeSAlex Elder u64 stripe_count; 151f35a4deeSAlex Elder u64 features; /* Might be changeable someday? */ 152602adf40SYehuda Sadeh 153f84344f3SAlex Elder /* The remaining fields need to be updated occasionally */ 154f84344f3SAlex Elder u64 image_size; 155f84344f3SAlex Elder struct ceph_snap_context *snapc; 156f35a4deeSAlex Elder char *snap_names; /* format 1 only */ 157f35a4deeSAlex Elder u64 *snap_sizes; /* format 1 only */ 15859c2be1eSYehuda Sadeh }; 15959c2be1eSYehuda Sadeh 1600d7dbfceSAlex Elder /* 1610d7dbfceSAlex Elder * An rbd image specification. 1620d7dbfceSAlex Elder * 1630d7dbfceSAlex Elder * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely 164c66c6e0cSAlex Elder * identify an image. Each rbd_dev structure includes a pointer to 165c66c6e0cSAlex Elder * an rbd_spec structure that encapsulates this identity. 166c66c6e0cSAlex Elder * 167c66c6e0cSAlex Elder * Each of the id's in an rbd_spec has an associated name. For a 168c66c6e0cSAlex Elder * user-mapped image, the names are supplied and the id's associated 169c66c6e0cSAlex Elder * with them are looked up. For a layered image, a parent image is 170c66c6e0cSAlex Elder * defined by the tuple, and the names are looked up. 171c66c6e0cSAlex Elder * 172c66c6e0cSAlex Elder * An rbd_dev structure contains a parent_spec pointer which is 173c66c6e0cSAlex Elder * non-null if the image it represents is a child in a layered 174c66c6e0cSAlex Elder * image. This pointer will refer to the rbd_spec structure used 175c66c6e0cSAlex Elder * by the parent rbd_dev for its own identity (i.e., the structure 176c66c6e0cSAlex Elder * is shared between the parent and child). 177c66c6e0cSAlex Elder * 178c66c6e0cSAlex Elder * Since these structures are populated once, during the discovery 179c66c6e0cSAlex Elder * phase of image construction, they are effectively immutable so 180c66c6e0cSAlex Elder * we make no effort to synchronize access to them. 181c66c6e0cSAlex Elder * 182c66c6e0cSAlex Elder * Note that code herein does not assume the image name is known (it 183c66c6e0cSAlex Elder * could be a null pointer). 1840d7dbfceSAlex Elder */ 1850d7dbfceSAlex Elder struct rbd_spec { 1860d7dbfceSAlex Elder u64 pool_id; 187ecb4dc22SAlex Elder const char *pool_name; 1880d7dbfceSAlex Elder 189ecb4dc22SAlex Elder const char *image_id; 190ecb4dc22SAlex Elder const char *image_name; 1910d7dbfceSAlex Elder 1920d7dbfceSAlex Elder u64 snap_id; 193ecb4dc22SAlex Elder const char *snap_name; 1940d7dbfceSAlex Elder 1950d7dbfceSAlex Elder struct kref kref; 1960d7dbfceSAlex Elder }; 1970d7dbfceSAlex Elder 198602adf40SYehuda Sadeh /* 199f0f8cef5SAlex Elder * an instance of the client. multiple devices may share an rbd client. 200602adf40SYehuda Sadeh */ 201602adf40SYehuda Sadeh struct rbd_client { 202602adf40SYehuda Sadeh struct ceph_client *client; 203602adf40SYehuda Sadeh struct kref kref; 204602adf40SYehuda Sadeh struct list_head node; 205602adf40SYehuda Sadeh }; 206602adf40SYehuda Sadeh 207bf0d5f50SAlex Elder struct rbd_img_request; 208bf0d5f50SAlex Elder typedef void (*rbd_img_callback_t)(struct rbd_img_request *); 209bf0d5f50SAlex Elder 210bf0d5f50SAlex Elder #define BAD_WHICH U32_MAX /* Good which or bad which, which? */ 211bf0d5f50SAlex Elder 212bf0d5f50SAlex Elder struct rbd_obj_request; 213bf0d5f50SAlex Elder typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *); 214bf0d5f50SAlex Elder 2159969ebc5SAlex Elder enum obj_request_type { 2169969ebc5SAlex Elder OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES 2179969ebc5SAlex Elder }; 218bf0d5f50SAlex Elder 2196d2940c8SGuangliang Zhao enum obj_operation_type { 2206d2940c8SGuangliang Zhao OBJ_OP_WRITE, 2216d2940c8SGuangliang Zhao OBJ_OP_READ, 22290e98c52SGuangliang Zhao OBJ_OP_DISCARD, 2236d2940c8SGuangliang Zhao }; 2246d2940c8SGuangliang Zhao 225926f9b3fSAlex Elder enum obj_req_flags { 226926f9b3fSAlex Elder OBJ_REQ_DONE, /* completion flag: not done = 0, done = 1 */ 2276365d33aSAlex Elder OBJ_REQ_IMG_DATA, /* object usage: standalone = 0, image = 1 */ 2285679c59fSAlex Elder OBJ_REQ_KNOWN, /* EXISTS flag valid: no = 0, yes = 1 */ 2295679c59fSAlex Elder OBJ_REQ_EXISTS, /* target exists: no = 0, yes = 1 */ 230926f9b3fSAlex Elder }; 231926f9b3fSAlex Elder 232bf0d5f50SAlex Elder struct rbd_obj_request { 233bf0d5f50SAlex Elder const char *object_name; 234bf0d5f50SAlex Elder u64 offset; /* object start byte */ 235bf0d5f50SAlex Elder u64 length; /* bytes from offset */ 236926f9b3fSAlex Elder unsigned long flags; 237bf0d5f50SAlex Elder 238c5b5ef6cSAlex Elder /* 239c5b5ef6cSAlex Elder * An object request associated with an image will have its 240c5b5ef6cSAlex Elder * img_data flag set; a standalone object request will not. 241c5b5ef6cSAlex Elder * 242c5b5ef6cSAlex Elder * A standalone object request will have which == BAD_WHICH 243c5b5ef6cSAlex Elder * and a null obj_request pointer. 244c5b5ef6cSAlex Elder * 245c5b5ef6cSAlex Elder * An object request initiated in support of a layered image 246c5b5ef6cSAlex Elder * object (to check for its existence before a write) will 247c5b5ef6cSAlex Elder * have which == BAD_WHICH and a non-null obj_request pointer. 248c5b5ef6cSAlex Elder * 249c5b5ef6cSAlex Elder * Finally, an object request for rbd image data will have 250c5b5ef6cSAlex Elder * which != BAD_WHICH, and will have a non-null img_request 251c5b5ef6cSAlex Elder * pointer. The value of which will be in the range 252c5b5ef6cSAlex Elder * 0..(img_request->obj_request_count-1). 253c5b5ef6cSAlex Elder */ 254c5b5ef6cSAlex Elder union { 255c5b5ef6cSAlex Elder struct rbd_obj_request *obj_request; /* STAT op */ 256c5b5ef6cSAlex Elder struct { 257bf0d5f50SAlex Elder struct rbd_img_request *img_request; 258c5b5ef6cSAlex Elder u64 img_offset; 259c5b5ef6cSAlex Elder /* links for img_request->obj_requests list */ 260c5b5ef6cSAlex Elder struct list_head links; 261c5b5ef6cSAlex Elder }; 262c5b5ef6cSAlex Elder }; 263bf0d5f50SAlex Elder u32 which; /* posn image request list */ 264bf0d5f50SAlex Elder 265bf0d5f50SAlex Elder enum obj_request_type type; 266788e2df3SAlex Elder union { 267bf0d5f50SAlex Elder struct bio *bio_list; 268788e2df3SAlex Elder struct { 269788e2df3SAlex Elder struct page **pages; 270788e2df3SAlex Elder u32 page_count; 271788e2df3SAlex Elder }; 272788e2df3SAlex Elder }; 2730eefd470SAlex Elder struct page **copyup_pages; 274ebda6408SAlex Elder u32 copyup_page_count; 275bf0d5f50SAlex Elder 276bf0d5f50SAlex Elder struct ceph_osd_request *osd_req; 277bf0d5f50SAlex Elder 278bf0d5f50SAlex Elder u64 xferred; /* bytes transferred */ 2791b83bef2SSage Weil int result; 280bf0d5f50SAlex Elder 281bf0d5f50SAlex Elder rbd_obj_callback_t callback; 282788e2df3SAlex Elder struct completion completion; 283bf0d5f50SAlex Elder 284bf0d5f50SAlex Elder struct kref kref; 285bf0d5f50SAlex Elder }; 286bf0d5f50SAlex Elder 2870c425248SAlex Elder enum img_req_flags { 2889849e986SAlex Elder IMG_REQ_WRITE, /* I/O direction: read = 0, write = 1 */ 2899849e986SAlex Elder IMG_REQ_CHILD, /* initiator: block = 0, child image = 1 */ 290d0b2e944SAlex Elder IMG_REQ_LAYERED, /* ENOENT handling: normal = 0, layered = 1 */ 29190e98c52SGuangliang Zhao IMG_REQ_DISCARD, /* discard: normal = 0, discard request = 1 */ 2920c425248SAlex Elder }; 2930c425248SAlex Elder 294bf0d5f50SAlex Elder struct rbd_img_request { 295bf0d5f50SAlex Elder struct rbd_device *rbd_dev; 296bf0d5f50SAlex Elder u64 offset; /* starting image byte offset */ 297bf0d5f50SAlex Elder u64 length; /* byte count from offset */ 2980c425248SAlex Elder unsigned long flags; 299bf0d5f50SAlex Elder union { 300bf0d5f50SAlex Elder u64 snap_id; /* for reads */ 3019849e986SAlex Elder struct ceph_snap_context *snapc; /* for writes */ 3029849e986SAlex Elder }; 3039849e986SAlex Elder union { 3049849e986SAlex Elder struct request *rq; /* block request */ 3059849e986SAlex Elder struct rbd_obj_request *obj_request; /* obj req initiator */ 306bf0d5f50SAlex Elder }; 3073d7efd18SAlex Elder struct page **copyup_pages; 308ebda6408SAlex Elder u32 copyup_page_count; 309bf0d5f50SAlex Elder spinlock_t completion_lock;/* protects next_completion */ 310bf0d5f50SAlex Elder u32 next_completion; 311bf0d5f50SAlex Elder rbd_img_callback_t callback; 31255f27e09SAlex Elder u64 xferred;/* aggregate bytes transferred */ 313a5a337d4SAlex Elder int result; /* first nonzero obj_request result */ 314bf0d5f50SAlex Elder 315bf0d5f50SAlex Elder u32 obj_request_count; 316bf0d5f50SAlex Elder struct list_head obj_requests; /* rbd_obj_request structs */ 317bf0d5f50SAlex Elder 318bf0d5f50SAlex Elder struct kref kref; 319bf0d5f50SAlex Elder }; 320bf0d5f50SAlex Elder 321bf0d5f50SAlex Elder #define for_each_obj_request(ireq, oreq) \ 322ef06f4d3SAlex Elder list_for_each_entry(oreq, &(ireq)->obj_requests, links) 323bf0d5f50SAlex Elder #define for_each_obj_request_from(ireq, oreq) \ 324ef06f4d3SAlex Elder list_for_each_entry_from(oreq, &(ireq)->obj_requests, links) 325bf0d5f50SAlex Elder #define for_each_obj_request_safe(ireq, oreq, n) \ 326ef06f4d3SAlex Elder list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links) 327bf0d5f50SAlex Elder 32899d16943SIlya Dryomov enum rbd_watch_state { 32999d16943SIlya Dryomov RBD_WATCH_STATE_UNREGISTERED, 33099d16943SIlya Dryomov RBD_WATCH_STATE_REGISTERED, 33199d16943SIlya Dryomov RBD_WATCH_STATE_ERROR, 33299d16943SIlya Dryomov }; 33399d16943SIlya Dryomov 334ed95b21aSIlya Dryomov enum rbd_lock_state { 335ed95b21aSIlya Dryomov RBD_LOCK_STATE_UNLOCKED, 336ed95b21aSIlya Dryomov RBD_LOCK_STATE_LOCKED, 337ed95b21aSIlya Dryomov RBD_LOCK_STATE_RELEASING, 338ed95b21aSIlya Dryomov }; 339ed95b21aSIlya Dryomov 340ed95b21aSIlya Dryomov /* WatchNotify::ClientId */ 341ed95b21aSIlya Dryomov struct rbd_client_id { 342ed95b21aSIlya Dryomov u64 gid; 343ed95b21aSIlya Dryomov u64 handle; 344ed95b21aSIlya Dryomov }; 345ed95b21aSIlya Dryomov 346f84344f3SAlex Elder struct rbd_mapping { 34799c1f08fSAlex Elder u64 size; 34834b13184SAlex Elder u64 features; 349f84344f3SAlex Elder bool read_only; 350f84344f3SAlex Elder }; 351f84344f3SAlex Elder 352602adf40SYehuda Sadeh /* 353602adf40SYehuda Sadeh * a single device 354602adf40SYehuda Sadeh */ 355602adf40SYehuda Sadeh struct rbd_device { 356de71a297SAlex Elder int dev_id; /* blkdev unique id */ 357602adf40SYehuda Sadeh 358602adf40SYehuda Sadeh int major; /* blkdev assigned major */ 359dd82fff1SIlya Dryomov int minor; 360602adf40SYehuda Sadeh struct gendisk *disk; /* blkdev's gendisk and rq */ 361602adf40SYehuda Sadeh 362a30b71b9SAlex Elder u32 image_format; /* Either 1 or 2 */ 363602adf40SYehuda Sadeh struct rbd_client *rbd_client; 364602adf40SYehuda Sadeh 365602adf40SYehuda Sadeh char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */ 366602adf40SYehuda Sadeh 367b82d167bSAlex Elder spinlock_t lock; /* queue, flags, open_count */ 368602adf40SYehuda Sadeh 369602adf40SYehuda Sadeh struct rbd_image_header header; 370b82d167bSAlex Elder unsigned long flags; /* possibly lock protected */ 3710d7dbfceSAlex Elder struct rbd_spec *spec; 372d147543dSIlya Dryomov struct rbd_options *opts; 3730d6d1e9cSMike Christie char *config_info; /* add{,_single_major} string */ 374602adf40SYehuda Sadeh 375c41d13a3SIlya Dryomov struct ceph_object_id header_oid; 376922dab61SIlya Dryomov struct ceph_object_locator header_oloc; 377971f839aSAlex Elder 3781643dfa4SIlya Dryomov struct ceph_file_layout layout; /* used for all rbd requests */ 3790903e875SAlex Elder 38099d16943SIlya Dryomov struct mutex watch_mutex; 38199d16943SIlya Dryomov enum rbd_watch_state watch_state; 382922dab61SIlya Dryomov struct ceph_osd_linger_request *watch_handle; 38399d16943SIlya Dryomov u64 watch_cookie; 38499d16943SIlya Dryomov struct delayed_work watch_dwork; 38559c2be1eSYehuda Sadeh 386ed95b21aSIlya Dryomov struct rw_semaphore lock_rwsem; 387ed95b21aSIlya Dryomov enum rbd_lock_state lock_state; 388ed95b21aSIlya Dryomov struct rbd_client_id owner_cid; 389ed95b21aSIlya Dryomov struct work_struct acquired_lock_work; 390ed95b21aSIlya Dryomov struct work_struct released_lock_work; 391ed95b21aSIlya Dryomov struct delayed_work lock_dwork; 392ed95b21aSIlya Dryomov struct work_struct unlock_work; 393ed95b21aSIlya Dryomov wait_queue_head_t lock_waitq; 394ed95b21aSIlya Dryomov 3951643dfa4SIlya Dryomov struct workqueue_struct *task_wq; 3961643dfa4SIlya Dryomov 39786b00e0dSAlex Elder struct rbd_spec *parent_spec; 39886b00e0dSAlex Elder u64 parent_overlap; 399a2acd00eSAlex Elder atomic_t parent_ref; 4002f82ee54SAlex Elder struct rbd_device *parent; 40186b00e0dSAlex Elder 4027ad18afaSChristoph Hellwig /* Block layer tags. */ 4037ad18afaSChristoph Hellwig struct blk_mq_tag_set tag_set; 4047ad18afaSChristoph Hellwig 405c666601aSJosh Durgin /* protects updating the header */ 406c666601aSJosh Durgin struct rw_semaphore header_rwsem; 407f84344f3SAlex Elder 408f84344f3SAlex Elder struct rbd_mapping mapping; 409602adf40SYehuda Sadeh 410602adf40SYehuda Sadeh struct list_head node; 411dfc5606dSYehuda Sadeh 412dfc5606dSYehuda Sadeh /* sysfs related */ 413dfc5606dSYehuda Sadeh struct device dev; 414b82d167bSAlex Elder unsigned long open_count; /* protected by lock */ 415dfc5606dSYehuda Sadeh }; 416dfc5606dSYehuda Sadeh 417b82d167bSAlex Elder /* 418b82d167bSAlex Elder * Flag bits for rbd_dev->flags. If atomicity is required, 419b82d167bSAlex Elder * rbd_dev->lock is used to protect access. 420b82d167bSAlex Elder * 421b82d167bSAlex Elder * Currently, only the "removing" flag (which is coupled with the 422b82d167bSAlex Elder * "open_count" field) requires atomic access. 423b82d167bSAlex Elder */ 4246d292906SAlex Elder enum rbd_dev_flags { 4256d292906SAlex Elder RBD_DEV_FLAG_EXISTS, /* mapped snapshot has not been deleted */ 426b82d167bSAlex Elder RBD_DEV_FLAG_REMOVING, /* this mapping is being removed */ 4276d292906SAlex Elder }; 4286d292906SAlex Elder 429cfbf6377SAlex Elder static DEFINE_MUTEX(client_mutex); /* Serialize client creation */ 430e124a82fSAlex Elder 431602adf40SYehuda Sadeh static LIST_HEAD(rbd_dev_list); /* devices */ 432e124a82fSAlex Elder static DEFINE_SPINLOCK(rbd_dev_list_lock); 433e124a82fSAlex Elder 434602adf40SYehuda Sadeh static LIST_HEAD(rbd_client_list); /* clients */ 435432b8587SAlex Elder static DEFINE_SPINLOCK(rbd_client_list_lock); 436602adf40SYehuda Sadeh 43778c2a44aSAlex Elder /* Slab caches for frequently-allocated structures */ 43878c2a44aSAlex Elder 4391c2a9dfeSAlex Elder static struct kmem_cache *rbd_img_request_cache; 440868311b1SAlex Elder static struct kmem_cache *rbd_obj_request_cache; 44178c2a44aSAlex Elder static struct kmem_cache *rbd_segment_name_cache; 4421c2a9dfeSAlex Elder 4439b60e70bSIlya Dryomov static int rbd_major; 444f8a22fc2SIlya Dryomov static DEFINE_IDA(rbd_dev_id_ida); 445f8a22fc2SIlya Dryomov 446f5ee37bdSIlya Dryomov static struct workqueue_struct *rbd_wq; 447f5ee37bdSIlya Dryomov 4489b60e70bSIlya Dryomov /* 4499b60e70bSIlya Dryomov * Default to false for now, as single-major requires >= 0.75 version of 4509b60e70bSIlya Dryomov * userspace rbd utility. 4519b60e70bSIlya Dryomov */ 4529b60e70bSIlya Dryomov static bool single_major = false; 4539b60e70bSIlya Dryomov module_param(single_major, bool, S_IRUGO); 4549b60e70bSIlya Dryomov MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: false)"); 4559b60e70bSIlya Dryomov 4563d7efd18SAlex Elder static int rbd_img_request_submit(struct rbd_img_request *img_request); 4573d7efd18SAlex Elder 458f0f8cef5SAlex Elder static ssize_t rbd_add(struct bus_type *bus, const char *buf, 459f0f8cef5SAlex Elder size_t count); 460f0f8cef5SAlex Elder static ssize_t rbd_remove(struct bus_type *bus, const char *buf, 461f0f8cef5SAlex Elder size_t count); 4629b60e70bSIlya Dryomov static ssize_t rbd_add_single_major(struct bus_type *bus, const char *buf, 4639b60e70bSIlya Dryomov size_t count); 4649b60e70bSIlya Dryomov static ssize_t rbd_remove_single_major(struct bus_type *bus, const char *buf, 4659b60e70bSIlya Dryomov size_t count); 4666d69bb53SIlya Dryomov static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth); 467a2acd00eSAlex Elder static void rbd_spec_put(struct rbd_spec *spec); 468f0f8cef5SAlex Elder 4699b60e70bSIlya Dryomov static int rbd_dev_id_to_minor(int dev_id) 4709b60e70bSIlya Dryomov { 4717e513d43SIlya Dryomov return dev_id << RBD_SINGLE_MAJOR_PART_SHIFT; 4729b60e70bSIlya Dryomov } 4739b60e70bSIlya Dryomov 4749b60e70bSIlya Dryomov static int minor_to_rbd_dev_id(int minor) 4759b60e70bSIlya Dryomov { 4767e513d43SIlya Dryomov return minor >> RBD_SINGLE_MAJOR_PART_SHIFT; 4779b60e70bSIlya Dryomov } 4789b60e70bSIlya Dryomov 479ed95b21aSIlya Dryomov static bool rbd_is_lock_supported(struct rbd_device *rbd_dev) 480ed95b21aSIlya Dryomov { 481ed95b21aSIlya Dryomov return (rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK) && 482ed95b21aSIlya Dryomov rbd_dev->spec->snap_id == CEPH_NOSNAP && 483ed95b21aSIlya Dryomov !rbd_dev->mapping.read_only; 484ed95b21aSIlya Dryomov } 485ed95b21aSIlya Dryomov 486ed95b21aSIlya Dryomov static bool __rbd_is_lock_owner(struct rbd_device *rbd_dev) 487ed95b21aSIlya Dryomov { 488ed95b21aSIlya Dryomov return rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED || 489ed95b21aSIlya Dryomov rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING; 490ed95b21aSIlya Dryomov } 491ed95b21aSIlya Dryomov 492ed95b21aSIlya Dryomov static bool rbd_is_lock_owner(struct rbd_device *rbd_dev) 493ed95b21aSIlya Dryomov { 494ed95b21aSIlya Dryomov bool is_lock_owner; 495ed95b21aSIlya Dryomov 496ed95b21aSIlya Dryomov down_read(&rbd_dev->lock_rwsem); 497ed95b21aSIlya Dryomov is_lock_owner = __rbd_is_lock_owner(rbd_dev); 498ed95b21aSIlya Dryomov up_read(&rbd_dev->lock_rwsem); 499ed95b21aSIlya Dryomov return is_lock_owner; 500ed95b21aSIlya Dryomov } 501ed95b21aSIlya Dryomov 502b15a21ddSGreg Kroah-Hartman static BUS_ATTR(add, S_IWUSR, NULL, rbd_add); 503b15a21ddSGreg Kroah-Hartman static BUS_ATTR(remove, S_IWUSR, NULL, rbd_remove); 5049b60e70bSIlya Dryomov static BUS_ATTR(add_single_major, S_IWUSR, NULL, rbd_add_single_major); 5059b60e70bSIlya Dryomov static BUS_ATTR(remove_single_major, S_IWUSR, NULL, rbd_remove_single_major); 506b15a21ddSGreg Kroah-Hartman 507b15a21ddSGreg Kroah-Hartman static struct attribute *rbd_bus_attrs[] = { 508b15a21ddSGreg Kroah-Hartman &bus_attr_add.attr, 509b15a21ddSGreg Kroah-Hartman &bus_attr_remove.attr, 5109b60e70bSIlya Dryomov &bus_attr_add_single_major.attr, 5119b60e70bSIlya Dryomov &bus_attr_remove_single_major.attr, 512b15a21ddSGreg Kroah-Hartman NULL, 513f0f8cef5SAlex Elder }; 51492c76dc0SIlya Dryomov 51592c76dc0SIlya Dryomov static umode_t rbd_bus_is_visible(struct kobject *kobj, 51692c76dc0SIlya Dryomov struct attribute *attr, int index) 51792c76dc0SIlya Dryomov { 5189b60e70bSIlya Dryomov if (!single_major && 5199b60e70bSIlya Dryomov (attr == &bus_attr_add_single_major.attr || 5209b60e70bSIlya Dryomov attr == &bus_attr_remove_single_major.attr)) 5219b60e70bSIlya Dryomov return 0; 5229b60e70bSIlya Dryomov 52392c76dc0SIlya Dryomov return attr->mode; 52492c76dc0SIlya Dryomov } 52592c76dc0SIlya Dryomov 52692c76dc0SIlya Dryomov static const struct attribute_group rbd_bus_group = { 52792c76dc0SIlya Dryomov .attrs = rbd_bus_attrs, 52892c76dc0SIlya Dryomov .is_visible = rbd_bus_is_visible, 52992c76dc0SIlya Dryomov }; 53092c76dc0SIlya Dryomov __ATTRIBUTE_GROUPS(rbd_bus); 531f0f8cef5SAlex Elder 532f0f8cef5SAlex Elder static struct bus_type rbd_bus_type = { 533f0f8cef5SAlex Elder .name = "rbd", 534b15a21ddSGreg Kroah-Hartman .bus_groups = rbd_bus_groups, 535f0f8cef5SAlex Elder }; 536f0f8cef5SAlex Elder 537f0f8cef5SAlex Elder static void rbd_root_dev_release(struct device *dev) 538f0f8cef5SAlex Elder { 539f0f8cef5SAlex Elder } 540f0f8cef5SAlex Elder 541f0f8cef5SAlex Elder static struct device rbd_root_dev = { 542f0f8cef5SAlex Elder .init_name = "rbd", 543f0f8cef5SAlex Elder .release = rbd_root_dev_release, 544f0f8cef5SAlex Elder }; 545f0f8cef5SAlex Elder 54606ecc6cbSAlex Elder static __printf(2, 3) 54706ecc6cbSAlex Elder void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...) 54806ecc6cbSAlex Elder { 54906ecc6cbSAlex Elder struct va_format vaf; 55006ecc6cbSAlex Elder va_list args; 55106ecc6cbSAlex Elder 55206ecc6cbSAlex Elder va_start(args, fmt); 55306ecc6cbSAlex Elder vaf.fmt = fmt; 55406ecc6cbSAlex Elder vaf.va = &args; 55506ecc6cbSAlex Elder 55606ecc6cbSAlex Elder if (!rbd_dev) 55706ecc6cbSAlex Elder printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf); 55806ecc6cbSAlex Elder else if (rbd_dev->disk) 55906ecc6cbSAlex Elder printk(KERN_WARNING "%s: %s: %pV\n", 56006ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf); 56106ecc6cbSAlex Elder else if (rbd_dev->spec && rbd_dev->spec->image_name) 56206ecc6cbSAlex Elder printk(KERN_WARNING "%s: image %s: %pV\n", 56306ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf); 56406ecc6cbSAlex Elder else if (rbd_dev->spec && rbd_dev->spec->image_id) 56506ecc6cbSAlex Elder printk(KERN_WARNING "%s: id %s: %pV\n", 56606ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf); 56706ecc6cbSAlex Elder else /* punt */ 56806ecc6cbSAlex Elder printk(KERN_WARNING "%s: rbd_dev %p: %pV\n", 56906ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev, &vaf); 57006ecc6cbSAlex Elder va_end(args); 57106ecc6cbSAlex Elder } 57206ecc6cbSAlex Elder 573aafb230eSAlex Elder #ifdef RBD_DEBUG 574aafb230eSAlex Elder #define rbd_assert(expr) \ 575aafb230eSAlex Elder if (unlikely(!(expr))) { \ 576aafb230eSAlex Elder printk(KERN_ERR "\nAssertion failure in %s() " \ 577aafb230eSAlex Elder "at line %d:\n\n" \ 578aafb230eSAlex Elder "\trbd_assert(%s);\n\n", \ 579aafb230eSAlex Elder __func__, __LINE__, #expr); \ 580aafb230eSAlex Elder BUG(); \ 581aafb230eSAlex Elder } 582aafb230eSAlex Elder #else /* !RBD_DEBUG */ 583aafb230eSAlex Elder # define rbd_assert(expr) ((void) 0) 584aafb230eSAlex Elder #endif /* !RBD_DEBUG */ 585dfc5606dSYehuda Sadeh 5862761713dSIlya Dryomov static void rbd_osd_copyup_callback(struct rbd_obj_request *obj_request); 587b454e36dSAlex Elder static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request); 58805a46afdSAlex Elder static void rbd_img_parent_read(struct rbd_obj_request *obj_request); 58905a46afdSAlex Elder static void rbd_dev_remove_parent(struct rbd_device *rbd_dev); 5908b3e1a56SAlex Elder 591cc4a38bdSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev); 5922df3fac7SAlex Elder static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev); 593a720ae09SIlya Dryomov static int rbd_dev_header_info(struct rbd_device *rbd_dev); 594e8f59b59SIlya Dryomov static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev); 59554cac61fSAlex Elder static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, 59654cac61fSAlex Elder u64 snap_id); 5972ad3d716SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id, 5982ad3d716SAlex Elder u8 *order, u64 *snap_size); 5992ad3d716SAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id, 6002ad3d716SAlex Elder u64 *snap_features); 60159c2be1eSYehuda Sadeh 602602adf40SYehuda Sadeh static int rbd_open(struct block_device *bdev, fmode_t mode) 603602adf40SYehuda Sadeh { 604f0f8cef5SAlex Elder struct rbd_device *rbd_dev = bdev->bd_disk->private_data; 605b82d167bSAlex Elder bool removing = false; 606602adf40SYehuda Sadeh 607f84344f3SAlex Elder if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only) 608602adf40SYehuda Sadeh return -EROFS; 609602adf40SYehuda Sadeh 610a14ea269SAlex Elder spin_lock_irq(&rbd_dev->lock); 611b82d167bSAlex Elder if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags)) 612b82d167bSAlex Elder removing = true; 613b82d167bSAlex Elder else 614b82d167bSAlex Elder rbd_dev->open_count++; 615a14ea269SAlex Elder spin_unlock_irq(&rbd_dev->lock); 616b82d167bSAlex Elder if (removing) 617b82d167bSAlex Elder return -ENOENT; 618b82d167bSAlex Elder 619c3e946ceSAlex Elder (void) get_device(&rbd_dev->dev); 620340c7a2bSAlex Elder 621602adf40SYehuda Sadeh return 0; 622602adf40SYehuda Sadeh } 623602adf40SYehuda Sadeh 624db2a144bSAl Viro static void rbd_release(struct gendisk *disk, fmode_t mode) 625dfc5606dSYehuda Sadeh { 626dfc5606dSYehuda Sadeh struct rbd_device *rbd_dev = disk->private_data; 627b82d167bSAlex Elder unsigned long open_count_before; 628b82d167bSAlex Elder 629a14ea269SAlex Elder spin_lock_irq(&rbd_dev->lock); 630b82d167bSAlex Elder open_count_before = rbd_dev->open_count--; 631a14ea269SAlex Elder spin_unlock_irq(&rbd_dev->lock); 632b82d167bSAlex Elder rbd_assert(open_count_before > 0); 633dfc5606dSYehuda Sadeh 634c3e946ceSAlex Elder put_device(&rbd_dev->dev); 635dfc5606dSYehuda Sadeh } 636dfc5606dSYehuda Sadeh 637131fd9f6SGuangliang Zhao static int rbd_ioctl_set_ro(struct rbd_device *rbd_dev, unsigned long arg) 638131fd9f6SGuangliang Zhao { 63977f33c03SJosh Durgin int ret = 0; 640131fd9f6SGuangliang Zhao int val; 641131fd9f6SGuangliang Zhao bool ro; 64277f33c03SJosh Durgin bool ro_changed = false; 643131fd9f6SGuangliang Zhao 64477f33c03SJosh Durgin /* get_user() may sleep, so call it before taking rbd_dev->lock */ 645131fd9f6SGuangliang Zhao if (get_user(val, (int __user *)(arg))) 646131fd9f6SGuangliang Zhao return -EFAULT; 647131fd9f6SGuangliang Zhao 648131fd9f6SGuangliang Zhao ro = val ? true : false; 649131fd9f6SGuangliang Zhao /* Snapshot doesn't allow to write*/ 650131fd9f6SGuangliang Zhao if (rbd_dev->spec->snap_id != CEPH_NOSNAP && !ro) 651131fd9f6SGuangliang Zhao return -EROFS; 652131fd9f6SGuangliang Zhao 65377f33c03SJosh Durgin spin_lock_irq(&rbd_dev->lock); 65477f33c03SJosh Durgin /* prevent others open this device */ 65577f33c03SJosh Durgin if (rbd_dev->open_count > 1) { 65677f33c03SJosh Durgin ret = -EBUSY; 65777f33c03SJosh Durgin goto out; 658131fd9f6SGuangliang Zhao } 659131fd9f6SGuangliang Zhao 66077f33c03SJosh Durgin if (rbd_dev->mapping.read_only != ro) { 66177f33c03SJosh Durgin rbd_dev->mapping.read_only = ro; 66277f33c03SJosh Durgin ro_changed = true; 66377f33c03SJosh Durgin } 66477f33c03SJosh Durgin 66577f33c03SJosh Durgin out: 66677f33c03SJosh Durgin spin_unlock_irq(&rbd_dev->lock); 66777f33c03SJosh Durgin /* set_disk_ro() may sleep, so call it after releasing rbd_dev->lock */ 66877f33c03SJosh Durgin if (ret == 0 && ro_changed) 66977f33c03SJosh Durgin set_disk_ro(rbd_dev->disk, ro ? 1 : 0); 67077f33c03SJosh Durgin 67177f33c03SJosh Durgin return ret; 672131fd9f6SGuangliang Zhao } 673131fd9f6SGuangliang Zhao 674131fd9f6SGuangliang Zhao static int rbd_ioctl(struct block_device *bdev, fmode_t mode, 675131fd9f6SGuangliang Zhao unsigned int cmd, unsigned long arg) 676131fd9f6SGuangliang Zhao { 677131fd9f6SGuangliang Zhao struct rbd_device *rbd_dev = bdev->bd_disk->private_data; 678131fd9f6SGuangliang Zhao int ret = 0; 679131fd9f6SGuangliang Zhao 680131fd9f6SGuangliang Zhao switch (cmd) { 681131fd9f6SGuangliang Zhao case BLKROSET: 682131fd9f6SGuangliang Zhao ret = rbd_ioctl_set_ro(rbd_dev, arg); 683131fd9f6SGuangliang Zhao break; 684131fd9f6SGuangliang Zhao default: 685131fd9f6SGuangliang Zhao ret = -ENOTTY; 686131fd9f6SGuangliang Zhao } 687131fd9f6SGuangliang Zhao 688131fd9f6SGuangliang Zhao return ret; 689131fd9f6SGuangliang Zhao } 690131fd9f6SGuangliang Zhao 691131fd9f6SGuangliang Zhao #ifdef CONFIG_COMPAT 692131fd9f6SGuangliang Zhao static int rbd_compat_ioctl(struct block_device *bdev, fmode_t mode, 693131fd9f6SGuangliang Zhao unsigned int cmd, unsigned long arg) 694131fd9f6SGuangliang Zhao { 695131fd9f6SGuangliang Zhao return rbd_ioctl(bdev, mode, cmd, arg); 696131fd9f6SGuangliang Zhao } 697131fd9f6SGuangliang Zhao #endif /* CONFIG_COMPAT */ 698131fd9f6SGuangliang Zhao 699602adf40SYehuda Sadeh static const struct block_device_operations rbd_bd_ops = { 700602adf40SYehuda Sadeh .owner = THIS_MODULE, 701602adf40SYehuda Sadeh .open = rbd_open, 702dfc5606dSYehuda Sadeh .release = rbd_release, 703131fd9f6SGuangliang Zhao .ioctl = rbd_ioctl, 704131fd9f6SGuangliang Zhao #ifdef CONFIG_COMPAT 705131fd9f6SGuangliang Zhao .compat_ioctl = rbd_compat_ioctl, 706131fd9f6SGuangliang Zhao #endif 707602adf40SYehuda Sadeh }; 708602adf40SYehuda Sadeh 709602adf40SYehuda Sadeh /* 7107262cfcaSAlex Elder * Initialize an rbd client instance. Success or not, this function 711cfbf6377SAlex Elder * consumes ceph_opts. Caller holds client_mutex. 712602adf40SYehuda Sadeh */ 713f8c38929SAlex Elder static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts) 714602adf40SYehuda Sadeh { 715602adf40SYehuda Sadeh struct rbd_client *rbdc; 716602adf40SYehuda Sadeh int ret = -ENOMEM; 717602adf40SYehuda Sadeh 71837206ee5SAlex Elder dout("%s:\n", __func__); 719602adf40SYehuda Sadeh rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL); 720602adf40SYehuda Sadeh if (!rbdc) 721602adf40SYehuda Sadeh goto out_opt; 722602adf40SYehuda Sadeh 723602adf40SYehuda Sadeh kref_init(&rbdc->kref); 724602adf40SYehuda Sadeh INIT_LIST_HEAD(&rbdc->node); 725602adf40SYehuda Sadeh 72643ae4701SAlex Elder rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0); 727602adf40SYehuda Sadeh if (IS_ERR(rbdc->client)) 72808f75463SAlex Elder goto out_rbdc; 72943ae4701SAlex Elder ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */ 730602adf40SYehuda Sadeh 731602adf40SYehuda Sadeh ret = ceph_open_session(rbdc->client); 732602adf40SYehuda Sadeh if (ret < 0) 73308f75463SAlex Elder goto out_client; 734602adf40SYehuda Sadeh 735432b8587SAlex Elder spin_lock(&rbd_client_list_lock); 736602adf40SYehuda Sadeh list_add_tail(&rbdc->node, &rbd_client_list); 737432b8587SAlex Elder spin_unlock(&rbd_client_list_lock); 738602adf40SYehuda Sadeh 73937206ee5SAlex Elder dout("%s: rbdc %p\n", __func__, rbdc); 740bc534d86SAlex Elder 741602adf40SYehuda Sadeh return rbdc; 74208f75463SAlex Elder out_client: 743602adf40SYehuda Sadeh ceph_destroy_client(rbdc->client); 74408f75463SAlex Elder out_rbdc: 745602adf40SYehuda Sadeh kfree(rbdc); 746602adf40SYehuda Sadeh out_opt: 74743ae4701SAlex Elder if (ceph_opts) 74843ae4701SAlex Elder ceph_destroy_options(ceph_opts); 74937206ee5SAlex Elder dout("%s: error %d\n", __func__, ret); 75037206ee5SAlex Elder 75128f259b7SVasiliy Kulikov return ERR_PTR(ret); 752602adf40SYehuda Sadeh } 753602adf40SYehuda Sadeh 7542f82ee54SAlex Elder static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc) 7552f82ee54SAlex Elder { 7562f82ee54SAlex Elder kref_get(&rbdc->kref); 7572f82ee54SAlex Elder 7582f82ee54SAlex Elder return rbdc; 7592f82ee54SAlex Elder } 7602f82ee54SAlex Elder 761602adf40SYehuda Sadeh /* 7621f7ba331SAlex Elder * Find a ceph client with specific addr and configuration. If 7631f7ba331SAlex Elder * found, bump its reference count. 764602adf40SYehuda Sadeh */ 7651f7ba331SAlex Elder static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts) 766602adf40SYehuda Sadeh { 767602adf40SYehuda Sadeh struct rbd_client *client_node; 7681f7ba331SAlex Elder bool found = false; 769602adf40SYehuda Sadeh 77043ae4701SAlex Elder if (ceph_opts->flags & CEPH_OPT_NOSHARE) 771602adf40SYehuda Sadeh return NULL; 772602adf40SYehuda Sadeh 7731f7ba331SAlex Elder spin_lock(&rbd_client_list_lock); 7741f7ba331SAlex Elder list_for_each_entry(client_node, &rbd_client_list, node) { 7751f7ba331SAlex Elder if (!ceph_compare_options(ceph_opts, client_node->client)) { 7762f82ee54SAlex Elder __rbd_get_client(client_node); 7772f82ee54SAlex Elder 7781f7ba331SAlex Elder found = true; 7791f7ba331SAlex Elder break; 7801f7ba331SAlex Elder } 7811f7ba331SAlex Elder } 7821f7ba331SAlex Elder spin_unlock(&rbd_client_list_lock); 7831f7ba331SAlex Elder 7841f7ba331SAlex Elder return found ? client_node : NULL; 785602adf40SYehuda Sadeh } 786602adf40SYehuda Sadeh 787602adf40SYehuda Sadeh /* 788210c104cSIlya Dryomov * (Per device) rbd map options 78959c2be1eSYehuda Sadeh */ 79059c2be1eSYehuda Sadeh enum { 791b5584180SIlya Dryomov Opt_queue_depth, 79259c2be1eSYehuda Sadeh Opt_last_int, 79359c2be1eSYehuda Sadeh /* int args above */ 79459c2be1eSYehuda Sadeh Opt_last_string, 79559c2be1eSYehuda Sadeh /* string args above */ 796cc0538b6SAlex Elder Opt_read_only, 797cc0538b6SAlex Elder Opt_read_write, 79880de1912SIlya Dryomov Opt_lock_on_read, 799210c104cSIlya Dryomov Opt_err 80059c2be1eSYehuda Sadeh }; 80159c2be1eSYehuda Sadeh 80243ae4701SAlex Elder static match_table_t rbd_opts_tokens = { 803b5584180SIlya Dryomov {Opt_queue_depth, "queue_depth=%d"}, 80459c2be1eSYehuda Sadeh /* int args above */ 80559c2be1eSYehuda Sadeh /* string args above */ 806be466c1cSAlex Elder {Opt_read_only, "read_only"}, 807cc0538b6SAlex Elder {Opt_read_only, "ro"}, /* Alternate spelling */ 808cc0538b6SAlex Elder {Opt_read_write, "read_write"}, 809cc0538b6SAlex Elder {Opt_read_write, "rw"}, /* Alternate spelling */ 81080de1912SIlya Dryomov {Opt_lock_on_read, "lock_on_read"}, 811210c104cSIlya Dryomov {Opt_err, NULL} 81259c2be1eSYehuda Sadeh }; 81359c2be1eSYehuda Sadeh 81498571b5aSAlex Elder struct rbd_options { 815b5584180SIlya Dryomov int queue_depth; 81698571b5aSAlex Elder bool read_only; 81780de1912SIlya Dryomov bool lock_on_read; 81898571b5aSAlex Elder }; 81998571b5aSAlex Elder 820b5584180SIlya Dryomov #define RBD_QUEUE_DEPTH_DEFAULT BLKDEV_MAX_RQ 82198571b5aSAlex Elder #define RBD_READ_ONLY_DEFAULT false 82280de1912SIlya Dryomov #define RBD_LOCK_ON_READ_DEFAULT false 82398571b5aSAlex Elder 82459c2be1eSYehuda Sadeh static int parse_rbd_opts_token(char *c, void *private) 82559c2be1eSYehuda Sadeh { 82643ae4701SAlex Elder struct rbd_options *rbd_opts = private; 82759c2be1eSYehuda Sadeh substring_t argstr[MAX_OPT_ARGS]; 82859c2be1eSYehuda Sadeh int token, intval, ret; 82959c2be1eSYehuda Sadeh 83043ae4701SAlex Elder token = match_token(c, rbd_opts_tokens, argstr); 83159c2be1eSYehuda Sadeh if (token < Opt_last_int) { 83259c2be1eSYehuda Sadeh ret = match_int(&argstr[0], &intval); 83359c2be1eSYehuda Sadeh if (ret < 0) { 834210c104cSIlya Dryomov pr_err("bad mount option arg (not int) at '%s'\n", c); 83559c2be1eSYehuda Sadeh return ret; 83659c2be1eSYehuda Sadeh } 83759c2be1eSYehuda Sadeh dout("got int token %d val %d\n", token, intval); 83859c2be1eSYehuda Sadeh } else if (token > Opt_last_int && token < Opt_last_string) { 839210c104cSIlya Dryomov dout("got string token %d val %s\n", token, argstr[0].from); 84059c2be1eSYehuda Sadeh } else { 84159c2be1eSYehuda Sadeh dout("got token %d\n", token); 84259c2be1eSYehuda Sadeh } 84359c2be1eSYehuda Sadeh 84459c2be1eSYehuda Sadeh switch (token) { 845b5584180SIlya Dryomov case Opt_queue_depth: 846b5584180SIlya Dryomov if (intval < 1) { 847b5584180SIlya Dryomov pr_err("queue_depth out of range\n"); 848b5584180SIlya Dryomov return -EINVAL; 849b5584180SIlya Dryomov } 850b5584180SIlya Dryomov rbd_opts->queue_depth = intval; 851b5584180SIlya Dryomov break; 852cc0538b6SAlex Elder case Opt_read_only: 853cc0538b6SAlex Elder rbd_opts->read_only = true; 854cc0538b6SAlex Elder break; 855cc0538b6SAlex Elder case Opt_read_write: 856cc0538b6SAlex Elder rbd_opts->read_only = false; 857cc0538b6SAlex Elder break; 85880de1912SIlya Dryomov case Opt_lock_on_read: 85980de1912SIlya Dryomov rbd_opts->lock_on_read = true; 86080de1912SIlya Dryomov break; 86159c2be1eSYehuda Sadeh default: 862210c104cSIlya Dryomov /* libceph prints "bad option" msg */ 863210c104cSIlya Dryomov return -EINVAL; 86459c2be1eSYehuda Sadeh } 865210c104cSIlya Dryomov 86659c2be1eSYehuda Sadeh return 0; 86759c2be1eSYehuda Sadeh } 86859c2be1eSYehuda Sadeh 8696d2940c8SGuangliang Zhao static char* obj_op_name(enum obj_operation_type op_type) 8706d2940c8SGuangliang Zhao { 8716d2940c8SGuangliang Zhao switch (op_type) { 8726d2940c8SGuangliang Zhao case OBJ_OP_READ: 8736d2940c8SGuangliang Zhao return "read"; 8746d2940c8SGuangliang Zhao case OBJ_OP_WRITE: 8756d2940c8SGuangliang Zhao return "write"; 87690e98c52SGuangliang Zhao case OBJ_OP_DISCARD: 87790e98c52SGuangliang Zhao return "discard"; 8786d2940c8SGuangliang Zhao default: 8796d2940c8SGuangliang Zhao return "???"; 8806d2940c8SGuangliang Zhao } 8816d2940c8SGuangliang Zhao } 8826d2940c8SGuangliang Zhao 88359c2be1eSYehuda Sadeh /* 884602adf40SYehuda Sadeh * Get a ceph client with specific addr and configuration, if one does 8857262cfcaSAlex Elder * not exist create it. Either way, ceph_opts is consumed by this 8867262cfcaSAlex Elder * function. 887602adf40SYehuda Sadeh */ 8889d3997fdSAlex Elder static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts) 889602adf40SYehuda Sadeh { 890f8c38929SAlex Elder struct rbd_client *rbdc; 89159c2be1eSYehuda Sadeh 892cfbf6377SAlex Elder mutex_lock_nested(&client_mutex, SINGLE_DEPTH_NESTING); 8931f7ba331SAlex Elder rbdc = rbd_client_find(ceph_opts); 8949d3997fdSAlex Elder if (rbdc) /* using an existing client */ 89543ae4701SAlex Elder ceph_destroy_options(ceph_opts); 8969d3997fdSAlex Elder else 897f8c38929SAlex Elder rbdc = rbd_client_create(ceph_opts); 898cfbf6377SAlex Elder mutex_unlock(&client_mutex); 899d720bcb0SAlex Elder 9009d3997fdSAlex Elder return rbdc; 901602adf40SYehuda Sadeh } 902602adf40SYehuda Sadeh 903602adf40SYehuda Sadeh /* 904602adf40SYehuda Sadeh * Destroy ceph client 905d23a4b3fSAlex Elder * 906432b8587SAlex Elder * Caller must hold rbd_client_list_lock. 907602adf40SYehuda Sadeh */ 908602adf40SYehuda Sadeh static void rbd_client_release(struct kref *kref) 909602adf40SYehuda Sadeh { 910602adf40SYehuda Sadeh struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref); 911602adf40SYehuda Sadeh 91237206ee5SAlex Elder dout("%s: rbdc %p\n", __func__, rbdc); 913cd9d9f5dSAlex Elder spin_lock(&rbd_client_list_lock); 914602adf40SYehuda Sadeh list_del(&rbdc->node); 915cd9d9f5dSAlex Elder spin_unlock(&rbd_client_list_lock); 916602adf40SYehuda Sadeh 917602adf40SYehuda Sadeh ceph_destroy_client(rbdc->client); 918602adf40SYehuda Sadeh kfree(rbdc); 919602adf40SYehuda Sadeh } 920602adf40SYehuda Sadeh 921602adf40SYehuda Sadeh /* 922602adf40SYehuda Sadeh * Drop reference to ceph client node. If it's not referenced anymore, release 923602adf40SYehuda Sadeh * it. 924602adf40SYehuda Sadeh */ 9259d3997fdSAlex Elder static void rbd_put_client(struct rbd_client *rbdc) 926602adf40SYehuda Sadeh { 927c53d5893SAlex Elder if (rbdc) 9289d3997fdSAlex Elder kref_put(&rbdc->kref, rbd_client_release); 929602adf40SYehuda Sadeh } 930602adf40SYehuda Sadeh 931a30b71b9SAlex Elder static bool rbd_image_format_valid(u32 image_format) 932a30b71b9SAlex Elder { 933a30b71b9SAlex Elder return image_format == 1 || image_format == 2; 934a30b71b9SAlex Elder } 935a30b71b9SAlex Elder 9368e94af8eSAlex Elder static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk) 9378e94af8eSAlex Elder { 938103a150fSAlex Elder size_t size; 939103a150fSAlex Elder u32 snap_count; 940103a150fSAlex Elder 941103a150fSAlex Elder /* The header has to start with the magic rbd header text */ 942103a150fSAlex Elder if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT))) 943103a150fSAlex Elder return false; 944103a150fSAlex Elder 945db2388b6SAlex Elder /* The bio layer requires at least sector-sized I/O */ 946db2388b6SAlex Elder 947db2388b6SAlex Elder if (ondisk->options.order < SECTOR_SHIFT) 948db2388b6SAlex Elder return false; 949db2388b6SAlex Elder 950db2388b6SAlex Elder /* If we use u64 in a few spots we may be able to loosen this */ 951db2388b6SAlex Elder 952db2388b6SAlex Elder if (ondisk->options.order > 8 * sizeof (int) - 1) 953db2388b6SAlex Elder return false; 954db2388b6SAlex Elder 955103a150fSAlex Elder /* 956103a150fSAlex Elder * The size of a snapshot header has to fit in a size_t, and 957103a150fSAlex Elder * that limits the number of snapshots. 958103a150fSAlex Elder */ 959103a150fSAlex Elder snap_count = le32_to_cpu(ondisk->snap_count); 960103a150fSAlex Elder size = SIZE_MAX - sizeof (struct ceph_snap_context); 961103a150fSAlex Elder if (snap_count > size / sizeof (__le64)) 962103a150fSAlex Elder return false; 963103a150fSAlex Elder 964103a150fSAlex Elder /* 965103a150fSAlex Elder * Not only that, but the size of the entire the snapshot 966103a150fSAlex Elder * header must also be representable in a size_t. 967103a150fSAlex Elder */ 968103a150fSAlex Elder size -= snap_count * sizeof (__le64); 969103a150fSAlex Elder if ((u64) size < le64_to_cpu(ondisk->snap_names_len)) 970103a150fSAlex Elder return false; 971103a150fSAlex Elder 972103a150fSAlex Elder return true; 9738e94af8eSAlex Elder } 9748e94af8eSAlex Elder 975602adf40SYehuda Sadeh /* 976bb23e37aSAlex Elder * Fill an rbd image header with information from the given format 1 977bb23e37aSAlex Elder * on-disk header. 978602adf40SYehuda Sadeh */ 979662518b1SAlex Elder static int rbd_header_from_disk(struct rbd_device *rbd_dev, 9804156d998SAlex Elder struct rbd_image_header_ondisk *ondisk) 981602adf40SYehuda Sadeh { 982662518b1SAlex Elder struct rbd_image_header *header = &rbd_dev->header; 983bb23e37aSAlex Elder bool first_time = header->object_prefix == NULL; 984bb23e37aSAlex Elder struct ceph_snap_context *snapc; 985bb23e37aSAlex Elder char *object_prefix = NULL; 986bb23e37aSAlex Elder char *snap_names = NULL; 987bb23e37aSAlex Elder u64 *snap_sizes = NULL; 988ccece235SAlex Elder u32 snap_count; 989d2bb24e5SAlex Elder size_t size; 990bb23e37aSAlex Elder int ret = -ENOMEM; 991621901d6SAlex Elder u32 i; 992602adf40SYehuda Sadeh 993bb23e37aSAlex Elder /* Allocate this now to avoid having to handle failure below */ 994103a150fSAlex Elder 995bb23e37aSAlex Elder if (first_time) { 996bb23e37aSAlex Elder size_t len; 997bb23e37aSAlex Elder 998bb23e37aSAlex Elder len = strnlen(ondisk->object_prefix, 999bb23e37aSAlex Elder sizeof (ondisk->object_prefix)); 1000bb23e37aSAlex Elder object_prefix = kmalloc(len + 1, GFP_KERNEL); 1001bb23e37aSAlex Elder if (!object_prefix) 1002602adf40SYehuda Sadeh return -ENOMEM; 1003bb23e37aSAlex Elder memcpy(object_prefix, ondisk->object_prefix, len); 1004bb23e37aSAlex Elder object_prefix[len] = '\0'; 1005bb23e37aSAlex Elder } 100600f1f36fSAlex Elder 1007bb23e37aSAlex Elder /* Allocate the snapshot context and fill it in */ 1008d2bb24e5SAlex Elder 1009602adf40SYehuda Sadeh snap_count = le32_to_cpu(ondisk->snap_count); 1010bb23e37aSAlex Elder snapc = ceph_create_snap_context(snap_count, GFP_KERNEL); 1011bb23e37aSAlex Elder if (!snapc) 1012bb23e37aSAlex Elder goto out_err; 1013bb23e37aSAlex Elder snapc->seq = le64_to_cpu(ondisk->snap_seq); 1014602adf40SYehuda Sadeh if (snap_count) { 1015bb23e37aSAlex Elder struct rbd_image_snap_ondisk *snaps; 1016f785cc1dSAlex Elder u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len); 1017f785cc1dSAlex Elder 1018bb23e37aSAlex Elder /* We'll keep a copy of the snapshot names... */ 1019621901d6SAlex Elder 1020f785cc1dSAlex Elder if (snap_names_len > (u64)SIZE_MAX) 1021bb23e37aSAlex Elder goto out_2big; 1022bb23e37aSAlex Elder snap_names = kmalloc(snap_names_len, GFP_KERNEL); 1023bb23e37aSAlex Elder if (!snap_names) 1024602adf40SYehuda Sadeh goto out_err; 1025bb23e37aSAlex Elder 1026bb23e37aSAlex Elder /* ...as well as the array of their sizes. */ 1027bb23e37aSAlex Elder 1028bb23e37aSAlex Elder size = snap_count * sizeof (*header->snap_sizes); 1029bb23e37aSAlex Elder snap_sizes = kmalloc(size, GFP_KERNEL); 1030bb23e37aSAlex Elder if (!snap_sizes) 1031bb23e37aSAlex Elder goto out_err; 1032bb23e37aSAlex Elder 1033f785cc1dSAlex Elder /* 1034bb23e37aSAlex Elder * Copy the names, and fill in each snapshot's id 1035bb23e37aSAlex Elder * and size. 1036bb23e37aSAlex Elder * 103799a41ebcSAlex Elder * Note that rbd_dev_v1_header_info() guarantees the 1038bb23e37aSAlex Elder * ondisk buffer we're working with has 1039f785cc1dSAlex Elder * snap_names_len bytes beyond the end of the 1040f785cc1dSAlex Elder * snapshot id array, this memcpy() is safe. 1041f785cc1dSAlex Elder */ 1042bb23e37aSAlex Elder memcpy(snap_names, &ondisk->snaps[snap_count], snap_names_len); 1043bb23e37aSAlex Elder snaps = ondisk->snaps; 1044bb23e37aSAlex Elder for (i = 0; i < snap_count; i++) { 1045bb23e37aSAlex Elder snapc->snaps[i] = le64_to_cpu(snaps[i].id); 1046bb23e37aSAlex Elder snap_sizes[i] = le64_to_cpu(snaps[i].image_size); 1047bb23e37aSAlex Elder } 1048602adf40SYehuda Sadeh } 1049849b4260SAlex Elder 1050bb23e37aSAlex Elder /* We won't fail any more, fill in the header */ 1051bb23e37aSAlex Elder 1052bb23e37aSAlex Elder if (first_time) { 1053bb23e37aSAlex Elder header->object_prefix = object_prefix; 1054602adf40SYehuda Sadeh header->obj_order = ondisk->options.order; 1055602adf40SYehuda Sadeh header->crypt_type = ondisk->options.crypt_type; 1056602adf40SYehuda Sadeh header->comp_type = ondisk->options.comp_type; 1057bb23e37aSAlex Elder /* The rest aren't used for format 1 images */ 1058bb23e37aSAlex Elder header->stripe_unit = 0; 1059bb23e37aSAlex Elder header->stripe_count = 0; 1060bb23e37aSAlex Elder header->features = 0; 1061662518b1SAlex Elder } else { 1062662518b1SAlex Elder ceph_put_snap_context(header->snapc); 1063662518b1SAlex Elder kfree(header->snap_names); 1064662518b1SAlex Elder kfree(header->snap_sizes); 1065bb23e37aSAlex Elder } 10666a52325fSAlex Elder 1067bb23e37aSAlex Elder /* The remaining fields always get updated (when we refresh) */ 1068621901d6SAlex Elder 1069f84344f3SAlex Elder header->image_size = le64_to_cpu(ondisk->image_size); 1070bb23e37aSAlex Elder header->snapc = snapc; 1071bb23e37aSAlex Elder header->snap_names = snap_names; 1072bb23e37aSAlex Elder header->snap_sizes = snap_sizes; 1073468521c1SAlex Elder 1074602adf40SYehuda Sadeh return 0; 1075bb23e37aSAlex Elder out_2big: 1076bb23e37aSAlex Elder ret = -EIO; 10776a52325fSAlex Elder out_err: 1078bb23e37aSAlex Elder kfree(snap_sizes); 1079bb23e37aSAlex Elder kfree(snap_names); 1080bb23e37aSAlex Elder ceph_put_snap_context(snapc); 1081bb23e37aSAlex Elder kfree(object_prefix); 1082ccece235SAlex Elder 1083bb23e37aSAlex Elder return ret; 1084602adf40SYehuda Sadeh } 1085602adf40SYehuda Sadeh 10869682fc6dSAlex Elder static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which) 10879682fc6dSAlex Elder { 10889682fc6dSAlex Elder const char *snap_name; 10899682fc6dSAlex Elder 10909682fc6dSAlex Elder rbd_assert(which < rbd_dev->header.snapc->num_snaps); 10919682fc6dSAlex Elder 10929682fc6dSAlex Elder /* Skip over names until we find the one we are looking for */ 10939682fc6dSAlex Elder 10949682fc6dSAlex Elder snap_name = rbd_dev->header.snap_names; 10959682fc6dSAlex Elder while (which--) 10969682fc6dSAlex Elder snap_name += strlen(snap_name) + 1; 10979682fc6dSAlex Elder 10989682fc6dSAlex Elder return kstrdup(snap_name, GFP_KERNEL); 10999682fc6dSAlex Elder } 11009682fc6dSAlex Elder 110130d1cff8SAlex Elder /* 110230d1cff8SAlex Elder * Snapshot id comparison function for use with qsort()/bsearch(). 110330d1cff8SAlex Elder * Note that result is for snapshots in *descending* order. 110430d1cff8SAlex Elder */ 110530d1cff8SAlex Elder static int snapid_compare_reverse(const void *s1, const void *s2) 110630d1cff8SAlex Elder { 110730d1cff8SAlex Elder u64 snap_id1 = *(u64 *)s1; 110830d1cff8SAlex Elder u64 snap_id2 = *(u64 *)s2; 110930d1cff8SAlex Elder 111030d1cff8SAlex Elder if (snap_id1 < snap_id2) 111130d1cff8SAlex Elder return 1; 111230d1cff8SAlex Elder return snap_id1 == snap_id2 ? 0 : -1; 111330d1cff8SAlex Elder } 111430d1cff8SAlex Elder 111530d1cff8SAlex Elder /* 111630d1cff8SAlex Elder * Search a snapshot context to see if the given snapshot id is 111730d1cff8SAlex Elder * present. 111830d1cff8SAlex Elder * 111930d1cff8SAlex Elder * Returns the position of the snapshot id in the array if it's found, 112030d1cff8SAlex Elder * or BAD_SNAP_INDEX otherwise. 112130d1cff8SAlex Elder * 112230d1cff8SAlex Elder * Note: The snapshot array is in kept sorted (by the osd) in 112330d1cff8SAlex Elder * reverse order, highest snapshot id first. 112430d1cff8SAlex Elder */ 11259682fc6dSAlex Elder static u32 rbd_dev_snap_index(struct rbd_device *rbd_dev, u64 snap_id) 11269682fc6dSAlex Elder { 11279682fc6dSAlex Elder struct ceph_snap_context *snapc = rbd_dev->header.snapc; 112830d1cff8SAlex Elder u64 *found; 11299682fc6dSAlex Elder 113030d1cff8SAlex Elder found = bsearch(&snap_id, &snapc->snaps, snapc->num_snaps, 113130d1cff8SAlex Elder sizeof (snap_id), snapid_compare_reverse); 11329682fc6dSAlex Elder 113330d1cff8SAlex Elder return found ? (u32)(found - &snapc->snaps[0]) : BAD_SNAP_INDEX; 11349682fc6dSAlex Elder } 11359682fc6dSAlex Elder 11362ad3d716SAlex Elder static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, 11372ad3d716SAlex Elder u64 snap_id) 113854cac61fSAlex Elder { 113954cac61fSAlex Elder u32 which; 1140da6a6b63SJosh Durgin const char *snap_name; 114154cac61fSAlex Elder 114254cac61fSAlex Elder which = rbd_dev_snap_index(rbd_dev, snap_id); 114354cac61fSAlex Elder if (which == BAD_SNAP_INDEX) 1144da6a6b63SJosh Durgin return ERR_PTR(-ENOENT); 114554cac61fSAlex Elder 1146da6a6b63SJosh Durgin snap_name = _rbd_dev_v1_snap_name(rbd_dev, which); 1147da6a6b63SJosh Durgin return snap_name ? snap_name : ERR_PTR(-ENOMEM); 114854cac61fSAlex Elder } 114954cac61fSAlex Elder 11509e15b77dSAlex Elder static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id) 11519e15b77dSAlex Elder { 11529e15b77dSAlex Elder if (snap_id == CEPH_NOSNAP) 11539e15b77dSAlex Elder return RBD_SNAP_HEAD_NAME; 11549e15b77dSAlex Elder 115554cac61fSAlex Elder rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 115654cac61fSAlex Elder if (rbd_dev->image_format == 1) 115754cac61fSAlex Elder return rbd_dev_v1_snap_name(rbd_dev, snap_id); 11589e15b77dSAlex Elder 115954cac61fSAlex Elder return rbd_dev_v2_snap_name(rbd_dev, snap_id); 11609e15b77dSAlex Elder } 11619e15b77dSAlex Elder 11622ad3d716SAlex Elder static int rbd_snap_size(struct rbd_device *rbd_dev, u64 snap_id, 11632ad3d716SAlex Elder u64 *snap_size) 1164602adf40SYehuda Sadeh { 11652ad3d716SAlex Elder rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 11662ad3d716SAlex Elder if (snap_id == CEPH_NOSNAP) { 11672ad3d716SAlex Elder *snap_size = rbd_dev->header.image_size; 11682ad3d716SAlex Elder } else if (rbd_dev->image_format == 1) { 11692ad3d716SAlex Elder u32 which; 117000f1f36fSAlex Elder 11712ad3d716SAlex Elder which = rbd_dev_snap_index(rbd_dev, snap_id); 11722ad3d716SAlex Elder if (which == BAD_SNAP_INDEX) 11732ad3d716SAlex Elder return -ENOENT; 117400f1f36fSAlex Elder 11752ad3d716SAlex Elder *snap_size = rbd_dev->header.snap_sizes[which]; 11762ad3d716SAlex Elder } else { 11772ad3d716SAlex Elder u64 size = 0; 11782ad3d716SAlex Elder int ret; 11792ad3d716SAlex Elder 11802ad3d716SAlex Elder ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size); 11812ad3d716SAlex Elder if (ret) 11822ad3d716SAlex Elder return ret; 11832ad3d716SAlex Elder 11842ad3d716SAlex Elder *snap_size = size; 11852ad3d716SAlex Elder } 11862ad3d716SAlex Elder return 0; 11872ad3d716SAlex Elder } 11882ad3d716SAlex Elder 11892ad3d716SAlex Elder static int rbd_snap_features(struct rbd_device *rbd_dev, u64 snap_id, 11902ad3d716SAlex Elder u64 *snap_features) 11912ad3d716SAlex Elder { 11922ad3d716SAlex Elder rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 11932ad3d716SAlex Elder if (snap_id == CEPH_NOSNAP) { 11942ad3d716SAlex Elder *snap_features = rbd_dev->header.features; 11952ad3d716SAlex Elder } else if (rbd_dev->image_format == 1) { 11962ad3d716SAlex Elder *snap_features = 0; /* No features for format 1 */ 11972ad3d716SAlex Elder } else { 11982ad3d716SAlex Elder u64 features = 0; 11992ad3d716SAlex Elder int ret; 12002ad3d716SAlex Elder 12012ad3d716SAlex Elder ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, &features); 12022ad3d716SAlex Elder if (ret) 12032ad3d716SAlex Elder return ret; 12042ad3d716SAlex Elder 12052ad3d716SAlex Elder *snap_features = features; 12062ad3d716SAlex Elder } 12072ad3d716SAlex Elder return 0; 120800f1f36fSAlex Elder } 1209602adf40SYehuda Sadeh 1210d1cf5788SAlex Elder static int rbd_dev_mapping_set(struct rbd_device *rbd_dev) 1211602adf40SYehuda Sadeh { 12128f4b7d98SAlex Elder u64 snap_id = rbd_dev->spec->snap_id; 12132ad3d716SAlex Elder u64 size = 0; 12142ad3d716SAlex Elder u64 features = 0; 12152ad3d716SAlex Elder int ret; 12168b0241f8SAlex Elder 12172ad3d716SAlex Elder ret = rbd_snap_size(rbd_dev, snap_id, &size); 12182ad3d716SAlex Elder if (ret) 12192ad3d716SAlex Elder return ret; 12202ad3d716SAlex Elder ret = rbd_snap_features(rbd_dev, snap_id, &features); 12212ad3d716SAlex Elder if (ret) 12222ad3d716SAlex Elder return ret; 12232ad3d716SAlex Elder 12242ad3d716SAlex Elder rbd_dev->mapping.size = size; 12252ad3d716SAlex Elder rbd_dev->mapping.features = features; 12262ad3d716SAlex Elder 12278b0241f8SAlex Elder return 0; 1228602adf40SYehuda Sadeh } 1229602adf40SYehuda Sadeh 1230d1cf5788SAlex Elder static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev) 1231d1cf5788SAlex Elder { 1232d1cf5788SAlex Elder rbd_dev->mapping.size = 0; 1233d1cf5788SAlex Elder rbd_dev->mapping.features = 0; 1234200a6a8bSAlex Elder } 1235200a6a8bSAlex Elder 12367d5079aaSHimangi Saraogi static void rbd_segment_name_free(const char *name) 12377d5079aaSHimangi Saraogi { 12387d5079aaSHimangi Saraogi /* The explicit cast here is needed to drop the const qualifier */ 12397d5079aaSHimangi Saraogi 12407d5079aaSHimangi Saraogi kmem_cache_free(rbd_segment_name_cache, (void *)name); 12417d5079aaSHimangi Saraogi } 12427d5079aaSHimangi Saraogi 124398571b5aSAlex Elder static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset) 1244602adf40SYehuda Sadeh { 124565ccfe21SAlex Elder char *name; 124665ccfe21SAlex Elder u64 segment; 124765ccfe21SAlex Elder int ret; 12483a96d5cdSJosh Durgin char *name_format; 1249602adf40SYehuda Sadeh 125078c2a44aSAlex Elder name = kmem_cache_alloc(rbd_segment_name_cache, GFP_NOIO); 125165ccfe21SAlex Elder if (!name) 125265ccfe21SAlex Elder return NULL; 125365ccfe21SAlex Elder segment = offset >> rbd_dev->header.obj_order; 12543a96d5cdSJosh Durgin name_format = "%s.%012llx"; 12553a96d5cdSJosh Durgin if (rbd_dev->image_format == 2) 12563a96d5cdSJosh Durgin name_format = "%s.%016llx"; 12572d0ebc5dSIlya Dryomov ret = snprintf(name, CEPH_MAX_OID_NAME_LEN + 1, name_format, 125865ccfe21SAlex Elder rbd_dev->header.object_prefix, segment); 12592d0ebc5dSIlya Dryomov if (ret < 0 || ret > CEPH_MAX_OID_NAME_LEN) { 126065ccfe21SAlex Elder pr_err("error formatting segment name for #%llu (%d)\n", 126165ccfe21SAlex Elder segment, ret); 12627d5079aaSHimangi Saraogi rbd_segment_name_free(name); 126365ccfe21SAlex Elder name = NULL; 126465ccfe21SAlex Elder } 1265602adf40SYehuda Sadeh 126665ccfe21SAlex Elder return name; 126765ccfe21SAlex Elder } 1268602adf40SYehuda Sadeh 126965ccfe21SAlex Elder static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset) 127065ccfe21SAlex Elder { 127165ccfe21SAlex Elder u64 segment_size = (u64) 1 << rbd_dev->header.obj_order; 1272602adf40SYehuda Sadeh 127365ccfe21SAlex Elder return offset & (segment_size - 1); 127465ccfe21SAlex Elder } 127565ccfe21SAlex Elder 127665ccfe21SAlex Elder static u64 rbd_segment_length(struct rbd_device *rbd_dev, 127765ccfe21SAlex Elder u64 offset, u64 length) 127865ccfe21SAlex Elder { 127965ccfe21SAlex Elder u64 segment_size = (u64) 1 << rbd_dev->header.obj_order; 128065ccfe21SAlex Elder 128165ccfe21SAlex Elder offset &= segment_size - 1; 128265ccfe21SAlex Elder 1283aafb230eSAlex Elder rbd_assert(length <= U64_MAX - offset); 128465ccfe21SAlex Elder if (offset + length > segment_size) 128565ccfe21SAlex Elder length = segment_size - offset; 128665ccfe21SAlex Elder 128765ccfe21SAlex Elder return length; 1288602adf40SYehuda Sadeh } 1289602adf40SYehuda Sadeh 1290602adf40SYehuda Sadeh /* 1291029bcbd8SJosh Durgin * returns the size of an object in the image 1292029bcbd8SJosh Durgin */ 1293029bcbd8SJosh Durgin static u64 rbd_obj_bytes(struct rbd_image_header *header) 1294029bcbd8SJosh Durgin { 1295029bcbd8SJosh Durgin return 1 << header->obj_order; 1296029bcbd8SJosh Durgin } 1297029bcbd8SJosh Durgin 1298029bcbd8SJosh Durgin /* 1299602adf40SYehuda Sadeh * bio helpers 1300602adf40SYehuda Sadeh */ 1301602adf40SYehuda Sadeh 1302602adf40SYehuda Sadeh static void bio_chain_put(struct bio *chain) 1303602adf40SYehuda Sadeh { 1304602adf40SYehuda Sadeh struct bio *tmp; 1305602adf40SYehuda Sadeh 1306602adf40SYehuda Sadeh while (chain) { 1307602adf40SYehuda Sadeh tmp = chain; 1308602adf40SYehuda Sadeh chain = chain->bi_next; 1309602adf40SYehuda Sadeh bio_put(tmp); 1310602adf40SYehuda Sadeh } 1311602adf40SYehuda Sadeh } 1312602adf40SYehuda Sadeh 1313602adf40SYehuda Sadeh /* 1314602adf40SYehuda Sadeh * zeros a bio chain, starting at specific offset 1315602adf40SYehuda Sadeh */ 1316602adf40SYehuda Sadeh static void zero_bio_chain(struct bio *chain, int start_ofs) 1317602adf40SYehuda Sadeh { 13187988613bSKent Overstreet struct bio_vec bv; 13197988613bSKent Overstreet struct bvec_iter iter; 1320602adf40SYehuda Sadeh unsigned long flags; 1321602adf40SYehuda Sadeh void *buf; 1322602adf40SYehuda Sadeh int pos = 0; 1323602adf40SYehuda Sadeh 1324602adf40SYehuda Sadeh while (chain) { 13257988613bSKent Overstreet bio_for_each_segment(bv, chain, iter) { 13267988613bSKent Overstreet if (pos + bv.bv_len > start_ofs) { 1327602adf40SYehuda Sadeh int remainder = max(start_ofs - pos, 0); 13287988613bSKent Overstreet buf = bvec_kmap_irq(&bv, &flags); 1329602adf40SYehuda Sadeh memset(buf + remainder, 0, 13307988613bSKent Overstreet bv.bv_len - remainder); 13317988613bSKent Overstreet flush_dcache_page(bv.bv_page); 133285b5aaa6SDan Carpenter bvec_kunmap_irq(buf, &flags); 1333602adf40SYehuda Sadeh } 13347988613bSKent Overstreet pos += bv.bv_len; 1335602adf40SYehuda Sadeh } 1336602adf40SYehuda Sadeh 1337602adf40SYehuda Sadeh chain = chain->bi_next; 1338602adf40SYehuda Sadeh } 1339602adf40SYehuda Sadeh } 1340602adf40SYehuda Sadeh 1341602adf40SYehuda Sadeh /* 1342b9434c5bSAlex Elder * similar to zero_bio_chain(), zeros data defined by a page array, 1343b9434c5bSAlex Elder * starting at the given byte offset from the start of the array and 1344b9434c5bSAlex Elder * continuing up to the given end offset. The pages array is 1345b9434c5bSAlex Elder * assumed to be big enough to hold all bytes up to the end. 1346b9434c5bSAlex Elder */ 1347b9434c5bSAlex Elder static void zero_pages(struct page **pages, u64 offset, u64 end) 1348b9434c5bSAlex Elder { 1349b9434c5bSAlex Elder struct page **page = &pages[offset >> PAGE_SHIFT]; 1350b9434c5bSAlex Elder 1351b9434c5bSAlex Elder rbd_assert(end > offset); 1352b9434c5bSAlex Elder rbd_assert(end - offset <= (u64)SIZE_MAX); 1353b9434c5bSAlex Elder while (offset < end) { 1354b9434c5bSAlex Elder size_t page_offset; 1355b9434c5bSAlex Elder size_t length; 1356b9434c5bSAlex Elder unsigned long flags; 1357b9434c5bSAlex Elder void *kaddr; 1358b9434c5bSAlex Elder 1359491205a8SGeert Uytterhoeven page_offset = offset & ~PAGE_MASK; 1360491205a8SGeert Uytterhoeven length = min_t(size_t, PAGE_SIZE - page_offset, end - offset); 1361b9434c5bSAlex Elder local_irq_save(flags); 1362b9434c5bSAlex Elder kaddr = kmap_atomic(*page); 1363b9434c5bSAlex Elder memset(kaddr + page_offset, 0, length); 1364e2156054SAlex Elder flush_dcache_page(*page); 1365b9434c5bSAlex Elder kunmap_atomic(kaddr); 1366b9434c5bSAlex Elder local_irq_restore(flags); 1367b9434c5bSAlex Elder 1368b9434c5bSAlex Elder offset += length; 1369b9434c5bSAlex Elder page++; 1370b9434c5bSAlex Elder } 1371b9434c5bSAlex Elder } 1372b9434c5bSAlex Elder 1373b9434c5bSAlex Elder /* 1374f7760dadSAlex Elder * Clone a portion of a bio, starting at the given byte offset 1375f7760dadSAlex Elder * and continuing for the number of bytes indicated. 1376602adf40SYehuda Sadeh */ 1377f7760dadSAlex Elder static struct bio *bio_clone_range(struct bio *bio_src, 1378f7760dadSAlex Elder unsigned int offset, 1379f7760dadSAlex Elder unsigned int len, 1380f7760dadSAlex Elder gfp_t gfpmask) 1381602adf40SYehuda Sadeh { 1382f7760dadSAlex Elder struct bio *bio; 1383602adf40SYehuda Sadeh 13845341a627SKent Overstreet bio = bio_clone(bio_src, gfpmask); 1385f7760dadSAlex Elder if (!bio) 1386f7760dadSAlex Elder return NULL; /* ENOMEM */ 1387f7760dadSAlex Elder 13885341a627SKent Overstreet bio_advance(bio, offset); 13894f024f37SKent Overstreet bio->bi_iter.bi_size = len; 1390602adf40SYehuda Sadeh 1391f7760dadSAlex Elder return bio; 1392602adf40SYehuda Sadeh } 1393602adf40SYehuda Sadeh 1394f7760dadSAlex Elder /* 1395f7760dadSAlex Elder * Clone a portion of a bio chain, starting at the given byte offset 1396f7760dadSAlex Elder * into the first bio in the source chain and continuing for the 1397f7760dadSAlex Elder * number of bytes indicated. The result is another bio chain of 1398f7760dadSAlex Elder * exactly the given length, or a null pointer on error. 1399f7760dadSAlex Elder * 1400f7760dadSAlex Elder * The bio_src and offset parameters are both in-out. On entry they 1401f7760dadSAlex Elder * refer to the first source bio and the offset into that bio where 1402f7760dadSAlex Elder * the start of data to be cloned is located. 1403f7760dadSAlex Elder * 1404f7760dadSAlex Elder * On return, bio_src is updated to refer to the bio in the source 1405f7760dadSAlex Elder * chain that contains first un-cloned byte, and *offset will 1406f7760dadSAlex Elder * contain the offset of that byte within that bio. 1407f7760dadSAlex Elder */ 1408f7760dadSAlex Elder static struct bio *bio_chain_clone_range(struct bio **bio_src, 1409f7760dadSAlex Elder unsigned int *offset, 1410f7760dadSAlex Elder unsigned int len, 1411f7760dadSAlex Elder gfp_t gfpmask) 1412f7760dadSAlex Elder { 1413f7760dadSAlex Elder struct bio *bi = *bio_src; 1414f7760dadSAlex Elder unsigned int off = *offset; 1415f7760dadSAlex Elder struct bio *chain = NULL; 1416f7760dadSAlex Elder struct bio **end; 1417602adf40SYehuda Sadeh 1418f7760dadSAlex Elder /* Build up a chain of clone bios up to the limit */ 1419602adf40SYehuda Sadeh 14204f024f37SKent Overstreet if (!bi || off >= bi->bi_iter.bi_size || !len) 1421f7760dadSAlex Elder return NULL; /* Nothing to clone */ 1422602adf40SYehuda Sadeh 1423f7760dadSAlex Elder end = &chain; 1424f7760dadSAlex Elder while (len) { 1425f7760dadSAlex Elder unsigned int bi_size; 1426f7760dadSAlex Elder struct bio *bio; 1427f7760dadSAlex Elder 1428f5400b7aSAlex Elder if (!bi) { 1429f5400b7aSAlex Elder rbd_warn(NULL, "bio_chain exhausted with %u left", len); 1430f7760dadSAlex Elder goto out_err; /* EINVAL; ran out of bio's */ 1431f5400b7aSAlex Elder } 14324f024f37SKent Overstreet bi_size = min_t(unsigned int, bi->bi_iter.bi_size - off, len); 1433f7760dadSAlex Elder bio = bio_clone_range(bi, off, bi_size, gfpmask); 1434f7760dadSAlex Elder if (!bio) 1435f7760dadSAlex Elder goto out_err; /* ENOMEM */ 1436f7760dadSAlex Elder 1437f7760dadSAlex Elder *end = bio; 1438f7760dadSAlex Elder end = &bio->bi_next; 1439f7760dadSAlex Elder 1440f7760dadSAlex Elder off += bi_size; 14414f024f37SKent Overstreet if (off == bi->bi_iter.bi_size) { 1442f7760dadSAlex Elder bi = bi->bi_next; 1443f7760dadSAlex Elder off = 0; 1444f7760dadSAlex Elder } 1445f7760dadSAlex Elder len -= bi_size; 1446f7760dadSAlex Elder } 1447f7760dadSAlex Elder *bio_src = bi; 1448f7760dadSAlex Elder *offset = off; 1449f7760dadSAlex Elder 1450f7760dadSAlex Elder return chain; 1451f7760dadSAlex Elder out_err: 1452f7760dadSAlex Elder bio_chain_put(chain); 1453f7760dadSAlex Elder 1454602adf40SYehuda Sadeh return NULL; 1455602adf40SYehuda Sadeh } 1456602adf40SYehuda Sadeh 1457926f9b3fSAlex Elder /* 1458926f9b3fSAlex Elder * The default/initial value for all object request flags is 0. For 1459926f9b3fSAlex Elder * each flag, once its value is set to 1 it is never reset to 0 1460926f9b3fSAlex Elder * again. 1461926f9b3fSAlex Elder */ 14626365d33aSAlex Elder static void obj_request_img_data_set(struct rbd_obj_request *obj_request) 14636365d33aSAlex Elder { 14646365d33aSAlex Elder if (test_and_set_bit(OBJ_REQ_IMG_DATA, &obj_request->flags)) { 14656365d33aSAlex Elder struct rbd_device *rbd_dev; 14666365d33aSAlex Elder 146757acbaa7SAlex Elder rbd_dev = obj_request->img_request->rbd_dev; 14689584d508SIlya Dryomov rbd_warn(rbd_dev, "obj_request %p already marked img_data", 14696365d33aSAlex Elder obj_request); 14706365d33aSAlex Elder } 14716365d33aSAlex Elder } 14726365d33aSAlex Elder 14736365d33aSAlex Elder static bool obj_request_img_data_test(struct rbd_obj_request *obj_request) 14746365d33aSAlex Elder { 14756365d33aSAlex Elder smp_mb(); 14766365d33aSAlex Elder return test_bit(OBJ_REQ_IMG_DATA, &obj_request->flags) != 0; 14776365d33aSAlex Elder } 14786365d33aSAlex Elder 147957acbaa7SAlex Elder static void obj_request_done_set(struct rbd_obj_request *obj_request) 148057acbaa7SAlex Elder { 148157acbaa7SAlex Elder if (test_and_set_bit(OBJ_REQ_DONE, &obj_request->flags)) { 148257acbaa7SAlex Elder struct rbd_device *rbd_dev = NULL; 148357acbaa7SAlex Elder 148457acbaa7SAlex Elder if (obj_request_img_data_test(obj_request)) 148557acbaa7SAlex Elder rbd_dev = obj_request->img_request->rbd_dev; 14869584d508SIlya Dryomov rbd_warn(rbd_dev, "obj_request %p already marked done", 148757acbaa7SAlex Elder obj_request); 148857acbaa7SAlex Elder } 148957acbaa7SAlex Elder } 149057acbaa7SAlex Elder 149157acbaa7SAlex Elder static bool obj_request_done_test(struct rbd_obj_request *obj_request) 149257acbaa7SAlex Elder { 149357acbaa7SAlex Elder smp_mb(); 149457acbaa7SAlex Elder return test_bit(OBJ_REQ_DONE, &obj_request->flags) != 0; 149557acbaa7SAlex Elder } 149657acbaa7SAlex Elder 14975679c59fSAlex Elder /* 14985679c59fSAlex Elder * This sets the KNOWN flag after (possibly) setting the EXISTS 14995679c59fSAlex Elder * flag. The latter is set based on the "exists" value provided. 15005679c59fSAlex Elder * 15015679c59fSAlex Elder * Note that for our purposes once an object exists it never goes 15025679c59fSAlex Elder * away again. It's possible that the response from two existence 15035679c59fSAlex Elder * checks are separated by the creation of the target object, and 15045679c59fSAlex Elder * the first ("doesn't exist") response arrives *after* the second 15055679c59fSAlex Elder * ("does exist"). In that case we ignore the second one. 15065679c59fSAlex Elder */ 15075679c59fSAlex Elder static void obj_request_existence_set(struct rbd_obj_request *obj_request, 15085679c59fSAlex Elder bool exists) 15095679c59fSAlex Elder { 15105679c59fSAlex Elder if (exists) 15115679c59fSAlex Elder set_bit(OBJ_REQ_EXISTS, &obj_request->flags); 15125679c59fSAlex Elder set_bit(OBJ_REQ_KNOWN, &obj_request->flags); 15135679c59fSAlex Elder smp_mb(); 15145679c59fSAlex Elder } 15155679c59fSAlex Elder 15165679c59fSAlex Elder static bool obj_request_known_test(struct rbd_obj_request *obj_request) 15175679c59fSAlex Elder { 15185679c59fSAlex Elder smp_mb(); 15195679c59fSAlex Elder return test_bit(OBJ_REQ_KNOWN, &obj_request->flags) != 0; 15205679c59fSAlex Elder } 15215679c59fSAlex Elder 15225679c59fSAlex Elder static bool obj_request_exists_test(struct rbd_obj_request *obj_request) 15235679c59fSAlex Elder { 15245679c59fSAlex Elder smp_mb(); 15255679c59fSAlex Elder return test_bit(OBJ_REQ_EXISTS, &obj_request->flags) != 0; 15265679c59fSAlex Elder } 15275679c59fSAlex Elder 15289638556aSIlya Dryomov static bool obj_request_overlaps_parent(struct rbd_obj_request *obj_request) 15299638556aSIlya Dryomov { 15309638556aSIlya Dryomov struct rbd_device *rbd_dev = obj_request->img_request->rbd_dev; 15319638556aSIlya Dryomov 15329638556aSIlya Dryomov return obj_request->img_offset < 15339638556aSIlya Dryomov round_up(rbd_dev->parent_overlap, rbd_obj_bytes(&rbd_dev->header)); 15349638556aSIlya Dryomov } 15359638556aSIlya Dryomov 1536bf0d5f50SAlex Elder static void rbd_obj_request_get(struct rbd_obj_request *obj_request) 1537bf0d5f50SAlex Elder { 153837206ee5SAlex Elder dout("%s: obj %p (was %d)\n", __func__, obj_request, 153937206ee5SAlex Elder atomic_read(&obj_request->kref.refcount)); 1540bf0d5f50SAlex Elder kref_get(&obj_request->kref); 1541bf0d5f50SAlex Elder } 1542bf0d5f50SAlex Elder 1543bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref); 1544bf0d5f50SAlex Elder static void rbd_obj_request_put(struct rbd_obj_request *obj_request) 1545bf0d5f50SAlex Elder { 1546bf0d5f50SAlex Elder rbd_assert(obj_request != NULL); 154737206ee5SAlex Elder dout("%s: obj %p (was %d)\n", __func__, obj_request, 154837206ee5SAlex Elder atomic_read(&obj_request->kref.refcount)); 1549bf0d5f50SAlex Elder kref_put(&obj_request->kref, rbd_obj_request_destroy); 1550bf0d5f50SAlex Elder } 1551bf0d5f50SAlex Elder 15520f2d5be7SAlex Elder static void rbd_img_request_get(struct rbd_img_request *img_request) 15530f2d5be7SAlex Elder { 15540f2d5be7SAlex Elder dout("%s: img %p (was %d)\n", __func__, img_request, 15550f2d5be7SAlex Elder atomic_read(&img_request->kref.refcount)); 15560f2d5be7SAlex Elder kref_get(&img_request->kref); 15570f2d5be7SAlex Elder } 15580f2d5be7SAlex Elder 1559e93f3152SAlex Elder static bool img_request_child_test(struct rbd_img_request *img_request); 1560e93f3152SAlex Elder static void rbd_parent_request_destroy(struct kref *kref); 1561bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref); 1562bf0d5f50SAlex Elder static void rbd_img_request_put(struct rbd_img_request *img_request) 1563bf0d5f50SAlex Elder { 1564bf0d5f50SAlex Elder rbd_assert(img_request != NULL); 156537206ee5SAlex Elder dout("%s: img %p (was %d)\n", __func__, img_request, 156637206ee5SAlex Elder atomic_read(&img_request->kref.refcount)); 1567e93f3152SAlex Elder if (img_request_child_test(img_request)) 1568e93f3152SAlex Elder kref_put(&img_request->kref, rbd_parent_request_destroy); 1569e93f3152SAlex Elder else 1570bf0d5f50SAlex Elder kref_put(&img_request->kref, rbd_img_request_destroy); 1571bf0d5f50SAlex Elder } 1572bf0d5f50SAlex Elder 1573bf0d5f50SAlex Elder static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request, 1574bf0d5f50SAlex Elder struct rbd_obj_request *obj_request) 1575bf0d5f50SAlex Elder { 157625dcf954SAlex Elder rbd_assert(obj_request->img_request == NULL); 157725dcf954SAlex Elder 1578b155e86cSAlex Elder /* Image request now owns object's original reference */ 1579bf0d5f50SAlex Elder obj_request->img_request = img_request; 158025dcf954SAlex Elder obj_request->which = img_request->obj_request_count; 15816365d33aSAlex Elder rbd_assert(!obj_request_img_data_test(obj_request)); 15826365d33aSAlex Elder obj_request_img_data_set(obj_request); 1583bf0d5f50SAlex Elder rbd_assert(obj_request->which != BAD_WHICH); 158425dcf954SAlex Elder img_request->obj_request_count++; 158525dcf954SAlex Elder list_add_tail(&obj_request->links, &img_request->obj_requests); 158637206ee5SAlex Elder dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request, 158737206ee5SAlex Elder obj_request->which); 1588bf0d5f50SAlex Elder } 1589bf0d5f50SAlex Elder 1590bf0d5f50SAlex Elder static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request, 1591bf0d5f50SAlex Elder struct rbd_obj_request *obj_request) 1592bf0d5f50SAlex Elder { 1593bf0d5f50SAlex Elder rbd_assert(obj_request->which != BAD_WHICH); 159425dcf954SAlex Elder 159537206ee5SAlex Elder dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request, 159637206ee5SAlex Elder obj_request->which); 1597bf0d5f50SAlex Elder list_del(&obj_request->links); 159825dcf954SAlex Elder rbd_assert(img_request->obj_request_count > 0); 159925dcf954SAlex Elder img_request->obj_request_count--; 160025dcf954SAlex Elder rbd_assert(obj_request->which == img_request->obj_request_count); 160125dcf954SAlex Elder obj_request->which = BAD_WHICH; 16026365d33aSAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 1603bf0d5f50SAlex Elder rbd_assert(obj_request->img_request == img_request); 1604bf0d5f50SAlex Elder obj_request->img_request = NULL; 160525dcf954SAlex Elder obj_request->callback = NULL; 1606bf0d5f50SAlex Elder rbd_obj_request_put(obj_request); 1607bf0d5f50SAlex Elder } 1608bf0d5f50SAlex Elder 1609bf0d5f50SAlex Elder static bool obj_request_type_valid(enum obj_request_type type) 1610bf0d5f50SAlex Elder { 1611bf0d5f50SAlex Elder switch (type) { 16129969ebc5SAlex Elder case OBJ_REQUEST_NODATA: 1613bf0d5f50SAlex Elder case OBJ_REQUEST_BIO: 1614788e2df3SAlex Elder case OBJ_REQUEST_PAGES: 1615bf0d5f50SAlex Elder return true; 1616bf0d5f50SAlex Elder default: 1617bf0d5f50SAlex Elder return false; 1618bf0d5f50SAlex Elder } 1619bf0d5f50SAlex Elder } 1620bf0d5f50SAlex Elder 16214a17dadcSIlya Dryomov static void rbd_img_obj_callback(struct rbd_obj_request *obj_request); 16224a17dadcSIlya Dryomov 1623980917fcSIlya Dryomov static void rbd_obj_request_submit(struct rbd_obj_request *obj_request) 1624bf0d5f50SAlex Elder { 1625980917fcSIlya Dryomov struct ceph_osd_request *osd_req = obj_request->osd_req; 1626980917fcSIlya Dryomov 1627980917fcSIlya Dryomov dout("%s %p osd_req %p\n", __func__, obj_request, osd_req); 16284a17dadcSIlya Dryomov if (obj_request_img_data_test(obj_request)) { 16294a17dadcSIlya Dryomov WARN_ON(obj_request->callback != rbd_img_obj_callback); 16304a17dadcSIlya Dryomov rbd_img_request_get(obj_request->img_request); 16314a17dadcSIlya Dryomov } 1632980917fcSIlya Dryomov ceph_osdc_start_request(osd_req->r_osdc, osd_req, false); 1633bf0d5f50SAlex Elder } 1634bf0d5f50SAlex Elder 163571c20a06SIlya Dryomov static void rbd_obj_request_end(struct rbd_obj_request *obj_request) 163671c20a06SIlya Dryomov { 163771c20a06SIlya Dryomov dout("%s %p\n", __func__, obj_request); 163871c20a06SIlya Dryomov ceph_osdc_cancel_request(obj_request->osd_req); 163971c20a06SIlya Dryomov } 164071c20a06SIlya Dryomov 164171c20a06SIlya Dryomov /* 164271c20a06SIlya Dryomov * Wait for an object request to complete. If interrupted, cancel the 164371c20a06SIlya Dryomov * underlying osd request. 16442894e1d7SIlya Dryomov * 16452894e1d7SIlya Dryomov * @timeout: in jiffies, 0 means "wait forever" 164671c20a06SIlya Dryomov */ 16472894e1d7SIlya Dryomov static int __rbd_obj_request_wait(struct rbd_obj_request *obj_request, 16482894e1d7SIlya Dryomov unsigned long timeout) 164971c20a06SIlya Dryomov { 16502894e1d7SIlya Dryomov long ret; 165171c20a06SIlya Dryomov 165271c20a06SIlya Dryomov dout("%s %p\n", __func__, obj_request); 16532894e1d7SIlya Dryomov ret = wait_for_completion_interruptible_timeout( 16542894e1d7SIlya Dryomov &obj_request->completion, 16552894e1d7SIlya Dryomov ceph_timeout_jiffies(timeout)); 16562894e1d7SIlya Dryomov if (ret <= 0) { 16572894e1d7SIlya Dryomov if (ret == 0) 16582894e1d7SIlya Dryomov ret = -ETIMEDOUT; 165971c20a06SIlya Dryomov rbd_obj_request_end(obj_request); 16602894e1d7SIlya Dryomov } else { 16612894e1d7SIlya Dryomov ret = 0; 16622894e1d7SIlya Dryomov } 16632894e1d7SIlya Dryomov 16642894e1d7SIlya Dryomov dout("%s %p ret %d\n", __func__, obj_request, (int)ret); 166571c20a06SIlya Dryomov return ret; 166671c20a06SIlya Dryomov } 166771c20a06SIlya Dryomov 16682894e1d7SIlya Dryomov static int rbd_obj_request_wait(struct rbd_obj_request *obj_request) 16692894e1d7SIlya Dryomov { 16702894e1d7SIlya Dryomov return __rbd_obj_request_wait(obj_request, 0); 16712894e1d7SIlya Dryomov } 16722894e1d7SIlya Dryomov 1673bf0d5f50SAlex Elder static void rbd_img_request_complete(struct rbd_img_request *img_request) 1674bf0d5f50SAlex Elder { 167555f27e09SAlex Elder 167637206ee5SAlex Elder dout("%s: img %p\n", __func__, img_request); 167755f27e09SAlex Elder 167855f27e09SAlex Elder /* 167955f27e09SAlex Elder * If no error occurred, compute the aggregate transfer 168055f27e09SAlex Elder * count for the image request. We could instead use 168155f27e09SAlex Elder * atomic64_cmpxchg() to update it as each object request 168255f27e09SAlex Elder * completes; not clear which way is better off hand. 168355f27e09SAlex Elder */ 168455f27e09SAlex Elder if (!img_request->result) { 168555f27e09SAlex Elder struct rbd_obj_request *obj_request; 168655f27e09SAlex Elder u64 xferred = 0; 168755f27e09SAlex Elder 168855f27e09SAlex Elder for_each_obj_request(img_request, obj_request) 168955f27e09SAlex Elder xferred += obj_request->xferred; 169055f27e09SAlex Elder img_request->xferred = xferred; 169155f27e09SAlex Elder } 169255f27e09SAlex Elder 1693bf0d5f50SAlex Elder if (img_request->callback) 1694bf0d5f50SAlex Elder img_request->callback(img_request); 1695bf0d5f50SAlex Elder else 1696bf0d5f50SAlex Elder rbd_img_request_put(img_request); 1697bf0d5f50SAlex Elder } 1698bf0d5f50SAlex Elder 16990c425248SAlex Elder /* 17000c425248SAlex Elder * The default/initial value for all image request flags is 0. Each 17010c425248SAlex Elder * is conditionally set to 1 at image request initialization time 17020c425248SAlex Elder * and currently never change thereafter. 17030c425248SAlex Elder */ 17040c425248SAlex Elder static void img_request_write_set(struct rbd_img_request *img_request) 17050c425248SAlex Elder { 17060c425248SAlex Elder set_bit(IMG_REQ_WRITE, &img_request->flags); 17070c425248SAlex Elder smp_mb(); 17080c425248SAlex Elder } 17090c425248SAlex Elder 17100c425248SAlex Elder static bool img_request_write_test(struct rbd_img_request *img_request) 17110c425248SAlex Elder { 17120c425248SAlex Elder smp_mb(); 17130c425248SAlex Elder return test_bit(IMG_REQ_WRITE, &img_request->flags) != 0; 17140c425248SAlex Elder } 17150c425248SAlex Elder 171690e98c52SGuangliang Zhao /* 171790e98c52SGuangliang Zhao * Set the discard flag when the img_request is an discard request 171890e98c52SGuangliang Zhao */ 171990e98c52SGuangliang Zhao static void img_request_discard_set(struct rbd_img_request *img_request) 172090e98c52SGuangliang Zhao { 172190e98c52SGuangliang Zhao set_bit(IMG_REQ_DISCARD, &img_request->flags); 172290e98c52SGuangliang Zhao smp_mb(); 172390e98c52SGuangliang Zhao } 172490e98c52SGuangliang Zhao 172590e98c52SGuangliang Zhao static bool img_request_discard_test(struct rbd_img_request *img_request) 172690e98c52SGuangliang Zhao { 172790e98c52SGuangliang Zhao smp_mb(); 172890e98c52SGuangliang Zhao return test_bit(IMG_REQ_DISCARD, &img_request->flags) != 0; 172990e98c52SGuangliang Zhao } 173090e98c52SGuangliang Zhao 17319849e986SAlex Elder static void img_request_child_set(struct rbd_img_request *img_request) 17329849e986SAlex Elder { 17339849e986SAlex Elder set_bit(IMG_REQ_CHILD, &img_request->flags); 17349849e986SAlex Elder smp_mb(); 17359849e986SAlex Elder } 17369849e986SAlex Elder 1737e93f3152SAlex Elder static void img_request_child_clear(struct rbd_img_request *img_request) 1738e93f3152SAlex Elder { 1739e93f3152SAlex Elder clear_bit(IMG_REQ_CHILD, &img_request->flags); 1740e93f3152SAlex Elder smp_mb(); 1741e93f3152SAlex Elder } 1742e93f3152SAlex Elder 17439849e986SAlex Elder static bool img_request_child_test(struct rbd_img_request *img_request) 17449849e986SAlex Elder { 17459849e986SAlex Elder smp_mb(); 17469849e986SAlex Elder return test_bit(IMG_REQ_CHILD, &img_request->flags) != 0; 17479849e986SAlex Elder } 17489849e986SAlex Elder 1749d0b2e944SAlex Elder static void img_request_layered_set(struct rbd_img_request *img_request) 1750d0b2e944SAlex Elder { 1751d0b2e944SAlex Elder set_bit(IMG_REQ_LAYERED, &img_request->flags); 1752d0b2e944SAlex Elder smp_mb(); 1753d0b2e944SAlex Elder } 1754d0b2e944SAlex Elder 1755a2acd00eSAlex Elder static void img_request_layered_clear(struct rbd_img_request *img_request) 1756a2acd00eSAlex Elder { 1757a2acd00eSAlex Elder clear_bit(IMG_REQ_LAYERED, &img_request->flags); 1758a2acd00eSAlex Elder smp_mb(); 1759a2acd00eSAlex Elder } 1760a2acd00eSAlex Elder 1761d0b2e944SAlex Elder static bool img_request_layered_test(struct rbd_img_request *img_request) 1762d0b2e944SAlex Elder { 1763d0b2e944SAlex Elder smp_mb(); 1764d0b2e944SAlex Elder return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0; 1765d0b2e944SAlex Elder } 1766d0b2e944SAlex Elder 17673b434a2aSJosh Durgin static enum obj_operation_type 17683b434a2aSJosh Durgin rbd_img_request_op_type(struct rbd_img_request *img_request) 17693b434a2aSJosh Durgin { 17703b434a2aSJosh Durgin if (img_request_write_test(img_request)) 17713b434a2aSJosh Durgin return OBJ_OP_WRITE; 17723b434a2aSJosh Durgin else if (img_request_discard_test(img_request)) 17733b434a2aSJosh Durgin return OBJ_OP_DISCARD; 17743b434a2aSJosh Durgin else 17753b434a2aSJosh Durgin return OBJ_OP_READ; 17763b434a2aSJosh Durgin } 17773b434a2aSJosh Durgin 17786e2a4505SAlex Elder static void 17796e2a4505SAlex Elder rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request) 17806e2a4505SAlex Elder { 1781b9434c5bSAlex Elder u64 xferred = obj_request->xferred; 1782b9434c5bSAlex Elder u64 length = obj_request->length; 1783b9434c5bSAlex Elder 17846e2a4505SAlex Elder dout("%s: obj %p img %p result %d %llu/%llu\n", __func__, 17856e2a4505SAlex Elder obj_request, obj_request->img_request, obj_request->result, 1786b9434c5bSAlex Elder xferred, length); 17876e2a4505SAlex Elder /* 178817c1cc1dSJosh Durgin * ENOENT means a hole in the image. We zero-fill the entire 178917c1cc1dSJosh Durgin * length of the request. A short read also implies zero-fill 179017c1cc1dSJosh Durgin * to the end of the request. An error requires the whole 179117c1cc1dSJosh Durgin * length of the request to be reported finished with an error 179217c1cc1dSJosh Durgin * to the block layer. In each case we update the xferred 179317c1cc1dSJosh Durgin * count to indicate the whole request was satisfied. 17946e2a4505SAlex Elder */ 1795b9434c5bSAlex Elder rbd_assert(obj_request->type != OBJ_REQUEST_NODATA); 17966e2a4505SAlex Elder if (obj_request->result == -ENOENT) { 1797b9434c5bSAlex Elder if (obj_request->type == OBJ_REQUEST_BIO) 17986e2a4505SAlex Elder zero_bio_chain(obj_request->bio_list, 0); 1799b9434c5bSAlex Elder else 1800b9434c5bSAlex Elder zero_pages(obj_request->pages, 0, length); 18016e2a4505SAlex Elder obj_request->result = 0; 1802b9434c5bSAlex Elder } else if (xferred < length && !obj_request->result) { 1803b9434c5bSAlex Elder if (obj_request->type == OBJ_REQUEST_BIO) 1804b9434c5bSAlex Elder zero_bio_chain(obj_request->bio_list, xferred); 1805b9434c5bSAlex Elder else 1806b9434c5bSAlex Elder zero_pages(obj_request->pages, xferred, length); 18076e2a4505SAlex Elder } 180817c1cc1dSJosh Durgin obj_request->xferred = length; 18096e2a4505SAlex Elder obj_request_done_set(obj_request); 18106e2a4505SAlex Elder } 18116e2a4505SAlex Elder 1812bf0d5f50SAlex Elder static void rbd_obj_request_complete(struct rbd_obj_request *obj_request) 1813bf0d5f50SAlex Elder { 181437206ee5SAlex Elder dout("%s: obj %p cb %p\n", __func__, obj_request, 181537206ee5SAlex Elder obj_request->callback); 1816bf0d5f50SAlex Elder if (obj_request->callback) 1817bf0d5f50SAlex Elder obj_request->callback(obj_request); 1818788e2df3SAlex Elder else 1819788e2df3SAlex Elder complete_all(&obj_request->completion); 1820bf0d5f50SAlex Elder } 1821bf0d5f50SAlex Elder 1822c47f9371SAlex Elder static void rbd_osd_read_callback(struct rbd_obj_request *obj_request) 1823bf0d5f50SAlex Elder { 182457acbaa7SAlex Elder struct rbd_img_request *img_request = NULL; 1825a9e8ba2cSAlex Elder struct rbd_device *rbd_dev = NULL; 182657acbaa7SAlex Elder bool layered = false; 182757acbaa7SAlex Elder 182857acbaa7SAlex Elder if (obj_request_img_data_test(obj_request)) { 182957acbaa7SAlex Elder img_request = obj_request->img_request; 183057acbaa7SAlex Elder layered = img_request && img_request_layered_test(img_request); 1831a9e8ba2cSAlex Elder rbd_dev = img_request->rbd_dev; 183257acbaa7SAlex Elder } 18338b3e1a56SAlex Elder 18348b3e1a56SAlex Elder dout("%s: obj %p img %p result %d %llu/%llu\n", __func__, 18358b3e1a56SAlex Elder obj_request, img_request, obj_request->result, 18368b3e1a56SAlex Elder obj_request->xferred, obj_request->length); 1837a9e8ba2cSAlex Elder if (layered && obj_request->result == -ENOENT && 1838a9e8ba2cSAlex Elder obj_request->img_offset < rbd_dev->parent_overlap) 18398b3e1a56SAlex Elder rbd_img_parent_read(obj_request); 18408b3e1a56SAlex Elder else if (img_request) 18416e2a4505SAlex Elder rbd_img_obj_request_read_callback(obj_request); 18426e2a4505SAlex Elder else 184307741308SAlex Elder obj_request_done_set(obj_request); 1844bf0d5f50SAlex Elder } 1845bf0d5f50SAlex Elder 1846c47f9371SAlex Elder static void rbd_osd_write_callback(struct rbd_obj_request *obj_request) 1847bf0d5f50SAlex Elder { 18481b83bef2SSage Weil dout("%s: obj %p result %d %llu\n", __func__, obj_request, 18491b83bef2SSage Weil obj_request->result, obj_request->length); 18501b83bef2SSage Weil /* 18518b3e1a56SAlex Elder * There is no such thing as a successful short write. Set 18528b3e1a56SAlex Elder * it to our originally-requested length. 18531b83bef2SSage Weil */ 18541b83bef2SSage Weil obj_request->xferred = obj_request->length; 185507741308SAlex Elder obj_request_done_set(obj_request); 1856bf0d5f50SAlex Elder } 1857bf0d5f50SAlex Elder 185890e98c52SGuangliang Zhao static void rbd_osd_discard_callback(struct rbd_obj_request *obj_request) 185990e98c52SGuangliang Zhao { 186090e98c52SGuangliang Zhao dout("%s: obj %p result %d %llu\n", __func__, obj_request, 186190e98c52SGuangliang Zhao obj_request->result, obj_request->length); 186290e98c52SGuangliang Zhao /* 186390e98c52SGuangliang Zhao * There is no such thing as a successful short discard. Set 186490e98c52SGuangliang Zhao * it to our originally-requested length. 186590e98c52SGuangliang Zhao */ 186690e98c52SGuangliang Zhao obj_request->xferred = obj_request->length; 1867d0265de7SJosh Durgin /* discarding a non-existent object is not a problem */ 1868d0265de7SJosh Durgin if (obj_request->result == -ENOENT) 1869d0265de7SJosh Durgin obj_request->result = 0; 187090e98c52SGuangliang Zhao obj_request_done_set(obj_request); 187190e98c52SGuangliang Zhao } 187290e98c52SGuangliang Zhao 1873fbfab539SAlex Elder /* 1874fbfab539SAlex Elder * For a simple stat call there's nothing to do. We'll do more if 1875fbfab539SAlex Elder * this is part of a write sequence for a layered image. 1876fbfab539SAlex Elder */ 1877c47f9371SAlex Elder static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request) 1878fbfab539SAlex Elder { 187937206ee5SAlex Elder dout("%s: obj %p\n", __func__, obj_request); 1880fbfab539SAlex Elder obj_request_done_set(obj_request); 1881fbfab539SAlex Elder } 1882fbfab539SAlex Elder 18832761713dSIlya Dryomov static void rbd_osd_call_callback(struct rbd_obj_request *obj_request) 18842761713dSIlya Dryomov { 18852761713dSIlya Dryomov dout("%s: obj %p\n", __func__, obj_request); 18862761713dSIlya Dryomov 18872761713dSIlya Dryomov if (obj_request_img_data_test(obj_request)) 18882761713dSIlya Dryomov rbd_osd_copyup_callback(obj_request); 18892761713dSIlya Dryomov else 18902761713dSIlya Dryomov obj_request_done_set(obj_request); 18912761713dSIlya Dryomov } 18922761713dSIlya Dryomov 189385e084feSIlya Dryomov static void rbd_osd_req_callback(struct ceph_osd_request *osd_req) 1894bf0d5f50SAlex Elder { 1895bf0d5f50SAlex Elder struct rbd_obj_request *obj_request = osd_req->r_priv; 1896bf0d5f50SAlex Elder u16 opcode; 1897bf0d5f50SAlex Elder 189885e084feSIlya Dryomov dout("%s: osd_req %p\n", __func__, osd_req); 1899bf0d5f50SAlex Elder rbd_assert(osd_req == obj_request->osd_req); 190057acbaa7SAlex Elder if (obj_request_img_data_test(obj_request)) { 190157acbaa7SAlex Elder rbd_assert(obj_request->img_request); 190257acbaa7SAlex Elder rbd_assert(obj_request->which != BAD_WHICH); 190357acbaa7SAlex Elder } else { 190457acbaa7SAlex Elder rbd_assert(obj_request->which == BAD_WHICH); 190557acbaa7SAlex Elder } 1906bf0d5f50SAlex Elder 19071b83bef2SSage Weil if (osd_req->r_result < 0) 19081b83bef2SSage Weil obj_request->result = osd_req->r_result; 1909bf0d5f50SAlex Elder 1910c47f9371SAlex Elder /* 1911c47f9371SAlex Elder * We support a 64-bit length, but ultimately it has to be 19127ad18afaSChristoph Hellwig * passed to the block layer, which just supports a 32-bit 19137ad18afaSChristoph Hellwig * length field. 1914c47f9371SAlex Elder */ 19157665d85bSYan, Zheng obj_request->xferred = osd_req->r_ops[0].outdata_len; 1916c47f9371SAlex Elder rbd_assert(obj_request->xferred < (u64)UINT_MAX); 19170ccd5926SIlya Dryomov 191879528734SAlex Elder opcode = osd_req->r_ops[0].op; 1919bf0d5f50SAlex Elder switch (opcode) { 1920bf0d5f50SAlex Elder case CEPH_OSD_OP_READ: 1921c47f9371SAlex Elder rbd_osd_read_callback(obj_request); 1922bf0d5f50SAlex Elder break; 19230ccd5926SIlya Dryomov case CEPH_OSD_OP_SETALLOCHINT: 1924e30b7577SIlya Dryomov rbd_assert(osd_req->r_ops[1].op == CEPH_OSD_OP_WRITE || 1925e30b7577SIlya Dryomov osd_req->r_ops[1].op == CEPH_OSD_OP_WRITEFULL); 19260ccd5926SIlya Dryomov /* fall through */ 1927bf0d5f50SAlex Elder case CEPH_OSD_OP_WRITE: 1928e30b7577SIlya Dryomov case CEPH_OSD_OP_WRITEFULL: 1929c47f9371SAlex Elder rbd_osd_write_callback(obj_request); 1930bf0d5f50SAlex Elder break; 1931fbfab539SAlex Elder case CEPH_OSD_OP_STAT: 1932c47f9371SAlex Elder rbd_osd_stat_callback(obj_request); 1933fbfab539SAlex Elder break; 193490e98c52SGuangliang Zhao case CEPH_OSD_OP_DELETE: 193590e98c52SGuangliang Zhao case CEPH_OSD_OP_TRUNCATE: 193690e98c52SGuangliang Zhao case CEPH_OSD_OP_ZERO: 193790e98c52SGuangliang Zhao rbd_osd_discard_callback(obj_request); 193890e98c52SGuangliang Zhao break; 193936be9a76SAlex Elder case CEPH_OSD_OP_CALL: 19402761713dSIlya Dryomov rbd_osd_call_callback(obj_request); 19412761713dSIlya Dryomov break; 1942bf0d5f50SAlex Elder default: 19439584d508SIlya Dryomov rbd_warn(NULL, "%s: unsupported op %hu", 1944bf0d5f50SAlex Elder obj_request->object_name, (unsigned short) opcode); 1945bf0d5f50SAlex Elder break; 1946bf0d5f50SAlex Elder } 1947bf0d5f50SAlex Elder 194807741308SAlex Elder if (obj_request_done_test(obj_request)) 1949bf0d5f50SAlex Elder rbd_obj_request_complete(obj_request); 1950bf0d5f50SAlex Elder } 1951bf0d5f50SAlex Elder 19529d4df01fSAlex Elder static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request) 1953430c28c3SAlex Elder { 19548c042b0dSAlex Elder struct ceph_osd_request *osd_req = obj_request->osd_req; 1955430c28c3SAlex Elder 19567c84883aSIlya Dryomov rbd_assert(obj_request_img_data_test(obj_request)); 19577c84883aSIlya Dryomov osd_req->r_snapid = obj_request->img_request->snap_id; 19589d4df01fSAlex Elder } 19599d4df01fSAlex Elder 19609d4df01fSAlex Elder static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request) 19619d4df01fSAlex Elder { 19629d4df01fSAlex Elder struct ceph_osd_request *osd_req = obj_request->osd_req; 19639d4df01fSAlex Elder 1964bb873b53SIlya Dryomov osd_req->r_mtime = CURRENT_TIME; 1965bb873b53SIlya Dryomov osd_req->r_data_offset = obj_request->offset; 1966430c28c3SAlex Elder } 1967430c28c3SAlex Elder 19680ccd5926SIlya Dryomov /* 19690ccd5926SIlya Dryomov * Create an osd request. A read request has one osd op (read). 19700ccd5926SIlya Dryomov * A write request has either one (watch) or two (hint+write) osd ops. 19710ccd5926SIlya Dryomov * (All rbd data writes are prefixed with an allocation hint op, but 19720ccd5926SIlya Dryomov * technically osd watch is a write request, hence this distinction.) 19730ccd5926SIlya Dryomov */ 1974bf0d5f50SAlex Elder static struct ceph_osd_request *rbd_osd_req_create( 1975bf0d5f50SAlex Elder struct rbd_device *rbd_dev, 19766d2940c8SGuangliang Zhao enum obj_operation_type op_type, 1977deb236b3SIlya Dryomov unsigned int num_ops, 1978430c28c3SAlex Elder struct rbd_obj_request *obj_request) 1979bf0d5f50SAlex Elder { 1980bf0d5f50SAlex Elder struct ceph_snap_context *snapc = NULL; 1981bf0d5f50SAlex Elder struct ceph_osd_client *osdc; 1982bf0d5f50SAlex Elder struct ceph_osd_request *osd_req; 1983bf0d5f50SAlex Elder 198490e98c52SGuangliang Zhao if (obj_request_img_data_test(obj_request) && 198590e98c52SGuangliang Zhao (op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_WRITE)) { 19866365d33aSAlex Elder struct rbd_img_request *img_request = obj_request->img_request; 198790e98c52SGuangliang Zhao if (op_type == OBJ_OP_WRITE) { 19886d2940c8SGuangliang Zhao rbd_assert(img_request_write_test(img_request)); 198990e98c52SGuangliang Zhao } else { 199090e98c52SGuangliang Zhao rbd_assert(img_request_discard_test(img_request)); 199190e98c52SGuangliang Zhao } 1992bf0d5f50SAlex Elder snapc = img_request->snapc; 1993bf0d5f50SAlex Elder } 1994bf0d5f50SAlex Elder 19956d2940c8SGuangliang Zhao rbd_assert(num_ops == 1 || ((op_type == OBJ_OP_WRITE) && num_ops == 2)); 1996deb236b3SIlya Dryomov 1997deb236b3SIlya Dryomov /* Allocate and initialize the request, for the num_ops ops */ 1998bf0d5f50SAlex Elder 1999bf0d5f50SAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 2000deb236b3SIlya Dryomov osd_req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false, 20012224d879SDavid Disseldorp GFP_NOIO); 2002bf0d5f50SAlex Elder if (!osd_req) 200313d1ad16SIlya Dryomov goto fail; 2004bf0d5f50SAlex Elder 200590e98c52SGuangliang Zhao if (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD) 2006bf0d5f50SAlex Elder osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK; 2007430c28c3SAlex Elder else 2008bf0d5f50SAlex Elder osd_req->r_flags = CEPH_OSD_FLAG_READ; 2009bf0d5f50SAlex Elder 2010bf0d5f50SAlex Elder osd_req->r_callback = rbd_osd_req_callback; 2011bf0d5f50SAlex Elder osd_req->r_priv = obj_request; 2012bf0d5f50SAlex Elder 20137627151eSYan, Zheng osd_req->r_base_oloc.pool = rbd_dev->layout.pool_id; 2014d30291b9SIlya Dryomov if (ceph_oid_aprintf(&osd_req->r_base_oid, GFP_NOIO, "%s", 2015d30291b9SIlya Dryomov obj_request->object_name)) 2016d30291b9SIlya Dryomov goto fail; 2017bf0d5f50SAlex Elder 201813d1ad16SIlya Dryomov if (ceph_osdc_alloc_messages(osd_req, GFP_NOIO)) 201913d1ad16SIlya Dryomov goto fail; 202013d1ad16SIlya Dryomov 2021bf0d5f50SAlex Elder return osd_req; 202213d1ad16SIlya Dryomov 202313d1ad16SIlya Dryomov fail: 202413d1ad16SIlya Dryomov ceph_osdc_put_request(osd_req); 202513d1ad16SIlya Dryomov return NULL; 2026bf0d5f50SAlex Elder } 2027bf0d5f50SAlex Elder 20280eefd470SAlex Elder /* 2029d3246fb0SJosh Durgin * Create a copyup osd request based on the information in the object 2030d3246fb0SJosh Durgin * request supplied. A copyup request has two or three osd ops, a 2031d3246fb0SJosh Durgin * copyup method call, potentially a hint op, and a write or truncate 2032d3246fb0SJosh Durgin * or zero op. 20330eefd470SAlex Elder */ 20340eefd470SAlex Elder static struct ceph_osd_request * 20350eefd470SAlex Elder rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request) 20360eefd470SAlex Elder { 20370eefd470SAlex Elder struct rbd_img_request *img_request; 20380eefd470SAlex Elder struct ceph_snap_context *snapc; 20390eefd470SAlex Elder struct rbd_device *rbd_dev; 20400eefd470SAlex Elder struct ceph_osd_client *osdc; 20410eefd470SAlex Elder struct ceph_osd_request *osd_req; 2042d3246fb0SJosh Durgin int num_osd_ops = 3; 20430eefd470SAlex Elder 20440eefd470SAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 20450eefd470SAlex Elder img_request = obj_request->img_request; 20460eefd470SAlex Elder rbd_assert(img_request); 2047d3246fb0SJosh Durgin rbd_assert(img_request_write_test(img_request) || 2048d3246fb0SJosh Durgin img_request_discard_test(img_request)); 20490eefd470SAlex Elder 2050d3246fb0SJosh Durgin if (img_request_discard_test(img_request)) 2051d3246fb0SJosh Durgin num_osd_ops = 2; 2052d3246fb0SJosh Durgin 2053d3246fb0SJosh Durgin /* Allocate and initialize the request, for all the ops */ 20540eefd470SAlex Elder 20550eefd470SAlex Elder snapc = img_request->snapc; 20560eefd470SAlex Elder rbd_dev = img_request->rbd_dev; 20570eefd470SAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 2058d3246fb0SJosh Durgin osd_req = ceph_osdc_alloc_request(osdc, snapc, num_osd_ops, 20592224d879SDavid Disseldorp false, GFP_NOIO); 20600eefd470SAlex Elder if (!osd_req) 206113d1ad16SIlya Dryomov goto fail; 20620eefd470SAlex Elder 20630eefd470SAlex Elder osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK; 20640eefd470SAlex Elder osd_req->r_callback = rbd_osd_req_callback; 20650eefd470SAlex Elder osd_req->r_priv = obj_request; 20660eefd470SAlex Elder 20677627151eSYan, Zheng osd_req->r_base_oloc.pool = rbd_dev->layout.pool_id; 2068d30291b9SIlya Dryomov if (ceph_oid_aprintf(&osd_req->r_base_oid, GFP_NOIO, "%s", 2069d30291b9SIlya Dryomov obj_request->object_name)) 2070d30291b9SIlya Dryomov goto fail; 20710eefd470SAlex Elder 207213d1ad16SIlya Dryomov if (ceph_osdc_alloc_messages(osd_req, GFP_NOIO)) 207313d1ad16SIlya Dryomov goto fail; 207413d1ad16SIlya Dryomov 20750eefd470SAlex Elder return osd_req; 207613d1ad16SIlya Dryomov 207713d1ad16SIlya Dryomov fail: 207813d1ad16SIlya Dryomov ceph_osdc_put_request(osd_req); 207913d1ad16SIlya Dryomov return NULL; 20800eefd470SAlex Elder } 20810eefd470SAlex Elder 20820eefd470SAlex Elder 2083bf0d5f50SAlex Elder static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req) 2084bf0d5f50SAlex Elder { 2085bf0d5f50SAlex Elder ceph_osdc_put_request(osd_req); 2086bf0d5f50SAlex Elder } 2087bf0d5f50SAlex Elder 2088bf0d5f50SAlex Elder /* object_name is assumed to be a non-null pointer and NUL-terminated */ 2089bf0d5f50SAlex Elder 2090bf0d5f50SAlex Elder static struct rbd_obj_request *rbd_obj_request_create(const char *object_name, 2091bf0d5f50SAlex Elder u64 offset, u64 length, 2092bf0d5f50SAlex Elder enum obj_request_type type) 2093bf0d5f50SAlex Elder { 2094bf0d5f50SAlex Elder struct rbd_obj_request *obj_request; 2095bf0d5f50SAlex Elder size_t size; 2096bf0d5f50SAlex Elder char *name; 2097bf0d5f50SAlex Elder 2098bf0d5f50SAlex Elder rbd_assert(obj_request_type_valid(type)); 2099bf0d5f50SAlex Elder 2100bf0d5f50SAlex Elder size = strlen(object_name) + 1; 21015a60e876SIlya Dryomov name = kmalloc(size, GFP_NOIO); 2102f907ad55SAlex Elder if (!name) 2103bf0d5f50SAlex Elder return NULL; 2104bf0d5f50SAlex Elder 21055a60e876SIlya Dryomov obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_NOIO); 2106f907ad55SAlex Elder if (!obj_request) { 2107f907ad55SAlex Elder kfree(name); 2108f907ad55SAlex Elder return NULL; 2109f907ad55SAlex Elder } 2110f907ad55SAlex Elder 2111bf0d5f50SAlex Elder obj_request->object_name = memcpy(name, object_name, size); 2112bf0d5f50SAlex Elder obj_request->offset = offset; 2113bf0d5f50SAlex Elder obj_request->length = length; 2114926f9b3fSAlex Elder obj_request->flags = 0; 2115bf0d5f50SAlex Elder obj_request->which = BAD_WHICH; 2116bf0d5f50SAlex Elder obj_request->type = type; 2117bf0d5f50SAlex Elder INIT_LIST_HEAD(&obj_request->links); 2118788e2df3SAlex Elder init_completion(&obj_request->completion); 2119bf0d5f50SAlex Elder kref_init(&obj_request->kref); 2120bf0d5f50SAlex Elder 212137206ee5SAlex Elder dout("%s: \"%s\" %llu/%llu %d -> obj %p\n", __func__, object_name, 212237206ee5SAlex Elder offset, length, (int)type, obj_request); 212337206ee5SAlex Elder 2124bf0d5f50SAlex Elder return obj_request; 2125bf0d5f50SAlex Elder } 2126bf0d5f50SAlex Elder 2127bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref) 2128bf0d5f50SAlex Elder { 2129bf0d5f50SAlex Elder struct rbd_obj_request *obj_request; 2130bf0d5f50SAlex Elder 2131bf0d5f50SAlex Elder obj_request = container_of(kref, struct rbd_obj_request, kref); 2132bf0d5f50SAlex Elder 213337206ee5SAlex Elder dout("%s: obj %p\n", __func__, obj_request); 213437206ee5SAlex Elder 2135bf0d5f50SAlex Elder rbd_assert(obj_request->img_request == NULL); 2136bf0d5f50SAlex Elder rbd_assert(obj_request->which == BAD_WHICH); 2137bf0d5f50SAlex Elder 2138bf0d5f50SAlex Elder if (obj_request->osd_req) 2139bf0d5f50SAlex Elder rbd_osd_req_destroy(obj_request->osd_req); 2140bf0d5f50SAlex Elder 2141bf0d5f50SAlex Elder rbd_assert(obj_request_type_valid(obj_request->type)); 2142bf0d5f50SAlex Elder switch (obj_request->type) { 21439969ebc5SAlex Elder case OBJ_REQUEST_NODATA: 21449969ebc5SAlex Elder break; /* Nothing to do */ 2145bf0d5f50SAlex Elder case OBJ_REQUEST_BIO: 2146bf0d5f50SAlex Elder if (obj_request->bio_list) 2147bf0d5f50SAlex Elder bio_chain_put(obj_request->bio_list); 2148bf0d5f50SAlex Elder break; 2149788e2df3SAlex Elder case OBJ_REQUEST_PAGES: 2150788e2df3SAlex Elder if (obj_request->pages) 2151788e2df3SAlex Elder ceph_release_page_vector(obj_request->pages, 2152788e2df3SAlex Elder obj_request->page_count); 2153788e2df3SAlex Elder break; 2154bf0d5f50SAlex Elder } 2155bf0d5f50SAlex Elder 2156f907ad55SAlex Elder kfree(obj_request->object_name); 2157868311b1SAlex Elder obj_request->object_name = NULL; 2158868311b1SAlex Elder kmem_cache_free(rbd_obj_request_cache, obj_request); 2159bf0d5f50SAlex Elder } 2160bf0d5f50SAlex Elder 2161fb65d228SAlex Elder /* It's OK to call this for a device with no parent */ 2162fb65d228SAlex Elder 2163fb65d228SAlex Elder static void rbd_spec_put(struct rbd_spec *spec); 2164fb65d228SAlex Elder static void rbd_dev_unparent(struct rbd_device *rbd_dev) 2165fb65d228SAlex Elder { 2166fb65d228SAlex Elder rbd_dev_remove_parent(rbd_dev); 2167fb65d228SAlex Elder rbd_spec_put(rbd_dev->parent_spec); 2168fb65d228SAlex Elder rbd_dev->parent_spec = NULL; 2169fb65d228SAlex Elder rbd_dev->parent_overlap = 0; 2170fb65d228SAlex Elder } 2171fb65d228SAlex Elder 2172bf0d5f50SAlex Elder /* 2173a2acd00eSAlex Elder * Parent image reference counting is used to determine when an 2174a2acd00eSAlex Elder * image's parent fields can be safely torn down--after there are no 2175a2acd00eSAlex Elder * more in-flight requests to the parent image. When the last 2176a2acd00eSAlex Elder * reference is dropped, cleaning them up is safe. 2177a2acd00eSAlex Elder */ 2178a2acd00eSAlex Elder static void rbd_dev_parent_put(struct rbd_device *rbd_dev) 2179a2acd00eSAlex Elder { 2180a2acd00eSAlex Elder int counter; 2181a2acd00eSAlex Elder 2182a2acd00eSAlex Elder if (!rbd_dev->parent_spec) 2183a2acd00eSAlex Elder return; 2184a2acd00eSAlex Elder 2185a2acd00eSAlex Elder counter = atomic_dec_return_safe(&rbd_dev->parent_ref); 2186a2acd00eSAlex Elder if (counter > 0) 2187a2acd00eSAlex Elder return; 2188a2acd00eSAlex Elder 2189a2acd00eSAlex Elder /* Last reference; clean up parent data structures */ 2190a2acd00eSAlex Elder 2191a2acd00eSAlex Elder if (!counter) 2192a2acd00eSAlex Elder rbd_dev_unparent(rbd_dev); 2193a2acd00eSAlex Elder else 21949584d508SIlya Dryomov rbd_warn(rbd_dev, "parent reference underflow"); 2195a2acd00eSAlex Elder } 2196a2acd00eSAlex Elder 2197a2acd00eSAlex Elder /* 2198a2acd00eSAlex Elder * If an image has a non-zero parent overlap, get a reference to its 2199a2acd00eSAlex Elder * parent. 2200a2acd00eSAlex Elder * 2201a2acd00eSAlex Elder * Returns true if the rbd device has a parent with a non-zero 2202a2acd00eSAlex Elder * overlap and a reference for it was successfully taken, or 2203a2acd00eSAlex Elder * false otherwise. 2204a2acd00eSAlex Elder */ 2205a2acd00eSAlex Elder static bool rbd_dev_parent_get(struct rbd_device *rbd_dev) 2206a2acd00eSAlex Elder { 2207ae43e9d0SIlya Dryomov int counter = 0; 2208a2acd00eSAlex Elder 2209a2acd00eSAlex Elder if (!rbd_dev->parent_spec) 2210a2acd00eSAlex Elder return false; 2211a2acd00eSAlex Elder 2212ae43e9d0SIlya Dryomov down_read(&rbd_dev->header_rwsem); 2213ae43e9d0SIlya Dryomov if (rbd_dev->parent_overlap) 2214a2acd00eSAlex Elder counter = atomic_inc_return_safe(&rbd_dev->parent_ref); 2215ae43e9d0SIlya Dryomov up_read(&rbd_dev->header_rwsem); 2216a2acd00eSAlex Elder 2217a2acd00eSAlex Elder if (counter < 0) 22189584d508SIlya Dryomov rbd_warn(rbd_dev, "parent reference overflow"); 2219a2acd00eSAlex Elder 2220ae43e9d0SIlya Dryomov return counter > 0; 2221a2acd00eSAlex Elder } 2222a2acd00eSAlex Elder 2223bf0d5f50SAlex Elder /* 2224bf0d5f50SAlex Elder * Caller is responsible for filling in the list of object requests 2225bf0d5f50SAlex Elder * that comprises the image request, and the Linux request pointer 2226bf0d5f50SAlex Elder * (if there is one). 2227bf0d5f50SAlex Elder */ 2228cc344fa1SAlex Elder static struct rbd_img_request *rbd_img_request_create( 2229cc344fa1SAlex Elder struct rbd_device *rbd_dev, 2230bf0d5f50SAlex Elder u64 offset, u64 length, 22316d2940c8SGuangliang Zhao enum obj_operation_type op_type, 22324e752f0aSJosh Durgin struct ceph_snap_context *snapc) 2233bf0d5f50SAlex Elder { 2234bf0d5f50SAlex Elder struct rbd_img_request *img_request; 2235bf0d5f50SAlex Elder 22367a716aacSIlya Dryomov img_request = kmem_cache_alloc(rbd_img_request_cache, GFP_NOIO); 2237bf0d5f50SAlex Elder if (!img_request) 2238bf0d5f50SAlex Elder return NULL; 2239bf0d5f50SAlex Elder 2240bf0d5f50SAlex Elder img_request->rq = NULL; 2241bf0d5f50SAlex Elder img_request->rbd_dev = rbd_dev; 2242bf0d5f50SAlex Elder img_request->offset = offset; 2243bf0d5f50SAlex Elder img_request->length = length; 22440c425248SAlex Elder img_request->flags = 0; 224590e98c52SGuangliang Zhao if (op_type == OBJ_OP_DISCARD) { 224690e98c52SGuangliang Zhao img_request_discard_set(img_request); 224790e98c52SGuangliang Zhao img_request->snapc = snapc; 224890e98c52SGuangliang Zhao } else if (op_type == OBJ_OP_WRITE) { 22490c425248SAlex Elder img_request_write_set(img_request); 22504e752f0aSJosh Durgin img_request->snapc = snapc; 22510c425248SAlex Elder } else { 2252bf0d5f50SAlex Elder img_request->snap_id = rbd_dev->spec->snap_id; 22530c425248SAlex Elder } 2254a2acd00eSAlex Elder if (rbd_dev_parent_get(rbd_dev)) 2255d0b2e944SAlex Elder img_request_layered_set(img_request); 2256bf0d5f50SAlex Elder spin_lock_init(&img_request->completion_lock); 2257bf0d5f50SAlex Elder img_request->next_completion = 0; 2258bf0d5f50SAlex Elder img_request->callback = NULL; 2259a5a337d4SAlex Elder img_request->result = 0; 2260bf0d5f50SAlex Elder img_request->obj_request_count = 0; 2261bf0d5f50SAlex Elder INIT_LIST_HEAD(&img_request->obj_requests); 2262bf0d5f50SAlex Elder kref_init(&img_request->kref); 2263bf0d5f50SAlex Elder 226437206ee5SAlex Elder dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev, 22656d2940c8SGuangliang Zhao obj_op_name(op_type), offset, length, img_request); 226637206ee5SAlex Elder 2267bf0d5f50SAlex Elder return img_request; 2268bf0d5f50SAlex Elder } 2269bf0d5f50SAlex Elder 2270bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref) 2271bf0d5f50SAlex Elder { 2272bf0d5f50SAlex Elder struct rbd_img_request *img_request; 2273bf0d5f50SAlex Elder struct rbd_obj_request *obj_request; 2274bf0d5f50SAlex Elder struct rbd_obj_request *next_obj_request; 2275bf0d5f50SAlex Elder 2276bf0d5f50SAlex Elder img_request = container_of(kref, struct rbd_img_request, kref); 2277bf0d5f50SAlex Elder 227837206ee5SAlex Elder dout("%s: img %p\n", __func__, img_request); 227937206ee5SAlex Elder 2280bf0d5f50SAlex Elder for_each_obj_request_safe(img_request, obj_request, next_obj_request) 2281bf0d5f50SAlex Elder rbd_img_obj_request_del(img_request, obj_request); 228225dcf954SAlex Elder rbd_assert(img_request->obj_request_count == 0); 2283bf0d5f50SAlex Elder 2284a2acd00eSAlex Elder if (img_request_layered_test(img_request)) { 2285a2acd00eSAlex Elder img_request_layered_clear(img_request); 2286a2acd00eSAlex Elder rbd_dev_parent_put(img_request->rbd_dev); 2287a2acd00eSAlex Elder } 2288a2acd00eSAlex Elder 2289bef95455SJosh Durgin if (img_request_write_test(img_request) || 2290bef95455SJosh Durgin img_request_discard_test(img_request)) 2291812164f8SAlex Elder ceph_put_snap_context(img_request->snapc); 2292bf0d5f50SAlex Elder 22931c2a9dfeSAlex Elder kmem_cache_free(rbd_img_request_cache, img_request); 2294bf0d5f50SAlex Elder } 2295bf0d5f50SAlex Elder 2296e93f3152SAlex Elder static struct rbd_img_request *rbd_parent_request_create( 2297e93f3152SAlex Elder struct rbd_obj_request *obj_request, 2298e93f3152SAlex Elder u64 img_offset, u64 length) 2299e93f3152SAlex Elder { 2300e93f3152SAlex Elder struct rbd_img_request *parent_request; 2301e93f3152SAlex Elder struct rbd_device *rbd_dev; 2302e93f3152SAlex Elder 2303e93f3152SAlex Elder rbd_assert(obj_request->img_request); 2304e93f3152SAlex Elder rbd_dev = obj_request->img_request->rbd_dev; 2305e93f3152SAlex Elder 23064e752f0aSJosh Durgin parent_request = rbd_img_request_create(rbd_dev->parent, img_offset, 23076d2940c8SGuangliang Zhao length, OBJ_OP_READ, NULL); 2308e93f3152SAlex Elder if (!parent_request) 2309e93f3152SAlex Elder return NULL; 2310e93f3152SAlex Elder 2311e93f3152SAlex Elder img_request_child_set(parent_request); 2312e93f3152SAlex Elder rbd_obj_request_get(obj_request); 2313e93f3152SAlex Elder parent_request->obj_request = obj_request; 2314e93f3152SAlex Elder 2315e93f3152SAlex Elder return parent_request; 2316e93f3152SAlex Elder } 2317e93f3152SAlex Elder 2318e93f3152SAlex Elder static void rbd_parent_request_destroy(struct kref *kref) 2319e93f3152SAlex Elder { 2320e93f3152SAlex Elder struct rbd_img_request *parent_request; 2321e93f3152SAlex Elder struct rbd_obj_request *orig_request; 2322e93f3152SAlex Elder 2323e93f3152SAlex Elder parent_request = container_of(kref, struct rbd_img_request, kref); 2324e93f3152SAlex Elder orig_request = parent_request->obj_request; 2325e93f3152SAlex Elder 2326e93f3152SAlex Elder parent_request->obj_request = NULL; 2327e93f3152SAlex Elder rbd_obj_request_put(orig_request); 2328e93f3152SAlex Elder img_request_child_clear(parent_request); 2329e93f3152SAlex Elder 2330e93f3152SAlex Elder rbd_img_request_destroy(kref); 2331e93f3152SAlex Elder } 2332e93f3152SAlex Elder 23331217857fSAlex Elder static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request) 23341217857fSAlex Elder { 23356365d33aSAlex Elder struct rbd_img_request *img_request; 23361217857fSAlex Elder unsigned int xferred; 23371217857fSAlex Elder int result; 23388b3e1a56SAlex Elder bool more; 23391217857fSAlex Elder 23406365d33aSAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 23416365d33aSAlex Elder img_request = obj_request->img_request; 23426365d33aSAlex Elder 23431217857fSAlex Elder rbd_assert(obj_request->xferred <= (u64)UINT_MAX); 23441217857fSAlex Elder xferred = (unsigned int)obj_request->xferred; 23451217857fSAlex Elder result = obj_request->result; 23461217857fSAlex Elder if (result) { 23471217857fSAlex Elder struct rbd_device *rbd_dev = img_request->rbd_dev; 23486d2940c8SGuangliang Zhao enum obj_operation_type op_type; 23496d2940c8SGuangliang Zhao 235090e98c52SGuangliang Zhao if (img_request_discard_test(img_request)) 235190e98c52SGuangliang Zhao op_type = OBJ_OP_DISCARD; 235290e98c52SGuangliang Zhao else if (img_request_write_test(img_request)) 235390e98c52SGuangliang Zhao op_type = OBJ_OP_WRITE; 235490e98c52SGuangliang Zhao else 235590e98c52SGuangliang Zhao op_type = OBJ_OP_READ; 23561217857fSAlex Elder 23579584d508SIlya Dryomov rbd_warn(rbd_dev, "%s %llx at %llx (%llx)", 23586d2940c8SGuangliang Zhao obj_op_name(op_type), obj_request->length, 23596d2940c8SGuangliang Zhao obj_request->img_offset, obj_request->offset); 23609584d508SIlya Dryomov rbd_warn(rbd_dev, " result %d xferred %x", 23611217857fSAlex Elder result, xferred); 23621217857fSAlex Elder if (!img_request->result) 23631217857fSAlex Elder img_request->result = result; 2364082a75daSIlya Dryomov /* 2365082a75daSIlya Dryomov * Need to end I/O on the entire obj_request worth of 2366082a75daSIlya Dryomov * bytes in case of error. 2367082a75daSIlya Dryomov */ 2368082a75daSIlya Dryomov xferred = obj_request->length; 23691217857fSAlex Elder } 23701217857fSAlex Elder 2371f1a4739fSAlex Elder /* Image object requests don't own their page array */ 2372f1a4739fSAlex Elder 2373f1a4739fSAlex Elder if (obj_request->type == OBJ_REQUEST_PAGES) { 2374f1a4739fSAlex Elder obj_request->pages = NULL; 2375f1a4739fSAlex Elder obj_request->page_count = 0; 2376f1a4739fSAlex Elder } 2377f1a4739fSAlex Elder 23788b3e1a56SAlex Elder if (img_request_child_test(img_request)) { 23798b3e1a56SAlex Elder rbd_assert(img_request->obj_request != NULL); 23808b3e1a56SAlex Elder more = obj_request->which < img_request->obj_request_count - 1; 23818b3e1a56SAlex Elder } else { 23828b3e1a56SAlex Elder rbd_assert(img_request->rq != NULL); 23837ad18afaSChristoph Hellwig 23847ad18afaSChristoph Hellwig more = blk_update_request(img_request->rq, result, xferred); 23857ad18afaSChristoph Hellwig if (!more) 23867ad18afaSChristoph Hellwig __blk_mq_end_request(img_request->rq, result); 23878b3e1a56SAlex Elder } 23888b3e1a56SAlex Elder 23898b3e1a56SAlex Elder return more; 23901217857fSAlex Elder } 23911217857fSAlex Elder 23922169238dSAlex Elder static void rbd_img_obj_callback(struct rbd_obj_request *obj_request) 23932169238dSAlex Elder { 23942169238dSAlex Elder struct rbd_img_request *img_request; 23952169238dSAlex Elder u32 which = obj_request->which; 23962169238dSAlex Elder bool more = true; 23972169238dSAlex Elder 23986365d33aSAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 23992169238dSAlex Elder img_request = obj_request->img_request; 24002169238dSAlex Elder 24012169238dSAlex Elder dout("%s: img %p obj %p\n", __func__, img_request, obj_request); 24022169238dSAlex Elder rbd_assert(img_request != NULL); 24032169238dSAlex Elder rbd_assert(img_request->obj_request_count > 0); 24042169238dSAlex Elder rbd_assert(which != BAD_WHICH); 24052169238dSAlex Elder rbd_assert(which < img_request->obj_request_count); 24062169238dSAlex Elder 24072169238dSAlex Elder spin_lock_irq(&img_request->completion_lock); 24082169238dSAlex Elder if (which != img_request->next_completion) 24092169238dSAlex Elder goto out; 24102169238dSAlex Elder 24112169238dSAlex Elder for_each_obj_request_from(img_request, obj_request) { 24122169238dSAlex Elder rbd_assert(more); 24132169238dSAlex Elder rbd_assert(which < img_request->obj_request_count); 24142169238dSAlex Elder 24152169238dSAlex Elder if (!obj_request_done_test(obj_request)) 24162169238dSAlex Elder break; 24171217857fSAlex Elder more = rbd_img_obj_end_request(obj_request); 24182169238dSAlex Elder which++; 24192169238dSAlex Elder } 24202169238dSAlex Elder 24212169238dSAlex Elder rbd_assert(more ^ (which == img_request->obj_request_count)); 24222169238dSAlex Elder img_request->next_completion = which; 24232169238dSAlex Elder out: 24242169238dSAlex Elder spin_unlock_irq(&img_request->completion_lock); 24250f2d5be7SAlex Elder rbd_img_request_put(img_request); 24262169238dSAlex Elder 24272169238dSAlex Elder if (!more) 24282169238dSAlex Elder rbd_img_request_complete(img_request); 24292169238dSAlex Elder } 24302169238dSAlex Elder 2431f1a4739fSAlex Elder /* 24323b434a2aSJosh Durgin * Add individual osd ops to the given ceph_osd_request and prepare 24333b434a2aSJosh Durgin * them for submission. num_ops is the current number of 24343b434a2aSJosh Durgin * osd operations already to the object request. 24353b434a2aSJosh Durgin */ 24363b434a2aSJosh Durgin static void rbd_img_obj_request_fill(struct rbd_obj_request *obj_request, 24373b434a2aSJosh Durgin struct ceph_osd_request *osd_request, 24383b434a2aSJosh Durgin enum obj_operation_type op_type, 24393b434a2aSJosh Durgin unsigned int num_ops) 24403b434a2aSJosh Durgin { 24413b434a2aSJosh Durgin struct rbd_img_request *img_request = obj_request->img_request; 24423b434a2aSJosh Durgin struct rbd_device *rbd_dev = img_request->rbd_dev; 24433b434a2aSJosh Durgin u64 object_size = rbd_obj_bytes(&rbd_dev->header); 24443b434a2aSJosh Durgin u64 offset = obj_request->offset; 24453b434a2aSJosh Durgin u64 length = obj_request->length; 24463b434a2aSJosh Durgin u64 img_end; 24473b434a2aSJosh Durgin u16 opcode; 24483b434a2aSJosh Durgin 24493b434a2aSJosh Durgin if (op_type == OBJ_OP_DISCARD) { 2450d3246fb0SJosh Durgin if (!offset && length == object_size && 2451d3246fb0SJosh Durgin (!img_request_layered_test(img_request) || 2452d3246fb0SJosh Durgin !obj_request_overlaps_parent(obj_request))) { 24533b434a2aSJosh Durgin opcode = CEPH_OSD_OP_DELETE; 24543b434a2aSJosh Durgin } else if ((offset + length == object_size)) { 24553b434a2aSJosh Durgin opcode = CEPH_OSD_OP_TRUNCATE; 24563b434a2aSJosh Durgin } else { 24573b434a2aSJosh Durgin down_read(&rbd_dev->header_rwsem); 24583b434a2aSJosh Durgin img_end = rbd_dev->header.image_size; 24593b434a2aSJosh Durgin up_read(&rbd_dev->header_rwsem); 24603b434a2aSJosh Durgin 24613b434a2aSJosh Durgin if (obj_request->img_offset + length == img_end) 24623b434a2aSJosh Durgin opcode = CEPH_OSD_OP_TRUNCATE; 24633b434a2aSJosh Durgin else 24643b434a2aSJosh Durgin opcode = CEPH_OSD_OP_ZERO; 24653b434a2aSJosh Durgin } 24663b434a2aSJosh Durgin } else if (op_type == OBJ_OP_WRITE) { 2467e30b7577SIlya Dryomov if (!offset && length == object_size) 2468e30b7577SIlya Dryomov opcode = CEPH_OSD_OP_WRITEFULL; 2469e30b7577SIlya Dryomov else 24703b434a2aSJosh Durgin opcode = CEPH_OSD_OP_WRITE; 24713b434a2aSJosh Durgin osd_req_op_alloc_hint_init(osd_request, num_ops, 24723b434a2aSJosh Durgin object_size, object_size); 24733b434a2aSJosh Durgin num_ops++; 24743b434a2aSJosh Durgin } else { 24753b434a2aSJosh Durgin opcode = CEPH_OSD_OP_READ; 24763b434a2aSJosh Durgin } 24773b434a2aSJosh Durgin 24787e868b6eSIlya Dryomov if (opcode == CEPH_OSD_OP_DELETE) 2479144cba14SYan, Zheng osd_req_op_init(osd_request, num_ops, opcode, 0); 24807e868b6eSIlya Dryomov else 24817e868b6eSIlya Dryomov osd_req_op_extent_init(osd_request, num_ops, opcode, 24827e868b6eSIlya Dryomov offset, length, 0, 0); 24837e868b6eSIlya Dryomov 24843b434a2aSJosh Durgin if (obj_request->type == OBJ_REQUEST_BIO) 24853b434a2aSJosh Durgin osd_req_op_extent_osd_data_bio(osd_request, num_ops, 24863b434a2aSJosh Durgin obj_request->bio_list, length); 24873b434a2aSJosh Durgin else if (obj_request->type == OBJ_REQUEST_PAGES) 24883b434a2aSJosh Durgin osd_req_op_extent_osd_data_pages(osd_request, num_ops, 24893b434a2aSJosh Durgin obj_request->pages, length, 24903b434a2aSJosh Durgin offset & ~PAGE_MASK, false, false); 24913b434a2aSJosh Durgin 24923b434a2aSJosh Durgin /* Discards are also writes */ 24933b434a2aSJosh Durgin if (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD) 24943b434a2aSJosh Durgin rbd_osd_req_format_write(obj_request); 24953b434a2aSJosh Durgin else 24963b434a2aSJosh Durgin rbd_osd_req_format_read(obj_request); 24973b434a2aSJosh Durgin } 24983b434a2aSJosh Durgin 24993b434a2aSJosh Durgin /* 2500f1a4739fSAlex Elder * Split up an image request into one or more object requests, each 2501f1a4739fSAlex Elder * to a different object. The "type" parameter indicates whether 2502f1a4739fSAlex Elder * "data_desc" is the pointer to the head of a list of bio 2503f1a4739fSAlex Elder * structures, or the base of a page array. In either case this 2504f1a4739fSAlex Elder * function assumes data_desc describes memory sufficient to hold 2505f1a4739fSAlex Elder * all data described by the image request. 2506f1a4739fSAlex Elder */ 2507f1a4739fSAlex Elder static int rbd_img_request_fill(struct rbd_img_request *img_request, 2508f1a4739fSAlex Elder enum obj_request_type type, 2509f1a4739fSAlex Elder void *data_desc) 2510bf0d5f50SAlex Elder { 2511bf0d5f50SAlex Elder struct rbd_device *rbd_dev = img_request->rbd_dev; 2512bf0d5f50SAlex Elder struct rbd_obj_request *obj_request = NULL; 2513bf0d5f50SAlex Elder struct rbd_obj_request *next_obj_request; 2514a158073cSJingoo Han struct bio *bio_list = NULL; 2515f1a4739fSAlex Elder unsigned int bio_offset = 0; 2516a158073cSJingoo Han struct page **pages = NULL; 25176d2940c8SGuangliang Zhao enum obj_operation_type op_type; 25187da22d29SAlex Elder u64 img_offset; 2519bf0d5f50SAlex Elder u64 resid; 2520bf0d5f50SAlex Elder 2521f1a4739fSAlex Elder dout("%s: img %p type %d data_desc %p\n", __func__, img_request, 2522f1a4739fSAlex Elder (int)type, data_desc); 252337206ee5SAlex Elder 25247da22d29SAlex Elder img_offset = img_request->offset; 2525bf0d5f50SAlex Elder resid = img_request->length; 25264dda41d3SAlex Elder rbd_assert(resid > 0); 25273b434a2aSJosh Durgin op_type = rbd_img_request_op_type(img_request); 2528f1a4739fSAlex Elder 2529f1a4739fSAlex Elder if (type == OBJ_REQUEST_BIO) { 2530f1a4739fSAlex Elder bio_list = data_desc; 25314f024f37SKent Overstreet rbd_assert(img_offset == 25324f024f37SKent Overstreet bio_list->bi_iter.bi_sector << SECTOR_SHIFT); 253390e98c52SGuangliang Zhao } else if (type == OBJ_REQUEST_PAGES) { 2534f1a4739fSAlex Elder pages = data_desc; 2535f1a4739fSAlex Elder } 2536f1a4739fSAlex Elder 2537bf0d5f50SAlex Elder while (resid) { 25382fa12320SAlex Elder struct ceph_osd_request *osd_req; 2539bf0d5f50SAlex Elder const char *object_name; 2540bf0d5f50SAlex Elder u64 offset; 2541bf0d5f50SAlex Elder u64 length; 2542bf0d5f50SAlex Elder 25437da22d29SAlex Elder object_name = rbd_segment_name(rbd_dev, img_offset); 2544bf0d5f50SAlex Elder if (!object_name) 2545bf0d5f50SAlex Elder goto out_unwind; 25467da22d29SAlex Elder offset = rbd_segment_offset(rbd_dev, img_offset); 25477da22d29SAlex Elder length = rbd_segment_length(rbd_dev, img_offset, resid); 2548bf0d5f50SAlex Elder obj_request = rbd_obj_request_create(object_name, 2549f1a4739fSAlex Elder offset, length, type); 255078c2a44aSAlex Elder /* object request has its own copy of the object name */ 255178c2a44aSAlex Elder rbd_segment_name_free(object_name); 2552bf0d5f50SAlex Elder if (!obj_request) 2553bf0d5f50SAlex Elder goto out_unwind; 255462054da6SIlya Dryomov 255503507db6SJosh Durgin /* 255603507db6SJosh Durgin * set obj_request->img_request before creating the 255703507db6SJosh Durgin * osd_request so that it gets the right snapc 255803507db6SJosh Durgin */ 255903507db6SJosh Durgin rbd_img_obj_request_add(img_request, obj_request); 2560bf0d5f50SAlex Elder 2561f1a4739fSAlex Elder if (type == OBJ_REQUEST_BIO) { 2562f1a4739fSAlex Elder unsigned int clone_size; 2563f1a4739fSAlex Elder 2564bf0d5f50SAlex Elder rbd_assert(length <= (u64)UINT_MAX); 2565bf0d5f50SAlex Elder clone_size = (unsigned int)length; 2566f1a4739fSAlex Elder obj_request->bio_list = 2567f1a4739fSAlex Elder bio_chain_clone_range(&bio_list, 2568f1a4739fSAlex Elder &bio_offset, 2569f1a4739fSAlex Elder clone_size, 25702224d879SDavid Disseldorp GFP_NOIO); 2571bf0d5f50SAlex Elder if (!obj_request->bio_list) 257262054da6SIlya Dryomov goto out_unwind; 257390e98c52SGuangliang Zhao } else if (type == OBJ_REQUEST_PAGES) { 2574f1a4739fSAlex Elder unsigned int page_count; 2575f1a4739fSAlex Elder 2576f1a4739fSAlex Elder obj_request->pages = pages; 2577f1a4739fSAlex Elder page_count = (u32)calc_pages_for(offset, length); 2578f1a4739fSAlex Elder obj_request->page_count = page_count; 2579f1a4739fSAlex Elder if ((offset + length) & ~PAGE_MASK) 2580f1a4739fSAlex Elder page_count--; /* more on last page */ 2581f1a4739fSAlex Elder pages += page_count; 2582f1a4739fSAlex Elder } 2583bf0d5f50SAlex Elder 25846d2940c8SGuangliang Zhao osd_req = rbd_osd_req_create(rbd_dev, op_type, 25856d2940c8SGuangliang Zhao (op_type == OBJ_OP_WRITE) ? 2 : 1, 25862fa12320SAlex Elder obj_request); 25872fa12320SAlex Elder if (!osd_req) 258862054da6SIlya Dryomov goto out_unwind; 25893b434a2aSJosh Durgin 25902fa12320SAlex Elder obj_request->osd_req = osd_req; 25912169238dSAlex Elder obj_request->callback = rbd_img_obj_callback; 25927da22d29SAlex Elder obj_request->img_offset = img_offset; 2593bf0d5f50SAlex Elder 25943b434a2aSJosh Durgin rbd_img_obj_request_fill(obj_request, osd_req, op_type, 0); 25953b434a2aSJosh Durgin 25967da22d29SAlex Elder img_offset += length; 2597bf0d5f50SAlex Elder resid -= length; 2598bf0d5f50SAlex Elder } 2599bf0d5f50SAlex Elder 2600bf0d5f50SAlex Elder return 0; 2601bf0d5f50SAlex Elder 2602bf0d5f50SAlex Elder out_unwind: 2603bf0d5f50SAlex Elder for_each_obj_request_safe(img_request, obj_request, next_obj_request) 260442dd037cSIlya Dryomov rbd_img_obj_request_del(img_request, obj_request); 2605bf0d5f50SAlex Elder 2606bf0d5f50SAlex Elder return -ENOMEM; 2607bf0d5f50SAlex Elder } 2608bf0d5f50SAlex Elder 26093d7efd18SAlex Elder static void 26102761713dSIlya Dryomov rbd_osd_copyup_callback(struct rbd_obj_request *obj_request) 26110eefd470SAlex Elder { 26120eefd470SAlex Elder struct rbd_img_request *img_request; 26130eefd470SAlex Elder struct rbd_device *rbd_dev; 2614ebda6408SAlex Elder struct page **pages; 26150eefd470SAlex Elder u32 page_count; 26160eefd470SAlex Elder 26172761713dSIlya Dryomov dout("%s: obj %p\n", __func__, obj_request); 26182761713dSIlya Dryomov 2619d3246fb0SJosh Durgin rbd_assert(obj_request->type == OBJ_REQUEST_BIO || 2620d3246fb0SJosh Durgin obj_request->type == OBJ_REQUEST_NODATA); 26210eefd470SAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 26220eefd470SAlex Elder img_request = obj_request->img_request; 26230eefd470SAlex Elder rbd_assert(img_request); 26240eefd470SAlex Elder 26250eefd470SAlex Elder rbd_dev = img_request->rbd_dev; 26260eefd470SAlex Elder rbd_assert(rbd_dev); 26270eefd470SAlex Elder 2628ebda6408SAlex Elder pages = obj_request->copyup_pages; 2629ebda6408SAlex Elder rbd_assert(pages != NULL); 26300eefd470SAlex Elder obj_request->copyup_pages = NULL; 2631ebda6408SAlex Elder page_count = obj_request->copyup_page_count; 2632ebda6408SAlex Elder rbd_assert(page_count); 2633ebda6408SAlex Elder obj_request->copyup_page_count = 0; 2634ebda6408SAlex Elder ceph_release_page_vector(pages, page_count); 26350eefd470SAlex Elder 26360eefd470SAlex Elder /* 26370eefd470SAlex Elder * We want the transfer count to reflect the size of the 26380eefd470SAlex Elder * original write request. There is no such thing as a 26390eefd470SAlex Elder * successful short write, so if the request was successful 26400eefd470SAlex Elder * we can just set it to the originally-requested length. 26410eefd470SAlex Elder */ 26420eefd470SAlex Elder if (!obj_request->result) 26430eefd470SAlex Elder obj_request->xferred = obj_request->length; 26440eefd470SAlex Elder 26452761713dSIlya Dryomov obj_request_done_set(obj_request); 26460eefd470SAlex Elder } 26470eefd470SAlex Elder 26480eefd470SAlex Elder static void 26493d7efd18SAlex Elder rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request) 26503d7efd18SAlex Elder { 26513d7efd18SAlex Elder struct rbd_obj_request *orig_request; 26520eefd470SAlex Elder struct ceph_osd_request *osd_req; 26530eefd470SAlex Elder struct rbd_device *rbd_dev; 26543d7efd18SAlex Elder struct page **pages; 2655d3246fb0SJosh Durgin enum obj_operation_type op_type; 2656ebda6408SAlex Elder u32 page_count; 2657bbea1c1aSAlex Elder int img_result; 2658ebda6408SAlex Elder u64 parent_length; 26593d7efd18SAlex Elder 26603d7efd18SAlex Elder rbd_assert(img_request_child_test(img_request)); 26613d7efd18SAlex Elder 26623d7efd18SAlex Elder /* First get what we need from the image request */ 26633d7efd18SAlex Elder 26643d7efd18SAlex Elder pages = img_request->copyup_pages; 26653d7efd18SAlex Elder rbd_assert(pages != NULL); 26663d7efd18SAlex Elder img_request->copyup_pages = NULL; 2667ebda6408SAlex Elder page_count = img_request->copyup_page_count; 2668ebda6408SAlex Elder rbd_assert(page_count); 2669ebda6408SAlex Elder img_request->copyup_page_count = 0; 26703d7efd18SAlex Elder 26713d7efd18SAlex Elder orig_request = img_request->obj_request; 26723d7efd18SAlex Elder rbd_assert(orig_request != NULL); 2673b91f09f1SAlex Elder rbd_assert(obj_request_type_valid(orig_request->type)); 2674bbea1c1aSAlex Elder img_result = img_request->result; 2675ebda6408SAlex Elder parent_length = img_request->length; 2676fa355112SIlya Dryomov rbd_assert(img_result || parent_length == img_request->xferred); 26773d7efd18SAlex Elder rbd_img_request_put(img_request); 26783d7efd18SAlex Elder 267991c6febbSAlex Elder rbd_assert(orig_request->img_request); 268091c6febbSAlex Elder rbd_dev = orig_request->img_request->rbd_dev; 26813d7efd18SAlex Elder rbd_assert(rbd_dev); 26823d7efd18SAlex Elder 2683bbea1c1aSAlex Elder /* 2684bbea1c1aSAlex Elder * If the overlap has become 0 (most likely because the 2685bbea1c1aSAlex Elder * image has been flattened) we need to free the pages 2686bbea1c1aSAlex Elder * and re-submit the original write request. 2687bbea1c1aSAlex Elder */ 2688bbea1c1aSAlex Elder if (!rbd_dev->parent_overlap) { 2689bbea1c1aSAlex Elder ceph_release_page_vector(pages, page_count); 2690980917fcSIlya Dryomov rbd_obj_request_submit(orig_request); 2691bbea1c1aSAlex Elder return; 2692bbea1c1aSAlex Elder } 2693bbea1c1aSAlex Elder 2694bbea1c1aSAlex Elder if (img_result) 26950eefd470SAlex Elder goto out_err; 26963d7efd18SAlex Elder 26978785b1d4SAlex Elder /* 26988785b1d4SAlex Elder * The original osd request is of no use to use any more. 26990ccd5926SIlya Dryomov * We need a new one that can hold the three ops in a copyup 27008785b1d4SAlex Elder * request. Allocate the new copyup osd request for the 27018785b1d4SAlex Elder * original request, and release the old one. 27028785b1d4SAlex Elder */ 2703bbea1c1aSAlex Elder img_result = -ENOMEM; 27040eefd470SAlex Elder osd_req = rbd_osd_req_create_copyup(orig_request); 27050eefd470SAlex Elder if (!osd_req) 27060eefd470SAlex Elder goto out_err; 27078785b1d4SAlex Elder rbd_osd_req_destroy(orig_request->osd_req); 27080eefd470SAlex Elder orig_request->osd_req = osd_req; 27090eefd470SAlex Elder orig_request->copyup_pages = pages; 2710ebda6408SAlex Elder orig_request->copyup_page_count = page_count; 27113d7efd18SAlex Elder 27120eefd470SAlex Elder /* Initialize the copyup op */ 27130eefd470SAlex Elder 27140eefd470SAlex Elder osd_req_op_cls_init(osd_req, 0, CEPH_OSD_OP_CALL, "rbd", "copyup"); 2715ebda6408SAlex Elder osd_req_op_cls_request_data_pages(osd_req, 0, pages, parent_length, 0, 27160eefd470SAlex Elder false, false); 27170eefd470SAlex Elder 2718d3246fb0SJosh Durgin /* Add the other op(s) */ 27190ccd5926SIlya Dryomov 2720d3246fb0SJosh Durgin op_type = rbd_img_request_op_type(orig_request->img_request); 2721d3246fb0SJosh Durgin rbd_img_obj_request_fill(orig_request, osd_req, op_type, 1); 27220eefd470SAlex Elder 27230eefd470SAlex Elder /* All set, send it off. */ 27240eefd470SAlex Elder 2725980917fcSIlya Dryomov rbd_obj_request_submit(orig_request); 27260eefd470SAlex Elder return; 2727980917fcSIlya Dryomov 27280eefd470SAlex Elder out_err: 2729fa355112SIlya Dryomov ceph_release_page_vector(pages, page_count); 2730bbea1c1aSAlex Elder orig_request->result = img_result; 27310eefd470SAlex Elder orig_request->xferred = 0; 27324a17dadcSIlya Dryomov rbd_img_request_get(orig_request->img_request); 27333d7efd18SAlex Elder obj_request_done_set(orig_request); 27343d7efd18SAlex Elder rbd_obj_request_complete(orig_request); 27353d7efd18SAlex Elder } 27363d7efd18SAlex Elder 27373d7efd18SAlex Elder /* 27383d7efd18SAlex Elder * Read from the parent image the range of data that covers the 27393d7efd18SAlex Elder * entire target of the given object request. This is used for 27403d7efd18SAlex Elder * satisfying a layered image write request when the target of an 27413d7efd18SAlex Elder * object request from the image request does not exist. 27423d7efd18SAlex Elder * 27433d7efd18SAlex Elder * A page array big enough to hold the returned data is allocated 27443d7efd18SAlex Elder * and supplied to rbd_img_request_fill() as the "data descriptor." 27453d7efd18SAlex Elder * When the read completes, this page array will be transferred to 27463d7efd18SAlex Elder * the original object request for the copyup operation. 27473d7efd18SAlex Elder * 2748c2e82414SIlya Dryomov * If an error occurs, it is recorded as the result of the original 2749c2e82414SIlya Dryomov * object request in rbd_img_obj_exists_callback(). 27503d7efd18SAlex Elder */ 27513d7efd18SAlex Elder static int rbd_img_obj_parent_read_full(struct rbd_obj_request *obj_request) 27523d7efd18SAlex Elder { 2753058aa991SIlya Dryomov struct rbd_device *rbd_dev = obj_request->img_request->rbd_dev; 27543d7efd18SAlex Elder struct rbd_img_request *parent_request = NULL; 27553d7efd18SAlex Elder u64 img_offset; 27563d7efd18SAlex Elder u64 length; 27573d7efd18SAlex Elder struct page **pages = NULL; 27583d7efd18SAlex Elder u32 page_count; 27593d7efd18SAlex Elder int result; 27603d7efd18SAlex Elder 27613d7efd18SAlex Elder rbd_assert(rbd_dev->parent != NULL); 27623d7efd18SAlex Elder 27633d7efd18SAlex Elder /* 27643d7efd18SAlex Elder * Determine the byte range covered by the object in the 27653d7efd18SAlex Elder * child image to which the original request was to be sent. 27663d7efd18SAlex Elder */ 27673d7efd18SAlex Elder img_offset = obj_request->img_offset - obj_request->offset; 27683d7efd18SAlex Elder length = (u64)1 << rbd_dev->header.obj_order; 27693d7efd18SAlex Elder 27703d7efd18SAlex Elder /* 2771a9e8ba2cSAlex Elder * There is no defined parent data beyond the parent 2772a9e8ba2cSAlex Elder * overlap, so limit what we read at that boundary if 2773a9e8ba2cSAlex Elder * necessary. 2774a9e8ba2cSAlex Elder */ 2775a9e8ba2cSAlex Elder if (img_offset + length > rbd_dev->parent_overlap) { 2776a9e8ba2cSAlex Elder rbd_assert(img_offset < rbd_dev->parent_overlap); 2777a9e8ba2cSAlex Elder length = rbd_dev->parent_overlap - img_offset; 2778a9e8ba2cSAlex Elder } 2779a9e8ba2cSAlex Elder 2780a9e8ba2cSAlex Elder /* 27813d7efd18SAlex Elder * Allocate a page array big enough to receive the data read 27823d7efd18SAlex Elder * from the parent. 27833d7efd18SAlex Elder */ 27843d7efd18SAlex Elder page_count = (u32)calc_pages_for(0, length); 27853d7efd18SAlex Elder pages = ceph_alloc_page_vector(page_count, GFP_KERNEL); 27863d7efd18SAlex Elder if (IS_ERR(pages)) { 27873d7efd18SAlex Elder result = PTR_ERR(pages); 27883d7efd18SAlex Elder pages = NULL; 27893d7efd18SAlex Elder goto out_err; 27903d7efd18SAlex Elder } 27913d7efd18SAlex Elder 27923d7efd18SAlex Elder result = -ENOMEM; 2793e93f3152SAlex Elder parent_request = rbd_parent_request_create(obj_request, 2794e93f3152SAlex Elder img_offset, length); 27953d7efd18SAlex Elder if (!parent_request) 27963d7efd18SAlex Elder goto out_err; 27973d7efd18SAlex Elder 27983d7efd18SAlex Elder result = rbd_img_request_fill(parent_request, OBJ_REQUEST_PAGES, pages); 27993d7efd18SAlex Elder if (result) 28003d7efd18SAlex Elder goto out_err; 2801058aa991SIlya Dryomov 28023d7efd18SAlex Elder parent_request->copyup_pages = pages; 2803ebda6408SAlex Elder parent_request->copyup_page_count = page_count; 28043d7efd18SAlex Elder parent_request->callback = rbd_img_obj_parent_read_full_callback; 2805058aa991SIlya Dryomov 28063d7efd18SAlex Elder result = rbd_img_request_submit(parent_request); 28073d7efd18SAlex Elder if (!result) 28083d7efd18SAlex Elder return 0; 28093d7efd18SAlex Elder 28103d7efd18SAlex Elder parent_request->copyup_pages = NULL; 2811ebda6408SAlex Elder parent_request->copyup_page_count = 0; 28123d7efd18SAlex Elder parent_request->obj_request = NULL; 28133d7efd18SAlex Elder rbd_obj_request_put(obj_request); 28143d7efd18SAlex Elder out_err: 28153d7efd18SAlex Elder if (pages) 28163d7efd18SAlex Elder ceph_release_page_vector(pages, page_count); 28173d7efd18SAlex Elder if (parent_request) 28183d7efd18SAlex Elder rbd_img_request_put(parent_request); 28193d7efd18SAlex Elder return result; 28203d7efd18SAlex Elder } 28213d7efd18SAlex Elder 2822c5b5ef6cSAlex Elder static void rbd_img_obj_exists_callback(struct rbd_obj_request *obj_request) 2823c5b5ef6cSAlex Elder { 2824c5b5ef6cSAlex Elder struct rbd_obj_request *orig_request; 2825638f5abeSAlex Elder struct rbd_device *rbd_dev; 2826c5b5ef6cSAlex Elder int result; 2827c5b5ef6cSAlex Elder 2828c5b5ef6cSAlex Elder rbd_assert(!obj_request_img_data_test(obj_request)); 2829c5b5ef6cSAlex Elder 2830c5b5ef6cSAlex Elder /* 2831c5b5ef6cSAlex Elder * All we need from the object request is the original 2832c5b5ef6cSAlex Elder * request and the result of the STAT op. Grab those, then 2833c5b5ef6cSAlex Elder * we're done with the request. 2834c5b5ef6cSAlex Elder */ 2835c5b5ef6cSAlex Elder orig_request = obj_request->obj_request; 2836c5b5ef6cSAlex Elder obj_request->obj_request = NULL; 2837912c317dSAlex Elder rbd_obj_request_put(orig_request); 2838c5b5ef6cSAlex Elder rbd_assert(orig_request); 2839c5b5ef6cSAlex Elder rbd_assert(orig_request->img_request); 2840c5b5ef6cSAlex Elder 2841c5b5ef6cSAlex Elder result = obj_request->result; 2842c5b5ef6cSAlex Elder obj_request->result = 0; 2843c5b5ef6cSAlex Elder 2844c5b5ef6cSAlex Elder dout("%s: obj %p for obj %p result %d %llu/%llu\n", __func__, 2845c5b5ef6cSAlex Elder obj_request, orig_request, result, 2846c5b5ef6cSAlex Elder obj_request->xferred, obj_request->length); 2847c5b5ef6cSAlex Elder rbd_obj_request_put(obj_request); 2848c5b5ef6cSAlex Elder 2849638f5abeSAlex Elder /* 2850638f5abeSAlex Elder * If the overlap has become 0 (most likely because the 2851980917fcSIlya Dryomov * image has been flattened) we need to re-submit the 2852980917fcSIlya Dryomov * original request. 2853638f5abeSAlex Elder */ 2854638f5abeSAlex Elder rbd_dev = orig_request->img_request->rbd_dev; 2855638f5abeSAlex Elder if (!rbd_dev->parent_overlap) { 2856980917fcSIlya Dryomov rbd_obj_request_submit(orig_request); 2857638f5abeSAlex Elder return; 2858638f5abeSAlex Elder } 2859c5b5ef6cSAlex Elder 2860c5b5ef6cSAlex Elder /* 2861c5b5ef6cSAlex Elder * Our only purpose here is to determine whether the object 2862c5b5ef6cSAlex Elder * exists, and we don't want to treat the non-existence as 2863c5b5ef6cSAlex Elder * an error. If something else comes back, transfer the 2864c5b5ef6cSAlex Elder * error to the original request and complete it now. 2865c5b5ef6cSAlex Elder */ 2866c5b5ef6cSAlex Elder if (!result) { 2867c5b5ef6cSAlex Elder obj_request_existence_set(orig_request, true); 2868c5b5ef6cSAlex Elder } else if (result == -ENOENT) { 2869c5b5ef6cSAlex Elder obj_request_existence_set(orig_request, false); 2870c2e82414SIlya Dryomov } else { 2871c2e82414SIlya Dryomov goto fail_orig_request; 2872c5b5ef6cSAlex Elder } 2873c5b5ef6cSAlex Elder 2874c5b5ef6cSAlex Elder /* 2875c5b5ef6cSAlex Elder * Resubmit the original request now that we have recorded 2876c5b5ef6cSAlex Elder * whether the target object exists. 2877c5b5ef6cSAlex Elder */ 2878c2e82414SIlya Dryomov result = rbd_img_obj_request_submit(orig_request); 2879c2e82414SIlya Dryomov if (result) 2880c2e82414SIlya Dryomov goto fail_orig_request; 2881c2e82414SIlya Dryomov 2882c2e82414SIlya Dryomov return; 2883c2e82414SIlya Dryomov 2884c2e82414SIlya Dryomov fail_orig_request: 2885c2e82414SIlya Dryomov orig_request->result = result; 2886c2e82414SIlya Dryomov orig_request->xferred = 0; 28874a17dadcSIlya Dryomov rbd_img_request_get(orig_request->img_request); 2888c2e82414SIlya Dryomov obj_request_done_set(orig_request); 2889c5b5ef6cSAlex Elder rbd_obj_request_complete(orig_request); 2890c5b5ef6cSAlex Elder } 2891c5b5ef6cSAlex Elder 2892c5b5ef6cSAlex Elder static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request) 2893c5b5ef6cSAlex Elder { 2894058aa991SIlya Dryomov struct rbd_device *rbd_dev = obj_request->img_request->rbd_dev; 2895c5b5ef6cSAlex Elder struct rbd_obj_request *stat_request; 2896710214e3SIlya Dryomov struct page **pages; 2897c5b5ef6cSAlex Elder u32 page_count; 2898c5b5ef6cSAlex Elder size_t size; 2899c5b5ef6cSAlex Elder int ret; 2900c5b5ef6cSAlex Elder 2901710214e3SIlya Dryomov stat_request = rbd_obj_request_create(obj_request->object_name, 0, 0, 2902710214e3SIlya Dryomov OBJ_REQUEST_PAGES); 2903710214e3SIlya Dryomov if (!stat_request) 2904710214e3SIlya Dryomov return -ENOMEM; 2905710214e3SIlya Dryomov 2906710214e3SIlya Dryomov stat_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1, 2907710214e3SIlya Dryomov stat_request); 2908710214e3SIlya Dryomov if (!stat_request->osd_req) { 2909710214e3SIlya Dryomov ret = -ENOMEM; 2910710214e3SIlya Dryomov goto fail_stat_request; 2911710214e3SIlya Dryomov } 2912710214e3SIlya Dryomov 2913c5b5ef6cSAlex Elder /* 2914c5b5ef6cSAlex Elder * The response data for a STAT call consists of: 2915c5b5ef6cSAlex Elder * le64 length; 2916c5b5ef6cSAlex Elder * struct { 2917c5b5ef6cSAlex Elder * le32 tv_sec; 2918c5b5ef6cSAlex Elder * le32 tv_nsec; 2919c5b5ef6cSAlex Elder * } mtime; 2920c5b5ef6cSAlex Elder */ 2921c5b5ef6cSAlex Elder size = sizeof (__le64) + sizeof (__le32) + sizeof (__le32); 2922c5b5ef6cSAlex Elder page_count = (u32)calc_pages_for(0, size); 2923c5b5ef6cSAlex Elder pages = ceph_alloc_page_vector(page_count, GFP_KERNEL); 2924710214e3SIlya Dryomov if (IS_ERR(pages)) { 2925710214e3SIlya Dryomov ret = PTR_ERR(pages); 2926710214e3SIlya Dryomov goto fail_stat_request; 2927710214e3SIlya Dryomov } 2928c5b5ef6cSAlex Elder 2929710214e3SIlya Dryomov osd_req_op_init(stat_request->osd_req, 0, CEPH_OSD_OP_STAT, 0); 2930710214e3SIlya Dryomov osd_req_op_raw_data_in_pages(stat_request->osd_req, 0, pages, size, 0, 2931710214e3SIlya Dryomov false, false); 2932c5b5ef6cSAlex Elder 2933c5b5ef6cSAlex Elder rbd_obj_request_get(obj_request); 2934c5b5ef6cSAlex Elder stat_request->obj_request = obj_request; 2935c5b5ef6cSAlex Elder stat_request->pages = pages; 2936c5b5ef6cSAlex Elder stat_request->page_count = page_count; 2937c5b5ef6cSAlex Elder stat_request->callback = rbd_img_obj_exists_callback; 2938c5b5ef6cSAlex Elder 2939980917fcSIlya Dryomov rbd_obj_request_submit(stat_request); 2940980917fcSIlya Dryomov return 0; 2941980917fcSIlya Dryomov 2942710214e3SIlya Dryomov fail_stat_request: 2943710214e3SIlya Dryomov rbd_obj_request_put(stat_request); 2944c5b5ef6cSAlex Elder return ret; 2945c5b5ef6cSAlex Elder } 2946c5b5ef6cSAlex Elder 294770d045f6SIlya Dryomov static bool img_obj_request_simple(struct rbd_obj_request *obj_request) 2948b454e36dSAlex Elder { 2949058aa991SIlya Dryomov struct rbd_img_request *img_request = obj_request->img_request; 2950058aa991SIlya Dryomov struct rbd_device *rbd_dev = img_request->rbd_dev; 2951b454e36dSAlex Elder 295270d045f6SIlya Dryomov /* Reads */ 29531c220881SJosh Durgin if (!img_request_write_test(img_request) && 29541c220881SJosh Durgin !img_request_discard_test(img_request)) 295570d045f6SIlya Dryomov return true; 2956b454e36dSAlex Elder 295770d045f6SIlya Dryomov /* Non-layered writes */ 295870d045f6SIlya Dryomov if (!img_request_layered_test(img_request)) 295970d045f6SIlya Dryomov return true; 296070d045f6SIlya Dryomov 296170d045f6SIlya Dryomov /* 296270d045f6SIlya Dryomov * Layered writes outside of the parent overlap range don't 296370d045f6SIlya Dryomov * share any data with the parent. 296470d045f6SIlya Dryomov */ 296570d045f6SIlya Dryomov if (!obj_request_overlaps_parent(obj_request)) 296670d045f6SIlya Dryomov return true; 296770d045f6SIlya Dryomov 296870d045f6SIlya Dryomov /* 2969c622d226SGuangliang Zhao * Entire-object layered writes - we will overwrite whatever 2970c622d226SGuangliang Zhao * parent data there is anyway. 2971c622d226SGuangliang Zhao */ 2972c622d226SGuangliang Zhao if (!obj_request->offset && 2973c622d226SGuangliang Zhao obj_request->length == rbd_obj_bytes(&rbd_dev->header)) 2974c622d226SGuangliang Zhao return true; 2975c622d226SGuangliang Zhao 2976c622d226SGuangliang Zhao /* 297770d045f6SIlya Dryomov * If the object is known to already exist, its parent data has 297870d045f6SIlya Dryomov * already been copied. 297970d045f6SIlya Dryomov */ 298070d045f6SIlya Dryomov if (obj_request_known_test(obj_request) && 298170d045f6SIlya Dryomov obj_request_exists_test(obj_request)) 298270d045f6SIlya Dryomov return true; 298370d045f6SIlya Dryomov 298470d045f6SIlya Dryomov return false; 298570d045f6SIlya Dryomov } 298670d045f6SIlya Dryomov 298770d045f6SIlya Dryomov static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request) 298870d045f6SIlya Dryomov { 2989058aa991SIlya Dryomov rbd_assert(obj_request_img_data_test(obj_request)); 2990058aa991SIlya Dryomov rbd_assert(obj_request_type_valid(obj_request->type)); 2991058aa991SIlya Dryomov rbd_assert(obj_request->img_request); 2992058aa991SIlya Dryomov 299370d045f6SIlya Dryomov if (img_obj_request_simple(obj_request)) { 2994980917fcSIlya Dryomov rbd_obj_request_submit(obj_request); 2995980917fcSIlya Dryomov return 0; 2996b454e36dSAlex Elder } 2997b454e36dSAlex Elder 2998b454e36dSAlex Elder /* 29993d7efd18SAlex Elder * It's a layered write. The target object might exist but 30003d7efd18SAlex Elder * we may not know that yet. If we know it doesn't exist, 30013d7efd18SAlex Elder * start by reading the data for the full target object from 30023d7efd18SAlex Elder * the parent so we can use it for a copyup to the target. 3003b454e36dSAlex Elder */ 300470d045f6SIlya Dryomov if (obj_request_known_test(obj_request)) 30053d7efd18SAlex Elder return rbd_img_obj_parent_read_full(obj_request); 30063d7efd18SAlex Elder 30073d7efd18SAlex Elder /* We don't know whether the target exists. Go find out. */ 3008b454e36dSAlex Elder 3009b454e36dSAlex Elder return rbd_img_obj_exists_submit(obj_request); 3010b454e36dSAlex Elder } 3011b454e36dSAlex Elder 3012bf0d5f50SAlex Elder static int rbd_img_request_submit(struct rbd_img_request *img_request) 3013bf0d5f50SAlex Elder { 3014bf0d5f50SAlex Elder struct rbd_obj_request *obj_request; 301546faeed4SAlex Elder struct rbd_obj_request *next_obj_request; 3016663ae2ccSIlya Dryomov int ret = 0; 3017bf0d5f50SAlex Elder 301837206ee5SAlex Elder dout("%s: img %p\n", __func__, img_request); 3019bf0d5f50SAlex Elder 3020663ae2ccSIlya Dryomov rbd_img_request_get(img_request); 3021663ae2ccSIlya Dryomov for_each_obj_request_safe(img_request, obj_request, next_obj_request) { 3022b454e36dSAlex Elder ret = rbd_img_obj_request_submit(obj_request); 3023bf0d5f50SAlex Elder if (ret) 3024663ae2ccSIlya Dryomov goto out_put_ireq; 3025bf0d5f50SAlex Elder } 3026bf0d5f50SAlex Elder 3027663ae2ccSIlya Dryomov out_put_ireq: 3028663ae2ccSIlya Dryomov rbd_img_request_put(img_request); 3029663ae2ccSIlya Dryomov return ret; 3030bf0d5f50SAlex Elder } 3031bf0d5f50SAlex Elder 30328b3e1a56SAlex Elder static void rbd_img_parent_read_callback(struct rbd_img_request *img_request) 30338b3e1a56SAlex Elder { 30348b3e1a56SAlex Elder struct rbd_obj_request *obj_request; 3035a9e8ba2cSAlex Elder struct rbd_device *rbd_dev; 3036a9e8ba2cSAlex Elder u64 obj_end; 303702c74fbaSAlex Elder u64 img_xferred; 303802c74fbaSAlex Elder int img_result; 30398b3e1a56SAlex Elder 30408b3e1a56SAlex Elder rbd_assert(img_request_child_test(img_request)); 30418b3e1a56SAlex Elder 304202c74fbaSAlex Elder /* First get what we need from the image request and release it */ 304302c74fbaSAlex Elder 30448b3e1a56SAlex Elder obj_request = img_request->obj_request; 304502c74fbaSAlex Elder img_xferred = img_request->xferred; 304602c74fbaSAlex Elder img_result = img_request->result; 304702c74fbaSAlex Elder rbd_img_request_put(img_request); 304802c74fbaSAlex Elder 304902c74fbaSAlex Elder /* 305002c74fbaSAlex Elder * If the overlap has become 0 (most likely because the 305102c74fbaSAlex Elder * image has been flattened) we need to re-submit the 305202c74fbaSAlex Elder * original request. 305302c74fbaSAlex Elder */ 3054a9e8ba2cSAlex Elder rbd_assert(obj_request); 3055a9e8ba2cSAlex Elder rbd_assert(obj_request->img_request); 305602c74fbaSAlex Elder rbd_dev = obj_request->img_request->rbd_dev; 305702c74fbaSAlex Elder if (!rbd_dev->parent_overlap) { 3058980917fcSIlya Dryomov rbd_obj_request_submit(obj_request); 305902c74fbaSAlex Elder return; 306002c74fbaSAlex Elder } 306102c74fbaSAlex Elder 306202c74fbaSAlex Elder obj_request->result = img_result; 3063a9e8ba2cSAlex Elder if (obj_request->result) 3064a9e8ba2cSAlex Elder goto out; 3065a9e8ba2cSAlex Elder 3066a9e8ba2cSAlex Elder /* 3067a9e8ba2cSAlex Elder * We need to zero anything beyond the parent overlap 3068a9e8ba2cSAlex Elder * boundary. Since rbd_img_obj_request_read_callback() 3069a9e8ba2cSAlex Elder * will zero anything beyond the end of a short read, an 3070a9e8ba2cSAlex Elder * easy way to do this is to pretend the data from the 3071a9e8ba2cSAlex Elder * parent came up short--ending at the overlap boundary. 3072a9e8ba2cSAlex Elder */ 3073a9e8ba2cSAlex Elder rbd_assert(obj_request->img_offset < U64_MAX - obj_request->length); 3074a9e8ba2cSAlex Elder obj_end = obj_request->img_offset + obj_request->length; 3075a9e8ba2cSAlex Elder if (obj_end > rbd_dev->parent_overlap) { 3076a9e8ba2cSAlex Elder u64 xferred = 0; 3077a9e8ba2cSAlex Elder 3078a9e8ba2cSAlex Elder if (obj_request->img_offset < rbd_dev->parent_overlap) 3079a9e8ba2cSAlex Elder xferred = rbd_dev->parent_overlap - 3080a9e8ba2cSAlex Elder obj_request->img_offset; 3081a9e8ba2cSAlex Elder 308202c74fbaSAlex Elder obj_request->xferred = min(img_xferred, xferred); 3083a9e8ba2cSAlex Elder } else { 308402c74fbaSAlex Elder obj_request->xferred = img_xferred; 3085a9e8ba2cSAlex Elder } 3086a9e8ba2cSAlex Elder out: 30878b3e1a56SAlex Elder rbd_img_obj_request_read_callback(obj_request); 30888b3e1a56SAlex Elder rbd_obj_request_complete(obj_request); 30898b3e1a56SAlex Elder } 30908b3e1a56SAlex Elder 30918b3e1a56SAlex Elder static void rbd_img_parent_read(struct rbd_obj_request *obj_request) 30928b3e1a56SAlex Elder { 30938b3e1a56SAlex Elder struct rbd_img_request *img_request; 30948b3e1a56SAlex Elder int result; 30958b3e1a56SAlex Elder 30968b3e1a56SAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 30978b3e1a56SAlex Elder rbd_assert(obj_request->img_request != NULL); 30988b3e1a56SAlex Elder rbd_assert(obj_request->result == (s32) -ENOENT); 30995b2ab72dSAlex Elder rbd_assert(obj_request_type_valid(obj_request->type)); 31008b3e1a56SAlex Elder 31018b3e1a56SAlex Elder /* rbd_read_finish(obj_request, obj_request->length); */ 3102e93f3152SAlex Elder img_request = rbd_parent_request_create(obj_request, 31038b3e1a56SAlex Elder obj_request->img_offset, 3104e93f3152SAlex Elder obj_request->length); 31058b3e1a56SAlex Elder result = -ENOMEM; 31068b3e1a56SAlex Elder if (!img_request) 31078b3e1a56SAlex Elder goto out_err; 31088b3e1a56SAlex Elder 31095b2ab72dSAlex Elder if (obj_request->type == OBJ_REQUEST_BIO) 3110f1a4739fSAlex Elder result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO, 3111f1a4739fSAlex Elder obj_request->bio_list); 31125b2ab72dSAlex Elder else 31135b2ab72dSAlex Elder result = rbd_img_request_fill(img_request, OBJ_REQUEST_PAGES, 31145b2ab72dSAlex Elder obj_request->pages); 31158b3e1a56SAlex Elder if (result) 31168b3e1a56SAlex Elder goto out_err; 31178b3e1a56SAlex Elder 31188b3e1a56SAlex Elder img_request->callback = rbd_img_parent_read_callback; 31198b3e1a56SAlex Elder result = rbd_img_request_submit(img_request); 31208b3e1a56SAlex Elder if (result) 31218b3e1a56SAlex Elder goto out_err; 31228b3e1a56SAlex Elder 31238b3e1a56SAlex Elder return; 31248b3e1a56SAlex Elder out_err: 31258b3e1a56SAlex Elder if (img_request) 31268b3e1a56SAlex Elder rbd_img_request_put(img_request); 31278b3e1a56SAlex Elder obj_request->result = result; 31288b3e1a56SAlex Elder obj_request->xferred = 0; 31298b3e1a56SAlex Elder obj_request_done_set(obj_request); 31308b3e1a56SAlex Elder } 31318b3e1a56SAlex Elder 3132ed95b21aSIlya Dryomov static const struct rbd_client_id rbd_empty_cid; 3133ed95b21aSIlya Dryomov 3134ed95b21aSIlya Dryomov static bool rbd_cid_equal(const struct rbd_client_id *lhs, 3135ed95b21aSIlya Dryomov const struct rbd_client_id *rhs) 3136ed95b21aSIlya Dryomov { 3137ed95b21aSIlya Dryomov return lhs->gid == rhs->gid && lhs->handle == rhs->handle; 3138ed95b21aSIlya Dryomov } 3139ed95b21aSIlya Dryomov 3140ed95b21aSIlya Dryomov static struct rbd_client_id rbd_get_cid(struct rbd_device *rbd_dev) 3141ed95b21aSIlya Dryomov { 3142ed95b21aSIlya Dryomov struct rbd_client_id cid; 3143ed95b21aSIlya Dryomov 3144ed95b21aSIlya Dryomov mutex_lock(&rbd_dev->watch_mutex); 3145ed95b21aSIlya Dryomov cid.gid = ceph_client_gid(rbd_dev->rbd_client->client); 3146ed95b21aSIlya Dryomov cid.handle = rbd_dev->watch_cookie; 3147ed95b21aSIlya Dryomov mutex_unlock(&rbd_dev->watch_mutex); 3148ed95b21aSIlya Dryomov return cid; 3149ed95b21aSIlya Dryomov } 3150ed95b21aSIlya Dryomov 3151ed95b21aSIlya Dryomov /* 3152ed95b21aSIlya Dryomov * lock_rwsem must be held for write 3153ed95b21aSIlya Dryomov */ 3154ed95b21aSIlya Dryomov static void rbd_set_owner_cid(struct rbd_device *rbd_dev, 3155ed95b21aSIlya Dryomov const struct rbd_client_id *cid) 3156ed95b21aSIlya Dryomov { 3157ed95b21aSIlya Dryomov dout("%s rbd_dev %p %llu-%llu -> %llu-%llu\n", __func__, rbd_dev, 3158ed95b21aSIlya Dryomov rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle, 3159ed95b21aSIlya Dryomov cid->gid, cid->handle); 3160ed95b21aSIlya Dryomov rbd_dev->owner_cid = *cid; /* struct */ 3161ed95b21aSIlya Dryomov } 3162ed95b21aSIlya Dryomov 3163ed95b21aSIlya Dryomov static void format_lock_cookie(struct rbd_device *rbd_dev, char *buf) 3164ed95b21aSIlya Dryomov { 3165ed95b21aSIlya Dryomov mutex_lock(&rbd_dev->watch_mutex); 3166ed95b21aSIlya Dryomov sprintf(buf, "%s %llu", RBD_LOCK_COOKIE_PREFIX, rbd_dev->watch_cookie); 3167ed95b21aSIlya Dryomov mutex_unlock(&rbd_dev->watch_mutex); 3168ed95b21aSIlya Dryomov } 3169ed95b21aSIlya Dryomov 3170ed95b21aSIlya Dryomov /* 3171ed95b21aSIlya Dryomov * lock_rwsem must be held for write 3172ed95b21aSIlya Dryomov */ 3173ed95b21aSIlya Dryomov static int rbd_lock(struct rbd_device *rbd_dev) 3174ed95b21aSIlya Dryomov { 3175ed95b21aSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 3176ed95b21aSIlya Dryomov struct rbd_client_id cid = rbd_get_cid(rbd_dev); 3177ed95b21aSIlya Dryomov char cookie[32]; 3178ed95b21aSIlya Dryomov int ret; 3179ed95b21aSIlya Dryomov 3180ed95b21aSIlya Dryomov WARN_ON(__rbd_is_lock_owner(rbd_dev)); 3181ed95b21aSIlya Dryomov 3182ed95b21aSIlya Dryomov format_lock_cookie(rbd_dev, cookie); 3183ed95b21aSIlya Dryomov ret = ceph_cls_lock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc, 3184ed95b21aSIlya Dryomov RBD_LOCK_NAME, CEPH_CLS_LOCK_EXCLUSIVE, cookie, 3185ed95b21aSIlya Dryomov RBD_LOCK_TAG, "", 0); 3186ed95b21aSIlya Dryomov if (ret) 3187ed95b21aSIlya Dryomov return ret; 3188ed95b21aSIlya Dryomov 3189ed95b21aSIlya Dryomov rbd_dev->lock_state = RBD_LOCK_STATE_LOCKED; 3190ed95b21aSIlya Dryomov rbd_set_owner_cid(rbd_dev, &cid); 3191ed95b21aSIlya Dryomov queue_work(rbd_dev->task_wq, &rbd_dev->acquired_lock_work); 3192ed95b21aSIlya Dryomov return 0; 3193ed95b21aSIlya Dryomov } 3194ed95b21aSIlya Dryomov 3195ed95b21aSIlya Dryomov /* 3196ed95b21aSIlya Dryomov * lock_rwsem must be held for write 3197ed95b21aSIlya Dryomov */ 3198ed95b21aSIlya Dryomov static int rbd_unlock(struct rbd_device *rbd_dev) 3199ed95b21aSIlya Dryomov { 3200ed95b21aSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 3201ed95b21aSIlya Dryomov char cookie[32]; 3202ed95b21aSIlya Dryomov int ret; 3203ed95b21aSIlya Dryomov 3204ed95b21aSIlya Dryomov WARN_ON(!__rbd_is_lock_owner(rbd_dev)); 3205ed95b21aSIlya Dryomov 3206ed95b21aSIlya Dryomov rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED; 3207ed95b21aSIlya Dryomov 3208ed95b21aSIlya Dryomov format_lock_cookie(rbd_dev, cookie); 3209ed95b21aSIlya Dryomov ret = ceph_cls_unlock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc, 3210ed95b21aSIlya Dryomov RBD_LOCK_NAME, cookie); 3211ed95b21aSIlya Dryomov if (ret && ret != -ENOENT) { 3212ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "cls_unlock failed: %d", ret); 3213ed95b21aSIlya Dryomov return ret; 3214ed95b21aSIlya Dryomov } 3215ed95b21aSIlya Dryomov 3216ed95b21aSIlya Dryomov rbd_set_owner_cid(rbd_dev, &rbd_empty_cid); 3217ed95b21aSIlya Dryomov queue_work(rbd_dev->task_wq, &rbd_dev->released_lock_work); 3218ed95b21aSIlya Dryomov return 0; 3219ed95b21aSIlya Dryomov } 3220ed95b21aSIlya Dryomov 3221ed95b21aSIlya Dryomov static int __rbd_notify_op_lock(struct rbd_device *rbd_dev, 3222ed95b21aSIlya Dryomov enum rbd_notify_op notify_op, 3223ed95b21aSIlya Dryomov struct page ***preply_pages, 3224ed95b21aSIlya Dryomov size_t *preply_len) 3225ed95b21aSIlya Dryomov { 3226ed95b21aSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 3227ed95b21aSIlya Dryomov struct rbd_client_id cid = rbd_get_cid(rbd_dev); 3228ed95b21aSIlya Dryomov int buf_size = 4 + 8 + 8 + CEPH_ENCODING_START_BLK_LEN; 3229ed95b21aSIlya Dryomov char buf[buf_size]; 3230ed95b21aSIlya Dryomov void *p = buf; 3231ed95b21aSIlya Dryomov 3232ed95b21aSIlya Dryomov dout("%s rbd_dev %p notify_op %d\n", __func__, rbd_dev, notify_op); 3233ed95b21aSIlya Dryomov 3234ed95b21aSIlya Dryomov /* encode *LockPayload NotifyMessage (op + ClientId) */ 3235ed95b21aSIlya Dryomov ceph_start_encoding(&p, 2, 1, buf_size - CEPH_ENCODING_START_BLK_LEN); 3236ed95b21aSIlya Dryomov ceph_encode_32(&p, notify_op); 3237ed95b21aSIlya Dryomov ceph_encode_64(&p, cid.gid); 3238ed95b21aSIlya Dryomov ceph_encode_64(&p, cid.handle); 3239ed95b21aSIlya Dryomov 3240ed95b21aSIlya Dryomov return ceph_osdc_notify(osdc, &rbd_dev->header_oid, 3241ed95b21aSIlya Dryomov &rbd_dev->header_oloc, buf, buf_size, 3242ed95b21aSIlya Dryomov RBD_NOTIFY_TIMEOUT, preply_pages, preply_len); 3243ed95b21aSIlya Dryomov } 3244ed95b21aSIlya Dryomov 3245ed95b21aSIlya Dryomov static void rbd_notify_op_lock(struct rbd_device *rbd_dev, 3246ed95b21aSIlya Dryomov enum rbd_notify_op notify_op) 3247ed95b21aSIlya Dryomov { 3248ed95b21aSIlya Dryomov struct page **reply_pages; 3249ed95b21aSIlya Dryomov size_t reply_len; 3250ed95b21aSIlya Dryomov 3251ed95b21aSIlya Dryomov __rbd_notify_op_lock(rbd_dev, notify_op, &reply_pages, &reply_len); 3252ed95b21aSIlya Dryomov ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len)); 3253ed95b21aSIlya Dryomov } 3254ed95b21aSIlya Dryomov 3255ed95b21aSIlya Dryomov static void rbd_notify_acquired_lock(struct work_struct *work) 3256ed95b21aSIlya Dryomov { 3257ed95b21aSIlya Dryomov struct rbd_device *rbd_dev = container_of(work, struct rbd_device, 3258ed95b21aSIlya Dryomov acquired_lock_work); 3259ed95b21aSIlya Dryomov 3260ed95b21aSIlya Dryomov rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_ACQUIRED_LOCK); 3261ed95b21aSIlya Dryomov } 3262ed95b21aSIlya Dryomov 3263ed95b21aSIlya Dryomov static void rbd_notify_released_lock(struct work_struct *work) 3264ed95b21aSIlya Dryomov { 3265ed95b21aSIlya Dryomov struct rbd_device *rbd_dev = container_of(work, struct rbd_device, 3266ed95b21aSIlya Dryomov released_lock_work); 3267ed95b21aSIlya Dryomov 3268ed95b21aSIlya Dryomov rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_RELEASED_LOCK); 3269ed95b21aSIlya Dryomov } 3270ed95b21aSIlya Dryomov 3271ed95b21aSIlya Dryomov static int rbd_request_lock(struct rbd_device *rbd_dev) 3272ed95b21aSIlya Dryomov { 3273ed95b21aSIlya Dryomov struct page **reply_pages; 3274ed95b21aSIlya Dryomov size_t reply_len; 3275ed95b21aSIlya Dryomov bool lock_owner_responded = false; 3276ed95b21aSIlya Dryomov int ret; 3277ed95b21aSIlya Dryomov 3278ed95b21aSIlya Dryomov dout("%s rbd_dev %p\n", __func__, rbd_dev); 3279ed95b21aSIlya Dryomov 3280ed95b21aSIlya Dryomov ret = __rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_REQUEST_LOCK, 3281ed95b21aSIlya Dryomov &reply_pages, &reply_len); 3282ed95b21aSIlya Dryomov if (ret && ret != -ETIMEDOUT) { 3283ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "failed to request lock: %d", ret); 3284ed95b21aSIlya Dryomov goto out; 3285ed95b21aSIlya Dryomov } 3286ed95b21aSIlya Dryomov 3287ed95b21aSIlya Dryomov if (reply_len > 0 && reply_len <= PAGE_SIZE) { 3288ed95b21aSIlya Dryomov void *p = page_address(reply_pages[0]); 3289ed95b21aSIlya Dryomov void *const end = p + reply_len; 3290ed95b21aSIlya Dryomov u32 n; 3291ed95b21aSIlya Dryomov 3292ed95b21aSIlya Dryomov ceph_decode_32_safe(&p, end, n, e_inval); /* num_acks */ 3293ed95b21aSIlya Dryomov while (n--) { 3294ed95b21aSIlya Dryomov u8 struct_v; 3295ed95b21aSIlya Dryomov u32 len; 3296ed95b21aSIlya Dryomov 3297ed95b21aSIlya Dryomov ceph_decode_need(&p, end, 8 + 8, e_inval); 3298ed95b21aSIlya Dryomov p += 8 + 8; /* skip gid and cookie */ 3299ed95b21aSIlya Dryomov 3300ed95b21aSIlya Dryomov ceph_decode_32_safe(&p, end, len, e_inval); 3301ed95b21aSIlya Dryomov if (!len) 3302ed95b21aSIlya Dryomov continue; 3303ed95b21aSIlya Dryomov 3304ed95b21aSIlya Dryomov if (lock_owner_responded) { 3305ed95b21aSIlya Dryomov rbd_warn(rbd_dev, 3306ed95b21aSIlya Dryomov "duplicate lock owners detected"); 3307ed95b21aSIlya Dryomov ret = -EIO; 3308ed95b21aSIlya Dryomov goto out; 3309ed95b21aSIlya Dryomov } 3310ed95b21aSIlya Dryomov 3311ed95b21aSIlya Dryomov lock_owner_responded = true; 3312ed95b21aSIlya Dryomov ret = ceph_start_decoding(&p, end, 1, "ResponseMessage", 3313ed95b21aSIlya Dryomov &struct_v, &len); 3314ed95b21aSIlya Dryomov if (ret) { 3315ed95b21aSIlya Dryomov rbd_warn(rbd_dev, 3316ed95b21aSIlya Dryomov "failed to decode ResponseMessage: %d", 3317ed95b21aSIlya Dryomov ret); 3318ed95b21aSIlya Dryomov goto e_inval; 3319ed95b21aSIlya Dryomov } 3320ed95b21aSIlya Dryomov 3321ed95b21aSIlya Dryomov ret = ceph_decode_32(&p); 3322ed95b21aSIlya Dryomov } 3323ed95b21aSIlya Dryomov } 3324ed95b21aSIlya Dryomov 3325ed95b21aSIlya Dryomov if (!lock_owner_responded) { 3326ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "no lock owners detected"); 3327ed95b21aSIlya Dryomov ret = -ETIMEDOUT; 3328ed95b21aSIlya Dryomov } 3329ed95b21aSIlya Dryomov 3330ed95b21aSIlya Dryomov out: 3331ed95b21aSIlya Dryomov ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len)); 3332ed95b21aSIlya Dryomov return ret; 3333ed95b21aSIlya Dryomov 3334ed95b21aSIlya Dryomov e_inval: 3335ed95b21aSIlya Dryomov ret = -EINVAL; 3336ed95b21aSIlya Dryomov goto out; 3337ed95b21aSIlya Dryomov } 3338ed95b21aSIlya Dryomov 3339ed95b21aSIlya Dryomov static void wake_requests(struct rbd_device *rbd_dev, bool wake_all) 3340ed95b21aSIlya Dryomov { 3341ed95b21aSIlya Dryomov dout("%s rbd_dev %p wake_all %d\n", __func__, rbd_dev, wake_all); 3342ed95b21aSIlya Dryomov 3343ed95b21aSIlya Dryomov cancel_delayed_work(&rbd_dev->lock_dwork); 3344ed95b21aSIlya Dryomov if (wake_all) 3345ed95b21aSIlya Dryomov wake_up_all(&rbd_dev->lock_waitq); 3346ed95b21aSIlya Dryomov else 3347ed95b21aSIlya Dryomov wake_up(&rbd_dev->lock_waitq); 3348ed95b21aSIlya Dryomov } 3349ed95b21aSIlya Dryomov 3350ed95b21aSIlya Dryomov static int get_lock_owner_info(struct rbd_device *rbd_dev, 3351ed95b21aSIlya Dryomov struct ceph_locker **lockers, u32 *num_lockers) 3352ed95b21aSIlya Dryomov { 3353ed95b21aSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 3354ed95b21aSIlya Dryomov u8 lock_type; 3355ed95b21aSIlya Dryomov char *lock_tag; 3356ed95b21aSIlya Dryomov int ret; 3357ed95b21aSIlya Dryomov 3358ed95b21aSIlya Dryomov dout("%s rbd_dev %p\n", __func__, rbd_dev); 3359ed95b21aSIlya Dryomov 3360ed95b21aSIlya Dryomov ret = ceph_cls_lock_info(osdc, &rbd_dev->header_oid, 3361ed95b21aSIlya Dryomov &rbd_dev->header_oloc, RBD_LOCK_NAME, 3362ed95b21aSIlya Dryomov &lock_type, &lock_tag, lockers, num_lockers); 3363ed95b21aSIlya Dryomov if (ret) 3364ed95b21aSIlya Dryomov return ret; 3365ed95b21aSIlya Dryomov 3366ed95b21aSIlya Dryomov if (*num_lockers == 0) { 3367ed95b21aSIlya Dryomov dout("%s rbd_dev %p no lockers detected\n", __func__, rbd_dev); 3368ed95b21aSIlya Dryomov goto out; 3369ed95b21aSIlya Dryomov } 3370ed95b21aSIlya Dryomov 3371ed95b21aSIlya Dryomov if (strcmp(lock_tag, RBD_LOCK_TAG)) { 3372ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "locked by external mechanism, tag %s", 3373ed95b21aSIlya Dryomov lock_tag); 3374ed95b21aSIlya Dryomov ret = -EBUSY; 3375ed95b21aSIlya Dryomov goto out; 3376ed95b21aSIlya Dryomov } 3377ed95b21aSIlya Dryomov 3378ed95b21aSIlya Dryomov if (lock_type == CEPH_CLS_LOCK_SHARED) { 3379ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "shared lock type detected"); 3380ed95b21aSIlya Dryomov ret = -EBUSY; 3381ed95b21aSIlya Dryomov goto out; 3382ed95b21aSIlya Dryomov } 3383ed95b21aSIlya Dryomov 3384ed95b21aSIlya Dryomov if (strncmp((*lockers)[0].id.cookie, RBD_LOCK_COOKIE_PREFIX, 3385ed95b21aSIlya Dryomov strlen(RBD_LOCK_COOKIE_PREFIX))) { 3386ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "locked by external mechanism, cookie %s", 3387ed95b21aSIlya Dryomov (*lockers)[0].id.cookie); 3388ed95b21aSIlya Dryomov ret = -EBUSY; 3389ed95b21aSIlya Dryomov goto out; 3390ed95b21aSIlya Dryomov } 3391ed95b21aSIlya Dryomov 3392ed95b21aSIlya Dryomov out: 3393ed95b21aSIlya Dryomov kfree(lock_tag); 3394ed95b21aSIlya Dryomov return ret; 3395ed95b21aSIlya Dryomov } 3396ed95b21aSIlya Dryomov 3397ed95b21aSIlya Dryomov static int find_watcher(struct rbd_device *rbd_dev, 3398ed95b21aSIlya Dryomov const struct ceph_locker *locker) 3399ed95b21aSIlya Dryomov { 3400ed95b21aSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 3401ed95b21aSIlya Dryomov struct ceph_watch_item *watchers; 3402ed95b21aSIlya Dryomov u32 num_watchers; 3403ed95b21aSIlya Dryomov u64 cookie; 3404ed95b21aSIlya Dryomov int i; 3405ed95b21aSIlya Dryomov int ret; 3406ed95b21aSIlya Dryomov 3407ed95b21aSIlya Dryomov ret = ceph_osdc_list_watchers(osdc, &rbd_dev->header_oid, 3408ed95b21aSIlya Dryomov &rbd_dev->header_oloc, &watchers, 3409ed95b21aSIlya Dryomov &num_watchers); 3410ed95b21aSIlya Dryomov if (ret) 3411ed95b21aSIlya Dryomov return ret; 3412ed95b21aSIlya Dryomov 3413ed95b21aSIlya Dryomov sscanf(locker->id.cookie, RBD_LOCK_COOKIE_PREFIX " %llu", &cookie); 3414ed95b21aSIlya Dryomov for (i = 0; i < num_watchers; i++) { 3415ed95b21aSIlya Dryomov if (!memcmp(&watchers[i].addr, &locker->info.addr, 3416ed95b21aSIlya Dryomov sizeof(locker->info.addr)) && 3417ed95b21aSIlya Dryomov watchers[i].cookie == cookie) { 3418ed95b21aSIlya Dryomov struct rbd_client_id cid = { 3419ed95b21aSIlya Dryomov .gid = le64_to_cpu(watchers[i].name.num), 3420ed95b21aSIlya Dryomov .handle = cookie, 3421ed95b21aSIlya Dryomov }; 3422ed95b21aSIlya Dryomov 3423ed95b21aSIlya Dryomov dout("%s rbd_dev %p found cid %llu-%llu\n", __func__, 3424ed95b21aSIlya Dryomov rbd_dev, cid.gid, cid.handle); 3425ed95b21aSIlya Dryomov rbd_set_owner_cid(rbd_dev, &cid); 3426ed95b21aSIlya Dryomov ret = 1; 3427ed95b21aSIlya Dryomov goto out; 3428ed95b21aSIlya Dryomov } 3429ed95b21aSIlya Dryomov } 3430ed95b21aSIlya Dryomov 3431ed95b21aSIlya Dryomov dout("%s rbd_dev %p no watchers\n", __func__, rbd_dev); 3432ed95b21aSIlya Dryomov ret = 0; 3433ed95b21aSIlya Dryomov out: 3434ed95b21aSIlya Dryomov kfree(watchers); 3435ed95b21aSIlya Dryomov return ret; 3436ed95b21aSIlya Dryomov } 3437ed95b21aSIlya Dryomov 3438ed95b21aSIlya Dryomov /* 3439ed95b21aSIlya Dryomov * lock_rwsem must be held for write 3440ed95b21aSIlya Dryomov */ 3441ed95b21aSIlya Dryomov static int rbd_try_lock(struct rbd_device *rbd_dev) 3442ed95b21aSIlya Dryomov { 3443ed95b21aSIlya Dryomov struct ceph_client *client = rbd_dev->rbd_client->client; 3444ed95b21aSIlya Dryomov struct ceph_locker *lockers; 3445ed95b21aSIlya Dryomov u32 num_lockers; 3446ed95b21aSIlya Dryomov int ret; 3447ed95b21aSIlya Dryomov 3448ed95b21aSIlya Dryomov for (;;) { 3449ed95b21aSIlya Dryomov ret = rbd_lock(rbd_dev); 3450ed95b21aSIlya Dryomov if (ret != -EBUSY) 3451ed95b21aSIlya Dryomov return ret; 3452ed95b21aSIlya Dryomov 3453ed95b21aSIlya Dryomov /* determine if the current lock holder is still alive */ 3454ed95b21aSIlya Dryomov ret = get_lock_owner_info(rbd_dev, &lockers, &num_lockers); 3455ed95b21aSIlya Dryomov if (ret) 3456ed95b21aSIlya Dryomov return ret; 3457ed95b21aSIlya Dryomov 3458ed95b21aSIlya Dryomov if (num_lockers == 0) 3459ed95b21aSIlya Dryomov goto again; 3460ed95b21aSIlya Dryomov 3461ed95b21aSIlya Dryomov ret = find_watcher(rbd_dev, lockers); 3462ed95b21aSIlya Dryomov if (ret) { 3463ed95b21aSIlya Dryomov if (ret > 0) 3464ed95b21aSIlya Dryomov ret = 0; /* have to request lock */ 3465ed95b21aSIlya Dryomov goto out; 3466ed95b21aSIlya Dryomov } 3467ed95b21aSIlya Dryomov 3468ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "%s%llu seems dead, breaking lock", 3469ed95b21aSIlya Dryomov ENTITY_NAME(lockers[0].id.name)); 3470ed95b21aSIlya Dryomov 3471ed95b21aSIlya Dryomov ret = ceph_monc_blacklist_add(&client->monc, 3472ed95b21aSIlya Dryomov &lockers[0].info.addr); 3473ed95b21aSIlya Dryomov if (ret) { 3474ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "blacklist of %s%llu failed: %d", 3475ed95b21aSIlya Dryomov ENTITY_NAME(lockers[0].id.name), ret); 3476ed95b21aSIlya Dryomov goto out; 3477ed95b21aSIlya Dryomov } 3478ed95b21aSIlya Dryomov 3479ed95b21aSIlya Dryomov ret = ceph_cls_break_lock(&client->osdc, &rbd_dev->header_oid, 3480ed95b21aSIlya Dryomov &rbd_dev->header_oloc, RBD_LOCK_NAME, 3481ed95b21aSIlya Dryomov lockers[0].id.cookie, 3482ed95b21aSIlya Dryomov &lockers[0].id.name); 3483ed95b21aSIlya Dryomov if (ret && ret != -ENOENT) 3484ed95b21aSIlya Dryomov goto out; 3485ed95b21aSIlya Dryomov 3486ed95b21aSIlya Dryomov again: 3487ed95b21aSIlya Dryomov ceph_free_lockers(lockers, num_lockers); 3488ed95b21aSIlya Dryomov } 3489ed95b21aSIlya Dryomov 3490ed95b21aSIlya Dryomov out: 3491ed95b21aSIlya Dryomov ceph_free_lockers(lockers, num_lockers); 3492ed95b21aSIlya Dryomov return ret; 3493ed95b21aSIlya Dryomov } 3494ed95b21aSIlya Dryomov 3495ed95b21aSIlya Dryomov /* 3496ed95b21aSIlya Dryomov * ret is set only if lock_state is RBD_LOCK_STATE_UNLOCKED 3497ed95b21aSIlya Dryomov */ 3498ed95b21aSIlya Dryomov static enum rbd_lock_state rbd_try_acquire_lock(struct rbd_device *rbd_dev, 3499ed95b21aSIlya Dryomov int *pret) 3500ed95b21aSIlya Dryomov { 3501ed95b21aSIlya Dryomov enum rbd_lock_state lock_state; 3502ed95b21aSIlya Dryomov 3503ed95b21aSIlya Dryomov down_read(&rbd_dev->lock_rwsem); 3504ed95b21aSIlya Dryomov dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev, 3505ed95b21aSIlya Dryomov rbd_dev->lock_state); 3506ed95b21aSIlya Dryomov if (__rbd_is_lock_owner(rbd_dev)) { 3507ed95b21aSIlya Dryomov lock_state = rbd_dev->lock_state; 3508ed95b21aSIlya Dryomov up_read(&rbd_dev->lock_rwsem); 3509ed95b21aSIlya Dryomov return lock_state; 3510ed95b21aSIlya Dryomov } 3511ed95b21aSIlya Dryomov 3512ed95b21aSIlya Dryomov up_read(&rbd_dev->lock_rwsem); 3513ed95b21aSIlya Dryomov down_write(&rbd_dev->lock_rwsem); 3514ed95b21aSIlya Dryomov dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev, 3515ed95b21aSIlya Dryomov rbd_dev->lock_state); 3516ed95b21aSIlya Dryomov if (!__rbd_is_lock_owner(rbd_dev)) { 3517ed95b21aSIlya Dryomov *pret = rbd_try_lock(rbd_dev); 3518ed95b21aSIlya Dryomov if (*pret) 3519ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "failed to acquire lock: %d", *pret); 3520ed95b21aSIlya Dryomov } 3521ed95b21aSIlya Dryomov 3522ed95b21aSIlya Dryomov lock_state = rbd_dev->lock_state; 3523ed95b21aSIlya Dryomov up_write(&rbd_dev->lock_rwsem); 3524ed95b21aSIlya Dryomov return lock_state; 3525ed95b21aSIlya Dryomov } 3526ed95b21aSIlya Dryomov 3527ed95b21aSIlya Dryomov static void rbd_acquire_lock(struct work_struct *work) 3528ed95b21aSIlya Dryomov { 3529ed95b21aSIlya Dryomov struct rbd_device *rbd_dev = container_of(to_delayed_work(work), 3530ed95b21aSIlya Dryomov struct rbd_device, lock_dwork); 3531ed95b21aSIlya Dryomov enum rbd_lock_state lock_state; 3532ed95b21aSIlya Dryomov int ret; 3533ed95b21aSIlya Dryomov 3534ed95b21aSIlya Dryomov dout("%s rbd_dev %p\n", __func__, rbd_dev); 3535ed95b21aSIlya Dryomov again: 3536ed95b21aSIlya Dryomov lock_state = rbd_try_acquire_lock(rbd_dev, &ret); 3537ed95b21aSIlya Dryomov if (lock_state != RBD_LOCK_STATE_UNLOCKED || ret == -EBLACKLISTED) { 3538ed95b21aSIlya Dryomov if (lock_state == RBD_LOCK_STATE_LOCKED) 3539ed95b21aSIlya Dryomov wake_requests(rbd_dev, true); 3540ed95b21aSIlya Dryomov dout("%s rbd_dev %p lock_state %d ret %d - done\n", __func__, 3541ed95b21aSIlya Dryomov rbd_dev, lock_state, ret); 3542ed95b21aSIlya Dryomov return; 3543ed95b21aSIlya Dryomov } 3544ed95b21aSIlya Dryomov 3545ed95b21aSIlya Dryomov ret = rbd_request_lock(rbd_dev); 3546ed95b21aSIlya Dryomov if (ret == -ETIMEDOUT) { 3547ed95b21aSIlya Dryomov goto again; /* treat this as a dead client */ 3548ed95b21aSIlya Dryomov } else if (ret < 0) { 3549ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "error requesting lock: %d", ret); 3550ed95b21aSIlya Dryomov mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 3551ed95b21aSIlya Dryomov RBD_RETRY_DELAY); 3552ed95b21aSIlya Dryomov } else { 3553ed95b21aSIlya Dryomov /* 3554ed95b21aSIlya Dryomov * lock owner acked, but resend if we don't see them 3555ed95b21aSIlya Dryomov * release the lock 3556ed95b21aSIlya Dryomov */ 3557ed95b21aSIlya Dryomov dout("%s rbd_dev %p requeueing lock_dwork\n", __func__, 3558ed95b21aSIlya Dryomov rbd_dev); 3559ed95b21aSIlya Dryomov mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 3560ed95b21aSIlya Dryomov msecs_to_jiffies(2 * RBD_NOTIFY_TIMEOUT * MSEC_PER_SEC)); 3561ed95b21aSIlya Dryomov } 3562ed95b21aSIlya Dryomov } 3563ed95b21aSIlya Dryomov 3564ed95b21aSIlya Dryomov /* 3565ed95b21aSIlya Dryomov * lock_rwsem must be held for write 3566ed95b21aSIlya Dryomov */ 3567ed95b21aSIlya Dryomov static bool rbd_release_lock(struct rbd_device *rbd_dev) 3568ed95b21aSIlya Dryomov { 3569ed95b21aSIlya Dryomov dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev, 3570ed95b21aSIlya Dryomov rbd_dev->lock_state); 3571ed95b21aSIlya Dryomov if (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED) 3572ed95b21aSIlya Dryomov return false; 3573ed95b21aSIlya Dryomov 3574ed95b21aSIlya Dryomov rbd_dev->lock_state = RBD_LOCK_STATE_RELEASING; 3575ed95b21aSIlya Dryomov downgrade_write(&rbd_dev->lock_rwsem); 3576ed95b21aSIlya Dryomov /* 3577ed95b21aSIlya Dryomov * Ensure that all in-flight IO is flushed. 3578ed95b21aSIlya Dryomov * 3579ed95b21aSIlya Dryomov * FIXME: ceph_osdc_sync() flushes the entire OSD client, which 3580ed95b21aSIlya Dryomov * may be shared with other devices. 3581ed95b21aSIlya Dryomov */ 3582ed95b21aSIlya Dryomov ceph_osdc_sync(&rbd_dev->rbd_client->client->osdc); 3583ed95b21aSIlya Dryomov up_read(&rbd_dev->lock_rwsem); 3584ed95b21aSIlya Dryomov 3585ed95b21aSIlya Dryomov down_write(&rbd_dev->lock_rwsem); 3586ed95b21aSIlya Dryomov dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev, 3587ed95b21aSIlya Dryomov rbd_dev->lock_state); 3588ed95b21aSIlya Dryomov if (rbd_dev->lock_state != RBD_LOCK_STATE_RELEASING) 3589ed95b21aSIlya Dryomov return false; 3590ed95b21aSIlya Dryomov 3591ed95b21aSIlya Dryomov if (!rbd_unlock(rbd_dev)) 3592ed95b21aSIlya Dryomov /* 3593ed95b21aSIlya Dryomov * Give others a chance to grab the lock - we would re-acquire 3594ed95b21aSIlya Dryomov * almost immediately if we got new IO during ceph_osdc_sync() 3595ed95b21aSIlya Dryomov * otherwise. We need to ack our own notifications, so this 3596ed95b21aSIlya Dryomov * lock_dwork will be requeued from rbd_wait_state_locked() 3597ed95b21aSIlya Dryomov * after wake_requests() in rbd_handle_released_lock(). 3598ed95b21aSIlya Dryomov */ 3599ed95b21aSIlya Dryomov cancel_delayed_work(&rbd_dev->lock_dwork); 3600ed95b21aSIlya Dryomov 3601ed95b21aSIlya Dryomov return true; 3602ed95b21aSIlya Dryomov } 3603ed95b21aSIlya Dryomov 3604ed95b21aSIlya Dryomov static void rbd_release_lock_work(struct work_struct *work) 3605ed95b21aSIlya Dryomov { 3606ed95b21aSIlya Dryomov struct rbd_device *rbd_dev = container_of(work, struct rbd_device, 3607ed95b21aSIlya Dryomov unlock_work); 3608ed95b21aSIlya Dryomov 3609ed95b21aSIlya Dryomov down_write(&rbd_dev->lock_rwsem); 3610ed95b21aSIlya Dryomov rbd_release_lock(rbd_dev); 3611ed95b21aSIlya Dryomov up_write(&rbd_dev->lock_rwsem); 3612ed95b21aSIlya Dryomov } 3613ed95b21aSIlya Dryomov 3614ed95b21aSIlya Dryomov static void rbd_handle_acquired_lock(struct rbd_device *rbd_dev, u8 struct_v, 3615ed95b21aSIlya Dryomov void **p) 3616ed95b21aSIlya Dryomov { 3617ed95b21aSIlya Dryomov struct rbd_client_id cid = { 0 }; 3618ed95b21aSIlya Dryomov 3619ed95b21aSIlya Dryomov if (struct_v >= 2) { 3620ed95b21aSIlya Dryomov cid.gid = ceph_decode_64(p); 3621ed95b21aSIlya Dryomov cid.handle = ceph_decode_64(p); 3622ed95b21aSIlya Dryomov } 3623ed95b21aSIlya Dryomov 3624ed95b21aSIlya Dryomov dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid, 3625ed95b21aSIlya Dryomov cid.handle); 3626ed95b21aSIlya Dryomov if (!rbd_cid_equal(&cid, &rbd_empty_cid)) { 3627ed95b21aSIlya Dryomov down_write(&rbd_dev->lock_rwsem); 3628ed95b21aSIlya Dryomov if (rbd_cid_equal(&cid, &rbd_dev->owner_cid)) { 3629ed95b21aSIlya Dryomov /* 3630ed95b21aSIlya Dryomov * we already know that the remote client is 3631ed95b21aSIlya Dryomov * the owner 3632ed95b21aSIlya Dryomov */ 3633ed95b21aSIlya Dryomov up_write(&rbd_dev->lock_rwsem); 3634ed95b21aSIlya Dryomov return; 3635ed95b21aSIlya Dryomov } 3636ed95b21aSIlya Dryomov 3637ed95b21aSIlya Dryomov rbd_set_owner_cid(rbd_dev, &cid); 3638ed95b21aSIlya Dryomov downgrade_write(&rbd_dev->lock_rwsem); 3639ed95b21aSIlya Dryomov } else { 3640ed95b21aSIlya Dryomov down_read(&rbd_dev->lock_rwsem); 3641ed95b21aSIlya Dryomov } 3642ed95b21aSIlya Dryomov 3643ed95b21aSIlya Dryomov if (!__rbd_is_lock_owner(rbd_dev)) 3644ed95b21aSIlya Dryomov wake_requests(rbd_dev, false); 3645ed95b21aSIlya Dryomov up_read(&rbd_dev->lock_rwsem); 3646ed95b21aSIlya Dryomov } 3647ed95b21aSIlya Dryomov 3648ed95b21aSIlya Dryomov static void rbd_handle_released_lock(struct rbd_device *rbd_dev, u8 struct_v, 3649ed95b21aSIlya Dryomov void **p) 3650ed95b21aSIlya Dryomov { 3651ed95b21aSIlya Dryomov struct rbd_client_id cid = { 0 }; 3652ed95b21aSIlya Dryomov 3653ed95b21aSIlya Dryomov if (struct_v >= 2) { 3654ed95b21aSIlya Dryomov cid.gid = ceph_decode_64(p); 3655ed95b21aSIlya Dryomov cid.handle = ceph_decode_64(p); 3656ed95b21aSIlya Dryomov } 3657ed95b21aSIlya Dryomov 3658ed95b21aSIlya Dryomov dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid, 3659ed95b21aSIlya Dryomov cid.handle); 3660ed95b21aSIlya Dryomov if (!rbd_cid_equal(&cid, &rbd_empty_cid)) { 3661ed95b21aSIlya Dryomov down_write(&rbd_dev->lock_rwsem); 3662ed95b21aSIlya Dryomov if (!rbd_cid_equal(&cid, &rbd_dev->owner_cid)) { 3663ed95b21aSIlya Dryomov dout("%s rbd_dev %p unexpected owner, cid %llu-%llu != owner_cid %llu-%llu\n", 3664ed95b21aSIlya Dryomov __func__, rbd_dev, cid.gid, cid.handle, 3665ed95b21aSIlya Dryomov rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle); 3666ed95b21aSIlya Dryomov up_write(&rbd_dev->lock_rwsem); 3667ed95b21aSIlya Dryomov return; 3668ed95b21aSIlya Dryomov } 3669ed95b21aSIlya Dryomov 3670ed95b21aSIlya Dryomov rbd_set_owner_cid(rbd_dev, &rbd_empty_cid); 3671ed95b21aSIlya Dryomov downgrade_write(&rbd_dev->lock_rwsem); 3672ed95b21aSIlya Dryomov } else { 3673ed95b21aSIlya Dryomov down_read(&rbd_dev->lock_rwsem); 3674ed95b21aSIlya Dryomov } 3675ed95b21aSIlya Dryomov 3676ed95b21aSIlya Dryomov if (!__rbd_is_lock_owner(rbd_dev)) 3677ed95b21aSIlya Dryomov wake_requests(rbd_dev, false); 3678ed95b21aSIlya Dryomov up_read(&rbd_dev->lock_rwsem); 3679ed95b21aSIlya Dryomov } 3680ed95b21aSIlya Dryomov 3681ed95b21aSIlya Dryomov static bool rbd_handle_request_lock(struct rbd_device *rbd_dev, u8 struct_v, 3682ed95b21aSIlya Dryomov void **p) 3683ed95b21aSIlya Dryomov { 3684ed95b21aSIlya Dryomov struct rbd_client_id my_cid = rbd_get_cid(rbd_dev); 3685ed95b21aSIlya Dryomov struct rbd_client_id cid = { 0 }; 3686ed95b21aSIlya Dryomov bool need_to_send; 3687ed95b21aSIlya Dryomov 3688ed95b21aSIlya Dryomov if (struct_v >= 2) { 3689ed95b21aSIlya Dryomov cid.gid = ceph_decode_64(p); 3690ed95b21aSIlya Dryomov cid.handle = ceph_decode_64(p); 3691ed95b21aSIlya Dryomov } 3692ed95b21aSIlya Dryomov 3693ed95b21aSIlya Dryomov dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid, 3694ed95b21aSIlya Dryomov cid.handle); 3695ed95b21aSIlya Dryomov if (rbd_cid_equal(&cid, &my_cid)) 3696ed95b21aSIlya Dryomov return false; 3697ed95b21aSIlya Dryomov 3698ed95b21aSIlya Dryomov down_read(&rbd_dev->lock_rwsem); 3699ed95b21aSIlya Dryomov need_to_send = __rbd_is_lock_owner(rbd_dev); 3700ed95b21aSIlya Dryomov if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED) { 3701ed95b21aSIlya Dryomov if (!rbd_cid_equal(&rbd_dev->owner_cid, &rbd_empty_cid)) { 3702ed95b21aSIlya Dryomov dout("%s rbd_dev %p queueing unlock_work\n", __func__, 3703ed95b21aSIlya Dryomov rbd_dev); 3704ed95b21aSIlya Dryomov queue_work(rbd_dev->task_wq, &rbd_dev->unlock_work); 3705ed95b21aSIlya Dryomov } 3706ed95b21aSIlya Dryomov } 3707ed95b21aSIlya Dryomov up_read(&rbd_dev->lock_rwsem); 3708ed95b21aSIlya Dryomov return need_to_send; 3709ed95b21aSIlya Dryomov } 3710ed95b21aSIlya Dryomov 3711ed95b21aSIlya Dryomov static void __rbd_acknowledge_notify(struct rbd_device *rbd_dev, 3712ed95b21aSIlya Dryomov u64 notify_id, u64 cookie, s32 *result) 3713ed95b21aSIlya Dryomov { 3714ed95b21aSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 3715ed95b21aSIlya Dryomov int buf_size = 4 + CEPH_ENCODING_START_BLK_LEN; 3716ed95b21aSIlya Dryomov char buf[buf_size]; 3717ed95b21aSIlya Dryomov int ret; 3718ed95b21aSIlya Dryomov 3719ed95b21aSIlya Dryomov if (result) { 3720ed95b21aSIlya Dryomov void *p = buf; 3721ed95b21aSIlya Dryomov 3722ed95b21aSIlya Dryomov /* encode ResponseMessage */ 3723ed95b21aSIlya Dryomov ceph_start_encoding(&p, 1, 1, 3724ed95b21aSIlya Dryomov buf_size - CEPH_ENCODING_START_BLK_LEN); 3725ed95b21aSIlya Dryomov ceph_encode_32(&p, *result); 3726ed95b21aSIlya Dryomov } else { 3727ed95b21aSIlya Dryomov buf_size = 0; 3728ed95b21aSIlya Dryomov } 3729ed95b21aSIlya Dryomov 3730ed95b21aSIlya Dryomov ret = ceph_osdc_notify_ack(osdc, &rbd_dev->header_oid, 3731ed95b21aSIlya Dryomov &rbd_dev->header_oloc, notify_id, cookie, 3732ed95b21aSIlya Dryomov buf, buf_size); 3733ed95b21aSIlya Dryomov if (ret) 3734ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "acknowledge_notify failed: %d", ret); 3735ed95b21aSIlya Dryomov } 3736ed95b21aSIlya Dryomov 3737ed95b21aSIlya Dryomov static void rbd_acknowledge_notify(struct rbd_device *rbd_dev, u64 notify_id, 3738ed95b21aSIlya Dryomov u64 cookie) 3739ed95b21aSIlya Dryomov { 3740ed95b21aSIlya Dryomov dout("%s rbd_dev %p\n", __func__, rbd_dev); 3741ed95b21aSIlya Dryomov __rbd_acknowledge_notify(rbd_dev, notify_id, cookie, NULL); 3742ed95b21aSIlya Dryomov } 3743ed95b21aSIlya Dryomov 3744ed95b21aSIlya Dryomov static void rbd_acknowledge_notify_result(struct rbd_device *rbd_dev, 3745ed95b21aSIlya Dryomov u64 notify_id, u64 cookie, s32 result) 3746ed95b21aSIlya Dryomov { 3747ed95b21aSIlya Dryomov dout("%s rbd_dev %p result %d\n", __func__, rbd_dev, result); 3748ed95b21aSIlya Dryomov __rbd_acknowledge_notify(rbd_dev, notify_id, cookie, &result); 3749ed95b21aSIlya Dryomov } 3750ed95b21aSIlya Dryomov 3751922dab61SIlya Dryomov static void rbd_watch_cb(void *arg, u64 notify_id, u64 cookie, 3752922dab61SIlya Dryomov u64 notifier_id, void *data, size_t data_len) 3753b8d70035SAlex Elder { 3754922dab61SIlya Dryomov struct rbd_device *rbd_dev = arg; 3755ed95b21aSIlya Dryomov void *p = data; 3756ed95b21aSIlya Dryomov void *const end = p + data_len; 3757ed95b21aSIlya Dryomov u8 struct_v; 3758ed95b21aSIlya Dryomov u32 len; 3759ed95b21aSIlya Dryomov u32 notify_op; 3760b8d70035SAlex Elder int ret; 3761b8d70035SAlex Elder 3762ed95b21aSIlya Dryomov dout("%s rbd_dev %p cookie %llu notify_id %llu data_len %zu\n", 3763ed95b21aSIlya Dryomov __func__, rbd_dev, cookie, notify_id, data_len); 3764ed95b21aSIlya Dryomov if (data_len) { 3765ed95b21aSIlya Dryomov ret = ceph_start_decoding(&p, end, 1, "NotifyMessage", 3766ed95b21aSIlya Dryomov &struct_v, &len); 3767ed95b21aSIlya Dryomov if (ret) { 3768ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "failed to decode NotifyMessage: %d", 3769ed95b21aSIlya Dryomov ret); 3770ed95b21aSIlya Dryomov return; 3771ed95b21aSIlya Dryomov } 377252bb1f9bSIlya Dryomov 3773ed95b21aSIlya Dryomov notify_op = ceph_decode_32(&p); 3774ed95b21aSIlya Dryomov } else { 3775ed95b21aSIlya Dryomov /* legacy notification for header updates */ 3776ed95b21aSIlya Dryomov notify_op = RBD_NOTIFY_OP_HEADER_UPDATE; 3777ed95b21aSIlya Dryomov len = 0; 3778ed95b21aSIlya Dryomov } 3779ed95b21aSIlya Dryomov 3780ed95b21aSIlya Dryomov dout("%s rbd_dev %p notify_op %u\n", __func__, rbd_dev, notify_op); 3781ed95b21aSIlya Dryomov switch (notify_op) { 3782ed95b21aSIlya Dryomov case RBD_NOTIFY_OP_ACQUIRED_LOCK: 3783ed95b21aSIlya Dryomov rbd_handle_acquired_lock(rbd_dev, struct_v, &p); 3784ed95b21aSIlya Dryomov rbd_acknowledge_notify(rbd_dev, notify_id, cookie); 3785ed95b21aSIlya Dryomov break; 3786ed95b21aSIlya Dryomov case RBD_NOTIFY_OP_RELEASED_LOCK: 3787ed95b21aSIlya Dryomov rbd_handle_released_lock(rbd_dev, struct_v, &p); 3788ed95b21aSIlya Dryomov rbd_acknowledge_notify(rbd_dev, notify_id, cookie); 3789ed95b21aSIlya Dryomov break; 3790ed95b21aSIlya Dryomov case RBD_NOTIFY_OP_REQUEST_LOCK: 3791ed95b21aSIlya Dryomov if (rbd_handle_request_lock(rbd_dev, struct_v, &p)) 379252bb1f9bSIlya Dryomov /* 3793ed95b21aSIlya Dryomov * send ResponseMessage(0) back so the client 3794ed95b21aSIlya Dryomov * can detect a missing owner 379552bb1f9bSIlya Dryomov */ 3796ed95b21aSIlya Dryomov rbd_acknowledge_notify_result(rbd_dev, notify_id, 3797ed95b21aSIlya Dryomov cookie, 0); 3798ed95b21aSIlya Dryomov else 3799ed95b21aSIlya Dryomov rbd_acknowledge_notify(rbd_dev, notify_id, cookie); 3800ed95b21aSIlya Dryomov break; 3801ed95b21aSIlya Dryomov case RBD_NOTIFY_OP_HEADER_UPDATE: 3802e627db08SAlex Elder ret = rbd_dev_refresh(rbd_dev); 3803e627db08SAlex Elder if (ret) 38049584d508SIlya Dryomov rbd_warn(rbd_dev, "refresh failed: %d", ret); 3805b8d70035SAlex Elder 3806ed95b21aSIlya Dryomov rbd_acknowledge_notify(rbd_dev, notify_id, cookie); 3807ed95b21aSIlya Dryomov break; 3808ed95b21aSIlya Dryomov default: 3809ed95b21aSIlya Dryomov if (rbd_is_lock_owner(rbd_dev)) 3810ed95b21aSIlya Dryomov rbd_acknowledge_notify_result(rbd_dev, notify_id, 3811ed95b21aSIlya Dryomov cookie, -EOPNOTSUPP); 3812ed95b21aSIlya Dryomov else 3813ed95b21aSIlya Dryomov rbd_acknowledge_notify(rbd_dev, notify_id, cookie); 3814ed95b21aSIlya Dryomov break; 3815ed95b21aSIlya Dryomov } 3816b8d70035SAlex Elder } 3817b8d70035SAlex Elder 381899d16943SIlya Dryomov static void __rbd_unregister_watch(struct rbd_device *rbd_dev); 381999d16943SIlya Dryomov 3820922dab61SIlya Dryomov static void rbd_watch_errcb(void *arg, u64 cookie, int err) 3821bb040aa0SIlya Dryomov { 3822922dab61SIlya Dryomov struct rbd_device *rbd_dev = arg; 3823bb040aa0SIlya Dryomov 3824922dab61SIlya Dryomov rbd_warn(rbd_dev, "encountered watch error: %d", err); 3825bb040aa0SIlya Dryomov 3826ed95b21aSIlya Dryomov down_write(&rbd_dev->lock_rwsem); 3827ed95b21aSIlya Dryomov rbd_set_owner_cid(rbd_dev, &rbd_empty_cid); 3828ed95b21aSIlya Dryomov up_write(&rbd_dev->lock_rwsem); 3829ed95b21aSIlya Dryomov 383099d16943SIlya Dryomov mutex_lock(&rbd_dev->watch_mutex); 383199d16943SIlya Dryomov if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED) { 383299d16943SIlya Dryomov __rbd_unregister_watch(rbd_dev); 383399d16943SIlya Dryomov rbd_dev->watch_state = RBD_WATCH_STATE_ERROR; 3834bb040aa0SIlya Dryomov 383599d16943SIlya Dryomov queue_delayed_work(rbd_dev->task_wq, &rbd_dev->watch_dwork, 0); 3836bb040aa0SIlya Dryomov } 383799d16943SIlya Dryomov mutex_unlock(&rbd_dev->watch_mutex); 3838bb040aa0SIlya Dryomov } 3839bb040aa0SIlya Dryomov 3840bb040aa0SIlya Dryomov /* 384199d16943SIlya Dryomov * watch_mutex must be locked 38429969ebc5SAlex Elder */ 384399d16943SIlya Dryomov static int __rbd_register_watch(struct rbd_device *rbd_dev) 38449969ebc5SAlex Elder { 38459969ebc5SAlex Elder struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 3846922dab61SIlya Dryomov struct ceph_osd_linger_request *handle; 38479969ebc5SAlex Elder 3848922dab61SIlya Dryomov rbd_assert(!rbd_dev->watch_handle); 384999d16943SIlya Dryomov dout("%s rbd_dev %p\n", __func__, rbd_dev); 38509969ebc5SAlex Elder 3851922dab61SIlya Dryomov handle = ceph_osdc_watch(osdc, &rbd_dev->header_oid, 3852922dab61SIlya Dryomov &rbd_dev->header_oloc, rbd_watch_cb, 3853922dab61SIlya Dryomov rbd_watch_errcb, rbd_dev); 3854922dab61SIlya Dryomov if (IS_ERR(handle)) 3855922dab61SIlya Dryomov return PTR_ERR(handle); 38569969ebc5SAlex Elder 3857922dab61SIlya Dryomov rbd_dev->watch_handle = handle; 38588eb87565SAlex Elder return 0; 38599969ebc5SAlex Elder } 38609969ebc5SAlex Elder 386199d16943SIlya Dryomov /* 386299d16943SIlya Dryomov * watch_mutex must be locked 386399d16943SIlya Dryomov */ 386499d16943SIlya Dryomov static void __rbd_unregister_watch(struct rbd_device *rbd_dev) 3865fca27065SIlya Dryomov { 3866922dab61SIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 3867922dab61SIlya Dryomov int ret; 3868b30a01f2SIlya Dryomov 386999d16943SIlya Dryomov rbd_assert(rbd_dev->watch_handle); 387099d16943SIlya Dryomov dout("%s rbd_dev %p\n", __func__, rbd_dev); 3871b30a01f2SIlya Dryomov 3872922dab61SIlya Dryomov ret = ceph_osdc_unwatch(osdc, rbd_dev->watch_handle); 3873922dab61SIlya Dryomov if (ret) 3874922dab61SIlya Dryomov rbd_warn(rbd_dev, "failed to unwatch: %d", ret); 3875b30a01f2SIlya Dryomov 3876922dab61SIlya Dryomov rbd_dev->watch_handle = NULL; 3877c525f036SIlya Dryomov } 3878c525f036SIlya Dryomov 387999d16943SIlya Dryomov static int rbd_register_watch(struct rbd_device *rbd_dev) 3880c525f036SIlya Dryomov { 388199d16943SIlya Dryomov int ret; 3882811c6688SIlya Dryomov 388399d16943SIlya Dryomov mutex_lock(&rbd_dev->watch_mutex); 388499d16943SIlya Dryomov rbd_assert(rbd_dev->watch_state == RBD_WATCH_STATE_UNREGISTERED); 388599d16943SIlya Dryomov ret = __rbd_register_watch(rbd_dev); 388699d16943SIlya Dryomov if (ret) 388799d16943SIlya Dryomov goto out; 388899d16943SIlya Dryomov 388999d16943SIlya Dryomov rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED; 389099d16943SIlya Dryomov rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id; 389199d16943SIlya Dryomov 389299d16943SIlya Dryomov out: 389399d16943SIlya Dryomov mutex_unlock(&rbd_dev->watch_mutex); 389499d16943SIlya Dryomov return ret; 389599d16943SIlya Dryomov } 389699d16943SIlya Dryomov 389799d16943SIlya Dryomov static void cancel_tasks_sync(struct rbd_device *rbd_dev) 389899d16943SIlya Dryomov { 389999d16943SIlya Dryomov dout("%s rbd_dev %p\n", __func__, rbd_dev); 390099d16943SIlya Dryomov 390199d16943SIlya Dryomov cancel_delayed_work_sync(&rbd_dev->watch_dwork); 3902ed95b21aSIlya Dryomov cancel_work_sync(&rbd_dev->acquired_lock_work); 3903ed95b21aSIlya Dryomov cancel_work_sync(&rbd_dev->released_lock_work); 3904ed95b21aSIlya Dryomov cancel_delayed_work_sync(&rbd_dev->lock_dwork); 3905ed95b21aSIlya Dryomov cancel_work_sync(&rbd_dev->unlock_work); 390699d16943SIlya Dryomov } 390799d16943SIlya Dryomov 390899d16943SIlya Dryomov static void rbd_unregister_watch(struct rbd_device *rbd_dev) 390999d16943SIlya Dryomov { 3910ed95b21aSIlya Dryomov WARN_ON(waitqueue_active(&rbd_dev->lock_waitq)); 391199d16943SIlya Dryomov cancel_tasks_sync(rbd_dev); 391299d16943SIlya Dryomov 391399d16943SIlya Dryomov mutex_lock(&rbd_dev->watch_mutex); 391499d16943SIlya Dryomov if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED) 391599d16943SIlya Dryomov __rbd_unregister_watch(rbd_dev); 391699d16943SIlya Dryomov rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED; 391799d16943SIlya Dryomov mutex_unlock(&rbd_dev->watch_mutex); 391899d16943SIlya Dryomov 3919811c6688SIlya Dryomov ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc); 3920fca27065SIlya Dryomov } 3921fca27065SIlya Dryomov 392299d16943SIlya Dryomov static void rbd_reregister_watch(struct work_struct *work) 392399d16943SIlya Dryomov { 392499d16943SIlya Dryomov struct rbd_device *rbd_dev = container_of(to_delayed_work(work), 392599d16943SIlya Dryomov struct rbd_device, watch_dwork); 3926ed95b21aSIlya Dryomov bool was_lock_owner = false; 392799d16943SIlya Dryomov int ret; 392899d16943SIlya Dryomov 392999d16943SIlya Dryomov dout("%s rbd_dev %p\n", __func__, rbd_dev); 393099d16943SIlya Dryomov 3931ed95b21aSIlya Dryomov down_write(&rbd_dev->lock_rwsem); 3932ed95b21aSIlya Dryomov if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED) 3933ed95b21aSIlya Dryomov was_lock_owner = rbd_release_lock(rbd_dev); 3934ed95b21aSIlya Dryomov 393599d16943SIlya Dryomov mutex_lock(&rbd_dev->watch_mutex); 393699d16943SIlya Dryomov if (rbd_dev->watch_state != RBD_WATCH_STATE_ERROR) 393799d16943SIlya Dryomov goto fail_unlock; 393899d16943SIlya Dryomov 393999d16943SIlya Dryomov ret = __rbd_register_watch(rbd_dev); 394099d16943SIlya Dryomov if (ret) { 394199d16943SIlya Dryomov rbd_warn(rbd_dev, "failed to reregister watch: %d", ret); 394299d16943SIlya Dryomov if (ret != -EBLACKLISTED) 394399d16943SIlya Dryomov queue_delayed_work(rbd_dev->task_wq, 394499d16943SIlya Dryomov &rbd_dev->watch_dwork, 394599d16943SIlya Dryomov RBD_RETRY_DELAY); 394699d16943SIlya Dryomov goto fail_unlock; 394799d16943SIlya Dryomov } 394899d16943SIlya Dryomov 394999d16943SIlya Dryomov rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED; 395099d16943SIlya Dryomov rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id; 395199d16943SIlya Dryomov mutex_unlock(&rbd_dev->watch_mutex); 395299d16943SIlya Dryomov 395399d16943SIlya Dryomov ret = rbd_dev_refresh(rbd_dev); 395499d16943SIlya Dryomov if (ret) 395599d16943SIlya Dryomov rbd_warn(rbd_dev, "reregisteration refresh failed: %d", ret); 395699d16943SIlya Dryomov 3957ed95b21aSIlya Dryomov if (was_lock_owner) { 3958ed95b21aSIlya Dryomov ret = rbd_try_lock(rbd_dev); 3959ed95b21aSIlya Dryomov if (ret) 3960ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "reregisteration lock failed: %d", 3961ed95b21aSIlya Dryomov ret); 3962ed95b21aSIlya Dryomov } 3963ed95b21aSIlya Dryomov 3964ed95b21aSIlya Dryomov up_write(&rbd_dev->lock_rwsem); 3965ed95b21aSIlya Dryomov wake_requests(rbd_dev, true); 396699d16943SIlya Dryomov return; 396799d16943SIlya Dryomov 396899d16943SIlya Dryomov fail_unlock: 396999d16943SIlya Dryomov mutex_unlock(&rbd_dev->watch_mutex); 3970ed95b21aSIlya Dryomov up_write(&rbd_dev->lock_rwsem); 397199d16943SIlya Dryomov } 397299d16943SIlya Dryomov 397336be9a76SAlex Elder /* 3974f40eb349SAlex Elder * Synchronous osd object method call. Returns the number of bytes 3975f40eb349SAlex Elder * returned in the outbound buffer, or a negative error code. 397636be9a76SAlex Elder */ 397736be9a76SAlex Elder static int rbd_obj_method_sync(struct rbd_device *rbd_dev, 397836be9a76SAlex Elder const char *object_name, 397936be9a76SAlex Elder const char *class_name, 398036be9a76SAlex Elder const char *method_name, 39814157976bSAlex Elder const void *outbound, 398236be9a76SAlex Elder size_t outbound_size, 39834157976bSAlex Elder void *inbound, 3984e2a58ee5SAlex Elder size_t inbound_size) 398536be9a76SAlex Elder { 398636be9a76SAlex Elder struct rbd_obj_request *obj_request; 398736be9a76SAlex Elder struct page **pages; 398836be9a76SAlex Elder u32 page_count; 398936be9a76SAlex Elder int ret; 399036be9a76SAlex Elder 399136be9a76SAlex Elder /* 39926010a451SAlex Elder * Method calls are ultimately read operations. The result 39936010a451SAlex Elder * should placed into the inbound buffer provided. They 39946010a451SAlex Elder * also supply outbound data--parameters for the object 39956010a451SAlex Elder * method. Currently if this is present it will be a 39966010a451SAlex Elder * snapshot id. 399736be9a76SAlex Elder */ 399836be9a76SAlex Elder page_count = (u32)calc_pages_for(0, inbound_size); 399936be9a76SAlex Elder pages = ceph_alloc_page_vector(page_count, GFP_KERNEL); 400036be9a76SAlex Elder if (IS_ERR(pages)) 400136be9a76SAlex Elder return PTR_ERR(pages); 400236be9a76SAlex Elder 400336be9a76SAlex Elder ret = -ENOMEM; 40046010a451SAlex Elder obj_request = rbd_obj_request_create(object_name, 0, inbound_size, 400536be9a76SAlex Elder OBJ_REQUEST_PAGES); 400636be9a76SAlex Elder if (!obj_request) 400736be9a76SAlex Elder goto out; 400836be9a76SAlex Elder 400936be9a76SAlex Elder obj_request->pages = pages; 401036be9a76SAlex Elder obj_request->page_count = page_count; 401136be9a76SAlex Elder 40126d2940c8SGuangliang Zhao obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1, 4013deb236b3SIlya Dryomov obj_request); 401436be9a76SAlex Elder if (!obj_request->osd_req) 401536be9a76SAlex Elder goto out; 401636be9a76SAlex Elder 4017c99d2d4aSAlex Elder osd_req_op_cls_init(obj_request->osd_req, 0, CEPH_OSD_OP_CALL, 401804017e29SAlex Elder class_name, method_name); 401904017e29SAlex Elder if (outbound_size) { 402004017e29SAlex Elder struct ceph_pagelist *pagelist; 402104017e29SAlex Elder 402204017e29SAlex Elder pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS); 402304017e29SAlex Elder if (!pagelist) 402404017e29SAlex Elder goto out; 402504017e29SAlex Elder 402604017e29SAlex Elder ceph_pagelist_init(pagelist); 402704017e29SAlex Elder ceph_pagelist_append(pagelist, outbound, outbound_size); 402804017e29SAlex Elder osd_req_op_cls_request_data_pagelist(obj_request->osd_req, 0, 402904017e29SAlex Elder pagelist); 403004017e29SAlex Elder } 4031a4ce40a9SAlex Elder osd_req_op_cls_response_data_pages(obj_request->osd_req, 0, 4032a4ce40a9SAlex Elder obj_request->pages, inbound_size, 403344cd188dSAlex Elder 0, false, false); 4034430c28c3SAlex Elder 4035980917fcSIlya Dryomov rbd_obj_request_submit(obj_request); 403636be9a76SAlex Elder ret = rbd_obj_request_wait(obj_request); 403736be9a76SAlex Elder if (ret) 403836be9a76SAlex Elder goto out; 403936be9a76SAlex Elder 404036be9a76SAlex Elder ret = obj_request->result; 404136be9a76SAlex Elder if (ret < 0) 404236be9a76SAlex Elder goto out; 404357385b51SAlex Elder 404457385b51SAlex Elder rbd_assert(obj_request->xferred < (u64)INT_MAX); 404557385b51SAlex Elder ret = (int)obj_request->xferred; 4046903bb32eSAlex Elder ceph_copy_from_page_vector(pages, inbound, 0, obj_request->xferred); 404736be9a76SAlex Elder out: 404836be9a76SAlex Elder if (obj_request) 404936be9a76SAlex Elder rbd_obj_request_put(obj_request); 405036be9a76SAlex Elder else 405136be9a76SAlex Elder ceph_release_page_vector(pages, page_count); 405236be9a76SAlex Elder 405336be9a76SAlex Elder return ret; 405436be9a76SAlex Elder } 405536be9a76SAlex Elder 4056ed95b21aSIlya Dryomov /* 4057ed95b21aSIlya Dryomov * lock_rwsem must be held for read 4058ed95b21aSIlya Dryomov */ 4059ed95b21aSIlya Dryomov static void rbd_wait_state_locked(struct rbd_device *rbd_dev) 4060ed95b21aSIlya Dryomov { 4061ed95b21aSIlya Dryomov DEFINE_WAIT(wait); 4062ed95b21aSIlya Dryomov 4063ed95b21aSIlya Dryomov do { 4064ed95b21aSIlya Dryomov /* 4065ed95b21aSIlya Dryomov * Note the use of mod_delayed_work() in rbd_acquire_lock() 4066ed95b21aSIlya Dryomov * and cancel_delayed_work() in wake_requests(). 4067ed95b21aSIlya Dryomov */ 4068ed95b21aSIlya Dryomov dout("%s rbd_dev %p queueing lock_dwork\n", __func__, rbd_dev); 4069ed95b21aSIlya Dryomov queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0); 4070ed95b21aSIlya Dryomov prepare_to_wait_exclusive(&rbd_dev->lock_waitq, &wait, 4071ed95b21aSIlya Dryomov TASK_UNINTERRUPTIBLE); 4072ed95b21aSIlya Dryomov up_read(&rbd_dev->lock_rwsem); 4073ed95b21aSIlya Dryomov schedule(); 4074ed95b21aSIlya Dryomov down_read(&rbd_dev->lock_rwsem); 4075ed95b21aSIlya Dryomov } while (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED); 4076ed95b21aSIlya Dryomov finish_wait(&rbd_dev->lock_waitq, &wait); 4077ed95b21aSIlya Dryomov } 4078ed95b21aSIlya Dryomov 40797ad18afaSChristoph Hellwig static void rbd_queue_workfn(struct work_struct *work) 4080bc1ecc65SIlya Dryomov { 40817ad18afaSChristoph Hellwig struct request *rq = blk_mq_rq_from_pdu(work); 40827ad18afaSChristoph Hellwig struct rbd_device *rbd_dev = rq->q->queuedata; 4083bc1ecc65SIlya Dryomov struct rbd_img_request *img_request; 40844e752f0aSJosh Durgin struct ceph_snap_context *snapc = NULL; 4085bc1ecc65SIlya Dryomov u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT; 4086bc1ecc65SIlya Dryomov u64 length = blk_rq_bytes(rq); 40876d2940c8SGuangliang Zhao enum obj_operation_type op_type; 40884e752f0aSJosh Durgin u64 mapping_size; 408980de1912SIlya Dryomov bool must_be_locked; 4090bc1ecc65SIlya Dryomov int result; 4091bc1ecc65SIlya Dryomov 40927ad18afaSChristoph Hellwig if (rq->cmd_type != REQ_TYPE_FS) { 40937ad18afaSChristoph Hellwig dout("%s: non-fs request type %d\n", __func__, 40947ad18afaSChristoph Hellwig (int) rq->cmd_type); 40957ad18afaSChristoph Hellwig result = -EIO; 40967ad18afaSChristoph Hellwig goto err; 40977ad18afaSChristoph Hellwig } 40987ad18afaSChristoph Hellwig 4099c2df40dfSMike Christie if (req_op(rq) == REQ_OP_DISCARD) 410090e98c52SGuangliang Zhao op_type = OBJ_OP_DISCARD; 4101c2df40dfSMike Christie else if (req_op(rq) == REQ_OP_WRITE) 41026d2940c8SGuangliang Zhao op_type = OBJ_OP_WRITE; 41036d2940c8SGuangliang Zhao else 41046d2940c8SGuangliang Zhao op_type = OBJ_OP_READ; 41056d2940c8SGuangliang Zhao 4106bc1ecc65SIlya Dryomov /* Ignore/skip any zero-length requests */ 4107bc1ecc65SIlya Dryomov 4108bc1ecc65SIlya Dryomov if (!length) { 4109bc1ecc65SIlya Dryomov dout("%s: zero-length request\n", __func__); 4110bc1ecc65SIlya Dryomov result = 0; 4111bc1ecc65SIlya Dryomov goto err_rq; 4112bc1ecc65SIlya Dryomov } 4113bc1ecc65SIlya Dryomov 41146d2940c8SGuangliang Zhao /* Only reads are allowed to a read-only device */ 4115bc1ecc65SIlya Dryomov 41166d2940c8SGuangliang Zhao if (op_type != OBJ_OP_READ) { 4117bc1ecc65SIlya Dryomov if (rbd_dev->mapping.read_only) { 4118bc1ecc65SIlya Dryomov result = -EROFS; 4119bc1ecc65SIlya Dryomov goto err_rq; 4120bc1ecc65SIlya Dryomov } 4121bc1ecc65SIlya Dryomov rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP); 4122bc1ecc65SIlya Dryomov } 4123bc1ecc65SIlya Dryomov 4124bc1ecc65SIlya Dryomov /* 4125bc1ecc65SIlya Dryomov * Quit early if the mapped snapshot no longer exists. It's 4126bc1ecc65SIlya Dryomov * still possible the snapshot will have disappeared by the 4127bc1ecc65SIlya Dryomov * time our request arrives at the osd, but there's no sense in 4128bc1ecc65SIlya Dryomov * sending it if we already know. 4129bc1ecc65SIlya Dryomov */ 4130bc1ecc65SIlya Dryomov if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) { 4131bc1ecc65SIlya Dryomov dout("request for non-existent snapshot"); 4132bc1ecc65SIlya Dryomov rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP); 4133bc1ecc65SIlya Dryomov result = -ENXIO; 4134bc1ecc65SIlya Dryomov goto err_rq; 4135bc1ecc65SIlya Dryomov } 4136bc1ecc65SIlya Dryomov 4137bc1ecc65SIlya Dryomov if (offset && length > U64_MAX - offset + 1) { 4138bc1ecc65SIlya Dryomov rbd_warn(rbd_dev, "bad request range (%llu~%llu)", offset, 4139bc1ecc65SIlya Dryomov length); 4140bc1ecc65SIlya Dryomov result = -EINVAL; 4141bc1ecc65SIlya Dryomov goto err_rq; /* Shouldn't happen */ 4142bc1ecc65SIlya Dryomov } 4143bc1ecc65SIlya Dryomov 41447ad18afaSChristoph Hellwig blk_mq_start_request(rq); 41457ad18afaSChristoph Hellwig 41464e752f0aSJosh Durgin down_read(&rbd_dev->header_rwsem); 41474e752f0aSJosh Durgin mapping_size = rbd_dev->mapping.size; 41486d2940c8SGuangliang Zhao if (op_type != OBJ_OP_READ) { 41494e752f0aSJosh Durgin snapc = rbd_dev->header.snapc; 41504e752f0aSJosh Durgin ceph_get_snap_context(snapc); 4151ed95b21aSIlya Dryomov must_be_locked = rbd_is_lock_supported(rbd_dev); 415280de1912SIlya Dryomov } else { 415380de1912SIlya Dryomov must_be_locked = rbd_dev->opts->lock_on_read && 415480de1912SIlya Dryomov rbd_is_lock_supported(rbd_dev); 41554e752f0aSJosh Durgin } 41564e752f0aSJosh Durgin up_read(&rbd_dev->header_rwsem); 41574e752f0aSJosh Durgin 41584e752f0aSJosh Durgin if (offset + length > mapping_size) { 4159bc1ecc65SIlya Dryomov rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)", offset, 41604e752f0aSJosh Durgin length, mapping_size); 4161bc1ecc65SIlya Dryomov result = -EIO; 4162bc1ecc65SIlya Dryomov goto err_rq; 4163bc1ecc65SIlya Dryomov } 4164bc1ecc65SIlya Dryomov 4165ed95b21aSIlya Dryomov if (must_be_locked) { 4166ed95b21aSIlya Dryomov down_read(&rbd_dev->lock_rwsem); 4167ed95b21aSIlya Dryomov if (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED) 4168ed95b21aSIlya Dryomov rbd_wait_state_locked(rbd_dev); 4169ed95b21aSIlya Dryomov } 4170ed95b21aSIlya Dryomov 41716d2940c8SGuangliang Zhao img_request = rbd_img_request_create(rbd_dev, offset, length, op_type, 41724e752f0aSJosh Durgin snapc); 4173bc1ecc65SIlya Dryomov if (!img_request) { 4174bc1ecc65SIlya Dryomov result = -ENOMEM; 4175ed95b21aSIlya Dryomov goto err_unlock; 4176bc1ecc65SIlya Dryomov } 4177bc1ecc65SIlya Dryomov img_request->rq = rq; 417870b16db8SIlya Dryomov snapc = NULL; /* img_request consumes a ref */ 4179bc1ecc65SIlya Dryomov 418090e98c52SGuangliang Zhao if (op_type == OBJ_OP_DISCARD) 418190e98c52SGuangliang Zhao result = rbd_img_request_fill(img_request, OBJ_REQUEST_NODATA, 418290e98c52SGuangliang Zhao NULL); 418390e98c52SGuangliang Zhao else 418490e98c52SGuangliang Zhao result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO, 418590e98c52SGuangliang Zhao rq->bio); 4186bc1ecc65SIlya Dryomov if (result) 4187bc1ecc65SIlya Dryomov goto err_img_request; 4188bc1ecc65SIlya Dryomov 4189bc1ecc65SIlya Dryomov result = rbd_img_request_submit(img_request); 4190bc1ecc65SIlya Dryomov if (result) 4191bc1ecc65SIlya Dryomov goto err_img_request; 4192bc1ecc65SIlya Dryomov 4193ed95b21aSIlya Dryomov if (must_be_locked) 4194ed95b21aSIlya Dryomov up_read(&rbd_dev->lock_rwsem); 4195bc1ecc65SIlya Dryomov return; 4196bc1ecc65SIlya Dryomov 4197bc1ecc65SIlya Dryomov err_img_request: 4198bc1ecc65SIlya Dryomov rbd_img_request_put(img_request); 4199ed95b21aSIlya Dryomov err_unlock: 4200ed95b21aSIlya Dryomov if (must_be_locked) 4201ed95b21aSIlya Dryomov up_read(&rbd_dev->lock_rwsem); 4202bc1ecc65SIlya Dryomov err_rq: 4203bc1ecc65SIlya Dryomov if (result) 4204bc1ecc65SIlya Dryomov rbd_warn(rbd_dev, "%s %llx at %llx result %d", 42056d2940c8SGuangliang Zhao obj_op_name(op_type), length, offset, result); 42064e752f0aSJosh Durgin ceph_put_snap_context(snapc); 42077ad18afaSChristoph Hellwig err: 42087ad18afaSChristoph Hellwig blk_mq_end_request(rq, result); 4209bc1ecc65SIlya Dryomov } 4210bc1ecc65SIlya Dryomov 42117ad18afaSChristoph Hellwig static int rbd_queue_rq(struct blk_mq_hw_ctx *hctx, 42127ad18afaSChristoph Hellwig const struct blk_mq_queue_data *bd) 4213bc1ecc65SIlya Dryomov { 42147ad18afaSChristoph Hellwig struct request *rq = bd->rq; 42157ad18afaSChristoph Hellwig struct work_struct *work = blk_mq_rq_to_pdu(rq); 4216bc1ecc65SIlya Dryomov 42177ad18afaSChristoph Hellwig queue_work(rbd_wq, work); 42187ad18afaSChristoph Hellwig return BLK_MQ_RQ_QUEUE_OK; 4219bf0d5f50SAlex Elder } 4220bf0d5f50SAlex Elder 4221602adf40SYehuda Sadeh static void rbd_free_disk(struct rbd_device *rbd_dev) 4222602adf40SYehuda Sadeh { 4223602adf40SYehuda Sadeh struct gendisk *disk = rbd_dev->disk; 4224602adf40SYehuda Sadeh 4225602adf40SYehuda Sadeh if (!disk) 4226602adf40SYehuda Sadeh return; 4227602adf40SYehuda Sadeh 4228a0cab924SAlex Elder rbd_dev->disk = NULL; 4229a0cab924SAlex Elder if (disk->flags & GENHD_FL_UP) { 4230602adf40SYehuda Sadeh del_gendisk(disk); 4231602adf40SYehuda Sadeh if (disk->queue) 4232602adf40SYehuda Sadeh blk_cleanup_queue(disk->queue); 42337ad18afaSChristoph Hellwig blk_mq_free_tag_set(&rbd_dev->tag_set); 4234a0cab924SAlex Elder } 4235602adf40SYehuda Sadeh put_disk(disk); 4236602adf40SYehuda Sadeh } 4237602adf40SYehuda Sadeh 4238788e2df3SAlex Elder static int rbd_obj_read_sync(struct rbd_device *rbd_dev, 4239788e2df3SAlex Elder const char *object_name, 42407097f8dfSAlex Elder u64 offset, u64 length, void *buf) 4241788e2df3SAlex Elder 4242788e2df3SAlex Elder { 4243788e2df3SAlex Elder struct rbd_obj_request *obj_request; 4244788e2df3SAlex Elder struct page **pages = NULL; 4245788e2df3SAlex Elder u32 page_count; 42461ceae7efSAlex Elder size_t size; 4247788e2df3SAlex Elder int ret; 4248788e2df3SAlex Elder 4249788e2df3SAlex Elder page_count = (u32) calc_pages_for(offset, length); 4250788e2df3SAlex Elder pages = ceph_alloc_page_vector(page_count, GFP_KERNEL); 4251788e2df3SAlex Elder if (IS_ERR(pages)) 4252a8d42056SJan Kara return PTR_ERR(pages); 4253788e2df3SAlex Elder 4254788e2df3SAlex Elder ret = -ENOMEM; 4255788e2df3SAlex Elder obj_request = rbd_obj_request_create(object_name, offset, length, 4256788e2df3SAlex Elder OBJ_REQUEST_PAGES); 4257788e2df3SAlex Elder if (!obj_request) 4258788e2df3SAlex Elder goto out; 4259788e2df3SAlex Elder 4260788e2df3SAlex Elder obj_request->pages = pages; 4261788e2df3SAlex Elder obj_request->page_count = page_count; 4262788e2df3SAlex Elder 42636d2940c8SGuangliang Zhao obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1, 4264deb236b3SIlya Dryomov obj_request); 4265788e2df3SAlex Elder if (!obj_request->osd_req) 4266788e2df3SAlex Elder goto out; 4267788e2df3SAlex Elder 4268c99d2d4aSAlex Elder osd_req_op_extent_init(obj_request->osd_req, 0, CEPH_OSD_OP_READ, 4269c99d2d4aSAlex Elder offset, length, 0, 0); 4270406e2c9fSAlex Elder osd_req_op_extent_osd_data_pages(obj_request->osd_req, 0, 4271a4ce40a9SAlex Elder obj_request->pages, 427244cd188dSAlex Elder obj_request->length, 427344cd188dSAlex Elder obj_request->offset & ~PAGE_MASK, 427444cd188dSAlex Elder false, false); 4275430c28c3SAlex Elder 4276980917fcSIlya Dryomov rbd_obj_request_submit(obj_request); 4277788e2df3SAlex Elder ret = rbd_obj_request_wait(obj_request); 4278788e2df3SAlex Elder if (ret) 4279788e2df3SAlex Elder goto out; 4280788e2df3SAlex Elder 4281788e2df3SAlex Elder ret = obj_request->result; 4282788e2df3SAlex Elder if (ret < 0) 4283788e2df3SAlex Elder goto out; 42841ceae7efSAlex Elder 42851ceae7efSAlex Elder rbd_assert(obj_request->xferred <= (u64) SIZE_MAX); 42861ceae7efSAlex Elder size = (size_t) obj_request->xferred; 4287903bb32eSAlex Elder ceph_copy_from_page_vector(pages, buf, 0, size); 428823ed6e13SAlex Elder rbd_assert(size <= (size_t)INT_MAX); 428923ed6e13SAlex Elder ret = (int)size; 4290788e2df3SAlex Elder out: 4291788e2df3SAlex Elder if (obj_request) 4292788e2df3SAlex Elder rbd_obj_request_put(obj_request); 4293788e2df3SAlex Elder else 4294788e2df3SAlex Elder ceph_release_page_vector(pages, page_count); 4295788e2df3SAlex Elder 4296788e2df3SAlex Elder return ret; 4297788e2df3SAlex Elder } 4298788e2df3SAlex Elder 4299602adf40SYehuda Sadeh /* 4300662518b1SAlex Elder * Read the complete header for the given rbd device. On successful 4301662518b1SAlex Elder * return, the rbd_dev->header field will contain up-to-date 4302662518b1SAlex Elder * information about the image. 43034156d998SAlex Elder */ 430499a41ebcSAlex Elder static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev) 43054156d998SAlex Elder { 43064156d998SAlex Elder struct rbd_image_header_ondisk *ondisk = NULL; 43074156d998SAlex Elder u32 snap_count = 0; 43084156d998SAlex Elder u64 names_size = 0; 43094156d998SAlex Elder u32 want_count; 43104156d998SAlex Elder int ret; 43114156d998SAlex Elder 43124156d998SAlex Elder /* 43134156d998SAlex Elder * The complete header will include an array of its 64-bit 43144156d998SAlex Elder * snapshot ids, followed by the names of those snapshots as 43154156d998SAlex Elder * a contiguous block of NUL-terminated strings. Note that 43164156d998SAlex Elder * the number of snapshots could change by the time we read 43174156d998SAlex Elder * it in, in which case we re-read it. 43184156d998SAlex Elder */ 43194156d998SAlex Elder do { 43204156d998SAlex Elder size_t size; 43214156d998SAlex Elder 43224156d998SAlex Elder kfree(ondisk); 43234156d998SAlex Elder 43244156d998SAlex Elder size = sizeof (*ondisk); 43254156d998SAlex Elder size += snap_count * sizeof (struct rbd_image_snap_ondisk); 43264156d998SAlex Elder size += names_size; 43274156d998SAlex Elder ondisk = kmalloc(size, GFP_KERNEL); 43284156d998SAlex Elder if (!ondisk) 4329662518b1SAlex Elder return -ENOMEM; 43304156d998SAlex Elder 4331c41d13a3SIlya Dryomov ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_oid.name, 43327097f8dfSAlex Elder 0, size, ondisk); 43334156d998SAlex Elder if (ret < 0) 4334662518b1SAlex Elder goto out; 4335c0cd10dbSAlex Elder if ((size_t)ret < size) { 43364156d998SAlex Elder ret = -ENXIO; 433706ecc6cbSAlex Elder rbd_warn(rbd_dev, "short header read (want %zd got %d)", 433806ecc6cbSAlex Elder size, ret); 4339662518b1SAlex Elder goto out; 43404156d998SAlex Elder } 43414156d998SAlex Elder if (!rbd_dev_ondisk_valid(ondisk)) { 43424156d998SAlex Elder ret = -ENXIO; 434306ecc6cbSAlex Elder rbd_warn(rbd_dev, "invalid header"); 4344662518b1SAlex Elder goto out; 43454156d998SAlex Elder } 43464156d998SAlex Elder 43474156d998SAlex Elder names_size = le64_to_cpu(ondisk->snap_names_len); 43484156d998SAlex Elder want_count = snap_count; 43494156d998SAlex Elder snap_count = le32_to_cpu(ondisk->snap_count); 43504156d998SAlex Elder } while (snap_count != want_count); 43514156d998SAlex Elder 4352662518b1SAlex Elder ret = rbd_header_from_disk(rbd_dev, ondisk); 4353662518b1SAlex Elder out: 43544156d998SAlex Elder kfree(ondisk); 43554156d998SAlex Elder 4356dfc5606dSYehuda Sadeh return ret; 4357602adf40SYehuda Sadeh } 4358602adf40SYehuda Sadeh 435915228edeSAlex Elder /* 436015228edeSAlex Elder * Clear the rbd device's EXISTS flag if the snapshot it's mapped to 436115228edeSAlex Elder * has disappeared from the (just updated) snapshot context. 436215228edeSAlex Elder */ 436315228edeSAlex Elder static void rbd_exists_validate(struct rbd_device *rbd_dev) 436415228edeSAlex Elder { 436515228edeSAlex Elder u64 snap_id; 436615228edeSAlex Elder 436715228edeSAlex Elder if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) 436815228edeSAlex Elder return; 436915228edeSAlex Elder 437015228edeSAlex Elder snap_id = rbd_dev->spec->snap_id; 437115228edeSAlex Elder if (snap_id == CEPH_NOSNAP) 437215228edeSAlex Elder return; 437315228edeSAlex Elder 437415228edeSAlex Elder if (rbd_dev_snap_index(rbd_dev, snap_id) == BAD_SNAP_INDEX) 437515228edeSAlex Elder clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); 437615228edeSAlex Elder } 437715228edeSAlex Elder 43789875201eSJosh Durgin static void rbd_dev_update_size(struct rbd_device *rbd_dev) 43799875201eSJosh Durgin { 43809875201eSJosh Durgin sector_t size; 43819875201eSJosh Durgin 43829875201eSJosh Durgin /* 4383811c6688SIlya Dryomov * If EXISTS is not set, rbd_dev->disk may be NULL, so don't 4384811c6688SIlya Dryomov * try to update its size. If REMOVING is set, updating size 4385811c6688SIlya Dryomov * is just useless work since the device can't be opened. 43869875201eSJosh Durgin */ 4387811c6688SIlya Dryomov if (test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags) && 4388811c6688SIlya Dryomov !test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags)) { 43899875201eSJosh Durgin size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE; 43909875201eSJosh Durgin dout("setting size to %llu sectors", (unsigned long long)size); 43919875201eSJosh Durgin set_capacity(rbd_dev->disk, size); 43929875201eSJosh Durgin revalidate_disk(rbd_dev->disk); 43939875201eSJosh Durgin } 43949875201eSJosh Durgin } 43959875201eSJosh Durgin 4396cc4a38bdSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev) 43971fe5e993SAlex Elder { 4398e627db08SAlex Elder u64 mapping_size; 43991fe5e993SAlex Elder int ret; 44001fe5e993SAlex Elder 4401cfbf6377SAlex Elder down_write(&rbd_dev->header_rwsem); 44023b5cf2a2SAlex Elder mapping_size = rbd_dev->mapping.size; 4403a720ae09SIlya Dryomov 4404a720ae09SIlya Dryomov ret = rbd_dev_header_info(rbd_dev); 440552bb1f9bSIlya Dryomov if (ret) 440673e39e4dSIlya Dryomov goto out; 440715228edeSAlex Elder 4408e8f59b59SIlya Dryomov /* 4409e8f59b59SIlya Dryomov * If there is a parent, see if it has disappeared due to the 4410e8f59b59SIlya Dryomov * mapped image getting flattened. 4411e8f59b59SIlya Dryomov */ 4412e8f59b59SIlya Dryomov if (rbd_dev->parent) { 4413e8f59b59SIlya Dryomov ret = rbd_dev_v2_parent_info(rbd_dev); 4414e8f59b59SIlya Dryomov if (ret) 441573e39e4dSIlya Dryomov goto out; 4416e8f59b59SIlya Dryomov } 4417e8f59b59SIlya Dryomov 44185ff1108cSIlya Dryomov if (rbd_dev->spec->snap_id == CEPH_NOSNAP) { 44195ff1108cSIlya Dryomov rbd_dev->mapping.size = rbd_dev->header.image_size; 44205ff1108cSIlya Dryomov } else { 44215ff1108cSIlya Dryomov /* validate mapped snapshot's EXISTS flag */ 442215228edeSAlex Elder rbd_exists_validate(rbd_dev); 44235ff1108cSIlya Dryomov } 44245ff1108cSIlya Dryomov 442573e39e4dSIlya Dryomov out: 4426cfbf6377SAlex Elder up_write(&rbd_dev->header_rwsem); 442773e39e4dSIlya Dryomov if (!ret && mapping_size != rbd_dev->mapping.size) 44289875201eSJosh Durgin rbd_dev_update_size(rbd_dev); 44291fe5e993SAlex Elder 443073e39e4dSIlya Dryomov return ret; 44311fe5e993SAlex Elder } 44321fe5e993SAlex Elder 44337ad18afaSChristoph Hellwig static int rbd_init_request(void *data, struct request *rq, 44347ad18afaSChristoph Hellwig unsigned int hctx_idx, unsigned int request_idx, 44357ad18afaSChristoph Hellwig unsigned int numa_node) 44367ad18afaSChristoph Hellwig { 44377ad18afaSChristoph Hellwig struct work_struct *work = blk_mq_rq_to_pdu(rq); 44387ad18afaSChristoph Hellwig 44397ad18afaSChristoph Hellwig INIT_WORK(work, rbd_queue_workfn); 44407ad18afaSChristoph Hellwig return 0; 44417ad18afaSChristoph Hellwig } 44427ad18afaSChristoph Hellwig 44437ad18afaSChristoph Hellwig static struct blk_mq_ops rbd_mq_ops = { 44447ad18afaSChristoph Hellwig .queue_rq = rbd_queue_rq, 44457ad18afaSChristoph Hellwig .map_queue = blk_mq_map_queue, 44467ad18afaSChristoph Hellwig .init_request = rbd_init_request, 44477ad18afaSChristoph Hellwig }; 44487ad18afaSChristoph Hellwig 4449602adf40SYehuda Sadeh static int rbd_init_disk(struct rbd_device *rbd_dev) 4450602adf40SYehuda Sadeh { 4451602adf40SYehuda Sadeh struct gendisk *disk; 4452602adf40SYehuda Sadeh struct request_queue *q; 4453593a9e7bSAlex Elder u64 segment_size; 44547ad18afaSChristoph Hellwig int err; 4455602adf40SYehuda Sadeh 4456602adf40SYehuda Sadeh /* create gendisk info */ 44577e513d43SIlya Dryomov disk = alloc_disk(single_major ? 44587e513d43SIlya Dryomov (1 << RBD_SINGLE_MAJOR_PART_SHIFT) : 44597e513d43SIlya Dryomov RBD_MINORS_PER_MAJOR); 4460602adf40SYehuda Sadeh if (!disk) 44611fcdb8aaSAlex Elder return -ENOMEM; 4462602adf40SYehuda Sadeh 4463f0f8cef5SAlex Elder snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d", 4464de71a297SAlex Elder rbd_dev->dev_id); 4465602adf40SYehuda Sadeh disk->major = rbd_dev->major; 4466dd82fff1SIlya Dryomov disk->first_minor = rbd_dev->minor; 44677e513d43SIlya Dryomov if (single_major) 44687e513d43SIlya Dryomov disk->flags |= GENHD_FL_EXT_DEVT; 4469602adf40SYehuda Sadeh disk->fops = &rbd_bd_ops; 4470602adf40SYehuda Sadeh disk->private_data = rbd_dev; 4471602adf40SYehuda Sadeh 44727ad18afaSChristoph Hellwig memset(&rbd_dev->tag_set, 0, sizeof(rbd_dev->tag_set)); 44737ad18afaSChristoph Hellwig rbd_dev->tag_set.ops = &rbd_mq_ops; 4474b5584180SIlya Dryomov rbd_dev->tag_set.queue_depth = rbd_dev->opts->queue_depth; 44757ad18afaSChristoph Hellwig rbd_dev->tag_set.numa_node = NUMA_NO_NODE; 4476b5584180SIlya Dryomov rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE; 44777ad18afaSChristoph Hellwig rbd_dev->tag_set.nr_hw_queues = 1; 44787ad18afaSChristoph Hellwig rbd_dev->tag_set.cmd_size = sizeof(struct work_struct); 44797ad18afaSChristoph Hellwig 44807ad18afaSChristoph Hellwig err = blk_mq_alloc_tag_set(&rbd_dev->tag_set); 44817ad18afaSChristoph Hellwig if (err) 4482602adf40SYehuda Sadeh goto out_disk; 4483029bcbd8SJosh Durgin 44847ad18afaSChristoph Hellwig q = blk_mq_init_queue(&rbd_dev->tag_set); 44857ad18afaSChristoph Hellwig if (IS_ERR(q)) { 44867ad18afaSChristoph Hellwig err = PTR_ERR(q); 44877ad18afaSChristoph Hellwig goto out_tag_set; 44887ad18afaSChristoph Hellwig } 44897ad18afaSChristoph Hellwig 4490d8a2c89cSIlya Dryomov queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q); 4491d8a2c89cSIlya Dryomov /* QUEUE_FLAG_ADD_RANDOM is off by default for blk-mq */ 4492593a9e7bSAlex Elder 4493029bcbd8SJosh Durgin /* set io sizes to object size */ 4494593a9e7bSAlex Elder segment_size = rbd_obj_bytes(&rbd_dev->header); 4495593a9e7bSAlex Elder blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE); 44960d9fde4fSIlya Dryomov q->limits.max_sectors = queue_max_hw_sectors(q); 4497d3834fefSIlya Dryomov blk_queue_max_segments(q, segment_size / SECTOR_SIZE); 4498593a9e7bSAlex Elder blk_queue_max_segment_size(q, segment_size); 4499593a9e7bSAlex Elder blk_queue_io_min(q, segment_size); 4500593a9e7bSAlex Elder blk_queue_io_opt(q, segment_size); 4501029bcbd8SJosh Durgin 450290e98c52SGuangliang Zhao /* enable the discard support */ 450390e98c52SGuangliang Zhao queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q); 450490e98c52SGuangliang Zhao q->limits.discard_granularity = segment_size; 450590e98c52SGuangliang Zhao q->limits.discard_alignment = segment_size; 45062bb4cd5cSJens Axboe blk_queue_max_discard_sectors(q, segment_size / SECTOR_SIZE); 4507b76f8239SJosh Durgin q->limits.discard_zeroes_data = 1; 450890e98c52SGuangliang Zhao 4509bae818eeSRonny Hegewald if (!ceph_test_opt(rbd_dev->rbd_client->client, NOCRC)) 4510bae818eeSRonny Hegewald q->backing_dev_info.capabilities |= BDI_CAP_STABLE_WRITES; 4511bae818eeSRonny Hegewald 4512602adf40SYehuda Sadeh disk->queue = q; 4513602adf40SYehuda Sadeh 4514602adf40SYehuda Sadeh q->queuedata = rbd_dev; 4515602adf40SYehuda Sadeh 4516602adf40SYehuda Sadeh rbd_dev->disk = disk; 4517602adf40SYehuda Sadeh 4518602adf40SYehuda Sadeh return 0; 45197ad18afaSChristoph Hellwig out_tag_set: 45207ad18afaSChristoph Hellwig blk_mq_free_tag_set(&rbd_dev->tag_set); 4521602adf40SYehuda Sadeh out_disk: 4522602adf40SYehuda Sadeh put_disk(disk); 45237ad18afaSChristoph Hellwig return err; 4524602adf40SYehuda Sadeh } 4525602adf40SYehuda Sadeh 4526dfc5606dSYehuda Sadeh /* 4527dfc5606dSYehuda Sadeh sysfs 4528dfc5606dSYehuda Sadeh */ 4529602adf40SYehuda Sadeh 4530593a9e7bSAlex Elder static struct rbd_device *dev_to_rbd_dev(struct device *dev) 4531593a9e7bSAlex Elder { 4532593a9e7bSAlex Elder return container_of(dev, struct rbd_device, dev); 4533593a9e7bSAlex Elder } 4534593a9e7bSAlex Elder 4535dfc5606dSYehuda Sadeh static ssize_t rbd_size_show(struct device *dev, 4536dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 4537602adf40SYehuda Sadeh { 4538593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4539dfc5606dSYehuda Sadeh 4540fc71d833SAlex Elder return sprintf(buf, "%llu\n", 4541fc71d833SAlex Elder (unsigned long long)rbd_dev->mapping.size); 4542602adf40SYehuda Sadeh } 4543602adf40SYehuda Sadeh 454434b13184SAlex Elder /* 454534b13184SAlex Elder * Note this shows the features for whatever's mapped, which is not 454634b13184SAlex Elder * necessarily the base image. 454734b13184SAlex Elder */ 454834b13184SAlex Elder static ssize_t rbd_features_show(struct device *dev, 454934b13184SAlex Elder struct device_attribute *attr, char *buf) 455034b13184SAlex Elder { 455134b13184SAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 455234b13184SAlex Elder 455334b13184SAlex Elder return sprintf(buf, "0x%016llx\n", 455434b13184SAlex Elder (unsigned long long)rbd_dev->mapping.features); 455534b13184SAlex Elder } 455634b13184SAlex Elder 4557dfc5606dSYehuda Sadeh static ssize_t rbd_major_show(struct device *dev, 4558dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 4559602adf40SYehuda Sadeh { 4560593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4561dfc5606dSYehuda Sadeh 4562fc71d833SAlex Elder if (rbd_dev->major) 4563dfc5606dSYehuda Sadeh return sprintf(buf, "%d\n", rbd_dev->major); 4564fc71d833SAlex Elder 4565fc71d833SAlex Elder return sprintf(buf, "(none)\n"); 4566dd82fff1SIlya Dryomov } 4567fc71d833SAlex Elder 4568dd82fff1SIlya Dryomov static ssize_t rbd_minor_show(struct device *dev, 4569dd82fff1SIlya Dryomov struct device_attribute *attr, char *buf) 4570dd82fff1SIlya Dryomov { 4571dd82fff1SIlya Dryomov struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4572dd82fff1SIlya Dryomov 4573dd82fff1SIlya Dryomov return sprintf(buf, "%d\n", rbd_dev->minor); 4574dfc5606dSYehuda Sadeh } 4575dfc5606dSYehuda Sadeh 4576005a07bfSIlya Dryomov static ssize_t rbd_client_addr_show(struct device *dev, 4577005a07bfSIlya Dryomov struct device_attribute *attr, char *buf) 4578005a07bfSIlya Dryomov { 4579005a07bfSIlya Dryomov struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4580005a07bfSIlya Dryomov struct ceph_entity_addr *client_addr = 4581005a07bfSIlya Dryomov ceph_client_addr(rbd_dev->rbd_client->client); 4582005a07bfSIlya Dryomov 4583005a07bfSIlya Dryomov return sprintf(buf, "%pISpc/%u\n", &client_addr->in_addr, 4584005a07bfSIlya Dryomov le32_to_cpu(client_addr->nonce)); 4585005a07bfSIlya Dryomov } 4586005a07bfSIlya Dryomov 4587dfc5606dSYehuda Sadeh static ssize_t rbd_client_id_show(struct device *dev, 4588dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 4589dfc5606dSYehuda Sadeh { 4590593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4591dfc5606dSYehuda Sadeh 45921dbb4399SAlex Elder return sprintf(buf, "client%lld\n", 4593033268a5SIlya Dryomov ceph_client_gid(rbd_dev->rbd_client->client)); 4594dfc5606dSYehuda Sadeh } 4595dfc5606dSYehuda Sadeh 4596267fb90bSMike Christie static ssize_t rbd_cluster_fsid_show(struct device *dev, 4597267fb90bSMike Christie struct device_attribute *attr, char *buf) 4598267fb90bSMike Christie { 4599267fb90bSMike Christie struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4600267fb90bSMike Christie 4601267fb90bSMike Christie return sprintf(buf, "%pU\n", &rbd_dev->rbd_client->client->fsid); 4602267fb90bSMike Christie } 4603267fb90bSMike Christie 46040d6d1e9cSMike Christie static ssize_t rbd_config_info_show(struct device *dev, 46050d6d1e9cSMike Christie struct device_attribute *attr, char *buf) 46060d6d1e9cSMike Christie { 46070d6d1e9cSMike Christie struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 46080d6d1e9cSMike Christie 46090d6d1e9cSMike Christie return sprintf(buf, "%s\n", rbd_dev->config_info); 46100d6d1e9cSMike Christie } 46110d6d1e9cSMike Christie 4612dfc5606dSYehuda Sadeh static ssize_t rbd_pool_show(struct device *dev, 4613dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 4614dfc5606dSYehuda Sadeh { 4615593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4616dfc5606dSYehuda Sadeh 46170d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->pool_name); 4618dfc5606dSYehuda Sadeh } 4619dfc5606dSYehuda Sadeh 46209bb2f334SAlex Elder static ssize_t rbd_pool_id_show(struct device *dev, 46219bb2f334SAlex Elder struct device_attribute *attr, char *buf) 46229bb2f334SAlex Elder { 46239bb2f334SAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 46249bb2f334SAlex Elder 46250d7dbfceSAlex Elder return sprintf(buf, "%llu\n", 46260d7dbfceSAlex Elder (unsigned long long) rbd_dev->spec->pool_id); 46279bb2f334SAlex Elder } 46289bb2f334SAlex Elder 4629dfc5606dSYehuda Sadeh static ssize_t rbd_name_show(struct device *dev, 4630dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 4631dfc5606dSYehuda Sadeh { 4632593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4633dfc5606dSYehuda Sadeh 4634a92ffdf8SAlex Elder if (rbd_dev->spec->image_name) 46350d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->image_name); 4636a92ffdf8SAlex Elder 4637a92ffdf8SAlex Elder return sprintf(buf, "(unknown)\n"); 4638dfc5606dSYehuda Sadeh } 4639dfc5606dSYehuda Sadeh 4640589d30e0SAlex Elder static ssize_t rbd_image_id_show(struct device *dev, 4641589d30e0SAlex Elder struct device_attribute *attr, char *buf) 4642589d30e0SAlex Elder { 4643589d30e0SAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4644589d30e0SAlex Elder 46450d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->image_id); 4646589d30e0SAlex Elder } 4647589d30e0SAlex Elder 464834b13184SAlex Elder /* 464934b13184SAlex Elder * Shows the name of the currently-mapped snapshot (or 465034b13184SAlex Elder * RBD_SNAP_HEAD_NAME for the base image). 465134b13184SAlex Elder */ 4652dfc5606dSYehuda Sadeh static ssize_t rbd_snap_show(struct device *dev, 4653dfc5606dSYehuda Sadeh struct device_attribute *attr, 4654dfc5606dSYehuda Sadeh char *buf) 4655dfc5606dSYehuda Sadeh { 4656593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4657dfc5606dSYehuda Sadeh 46580d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->snap_name); 4659dfc5606dSYehuda Sadeh } 4660dfc5606dSYehuda Sadeh 466192a58671SMike Christie static ssize_t rbd_snap_id_show(struct device *dev, 466292a58671SMike Christie struct device_attribute *attr, char *buf) 466392a58671SMike Christie { 466492a58671SMike Christie struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 466592a58671SMike Christie 466692a58671SMike Christie return sprintf(buf, "%llu\n", rbd_dev->spec->snap_id); 466792a58671SMike Christie } 466892a58671SMike Christie 466986b00e0dSAlex Elder /* 4670ff96128fSIlya Dryomov * For a v2 image, shows the chain of parent images, separated by empty 4671ff96128fSIlya Dryomov * lines. For v1 images or if there is no parent, shows "(no parent 4672ff96128fSIlya Dryomov * image)". 467386b00e0dSAlex Elder */ 467486b00e0dSAlex Elder static ssize_t rbd_parent_show(struct device *dev, 467586b00e0dSAlex Elder struct device_attribute *attr, 467686b00e0dSAlex Elder char *buf) 467786b00e0dSAlex Elder { 467886b00e0dSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4679ff96128fSIlya Dryomov ssize_t count = 0; 468086b00e0dSAlex Elder 4681ff96128fSIlya Dryomov if (!rbd_dev->parent) 468286b00e0dSAlex Elder return sprintf(buf, "(no parent image)\n"); 468386b00e0dSAlex Elder 4684ff96128fSIlya Dryomov for ( ; rbd_dev->parent; rbd_dev = rbd_dev->parent) { 4685ff96128fSIlya Dryomov struct rbd_spec *spec = rbd_dev->parent_spec; 468686b00e0dSAlex Elder 4687ff96128fSIlya Dryomov count += sprintf(&buf[count], "%s" 4688ff96128fSIlya Dryomov "pool_id %llu\npool_name %s\n" 4689ff96128fSIlya Dryomov "image_id %s\nimage_name %s\n" 4690ff96128fSIlya Dryomov "snap_id %llu\nsnap_name %s\n" 4691ff96128fSIlya Dryomov "overlap %llu\n", 4692ff96128fSIlya Dryomov !count ? "" : "\n", /* first? */ 4693ff96128fSIlya Dryomov spec->pool_id, spec->pool_name, 4694ff96128fSIlya Dryomov spec->image_id, spec->image_name ?: "(unknown)", 4695ff96128fSIlya Dryomov spec->snap_id, spec->snap_name, 4696ff96128fSIlya Dryomov rbd_dev->parent_overlap); 4697ff96128fSIlya Dryomov } 469886b00e0dSAlex Elder 469986b00e0dSAlex Elder return count; 470086b00e0dSAlex Elder } 470186b00e0dSAlex Elder 4702dfc5606dSYehuda Sadeh static ssize_t rbd_image_refresh(struct device *dev, 4703dfc5606dSYehuda Sadeh struct device_attribute *attr, 4704dfc5606dSYehuda Sadeh const char *buf, 4705dfc5606dSYehuda Sadeh size_t size) 4706dfc5606dSYehuda Sadeh { 4707593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4708b813623aSAlex Elder int ret; 4709602adf40SYehuda Sadeh 4710cc4a38bdSAlex Elder ret = rbd_dev_refresh(rbd_dev); 4711e627db08SAlex Elder if (ret) 471252bb1f9bSIlya Dryomov return ret; 4713b813623aSAlex Elder 471452bb1f9bSIlya Dryomov return size; 4715dfc5606dSYehuda Sadeh } 4716602adf40SYehuda Sadeh 4717dfc5606dSYehuda Sadeh static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL); 471834b13184SAlex Elder static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL); 4719dfc5606dSYehuda Sadeh static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL); 4720dd82fff1SIlya Dryomov static DEVICE_ATTR(minor, S_IRUGO, rbd_minor_show, NULL); 4721005a07bfSIlya Dryomov static DEVICE_ATTR(client_addr, S_IRUGO, rbd_client_addr_show, NULL); 4722dfc5606dSYehuda Sadeh static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL); 4723267fb90bSMike Christie static DEVICE_ATTR(cluster_fsid, S_IRUGO, rbd_cluster_fsid_show, NULL); 47240d6d1e9cSMike Christie static DEVICE_ATTR(config_info, S_IRUSR, rbd_config_info_show, NULL); 4725dfc5606dSYehuda Sadeh static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL); 47269bb2f334SAlex Elder static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL); 4727dfc5606dSYehuda Sadeh static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL); 4728589d30e0SAlex Elder static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL); 4729dfc5606dSYehuda Sadeh static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh); 4730dfc5606dSYehuda Sadeh static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL); 473192a58671SMike Christie static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL); 473286b00e0dSAlex Elder static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL); 4733dfc5606dSYehuda Sadeh 4734dfc5606dSYehuda Sadeh static struct attribute *rbd_attrs[] = { 4735dfc5606dSYehuda Sadeh &dev_attr_size.attr, 473634b13184SAlex Elder &dev_attr_features.attr, 4737dfc5606dSYehuda Sadeh &dev_attr_major.attr, 4738dd82fff1SIlya Dryomov &dev_attr_minor.attr, 4739005a07bfSIlya Dryomov &dev_attr_client_addr.attr, 4740dfc5606dSYehuda Sadeh &dev_attr_client_id.attr, 4741267fb90bSMike Christie &dev_attr_cluster_fsid.attr, 47420d6d1e9cSMike Christie &dev_attr_config_info.attr, 4743dfc5606dSYehuda Sadeh &dev_attr_pool.attr, 47449bb2f334SAlex Elder &dev_attr_pool_id.attr, 4745dfc5606dSYehuda Sadeh &dev_attr_name.attr, 4746589d30e0SAlex Elder &dev_attr_image_id.attr, 4747dfc5606dSYehuda Sadeh &dev_attr_current_snap.attr, 474892a58671SMike Christie &dev_attr_snap_id.attr, 474986b00e0dSAlex Elder &dev_attr_parent.attr, 4750dfc5606dSYehuda Sadeh &dev_attr_refresh.attr, 4751dfc5606dSYehuda Sadeh NULL 4752dfc5606dSYehuda Sadeh }; 4753dfc5606dSYehuda Sadeh 4754dfc5606dSYehuda Sadeh static struct attribute_group rbd_attr_group = { 4755dfc5606dSYehuda Sadeh .attrs = rbd_attrs, 4756dfc5606dSYehuda Sadeh }; 4757dfc5606dSYehuda Sadeh 4758dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_attr_groups[] = { 4759dfc5606dSYehuda Sadeh &rbd_attr_group, 4760dfc5606dSYehuda Sadeh NULL 4761dfc5606dSYehuda Sadeh }; 4762dfc5606dSYehuda Sadeh 47636cac4695SIlya Dryomov static void rbd_dev_release(struct device *dev); 4764dfc5606dSYehuda Sadeh 4765dfc5606dSYehuda Sadeh static struct device_type rbd_device_type = { 4766dfc5606dSYehuda Sadeh .name = "rbd", 4767dfc5606dSYehuda Sadeh .groups = rbd_attr_groups, 47686cac4695SIlya Dryomov .release = rbd_dev_release, 4769dfc5606dSYehuda Sadeh }; 4770dfc5606dSYehuda Sadeh 47718b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec) 47728b8fb99cSAlex Elder { 47738b8fb99cSAlex Elder kref_get(&spec->kref); 47748b8fb99cSAlex Elder 47758b8fb99cSAlex Elder return spec; 47768b8fb99cSAlex Elder } 47778b8fb99cSAlex Elder 47788b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref); 47798b8fb99cSAlex Elder static void rbd_spec_put(struct rbd_spec *spec) 47808b8fb99cSAlex Elder { 47818b8fb99cSAlex Elder if (spec) 47828b8fb99cSAlex Elder kref_put(&spec->kref, rbd_spec_free); 47838b8fb99cSAlex Elder } 47848b8fb99cSAlex Elder 47858b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_alloc(void) 47868b8fb99cSAlex Elder { 47878b8fb99cSAlex Elder struct rbd_spec *spec; 47888b8fb99cSAlex Elder 47898b8fb99cSAlex Elder spec = kzalloc(sizeof (*spec), GFP_KERNEL); 47908b8fb99cSAlex Elder if (!spec) 47918b8fb99cSAlex Elder return NULL; 479204077599SIlya Dryomov 479304077599SIlya Dryomov spec->pool_id = CEPH_NOPOOL; 479404077599SIlya Dryomov spec->snap_id = CEPH_NOSNAP; 47958b8fb99cSAlex Elder kref_init(&spec->kref); 47968b8fb99cSAlex Elder 47978b8fb99cSAlex Elder return spec; 47988b8fb99cSAlex Elder } 47998b8fb99cSAlex Elder 48008b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref) 48018b8fb99cSAlex Elder { 48028b8fb99cSAlex Elder struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref); 48038b8fb99cSAlex Elder 48048b8fb99cSAlex Elder kfree(spec->pool_name); 48058b8fb99cSAlex Elder kfree(spec->image_id); 48068b8fb99cSAlex Elder kfree(spec->image_name); 48078b8fb99cSAlex Elder kfree(spec->snap_name); 48088b8fb99cSAlex Elder kfree(spec); 48098b8fb99cSAlex Elder } 48108b8fb99cSAlex Elder 48111643dfa4SIlya Dryomov static void rbd_dev_free(struct rbd_device *rbd_dev) 4812dd5ac32dSIlya Dryomov { 481399d16943SIlya Dryomov WARN_ON(rbd_dev->watch_state != RBD_WATCH_STATE_UNREGISTERED); 4814ed95b21aSIlya Dryomov WARN_ON(rbd_dev->lock_state != RBD_LOCK_STATE_UNLOCKED); 481599d16943SIlya Dryomov 4816c41d13a3SIlya Dryomov ceph_oid_destroy(&rbd_dev->header_oid); 48176b6dddbeSIlya Dryomov ceph_oloc_destroy(&rbd_dev->header_oloc); 48180d6d1e9cSMike Christie kfree(rbd_dev->config_info); 4819c41d13a3SIlya Dryomov 4820dd5ac32dSIlya Dryomov rbd_put_client(rbd_dev->rbd_client); 4821dd5ac32dSIlya Dryomov rbd_spec_put(rbd_dev->spec); 4822dd5ac32dSIlya Dryomov kfree(rbd_dev->opts); 4823dd5ac32dSIlya Dryomov kfree(rbd_dev); 48241643dfa4SIlya Dryomov } 48251643dfa4SIlya Dryomov 48261643dfa4SIlya Dryomov static void rbd_dev_release(struct device *dev) 48271643dfa4SIlya Dryomov { 48281643dfa4SIlya Dryomov struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 48291643dfa4SIlya Dryomov bool need_put = !!rbd_dev->opts; 48301643dfa4SIlya Dryomov 48311643dfa4SIlya Dryomov if (need_put) { 48321643dfa4SIlya Dryomov destroy_workqueue(rbd_dev->task_wq); 48331643dfa4SIlya Dryomov ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id); 48341643dfa4SIlya Dryomov } 48351643dfa4SIlya Dryomov 48361643dfa4SIlya Dryomov rbd_dev_free(rbd_dev); 4837dd5ac32dSIlya Dryomov 4838dd5ac32dSIlya Dryomov /* 4839dd5ac32dSIlya Dryomov * This is racy, but way better than putting module outside of 4840dd5ac32dSIlya Dryomov * the release callback. The race window is pretty small, so 4841dd5ac32dSIlya Dryomov * doing something similar to dm (dm-builtin.c) is overkill. 4842dd5ac32dSIlya Dryomov */ 4843dd5ac32dSIlya Dryomov if (need_put) 4844dd5ac32dSIlya Dryomov module_put(THIS_MODULE); 4845dd5ac32dSIlya Dryomov } 4846dd5ac32dSIlya Dryomov 48471643dfa4SIlya Dryomov static struct rbd_device *__rbd_dev_create(struct rbd_client *rbdc, 48481643dfa4SIlya Dryomov struct rbd_spec *spec) 4849c53d5893SAlex Elder { 4850c53d5893SAlex Elder struct rbd_device *rbd_dev; 4851c53d5893SAlex Elder 4852c53d5893SAlex Elder rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL); 4853c53d5893SAlex Elder if (!rbd_dev) 4854c53d5893SAlex Elder return NULL; 4855c53d5893SAlex Elder 4856c53d5893SAlex Elder spin_lock_init(&rbd_dev->lock); 4857c53d5893SAlex Elder INIT_LIST_HEAD(&rbd_dev->node); 4858c53d5893SAlex Elder init_rwsem(&rbd_dev->header_rwsem); 4859c53d5893SAlex Elder 4860c41d13a3SIlya Dryomov ceph_oid_init(&rbd_dev->header_oid); 4861922dab61SIlya Dryomov ceph_oloc_init(&rbd_dev->header_oloc); 4862c41d13a3SIlya Dryomov 486399d16943SIlya Dryomov mutex_init(&rbd_dev->watch_mutex); 486499d16943SIlya Dryomov rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED; 486599d16943SIlya Dryomov INIT_DELAYED_WORK(&rbd_dev->watch_dwork, rbd_reregister_watch); 486699d16943SIlya Dryomov 4867ed95b21aSIlya Dryomov init_rwsem(&rbd_dev->lock_rwsem); 4868ed95b21aSIlya Dryomov rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED; 4869ed95b21aSIlya Dryomov INIT_WORK(&rbd_dev->acquired_lock_work, rbd_notify_acquired_lock); 4870ed95b21aSIlya Dryomov INIT_WORK(&rbd_dev->released_lock_work, rbd_notify_released_lock); 4871ed95b21aSIlya Dryomov INIT_DELAYED_WORK(&rbd_dev->lock_dwork, rbd_acquire_lock); 4872ed95b21aSIlya Dryomov INIT_WORK(&rbd_dev->unlock_work, rbd_release_lock_work); 4873ed95b21aSIlya Dryomov init_waitqueue_head(&rbd_dev->lock_waitq); 4874ed95b21aSIlya Dryomov 4875dd5ac32dSIlya Dryomov rbd_dev->dev.bus = &rbd_bus_type; 4876dd5ac32dSIlya Dryomov rbd_dev->dev.type = &rbd_device_type; 4877dd5ac32dSIlya Dryomov rbd_dev->dev.parent = &rbd_root_dev; 4878dd5ac32dSIlya Dryomov device_initialize(&rbd_dev->dev); 4879dd5ac32dSIlya Dryomov 4880c53d5893SAlex Elder rbd_dev->rbd_client = rbdc; 4881d147543dSIlya Dryomov rbd_dev->spec = spec; 48820903e875SAlex Elder 48837627151eSYan, Zheng rbd_dev->layout.stripe_unit = 1 << RBD_MAX_OBJ_ORDER; 48847627151eSYan, Zheng rbd_dev->layout.stripe_count = 1; 48857627151eSYan, Zheng rbd_dev->layout.object_size = 1 << RBD_MAX_OBJ_ORDER; 48867627151eSYan, Zheng rbd_dev->layout.pool_id = spec->pool_id; 488730c156d9SYan, Zheng RCU_INIT_POINTER(rbd_dev->layout.pool_ns, NULL); 48880903e875SAlex Elder 48891643dfa4SIlya Dryomov return rbd_dev; 48901643dfa4SIlya Dryomov } 48911643dfa4SIlya Dryomov 4892dd5ac32dSIlya Dryomov /* 48931643dfa4SIlya Dryomov * Create a mapping rbd_dev. 4894dd5ac32dSIlya Dryomov */ 48951643dfa4SIlya Dryomov static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc, 48961643dfa4SIlya Dryomov struct rbd_spec *spec, 48971643dfa4SIlya Dryomov struct rbd_options *opts) 48981643dfa4SIlya Dryomov { 48991643dfa4SIlya Dryomov struct rbd_device *rbd_dev; 49001643dfa4SIlya Dryomov 49011643dfa4SIlya Dryomov rbd_dev = __rbd_dev_create(rbdc, spec); 49021643dfa4SIlya Dryomov if (!rbd_dev) 49031643dfa4SIlya Dryomov return NULL; 49041643dfa4SIlya Dryomov 49051643dfa4SIlya Dryomov rbd_dev->opts = opts; 49061643dfa4SIlya Dryomov 49071643dfa4SIlya Dryomov /* get an id and fill in device name */ 49081643dfa4SIlya Dryomov rbd_dev->dev_id = ida_simple_get(&rbd_dev_id_ida, 0, 49091643dfa4SIlya Dryomov minor_to_rbd_dev_id(1 << MINORBITS), 49101643dfa4SIlya Dryomov GFP_KERNEL); 49111643dfa4SIlya Dryomov if (rbd_dev->dev_id < 0) 49121643dfa4SIlya Dryomov goto fail_rbd_dev; 49131643dfa4SIlya Dryomov 49141643dfa4SIlya Dryomov sprintf(rbd_dev->name, RBD_DRV_NAME "%d", rbd_dev->dev_id); 49151643dfa4SIlya Dryomov rbd_dev->task_wq = alloc_ordered_workqueue("%s-tasks", WQ_MEM_RECLAIM, 49161643dfa4SIlya Dryomov rbd_dev->name); 49171643dfa4SIlya Dryomov if (!rbd_dev->task_wq) 49181643dfa4SIlya Dryomov goto fail_dev_id; 49191643dfa4SIlya Dryomov 49201643dfa4SIlya Dryomov /* we have a ref from do_rbd_add() */ 4921dd5ac32dSIlya Dryomov __module_get(THIS_MODULE); 4922dd5ac32dSIlya Dryomov 49231643dfa4SIlya Dryomov dout("%s rbd_dev %p dev_id %d\n", __func__, rbd_dev, rbd_dev->dev_id); 4924c53d5893SAlex Elder return rbd_dev; 49251643dfa4SIlya Dryomov 49261643dfa4SIlya Dryomov fail_dev_id: 49271643dfa4SIlya Dryomov ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id); 49281643dfa4SIlya Dryomov fail_rbd_dev: 49291643dfa4SIlya Dryomov rbd_dev_free(rbd_dev); 49301643dfa4SIlya Dryomov return NULL; 4931c53d5893SAlex Elder } 4932c53d5893SAlex Elder 4933c53d5893SAlex Elder static void rbd_dev_destroy(struct rbd_device *rbd_dev) 4934c53d5893SAlex Elder { 4935dd5ac32dSIlya Dryomov if (rbd_dev) 4936dd5ac32dSIlya Dryomov put_device(&rbd_dev->dev); 4937c53d5893SAlex Elder } 4938c53d5893SAlex Elder 4939dfc5606dSYehuda Sadeh /* 49409d475de5SAlex Elder * Get the size and object order for an image snapshot, or if 49419d475de5SAlex Elder * snap_id is CEPH_NOSNAP, gets this information for the base 49429d475de5SAlex Elder * image. 49439d475de5SAlex Elder */ 49449d475de5SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id, 49459d475de5SAlex Elder u8 *order, u64 *snap_size) 49469d475de5SAlex Elder { 49479d475de5SAlex Elder __le64 snapid = cpu_to_le64(snap_id); 49489d475de5SAlex Elder int ret; 49499d475de5SAlex Elder struct { 49509d475de5SAlex Elder u8 order; 49519d475de5SAlex Elder __le64 size; 49529d475de5SAlex Elder } __attribute__ ((packed)) size_buf = { 0 }; 49539d475de5SAlex Elder 4954c41d13a3SIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name, 49559d475de5SAlex Elder "rbd", "get_size", 49564157976bSAlex Elder &snapid, sizeof (snapid), 4957e2a58ee5SAlex Elder &size_buf, sizeof (size_buf)); 495836be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 49599d475de5SAlex Elder if (ret < 0) 49609d475de5SAlex Elder return ret; 496157385b51SAlex Elder if (ret < sizeof (size_buf)) 496257385b51SAlex Elder return -ERANGE; 49639d475de5SAlex Elder 4964c3545579SJosh Durgin if (order) { 49659d475de5SAlex Elder *order = size_buf.order; 4966c3545579SJosh Durgin dout(" order %u", (unsigned int)*order); 4967c3545579SJosh Durgin } 49689d475de5SAlex Elder *snap_size = le64_to_cpu(size_buf.size); 49699d475de5SAlex Elder 4970c3545579SJosh Durgin dout(" snap_id 0x%016llx snap_size = %llu\n", 4971c3545579SJosh Durgin (unsigned long long)snap_id, 49729d475de5SAlex Elder (unsigned long long)*snap_size); 49739d475de5SAlex Elder 49749d475de5SAlex Elder return 0; 49759d475de5SAlex Elder } 49769d475de5SAlex Elder 49779d475de5SAlex Elder static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev) 49789d475de5SAlex Elder { 49799d475de5SAlex Elder return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP, 49809d475de5SAlex Elder &rbd_dev->header.obj_order, 49819d475de5SAlex Elder &rbd_dev->header.image_size); 49829d475de5SAlex Elder } 49839d475de5SAlex Elder 49841e130199SAlex Elder static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev) 49851e130199SAlex Elder { 49861e130199SAlex Elder void *reply_buf; 49871e130199SAlex Elder int ret; 49881e130199SAlex Elder void *p; 49891e130199SAlex Elder 49901e130199SAlex Elder reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL); 49911e130199SAlex Elder if (!reply_buf) 49921e130199SAlex Elder return -ENOMEM; 49931e130199SAlex Elder 4994c41d13a3SIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name, 49954157976bSAlex Elder "rbd", "get_object_prefix", NULL, 0, 4996e2a58ee5SAlex Elder reply_buf, RBD_OBJ_PREFIX_LEN_MAX); 499736be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 49981e130199SAlex Elder if (ret < 0) 49991e130199SAlex Elder goto out; 50001e130199SAlex Elder 50011e130199SAlex Elder p = reply_buf; 50021e130199SAlex Elder rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p, 500357385b51SAlex Elder p + ret, NULL, GFP_NOIO); 500457385b51SAlex Elder ret = 0; 50051e130199SAlex Elder 50061e130199SAlex Elder if (IS_ERR(rbd_dev->header.object_prefix)) { 50071e130199SAlex Elder ret = PTR_ERR(rbd_dev->header.object_prefix); 50081e130199SAlex Elder rbd_dev->header.object_prefix = NULL; 50091e130199SAlex Elder } else { 50101e130199SAlex Elder dout(" object_prefix = %s\n", rbd_dev->header.object_prefix); 50111e130199SAlex Elder } 50121e130199SAlex Elder out: 50131e130199SAlex Elder kfree(reply_buf); 50141e130199SAlex Elder 50151e130199SAlex Elder return ret; 50161e130199SAlex Elder } 50171e130199SAlex Elder 5018b1b5402aSAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id, 5019b1b5402aSAlex Elder u64 *snap_features) 5020b1b5402aSAlex Elder { 5021b1b5402aSAlex Elder __le64 snapid = cpu_to_le64(snap_id); 5022b1b5402aSAlex Elder struct { 5023b1b5402aSAlex Elder __le64 features; 5024b1b5402aSAlex Elder __le64 incompat; 50254157976bSAlex Elder } __attribute__ ((packed)) features_buf = { 0 }; 5026d3767f0fSIlya Dryomov u64 unsup; 5027b1b5402aSAlex Elder int ret; 5028b1b5402aSAlex Elder 5029c41d13a3SIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name, 5030b1b5402aSAlex Elder "rbd", "get_features", 50314157976bSAlex Elder &snapid, sizeof (snapid), 5032e2a58ee5SAlex Elder &features_buf, sizeof (features_buf)); 503336be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 5034b1b5402aSAlex Elder if (ret < 0) 5035b1b5402aSAlex Elder return ret; 503657385b51SAlex Elder if (ret < sizeof (features_buf)) 503757385b51SAlex Elder return -ERANGE; 5038d889140cSAlex Elder 5039d3767f0fSIlya Dryomov unsup = le64_to_cpu(features_buf.incompat) & ~RBD_FEATURES_SUPPORTED; 5040d3767f0fSIlya Dryomov if (unsup) { 5041d3767f0fSIlya Dryomov rbd_warn(rbd_dev, "image uses unsupported features: 0x%llx", 5042d3767f0fSIlya Dryomov unsup); 5043b8f5c6edSAlex Elder return -ENXIO; 5044d3767f0fSIlya Dryomov } 5045d889140cSAlex Elder 5046b1b5402aSAlex Elder *snap_features = le64_to_cpu(features_buf.features); 5047b1b5402aSAlex Elder 5048b1b5402aSAlex Elder dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n", 5049b1b5402aSAlex Elder (unsigned long long)snap_id, 5050b1b5402aSAlex Elder (unsigned long long)*snap_features, 5051b1b5402aSAlex Elder (unsigned long long)le64_to_cpu(features_buf.incompat)); 5052b1b5402aSAlex Elder 5053b1b5402aSAlex Elder return 0; 5054b1b5402aSAlex Elder } 5055b1b5402aSAlex Elder 5056b1b5402aSAlex Elder static int rbd_dev_v2_features(struct rbd_device *rbd_dev) 5057b1b5402aSAlex Elder { 5058b1b5402aSAlex Elder return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP, 5059b1b5402aSAlex Elder &rbd_dev->header.features); 5060b1b5402aSAlex Elder } 5061b1b5402aSAlex Elder 506286b00e0dSAlex Elder static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev) 506386b00e0dSAlex Elder { 506486b00e0dSAlex Elder struct rbd_spec *parent_spec; 506586b00e0dSAlex Elder size_t size; 506686b00e0dSAlex Elder void *reply_buf = NULL; 506786b00e0dSAlex Elder __le64 snapid; 506886b00e0dSAlex Elder void *p; 506986b00e0dSAlex Elder void *end; 5070642a2537SAlex Elder u64 pool_id; 507186b00e0dSAlex Elder char *image_id; 50723b5cf2a2SAlex Elder u64 snap_id; 507386b00e0dSAlex Elder u64 overlap; 507486b00e0dSAlex Elder int ret; 507586b00e0dSAlex Elder 507686b00e0dSAlex Elder parent_spec = rbd_spec_alloc(); 507786b00e0dSAlex Elder if (!parent_spec) 507886b00e0dSAlex Elder return -ENOMEM; 507986b00e0dSAlex Elder 508086b00e0dSAlex Elder size = sizeof (__le64) + /* pool_id */ 508186b00e0dSAlex Elder sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX + /* image_id */ 508286b00e0dSAlex Elder sizeof (__le64) + /* snap_id */ 508386b00e0dSAlex Elder sizeof (__le64); /* overlap */ 508486b00e0dSAlex Elder reply_buf = kmalloc(size, GFP_KERNEL); 508586b00e0dSAlex Elder if (!reply_buf) { 508686b00e0dSAlex Elder ret = -ENOMEM; 508786b00e0dSAlex Elder goto out_err; 508886b00e0dSAlex Elder } 508986b00e0dSAlex Elder 50904d9b67cdSIlya Dryomov snapid = cpu_to_le64(rbd_dev->spec->snap_id); 5091c41d13a3SIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name, 509286b00e0dSAlex Elder "rbd", "get_parent", 50934157976bSAlex Elder &snapid, sizeof (snapid), 5094e2a58ee5SAlex Elder reply_buf, size); 509536be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 509686b00e0dSAlex Elder if (ret < 0) 509786b00e0dSAlex Elder goto out_err; 509886b00e0dSAlex Elder 509986b00e0dSAlex Elder p = reply_buf; 510057385b51SAlex Elder end = reply_buf + ret; 510157385b51SAlex Elder ret = -ERANGE; 5102642a2537SAlex Elder ceph_decode_64_safe(&p, end, pool_id, out_err); 5103392a9dadSAlex Elder if (pool_id == CEPH_NOPOOL) { 5104392a9dadSAlex Elder /* 5105392a9dadSAlex Elder * Either the parent never existed, or we have 5106392a9dadSAlex Elder * record of it but the image got flattened so it no 5107392a9dadSAlex Elder * longer has a parent. When the parent of a 5108392a9dadSAlex Elder * layered image disappears we immediately set the 5109392a9dadSAlex Elder * overlap to 0. The effect of this is that all new 5110392a9dadSAlex Elder * requests will be treated as if the image had no 5111392a9dadSAlex Elder * parent. 5112392a9dadSAlex Elder */ 5113392a9dadSAlex Elder if (rbd_dev->parent_overlap) { 5114392a9dadSAlex Elder rbd_dev->parent_overlap = 0; 5115392a9dadSAlex Elder rbd_dev_parent_put(rbd_dev); 5116392a9dadSAlex Elder pr_info("%s: clone image has been flattened\n", 5117392a9dadSAlex Elder rbd_dev->disk->disk_name); 5118392a9dadSAlex Elder } 5119392a9dadSAlex Elder 512086b00e0dSAlex Elder goto out; /* No parent? No problem. */ 5121392a9dadSAlex Elder } 512286b00e0dSAlex Elder 51230903e875SAlex Elder /* The ceph file layout needs to fit pool id in 32 bits */ 51240903e875SAlex Elder 51250903e875SAlex Elder ret = -EIO; 5126642a2537SAlex Elder if (pool_id > (u64)U32_MAX) { 51279584d508SIlya Dryomov rbd_warn(NULL, "parent pool id too large (%llu > %u)", 5128642a2537SAlex Elder (unsigned long long)pool_id, U32_MAX); 512957385b51SAlex Elder goto out_err; 5130c0cd10dbSAlex Elder } 51310903e875SAlex Elder 5132979ed480SAlex Elder image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL); 513386b00e0dSAlex Elder if (IS_ERR(image_id)) { 513486b00e0dSAlex Elder ret = PTR_ERR(image_id); 513586b00e0dSAlex Elder goto out_err; 513686b00e0dSAlex Elder } 51373b5cf2a2SAlex Elder ceph_decode_64_safe(&p, end, snap_id, out_err); 513886b00e0dSAlex Elder ceph_decode_64_safe(&p, end, overlap, out_err); 513986b00e0dSAlex Elder 51403b5cf2a2SAlex Elder /* 51413b5cf2a2SAlex Elder * The parent won't change (except when the clone is 51423b5cf2a2SAlex Elder * flattened, already handled that). So we only need to 51433b5cf2a2SAlex Elder * record the parent spec we have not already done so. 51443b5cf2a2SAlex Elder */ 51453b5cf2a2SAlex Elder if (!rbd_dev->parent_spec) { 51463b5cf2a2SAlex Elder parent_spec->pool_id = pool_id; 51473b5cf2a2SAlex Elder parent_spec->image_id = image_id; 51483b5cf2a2SAlex Elder parent_spec->snap_id = snap_id; 514986b00e0dSAlex Elder rbd_dev->parent_spec = parent_spec; 515086b00e0dSAlex Elder parent_spec = NULL; /* rbd_dev now owns this */ 5151fbba11b3SIlya Dryomov } else { 5152fbba11b3SIlya Dryomov kfree(image_id); 51533b5cf2a2SAlex Elder } 51543b5cf2a2SAlex Elder 51553b5cf2a2SAlex Elder /* 5156cf32bd9cSIlya Dryomov * We always update the parent overlap. If it's zero we issue 5157cf32bd9cSIlya Dryomov * a warning, as we will proceed as if there was no parent. 51583b5cf2a2SAlex Elder */ 51593b5cf2a2SAlex Elder if (!overlap) { 51603b5cf2a2SAlex Elder if (parent_spec) { 5161cf32bd9cSIlya Dryomov /* refresh, careful to warn just once */ 5162cf32bd9cSIlya Dryomov if (rbd_dev->parent_overlap) 5163cf32bd9cSIlya Dryomov rbd_warn(rbd_dev, 5164cf32bd9cSIlya Dryomov "clone now standalone (overlap became 0)"); 516570cf49cfSAlex Elder } else { 5166cf32bd9cSIlya Dryomov /* initial probe */ 5167cf32bd9cSIlya Dryomov rbd_warn(rbd_dev, "clone is standalone (overlap 0)"); 51683b5cf2a2SAlex Elder } 516970cf49cfSAlex Elder } 5170cf32bd9cSIlya Dryomov rbd_dev->parent_overlap = overlap; 5171cf32bd9cSIlya Dryomov 517286b00e0dSAlex Elder out: 517386b00e0dSAlex Elder ret = 0; 517486b00e0dSAlex Elder out_err: 517586b00e0dSAlex Elder kfree(reply_buf); 517686b00e0dSAlex Elder rbd_spec_put(parent_spec); 517786b00e0dSAlex Elder 517886b00e0dSAlex Elder return ret; 517986b00e0dSAlex Elder } 518086b00e0dSAlex Elder 5181cc070d59SAlex Elder static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev) 5182cc070d59SAlex Elder { 5183cc070d59SAlex Elder struct { 5184cc070d59SAlex Elder __le64 stripe_unit; 5185cc070d59SAlex Elder __le64 stripe_count; 5186cc070d59SAlex Elder } __attribute__ ((packed)) striping_info_buf = { 0 }; 5187cc070d59SAlex Elder size_t size = sizeof (striping_info_buf); 5188cc070d59SAlex Elder void *p; 5189cc070d59SAlex Elder u64 obj_size; 5190cc070d59SAlex Elder u64 stripe_unit; 5191cc070d59SAlex Elder u64 stripe_count; 5192cc070d59SAlex Elder int ret; 5193cc070d59SAlex Elder 5194c41d13a3SIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name, 5195cc070d59SAlex Elder "rbd", "get_stripe_unit_count", NULL, 0, 5196e2a58ee5SAlex Elder (char *)&striping_info_buf, size); 5197cc070d59SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 5198cc070d59SAlex Elder if (ret < 0) 5199cc070d59SAlex Elder return ret; 5200cc070d59SAlex Elder if (ret < size) 5201cc070d59SAlex Elder return -ERANGE; 5202cc070d59SAlex Elder 5203cc070d59SAlex Elder /* 5204cc070d59SAlex Elder * We don't actually support the "fancy striping" feature 5205cc070d59SAlex Elder * (STRIPINGV2) yet, but if the striping sizes are the 5206cc070d59SAlex Elder * defaults the behavior is the same as before. So find 5207cc070d59SAlex Elder * out, and only fail if the image has non-default values. 5208cc070d59SAlex Elder */ 5209cc070d59SAlex Elder ret = -EINVAL; 5210cc070d59SAlex Elder obj_size = (u64)1 << rbd_dev->header.obj_order; 5211cc070d59SAlex Elder p = &striping_info_buf; 5212cc070d59SAlex Elder stripe_unit = ceph_decode_64(&p); 5213cc070d59SAlex Elder if (stripe_unit != obj_size) { 5214cc070d59SAlex Elder rbd_warn(rbd_dev, "unsupported stripe unit " 5215cc070d59SAlex Elder "(got %llu want %llu)", 5216cc070d59SAlex Elder stripe_unit, obj_size); 5217cc070d59SAlex Elder return -EINVAL; 5218cc070d59SAlex Elder } 5219cc070d59SAlex Elder stripe_count = ceph_decode_64(&p); 5220cc070d59SAlex Elder if (stripe_count != 1) { 5221cc070d59SAlex Elder rbd_warn(rbd_dev, "unsupported stripe count " 5222cc070d59SAlex Elder "(got %llu want 1)", stripe_count); 5223cc070d59SAlex Elder return -EINVAL; 5224cc070d59SAlex Elder } 5225500d0c0fSAlex Elder rbd_dev->header.stripe_unit = stripe_unit; 5226500d0c0fSAlex Elder rbd_dev->header.stripe_count = stripe_count; 5227cc070d59SAlex Elder 5228cc070d59SAlex Elder return 0; 5229cc070d59SAlex Elder } 5230cc070d59SAlex Elder 52319e15b77dSAlex Elder static char *rbd_dev_image_name(struct rbd_device *rbd_dev) 52329e15b77dSAlex Elder { 52339e15b77dSAlex Elder size_t image_id_size; 52349e15b77dSAlex Elder char *image_id; 52359e15b77dSAlex Elder void *p; 52369e15b77dSAlex Elder void *end; 52379e15b77dSAlex Elder size_t size; 52389e15b77dSAlex Elder void *reply_buf = NULL; 52399e15b77dSAlex Elder size_t len = 0; 52409e15b77dSAlex Elder char *image_name = NULL; 52419e15b77dSAlex Elder int ret; 52429e15b77dSAlex Elder 52439e15b77dSAlex Elder rbd_assert(!rbd_dev->spec->image_name); 52449e15b77dSAlex Elder 524569e7a02fSAlex Elder len = strlen(rbd_dev->spec->image_id); 524669e7a02fSAlex Elder image_id_size = sizeof (__le32) + len; 52479e15b77dSAlex Elder image_id = kmalloc(image_id_size, GFP_KERNEL); 52489e15b77dSAlex Elder if (!image_id) 52499e15b77dSAlex Elder return NULL; 52509e15b77dSAlex Elder 52519e15b77dSAlex Elder p = image_id; 52524157976bSAlex Elder end = image_id + image_id_size; 525369e7a02fSAlex Elder ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len); 52549e15b77dSAlex Elder 52559e15b77dSAlex Elder size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX; 52569e15b77dSAlex Elder reply_buf = kmalloc(size, GFP_KERNEL); 52579e15b77dSAlex Elder if (!reply_buf) 52589e15b77dSAlex Elder goto out; 52599e15b77dSAlex Elder 526036be9a76SAlex Elder ret = rbd_obj_method_sync(rbd_dev, RBD_DIRECTORY, 52619e15b77dSAlex Elder "rbd", "dir_get_name", 52629e15b77dSAlex Elder image_id, image_id_size, 5263e2a58ee5SAlex Elder reply_buf, size); 52649e15b77dSAlex Elder if (ret < 0) 52659e15b77dSAlex Elder goto out; 52669e15b77dSAlex Elder p = reply_buf; 5267f40eb349SAlex Elder end = reply_buf + ret; 5268f40eb349SAlex Elder 52699e15b77dSAlex Elder image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL); 52709e15b77dSAlex Elder if (IS_ERR(image_name)) 52719e15b77dSAlex Elder image_name = NULL; 52729e15b77dSAlex Elder else 52739e15b77dSAlex Elder dout("%s: name is %s len is %zd\n", __func__, image_name, len); 52749e15b77dSAlex Elder out: 52759e15b77dSAlex Elder kfree(reply_buf); 52769e15b77dSAlex Elder kfree(image_id); 52779e15b77dSAlex Elder 52789e15b77dSAlex Elder return image_name; 52799e15b77dSAlex Elder } 52809e15b77dSAlex Elder 52812ad3d716SAlex Elder static u64 rbd_v1_snap_id_by_name(struct rbd_device *rbd_dev, const char *name) 52822ad3d716SAlex Elder { 52832ad3d716SAlex Elder struct ceph_snap_context *snapc = rbd_dev->header.snapc; 52842ad3d716SAlex Elder const char *snap_name; 52852ad3d716SAlex Elder u32 which = 0; 52862ad3d716SAlex Elder 52872ad3d716SAlex Elder /* Skip over names until we find the one we are looking for */ 52882ad3d716SAlex Elder 52892ad3d716SAlex Elder snap_name = rbd_dev->header.snap_names; 52902ad3d716SAlex Elder while (which < snapc->num_snaps) { 52912ad3d716SAlex Elder if (!strcmp(name, snap_name)) 52922ad3d716SAlex Elder return snapc->snaps[which]; 52932ad3d716SAlex Elder snap_name += strlen(snap_name) + 1; 52942ad3d716SAlex Elder which++; 52952ad3d716SAlex Elder } 52962ad3d716SAlex Elder return CEPH_NOSNAP; 52972ad3d716SAlex Elder } 52982ad3d716SAlex Elder 52992ad3d716SAlex Elder static u64 rbd_v2_snap_id_by_name(struct rbd_device *rbd_dev, const char *name) 53002ad3d716SAlex Elder { 53012ad3d716SAlex Elder struct ceph_snap_context *snapc = rbd_dev->header.snapc; 53022ad3d716SAlex Elder u32 which; 53032ad3d716SAlex Elder bool found = false; 53042ad3d716SAlex Elder u64 snap_id; 53052ad3d716SAlex Elder 53062ad3d716SAlex Elder for (which = 0; !found && which < snapc->num_snaps; which++) { 53072ad3d716SAlex Elder const char *snap_name; 53082ad3d716SAlex Elder 53092ad3d716SAlex Elder snap_id = snapc->snaps[which]; 53102ad3d716SAlex Elder snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id); 5311efadc98aSJosh Durgin if (IS_ERR(snap_name)) { 5312efadc98aSJosh Durgin /* ignore no-longer existing snapshots */ 5313efadc98aSJosh Durgin if (PTR_ERR(snap_name) == -ENOENT) 5314efadc98aSJosh Durgin continue; 5315efadc98aSJosh Durgin else 53162ad3d716SAlex Elder break; 5317efadc98aSJosh Durgin } 53182ad3d716SAlex Elder found = !strcmp(name, snap_name); 53192ad3d716SAlex Elder kfree(snap_name); 53202ad3d716SAlex Elder } 53212ad3d716SAlex Elder return found ? snap_id : CEPH_NOSNAP; 53222ad3d716SAlex Elder } 53232ad3d716SAlex Elder 53242ad3d716SAlex Elder /* 53252ad3d716SAlex Elder * Assumes name is never RBD_SNAP_HEAD_NAME; returns CEPH_NOSNAP if 53262ad3d716SAlex Elder * no snapshot by that name is found, or if an error occurs. 53272ad3d716SAlex Elder */ 53282ad3d716SAlex Elder static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name) 53292ad3d716SAlex Elder { 53302ad3d716SAlex Elder if (rbd_dev->image_format == 1) 53312ad3d716SAlex Elder return rbd_v1_snap_id_by_name(rbd_dev, name); 53322ad3d716SAlex Elder 53332ad3d716SAlex Elder return rbd_v2_snap_id_by_name(rbd_dev, name); 53342ad3d716SAlex Elder } 53352ad3d716SAlex Elder 53369e15b77dSAlex Elder /* 533704077599SIlya Dryomov * An image being mapped will have everything but the snap id. 53389e15b77dSAlex Elder */ 533904077599SIlya Dryomov static int rbd_spec_fill_snap_id(struct rbd_device *rbd_dev) 534004077599SIlya Dryomov { 534104077599SIlya Dryomov struct rbd_spec *spec = rbd_dev->spec; 534204077599SIlya Dryomov 534304077599SIlya Dryomov rbd_assert(spec->pool_id != CEPH_NOPOOL && spec->pool_name); 534404077599SIlya Dryomov rbd_assert(spec->image_id && spec->image_name); 534504077599SIlya Dryomov rbd_assert(spec->snap_name); 534604077599SIlya Dryomov 534704077599SIlya Dryomov if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) { 534804077599SIlya Dryomov u64 snap_id; 534904077599SIlya Dryomov 535004077599SIlya Dryomov snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name); 535104077599SIlya Dryomov if (snap_id == CEPH_NOSNAP) 535204077599SIlya Dryomov return -ENOENT; 535304077599SIlya Dryomov 535404077599SIlya Dryomov spec->snap_id = snap_id; 535504077599SIlya Dryomov } else { 535604077599SIlya Dryomov spec->snap_id = CEPH_NOSNAP; 535704077599SIlya Dryomov } 535804077599SIlya Dryomov 535904077599SIlya Dryomov return 0; 536004077599SIlya Dryomov } 536104077599SIlya Dryomov 536204077599SIlya Dryomov /* 536304077599SIlya Dryomov * A parent image will have all ids but none of the names. 536404077599SIlya Dryomov * 536504077599SIlya Dryomov * All names in an rbd spec are dynamically allocated. It's OK if we 536604077599SIlya Dryomov * can't figure out the name for an image id. 536704077599SIlya Dryomov */ 536804077599SIlya Dryomov static int rbd_spec_fill_names(struct rbd_device *rbd_dev) 53699e15b77dSAlex Elder { 53702e9f7f1cSAlex Elder struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 53712e9f7f1cSAlex Elder struct rbd_spec *spec = rbd_dev->spec; 53722e9f7f1cSAlex Elder const char *pool_name; 53732e9f7f1cSAlex Elder const char *image_name; 53742e9f7f1cSAlex Elder const char *snap_name; 53759e15b77dSAlex Elder int ret; 53769e15b77dSAlex Elder 537704077599SIlya Dryomov rbd_assert(spec->pool_id != CEPH_NOPOOL); 537804077599SIlya Dryomov rbd_assert(spec->image_id); 537904077599SIlya Dryomov rbd_assert(spec->snap_id != CEPH_NOSNAP); 53809e15b77dSAlex Elder 53812e9f7f1cSAlex Elder /* Get the pool name; we have to make our own copy of this */ 53829e15b77dSAlex Elder 53832e9f7f1cSAlex Elder pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id); 53842e9f7f1cSAlex Elder if (!pool_name) { 53852e9f7f1cSAlex Elder rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id); 5386935dc89fSAlex Elder return -EIO; 5387935dc89fSAlex Elder } 53882e9f7f1cSAlex Elder pool_name = kstrdup(pool_name, GFP_KERNEL); 53892e9f7f1cSAlex Elder if (!pool_name) 53909e15b77dSAlex Elder return -ENOMEM; 53919e15b77dSAlex Elder 53929e15b77dSAlex Elder /* Fetch the image name; tolerate failure here */ 53939e15b77dSAlex Elder 53942e9f7f1cSAlex Elder image_name = rbd_dev_image_name(rbd_dev); 53952e9f7f1cSAlex Elder if (!image_name) 539606ecc6cbSAlex Elder rbd_warn(rbd_dev, "unable to get image name"); 53979e15b77dSAlex Elder 539804077599SIlya Dryomov /* Fetch the snapshot name */ 53999e15b77dSAlex Elder 54002e9f7f1cSAlex Elder snap_name = rbd_snap_name(rbd_dev, spec->snap_id); 5401da6a6b63SJosh Durgin if (IS_ERR(snap_name)) { 5402da6a6b63SJosh Durgin ret = PTR_ERR(snap_name); 54039e15b77dSAlex Elder goto out_err; 54042e9f7f1cSAlex Elder } 54052e9f7f1cSAlex Elder 54062e9f7f1cSAlex Elder spec->pool_name = pool_name; 54072e9f7f1cSAlex Elder spec->image_name = image_name; 54082e9f7f1cSAlex Elder spec->snap_name = snap_name; 54099e15b77dSAlex Elder 54109e15b77dSAlex Elder return 0; 541104077599SIlya Dryomov 54129e15b77dSAlex Elder out_err: 54132e9f7f1cSAlex Elder kfree(image_name); 54142e9f7f1cSAlex Elder kfree(pool_name); 54159e15b77dSAlex Elder return ret; 54169e15b77dSAlex Elder } 54179e15b77dSAlex Elder 5418cc4a38bdSAlex Elder static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev) 541935d489f9SAlex Elder { 542035d489f9SAlex Elder size_t size; 542135d489f9SAlex Elder int ret; 542235d489f9SAlex Elder void *reply_buf; 542335d489f9SAlex Elder void *p; 542435d489f9SAlex Elder void *end; 542535d489f9SAlex Elder u64 seq; 542635d489f9SAlex Elder u32 snap_count; 542735d489f9SAlex Elder struct ceph_snap_context *snapc; 542835d489f9SAlex Elder u32 i; 542935d489f9SAlex Elder 543035d489f9SAlex Elder /* 543135d489f9SAlex Elder * We'll need room for the seq value (maximum snapshot id), 543235d489f9SAlex Elder * snapshot count, and array of that many snapshot ids. 543335d489f9SAlex Elder * For now we have a fixed upper limit on the number we're 543435d489f9SAlex Elder * prepared to receive. 543535d489f9SAlex Elder */ 543635d489f9SAlex Elder size = sizeof (__le64) + sizeof (__le32) + 543735d489f9SAlex Elder RBD_MAX_SNAP_COUNT * sizeof (__le64); 543835d489f9SAlex Elder reply_buf = kzalloc(size, GFP_KERNEL); 543935d489f9SAlex Elder if (!reply_buf) 544035d489f9SAlex Elder return -ENOMEM; 544135d489f9SAlex Elder 5442c41d13a3SIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name, 54434157976bSAlex Elder "rbd", "get_snapcontext", NULL, 0, 5444e2a58ee5SAlex Elder reply_buf, size); 544536be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 544635d489f9SAlex Elder if (ret < 0) 544735d489f9SAlex Elder goto out; 544835d489f9SAlex Elder 544935d489f9SAlex Elder p = reply_buf; 545057385b51SAlex Elder end = reply_buf + ret; 545157385b51SAlex Elder ret = -ERANGE; 545235d489f9SAlex Elder ceph_decode_64_safe(&p, end, seq, out); 545335d489f9SAlex Elder ceph_decode_32_safe(&p, end, snap_count, out); 545435d489f9SAlex Elder 545535d489f9SAlex Elder /* 545635d489f9SAlex Elder * Make sure the reported number of snapshot ids wouldn't go 545735d489f9SAlex Elder * beyond the end of our buffer. But before checking that, 545835d489f9SAlex Elder * make sure the computed size of the snapshot context we 545935d489f9SAlex Elder * allocate is representable in a size_t. 546035d489f9SAlex Elder */ 546135d489f9SAlex Elder if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context)) 546235d489f9SAlex Elder / sizeof (u64)) { 546335d489f9SAlex Elder ret = -EINVAL; 546435d489f9SAlex Elder goto out; 546535d489f9SAlex Elder } 546635d489f9SAlex Elder if (!ceph_has_room(&p, end, snap_count * sizeof (__le64))) 546735d489f9SAlex Elder goto out; 5468468521c1SAlex Elder ret = 0; 546935d489f9SAlex Elder 5470812164f8SAlex Elder snapc = ceph_create_snap_context(snap_count, GFP_KERNEL); 547135d489f9SAlex Elder if (!snapc) { 547235d489f9SAlex Elder ret = -ENOMEM; 547335d489f9SAlex Elder goto out; 547435d489f9SAlex Elder } 547535d489f9SAlex Elder snapc->seq = seq; 547635d489f9SAlex Elder for (i = 0; i < snap_count; i++) 547735d489f9SAlex Elder snapc->snaps[i] = ceph_decode_64(&p); 547835d489f9SAlex Elder 547949ece554SAlex Elder ceph_put_snap_context(rbd_dev->header.snapc); 548035d489f9SAlex Elder rbd_dev->header.snapc = snapc; 548135d489f9SAlex Elder 548235d489f9SAlex Elder dout(" snap context seq = %llu, snap_count = %u\n", 548335d489f9SAlex Elder (unsigned long long)seq, (unsigned int)snap_count); 548435d489f9SAlex Elder out: 548535d489f9SAlex Elder kfree(reply_buf); 548635d489f9SAlex Elder 548757385b51SAlex Elder return ret; 548835d489f9SAlex Elder } 548935d489f9SAlex Elder 549054cac61fSAlex Elder static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, 549154cac61fSAlex Elder u64 snap_id) 5492b8b1e2dbSAlex Elder { 5493b8b1e2dbSAlex Elder size_t size; 5494b8b1e2dbSAlex Elder void *reply_buf; 549554cac61fSAlex Elder __le64 snapid; 5496b8b1e2dbSAlex Elder int ret; 5497b8b1e2dbSAlex Elder void *p; 5498b8b1e2dbSAlex Elder void *end; 5499b8b1e2dbSAlex Elder char *snap_name; 5500b8b1e2dbSAlex Elder 5501b8b1e2dbSAlex Elder size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN; 5502b8b1e2dbSAlex Elder reply_buf = kmalloc(size, GFP_KERNEL); 5503b8b1e2dbSAlex Elder if (!reply_buf) 5504b8b1e2dbSAlex Elder return ERR_PTR(-ENOMEM); 5505b8b1e2dbSAlex Elder 550654cac61fSAlex Elder snapid = cpu_to_le64(snap_id); 5507c41d13a3SIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name, 5508b8b1e2dbSAlex Elder "rbd", "get_snapshot_name", 550954cac61fSAlex Elder &snapid, sizeof (snapid), 5510e2a58ee5SAlex Elder reply_buf, size); 551136be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 5512f40eb349SAlex Elder if (ret < 0) { 5513f40eb349SAlex Elder snap_name = ERR_PTR(ret); 5514b8b1e2dbSAlex Elder goto out; 5515f40eb349SAlex Elder } 5516b8b1e2dbSAlex Elder 5517b8b1e2dbSAlex Elder p = reply_buf; 5518f40eb349SAlex Elder end = reply_buf + ret; 5519e5c35534SAlex Elder snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL); 5520f40eb349SAlex Elder if (IS_ERR(snap_name)) 5521b8b1e2dbSAlex Elder goto out; 5522f40eb349SAlex Elder 5523b8b1e2dbSAlex Elder dout(" snap_id 0x%016llx snap_name = %s\n", 552454cac61fSAlex Elder (unsigned long long)snap_id, snap_name); 5525b8b1e2dbSAlex Elder out: 5526b8b1e2dbSAlex Elder kfree(reply_buf); 5527b8b1e2dbSAlex Elder 5528f40eb349SAlex Elder return snap_name; 5529b8b1e2dbSAlex Elder } 5530b8b1e2dbSAlex Elder 55312df3fac7SAlex Elder static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev) 5532117973fbSAlex Elder { 55332df3fac7SAlex Elder bool first_time = rbd_dev->header.object_prefix == NULL; 5534117973fbSAlex Elder int ret; 5535117973fbSAlex Elder 55361617e40cSJosh Durgin ret = rbd_dev_v2_image_size(rbd_dev); 55371617e40cSJosh Durgin if (ret) 5538cfbf6377SAlex Elder return ret; 55391617e40cSJosh Durgin 55402df3fac7SAlex Elder if (first_time) { 55412df3fac7SAlex Elder ret = rbd_dev_v2_header_onetime(rbd_dev); 55422df3fac7SAlex Elder if (ret) 5543cfbf6377SAlex Elder return ret; 55442df3fac7SAlex Elder } 55452df3fac7SAlex Elder 5546cc4a38bdSAlex Elder ret = rbd_dev_v2_snap_context(rbd_dev); 5547d194cd1dSIlya Dryomov if (ret && first_time) { 5548d194cd1dSIlya Dryomov kfree(rbd_dev->header.object_prefix); 5549d194cd1dSIlya Dryomov rbd_dev->header.object_prefix = NULL; 5550d194cd1dSIlya Dryomov } 5551117973fbSAlex Elder 5552117973fbSAlex Elder return ret; 5553117973fbSAlex Elder } 5554117973fbSAlex Elder 5555a720ae09SIlya Dryomov static int rbd_dev_header_info(struct rbd_device *rbd_dev) 5556a720ae09SIlya Dryomov { 5557a720ae09SIlya Dryomov rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 5558a720ae09SIlya Dryomov 5559a720ae09SIlya Dryomov if (rbd_dev->image_format == 1) 5560a720ae09SIlya Dryomov return rbd_dev_v1_header_info(rbd_dev); 5561a720ae09SIlya Dryomov 5562a720ae09SIlya Dryomov return rbd_dev_v2_header_info(rbd_dev); 5563a720ae09SIlya Dryomov } 5564a720ae09SIlya Dryomov 55651ddbe94eSAlex Elder /* 5566e28fff26SAlex Elder * Skips over white space at *buf, and updates *buf to point to the 5567e28fff26SAlex Elder * first found non-space character (if any). Returns the length of 5568593a9e7bSAlex Elder * the token (string of non-white space characters) found. Note 5569593a9e7bSAlex Elder * that *buf must be terminated with '\0'. 5570e28fff26SAlex Elder */ 5571e28fff26SAlex Elder static inline size_t next_token(const char **buf) 5572e28fff26SAlex Elder { 5573e28fff26SAlex Elder /* 5574e28fff26SAlex Elder * These are the characters that produce nonzero for 5575e28fff26SAlex Elder * isspace() in the "C" and "POSIX" locales. 5576e28fff26SAlex Elder */ 5577e28fff26SAlex Elder const char *spaces = " \f\n\r\t\v"; 5578e28fff26SAlex Elder 5579e28fff26SAlex Elder *buf += strspn(*buf, spaces); /* Find start of token */ 5580e28fff26SAlex Elder 5581e28fff26SAlex Elder return strcspn(*buf, spaces); /* Return token length */ 5582e28fff26SAlex Elder } 5583e28fff26SAlex Elder 5584e28fff26SAlex Elder /* 5585ea3352f4SAlex Elder * Finds the next token in *buf, dynamically allocates a buffer big 5586ea3352f4SAlex Elder * enough to hold a copy of it, and copies the token into the new 5587ea3352f4SAlex Elder * buffer. The copy is guaranteed to be terminated with '\0'. Note 5588ea3352f4SAlex Elder * that a duplicate buffer is created even for a zero-length token. 5589ea3352f4SAlex Elder * 5590ea3352f4SAlex Elder * Returns a pointer to the newly-allocated duplicate, or a null 5591ea3352f4SAlex Elder * pointer if memory for the duplicate was not available. If 5592ea3352f4SAlex Elder * the lenp argument is a non-null pointer, the length of the token 5593ea3352f4SAlex Elder * (not including the '\0') is returned in *lenp. 5594ea3352f4SAlex Elder * 5595ea3352f4SAlex Elder * If successful, the *buf pointer will be updated to point beyond 5596ea3352f4SAlex Elder * the end of the found token. 5597ea3352f4SAlex Elder * 5598ea3352f4SAlex Elder * Note: uses GFP_KERNEL for allocation. 5599ea3352f4SAlex Elder */ 5600ea3352f4SAlex Elder static inline char *dup_token(const char **buf, size_t *lenp) 5601ea3352f4SAlex Elder { 5602ea3352f4SAlex Elder char *dup; 5603ea3352f4SAlex Elder size_t len; 5604ea3352f4SAlex Elder 5605ea3352f4SAlex Elder len = next_token(buf); 56064caf35f9SAlex Elder dup = kmemdup(*buf, len + 1, GFP_KERNEL); 5607ea3352f4SAlex Elder if (!dup) 5608ea3352f4SAlex Elder return NULL; 5609ea3352f4SAlex Elder *(dup + len) = '\0'; 5610ea3352f4SAlex Elder *buf += len; 5611ea3352f4SAlex Elder 5612ea3352f4SAlex Elder if (lenp) 5613ea3352f4SAlex Elder *lenp = len; 5614ea3352f4SAlex Elder 5615ea3352f4SAlex Elder return dup; 5616ea3352f4SAlex Elder } 5617ea3352f4SAlex Elder 5618ea3352f4SAlex Elder /* 5619859c31dfSAlex Elder * Parse the options provided for an "rbd add" (i.e., rbd image 5620859c31dfSAlex Elder * mapping) request. These arrive via a write to /sys/bus/rbd/add, 5621859c31dfSAlex Elder * and the data written is passed here via a NUL-terminated buffer. 5622859c31dfSAlex Elder * Returns 0 if successful or an error code otherwise. 5623d22f76e7SAlex Elder * 5624859c31dfSAlex Elder * The information extracted from these options is recorded in 5625859c31dfSAlex Elder * the other parameters which return dynamically-allocated 5626859c31dfSAlex Elder * structures: 5627859c31dfSAlex Elder * ceph_opts 5628859c31dfSAlex Elder * The address of a pointer that will refer to a ceph options 5629859c31dfSAlex Elder * structure. Caller must release the returned pointer using 5630859c31dfSAlex Elder * ceph_destroy_options() when it is no longer needed. 5631859c31dfSAlex Elder * rbd_opts 5632859c31dfSAlex Elder * Address of an rbd options pointer. Fully initialized by 5633859c31dfSAlex Elder * this function; caller must release with kfree(). 5634859c31dfSAlex Elder * spec 5635859c31dfSAlex Elder * Address of an rbd image specification pointer. Fully 5636859c31dfSAlex Elder * initialized by this function based on parsed options. 5637859c31dfSAlex Elder * Caller must release with rbd_spec_put(). 5638859c31dfSAlex Elder * 5639859c31dfSAlex Elder * The options passed take this form: 5640859c31dfSAlex Elder * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>] 5641859c31dfSAlex Elder * where: 5642859c31dfSAlex Elder * <mon_addrs> 5643859c31dfSAlex Elder * A comma-separated list of one or more monitor addresses. 5644859c31dfSAlex Elder * A monitor address is an ip address, optionally followed 5645859c31dfSAlex Elder * by a port number (separated by a colon). 5646859c31dfSAlex Elder * I.e.: ip1[:port1][,ip2[:port2]...] 5647859c31dfSAlex Elder * <options> 5648859c31dfSAlex Elder * A comma-separated list of ceph and/or rbd options. 5649859c31dfSAlex Elder * <pool_name> 5650859c31dfSAlex Elder * The name of the rados pool containing the rbd image. 5651859c31dfSAlex Elder * <image_name> 5652859c31dfSAlex Elder * The name of the image in that pool to map. 5653859c31dfSAlex Elder * <snap_id> 5654859c31dfSAlex Elder * An optional snapshot id. If provided, the mapping will 5655859c31dfSAlex Elder * present data from the image at the time that snapshot was 5656859c31dfSAlex Elder * created. The image head is used if no snapshot id is 5657859c31dfSAlex Elder * provided. Snapshot mappings are always read-only. 5658a725f65eSAlex Elder */ 5659859c31dfSAlex Elder static int rbd_add_parse_args(const char *buf, 5660dc79b113SAlex Elder struct ceph_options **ceph_opts, 5661859c31dfSAlex Elder struct rbd_options **opts, 5662859c31dfSAlex Elder struct rbd_spec **rbd_spec) 5663a725f65eSAlex Elder { 5664e28fff26SAlex Elder size_t len; 5665859c31dfSAlex Elder char *options; 56660ddebc0cSAlex Elder const char *mon_addrs; 5667ecb4dc22SAlex Elder char *snap_name; 56680ddebc0cSAlex Elder size_t mon_addrs_size; 5669859c31dfSAlex Elder struct rbd_spec *spec = NULL; 56704e9afebaSAlex Elder struct rbd_options *rbd_opts = NULL; 5671859c31dfSAlex Elder struct ceph_options *copts; 5672dc79b113SAlex Elder int ret; 5673e28fff26SAlex Elder 5674e28fff26SAlex Elder /* The first four tokens are required */ 5675e28fff26SAlex Elder 56767ef3214aSAlex Elder len = next_token(&buf); 56774fb5d671SAlex Elder if (!len) { 56784fb5d671SAlex Elder rbd_warn(NULL, "no monitor address(es) provided"); 56794fb5d671SAlex Elder return -EINVAL; 56804fb5d671SAlex Elder } 56810ddebc0cSAlex Elder mon_addrs = buf; 5682f28e565aSAlex Elder mon_addrs_size = len + 1; 56837ef3214aSAlex Elder buf += len; 5684a725f65eSAlex Elder 5685dc79b113SAlex Elder ret = -EINVAL; 5686f28e565aSAlex Elder options = dup_token(&buf, NULL); 5687f28e565aSAlex Elder if (!options) 5688dc79b113SAlex Elder return -ENOMEM; 56894fb5d671SAlex Elder if (!*options) { 56904fb5d671SAlex Elder rbd_warn(NULL, "no options provided"); 56914fb5d671SAlex Elder goto out_err; 56924fb5d671SAlex Elder } 5693a725f65eSAlex Elder 5694859c31dfSAlex Elder spec = rbd_spec_alloc(); 5695859c31dfSAlex Elder if (!spec) 5696f28e565aSAlex Elder goto out_mem; 5697859c31dfSAlex Elder 5698859c31dfSAlex Elder spec->pool_name = dup_token(&buf, NULL); 5699859c31dfSAlex Elder if (!spec->pool_name) 5700859c31dfSAlex Elder goto out_mem; 57014fb5d671SAlex Elder if (!*spec->pool_name) { 57024fb5d671SAlex Elder rbd_warn(NULL, "no pool name provided"); 57034fb5d671SAlex Elder goto out_err; 57044fb5d671SAlex Elder } 5705e28fff26SAlex Elder 570669e7a02fSAlex Elder spec->image_name = dup_token(&buf, NULL); 5707859c31dfSAlex Elder if (!spec->image_name) 5708f28e565aSAlex Elder goto out_mem; 57094fb5d671SAlex Elder if (!*spec->image_name) { 57104fb5d671SAlex Elder rbd_warn(NULL, "no image name provided"); 57114fb5d671SAlex Elder goto out_err; 57124fb5d671SAlex Elder } 5713e28fff26SAlex Elder 5714f28e565aSAlex Elder /* 5715f28e565aSAlex Elder * Snapshot name is optional; default is to use "-" 5716f28e565aSAlex Elder * (indicating the head/no snapshot). 5717f28e565aSAlex Elder */ 57183feeb894SAlex Elder len = next_token(&buf); 5719820a5f3eSAlex Elder if (!len) { 57203feeb894SAlex Elder buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */ 57213feeb894SAlex Elder len = sizeof (RBD_SNAP_HEAD_NAME) - 1; 5722f28e565aSAlex Elder } else if (len > RBD_MAX_SNAP_NAME_LEN) { 5723dc79b113SAlex Elder ret = -ENAMETOOLONG; 5724f28e565aSAlex Elder goto out_err; 5725849b4260SAlex Elder } 5726ecb4dc22SAlex Elder snap_name = kmemdup(buf, len + 1, GFP_KERNEL); 5727ecb4dc22SAlex Elder if (!snap_name) 5728f28e565aSAlex Elder goto out_mem; 5729ecb4dc22SAlex Elder *(snap_name + len) = '\0'; 5730ecb4dc22SAlex Elder spec->snap_name = snap_name; 5731e5c35534SAlex Elder 57320ddebc0cSAlex Elder /* Initialize all rbd options to the defaults */ 5733e28fff26SAlex Elder 57344e9afebaSAlex Elder rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL); 57354e9afebaSAlex Elder if (!rbd_opts) 57364e9afebaSAlex Elder goto out_mem; 57374e9afebaSAlex Elder 57384e9afebaSAlex Elder rbd_opts->read_only = RBD_READ_ONLY_DEFAULT; 5739b5584180SIlya Dryomov rbd_opts->queue_depth = RBD_QUEUE_DEPTH_DEFAULT; 574080de1912SIlya Dryomov rbd_opts->lock_on_read = RBD_LOCK_ON_READ_DEFAULT; 5741d22f76e7SAlex Elder 5742859c31dfSAlex Elder copts = ceph_parse_options(options, mon_addrs, 57430ddebc0cSAlex Elder mon_addrs + mon_addrs_size - 1, 57444e9afebaSAlex Elder parse_rbd_opts_token, rbd_opts); 5745859c31dfSAlex Elder if (IS_ERR(copts)) { 5746859c31dfSAlex Elder ret = PTR_ERR(copts); 5747dc79b113SAlex Elder goto out_err; 5748dc79b113SAlex Elder } 5749859c31dfSAlex Elder kfree(options); 5750859c31dfSAlex Elder 5751859c31dfSAlex Elder *ceph_opts = copts; 57524e9afebaSAlex Elder *opts = rbd_opts; 5753859c31dfSAlex Elder *rbd_spec = spec; 57540ddebc0cSAlex Elder 5755dc79b113SAlex Elder return 0; 5756f28e565aSAlex Elder out_mem: 5757dc79b113SAlex Elder ret = -ENOMEM; 5758d22f76e7SAlex Elder out_err: 5759859c31dfSAlex Elder kfree(rbd_opts); 5760859c31dfSAlex Elder rbd_spec_put(spec); 5761f28e565aSAlex Elder kfree(options); 5762d22f76e7SAlex Elder 5763dc79b113SAlex Elder return ret; 5764a725f65eSAlex Elder } 5765a725f65eSAlex Elder 5766589d30e0SAlex Elder /* 576730ba1f02SIlya Dryomov * Return pool id (>= 0) or a negative error code. 576830ba1f02SIlya Dryomov */ 576930ba1f02SIlya Dryomov static int rbd_add_get_pool_id(struct rbd_client *rbdc, const char *pool_name) 577030ba1f02SIlya Dryomov { 5771a319bf56SIlya Dryomov struct ceph_options *opts = rbdc->client->options; 577230ba1f02SIlya Dryomov u64 newest_epoch; 577330ba1f02SIlya Dryomov int tries = 0; 577430ba1f02SIlya Dryomov int ret; 577530ba1f02SIlya Dryomov 577630ba1f02SIlya Dryomov again: 577730ba1f02SIlya Dryomov ret = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, pool_name); 577830ba1f02SIlya Dryomov if (ret == -ENOENT && tries++ < 1) { 5779d0b19705SIlya Dryomov ret = ceph_monc_get_version(&rbdc->client->monc, "osdmap", 578030ba1f02SIlya Dryomov &newest_epoch); 578130ba1f02SIlya Dryomov if (ret < 0) 578230ba1f02SIlya Dryomov return ret; 578330ba1f02SIlya Dryomov 578430ba1f02SIlya Dryomov if (rbdc->client->osdc.osdmap->epoch < newest_epoch) { 57857cca78c9SIlya Dryomov ceph_osdc_maybe_request_map(&rbdc->client->osdc); 578630ba1f02SIlya Dryomov (void) ceph_monc_wait_osdmap(&rbdc->client->monc, 5787a319bf56SIlya Dryomov newest_epoch, 5788a319bf56SIlya Dryomov opts->mount_timeout); 578930ba1f02SIlya Dryomov goto again; 579030ba1f02SIlya Dryomov } else { 579130ba1f02SIlya Dryomov /* the osdmap we have is new enough */ 579230ba1f02SIlya Dryomov return -ENOENT; 579330ba1f02SIlya Dryomov } 579430ba1f02SIlya Dryomov } 579530ba1f02SIlya Dryomov 579630ba1f02SIlya Dryomov return ret; 579730ba1f02SIlya Dryomov } 579830ba1f02SIlya Dryomov 579930ba1f02SIlya Dryomov /* 5800589d30e0SAlex Elder * An rbd format 2 image has a unique identifier, distinct from the 5801589d30e0SAlex Elder * name given to it by the user. Internally, that identifier is 5802589d30e0SAlex Elder * what's used to specify the names of objects related to the image. 5803589d30e0SAlex Elder * 5804589d30e0SAlex Elder * A special "rbd id" object is used to map an rbd image name to its 5805589d30e0SAlex Elder * id. If that object doesn't exist, then there is no v2 rbd image 5806589d30e0SAlex Elder * with the supplied name. 5807589d30e0SAlex Elder * 5808589d30e0SAlex Elder * This function will record the given rbd_dev's image_id field if 5809589d30e0SAlex Elder * it can be determined, and in that case will return 0. If any 5810589d30e0SAlex Elder * errors occur a negative errno will be returned and the rbd_dev's 5811589d30e0SAlex Elder * image_id field will be unchanged (and should be NULL). 5812589d30e0SAlex Elder */ 5813589d30e0SAlex Elder static int rbd_dev_image_id(struct rbd_device *rbd_dev) 5814589d30e0SAlex Elder { 5815589d30e0SAlex Elder int ret; 5816589d30e0SAlex Elder size_t size; 5817589d30e0SAlex Elder char *object_name; 5818589d30e0SAlex Elder void *response; 5819c0fba368SAlex Elder char *image_id; 58202f82ee54SAlex Elder 5821589d30e0SAlex Elder /* 58222c0d0a10SAlex Elder * When probing a parent image, the image id is already 58232c0d0a10SAlex Elder * known (and the image name likely is not). There's no 5824c0fba368SAlex Elder * need to fetch the image id again in this case. We 5825c0fba368SAlex Elder * do still need to set the image format though. 58262c0d0a10SAlex Elder */ 5827c0fba368SAlex Elder if (rbd_dev->spec->image_id) { 5828c0fba368SAlex Elder rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1; 5829c0fba368SAlex Elder 58302c0d0a10SAlex Elder return 0; 5831c0fba368SAlex Elder } 58322c0d0a10SAlex Elder 58332c0d0a10SAlex Elder /* 5834589d30e0SAlex Elder * First, see if the format 2 image id file exists, and if 5835589d30e0SAlex Elder * so, get the image's persistent id from it. 5836589d30e0SAlex Elder */ 583769e7a02fSAlex Elder size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name); 5838589d30e0SAlex Elder object_name = kmalloc(size, GFP_NOIO); 5839589d30e0SAlex Elder if (!object_name) 5840589d30e0SAlex Elder return -ENOMEM; 58410d7dbfceSAlex Elder sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name); 5842589d30e0SAlex Elder dout("rbd id object name is %s\n", object_name); 5843589d30e0SAlex Elder 5844589d30e0SAlex Elder /* Response will be an encoded string, which includes a length */ 5845589d30e0SAlex Elder 5846589d30e0SAlex Elder size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX; 5847589d30e0SAlex Elder response = kzalloc(size, GFP_NOIO); 5848589d30e0SAlex Elder if (!response) { 5849589d30e0SAlex Elder ret = -ENOMEM; 5850589d30e0SAlex Elder goto out; 5851589d30e0SAlex Elder } 5852589d30e0SAlex Elder 5853c0fba368SAlex Elder /* If it doesn't exist we'll assume it's a format 1 image */ 5854c0fba368SAlex Elder 585536be9a76SAlex Elder ret = rbd_obj_method_sync(rbd_dev, object_name, 58564157976bSAlex Elder "rbd", "get_id", NULL, 0, 5857e2a58ee5SAlex Elder response, RBD_IMAGE_ID_LEN_MAX); 585836be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 5859c0fba368SAlex Elder if (ret == -ENOENT) { 5860c0fba368SAlex Elder image_id = kstrdup("", GFP_KERNEL); 5861c0fba368SAlex Elder ret = image_id ? 0 : -ENOMEM; 5862c0fba368SAlex Elder if (!ret) 5863c0fba368SAlex Elder rbd_dev->image_format = 1; 58647dd440c9SIlya Dryomov } else if (ret >= 0) { 5865c0fba368SAlex Elder void *p = response; 5866589d30e0SAlex Elder 5867c0fba368SAlex Elder image_id = ceph_extract_encoded_string(&p, p + ret, 5868979ed480SAlex Elder NULL, GFP_NOIO); 5869461f758aSDuan Jiong ret = PTR_ERR_OR_ZERO(image_id); 5870c0fba368SAlex Elder if (!ret) 5871c0fba368SAlex Elder rbd_dev->image_format = 2; 5872c0fba368SAlex Elder } 5873c0fba368SAlex Elder 5874c0fba368SAlex Elder if (!ret) { 5875c0fba368SAlex Elder rbd_dev->spec->image_id = image_id; 5876c0fba368SAlex Elder dout("image_id is %s\n", image_id); 5877589d30e0SAlex Elder } 5878589d30e0SAlex Elder out: 5879589d30e0SAlex Elder kfree(response); 5880589d30e0SAlex Elder kfree(object_name); 5881589d30e0SAlex Elder 5882589d30e0SAlex Elder return ret; 5883589d30e0SAlex Elder } 5884589d30e0SAlex Elder 58853abef3b3SAlex Elder /* 58863abef3b3SAlex Elder * Undo whatever state changes are made by v1 or v2 header info 58873abef3b3SAlex Elder * call. 58883abef3b3SAlex Elder */ 58896fd48b3bSAlex Elder static void rbd_dev_unprobe(struct rbd_device *rbd_dev) 58906fd48b3bSAlex Elder { 58916fd48b3bSAlex Elder struct rbd_image_header *header; 58926fd48b3bSAlex Elder 5893a2acd00eSAlex Elder rbd_dev_parent_put(rbd_dev); 58946fd48b3bSAlex Elder 58956fd48b3bSAlex Elder /* Free dynamic fields from the header, then zero it out */ 58966fd48b3bSAlex Elder 58976fd48b3bSAlex Elder header = &rbd_dev->header; 5898812164f8SAlex Elder ceph_put_snap_context(header->snapc); 58996fd48b3bSAlex Elder kfree(header->snap_sizes); 59006fd48b3bSAlex Elder kfree(header->snap_names); 59016fd48b3bSAlex Elder kfree(header->object_prefix); 59026fd48b3bSAlex Elder memset(header, 0, sizeof (*header)); 59036fd48b3bSAlex Elder } 59046fd48b3bSAlex Elder 59052df3fac7SAlex Elder static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev) 5906a30b71b9SAlex Elder { 5907a30b71b9SAlex Elder int ret; 5908a30b71b9SAlex Elder 59091e130199SAlex Elder ret = rbd_dev_v2_object_prefix(rbd_dev); 591057385b51SAlex Elder if (ret) 59111e130199SAlex Elder goto out_err; 5912b1b5402aSAlex Elder 59132df3fac7SAlex Elder /* 59142df3fac7SAlex Elder * Get the and check features for the image. Currently the 59152df3fac7SAlex Elder * features are assumed to never change. 59162df3fac7SAlex Elder */ 5917b1b5402aSAlex Elder ret = rbd_dev_v2_features(rbd_dev); 591857385b51SAlex Elder if (ret) 5919b1b5402aSAlex Elder goto out_err; 592035d489f9SAlex Elder 5921cc070d59SAlex Elder /* If the image supports fancy striping, get its parameters */ 5922cc070d59SAlex Elder 5923cc070d59SAlex Elder if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) { 5924cc070d59SAlex Elder ret = rbd_dev_v2_striping_info(rbd_dev); 5925cc070d59SAlex Elder if (ret < 0) 5926cc070d59SAlex Elder goto out_err; 5927cc070d59SAlex Elder } 59282df3fac7SAlex Elder /* No support for crypto and compression type format 2 images */ 5929a30b71b9SAlex Elder 593035152979SAlex Elder return 0; 59319d475de5SAlex Elder out_err: 5932642a2537SAlex Elder rbd_dev->header.features = 0; 59331e130199SAlex Elder kfree(rbd_dev->header.object_prefix); 59341e130199SAlex Elder rbd_dev->header.object_prefix = NULL; 59359d475de5SAlex Elder 59369d475de5SAlex Elder return ret; 5937a30b71b9SAlex Elder } 5938a30b71b9SAlex Elder 59396d69bb53SIlya Dryomov /* 59406d69bb53SIlya Dryomov * @depth is rbd_dev_image_probe() -> rbd_dev_probe_parent() -> 59416d69bb53SIlya Dryomov * rbd_dev_image_probe() recursion depth, which means it's also the 59426d69bb53SIlya Dryomov * length of the already discovered part of the parent chain. 59436d69bb53SIlya Dryomov */ 59446d69bb53SIlya Dryomov static int rbd_dev_probe_parent(struct rbd_device *rbd_dev, int depth) 594583a06263SAlex Elder { 59462f82ee54SAlex Elder struct rbd_device *parent = NULL; 5947124afba2SAlex Elder int ret; 5948124afba2SAlex Elder 5949124afba2SAlex Elder if (!rbd_dev->parent_spec) 5950124afba2SAlex Elder return 0; 5951124afba2SAlex Elder 59526d69bb53SIlya Dryomov if (++depth > RBD_MAX_PARENT_CHAIN_LEN) { 59536d69bb53SIlya Dryomov pr_info("parent chain is too long (%d)\n", depth); 59546d69bb53SIlya Dryomov ret = -EINVAL; 59556d69bb53SIlya Dryomov goto out_err; 59566d69bb53SIlya Dryomov } 59576d69bb53SIlya Dryomov 59581643dfa4SIlya Dryomov parent = __rbd_dev_create(rbd_dev->rbd_client, rbd_dev->parent_spec); 59591f2c6651SIlya Dryomov if (!parent) { 5960124afba2SAlex Elder ret = -ENOMEM; 5961124afba2SAlex Elder goto out_err; 59621f2c6651SIlya Dryomov } 59631f2c6651SIlya Dryomov 59641f2c6651SIlya Dryomov /* 59651f2c6651SIlya Dryomov * Images related by parent/child relationships always share 59661f2c6651SIlya Dryomov * rbd_client and spec/parent_spec, so bump their refcounts. 59671f2c6651SIlya Dryomov */ 59681f2c6651SIlya Dryomov __rbd_get_client(rbd_dev->rbd_client); 59691f2c6651SIlya Dryomov rbd_spec_get(rbd_dev->parent_spec); 5970124afba2SAlex Elder 59716d69bb53SIlya Dryomov ret = rbd_dev_image_probe(parent, depth); 5972124afba2SAlex Elder if (ret < 0) 5973124afba2SAlex Elder goto out_err; 59741f2c6651SIlya Dryomov 5975124afba2SAlex Elder rbd_dev->parent = parent; 5976a2acd00eSAlex Elder atomic_set(&rbd_dev->parent_ref, 1); 5977124afba2SAlex Elder return 0; 5978124afba2SAlex Elder 59791f2c6651SIlya Dryomov out_err: 59801f2c6651SIlya Dryomov rbd_dev_unparent(rbd_dev); 59811f2c6651SIlya Dryomov rbd_dev_destroy(parent); 5982124afba2SAlex Elder return ret; 5983124afba2SAlex Elder } 5984124afba2SAlex Elder 5985811c6688SIlya Dryomov /* 5986811c6688SIlya Dryomov * rbd_dev->header_rwsem must be locked for write and will be unlocked 5987811c6688SIlya Dryomov * upon return. 5988811c6688SIlya Dryomov */ 5989200a6a8bSAlex Elder static int rbd_dev_device_setup(struct rbd_device *rbd_dev) 5990124afba2SAlex Elder { 599183a06263SAlex Elder int ret; 599283a06263SAlex Elder 59939b60e70bSIlya Dryomov /* Record our major and minor device numbers. */ 599483a06263SAlex Elder 59959b60e70bSIlya Dryomov if (!single_major) { 599683a06263SAlex Elder ret = register_blkdev(0, rbd_dev->name); 599783a06263SAlex Elder if (ret < 0) 59981643dfa4SIlya Dryomov goto err_out_unlock; 59999b60e70bSIlya Dryomov 600083a06263SAlex Elder rbd_dev->major = ret; 6001dd82fff1SIlya Dryomov rbd_dev->minor = 0; 60029b60e70bSIlya Dryomov } else { 60039b60e70bSIlya Dryomov rbd_dev->major = rbd_major; 60049b60e70bSIlya Dryomov rbd_dev->minor = rbd_dev_id_to_minor(rbd_dev->dev_id); 60059b60e70bSIlya Dryomov } 600683a06263SAlex Elder 600783a06263SAlex Elder /* Set up the blkdev mapping. */ 600883a06263SAlex Elder 600983a06263SAlex Elder ret = rbd_init_disk(rbd_dev); 601083a06263SAlex Elder if (ret) 601183a06263SAlex Elder goto err_out_blkdev; 601283a06263SAlex Elder 6013f35a4deeSAlex Elder ret = rbd_dev_mapping_set(rbd_dev); 601483a06263SAlex Elder if (ret) 601583a06263SAlex Elder goto err_out_disk; 6016bc1ecc65SIlya Dryomov 6017f35a4deeSAlex Elder set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE); 601822001f61SJosh Durgin set_disk_ro(rbd_dev->disk, rbd_dev->mapping.read_only); 6019f35a4deeSAlex Elder 6020dd5ac32dSIlya Dryomov dev_set_name(&rbd_dev->dev, "%d", rbd_dev->dev_id); 6021dd5ac32dSIlya Dryomov ret = device_add(&rbd_dev->dev); 6022f35a4deeSAlex Elder if (ret) 6023f5ee37bdSIlya Dryomov goto err_out_mapping; 602483a06263SAlex Elder 602583a06263SAlex Elder /* Everything's ready. Announce the disk to the world. */ 602683a06263SAlex Elder 6027129b79d4SAlex Elder set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); 6028811c6688SIlya Dryomov up_write(&rbd_dev->header_rwsem); 602983a06263SAlex Elder 60301643dfa4SIlya Dryomov spin_lock(&rbd_dev_list_lock); 60311643dfa4SIlya Dryomov list_add_tail(&rbd_dev->node, &rbd_dev_list); 60321643dfa4SIlya Dryomov spin_unlock(&rbd_dev_list_lock); 60331643dfa4SIlya Dryomov 6034811c6688SIlya Dryomov add_disk(rbd_dev->disk); 6035ca7909e8SIlya Dryomov pr_info("%s: capacity %llu features 0x%llx\n", rbd_dev->disk->disk_name, 6036ca7909e8SIlya Dryomov (unsigned long long)get_capacity(rbd_dev->disk) << SECTOR_SHIFT, 6037ca7909e8SIlya Dryomov rbd_dev->header.features); 603883a06263SAlex Elder 603983a06263SAlex Elder return ret; 60402f82ee54SAlex Elder 6041f35a4deeSAlex Elder err_out_mapping: 6042f35a4deeSAlex Elder rbd_dev_mapping_clear(rbd_dev); 604383a06263SAlex Elder err_out_disk: 604483a06263SAlex Elder rbd_free_disk(rbd_dev); 604583a06263SAlex Elder err_out_blkdev: 60469b60e70bSIlya Dryomov if (!single_major) 604783a06263SAlex Elder unregister_blkdev(rbd_dev->major, rbd_dev->name); 6048811c6688SIlya Dryomov err_out_unlock: 6049811c6688SIlya Dryomov up_write(&rbd_dev->header_rwsem); 605083a06263SAlex Elder return ret; 605183a06263SAlex Elder } 605283a06263SAlex Elder 6053332bb12dSAlex Elder static int rbd_dev_header_name(struct rbd_device *rbd_dev) 6054332bb12dSAlex Elder { 6055332bb12dSAlex Elder struct rbd_spec *spec = rbd_dev->spec; 6056c41d13a3SIlya Dryomov int ret; 6057332bb12dSAlex Elder 6058332bb12dSAlex Elder /* Record the header object name for this rbd image. */ 6059332bb12dSAlex Elder 6060332bb12dSAlex Elder rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 6061332bb12dSAlex Elder 60627627151eSYan, Zheng rbd_dev->header_oloc.pool = rbd_dev->layout.pool_id; 6063332bb12dSAlex Elder if (rbd_dev->image_format == 1) 6064c41d13a3SIlya Dryomov ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s", 6065332bb12dSAlex Elder spec->image_name, RBD_SUFFIX); 6066332bb12dSAlex Elder else 6067c41d13a3SIlya Dryomov ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s", 6068332bb12dSAlex Elder RBD_HEADER_PREFIX, spec->image_id); 6069c41d13a3SIlya Dryomov 6070c41d13a3SIlya Dryomov return ret; 6071332bb12dSAlex Elder } 6072332bb12dSAlex Elder 6073200a6a8bSAlex Elder static void rbd_dev_image_release(struct rbd_device *rbd_dev) 6074200a6a8bSAlex Elder { 60756fd48b3bSAlex Elder rbd_dev_unprobe(rbd_dev); 60766fd48b3bSAlex Elder rbd_dev->image_format = 0; 60776fd48b3bSAlex Elder kfree(rbd_dev->spec->image_id); 60786fd48b3bSAlex Elder rbd_dev->spec->image_id = NULL; 60796fd48b3bSAlex Elder 6080200a6a8bSAlex Elder rbd_dev_destroy(rbd_dev); 6081200a6a8bSAlex Elder } 6082200a6a8bSAlex Elder 6083a30b71b9SAlex Elder /* 6084a30b71b9SAlex Elder * Probe for the existence of the header object for the given rbd 60851f3ef788SAlex Elder * device. If this image is the one being mapped (i.e., not a 60861f3ef788SAlex Elder * parent), initiate a watch on its header object before using that 60871f3ef788SAlex Elder * object to get detailed information about the rbd image. 6088a30b71b9SAlex Elder */ 60896d69bb53SIlya Dryomov static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth) 6090a30b71b9SAlex Elder { 6091a30b71b9SAlex Elder int ret; 6092a30b71b9SAlex Elder 6093a30b71b9SAlex Elder /* 60943abef3b3SAlex Elder * Get the id from the image id object. Unless there's an 60953abef3b3SAlex Elder * error, rbd_dev->spec->image_id will be filled in with 60963abef3b3SAlex Elder * a dynamically-allocated string, and rbd_dev->image_format 60973abef3b3SAlex Elder * will be set to either 1 or 2. 6098a30b71b9SAlex Elder */ 6099a30b71b9SAlex Elder ret = rbd_dev_image_id(rbd_dev); 6100a30b71b9SAlex Elder if (ret) 6101c0fba368SAlex Elder return ret; 6102c0fba368SAlex Elder 6103332bb12dSAlex Elder ret = rbd_dev_header_name(rbd_dev); 6104332bb12dSAlex Elder if (ret) 6105332bb12dSAlex Elder goto err_out_format; 6106332bb12dSAlex Elder 61076d69bb53SIlya Dryomov if (!depth) { 610899d16943SIlya Dryomov ret = rbd_register_watch(rbd_dev); 61091fe48023SIlya Dryomov if (ret) { 61101fe48023SIlya Dryomov if (ret == -ENOENT) 61111fe48023SIlya Dryomov pr_info("image %s/%s does not exist\n", 61121fe48023SIlya Dryomov rbd_dev->spec->pool_name, 61131fe48023SIlya Dryomov rbd_dev->spec->image_name); 6114c41d13a3SIlya Dryomov goto err_out_format; 61151f3ef788SAlex Elder } 61161fe48023SIlya Dryomov } 6117b644de2bSAlex Elder 6118a720ae09SIlya Dryomov ret = rbd_dev_header_info(rbd_dev); 61195655c4d9SAlex Elder if (ret) 6120b644de2bSAlex Elder goto err_out_watch; 6121a30b71b9SAlex Elder 612204077599SIlya Dryomov /* 612304077599SIlya Dryomov * If this image is the one being mapped, we have pool name and 612404077599SIlya Dryomov * id, image name and id, and snap name - need to fill snap id. 612504077599SIlya Dryomov * Otherwise this is a parent image, identified by pool, image 612604077599SIlya Dryomov * and snap ids - need to fill in names for those ids. 612704077599SIlya Dryomov */ 61286d69bb53SIlya Dryomov if (!depth) 612904077599SIlya Dryomov ret = rbd_spec_fill_snap_id(rbd_dev); 613004077599SIlya Dryomov else 613104077599SIlya Dryomov ret = rbd_spec_fill_names(rbd_dev); 61321fe48023SIlya Dryomov if (ret) { 61331fe48023SIlya Dryomov if (ret == -ENOENT) 61341fe48023SIlya Dryomov pr_info("snap %s/%s@%s does not exist\n", 61351fe48023SIlya Dryomov rbd_dev->spec->pool_name, 61361fe48023SIlya Dryomov rbd_dev->spec->image_name, 61371fe48023SIlya Dryomov rbd_dev->spec->snap_name); 613833dca39fSAlex Elder goto err_out_probe; 61391fe48023SIlya Dryomov } 61409bb81c9bSAlex Elder 6141e8f59b59SIlya Dryomov if (rbd_dev->header.features & RBD_FEATURE_LAYERING) { 6142e8f59b59SIlya Dryomov ret = rbd_dev_v2_parent_info(rbd_dev); 6143e8f59b59SIlya Dryomov if (ret) 6144e8f59b59SIlya Dryomov goto err_out_probe; 6145e8f59b59SIlya Dryomov 6146e8f59b59SIlya Dryomov /* 6147e8f59b59SIlya Dryomov * Need to warn users if this image is the one being 6148e8f59b59SIlya Dryomov * mapped and has a parent. 6149e8f59b59SIlya Dryomov */ 61506d69bb53SIlya Dryomov if (!depth && rbd_dev->parent_spec) 6151e8f59b59SIlya Dryomov rbd_warn(rbd_dev, 6152e8f59b59SIlya Dryomov "WARNING: kernel layering is EXPERIMENTAL!"); 6153e8f59b59SIlya Dryomov } 6154e8f59b59SIlya Dryomov 61556d69bb53SIlya Dryomov ret = rbd_dev_probe_parent(rbd_dev, depth); 615630d60ba2SAlex Elder if (ret) 615730d60ba2SAlex Elder goto err_out_probe; 615883a06263SAlex Elder 615930d60ba2SAlex Elder dout("discovered format %u image, header name is %s\n", 6160c41d13a3SIlya Dryomov rbd_dev->image_format, rbd_dev->header_oid.name); 616130d60ba2SAlex Elder return 0; 6162e8f59b59SIlya Dryomov 61636fd48b3bSAlex Elder err_out_probe: 61646fd48b3bSAlex Elder rbd_dev_unprobe(rbd_dev); 6165b644de2bSAlex Elder err_out_watch: 61666d69bb53SIlya Dryomov if (!depth) 616799d16943SIlya Dryomov rbd_unregister_watch(rbd_dev); 6168332bb12dSAlex Elder err_out_format: 6169332bb12dSAlex Elder rbd_dev->image_format = 0; 61705655c4d9SAlex Elder kfree(rbd_dev->spec->image_id); 61715655c4d9SAlex Elder rbd_dev->spec->image_id = NULL; 61725655c4d9SAlex Elder return ret; 617383a06263SAlex Elder } 617483a06263SAlex Elder 61759b60e70bSIlya Dryomov static ssize_t do_rbd_add(struct bus_type *bus, 617659c2be1eSYehuda Sadeh const char *buf, 617759c2be1eSYehuda Sadeh size_t count) 6178602adf40SYehuda Sadeh { 6179cb8627c7SAlex Elder struct rbd_device *rbd_dev = NULL; 6180dc79b113SAlex Elder struct ceph_options *ceph_opts = NULL; 61814e9afebaSAlex Elder struct rbd_options *rbd_opts = NULL; 6182859c31dfSAlex Elder struct rbd_spec *spec = NULL; 61839d3997fdSAlex Elder struct rbd_client *rbdc; 618451344a38SAlex Elder bool read_only; 6185b51c83c2SIlya Dryomov int rc; 6186602adf40SYehuda Sadeh 6187602adf40SYehuda Sadeh if (!try_module_get(THIS_MODULE)) 6188602adf40SYehuda Sadeh return -ENODEV; 6189602adf40SYehuda Sadeh 6190a725f65eSAlex Elder /* parse add command */ 6191859c31dfSAlex Elder rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec); 6192dc79b113SAlex Elder if (rc < 0) 6193dd5ac32dSIlya Dryomov goto out; 6194a725f65eSAlex Elder 61959d3997fdSAlex Elder rbdc = rbd_get_client(ceph_opts); 61969d3997fdSAlex Elder if (IS_ERR(rbdc)) { 61979d3997fdSAlex Elder rc = PTR_ERR(rbdc); 61980ddebc0cSAlex Elder goto err_out_args; 61999d3997fdSAlex Elder } 6200602adf40SYehuda Sadeh 6201602adf40SYehuda Sadeh /* pick the pool */ 620230ba1f02SIlya Dryomov rc = rbd_add_get_pool_id(rbdc, spec->pool_name); 62031fe48023SIlya Dryomov if (rc < 0) { 62041fe48023SIlya Dryomov if (rc == -ENOENT) 62051fe48023SIlya Dryomov pr_info("pool %s does not exist\n", spec->pool_name); 6206602adf40SYehuda Sadeh goto err_out_client; 62071fe48023SIlya Dryomov } 6208859c31dfSAlex Elder spec->pool_id = (u64)rc; 6209859c31dfSAlex Elder 6210d147543dSIlya Dryomov rbd_dev = rbd_dev_create(rbdc, spec, rbd_opts); 6211b51c83c2SIlya Dryomov if (!rbd_dev) { 6212b51c83c2SIlya Dryomov rc = -ENOMEM; 6213bd4ba655SAlex Elder goto err_out_client; 6214b51c83c2SIlya Dryomov } 6215c53d5893SAlex Elder rbdc = NULL; /* rbd_dev now owns this */ 6216c53d5893SAlex Elder spec = NULL; /* rbd_dev now owns this */ 6217d147543dSIlya Dryomov rbd_opts = NULL; /* rbd_dev now owns this */ 6218602adf40SYehuda Sadeh 62190d6d1e9cSMike Christie rbd_dev->config_info = kstrdup(buf, GFP_KERNEL); 62200d6d1e9cSMike Christie if (!rbd_dev->config_info) { 62210d6d1e9cSMike Christie rc = -ENOMEM; 62220d6d1e9cSMike Christie goto err_out_rbd_dev; 62230d6d1e9cSMike Christie } 62240d6d1e9cSMike Christie 6225811c6688SIlya Dryomov down_write(&rbd_dev->header_rwsem); 62266d69bb53SIlya Dryomov rc = rbd_dev_image_probe(rbd_dev, 0); 62270d6d1e9cSMike Christie if (rc < 0) { 62280d6d1e9cSMike Christie up_write(&rbd_dev->header_rwsem); 6229c53d5893SAlex Elder goto err_out_rbd_dev; 62300d6d1e9cSMike Christie } 623105fd6f6fSAlex Elder 62327ce4eef7SAlex Elder /* If we are mapping a snapshot it must be marked read-only */ 62337ce4eef7SAlex Elder 6234d147543dSIlya Dryomov read_only = rbd_dev->opts->read_only; 62357ce4eef7SAlex Elder if (rbd_dev->spec->snap_id != CEPH_NOSNAP) 62367ce4eef7SAlex Elder read_only = true; 62377ce4eef7SAlex Elder rbd_dev->mapping.read_only = read_only; 62387ce4eef7SAlex Elder 6239b536f69aSAlex Elder rc = rbd_dev_device_setup(rbd_dev); 62403abef3b3SAlex Elder if (rc) { 6241e37180c0SIlya Dryomov /* 624299d16943SIlya Dryomov * rbd_unregister_watch() can't be moved into 6243e37180c0SIlya Dryomov * rbd_dev_image_release() without refactoring, see 6244e37180c0SIlya Dryomov * commit 1f3ef78861ac. 6245e37180c0SIlya Dryomov */ 624699d16943SIlya Dryomov rbd_unregister_watch(rbd_dev); 62473abef3b3SAlex Elder rbd_dev_image_release(rbd_dev); 6248dd5ac32dSIlya Dryomov goto out; 62493abef3b3SAlex Elder } 62503abef3b3SAlex Elder 6251dd5ac32dSIlya Dryomov rc = count; 6252dd5ac32dSIlya Dryomov out: 6253dd5ac32dSIlya Dryomov module_put(THIS_MODULE); 6254dd5ac32dSIlya Dryomov return rc; 6255b536f69aSAlex Elder 6256c53d5893SAlex Elder err_out_rbd_dev: 6257c53d5893SAlex Elder rbd_dev_destroy(rbd_dev); 6258bd4ba655SAlex Elder err_out_client: 62599d3997fdSAlex Elder rbd_put_client(rbdc); 62600ddebc0cSAlex Elder err_out_args: 6261859c31dfSAlex Elder rbd_spec_put(spec); 6262d147543dSIlya Dryomov kfree(rbd_opts); 6263dd5ac32dSIlya Dryomov goto out; 6264602adf40SYehuda Sadeh } 6265602adf40SYehuda Sadeh 62669b60e70bSIlya Dryomov static ssize_t rbd_add(struct bus_type *bus, 62679b60e70bSIlya Dryomov const char *buf, 62689b60e70bSIlya Dryomov size_t count) 62699b60e70bSIlya Dryomov { 62709b60e70bSIlya Dryomov if (single_major) 62719b60e70bSIlya Dryomov return -EINVAL; 62729b60e70bSIlya Dryomov 62739b60e70bSIlya Dryomov return do_rbd_add(bus, buf, count); 62749b60e70bSIlya Dryomov } 62759b60e70bSIlya Dryomov 62769b60e70bSIlya Dryomov static ssize_t rbd_add_single_major(struct bus_type *bus, 62779b60e70bSIlya Dryomov const char *buf, 62789b60e70bSIlya Dryomov size_t count) 62799b60e70bSIlya Dryomov { 62809b60e70bSIlya Dryomov return do_rbd_add(bus, buf, count); 62819b60e70bSIlya Dryomov } 62829b60e70bSIlya Dryomov 6283dd5ac32dSIlya Dryomov static void rbd_dev_device_release(struct rbd_device *rbd_dev) 6284602adf40SYehuda Sadeh { 6285602adf40SYehuda Sadeh rbd_free_disk(rbd_dev); 62861643dfa4SIlya Dryomov 62871643dfa4SIlya Dryomov spin_lock(&rbd_dev_list_lock); 62881643dfa4SIlya Dryomov list_del_init(&rbd_dev->node); 62891643dfa4SIlya Dryomov spin_unlock(&rbd_dev_list_lock); 62901643dfa4SIlya Dryomov 6291200a6a8bSAlex Elder clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); 6292dd5ac32dSIlya Dryomov device_del(&rbd_dev->dev); 62936d80b130SAlex Elder rbd_dev_mapping_clear(rbd_dev); 62949b60e70bSIlya Dryomov if (!single_major) 6295602adf40SYehuda Sadeh unregister_blkdev(rbd_dev->major, rbd_dev->name); 6296602adf40SYehuda Sadeh } 6297602adf40SYehuda Sadeh 629805a46afdSAlex Elder static void rbd_dev_remove_parent(struct rbd_device *rbd_dev) 629905a46afdSAlex Elder { 6300ad945fc1SAlex Elder while (rbd_dev->parent) { 630105a46afdSAlex Elder struct rbd_device *first = rbd_dev; 630205a46afdSAlex Elder struct rbd_device *second = first->parent; 630305a46afdSAlex Elder struct rbd_device *third; 630405a46afdSAlex Elder 630505a46afdSAlex Elder /* 630605a46afdSAlex Elder * Follow to the parent with no grandparent and 630705a46afdSAlex Elder * remove it. 630805a46afdSAlex Elder */ 630905a46afdSAlex Elder while (second && (third = second->parent)) { 631005a46afdSAlex Elder first = second; 631105a46afdSAlex Elder second = third; 631205a46afdSAlex Elder } 6313ad945fc1SAlex Elder rbd_assert(second); 63148ad42cd0SAlex Elder rbd_dev_image_release(second); 6315ad945fc1SAlex Elder first->parent = NULL; 6316ad945fc1SAlex Elder first->parent_overlap = 0; 6317ad945fc1SAlex Elder 6318ad945fc1SAlex Elder rbd_assert(first->parent_spec); 631905a46afdSAlex Elder rbd_spec_put(first->parent_spec); 632005a46afdSAlex Elder first->parent_spec = NULL; 632105a46afdSAlex Elder } 632205a46afdSAlex Elder } 632305a46afdSAlex Elder 63249b60e70bSIlya Dryomov static ssize_t do_rbd_remove(struct bus_type *bus, 6325602adf40SYehuda Sadeh const char *buf, 6326602adf40SYehuda Sadeh size_t count) 6327602adf40SYehuda Sadeh { 6328602adf40SYehuda Sadeh struct rbd_device *rbd_dev = NULL; 6329751cc0e3SAlex Elder struct list_head *tmp; 6330751cc0e3SAlex Elder int dev_id; 63310276dca6SMike Christie char opt_buf[6]; 633282a442d2SAlex Elder bool already = false; 63330276dca6SMike Christie bool force = false; 63340d8189e1SAlex Elder int ret; 6335602adf40SYehuda Sadeh 63360276dca6SMike Christie dev_id = -1; 63370276dca6SMike Christie opt_buf[0] = '\0'; 63380276dca6SMike Christie sscanf(buf, "%d %5s", &dev_id, opt_buf); 63390276dca6SMike Christie if (dev_id < 0) { 63400276dca6SMike Christie pr_err("dev_id out of range\n"); 6341602adf40SYehuda Sadeh return -EINVAL; 63420276dca6SMike Christie } 63430276dca6SMike Christie if (opt_buf[0] != '\0') { 63440276dca6SMike Christie if (!strcmp(opt_buf, "force")) { 63450276dca6SMike Christie force = true; 63460276dca6SMike Christie } else { 63470276dca6SMike Christie pr_err("bad remove option at '%s'\n", opt_buf); 63480276dca6SMike Christie return -EINVAL; 63490276dca6SMike Christie } 63500276dca6SMike Christie } 6351602adf40SYehuda Sadeh 6352602adf40SYehuda Sadeh ret = -ENOENT; 6353751cc0e3SAlex Elder spin_lock(&rbd_dev_list_lock); 6354751cc0e3SAlex Elder list_for_each(tmp, &rbd_dev_list) { 6355751cc0e3SAlex Elder rbd_dev = list_entry(tmp, struct rbd_device, node); 6356751cc0e3SAlex Elder if (rbd_dev->dev_id == dev_id) { 6357751cc0e3SAlex Elder ret = 0; 6358751cc0e3SAlex Elder break; 6359602adf40SYehuda Sadeh } 6360751cc0e3SAlex Elder } 6361751cc0e3SAlex Elder if (!ret) { 6362a14ea269SAlex Elder spin_lock_irq(&rbd_dev->lock); 63630276dca6SMike Christie if (rbd_dev->open_count && !force) 636442382b70SAlex Elder ret = -EBUSY; 6365b82d167bSAlex Elder else 636682a442d2SAlex Elder already = test_and_set_bit(RBD_DEV_FLAG_REMOVING, 636782a442d2SAlex Elder &rbd_dev->flags); 6368a14ea269SAlex Elder spin_unlock_irq(&rbd_dev->lock); 6369751cc0e3SAlex Elder } 6370751cc0e3SAlex Elder spin_unlock(&rbd_dev_list_lock); 637182a442d2SAlex Elder if (ret < 0 || already) 63721ba0f1e7SAlex Elder return ret; 6373751cc0e3SAlex Elder 63740276dca6SMike Christie if (force) { 63750276dca6SMike Christie /* 63760276dca6SMike Christie * Prevent new IO from being queued and wait for existing 63770276dca6SMike Christie * IO to complete/fail. 63780276dca6SMike Christie */ 63790276dca6SMike Christie blk_mq_freeze_queue(rbd_dev->disk->queue); 63800276dca6SMike Christie blk_set_queue_dying(rbd_dev->disk->queue); 63810276dca6SMike Christie } 63820276dca6SMike Christie 6383ed95b21aSIlya Dryomov down_write(&rbd_dev->lock_rwsem); 6384ed95b21aSIlya Dryomov if (__rbd_is_lock_owner(rbd_dev)) 6385ed95b21aSIlya Dryomov rbd_unlock(rbd_dev); 6386ed95b21aSIlya Dryomov up_write(&rbd_dev->lock_rwsem); 638799d16943SIlya Dryomov rbd_unregister_watch(rbd_dev); 6388fca27065SIlya Dryomov 63899875201eSJosh Durgin /* 63909875201eSJosh Durgin * Don't free anything from rbd_dev->disk until after all 63919875201eSJosh Durgin * notifies are completely processed. Otherwise 63929875201eSJosh Durgin * rbd_bus_del_dev() will race with rbd_watch_cb(), resulting 63939875201eSJosh Durgin * in a potential use after free of rbd_dev->disk or rbd_dev. 63949875201eSJosh Durgin */ 6395dd5ac32dSIlya Dryomov rbd_dev_device_release(rbd_dev); 63968ad42cd0SAlex Elder rbd_dev_image_release(rbd_dev); 6397aafb230eSAlex Elder 63981ba0f1e7SAlex Elder return count; 6399602adf40SYehuda Sadeh } 6400602adf40SYehuda Sadeh 64019b60e70bSIlya Dryomov static ssize_t rbd_remove(struct bus_type *bus, 64029b60e70bSIlya Dryomov const char *buf, 64039b60e70bSIlya Dryomov size_t count) 64049b60e70bSIlya Dryomov { 64059b60e70bSIlya Dryomov if (single_major) 64069b60e70bSIlya Dryomov return -EINVAL; 64079b60e70bSIlya Dryomov 64089b60e70bSIlya Dryomov return do_rbd_remove(bus, buf, count); 64099b60e70bSIlya Dryomov } 64109b60e70bSIlya Dryomov 64119b60e70bSIlya Dryomov static ssize_t rbd_remove_single_major(struct bus_type *bus, 64129b60e70bSIlya Dryomov const char *buf, 64139b60e70bSIlya Dryomov size_t count) 64149b60e70bSIlya Dryomov { 64159b60e70bSIlya Dryomov return do_rbd_remove(bus, buf, count); 64169b60e70bSIlya Dryomov } 64179b60e70bSIlya Dryomov 6418602adf40SYehuda Sadeh /* 6419602adf40SYehuda Sadeh * create control files in sysfs 6420dfc5606dSYehuda Sadeh * /sys/bus/rbd/... 6421602adf40SYehuda Sadeh */ 6422602adf40SYehuda Sadeh static int rbd_sysfs_init(void) 6423602adf40SYehuda Sadeh { 6424dfc5606dSYehuda Sadeh int ret; 6425602adf40SYehuda Sadeh 6426fed4c143SAlex Elder ret = device_register(&rbd_root_dev); 6427dfc5606dSYehuda Sadeh if (ret < 0) 6428dfc5606dSYehuda Sadeh return ret; 6429602adf40SYehuda Sadeh 6430fed4c143SAlex Elder ret = bus_register(&rbd_bus_type); 6431fed4c143SAlex Elder if (ret < 0) 6432fed4c143SAlex Elder device_unregister(&rbd_root_dev); 6433602adf40SYehuda Sadeh 6434602adf40SYehuda Sadeh return ret; 6435602adf40SYehuda Sadeh } 6436602adf40SYehuda Sadeh 6437602adf40SYehuda Sadeh static void rbd_sysfs_cleanup(void) 6438602adf40SYehuda Sadeh { 6439dfc5606dSYehuda Sadeh bus_unregister(&rbd_bus_type); 6440fed4c143SAlex Elder device_unregister(&rbd_root_dev); 6441602adf40SYehuda Sadeh } 6442602adf40SYehuda Sadeh 64431c2a9dfeSAlex Elder static int rbd_slab_init(void) 64441c2a9dfeSAlex Elder { 64451c2a9dfeSAlex Elder rbd_assert(!rbd_img_request_cache); 644603d94406SGeliang Tang rbd_img_request_cache = KMEM_CACHE(rbd_img_request, 0); 6447868311b1SAlex Elder if (!rbd_img_request_cache) 6448868311b1SAlex Elder return -ENOMEM; 6449868311b1SAlex Elder 6450868311b1SAlex Elder rbd_assert(!rbd_obj_request_cache); 645103d94406SGeliang Tang rbd_obj_request_cache = KMEM_CACHE(rbd_obj_request, 0); 645278c2a44aSAlex Elder if (!rbd_obj_request_cache) 645378c2a44aSAlex Elder goto out_err; 645478c2a44aSAlex Elder 645578c2a44aSAlex Elder rbd_assert(!rbd_segment_name_cache); 645678c2a44aSAlex Elder rbd_segment_name_cache = kmem_cache_create("rbd_segment_name", 64572d0ebc5dSIlya Dryomov CEPH_MAX_OID_NAME_LEN + 1, 1, 0, NULL); 645878c2a44aSAlex Elder if (rbd_segment_name_cache) 64591c2a9dfeSAlex Elder return 0; 646078c2a44aSAlex Elder out_err: 646178c2a44aSAlex Elder kmem_cache_destroy(rbd_obj_request_cache); 646278c2a44aSAlex Elder rbd_obj_request_cache = NULL; 64631c2a9dfeSAlex Elder 6464868311b1SAlex Elder kmem_cache_destroy(rbd_img_request_cache); 6465868311b1SAlex Elder rbd_img_request_cache = NULL; 6466868311b1SAlex Elder 64671c2a9dfeSAlex Elder return -ENOMEM; 64681c2a9dfeSAlex Elder } 64691c2a9dfeSAlex Elder 64701c2a9dfeSAlex Elder static void rbd_slab_exit(void) 64711c2a9dfeSAlex Elder { 647278c2a44aSAlex Elder rbd_assert(rbd_segment_name_cache); 647378c2a44aSAlex Elder kmem_cache_destroy(rbd_segment_name_cache); 647478c2a44aSAlex Elder rbd_segment_name_cache = NULL; 647578c2a44aSAlex Elder 6476868311b1SAlex Elder rbd_assert(rbd_obj_request_cache); 6477868311b1SAlex Elder kmem_cache_destroy(rbd_obj_request_cache); 6478868311b1SAlex Elder rbd_obj_request_cache = NULL; 6479868311b1SAlex Elder 64801c2a9dfeSAlex Elder rbd_assert(rbd_img_request_cache); 64811c2a9dfeSAlex Elder kmem_cache_destroy(rbd_img_request_cache); 64821c2a9dfeSAlex Elder rbd_img_request_cache = NULL; 64831c2a9dfeSAlex Elder } 64841c2a9dfeSAlex Elder 6485cc344fa1SAlex Elder static int __init rbd_init(void) 6486602adf40SYehuda Sadeh { 6487602adf40SYehuda Sadeh int rc; 6488602adf40SYehuda Sadeh 64891e32d34cSAlex Elder if (!libceph_compatible(NULL)) { 64901e32d34cSAlex Elder rbd_warn(NULL, "libceph incompatibility (quitting)"); 64911e32d34cSAlex Elder return -EINVAL; 64921e32d34cSAlex Elder } 6493e1b4d96dSIlya Dryomov 64941c2a9dfeSAlex Elder rc = rbd_slab_init(); 6495602adf40SYehuda Sadeh if (rc) 6496602adf40SYehuda Sadeh return rc; 6497e1b4d96dSIlya Dryomov 6498f5ee37bdSIlya Dryomov /* 6499f5ee37bdSIlya Dryomov * The number of active work items is limited by the number of 6500f77303bdSIlya Dryomov * rbd devices * queue depth, so leave @max_active at default. 6501f5ee37bdSIlya Dryomov */ 6502f5ee37bdSIlya Dryomov rbd_wq = alloc_workqueue(RBD_DRV_NAME, WQ_MEM_RECLAIM, 0); 6503f5ee37bdSIlya Dryomov if (!rbd_wq) { 6504f5ee37bdSIlya Dryomov rc = -ENOMEM; 6505f5ee37bdSIlya Dryomov goto err_out_slab; 6506f5ee37bdSIlya Dryomov } 6507f5ee37bdSIlya Dryomov 65089b60e70bSIlya Dryomov if (single_major) { 65099b60e70bSIlya Dryomov rbd_major = register_blkdev(0, RBD_DRV_NAME); 65109b60e70bSIlya Dryomov if (rbd_major < 0) { 65119b60e70bSIlya Dryomov rc = rbd_major; 6512f5ee37bdSIlya Dryomov goto err_out_wq; 65139b60e70bSIlya Dryomov } 65149b60e70bSIlya Dryomov } 65159b60e70bSIlya Dryomov 65161c2a9dfeSAlex Elder rc = rbd_sysfs_init(); 65171c2a9dfeSAlex Elder if (rc) 65189b60e70bSIlya Dryomov goto err_out_blkdev; 65191c2a9dfeSAlex Elder 65209b60e70bSIlya Dryomov if (single_major) 65219b60e70bSIlya Dryomov pr_info("loaded (major %d)\n", rbd_major); 65229b60e70bSIlya Dryomov else 6523e1b4d96dSIlya Dryomov pr_info("loaded\n"); 65249b60e70bSIlya Dryomov 6525e1b4d96dSIlya Dryomov return 0; 6526e1b4d96dSIlya Dryomov 65279b60e70bSIlya Dryomov err_out_blkdev: 65289b60e70bSIlya Dryomov if (single_major) 65299b60e70bSIlya Dryomov unregister_blkdev(rbd_major, RBD_DRV_NAME); 6530f5ee37bdSIlya Dryomov err_out_wq: 6531f5ee37bdSIlya Dryomov destroy_workqueue(rbd_wq); 6532e1b4d96dSIlya Dryomov err_out_slab: 6533e1b4d96dSIlya Dryomov rbd_slab_exit(); 65341c2a9dfeSAlex Elder return rc; 6535602adf40SYehuda Sadeh } 6536602adf40SYehuda Sadeh 6537cc344fa1SAlex Elder static void __exit rbd_exit(void) 6538602adf40SYehuda Sadeh { 6539ffe312cfSIlya Dryomov ida_destroy(&rbd_dev_id_ida); 6540602adf40SYehuda Sadeh rbd_sysfs_cleanup(); 65419b60e70bSIlya Dryomov if (single_major) 65429b60e70bSIlya Dryomov unregister_blkdev(rbd_major, RBD_DRV_NAME); 6543f5ee37bdSIlya Dryomov destroy_workqueue(rbd_wq); 65441c2a9dfeSAlex Elder rbd_slab_exit(); 6545602adf40SYehuda Sadeh } 6546602adf40SYehuda Sadeh 6547602adf40SYehuda Sadeh module_init(rbd_init); 6548602adf40SYehuda Sadeh module_exit(rbd_exit); 6549602adf40SYehuda Sadeh 6550d552c619SAlex Elder MODULE_AUTHOR("Alex Elder <elder@inktank.com>"); 6551602adf40SYehuda Sadeh MODULE_AUTHOR("Sage Weil <sage@newdream.net>"); 6552602adf40SYehuda Sadeh MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>"); 6553602adf40SYehuda Sadeh /* following authorship retained from original osdblk.c */ 6554602adf40SYehuda Sadeh MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>"); 6555602adf40SYehuda Sadeh 655690da258bSIlya Dryomov MODULE_DESCRIPTION("RADOS Block Device (RBD) driver"); 6557602adf40SYehuda Sadeh MODULE_LICENSE("GPL"); 6558