1e2a58ee5SAlex Elder 2602adf40SYehuda Sadeh /* 3602adf40SYehuda Sadeh rbd.c -- Export ceph rados objects as a Linux block device 4602adf40SYehuda Sadeh 5602adf40SYehuda Sadeh 6602adf40SYehuda Sadeh based on drivers/block/osdblk.c: 7602adf40SYehuda Sadeh 8602adf40SYehuda Sadeh Copyright 2009 Red Hat, Inc. 9602adf40SYehuda Sadeh 10602adf40SYehuda Sadeh This program is free software; you can redistribute it and/or modify 11602adf40SYehuda Sadeh it under the terms of the GNU General Public License as published by 12602adf40SYehuda Sadeh the Free Software Foundation. 13602adf40SYehuda Sadeh 14602adf40SYehuda Sadeh This program is distributed in the hope that it will be useful, 15602adf40SYehuda Sadeh but WITHOUT ANY WARRANTY; without even the implied warranty of 16602adf40SYehuda Sadeh MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17602adf40SYehuda Sadeh GNU General Public License for more details. 18602adf40SYehuda Sadeh 19602adf40SYehuda Sadeh You should have received a copy of the GNU General Public License 20602adf40SYehuda Sadeh along with this program; see the file COPYING. If not, write to 21602adf40SYehuda Sadeh the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 22602adf40SYehuda Sadeh 23602adf40SYehuda Sadeh 24602adf40SYehuda Sadeh 25dfc5606dSYehuda Sadeh For usage instructions, please refer to: 26602adf40SYehuda Sadeh 27dfc5606dSYehuda Sadeh Documentation/ABI/testing/sysfs-bus-rbd 28602adf40SYehuda Sadeh 29602adf40SYehuda Sadeh */ 30602adf40SYehuda Sadeh 31602adf40SYehuda Sadeh #include <linux/ceph/libceph.h> 32602adf40SYehuda Sadeh #include <linux/ceph/osd_client.h> 33602adf40SYehuda Sadeh #include <linux/ceph/mon_client.h> 34ed95b21aSIlya Dryomov #include <linux/ceph/cls_lock_client.h> 35602adf40SYehuda Sadeh #include <linux/ceph/decode.h> 3659c2be1eSYehuda Sadeh #include <linux/parser.h> 3730d1cff8SAlex Elder #include <linux/bsearch.h> 38602adf40SYehuda Sadeh 39602adf40SYehuda Sadeh #include <linux/kernel.h> 40602adf40SYehuda Sadeh #include <linux/device.h> 41602adf40SYehuda Sadeh #include <linux/module.h> 427ad18afaSChristoph Hellwig #include <linux/blk-mq.h> 43602adf40SYehuda Sadeh #include <linux/fs.h> 44602adf40SYehuda Sadeh #include <linux/blkdev.h> 451c2a9dfeSAlex Elder #include <linux/slab.h> 46f8a22fc2SIlya Dryomov #include <linux/idr.h> 47bc1ecc65SIlya Dryomov #include <linux/workqueue.h> 48602adf40SYehuda Sadeh 49602adf40SYehuda Sadeh #include "rbd_types.h" 50602adf40SYehuda Sadeh 51aafb230eSAlex Elder #define RBD_DEBUG /* Activate rbd_assert() calls */ 52aafb230eSAlex Elder 53593a9e7bSAlex Elder /* 54593a9e7bSAlex Elder * The basic unit of block I/O is a sector. It is interpreted in a 55593a9e7bSAlex Elder * number of contexts in Linux (blk, bio, genhd), but the default is 56593a9e7bSAlex Elder * universally 512 bytes. These symbols are just slightly more 57593a9e7bSAlex Elder * meaningful than the bare numbers they represent. 58593a9e7bSAlex Elder */ 59593a9e7bSAlex Elder #define SECTOR_SHIFT 9 60593a9e7bSAlex Elder #define SECTOR_SIZE (1ULL << SECTOR_SHIFT) 61593a9e7bSAlex Elder 62a2acd00eSAlex Elder /* 63a2acd00eSAlex Elder * Increment the given counter and return its updated value. 64a2acd00eSAlex Elder * If the counter is already 0 it will not be incremented. 65a2acd00eSAlex Elder * If the counter is already at its maximum value returns 66a2acd00eSAlex Elder * -EINVAL without updating it. 67a2acd00eSAlex Elder */ 68a2acd00eSAlex Elder static int atomic_inc_return_safe(atomic_t *v) 69a2acd00eSAlex Elder { 70a2acd00eSAlex Elder unsigned int counter; 71a2acd00eSAlex Elder 72a2acd00eSAlex Elder counter = (unsigned int)__atomic_add_unless(v, 1, 0); 73a2acd00eSAlex Elder if (counter <= (unsigned int)INT_MAX) 74a2acd00eSAlex Elder return (int)counter; 75a2acd00eSAlex Elder 76a2acd00eSAlex Elder atomic_dec(v); 77a2acd00eSAlex Elder 78a2acd00eSAlex Elder return -EINVAL; 79a2acd00eSAlex Elder } 80a2acd00eSAlex Elder 81a2acd00eSAlex Elder /* Decrement the counter. Return the resulting value, or -EINVAL */ 82a2acd00eSAlex Elder static int atomic_dec_return_safe(atomic_t *v) 83a2acd00eSAlex Elder { 84a2acd00eSAlex Elder int counter; 85a2acd00eSAlex Elder 86a2acd00eSAlex Elder counter = atomic_dec_return(v); 87a2acd00eSAlex Elder if (counter >= 0) 88a2acd00eSAlex Elder return counter; 89a2acd00eSAlex Elder 90a2acd00eSAlex Elder atomic_inc(v); 91a2acd00eSAlex Elder 92a2acd00eSAlex Elder return -EINVAL; 93a2acd00eSAlex Elder } 94a2acd00eSAlex Elder 95f0f8cef5SAlex Elder #define RBD_DRV_NAME "rbd" 96602adf40SYehuda Sadeh 977e513d43SIlya Dryomov #define RBD_MINORS_PER_MAJOR 256 987e513d43SIlya Dryomov #define RBD_SINGLE_MAJOR_PART_SHIFT 4 99602adf40SYehuda Sadeh 1006d69bb53SIlya Dryomov #define RBD_MAX_PARENT_CHAIN_LEN 16 1016d69bb53SIlya Dryomov 102d4b125e9SAlex Elder #define RBD_SNAP_DEV_NAME_PREFIX "snap_" 103d4b125e9SAlex Elder #define RBD_MAX_SNAP_NAME_LEN \ 104d4b125e9SAlex Elder (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1)) 105d4b125e9SAlex Elder 10635d489f9SAlex Elder #define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */ 107602adf40SYehuda Sadeh 108602adf40SYehuda Sadeh #define RBD_SNAP_HEAD_NAME "-" 109602adf40SYehuda Sadeh 1109682fc6dSAlex Elder #define BAD_SNAP_INDEX U32_MAX /* invalid index into snap array */ 1119682fc6dSAlex Elder 1129e15b77dSAlex Elder /* This allows a single page to hold an image name sent by OSD */ 1139e15b77dSAlex Elder #define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1) 114589d30e0SAlex Elder #define RBD_IMAGE_ID_LEN_MAX 64 1159e15b77dSAlex Elder 1161e130199SAlex Elder #define RBD_OBJ_PREFIX_LEN_MAX 64 117589d30e0SAlex Elder 118ed95b21aSIlya Dryomov #define RBD_NOTIFY_TIMEOUT 5 /* seconds */ 11999d16943SIlya Dryomov #define RBD_RETRY_DELAY msecs_to_jiffies(1000) 12099d16943SIlya Dryomov 121d889140cSAlex Elder /* Feature bits */ 122d889140cSAlex Elder 1235cbf6f12SAlex Elder #define RBD_FEATURE_LAYERING (1<<0) 1245cbf6f12SAlex Elder #define RBD_FEATURE_STRIPINGV2 (1<<1) 125ed95b21aSIlya Dryomov #define RBD_FEATURE_EXCLUSIVE_LOCK (1<<2) 126ed95b21aSIlya Dryomov #define RBD_FEATURES_ALL (RBD_FEATURE_LAYERING | \ 127ed95b21aSIlya Dryomov RBD_FEATURE_STRIPINGV2 | \ 128ed95b21aSIlya Dryomov RBD_FEATURE_EXCLUSIVE_LOCK) 129d889140cSAlex Elder 130d889140cSAlex Elder /* Features supported by this (client software) implementation. */ 131d889140cSAlex Elder 132770eba6eSAlex Elder #define RBD_FEATURES_SUPPORTED (RBD_FEATURES_ALL) 133d889140cSAlex Elder 13481a89793SAlex Elder /* 13581a89793SAlex Elder * An RBD device name will be "rbd#", where the "rbd" comes from 13681a89793SAlex Elder * RBD_DRV_NAME above, and # is a unique integer identifier. 13781a89793SAlex Elder */ 138602adf40SYehuda Sadeh #define DEV_NAME_LEN 32 139602adf40SYehuda Sadeh 140602adf40SYehuda Sadeh /* 141602adf40SYehuda Sadeh * block device image metadata (in-memory version) 142602adf40SYehuda Sadeh */ 143602adf40SYehuda Sadeh struct rbd_image_header { 144f35a4deeSAlex Elder /* These six fields never change for a given rbd image */ 145849b4260SAlex Elder char *object_prefix; 146602adf40SYehuda Sadeh __u8 obj_order; 147602adf40SYehuda Sadeh __u8 crypt_type; 148602adf40SYehuda Sadeh __u8 comp_type; 149f35a4deeSAlex Elder u64 stripe_unit; 150f35a4deeSAlex Elder u64 stripe_count; 151f35a4deeSAlex Elder u64 features; /* Might be changeable someday? */ 152602adf40SYehuda Sadeh 153f84344f3SAlex Elder /* The remaining fields need to be updated occasionally */ 154f84344f3SAlex Elder u64 image_size; 155f84344f3SAlex Elder struct ceph_snap_context *snapc; 156f35a4deeSAlex Elder char *snap_names; /* format 1 only */ 157f35a4deeSAlex Elder u64 *snap_sizes; /* format 1 only */ 15859c2be1eSYehuda Sadeh }; 15959c2be1eSYehuda Sadeh 1600d7dbfceSAlex Elder /* 1610d7dbfceSAlex Elder * An rbd image specification. 1620d7dbfceSAlex Elder * 1630d7dbfceSAlex Elder * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely 164c66c6e0cSAlex Elder * identify an image. Each rbd_dev structure includes a pointer to 165c66c6e0cSAlex Elder * an rbd_spec structure that encapsulates this identity. 166c66c6e0cSAlex Elder * 167c66c6e0cSAlex Elder * Each of the id's in an rbd_spec has an associated name. For a 168c66c6e0cSAlex Elder * user-mapped image, the names are supplied and the id's associated 169c66c6e0cSAlex Elder * with them are looked up. For a layered image, a parent image is 170c66c6e0cSAlex Elder * defined by the tuple, and the names are looked up. 171c66c6e0cSAlex Elder * 172c66c6e0cSAlex Elder * An rbd_dev structure contains a parent_spec pointer which is 173c66c6e0cSAlex Elder * non-null if the image it represents is a child in a layered 174c66c6e0cSAlex Elder * image. This pointer will refer to the rbd_spec structure used 175c66c6e0cSAlex Elder * by the parent rbd_dev for its own identity (i.e., the structure 176c66c6e0cSAlex Elder * is shared between the parent and child). 177c66c6e0cSAlex Elder * 178c66c6e0cSAlex Elder * Since these structures are populated once, during the discovery 179c66c6e0cSAlex Elder * phase of image construction, they are effectively immutable so 180c66c6e0cSAlex Elder * we make no effort to synchronize access to them. 181c66c6e0cSAlex Elder * 182c66c6e0cSAlex Elder * Note that code herein does not assume the image name is known (it 183c66c6e0cSAlex Elder * could be a null pointer). 1840d7dbfceSAlex Elder */ 1850d7dbfceSAlex Elder struct rbd_spec { 1860d7dbfceSAlex Elder u64 pool_id; 187ecb4dc22SAlex Elder const char *pool_name; 1880d7dbfceSAlex Elder 189ecb4dc22SAlex Elder const char *image_id; 190ecb4dc22SAlex Elder const char *image_name; 1910d7dbfceSAlex Elder 1920d7dbfceSAlex Elder u64 snap_id; 193ecb4dc22SAlex Elder const char *snap_name; 1940d7dbfceSAlex Elder 1950d7dbfceSAlex Elder struct kref kref; 1960d7dbfceSAlex Elder }; 1970d7dbfceSAlex Elder 198602adf40SYehuda Sadeh /* 199f0f8cef5SAlex Elder * an instance of the client. multiple devices may share an rbd client. 200602adf40SYehuda Sadeh */ 201602adf40SYehuda Sadeh struct rbd_client { 202602adf40SYehuda Sadeh struct ceph_client *client; 203602adf40SYehuda Sadeh struct kref kref; 204602adf40SYehuda Sadeh struct list_head node; 205602adf40SYehuda Sadeh }; 206602adf40SYehuda Sadeh 207bf0d5f50SAlex Elder struct rbd_img_request; 208bf0d5f50SAlex Elder typedef void (*rbd_img_callback_t)(struct rbd_img_request *); 209bf0d5f50SAlex Elder 210bf0d5f50SAlex Elder #define BAD_WHICH U32_MAX /* Good which or bad which, which? */ 211bf0d5f50SAlex Elder 212bf0d5f50SAlex Elder struct rbd_obj_request; 213bf0d5f50SAlex Elder typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *); 214bf0d5f50SAlex Elder 2159969ebc5SAlex Elder enum obj_request_type { 2169969ebc5SAlex Elder OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES 2179969ebc5SAlex Elder }; 218bf0d5f50SAlex Elder 2196d2940c8SGuangliang Zhao enum obj_operation_type { 2206d2940c8SGuangliang Zhao OBJ_OP_WRITE, 2216d2940c8SGuangliang Zhao OBJ_OP_READ, 22290e98c52SGuangliang Zhao OBJ_OP_DISCARD, 2236d2940c8SGuangliang Zhao }; 2246d2940c8SGuangliang Zhao 225926f9b3fSAlex Elder enum obj_req_flags { 226926f9b3fSAlex Elder OBJ_REQ_DONE, /* completion flag: not done = 0, done = 1 */ 2276365d33aSAlex Elder OBJ_REQ_IMG_DATA, /* object usage: standalone = 0, image = 1 */ 2285679c59fSAlex Elder OBJ_REQ_KNOWN, /* EXISTS flag valid: no = 0, yes = 1 */ 2295679c59fSAlex Elder OBJ_REQ_EXISTS, /* target exists: no = 0, yes = 1 */ 230926f9b3fSAlex Elder }; 231926f9b3fSAlex Elder 232bf0d5f50SAlex Elder struct rbd_obj_request { 233bf0d5f50SAlex Elder const char *object_name; 234bf0d5f50SAlex Elder u64 offset; /* object start byte */ 235bf0d5f50SAlex Elder u64 length; /* bytes from offset */ 236926f9b3fSAlex Elder unsigned long flags; 237bf0d5f50SAlex Elder 238c5b5ef6cSAlex Elder /* 239c5b5ef6cSAlex Elder * An object request associated with an image will have its 240c5b5ef6cSAlex Elder * img_data flag set; a standalone object request will not. 241c5b5ef6cSAlex Elder * 242c5b5ef6cSAlex Elder * A standalone object request will have which == BAD_WHICH 243c5b5ef6cSAlex Elder * and a null obj_request pointer. 244c5b5ef6cSAlex Elder * 245c5b5ef6cSAlex Elder * An object request initiated in support of a layered image 246c5b5ef6cSAlex Elder * object (to check for its existence before a write) will 247c5b5ef6cSAlex Elder * have which == BAD_WHICH and a non-null obj_request pointer. 248c5b5ef6cSAlex Elder * 249c5b5ef6cSAlex Elder * Finally, an object request for rbd image data will have 250c5b5ef6cSAlex Elder * which != BAD_WHICH, and will have a non-null img_request 251c5b5ef6cSAlex Elder * pointer. The value of which will be in the range 252c5b5ef6cSAlex Elder * 0..(img_request->obj_request_count-1). 253c5b5ef6cSAlex Elder */ 254c5b5ef6cSAlex Elder union { 255c5b5ef6cSAlex Elder struct rbd_obj_request *obj_request; /* STAT op */ 256c5b5ef6cSAlex Elder struct { 257bf0d5f50SAlex Elder struct rbd_img_request *img_request; 258c5b5ef6cSAlex Elder u64 img_offset; 259c5b5ef6cSAlex Elder /* links for img_request->obj_requests list */ 260c5b5ef6cSAlex Elder struct list_head links; 261c5b5ef6cSAlex Elder }; 262c5b5ef6cSAlex Elder }; 263bf0d5f50SAlex Elder u32 which; /* posn image request list */ 264bf0d5f50SAlex Elder 265bf0d5f50SAlex Elder enum obj_request_type type; 266788e2df3SAlex Elder union { 267bf0d5f50SAlex Elder struct bio *bio_list; 268788e2df3SAlex Elder struct { 269788e2df3SAlex Elder struct page **pages; 270788e2df3SAlex Elder u32 page_count; 271788e2df3SAlex Elder }; 272788e2df3SAlex Elder }; 2730eefd470SAlex Elder struct page **copyup_pages; 274ebda6408SAlex Elder u32 copyup_page_count; 275bf0d5f50SAlex Elder 276bf0d5f50SAlex Elder struct ceph_osd_request *osd_req; 277bf0d5f50SAlex Elder 278bf0d5f50SAlex Elder u64 xferred; /* bytes transferred */ 2791b83bef2SSage Weil int result; 280bf0d5f50SAlex Elder 281bf0d5f50SAlex Elder rbd_obj_callback_t callback; 282788e2df3SAlex Elder struct completion completion; 283bf0d5f50SAlex Elder 284bf0d5f50SAlex Elder struct kref kref; 285bf0d5f50SAlex Elder }; 286bf0d5f50SAlex Elder 2870c425248SAlex Elder enum img_req_flags { 2889849e986SAlex Elder IMG_REQ_WRITE, /* I/O direction: read = 0, write = 1 */ 2899849e986SAlex Elder IMG_REQ_CHILD, /* initiator: block = 0, child image = 1 */ 290d0b2e944SAlex Elder IMG_REQ_LAYERED, /* ENOENT handling: normal = 0, layered = 1 */ 29190e98c52SGuangliang Zhao IMG_REQ_DISCARD, /* discard: normal = 0, discard request = 1 */ 2920c425248SAlex Elder }; 2930c425248SAlex Elder 294bf0d5f50SAlex Elder struct rbd_img_request { 295bf0d5f50SAlex Elder struct rbd_device *rbd_dev; 296bf0d5f50SAlex Elder u64 offset; /* starting image byte offset */ 297bf0d5f50SAlex Elder u64 length; /* byte count from offset */ 2980c425248SAlex Elder unsigned long flags; 299bf0d5f50SAlex Elder union { 300bf0d5f50SAlex Elder u64 snap_id; /* for reads */ 3019849e986SAlex Elder struct ceph_snap_context *snapc; /* for writes */ 3029849e986SAlex Elder }; 3039849e986SAlex Elder union { 3049849e986SAlex Elder struct request *rq; /* block request */ 3059849e986SAlex Elder struct rbd_obj_request *obj_request; /* obj req initiator */ 306bf0d5f50SAlex Elder }; 3073d7efd18SAlex Elder struct page **copyup_pages; 308ebda6408SAlex Elder u32 copyup_page_count; 309bf0d5f50SAlex Elder spinlock_t completion_lock;/* protects next_completion */ 310bf0d5f50SAlex Elder u32 next_completion; 311bf0d5f50SAlex Elder rbd_img_callback_t callback; 31255f27e09SAlex Elder u64 xferred;/* aggregate bytes transferred */ 313a5a337d4SAlex Elder int result; /* first nonzero obj_request result */ 314bf0d5f50SAlex Elder 315bf0d5f50SAlex Elder u32 obj_request_count; 316bf0d5f50SAlex Elder struct list_head obj_requests; /* rbd_obj_request structs */ 317bf0d5f50SAlex Elder 318bf0d5f50SAlex Elder struct kref kref; 319bf0d5f50SAlex Elder }; 320bf0d5f50SAlex Elder 321bf0d5f50SAlex Elder #define for_each_obj_request(ireq, oreq) \ 322ef06f4d3SAlex Elder list_for_each_entry(oreq, &(ireq)->obj_requests, links) 323bf0d5f50SAlex Elder #define for_each_obj_request_from(ireq, oreq) \ 324ef06f4d3SAlex Elder list_for_each_entry_from(oreq, &(ireq)->obj_requests, links) 325bf0d5f50SAlex Elder #define for_each_obj_request_safe(ireq, oreq, n) \ 326ef06f4d3SAlex Elder list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links) 327bf0d5f50SAlex Elder 32899d16943SIlya Dryomov enum rbd_watch_state { 32999d16943SIlya Dryomov RBD_WATCH_STATE_UNREGISTERED, 33099d16943SIlya Dryomov RBD_WATCH_STATE_REGISTERED, 33199d16943SIlya Dryomov RBD_WATCH_STATE_ERROR, 33299d16943SIlya Dryomov }; 33399d16943SIlya Dryomov 334ed95b21aSIlya Dryomov enum rbd_lock_state { 335ed95b21aSIlya Dryomov RBD_LOCK_STATE_UNLOCKED, 336ed95b21aSIlya Dryomov RBD_LOCK_STATE_LOCKED, 337ed95b21aSIlya Dryomov RBD_LOCK_STATE_RELEASING, 338ed95b21aSIlya Dryomov }; 339ed95b21aSIlya Dryomov 340ed95b21aSIlya Dryomov /* WatchNotify::ClientId */ 341ed95b21aSIlya Dryomov struct rbd_client_id { 342ed95b21aSIlya Dryomov u64 gid; 343ed95b21aSIlya Dryomov u64 handle; 344ed95b21aSIlya Dryomov }; 345ed95b21aSIlya Dryomov 346f84344f3SAlex Elder struct rbd_mapping { 34799c1f08fSAlex Elder u64 size; 34834b13184SAlex Elder u64 features; 349f84344f3SAlex Elder bool read_only; 350f84344f3SAlex Elder }; 351f84344f3SAlex Elder 352602adf40SYehuda Sadeh /* 353602adf40SYehuda Sadeh * a single device 354602adf40SYehuda Sadeh */ 355602adf40SYehuda Sadeh struct rbd_device { 356de71a297SAlex Elder int dev_id; /* blkdev unique id */ 357602adf40SYehuda Sadeh 358602adf40SYehuda Sadeh int major; /* blkdev assigned major */ 359dd82fff1SIlya Dryomov int minor; 360602adf40SYehuda Sadeh struct gendisk *disk; /* blkdev's gendisk and rq */ 361602adf40SYehuda Sadeh 362a30b71b9SAlex Elder u32 image_format; /* Either 1 or 2 */ 363602adf40SYehuda Sadeh struct rbd_client *rbd_client; 364602adf40SYehuda Sadeh 365602adf40SYehuda Sadeh char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */ 366602adf40SYehuda Sadeh 367b82d167bSAlex Elder spinlock_t lock; /* queue, flags, open_count */ 368602adf40SYehuda Sadeh 369602adf40SYehuda Sadeh struct rbd_image_header header; 370b82d167bSAlex Elder unsigned long flags; /* possibly lock protected */ 3710d7dbfceSAlex Elder struct rbd_spec *spec; 372d147543dSIlya Dryomov struct rbd_options *opts; 373602adf40SYehuda Sadeh 374c41d13a3SIlya Dryomov struct ceph_object_id header_oid; 375922dab61SIlya Dryomov struct ceph_object_locator header_oloc; 376971f839aSAlex Elder 3771643dfa4SIlya Dryomov struct ceph_file_layout layout; /* used for all rbd requests */ 3780903e875SAlex Elder 37999d16943SIlya Dryomov struct mutex watch_mutex; 38099d16943SIlya Dryomov enum rbd_watch_state watch_state; 381922dab61SIlya Dryomov struct ceph_osd_linger_request *watch_handle; 38299d16943SIlya Dryomov u64 watch_cookie; 38399d16943SIlya Dryomov struct delayed_work watch_dwork; 38459c2be1eSYehuda Sadeh 385ed95b21aSIlya Dryomov struct rw_semaphore lock_rwsem; 386ed95b21aSIlya Dryomov enum rbd_lock_state lock_state; 387ed95b21aSIlya Dryomov struct rbd_client_id owner_cid; 388ed95b21aSIlya Dryomov struct work_struct acquired_lock_work; 389ed95b21aSIlya Dryomov struct work_struct released_lock_work; 390ed95b21aSIlya Dryomov struct delayed_work lock_dwork; 391ed95b21aSIlya Dryomov struct work_struct unlock_work; 392ed95b21aSIlya Dryomov wait_queue_head_t lock_waitq; 393ed95b21aSIlya Dryomov 3941643dfa4SIlya Dryomov struct workqueue_struct *task_wq; 3951643dfa4SIlya Dryomov 39686b00e0dSAlex Elder struct rbd_spec *parent_spec; 39786b00e0dSAlex Elder u64 parent_overlap; 398a2acd00eSAlex Elder atomic_t parent_ref; 3992f82ee54SAlex Elder struct rbd_device *parent; 40086b00e0dSAlex Elder 4017ad18afaSChristoph Hellwig /* Block layer tags. */ 4027ad18afaSChristoph Hellwig struct blk_mq_tag_set tag_set; 4037ad18afaSChristoph Hellwig 404c666601aSJosh Durgin /* protects updating the header */ 405c666601aSJosh Durgin struct rw_semaphore header_rwsem; 406f84344f3SAlex Elder 407f84344f3SAlex Elder struct rbd_mapping mapping; 408602adf40SYehuda Sadeh 409602adf40SYehuda Sadeh struct list_head node; 410dfc5606dSYehuda Sadeh 411dfc5606dSYehuda Sadeh /* sysfs related */ 412dfc5606dSYehuda Sadeh struct device dev; 413b82d167bSAlex Elder unsigned long open_count; /* protected by lock */ 414dfc5606dSYehuda Sadeh }; 415dfc5606dSYehuda Sadeh 416b82d167bSAlex Elder /* 417b82d167bSAlex Elder * Flag bits for rbd_dev->flags. If atomicity is required, 418b82d167bSAlex Elder * rbd_dev->lock is used to protect access. 419b82d167bSAlex Elder * 420b82d167bSAlex Elder * Currently, only the "removing" flag (which is coupled with the 421b82d167bSAlex Elder * "open_count" field) requires atomic access. 422b82d167bSAlex Elder */ 4236d292906SAlex Elder enum rbd_dev_flags { 4246d292906SAlex Elder RBD_DEV_FLAG_EXISTS, /* mapped snapshot has not been deleted */ 425b82d167bSAlex Elder RBD_DEV_FLAG_REMOVING, /* this mapping is being removed */ 4266d292906SAlex Elder }; 4276d292906SAlex Elder 428cfbf6377SAlex Elder static DEFINE_MUTEX(client_mutex); /* Serialize client creation */ 429e124a82fSAlex Elder 430602adf40SYehuda Sadeh static LIST_HEAD(rbd_dev_list); /* devices */ 431e124a82fSAlex Elder static DEFINE_SPINLOCK(rbd_dev_list_lock); 432e124a82fSAlex Elder 433602adf40SYehuda Sadeh static LIST_HEAD(rbd_client_list); /* clients */ 434432b8587SAlex Elder static DEFINE_SPINLOCK(rbd_client_list_lock); 435602adf40SYehuda Sadeh 43678c2a44aSAlex Elder /* Slab caches for frequently-allocated structures */ 43778c2a44aSAlex Elder 4381c2a9dfeSAlex Elder static struct kmem_cache *rbd_img_request_cache; 439868311b1SAlex Elder static struct kmem_cache *rbd_obj_request_cache; 44078c2a44aSAlex Elder static struct kmem_cache *rbd_segment_name_cache; 4411c2a9dfeSAlex Elder 4429b60e70bSIlya Dryomov static int rbd_major; 443f8a22fc2SIlya Dryomov static DEFINE_IDA(rbd_dev_id_ida); 444f8a22fc2SIlya Dryomov 445f5ee37bdSIlya Dryomov static struct workqueue_struct *rbd_wq; 446f5ee37bdSIlya Dryomov 4479b60e70bSIlya Dryomov /* 4489b60e70bSIlya Dryomov * Default to false for now, as single-major requires >= 0.75 version of 4499b60e70bSIlya Dryomov * userspace rbd utility. 4509b60e70bSIlya Dryomov */ 4519b60e70bSIlya Dryomov static bool single_major = false; 4529b60e70bSIlya Dryomov module_param(single_major, bool, S_IRUGO); 4539b60e70bSIlya Dryomov MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: false)"); 4549b60e70bSIlya Dryomov 4553d7efd18SAlex Elder static int rbd_img_request_submit(struct rbd_img_request *img_request); 4563d7efd18SAlex Elder 457f0f8cef5SAlex Elder static ssize_t rbd_add(struct bus_type *bus, const char *buf, 458f0f8cef5SAlex Elder size_t count); 459f0f8cef5SAlex Elder static ssize_t rbd_remove(struct bus_type *bus, const char *buf, 460f0f8cef5SAlex Elder size_t count); 4619b60e70bSIlya Dryomov static ssize_t rbd_add_single_major(struct bus_type *bus, const char *buf, 4629b60e70bSIlya Dryomov size_t count); 4639b60e70bSIlya Dryomov static ssize_t rbd_remove_single_major(struct bus_type *bus, const char *buf, 4649b60e70bSIlya Dryomov size_t count); 4656d69bb53SIlya Dryomov static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth); 466a2acd00eSAlex Elder static void rbd_spec_put(struct rbd_spec *spec); 467f0f8cef5SAlex Elder 4689b60e70bSIlya Dryomov static int rbd_dev_id_to_minor(int dev_id) 4699b60e70bSIlya Dryomov { 4707e513d43SIlya Dryomov return dev_id << RBD_SINGLE_MAJOR_PART_SHIFT; 4719b60e70bSIlya Dryomov } 4729b60e70bSIlya Dryomov 4739b60e70bSIlya Dryomov static int minor_to_rbd_dev_id(int minor) 4749b60e70bSIlya Dryomov { 4757e513d43SIlya Dryomov return minor >> RBD_SINGLE_MAJOR_PART_SHIFT; 4769b60e70bSIlya Dryomov } 4779b60e70bSIlya Dryomov 478ed95b21aSIlya Dryomov static bool rbd_is_lock_supported(struct rbd_device *rbd_dev) 479ed95b21aSIlya Dryomov { 480ed95b21aSIlya Dryomov return (rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK) && 481ed95b21aSIlya Dryomov rbd_dev->spec->snap_id == CEPH_NOSNAP && 482ed95b21aSIlya Dryomov !rbd_dev->mapping.read_only; 483ed95b21aSIlya Dryomov } 484ed95b21aSIlya Dryomov 485ed95b21aSIlya Dryomov static bool __rbd_is_lock_owner(struct rbd_device *rbd_dev) 486ed95b21aSIlya Dryomov { 487ed95b21aSIlya Dryomov return rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED || 488ed95b21aSIlya Dryomov rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING; 489ed95b21aSIlya Dryomov } 490ed95b21aSIlya Dryomov 491ed95b21aSIlya Dryomov static bool rbd_is_lock_owner(struct rbd_device *rbd_dev) 492ed95b21aSIlya Dryomov { 493ed95b21aSIlya Dryomov bool is_lock_owner; 494ed95b21aSIlya Dryomov 495ed95b21aSIlya Dryomov down_read(&rbd_dev->lock_rwsem); 496ed95b21aSIlya Dryomov is_lock_owner = __rbd_is_lock_owner(rbd_dev); 497ed95b21aSIlya Dryomov up_read(&rbd_dev->lock_rwsem); 498ed95b21aSIlya Dryomov return is_lock_owner; 499ed95b21aSIlya Dryomov } 500ed95b21aSIlya Dryomov 501b15a21ddSGreg Kroah-Hartman static BUS_ATTR(add, S_IWUSR, NULL, rbd_add); 502b15a21ddSGreg Kroah-Hartman static BUS_ATTR(remove, S_IWUSR, NULL, rbd_remove); 5039b60e70bSIlya Dryomov static BUS_ATTR(add_single_major, S_IWUSR, NULL, rbd_add_single_major); 5049b60e70bSIlya Dryomov static BUS_ATTR(remove_single_major, S_IWUSR, NULL, rbd_remove_single_major); 505b15a21ddSGreg Kroah-Hartman 506b15a21ddSGreg Kroah-Hartman static struct attribute *rbd_bus_attrs[] = { 507b15a21ddSGreg Kroah-Hartman &bus_attr_add.attr, 508b15a21ddSGreg Kroah-Hartman &bus_attr_remove.attr, 5099b60e70bSIlya Dryomov &bus_attr_add_single_major.attr, 5109b60e70bSIlya Dryomov &bus_attr_remove_single_major.attr, 511b15a21ddSGreg Kroah-Hartman NULL, 512f0f8cef5SAlex Elder }; 51392c76dc0SIlya Dryomov 51492c76dc0SIlya Dryomov static umode_t rbd_bus_is_visible(struct kobject *kobj, 51592c76dc0SIlya Dryomov struct attribute *attr, int index) 51692c76dc0SIlya Dryomov { 5179b60e70bSIlya Dryomov if (!single_major && 5189b60e70bSIlya Dryomov (attr == &bus_attr_add_single_major.attr || 5199b60e70bSIlya Dryomov attr == &bus_attr_remove_single_major.attr)) 5209b60e70bSIlya Dryomov return 0; 5219b60e70bSIlya Dryomov 52292c76dc0SIlya Dryomov return attr->mode; 52392c76dc0SIlya Dryomov } 52492c76dc0SIlya Dryomov 52592c76dc0SIlya Dryomov static const struct attribute_group rbd_bus_group = { 52692c76dc0SIlya Dryomov .attrs = rbd_bus_attrs, 52792c76dc0SIlya Dryomov .is_visible = rbd_bus_is_visible, 52892c76dc0SIlya Dryomov }; 52992c76dc0SIlya Dryomov __ATTRIBUTE_GROUPS(rbd_bus); 530f0f8cef5SAlex Elder 531f0f8cef5SAlex Elder static struct bus_type rbd_bus_type = { 532f0f8cef5SAlex Elder .name = "rbd", 533b15a21ddSGreg Kroah-Hartman .bus_groups = rbd_bus_groups, 534f0f8cef5SAlex Elder }; 535f0f8cef5SAlex Elder 536f0f8cef5SAlex Elder static void rbd_root_dev_release(struct device *dev) 537f0f8cef5SAlex Elder { 538f0f8cef5SAlex Elder } 539f0f8cef5SAlex Elder 540f0f8cef5SAlex Elder static struct device rbd_root_dev = { 541f0f8cef5SAlex Elder .init_name = "rbd", 542f0f8cef5SAlex Elder .release = rbd_root_dev_release, 543f0f8cef5SAlex Elder }; 544f0f8cef5SAlex Elder 54506ecc6cbSAlex Elder static __printf(2, 3) 54606ecc6cbSAlex Elder void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...) 54706ecc6cbSAlex Elder { 54806ecc6cbSAlex Elder struct va_format vaf; 54906ecc6cbSAlex Elder va_list args; 55006ecc6cbSAlex Elder 55106ecc6cbSAlex Elder va_start(args, fmt); 55206ecc6cbSAlex Elder vaf.fmt = fmt; 55306ecc6cbSAlex Elder vaf.va = &args; 55406ecc6cbSAlex Elder 55506ecc6cbSAlex Elder if (!rbd_dev) 55606ecc6cbSAlex Elder printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf); 55706ecc6cbSAlex Elder else if (rbd_dev->disk) 55806ecc6cbSAlex Elder printk(KERN_WARNING "%s: %s: %pV\n", 55906ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf); 56006ecc6cbSAlex Elder else if (rbd_dev->spec && rbd_dev->spec->image_name) 56106ecc6cbSAlex Elder printk(KERN_WARNING "%s: image %s: %pV\n", 56206ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf); 56306ecc6cbSAlex Elder else if (rbd_dev->spec && rbd_dev->spec->image_id) 56406ecc6cbSAlex Elder printk(KERN_WARNING "%s: id %s: %pV\n", 56506ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf); 56606ecc6cbSAlex Elder else /* punt */ 56706ecc6cbSAlex Elder printk(KERN_WARNING "%s: rbd_dev %p: %pV\n", 56806ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev, &vaf); 56906ecc6cbSAlex Elder va_end(args); 57006ecc6cbSAlex Elder } 57106ecc6cbSAlex Elder 572aafb230eSAlex Elder #ifdef RBD_DEBUG 573aafb230eSAlex Elder #define rbd_assert(expr) \ 574aafb230eSAlex Elder if (unlikely(!(expr))) { \ 575aafb230eSAlex Elder printk(KERN_ERR "\nAssertion failure in %s() " \ 576aafb230eSAlex Elder "at line %d:\n\n" \ 577aafb230eSAlex Elder "\trbd_assert(%s);\n\n", \ 578aafb230eSAlex Elder __func__, __LINE__, #expr); \ 579aafb230eSAlex Elder BUG(); \ 580aafb230eSAlex Elder } 581aafb230eSAlex Elder #else /* !RBD_DEBUG */ 582aafb230eSAlex Elder # define rbd_assert(expr) ((void) 0) 583aafb230eSAlex Elder #endif /* !RBD_DEBUG */ 584dfc5606dSYehuda Sadeh 5852761713dSIlya Dryomov static void rbd_osd_copyup_callback(struct rbd_obj_request *obj_request); 586b454e36dSAlex Elder static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request); 58705a46afdSAlex Elder static void rbd_img_parent_read(struct rbd_obj_request *obj_request); 58805a46afdSAlex Elder static void rbd_dev_remove_parent(struct rbd_device *rbd_dev); 5898b3e1a56SAlex Elder 590cc4a38bdSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev); 5912df3fac7SAlex Elder static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev); 592a720ae09SIlya Dryomov static int rbd_dev_header_info(struct rbd_device *rbd_dev); 593e8f59b59SIlya Dryomov static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev); 59454cac61fSAlex Elder static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, 59554cac61fSAlex Elder u64 snap_id); 5962ad3d716SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id, 5972ad3d716SAlex Elder u8 *order, u64 *snap_size); 5982ad3d716SAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id, 5992ad3d716SAlex Elder u64 *snap_features); 60059c2be1eSYehuda Sadeh 601602adf40SYehuda Sadeh static int rbd_open(struct block_device *bdev, fmode_t mode) 602602adf40SYehuda Sadeh { 603f0f8cef5SAlex Elder struct rbd_device *rbd_dev = bdev->bd_disk->private_data; 604b82d167bSAlex Elder bool removing = false; 605602adf40SYehuda Sadeh 606f84344f3SAlex Elder if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only) 607602adf40SYehuda Sadeh return -EROFS; 608602adf40SYehuda Sadeh 609a14ea269SAlex Elder spin_lock_irq(&rbd_dev->lock); 610b82d167bSAlex Elder if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags)) 611b82d167bSAlex Elder removing = true; 612b82d167bSAlex Elder else 613b82d167bSAlex Elder rbd_dev->open_count++; 614a14ea269SAlex Elder spin_unlock_irq(&rbd_dev->lock); 615b82d167bSAlex Elder if (removing) 616b82d167bSAlex Elder return -ENOENT; 617b82d167bSAlex Elder 618c3e946ceSAlex Elder (void) get_device(&rbd_dev->dev); 619340c7a2bSAlex Elder 620602adf40SYehuda Sadeh return 0; 621602adf40SYehuda Sadeh } 622602adf40SYehuda Sadeh 623db2a144bSAl Viro static void rbd_release(struct gendisk *disk, fmode_t mode) 624dfc5606dSYehuda Sadeh { 625dfc5606dSYehuda Sadeh struct rbd_device *rbd_dev = disk->private_data; 626b82d167bSAlex Elder unsigned long open_count_before; 627b82d167bSAlex Elder 628a14ea269SAlex Elder spin_lock_irq(&rbd_dev->lock); 629b82d167bSAlex Elder open_count_before = rbd_dev->open_count--; 630a14ea269SAlex Elder spin_unlock_irq(&rbd_dev->lock); 631b82d167bSAlex Elder rbd_assert(open_count_before > 0); 632dfc5606dSYehuda Sadeh 633c3e946ceSAlex Elder put_device(&rbd_dev->dev); 634dfc5606dSYehuda Sadeh } 635dfc5606dSYehuda Sadeh 636131fd9f6SGuangliang Zhao static int rbd_ioctl_set_ro(struct rbd_device *rbd_dev, unsigned long arg) 637131fd9f6SGuangliang Zhao { 63877f33c03SJosh Durgin int ret = 0; 639131fd9f6SGuangliang Zhao int val; 640131fd9f6SGuangliang Zhao bool ro; 64177f33c03SJosh Durgin bool ro_changed = false; 642131fd9f6SGuangliang Zhao 64377f33c03SJosh Durgin /* get_user() may sleep, so call it before taking rbd_dev->lock */ 644131fd9f6SGuangliang Zhao if (get_user(val, (int __user *)(arg))) 645131fd9f6SGuangliang Zhao return -EFAULT; 646131fd9f6SGuangliang Zhao 647131fd9f6SGuangliang Zhao ro = val ? true : false; 648131fd9f6SGuangliang Zhao /* Snapshot doesn't allow to write*/ 649131fd9f6SGuangliang Zhao if (rbd_dev->spec->snap_id != CEPH_NOSNAP && !ro) 650131fd9f6SGuangliang Zhao return -EROFS; 651131fd9f6SGuangliang Zhao 65277f33c03SJosh Durgin spin_lock_irq(&rbd_dev->lock); 65377f33c03SJosh Durgin /* prevent others open this device */ 65477f33c03SJosh Durgin if (rbd_dev->open_count > 1) { 65577f33c03SJosh Durgin ret = -EBUSY; 65677f33c03SJosh Durgin goto out; 657131fd9f6SGuangliang Zhao } 658131fd9f6SGuangliang Zhao 65977f33c03SJosh Durgin if (rbd_dev->mapping.read_only != ro) { 66077f33c03SJosh Durgin rbd_dev->mapping.read_only = ro; 66177f33c03SJosh Durgin ro_changed = true; 66277f33c03SJosh Durgin } 66377f33c03SJosh Durgin 66477f33c03SJosh Durgin out: 66577f33c03SJosh Durgin spin_unlock_irq(&rbd_dev->lock); 66677f33c03SJosh Durgin /* set_disk_ro() may sleep, so call it after releasing rbd_dev->lock */ 66777f33c03SJosh Durgin if (ret == 0 && ro_changed) 66877f33c03SJosh Durgin set_disk_ro(rbd_dev->disk, ro ? 1 : 0); 66977f33c03SJosh Durgin 67077f33c03SJosh Durgin return ret; 671131fd9f6SGuangliang Zhao } 672131fd9f6SGuangliang Zhao 673131fd9f6SGuangliang Zhao static int rbd_ioctl(struct block_device *bdev, fmode_t mode, 674131fd9f6SGuangliang Zhao unsigned int cmd, unsigned long arg) 675131fd9f6SGuangliang Zhao { 676131fd9f6SGuangliang Zhao struct rbd_device *rbd_dev = bdev->bd_disk->private_data; 677131fd9f6SGuangliang Zhao int ret = 0; 678131fd9f6SGuangliang Zhao 679131fd9f6SGuangliang Zhao switch (cmd) { 680131fd9f6SGuangliang Zhao case BLKROSET: 681131fd9f6SGuangliang Zhao ret = rbd_ioctl_set_ro(rbd_dev, arg); 682131fd9f6SGuangliang Zhao break; 683131fd9f6SGuangliang Zhao default: 684131fd9f6SGuangliang Zhao ret = -ENOTTY; 685131fd9f6SGuangliang Zhao } 686131fd9f6SGuangliang Zhao 687131fd9f6SGuangliang Zhao return ret; 688131fd9f6SGuangliang Zhao } 689131fd9f6SGuangliang Zhao 690131fd9f6SGuangliang Zhao #ifdef CONFIG_COMPAT 691131fd9f6SGuangliang Zhao static int rbd_compat_ioctl(struct block_device *bdev, fmode_t mode, 692131fd9f6SGuangliang Zhao unsigned int cmd, unsigned long arg) 693131fd9f6SGuangliang Zhao { 694131fd9f6SGuangliang Zhao return rbd_ioctl(bdev, mode, cmd, arg); 695131fd9f6SGuangliang Zhao } 696131fd9f6SGuangliang Zhao #endif /* CONFIG_COMPAT */ 697131fd9f6SGuangliang Zhao 698602adf40SYehuda Sadeh static const struct block_device_operations rbd_bd_ops = { 699602adf40SYehuda Sadeh .owner = THIS_MODULE, 700602adf40SYehuda Sadeh .open = rbd_open, 701dfc5606dSYehuda Sadeh .release = rbd_release, 702131fd9f6SGuangliang Zhao .ioctl = rbd_ioctl, 703131fd9f6SGuangliang Zhao #ifdef CONFIG_COMPAT 704131fd9f6SGuangliang Zhao .compat_ioctl = rbd_compat_ioctl, 705131fd9f6SGuangliang Zhao #endif 706602adf40SYehuda Sadeh }; 707602adf40SYehuda Sadeh 708602adf40SYehuda Sadeh /* 7097262cfcaSAlex Elder * Initialize an rbd client instance. Success or not, this function 710cfbf6377SAlex Elder * consumes ceph_opts. Caller holds client_mutex. 711602adf40SYehuda Sadeh */ 712f8c38929SAlex Elder static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts) 713602adf40SYehuda Sadeh { 714602adf40SYehuda Sadeh struct rbd_client *rbdc; 715602adf40SYehuda Sadeh int ret = -ENOMEM; 716602adf40SYehuda Sadeh 71737206ee5SAlex Elder dout("%s:\n", __func__); 718602adf40SYehuda Sadeh rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL); 719602adf40SYehuda Sadeh if (!rbdc) 720602adf40SYehuda Sadeh goto out_opt; 721602adf40SYehuda Sadeh 722602adf40SYehuda Sadeh kref_init(&rbdc->kref); 723602adf40SYehuda Sadeh INIT_LIST_HEAD(&rbdc->node); 724602adf40SYehuda Sadeh 72543ae4701SAlex Elder rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0); 726602adf40SYehuda Sadeh if (IS_ERR(rbdc->client)) 72708f75463SAlex Elder goto out_rbdc; 72843ae4701SAlex Elder ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */ 729602adf40SYehuda Sadeh 730602adf40SYehuda Sadeh ret = ceph_open_session(rbdc->client); 731602adf40SYehuda Sadeh if (ret < 0) 73208f75463SAlex Elder goto out_client; 733602adf40SYehuda Sadeh 734432b8587SAlex Elder spin_lock(&rbd_client_list_lock); 735602adf40SYehuda Sadeh list_add_tail(&rbdc->node, &rbd_client_list); 736432b8587SAlex Elder spin_unlock(&rbd_client_list_lock); 737602adf40SYehuda Sadeh 73837206ee5SAlex Elder dout("%s: rbdc %p\n", __func__, rbdc); 739bc534d86SAlex Elder 740602adf40SYehuda Sadeh return rbdc; 74108f75463SAlex Elder out_client: 742602adf40SYehuda Sadeh ceph_destroy_client(rbdc->client); 74308f75463SAlex Elder out_rbdc: 744602adf40SYehuda Sadeh kfree(rbdc); 745602adf40SYehuda Sadeh out_opt: 74643ae4701SAlex Elder if (ceph_opts) 74743ae4701SAlex Elder ceph_destroy_options(ceph_opts); 74837206ee5SAlex Elder dout("%s: error %d\n", __func__, ret); 74937206ee5SAlex Elder 75028f259b7SVasiliy Kulikov return ERR_PTR(ret); 751602adf40SYehuda Sadeh } 752602adf40SYehuda Sadeh 7532f82ee54SAlex Elder static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc) 7542f82ee54SAlex Elder { 7552f82ee54SAlex Elder kref_get(&rbdc->kref); 7562f82ee54SAlex Elder 7572f82ee54SAlex Elder return rbdc; 7582f82ee54SAlex Elder } 7592f82ee54SAlex Elder 760602adf40SYehuda Sadeh /* 7611f7ba331SAlex Elder * Find a ceph client with specific addr and configuration. If 7621f7ba331SAlex Elder * found, bump its reference count. 763602adf40SYehuda Sadeh */ 7641f7ba331SAlex Elder static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts) 765602adf40SYehuda Sadeh { 766602adf40SYehuda Sadeh struct rbd_client *client_node; 7671f7ba331SAlex Elder bool found = false; 768602adf40SYehuda Sadeh 76943ae4701SAlex Elder if (ceph_opts->flags & CEPH_OPT_NOSHARE) 770602adf40SYehuda Sadeh return NULL; 771602adf40SYehuda Sadeh 7721f7ba331SAlex Elder spin_lock(&rbd_client_list_lock); 7731f7ba331SAlex Elder list_for_each_entry(client_node, &rbd_client_list, node) { 7741f7ba331SAlex Elder if (!ceph_compare_options(ceph_opts, client_node->client)) { 7752f82ee54SAlex Elder __rbd_get_client(client_node); 7762f82ee54SAlex Elder 7771f7ba331SAlex Elder found = true; 7781f7ba331SAlex Elder break; 7791f7ba331SAlex Elder } 7801f7ba331SAlex Elder } 7811f7ba331SAlex Elder spin_unlock(&rbd_client_list_lock); 7821f7ba331SAlex Elder 7831f7ba331SAlex Elder return found ? client_node : NULL; 784602adf40SYehuda Sadeh } 785602adf40SYehuda Sadeh 786602adf40SYehuda Sadeh /* 787210c104cSIlya Dryomov * (Per device) rbd map options 78859c2be1eSYehuda Sadeh */ 78959c2be1eSYehuda Sadeh enum { 790b5584180SIlya Dryomov Opt_queue_depth, 79159c2be1eSYehuda Sadeh Opt_last_int, 79259c2be1eSYehuda Sadeh /* int args above */ 79359c2be1eSYehuda Sadeh Opt_last_string, 79459c2be1eSYehuda Sadeh /* string args above */ 795cc0538b6SAlex Elder Opt_read_only, 796cc0538b6SAlex Elder Opt_read_write, 797210c104cSIlya Dryomov Opt_err 79859c2be1eSYehuda Sadeh }; 79959c2be1eSYehuda Sadeh 80043ae4701SAlex Elder static match_table_t rbd_opts_tokens = { 801b5584180SIlya Dryomov {Opt_queue_depth, "queue_depth=%d"}, 80259c2be1eSYehuda Sadeh /* int args above */ 80359c2be1eSYehuda Sadeh /* string args above */ 804be466c1cSAlex Elder {Opt_read_only, "read_only"}, 805cc0538b6SAlex Elder {Opt_read_only, "ro"}, /* Alternate spelling */ 806cc0538b6SAlex Elder {Opt_read_write, "read_write"}, 807cc0538b6SAlex Elder {Opt_read_write, "rw"}, /* Alternate spelling */ 808210c104cSIlya Dryomov {Opt_err, NULL} 80959c2be1eSYehuda Sadeh }; 81059c2be1eSYehuda Sadeh 81198571b5aSAlex Elder struct rbd_options { 812b5584180SIlya Dryomov int queue_depth; 81398571b5aSAlex Elder bool read_only; 81498571b5aSAlex Elder }; 81598571b5aSAlex Elder 816b5584180SIlya Dryomov #define RBD_QUEUE_DEPTH_DEFAULT BLKDEV_MAX_RQ 81798571b5aSAlex Elder #define RBD_READ_ONLY_DEFAULT false 81898571b5aSAlex Elder 81959c2be1eSYehuda Sadeh static int parse_rbd_opts_token(char *c, void *private) 82059c2be1eSYehuda Sadeh { 82143ae4701SAlex Elder struct rbd_options *rbd_opts = private; 82259c2be1eSYehuda Sadeh substring_t argstr[MAX_OPT_ARGS]; 82359c2be1eSYehuda Sadeh int token, intval, ret; 82459c2be1eSYehuda Sadeh 82543ae4701SAlex Elder token = match_token(c, rbd_opts_tokens, argstr); 82659c2be1eSYehuda Sadeh if (token < Opt_last_int) { 82759c2be1eSYehuda Sadeh ret = match_int(&argstr[0], &intval); 82859c2be1eSYehuda Sadeh if (ret < 0) { 829210c104cSIlya Dryomov pr_err("bad mount option arg (not int) at '%s'\n", c); 83059c2be1eSYehuda Sadeh return ret; 83159c2be1eSYehuda Sadeh } 83259c2be1eSYehuda Sadeh dout("got int token %d val %d\n", token, intval); 83359c2be1eSYehuda Sadeh } else if (token > Opt_last_int && token < Opt_last_string) { 834210c104cSIlya Dryomov dout("got string token %d val %s\n", token, argstr[0].from); 83559c2be1eSYehuda Sadeh } else { 83659c2be1eSYehuda Sadeh dout("got token %d\n", token); 83759c2be1eSYehuda Sadeh } 83859c2be1eSYehuda Sadeh 83959c2be1eSYehuda Sadeh switch (token) { 840b5584180SIlya Dryomov case Opt_queue_depth: 841b5584180SIlya Dryomov if (intval < 1) { 842b5584180SIlya Dryomov pr_err("queue_depth out of range\n"); 843b5584180SIlya Dryomov return -EINVAL; 844b5584180SIlya Dryomov } 845b5584180SIlya Dryomov rbd_opts->queue_depth = intval; 846b5584180SIlya Dryomov break; 847cc0538b6SAlex Elder case Opt_read_only: 848cc0538b6SAlex Elder rbd_opts->read_only = true; 849cc0538b6SAlex Elder break; 850cc0538b6SAlex Elder case Opt_read_write: 851cc0538b6SAlex Elder rbd_opts->read_only = false; 852cc0538b6SAlex Elder break; 85359c2be1eSYehuda Sadeh default: 854210c104cSIlya Dryomov /* libceph prints "bad option" msg */ 855210c104cSIlya Dryomov return -EINVAL; 85659c2be1eSYehuda Sadeh } 857210c104cSIlya Dryomov 85859c2be1eSYehuda Sadeh return 0; 85959c2be1eSYehuda Sadeh } 86059c2be1eSYehuda Sadeh 8616d2940c8SGuangliang Zhao static char* obj_op_name(enum obj_operation_type op_type) 8626d2940c8SGuangliang Zhao { 8636d2940c8SGuangliang Zhao switch (op_type) { 8646d2940c8SGuangliang Zhao case OBJ_OP_READ: 8656d2940c8SGuangliang Zhao return "read"; 8666d2940c8SGuangliang Zhao case OBJ_OP_WRITE: 8676d2940c8SGuangliang Zhao return "write"; 86890e98c52SGuangliang Zhao case OBJ_OP_DISCARD: 86990e98c52SGuangliang Zhao return "discard"; 8706d2940c8SGuangliang Zhao default: 8716d2940c8SGuangliang Zhao return "???"; 8726d2940c8SGuangliang Zhao } 8736d2940c8SGuangliang Zhao } 8746d2940c8SGuangliang Zhao 87559c2be1eSYehuda Sadeh /* 876602adf40SYehuda Sadeh * Get a ceph client with specific addr and configuration, if one does 8777262cfcaSAlex Elder * not exist create it. Either way, ceph_opts is consumed by this 8787262cfcaSAlex Elder * function. 879602adf40SYehuda Sadeh */ 8809d3997fdSAlex Elder static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts) 881602adf40SYehuda Sadeh { 882f8c38929SAlex Elder struct rbd_client *rbdc; 88359c2be1eSYehuda Sadeh 884cfbf6377SAlex Elder mutex_lock_nested(&client_mutex, SINGLE_DEPTH_NESTING); 8851f7ba331SAlex Elder rbdc = rbd_client_find(ceph_opts); 8869d3997fdSAlex Elder if (rbdc) /* using an existing client */ 88743ae4701SAlex Elder ceph_destroy_options(ceph_opts); 8889d3997fdSAlex Elder else 889f8c38929SAlex Elder rbdc = rbd_client_create(ceph_opts); 890cfbf6377SAlex Elder mutex_unlock(&client_mutex); 891d720bcb0SAlex Elder 8929d3997fdSAlex Elder return rbdc; 893602adf40SYehuda Sadeh } 894602adf40SYehuda Sadeh 895602adf40SYehuda Sadeh /* 896602adf40SYehuda Sadeh * Destroy ceph client 897d23a4b3fSAlex Elder * 898432b8587SAlex Elder * Caller must hold rbd_client_list_lock. 899602adf40SYehuda Sadeh */ 900602adf40SYehuda Sadeh static void rbd_client_release(struct kref *kref) 901602adf40SYehuda Sadeh { 902602adf40SYehuda Sadeh struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref); 903602adf40SYehuda Sadeh 90437206ee5SAlex Elder dout("%s: rbdc %p\n", __func__, rbdc); 905cd9d9f5dSAlex Elder spin_lock(&rbd_client_list_lock); 906602adf40SYehuda Sadeh list_del(&rbdc->node); 907cd9d9f5dSAlex Elder spin_unlock(&rbd_client_list_lock); 908602adf40SYehuda Sadeh 909602adf40SYehuda Sadeh ceph_destroy_client(rbdc->client); 910602adf40SYehuda Sadeh kfree(rbdc); 911602adf40SYehuda Sadeh } 912602adf40SYehuda Sadeh 913602adf40SYehuda Sadeh /* 914602adf40SYehuda Sadeh * Drop reference to ceph client node. If it's not referenced anymore, release 915602adf40SYehuda Sadeh * it. 916602adf40SYehuda Sadeh */ 9179d3997fdSAlex Elder static void rbd_put_client(struct rbd_client *rbdc) 918602adf40SYehuda Sadeh { 919c53d5893SAlex Elder if (rbdc) 9209d3997fdSAlex Elder kref_put(&rbdc->kref, rbd_client_release); 921602adf40SYehuda Sadeh } 922602adf40SYehuda Sadeh 923a30b71b9SAlex Elder static bool rbd_image_format_valid(u32 image_format) 924a30b71b9SAlex Elder { 925a30b71b9SAlex Elder return image_format == 1 || image_format == 2; 926a30b71b9SAlex Elder } 927a30b71b9SAlex Elder 9288e94af8eSAlex Elder static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk) 9298e94af8eSAlex Elder { 930103a150fSAlex Elder size_t size; 931103a150fSAlex Elder u32 snap_count; 932103a150fSAlex Elder 933103a150fSAlex Elder /* The header has to start with the magic rbd header text */ 934103a150fSAlex Elder if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT))) 935103a150fSAlex Elder return false; 936103a150fSAlex Elder 937db2388b6SAlex Elder /* The bio layer requires at least sector-sized I/O */ 938db2388b6SAlex Elder 939db2388b6SAlex Elder if (ondisk->options.order < SECTOR_SHIFT) 940db2388b6SAlex Elder return false; 941db2388b6SAlex Elder 942db2388b6SAlex Elder /* If we use u64 in a few spots we may be able to loosen this */ 943db2388b6SAlex Elder 944db2388b6SAlex Elder if (ondisk->options.order > 8 * sizeof (int) - 1) 945db2388b6SAlex Elder return false; 946db2388b6SAlex Elder 947103a150fSAlex Elder /* 948103a150fSAlex Elder * The size of a snapshot header has to fit in a size_t, and 949103a150fSAlex Elder * that limits the number of snapshots. 950103a150fSAlex Elder */ 951103a150fSAlex Elder snap_count = le32_to_cpu(ondisk->snap_count); 952103a150fSAlex Elder size = SIZE_MAX - sizeof (struct ceph_snap_context); 953103a150fSAlex Elder if (snap_count > size / sizeof (__le64)) 954103a150fSAlex Elder return false; 955103a150fSAlex Elder 956103a150fSAlex Elder /* 957103a150fSAlex Elder * Not only that, but the size of the entire the snapshot 958103a150fSAlex Elder * header must also be representable in a size_t. 959103a150fSAlex Elder */ 960103a150fSAlex Elder size -= snap_count * sizeof (__le64); 961103a150fSAlex Elder if ((u64) size < le64_to_cpu(ondisk->snap_names_len)) 962103a150fSAlex Elder return false; 963103a150fSAlex Elder 964103a150fSAlex Elder return true; 9658e94af8eSAlex Elder } 9668e94af8eSAlex Elder 967602adf40SYehuda Sadeh /* 968bb23e37aSAlex Elder * Fill an rbd image header with information from the given format 1 969bb23e37aSAlex Elder * on-disk header. 970602adf40SYehuda Sadeh */ 971662518b1SAlex Elder static int rbd_header_from_disk(struct rbd_device *rbd_dev, 9724156d998SAlex Elder struct rbd_image_header_ondisk *ondisk) 973602adf40SYehuda Sadeh { 974662518b1SAlex Elder struct rbd_image_header *header = &rbd_dev->header; 975bb23e37aSAlex Elder bool first_time = header->object_prefix == NULL; 976bb23e37aSAlex Elder struct ceph_snap_context *snapc; 977bb23e37aSAlex Elder char *object_prefix = NULL; 978bb23e37aSAlex Elder char *snap_names = NULL; 979bb23e37aSAlex Elder u64 *snap_sizes = NULL; 980ccece235SAlex Elder u32 snap_count; 981d2bb24e5SAlex Elder size_t size; 982bb23e37aSAlex Elder int ret = -ENOMEM; 983621901d6SAlex Elder u32 i; 984602adf40SYehuda Sadeh 985bb23e37aSAlex Elder /* Allocate this now to avoid having to handle failure below */ 986103a150fSAlex Elder 987bb23e37aSAlex Elder if (first_time) { 988bb23e37aSAlex Elder size_t len; 989bb23e37aSAlex Elder 990bb23e37aSAlex Elder len = strnlen(ondisk->object_prefix, 991bb23e37aSAlex Elder sizeof (ondisk->object_prefix)); 992bb23e37aSAlex Elder object_prefix = kmalloc(len + 1, GFP_KERNEL); 993bb23e37aSAlex Elder if (!object_prefix) 994602adf40SYehuda Sadeh return -ENOMEM; 995bb23e37aSAlex Elder memcpy(object_prefix, ondisk->object_prefix, len); 996bb23e37aSAlex Elder object_prefix[len] = '\0'; 997bb23e37aSAlex Elder } 99800f1f36fSAlex Elder 999bb23e37aSAlex Elder /* Allocate the snapshot context and fill it in */ 1000d2bb24e5SAlex Elder 1001602adf40SYehuda Sadeh snap_count = le32_to_cpu(ondisk->snap_count); 1002bb23e37aSAlex Elder snapc = ceph_create_snap_context(snap_count, GFP_KERNEL); 1003bb23e37aSAlex Elder if (!snapc) 1004bb23e37aSAlex Elder goto out_err; 1005bb23e37aSAlex Elder snapc->seq = le64_to_cpu(ondisk->snap_seq); 1006602adf40SYehuda Sadeh if (snap_count) { 1007bb23e37aSAlex Elder struct rbd_image_snap_ondisk *snaps; 1008f785cc1dSAlex Elder u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len); 1009f785cc1dSAlex Elder 1010bb23e37aSAlex Elder /* We'll keep a copy of the snapshot names... */ 1011621901d6SAlex Elder 1012f785cc1dSAlex Elder if (snap_names_len > (u64)SIZE_MAX) 1013bb23e37aSAlex Elder goto out_2big; 1014bb23e37aSAlex Elder snap_names = kmalloc(snap_names_len, GFP_KERNEL); 1015bb23e37aSAlex Elder if (!snap_names) 1016602adf40SYehuda Sadeh goto out_err; 1017bb23e37aSAlex Elder 1018bb23e37aSAlex Elder /* ...as well as the array of their sizes. */ 1019bb23e37aSAlex Elder 1020bb23e37aSAlex Elder size = snap_count * sizeof (*header->snap_sizes); 1021bb23e37aSAlex Elder snap_sizes = kmalloc(size, GFP_KERNEL); 1022bb23e37aSAlex Elder if (!snap_sizes) 1023bb23e37aSAlex Elder goto out_err; 1024bb23e37aSAlex Elder 1025f785cc1dSAlex Elder /* 1026bb23e37aSAlex Elder * Copy the names, and fill in each snapshot's id 1027bb23e37aSAlex Elder * and size. 1028bb23e37aSAlex Elder * 102999a41ebcSAlex Elder * Note that rbd_dev_v1_header_info() guarantees the 1030bb23e37aSAlex Elder * ondisk buffer we're working with has 1031f785cc1dSAlex Elder * snap_names_len bytes beyond the end of the 1032f785cc1dSAlex Elder * snapshot id array, this memcpy() is safe. 1033f785cc1dSAlex Elder */ 1034bb23e37aSAlex Elder memcpy(snap_names, &ondisk->snaps[snap_count], snap_names_len); 1035bb23e37aSAlex Elder snaps = ondisk->snaps; 1036bb23e37aSAlex Elder for (i = 0; i < snap_count; i++) { 1037bb23e37aSAlex Elder snapc->snaps[i] = le64_to_cpu(snaps[i].id); 1038bb23e37aSAlex Elder snap_sizes[i] = le64_to_cpu(snaps[i].image_size); 1039bb23e37aSAlex Elder } 1040602adf40SYehuda Sadeh } 1041849b4260SAlex Elder 1042bb23e37aSAlex Elder /* We won't fail any more, fill in the header */ 1043bb23e37aSAlex Elder 1044bb23e37aSAlex Elder if (first_time) { 1045bb23e37aSAlex Elder header->object_prefix = object_prefix; 1046602adf40SYehuda Sadeh header->obj_order = ondisk->options.order; 1047602adf40SYehuda Sadeh header->crypt_type = ondisk->options.crypt_type; 1048602adf40SYehuda Sadeh header->comp_type = ondisk->options.comp_type; 1049bb23e37aSAlex Elder /* The rest aren't used for format 1 images */ 1050bb23e37aSAlex Elder header->stripe_unit = 0; 1051bb23e37aSAlex Elder header->stripe_count = 0; 1052bb23e37aSAlex Elder header->features = 0; 1053662518b1SAlex Elder } else { 1054662518b1SAlex Elder ceph_put_snap_context(header->snapc); 1055662518b1SAlex Elder kfree(header->snap_names); 1056662518b1SAlex Elder kfree(header->snap_sizes); 1057bb23e37aSAlex Elder } 10586a52325fSAlex Elder 1059bb23e37aSAlex Elder /* The remaining fields always get updated (when we refresh) */ 1060621901d6SAlex Elder 1061f84344f3SAlex Elder header->image_size = le64_to_cpu(ondisk->image_size); 1062bb23e37aSAlex Elder header->snapc = snapc; 1063bb23e37aSAlex Elder header->snap_names = snap_names; 1064bb23e37aSAlex Elder header->snap_sizes = snap_sizes; 1065468521c1SAlex Elder 1066602adf40SYehuda Sadeh return 0; 1067bb23e37aSAlex Elder out_2big: 1068bb23e37aSAlex Elder ret = -EIO; 10696a52325fSAlex Elder out_err: 1070bb23e37aSAlex Elder kfree(snap_sizes); 1071bb23e37aSAlex Elder kfree(snap_names); 1072bb23e37aSAlex Elder ceph_put_snap_context(snapc); 1073bb23e37aSAlex Elder kfree(object_prefix); 1074ccece235SAlex Elder 1075bb23e37aSAlex Elder return ret; 1076602adf40SYehuda Sadeh } 1077602adf40SYehuda Sadeh 10789682fc6dSAlex Elder static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which) 10799682fc6dSAlex Elder { 10809682fc6dSAlex Elder const char *snap_name; 10819682fc6dSAlex Elder 10829682fc6dSAlex Elder rbd_assert(which < rbd_dev->header.snapc->num_snaps); 10839682fc6dSAlex Elder 10849682fc6dSAlex Elder /* Skip over names until we find the one we are looking for */ 10859682fc6dSAlex Elder 10869682fc6dSAlex Elder snap_name = rbd_dev->header.snap_names; 10879682fc6dSAlex Elder while (which--) 10889682fc6dSAlex Elder snap_name += strlen(snap_name) + 1; 10899682fc6dSAlex Elder 10909682fc6dSAlex Elder return kstrdup(snap_name, GFP_KERNEL); 10919682fc6dSAlex Elder } 10929682fc6dSAlex Elder 109330d1cff8SAlex Elder /* 109430d1cff8SAlex Elder * Snapshot id comparison function for use with qsort()/bsearch(). 109530d1cff8SAlex Elder * Note that result is for snapshots in *descending* order. 109630d1cff8SAlex Elder */ 109730d1cff8SAlex Elder static int snapid_compare_reverse(const void *s1, const void *s2) 109830d1cff8SAlex Elder { 109930d1cff8SAlex Elder u64 snap_id1 = *(u64 *)s1; 110030d1cff8SAlex Elder u64 snap_id2 = *(u64 *)s2; 110130d1cff8SAlex Elder 110230d1cff8SAlex Elder if (snap_id1 < snap_id2) 110330d1cff8SAlex Elder return 1; 110430d1cff8SAlex Elder return snap_id1 == snap_id2 ? 0 : -1; 110530d1cff8SAlex Elder } 110630d1cff8SAlex Elder 110730d1cff8SAlex Elder /* 110830d1cff8SAlex Elder * Search a snapshot context to see if the given snapshot id is 110930d1cff8SAlex Elder * present. 111030d1cff8SAlex Elder * 111130d1cff8SAlex Elder * Returns the position of the snapshot id in the array if it's found, 111230d1cff8SAlex Elder * or BAD_SNAP_INDEX otherwise. 111330d1cff8SAlex Elder * 111430d1cff8SAlex Elder * Note: The snapshot array is in kept sorted (by the osd) in 111530d1cff8SAlex Elder * reverse order, highest snapshot id first. 111630d1cff8SAlex Elder */ 11179682fc6dSAlex Elder static u32 rbd_dev_snap_index(struct rbd_device *rbd_dev, u64 snap_id) 11189682fc6dSAlex Elder { 11199682fc6dSAlex Elder struct ceph_snap_context *snapc = rbd_dev->header.snapc; 112030d1cff8SAlex Elder u64 *found; 11219682fc6dSAlex Elder 112230d1cff8SAlex Elder found = bsearch(&snap_id, &snapc->snaps, snapc->num_snaps, 112330d1cff8SAlex Elder sizeof (snap_id), snapid_compare_reverse); 11249682fc6dSAlex Elder 112530d1cff8SAlex Elder return found ? (u32)(found - &snapc->snaps[0]) : BAD_SNAP_INDEX; 11269682fc6dSAlex Elder } 11279682fc6dSAlex Elder 11282ad3d716SAlex Elder static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, 11292ad3d716SAlex Elder u64 snap_id) 113054cac61fSAlex Elder { 113154cac61fSAlex Elder u32 which; 1132da6a6b63SJosh Durgin const char *snap_name; 113354cac61fSAlex Elder 113454cac61fSAlex Elder which = rbd_dev_snap_index(rbd_dev, snap_id); 113554cac61fSAlex Elder if (which == BAD_SNAP_INDEX) 1136da6a6b63SJosh Durgin return ERR_PTR(-ENOENT); 113754cac61fSAlex Elder 1138da6a6b63SJosh Durgin snap_name = _rbd_dev_v1_snap_name(rbd_dev, which); 1139da6a6b63SJosh Durgin return snap_name ? snap_name : ERR_PTR(-ENOMEM); 114054cac61fSAlex Elder } 114154cac61fSAlex Elder 11429e15b77dSAlex Elder static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id) 11439e15b77dSAlex Elder { 11449e15b77dSAlex Elder if (snap_id == CEPH_NOSNAP) 11459e15b77dSAlex Elder return RBD_SNAP_HEAD_NAME; 11469e15b77dSAlex Elder 114754cac61fSAlex Elder rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 114854cac61fSAlex Elder if (rbd_dev->image_format == 1) 114954cac61fSAlex Elder return rbd_dev_v1_snap_name(rbd_dev, snap_id); 11509e15b77dSAlex Elder 115154cac61fSAlex Elder return rbd_dev_v2_snap_name(rbd_dev, snap_id); 11529e15b77dSAlex Elder } 11539e15b77dSAlex Elder 11542ad3d716SAlex Elder static int rbd_snap_size(struct rbd_device *rbd_dev, u64 snap_id, 11552ad3d716SAlex Elder u64 *snap_size) 1156602adf40SYehuda Sadeh { 11572ad3d716SAlex Elder rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 11582ad3d716SAlex Elder if (snap_id == CEPH_NOSNAP) { 11592ad3d716SAlex Elder *snap_size = rbd_dev->header.image_size; 11602ad3d716SAlex Elder } else if (rbd_dev->image_format == 1) { 11612ad3d716SAlex Elder u32 which; 116200f1f36fSAlex Elder 11632ad3d716SAlex Elder which = rbd_dev_snap_index(rbd_dev, snap_id); 11642ad3d716SAlex Elder if (which == BAD_SNAP_INDEX) 11652ad3d716SAlex Elder return -ENOENT; 116600f1f36fSAlex Elder 11672ad3d716SAlex Elder *snap_size = rbd_dev->header.snap_sizes[which]; 11682ad3d716SAlex Elder } else { 11692ad3d716SAlex Elder u64 size = 0; 11702ad3d716SAlex Elder int ret; 11712ad3d716SAlex Elder 11722ad3d716SAlex Elder ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size); 11732ad3d716SAlex Elder if (ret) 11742ad3d716SAlex Elder return ret; 11752ad3d716SAlex Elder 11762ad3d716SAlex Elder *snap_size = size; 11772ad3d716SAlex Elder } 11782ad3d716SAlex Elder return 0; 11792ad3d716SAlex Elder } 11802ad3d716SAlex Elder 11812ad3d716SAlex Elder static int rbd_snap_features(struct rbd_device *rbd_dev, u64 snap_id, 11822ad3d716SAlex Elder u64 *snap_features) 11832ad3d716SAlex Elder { 11842ad3d716SAlex Elder rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 11852ad3d716SAlex Elder if (snap_id == CEPH_NOSNAP) { 11862ad3d716SAlex Elder *snap_features = rbd_dev->header.features; 11872ad3d716SAlex Elder } else if (rbd_dev->image_format == 1) { 11882ad3d716SAlex Elder *snap_features = 0; /* No features for format 1 */ 11892ad3d716SAlex Elder } else { 11902ad3d716SAlex Elder u64 features = 0; 11912ad3d716SAlex Elder int ret; 11922ad3d716SAlex Elder 11932ad3d716SAlex Elder ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, &features); 11942ad3d716SAlex Elder if (ret) 11952ad3d716SAlex Elder return ret; 11962ad3d716SAlex Elder 11972ad3d716SAlex Elder *snap_features = features; 11982ad3d716SAlex Elder } 11992ad3d716SAlex Elder return 0; 120000f1f36fSAlex Elder } 1201602adf40SYehuda Sadeh 1202d1cf5788SAlex Elder static int rbd_dev_mapping_set(struct rbd_device *rbd_dev) 1203602adf40SYehuda Sadeh { 12048f4b7d98SAlex Elder u64 snap_id = rbd_dev->spec->snap_id; 12052ad3d716SAlex Elder u64 size = 0; 12062ad3d716SAlex Elder u64 features = 0; 12072ad3d716SAlex Elder int ret; 12088b0241f8SAlex Elder 12092ad3d716SAlex Elder ret = rbd_snap_size(rbd_dev, snap_id, &size); 12102ad3d716SAlex Elder if (ret) 12112ad3d716SAlex Elder return ret; 12122ad3d716SAlex Elder ret = rbd_snap_features(rbd_dev, snap_id, &features); 12132ad3d716SAlex Elder if (ret) 12142ad3d716SAlex Elder return ret; 12152ad3d716SAlex Elder 12162ad3d716SAlex Elder rbd_dev->mapping.size = size; 12172ad3d716SAlex Elder rbd_dev->mapping.features = features; 12182ad3d716SAlex Elder 12198b0241f8SAlex Elder return 0; 1220602adf40SYehuda Sadeh } 1221602adf40SYehuda Sadeh 1222d1cf5788SAlex Elder static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev) 1223d1cf5788SAlex Elder { 1224d1cf5788SAlex Elder rbd_dev->mapping.size = 0; 1225d1cf5788SAlex Elder rbd_dev->mapping.features = 0; 1226200a6a8bSAlex Elder } 1227200a6a8bSAlex Elder 12287d5079aaSHimangi Saraogi static void rbd_segment_name_free(const char *name) 12297d5079aaSHimangi Saraogi { 12307d5079aaSHimangi Saraogi /* The explicit cast here is needed to drop the const qualifier */ 12317d5079aaSHimangi Saraogi 12327d5079aaSHimangi Saraogi kmem_cache_free(rbd_segment_name_cache, (void *)name); 12337d5079aaSHimangi Saraogi } 12347d5079aaSHimangi Saraogi 123598571b5aSAlex Elder static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset) 1236602adf40SYehuda Sadeh { 123765ccfe21SAlex Elder char *name; 123865ccfe21SAlex Elder u64 segment; 123965ccfe21SAlex Elder int ret; 12403a96d5cdSJosh Durgin char *name_format; 1241602adf40SYehuda Sadeh 124278c2a44aSAlex Elder name = kmem_cache_alloc(rbd_segment_name_cache, GFP_NOIO); 124365ccfe21SAlex Elder if (!name) 124465ccfe21SAlex Elder return NULL; 124565ccfe21SAlex Elder segment = offset >> rbd_dev->header.obj_order; 12463a96d5cdSJosh Durgin name_format = "%s.%012llx"; 12473a96d5cdSJosh Durgin if (rbd_dev->image_format == 2) 12483a96d5cdSJosh Durgin name_format = "%s.%016llx"; 12492d0ebc5dSIlya Dryomov ret = snprintf(name, CEPH_MAX_OID_NAME_LEN + 1, name_format, 125065ccfe21SAlex Elder rbd_dev->header.object_prefix, segment); 12512d0ebc5dSIlya Dryomov if (ret < 0 || ret > CEPH_MAX_OID_NAME_LEN) { 125265ccfe21SAlex Elder pr_err("error formatting segment name for #%llu (%d)\n", 125365ccfe21SAlex Elder segment, ret); 12547d5079aaSHimangi Saraogi rbd_segment_name_free(name); 125565ccfe21SAlex Elder name = NULL; 125665ccfe21SAlex Elder } 1257602adf40SYehuda Sadeh 125865ccfe21SAlex Elder return name; 125965ccfe21SAlex Elder } 1260602adf40SYehuda Sadeh 126165ccfe21SAlex Elder static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset) 126265ccfe21SAlex Elder { 126365ccfe21SAlex Elder u64 segment_size = (u64) 1 << rbd_dev->header.obj_order; 1264602adf40SYehuda Sadeh 126565ccfe21SAlex Elder return offset & (segment_size - 1); 126665ccfe21SAlex Elder } 126765ccfe21SAlex Elder 126865ccfe21SAlex Elder static u64 rbd_segment_length(struct rbd_device *rbd_dev, 126965ccfe21SAlex Elder u64 offset, u64 length) 127065ccfe21SAlex Elder { 127165ccfe21SAlex Elder u64 segment_size = (u64) 1 << rbd_dev->header.obj_order; 127265ccfe21SAlex Elder 127365ccfe21SAlex Elder offset &= segment_size - 1; 127465ccfe21SAlex Elder 1275aafb230eSAlex Elder rbd_assert(length <= U64_MAX - offset); 127665ccfe21SAlex Elder if (offset + length > segment_size) 127765ccfe21SAlex Elder length = segment_size - offset; 127865ccfe21SAlex Elder 127965ccfe21SAlex Elder return length; 1280602adf40SYehuda Sadeh } 1281602adf40SYehuda Sadeh 1282602adf40SYehuda Sadeh /* 1283029bcbd8SJosh Durgin * returns the size of an object in the image 1284029bcbd8SJosh Durgin */ 1285029bcbd8SJosh Durgin static u64 rbd_obj_bytes(struct rbd_image_header *header) 1286029bcbd8SJosh Durgin { 1287029bcbd8SJosh Durgin return 1 << header->obj_order; 1288029bcbd8SJosh Durgin } 1289029bcbd8SJosh Durgin 1290029bcbd8SJosh Durgin /* 1291602adf40SYehuda Sadeh * bio helpers 1292602adf40SYehuda Sadeh */ 1293602adf40SYehuda Sadeh 1294602adf40SYehuda Sadeh static void bio_chain_put(struct bio *chain) 1295602adf40SYehuda Sadeh { 1296602adf40SYehuda Sadeh struct bio *tmp; 1297602adf40SYehuda Sadeh 1298602adf40SYehuda Sadeh while (chain) { 1299602adf40SYehuda Sadeh tmp = chain; 1300602adf40SYehuda Sadeh chain = chain->bi_next; 1301602adf40SYehuda Sadeh bio_put(tmp); 1302602adf40SYehuda Sadeh } 1303602adf40SYehuda Sadeh } 1304602adf40SYehuda Sadeh 1305602adf40SYehuda Sadeh /* 1306602adf40SYehuda Sadeh * zeros a bio chain, starting at specific offset 1307602adf40SYehuda Sadeh */ 1308602adf40SYehuda Sadeh static void zero_bio_chain(struct bio *chain, int start_ofs) 1309602adf40SYehuda Sadeh { 13107988613bSKent Overstreet struct bio_vec bv; 13117988613bSKent Overstreet struct bvec_iter iter; 1312602adf40SYehuda Sadeh unsigned long flags; 1313602adf40SYehuda Sadeh void *buf; 1314602adf40SYehuda Sadeh int pos = 0; 1315602adf40SYehuda Sadeh 1316602adf40SYehuda Sadeh while (chain) { 13177988613bSKent Overstreet bio_for_each_segment(bv, chain, iter) { 13187988613bSKent Overstreet if (pos + bv.bv_len > start_ofs) { 1319602adf40SYehuda Sadeh int remainder = max(start_ofs - pos, 0); 13207988613bSKent Overstreet buf = bvec_kmap_irq(&bv, &flags); 1321602adf40SYehuda Sadeh memset(buf + remainder, 0, 13227988613bSKent Overstreet bv.bv_len - remainder); 13237988613bSKent Overstreet flush_dcache_page(bv.bv_page); 132485b5aaa6SDan Carpenter bvec_kunmap_irq(buf, &flags); 1325602adf40SYehuda Sadeh } 13267988613bSKent Overstreet pos += bv.bv_len; 1327602adf40SYehuda Sadeh } 1328602adf40SYehuda Sadeh 1329602adf40SYehuda Sadeh chain = chain->bi_next; 1330602adf40SYehuda Sadeh } 1331602adf40SYehuda Sadeh } 1332602adf40SYehuda Sadeh 1333602adf40SYehuda Sadeh /* 1334b9434c5bSAlex Elder * similar to zero_bio_chain(), zeros data defined by a page array, 1335b9434c5bSAlex Elder * starting at the given byte offset from the start of the array and 1336b9434c5bSAlex Elder * continuing up to the given end offset. The pages array is 1337b9434c5bSAlex Elder * assumed to be big enough to hold all bytes up to the end. 1338b9434c5bSAlex Elder */ 1339b9434c5bSAlex Elder static void zero_pages(struct page **pages, u64 offset, u64 end) 1340b9434c5bSAlex Elder { 1341b9434c5bSAlex Elder struct page **page = &pages[offset >> PAGE_SHIFT]; 1342b9434c5bSAlex Elder 1343b9434c5bSAlex Elder rbd_assert(end > offset); 1344b9434c5bSAlex Elder rbd_assert(end - offset <= (u64)SIZE_MAX); 1345b9434c5bSAlex Elder while (offset < end) { 1346b9434c5bSAlex Elder size_t page_offset; 1347b9434c5bSAlex Elder size_t length; 1348b9434c5bSAlex Elder unsigned long flags; 1349b9434c5bSAlex Elder void *kaddr; 1350b9434c5bSAlex Elder 1351491205a8SGeert Uytterhoeven page_offset = offset & ~PAGE_MASK; 1352491205a8SGeert Uytterhoeven length = min_t(size_t, PAGE_SIZE - page_offset, end - offset); 1353b9434c5bSAlex Elder local_irq_save(flags); 1354b9434c5bSAlex Elder kaddr = kmap_atomic(*page); 1355b9434c5bSAlex Elder memset(kaddr + page_offset, 0, length); 1356e2156054SAlex Elder flush_dcache_page(*page); 1357b9434c5bSAlex Elder kunmap_atomic(kaddr); 1358b9434c5bSAlex Elder local_irq_restore(flags); 1359b9434c5bSAlex Elder 1360b9434c5bSAlex Elder offset += length; 1361b9434c5bSAlex Elder page++; 1362b9434c5bSAlex Elder } 1363b9434c5bSAlex Elder } 1364b9434c5bSAlex Elder 1365b9434c5bSAlex Elder /* 1366f7760dadSAlex Elder * Clone a portion of a bio, starting at the given byte offset 1367f7760dadSAlex Elder * and continuing for the number of bytes indicated. 1368602adf40SYehuda Sadeh */ 1369f7760dadSAlex Elder static struct bio *bio_clone_range(struct bio *bio_src, 1370f7760dadSAlex Elder unsigned int offset, 1371f7760dadSAlex Elder unsigned int len, 1372f7760dadSAlex Elder gfp_t gfpmask) 1373602adf40SYehuda Sadeh { 1374f7760dadSAlex Elder struct bio *bio; 1375602adf40SYehuda Sadeh 13765341a627SKent Overstreet bio = bio_clone(bio_src, gfpmask); 1377f7760dadSAlex Elder if (!bio) 1378f7760dadSAlex Elder return NULL; /* ENOMEM */ 1379f7760dadSAlex Elder 13805341a627SKent Overstreet bio_advance(bio, offset); 13814f024f37SKent Overstreet bio->bi_iter.bi_size = len; 1382602adf40SYehuda Sadeh 1383f7760dadSAlex Elder return bio; 1384602adf40SYehuda Sadeh } 1385602adf40SYehuda Sadeh 1386f7760dadSAlex Elder /* 1387f7760dadSAlex Elder * Clone a portion of a bio chain, starting at the given byte offset 1388f7760dadSAlex Elder * into the first bio in the source chain and continuing for the 1389f7760dadSAlex Elder * number of bytes indicated. The result is another bio chain of 1390f7760dadSAlex Elder * exactly the given length, or a null pointer on error. 1391f7760dadSAlex Elder * 1392f7760dadSAlex Elder * The bio_src and offset parameters are both in-out. On entry they 1393f7760dadSAlex Elder * refer to the first source bio and the offset into that bio where 1394f7760dadSAlex Elder * the start of data to be cloned is located. 1395f7760dadSAlex Elder * 1396f7760dadSAlex Elder * On return, bio_src is updated to refer to the bio in the source 1397f7760dadSAlex Elder * chain that contains first un-cloned byte, and *offset will 1398f7760dadSAlex Elder * contain the offset of that byte within that bio. 1399f7760dadSAlex Elder */ 1400f7760dadSAlex Elder static struct bio *bio_chain_clone_range(struct bio **bio_src, 1401f7760dadSAlex Elder unsigned int *offset, 1402f7760dadSAlex Elder unsigned int len, 1403f7760dadSAlex Elder gfp_t gfpmask) 1404f7760dadSAlex Elder { 1405f7760dadSAlex Elder struct bio *bi = *bio_src; 1406f7760dadSAlex Elder unsigned int off = *offset; 1407f7760dadSAlex Elder struct bio *chain = NULL; 1408f7760dadSAlex Elder struct bio **end; 1409602adf40SYehuda Sadeh 1410f7760dadSAlex Elder /* Build up a chain of clone bios up to the limit */ 1411602adf40SYehuda Sadeh 14124f024f37SKent Overstreet if (!bi || off >= bi->bi_iter.bi_size || !len) 1413f7760dadSAlex Elder return NULL; /* Nothing to clone */ 1414602adf40SYehuda Sadeh 1415f7760dadSAlex Elder end = &chain; 1416f7760dadSAlex Elder while (len) { 1417f7760dadSAlex Elder unsigned int bi_size; 1418f7760dadSAlex Elder struct bio *bio; 1419f7760dadSAlex Elder 1420f5400b7aSAlex Elder if (!bi) { 1421f5400b7aSAlex Elder rbd_warn(NULL, "bio_chain exhausted with %u left", len); 1422f7760dadSAlex Elder goto out_err; /* EINVAL; ran out of bio's */ 1423f5400b7aSAlex Elder } 14244f024f37SKent Overstreet bi_size = min_t(unsigned int, bi->bi_iter.bi_size - off, len); 1425f7760dadSAlex Elder bio = bio_clone_range(bi, off, bi_size, gfpmask); 1426f7760dadSAlex Elder if (!bio) 1427f7760dadSAlex Elder goto out_err; /* ENOMEM */ 1428f7760dadSAlex Elder 1429f7760dadSAlex Elder *end = bio; 1430f7760dadSAlex Elder end = &bio->bi_next; 1431f7760dadSAlex Elder 1432f7760dadSAlex Elder off += bi_size; 14334f024f37SKent Overstreet if (off == bi->bi_iter.bi_size) { 1434f7760dadSAlex Elder bi = bi->bi_next; 1435f7760dadSAlex Elder off = 0; 1436f7760dadSAlex Elder } 1437f7760dadSAlex Elder len -= bi_size; 1438f7760dadSAlex Elder } 1439f7760dadSAlex Elder *bio_src = bi; 1440f7760dadSAlex Elder *offset = off; 1441f7760dadSAlex Elder 1442f7760dadSAlex Elder return chain; 1443f7760dadSAlex Elder out_err: 1444f7760dadSAlex Elder bio_chain_put(chain); 1445f7760dadSAlex Elder 1446602adf40SYehuda Sadeh return NULL; 1447602adf40SYehuda Sadeh } 1448602adf40SYehuda Sadeh 1449926f9b3fSAlex Elder /* 1450926f9b3fSAlex Elder * The default/initial value for all object request flags is 0. For 1451926f9b3fSAlex Elder * each flag, once its value is set to 1 it is never reset to 0 1452926f9b3fSAlex Elder * again. 1453926f9b3fSAlex Elder */ 14546365d33aSAlex Elder static void obj_request_img_data_set(struct rbd_obj_request *obj_request) 14556365d33aSAlex Elder { 14566365d33aSAlex Elder if (test_and_set_bit(OBJ_REQ_IMG_DATA, &obj_request->flags)) { 14576365d33aSAlex Elder struct rbd_device *rbd_dev; 14586365d33aSAlex Elder 145957acbaa7SAlex Elder rbd_dev = obj_request->img_request->rbd_dev; 14609584d508SIlya Dryomov rbd_warn(rbd_dev, "obj_request %p already marked img_data", 14616365d33aSAlex Elder obj_request); 14626365d33aSAlex Elder } 14636365d33aSAlex Elder } 14646365d33aSAlex Elder 14656365d33aSAlex Elder static bool obj_request_img_data_test(struct rbd_obj_request *obj_request) 14666365d33aSAlex Elder { 14676365d33aSAlex Elder smp_mb(); 14686365d33aSAlex Elder return test_bit(OBJ_REQ_IMG_DATA, &obj_request->flags) != 0; 14696365d33aSAlex Elder } 14706365d33aSAlex Elder 147157acbaa7SAlex Elder static void obj_request_done_set(struct rbd_obj_request *obj_request) 147257acbaa7SAlex Elder { 147357acbaa7SAlex Elder if (test_and_set_bit(OBJ_REQ_DONE, &obj_request->flags)) { 147457acbaa7SAlex Elder struct rbd_device *rbd_dev = NULL; 147557acbaa7SAlex Elder 147657acbaa7SAlex Elder if (obj_request_img_data_test(obj_request)) 147757acbaa7SAlex Elder rbd_dev = obj_request->img_request->rbd_dev; 14789584d508SIlya Dryomov rbd_warn(rbd_dev, "obj_request %p already marked done", 147957acbaa7SAlex Elder obj_request); 148057acbaa7SAlex Elder } 148157acbaa7SAlex Elder } 148257acbaa7SAlex Elder 148357acbaa7SAlex Elder static bool obj_request_done_test(struct rbd_obj_request *obj_request) 148457acbaa7SAlex Elder { 148557acbaa7SAlex Elder smp_mb(); 148657acbaa7SAlex Elder return test_bit(OBJ_REQ_DONE, &obj_request->flags) != 0; 148757acbaa7SAlex Elder } 148857acbaa7SAlex Elder 14895679c59fSAlex Elder /* 14905679c59fSAlex Elder * This sets the KNOWN flag after (possibly) setting the EXISTS 14915679c59fSAlex Elder * flag. The latter is set based on the "exists" value provided. 14925679c59fSAlex Elder * 14935679c59fSAlex Elder * Note that for our purposes once an object exists it never goes 14945679c59fSAlex Elder * away again. It's possible that the response from two existence 14955679c59fSAlex Elder * checks are separated by the creation of the target object, and 14965679c59fSAlex Elder * the first ("doesn't exist") response arrives *after* the second 14975679c59fSAlex Elder * ("does exist"). In that case we ignore the second one. 14985679c59fSAlex Elder */ 14995679c59fSAlex Elder static void obj_request_existence_set(struct rbd_obj_request *obj_request, 15005679c59fSAlex Elder bool exists) 15015679c59fSAlex Elder { 15025679c59fSAlex Elder if (exists) 15035679c59fSAlex Elder set_bit(OBJ_REQ_EXISTS, &obj_request->flags); 15045679c59fSAlex Elder set_bit(OBJ_REQ_KNOWN, &obj_request->flags); 15055679c59fSAlex Elder smp_mb(); 15065679c59fSAlex Elder } 15075679c59fSAlex Elder 15085679c59fSAlex Elder static bool obj_request_known_test(struct rbd_obj_request *obj_request) 15095679c59fSAlex Elder { 15105679c59fSAlex Elder smp_mb(); 15115679c59fSAlex Elder return test_bit(OBJ_REQ_KNOWN, &obj_request->flags) != 0; 15125679c59fSAlex Elder } 15135679c59fSAlex Elder 15145679c59fSAlex Elder static bool obj_request_exists_test(struct rbd_obj_request *obj_request) 15155679c59fSAlex Elder { 15165679c59fSAlex Elder smp_mb(); 15175679c59fSAlex Elder return test_bit(OBJ_REQ_EXISTS, &obj_request->flags) != 0; 15185679c59fSAlex Elder } 15195679c59fSAlex Elder 15209638556aSIlya Dryomov static bool obj_request_overlaps_parent(struct rbd_obj_request *obj_request) 15219638556aSIlya Dryomov { 15229638556aSIlya Dryomov struct rbd_device *rbd_dev = obj_request->img_request->rbd_dev; 15239638556aSIlya Dryomov 15249638556aSIlya Dryomov return obj_request->img_offset < 15259638556aSIlya Dryomov round_up(rbd_dev->parent_overlap, rbd_obj_bytes(&rbd_dev->header)); 15269638556aSIlya Dryomov } 15279638556aSIlya Dryomov 1528bf0d5f50SAlex Elder static void rbd_obj_request_get(struct rbd_obj_request *obj_request) 1529bf0d5f50SAlex Elder { 153037206ee5SAlex Elder dout("%s: obj %p (was %d)\n", __func__, obj_request, 153137206ee5SAlex Elder atomic_read(&obj_request->kref.refcount)); 1532bf0d5f50SAlex Elder kref_get(&obj_request->kref); 1533bf0d5f50SAlex Elder } 1534bf0d5f50SAlex Elder 1535bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref); 1536bf0d5f50SAlex Elder static void rbd_obj_request_put(struct rbd_obj_request *obj_request) 1537bf0d5f50SAlex Elder { 1538bf0d5f50SAlex Elder rbd_assert(obj_request != NULL); 153937206ee5SAlex Elder dout("%s: obj %p (was %d)\n", __func__, obj_request, 154037206ee5SAlex Elder atomic_read(&obj_request->kref.refcount)); 1541bf0d5f50SAlex Elder kref_put(&obj_request->kref, rbd_obj_request_destroy); 1542bf0d5f50SAlex Elder } 1543bf0d5f50SAlex Elder 15440f2d5be7SAlex Elder static void rbd_img_request_get(struct rbd_img_request *img_request) 15450f2d5be7SAlex Elder { 15460f2d5be7SAlex Elder dout("%s: img %p (was %d)\n", __func__, img_request, 15470f2d5be7SAlex Elder atomic_read(&img_request->kref.refcount)); 15480f2d5be7SAlex Elder kref_get(&img_request->kref); 15490f2d5be7SAlex Elder } 15500f2d5be7SAlex Elder 1551e93f3152SAlex Elder static bool img_request_child_test(struct rbd_img_request *img_request); 1552e93f3152SAlex Elder static void rbd_parent_request_destroy(struct kref *kref); 1553bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref); 1554bf0d5f50SAlex Elder static void rbd_img_request_put(struct rbd_img_request *img_request) 1555bf0d5f50SAlex Elder { 1556bf0d5f50SAlex Elder rbd_assert(img_request != NULL); 155737206ee5SAlex Elder dout("%s: img %p (was %d)\n", __func__, img_request, 155837206ee5SAlex Elder atomic_read(&img_request->kref.refcount)); 1559e93f3152SAlex Elder if (img_request_child_test(img_request)) 1560e93f3152SAlex Elder kref_put(&img_request->kref, rbd_parent_request_destroy); 1561e93f3152SAlex Elder else 1562bf0d5f50SAlex Elder kref_put(&img_request->kref, rbd_img_request_destroy); 1563bf0d5f50SAlex Elder } 1564bf0d5f50SAlex Elder 1565bf0d5f50SAlex Elder static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request, 1566bf0d5f50SAlex Elder struct rbd_obj_request *obj_request) 1567bf0d5f50SAlex Elder { 156825dcf954SAlex Elder rbd_assert(obj_request->img_request == NULL); 156925dcf954SAlex Elder 1570b155e86cSAlex Elder /* Image request now owns object's original reference */ 1571bf0d5f50SAlex Elder obj_request->img_request = img_request; 157225dcf954SAlex Elder obj_request->which = img_request->obj_request_count; 15736365d33aSAlex Elder rbd_assert(!obj_request_img_data_test(obj_request)); 15746365d33aSAlex Elder obj_request_img_data_set(obj_request); 1575bf0d5f50SAlex Elder rbd_assert(obj_request->which != BAD_WHICH); 157625dcf954SAlex Elder img_request->obj_request_count++; 157725dcf954SAlex Elder list_add_tail(&obj_request->links, &img_request->obj_requests); 157837206ee5SAlex Elder dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request, 157937206ee5SAlex Elder obj_request->which); 1580bf0d5f50SAlex Elder } 1581bf0d5f50SAlex Elder 1582bf0d5f50SAlex Elder static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request, 1583bf0d5f50SAlex Elder struct rbd_obj_request *obj_request) 1584bf0d5f50SAlex Elder { 1585bf0d5f50SAlex Elder rbd_assert(obj_request->which != BAD_WHICH); 158625dcf954SAlex Elder 158737206ee5SAlex Elder dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request, 158837206ee5SAlex Elder obj_request->which); 1589bf0d5f50SAlex Elder list_del(&obj_request->links); 159025dcf954SAlex Elder rbd_assert(img_request->obj_request_count > 0); 159125dcf954SAlex Elder img_request->obj_request_count--; 159225dcf954SAlex Elder rbd_assert(obj_request->which == img_request->obj_request_count); 159325dcf954SAlex Elder obj_request->which = BAD_WHICH; 15946365d33aSAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 1595bf0d5f50SAlex Elder rbd_assert(obj_request->img_request == img_request); 1596bf0d5f50SAlex Elder obj_request->img_request = NULL; 159725dcf954SAlex Elder obj_request->callback = NULL; 1598bf0d5f50SAlex Elder rbd_obj_request_put(obj_request); 1599bf0d5f50SAlex Elder } 1600bf0d5f50SAlex Elder 1601bf0d5f50SAlex Elder static bool obj_request_type_valid(enum obj_request_type type) 1602bf0d5f50SAlex Elder { 1603bf0d5f50SAlex Elder switch (type) { 16049969ebc5SAlex Elder case OBJ_REQUEST_NODATA: 1605bf0d5f50SAlex Elder case OBJ_REQUEST_BIO: 1606788e2df3SAlex Elder case OBJ_REQUEST_PAGES: 1607bf0d5f50SAlex Elder return true; 1608bf0d5f50SAlex Elder default: 1609bf0d5f50SAlex Elder return false; 1610bf0d5f50SAlex Elder } 1611bf0d5f50SAlex Elder } 1612bf0d5f50SAlex Elder 1613bf0d5f50SAlex Elder static int rbd_obj_request_submit(struct ceph_osd_client *osdc, 1614bf0d5f50SAlex Elder struct rbd_obj_request *obj_request) 1615bf0d5f50SAlex Elder { 161671c20a06SIlya Dryomov dout("%s %p\n", __func__, obj_request); 1617bf0d5f50SAlex Elder return ceph_osdc_start_request(osdc, obj_request->osd_req, false); 1618bf0d5f50SAlex Elder } 1619bf0d5f50SAlex Elder 162071c20a06SIlya Dryomov static void rbd_obj_request_end(struct rbd_obj_request *obj_request) 162171c20a06SIlya Dryomov { 162271c20a06SIlya Dryomov dout("%s %p\n", __func__, obj_request); 162371c20a06SIlya Dryomov ceph_osdc_cancel_request(obj_request->osd_req); 162471c20a06SIlya Dryomov } 162571c20a06SIlya Dryomov 162671c20a06SIlya Dryomov /* 162771c20a06SIlya Dryomov * Wait for an object request to complete. If interrupted, cancel the 162871c20a06SIlya Dryomov * underlying osd request. 16292894e1d7SIlya Dryomov * 16302894e1d7SIlya Dryomov * @timeout: in jiffies, 0 means "wait forever" 163171c20a06SIlya Dryomov */ 16322894e1d7SIlya Dryomov static int __rbd_obj_request_wait(struct rbd_obj_request *obj_request, 16332894e1d7SIlya Dryomov unsigned long timeout) 163471c20a06SIlya Dryomov { 16352894e1d7SIlya Dryomov long ret; 163671c20a06SIlya Dryomov 163771c20a06SIlya Dryomov dout("%s %p\n", __func__, obj_request); 16382894e1d7SIlya Dryomov ret = wait_for_completion_interruptible_timeout( 16392894e1d7SIlya Dryomov &obj_request->completion, 16402894e1d7SIlya Dryomov ceph_timeout_jiffies(timeout)); 16412894e1d7SIlya Dryomov if (ret <= 0) { 16422894e1d7SIlya Dryomov if (ret == 0) 16432894e1d7SIlya Dryomov ret = -ETIMEDOUT; 164471c20a06SIlya Dryomov rbd_obj_request_end(obj_request); 16452894e1d7SIlya Dryomov } else { 16462894e1d7SIlya Dryomov ret = 0; 16472894e1d7SIlya Dryomov } 16482894e1d7SIlya Dryomov 16492894e1d7SIlya Dryomov dout("%s %p ret %d\n", __func__, obj_request, (int)ret); 165071c20a06SIlya Dryomov return ret; 165171c20a06SIlya Dryomov } 165271c20a06SIlya Dryomov 16532894e1d7SIlya Dryomov static int rbd_obj_request_wait(struct rbd_obj_request *obj_request) 16542894e1d7SIlya Dryomov { 16552894e1d7SIlya Dryomov return __rbd_obj_request_wait(obj_request, 0); 16562894e1d7SIlya Dryomov } 16572894e1d7SIlya Dryomov 1658bf0d5f50SAlex Elder static void rbd_img_request_complete(struct rbd_img_request *img_request) 1659bf0d5f50SAlex Elder { 166055f27e09SAlex Elder 166137206ee5SAlex Elder dout("%s: img %p\n", __func__, img_request); 166255f27e09SAlex Elder 166355f27e09SAlex Elder /* 166455f27e09SAlex Elder * If no error occurred, compute the aggregate transfer 166555f27e09SAlex Elder * count for the image request. We could instead use 166655f27e09SAlex Elder * atomic64_cmpxchg() to update it as each object request 166755f27e09SAlex Elder * completes; not clear which way is better off hand. 166855f27e09SAlex Elder */ 166955f27e09SAlex Elder if (!img_request->result) { 167055f27e09SAlex Elder struct rbd_obj_request *obj_request; 167155f27e09SAlex Elder u64 xferred = 0; 167255f27e09SAlex Elder 167355f27e09SAlex Elder for_each_obj_request(img_request, obj_request) 167455f27e09SAlex Elder xferred += obj_request->xferred; 167555f27e09SAlex Elder img_request->xferred = xferred; 167655f27e09SAlex Elder } 167755f27e09SAlex Elder 1678bf0d5f50SAlex Elder if (img_request->callback) 1679bf0d5f50SAlex Elder img_request->callback(img_request); 1680bf0d5f50SAlex Elder else 1681bf0d5f50SAlex Elder rbd_img_request_put(img_request); 1682bf0d5f50SAlex Elder } 1683bf0d5f50SAlex Elder 16840c425248SAlex Elder /* 16850c425248SAlex Elder * The default/initial value for all image request flags is 0. Each 16860c425248SAlex Elder * is conditionally set to 1 at image request initialization time 16870c425248SAlex Elder * and currently never change thereafter. 16880c425248SAlex Elder */ 16890c425248SAlex Elder static void img_request_write_set(struct rbd_img_request *img_request) 16900c425248SAlex Elder { 16910c425248SAlex Elder set_bit(IMG_REQ_WRITE, &img_request->flags); 16920c425248SAlex Elder smp_mb(); 16930c425248SAlex Elder } 16940c425248SAlex Elder 16950c425248SAlex Elder static bool img_request_write_test(struct rbd_img_request *img_request) 16960c425248SAlex Elder { 16970c425248SAlex Elder smp_mb(); 16980c425248SAlex Elder return test_bit(IMG_REQ_WRITE, &img_request->flags) != 0; 16990c425248SAlex Elder } 17000c425248SAlex Elder 170190e98c52SGuangliang Zhao /* 170290e98c52SGuangliang Zhao * Set the discard flag when the img_request is an discard request 170390e98c52SGuangliang Zhao */ 170490e98c52SGuangliang Zhao static void img_request_discard_set(struct rbd_img_request *img_request) 170590e98c52SGuangliang Zhao { 170690e98c52SGuangliang Zhao set_bit(IMG_REQ_DISCARD, &img_request->flags); 170790e98c52SGuangliang Zhao smp_mb(); 170890e98c52SGuangliang Zhao } 170990e98c52SGuangliang Zhao 171090e98c52SGuangliang Zhao static bool img_request_discard_test(struct rbd_img_request *img_request) 171190e98c52SGuangliang Zhao { 171290e98c52SGuangliang Zhao smp_mb(); 171390e98c52SGuangliang Zhao return test_bit(IMG_REQ_DISCARD, &img_request->flags) != 0; 171490e98c52SGuangliang Zhao } 171590e98c52SGuangliang Zhao 17169849e986SAlex Elder static void img_request_child_set(struct rbd_img_request *img_request) 17179849e986SAlex Elder { 17189849e986SAlex Elder set_bit(IMG_REQ_CHILD, &img_request->flags); 17199849e986SAlex Elder smp_mb(); 17209849e986SAlex Elder } 17219849e986SAlex Elder 1722e93f3152SAlex Elder static void img_request_child_clear(struct rbd_img_request *img_request) 1723e93f3152SAlex Elder { 1724e93f3152SAlex Elder clear_bit(IMG_REQ_CHILD, &img_request->flags); 1725e93f3152SAlex Elder smp_mb(); 1726e93f3152SAlex Elder } 1727e93f3152SAlex Elder 17289849e986SAlex Elder static bool img_request_child_test(struct rbd_img_request *img_request) 17299849e986SAlex Elder { 17309849e986SAlex Elder smp_mb(); 17319849e986SAlex Elder return test_bit(IMG_REQ_CHILD, &img_request->flags) != 0; 17329849e986SAlex Elder } 17339849e986SAlex Elder 1734d0b2e944SAlex Elder static void img_request_layered_set(struct rbd_img_request *img_request) 1735d0b2e944SAlex Elder { 1736d0b2e944SAlex Elder set_bit(IMG_REQ_LAYERED, &img_request->flags); 1737d0b2e944SAlex Elder smp_mb(); 1738d0b2e944SAlex Elder } 1739d0b2e944SAlex Elder 1740a2acd00eSAlex Elder static void img_request_layered_clear(struct rbd_img_request *img_request) 1741a2acd00eSAlex Elder { 1742a2acd00eSAlex Elder clear_bit(IMG_REQ_LAYERED, &img_request->flags); 1743a2acd00eSAlex Elder smp_mb(); 1744a2acd00eSAlex Elder } 1745a2acd00eSAlex Elder 1746d0b2e944SAlex Elder static bool img_request_layered_test(struct rbd_img_request *img_request) 1747d0b2e944SAlex Elder { 1748d0b2e944SAlex Elder smp_mb(); 1749d0b2e944SAlex Elder return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0; 1750d0b2e944SAlex Elder } 1751d0b2e944SAlex Elder 17523b434a2aSJosh Durgin static enum obj_operation_type 17533b434a2aSJosh Durgin rbd_img_request_op_type(struct rbd_img_request *img_request) 17543b434a2aSJosh Durgin { 17553b434a2aSJosh Durgin if (img_request_write_test(img_request)) 17563b434a2aSJosh Durgin return OBJ_OP_WRITE; 17573b434a2aSJosh Durgin else if (img_request_discard_test(img_request)) 17583b434a2aSJosh Durgin return OBJ_OP_DISCARD; 17593b434a2aSJosh Durgin else 17603b434a2aSJosh Durgin return OBJ_OP_READ; 17613b434a2aSJosh Durgin } 17623b434a2aSJosh Durgin 17636e2a4505SAlex Elder static void 17646e2a4505SAlex Elder rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request) 17656e2a4505SAlex Elder { 1766b9434c5bSAlex Elder u64 xferred = obj_request->xferred; 1767b9434c5bSAlex Elder u64 length = obj_request->length; 1768b9434c5bSAlex Elder 17696e2a4505SAlex Elder dout("%s: obj %p img %p result %d %llu/%llu\n", __func__, 17706e2a4505SAlex Elder obj_request, obj_request->img_request, obj_request->result, 1771b9434c5bSAlex Elder xferred, length); 17726e2a4505SAlex Elder /* 177317c1cc1dSJosh Durgin * ENOENT means a hole in the image. We zero-fill the entire 177417c1cc1dSJosh Durgin * length of the request. A short read also implies zero-fill 177517c1cc1dSJosh Durgin * to the end of the request. An error requires the whole 177617c1cc1dSJosh Durgin * length of the request to be reported finished with an error 177717c1cc1dSJosh Durgin * to the block layer. In each case we update the xferred 177817c1cc1dSJosh Durgin * count to indicate the whole request was satisfied. 17796e2a4505SAlex Elder */ 1780b9434c5bSAlex Elder rbd_assert(obj_request->type != OBJ_REQUEST_NODATA); 17816e2a4505SAlex Elder if (obj_request->result == -ENOENT) { 1782b9434c5bSAlex Elder if (obj_request->type == OBJ_REQUEST_BIO) 17836e2a4505SAlex Elder zero_bio_chain(obj_request->bio_list, 0); 1784b9434c5bSAlex Elder else 1785b9434c5bSAlex Elder zero_pages(obj_request->pages, 0, length); 17866e2a4505SAlex Elder obj_request->result = 0; 1787b9434c5bSAlex Elder } else if (xferred < length && !obj_request->result) { 1788b9434c5bSAlex Elder if (obj_request->type == OBJ_REQUEST_BIO) 1789b9434c5bSAlex Elder zero_bio_chain(obj_request->bio_list, xferred); 1790b9434c5bSAlex Elder else 1791b9434c5bSAlex Elder zero_pages(obj_request->pages, xferred, length); 17926e2a4505SAlex Elder } 179317c1cc1dSJosh Durgin obj_request->xferred = length; 17946e2a4505SAlex Elder obj_request_done_set(obj_request); 17956e2a4505SAlex Elder } 17966e2a4505SAlex Elder 1797bf0d5f50SAlex Elder static void rbd_obj_request_complete(struct rbd_obj_request *obj_request) 1798bf0d5f50SAlex Elder { 179937206ee5SAlex Elder dout("%s: obj %p cb %p\n", __func__, obj_request, 180037206ee5SAlex Elder obj_request->callback); 1801bf0d5f50SAlex Elder if (obj_request->callback) 1802bf0d5f50SAlex Elder obj_request->callback(obj_request); 1803788e2df3SAlex Elder else 1804788e2df3SAlex Elder complete_all(&obj_request->completion); 1805bf0d5f50SAlex Elder } 1806bf0d5f50SAlex Elder 1807c47f9371SAlex Elder static void rbd_osd_read_callback(struct rbd_obj_request *obj_request) 1808bf0d5f50SAlex Elder { 180957acbaa7SAlex Elder struct rbd_img_request *img_request = NULL; 1810a9e8ba2cSAlex Elder struct rbd_device *rbd_dev = NULL; 181157acbaa7SAlex Elder bool layered = false; 181257acbaa7SAlex Elder 181357acbaa7SAlex Elder if (obj_request_img_data_test(obj_request)) { 181457acbaa7SAlex Elder img_request = obj_request->img_request; 181557acbaa7SAlex Elder layered = img_request && img_request_layered_test(img_request); 1816a9e8ba2cSAlex Elder rbd_dev = img_request->rbd_dev; 181757acbaa7SAlex Elder } 18188b3e1a56SAlex Elder 18198b3e1a56SAlex Elder dout("%s: obj %p img %p result %d %llu/%llu\n", __func__, 18208b3e1a56SAlex Elder obj_request, img_request, obj_request->result, 18218b3e1a56SAlex Elder obj_request->xferred, obj_request->length); 1822a9e8ba2cSAlex Elder if (layered && obj_request->result == -ENOENT && 1823a9e8ba2cSAlex Elder obj_request->img_offset < rbd_dev->parent_overlap) 18248b3e1a56SAlex Elder rbd_img_parent_read(obj_request); 18258b3e1a56SAlex Elder else if (img_request) 18266e2a4505SAlex Elder rbd_img_obj_request_read_callback(obj_request); 18276e2a4505SAlex Elder else 182807741308SAlex Elder obj_request_done_set(obj_request); 1829bf0d5f50SAlex Elder } 1830bf0d5f50SAlex Elder 1831c47f9371SAlex Elder static void rbd_osd_write_callback(struct rbd_obj_request *obj_request) 1832bf0d5f50SAlex Elder { 18331b83bef2SSage Weil dout("%s: obj %p result %d %llu\n", __func__, obj_request, 18341b83bef2SSage Weil obj_request->result, obj_request->length); 18351b83bef2SSage Weil /* 18368b3e1a56SAlex Elder * There is no such thing as a successful short write. Set 18378b3e1a56SAlex Elder * it to our originally-requested length. 18381b83bef2SSage Weil */ 18391b83bef2SSage Weil obj_request->xferred = obj_request->length; 184007741308SAlex Elder obj_request_done_set(obj_request); 1841bf0d5f50SAlex Elder } 1842bf0d5f50SAlex Elder 184390e98c52SGuangliang Zhao static void rbd_osd_discard_callback(struct rbd_obj_request *obj_request) 184490e98c52SGuangliang Zhao { 184590e98c52SGuangliang Zhao dout("%s: obj %p result %d %llu\n", __func__, obj_request, 184690e98c52SGuangliang Zhao obj_request->result, obj_request->length); 184790e98c52SGuangliang Zhao /* 184890e98c52SGuangliang Zhao * There is no such thing as a successful short discard. Set 184990e98c52SGuangliang Zhao * it to our originally-requested length. 185090e98c52SGuangliang Zhao */ 185190e98c52SGuangliang Zhao obj_request->xferred = obj_request->length; 1852d0265de7SJosh Durgin /* discarding a non-existent object is not a problem */ 1853d0265de7SJosh Durgin if (obj_request->result == -ENOENT) 1854d0265de7SJosh Durgin obj_request->result = 0; 185590e98c52SGuangliang Zhao obj_request_done_set(obj_request); 185690e98c52SGuangliang Zhao } 185790e98c52SGuangliang Zhao 1858fbfab539SAlex Elder /* 1859fbfab539SAlex Elder * For a simple stat call there's nothing to do. We'll do more if 1860fbfab539SAlex Elder * this is part of a write sequence for a layered image. 1861fbfab539SAlex Elder */ 1862c47f9371SAlex Elder static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request) 1863fbfab539SAlex Elder { 186437206ee5SAlex Elder dout("%s: obj %p\n", __func__, obj_request); 1865fbfab539SAlex Elder obj_request_done_set(obj_request); 1866fbfab539SAlex Elder } 1867fbfab539SAlex Elder 18682761713dSIlya Dryomov static void rbd_osd_call_callback(struct rbd_obj_request *obj_request) 18692761713dSIlya Dryomov { 18702761713dSIlya Dryomov dout("%s: obj %p\n", __func__, obj_request); 18712761713dSIlya Dryomov 18722761713dSIlya Dryomov if (obj_request_img_data_test(obj_request)) 18732761713dSIlya Dryomov rbd_osd_copyup_callback(obj_request); 18742761713dSIlya Dryomov else 18752761713dSIlya Dryomov obj_request_done_set(obj_request); 18762761713dSIlya Dryomov } 18772761713dSIlya Dryomov 187885e084feSIlya Dryomov static void rbd_osd_req_callback(struct ceph_osd_request *osd_req) 1879bf0d5f50SAlex Elder { 1880bf0d5f50SAlex Elder struct rbd_obj_request *obj_request = osd_req->r_priv; 1881bf0d5f50SAlex Elder u16 opcode; 1882bf0d5f50SAlex Elder 188385e084feSIlya Dryomov dout("%s: osd_req %p\n", __func__, osd_req); 1884bf0d5f50SAlex Elder rbd_assert(osd_req == obj_request->osd_req); 188557acbaa7SAlex Elder if (obj_request_img_data_test(obj_request)) { 188657acbaa7SAlex Elder rbd_assert(obj_request->img_request); 188757acbaa7SAlex Elder rbd_assert(obj_request->which != BAD_WHICH); 188857acbaa7SAlex Elder } else { 188957acbaa7SAlex Elder rbd_assert(obj_request->which == BAD_WHICH); 189057acbaa7SAlex Elder } 1891bf0d5f50SAlex Elder 18921b83bef2SSage Weil if (osd_req->r_result < 0) 18931b83bef2SSage Weil obj_request->result = osd_req->r_result; 1894bf0d5f50SAlex Elder 1895c47f9371SAlex Elder /* 1896c47f9371SAlex Elder * We support a 64-bit length, but ultimately it has to be 18977ad18afaSChristoph Hellwig * passed to the block layer, which just supports a 32-bit 18987ad18afaSChristoph Hellwig * length field. 1899c47f9371SAlex Elder */ 19007665d85bSYan, Zheng obj_request->xferred = osd_req->r_ops[0].outdata_len; 1901c47f9371SAlex Elder rbd_assert(obj_request->xferred < (u64)UINT_MAX); 19020ccd5926SIlya Dryomov 190379528734SAlex Elder opcode = osd_req->r_ops[0].op; 1904bf0d5f50SAlex Elder switch (opcode) { 1905bf0d5f50SAlex Elder case CEPH_OSD_OP_READ: 1906c47f9371SAlex Elder rbd_osd_read_callback(obj_request); 1907bf0d5f50SAlex Elder break; 19080ccd5926SIlya Dryomov case CEPH_OSD_OP_SETALLOCHINT: 1909e30b7577SIlya Dryomov rbd_assert(osd_req->r_ops[1].op == CEPH_OSD_OP_WRITE || 1910e30b7577SIlya Dryomov osd_req->r_ops[1].op == CEPH_OSD_OP_WRITEFULL); 19110ccd5926SIlya Dryomov /* fall through */ 1912bf0d5f50SAlex Elder case CEPH_OSD_OP_WRITE: 1913e30b7577SIlya Dryomov case CEPH_OSD_OP_WRITEFULL: 1914c47f9371SAlex Elder rbd_osd_write_callback(obj_request); 1915bf0d5f50SAlex Elder break; 1916fbfab539SAlex Elder case CEPH_OSD_OP_STAT: 1917c47f9371SAlex Elder rbd_osd_stat_callback(obj_request); 1918fbfab539SAlex Elder break; 191990e98c52SGuangliang Zhao case CEPH_OSD_OP_DELETE: 192090e98c52SGuangliang Zhao case CEPH_OSD_OP_TRUNCATE: 192190e98c52SGuangliang Zhao case CEPH_OSD_OP_ZERO: 192290e98c52SGuangliang Zhao rbd_osd_discard_callback(obj_request); 192390e98c52SGuangliang Zhao break; 192436be9a76SAlex Elder case CEPH_OSD_OP_CALL: 19252761713dSIlya Dryomov rbd_osd_call_callback(obj_request); 19262761713dSIlya Dryomov break; 1927bf0d5f50SAlex Elder default: 19289584d508SIlya Dryomov rbd_warn(NULL, "%s: unsupported op %hu", 1929bf0d5f50SAlex Elder obj_request->object_name, (unsigned short) opcode); 1930bf0d5f50SAlex Elder break; 1931bf0d5f50SAlex Elder } 1932bf0d5f50SAlex Elder 193307741308SAlex Elder if (obj_request_done_test(obj_request)) 1934bf0d5f50SAlex Elder rbd_obj_request_complete(obj_request); 1935bf0d5f50SAlex Elder } 1936bf0d5f50SAlex Elder 19379d4df01fSAlex Elder static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request) 1938430c28c3SAlex Elder { 1939430c28c3SAlex Elder struct rbd_img_request *img_request = obj_request->img_request; 19408c042b0dSAlex Elder struct ceph_osd_request *osd_req = obj_request->osd_req; 1941430c28c3SAlex Elder 1942bb873b53SIlya Dryomov if (img_request) 1943bb873b53SIlya Dryomov osd_req->r_snapid = img_request->snap_id; 19449d4df01fSAlex Elder } 19459d4df01fSAlex Elder 19469d4df01fSAlex Elder static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request) 19479d4df01fSAlex Elder { 19489d4df01fSAlex Elder struct ceph_osd_request *osd_req = obj_request->osd_req; 19499d4df01fSAlex Elder 1950bb873b53SIlya Dryomov osd_req->r_mtime = CURRENT_TIME; 1951bb873b53SIlya Dryomov osd_req->r_data_offset = obj_request->offset; 1952430c28c3SAlex Elder } 1953430c28c3SAlex Elder 19540ccd5926SIlya Dryomov /* 19550ccd5926SIlya Dryomov * Create an osd request. A read request has one osd op (read). 19560ccd5926SIlya Dryomov * A write request has either one (watch) or two (hint+write) osd ops. 19570ccd5926SIlya Dryomov * (All rbd data writes are prefixed with an allocation hint op, but 19580ccd5926SIlya Dryomov * technically osd watch is a write request, hence this distinction.) 19590ccd5926SIlya Dryomov */ 1960bf0d5f50SAlex Elder static struct ceph_osd_request *rbd_osd_req_create( 1961bf0d5f50SAlex Elder struct rbd_device *rbd_dev, 19626d2940c8SGuangliang Zhao enum obj_operation_type op_type, 1963deb236b3SIlya Dryomov unsigned int num_ops, 1964430c28c3SAlex Elder struct rbd_obj_request *obj_request) 1965bf0d5f50SAlex Elder { 1966bf0d5f50SAlex Elder struct ceph_snap_context *snapc = NULL; 1967bf0d5f50SAlex Elder struct ceph_osd_client *osdc; 1968bf0d5f50SAlex Elder struct ceph_osd_request *osd_req; 1969bf0d5f50SAlex Elder 197090e98c52SGuangliang Zhao if (obj_request_img_data_test(obj_request) && 197190e98c52SGuangliang Zhao (op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_WRITE)) { 19726365d33aSAlex Elder struct rbd_img_request *img_request = obj_request->img_request; 197390e98c52SGuangliang Zhao if (op_type == OBJ_OP_WRITE) { 19746d2940c8SGuangliang Zhao rbd_assert(img_request_write_test(img_request)); 197590e98c52SGuangliang Zhao } else { 197690e98c52SGuangliang Zhao rbd_assert(img_request_discard_test(img_request)); 197790e98c52SGuangliang Zhao } 1978bf0d5f50SAlex Elder snapc = img_request->snapc; 1979bf0d5f50SAlex Elder } 1980bf0d5f50SAlex Elder 19816d2940c8SGuangliang Zhao rbd_assert(num_ops == 1 || ((op_type == OBJ_OP_WRITE) && num_ops == 2)); 1982deb236b3SIlya Dryomov 1983deb236b3SIlya Dryomov /* Allocate and initialize the request, for the num_ops ops */ 1984bf0d5f50SAlex Elder 1985bf0d5f50SAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 1986deb236b3SIlya Dryomov osd_req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false, 19872224d879SDavid Disseldorp GFP_NOIO); 1988bf0d5f50SAlex Elder if (!osd_req) 198913d1ad16SIlya Dryomov goto fail; 1990bf0d5f50SAlex Elder 199190e98c52SGuangliang Zhao if (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD) 1992bf0d5f50SAlex Elder osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK; 1993430c28c3SAlex Elder else 1994bf0d5f50SAlex Elder osd_req->r_flags = CEPH_OSD_FLAG_READ; 1995bf0d5f50SAlex Elder 1996bf0d5f50SAlex Elder osd_req->r_callback = rbd_osd_req_callback; 1997bf0d5f50SAlex Elder osd_req->r_priv = obj_request; 1998bf0d5f50SAlex Elder 19997627151eSYan, Zheng osd_req->r_base_oloc.pool = rbd_dev->layout.pool_id; 2000d30291b9SIlya Dryomov if (ceph_oid_aprintf(&osd_req->r_base_oid, GFP_NOIO, "%s", 2001d30291b9SIlya Dryomov obj_request->object_name)) 2002d30291b9SIlya Dryomov goto fail; 2003bf0d5f50SAlex Elder 200413d1ad16SIlya Dryomov if (ceph_osdc_alloc_messages(osd_req, GFP_NOIO)) 200513d1ad16SIlya Dryomov goto fail; 200613d1ad16SIlya Dryomov 2007bf0d5f50SAlex Elder return osd_req; 200813d1ad16SIlya Dryomov 200913d1ad16SIlya Dryomov fail: 201013d1ad16SIlya Dryomov ceph_osdc_put_request(osd_req); 201113d1ad16SIlya Dryomov return NULL; 2012bf0d5f50SAlex Elder } 2013bf0d5f50SAlex Elder 20140eefd470SAlex Elder /* 2015d3246fb0SJosh Durgin * Create a copyup osd request based on the information in the object 2016d3246fb0SJosh Durgin * request supplied. A copyup request has two or three osd ops, a 2017d3246fb0SJosh Durgin * copyup method call, potentially a hint op, and a write or truncate 2018d3246fb0SJosh Durgin * or zero op. 20190eefd470SAlex Elder */ 20200eefd470SAlex Elder static struct ceph_osd_request * 20210eefd470SAlex Elder rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request) 20220eefd470SAlex Elder { 20230eefd470SAlex Elder struct rbd_img_request *img_request; 20240eefd470SAlex Elder struct ceph_snap_context *snapc; 20250eefd470SAlex Elder struct rbd_device *rbd_dev; 20260eefd470SAlex Elder struct ceph_osd_client *osdc; 20270eefd470SAlex Elder struct ceph_osd_request *osd_req; 2028d3246fb0SJosh Durgin int num_osd_ops = 3; 20290eefd470SAlex Elder 20300eefd470SAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 20310eefd470SAlex Elder img_request = obj_request->img_request; 20320eefd470SAlex Elder rbd_assert(img_request); 2033d3246fb0SJosh Durgin rbd_assert(img_request_write_test(img_request) || 2034d3246fb0SJosh Durgin img_request_discard_test(img_request)); 20350eefd470SAlex Elder 2036d3246fb0SJosh Durgin if (img_request_discard_test(img_request)) 2037d3246fb0SJosh Durgin num_osd_ops = 2; 2038d3246fb0SJosh Durgin 2039d3246fb0SJosh Durgin /* Allocate and initialize the request, for all the ops */ 20400eefd470SAlex Elder 20410eefd470SAlex Elder snapc = img_request->snapc; 20420eefd470SAlex Elder rbd_dev = img_request->rbd_dev; 20430eefd470SAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 2044d3246fb0SJosh Durgin osd_req = ceph_osdc_alloc_request(osdc, snapc, num_osd_ops, 20452224d879SDavid Disseldorp false, GFP_NOIO); 20460eefd470SAlex Elder if (!osd_req) 204713d1ad16SIlya Dryomov goto fail; 20480eefd470SAlex Elder 20490eefd470SAlex Elder osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK; 20500eefd470SAlex Elder osd_req->r_callback = rbd_osd_req_callback; 20510eefd470SAlex Elder osd_req->r_priv = obj_request; 20520eefd470SAlex Elder 20537627151eSYan, Zheng osd_req->r_base_oloc.pool = rbd_dev->layout.pool_id; 2054d30291b9SIlya Dryomov if (ceph_oid_aprintf(&osd_req->r_base_oid, GFP_NOIO, "%s", 2055d30291b9SIlya Dryomov obj_request->object_name)) 2056d30291b9SIlya Dryomov goto fail; 20570eefd470SAlex Elder 205813d1ad16SIlya Dryomov if (ceph_osdc_alloc_messages(osd_req, GFP_NOIO)) 205913d1ad16SIlya Dryomov goto fail; 206013d1ad16SIlya Dryomov 20610eefd470SAlex Elder return osd_req; 206213d1ad16SIlya Dryomov 206313d1ad16SIlya Dryomov fail: 206413d1ad16SIlya Dryomov ceph_osdc_put_request(osd_req); 206513d1ad16SIlya Dryomov return NULL; 20660eefd470SAlex Elder } 20670eefd470SAlex Elder 20680eefd470SAlex Elder 2069bf0d5f50SAlex Elder static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req) 2070bf0d5f50SAlex Elder { 2071bf0d5f50SAlex Elder ceph_osdc_put_request(osd_req); 2072bf0d5f50SAlex Elder } 2073bf0d5f50SAlex Elder 2074bf0d5f50SAlex Elder /* object_name is assumed to be a non-null pointer and NUL-terminated */ 2075bf0d5f50SAlex Elder 2076bf0d5f50SAlex Elder static struct rbd_obj_request *rbd_obj_request_create(const char *object_name, 2077bf0d5f50SAlex Elder u64 offset, u64 length, 2078bf0d5f50SAlex Elder enum obj_request_type type) 2079bf0d5f50SAlex Elder { 2080bf0d5f50SAlex Elder struct rbd_obj_request *obj_request; 2081bf0d5f50SAlex Elder size_t size; 2082bf0d5f50SAlex Elder char *name; 2083bf0d5f50SAlex Elder 2084bf0d5f50SAlex Elder rbd_assert(obj_request_type_valid(type)); 2085bf0d5f50SAlex Elder 2086bf0d5f50SAlex Elder size = strlen(object_name) + 1; 20875a60e876SIlya Dryomov name = kmalloc(size, GFP_NOIO); 2088f907ad55SAlex Elder if (!name) 2089bf0d5f50SAlex Elder return NULL; 2090bf0d5f50SAlex Elder 20915a60e876SIlya Dryomov obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_NOIO); 2092f907ad55SAlex Elder if (!obj_request) { 2093f907ad55SAlex Elder kfree(name); 2094f907ad55SAlex Elder return NULL; 2095f907ad55SAlex Elder } 2096f907ad55SAlex Elder 2097bf0d5f50SAlex Elder obj_request->object_name = memcpy(name, object_name, size); 2098bf0d5f50SAlex Elder obj_request->offset = offset; 2099bf0d5f50SAlex Elder obj_request->length = length; 2100926f9b3fSAlex Elder obj_request->flags = 0; 2101bf0d5f50SAlex Elder obj_request->which = BAD_WHICH; 2102bf0d5f50SAlex Elder obj_request->type = type; 2103bf0d5f50SAlex Elder INIT_LIST_HEAD(&obj_request->links); 2104788e2df3SAlex Elder init_completion(&obj_request->completion); 2105bf0d5f50SAlex Elder kref_init(&obj_request->kref); 2106bf0d5f50SAlex Elder 210737206ee5SAlex Elder dout("%s: \"%s\" %llu/%llu %d -> obj %p\n", __func__, object_name, 210837206ee5SAlex Elder offset, length, (int)type, obj_request); 210937206ee5SAlex Elder 2110bf0d5f50SAlex Elder return obj_request; 2111bf0d5f50SAlex Elder } 2112bf0d5f50SAlex Elder 2113bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref) 2114bf0d5f50SAlex Elder { 2115bf0d5f50SAlex Elder struct rbd_obj_request *obj_request; 2116bf0d5f50SAlex Elder 2117bf0d5f50SAlex Elder obj_request = container_of(kref, struct rbd_obj_request, kref); 2118bf0d5f50SAlex Elder 211937206ee5SAlex Elder dout("%s: obj %p\n", __func__, obj_request); 212037206ee5SAlex Elder 2121bf0d5f50SAlex Elder rbd_assert(obj_request->img_request == NULL); 2122bf0d5f50SAlex Elder rbd_assert(obj_request->which == BAD_WHICH); 2123bf0d5f50SAlex Elder 2124bf0d5f50SAlex Elder if (obj_request->osd_req) 2125bf0d5f50SAlex Elder rbd_osd_req_destroy(obj_request->osd_req); 2126bf0d5f50SAlex Elder 2127bf0d5f50SAlex Elder rbd_assert(obj_request_type_valid(obj_request->type)); 2128bf0d5f50SAlex Elder switch (obj_request->type) { 21299969ebc5SAlex Elder case OBJ_REQUEST_NODATA: 21309969ebc5SAlex Elder break; /* Nothing to do */ 2131bf0d5f50SAlex Elder case OBJ_REQUEST_BIO: 2132bf0d5f50SAlex Elder if (obj_request->bio_list) 2133bf0d5f50SAlex Elder bio_chain_put(obj_request->bio_list); 2134bf0d5f50SAlex Elder break; 2135788e2df3SAlex Elder case OBJ_REQUEST_PAGES: 2136788e2df3SAlex Elder if (obj_request->pages) 2137788e2df3SAlex Elder ceph_release_page_vector(obj_request->pages, 2138788e2df3SAlex Elder obj_request->page_count); 2139788e2df3SAlex Elder break; 2140bf0d5f50SAlex Elder } 2141bf0d5f50SAlex Elder 2142f907ad55SAlex Elder kfree(obj_request->object_name); 2143868311b1SAlex Elder obj_request->object_name = NULL; 2144868311b1SAlex Elder kmem_cache_free(rbd_obj_request_cache, obj_request); 2145bf0d5f50SAlex Elder } 2146bf0d5f50SAlex Elder 2147fb65d228SAlex Elder /* It's OK to call this for a device with no parent */ 2148fb65d228SAlex Elder 2149fb65d228SAlex Elder static void rbd_spec_put(struct rbd_spec *spec); 2150fb65d228SAlex Elder static void rbd_dev_unparent(struct rbd_device *rbd_dev) 2151fb65d228SAlex Elder { 2152fb65d228SAlex Elder rbd_dev_remove_parent(rbd_dev); 2153fb65d228SAlex Elder rbd_spec_put(rbd_dev->parent_spec); 2154fb65d228SAlex Elder rbd_dev->parent_spec = NULL; 2155fb65d228SAlex Elder rbd_dev->parent_overlap = 0; 2156fb65d228SAlex Elder } 2157fb65d228SAlex Elder 2158bf0d5f50SAlex Elder /* 2159a2acd00eSAlex Elder * Parent image reference counting is used to determine when an 2160a2acd00eSAlex Elder * image's parent fields can be safely torn down--after there are no 2161a2acd00eSAlex Elder * more in-flight requests to the parent image. When the last 2162a2acd00eSAlex Elder * reference is dropped, cleaning them up is safe. 2163a2acd00eSAlex Elder */ 2164a2acd00eSAlex Elder static void rbd_dev_parent_put(struct rbd_device *rbd_dev) 2165a2acd00eSAlex Elder { 2166a2acd00eSAlex Elder int counter; 2167a2acd00eSAlex Elder 2168a2acd00eSAlex Elder if (!rbd_dev->parent_spec) 2169a2acd00eSAlex Elder return; 2170a2acd00eSAlex Elder 2171a2acd00eSAlex Elder counter = atomic_dec_return_safe(&rbd_dev->parent_ref); 2172a2acd00eSAlex Elder if (counter > 0) 2173a2acd00eSAlex Elder return; 2174a2acd00eSAlex Elder 2175a2acd00eSAlex Elder /* Last reference; clean up parent data structures */ 2176a2acd00eSAlex Elder 2177a2acd00eSAlex Elder if (!counter) 2178a2acd00eSAlex Elder rbd_dev_unparent(rbd_dev); 2179a2acd00eSAlex Elder else 21809584d508SIlya Dryomov rbd_warn(rbd_dev, "parent reference underflow"); 2181a2acd00eSAlex Elder } 2182a2acd00eSAlex Elder 2183a2acd00eSAlex Elder /* 2184a2acd00eSAlex Elder * If an image has a non-zero parent overlap, get a reference to its 2185a2acd00eSAlex Elder * parent. 2186a2acd00eSAlex Elder * 2187a2acd00eSAlex Elder * Returns true if the rbd device has a parent with a non-zero 2188a2acd00eSAlex Elder * overlap and a reference for it was successfully taken, or 2189a2acd00eSAlex Elder * false otherwise. 2190a2acd00eSAlex Elder */ 2191a2acd00eSAlex Elder static bool rbd_dev_parent_get(struct rbd_device *rbd_dev) 2192a2acd00eSAlex Elder { 2193ae43e9d0SIlya Dryomov int counter = 0; 2194a2acd00eSAlex Elder 2195a2acd00eSAlex Elder if (!rbd_dev->parent_spec) 2196a2acd00eSAlex Elder return false; 2197a2acd00eSAlex Elder 2198ae43e9d0SIlya Dryomov down_read(&rbd_dev->header_rwsem); 2199ae43e9d0SIlya Dryomov if (rbd_dev->parent_overlap) 2200a2acd00eSAlex Elder counter = atomic_inc_return_safe(&rbd_dev->parent_ref); 2201ae43e9d0SIlya Dryomov up_read(&rbd_dev->header_rwsem); 2202a2acd00eSAlex Elder 2203a2acd00eSAlex Elder if (counter < 0) 22049584d508SIlya Dryomov rbd_warn(rbd_dev, "parent reference overflow"); 2205a2acd00eSAlex Elder 2206ae43e9d0SIlya Dryomov return counter > 0; 2207a2acd00eSAlex Elder } 2208a2acd00eSAlex Elder 2209bf0d5f50SAlex Elder /* 2210bf0d5f50SAlex Elder * Caller is responsible for filling in the list of object requests 2211bf0d5f50SAlex Elder * that comprises the image request, and the Linux request pointer 2212bf0d5f50SAlex Elder * (if there is one). 2213bf0d5f50SAlex Elder */ 2214cc344fa1SAlex Elder static struct rbd_img_request *rbd_img_request_create( 2215cc344fa1SAlex Elder struct rbd_device *rbd_dev, 2216bf0d5f50SAlex Elder u64 offset, u64 length, 22176d2940c8SGuangliang Zhao enum obj_operation_type op_type, 22184e752f0aSJosh Durgin struct ceph_snap_context *snapc) 2219bf0d5f50SAlex Elder { 2220bf0d5f50SAlex Elder struct rbd_img_request *img_request; 2221bf0d5f50SAlex Elder 22227a716aacSIlya Dryomov img_request = kmem_cache_alloc(rbd_img_request_cache, GFP_NOIO); 2223bf0d5f50SAlex Elder if (!img_request) 2224bf0d5f50SAlex Elder return NULL; 2225bf0d5f50SAlex Elder 2226bf0d5f50SAlex Elder img_request->rq = NULL; 2227bf0d5f50SAlex Elder img_request->rbd_dev = rbd_dev; 2228bf0d5f50SAlex Elder img_request->offset = offset; 2229bf0d5f50SAlex Elder img_request->length = length; 22300c425248SAlex Elder img_request->flags = 0; 223190e98c52SGuangliang Zhao if (op_type == OBJ_OP_DISCARD) { 223290e98c52SGuangliang Zhao img_request_discard_set(img_request); 223390e98c52SGuangliang Zhao img_request->snapc = snapc; 223490e98c52SGuangliang Zhao } else if (op_type == OBJ_OP_WRITE) { 22350c425248SAlex Elder img_request_write_set(img_request); 22364e752f0aSJosh Durgin img_request->snapc = snapc; 22370c425248SAlex Elder } else { 2238bf0d5f50SAlex Elder img_request->snap_id = rbd_dev->spec->snap_id; 22390c425248SAlex Elder } 2240a2acd00eSAlex Elder if (rbd_dev_parent_get(rbd_dev)) 2241d0b2e944SAlex Elder img_request_layered_set(img_request); 2242bf0d5f50SAlex Elder spin_lock_init(&img_request->completion_lock); 2243bf0d5f50SAlex Elder img_request->next_completion = 0; 2244bf0d5f50SAlex Elder img_request->callback = NULL; 2245a5a337d4SAlex Elder img_request->result = 0; 2246bf0d5f50SAlex Elder img_request->obj_request_count = 0; 2247bf0d5f50SAlex Elder INIT_LIST_HEAD(&img_request->obj_requests); 2248bf0d5f50SAlex Elder kref_init(&img_request->kref); 2249bf0d5f50SAlex Elder 225037206ee5SAlex Elder dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev, 22516d2940c8SGuangliang Zhao obj_op_name(op_type), offset, length, img_request); 225237206ee5SAlex Elder 2253bf0d5f50SAlex Elder return img_request; 2254bf0d5f50SAlex Elder } 2255bf0d5f50SAlex Elder 2256bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref) 2257bf0d5f50SAlex Elder { 2258bf0d5f50SAlex Elder struct rbd_img_request *img_request; 2259bf0d5f50SAlex Elder struct rbd_obj_request *obj_request; 2260bf0d5f50SAlex Elder struct rbd_obj_request *next_obj_request; 2261bf0d5f50SAlex Elder 2262bf0d5f50SAlex Elder img_request = container_of(kref, struct rbd_img_request, kref); 2263bf0d5f50SAlex Elder 226437206ee5SAlex Elder dout("%s: img %p\n", __func__, img_request); 226537206ee5SAlex Elder 2266bf0d5f50SAlex Elder for_each_obj_request_safe(img_request, obj_request, next_obj_request) 2267bf0d5f50SAlex Elder rbd_img_obj_request_del(img_request, obj_request); 226825dcf954SAlex Elder rbd_assert(img_request->obj_request_count == 0); 2269bf0d5f50SAlex Elder 2270a2acd00eSAlex Elder if (img_request_layered_test(img_request)) { 2271a2acd00eSAlex Elder img_request_layered_clear(img_request); 2272a2acd00eSAlex Elder rbd_dev_parent_put(img_request->rbd_dev); 2273a2acd00eSAlex Elder } 2274a2acd00eSAlex Elder 2275bef95455SJosh Durgin if (img_request_write_test(img_request) || 2276bef95455SJosh Durgin img_request_discard_test(img_request)) 2277812164f8SAlex Elder ceph_put_snap_context(img_request->snapc); 2278bf0d5f50SAlex Elder 22791c2a9dfeSAlex Elder kmem_cache_free(rbd_img_request_cache, img_request); 2280bf0d5f50SAlex Elder } 2281bf0d5f50SAlex Elder 2282e93f3152SAlex Elder static struct rbd_img_request *rbd_parent_request_create( 2283e93f3152SAlex Elder struct rbd_obj_request *obj_request, 2284e93f3152SAlex Elder u64 img_offset, u64 length) 2285e93f3152SAlex Elder { 2286e93f3152SAlex Elder struct rbd_img_request *parent_request; 2287e93f3152SAlex Elder struct rbd_device *rbd_dev; 2288e93f3152SAlex Elder 2289e93f3152SAlex Elder rbd_assert(obj_request->img_request); 2290e93f3152SAlex Elder rbd_dev = obj_request->img_request->rbd_dev; 2291e93f3152SAlex Elder 22924e752f0aSJosh Durgin parent_request = rbd_img_request_create(rbd_dev->parent, img_offset, 22936d2940c8SGuangliang Zhao length, OBJ_OP_READ, NULL); 2294e93f3152SAlex Elder if (!parent_request) 2295e93f3152SAlex Elder return NULL; 2296e93f3152SAlex Elder 2297e93f3152SAlex Elder img_request_child_set(parent_request); 2298e93f3152SAlex Elder rbd_obj_request_get(obj_request); 2299e93f3152SAlex Elder parent_request->obj_request = obj_request; 2300e93f3152SAlex Elder 2301e93f3152SAlex Elder return parent_request; 2302e93f3152SAlex Elder } 2303e93f3152SAlex Elder 2304e93f3152SAlex Elder static void rbd_parent_request_destroy(struct kref *kref) 2305e93f3152SAlex Elder { 2306e93f3152SAlex Elder struct rbd_img_request *parent_request; 2307e93f3152SAlex Elder struct rbd_obj_request *orig_request; 2308e93f3152SAlex Elder 2309e93f3152SAlex Elder parent_request = container_of(kref, struct rbd_img_request, kref); 2310e93f3152SAlex Elder orig_request = parent_request->obj_request; 2311e93f3152SAlex Elder 2312e93f3152SAlex Elder parent_request->obj_request = NULL; 2313e93f3152SAlex Elder rbd_obj_request_put(orig_request); 2314e93f3152SAlex Elder img_request_child_clear(parent_request); 2315e93f3152SAlex Elder 2316e93f3152SAlex Elder rbd_img_request_destroy(kref); 2317e93f3152SAlex Elder } 2318e93f3152SAlex Elder 23191217857fSAlex Elder static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request) 23201217857fSAlex Elder { 23216365d33aSAlex Elder struct rbd_img_request *img_request; 23221217857fSAlex Elder unsigned int xferred; 23231217857fSAlex Elder int result; 23248b3e1a56SAlex Elder bool more; 23251217857fSAlex Elder 23266365d33aSAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 23276365d33aSAlex Elder img_request = obj_request->img_request; 23286365d33aSAlex Elder 23291217857fSAlex Elder rbd_assert(obj_request->xferred <= (u64)UINT_MAX); 23301217857fSAlex Elder xferred = (unsigned int)obj_request->xferred; 23311217857fSAlex Elder result = obj_request->result; 23321217857fSAlex Elder if (result) { 23331217857fSAlex Elder struct rbd_device *rbd_dev = img_request->rbd_dev; 23346d2940c8SGuangliang Zhao enum obj_operation_type op_type; 23356d2940c8SGuangliang Zhao 233690e98c52SGuangliang Zhao if (img_request_discard_test(img_request)) 233790e98c52SGuangliang Zhao op_type = OBJ_OP_DISCARD; 233890e98c52SGuangliang Zhao else if (img_request_write_test(img_request)) 233990e98c52SGuangliang Zhao op_type = OBJ_OP_WRITE; 234090e98c52SGuangliang Zhao else 234190e98c52SGuangliang Zhao op_type = OBJ_OP_READ; 23421217857fSAlex Elder 23439584d508SIlya Dryomov rbd_warn(rbd_dev, "%s %llx at %llx (%llx)", 23446d2940c8SGuangliang Zhao obj_op_name(op_type), obj_request->length, 23456d2940c8SGuangliang Zhao obj_request->img_offset, obj_request->offset); 23469584d508SIlya Dryomov rbd_warn(rbd_dev, " result %d xferred %x", 23471217857fSAlex Elder result, xferred); 23481217857fSAlex Elder if (!img_request->result) 23491217857fSAlex Elder img_request->result = result; 2350082a75daSIlya Dryomov /* 2351082a75daSIlya Dryomov * Need to end I/O on the entire obj_request worth of 2352082a75daSIlya Dryomov * bytes in case of error. 2353082a75daSIlya Dryomov */ 2354082a75daSIlya Dryomov xferred = obj_request->length; 23551217857fSAlex Elder } 23561217857fSAlex Elder 2357f1a4739fSAlex Elder /* Image object requests don't own their page array */ 2358f1a4739fSAlex Elder 2359f1a4739fSAlex Elder if (obj_request->type == OBJ_REQUEST_PAGES) { 2360f1a4739fSAlex Elder obj_request->pages = NULL; 2361f1a4739fSAlex Elder obj_request->page_count = 0; 2362f1a4739fSAlex Elder } 2363f1a4739fSAlex Elder 23648b3e1a56SAlex Elder if (img_request_child_test(img_request)) { 23658b3e1a56SAlex Elder rbd_assert(img_request->obj_request != NULL); 23668b3e1a56SAlex Elder more = obj_request->which < img_request->obj_request_count - 1; 23678b3e1a56SAlex Elder } else { 23688b3e1a56SAlex Elder rbd_assert(img_request->rq != NULL); 23697ad18afaSChristoph Hellwig 23707ad18afaSChristoph Hellwig more = blk_update_request(img_request->rq, result, xferred); 23717ad18afaSChristoph Hellwig if (!more) 23727ad18afaSChristoph Hellwig __blk_mq_end_request(img_request->rq, result); 23738b3e1a56SAlex Elder } 23748b3e1a56SAlex Elder 23758b3e1a56SAlex Elder return more; 23761217857fSAlex Elder } 23771217857fSAlex Elder 23782169238dSAlex Elder static void rbd_img_obj_callback(struct rbd_obj_request *obj_request) 23792169238dSAlex Elder { 23802169238dSAlex Elder struct rbd_img_request *img_request; 23812169238dSAlex Elder u32 which = obj_request->which; 23822169238dSAlex Elder bool more = true; 23832169238dSAlex Elder 23846365d33aSAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 23852169238dSAlex Elder img_request = obj_request->img_request; 23862169238dSAlex Elder 23872169238dSAlex Elder dout("%s: img %p obj %p\n", __func__, img_request, obj_request); 23882169238dSAlex Elder rbd_assert(img_request != NULL); 23892169238dSAlex Elder rbd_assert(img_request->obj_request_count > 0); 23902169238dSAlex Elder rbd_assert(which != BAD_WHICH); 23912169238dSAlex Elder rbd_assert(which < img_request->obj_request_count); 23922169238dSAlex Elder 23932169238dSAlex Elder spin_lock_irq(&img_request->completion_lock); 23942169238dSAlex Elder if (which != img_request->next_completion) 23952169238dSAlex Elder goto out; 23962169238dSAlex Elder 23972169238dSAlex Elder for_each_obj_request_from(img_request, obj_request) { 23982169238dSAlex Elder rbd_assert(more); 23992169238dSAlex Elder rbd_assert(which < img_request->obj_request_count); 24002169238dSAlex Elder 24012169238dSAlex Elder if (!obj_request_done_test(obj_request)) 24022169238dSAlex Elder break; 24031217857fSAlex Elder more = rbd_img_obj_end_request(obj_request); 24042169238dSAlex Elder which++; 24052169238dSAlex Elder } 24062169238dSAlex Elder 24072169238dSAlex Elder rbd_assert(more ^ (which == img_request->obj_request_count)); 24082169238dSAlex Elder img_request->next_completion = which; 24092169238dSAlex Elder out: 24102169238dSAlex Elder spin_unlock_irq(&img_request->completion_lock); 24110f2d5be7SAlex Elder rbd_img_request_put(img_request); 24122169238dSAlex Elder 24132169238dSAlex Elder if (!more) 24142169238dSAlex Elder rbd_img_request_complete(img_request); 24152169238dSAlex Elder } 24162169238dSAlex Elder 2417f1a4739fSAlex Elder /* 24183b434a2aSJosh Durgin * Add individual osd ops to the given ceph_osd_request and prepare 24193b434a2aSJosh Durgin * them for submission. num_ops is the current number of 24203b434a2aSJosh Durgin * osd operations already to the object request. 24213b434a2aSJosh Durgin */ 24223b434a2aSJosh Durgin static void rbd_img_obj_request_fill(struct rbd_obj_request *obj_request, 24233b434a2aSJosh Durgin struct ceph_osd_request *osd_request, 24243b434a2aSJosh Durgin enum obj_operation_type op_type, 24253b434a2aSJosh Durgin unsigned int num_ops) 24263b434a2aSJosh Durgin { 24273b434a2aSJosh Durgin struct rbd_img_request *img_request = obj_request->img_request; 24283b434a2aSJosh Durgin struct rbd_device *rbd_dev = img_request->rbd_dev; 24293b434a2aSJosh Durgin u64 object_size = rbd_obj_bytes(&rbd_dev->header); 24303b434a2aSJosh Durgin u64 offset = obj_request->offset; 24313b434a2aSJosh Durgin u64 length = obj_request->length; 24323b434a2aSJosh Durgin u64 img_end; 24333b434a2aSJosh Durgin u16 opcode; 24343b434a2aSJosh Durgin 24353b434a2aSJosh Durgin if (op_type == OBJ_OP_DISCARD) { 2436d3246fb0SJosh Durgin if (!offset && length == object_size && 2437d3246fb0SJosh Durgin (!img_request_layered_test(img_request) || 2438d3246fb0SJosh Durgin !obj_request_overlaps_parent(obj_request))) { 24393b434a2aSJosh Durgin opcode = CEPH_OSD_OP_DELETE; 24403b434a2aSJosh Durgin } else if ((offset + length == object_size)) { 24413b434a2aSJosh Durgin opcode = CEPH_OSD_OP_TRUNCATE; 24423b434a2aSJosh Durgin } else { 24433b434a2aSJosh Durgin down_read(&rbd_dev->header_rwsem); 24443b434a2aSJosh Durgin img_end = rbd_dev->header.image_size; 24453b434a2aSJosh Durgin up_read(&rbd_dev->header_rwsem); 24463b434a2aSJosh Durgin 24473b434a2aSJosh Durgin if (obj_request->img_offset + length == img_end) 24483b434a2aSJosh Durgin opcode = CEPH_OSD_OP_TRUNCATE; 24493b434a2aSJosh Durgin else 24503b434a2aSJosh Durgin opcode = CEPH_OSD_OP_ZERO; 24513b434a2aSJosh Durgin } 24523b434a2aSJosh Durgin } else if (op_type == OBJ_OP_WRITE) { 2453e30b7577SIlya Dryomov if (!offset && length == object_size) 2454e30b7577SIlya Dryomov opcode = CEPH_OSD_OP_WRITEFULL; 2455e30b7577SIlya Dryomov else 24563b434a2aSJosh Durgin opcode = CEPH_OSD_OP_WRITE; 24573b434a2aSJosh Durgin osd_req_op_alloc_hint_init(osd_request, num_ops, 24583b434a2aSJosh Durgin object_size, object_size); 24593b434a2aSJosh Durgin num_ops++; 24603b434a2aSJosh Durgin } else { 24613b434a2aSJosh Durgin opcode = CEPH_OSD_OP_READ; 24623b434a2aSJosh Durgin } 24633b434a2aSJosh Durgin 24647e868b6eSIlya Dryomov if (opcode == CEPH_OSD_OP_DELETE) 2465144cba14SYan, Zheng osd_req_op_init(osd_request, num_ops, opcode, 0); 24667e868b6eSIlya Dryomov else 24677e868b6eSIlya Dryomov osd_req_op_extent_init(osd_request, num_ops, opcode, 24687e868b6eSIlya Dryomov offset, length, 0, 0); 24697e868b6eSIlya Dryomov 24703b434a2aSJosh Durgin if (obj_request->type == OBJ_REQUEST_BIO) 24713b434a2aSJosh Durgin osd_req_op_extent_osd_data_bio(osd_request, num_ops, 24723b434a2aSJosh Durgin obj_request->bio_list, length); 24733b434a2aSJosh Durgin else if (obj_request->type == OBJ_REQUEST_PAGES) 24743b434a2aSJosh Durgin osd_req_op_extent_osd_data_pages(osd_request, num_ops, 24753b434a2aSJosh Durgin obj_request->pages, length, 24763b434a2aSJosh Durgin offset & ~PAGE_MASK, false, false); 24773b434a2aSJosh Durgin 24783b434a2aSJosh Durgin /* Discards are also writes */ 24793b434a2aSJosh Durgin if (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD) 24803b434a2aSJosh Durgin rbd_osd_req_format_write(obj_request); 24813b434a2aSJosh Durgin else 24823b434a2aSJosh Durgin rbd_osd_req_format_read(obj_request); 24833b434a2aSJosh Durgin } 24843b434a2aSJosh Durgin 24853b434a2aSJosh Durgin /* 2486f1a4739fSAlex Elder * Split up an image request into one or more object requests, each 2487f1a4739fSAlex Elder * to a different object. The "type" parameter indicates whether 2488f1a4739fSAlex Elder * "data_desc" is the pointer to the head of a list of bio 2489f1a4739fSAlex Elder * structures, or the base of a page array. In either case this 2490f1a4739fSAlex Elder * function assumes data_desc describes memory sufficient to hold 2491f1a4739fSAlex Elder * all data described by the image request. 2492f1a4739fSAlex Elder */ 2493f1a4739fSAlex Elder static int rbd_img_request_fill(struct rbd_img_request *img_request, 2494f1a4739fSAlex Elder enum obj_request_type type, 2495f1a4739fSAlex Elder void *data_desc) 2496bf0d5f50SAlex Elder { 2497bf0d5f50SAlex Elder struct rbd_device *rbd_dev = img_request->rbd_dev; 2498bf0d5f50SAlex Elder struct rbd_obj_request *obj_request = NULL; 2499bf0d5f50SAlex Elder struct rbd_obj_request *next_obj_request; 2500a158073cSJingoo Han struct bio *bio_list = NULL; 2501f1a4739fSAlex Elder unsigned int bio_offset = 0; 2502a158073cSJingoo Han struct page **pages = NULL; 25036d2940c8SGuangliang Zhao enum obj_operation_type op_type; 25047da22d29SAlex Elder u64 img_offset; 2505bf0d5f50SAlex Elder u64 resid; 2506bf0d5f50SAlex Elder 2507f1a4739fSAlex Elder dout("%s: img %p type %d data_desc %p\n", __func__, img_request, 2508f1a4739fSAlex Elder (int)type, data_desc); 250937206ee5SAlex Elder 25107da22d29SAlex Elder img_offset = img_request->offset; 2511bf0d5f50SAlex Elder resid = img_request->length; 25124dda41d3SAlex Elder rbd_assert(resid > 0); 25133b434a2aSJosh Durgin op_type = rbd_img_request_op_type(img_request); 2514f1a4739fSAlex Elder 2515f1a4739fSAlex Elder if (type == OBJ_REQUEST_BIO) { 2516f1a4739fSAlex Elder bio_list = data_desc; 25174f024f37SKent Overstreet rbd_assert(img_offset == 25184f024f37SKent Overstreet bio_list->bi_iter.bi_sector << SECTOR_SHIFT); 251990e98c52SGuangliang Zhao } else if (type == OBJ_REQUEST_PAGES) { 2520f1a4739fSAlex Elder pages = data_desc; 2521f1a4739fSAlex Elder } 2522f1a4739fSAlex Elder 2523bf0d5f50SAlex Elder while (resid) { 25242fa12320SAlex Elder struct ceph_osd_request *osd_req; 2525bf0d5f50SAlex Elder const char *object_name; 2526bf0d5f50SAlex Elder u64 offset; 2527bf0d5f50SAlex Elder u64 length; 2528bf0d5f50SAlex Elder 25297da22d29SAlex Elder object_name = rbd_segment_name(rbd_dev, img_offset); 2530bf0d5f50SAlex Elder if (!object_name) 2531bf0d5f50SAlex Elder goto out_unwind; 25327da22d29SAlex Elder offset = rbd_segment_offset(rbd_dev, img_offset); 25337da22d29SAlex Elder length = rbd_segment_length(rbd_dev, img_offset, resid); 2534bf0d5f50SAlex Elder obj_request = rbd_obj_request_create(object_name, 2535f1a4739fSAlex Elder offset, length, type); 253678c2a44aSAlex Elder /* object request has its own copy of the object name */ 253778c2a44aSAlex Elder rbd_segment_name_free(object_name); 2538bf0d5f50SAlex Elder if (!obj_request) 2539bf0d5f50SAlex Elder goto out_unwind; 254062054da6SIlya Dryomov 254103507db6SJosh Durgin /* 254203507db6SJosh Durgin * set obj_request->img_request before creating the 254303507db6SJosh Durgin * osd_request so that it gets the right snapc 254403507db6SJosh Durgin */ 254503507db6SJosh Durgin rbd_img_obj_request_add(img_request, obj_request); 2546bf0d5f50SAlex Elder 2547f1a4739fSAlex Elder if (type == OBJ_REQUEST_BIO) { 2548f1a4739fSAlex Elder unsigned int clone_size; 2549f1a4739fSAlex Elder 2550bf0d5f50SAlex Elder rbd_assert(length <= (u64)UINT_MAX); 2551bf0d5f50SAlex Elder clone_size = (unsigned int)length; 2552f1a4739fSAlex Elder obj_request->bio_list = 2553f1a4739fSAlex Elder bio_chain_clone_range(&bio_list, 2554f1a4739fSAlex Elder &bio_offset, 2555f1a4739fSAlex Elder clone_size, 25562224d879SDavid Disseldorp GFP_NOIO); 2557bf0d5f50SAlex Elder if (!obj_request->bio_list) 255862054da6SIlya Dryomov goto out_unwind; 255990e98c52SGuangliang Zhao } else if (type == OBJ_REQUEST_PAGES) { 2560f1a4739fSAlex Elder unsigned int page_count; 2561f1a4739fSAlex Elder 2562f1a4739fSAlex Elder obj_request->pages = pages; 2563f1a4739fSAlex Elder page_count = (u32)calc_pages_for(offset, length); 2564f1a4739fSAlex Elder obj_request->page_count = page_count; 2565f1a4739fSAlex Elder if ((offset + length) & ~PAGE_MASK) 2566f1a4739fSAlex Elder page_count--; /* more on last page */ 2567f1a4739fSAlex Elder pages += page_count; 2568f1a4739fSAlex Elder } 2569bf0d5f50SAlex Elder 25706d2940c8SGuangliang Zhao osd_req = rbd_osd_req_create(rbd_dev, op_type, 25716d2940c8SGuangliang Zhao (op_type == OBJ_OP_WRITE) ? 2 : 1, 25722fa12320SAlex Elder obj_request); 25732fa12320SAlex Elder if (!osd_req) 257462054da6SIlya Dryomov goto out_unwind; 25753b434a2aSJosh Durgin 25762fa12320SAlex Elder obj_request->osd_req = osd_req; 25772169238dSAlex Elder obj_request->callback = rbd_img_obj_callback; 25787da22d29SAlex Elder obj_request->img_offset = img_offset; 2579bf0d5f50SAlex Elder 25803b434a2aSJosh Durgin rbd_img_obj_request_fill(obj_request, osd_req, op_type, 0); 25813b434a2aSJosh Durgin 25823b434a2aSJosh Durgin rbd_img_request_get(img_request); 25833b434a2aSJosh Durgin 25847da22d29SAlex Elder img_offset += length; 2585bf0d5f50SAlex Elder resid -= length; 2586bf0d5f50SAlex Elder } 2587bf0d5f50SAlex Elder 2588bf0d5f50SAlex Elder return 0; 2589bf0d5f50SAlex Elder 2590bf0d5f50SAlex Elder out_unwind: 2591bf0d5f50SAlex Elder for_each_obj_request_safe(img_request, obj_request, next_obj_request) 259242dd037cSIlya Dryomov rbd_img_obj_request_del(img_request, obj_request); 2593bf0d5f50SAlex Elder 2594bf0d5f50SAlex Elder return -ENOMEM; 2595bf0d5f50SAlex Elder } 2596bf0d5f50SAlex Elder 25973d7efd18SAlex Elder static void 25982761713dSIlya Dryomov rbd_osd_copyup_callback(struct rbd_obj_request *obj_request) 25990eefd470SAlex Elder { 26000eefd470SAlex Elder struct rbd_img_request *img_request; 26010eefd470SAlex Elder struct rbd_device *rbd_dev; 2602ebda6408SAlex Elder struct page **pages; 26030eefd470SAlex Elder u32 page_count; 26040eefd470SAlex Elder 26052761713dSIlya Dryomov dout("%s: obj %p\n", __func__, obj_request); 26062761713dSIlya Dryomov 2607d3246fb0SJosh Durgin rbd_assert(obj_request->type == OBJ_REQUEST_BIO || 2608d3246fb0SJosh Durgin obj_request->type == OBJ_REQUEST_NODATA); 26090eefd470SAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 26100eefd470SAlex Elder img_request = obj_request->img_request; 26110eefd470SAlex Elder rbd_assert(img_request); 26120eefd470SAlex Elder 26130eefd470SAlex Elder rbd_dev = img_request->rbd_dev; 26140eefd470SAlex Elder rbd_assert(rbd_dev); 26150eefd470SAlex Elder 2616ebda6408SAlex Elder pages = obj_request->copyup_pages; 2617ebda6408SAlex Elder rbd_assert(pages != NULL); 26180eefd470SAlex Elder obj_request->copyup_pages = NULL; 2619ebda6408SAlex Elder page_count = obj_request->copyup_page_count; 2620ebda6408SAlex Elder rbd_assert(page_count); 2621ebda6408SAlex Elder obj_request->copyup_page_count = 0; 2622ebda6408SAlex Elder ceph_release_page_vector(pages, page_count); 26230eefd470SAlex Elder 26240eefd470SAlex Elder /* 26250eefd470SAlex Elder * We want the transfer count to reflect the size of the 26260eefd470SAlex Elder * original write request. There is no such thing as a 26270eefd470SAlex Elder * successful short write, so if the request was successful 26280eefd470SAlex Elder * we can just set it to the originally-requested length. 26290eefd470SAlex Elder */ 26300eefd470SAlex Elder if (!obj_request->result) 26310eefd470SAlex Elder obj_request->xferred = obj_request->length; 26320eefd470SAlex Elder 26332761713dSIlya Dryomov obj_request_done_set(obj_request); 26340eefd470SAlex Elder } 26350eefd470SAlex Elder 26360eefd470SAlex Elder static void 26373d7efd18SAlex Elder rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request) 26383d7efd18SAlex Elder { 26393d7efd18SAlex Elder struct rbd_obj_request *orig_request; 26400eefd470SAlex Elder struct ceph_osd_request *osd_req; 26410eefd470SAlex Elder struct ceph_osd_client *osdc; 26420eefd470SAlex Elder struct rbd_device *rbd_dev; 26433d7efd18SAlex Elder struct page **pages; 2644d3246fb0SJosh Durgin enum obj_operation_type op_type; 2645ebda6408SAlex Elder u32 page_count; 2646bbea1c1aSAlex Elder int img_result; 2647ebda6408SAlex Elder u64 parent_length; 26483d7efd18SAlex Elder 26493d7efd18SAlex Elder rbd_assert(img_request_child_test(img_request)); 26503d7efd18SAlex Elder 26513d7efd18SAlex Elder /* First get what we need from the image request */ 26523d7efd18SAlex Elder 26533d7efd18SAlex Elder pages = img_request->copyup_pages; 26543d7efd18SAlex Elder rbd_assert(pages != NULL); 26553d7efd18SAlex Elder img_request->copyup_pages = NULL; 2656ebda6408SAlex Elder page_count = img_request->copyup_page_count; 2657ebda6408SAlex Elder rbd_assert(page_count); 2658ebda6408SAlex Elder img_request->copyup_page_count = 0; 26593d7efd18SAlex Elder 26603d7efd18SAlex Elder orig_request = img_request->obj_request; 26613d7efd18SAlex Elder rbd_assert(orig_request != NULL); 2662b91f09f1SAlex Elder rbd_assert(obj_request_type_valid(orig_request->type)); 2663bbea1c1aSAlex Elder img_result = img_request->result; 2664ebda6408SAlex Elder parent_length = img_request->length; 2665ebda6408SAlex Elder rbd_assert(parent_length == img_request->xferred); 26663d7efd18SAlex Elder rbd_img_request_put(img_request); 26673d7efd18SAlex Elder 266891c6febbSAlex Elder rbd_assert(orig_request->img_request); 266991c6febbSAlex Elder rbd_dev = orig_request->img_request->rbd_dev; 26703d7efd18SAlex Elder rbd_assert(rbd_dev); 26713d7efd18SAlex Elder 2672bbea1c1aSAlex Elder /* 2673bbea1c1aSAlex Elder * If the overlap has become 0 (most likely because the 2674bbea1c1aSAlex Elder * image has been flattened) we need to free the pages 2675bbea1c1aSAlex Elder * and re-submit the original write request. 2676bbea1c1aSAlex Elder */ 2677bbea1c1aSAlex Elder if (!rbd_dev->parent_overlap) { 2678bbea1c1aSAlex Elder struct ceph_osd_client *osdc; 2679bbea1c1aSAlex Elder 2680bbea1c1aSAlex Elder ceph_release_page_vector(pages, page_count); 2681bbea1c1aSAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 2682bbea1c1aSAlex Elder img_result = rbd_obj_request_submit(osdc, orig_request); 2683bbea1c1aSAlex Elder if (!img_result) 2684bbea1c1aSAlex Elder return; 2685bbea1c1aSAlex Elder } 2686bbea1c1aSAlex Elder 2687bbea1c1aSAlex Elder if (img_result) 26880eefd470SAlex Elder goto out_err; 26893d7efd18SAlex Elder 26908785b1d4SAlex Elder /* 26918785b1d4SAlex Elder * The original osd request is of no use to use any more. 26920ccd5926SIlya Dryomov * We need a new one that can hold the three ops in a copyup 26938785b1d4SAlex Elder * request. Allocate the new copyup osd request for the 26948785b1d4SAlex Elder * original request, and release the old one. 26958785b1d4SAlex Elder */ 2696bbea1c1aSAlex Elder img_result = -ENOMEM; 26970eefd470SAlex Elder osd_req = rbd_osd_req_create_copyup(orig_request); 26980eefd470SAlex Elder if (!osd_req) 26990eefd470SAlex Elder goto out_err; 27008785b1d4SAlex Elder rbd_osd_req_destroy(orig_request->osd_req); 27010eefd470SAlex Elder orig_request->osd_req = osd_req; 27020eefd470SAlex Elder orig_request->copyup_pages = pages; 2703ebda6408SAlex Elder orig_request->copyup_page_count = page_count; 27043d7efd18SAlex Elder 27050eefd470SAlex Elder /* Initialize the copyup op */ 27060eefd470SAlex Elder 27070eefd470SAlex Elder osd_req_op_cls_init(osd_req, 0, CEPH_OSD_OP_CALL, "rbd", "copyup"); 2708ebda6408SAlex Elder osd_req_op_cls_request_data_pages(osd_req, 0, pages, parent_length, 0, 27090eefd470SAlex Elder false, false); 27100eefd470SAlex Elder 2711d3246fb0SJosh Durgin /* Add the other op(s) */ 27120ccd5926SIlya Dryomov 2713d3246fb0SJosh Durgin op_type = rbd_img_request_op_type(orig_request->img_request); 2714d3246fb0SJosh Durgin rbd_img_obj_request_fill(orig_request, osd_req, op_type, 1); 27150eefd470SAlex Elder 27160eefd470SAlex Elder /* All set, send it off. */ 27170eefd470SAlex Elder 27180eefd470SAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 2719bbea1c1aSAlex Elder img_result = rbd_obj_request_submit(osdc, orig_request); 2720bbea1c1aSAlex Elder if (!img_result) 27210eefd470SAlex Elder return; 27220eefd470SAlex Elder out_err: 27230eefd470SAlex Elder /* Record the error code and complete the request */ 27240eefd470SAlex Elder 2725bbea1c1aSAlex Elder orig_request->result = img_result; 27260eefd470SAlex Elder orig_request->xferred = 0; 27273d7efd18SAlex Elder obj_request_done_set(orig_request); 27283d7efd18SAlex Elder rbd_obj_request_complete(orig_request); 27293d7efd18SAlex Elder } 27303d7efd18SAlex Elder 27313d7efd18SAlex Elder /* 27323d7efd18SAlex Elder * Read from the parent image the range of data that covers the 27333d7efd18SAlex Elder * entire target of the given object request. This is used for 27343d7efd18SAlex Elder * satisfying a layered image write request when the target of an 27353d7efd18SAlex Elder * object request from the image request does not exist. 27363d7efd18SAlex Elder * 27373d7efd18SAlex Elder * A page array big enough to hold the returned data is allocated 27383d7efd18SAlex Elder * and supplied to rbd_img_request_fill() as the "data descriptor." 27393d7efd18SAlex Elder * When the read completes, this page array will be transferred to 27403d7efd18SAlex Elder * the original object request for the copyup operation. 27413d7efd18SAlex Elder * 27423d7efd18SAlex Elder * If an error occurs, record it as the result of the original 27433d7efd18SAlex Elder * object request and mark it done so it gets completed. 27443d7efd18SAlex Elder */ 27453d7efd18SAlex Elder static int rbd_img_obj_parent_read_full(struct rbd_obj_request *obj_request) 27463d7efd18SAlex Elder { 27473d7efd18SAlex Elder struct rbd_img_request *img_request = NULL; 27483d7efd18SAlex Elder struct rbd_img_request *parent_request = NULL; 27493d7efd18SAlex Elder struct rbd_device *rbd_dev; 27503d7efd18SAlex Elder u64 img_offset; 27513d7efd18SAlex Elder u64 length; 27523d7efd18SAlex Elder struct page **pages = NULL; 27533d7efd18SAlex Elder u32 page_count; 27543d7efd18SAlex Elder int result; 27553d7efd18SAlex Elder 27563d7efd18SAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 2757b91f09f1SAlex Elder rbd_assert(obj_request_type_valid(obj_request->type)); 27583d7efd18SAlex Elder 27593d7efd18SAlex Elder img_request = obj_request->img_request; 27603d7efd18SAlex Elder rbd_assert(img_request != NULL); 27613d7efd18SAlex Elder rbd_dev = img_request->rbd_dev; 27623d7efd18SAlex Elder rbd_assert(rbd_dev->parent != NULL); 27633d7efd18SAlex Elder 27643d7efd18SAlex Elder /* 27653d7efd18SAlex Elder * Determine the byte range covered by the object in the 27663d7efd18SAlex Elder * child image to which the original request was to be sent. 27673d7efd18SAlex Elder */ 27683d7efd18SAlex Elder img_offset = obj_request->img_offset - obj_request->offset; 27693d7efd18SAlex Elder length = (u64)1 << rbd_dev->header.obj_order; 27703d7efd18SAlex Elder 27713d7efd18SAlex Elder /* 2772a9e8ba2cSAlex Elder * There is no defined parent data beyond the parent 2773a9e8ba2cSAlex Elder * overlap, so limit what we read at that boundary if 2774a9e8ba2cSAlex Elder * necessary. 2775a9e8ba2cSAlex Elder */ 2776a9e8ba2cSAlex Elder if (img_offset + length > rbd_dev->parent_overlap) { 2777a9e8ba2cSAlex Elder rbd_assert(img_offset < rbd_dev->parent_overlap); 2778a9e8ba2cSAlex Elder length = rbd_dev->parent_overlap - img_offset; 2779a9e8ba2cSAlex Elder } 2780a9e8ba2cSAlex Elder 2781a9e8ba2cSAlex Elder /* 27823d7efd18SAlex Elder * Allocate a page array big enough to receive the data read 27833d7efd18SAlex Elder * from the parent. 27843d7efd18SAlex Elder */ 27853d7efd18SAlex Elder page_count = (u32)calc_pages_for(0, length); 27863d7efd18SAlex Elder pages = ceph_alloc_page_vector(page_count, GFP_KERNEL); 27873d7efd18SAlex Elder if (IS_ERR(pages)) { 27883d7efd18SAlex Elder result = PTR_ERR(pages); 27893d7efd18SAlex Elder pages = NULL; 27903d7efd18SAlex Elder goto out_err; 27913d7efd18SAlex Elder } 27923d7efd18SAlex Elder 27933d7efd18SAlex Elder result = -ENOMEM; 2794e93f3152SAlex Elder parent_request = rbd_parent_request_create(obj_request, 2795e93f3152SAlex Elder img_offset, length); 27963d7efd18SAlex Elder if (!parent_request) 27973d7efd18SAlex Elder goto out_err; 27983d7efd18SAlex Elder 27993d7efd18SAlex Elder result = rbd_img_request_fill(parent_request, OBJ_REQUEST_PAGES, pages); 28003d7efd18SAlex Elder if (result) 28013d7efd18SAlex Elder goto out_err; 28023d7efd18SAlex Elder parent_request->copyup_pages = pages; 2803ebda6408SAlex Elder parent_request->copyup_page_count = page_count; 28043d7efd18SAlex Elder 28053d7efd18SAlex Elder parent_request->callback = rbd_img_obj_parent_read_full_callback; 28063d7efd18SAlex Elder result = rbd_img_request_submit(parent_request); 28073d7efd18SAlex Elder if (!result) 28083d7efd18SAlex Elder return 0; 28093d7efd18SAlex Elder 28103d7efd18SAlex Elder parent_request->copyup_pages = NULL; 2811ebda6408SAlex Elder parent_request->copyup_page_count = 0; 28123d7efd18SAlex Elder parent_request->obj_request = NULL; 28133d7efd18SAlex Elder rbd_obj_request_put(obj_request); 28143d7efd18SAlex Elder out_err: 28153d7efd18SAlex Elder if (pages) 28163d7efd18SAlex Elder ceph_release_page_vector(pages, page_count); 28173d7efd18SAlex Elder if (parent_request) 28183d7efd18SAlex Elder rbd_img_request_put(parent_request); 28193d7efd18SAlex Elder obj_request->result = result; 28203d7efd18SAlex Elder obj_request->xferred = 0; 28213d7efd18SAlex Elder obj_request_done_set(obj_request); 28223d7efd18SAlex Elder 28233d7efd18SAlex Elder return result; 28243d7efd18SAlex Elder } 28253d7efd18SAlex Elder 2826c5b5ef6cSAlex Elder static void rbd_img_obj_exists_callback(struct rbd_obj_request *obj_request) 2827c5b5ef6cSAlex Elder { 2828c5b5ef6cSAlex Elder struct rbd_obj_request *orig_request; 2829638f5abeSAlex Elder struct rbd_device *rbd_dev; 2830c5b5ef6cSAlex Elder int result; 2831c5b5ef6cSAlex Elder 2832c5b5ef6cSAlex Elder rbd_assert(!obj_request_img_data_test(obj_request)); 2833c5b5ef6cSAlex Elder 2834c5b5ef6cSAlex Elder /* 2835c5b5ef6cSAlex Elder * All we need from the object request is the original 2836c5b5ef6cSAlex Elder * request and the result of the STAT op. Grab those, then 2837c5b5ef6cSAlex Elder * we're done with the request. 2838c5b5ef6cSAlex Elder */ 2839c5b5ef6cSAlex Elder orig_request = obj_request->obj_request; 2840c5b5ef6cSAlex Elder obj_request->obj_request = NULL; 2841912c317dSAlex Elder rbd_obj_request_put(orig_request); 2842c5b5ef6cSAlex Elder rbd_assert(orig_request); 2843c5b5ef6cSAlex Elder rbd_assert(orig_request->img_request); 2844c5b5ef6cSAlex Elder 2845c5b5ef6cSAlex Elder result = obj_request->result; 2846c5b5ef6cSAlex Elder obj_request->result = 0; 2847c5b5ef6cSAlex Elder 2848c5b5ef6cSAlex Elder dout("%s: obj %p for obj %p result %d %llu/%llu\n", __func__, 2849c5b5ef6cSAlex Elder obj_request, orig_request, result, 2850c5b5ef6cSAlex Elder obj_request->xferred, obj_request->length); 2851c5b5ef6cSAlex Elder rbd_obj_request_put(obj_request); 2852c5b5ef6cSAlex Elder 2853638f5abeSAlex Elder /* 2854638f5abeSAlex Elder * If the overlap has become 0 (most likely because the 2855638f5abeSAlex Elder * image has been flattened) we need to free the pages 2856638f5abeSAlex Elder * and re-submit the original write request. 2857638f5abeSAlex Elder */ 2858638f5abeSAlex Elder rbd_dev = orig_request->img_request->rbd_dev; 2859638f5abeSAlex Elder if (!rbd_dev->parent_overlap) { 2860638f5abeSAlex Elder struct ceph_osd_client *osdc; 2861638f5abeSAlex Elder 2862638f5abeSAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 2863638f5abeSAlex Elder result = rbd_obj_request_submit(osdc, orig_request); 2864638f5abeSAlex Elder if (!result) 2865638f5abeSAlex Elder return; 2866638f5abeSAlex Elder } 2867c5b5ef6cSAlex Elder 2868c5b5ef6cSAlex Elder /* 2869c5b5ef6cSAlex Elder * Our only purpose here is to determine whether the object 2870c5b5ef6cSAlex Elder * exists, and we don't want to treat the non-existence as 2871c5b5ef6cSAlex Elder * an error. If something else comes back, transfer the 2872c5b5ef6cSAlex Elder * error to the original request and complete it now. 2873c5b5ef6cSAlex Elder */ 2874c5b5ef6cSAlex Elder if (!result) { 2875c5b5ef6cSAlex Elder obj_request_existence_set(orig_request, true); 2876c5b5ef6cSAlex Elder } else if (result == -ENOENT) { 2877c5b5ef6cSAlex Elder obj_request_existence_set(orig_request, false); 2878c5b5ef6cSAlex Elder } else if (result) { 2879c5b5ef6cSAlex Elder orig_request->result = result; 28803d7efd18SAlex Elder goto out; 2881c5b5ef6cSAlex Elder } 2882c5b5ef6cSAlex Elder 2883c5b5ef6cSAlex Elder /* 2884c5b5ef6cSAlex Elder * Resubmit the original request now that we have recorded 2885c5b5ef6cSAlex Elder * whether the target object exists. 2886c5b5ef6cSAlex Elder */ 2887b454e36dSAlex Elder orig_request->result = rbd_img_obj_request_submit(orig_request); 28883d7efd18SAlex Elder out: 2889c5b5ef6cSAlex Elder if (orig_request->result) 2890c5b5ef6cSAlex Elder rbd_obj_request_complete(orig_request); 2891c5b5ef6cSAlex Elder } 2892c5b5ef6cSAlex Elder 2893c5b5ef6cSAlex Elder static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request) 2894c5b5ef6cSAlex Elder { 2895c5b5ef6cSAlex Elder struct rbd_obj_request *stat_request; 2896c5b5ef6cSAlex Elder struct rbd_device *rbd_dev; 2897c5b5ef6cSAlex Elder struct ceph_osd_client *osdc; 2898c5b5ef6cSAlex Elder struct page **pages = NULL; 2899c5b5ef6cSAlex Elder u32 page_count; 2900c5b5ef6cSAlex Elder size_t size; 2901c5b5ef6cSAlex Elder int ret; 2902c5b5ef6cSAlex Elder 2903c5b5ef6cSAlex Elder /* 2904c5b5ef6cSAlex Elder * The response data for a STAT call consists of: 2905c5b5ef6cSAlex Elder * le64 length; 2906c5b5ef6cSAlex Elder * struct { 2907c5b5ef6cSAlex Elder * le32 tv_sec; 2908c5b5ef6cSAlex Elder * le32 tv_nsec; 2909c5b5ef6cSAlex Elder * } mtime; 2910c5b5ef6cSAlex Elder */ 2911c5b5ef6cSAlex Elder size = sizeof (__le64) + sizeof (__le32) + sizeof (__le32); 2912c5b5ef6cSAlex Elder page_count = (u32)calc_pages_for(0, size); 2913c5b5ef6cSAlex Elder pages = ceph_alloc_page_vector(page_count, GFP_KERNEL); 2914c5b5ef6cSAlex Elder if (IS_ERR(pages)) 2915c5b5ef6cSAlex Elder return PTR_ERR(pages); 2916c5b5ef6cSAlex Elder 2917c5b5ef6cSAlex Elder ret = -ENOMEM; 2918c5b5ef6cSAlex Elder stat_request = rbd_obj_request_create(obj_request->object_name, 0, 0, 2919c5b5ef6cSAlex Elder OBJ_REQUEST_PAGES); 2920c5b5ef6cSAlex Elder if (!stat_request) 2921c5b5ef6cSAlex Elder goto out; 2922c5b5ef6cSAlex Elder 2923c5b5ef6cSAlex Elder rbd_obj_request_get(obj_request); 2924c5b5ef6cSAlex Elder stat_request->obj_request = obj_request; 2925c5b5ef6cSAlex Elder stat_request->pages = pages; 2926c5b5ef6cSAlex Elder stat_request->page_count = page_count; 2927c5b5ef6cSAlex Elder 2928c5b5ef6cSAlex Elder rbd_assert(obj_request->img_request); 2929c5b5ef6cSAlex Elder rbd_dev = obj_request->img_request->rbd_dev; 29306d2940c8SGuangliang Zhao stat_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1, 2931c5b5ef6cSAlex Elder stat_request); 2932c5b5ef6cSAlex Elder if (!stat_request->osd_req) 2933c5b5ef6cSAlex Elder goto out; 2934c5b5ef6cSAlex Elder stat_request->callback = rbd_img_obj_exists_callback; 2935c5b5ef6cSAlex Elder 2936144cba14SYan, Zheng osd_req_op_init(stat_request->osd_req, 0, CEPH_OSD_OP_STAT, 0); 2937c5b5ef6cSAlex Elder osd_req_op_raw_data_in_pages(stat_request->osd_req, 0, pages, size, 0, 2938c5b5ef6cSAlex Elder false, false); 29399d4df01fSAlex Elder rbd_osd_req_format_read(stat_request); 2940c5b5ef6cSAlex Elder 2941c5b5ef6cSAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 2942c5b5ef6cSAlex Elder ret = rbd_obj_request_submit(osdc, stat_request); 2943c5b5ef6cSAlex Elder out: 2944c5b5ef6cSAlex Elder if (ret) 2945c5b5ef6cSAlex Elder rbd_obj_request_put(obj_request); 2946c5b5ef6cSAlex Elder 2947c5b5ef6cSAlex Elder return ret; 2948c5b5ef6cSAlex Elder } 2949c5b5ef6cSAlex Elder 295070d045f6SIlya Dryomov static bool img_obj_request_simple(struct rbd_obj_request *obj_request) 2951b454e36dSAlex Elder { 2952b454e36dSAlex Elder struct rbd_img_request *img_request; 2953a9e8ba2cSAlex Elder struct rbd_device *rbd_dev; 2954b454e36dSAlex Elder 2955b454e36dSAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 2956b454e36dSAlex Elder 2957b454e36dSAlex Elder img_request = obj_request->img_request; 2958b454e36dSAlex Elder rbd_assert(img_request); 2959a9e8ba2cSAlex Elder rbd_dev = img_request->rbd_dev; 2960b454e36dSAlex Elder 296170d045f6SIlya Dryomov /* Reads */ 29621c220881SJosh Durgin if (!img_request_write_test(img_request) && 29631c220881SJosh Durgin !img_request_discard_test(img_request)) 296470d045f6SIlya Dryomov return true; 2965b454e36dSAlex Elder 296670d045f6SIlya Dryomov /* Non-layered writes */ 296770d045f6SIlya Dryomov if (!img_request_layered_test(img_request)) 296870d045f6SIlya Dryomov return true; 296970d045f6SIlya Dryomov 297070d045f6SIlya Dryomov /* 297170d045f6SIlya Dryomov * Layered writes outside of the parent overlap range don't 297270d045f6SIlya Dryomov * share any data with the parent. 297370d045f6SIlya Dryomov */ 297470d045f6SIlya Dryomov if (!obj_request_overlaps_parent(obj_request)) 297570d045f6SIlya Dryomov return true; 297670d045f6SIlya Dryomov 297770d045f6SIlya Dryomov /* 2978c622d226SGuangliang Zhao * Entire-object layered writes - we will overwrite whatever 2979c622d226SGuangliang Zhao * parent data there is anyway. 2980c622d226SGuangliang Zhao */ 2981c622d226SGuangliang Zhao if (!obj_request->offset && 2982c622d226SGuangliang Zhao obj_request->length == rbd_obj_bytes(&rbd_dev->header)) 2983c622d226SGuangliang Zhao return true; 2984c622d226SGuangliang Zhao 2985c622d226SGuangliang Zhao /* 298670d045f6SIlya Dryomov * If the object is known to already exist, its parent data has 298770d045f6SIlya Dryomov * already been copied. 298870d045f6SIlya Dryomov */ 298970d045f6SIlya Dryomov if (obj_request_known_test(obj_request) && 299070d045f6SIlya Dryomov obj_request_exists_test(obj_request)) 299170d045f6SIlya Dryomov return true; 299270d045f6SIlya Dryomov 299370d045f6SIlya Dryomov return false; 299470d045f6SIlya Dryomov } 299570d045f6SIlya Dryomov 299670d045f6SIlya Dryomov static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request) 299770d045f6SIlya Dryomov { 299870d045f6SIlya Dryomov if (img_obj_request_simple(obj_request)) { 2999b454e36dSAlex Elder struct rbd_device *rbd_dev; 3000b454e36dSAlex Elder struct ceph_osd_client *osdc; 3001b454e36dSAlex Elder 3002b454e36dSAlex Elder rbd_dev = obj_request->img_request->rbd_dev; 3003b454e36dSAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 3004b454e36dSAlex Elder 3005b454e36dSAlex Elder return rbd_obj_request_submit(osdc, obj_request); 3006b454e36dSAlex Elder } 3007b454e36dSAlex Elder 3008b454e36dSAlex Elder /* 30093d7efd18SAlex Elder * It's a layered write. The target object might exist but 30103d7efd18SAlex Elder * we may not know that yet. If we know it doesn't exist, 30113d7efd18SAlex Elder * start by reading the data for the full target object from 30123d7efd18SAlex Elder * the parent so we can use it for a copyup to the target. 3013b454e36dSAlex Elder */ 301470d045f6SIlya Dryomov if (obj_request_known_test(obj_request)) 30153d7efd18SAlex Elder return rbd_img_obj_parent_read_full(obj_request); 30163d7efd18SAlex Elder 30173d7efd18SAlex Elder /* We don't know whether the target exists. Go find out. */ 3018b454e36dSAlex Elder 3019b454e36dSAlex Elder return rbd_img_obj_exists_submit(obj_request); 3020b454e36dSAlex Elder } 3021b454e36dSAlex Elder 3022bf0d5f50SAlex Elder static int rbd_img_request_submit(struct rbd_img_request *img_request) 3023bf0d5f50SAlex Elder { 3024bf0d5f50SAlex Elder struct rbd_obj_request *obj_request; 302546faeed4SAlex Elder struct rbd_obj_request *next_obj_request; 3026663ae2ccSIlya Dryomov int ret = 0; 3027bf0d5f50SAlex Elder 302837206ee5SAlex Elder dout("%s: img %p\n", __func__, img_request); 3029bf0d5f50SAlex Elder 3030663ae2ccSIlya Dryomov rbd_img_request_get(img_request); 3031663ae2ccSIlya Dryomov for_each_obj_request_safe(img_request, obj_request, next_obj_request) { 3032b454e36dSAlex Elder ret = rbd_img_obj_request_submit(obj_request); 3033bf0d5f50SAlex Elder if (ret) 3034663ae2ccSIlya Dryomov goto out_put_ireq; 3035bf0d5f50SAlex Elder } 3036bf0d5f50SAlex Elder 3037663ae2ccSIlya Dryomov out_put_ireq: 3038663ae2ccSIlya Dryomov rbd_img_request_put(img_request); 3039663ae2ccSIlya Dryomov return ret; 3040bf0d5f50SAlex Elder } 3041bf0d5f50SAlex Elder 30428b3e1a56SAlex Elder static void rbd_img_parent_read_callback(struct rbd_img_request *img_request) 30438b3e1a56SAlex Elder { 30448b3e1a56SAlex Elder struct rbd_obj_request *obj_request; 3045a9e8ba2cSAlex Elder struct rbd_device *rbd_dev; 3046a9e8ba2cSAlex Elder u64 obj_end; 304702c74fbaSAlex Elder u64 img_xferred; 304802c74fbaSAlex Elder int img_result; 30498b3e1a56SAlex Elder 30508b3e1a56SAlex Elder rbd_assert(img_request_child_test(img_request)); 30518b3e1a56SAlex Elder 305202c74fbaSAlex Elder /* First get what we need from the image request and release it */ 305302c74fbaSAlex Elder 30548b3e1a56SAlex Elder obj_request = img_request->obj_request; 305502c74fbaSAlex Elder img_xferred = img_request->xferred; 305602c74fbaSAlex Elder img_result = img_request->result; 305702c74fbaSAlex Elder rbd_img_request_put(img_request); 305802c74fbaSAlex Elder 305902c74fbaSAlex Elder /* 306002c74fbaSAlex Elder * If the overlap has become 0 (most likely because the 306102c74fbaSAlex Elder * image has been flattened) we need to re-submit the 306202c74fbaSAlex Elder * original request. 306302c74fbaSAlex Elder */ 3064a9e8ba2cSAlex Elder rbd_assert(obj_request); 3065a9e8ba2cSAlex Elder rbd_assert(obj_request->img_request); 306602c74fbaSAlex Elder rbd_dev = obj_request->img_request->rbd_dev; 306702c74fbaSAlex Elder if (!rbd_dev->parent_overlap) { 306802c74fbaSAlex Elder struct ceph_osd_client *osdc; 30698b3e1a56SAlex Elder 307002c74fbaSAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 307102c74fbaSAlex Elder img_result = rbd_obj_request_submit(osdc, obj_request); 307202c74fbaSAlex Elder if (!img_result) 307302c74fbaSAlex Elder return; 307402c74fbaSAlex Elder } 307502c74fbaSAlex Elder 307602c74fbaSAlex Elder obj_request->result = img_result; 3077a9e8ba2cSAlex Elder if (obj_request->result) 3078a9e8ba2cSAlex Elder goto out; 3079a9e8ba2cSAlex Elder 3080a9e8ba2cSAlex Elder /* 3081a9e8ba2cSAlex Elder * We need to zero anything beyond the parent overlap 3082a9e8ba2cSAlex Elder * boundary. Since rbd_img_obj_request_read_callback() 3083a9e8ba2cSAlex Elder * will zero anything beyond the end of a short read, an 3084a9e8ba2cSAlex Elder * easy way to do this is to pretend the data from the 3085a9e8ba2cSAlex Elder * parent came up short--ending at the overlap boundary. 3086a9e8ba2cSAlex Elder */ 3087a9e8ba2cSAlex Elder rbd_assert(obj_request->img_offset < U64_MAX - obj_request->length); 3088a9e8ba2cSAlex Elder obj_end = obj_request->img_offset + obj_request->length; 3089a9e8ba2cSAlex Elder if (obj_end > rbd_dev->parent_overlap) { 3090a9e8ba2cSAlex Elder u64 xferred = 0; 3091a9e8ba2cSAlex Elder 3092a9e8ba2cSAlex Elder if (obj_request->img_offset < rbd_dev->parent_overlap) 3093a9e8ba2cSAlex Elder xferred = rbd_dev->parent_overlap - 3094a9e8ba2cSAlex Elder obj_request->img_offset; 3095a9e8ba2cSAlex Elder 309602c74fbaSAlex Elder obj_request->xferred = min(img_xferred, xferred); 3097a9e8ba2cSAlex Elder } else { 309802c74fbaSAlex Elder obj_request->xferred = img_xferred; 3099a9e8ba2cSAlex Elder } 3100a9e8ba2cSAlex Elder out: 31018b3e1a56SAlex Elder rbd_img_obj_request_read_callback(obj_request); 31028b3e1a56SAlex Elder rbd_obj_request_complete(obj_request); 31038b3e1a56SAlex Elder } 31048b3e1a56SAlex Elder 31058b3e1a56SAlex Elder static void rbd_img_parent_read(struct rbd_obj_request *obj_request) 31068b3e1a56SAlex Elder { 31078b3e1a56SAlex Elder struct rbd_img_request *img_request; 31088b3e1a56SAlex Elder int result; 31098b3e1a56SAlex Elder 31108b3e1a56SAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 31118b3e1a56SAlex Elder rbd_assert(obj_request->img_request != NULL); 31128b3e1a56SAlex Elder rbd_assert(obj_request->result == (s32) -ENOENT); 31135b2ab72dSAlex Elder rbd_assert(obj_request_type_valid(obj_request->type)); 31148b3e1a56SAlex Elder 31158b3e1a56SAlex Elder /* rbd_read_finish(obj_request, obj_request->length); */ 3116e93f3152SAlex Elder img_request = rbd_parent_request_create(obj_request, 31178b3e1a56SAlex Elder obj_request->img_offset, 3118e93f3152SAlex Elder obj_request->length); 31198b3e1a56SAlex Elder result = -ENOMEM; 31208b3e1a56SAlex Elder if (!img_request) 31218b3e1a56SAlex Elder goto out_err; 31228b3e1a56SAlex Elder 31235b2ab72dSAlex Elder if (obj_request->type == OBJ_REQUEST_BIO) 3124f1a4739fSAlex Elder result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO, 3125f1a4739fSAlex Elder obj_request->bio_list); 31265b2ab72dSAlex Elder else 31275b2ab72dSAlex Elder result = rbd_img_request_fill(img_request, OBJ_REQUEST_PAGES, 31285b2ab72dSAlex Elder obj_request->pages); 31298b3e1a56SAlex Elder if (result) 31308b3e1a56SAlex Elder goto out_err; 31318b3e1a56SAlex Elder 31328b3e1a56SAlex Elder img_request->callback = rbd_img_parent_read_callback; 31338b3e1a56SAlex Elder result = rbd_img_request_submit(img_request); 31348b3e1a56SAlex Elder if (result) 31358b3e1a56SAlex Elder goto out_err; 31368b3e1a56SAlex Elder 31378b3e1a56SAlex Elder return; 31388b3e1a56SAlex Elder out_err: 31398b3e1a56SAlex Elder if (img_request) 31408b3e1a56SAlex Elder rbd_img_request_put(img_request); 31418b3e1a56SAlex Elder obj_request->result = result; 31428b3e1a56SAlex Elder obj_request->xferred = 0; 31438b3e1a56SAlex Elder obj_request_done_set(obj_request); 31448b3e1a56SAlex Elder } 31458b3e1a56SAlex Elder 3146ed95b21aSIlya Dryomov static const struct rbd_client_id rbd_empty_cid; 3147ed95b21aSIlya Dryomov 3148ed95b21aSIlya Dryomov static bool rbd_cid_equal(const struct rbd_client_id *lhs, 3149ed95b21aSIlya Dryomov const struct rbd_client_id *rhs) 3150ed95b21aSIlya Dryomov { 3151ed95b21aSIlya Dryomov return lhs->gid == rhs->gid && lhs->handle == rhs->handle; 3152ed95b21aSIlya Dryomov } 3153ed95b21aSIlya Dryomov 3154ed95b21aSIlya Dryomov static struct rbd_client_id rbd_get_cid(struct rbd_device *rbd_dev) 3155ed95b21aSIlya Dryomov { 3156ed95b21aSIlya Dryomov struct rbd_client_id cid; 3157ed95b21aSIlya Dryomov 3158ed95b21aSIlya Dryomov mutex_lock(&rbd_dev->watch_mutex); 3159ed95b21aSIlya Dryomov cid.gid = ceph_client_gid(rbd_dev->rbd_client->client); 3160ed95b21aSIlya Dryomov cid.handle = rbd_dev->watch_cookie; 3161ed95b21aSIlya Dryomov mutex_unlock(&rbd_dev->watch_mutex); 3162ed95b21aSIlya Dryomov return cid; 3163ed95b21aSIlya Dryomov } 3164ed95b21aSIlya Dryomov 3165ed95b21aSIlya Dryomov /* 3166ed95b21aSIlya Dryomov * lock_rwsem must be held for write 3167ed95b21aSIlya Dryomov */ 3168ed95b21aSIlya Dryomov static void rbd_set_owner_cid(struct rbd_device *rbd_dev, 3169ed95b21aSIlya Dryomov const struct rbd_client_id *cid) 3170ed95b21aSIlya Dryomov { 3171ed95b21aSIlya Dryomov dout("%s rbd_dev %p %llu-%llu -> %llu-%llu\n", __func__, rbd_dev, 3172ed95b21aSIlya Dryomov rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle, 3173ed95b21aSIlya Dryomov cid->gid, cid->handle); 3174ed95b21aSIlya Dryomov rbd_dev->owner_cid = *cid; /* struct */ 3175ed95b21aSIlya Dryomov } 3176ed95b21aSIlya Dryomov 3177ed95b21aSIlya Dryomov static void format_lock_cookie(struct rbd_device *rbd_dev, char *buf) 3178ed95b21aSIlya Dryomov { 3179ed95b21aSIlya Dryomov mutex_lock(&rbd_dev->watch_mutex); 3180ed95b21aSIlya Dryomov sprintf(buf, "%s %llu", RBD_LOCK_COOKIE_PREFIX, rbd_dev->watch_cookie); 3181ed95b21aSIlya Dryomov mutex_unlock(&rbd_dev->watch_mutex); 3182ed95b21aSIlya Dryomov } 3183ed95b21aSIlya Dryomov 3184ed95b21aSIlya Dryomov /* 3185ed95b21aSIlya Dryomov * lock_rwsem must be held for write 3186ed95b21aSIlya Dryomov */ 3187ed95b21aSIlya Dryomov static int rbd_lock(struct rbd_device *rbd_dev) 3188ed95b21aSIlya Dryomov { 3189ed95b21aSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 3190ed95b21aSIlya Dryomov struct rbd_client_id cid = rbd_get_cid(rbd_dev); 3191ed95b21aSIlya Dryomov char cookie[32]; 3192ed95b21aSIlya Dryomov int ret; 3193ed95b21aSIlya Dryomov 3194ed95b21aSIlya Dryomov WARN_ON(__rbd_is_lock_owner(rbd_dev)); 3195ed95b21aSIlya Dryomov 3196ed95b21aSIlya Dryomov format_lock_cookie(rbd_dev, cookie); 3197ed95b21aSIlya Dryomov ret = ceph_cls_lock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc, 3198ed95b21aSIlya Dryomov RBD_LOCK_NAME, CEPH_CLS_LOCK_EXCLUSIVE, cookie, 3199ed95b21aSIlya Dryomov RBD_LOCK_TAG, "", 0); 3200ed95b21aSIlya Dryomov if (ret) 3201ed95b21aSIlya Dryomov return ret; 3202ed95b21aSIlya Dryomov 3203ed95b21aSIlya Dryomov rbd_dev->lock_state = RBD_LOCK_STATE_LOCKED; 3204ed95b21aSIlya Dryomov rbd_set_owner_cid(rbd_dev, &cid); 3205ed95b21aSIlya Dryomov queue_work(rbd_dev->task_wq, &rbd_dev->acquired_lock_work); 3206ed95b21aSIlya Dryomov return 0; 3207ed95b21aSIlya Dryomov } 3208ed95b21aSIlya Dryomov 3209ed95b21aSIlya Dryomov /* 3210ed95b21aSIlya Dryomov * lock_rwsem must be held for write 3211ed95b21aSIlya Dryomov */ 3212ed95b21aSIlya Dryomov static int rbd_unlock(struct rbd_device *rbd_dev) 3213ed95b21aSIlya Dryomov { 3214ed95b21aSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 3215ed95b21aSIlya Dryomov char cookie[32]; 3216ed95b21aSIlya Dryomov int ret; 3217ed95b21aSIlya Dryomov 3218ed95b21aSIlya Dryomov WARN_ON(!__rbd_is_lock_owner(rbd_dev)); 3219ed95b21aSIlya Dryomov 3220ed95b21aSIlya Dryomov rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED; 3221ed95b21aSIlya Dryomov 3222ed95b21aSIlya Dryomov format_lock_cookie(rbd_dev, cookie); 3223ed95b21aSIlya Dryomov ret = ceph_cls_unlock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc, 3224ed95b21aSIlya Dryomov RBD_LOCK_NAME, cookie); 3225ed95b21aSIlya Dryomov if (ret && ret != -ENOENT) { 3226ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "cls_unlock failed: %d", ret); 3227ed95b21aSIlya Dryomov return ret; 3228ed95b21aSIlya Dryomov } 3229ed95b21aSIlya Dryomov 3230ed95b21aSIlya Dryomov rbd_set_owner_cid(rbd_dev, &rbd_empty_cid); 3231ed95b21aSIlya Dryomov queue_work(rbd_dev->task_wq, &rbd_dev->released_lock_work); 3232ed95b21aSIlya Dryomov return 0; 3233ed95b21aSIlya Dryomov } 3234ed95b21aSIlya Dryomov 3235ed95b21aSIlya Dryomov static int __rbd_notify_op_lock(struct rbd_device *rbd_dev, 3236ed95b21aSIlya Dryomov enum rbd_notify_op notify_op, 3237ed95b21aSIlya Dryomov struct page ***preply_pages, 3238ed95b21aSIlya Dryomov size_t *preply_len) 3239ed95b21aSIlya Dryomov { 3240ed95b21aSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 3241ed95b21aSIlya Dryomov struct rbd_client_id cid = rbd_get_cid(rbd_dev); 3242ed95b21aSIlya Dryomov int buf_size = 4 + 8 + 8 + CEPH_ENCODING_START_BLK_LEN; 3243ed95b21aSIlya Dryomov char buf[buf_size]; 3244ed95b21aSIlya Dryomov void *p = buf; 3245ed95b21aSIlya Dryomov 3246ed95b21aSIlya Dryomov dout("%s rbd_dev %p notify_op %d\n", __func__, rbd_dev, notify_op); 3247ed95b21aSIlya Dryomov 3248ed95b21aSIlya Dryomov /* encode *LockPayload NotifyMessage (op + ClientId) */ 3249ed95b21aSIlya Dryomov ceph_start_encoding(&p, 2, 1, buf_size - CEPH_ENCODING_START_BLK_LEN); 3250ed95b21aSIlya Dryomov ceph_encode_32(&p, notify_op); 3251ed95b21aSIlya Dryomov ceph_encode_64(&p, cid.gid); 3252ed95b21aSIlya Dryomov ceph_encode_64(&p, cid.handle); 3253ed95b21aSIlya Dryomov 3254ed95b21aSIlya Dryomov return ceph_osdc_notify(osdc, &rbd_dev->header_oid, 3255ed95b21aSIlya Dryomov &rbd_dev->header_oloc, buf, buf_size, 3256ed95b21aSIlya Dryomov RBD_NOTIFY_TIMEOUT, preply_pages, preply_len); 3257ed95b21aSIlya Dryomov } 3258ed95b21aSIlya Dryomov 3259ed95b21aSIlya Dryomov static void rbd_notify_op_lock(struct rbd_device *rbd_dev, 3260ed95b21aSIlya Dryomov enum rbd_notify_op notify_op) 3261ed95b21aSIlya Dryomov { 3262ed95b21aSIlya Dryomov struct page **reply_pages; 3263ed95b21aSIlya Dryomov size_t reply_len; 3264ed95b21aSIlya Dryomov 3265ed95b21aSIlya Dryomov __rbd_notify_op_lock(rbd_dev, notify_op, &reply_pages, &reply_len); 3266ed95b21aSIlya Dryomov ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len)); 3267ed95b21aSIlya Dryomov } 3268ed95b21aSIlya Dryomov 3269ed95b21aSIlya Dryomov static void rbd_notify_acquired_lock(struct work_struct *work) 3270ed95b21aSIlya Dryomov { 3271ed95b21aSIlya Dryomov struct rbd_device *rbd_dev = container_of(work, struct rbd_device, 3272ed95b21aSIlya Dryomov acquired_lock_work); 3273ed95b21aSIlya Dryomov 3274ed95b21aSIlya Dryomov rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_ACQUIRED_LOCK); 3275ed95b21aSIlya Dryomov } 3276ed95b21aSIlya Dryomov 3277ed95b21aSIlya Dryomov static void rbd_notify_released_lock(struct work_struct *work) 3278ed95b21aSIlya Dryomov { 3279ed95b21aSIlya Dryomov struct rbd_device *rbd_dev = container_of(work, struct rbd_device, 3280ed95b21aSIlya Dryomov released_lock_work); 3281ed95b21aSIlya Dryomov 3282ed95b21aSIlya Dryomov rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_RELEASED_LOCK); 3283ed95b21aSIlya Dryomov } 3284ed95b21aSIlya Dryomov 3285ed95b21aSIlya Dryomov static int rbd_request_lock(struct rbd_device *rbd_dev) 3286ed95b21aSIlya Dryomov { 3287ed95b21aSIlya Dryomov struct page **reply_pages; 3288ed95b21aSIlya Dryomov size_t reply_len; 3289ed95b21aSIlya Dryomov bool lock_owner_responded = false; 3290ed95b21aSIlya Dryomov int ret; 3291ed95b21aSIlya Dryomov 3292ed95b21aSIlya Dryomov dout("%s rbd_dev %p\n", __func__, rbd_dev); 3293ed95b21aSIlya Dryomov 3294ed95b21aSIlya Dryomov ret = __rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_REQUEST_LOCK, 3295ed95b21aSIlya Dryomov &reply_pages, &reply_len); 3296ed95b21aSIlya Dryomov if (ret && ret != -ETIMEDOUT) { 3297ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "failed to request lock: %d", ret); 3298ed95b21aSIlya Dryomov goto out; 3299ed95b21aSIlya Dryomov } 3300ed95b21aSIlya Dryomov 3301ed95b21aSIlya Dryomov if (reply_len > 0 && reply_len <= PAGE_SIZE) { 3302ed95b21aSIlya Dryomov void *p = page_address(reply_pages[0]); 3303ed95b21aSIlya Dryomov void *const end = p + reply_len; 3304ed95b21aSIlya Dryomov u32 n; 3305ed95b21aSIlya Dryomov 3306ed95b21aSIlya Dryomov ceph_decode_32_safe(&p, end, n, e_inval); /* num_acks */ 3307ed95b21aSIlya Dryomov while (n--) { 3308ed95b21aSIlya Dryomov u8 struct_v; 3309ed95b21aSIlya Dryomov u32 len; 3310ed95b21aSIlya Dryomov 3311ed95b21aSIlya Dryomov ceph_decode_need(&p, end, 8 + 8, e_inval); 3312ed95b21aSIlya Dryomov p += 8 + 8; /* skip gid and cookie */ 3313ed95b21aSIlya Dryomov 3314ed95b21aSIlya Dryomov ceph_decode_32_safe(&p, end, len, e_inval); 3315ed95b21aSIlya Dryomov if (!len) 3316ed95b21aSIlya Dryomov continue; 3317ed95b21aSIlya Dryomov 3318ed95b21aSIlya Dryomov if (lock_owner_responded) { 3319ed95b21aSIlya Dryomov rbd_warn(rbd_dev, 3320ed95b21aSIlya Dryomov "duplicate lock owners detected"); 3321ed95b21aSIlya Dryomov ret = -EIO; 3322ed95b21aSIlya Dryomov goto out; 3323ed95b21aSIlya Dryomov } 3324ed95b21aSIlya Dryomov 3325ed95b21aSIlya Dryomov lock_owner_responded = true; 3326ed95b21aSIlya Dryomov ret = ceph_start_decoding(&p, end, 1, "ResponseMessage", 3327ed95b21aSIlya Dryomov &struct_v, &len); 3328ed95b21aSIlya Dryomov if (ret) { 3329ed95b21aSIlya Dryomov rbd_warn(rbd_dev, 3330ed95b21aSIlya Dryomov "failed to decode ResponseMessage: %d", 3331ed95b21aSIlya Dryomov ret); 3332ed95b21aSIlya Dryomov goto e_inval; 3333ed95b21aSIlya Dryomov } 3334ed95b21aSIlya Dryomov 3335ed95b21aSIlya Dryomov ret = ceph_decode_32(&p); 3336ed95b21aSIlya Dryomov } 3337ed95b21aSIlya Dryomov } 3338ed95b21aSIlya Dryomov 3339ed95b21aSIlya Dryomov if (!lock_owner_responded) { 3340ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "no lock owners detected"); 3341ed95b21aSIlya Dryomov ret = -ETIMEDOUT; 3342ed95b21aSIlya Dryomov } 3343ed95b21aSIlya Dryomov 3344ed95b21aSIlya Dryomov out: 3345ed95b21aSIlya Dryomov ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len)); 3346ed95b21aSIlya Dryomov return ret; 3347ed95b21aSIlya Dryomov 3348ed95b21aSIlya Dryomov e_inval: 3349ed95b21aSIlya Dryomov ret = -EINVAL; 3350ed95b21aSIlya Dryomov goto out; 3351ed95b21aSIlya Dryomov } 3352ed95b21aSIlya Dryomov 3353ed95b21aSIlya Dryomov static void wake_requests(struct rbd_device *rbd_dev, bool wake_all) 3354ed95b21aSIlya Dryomov { 3355ed95b21aSIlya Dryomov dout("%s rbd_dev %p wake_all %d\n", __func__, rbd_dev, wake_all); 3356ed95b21aSIlya Dryomov 3357ed95b21aSIlya Dryomov cancel_delayed_work(&rbd_dev->lock_dwork); 3358ed95b21aSIlya Dryomov if (wake_all) 3359ed95b21aSIlya Dryomov wake_up_all(&rbd_dev->lock_waitq); 3360ed95b21aSIlya Dryomov else 3361ed95b21aSIlya Dryomov wake_up(&rbd_dev->lock_waitq); 3362ed95b21aSIlya Dryomov } 3363ed95b21aSIlya Dryomov 3364ed95b21aSIlya Dryomov static int get_lock_owner_info(struct rbd_device *rbd_dev, 3365ed95b21aSIlya Dryomov struct ceph_locker **lockers, u32 *num_lockers) 3366ed95b21aSIlya Dryomov { 3367ed95b21aSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 3368ed95b21aSIlya Dryomov u8 lock_type; 3369ed95b21aSIlya Dryomov char *lock_tag; 3370ed95b21aSIlya Dryomov int ret; 3371ed95b21aSIlya Dryomov 3372ed95b21aSIlya Dryomov dout("%s rbd_dev %p\n", __func__, rbd_dev); 3373ed95b21aSIlya Dryomov 3374ed95b21aSIlya Dryomov ret = ceph_cls_lock_info(osdc, &rbd_dev->header_oid, 3375ed95b21aSIlya Dryomov &rbd_dev->header_oloc, RBD_LOCK_NAME, 3376ed95b21aSIlya Dryomov &lock_type, &lock_tag, lockers, num_lockers); 3377ed95b21aSIlya Dryomov if (ret) 3378ed95b21aSIlya Dryomov return ret; 3379ed95b21aSIlya Dryomov 3380ed95b21aSIlya Dryomov if (*num_lockers == 0) { 3381ed95b21aSIlya Dryomov dout("%s rbd_dev %p no lockers detected\n", __func__, rbd_dev); 3382ed95b21aSIlya Dryomov goto out; 3383ed95b21aSIlya Dryomov } 3384ed95b21aSIlya Dryomov 3385ed95b21aSIlya Dryomov if (strcmp(lock_tag, RBD_LOCK_TAG)) { 3386ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "locked by external mechanism, tag %s", 3387ed95b21aSIlya Dryomov lock_tag); 3388ed95b21aSIlya Dryomov ret = -EBUSY; 3389ed95b21aSIlya Dryomov goto out; 3390ed95b21aSIlya Dryomov } 3391ed95b21aSIlya Dryomov 3392ed95b21aSIlya Dryomov if (lock_type == CEPH_CLS_LOCK_SHARED) { 3393ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "shared lock type detected"); 3394ed95b21aSIlya Dryomov ret = -EBUSY; 3395ed95b21aSIlya Dryomov goto out; 3396ed95b21aSIlya Dryomov } 3397ed95b21aSIlya Dryomov 3398ed95b21aSIlya Dryomov if (strncmp((*lockers)[0].id.cookie, RBD_LOCK_COOKIE_PREFIX, 3399ed95b21aSIlya Dryomov strlen(RBD_LOCK_COOKIE_PREFIX))) { 3400ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "locked by external mechanism, cookie %s", 3401ed95b21aSIlya Dryomov (*lockers)[0].id.cookie); 3402ed95b21aSIlya Dryomov ret = -EBUSY; 3403ed95b21aSIlya Dryomov goto out; 3404ed95b21aSIlya Dryomov } 3405ed95b21aSIlya Dryomov 3406ed95b21aSIlya Dryomov out: 3407ed95b21aSIlya Dryomov kfree(lock_tag); 3408ed95b21aSIlya Dryomov return ret; 3409ed95b21aSIlya Dryomov } 3410ed95b21aSIlya Dryomov 3411ed95b21aSIlya Dryomov static int find_watcher(struct rbd_device *rbd_dev, 3412ed95b21aSIlya Dryomov const struct ceph_locker *locker) 3413ed95b21aSIlya Dryomov { 3414ed95b21aSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 3415ed95b21aSIlya Dryomov struct ceph_watch_item *watchers; 3416ed95b21aSIlya Dryomov u32 num_watchers; 3417ed95b21aSIlya Dryomov u64 cookie; 3418ed95b21aSIlya Dryomov int i; 3419ed95b21aSIlya Dryomov int ret; 3420ed95b21aSIlya Dryomov 3421ed95b21aSIlya Dryomov ret = ceph_osdc_list_watchers(osdc, &rbd_dev->header_oid, 3422ed95b21aSIlya Dryomov &rbd_dev->header_oloc, &watchers, 3423ed95b21aSIlya Dryomov &num_watchers); 3424ed95b21aSIlya Dryomov if (ret) 3425ed95b21aSIlya Dryomov return ret; 3426ed95b21aSIlya Dryomov 3427ed95b21aSIlya Dryomov sscanf(locker->id.cookie, RBD_LOCK_COOKIE_PREFIX " %llu", &cookie); 3428ed95b21aSIlya Dryomov for (i = 0; i < num_watchers; i++) { 3429ed95b21aSIlya Dryomov if (!memcmp(&watchers[i].addr, &locker->info.addr, 3430ed95b21aSIlya Dryomov sizeof(locker->info.addr)) && 3431ed95b21aSIlya Dryomov watchers[i].cookie == cookie) { 3432ed95b21aSIlya Dryomov struct rbd_client_id cid = { 3433ed95b21aSIlya Dryomov .gid = le64_to_cpu(watchers[i].name.num), 3434ed95b21aSIlya Dryomov .handle = cookie, 3435ed95b21aSIlya Dryomov }; 3436ed95b21aSIlya Dryomov 3437ed95b21aSIlya Dryomov dout("%s rbd_dev %p found cid %llu-%llu\n", __func__, 3438ed95b21aSIlya Dryomov rbd_dev, cid.gid, cid.handle); 3439ed95b21aSIlya Dryomov rbd_set_owner_cid(rbd_dev, &cid); 3440ed95b21aSIlya Dryomov ret = 1; 3441ed95b21aSIlya Dryomov goto out; 3442ed95b21aSIlya Dryomov } 3443ed95b21aSIlya Dryomov } 3444ed95b21aSIlya Dryomov 3445ed95b21aSIlya Dryomov dout("%s rbd_dev %p no watchers\n", __func__, rbd_dev); 3446ed95b21aSIlya Dryomov ret = 0; 3447ed95b21aSIlya Dryomov out: 3448ed95b21aSIlya Dryomov kfree(watchers); 3449ed95b21aSIlya Dryomov return ret; 3450ed95b21aSIlya Dryomov } 3451ed95b21aSIlya Dryomov 3452ed95b21aSIlya Dryomov /* 3453ed95b21aSIlya Dryomov * lock_rwsem must be held for write 3454ed95b21aSIlya Dryomov */ 3455ed95b21aSIlya Dryomov static int rbd_try_lock(struct rbd_device *rbd_dev) 3456ed95b21aSIlya Dryomov { 3457ed95b21aSIlya Dryomov struct ceph_client *client = rbd_dev->rbd_client->client; 3458ed95b21aSIlya Dryomov struct ceph_locker *lockers; 3459ed95b21aSIlya Dryomov u32 num_lockers; 3460ed95b21aSIlya Dryomov int ret; 3461ed95b21aSIlya Dryomov 3462ed95b21aSIlya Dryomov for (;;) { 3463ed95b21aSIlya Dryomov ret = rbd_lock(rbd_dev); 3464ed95b21aSIlya Dryomov if (ret != -EBUSY) 3465ed95b21aSIlya Dryomov return ret; 3466ed95b21aSIlya Dryomov 3467ed95b21aSIlya Dryomov /* determine if the current lock holder is still alive */ 3468ed95b21aSIlya Dryomov ret = get_lock_owner_info(rbd_dev, &lockers, &num_lockers); 3469ed95b21aSIlya Dryomov if (ret) 3470ed95b21aSIlya Dryomov return ret; 3471ed95b21aSIlya Dryomov 3472ed95b21aSIlya Dryomov if (num_lockers == 0) 3473ed95b21aSIlya Dryomov goto again; 3474ed95b21aSIlya Dryomov 3475ed95b21aSIlya Dryomov ret = find_watcher(rbd_dev, lockers); 3476ed95b21aSIlya Dryomov if (ret) { 3477ed95b21aSIlya Dryomov if (ret > 0) 3478ed95b21aSIlya Dryomov ret = 0; /* have to request lock */ 3479ed95b21aSIlya Dryomov goto out; 3480ed95b21aSIlya Dryomov } 3481ed95b21aSIlya Dryomov 3482ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "%s%llu seems dead, breaking lock", 3483ed95b21aSIlya Dryomov ENTITY_NAME(lockers[0].id.name)); 3484ed95b21aSIlya Dryomov 3485ed95b21aSIlya Dryomov ret = ceph_monc_blacklist_add(&client->monc, 3486ed95b21aSIlya Dryomov &lockers[0].info.addr); 3487ed95b21aSIlya Dryomov if (ret) { 3488ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "blacklist of %s%llu failed: %d", 3489ed95b21aSIlya Dryomov ENTITY_NAME(lockers[0].id.name), ret); 3490ed95b21aSIlya Dryomov goto out; 3491ed95b21aSIlya Dryomov } 3492ed95b21aSIlya Dryomov 3493ed95b21aSIlya Dryomov ret = ceph_cls_break_lock(&client->osdc, &rbd_dev->header_oid, 3494ed95b21aSIlya Dryomov &rbd_dev->header_oloc, RBD_LOCK_NAME, 3495ed95b21aSIlya Dryomov lockers[0].id.cookie, 3496ed95b21aSIlya Dryomov &lockers[0].id.name); 3497ed95b21aSIlya Dryomov if (ret && ret != -ENOENT) 3498ed95b21aSIlya Dryomov goto out; 3499ed95b21aSIlya Dryomov 3500ed95b21aSIlya Dryomov again: 3501ed95b21aSIlya Dryomov ceph_free_lockers(lockers, num_lockers); 3502ed95b21aSIlya Dryomov } 3503ed95b21aSIlya Dryomov 3504ed95b21aSIlya Dryomov out: 3505ed95b21aSIlya Dryomov ceph_free_lockers(lockers, num_lockers); 3506ed95b21aSIlya Dryomov return ret; 3507ed95b21aSIlya Dryomov } 3508ed95b21aSIlya Dryomov 3509ed95b21aSIlya Dryomov /* 3510ed95b21aSIlya Dryomov * ret is set only if lock_state is RBD_LOCK_STATE_UNLOCKED 3511ed95b21aSIlya Dryomov */ 3512ed95b21aSIlya Dryomov static enum rbd_lock_state rbd_try_acquire_lock(struct rbd_device *rbd_dev, 3513ed95b21aSIlya Dryomov int *pret) 3514ed95b21aSIlya Dryomov { 3515ed95b21aSIlya Dryomov enum rbd_lock_state lock_state; 3516ed95b21aSIlya Dryomov 3517ed95b21aSIlya Dryomov down_read(&rbd_dev->lock_rwsem); 3518ed95b21aSIlya Dryomov dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev, 3519ed95b21aSIlya Dryomov rbd_dev->lock_state); 3520ed95b21aSIlya Dryomov if (__rbd_is_lock_owner(rbd_dev)) { 3521ed95b21aSIlya Dryomov lock_state = rbd_dev->lock_state; 3522ed95b21aSIlya Dryomov up_read(&rbd_dev->lock_rwsem); 3523ed95b21aSIlya Dryomov return lock_state; 3524ed95b21aSIlya Dryomov } 3525ed95b21aSIlya Dryomov 3526ed95b21aSIlya Dryomov up_read(&rbd_dev->lock_rwsem); 3527ed95b21aSIlya Dryomov down_write(&rbd_dev->lock_rwsem); 3528ed95b21aSIlya Dryomov dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev, 3529ed95b21aSIlya Dryomov rbd_dev->lock_state); 3530ed95b21aSIlya Dryomov if (!__rbd_is_lock_owner(rbd_dev)) { 3531ed95b21aSIlya Dryomov *pret = rbd_try_lock(rbd_dev); 3532ed95b21aSIlya Dryomov if (*pret) 3533ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "failed to acquire lock: %d", *pret); 3534ed95b21aSIlya Dryomov } 3535ed95b21aSIlya Dryomov 3536ed95b21aSIlya Dryomov lock_state = rbd_dev->lock_state; 3537ed95b21aSIlya Dryomov up_write(&rbd_dev->lock_rwsem); 3538ed95b21aSIlya Dryomov return lock_state; 3539ed95b21aSIlya Dryomov } 3540ed95b21aSIlya Dryomov 3541ed95b21aSIlya Dryomov static void rbd_acquire_lock(struct work_struct *work) 3542ed95b21aSIlya Dryomov { 3543ed95b21aSIlya Dryomov struct rbd_device *rbd_dev = container_of(to_delayed_work(work), 3544ed95b21aSIlya Dryomov struct rbd_device, lock_dwork); 3545ed95b21aSIlya Dryomov enum rbd_lock_state lock_state; 3546ed95b21aSIlya Dryomov int ret; 3547ed95b21aSIlya Dryomov 3548ed95b21aSIlya Dryomov dout("%s rbd_dev %p\n", __func__, rbd_dev); 3549ed95b21aSIlya Dryomov again: 3550ed95b21aSIlya Dryomov lock_state = rbd_try_acquire_lock(rbd_dev, &ret); 3551ed95b21aSIlya Dryomov if (lock_state != RBD_LOCK_STATE_UNLOCKED || ret == -EBLACKLISTED) { 3552ed95b21aSIlya Dryomov if (lock_state == RBD_LOCK_STATE_LOCKED) 3553ed95b21aSIlya Dryomov wake_requests(rbd_dev, true); 3554ed95b21aSIlya Dryomov dout("%s rbd_dev %p lock_state %d ret %d - done\n", __func__, 3555ed95b21aSIlya Dryomov rbd_dev, lock_state, ret); 3556ed95b21aSIlya Dryomov return; 3557ed95b21aSIlya Dryomov } 3558ed95b21aSIlya Dryomov 3559ed95b21aSIlya Dryomov ret = rbd_request_lock(rbd_dev); 3560ed95b21aSIlya Dryomov if (ret == -ETIMEDOUT) { 3561ed95b21aSIlya Dryomov goto again; /* treat this as a dead client */ 3562ed95b21aSIlya Dryomov } else if (ret < 0) { 3563ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "error requesting lock: %d", ret); 3564ed95b21aSIlya Dryomov mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 3565ed95b21aSIlya Dryomov RBD_RETRY_DELAY); 3566ed95b21aSIlya Dryomov } else { 3567ed95b21aSIlya Dryomov /* 3568ed95b21aSIlya Dryomov * lock owner acked, but resend if we don't see them 3569ed95b21aSIlya Dryomov * release the lock 3570ed95b21aSIlya Dryomov */ 3571ed95b21aSIlya Dryomov dout("%s rbd_dev %p requeueing lock_dwork\n", __func__, 3572ed95b21aSIlya Dryomov rbd_dev); 3573ed95b21aSIlya Dryomov mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 3574ed95b21aSIlya Dryomov msecs_to_jiffies(2 * RBD_NOTIFY_TIMEOUT * MSEC_PER_SEC)); 3575ed95b21aSIlya Dryomov } 3576ed95b21aSIlya Dryomov } 3577ed95b21aSIlya Dryomov 3578ed95b21aSIlya Dryomov /* 3579ed95b21aSIlya Dryomov * lock_rwsem must be held for write 3580ed95b21aSIlya Dryomov */ 3581ed95b21aSIlya Dryomov static bool rbd_release_lock(struct rbd_device *rbd_dev) 3582ed95b21aSIlya Dryomov { 3583ed95b21aSIlya Dryomov dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev, 3584ed95b21aSIlya Dryomov rbd_dev->lock_state); 3585ed95b21aSIlya Dryomov if (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED) 3586ed95b21aSIlya Dryomov return false; 3587ed95b21aSIlya Dryomov 3588ed95b21aSIlya Dryomov rbd_dev->lock_state = RBD_LOCK_STATE_RELEASING; 3589ed95b21aSIlya Dryomov downgrade_write(&rbd_dev->lock_rwsem); 3590ed95b21aSIlya Dryomov /* 3591ed95b21aSIlya Dryomov * Ensure that all in-flight IO is flushed. 3592ed95b21aSIlya Dryomov * 3593ed95b21aSIlya Dryomov * FIXME: ceph_osdc_sync() flushes the entire OSD client, which 3594ed95b21aSIlya Dryomov * may be shared with other devices. 3595ed95b21aSIlya Dryomov */ 3596ed95b21aSIlya Dryomov ceph_osdc_sync(&rbd_dev->rbd_client->client->osdc); 3597ed95b21aSIlya Dryomov up_read(&rbd_dev->lock_rwsem); 3598ed95b21aSIlya Dryomov 3599ed95b21aSIlya Dryomov down_write(&rbd_dev->lock_rwsem); 3600ed95b21aSIlya Dryomov dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev, 3601ed95b21aSIlya Dryomov rbd_dev->lock_state); 3602ed95b21aSIlya Dryomov if (rbd_dev->lock_state != RBD_LOCK_STATE_RELEASING) 3603ed95b21aSIlya Dryomov return false; 3604ed95b21aSIlya Dryomov 3605ed95b21aSIlya Dryomov if (!rbd_unlock(rbd_dev)) 3606ed95b21aSIlya Dryomov /* 3607ed95b21aSIlya Dryomov * Give others a chance to grab the lock - we would re-acquire 3608ed95b21aSIlya Dryomov * almost immediately if we got new IO during ceph_osdc_sync() 3609ed95b21aSIlya Dryomov * otherwise. We need to ack our own notifications, so this 3610ed95b21aSIlya Dryomov * lock_dwork will be requeued from rbd_wait_state_locked() 3611ed95b21aSIlya Dryomov * after wake_requests() in rbd_handle_released_lock(). 3612ed95b21aSIlya Dryomov */ 3613ed95b21aSIlya Dryomov cancel_delayed_work(&rbd_dev->lock_dwork); 3614ed95b21aSIlya Dryomov 3615ed95b21aSIlya Dryomov return true; 3616ed95b21aSIlya Dryomov } 3617ed95b21aSIlya Dryomov 3618ed95b21aSIlya Dryomov static void rbd_release_lock_work(struct work_struct *work) 3619ed95b21aSIlya Dryomov { 3620ed95b21aSIlya Dryomov struct rbd_device *rbd_dev = container_of(work, struct rbd_device, 3621ed95b21aSIlya Dryomov unlock_work); 3622ed95b21aSIlya Dryomov 3623ed95b21aSIlya Dryomov down_write(&rbd_dev->lock_rwsem); 3624ed95b21aSIlya Dryomov rbd_release_lock(rbd_dev); 3625ed95b21aSIlya Dryomov up_write(&rbd_dev->lock_rwsem); 3626ed95b21aSIlya Dryomov } 3627ed95b21aSIlya Dryomov 3628ed95b21aSIlya Dryomov static void rbd_handle_acquired_lock(struct rbd_device *rbd_dev, u8 struct_v, 3629ed95b21aSIlya Dryomov void **p) 3630ed95b21aSIlya Dryomov { 3631ed95b21aSIlya Dryomov struct rbd_client_id cid = { 0 }; 3632ed95b21aSIlya Dryomov 3633ed95b21aSIlya Dryomov if (struct_v >= 2) { 3634ed95b21aSIlya Dryomov cid.gid = ceph_decode_64(p); 3635ed95b21aSIlya Dryomov cid.handle = ceph_decode_64(p); 3636ed95b21aSIlya Dryomov } 3637ed95b21aSIlya Dryomov 3638ed95b21aSIlya Dryomov dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid, 3639ed95b21aSIlya Dryomov cid.handle); 3640ed95b21aSIlya Dryomov if (!rbd_cid_equal(&cid, &rbd_empty_cid)) { 3641ed95b21aSIlya Dryomov down_write(&rbd_dev->lock_rwsem); 3642ed95b21aSIlya Dryomov if (rbd_cid_equal(&cid, &rbd_dev->owner_cid)) { 3643ed95b21aSIlya Dryomov /* 3644ed95b21aSIlya Dryomov * we already know that the remote client is 3645ed95b21aSIlya Dryomov * the owner 3646ed95b21aSIlya Dryomov */ 3647ed95b21aSIlya Dryomov up_write(&rbd_dev->lock_rwsem); 3648ed95b21aSIlya Dryomov return; 3649ed95b21aSIlya Dryomov } 3650ed95b21aSIlya Dryomov 3651ed95b21aSIlya Dryomov rbd_set_owner_cid(rbd_dev, &cid); 3652ed95b21aSIlya Dryomov downgrade_write(&rbd_dev->lock_rwsem); 3653ed95b21aSIlya Dryomov } else { 3654ed95b21aSIlya Dryomov down_read(&rbd_dev->lock_rwsem); 3655ed95b21aSIlya Dryomov } 3656ed95b21aSIlya Dryomov 3657ed95b21aSIlya Dryomov if (!__rbd_is_lock_owner(rbd_dev)) 3658ed95b21aSIlya Dryomov wake_requests(rbd_dev, false); 3659ed95b21aSIlya Dryomov up_read(&rbd_dev->lock_rwsem); 3660ed95b21aSIlya Dryomov } 3661ed95b21aSIlya Dryomov 3662ed95b21aSIlya Dryomov static void rbd_handle_released_lock(struct rbd_device *rbd_dev, u8 struct_v, 3663ed95b21aSIlya Dryomov void **p) 3664ed95b21aSIlya Dryomov { 3665ed95b21aSIlya Dryomov struct rbd_client_id cid = { 0 }; 3666ed95b21aSIlya Dryomov 3667ed95b21aSIlya Dryomov if (struct_v >= 2) { 3668ed95b21aSIlya Dryomov cid.gid = ceph_decode_64(p); 3669ed95b21aSIlya Dryomov cid.handle = ceph_decode_64(p); 3670ed95b21aSIlya Dryomov } 3671ed95b21aSIlya Dryomov 3672ed95b21aSIlya Dryomov dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid, 3673ed95b21aSIlya Dryomov cid.handle); 3674ed95b21aSIlya Dryomov if (!rbd_cid_equal(&cid, &rbd_empty_cid)) { 3675ed95b21aSIlya Dryomov down_write(&rbd_dev->lock_rwsem); 3676ed95b21aSIlya Dryomov if (!rbd_cid_equal(&cid, &rbd_dev->owner_cid)) { 3677ed95b21aSIlya Dryomov dout("%s rbd_dev %p unexpected owner, cid %llu-%llu != owner_cid %llu-%llu\n", 3678ed95b21aSIlya Dryomov __func__, rbd_dev, cid.gid, cid.handle, 3679ed95b21aSIlya Dryomov rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle); 3680ed95b21aSIlya Dryomov up_write(&rbd_dev->lock_rwsem); 3681ed95b21aSIlya Dryomov return; 3682ed95b21aSIlya Dryomov } 3683ed95b21aSIlya Dryomov 3684ed95b21aSIlya Dryomov rbd_set_owner_cid(rbd_dev, &rbd_empty_cid); 3685ed95b21aSIlya Dryomov downgrade_write(&rbd_dev->lock_rwsem); 3686ed95b21aSIlya Dryomov } else { 3687ed95b21aSIlya Dryomov down_read(&rbd_dev->lock_rwsem); 3688ed95b21aSIlya Dryomov } 3689ed95b21aSIlya Dryomov 3690ed95b21aSIlya Dryomov if (!__rbd_is_lock_owner(rbd_dev)) 3691ed95b21aSIlya Dryomov wake_requests(rbd_dev, false); 3692ed95b21aSIlya Dryomov up_read(&rbd_dev->lock_rwsem); 3693ed95b21aSIlya Dryomov } 3694ed95b21aSIlya Dryomov 3695ed95b21aSIlya Dryomov static bool rbd_handle_request_lock(struct rbd_device *rbd_dev, u8 struct_v, 3696ed95b21aSIlya Dryomov void **p) 3697ed95b21aSIlya Dryomov { 3698ed95b21aSIlya Dryomov struct rbd_client_id my_cid = rbd_get_cid(rbd_dev); 3699ed95b21aSIlya Dryomov struct rbd_client_id cid = { 0 }; 3700ed95b21aSIlya Dryomov bool need_to_send; 3701ed95b21aSIlya Dryomov 3702ed95b21aSIlya Dryomov if (struct_v >= 2) { 3703ed95b21aSIlya Dryomov cid.gid = ceph_decode_64(p); 3704ed95b21aSIlya Dryomov cid.handle = ceph_decode_64(p); 3705ed95b21aSIlya Dryomov } 3706ed95b21aSIlya Dryomov 3707ed95b21aSIlya Dryomov dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid, 3708ed95b21aSIlya Dryomov cid.handle); 3709ed95b21aSIlya Dryomov if (rbd_cid_equal(&cid, &my_cid)) 3710ed95b21aSIlya Dryomov return false; 3711ed95b21aSIlya Dryomov 3712ed95b21aSIlya Dryomov down_read(&rbd_dev->lock_rwsem); 3713ed95b21aSIlya Dryomov need_to_send = __rbd_is_lock_owner(rbd_dev); 3714ed95b21aSIlya Dryomov if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED) { 3715ed95b21aSIlya Dryomov if (!rbd_cid_equal(&rbd_dev->owner_cid, &rbd_empty_cid)) { 3716ed95b21aSIlya Dryomov dout("%s rbd_dev %p queueing unlock_work\n", __func__, 3717ed95b21aSIlya Dryomov rbd_dev); 3718ed95b21aSIlya Dryomov queue_work(rbd_dev->task_wq, &rbd_dev->unlock_work); 3719ed95b21aSIlya Dryomov } 3720ed95b21aSIlya Dryomov } 3721ed95b21aSIlya Dryomov up_read(&rbd_dev->lock_rwsem); 3722ed95b21aSIlya Dryomov return need_to_send; 3723ed95b21aSIlya Dryomov } 3724ed95b21aSIlya Dryomov 3725ed95b21aSIlya Dryomov static void __rbd_acknowledge_notify(struct rbd_device *rbd_dev, 3726ed95b21aSIlya Dryomov u64 notify_id, u64 cookie, s32 *result) 3727ed95b21aSIlya Dryomov { 3728ed95b21aSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 3729ed95b21aSIlya Dryomov int buf_size = 4 + CEPH_ENCODING_START_BLK_LEN; 3730ed95b21aSIlya Dryomov char buf[buf_size]; 3731ed95b21aSIlya Dryomov int ret; 3732ed95b21aSIlya Dryomov 3733ed95b21aSIlya Dryomov if (result) { 3734ed95b21aSIlya Dryomov void *p = buf; 3735ed95b21aSIlya Dryomov 3736ed95b21aSIlya Dryomov /* encode ResponseMessage */ 3737ed95b21aSIlya Dryomov ceph_start_encoding(&p, 1, 1, 3738ed95b21aSIlya Dryomov buf_size - CEPH_ENCODING_START_BLK_LEN); 3739ed95b21aSIlya Dryomov ceph_encode_32(&p, *result); 3740ed95b21aSIlya Dryomov } else { 3741ed95b21aSIlya Dryomov buf_size = 0; 3742ed95b21aSIlya Dryomov } 3743ed95b21aSIlya Dryomov 3744ed95b21aSIlya Dryomov ret = ceph_osdc_notify_ack(osdc, &rbd_dev->header_oid, 3745ed95b21aSIlya Dryomov &rbd_dev->header_oloc, notify_id, cookie, 3746ed95b21aSIlya Dryomov buf, buf_size); 3747ed95b21aSIlya Dryomov if (ret) 3748ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "acknowledge_notify failed: %d", ret); 3749ed95b21aSIlya Dryomov } 3750ed95b21aSIlya Dryomov 3751ed95b21aSIlya Dryomov static void rbd_acknowledge_notify(struct rbd_device *rbd_dev, u64 notify_id, 3752ed95b21aSIlya Dryomov u64 cookie) 3753ed95b21aSIlya Dryomov { 3754ed95b21aSIlya Dryomov dout("%s rbd_dev %p\n", __func__, rbd_dev); 3755ed95b21aSIlya Dryomov __rbd_acknowledge_notify(rbd_dev, notify_id, cookie, NULL); 3756ed95b21aSIlya Dryomov } 3757ed95b21aSIlya Dryomov 3758ed95b21aSIlya Dryomov static void rbd_acknowledge_notify_result(struct rbd_device *rbd_dev, 3759ed95b21aSIlya Dryomov u64 notify_id, u64 cookie, s32 result) 3760ed95b21aSIlya Dryomov { 3761ed95b21aSIlya Dryomov dout("%s rbd_dev %p result %d\n", __func__, rbd_dev, result); 3762ed95b21aSIlya Dryomov __rbd_acknowledge_notify(rbd_dev, notify_id, cookie, &result); 3763ed95b21aSIlya Dryomov } 3764ed95b21aSIlya Dryomov 3765922dab61SIlya Dryomov static void rbd_watch_cb(void *arg, u64 notify_id, u64 cookie, 3766922dab61SIlya Dryomov u64 notifier_id, void *data, size_t data_len) 3767b8d70035SAlex Elder { 3768922dab61SIlya Dryomov struct rbd_device *rbd_dev = arg; 3769ed95b21aSIlya Dryomov void *p = data; 3770ed95b21aSIlya Dryomov void *const end = p + data_len; 3771ed95b21aSIlya Dryomov u8 struct_v; 3772ed95b21aSIlya Dryomov u32 len; 3773ed95b21aSIlya Dryomov u32 notify_op; 3774b8d70035SAlex Elder int ret; 3775b8d70035SAlex Elder 3776ed95b21aSIlya Dryomov dout("%s rbd_dev %p cookie %llu notify_id %llu data_len %zu\n", 3777ed95b21aSIlya Dryomov __func__, rbd_dev, cookie, notify_id, data_len); 3778ed95b21aSIlya Dryomov if (data_len) { 3779ed95b21aSIlya Dryomov ret = ceph_start_decoding(&p, end, 1, "NotifyMessage", 3780ed95b21aSIlya Dryomov &struct_v, &len); 3781ed95b21aSIlya Dryomov if (ret) { 3782ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "failed to decode NotifyMessage: %d", 3783ed95b21aSIlya Dryomov ret); 3784ed95b21aSIlya Dryomov return; 3785ed95b21aSIlya Dryomov } 378652bb1f9bSIlya Dryomov 3787ed95b21aSIlya Dryomov notify_op = ceph_decode_32(&p); 3788ed95b21aSIlya Dryomov } else { 3789ed95b21aSIlya Dryomov /* legacy notification for header updates */ 3790ed95b21aSIlya Dryomov notify_op = RBD_NOTIFY_OP_HEADER_UPDATE; 3791ed95b21aSIlya Dryomov len = 0; 3792ed95b21aSIlya Dryomov } 3793ed95b21aSIlya Dryomov 3794ed95b21aSIlya Dryomov dout("%s rbd_dev %p notify_op %u\n", __func__, rbd_dev, notify_op); 3795ed95b21aSIlya Dryomov switch (notify_op) { 3796ed95b21aSIlya Dryomov case RBD_NOTIFY_OP_ACQUIRED_LOCK: 3797ed95b21aSIlya Dryomov rbd_handle_acquired_lock(rbd_dev, struct_v, &p); 3798ed95b21aSIlya Dryomov rbd_acknowledge_notify(rbd_dev, notify_id, cookie); 3799ed95b21aSIlya Dryomov break; 3800ed95b21aSIlya Dryomov case RBD_NOTIFY_OP_RELEASED_LOCK: 3801ed95b21aSIlya Dryomov rbd_handle_released_lock(rbd_dev, struct_v, &p); 3802ed95b21aSIlya Dryomov rbd_acknowledge_notify(rbd_dev, notify_id, cookie); 3803ed95b21aSIlya Dryomov break; 3804ed95b21aSIlya Dryomov case RBD_NOTIFY_OP_REQUEST_LOCK: 3805ed95b21aSIlya Dryomov if (rbd_handle_request_lock(rbd_dev, struct_v, &p)) 380652bb1f9bSIlya Dryomov /* 3807ed95b21aSIlya Dryomov * send ResponseMessage(0) back so the client 3808ed95b21aSIlya Dryomov * can detect a missing owner 380952bb1f9bSIlya Dryomov */ 3810ed95b21aSIlya Dryomov rbd_acknowledge_notify_result(rbd_dev, notify_id, 3811ed95b21aSIlya Dryomov cookie, 0); 3812ed95b21aSIlya Dryomov else 3813ed95b21aSIlya Dryomov rbd_acknowledge_notify(rbd_dev, notify_id, cookie); 3814ed95b21aSIlya Dryomov break; 3815ed95b21aSIlya Dryomov case RBD_NOTIFY_OP_HEADER_UPDATE: 3816e627db08SAlex Elder ret = rbd_dev_refresh(rbd_dev); 3817e627db08SAlex Elder if (ret) 38189584d508SIlya Dryomov rbd_warn(rbd_dev, "refresh failed: %d", ret); 3819b8d70035SAlex Elder 3820ed95b21aSIlya Dryomov rbd_acknowledge_notify(rbd_dev, notify_id, cookie); 3821ed95b21aSIlya Dryomov break; 3822ed95b21aSIlya Dryomov default: 3823ed95b21aSIlya Dryomov if (rbd_is_lock_owner(rbd_dev)) 3824ed95b21aSIlya Dryomov rbd_acknowledge_notify_result(rbd_dev, notify_id, 3825ed95b21aSIlya Dryomov cookie, -EOPNOTSUPP); 3826ed95b21aSIlya Dryomov else 3827ed95b21aSIlya Dryomov rbd_acknowledge_notify(rbd_dev, notify_id, cookie); 3828ed95b21aSIlya Dryomov break; 3829ed95b21aSIlya Dryomov } 3830b8d70035SAlex Elder } 3831b8d70035SAlex Elder 383299d16943SIlya Dryomov static void __rbd_unregister_watch(struct rbd_device *rbd_dev); 383399d16943SIlya Dryomov 3834922dab61SIlya Dryomov static void rbd_watch_errcb(void *arg, u64 cookie, int err) 3835bb040aa0SIlya Dryomov { 3836922dab61SIlya Dryomov struct rbd_device *rbd_dev = arg; 3837bb040aa0SIlya Dryomov 3838922dab61SIlya Dryomov rbd_warn(rbd_dev, "encountered watch error: %d", err); 3839bb040aa0SIlya Dryomov 3840ed95b21aSIlya Dryomov down_write(&rbd_dev->lock_rwsem); 3841ed95b21aSIlya Dryomov rbd_set_owner_cid(rbd_dev, &rbd_empty_cid); 3842ed95b21aSIlya Dryomov up_write(&rbd_dev->lock_rwsem); 3843ed95b21aSIlya Dryomov 384499d16943SIlya Dryomov mutex_lock(&rbd_dev->watch_mutex); 384599d16943SIlya Dryomov if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED) { 384699d16943SIlya Dryomov __rbd_unregister_watch(rbd_dev); 384799d16943SIlya Dryomov rbd_dev->watch_state = RBD_WATCH_STATE_ERROR; 3848bb040aa0SIlya Dryomov 384999d16943SIlya Dryomov queue_delayed_work(rbd_dev->task_wq, &rbd_dev->watch_dwork, 0); 3850bb040aa0SIlya Dryomov } 385199d16943SIlya Dryomov mutex_unlock(&rbd_dev->watch_mutex); 3852bb040aa0SIlya Dryomov } 3853bb040aa0SIlya Dryomov 3854bb040aa0SIlya Dryomov /* 385599d16943SIlya Dryomov * watch_mutex must be locked 38569969ebc5SAlex Elder */ 385799d16943SIlya Dryomov static int __rbd_register_watch(struct rbd_device *rbd_dev) 38589969ebc5SAlex Elder { 38599969ebc5SAlex Elder struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 3860922dab61SIlya Dryomov struct ceph_osd_linger_request *handle; 38619969ebc5SAlex Elder 3862922dab61SIlya Dryomov rbd_assert(!rbd_dev->watch_handle); 386399d16943SIlya Dryomov dout("%s rbd_dev %p\n", __func__, rbd_dev); 38649969ebc5SAlex Elder 3865922dab61SIlya Dryomov handle = ceph_osdc_watch(osdc, &rbd_dev->header_oid, 3866922dab61SIlya Dryomov &rbd_dev->header_oloc, rbd_watch_cb, 3867922dab61SIlya Dryomov rbd_watch_errcb, rbd_dev); 3868922dab61SIlya Dryomov if (IS_ERR(handle)) 3869922dab61SIlya Dryomov return PTR_ERR(handle); 38709969ebc5SAlex Elder 3871922dab61SIlya Dryomov rbd_dev->watch_handle = handle; 38728eb87565SAlex Elder return 0; 38739969ebc5SAlex Elder } 38749969ebc5SAlex Elder 387599d16943SIlya Dryomov /* 387699d16943SIlya Dryomov * watch_mutex must be locked 387799d16943SIlya Dryomov */ 387899d16943SIlya Dryomov static void __rbd_unregister_watch(struct rbd_device *rbd_dev) 3879fca27065SIlya Dryomov { 3880922dab61SIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 3881922dab61SIlya Dryomov int ret; 3882b30a01f2SIlya Dryomov 388399d16943SIlya Dryomov rbd_assert(rbd_dev->watch_handle); 388499d16943SIlya Dryomov dout("%s rbd_dev %p\n", __func__, rbd_dev); 3885b30a01f2SIlya Dryomov 3886922dab61SIlya Dryomov ret = ceph_osdc_unwatch(osdc, rbd_dev->watch_handle); 3887922dab61SIlya Dryomov if (ret) 3888922dab61SIlya Dryomov rbd_warn(rbd_dev, "failed to unwatch: %d", ret); 3889b30a01f2SIlya Dryomov 3890922dab61SIlya Dryomov rbd_dev->watch_handle = NULL; 3891c525f036SIlya Dryomov } 3892c525f036SIlya Dryomov 389399d16943SIlya Dryomov static int rbd_register_watch(struct rbd_device *rbd_dev) 3894c525f036SIlya Dryomov { 389599d16943SIlya Dryomov int ret; 3896811c6688SIlya Dryomov 389799d16943SIlya Dryomov mutex_lock(&rbd_dev->watch_mutex); 389899d16943SIlya Dryomov rbd_assert(rbd_dev->watch_state == RBD_WATCH_STATE_UNREGISTERED); 389999d16943SIlya Dryomov ret = __rbd_register_watch(rbd_dev); 390099d16943SIlya Dryomov if (ret) 390199d16943SIlya Dryomov goto out; 390299d16943SIlya Dryomov 390399d16943SIlya Dryomov rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED; 390499d16943SIlya Dryomov rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id; 390599d16943SIlya Dryomov 390699d16943SIlya Dryomov out: 390799d16943SIlya Dryomov mutex_unlock(&rbd_dev->watch_mutex); 390899d16943SIlya Dryomov return ret; 390999d16943SIlya Dryomov } 391099d16943SIlya Dryomov 391199d16943SIlya Dryomov static void cancel_tasks_sync(struct rbd_device *rbd_dev) 391299d16943SIlya Dryomov { 391399d16943SIlya Dryomov dout("%s rbd_dev %p\n", __func__, rbd_dev); 391499d16943SIlya Dryomov 391599d16943SIlya Dryomov cancel_delayed_work_sync(&rbd_dev->watch_dwork); 3916ed95b21aSIlya Dryomov cancel_work_sync(&rbd_dev->acquired_lock_work); 3917ed95b21aSIlya Dryomov cancel_work_sync(&rbd_dev->released_lock_work); 3918ed95b21aSIlya Dryomov cancel_delayed_work_sync(&rbd_dev->lock_dwork); 3919ed95b21aSIlya Dryomov cancel_work_sync(&rbd_dev->unlock_work); 392099d16943SIlya Dryomov } 392199d16943SIlya Dryomov 392299d16943SIlya Dryomov static void rbd_unregister_watch(struct rbd_device *rbd_dev) 392399d16943SIlya Dryomov { 3924ed95b21aSIlya Dryomov WARN_ON(waitqueue_active(&rbd_dev->lock_waitq)); 392599d16943SIlya Dryomov cancel_tasks_sync(rbd_dev); 392699d16943SIlya Dryomov 392799d16943SIlya Dryomov mutex_lock(&rbd_dev->watch_mutex); 392899d16943SIlya Dryomov if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED) 392999d16943SIlya Dryomov __rbd_unregister_watch(rbd_dev); 393099d16943SIlya Dryomov rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED; 393199d16943SIlya Dryomov mutex_unlock(&rbd_dev->watch_mutex); 393299d16943SIlya Dryomov 3933811c6688SIlya Dryomov ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc); 3934fca27065SIlya Dryomov } 3935fca27065SIlya Dryomov 393699d16943SIlya Dryomov static void rbd_reregister_watch(struct work_struct *work) 393799d16943SIlya Dryomov { 393899d16943SIlya Dryomov struct rbd_device *rbd_dev = container_of(to_delayed_work(work), 393999d16943SIlya Dryomov struct rbd_device, watch_dwork); 3940ed95b21aSIlya Dryomov bool was_lock_owner = false; 394199d16943SIlya Dryomov int ret; 394299d16943SIlya Dryomov 394399d16943SIlya Dryomov dout("%s rbd_dev %p\n", __func__, rbd_dev); 394499d16943SIlya Dryomov 3945ed95b21aSIlya Dryomov down_write(&rbd_dev->lock_rwsem); 3946ed95b21aSIlya Dryomov if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED) 3947ed95b21aSIlya Dryomov was_lock_owner = rbd_release_lock(rbd_dev); 3948ed95b21aSIlya Dryomov 394999d16943SIlya Dryomov mutex_lock(&rbd_dev->watch_mutex); 395099d16943SIlya Dryomov if (rbd_dev->watch_state != RBD_WATCH_STATE_ERROR) 395199d16943SIlya Dryomov goto fail_unlock; 395299d16943SIlya Dryomov 395399d16943SIlya Dryomov ret = __rbd_register_watch(rbd_dev); 395499d16943SIlya Dryomov if (ret) { 395599d16943SIlya Dryomov rbd_warn(rbd_dev, "failed to reregister watch: %d", ret); 395699d16943SIlya Dryomov if (ret != -EBLACKLISTED) 395799d16943SIlya Dryomov queue_delayed_work(rbd_dev->task_wq, 395899d16943SIlya Dryomov &rbd_dev->watch_dwork, 395999d16943SIlya Dryomov RBD_RETRY_DELAY); 396099d16943SIlya Dryomov goto fail_unlock; 396199d16943SIlya Dryomov } 396299d16943SIlya Dryomov 396399d16943SIlya Dryomov rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED; 396499d16943SIlya Dryomov rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id; 396599d16943SIlya Dryomov mutex_unlock(&rbd_dev->watch_mutex); 396699d16943SIlya Dryomov 396799d16943SIlya Dryomov ret = rbd_dev_refresh(rbd_dev); 396899d16943SIlya Dryomov if (ret) 396999d16943SIlya Dryomov rbd_warn(rbd_dev, "reregisteration refresh failed: %d", ret); 397099d16943SIlya Dryomov 3971ed95b21aSIlya Dryomov if (was_lock_owner) { 3972ed95b21aSIlya Dryomov ret = rbd_try_lock(rbd_dev); 3973ed95b21aSIlya Dryomov if (ret) 3974ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "reregisteration lock failed: %d", 3975ed95b21aSIlya Dryomov ret); 3976ed95b21aSIlya Dryomov } 3977ed95b21aSIlya Dryomov 3978ed95b21aSIlya Dryomov up_write(&rbd_dev->lock_rwsem); 3979ed95b21aSIlya Dryomov wake_requests(rbd_dev, true); 398099d16943SIlya Dryomov return; 398199d16943SIlya Dryomov 398299d16943SIlya Dryomov fail_unlock: 398399d16943SIlya Dryomov mutex_unlock(&rbd_dev->watch_mutex); 3984ed95b21aSIlya Dryomov up_write(&rbd_dev->lock_rwsem); 398599d16943SIlya Dryomov } 398699d16943SIlya Dryomov 398736be9a76SAlex Elder /* 3988f40eb349SAlex Elder * Synchronous osd object method call. Returns the number of bytes 3989f40eb349SAlex Elder * returned in the outbound buffer, or a negative error code. 399036be9a76SAlex Elder */ 399136be9a76SAlex Elder static int rbd_obj_method_sync(struct rbd_device *rbd_dev, 399236be9a76SAlex Elder const char *object_name, 399336be9a76SAlex Elder const char *class_name, 399436be9a76SAlex Elder const char *method_name, 39954157976bSAlex Elder const void *outbound, 399636be9a76SAlex Elder size_t outbound_size, 39974157976bSAlex Elder void *inbound, 3998e2a58ee5SAlex Elder size_t inbound_size) 399936be9a76SAlex Elder { 40002169238dSAlex Elder struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 400136be9a76SAlex Elder struct rbd_obj_request *obj_request; 400236be9a76SAlex Elder struct page **pages; 400336be9a76SAlex Elder u32 page_count; 400436be9a76SAlex Elder int ret; 400536be9a76SAlex Elder 400636be9a76SAlex Elder /* 40076010a451SAlex Elder * Method calls are ultimately read operations. The result 40086010a451SAlex Elder * should placed into the inbound buffer provided. They 40096010a451SAlex Elder * also supply outbound data--parameters for the object 40106010a451SAlex Elder * method. Currently if this is present it will be a 40116010a451SAlex Elder * snapshot id. 401236be9a76SAlex Elder */ 401336be9a76SAlex Elder page_count = (u32)calc_pages_for(0, inbound_size); 401436be9a76SAlex Elder pages = ceph_alloc_page_vector(page_count, GFP_KERNEL); 401536be9a76SAlex Elder if (IS_ERR(pages)) 401636be9a76SAlex Elder return PTR_ERR(pages); 401736be9a76SAlex Elder 401836be9a76SAlex Elder ret = -ENOMEM; 40196010a451SAlex Elder obj_request = rbd_obj_request_create(object_name, 0, inbound_size, 402036be9a76SAlex Elder OBJ_REQUEST_PAGES); 402136be9a76SAlex Elder if (!obj_request) 402236be9a76SAlex Elder goto out; 402336be9a76SAlex Elder 402436be9a76SAlex Elder obj_request->pages = pages; 402536be9a76SAlex Elder obj_request->page_count = page_count; 402636be9a76SAlex Elder 40276d2940c8SGuangliang Zhao obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1, 4028deb236b3SIlya Dryomov obj_request); 402936be9a76SAlex Elder if (!obj_request->osd_req) 403036be9a76SAlex Elder goto out; 403136be9a76SAlex Elder 4032c99d2d4aSAlex Elder osd_req_op_cls_init(obj_request->osd_req, 0, CEPH_OSD_OP_CALL, 403304017e29SAlex Elder class_name, method_name); 403404017e29SAlex Elder if (outbound_size) { 403504017e29SAlex Elder struct ceph_pagelist *pagelist; 403604017e29SAlex Elder 403704017e29SAlex Elder pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS); 403804017e29SAlex Elder if (!pagelist) 403904017e29SAlex Elder goto out; 404004017e29SAlex Elder 404104017e29SAlex Elder ceph_pagelist_init(pagelist); 404204017e29SAlex Elder ceph_pagelist_append(pagelist, outbound, outbound_size); 404304017e29SAlex Elder osd_req_op_cls_request_data_pagelist(obj_request->osd_req, 0, 404404017e29SAlex Elder pagelist); 404504017e29SAlex Elder } 4046a4ce40a9SAlex Elder osd_req_op_cls_response_data_pages(obj_request->osd_req, 0, 4047a4ce40a9SAlex Elder obj_request->pages, inbound_size, 404844cd188dSAlex Elder 0, false, false); 40499d4df01fSAlex Elder rbd_osd_req_format_read(obj_request); 4050430c28c3SAlex Elder 405136be9a76SAlex Elder ret = rbd_obj_request_submit(osdc, obj_request); 405236be9a76SAlex Elder if (ret) 405336be9a76SAlex Elder goto out; 405436be9a76SAlex Elder ret = rbd_obj_request_wait(obj_request); 405536be9a76SAlex Elder if (ret) 405636be9a76SAlex Elder goto out; 405736be9a76SAlex Elder 405836be9a76SAlex Elder ret = obj_request->result; 405936be9a76SAlex Elder if (ret < 0) 406036be9a76SAlex Elder goto out; 406157385b51SAlex Elder 406257385b51SAlex Elder rbd_assert(obj_request->xferred < (u64)INT_MAX); 406357385b51SAlex Elder ret = (int)obj_request->xferred; 4064903bb32eSAlex Elder ceph_copy_from_page_vector(pages, inbound, 0, obj_request->xferred); 406536be9a76SAlex Elder out: 406636be9a76SAlex Elder if (obj_request) 406736be9a76SAlex Elder rbd_obj_request_put(obj_request); 406836be9a76SAlex Elder else 406936be9a76SAlex Elder ceph_release_page_vector(pages, page_count); 407036be9a76SAlex Elder 407136be9a76SAlex Elder return ret; 407236be9a76SAlex Elder } 407336be9a76SAlex Elder 4074ed95b21aSIlya Dryomov /* 4075ed95b21aSIlya Dryomov * lock_rwsem must be held for read 4076ed95b21aSIlya Dryomov */ 4077ed95b21aSIlya Dryomov static void rbd_wait_state_locked(struct rbd_device *rbd_dev) 4078ed95b21aSIlya Dryomov { 4079ed95b21aSIlya Dryomov DEFINE_WAIT(wait); 4080ed95b21aSIlya Dryomov 4081ed95b21aSIlya Dryomov do { 4082ed95b21aSIlya Dryomov /* 4083ed95b21aSIlya Dryomov * Note the use of mod_delayed_work() in rbd_acquire_lock() 4084ed95b21aSIlya Dryomov * and cancel_delayed_work() in wake_requests(). 4085ed95b21aSIlya Dryomov */ 4086ed95b21aSIlya Dryomov dout("%s rbd_dev %p queueing lock_dwork\n", __func__, rbd_dev); 4087ed95b21aSIlya Dryomov queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0); 4088ed95b21aSIlya Dryomov prepare_to_wait_exclusive(&rbd_dev->lock_waitq, &wait, 4089ed95b21aSIlya Dryomov TASK_UNINTERRUPTIBLE); 4090ed95b21aSIlya Dryomov up_read(&rbd_dev->lock_rwsem); 4091ed95b21aSIlya Dryomov schedule(); 4092ed95b21aSIlya Dryomov down_read(&rbd_dev->lock_rwsem); 4093ed95b21aSIlya Dryomov } while (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED); 4094ed95b21aSIlya Dryomov finish_wait(&rbd_dev->lock_waitq, &wait); 4095ed95b21aSIlya Dryomov } 4096ed95b21aSIlya Dryomov 40977ad18afaSChristoph Hellwig static void rbd_queue_workfn(struct work_struct *work) 4098bc1ecc65SIlya Dryomov { 40997ad18afaSChristoph Hellwig struct request *rq = blk_mq_rq_from_pdu(work); 41007ad18afaSChristoph Hellwig struct rbd_device *rbd_dev = rq->q->queuedata; 4101bc1ecc65SIlya Dryomov struct rbd_img_request *img_request; 41024e752f0aSJosh Durgin struct ceph_snap_context *snapc = NULL; 4103bc1ecc65SIlya Dryomov u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT; 4104bc1ecc65SIlya Dryomov u64 length = blk_rq_bytes(rq); 41056d2940c8SGuangliang Zhao enum obj_operation_type op_type; 41064e752f0aSJosh Durgin u64 mapping_size; 4107ed95b21aSIlya Dryomov bool must_be_locked = false; 4108bc1ecc65SIlya Dryomov int result; 4109bc1ecc65SIlya Dryomov 41107ad18afaSChristoph Hellwig if (rq->cmd_type != REQ_TYPE_FS) { 41117ad18afaSChristoph Hellwig dout("%s: non-fs request type %d\n", __func__, 41127ad18afaSChristoph Hellwig (int) rq->cmd_type); 41137ad18afaSChristoph Hellwig result = -EIO; 41147ad18afaSChristoph Hellwig goto err; 41157ad18afaSChristoph Hellwig } 41167ad18afaSChristoph Hellwig 4117c2df40dfSMike Christie if (req_op(rq) == REQ_OP_DISCARD) 411890e98c52SGuangliang Zhao op_type = OBJ_OP_DISCARD; 4119c2df40dfSMike Christie else if (req_op(rq) == REQ_OP_WRITE) 41206d2940c8SGuangliang Zhao op_type = OBJ_OP_WRITE; 41216d2940c8SGuangliang Zhao else 41226d2940c8SGuangliang Zhao op_type = OBJ_OP_READ; 41236d2940c8SGuangliang Zhao 4124bc1ecc65SIlya Dryomov /* Ignore/skip any zero-length requests */ 4125bc1ecc65SIlya Dryomov 4126bc1ecc65SIlya Dryomov if (!length) { 4127bc1ecc65SIlya Dryomov dout("%s: zero-length request\n", __func__); 4128bc1ecc65SIlya Dryomov result = 0; 4129bc1ecc65SIlya Dryomov goto err_rq; 4130bc1ecc65SIlya Dryomov } 4131bc1ecc65SIlya Dryomov 41326d2940c8SGuangliang Zhao /* Only reads are allowed to a read-only device */ 4133bc1ecc65SIlya Dryomov 41346d2940c8SGuangliang Zhao if (op_type != OBJ_OP_READ) { 4135bc1ecc65SIlya Dryomov if (rbd_dev->mapping.read_only) { 4136bc1ecc65SIlya Dryomov result = -EROFS; 4137bc1ecc65SIlya Dryomov goto err_rq; 4138bc1ecc65SIlya Dryomov } 4139bc1ecc65SIlya Dryomov rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP); 4140bc1ecc65SIlya Dryomov } 4141bc1ecc65SIlya Dryomov 4142bc1ecc65SIlya Dryomov /* 4143bc1ecc65SIlya Dryomov * Quit early if the mapped snapshot no longer exists. It's 4144bc1ecc65SIlya Dryomov * still possible the snapshot will have disappeared by the 4145bc1ecc65SIlya Dryomov * time our request arrives at the osd, but there's no sense in 4146bc1ecc65SIlya Dryomov * sending it if we already know. 4147bc1ecc65SIlya Dryomov */ 4148bc1ecc65SIlya Dryomov if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) { 4149bc1ecc65SIlya Dryomov dout("request for non-existent snapshot"); 4150bc1ecc65SIlya Dryomov rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP); 4151bc1ecc65SIlya Dryomov result = -ENXIO; 4152bc1ecc65SIlya Dryomov goto err_rq; 4153bc1ecc65SIlya Dryomov } 4154bc1ecc65SIlya Dryomov 4155bc1ecc65SIlya Dryomov if (offset && length > U64_MAX - offset + 1) { 4156bc1ecc65SIlya Dryomov rbd_warn(rbd_dev, "bad request range (%llu~%llu)", offset, 4157bc1ecc65SIlya Dryomov length); 4158bc1ecc65SIlya Dryomov result = -EINVAL; 4159bc1ecc65SIlya Dryomov goto err_rq; /* Shouldn't happen */ 4160bc1ecc65SIlya Dryomov } 4161bc1ecc65SIlya Dryomov 41627ad18afaSChristoph Hellwig blk_mq_start_request(rq); 41637ad18afaSChristoph Hellwig 41644e752f0aSJosh Durgin down_read(&rbd_dev->header_rwsem); 41654e752f0aSJosh Durgin mapping_size = rbd_dev->mapping.size; 41666d2940c8SGuangliang Zhao if (op_type != OBJ_OP_READ) { 41674e752f0aSJosh Durgin snapc = rbd_dev->header.snapc; 41684e752f0aSJosh Durgin ceph_get_snap_context(snapc); 4169ed95b21aSIlya Dryomov must_be_locked = rbd_is_lock_supported(rbd_dev); 41704e752f0aSJosh Durgin } 41714e752f0aSJosh Durgin up_read(&rbd_dev->header_rwsem); 41724e752f0aSJosh Durgin 41734e752f0aSJosh Durgin if (offset + length > mapping_size) { 4174bc1ecc65SIlya Dryomov rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)", offset, 41754e752f0aSJosh Durgin length, mapping_size); 4176bc1ecc65SIlya Dryomov result = -EIO; 4177bc1ecc65SIlya Dryomov goto err_rq; 4178bc1ecc65SIlya Dryomov } 4179bc1ecc65SIlya Dryomov 4180ed95b21aSIlya Dryomov if (must_be_locked) { 4181ed95b21aSIlya Dryomov down_read(&rbd_dev->lock_rwsem); 4182ed95b21aSIlya Dryomov if (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED) 4183ed95b21aSIlya Dryomov rbd_wait_state_locked(rbd_dev); 4184ed95b21aSIlya Dryomov } 4185ed95b21aSIlya Dryomov 41866d2940c8SGuangliang Zhao img_request = rbd_img_request_create(rbd_dev, offset, length, op_type, 41874e752f0aSJosh Durgin snapc); 4188bc1ecc65SIlya Dryomov if (!img_request) { 4189bc1ecc65SIlya Dryomov result = -ENOMEM; 4190ed95b21aSIlya Dryomov goto err_unlock; 4191bc1ecc65SIlya Dryomov } 4192bc1ecc65SIlya Dryomov img_request->rq = rq; 419370b16db8SIlya Dryomov snapc = NULL; /* img_request consumes a ref */ 4194bc1ecc65SIlya Dryomov 419590e98c52SGuangliang Zhao if (op_type == OBJ_OP_DISCARD) 419690e98c52SGuangliang Zhao result = rbd_img_request_fill(img_request, OBJ_REQUEST_NODATA, 419790e98c52SGuangliang Zhao NULL); 419890e98c52SGuangliang Zhao else 419990e98c52SGuangliang Zhao result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO, 420090e98c52SGuangliang Zhao rq->bio); 4201bc1ecc65SIlya Dryomov if (result) 4202bc1ecc65SIlya Dryomov goto err_img_request; 4203bc1ecc65SIlya Dryomov 4204bc1ecc65SIlya Dryomov result = rbd_img_request_submit(img_request); 4205bc1ecc65SIlya Dryomov if (result) 4206bc1ecc65SIlya Dryomov goto err_img_request; 4207bc1ecc65SIlya Dryomov 4208ed95b21aSIlya Dryomov if (must_be_locked) 4209ed95b21aSIlya Dryomov up_read(&rbd_dev->lock_rwsem); 4210bc1ecc65SIlya Dryomov return; 4211bc1ecc65SIlya Dryomov 4212bc1ecc65SIlya Dryomov err_img_request: 4213bc1ecc65SIlya Dryomov rbd_img_request_put(img_request); 4214ed95b21aSIlya Dryomov err_unlock: 4215ed95b21aSIlya Dryomov if (must_be_locked) 4216ed95b21aSIlya Dryomov up_read(&rbd_dev->lock_rwsem); 4217bc1ecc65SIlya Dryomov err_rq: 4218bc1ecc65SIlya Dryomov if (result) 4219bc1ecc65SIlya Dryomov rbd_warn(rbd_dev, "%s %llx at %llx result %d", 42206d2940c8SGuangliang Zhao obj_op_name(op_type), length, offset, result); 42214e752f0aSJosh Durgin ceph_put_snap_context(snapc); 42227ad18afaSChristoph Hellwig err: 42237ad18afaSChristoph Hellwig blk_mq_end_request(rq, result); 4224bc1ecc65SIlya Dryomov } 4225bc1ecc65SIlya Dryomov 42267ad18afaSChristoph Hellwig static int rbd_queue_rq(struct blk_mq_hw_ctx *hctx, 42277ad18afaSChristoph Hellwig const struct blk_mq_queue_data *bd) 4228bc1ecc65SIlya Dryomov { 42297ad18afaSChristoph Hellwig struct request *rq = bd->rq; 42307ad18afaSChristoph Hellwig struct work_struct *work = blk_mq_rq_to_pdu(rq); 4231bc1ecc65SIlya Dryomov 42327ad18afaSChristoph Hellwig queue_work(rbd_wq, work); 42337ad18afaSChristoph Hellwig return BLK_MQ_RQ_QUEUE_OK; 4234bf0d5f50SAlex Elder } 4235bf0d5f50SAlex Elder 4236602adf40SYehuda Sadeh static void rbd_free_disk(struct rbd_device *rbd_dev) 4237602adf40SYehuda Sadeh { 4238602adf40SYehuda Sadeh struct gendisk *disk = rbd_dev->disk; 4239602adf40SYehuda Sadeh 4240602adf40SYehuda Sadeh if (!disk) 4241602adf40SYehuda Sadeh return; 4242602adf40SYehuda Sadeh 4243a0cab924SAlex Elder rbd_dev->disk = NULL; 4244a0cab924SAlex Elder if (disk->flags & GENHD_FL_UP) { 4245602adf40SYehuda Sadeh del_gendisk(disk); 4246602adf40SYehuda Sadeh if (disk->queue) 4247602adf40SYehuda Sadeh blk_cleanup_queue(disk->queue); 42487ad18afaSChristoph Hellwig blk_mq_free_tag_set(&rbd_dev->tag_set); 4249a0cab924SAlex Elder } 4250602adf40SYehuda Sadeh put_disk(disk); 4251602adf40SYehuda Sadeh } 4252602adf40SYehuda Sadeh 4253788e2df3SAlex Elder static int rbd_obj_read_sync(struct rbd_device *rbd_dev, 4254788e2df3SAlex Elder const char *object_name, 42557097f8dfSAlex Elder u64 offset, u64 length, void *buf) 4256788e2df3SAlex Elder 4257788e2df3SAlex Elder { 42582169238dSAlex Elder struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 4259788e2df3SAlex Elder struct rbd_obj_request *obj_request; 4260788e2df3SAlex Elder struct page **pages = NULL; 4261788e2df3SAlex Elder u32 page_count; 42621ceae7efSAlex Elder size_t size; 4263788e2df3SAlex Elder int ret; 4264788e2df3SAlex Elder 4265788e2df3SAlex Elder page_count = (u32) calc_pages_for(offset, length); 4266788e2df3SAlex Elder pages = ceph_alloc_page_vector(page_count, GFP_KERNEL); 4267788e2df3SAlex Elder if (IS_ERR(pages)) 4268a8d42056SJan Kara return PTR_ERR(pages); 4269788e2df3SAlex Elder 4270788e2df3SAlex Elder ret = -ENOMEM; 4271788e2df3SAlex Elder obj_request = rbd_obj_request_create(object_name, offset, length, 4272788e2df3SAlex Elder OBJ_REQUEST_PAGES); 4273788e2df3SAlex Elder if (!obj_request) 4274788e2df3SAlex Elder goto out; 4275788e2df3SAlex Elder 4276788e2df3SAlex Elder obj_request->pages = pages; 4277788e2df3SAlex Elder obj_request->page_count = page_count; 4278788e2df3SAlex Elder 42796d2940c8SGuangliang Zhao obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1, 4280deb236b3SIlya Dryomov obj_request); 4281788e2df3SAlex Elder if (!obj_request->osd_req) 4282788e2df3SAlex Elder goto out; 4283788e2df3SAlex Elder 4284c99d2d4aSAlex Elder osd_req_op_extent_init(obj_request->osd_req, 0, CEPH_OSD_OP_READ, 4285c99d2d4aSAlex Elder offset, length, 0, 0); 4286406e2c9fSAlex Elder osd_req_op_extent_osd_data_pages(obj_request->osd_req, 0, 4287a4ce40a9SAlex Elder obj_request->pages, 428844cd188dSAlex Elder obj_request->length, 428944cd188dSAlex Elder obj_request->offset & ~PAGE_MASK, 429044cd188dSAlex Elder false, false); 42919d4df01fSAlex Elder rbd_osd_req_format_read(obj_request); 4292430c28c3SAlex Elder 4293788e2df3SAlex Elder ret = rbd_obj_request_submit(osdc, obj_request); 4294788e2df3SAlex Elder if (ret) 4295788e2df3SAlex Elder goto out; 4296788e2df3SAlex Elder ret = rbd_obj_request_wait(obj_request); 4297788e2df3SAlex Elder if (ret) 4298788e2df3SAlex Elder goto out; 4299788e2df3SAlex Elder 4300788e2df3SAlex Elder ret = obj_request->result; 4301788e2df3SAlex Elder if (ret < 0) 4302788e2df3SAlex Elder goto out; 43031ceae7efSAlex Elder 43041ceae7efSAlex Elder rbd_assert(obj_request->xferred <= (u64) SIZE_MAX); 43051ceae7efSAlex Elder size = (size_t) obj_request->xferred; 4306903bb32eSAlex Elder ceph_copy_from_page_vector(pages, buf, 0, size); 430723ed6e13SAlex Elder rbd_assert(size <= (size_t)INT_MAX); 430823ed6e13SAlex Elder ret = (int)size; 4309788e2df3SAlex Elder out: 4310788e2df3SAlex Elder if (obj_request) 4311788e2df3SAlex Elder rbd_obj_request_put(obj_request); 4312788e2df3SAlex Elder else 4313788e2df3SAlex Elder ceph_release_page_vector(pages, page_count); 4314788e2df3SAlex Elder 4315788e2df3SAlex Elder return ret; 4316788e2df3SAlex Elder } 4317788e2df3SAlex Elder 4318602adf40SYehuda Sadeh /* 4319662518b1SAlex Elder * Read the complete header for the given rbd device. On successful 4320662518b1SAlex Elder * return, the rbd_dev->header field will contain up-to-date 4321662518b1SAlex Elder * information about the image. 43224156d998SAlex Elder */ 432399a41ebcSAlex Elder static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev) 43244156d998SAlex Elder { 43254156d998SAlex Elder struct rbd_image_header_ondisk *ondisk = NULL; 43264156d998SAlex Elder u32 snap_count = 0; 43274156d998SAlex Elder u64 names_size = 0; 43284156d998SAlex Elder u32 want_count; 43294156d998SAlex Elder int ret; 43304156d998SAlex Elder 43314156d998SAlex Elder /* 43324156d998SAlex Elder * The complete header will include an array of its 64-bit 43334156d998SAlex Elder * snapshot ids, followed by the names of those snapshots as 43344156d998SAlex Elder * a contiguous block of NUL-terminated strings. Note that 43354156d998SAlex Elder * the number of snapshots could change by the time we read 43364156d998SAlex Elder * it in, in which case we re-read it. 43374156d998SAlex Elder */ 43384156d998SAlex Elder do { 43394156d998SAlex Elder size_t size; 43404156d998SAlex Elder 43414156d998SAlex Elder kfree(ondisk); 43424156d998SAlex Elder 43434156d998SAlex Elder size = sizeof (*ondisk); 43444156d998SAlex Elder size += snap_count * sizeof (struct rbd_image_snap_ondisk); 43454156d998SAlex Elder size += names_size; 43464156d998SAlex Elder ondisk = kmalloc(size, GFP_KERNEL); 43474156d998SAlex Elder if (!ondisk) 4348662518b1SAlex Elder return -ENOMEM; 43494156d998SAlex Elder 4350c41d13a3SIlya Dryomov ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_oid.name, 43517097f8dfSAlex Elder 0, size, ondisk); 43524156d998SAlex Elder if (ret < 0) 4353662518b1SAlex Elder goto out; 4354c0cd10dbSAlex Elder if ((size_t)ret < size) { 43554156d998SAlex Elder ret = -ENXIO; 435606ecc6cbSAlex Elder rbd_warn(rbd_dev, "short header read (want %zd got %d)", 435706ecc6cbSAlex Elder size, ret); 4358662518b1SAlex Elder goto out; 43594156d998SAlex Elder } 43604156d998SAlex Elder if (!rbd_dev_ondisk_valid(ondisk)) { 43614156d998SAlex Elder ret = -ENXIO; 436206ecc6cbSAlex Elder rbd_warn(rbd_dev, "invalid header"); 4363662518b1SAlex Elder goto out; 43644156d998SAlex Elder } 43654156d998SAlex Elder 43664156d998SAlex Elder names_size = le64_to_cpu(ondisk->snap_names_len); 43674156d998SAlex Elder want_count = snap_count; 43684156d998SAlex Elder snap_count = le32_to_cpu(ondisk->snap_count); 43694156d998SAlex Elder } while (snap_count != want_count); 43704156d998SAlex Elder 4371662518b1SAlex Elder ret = rbd_header_from_disk(rbd_dev, ondisk); 4372662518b1SAlex Elder out: 43734156d998SAlex Elder kfree(ondisk); 43744156d998SAlex Elder 4375dfc5606dSYehuda Sadeh return ret; 4376602adf40SYehuda Sadeh } 4377602adf40SYehuda Sadeh 437815228edeSAlex Elder /* 437915228edeSAlex Elder * Clear the rbd device's EXISTS flag if the snapshot it's mapped to 438015228edeSAlex Elder * has disappeared from the (just updated) snapshot context. 438115228edeSAlex Elder */ 438215228edeSAlex Elder static void rbd_exists_validate(struct rbd_device *rbd_dev) 438315228edeSAlex Elder { 438415228edeSAlex Elder u64 snap_id; 438515228edeSAlex Elder 438615228edeSAlex Elder if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) 438715228edeSAlex Elder return; 438815228edeSAlex Elder 438915228edeSAlex Elder snap_id = rbd_dev->spec->snap_id; 439015228edeSAlex Elder if (snap_id == CEPH_NOSNAP) 439115228edeSAlex Elder return; 439215228edeSAlex Elder 439315228edeSAlex Elder if (rbd_dev_snap_index(rbd_dev, snap_id) == BAD_SNAP_INDEX) 439415228edeSAlex Elder clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); 439515228edeSAlex Elder } 439615228edeSAlex Elder 43979875201eSJosh Durgin static void rbd_dev_update_size(struct rbd_device *rbd_dev) 43989875201eSJosh Durgin { 43999875201eSJosh Durgin sector_t size; 44009875201eSJosh Durgin 44019875201eSJosh Durgin /* 4402811c6688SIlya Dryomov * If EXISTS is not set, rbd_dev->disk may be NULL, so don't 4403811c6688SIlya Dryomov * try to update its size. If REMOVING is set, updating size 4404811c6688SIlya Dryomov * is just useless work since the device can't be opened. 44059875201eSJosh Durgin */ 4406811c6688SIlya Dryomov if (test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags) && 4407811c6688SIlya Dryomov !test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags)) { 44089875201eSJosh Durgin size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE; 44099875201eSJosh Durgin dout("setting size to %llu sectors", (unsigned long long)size); 44109875201eSJosh Durgin set_capacity(rbd_dev->disk, size); 44119875201eSJosh Durgin revalidate_disk(rbd_dev->disk); 44129875201eSJosh Durgin } 44139875201eSJosh Durgin } 44149875201eSJosh Durgin 4415cc4a38bdSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev) 44161fe5e993SAlex Elder { 4417e627db08SAlex Elder u64 mapping_size; 44181fe5e993SAlex Elder int ret; 44191fe5e993SAlex Elder 4420cfbf6377SAlex Elder down_write(&rbd_dev->header_rwsem); 44213b5cf2a2SAlex Elder mapping_size = rbd_dev->mapping.size; 4422a720ae09SIlya Dryomov 4423a720ae09SIlya Dryomov ret = rbd_dev_header_info(rbd_dev); 442452bb1f9bSIlya Dryomov if (ret) 442573e39e4dSIlya Dryomov goto out; 442615228edeSAlex Elder 4427e8f59b59SIlya Dryomov /* 4428e8f59b59SIlya Dryomov * If there is a parent, see if it has disappeared due to the 4429e8f59b59SIlya Dryomov * mapped image getting flattened. 4430e8f59b59SIlya Dryomov */ 4431e8f59b59SIlya Dryomov if (rbd_dev->parent) { 4432e8f59b59SIlya Dryomov ret = rbd_dev_v2_parent_info(rbd_dev); 4433e8f59b59SIlya Dryomov if (ret) 443473e39e4dSIlya Dryomov goto out; 4435e8f59b59SIlya Dryomov } 4436e8f59b59SIlya Dryomov 44375ff1108cSIlya Dryomov if (rbd_dev->spec->snap_id == CEPH_NOSNAP) { 44385ff1108cSIlya Dryomov rbd_dev->mapping.size = rbd_dev->header.image_size; 44395ff1108cSIlya Dryomov } else { 44405ff1108cSIlya Dryomov /* validate mapped snapshot's EXISTS flag */ 444115228edeSAlex Elder rbd_exists_validate(rbd_dev); 44425ff1108cSIlya Dryomov } 44435ff1108cSIlya Dryomov 444473e39e4dSIlya Dryomov out: 4445cfbf6377SAlex Elder up_write(&rbd_dev->header_rwsem); 444673e39e4dSIlya Dryomov if (!ret && mapping_size != rbd_dev->mapping.size) 44479875201eSJosh Durgin rbd_dev_update_size(rbd_dev); 44481fe5e993SAlex Elder 444973e39e4dSIlya Dryomov return ret; 44501fe5e993SAlex Elder } 44511fe5e993SAlex Elder 44527ad18afaSChristoph Hellwig static int rbd_init_request(void *data, struct request *rq, 44537ad18afaSChristoph Hellwig unsigned int hctx_idx, unsigned int request_idx, 44547ad18afaSChristoph Hellwig unsigned int numa_node) 44557ad18afaSChristoph Hellwig { 44567ad18afaSChristoph Hellwig struct work_struct *work = blk_mq_rq_to_pdu(rq); 44577ad18afaSChristoph Hellwig 44587ad18afaSChristoph Hellwig INIT_WORK(work, rbd_queue_workfn); 44597ad18afaSChristoph Hellwig return 0; 44607ad18afaSChristoph Hellwig } 44617ad18afaSChristoph Hellwig 44627ad18afaSChristoph Hellwig static struct blk_mq_ops rbd_mq_ops = { 44637ad18afaSChristoph Hellwig .queue_rq = rbd_queue_rq, 44647ad18afaSChristoph Hellwig .map_queue = blk_mq_map_queue, 44657ad18afaSChristoph Hellwig .init_request = rbd_init_request, 44667ad18afaSChristoph Hellwig }; 44677ad18afaSChristoph Hellwig 4468602adf40SYehuda Sadeh static int rbd_init_disk(struct rbd_device *rbd_dev) 4469602adf40SYehuda Sadeh { 4470602adf40SYehuda Sadeh struct gendisk *disk; 4471602adf40SYehuda Sadeh struct request_queue *q; 4472593a9e7bSAlex Elder u64 segment_size; 44737ad18afaSChristoph Hellwig int err; 4474602adf40SYehuda Sadeh 4475602adf40SYehuda Sadeh /* create gendisk info */ 44767e513d43SIlya Dryomov disk = alloc_disk(single_major ? 44777e513d43SIlya Dryomov (1 << RBD_SINGLE_MAJOR_PART_SHIFT) : 44787e513d43SIlya Dryomov RBD_MINORS_PER_MAJOR); 4479602adf40SYehuda Sadeh if (!disk) 44801fcdb8aaSAlex Elder return -ENOMEM; 4481602adf40SYehuda Sadeh 4482f0f8cef5SAlex Elder snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d", 4483de71a297SAlex Elder rbd_dev->dev_id); 4484602adf40SYehuda Sadeh disk->major = rbd_dev->major; 4485dd82fff1SIlya Dryomov disk->first_minor = rbd_dev->minor; 44867e513d43SIlya Dryomov if (single_major) 44877e513d43SIlya Dryomov disk->flags |= GENHD_FL_EXT_DEVT; 4488602adf40SYehuda Sadeh disk->fops = &rbd_bd_ops; 4489602adf40SYehuda Sadeh disk->private_data = rbd_dev; 4490602adf40SYehuda Sadeh 44917ad18afaSChristoph Hellwig memset(&rbd_dev->tag_set, 0, sizeof(rbd_dev->tag_set)); 44927ad18afaSChristoph Hellwig rbd_dev->tag_set.ops = &rbd_mq_ops; 4493b5584180SIlya Dryomov rbd_dev->tag_set.queue_depth = rbd_dev->opts->queue_depth; 44947ad18afaSChristoph Hellwig rbd_dev->tag_set.numa_node = NUMA_NO_NODE; 4495b5584180SIlya Dryomov rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE; 44967ad18afaSChristoph Hellwig rbd_dev->tag_set.nr_hw_queues = 1; 44977ad18afaSChristoph Hellwig rbd_dev->tag_set.cmd_size = sizeof(struct work_struct); 44987ad18afaSChristoph Hellwig 44997ad18afaSChristoph Hellwig err = blk_mq_alloc_tag_set(&rbd_dev->tag_set); 45007ad18afaSChristoph Hellwig if (err) 4501602adf40SYehuda Sadeh goto out_disk; 4502029bcbd8SJosh Durgin 45037ad18afaSChristoph Hellwig q = blk_mq_init_queue(&rbd_dev->tag_set); 45047ad18afaSChristoph Hellwig if (IS_ERR(q)) { 45057ad18afaSChristoph Hellwig err = PTR_ERR(q); 45067ad18afaSChristoph Hellwig goto out_tag_set; 45077ad18afaSChristoph Hellwig } 45087ad18afaSChristoph Hellwig 4509d8a2c89cSIlya Dryomov queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q); 4510d8a2c89cSIlya Dryomov /* QUEUE_FLAG_ADD_RANDOM is off by default for blk-mq */ 4511593a9e7bSAlex Elder 4512029bcbd8SJosh Durgin /* set io sizes to object size */ 4513593a9e7bSAlex Elder segment_size = rbd_obj_bytes(&rbd_dev->header); 4514593a9e7bSAlex Elder blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE); 45150d9fde4fSIlya Dryomov q->limits.max_sectors = queue_max_hw_sectors(q); 4516d3834fefSIlya Dryomov blk_queue_max_segments(q, segment_size / SECTOR_SIZE); 4517593a9e7bSAlex Elder blk_queue_max_segment_size(q, segment_size); 4518593a9e7bSAlex Elder blk_queue_io_min(q, segment_size); 4519593a9e7bSAlex Elder blk_queue_io_opt(q, segment_size); 4520029bcbd8SJosh Durgin 452190e98c52SGuangliang Zhao /* enable the discard support */ 452290e98c52SGuangliang Zhao queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q); 452390e98c52SGuangliang Zhao q->limits.discard_granularity = segment_size; 452490e98c52SGuangliang Zhao q->limits.discard_alignment = segment_size; 45252bb4cd5cSJens Axboe blk_queue_max_discard_sectors(q, segment_size / SECTOR_SIZE); 4526b76f8239SJosh Durgin q->limits.discard_zeroes_data = 1; 452790e98c52SGuangliang Zhao 4528bae818eeSRonny Hegewald if (!ceph_test_opt(rbd_dev->rbd_client->client, NOCRC)) 4529bae818eeSRonny Hegewald q->backing_dev_info.capabilities |= BDI_CAP_STABLE_WRITES; 4530bae818eeSRonny Hegewald 4531602adf40SYehuda Sadeh disk->queue = q; 4532602adf40SYehuda Sadeh 4533602adf40SYehuda Sadeh q->queuedata = rbd_dev; 4534602adf40SYehuda Sadeh 4535602adf40SYehuda Sadeh rbd_dev->disk = disk; 4536602adf40SYehuda Sadeh 4537602adf40SYehuda Sadeh return 0; 45387ad18afaSChristoph Hellwig out_tag_set: 45397ad18afaSChristoph Hellwig blk_mq_free_tag_set(&rbd_dev->tag_set); 4540602adf40SYehuda Sadeh out_disk: 4541602adf40SYehuda Sadeh put_disk(disk); 45427ad18afaSChristoph Hellwig return err; 4543602adf40SYehuda Sadeh } 4544602adf40SYehuda Sadeh 4545dfc5606dSYehuda Sadeh /* 4546dfc5606dSYehuda Sadeh sysfs 4547dfc5606dSYehuda Sadeh */ 4548602adf40SYehuda Sadeh 4549593a9e7bSAlex Elder static struct rbd_device *dev_to_rbd_dev(struct device *dev) 4550593a9e7bSAlex Elder { 4551593a9e7bSAlex Elder return container_of(dev, struct rbd_device, dev); 4552593a9e7bSAlex Elder } 4553593a9e7bSAlex Elder 4554dfc5606dSYehuda Sadeh static ssize_t rbd_size_show(struct device *dev, 4555dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 4556602adf40SYehuda Sadeh { 4557593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4558dfc5606dSYehuda Sadeh 4559fc71d833SAlex Elder return sprintf(buf, "%llu\n", 4560fc71d833SAlex Elder (unsigned long long)rbd_dev->mapping.size); 4561602adf40SYehuda Sadeh } 4562602adf40SYehuda Sadeh 456334b13184SAlex Elder /* 456434b13184SAlex Elder * Note this shows the features for whatever's mapped, which is not 456534b13184SAlex Elder * necessarily the base image. 456634b13184SAlex Elder */ 456734b13184SAlex Elder static ssize_t rbd_features_show(struct device *dev, 456834b13184SAlex Elder struct device_attribute *attr, char *buf) 456934b13184SAlex Elder { 457034b13184SAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 457134b13184SAlex Elder 457234b13184SAlex Elder return sprintf(buf, "0x%016llx\n", 457334b13184SAlex Elder (unsigned long long)rbd_dev->mapping.features); 457434b13184SAlex Elder } 457534b13184SAlex Elder 4576dfc5606dSYehuda Sadeh static ssize_t rbd_major_show(struct device *dev, 4577dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 4578602adf40SYehuda Sadeh { 4579593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4580dfc5606dSYehuda Sadeh 4581fc71d833SAlex Elder if (rbd_dev->major) 4582dfc5606dSYehuda Sadeh return sprintf(buf, "%d\n", rbd_dev->major); 4583fc71d833SAlex Elder 4584fc71d833SAlex Elder return sprintf(buf, "(none)\n"); 4585dd82fff1SIlya Dryomov } 4586fc71d833SAlex Elder 4587dd82fff1SIlya Dryomov static ssize_t rbd_minor_show(struct device *dev, 4588dd82fff1SIlya Dryomov struct device_attribute *attr, char *buf) 4589dd82fff1SIlya Dryomov { 4590dd82fff1SIlya Dryomov struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4591dd82fff1SIlya Dryomov 4592dd82fff1SIlya Dryomov return sprintf(buf, "%d\n", rbd_dev->minor); 4593dfc5606dSYehuda Sadeh } 4594dfc5606dSYehuda Sadeh 4595dfc5606dSYehuda Sadeh static ssize_t rbd_client_id_show(struct device *dev, 4596dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 4597dfc5606dSYehuda Sadeh { 4598593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4599dfc5606dSYehuda Sadeh 46001dbb4399SAlex Elder return sprintf(buf, "client%lld\n", 4601033268a5SIlya Dryomov ceph_client_gid(rbd_dev->rbd_client->client)); 4602dfc5606dSYehuda Sadeh } 4603dfc5606dSYehuda Sadeh 4604dfc5606dSYehuda Sadeh static ssize_t rbd_pool_show(struct device *dev, 4605dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 4606dfc5606dSYehuda Sadeh { 4607593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4608dfc5606dSYehuda Sadeh 46090d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->pool_name); 4610dfc5606dSYehuda Sadeh } 4611dfc5606dSYehuda Sadeh 46129bb2f334SAlex Elder static ssize_t rbd_pool_id_show(struct device *dev, 46139bb2f334SAlex Elder struct device_attribute *attr, char *buf) 46149bb2f334SAlex Elder { 46159bb2f334SAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 46169bb2f334SAlex Elder 46170d7dbfceSAlex Elder return sprintf(buf, "%llu\n", 46180d7dbfceSAlex Elder (unsigned long long) rbd_dev->spec->pool_id); 46199bb2f334SAlex Elder } 46209bb2f334SAlex Elder 4621dfc5606dSYehuda Sadeh static ssize_t rbd_name_show(struct device *dev, 4622dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 4623dfc5606dSYehuda Sadeh { 4624593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4625dfc5606dSYehuda Sadeh 4626a92ffdf8SAlex Elder if (rbd_dev->spec->image_name) 46270d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->image_name); 4628a92ffdf8SAlex Elder 4629a92ffdf8SAlex Elder return sprintf(buf, "(unknown)\n"); 4630dfc5606dSYehuda Sadeh } 4631dfc5606dSYehuda Sadeh 4632589d30e0SAlex Elder static ssize_t rbd_image_id_show(struct device *dev, 4633589d30e0SAlex Elder struct device_attribute *attr, char *buf) 4634589d30e0SAlex Elder { 4635589d30e0SAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4636589d30e0SAlex Elder 46370d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->image_id); 4638589d30e0SAlex Elder } 4639589d30e0SAlex Elder 464034b13184SAlex Elder /* 464134b13184SAlex Elder * Shows the name of the currently-mapped snapshot (or 464234b13184SAlex Elder * RBD_SNAP_HEAD_NAME for the base image). 464334b13184SAlex Elder */ 4644dfc5606dSYehuda Sadeh static ssize_t rbd_snap_show(struct device *dev, 4645dfc5606dSYehuda Sadeh struct device_attribute *attr, 4646dfc5606dSYehuda Sadeh char *buf) 4647dfc5606dSYehuda Sadeh { 4648593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4649dfc5606dSYehuda Sadeh 46500d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->snap_name); 4651dfc5606dSYehuda Sadeh } 4652dfc5606dSYehuda Sadeh 465386b00e0dSAlex Elder /* 4654ff96128fSIlya Dryomov * For a v2 image, shows the chain of parent images, separated by empty 4655ff96128fSIlya Dryomov * lines. For v1 images or if there is no parent, shows "(no parent 4656ff96128fSIlya Dryomov * image)". 465786b00e0dSAlex Elder */ 465886b00e0dSAlex Elder static ssize_t rbd_parent_show(struct device *dev, 465986b00e0dSAlex Elder struct device_attribute *attr, 466086b00e0dSAlex Elder char *buf) 466186b00e0dSAlex Elder { 466286b00e0dSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4663ff96128fSIlya Dryomov ssize_t count = 0; 466486b00e0dSAlex Elder 4665ff96128fSIlya Dryomov if (!rbd_dev->parent) 466686b00e0dSAlex Elder return sprintf(buf, "(no parent image)\n"); 466786b00e0dSAlex Elder 4668ff96128fSIlya Dryomov for ( ; rbd_dev->parent; rbd_dev = rbd_dev->parent) { 4669ff96128fSIlya Dryomov struct rbd_spec *spec = rbd_dev->parent_spec; 467086b00e0dSAlex Elder 4671ff96128fSIlya Dryomov count += sprintf(&buf[count], "%s" 4672ff96128fSIlya Dryomov "pool_id %llu\npool_name %s\n" 4673ff96128fSIlya Dryomov "image_id %s\nimage_name %s\n" 4674ff96128fSIlya Dryomov "snap_id %llu\nsnap_name %s\n" 4675ff96128fSIlya Dryomov "overlap %llu\n", 4676ff96128fSIlya Dryomov !count ? "" : "\n", /* first? */ 4677ff96128fSIlya Dryomov spec->pool_id, spec->pool_name, 4678ff96128fSIlya Dryomov spec->image_id, spec->image_name ?: "(unknown)", 4679ff96128fSIlya Dryomov spec->snap_id, spec->snap_name, 4680ff96128fSIlya Dryomov rbd_dev->parent_overlap); 4681ff96128fSIlya Dryomov } 468286b00e0dSAlex Elder 468386b00e0dSAlex Elder return count; 468486b00e0dSAlex Elder } 468586b00e0dSAlex Elder 4686dfc5606dSYehuda Sadeh static ssize_t rbd_image_refresh(struct device *dev, 4687dfc5606dSYehuda Sadeh struct device_attribute *attr, 4688dfc5606dSYehuda Sadeh const char *buf, 4689dfc5606dSYehuda Sadeh size_t size) 4690dfc5606dSYehuda Sadeh { 4691593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4692b813623aSAlex Elder int ret; 4693602adf40SYehuda Sadeh 4694cc4a38bdSAlex Elder ret = rbd_dev_refresh(rbd_dev); 4695e627db08SAlex Elder if (ret) 469652bb1f9bSIlya Dryomov return ret; 4697b813623aSAlex Elder 469852bb1f9bSIlya Dryomov return size; 4699dfc5606dSYehuda Sadeh } 4700602adf40SYehuda Sadeh 4701dfc5606dSYehuda Sadeh static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL); 470234b13184SAlex Elder static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL); 4703dfc5606dSYehuda Sadeh static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL); 4704dd82fff1SIlya Dryomov static DEVICE_ATTR(minor, S_IRUGO, rbd_minor_show, NULL); 4705dfc5606dSYehuda Sadeh static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL); 4706dfc5606dSYehuda Sadeh static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL); 47079bb2f334SAlex Elder static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL); 4708dfc5606dSYehuda Sadeh static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL); 4709589d30e0SAlex Elder static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL); 4710dfc5606dSYehuda Sadeh static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh); 4711dfc5606dSYehuda Sadeh static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL); 471286b00e0dSAlex Elder static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL); 4713dfc5606dSYehuda Sadeh 4714dfc5606dSYehuda Sadeh static struct attribute *rbd_attrs[] = { 4715dfc5606dSYehuda Sadeh &dev_attr_size.attr, 471634b13184SAlex Elder &dev_attr_features.attr, 4717dfc5606dSYehuda Sadeh &dev_attr_major.attr, 4718dd82fff1SIlya Dryomov &dev_attr_minor.attr, 4719dfc5606dSYehuda Sadeh &dev_attr_client_id.attr, 4720dfc5606dSYehuda Sadeh &dev_attr_pool.attr, 47219bb2f334SAlex Elder &dev_attr_pool_id.attr, 4722dfc5606dSYehuda Sadeh &dev_attr_name.attr, 4723589d30e0SAlex Elder &dev_attr_image_id.attr, 4724dfc5606dSYehuda Sadeh &dev_attr_current_snap.attr, 472586b00e0dSAlex Elder &dev_attr_parent.attr, 4726dfc5606dSYehuda Sadeh &dev_attr_refresh.attr, 4727dfc5606dSYehuda Sadeh NULL 4728dfc5606dSYehuda Sadeh }; 4729dfc5606dSYehuda Sadeh 4730dfc5606dSYehuda Sadeh static struct attribute_group rbd_attr_group = { 4731dfc5606dSYehuda Sadeh .attrs = rbd_attrs, 4732dfc5606dSYehuda Sadeh }; 4733dfc5606dSYehuda Sadeh 4734dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_attr_groups[] = { 4735dfc5606dSYehuda Sadeh &rbd_attr_group, 4736dfc5606dSYehuda Sadeh NULL 4737dfc5606dSYehuda Sadeh }; 4738dfc5606dSYehuda Sadeh 47396cac4695SIlya Dryomov static void rbd_dev_release(struct device *dev); 4740dfc5606dSYehuda Sadeh 4741dfc5606dSYehuda Sadeh static struct device_type rbd_device_type = { 4742dfc5606dSYehuda Sadeh .name = "rbd", 4743dfc5606dSYehuda Sadeh .groups = rbd_attr_groups, 47446cac4695SIlya Dryomov .release = rbd_dev_release, 4745dfc5606dSYehuda Sadeh }; 4746dfc5606dSYehuda Sadeh 47478b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec) 47488b8fb99cSAlex Elder { 47498b8fb99cSAlex Elder kref_get(&spec->kref); 47508b8fb99cSAlex Elder 47518b8fb99cSAlex Elder return spec; 47528b8fb99cSAlex Elder } 47538b8fb99cSAlex Elder 47548b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref); 47558b8fb99cSAlex Elder static void rbd_spec_put(struct rbd_spec *spec) 47568b8fb99cSAlex Elder { 47578b8fb99cSAlex Elder if (spec) 47588b8fb99cSAlex Elder kref_put(&spec->kref, rbd_spec_free); 47598b8fb99cSAlex Elder } 47608b8fb99cSAlex Elder 47618b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_alloc(void) 47628b8fb99cSAlex Elder { 47638b8fb99cSAlex Elder struct rbd_spec *spec; 47648b8fb99cSAlex Elder 47658b8fb99cSAlex Elder spec = kzalloc(sizeof (*spec), GFP_KERNEL); 47668b8fb99cSAlex Elder if (!spec) 47678b8fb99cSAlex Elder return NULL; 476804077599SIlya Dryomov 476904077599SIlya Dryomov spec->pool_id = CEPH_NOPOOL; 477004077599SIlya Dryomov spec->snap_id = CEPH_NOSNAP; 47718b8fb99cSAlex Elder kref_init(&spec->kref); 47728b8fb99cSAlex Elder 47738b8fb99cSAlex Elder return spec; 47748b8fb99cSAlex Elder } 47758b8fb99cSAlex Elder 47768b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref) 47778b8fb99cSAlex Elder { 47788b8fb99cSAlex Elder struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref); 47798b8fb99cSAlex Elder 47808b8fb99cSAlex Elder kfree(spec->pool_name); 47818b8fb99cSAlex Elder kfree(spec->image_id); 47828b8fb99cSAlex Elder kfree(spec->image_name); 47838b8fb99cSAlex Elder kfree(spec->snap_name); 47848b8fb99cSAlex Elder kfree(spec); 47858b8fb99cSAlex Elder } 47868b8fb99cSAlex Elder 47871643dfa4SIlya Dryomov static void rbd_dev_free(struct rbd_device *rbd_dev) 4788dd5ac32dSIlya Dryomov { 478999d16943SIlya Dryomov WARN_ON(rbd_dev->watch_state != RBD_WATCH_STATE_UNREGISTERED); 4790ed95b21aSIlya Dryomov WARN_ON(rbd_dev->lock_state != RBD_LOCK_STATE_UNLOCKED); 479199d16943SIlya Dryomov 4792c41d13a3SIlya Dryomov ceph_oid_destroy(&rbd_dev->header_oid); 47936b6dddbeSIlya Dryomov ceph_oloc_destroy(&rbd_dev->header_oloc); 4794c41d13a3SIlya Dryomov 4795dd5ac32dSIlya Dryomov rbd_put_client(rbd_dev->rbd_client); 4796dd5ac32dSIlya Dryomov rbd_spec_put(rbd_dev->spec); 4797dd5ac32dSIlya Dryomov kfree(rbd_dev->opts); 4798dd5ac32dSIlya Dryomov kfree(rbd_dev); 47991643dfa4SIlya Dryomov } 48001643dfa4SIlya Dryomov 48011643dfa4SIlya Dryomov static void rbd_dev_release(struct device *dev) 48021643dfa4SIlya Dryomov { 48031643dfa4SIlya Dryomov struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 48041643dfa4SIlya Dryomov bool need_put = !!rbd_dev->opts; 48051643dfa4SIlya Dryomov 48061643dfa4SIlya Dryomov if (need_put) { 48071643dfa4SIlya Dryomov destroy_workqueue(rbd_dev->task_wq); 48081643dfa4SIlya Dryomov ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id); 48091643dfa4SIlya Dryomov } 48101643dfa4SIlya Dryomov 48111643dfa4SIlya Dryomov rbd_dev_free(rbd_dev); 4812dd5ac32dSIlya Dryomov 4813dd5ac32dSIlya Dryomov /* 4814dd5ac32dSIlya Dryomov * This is racy, but way better than putting module outside of 4815dd5ac32dSIlya Dryomov * the release callback. The race window is pretty small, so 4816dd5ac32dSIlya Dryomov * doing something similar to dm (dm-builtin.c) is overkill. 4817dd5ac32dSIlya Dryomov */ 4818dd5ac32dSIlya Dryomov if (need_put) 4819dd5ac32dSIlya Dryomov module_put(THIS_MODULE); 4820dd5ac32dSIlya Dryomov } 4821dd5ac32dSIlya Dryomov 48221643dfa4SIlya Dryomov static struct rbd_device *__rbd_dev_create(struct rbd_client *rbdc, 48231643dfa4SIlya Dryomov struct rbd_spec *spec) 4824c53d5893SAlex Elder { 4825c53d5893SAlex Elder struct rbd_device *rbd_dev; 4826c53d5893SAlex Elder 4827c53d5893SAlex Elder rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL); 4828c53d5893SAlex Elder if (!rbd_dev) 4829c53d5893SAlex Elder return NULL; 4830c53d5893SAlex Elder 4831c53d5893SAlex Elder spin_lock_init(&rbd_dev->lock); 4832c53d5893SAlex Elder INIT_LIST_HEAD(&rbd_dev->node); 4833c53d5893SAlex Elder init_rwsem(&rbd_dev->header_rwsem); 4834c53d5893SAlex Elder 4835c41d13a3SIlya Dryomov ceph_oid_init(&rbd_dev->header_oid); 4836922dab61SIlya Dryomov ceph_oloc_init(&rbd_dev->header_oloc); 4837c41d13a3SIlya Dryomov 483899d16943SIlya Dryomov mutex_init(&rbd_dev->watch_mutex); 483999d16943SIlya Dryomov rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED; 484099d16943SIlya Dryomov INIT_DELAYED_WORK(&rbd_dev->watch_dwork, rbd_reregister_watch); 484199d16943SIlya Dryomov 4842ed95b21aSIlya Dryomov init_rwsem(&rbd_dev->lock_rwsem); 4843ed95b21aSIlya Dryomov rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED; 4844ed95b21aSIlya Dryomov INIT_WORK(&rbd_dev->acquired_lock_work, rbd_notify_acquired_lock); 4845ed95b21aSIlya Dryomov INIT_WORK(&rbd_dev->released_lock_work, rbd_notify_released_lock); 4846ed95b21aSIlya Dryomov INIT_DELAYED_WORK(&rbd_dev->lock_dwork, rbd_acquire_lock); 4847ed95b21aSIlya Dryomov INIT_WORK(&rbd_dev->unlock_work, rbd_release_lock_work); 4848ed95b21aSIlya Dryomov init_waitqueue_head(&rbd_dev->lock_waitq); 4849ed95b21aSIlya Dryomov 4850dd5ac32dSIlya Dryomov rbd_dev->dev.bus = &rbd_bus_type; 4851dd5ac32dSIlya Dryomov rbd_dev->dev.type = &rbd_device_type; 4852dd5ac32dSIlya Dryomov rbd_dev->dev.parent = &rbd_root_dev; 4853dd5ac32dSIlya Dryomov device_initialize(&rbd_dev->dev); 4854dd5ac32dSIlya Dryomov 4855c53d5893SAlex Elder rbd_dev->rbd_client = rbdc; 4856d147543dSIlya Dryomov rbd_dev->spec = spec; 48570903e875SAlex Elder 48587627151eSYan, Zheng rbd_dev->layout.stripe_unit = 1 << RBD_MAX_OBJ_ORDER; 48597627151eSYan, Zheng rbd_dev->layout.stripe_count = 1; 48607627151eSYan, Zheng rbd_dev->layout.object_size = 1 << RBD_MAX_OBJ_ORDER; 48617627151eSYan, Zheng rbd_dev->layout.pool_id = spec->pool_id; 486230c156d9SYan, Zheng RCU_INIT_POINTER(rbd_dev->layout.pool_ns, NULL); 48630903e875SAlex Elder 48641643dfa4SIlya Dryomov return rbd_dev; 48651643dfa4SIlya Dryomov } 48661643dfa4SIlya Dryomov 4867dd5ac32dSIlya Dryomov /* 48681643dfa4SIlya Dryomov * Create a mapping rbd_dev. 4869dd5ac32dSIlya Dryomov */ 48701643dfa4SIlya Dryomov static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc, 48711643dfa4SIlya Dryomov struct rbd_spec *spec, 48721643dfa4SIlya Dryomov struct rbd_options *opts) 48731643dfa4SIlya Dryomov { 48741643dfa4SIlya Dryomov struct rbd_device *rbd_dev; 48751643dfa4SIlya Dryomov 48761643dfa4SIlya Dryomov rbd_dev = __rbd_dev_create(rbdc, spec); 48771643dfa4SIlya Dryomov if (!rbd_dev) 48781643dfa4SIlya Dryomov return NULL; 48791643dfa4SIlya Dryomov 48801643dfa4SIlya Dryomov rbd_dev->opts = opts; 48811643dfa4SIlya Dryomov 48821643dfa4SIlya Dryomov /* get an id and fill in device name */ 48831643dfa4SIlya Dryomov rbd_dev->dev_id = ida_simple_get(&rbd_dev_id_ida, 0, 48841643dfa4SIlya Dryomov minor_to_rbd_dev_id(1 << MINORBITS), 48851643dfa4SIlya Dryomov GFP_KERNEL); 48861643dfa4SIlya Dryomov if (rbd_dev->dev_id < 0) 48871643dfa4SIlya Dryomov goto fail_rbd_dev; 48881643dfa4SIlya Dryomov 48891643dfa4SIlya Dryomov sprintf(rbd_dev->name, RBD_DRV_NAME "%d", rbd_dev->dev_id); 48901643dfa4SIlya Dryomov rbd_dev->task_wq = alloc_ordered_workqueue("%s-tasks", WQ_MEM_RECLAIM, 48911643dfa4SIlya Dryomov rbd_dev->name); 48921643dfa4SIlya Dryomov if (!rbd_dev->task_wq) 48931643dfa4SIlya Dryomov goto fail_dev_id; 48941643dfa4SIlya Dryomov 48951643dfa4SIlya Dryomov /* we have a ref from do_rbd_add() */ 4896dd5ac32dSIlya Dryomov __module_get(THIS_MODULE); 4897dd5ac32dSIlya Dryomov 48981643dfa4SIlya Dryomov dout("%s rbd_dev %p dev_id %d\n", __func__, rbd_dev, rbd_dev->dev_id); 4899c53d5893SAlex Elder return rbd_dev; 49001643dfa4SIlya Dryomov 49011643dfa4SIlya Dryomov fail_dev_id: 49021643dfa4SIlya Dryomov ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id); 49031643dfa4SIlya Dryomov fail_rbd_dev: 49041643dfa4SIlya Dryomov rbd_dev_free(rbd_dev); 49051643dfa4SIlya Dryomov return NULL; 4906c53d5893SAlex Elder } 4907c53d5893SAlex Elder 4908c53d5893SAlex Elder static void rbd_dev_destroy(struct rbd_device *rbd_dev) 4909c53d5893SAlex Elder { 4910dd5ac32dSIlya Dryomov if (rbd_dev) 4911dd5ac32dSIlya Dryomov put_device(&rbd_dev->dev); 4912c53d5893SAlex Elder } 4913c53d5893SAlex Elder 4914dfc5606dSYehuda Sadeh /* 49159d475de5SAlex Elder * Get the size and object order for an image snapshot, or if 49169d475de5SAlex Elder * snap_id is CEPH_NOSNAP, gets this information for the base 49179d475de5SAlex Elder * image. 49189d475de5SAlex Elder */ 49199d475de5SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id, 49209d475de5SAlex Elder u8 *order, u64 *snap_size) 49219d475de5SAlex Elder { 49229d475de5SAlex Elder __le64 snapid = cpu_to_le64(snap_id); 49239d475de5SAlex Elder int ret; 49249d475de5SAlex Elder struct { 49259d475de5SAlex Elder u8 order; 49269d475de5SAlex Elder __le64 size; 49279d475de5SAlex Elder } __attribute__ ((packed)) size_buf = { 0 }; 49289d475de5SAlex Elder 4929c41d13a3SIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name, 49309d475de5SAlex Elder "rbd", "get_size", 49314157976bSAlex Elder &snapid, sizeof (snapid), 4932e2a58ee5SAlex Elder &size_buf, sizeof (size_buf)); 493336be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 49349d475de5SAlex Elder if (ret < 0) 49359d475de5SAlex Elder return ret; 493657385b51SAlex Elder if (ret < sizeof (size_buf)) 493757385b51SAlex Elder return -ERANGE; 49389d475de5SAlex Elder 4939c3545579SJosh Durgin if (order) { 49409d475de5SAlex Elder *order = size_buf.order; 4941c3545579SJosh Durgin dout(" order %u", (unsigned int)*order); 4942c3545579SJosh Durgin } 49439d475de5SAlex Elder *snap_size = le64_to_cpu(size_buf.size); 49449d475de5SAlex Elder 4945c3545579SJosh Durgin dout(" snap_id 0x%016llx snap_size = %llu\n", 4946c3545579SJosh Durgin (unsigned long long)snap_id, 49479d475de5SAlex Elder (unsigned long long)*snap_size); 49489d475de5SAlex Elder 49499d475de5SAlex Elder return 0; 49509d475de5SAlex Elder } 49519d475de5SAlex Elder 49529d475de5SAlex Elder static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev) 49539d475de5SAlex Elder { 49549d475de5SAlex Elder return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP, 49559d475de5SAlex Elder &rbd_dev->header.obj_order, 49569d475de5SAlex Elder &rbd_dev->header.image_size); 49579d475de5SAlex Elder } 49589d475de5SAlex Elder 49591e130199SAlex Elder static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev) 49601e130199SAlex Elder { 49611e130199SAlex Elder void *reply_buf; 49621e130199SAlex Elder int ret; 49631e130199SAlex Elder void *p; 49641e130199SAlex Elder 49651e130199SAlex Elder reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL); 49661e130199SAlex Elder if (!reply_buf) 49671e130199SAlex Elder return -ENOMEM; 49681e130199SAlex Elder 4969c41d13a3SIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name, 49704157976bSAlex Elder "rbd", "get_object_prefix", NULL, 0, 4971e2a58ee5SAlex Elder reply_buf, RBD_OBJ_PREFIX_LEN_MAX); 497236be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 49731e130199SAlex Elder if (ret < 0) 49741e130199SAlex Elder goto out; 49751e130199SAlex Elder 49761e130199SAlex Elder p = reply_buf; 49771e130199SAlex Elder rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p, 497857385b51SAlex Elder p + ret, NULL, GFP_NOIO); 497957385b51SAlex Elder ret = 0; 49801e130199SAlex Elder 49811e130199SAlex Elder if (IS_ERR(rbd_dev->header.object_prefix)) { 49821e130199SAlex Elder ret = PTR_ERR(rbd_dev->header.object_prefix); 49831e130199SAlex Elder rbd_dev->header.object_prefix = NULL; 49841e130199SAlex Elder } else { 49851e130199SAlex Elder dout(" object_prefix = %s\n", rbd_dev->header.object_prefix); 49861e130199SAlex Elder } 49871e130199SAlex Elder out: 49881e130199SAlex Elder kfree(reply_buf); 49891e130199SAlex Elder 49901e130199SAlex Elder return ret; 49911e130199SAlex Elder } 49921e130199SAlex Elder 4993b1b5402aSAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id, 4994b1b5402aSAlex Elder u64 *snap_features) 4995b1b5402aSAlex Elder { 4996b1b5402aSAlex Elder __le64 snapid = cpu_to_le64(snap_id); 4997b1b5402aSAlex Elder struct { 4998b1b5402aSAlex Elder __le64 features; 4999b1b5402aSAlex Elder __le64 incompat; 50004157976bSAlex Elder } __attribute__ ((packed)) features_buf = { 0 }; 5001d3767f0fSIlya Dryomov u64 unsup; 5002b1b5402aSAlex Elder int ret; 5003b1b5402aSAlex Elder 5004c41d13a3SIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name, 5005b1b5402aSAlex Elder "rbd", "get_features", 50064157976bSAlex Elder &snapid, sizeof (snapid), 5007e2a58ee5SAlex Elder &features_buf, sizeof (features_buf)); 500836be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 5009b1b5402aSAlex Elder if (ret < 0) 5010b1b5402aSAlex Elder return ret; 501157385b51SAlex Elder if (ret < sizeof (features_buf)) 501257385b51SAlex Elder return -ERANGE; 5013d889140cSAlex Elder 5014d3767f0fSIlya Dryomov unsup = le64_to_cpu(features_buf.incompat) & ~RBD_FEATURES_SUPPORTED; 5015d3767f0fSIlya Dryomov if (unsup) { 5016d3767f0fSIlya Dryomov rbd_warn(rbd_dev, "image uses unsupported features: 0x%llx", 5017d3767f0fSIlya Dryomov unsup); 5018b8f5c6edSAlex Elder return -ENXIO; 5019d3767f0fSIlya Dryomov } 5020d889140cSAlex Elder 5021b1b5402aSAlex Elder *snap_features = le64_to_cpu(features_buf.features); 5022b1b5402aSAlex Elder 5023b1b5402aSAlex Elder dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n", 5024b1b5402aSAlex Elder (unsigned long long)snap_id, 5025b1b5402aSAlex Elder (unsigned long long)*snap_features, 5026b1b5402aSAlex Elder (unsigned long long)le64_to_cpu(features_buf.incompat)); 5027b1b5402aSAlex Elder 5028b1b5402aSAlex Elder return 0; 5029b1b5402aSAlex Elder } 5030b1b5402aSAlex Elder 5031b1b5402aSAlex Elder static int rbd_dev_v2_features(struct rbd_device *rbd_dev) 5032b1b5402aSAlex Elder { 5033b1b5402aSAlex Elder return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP, 5034b1b5402aSAlex Elder &rbd_dev->header.features); 5035b1b5402aSAlex Elder } 5036b1b5402aSAlex Elder 503786b00e0dSAlex Elder static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev) 503886b00e0dSAlex Elder { 503986b00e0dSAlex Elder struct rbd_spec *parent_spec; 504086b00e0dSAlex Elder size_t size; 504186b00e0dSAlex Elder void *reply_buf = NULL; 504286b00e0dSAlex Elder __le64 snapid; 504386b00e0dSAlex Elder void *p; 504486b00e0dSAlex Elder void *end; 5045642a2537SAlex Elder u64 pool_id; 504686b00e0dSAlex Elder char *image_id; 50473b5cf2a2SAlex Elder u64 snap_id; 504886b00e0dSAlex Elder u64 overlap; 504986b00e0dSAlex Elder int ret; 505086b00e0dSAlex Elder 505186b00e0dSAlex Elder parent_spec = rbd_spec_alloc(); 505286b00e0dSAlex Elder if (!parent_spec) 505386b00e0dSAlex Elder return -ENOMEM; 505486b00e0dSAlex Elder 505586b00e0dSAlex Elder size = sizeof (__le64) + /* pool_id */ 505686b00e0dSAlex Elder sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX + /* image_id */ 505786b00e0dSAlex Elder sizeof (__le64) + /* snap_id */ 505886b00e0dSAlex Elder sizeof (__le64); /* overlap */ 505986b00e0dSAlex Elder reply_buf = kmalloc(size, GFP_KERNEL); 506086b00e0dSAlex Elder if (!reply_buf) { 506186b00e0dSAlex Elder ret = -ENOMEM; 506286b00e0dSAlex Elder goto out_err; 506386b00e0dSAlex Elder } 506486b00e0dSAlex Elder 50654d9b67cdSIlya Dryomov snapid = cpu_to_le64(rbd_dev->spec->snap_id); 5066c41d13a3SIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name, 506786b00e0dSAlex Elder "rbd", "get_parent", 50684157976bSAlex Elder &snapid, sizeof (snapid), 5069e2a58ee5SAlex Elder reply_buf, size); 507036be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 507186b00e0dSAlex Elder if (ret < 0) 507286b00e0dSAlex Elder goto out_err; 507386b00e0dSAlex Elder 507486b00e0dSAlex Elder p = reply_buf; 507557385b51SAlex Elder end = reply_buf + ret; 507657385b51SAlex Elder ret = -ERANGE; 5077642a2537SAlex Elder ceph_decode_64_safe(&p, end, pool_id, out_err); 5078392a9dadSAlex Elder if (pool_id == CEPH_NOPOOL) { 5079392a9dadSAlex Elder /* 5080392a9dadSAlex Elder * Either the parent never existed, or we have 5081392a9dadSAlex Elder * record of it but the image got flattened so it no 5082392a9dadSAlex Elder * longer has a parent. When the parent of a 5083392a9dadSAlex Elder * layered image disappears we immediately set the 5084392a9dadSAlex Elder * overlap to 0. The effect of this is that all new 5085392a9dadSAlex Elder * requests will be treated as if the image had no 5086392a9dadSAlex Elder * parent. 5087392a9dadSAlex Elder */ 5088392a9dadSAlex Elder if (rbd_dev->parent_overlap) { 5089392a9dadSAlex Elder rbd_dev->parent_overlap = 0; 5090392a9dadSAlex Elder rbd_dev_parent_put(rbd_dev); 5091392a9dadSAlex Elder pr_info("%s: clone image has been flattened\n", 5092392a9dadSAlex Elder rbd_dev->disk->disk_name); 5093392a9dadSAlex Elder } 5094392a9dadSAlex Elder 509586b00e0dSAlex Elder goto out; /* No parent? No problem. */ 5096392a9dadSAlex Elder } 509786b00e0dSAlex Elder 50980903e875SAlex Elder /* The ceph file layout needs to fit pool id in 32 bits */ 50990903e875SAlex Elder 51000903e875SAlex Elder ret = -EIO; 5101642a2537SAlex Elder if (pool_id > (u64)U32_MAX) { 51029584d508SIlya Dryomov rbd_warn(NULL, "parent pool id too large (%llu > %u)", 5103642a2537SAlex Elder (unsigned long long)pool_id, U32_MAX); 510457385b51SAlex Elder goto out_err; 5105c0cd10dbSAlex Elder } 51060903e875SAlex Elder 5107979ed480SAlex Elder image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL); 510886b00e0dSAlex Elder if (IS_ERR(image_id)) { 510986b00e0dSAlex Elder ret = PTR_ERR(image_id); 511086b00e0dSAlex Elder goto out_err; 511186b00e0dSAlex Elder } 51123b5cf2a2SAlex Elder ceph_decode_64_safe(&p, end, snap_id, out_err); 511386b00e0dSAlex Elder ceph_decode_64_safe(&p, end, overlap, out_err); 511486b00e0dSAlex Elder 51153b5cf2a2SAlex Elder /* 51163b5cf2a2SAlex Elder * The parent won't change (except when the clone is 51173b5cf2a2SAlex Elder * flattened, already handled that). So we only need to 51183b5cf2a2SAlex Elder * record the parent spec we have not already done so. 51193b5cf2a2SAlex Elder */ 51203b5cf2a2SAlex Elder if (!rbd_dev->parent_spec) { 51213b5cf2a2SAlex Elder parent_spec->pool_id = pool_id; 51223b5cf2a2SAlex Elder parent_spec->image_id = image_id; 51233b5cf2a2SAlex Elder parent_spec->snap_id = snap_id; 512486b00e0dSAlex Elder rbd_dev->parent_spec = parent_spec; 512586b00e0dSAlex Elder parent_spec = NULL; /* rbd_dev now owns this */ 5126fbba11b3SIlya Dryomov } else { 5127fbba11b3SIlya Dryomov kfree(image_id); 51283b5cf2a2SAlex Elder } 51293b5cf2a2SAlex Elder 51303b5cf2a2SAlex Elder /* 5131cf32bd9cSIlya Dryomov * We always update the parent overlap. If it's zero we issue 5132cf32bd9cSIlya Dryomov * a warning, as we will proceed as if there was no parent. 51333b5cf2a2SAlex Elder */ 51343b5cf2a2SAlex Elder if (!overlap) { 51353b5cf2a2SAlex Elder if (parent_spec) { 5136cf32bd9cSIlya Dryomov /* refresh, careful to warn just once */ 5137cf32bd9cSIlya Dryomov if (rbd_dev->parent_overlap) 5138cf32bd9cSIlya Dryomov rbd_warn(rbd_dev, 5139cf32bd9cSIlya Dryomov "clone now standalone (overlap became 0)"); 514070cf49cfSAlex Elder } else { 5141cf32bd9cSIlya Dryomov /* initial probe */ 5142cf32bd9cSIlya Dryomov rbd_warn(rbd_dev, "clone is standalone (overlap 0)"); 51433b5cf2a2SAlex Elder } 514470cf49cfSAlex Elder } 5145cf32bd9cSIlya Dryomov rbd_dev->parent_overlap = overlap; 5146cf32bd9cSIlya Dryomov 514786b00e0dSAlex Elder out: 514886b00e0dSAlex Elder ret = 0; 514986b00e0dSAlex Elder out_err: 515086b00e0dSAlex Elder kfree(reply_buf); 515186b00e0dSAlex Elder rbd_spec_put(parent_spec); 515286b00e0dSAlex Elder 515386b00e0dSAlex Elder return ret; 515486b00e0dSAlex Elder } 515586b00e0dSAlex Elder 5156cc070d59SAlex Elder static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev) 5157cc070d59SAlex Elder { 5158cc070d59SAlex Elder struct { 5159cc070d59SAlex Elder __le64 stripe_unit; 5160cc070d59SAlex Elder __le64 stripe_count; 5161cc070d59SAlex Elder } __attribute__ ((packed)) striping_info_buf = { 0 }; 5162cc070d59SAlex Elder size_t size = sizeof (striping_info_buf); 5163cc070d59SAlex Elder void *p; 5164cc070d59SAlex Elder u64 obj_size; 5165cc070d59SAlex Elder u64 stripe_unit; 5166cc070d59SAlex Elder u64 stripe_count; 5167cc070d59SAlex Elder int ret; 5168cc070d59SAlex Elder 5169c41d13a3SIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name, 5170cc070d59SAlex Elder "rbd", "get_stripe_unit_count", NULL, 0, 5171e2a58ee5SAlex Elder (char *)&striping_info_buf, size); 5172cc070d59SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 5173cc070d59SAlex Elder if (ret < 0) 5174cc070d59SAlex Elder return ret; 5175cc070d59SAlex Elder if (ret < size) 5176cc070d59SAlex Elder return -ERANGE; 5177cc070d59SAlex Elder 5178cc070d59SAlex Elder /* 5179cc070d59SAlex Elder * We don't actually support the "fancy striping" feature 5180cc070d59SAlex Elder * (STRIPINGV2) yet, but if the striping sizes are the 5181cc070d59SAlex Elder * defaults the behavior is the same as before. So find 5182cc070d59SAlex Elder * out, and only fail if the image has non-default values. 5183cc070d59SAlex Elder */ 5184cc070d59SAlex Elder ret = -EINVAL; 5185cc070d59SAlex Elder obj_size = (u64)1 << rbd_dev->header.obj_order; 5186cc070d59SAlex Elder p = &striping_info_buf; 5187cc070d59SAlex Elder stripe_unit = ceph_decode_64(&p); 5188cc070d59SAlex Elder if (stripe_unit != obj_size) { 5189cc070d59SAlex Elder rbd_warn(rbd_dev, "unsupported stripe unit " 5190cc070d59SAlex Elder "(got %llu want %llu)", 5191cc070d59SAlex Elder stripe_unit, obj_size); 5192cc070d59SAlex Elder return -EINVAL; 5193cc070d59SAlex Elder } 5194cc070d59SAlex Elder stripe_count = ceph_decode_64(&p); 5195cc070d59SAlex Elder if (stripe_count != 1) { 5196cc070d59SAlex Elder rbd_warn(rbd_dev, "unsupported stripe count " 5197cc070d59SAlex Elder "(got %llu want 1)", stripe_count); 5198cc070d59SAlex Elder return -EINVAL; 5199cc070d59SAlex Elder } 5200500d0c0fSAlex Elder rbd_dev->header.stripe_unit = stripe_unit; 5201500d0c0fSAlex Elder rbd_dev->header.stripe_count = stripe_count; 5202cc070d59SAlex Elder 5203cc070d59SAlex Elder return 0; 5204cc070d59SAlex Elder } 5205cc070d59SAlex Elder 52069e15b77dSAlex Elder static char *rbd_dev_image_name(struct rbd_device *rbd_dev) 52079e15b77dSAlex Elder { 52089e15b77dSAlex Elder size_t image_id_size; 52099e15b77dSAlex Elder char *image_id; 52109e15b77dSAlex Elder void *p; 52119e15b77dSAlex Elder void *end; 52129e15b77dSAlex Elder size_t size; 52139e15b77dSAlex Elder void *reply_buf = NULL; 52149e15b77dSAlex Elder size_t len = 0; 52159e15b77dSAlex Elder char *image_name = NULL; 52169e15b77dSAlex Elder int ret; 52179e15b77dSAlex Elder 52189e15b77dSAlex Elder rbd_assert(!rbd_dev->spec->image_name); 52199e15b77dSAlex Elder 522069e7a02fSAlex Elder len = strlen(rbd_dev->spec->image_id); 522169e7a02fSAlex Elder image_id_size = sizeof (__le32) + len; 52229e15b77dSAlex Elder image_id = kmalloc(image_id_size, GFP_KERNEL); 52239e15b77dSAlex Elder if (!image_id) 52249e15b77dSAlex Elder return NULL; 52259e15b77dSAlex Elder 52269e15b77dSAlex Elder p = image_id; 52274157976bSAlex Elder end = image_id + image_id_size; 522869e7a02fSAlex Elder ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len); 52299e15b77dSAlex Elder 52309e15b77dSAlex Elder size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX; 52319e15b77dSAlex Elder reply_buf = kmalloc(size, GFP_KERNEL); 52329e15b77dSAlex Elder if (!reply_buf) 52339e15b77dSAlex Elder goto out; 52349e15b77dSAlex Elder 523536be9a76SAlex Elder ret = rbd_obj_method_sync(rbd_dev, RBD_DIRECTORY, 52369e15b77dSAlex Elder "rbd", "dir_get_name", 52379e15b77dSAlex Elder image_id, image_id_size, 5238e2a58ee5SAlex Elder reply_buf, size); 52399e15b77dSAlex Elder if (ret < 0) 52409e15b77dSAlex Elder goto out; 52419e15b77dSAlex Elder p = reply_buf; 5242f40eb349SAlex Elder end = reply_buf + ret; 5243f40eb349SAlex Elder 52449e15b77dSAlex Elder image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL); 52459e15b77dSAlex Elder if (IS_ERR(image_name)) 52469e15b77dSAlex Elder image_name = NULL; 52479e15b77dSAlex Elder else 52489e15b77dSAlex Elder dout("%s: name is %s len is %zd\n", __func__, image_name, len); 52499e15b77dSAlex Elder out: 52509e15b77dSAlex Elder kfree(reply_buf); 52519e15b77dSAlex Elder kfree(image_id); 52529e15b77dSAlex Elder 52539e15b77dSAlex Elder return image_name; 52549e15b77dSAlex Elder } 52559e15b77dSAlex Elder 52562ad3d716SAlex Elder static u64 rbd_v1_snap_id_by_name(struct rbd_device *rbd_dev, const char *name) 52572ad3d716SAlex Elder { 52582ad3d716SAlex Elder struct ceph_snap_context *snapc = rbd_dev->header.snapc; 52592ad3d716SAlex Elder const char *snap_name; 52602ad3d716SAlex Elder u32 which = 0; 52612ad3d716SAlex Elder 52622ad3d716SAlex Elder /* Skip over names until we find the one we are looking for */ 52632ad3d716SAlex Elder 52642ad3d716SAlex Elder snap_name = rbd_dev->header.snap_names; 52652ad3d716SAlex Elder while (which < snapc->num_snaps) { 52662ad3d716SAlex Elder if (!strcmp(name, snap_name)) 52672ad3d716SAlex Elder return snapc->snaps[which]; 52682ad3d716SAlex Elder snap_name += strlen(snap_name) + 1; 52692ad3d716SAlex Elder which++; 52702ad3d716SAlex Elder } 52712ad3d716SAlex Elder return CEPH_NOSNAP; 52722ad3d716SAlex Elder } 52732ad3d716SAlex Elder 52742ad3d716SAlex Elder static u64 rbd_v2_snap_id_by_name(struct rbd_device *rbd_dev, const char *name) 52752ad3d716SAlex Elder { 52762ad3d716SAlex Elder struct ceph_snap_context *snapc = rbd_dev->header.snapc; 52772ad3d716SAlex Elder u32 which; 52782ad3d716SAlex Elder bool found = false; 52792ad3d716SAlex Elder u64 snap_id; 52802ad3d716SAlex Elder 52812ad3d716SAlex Elder for (which = 0; !found && which < snapc->num_snaps; which++) { 52822ad3d716SAlex Elder const char *snap_name; 52832ad3d716SAlex Elder 52842ad3d716SAlex Elder snap_id = snapc->snaps[which]; 52852ad3d716SAlex Elder snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id); 5286efadc98aSJosh Durgin if (IS_ERR(snap_name)) { 5287efadc98aSJosh Durgin /* ignore no-longer existing snapshots */ 5288efadc98aSJosh Durgin if (PTR_ERR(snap_name) == -ENOENT) 5289efadc98aSJosh Durgin continue; 5290efadc98aSJosh Durgin else 52912ad3d716SAlex Elder break; 5292efadc98aSJosh Durgin } 52932ad3d716SAlex Elder found = !strcmp(name, snap_name); 52942ad3d716SAlex Elder kfree(snap_name); 52952ad3d716SAlex Elder } 52962ad3d716SAlex Elder return found ? snap_id : CEPH_NOSNAP; 52972ad3d716SAlex Elder } 52982ad3d716SAlex Elder 52992ad3d716SAlex Elder /* 53002ad3d716SAlex Elder * Assumes name is never RBD_SNAP_HEAD_NAME; returns CEPH_NOSNAP if 53012ad3d716SAlex Elder * no snapshot by that name is found, or if an error occurs. 53022ad3d716SAlex Elder */ 53032ad3d716SAlex Elder static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name) 53042ad3d716SAlex Elder { 53052ad3d716SAlex Elder if (rbd_dev->image_format == 1) 53062ad3d716SAlex Elder return rbd_v1_snap_id_by_name(rbd_dev, name); 53072ad3d716SAlex Elder 53082ad3d716SAlex Elder return rbd_v2_snap_id_by_name(rbd_dev, name); 53092ad3d716SAlex Elder } 53102ad3d716SAlex Elder 53119e15b77dSAlex Elder /* 531204077599SIlya Dryomov * An image being mapped will have everything but the snap id. 53139e15b77dSAlex Elder */ 531404077599SIlya Dryomov static int rbd_spec_fill_snap_id(struct rbd_device *rbd_dev) 531504077599SIlya Dryomov { 531604077599SIlya Dryomov struct rbd_spec *spec = rbd_dev->spec; 531704077599SIlya Dryomov 531804077599SIlya Dryomov rbd_assert(spec->pool_id != CEPH_NOPOOL && spec->pool_name); 531904077599SIlya Dryomov rbd_assert(spec->image_id && spec->image_name); 532004077599SIlya Dryomov rbd_assert(spec->snap_name); 532104077599SIlya Dryomov 532204077599SIlya Dryomov if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) { 532304077599SIlya Dryomov u64 snap_id; 532404077599SIlya Dryomov 532504077599SIlya Dryomov snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name); 532604077599SIlya Dryomov if (snap_id == CEPH_NOSNAP) 532704077599SIlya Dryomov return -ENOENT; 532804077599SIlya Dryomov 532904077599SIlya Dryomov spec->snap_id = snap_id; 533004077599SIlya Dryomov } else { 533104077599SIlya Dryomov spec->snap_id = CEPH_NOSNAP; 533204077599SIlya Dryomov } 533304077599SIlya Dryomov 533404077599SIlya Dryomov return 0; 533504077599SIlya Dryomov } 533604077599SIlya Dryomov 533704077599SIlya Dryomov /* 533804077599SIlya Dryomov * A parent image will have all ids but none of the names. 533904077599SIlya Dryomov * 534004077599SIlya Dryomov * All names in an rbd spec are dynamically allocated. It's OK if we 534104077599SIlya Dryomov * can't figure out the name for an image id. 534204077599SIlya Dryomov */ 534304077599SIlya Dryomov static int rbd_spec_fill_names(struct rbd_device *rbd_dev) 53449e15b77dSAlex Elder { 53452e9f7f1cSAlex Elder struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 53462e9f7f1cSAlex Elder struct rbd_spec *spec = rbd_dev->spec; 53472e9f7f1cSAlex Elder const char *pool_name; 53482e9f7f1cSAlex Elder const char *image_name; 53492e9f7f1cSAlex Elder const char *snap_name; 53509e15b77dSAlex Elder int ret; 53519e15b77dSAlex Elder 535204077599SIlya Dryomov rbd_assert(spec->pool_id != CEPH_NOPOOL); 535304077599SIlya Dryomov rbd_assert(spec->image_id); 535404077599SIlya Dryomov rbd_assert(spec->snap_id != CEPH_NOSNAP); 53559e15b77dSAlex Elder 53562e9f7f1cSAlex Elder /* Get the pool name; we have to make our own copy of this */ 53579e15b77dSAlex Elder 53582e9f7f1cSAlex Elder pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id); 53592e9f7f1cSAlex Elder if (!pool_name) { 53602e9f7f1cSAlex Elder rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id); 5361935dc89fSAlex Elder return -EIO; 5362935dc89fSAlex Elder } 53632e9f7f1cSAlex Elder pool_name = kstrdup(pool_name, GFP_KERNEL); 53642e9f7f1cSAlex Elder if (!pool_name) 53659e15b77dSAlex Elder return -ENOMEM; 53669e15b77dSAlex Elder 53679e15b77dSAlex Elder /* Fetch the image name; tolerate failure here */ 53689e15b77dSAlex Elder 53692e9f7f1cSAlex Elder image_name = rbd_dev_image_name(rbd_dev); 53702e9f7f1cSAlex Elder if (!image_name) 537106ecc6cbSAlex Elder rbd_warn(rbd_dev, "unable to get image name"); 53729e15b77dSAlex Elder 537304077599SIlya Dryomov /* Fetch the snapshot name */ 53749e15b77dSAlex Elder 53752e9f7f1cSAlex Elder snap_name = rbd_snap_name(rbd_dev, spec->snap_id); 5376da6a6b63SJosh Durgin if (IS_ERR(snap_name)) { 5377da6a6b63SJosh Durgin ret = PTR_ERR(snap_name); 53789e15b77dSAlex Elder goto out_err; 53792e9f7f1cSAlex Elder } 53802e9f7f1cSAlex Elder 53812e9f7f1cSAlex Elder spec->pool_name = pool_name; 53822e9f7f1cSAlex Elder spec->image_name = image_name; 53832e9f7f1cSAlex Elder spec->snap_name = snap_name; 53849e15b77dSAlex Elder 53859e15b77dSAlex Elder return 0; 538604077599SIlya Dryomov 53879e15b77dSAlex Elder out_err: 53882e9f7f1cSAlex Elder kfree(image_name); 53892e9f7f1cSAlex Elder kfree(pool_name); 53909e15b77dSAlex Elder return ret; 53919e15b77dSAlex Elder } 53929e15b77dSAlex Elder 5393cc4a38bdSAlex Elder static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev) 539435d489f9SAlex Elder { 539535d489f9SAlex Elder size_t size; 539635d489f9SAlex Elder int ret; 539735d489f9SAlex Elder void *reply_buf; 539835d489f9SAlex Elder void *p; 539935d489f9SAlex Elder void *end; 540035d489f9SAlex Elder u64 seq; 540135d489f9SAlex Elder u32 snap_count; 540235d489f9SAlex Elder struct ceph_snap_context *snapc; 540335d489f9SAlex Elder u32 i; 540435d489f9SAlex Elder 540535d489f9SAlex Elder /* 540635d489f9SAlex Elder * We'll need room for the seq value (maximum snapshot id), 540735d489f9SAlex Elder * snapshot count, and array of that many snapshot ids. 540835d489f9SAlex Elder * For now we have a fixed upper limit on the number we're 540935d489f9SAlex Elder * prepared to receive. 541035d489f9SAlex Elder */ 541135d489f9SAlex Elder size = sizeof (__le64) + sizeof (__le32) + 541235d489f9SAlex Elder RBD_MAX_SNAP_COUNT * sizeof (__le64); 541335d489f9SAlex Elder reply_buf = kzalloc(size, GFP_KERNEL); 541435d489f9SAlex Elder if (!reply_buf) 541535d489f9SAlex Elder return -ENOMEM; 541635d489f9SAlex Elder 5417c41d13a3SIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name, 54184157976bSAlex Elder "rbd", "get_snapcontext", NULL, 0, 5419e2a58ee5SAlex Elder reply_buf, size); 542036be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 542135d489f9SAlex Elder if (ret < 0) 542235d489f9SAlex Elder goto out; 542335d489f9SAlex Elder 542435d489f9SAlex Elder p = reply_buf; 542557385b51SAlex Elder end = reply_buf + ret; 542657385b51SAlex Elder ret = -ERANGE; 542735d489f9SAlex Elder ceph_decode_64_safe(&p, end, seq, out); 542835d489f9SAlex Elder ceph_decode_32_safe(&p, end, snap_count, out); 542935d489f9SAlex Elder 543035d489f9SAlex Elder /* 543135d489f9SAlex Elder * Make sure the reported number of snapshot ids wouldn't go 543235d489f9SAlex Elder * beyond the end of our buffer. But before checking that, 543335d489f9SAlex Elder * make sure the computed size of the snapshot context we 543435d489f9SAlex Elder * allocate is representable in a size_t. 543535d489f9SAlex Elder */ 543635d489f9SAlex Elder if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context)) 543735d489f9SAlex Elder / sizeof (u64)) { 543835d489f9SAlex Elder ret = -EINVAL; 543935d489f9SAlex Elder goto out; 544035d489f9SAlex Elder } 544135d489f9SAlex Elder if (!ceph_has_room(&p, end, snap_count * sizeof (__le64))) 544235d489f9SAlex Elder goto out; 5443468521c1SAlex Elder ret = 0; 544435d489f9SAlex Elder 5445812164f8SAlex Elder snapc = ceph_create_snap_context(snap_count, GFP_KERNEL); 544635d489f9SAlex Elder if (!snapc) { 544735d489f9SAlex Elder ret = -ENOMEM; 544835d489f9SAlex Elder goto out; 544935d489f9SAlex Elder } 545035d489f9SAlex Elder snapc->seq = seq; 545135d489f9SAlex Elder for (i = 0; i < snap_count; i++) 545235d489f9SAlex Elder snapc->snaps[i] = ceph_decode_64(&p); 545335d489f9SAlex Elder 545449ece554SAlex Elder ceph_put_snap_context(rbd_dev->header.snapc); 545535d489f9SAlex Elder rbd_dev->header.snapc = snapc; 545635d489f9SAlex Elder 545735d489f9SAlex Elder dout(" snap context seq = %llu, snap_count = %u\n", 545835d489f9SAlex Elder (unsigned long long)seq, (unsigned int)snap_count); 545935d489f9SAlex Elder out: 546035d489f9SAlex Elder kfree(reply_buf); 546135d489f9SAlex Elder 546257385b51SAlex Elder return ret; 546335d489f9SAlex Elder } 546435d489f9SAlex Elder 546554cac61fSAlex Elder static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, 546654cac61fSAlex Elder u64 snap_id) 5467b8b1e2dbSAlex Elder { 5468b8b1e2dbSAlex Elder size_t size; 5469b8b1e2dbSAlex Elder void *reply_buf; 547054cac61fSAlex Elder __le64 snapid; 5471b8b1e2dbSAlex Elder int ret; 5472b8b1e2dbSAlex Elder void *p; 5473b8b1e2dbSAlex Elder void *end; 5474b8b1e2dbSAlex Elder char *snap_name; 5475b8b1e2dbSAlex Elder 5476b8b1e2dbSAlex Elder size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN; 5477b8b1e2dbSAlex Elder reply_buf = kmalloc(size, GFP_KERNEL); 5478b8b1e2dbSAlex Elder if (!reply_buf) 5479b8b1e2dbSAlex Elder return ERR_PTR(-ENOMEM); 5480b8b1e2dbSAlex Elder 548154cac61fSAlex Elder snapid = cpu_to_le64(snap_id); 5482c41d13a3SIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name, 5483b8b1e2dbSAlex Elder "rbd", "get_snapshot_name", 548454cac61fSAlex Elder &snapid, sizeof (snapid), 5485e2a58ee5SAlex Elder reply_buf, size); 548636be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 5487f40eb349SAlex Elder if (ret < 0) { 5488f40eb349SAlex Elder snap_name = ERR_PTR(ret); 5489b8b1e2dbSAlex Elder goto out; 5490f40eb349SAlex Elder } 5491b8b1e2dbSAlex Elder 5492b8b1e2dbSAlex Elder p = reply_buf; 5493f40eb349SAlex Elder end = reply_buf + ret; 5494e5c35534SAlex Elder snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL); 5495f40eb349SAlex Elder if (IS_ERR(snap_name)) 5496b8b1e2dbSAlex Elder goto out; 5497f40eb349SAlex Elder 5498b8b1e2dbSAlex Elder dout(" snap_id 0x%016llx snap_name = %s\n", 549954cac61fSAlex Elder (unsigned long long)snap_id, snap_name); 5500b8b1e2dbSAlex Elder out: 5501b8b1e2dbSAlex Elder kfree(reply_buf); 5502b8b1e2dbSAlex Elder 5503f40eb349SAlex Elder return snap_name; 5504b8b1e2dbSAlex Elder } 5505b8b1e2dbSAlex Elder 55062df3fac7SAlex Elder static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev) 5507117973fbSAlex Elder { 55082df3fac7SAlex Elder bool first_time = rbd_dev->header.object_prefix == NULL; 5509117973fbSAlex Elder int ret; 5510117973fbSAlex Elder 55111617e40cSJosh Durgin ret = rbd_dev_v2_image_size(rbd_dev); 55121617e40cSJosh Durgin if (ret) 5513cfbf6377SAlex Elder return ret; 55141617e40cSJosh Durgin 55152df3fac7SAlex Elder if (first_time) { 55162df3fac7SAlex Elder ret = rbd_dev_v2_header_onetime(rbd_dev); 55172df3fac7SAlex Elder if (ret) 5518cfbf6377SAlex Elder return ret; 55192df3fac7SAlex Elder } 55202df3fac7SAlex Elder 5521cc4a38bdSAlex Elder ret = rbd_dev_v2_snap_context(rbd_dev); 5522d194cd1dSIlya Dryomov if (ret && first_time) { 5523d194cd1dSIlya Dryomov kfree(rbd_dev->header.object_prefix); 5524d194cd1dSIlya Dryomov rbd_dev->header.object_prefix = NULL; 5525d194cd1dSIlya Dryomov } 5526117973fbSAlex Elder 5527117973fbSAlex Elder return ret; 5528117973fbSAlex Elder } 5529117973fbSAlex Elder 5530a720ae09SIlya Dryomov static int rbd_dev_header_info(struct rbd_device *rbd_dev) 5531a720ae09SIlya Dryomov { 5532a720ae09SIlya Dryomov rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 5533a720ae09SIlya Dryomov 5534a720ae09SIlya Dryomov if (rbd_dev->image_format == 1) 5535a720ae09SIlya Dryomov return rbd_dev_v1_header_info(rbd_dev); 5536a720ae09SIlya Dryomov 5537a720ae09SIlya Dryomov return rbd_dev_v2_header_info(rbd_dev); 5538a720ae09SIlya Dryomov } 5539a720ae09SIlya Dryomov 55401ddbe94eSAlex Elder /* 5541e28fff26SAlex Elder * Skips over white space at *buf, and updates *buf to point to the 5542e28fff26SAlex Elder * first found non-space character (if any). Returns the length of 5543593a9e7bSAlex Elder * the token (string of non-white space characters) found. Note 5544593a9e7bSAlex Elder * that *buf must be terminated with '\0'. 5545e28fff26SAlex Elder */ 5546e28fff26SAlex Elder static inline size_t next_token(const char **buf) 5547e28fff26SAlex Elder { 5548e28fff26SAlex Elder /* 5549e28fff26SAlex Elder * These are the characters that produce nonzero for 5550e28fff26SAlex Elder * isspace() in the "C" and "POSIX" locales. 5551e28fff26SAlex Elder */ 5552e28fff26SAlex Elder const char *spaces = " \f\n\r\t\v"; 5553e28fff26SAlex Elder 5554e28fff26SAlex Elder *buf += strspn(*buf, spaces); /* Find start of token */ 5555e28fff26SAlex Elder 5556e28fff26SAlex Elder return strcspn(*buf, spaces); /* Return token length */ 5557e28fff26SAlex Elder } 5558e28fff26SAlex Elder 5559e28fff26SAlex Elder /* 5560ea3352f4SAlex Elder * Finds the next token in *buf, dynamically allocates a buffer big 5561ea3352f4SAlex Elder * enough to hold a copy of it, and copies the token into the new 5562ea3352f4SAlex Elder * buffer. The copy is guaranteed to be terminated with '\0'. Note 5563ea3352f4SAlex Elder * that a duplicate buffer is created even for a zero-length token. 5564ea3352f4SAlex Elder * 5565ea3352f4SAlex Elder * Returns a pointer to the newly-allocated duplicate, or a null 5566ea3352f4SAlex Elder * pointer if memory for the duplicate was not available. If 5567ea3352f4SAlex Elder * the lenp argument is a non-null pointer, the length of the token 5568ea3352f4SAlex Elder * (not including the '\0') is returned in *lenp. 5569ea3352f4SAlex Elder * 5570ea3352f4SAlex Elder * If successful, the *buf pointer will be updated to point beyond 5571ea3352f4SAlex Elder * the end of the found token. 5572ea3352f4SAlex Elder * 5573ea3352f4SAlex Elder * Note: uses GFP_KERNEL for allocation. 5574ea3352f4SAlex Elder */ 5575ea3352f4SAlex Elder static inline char *dup_token(const char **buf, size_t *lenp) 5576ea3352f4SAlex Elder { 5577ea3352f4SAlex Elder char *dup; 5578ea3352f4SAlex Elder size_t len; 5579ea3352f4SAlex Elder 5580ea3352f4SAlex Elder len = next_token(buf); 55814caf35f9SAlex Elder dup = kmemdup(*buf, len + 1, GFP_KERNEL); 5582ea3352f4SAlex Elder if (!dup) 5583ea3352f4SAlex Elder return NULL; 5584ea3352f4SAlex Elder *(dup + len) = '\0'; 5585ea3352f4SAlex Elder *buf += len; 5586ea3352f4SAlex Elder 5587ea3352f4SAlex Elder if (lenp) 5588ea3352f4SAlex Elder *lenp = len; 5589ea3352f4SAlex Elder 5590ea3352f4SAlex Elder return dup; 5591ea3352f4SAlex Elder } 5592ea3352f4SAlex Elder 5593ea3352f4SAlex Elder /* 5594859c31dfSAlex Elder * Parse the options provided for an "rbd add" (i.e., rbd image 5595859c31dfSAlex Elder * mapping) request. These arrive via a write to /sys/bus/rbd/add, 5596859c31dfSAlex Elder * and the data written is passed here via a NUL-terminated buffer. 5597859c31dfSAlex Elder * Returns 0 if successful or an error code otherwise. 5598d22f76e7SAlex Elder * 5599859c31dfSAlex Elder * The information extracted from these options is recorded in 5600859c31dfSAlex Elder * the other parameters which return dynamically-allocated 5601859c31dfSAlex Elder * structures: 5602859c31dfSAlex Elder * ceph_opts 5603859c31dfSAlex Elder * The address of a pointer that will refer to a ceph options 5604859c31dfSAlex Elder * structure. Caller must release the returned pointer using 5605859c31dfSAlex Elder * ceph_destroy_options() when it is no longer needed. 5606859c31dfSAlex Elder * rbd_opts 5607859c31dfSAlex Elder * Address of an rbd options pointer. Fully initialized by 5608859c31dfSAlex Elder * this function; caller must release with kfree(). 5609859c31dfSAlex Elder * spec 5610859c31dfSAlex Elder * Address of an rbd image specification pointer. Fully 5611859c31dfSAlex Elder * initialized by this function based on parsed options. 5612859c31dfSAlex Elder * Caller must release with rbd_spec_put(). 5613859c31dfSAlex Elder * 5614859c31dfSAlex Elder * The options passed take this form: 5615859c31dfSAlex Elder * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>] 5616859c31dfSAlex Elder * where: 5617859c31dfSAlex Elder * <mon_addrs> 5618859c31dfSAlex Elder * A comma-separated list of one or more monitor addresses. 5619859c31dfSAlex Elder * A monitor address is an ip address, optionally followed 5620859c31dfSAlex Elder * by a port number (separated by a colon). 5621859c31dfSAlex Elder * I.e.: ip1[:port1][,ip2[:port2]...] 5622859c31dfSAlex Elder * <options> 5623859c31dfSAlex Elder * A comma-separated list of ceph and/or rbd options. 5624859c31dfSAlex Elder * <pool_name> 5625859c31dfSAlex Elder * The name of the rados pool containing the rbd image. 5626859c31dfSAlex Elder * <image_name> 5627859c31dfSAlex Elder * The name of the image in that pool to map. 5628859c31dfSAlex Elder * <snap_id> 5629859c31dfSAlex Elder * An optional snapshot id. If provided, the mapping will 5630859c31dfSAlex Elder * present data from the image at the time that snapshot was 5631859c31dfSAlex Elder * created. The image head is used if no snapshot id is 5632859c31dfSAlex Elder * provided. Snapshot mappings are always read-only. 5633a725f65eSAlex Elder */ 5634859c31dfSAlex Elder static int rbd_add_parse_args(const char *buf, 5635dc79b113SAlex Elder struct ceph_options **ceph_opts, 5636859c31dfSAlex Elder struct rbd_options **opts, 5637859c31dfSAlex Elder struct rbd_spec **rbd_spec) 5638a725f65eSAlex Elder { 5639e28fff26SAlex Elder size_t len; 5640859c31dfSAlex Elder char *options; 56410ddebc0cSAlex Elder const char *mon_addrs; 5642ecb4dc22SAlex Elder char *snap_name; 56430ddebc0cSAlex Elder size_t mon_addrs_size; 5644859c31dfSAlex Elder struct rbd_spec *spec = NULL; 56454e9afebaSAlex Elder struct rbd_options *rbd_opts = NULL; 5646859c31dfSAlex Elder struct ceph_options *copts; 5647dc79b113SAlex Elder int ret; 5648e28fff26SAlex Elder 5649e28fff26SAlex Elder /* The first four tokens are required */ 5650e28fff26SAlex Elder 56517ef3214aSAlex Elder len = next_token(&buf); 56524fb5d671SAlex Elder if (!len) { 56534fb5d671SAlex Elder rbd_warn(NULL, "no monitor address(es) provided"); 56544fb5d671SAlex Elder return -EINVAL; 56554fb5d671SAlex Elder } 56560ddebc0cSAlex Elder mon_addrs = buf; 5657f28e565aSAlex Elder mon_addrs_size = len + 1; 56587ef3214aSAlex Elder buf += len; 5659a725f65eSAlex Elder 5660dc79b113SAlex Elder ret = -EINVAL; 5661f28e565aSAlex Elder options = dup_token(&buf, NULL); 5662f28e565aSAlex Elder if (!options) 5663dc79b113SAlex Elder return -ENOMEM; 56644fb5d671SAlex Elder if (!*options) { 56654fb5d671SAlex Elder rbd_warn(NULL, "no options provided"); 56664fb5d671SAlex Elder goto out_err; 56674fb5d671SAlex Elder } 5668a725f65eSAlex Elder 5669859c31dfSAlex Elder spec = rbd_spec_alloc(); 5670859c31dfSAlex Elder if (!spec) 5671f28e565aSAlex Elder goto out_mem; 5672859c31dfSAlex Elder 5673859c31dfSAlex Elder spec->pool_name = dup_token(&buf, NULL); 5674859c31dfSAlex Elder if (!spec->pool_name) 5675859c31dfSAlex Elder goto out_mem; 56764fb5d671SAlex Elder if (!*spec->pool_name) { 56774fb5d671SAlex Elder rbd_warn(NULL, "no pool name provided"); 56784fb5d671SAlex Elder goto out_err; 56794fb5d671SAlex Elder } 5680e28fff26SAlex Elder 568169e7a02fSAlex Elder spec->image_name = dup_token(&buf, NULL); 5682859c31dfSAlex Elder if (!spec->image_name) 5683f28e565aSAlex Elder goto out_mem; 56844fb5d671SAlex Elder if (!*spec->image_name) { 56854fb5d671SAlex Elder rbd_warn(NULL, "no image name provided"); 56864fb5d671SAlex Elder goto out_err; 56874fb5d671SAlex Elder } 5688e28fff26SAlex Elder 5689f28e565aSAlex Elder /* 5690f28e565aSAlex Elder * Snapshot name is optional; default is to use "-" 5691f28e565aSAlex Elder * (indicating the head/no snapshot). 5692f28e565aSAlex Elder */ 56933feeb894SAlex Elder len = next_token(&buf); 5694820a5f3eSAlex Elder if (!len) { 56953feeb894SAlex Elder buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */ 56963feeb894SAlex Elder len = sizeof (RBD_SNAP_HEAD_NAME) - 1; 5697f28e565aSAlex Elder } else if (len > RBD_MAX_SNAP_NAME_LEN) { 5698dc79b113SAlex Elder ret = -ENAMETOOLONG; 5699f28e565aSAlex Elder goto out_err; 5700849b4260SAlex Elder } 5701ecb4dc22SAlex Elder snap_name = kmemdup(buf, len + 1, GFP_KERNEL); 5702ecb4dc22SAlex Elder if (!snap_name) 5703f28e565aSAlex Elder goto out_mem; 5704ecb4dc22SAlex Elder *(snap_name + len) = '\0'; 5705ecb4dc22SAlex Elder spec->snap_name = snap_name; 5706e5c35534SAlex Elder 57070ddebc0cSAlex Elder /* Initialize all rbd options to the defaults */ 5708e28fff26SAlex Elder 57094e9afebaSAlex Elder rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL); 57104e9afebaSAlex Elder if (!rbd_opts) 57114e9afebaSAlex Elder goto out_mem; 57124e9afebaSAlex Elder 57134e9afebaSAlex Elder rbd_opts->read_only = RBD_READ_ONLY_DEFAULT; 5714b5584180SIlya Dryomov rbd_opts->queue_depth = RBD_QUEUE_DEPTH_DEFAULT; 5715d22f76e7SAlex Elder 5716859c31dfSAlex Elder copts = ceph_parse_options(options, mon_addrs, 57170ddebc0cSAlex Elder mon_addrs + mon_addrs_size - 1, 57184e9afebaSAlex Elder parse_rbd_opts_token, rbd_opts); 5719859c31dfSAlex Elder if (IS_ERR(copts)) { 5720859c31dfSAlex Elder ret = PTR_ERR(copts); 5721dc79b113SAlex Elder goto out_err; 5722dc79b113SAlex Elder } 5723859c31dfSAlex Elder kfree(options); 5724859c31dfSAlex Elder 5725859c31dfSAlex Elder *ceph_opts = copts; 57264e9afebaSAlex Elder *opts = rbd_opts; 5727859c31dfSAlex Elder *rbd_spec = spec; 57280ddebc0cSAlex Elder 5729dc79b113SAlex Elder return 0; 5730f28e565aSAlex Elder out_mem: 5731dc79b113SAlex Elder ret = -ENOMEM; 5732d22f76e7SAlex Elder out_err: 5733859c31dfSAlex Elder kfree(rbd_opts); 5734859c31dfSAlex Elder rbd_spec_put(spec); 5735f28e565aSAlex Elder kfree(options); 5736d22f76e7SAlex Elder 5737dc79b113SAlex Elder return ret; 5738a725f65eSAlex Elder } 5739a725f65eSAlex Elder 5740589d30e0SAlex Elder /* 574130ba1f02SIlya Dryomov * Return pool id (>= 0) or a negative error code. 574230ba1f02SIlya Dryomov */ 574330ba1f02SIlya Dryomov static int rbd_add_get_pool_id(struct rbd_client *rbdc, const char *pool_name) 574430ba1f02SIlya Dryomov { 5745a319bf56SIlya Dryomov struct ceph_options *opts = rbdc->client->options; 574630ba1f02SIlya Dryomov u64 newest_epoch; 574730ba1f02SIlya Dryomov int tries = 0; 574830ba1f02SIlya Dryomov int ret; 574930ba1f02SIlya Dryomov 575030ba1f02SIlya Dryomov again: 575130ba1f02SIlya Dryomov ret = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, pool_name); 575230ba1f02SIlya Dryomov if (ret == -ENOENT && tries++ < 1) { 5753d0b19705SIlya Dryomov ret = ceph_monc_get_version(&rbdc->client->monc, "osdmap", 575430ba1f02SIlya Dryomov &newest_epoch); 575530ba1f02SIlya Dryomov if (ret < 0) 575630ba1f02SIlya Dryomov return ret; 575730ba1f02SIlya Dryomov 575830ba1f02SIlya Dryomov if (rbdc->client->osdc.osdmap->epoch < newest_epoch) { 57597cca78c9SIlya Dryomov ceph_osdc_maybe_request_map(&rbdc->client->osdc); 576030ba1f02SIlya Dryomov (void) ceph_monc_wait_osdmap(&rbdc->client->monc, 5761a319bf56SIlya Dryomov newest_epoch, 5762a319bf56SIlya Dryomov opts->mount_timeout); 576330ba1f02SIlya Dryomov goto again; 576430ba1f02SIlya Dryomov } else { 576530ba1f02SIlya Dryomov /* the osdmap we have is new enough */ 576630ba1f02SIlya Dryomov return -ENOENT; 576730ba1f02SIlya Dryomov } 576830ba1f02SIlya Dryomov } 576930ba1f02SIlya Dryomov 577030ba1f02SIlya Dryomov return ret; 577130ba1f02SIlya Dryomov } 577230ba1f02SIlya Dryomov 577330ba1f02SIlya Dryomov /* 5774589d30e0SAlex Elder * An rbd format 2 image has a unique identifier, distinct from the 5775589d30e0SAlex Elder * name given to it by the user. Internally, that identifier is 5776589d30e0SAlex Elder * what's used to specify the names of objects related to the image. 5777589d30e0SAlex Elder * 5778589d30e0SAlex Elder * A special "rbd id" object is used to map an rbd image name to its 5779589d30e0SAlex Elder * id. If that object doesn't exist, then there is no v2 rbd image 5780589d30e0SAlex Elder * with the supplied name. 5781589d30e0SAlex Elder * 5782589d30e0SAlex Elder * This function will record the given rbd_dev's image_id field if 5783589d30e0SAlex Elder * it can be determined, and in that case will return 0. If any 5784589d30e0SAlex Elder * errors occur a negative errno will be returned and the rbd_dev's 5785589d30e0SAlex Elder * image_id field will be unchanged (and should be NULL). 5786589d30e0SAlex Elder */ 5787589d30e0SAlex Elder static int rbd_dev_image_id(struct rbd_device *rbd_dev) 5788589d30e0SAlex Elder { 5789589d30e0SAlex Elder int ret; 5790589d30e0SAlex Elder size_t size; 5791589d30e0SAlex Elder char *object_name; 5792589d30e0SAlex Elder void *response; 5793c0fba368SAlex Elder char *image_id; 57942f82ee54SAlex Elder 5795589d30e0SAlex Elder /* 57962c0d0a10SAlex Elder * When probing a parent image, the image id is already 57972c0d0a10SAlex Elder * known (and the image name likely is not). There's no 5798c0fba368SAlex Elder * need to fetch the image id again in this case. We 5799c0fba368SAlex Elder * do still need to set the image format though. 58002c0d0a10SAlex Elder */ 5801c0fba368SAlex Elder if (rbd_dev->spec->image_id) { 5802c0fba368SAlex Elder rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1; 5803c0fba368SAlex Elder 58042c0d0a10SAlex Elder return 0; 5805c0fba368SAlex Elder } 58062c0d0a10SAlex Elder 58072c0d0a10SAlex Elder /* 5808589d30e0SAlex Elder * First, see if the format 2 image id file exists, and if 5809589d30e0SAlex Elder * so, get the image's persistent id from it. 5810589d30e0SAlex Elder */ 581169e7a02fSAlex Elder size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name); 5812589d30e0SAlex Elder object_name = kmalloc(size, GFP_NOIO); 5813589d30e0SAlex Elder if (!object_name) 5814589d30e0SAlex Elder return -ENOMEM; 58150d7dbfceSAlex Elder sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name); 5816589d30e0SAlex Elder dout("rbd id object name is %s\n", object_name); 5817589d30e0SAlex Elder 5818589d30e0SAlex Elder /* Response will be an encoded string, which includes a length */ 5819589d30e0SAlex Elder 5820589d30e0SAlex Elder size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX; 5821589d30e0SAlex Elder response = kzalloc(size, GFP_NOIO); 5822589d30e0SAlex Elder if (!response) { 5823589d30e0SAlex Elder ret = -ENOMEM; 5824589d30e0SAlex Elder goto out; 5825589d30e0SAlex Elder } 5826589d30e0SAlex Elder 5827c0fba368SAlex Elder /* If it doesn't exist we'll assume it's a format 1 image */ 5828c0fba368SAlex Elder 582936be9a76SAlex Elder ret = rbd_obj_method_sync(rbd_dev, object_name, 58304157976bSAlex Elder "rbd", "get_id", NULL, 0, 5831e2a58ee5SAlex Elder response, RBD_IMAGE_ID_LEN_MAX); 583236be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 5833c0fba368SAlex Elder if (ret == -ENOENT) { 5834c0fba368SAlex Elder image_id = kstrdup("", GFP_KERNEL); 5835c0fba368SAlex Elder ret = image_id ? 0 : -ENOMEM; 5836c0fba368SAlex Elder if (!ret) 5837c0fba368SAlex Elder rbd_dev->image_format = 1; 58387dd440c9SIlya Dryomov } else if (ret >= 0) { 5839c0fba368SAlex Elder void *p = response; 5840589d30e0SAlex Elder 5841c0fba368SAlex Elder image_id = ceph_extract_encoded_string(&p, p + ret, 5842979ed480SAlex Elder NULL, GFP_NOIO); 5843461f758aSDuan Jiong ret = PTR_ERR_OR_ZERO(image_id); 5844c0fba368SAlex Elder if (!ret) 5845c0fba368SAlex Elder rbd_dev->image_format = 2; 5846c0fba368SAlex Elder } 5847c0fba368SAlex Elder 5848c0fba368SAlex Elder if (!ret) { 5849c0fba368SAlex Elder rbd_dev->spec->image_id = image_id; 5850c0fba368SAlex Elder dout("image_id is %s\n", image_id); 5851589d30e0SAlex Elder } 5852589d30e0SAlex Elder out: 5853589d30e0SAlex Elder kfree(response); 5854589d30e0SAlex Elder kfree(object_name); 5855589d30e0SAlex Elder 5856589d30e0SAlex Elder return ret; 5857589d30e0SAlex Elder } 5858589d30e0SAlex Elder 58593abef3b3SAlex Elder /* 58603abef3b3SAlex Elder * Undo whatever state changes are made by v1 or v2 header info 58613abef3b3SAlex Elder * call. 58623abef3b3SAlex Elder */ 58636fd48b3bSAlex Elder static void rbd_dev_unprobe(struct rbd_device *rbd_dev) 58646fd48b3bSAlex Elder { 58656fd48b3bSAlex Elder struct rbd_image_header *header; 58666fd48b3bSAlex Elder 5867a2acd00eSAlex Elder rbd_dev_parent_put(rbd_dev); 58686fd48b3bSAlex Elder 58696fd48b3bSAlex Elder /* Free dynamic fields from the header, then zero it out */ 58706fd48b3bSAlex Elder 58716fd48b3bSAlex Elder header = &rbd_dev->header; 5872812164f8SAlex Elder ceph_put_snap_context(header->snapc); 58736fd48b3bSAlex Elder kfree(header->snap_sizes); 58746fd48b3bSAlex Elder kfree(header->snap_names); 58756fd48b3bSAlex Elder kfree(header->object_prefix); 58766fd48b3bSAlex Elder memset(header, 0, sizeof (*header)); 58776fd48b3bSAlex Elder } 58786fd48b3bSAlex Elder 58792df3fac7SAlex Elder static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev) 5880a30b71b9SAlex Elder { 5881a30b71b9SAlex Elder int ret; 5882a30b71b9SAlex Elder 58831e130199SAlex Elder ret = rbd_dev_v2_object_prefix(rbd_dev); 588457385b51SAlex Elder if (ret) 58851e130199SAlex Elder goto out_err; 5886b1b5402aSAlex Elder 58872df3fac7SAlex Elder /* 58882df3fac7SAlex Elder * Get the and check features for the image. Currently the 58892df3fac7SAlex Elder * features are assumed to never change. 58902df3fac7SAlex Elder */ 5891b1b5402aSAlex Elder ret = rbd_dev_v2_features(rbd_dev); 589257385b51SAlex Elder if (ret) 5893b1b5402aSAlex Elder goto out_err; 589435d489f9SAlex Elder 5895cc070d59SAlex Elder /* If the image supports fancy striping, get its parameters */ 5896cc070d59SAlex Elder 5897cc070d59SAlex Elder if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) { 5898cc070d59SAlex Elder ret = rbd_dev_v2_striping_info(rbd_dev); 5899cc070d59SAlex Elder if (ret < 0) 5900cc070d59SAlex Elder goto out_err; 5901cc070d59SAlex Elder } 59022df3fac7SAlex Elder /* No support for crypto and compression type format 2 images */ 5903a30b71b9SAlex Elder 590435152979SAlex Elder return 0; 59059d475de5SAlex Elder out_err: 5906642a2537SAlex Elder rbd_dev->header.features = 0; 59071e130199SAlex Elder kfree(rbd_dev->header.object_prefix); 59081e130199SAlex Elder rbd_dev->header.object_prefix = NULL; 59099d475de5SAlex Elder 59109d475de5SAlex Elder return ret; 5911a30b71b9SAlex Elder } 5912a30b71b9SAlex Elder 59136d69bb53SIlya Dryomov /* 59146d69bb53SIlya Dryomov * @depth is rbd_dev_image_probe() -> rbd_dev_probe_parent() -> 59156d69bb53SIlya Dryomov * rbd_dev_image_probe() recursion depth, which means it's also the 59166d69bb53SIlya Dryomov * length of the already discovered part of the parent chain. 59176d69bb53SIlya Dryomov */ 59186d69bb53SIlya Dryomov static int rbd_dev_probe_parent(struct rbd_device *rbd_dev, int depth) 591983a06263SAlex Elder { 59202f82ee54SAlex Elder struct rbd_device *parent = NULL; 5921124afba2SAlex Elder int ret; 5922124afba2SAlex Elder 5923124afba2SAlex Elder if (!rbd_dev->parent_spec) 5924124afba2SAlex Elder return 0; 5925124afba2SAlex Elder 59266d69bb53SIlya Dryomov if (++depth > RBD_MAX_PARENT_CHAIN_LEN) { 59276d69bb53SIlya Dryomov pr_info("parent chain is too long (%d)\n", depth); 59286d69bb53SIlya Dryomov ret = -EINVAL; 59296d69bb53SIlya Dryomov goto out_err; 59306d69bb53SIlya Dryomov } 59316d69bb53SIlya Dryomov 59321643dfa4SIlya Dryomov parent = __rbd_dev_create(rbd_dev->rbd_client, rbd_dev->parent_spec); 59331f2c6651SIlya Dryomov if (!parent) { 5934124afba2SAlex Elder ret = -ENOMEM; 5935124afba2SAlex Elder goto out_err; 59361f2c6651SIlya Dryomov } 59371f2c6651SIlya Dryomov 59381f2c6651SIlya Dryomov /* 59391f2c6651SIlya Dryomov * Images related by parent/child relationships always share 59401f2c6651SIlya Dryomov * rbd_client and spec/parent_spec, so bump their refcounts. 59411f2c6651SIlya Dryomov */ 59421f2c6651SIlya Dryomov __rbd_get_client(rbd_dev->rbd_client); 59431f2c6651SIlya Dryomov rbd_spec_get(rbd_dev->parent_spec); 5944124afba2SAlex Elder 59456d69bb53SIlya Dryomov ret = rbd_dev_image_probe(parent, depth); 5946124afba2SAlex Elder if (ret < 0) 5947124afba2SAlex Elder goto out_err; 59481f2c6651SIlya Dryomov 5949124afba2SAlex Elder rbd_dev->parent = parent; 5950a2acd00eSAlex Elder atomic_set(&rbd_dev->parent_ref, 1); 5951124afba2SAlex Elder return 0; 5952124afba2SAlex Elder 59531f2c6651SIlya Dryomov out_err: 59541f2c6651SIlya Dryomov rbd_dev_unparent(rbd_dev); 59551f2c6651SIlya Dryomov rbd_dev_destroy(parent); 5956124afba2SAlex Elder return ret; 5957124afba2SAlex Elder } 5958124afba2SAlex Elder 5959811c6688SIlya Dryomov /* 5960811c6688SIlya Dryomov * rbd_dev->header_rwsem must be locked for write and will be unlocked 5961811c6688SIlya Dryomov * upon return. 5962811c6688SIlya Dryomov */ 5963200a6a8bSAlex Elder static int rbd_dev_device_setup(struct rbd_device *rbd_dev) 5964124afba2SAlex Elder { 596583a06263SAlex Elder int ret; 596683a06263SAlex Elder 59679b60e70bSIlya Dryomov /* Record our major and minor device numbers. */ 596883a06263SAlex Elder 59699b60e70bSIlya Dryomov if (!single_major) { 597083a06263SAlex Elder ret = register_blkdev(0, rbd_dev->name); 597183a06263SAlex Elder if (ret < 0) 59721643dfa4SIlya Dryomov goto err_out_unlock; 59739b60e70bSIlya Dryomov 597483a06263SAlex Elder rbd_dev->major = ret; 5975dd82fff1SIlya Dryomov rbd_dev->minor = 0; 59769b60e70bSIlya Dryomov } else { 59779b60e70bSIlya Dryomov rbd_dev->major = rbd_major; 59789b60e70bSIlya Dryomov rbd_dev->minor = rbd_dev_id_to_minor(rbd_dev->dev_id); 59799b60e70bSIlya Dryomov } 598083a06263SAlex Elder 598183a06263SAlex Elder /* Set up the blkdev mapping. */ 598283a06263SAlex Elder 598383a06263SAlex Elder ret = rbd_init_disk(rbd_dev); 598483a06263SAlex Elder if (ret) 598583a06263SAlex Elder goto err_out_blkdev; 598683a06263SAlex Elder 5987f35a4deeSAlex Elder ret = rbd_dev_mapping_set(rbd_dev); 598883a06263SAlex Elder if (ret) 598983a06263SAlex Elder goto err_out_disk; 5990bc1ecc65SIlya Dryomov 5991f35a4deeSAlex Elder set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE); 599222001f61SJosh Durgin set_disk_ro(rbd_dev->disk, rbd_dev->mapping.read_only); 5993f35a4deeSAlex Elder 5994dd5ac32dSIlya Dryomov dev_set_name(&rbd_dev->dev, "%d", rbd_dev->dev_id); 5995dd5ac32dSIlya Dryomov ret = device_add(&rbd_dev->dev); 5996f35a4deeSAlex Elder if (ret) 5997f5ee37bdSIlya Dryomov goto err_out_mapping; 599883a06263SAlex Elder 599983a06263SAlex Elder /* Everything's ready. Announce the disk to the world. */ 600083a06263SAlex Elder 6001129b79d4SAlex Elder set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); 6002811c6688SIlya Dryomov up_write(&rbd_dev->header_rwsem); 600383a06263SAlex Elder 60041643dfa4SIlya Dryomov spin_lock(&rbd_dev_list_lock); 60051643dfa4SIlya Dryomov list_add_tail(&rbd_dev->node, &rbd_dev_list); 60061643dfa4SIlya Dryomov spin_unlock(&rbd_dev_list_lock); 60071643dfa4SIlya Dryomov 6008811c6688SIlya Dryomov add_disk(rbd_dev->disk); 600983a06263SAlex Elder pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name, 601083a06263SAlex Elder (unsigned long long) rbd_dev->mapping.size); 601183a06263SAlex Elder 601283a06263SAlex Elder return ret; 60132f82ee54SAlex Elder 6014f35a4deeSAlex Elder err_out_mapping: 6015f35a4deeSAlex Elder rbd_dev_mapping_clear(rbd_dev); 601683a06263SAlex Elder err_out_disk: 601783a06263SAlex Elder rbd_free_disk(rbd_dev); 601883a06263SAlex Elder err_out_blkdev: 60199b60e70bSIlya Dryomov if (!single_major) 602083a06263SAlex Elder unregister_blkdev(rbd_dev->major, rbd_dev->name); 6021811c6688SIlya Dryomov err_out_unlock: 6022811c6688SIlya Dryomov up_write(&rbd_dev->header_rwsem); 602383a06263SAlex Elder return ret; 602483a06263SAlex Elder } 602583a06263SAlex Elder 6026332bb12dSAlex Elder static int rbd_dev_header_name(struct rbd_device *rbd_dev) 6027332bb12dSAlex Elder { 6028332bb12dSAlex Elder struct rbd_spec *spec = rbd_dev->spec; 6029c41d13a3SIlya Dryomov int ret; 6030332bb12dSAlex Elder 6031332bb12dSAlex Elder /* Record the header object name for this rbd image. */ 6032332bb12dSAlex Elder 6033332bb12dSAlex Elder rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 6034332bb12dSAlex Elder 60357627151eSYan, Zheng rbd_dev->header_oloc.pool = rbd_dev->layout.pool_id; 6036332bb12dSAlex Elder if (rbd_dev->image_format == 1) 6037c41d13a3SIlya Dryomov ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s", 6038332bb12dSAlex Elder spec->image_name, RBD_SUFFIX); 6039332bb12dSAlex Elder else 6040c41d13a3SIlya Dryomov ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s", 6041332bb12dSAlex Elder RBD_HEADER_PREFIX, spec->image_id); 6042c41d13a3SIlya Dryomov 6043c41d13a3SIlya Dryomov return ret; 6044332bb12dSAlex Elder } 6045332bb12dSAlex Elder 6046200a6a8bSAlex Elder static void rbd_dev_image_release(struct rbd_device *rbd_dev) 6047200a6a8bSAlex Elder { 60486fd48b3bSAlex Elder rbd_dev_unprobe(rbd_dev); 60496fd48b3bSAlex Elder rbd_dev->image_format = 0; 60506fd48b3bSAlex Elder kfree(rbd_dev->spec->image_id); 60516fd48b3bSAlex Elder rbd_dev->spec->image_id = NULL; 60526fd48b3bSAlex Elder 6053200a6a8bSAlex Elder rbd_dev_destroy(rbd_dev); 6054200a6a8bSAlex Elder } 6055200a6a8bSAlex Elder 6056a30b71b9SAlex Elder /* 6057a30b71b9SAlex Elder * Probe for the existence of the header object for the given rbd 60581f3ef788SAlex Elder * device. If this image is the one being mapped (i.e., not a 60591f3ef788SAlex Elder * parent), initiate a watch on its header object before using that 60601f3ef788SAlex Elder * object to get detailed information about the rbd image. 6061a30b71b9SAlex Elder */ 60626d69bb53SIlya Dryomov static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth) 6063a30b71b9SAlex Elder { 6064a30b71b9SAlex Elder int ret; 6065a30b71b9SAlex Elder 6066a30b71b9SAlex Elder /* 60673abef3b3SAlex Elder * Get the id from the image id object. Unless there's an 60683abef3b3SAlex Elder * error, rbd_dev->spec->image_id will be filled in with 60693abef3b3SAlex Elder * a dynamically-allocated string, and rbd_dev->image_format 60703abef3b3SAlex Elder * will be set to either 1 or 2. 6071a30b71b9SAlex Elder */ 6072a30b71b9SAlex Elder ret = rbd_dev_image_id(rbd_dev); 6073a30b71b9SAlex Elder if (ret) 6074c0fba368SAlex Elder return ret; 6075c0fba368SAlex Elder 6076332bb12dSAlex Elder ret = rbd_dev_header_name(rbd_dev); 6077332bb12dSAlex Elder if (ret) 6078332bb12dSAlex Elder goto err_out_format; 6079332bb12dSAlex Elder 60806d69bb53SIlya Dryomov if (!depth) { 608199d16943SIlya Dryomov ret = rbd_register_watch(rbd_dev); 60821fe48023SIlya Dryomov if (ret) { 60831fe48023SIlya Dryomov if (ret == -ENOENT) 60841fe48023SIlya Dryomov pr_info("image %s/%s does not exist\n", 60851fe48023SIlya Dryomov rbd_dev->spec->pool_name, 60861fe48023SIlya Dryomov rbd_dev->spec->image_name); 6087c41d13a3SIlya Dryomov goto err_out_format; 60881f3ef788SAlex Elder } 60891fe48023SIlya Dryomov } 6090b644de2bSAlex Elder 6091a720ae09SIlya Dryomov ret = rbd_dev_header_info(rbd_dev); 60925655c4d9SAlex Elder if (ret) 6093b644de2bSAlex Elder goto err_out_watch; 6094a30b71b9SAlex Elder 609504077599SIlya Dryomov /* 609604077599SIlya Dryomov * If this image is the one being mapped, we have pool name and 609704077599SIlya Dryomov * id, image name and id, and snap name - need to fill snap id. 609804077599SIlya Dryomov * Otherwise this is a parent image, identified by pool, image 609904077599SIlya Dryomov * and snap ids - need to fill in names for those ids. 610004077599SIlya Dryomov */ 61016d69bb53SIlya Dryomov if (!depth) 610204077599SIlya Dryomov ret = rbd_spec_fill_snap_id(rbd_dev); 610304077599SIlya Dryomov else 610404077599SIlya Dryomov ret = rbd_spec_fill_names(rbd_dev); 61051fe48023SIlya Dryomov if (ret) { 61061fe48023SIlya Dryomov if (ret == -ENOENT) 61071fe48023SIlya Dryomov pr_info("snap %s/%s@%s does not exist\n", 61081fe48023SIlya Dryomov rbd_dev->spec->pool_name, 61091fe48023SIlya Dryomov rbd_dev->spec->image_name, 61101fe48023SIlya Dryomov rbd_dev->spec->snap_name); 611133dca39fSAlex Elder goto err_out_probe; 61121fe48023SIlya Dryomov } 61139bb81c9bSAlex Elder 6114e8f59b59SIlya Dryomov if (rbd_dev->header.features & RBD_FEATURE_LAYERING) { 6115e8f59b59SIlya Dryomov ret = rbd_dev_v2_parent_info(rbd_dev); 6116e8f59b59SIlya Dryomov if (ret) 6117e8f59b59SIlya Dryomov goto err_out_probe; 6118e8f59b59SIlya Dryomov 6119e8f59b59SIlya Dryomov /* 6120e8f59b59SIlya Dryomov * Need to warn users if this image is the one being 6121e8f59b59SIlya Dryomov * mapped and has a parent. 6122e8f59b59SIlya Dryomov */ 61236d69bb53SIlya Dryomov if (!depth && rbd_dev->parent_spec) 6124e8f59b59SIlya Dryomov rbd_warn(rbd_dev, 6125e8f59b59SIlya Dryomov "WARNING: kernel layering is EXPERIMENTAL!"); 6126e8f59b59SIlya Dryomov } 6127e8f59b59SIlya Dryomov 61286d69bb53SIlya Dryomov ret = rbd_dev_probe_parent(rbd_dev, depth); 612930d60ba2SAlex Elder if (ret) 613030d60ba2SAlex Elder goto err_out_probe; 613183a06263SAlex Elder 613230d60ba2SAlex Elder dout("discovered format %u image, header name is %s\n", 6133c41d13a3SIlya Dryomov rbd_dev->image_format, rbd_dev->header_oid.name); 613430d60ba2SAlex Elder return 0; 6135e8f59b59SIlya Dryomov 61366fd48b3bSAlex Elder err_out_probe: 61376fd48b3bSAlex Elder rbd_dev_unprobe(rbd_dev); 6138b644de2bSAlex Elder err_out_watch: 61396d69bb53SIlya Dryomov if (!depth) 614099d16943SIlya Dryomov rbd_unregister_watch(rbd_dev); 6141332bb12dSAlex Elder err_out_format: 6142332bb12dSAlex Elder rbd_dev->image_format = 0; 61435655c4d9SAlex Elder kfree(rbd_dev->spec->image_id); 61445655c4d9SAlex Elder rbd_dev->spec->image_id = NULL; 61455655c4d9SAlex Elder return ret; 614683a06263SAlex Elder } 614783a06263SAlex Elder 61489b60e70bSIlya Dryomov static ssize_t do_rbd_add(struct bus_type *bus, 614959c2be1eSYehuda Sadeh const char *buf, 615059c2be1eSYehuda Sadeh size_t count) 6151602adf40SYehuda Sadeh { 6152cb8627c7SAlex Elder struct rbd_device *rbd_dev = NULL; 6153dc79b113SAlex Elder struct ceph_options *ceph_opts = NULL; 61544e9afebaSAlex Elder struct rbd_options *rbd_opts = NULL; 6155859c31dfSAlex Elder struct rbd_spec *spec = NULL; 61569d3997fdSAlex Elder struct rbd_client *rbdc; 615751344a38SAlex Elder bool read_only; 6158b51c83c2SIlya Dryomov int rc; 6159602adf40SYehuda Sadeh 6160602adf40SYehuda Sadeh if (!try_module_get(THIS_MODULE)) 6161602adf40SYehuda Sadeh return -ENODEV; 6162602adf40SYehuda Sadeh 6163a725f65eSAlex Elder /* parse add command */ 6164859c31dfSAlex Elder rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec); 6165dc79b113SAlex Elder if (rc < 0) 6166dd5ac32dSIlya Dryomov goto out; 6167a725f65eSAlex Elder 61689d3997fdSAlex Elder rbdc = rbd_get_client(ceph_opts); 61699d3997fdSAlex Elder if (IS_ERR(rbdc)) { 61709d3997fdSAlex Elder rc = PTR_ERR(rbdc); 61710ddebc0cSAlex Elder goto err_out_args; 61729d3997fdSAlex Elder } 6173602adf40SYehuda Sadeh 6174602adf40SYehuda Sadeh /* pick the pool */ 617530ba1f02SIlya Dryomov rc = rbd_add_get_pool_id(rbdc, spec->pool_name); 61761fe48023SIlya Dryomov if (rc < 0) { 61771fe48023SIlya Dryomov if (rc == -ENOENT) 61781fe48023SIlya Dryomov pr_info("pool %s does not exist\n", spec->pool_name); 6179602adf40SYehuda Sadeh goto err_out_client; 61801fe48023SIlya Dryomov } 6181859c31dfSAlex Elder spec->pool_id = (u64)rc; 6182859c31dfSAlex Elder 6183d147543dSIlya Dryomov rbd_dev = rbd_dev_create(rbdc, spec, rbd_opts); 6184b51c83c2SIlya Dryomov if (!rbd_dev) { 6185b51c83c2SIlya Dryomov rc = -ENOMEM; 6186bd4ba655SAlex Elder goto err_out_client; 6187b51c83c2SIlya Dryomov } 6188c53d5893SAlex Elder rbdc = NULL; /* rbd_dev now owns this */ 6189c53d5893SAlex Elder spec = NULL; /* rbd_dev now owns this */ 6190d147543dSIlya Dryomov rbd_opts = NULL; /* rbd_dev now owns this */ 6191602adf40SYehuda Sadeh 6192811c6688SIlya Dryomov down_write(&rbd_dev->header_rwsem); 61936d69bb53SIlya Dryomov rc = rbd_dev_image_probe(rbd_dev, 0); 6194a30b71b9SAlex Elder if (rc < 0) 6195c53d5893SAlex Elder goto err_out_rbd_dev; 619605fd6f6fSAlex Elder 61977ce4eef7SAlex Elder /* If we are mapping a snapshot it must be marked read-only */ 61987ce4eef7SAlex Elder 6199d147543dSIlya Dryomov read_only = rbd_dev->opts->read_only; 62007ce4eef7SAlex Elder if (rbd_dev->spec->snap_id != CEPH_NOSNAP) 62017ce4eef7SAlex Elder read_only = true; 62027ce4eef7SAlex Elder rbd_dev->mapping.read_only = read_only; 62037ce4eef7SAlex Elder 6204b536f69aSAlex Elder rc = rbd_dev_device_setup(rbd_dev); 62053abef3b3SAlex Elder if (rc) { 6206e37180c0SIlya Dryomov /* 620799d16943SIlya Dryomov * rbd_unregister_watch() can't be moved into 6208e37180c0SIlya Dryomov * rbd_dev_image_release() without refactoring, see 6209e37180c0SIlya Dryomov * commit 1f3ef78861ac. 6210e37180c0SIlya Dryomov */ 621199d16943SIlya Dryomov rbd_unregister_watch(rbd_dev); 62123abef3b3SAlex Elder rbd_dev_image_release(rbd_dev); 6213dd5ac32dSIlya Dryomov goto out; 62143abef3b3SAlex Elder } 62153abef3b3SAlex Elder 6216dd5ac32dSIlya Dryomov rc = count; 6217dd5ac32dSIlya Dryomov out: 6218dd5ac32dSIlya Dryomov module_put(THIS_MODULE); 6219dd5ac32dSIlya Dryomov return rc; 6220b536f69aSAlex Elder 6221c53d5893SAlex Elder err_out_rbd_dev: 6222811c6688SIlya Dryomov up_write(&rbd_dev->header_rwsem); 6223c53d5893SAlex Elder rbd_dev_destroy(rbd_dev); 6224bd4ba655SAlex Elder err_out_client: 62259d3997fdSAlex Elder rbd_put_client(rbdc); 62260ddebc0cSAlex Elder err_out_args: 6227859c31dfSAlex Elder rbd_spec_put(spec); 6228d147543dSIlya Dryomov kfree(rbd_opts); 6229dd5ac32dSIlya Dryomov goto out; 6230602adf40SYehuda Sadeh } 6231602adf40SYehuda Sadeh 62329b60e70bSIlya Dryomov static ssize_t rbd_add(struct bus_type *bus, 62339b60e70bSIlya Dryomov const char *buf, 62349b60e70bSIlya Dryomov size_t count) 62359b60e70bSIlya Dryomov { 62369b60e70bSIlya Dryomov if (single_major) 62379b60e70bSIlya Dryomov return -EINVAL; 62389b60e70bSIlya Dryomov 62399b60e70bSIlya Dryomov return do_rbd_add(bus, buf, count); 62409b60e70bSIlya Dryomov } 62419b60e70bSIlya Dryomov 62429b60e70bSIlya Dryomov static ssize_t rbd_add_single_major(struct bus_type *bus, 62439b60e70bSIlya Dryomov const char *buf, 62449b60e70bSIlya Dryomov size_t count) 62459b60e70bSIlya Dryomov { 62469b60e70bSIlya Dryomov return do_rbd_add(bus, buf, count); 62479b60e70bSIlya Dryomov } 62489b60e70bSIlya Dryomov 6249dd5ac32dSIlya Dryomov static void rbd_dev_device_release(struct rbd_device *rbd_dev) 6250602adf40SYehuda Sadeh { 6251602adf40SYehuda Sadeh rbd_free_disk(rbd_dev); 62521643dfa4SIlya Dryomov 62531643dfa4SIlya Dryomov spin_lock(&rbd_dev_list_lock); 62541643dfa4SIlya Dryomov list_del_init(&rbd_dev->node); 62551643dfa4SIlya Dryomov spin_unlock(&rbd_dev_list_lock); 62561643dfa4SIlya Dryomov 6257200a6a8bSAlex Elder clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); 6258dd5ac32dSIlya Dryomov device_del(&rbd_dev->dev); 62596d80b130SAlex Elder rbd_dev_mapping_clear(rbd_dev); 62609b60e70bSIlya Dryomov if (!single_major) 6261602adf40SYehuda Sadeh unregister_blkdev(rbd_dev->major, rbd_dev->name); 6262602adf40SYehuda Sadeh } 6263602adf40SYehuda Sadeh 626405a46afdSAlex Elder static void rbd_dev_remove_parent(struct rbd_device *rbd_dev) 626505a46afdSAlex Elder { 6266ad945fc1SAlex Elder while (rbd_dev->parent) { 626705a46afdSAlex Elder struct rbd_device *first = rbd_dev; 626805a46afdSAlex Elder struct rbd_device *second = first->parent; 626905a46afdSAlex Elder struct rbd_device *third; 627005a46afdSAlex Elder 627105a46afdSAlex Elder /* 627205a46afdSAlex Elder * Follow to the parent with no grandparent and 627305a46afdSAlex Elder * remove it. 627405a46afdSAlex Elder */ 627505a46afdSAlex Elder while (second && (third = second->parent)) { 627605a46afdSAlex Elder first = second; 627705a46afdSAlex Elder second = third; 627805a46afdSAlex Elder } 6279ad945fc1SAlex Elder rbd_assert(second); 62808ad42cd0SAlex Elder rbd_dev_image_release(second); 6281ad945fc1SAlex Elder first->parent = NULL; 6282ad945fc1SAlex Elder first->parent_overlap = 0; 6283ad945fc1SAlex Elder 6284ad945fc1SAlex Elder rbd_assert(first->parent_spec); 628505a46afdSAlex Elder rbd_spec_put(first->parent_spec); 628605a46afdSAlex Elder first->parent_spec = NULL; 628705a46afdSAlex Elder } 628805a46afdSAlex Elder } 628905a46afdSAlex Elder 62909b60e70bSIlya Dryomov static ssize_t do_rbd_remove(struct bus_type *bus, 6291602adf40SYehuda Sadeh const char *buf, 6292602adf40SYehuda Sadeh size_t count) 6293602adf40SYehuda Sadeh { 6294602adf40SYehuda Sadeh struct rbd_device *rbd_dev = NULL; 6295751cc0e3SAlex Elder struct list_head *tmp; 6296751cc0e3SAlex Elder int dev_id; 6297602adf40SYehuda Sadeh unsigned long ul; 629882a442d2SAlex Elder bool already = false; 62990d8189e1SAlex Elder int ret; 6300602adf40SYehuda Sadeh 6301bb8e0e84SJingoo Han ret = kstrtoul(buf, 10, &ul); 63020d8189e1SAlex Elder if (ret) 63030d8189e1SAlex Elder return ret; 6304602adf40SYehuda Sadeh 6305602adf40SYehuda Sadeh /* convert to int; abort if we lost anything in the conversion */ 6306751cc0e3SAlex Elder dev_id = (int)ul; 6307751cc0e3SAlex Elder if (dev_id != ul) 6308602adf40SYehuda Sadeh return -EINVAL; 6309602adf40SYehuda Sadeh 6310602adf40SYehuda Sadeh ret = -ENOENT; 6311751cc0e3SAlex Elder spin_lock(&rbd_dev_list_lock); 6312751cc0e3SAlex Elder list_for_each(tmp, &rbd_dev_list) { 6313751cc0e3SAlex Elder rbd_dev = list_entry(tmp, struct rbd_device, node); 6314751cc0e3SAlex Elder if (rbd_dev->dev_id == dev_id) { 6315751cc0e3SAlex Elder ret = 0; 6316751cc0e3SAlex Elder break; 6317602adf40SYehuda Sadeh } 6318751cc0e3SAlex Elder } 6319751cc0e3SAlex Elder if (!ret) { 6320a14ea269SAlex Elder spin_lock_irq(&rbd_dev->lock); 6321b82d167bSAlex Elder if (rbd_dev->open_count) 632242382b70SAlex Elder ret = -EBUSY; 6323b82d167bSAlex Elder else 632482a442d2SAlex Elder already = test_and_set_bit(RBD_DEV_FLAG_REMOVING, 632582a442d2SAlex Elder &rbd_dev->flags); 6326a14ea269SAlex Elder spin_unlock_irq(&rbd_dev->lock); 6327751cc0e3SAlex Elder } 6328751cc0e3SAlex Elder spin_unlock(&rbd_dev_list_lock); 632982a442d2SAlex Elder if (ret < 0 || already) 63301ba0f1e7SAlex Elder return ret; 6331751cc0e3SAlex Elder 6332ed95b21aSIlya Dryomov down_write(&rbd_dev->lock_rwsem); 6333ed95b21aSIlya Dryomov if (__rbd_is_lock_owner(rbd_dev)) 6334ed95b21aSIlya Dryomov rbd_unlock(rbd_dev); 6335ed95b21aSIlya Dryomov up_write(&rbd_dev->lock_rwsem); 633699d16943SIlya Dryomov rbd_unregister_watch(rbd_dev); 6337fca27065SIlya Dryomov 63389875201eSJosh Durgin /* 63399875201eSJosh Durgin * Don't free anything from rbd_dev->disk until after all 63409875201eSJosh Durgin * notifies are completely processed. Otherwise 63419875201eSJosh Durgin * rbd_bus_del_dev() will race with rbd_watch_cb(), resulting 63429875201eSJosh Durgin * in a potential use after free of rbd_dev->disk or rbd_dev. 63439875201eSJosh Durgin */ 6344dd5ac32dSIlya Dryomov rbd_dev_device_release(rbd_dev); 63458ad42cd0SAlex Elder rbd_dev_image_release(rbd_dev); 6346aafb230eSAlex Elder 63471ba0f1e7SAlex Elder return count; 6348602adf40SYehuda Sadeh } 6349602adf40SYehuda Sadeh 63509b60e70bSIlya Dryomov static ssize_t rbd_remove(struct bus_type *bus, 63519b60e70bSIlya Dryomov const char *buf, 63529b60e70bSIlya Dryomov size_t count) 63539b60e70bSIlya Dryomov { 63549b60e70bSIlya Dryomov if (single_major) 63559b60e70bSIlya Dryomov return -EINVAL; 63569b60e70bSIlya Dryomov 63579b60e70bSIlya Dryomov return do_rbd_remove(bus, buf, count); 63589b60e70bSIlya Dryomov } 63599b60e70bSIlya Dryomov 63609b60e70bSIlya Dryomov static ssize_t rbd_remove_single_major(struct bus_type *bus, 63619b60e70bSIlya Dryomov const char *buf, 63629b60e70bSIlya Dryomov size_t count) 63639b60e70bSIlya Dryomov { 63649b60e70bSIlya Dryomov return do_rbd_remove(bus, buf, count); 63659b60e70bSIlya Dryomov } 63669b60e70bSIlya Dryomov 6367602adf40SYehuda Sadeh /* 6368602adf40SYehuda Sadeh * create control files in sysfs 6369dfc5606dSYehuda Sadeh * /sys/bus/rbd/... 6370602adf40SYehuda Sadeh */ 6371602adf40SYehuda Sadeh static int rbd_sysfs_init(void) 6372602adf40SYehuda Sadeh { 6373dfc5606dSYehuda Sadeh int ret; 6374602adf40SYehuda Sadeh 6375fed4c143SAlex Elder ret = device_register(&rbd_root_dev); 6376dfc5606dSYehuda Sadeh if (ret < 0) 6377dfc5606dSYehuda Sadeh return ret; 6378602adf40SYehuda Sadeh 6379fed4c143SAlex Elder ret = bus_register(&rbd_bus_type); 6380fed4c143SAlex Elder if (ret < 0) 6381fed4c143SAlex Elder device_unregister(&rbd_root_dev); 6382602adf40SYehuda Sadeh 6383602adf40SYehuda Sadeh return ret; 6384602adf40SYehuda Sadeh } 6385602adf40SYehuda Sadeh 6386602adf40SYehuda Sadeh static void rbd_sysfs_cleanup(void) 6387602adf40SYehuda Sadeh { 6388dfc5606dSYehuda Sadeh bus_unregister(&rbd_bus_type); 6389fed4c143SAlex Elder device_unregister(&rbd_root_dev); 6390602adf40SYehuda Sadeh } 6391602adf40SYehuda Sadeh 63921c2a9dfeSAlex Elder static int rbd_slab_init(void) 63931c2a9dfeSAlex Elder { 63941c2a9dfeSAlex Elder rbd_assert(!rbd_img_request_cache); 639503d94406SGeliang Tang rbd_img_request_cache = KMEM_CACHE(rbd_img_request, 0); 6396868311b1SAlex Elder if (!rbd_img_request_cache) 6397868311b1SAlex Elder return -ENOMEM; 6398868311b1SAlex Elder 6399868311b1SAlex Elder rbd_assert(!rbd_obj_request_cache); 640003d94406SGeliang Tang rbd_obj_request_cache = KMEM_CACHE(rbd_obj_request, 0); 640178c2a44aSAlex Elder if (!rbd_obj_request_cache) 640278c2a44aSAlex Elder goto out_err; 640378c2a44aSAlex Elder 640478c2a44aSAlex Elder rbd_assert(!rbd_segment_name_cache); 640578c2a44aSAlex Elder rbd_segment_name_cache = kmem_cache_create("rbd_segment_name", 64062d0ebc5dSIlya Dryomov CEPH_MAX_OID_NAME_LEN + 1, 1, 0, NULL); 640778c2a44aSAlex Elder if (rbd_segment_name_cache) 64081c2a9dfeSAlex Elder return 0; 640978c2a44aSAlex Elder out_err: 641078c2a44aSAlex Elder kmem_cache_destroy(rbd_obj_request_cache); 641178c2a44aSAlex Elder rbd_obj_request_cache = NULL; 64121c2a9dfeSAlex Elder 6413868311b1SAlex Elder kmem_cache_destroy(rbd_img_request_cache); 6414868311b1SAlex Elder rbd_img_request_cache = NULL; 6415868311b1SAlex Elder 64161c2a9dfeSAlex Elder return -ENOMEM; 64171c2a9dfeSAlex Elder } 64181c2a9dfeSAlex Elder 64191c2a9dfeSAlex Elder static void rbd_slab_exit(void) 64201c2a9dfeSAlex Elder { 642178c2a44aSAlex Elder rbd_assert(rbd_segment_name_cache); 642278c2a44aSAlex Elder kmem_cache_destroy(rbd_segment_name_cache); 642378c2a44aSAlex Elder rbd_segment_name_cache = NULL; 642478c2a44aSAlex Elder 6425868311b1SAlex Elder rbd_assert(rbd_obj_request_cache); 6426868311b1SAlex Elder kmem_cache_destroy(rbd_obj_request_cache); 6427868311b1SAlex Elder rbd_obj_request_cache = NULL; 6428868311b1SAlex Elder 64291c2a9dfeSAlex Elder rbd_assert(rbd_img_request_cache); 64301c2a9dfeSAlex Elder kmem_cache_destroy(rbd_img_request_cache); 64311c2a9dfeSAlex Elder rbd_img_request_cache = NULL; 64321c2a9dfeSAlex Elder } 64331c2a9dfeSAlex Elder 6434cc344fa1SAlex Elder static int __init rbd_init(void) 6435602adf40SYehuda Sadeh { 6436602adf40SYehuda Sadeh int rc; 6437602adf40SYehuda Sadeh 64381e32d34cSAlex Elder if (!libceph_compatible(NULL)) { 64391e32d34cSAlex Elder rbd_warn(NULL, "libceph incompatibility (quitting)"); 64401e32d34cSAlex Elder return -EINVAL; 64411e32d34cSAlex Elder } 6442e1b4d96dSIlya Dryomov 64431c2a9dfeSAlex Elder rc = rbd_slab_init(); 6444602adf40SYehuda Sadeh if (rc) 6445602adf40SYehuda Sadeh return rc; 6446e1b4d96dSIlya Dryomov 6447f5ee37bdSIlya Dryomov /* 6448f5ee37bdSIlya Dryomov * The number of active work items is limited by the number of 6449f77303bdSIlya Dryomov * rbd devices * queue depth, so leave @max_active at default. 6450f5ee37bdSIlya Dryomov */ 6451f5ee37bdSIlya Dryomov rbd_wq = alloc_workqueue(RBD_DRV_NAME, WQ_MEM_RECLAIM, 0); 6452f5ee37bdSIlya Dryomov if (!rbd_wq) { 6453f5ee37bdSIlya Dryomov rc = -ENOMEM; 6454f5ee37bdSIlya Dryomov goto err_out_slab; 6455f5ee37bdSIlya Dryomov } 6456f5ee37bdSIlya Dryomov 64579b60e70bSIlya Dryomov if (single_major) { 64589b60e70bSIlya Dryomov rbd_major = register_blkdev(0, RBD_DRV_NAME); 64599b60e70bSIlya Dryomov if (rbd_major < 0) { 64609b60e70bSIlya Dryomov rc = rbd_major; 6461f5ee37bdSIlya Dryomov goto err_out_wq; 64629b60e70bSIlya Dryomov } 64639b60e70bSIlya Dryomov } 64649b60e70bSIlya Dryomov 64651c2a9dfeSAlex Elder rc = rbd_sysfs_init(); 64661c2a9dfeSAlex Elder if (rc) 64679b60e70bSIlya Dryomov goto err_out_blkdev; 64681c2a9dfeSAlex Elder 64699b60e70bSIlya Dryomov if (single_major) 64709b60e70bSIlya Dryomov pr_info("loaded (major %d)\n", rbd_major); 64719b60e70bSIlya Dryomov else 6472e1b4d96dSIlya Dryomov pr_info("loaded\n"); 64739b60e70bSIlya Dryomov 6474e1b4d96dSIlya Dryomov return 0; 6475e1b4d96dSIlya Dryomov 64769b60e70bSIlya Dryomov err_out_blkdev: 64779b60e70bSIlya Dryomov if (single_major) 64789b60e70bSIlya Dryomov unregister_blkdev(rbd_major, RBD_DRV_NAME); 6479f5ee37bdSIlya Dryomov err_out_wq: 6480f5ee37bdSIlya Dryomov destroy_workqueue(rbd_wq); 6481e1b4d96dSIlya Dryomov err_out_slab: 6482e1b4d96dSIlya Dryomov rbd_slab_exit(); 64831c2a9dfeSAlex Elder return rc; 6484602adf40SYehuda Sadeh } 6485602adf40SYehuda Sadeh 6486cc344fa1SAlex Elder static void __exit rbd_exit(void) 6487602adf40SYehuda Sadeh { 6488ffe312cfSIlya Dryomov ida_destroy(&rbd_dev_id_ida); 6489602adf40SYehuda Sadeh rbd_sysfs_cleanup(); 64909b60e70bSIlya Dryomov if (single_major) 64919b60e70bSIlya Dryomov unregister_blkdev(rbd_major, RBD_DRV_NAME); 6492f5ee37bdSIlya Dryomov destroy_workqueue(rbd_wq); 64931c2a9dfeSAlex Elder rbd_slab_exit(); 6494602adf40SYehuda Sadeh } 6495602adf40SYehuda Sadeh 6496602adf40SYehuda Sadeh module_init(rbd_init); 6497602adf40SYehuda Sadeh module_exit(rbd_exit); 6498602adf40SYehuda Sadeh 6499d552c619SAlex Elder MODULE_AUTHOR("Alex Elder <elder@inktank.com>"); 6500602adf40SYehuda Sadeh MODULE_AUTHOR("Sage Weil <sage@newdream.net>"); 6501602adf40SYehuda Sadeh MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>"); 6502602adf40SYehuda Sadeh /* following authorship retained from original osdblk.c */ 6503602adf40SYehuda Sadeh MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>"); 6504602adf40SYehuda Sadeh 650590da258bSIlya Dryomov MODULE_DESCRIPTION("RADOS Block Device (RBD) driver"); 6506602adf40SYehuda Sadeh MODULE_LICENSE("GPL"); 6507