1e2a58ee5SAlex Elder 2602adf40SYehuda Sadeh /* 3602adf40SYehuda Sadeh rbd.c -- Export ceph rados objects as a Linux block device 4602adf40SYehuda Sadeh 5602adf40SYehuda Sadeh 6602adf40SYehuda Sadeh based on drivers/block/osdblk.c: 7602adf40SYehuda Sadeh 8602adf40SYehuda Sadeh Copyright 2009 Red Hat, Inc. 9602adf40SYehuda Sadeh 10602adf40SYehuda Sadeh This program is free software; you can redistribute it and/or modify 11602adf40SYehuda Sadeh it under the terms of the GNU General Public License as published by 12602adf40SYehuda Sadeh the Free Software Foundation. 13602adf40SYehuda Sadeh 14602adf40SYehuda Sadeh This program is distributed in the hope that it will be useful, 15602adf40SYehuda Sadeh but WITHOUT ANY WARRANTY; without even the implied warranty of 16602adf40SYehuda Sadeh MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17602adf40SYehuda Sadeh GNU General Public License for more details. 18602adf40SYehuda Sadeh 19602adf40SYehuda Sadeh You should have received a copy of the GNU General Public License 20602adf40SYehuda Sadeh along with this program; see the file COPYING. If not, write to 21602adf40SYehuda Sadeh the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 22602adf40SYehuda Sadeh 23602adf40SYehuda Sadeh 24602adf40SYehuda Sadeh 25dfc5606dSYehuda Sadeh For usage instructions, please refer to: 26602adf40SYehuda Sadeh 27dfc5606dSYehuda Sadeh Documentation/ABI/testing/sysfs-bus-rbd 28602adf40SYehuda Sadeh 29602adf40SYehuda Sadeh */ 30602adf40SYehuda Sadeh 31602adf40SYehuda Sadeh #include <linux/ceph/libceph.h> 32602adf40SYehuda Sadeh #include <linux/ceph/osd_client.h> 33602adf40SYehuda Sadeh #include <linux/ceph/mon_client.h> 34602adf40SYehuda Sadeh #include <linux/ceph/decode.h> 3559c2be1eSYehuda Sadeh #include <linux/parser.h> 3630d1cff8SAlex Elder #include <linux/bsearch.h> 37602adf40SYehuda Sadeh 38602adf40SYehuda Sadeh #include <linux/kernel.h> 39602adf40SYehuda Sadeh #include <linux/device.h> 40602adf40SYehuda Sadeh #include <linux/module.h> 417ad18afaSChristoph Hellwig #include <linux/blk-mq.h> 42602adf40SYehuda Sadeh #include <linux/fs.h> 43602adf40SYehuda Sadeh #include <linux/blkdev.h> 441c2a9dfeSAlex Elder #include <linux/slab.h> 45f8a22fc2SIlya Dryomov #include <linux/idr.h> 46bc1ecc65SIlya Dryomov #include <linux/workqueue.h> 47602adf40SYehuda Sadeh 48602adf40SYehuda Sadeh #include "rbd_types.h" 49602adf40SYehuda Sadeh 50aafb230eSAlex Elder #define RBD_DEBUG /* Activate rbd_assert() calls */ 51aafb230eSAlex Elder 52593a9e7bSAlex Elder /* 53593a9e7bSAlex Elder * The basic unit of block I/O is a sector. It is interpreted in a 54593a9e7bSAlex Elder * number of contexts in Linux (blk, bio, genhd), but the default is 55593a9e7bSAlex Elder * universally 512 bytes. These symbols are just slightly more 56593a9e7bSAlex Elder * meaningful than the bare numbers they represent. 57593a9e7bSAlex Elder */ 58593a9e7bSAlex Elder #define SECTOR_SHIFT 9 59593a9e7bSAlex Elder #define SECTOR_SIZE (1ULL << SECTOR_SHIFT) 60593a9e7bSAlex Elder 61a2acd00eSAlex Elder /* 62a2acd00eSAlex Elder * Increment the given counter and return its updated value. 63a2acd00eSAlex Elder * If the counter is already 0 it will not be incremented. 64a2acd00eSAlex Elder * If the counter is already at its maximum value returns 65a2acd00eSAlex Elder * -EINVAL without updating it. 66a2acd00eSAlex Elder */ 67a2acd00eSAlex Elder static int atomic_inc_return_safe(atomic_t *v) 68a2acd00eSAlex Elder { 69a2acd00eSAlex Elder unsigned int counter; 70a2acd00eSAlex Elder 71a2acd00eSAlex Elder counter = (unsigned int)__atomic_add_unless(v, 1, 0); 72a2acd00eSAlex Elder if (counter <= (unsigned int)INT_MAX) 73a2acd00eSAlex Elder return (int)counter; 74a2acd00eSAlex Elder 75a2acd00eSAlex Elder atomic_dec(v); 76a2acd00eSAlex Elder 77a2acd00eSAlex Elder return -EINVAL; 78a2acd00eSAlex Elder } 79a2acd00eSAlex Elder 80a2acd00eSAlex Elder /* Decrement the counter. Return the resulting value, or -EINVAL */ 81a2acd00eSAlex Elder static int atomic_dec_return_safe(atomic_t *v) 82a2acd00eSAlex Elder { 83a2acd00eSAlex Elder int counter; 84a2acd00eSAlex Elder 85a2acd00eSAlex Elder counter = atomic_dec_return(v); 86a2acd00eSAlex Elder if (counter >= 0) 87a2acd00eSAlex Elder return counter; 88a2acd00eSAlex Elder 89a2acd00eSAlex Elder atomic_inc(v); 90a2acd00eSAlex Elder 91a2acd00eSAlex Elder return -EINVAL; 92a2acd00eSAlex Elder } 93a2acd00eSAlex Elder 94f0f8cef5SAlex Elder #define RBD_DRV_NAME "rbd" 95602adf40SYehuda Sadeh 967e513d43SIlya Dryomov #define RBD_MINORS_PER_MAJOR 256 977e513d43SIlya Dryomov #define RBD_SINGLE_MAJOR_PART_SHIFT 4 98602adf40SYehuda Sadeh 99d4b125e9SAlex Elder #define RBD_SNAP_DEV_NAME_PREFIX "snap_" 100d4b125e9SAlex Elder #define RBD_MAX_SNAP_NAME_LEN \ 101d4b125e9SAlex Elder (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1)) 102d4b125e9SAlex Elder 10335d489f9SAlex Elder #define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */ 104602adf40SYehuda Sadeh 105602adf40SYehuda Sadeh #define RBD_SNAP_HEAD_NAME "-" 106602adf40SYehuda Sadeh 1079682fc6dSAlex Elder #define BAD_SNAP_INDEX U32_MAX /* invalid index into snap array */ 1089682fc6dSAlex Elder 1099e15b77dSAlex Elder /* This allows a single page to hold an image name sent by OSD */ 1109e15b77dSAlex Elder #define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1) 111589d30e0SAlex Elder #define RBD_IMAGE_ID_LEN_MAX 64 1129e15b77dSAlex Elder 1131e130199SAlex Elder #define RBD_OBJ_PREFIX_LEN_MAX 64 114589d30e0SAlex Elder 115d889140cSAlex Elder /* Feature bits */ 116d889140cSAlex Elder 1175cbf6f12SAlex Elder #define RBD_FEATURE_LAYERING (1<<0) 1185cbf6f12SAlex Elder #define RBD_FEATURE_STRIPINGV2 (1<<1) 1195cbf6f12SAlex Elder #define RBD_FEATURES_ALL \ 1205cbf6f12SAlex Elder (RBD_FEATURE_LAYERING | RBD_FEATURE_STRIPINGV2) 121d889140cSAlex Elder 122d889140cSAlex Elder /* Features supported by this (client software) implementation. */ 123d889140cSAlex Elder 124770eba6eSAlex Elder #define RBD_FEATURES_SUPPORTED (RBD_FEATURES_ALL) 125d889140cSAlex Elder 12681a89793SAlex Elder /* 12781a89793SAlex Elder * An RBD device name will be "rbd#", where the "rbd" comes from 12881a89793SAlex Elder * RBD_DRV_NAME above, and # is a unique integer identifier. 12981a89793SAlex Elder * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big 13081a89793SAlex Elder * enough to hold all possible device names. 13181a89793SAlex Elder */ 132602adf40SYehuda Sadeh #define DEV_NAME_LEN 32 13381a89793SAlex Elder #define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1) 134602adf40SYehuda Sadeh 135602adf40SYehuda Sadeh /* 136602adf40SYehuda Sadeh * block device image metadata (in-memory version) 137602adf40SYehuda Sadeh */ 138602adf40SYehuda Sadeh struct rbd_image_header { 139f35a4deeSAlex Elder /* These six fields never change for a given rbd image */ 140849b4260SAlex Elder char *object_prefix; 141602adf40SYehuda Sadeh __u8 obj_order; 142602adf40SYehuda Sadeh __u8 crypt_type; 143602adf40SYehuda Sadeh __u8 comp_type; 144f35a4deeSAlex Elder u64 stripe_unit; 145f35a4deeSAlex Elder u64 stripe_count; 146f35a4deeSAlex Elder u64 features; /* Might be changeable someday? */ 147602adf40SYehuda Sadeh 148f84344f3SAlex Elder /* The remaining fields need to be updated occasionally */ 149f84344f3SAlex Elder u64 image_size; 150f84344f3SAlex Elder struct ceph_snap_context *snapc; 151f35a4deeSAlex Elder char *snap_names; /* format 1 only */ 152f35a4deeSAlex Elder u64 *snap_sizes; /* format 1 only */ 15359c2be1eSYehuda Sadeh }; 15459c2be1eSYehuda Sadeh 1550d7dbfceSAlex Elder /* 1560d7dbfceSAlex Elder * An rbd image specification. 1570d7dbfceSAlex Elder * 1580d7dbfceSAlex Elder * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely 159c66c6e0cSAlex Elder * identify an image. Each rbd_dev structure includes a pointer to 160c66c6e0cSAlex Elder * an rbd_spec structure that encapsulates this identity. 161c66c6e0cSAlex Elder * 162c66c6e0cSAlex Elder * Each of the id's in an rbd_spec has an associated name. For a 163c66c6e0cSAlex Elder * user-mapped image, the names are supplied and the id's associated 164c66c6e0cSAlex Elder * with them are looked up. For a layered image, a parent image is 165c66c6e0cSAlex Elder * defined by the tuple, and the names are looked up. 166c66c6e0cSAlex Elder * 167c66c6e0cSAlex Elder * An rbd_dev structure contains a parent_spec pointer which is 168c66c6e0cSAlex Elder * non-null if the image it represents is a child in a layered 169c66c6e0cSAlex Elder * image. This pointer will refer to the rbd_spec structure used 170c66c6e0cSAlex Elder * by the parent rbd_dev for its own identity (i.e., the structure 171c66c6e0cSAlex Elder * is shared between the parent and child). 172c66c6e0cSAlex Elder * 173c66c6e0cSAlex Elder * Since these structures are populated once, during the discovery 174c66c6e0cSAlex Elder * phase of image construction, they are effectively immutable so 175c66c6e0cSAlex Elder * we make no effort to synchronize access to them. 176c66c6e0cSAlex Elder * 177c66c6e0cSAlex Elder * Note that code herein does not assume the image name is known (it 178c66c6e0cSAlex Elder * could be a null pointer). 1790d7dbfceSAlex Elder */ 1800d7dbfceSAlex Elder struct rbd_spec { 1810d7dbfceSAlex Elder u64 pool_id; 182ecb4dc22SAlex Elder const char *pool_name; 1830d7dbfceSAlex Elder 184ecb4dc22SAlex Elder const char *image_id; 185ecb4dc22SAlex Elder const char *image_name; 1860d7dbfceSAlex Elder 1870d7dbfceSAlex Elder u64 snap_id; 188ecb4dc22SAlex Elder const char *snap_name; 1890d7dbfceSAlex Elder 1900d7dbfceSAlex Elder struct kref kref; 1910d7dbfceSAlex Elder }; 1920d7dbfceSAlex Elder 193602adf40SYehuda Sadeh /* 194f0f8cef5SAlex Elder * an instance of the client. multiple devices may share an rbd client. 195602adf40SYehuda Sadeh */ 196602adf40SYehuda Sadeh struct rbd_client { 197602adf40SYehuda Sadeh struct ceph_client *client; 198602adf40SYehuda Sadeh struct kref kref; 199602adf40SYehuda Sadeh struct list_head node; 200602adf40SYehuda Sadeh }; 201602adf40SYehuda Sadeh 202bf0d5f50SAlex Elder struct rbd_img_request; 203bf0d5f50SAlex Elder typedef void (*rbd_img_callback_t)(struct rbd_img_request *); 204bf0d5f50SAlex Elder 205bf0d5f50SAlex Elder #define BAD_WHICH U32_MAX /* Good which or bad which, which? */ 206bf0d5f50SAlex Elder 207bf0d5f50SAlex Elder struct rbd_obj_request; 208bf0d5f50SAlex Elder typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *); 209bf0d5f50SAlex Elder 2109969ebc5SAlex Elder enum obj_request_type { 2119969ebc5SAlex Elder OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES 2129969ebc5SAlex Elder }; 213bf0d5f50SAlex Elder 2146d2940c8SGuangliang Zhao enum obj_operation_type { 2156d2940c8SGuangliang Zhao OBJ_OP_WRITE, 2166d2940c8SGuangliang Zhao OBJ_OP_READ, 21790e98c52SGuangliang Zhao OBJ_OP_DISCARD, 2186d2940c8SGuangliang Zhao }; 2196d2940c8SGuangliang Zhao 220926f9b3fSAlex Elder enum obj_req_flags { 221926f9b3fSAlex Elder OBJ_REQ_DONE, /* completion flag: not done = 0, done = 1 */ 2226365d33aSAlex Elder OBJ_REQ_IMG_DATA, /* object usage: standalone = 0, image = 1 */ 2235679c59fSAlex Elder OBJ_REQ_KNOWN, /* EXISTS flag valid: no = 0, yes = 1 */ 2245679c59fSAlex Elder OBJ_REQ_EXISTS, /* target exists: no = 0, yes = 1 */ 225926f9b3fSAlex Elder }; 226926f9b3fSAlex Elder 227bf0d5f50SAlex Elder struct rbd_obj_request { 228bf0d5f50SAlex Elder const char *object_name; 229bf0d5f50SAlex Elder u64 offset; /* object start byte */ 230bf0d5f50SAlex Elder u64 length; /* bytes from offset */ 231926f9b3fSAlex Elder unsigned long flags; 232bf0d5f50SAlex Elder 233c5b5ef6cSAlex Elder /* 234c5b5ef6cSAlex Elder * An object request associated with an image will have its 235c5b5ef6cSAlex Elder * img_data flag set; a standalone object request will not. 236c5b5ef6cSAlex Elder * 237c5b5ef6cSAlex Elder * A standalone object request will have which == BAD_WHICH 238c5b5ef6cSAlex Elder * and a null obj_request pointer. 239c5b5ef6cSAlex Elder * 240c5b5ef6cSAlex Elder * An object request initiated in support of a layered image 241c5b5ef6cSAlex Elder * object (to check for its existence before a write) will 242c5b5ef6cSAlex Elder * have which == BAD_WHICH and a non-null obj_request pointer. 243c5b5ef6cSAlex Elder * 244c5b5ef6cSAlex Elder * Finally, an object request for rbd image data will have 245c5b5ef6cSAlex Elder * which != BAD_WHICH, and will have a non-null img_request 246c5b5ef6cSAlex Elder * pointer. The value of which will be in the range 247c5b5ef6cSAlex Elder * 0..(img_request->obj_request_count-1). 248c5b5ef6cSAlex Elder */ 249c5b5ef6cSAlex Elder union { 250c5b5ef6cSAlex Elder struct rbd_obj_request *obj_request; /* STAT op */ 251c5b5ef6cSAlex Elder struct { 252bf0d5f50SAlex Elder struct rbd_img_request *img_request; 253c5b5ef6cSAlex Elder u64 img_offset; 254c5b5ef6cSAlex Elder /* links for img_request->obj_requests list */ 255c5b5ef6cSAlex Elder struct list_head links; 256c5b5ef6cSAlex Elder }; 257c5b5ef6cSAlex Elder }; 258bf0d5f50SAlex Elder u32 which; /* posn image request list */ 259bf0d5f50SAlex Elder 260bf0d5f50SAlex Elder enum obj_request_type type; 261788e2df3SAlex Elder union { 262bf0d5f50SAlex Elder struct bio *bio_list; 263788e2df3SAlex Elder struct { 264788e2df3SAlex Elder struct page **pages; 265788e2df3SAlex Elder u32 page_count; 266788e2df3SAlex Elder }; 267788e2df3SAlex Elder }; 2680eefd470SAlex Elder struct page **copyup_pages; 269ebda6408SAlex Elder u32 copyup_page_count; 270bf0d5f50SAlex Elder 271bf0d5f50SAlex Elder struct ceph_osd_request *osd_req; 272bf0d5f50SAlex Elder 273bf0d5f50SAlex Elder u64 xferred; /* bytes transferred */ 2741b83bef2SSage Weil int result; 275bf0d5f50SAlex Elder 276bf0d5f50SAlex Elder rbd_obj_callback_t callback; 277788e2df3SAlex Elder struct completion completion; 278bf0d5f50SAlex Elder 279bf0d5f50SAlex Elder struct kref kref; 280bf0d5f50SAlex Elder }; 281bf0d5f50SAlex Elder 2820c425248SAlex Elder enum img_req_flags { 2839849e986SAlex Elder IMG_REQ_WRITE, /* I/O direction: read = 0, write = 1 */ 2849849e986SAlex Elder IMG_REQ_CHILD, /* initiator: block = 0, child image = 1 */ 285d0b2e944SAlex Elder IMG_REQ_LAYERED, /* ENOENT handling: normal = 0, layered = 1 */ 28690e98c52SGuangliang Zhao IMG_REQ_DISCARD, /* discard: normal = 0, discard request = 1 */ 2870c425248SAlex Elder }; 2880c425248SAlex Elder 289bf0d5f50SAlex Elder struct rbd_img_request { 290bf0d5f50SAlex Elder struct rbd_device *rbd_dev; 291bf0d5f50SAlex Elder u64 offset; /* starting image byte offset */ 292bf0d5f50SAlex Elder u64 length; /* byte count from offset */ 2930c425248SAlex Elder unsigned long flags; 294bf0d5f50SAlex Elder union { 295bf0d5f50SAlex Elder u64 snap_id; /* for reads */ 2969849e986SAlex Elder struct ceph_snap_context *snapc; /* for writes */ 2979849e986SAlex Elder }; 2989849e986SAlex Elder union { 2999849e986SAlex Elder struct request *rq; /* block request */ 3009849e986SAlex Elder struct rbd_obj_request *obj_request; /* obj req initiator */ 301bf0d5f50SAlex Elder }; 3023d7efd18SAlex Elder struct page **copyup_pages; 303ebda6408SAlex Elder u32 copyup_page_count; 304bf0d5f50SAlex Elder spinlock_t completion_lock;/* protects next_completion */ 305bf0d5f50SAlex Elder u32 next_completion; 306bf0d5f50SAlex Elder rbd_img_callback_t callback; 30755f27e09SAlex Elder u64 xferred;/* aggregate bytes transferred */ 308a5a337d4SAlex Elder int result; /* first nonzero obj_request result */ 309bf0d5f50SAlex Elder 310bf0d5f50SAlex Elder u32 obj_request_count; 311bf0d5f50SAlex Elder struct list_head obj_requests; /* rbd_obj_request structs */ 312bf0d5f50SAlex Elder 313bf0d5f50SAlex Elder struct kref kref; 314bf0d5f50SAlex Elder }; 315bf0d5f50SAlex Elder 316bf0d5f50SAlex Elder #define for_each_obj_request(ireq, oreq) \ 317ef06f4d3SAlex Elder list_for_each_entry(oreq, &(ireq)->obj_requests, links) 318bf0d5f50SAlex Elder #define for_each_obj_request_from(ireq, oreq) \ 319ef06f4d3SAlex Elder list_for_each_entry_from(oreq, &(ireq)->obj_requests, links) 320bf0d5f50SAlex Elder #define for_each_obj_request_safe(ireq, oreq, n) \ 321ef06f4d3SAlex Elder list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links) 322bf0d5f50SAlex Elder 323f84344f3SAlex Elder struct rbd_mapping { 32499c1f08fSAlex Elder u64 size; 32534b13184SAlex Elder u64 features; 326f84344f3SAlex Elder bool read_only; 327f84344f3SAlex Elder }; 328f84344f3SAlex Elder 329602adf40SYehuda Sadeh /* 330602adf40SYehuda Sadeh * a single device 331602adf40SYehuda Sadeh */ 332602adf40SYehuda Sadeh struct rbd_device { 333de71a297SAlex Elder int dev_id; /* blkdev unique id */ 334602adf40SYehuda Sadeh 335602adf40SYehuda Sadeh int major; /* blkdev assigned major */ 336dd82fff1SIlya Dryomov int minor; 337602adf40SYehuda Sadeh struct gendisk *disk; /* blkdev's gendisk and rq */ 338602adf40SYehuda Sadeh 339a30b71b9SAlex Elder u32 image_format; /* Either 1 or 2 */ 340602adf40SYehuda Sadeh struct rbd_client *rbd_client; 341602adf40SYehuda Sadeh 342602adf40SYehuda Sadeh char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */ 343602adf40SYehuda Sadeh 344b82d167bSAlex Elder spinlock_t lock; /* queue, flags, open_count */ 345602adf40SYehuda Sadeh 346602adf40SYehuda Sadeh struct rbd_image_header header; 347b82d167bSAlex Elder unsigned long flags; /* possibly lock protected */ 3480d7dbfceSAlex Elder struct rbd_spec *spec; 349602adf40SYehuda Sadeh 3500d7dbfceSAlex Elder char *header_name; 351971f839aSAlex Elder 3520903e875SAlex Elder struct ceph_file_layout layout; 3530903e875SAlex Elder 35459c2be1eSYehuda Sadeh struct ceph_osd_event *watch_event; 355975241afSAlex Elder struct rbd_obj_request *watch_request; 35659c2be1eSYehuda Sadeh 35786b00e0dSAlex Elder struct rbd_spec *parent_spec; 35886b00e0dSAlex Elder u64 parent_overlap; 359a2acd00eSAlex Elder atomic_t parent_ref; 3602f82ee54SAlex Elder struct rbd_device *parent; 36186b00e0dSAlex Elder 3627ad18afaSChristoph Hellwig /* Block layer tags. */ 3637ad18afaSChristoph Hellwig struct blk_mq_tag_set tag_set; 3647ad18afaSChristoph Hellwig 365c666601aSJosh Durgin /* protects updating the header */ 366c666601aSJosh Durgin struct rw_semaphore header_rwsem; 367f84344f3SAlex Elder 368f84344f3SAlex Elder struct rbd_mapping mapping; 369602adf40SYehuda Sadeh 370602adf40SYehuda Sadeh struct list_head node; 371dfc5606dSYehuda Sadeh 372dfc5606dSYehuda Sadeh /* sysfs related */ 373dfc5606dSYehuda Sadeh struct device dev; 374b82d167bSAlex Elder unsigned long open_count; /* protected by lock */ 375dfc5606dSYehuda Sadeh }; 376dfc5606dSYehuda Sadeh 377b82d167bSAlex Elder /* 378b82d167bSAlex Elder * Flag bits for rbd_dev->flags. If atomicity is required, 379b82d167bSAlex Elder * rbd_dev->lock is used to protect access. 380b82d167bSAlex Elder * 381b82d167bSAlex Elder * Currently, only the "removing" flag (which is coupled with the 382b82d167bSAlex Elder * "open_count" field) requires atomic access. 383b82d167bSAlex Elder */ 3846d292906SAlex Elder enum rbd_dev_flags { 3856d292906SAlex Elder RBD_DEV_FLAG_EXISTS, /* mapped snapshot has not been deleted */ 386b82d167bSAlex Elder RBD_DEV_FLAG_REMOVING, /* this mapping is being removed */ 3876d292906SAlex Elder }; 3886d292906SAlex Elder 389cfbf6377SAlex Elder static DEFINE_MUTEX(client_mutex); /* Serialize client creation */ 390e124a82fSAlex Elder 391602adf40SYehuda Sadeh static LIST_HEAD(rbd_dev_list); /* devices */ 392e124a82fSAlex Elder static DEFINE_SPINLOCK(rbd_dev_list_lock); 393e124a82fSAlex Elder 394602adf40SYehuda Sadeh static LIST_HEAD(rbd_client_list); /* clients */ 395432b8587SAlex Elder static DEFINE_SPINLOCK(rbd_client_list_lock); 396602adf40SYehuda Sadeh 39778c2a44aSAlex Elder /* Slab caches for frequently-allocated structures */ 39878c2a44aSAlex Elder 3991c2a9dfeSAlex Elder static struct kmem_cache *rbd_img_request_cache; 400868311b1SAlex Elder static struct kmem_cache *rbd_obj_request_cache; 40178c2a44aSAlex Elder static struct kmem_cache *rbd_segment_name_cache; 4021c2a9dfeSAlex Elder 4039b60e70bSIlya Dryomov static int rbd_major; 404f8a22fc2SIlya Dryomov static DEFINE_IDA(rbd_dev_id_ida); 405f8a22fc2SIlya Dryomov 406f5ee37bdSIlya Dryomov static struct workqueue_struct *rbd_wq; 407f5ee37bdSIlya Dryomov 4089b60e70bSIlya Dryomov /* 4099b60e70bSIlya Dryomov * Default to false for now, as single-major requires >= 0.75 version of 4109b60e70bSIlya Dryomov * userspace rbd utility. 4119b60e70bSIlya Dryomov */ 4129b60e70bSIlya Dryomov static bool single_major = false; 4139b60e70bSIlya Dryomov module_param(single_major, bool, S_IRUGO); 4149b60e70bSIlya Dryomov MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: false)"); 4159b60e70bSIlya Dryomov 4163d7efd18SAlex Elder static int rbd_img_request_submit(struct rbd_img_request *img_request); 4173d7efd18SAlex Elder 418200a6a8bSAlex Elder static void rbd_dev_device_release(struct device *dev); 419dfc5606dSYehuda Sadeh 420f0f8cef5SAlex Elder static ssize_t rbd_add(struct bus_type *bus, const char *buf, 421f0f8cef5SAlex Elder size_t count); 422f0f8cef5SAlex Elder static ssize_t rbd_remove(struct bus_type *bus, const char *buf, 423f0f8cef5SAlex Elder size_t count); 4249b60e70bSIlya Dryomov static ssize_t rbd_add_single_major(struct bus_type *bus, const char *buf, 4259b60e70bSIlya Dryomov size_t count); 4269b60e70bSIlya Dryomov static ssize_t rbd_remove_single_major(struct bus_type *bus, const char *buf, 4279b60e70bSIlya Dryomov size_t count); 4281f3ef788SAlex Elder static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool mapping); 429a2acd00eSAlex Elder static void rbd_spec_put(struct rbd_spec *spec); 430f0f8cef5SAlex Elder 4319b60e70bSIlya Dryomov static int rbd_dev_id_to_minor(int dev_id) 4329b60e70bSIlya Dryomov { 4337e513d43SIlya Dryomov return dev_id << RBD_SINGLE_MAJOR_PART_SHIFT; 4349b60e70bSIlya Dryomov } 4359b60e70bSIlya Dryomov 4369b60e70bSIlya Dryomov static int minor_to_rbd_dev_id(int minor) 4379b60e70bSIlya Dryomov { 4387e513d43SIlya Dryomov return minor >> RBD_SINGLE_MAJOR_PART_SHIFT; 4399b60e70bSIlya Dryomov } 4409b60e70bSIlya Dryomov 441b15a21ddSGreg Kroah-Hartman static BUS_ATTR(add, S_IWUSR, NULL, rbd_add); 442b15a21ddSGreg Kroah-Hartman static BUS_ATTR(remove, S_IWUSR, NULL, rbd_remove); 4439b60e70bSIlya Dryomov static BUS_ATTR(add_single_major, S_IWUSR, NULL, rbd_add_single_major); 4449b60e70bSIlya Dryomov static BUS_ATTR(remove_single_major, S_IWUSR, NULL, rbd_remove_single_major); 445b15a21ddSGreg Kroah-Hartman 446b15a21ddSGreg Kroah-Hartman static struct attribute *rbd_bus_attrs[] = { 447b15a21ddSGreg Kroah-Hartman &bus_attr_add.attr, 448b15a21ddSGreg Kroah-Hartman &bus_attr_remove.attr, 4499b60e70bSIlya Dryomov &bus_attr_add_single_major.attr, 4509b60e70bSIlya Dryomov &bus_attr_remove_single_major.attr, 451b15a21ddSGreg Kroah-Hartman NULL, 452f0f8cef5SAlex Elder }; 45392c76dc0SIlya Dryomov 45492c76dc0SIlya Dryomov static umode_t rbd_bus_is_visible(struct kobject *kobj, 45592c76dc0SIlya Dryomov struct attribute *attr, int index) 45692c76dc0SIlya Dryomov { 4579b60e70bSIlya Dryomov if (!single_major && 4589b60e70bSIlya Dryomov (attr == &bus_attr_add_single_major.attr || 4599b60e70bSIlya Dryomov attr == &bus_attr_remove_single_major.attr)) 4609b60e70bSIlya Dryomov return 0; 4619b60e70bSIlya Dryomov 46292c76dc0SIlya Dryomov return attr->mode; 46392c76dc0SIlya Dryomov } 46492c76dc0SIlya Dryomov 46592c76dc0SIlya Dryomov static const struct attribute_group rbd_bus_group = { 46692c76dc0SIlya Dryomov .attrs = rbd_bus_attrs, 46792c76dc0SIlya Dryomov .is_visible = rbd_bus_is_visible, 46892c76dc0SIlya Dryomov }; 46992c76dc0SIlya Dryomov __ATTRIBUTE_GROUPS(rbd_bus); 470f0f8cef5SAlex Elder 471f0f8cef5SAlex Elder static struct bus_type rbd_bus_type = { 472f0f8cef5SAlex Elder .name = "rbd", 473b15a21ddSGreg Kroah-Hartman .bus_groups = rbd_bus_groups, 474f0f8cef5SAlex Elder }; 475f0f8cef5SAlex Elder 476f0f8cef5SAlex Elder static void rbd_root_dev_release(struct device *dev) 477f0f8cef5SAlex Elder { 478f0f8cef5SAlex Elder } 479f0f8cef5SAlex Elder 480f0f8cef5SAlex Elder static struct device rbd_root_dev = { 481f0f8cef5SAlex Elder .init_name = "rbd", 482f0f8cef5SAlex Elder .release = rbd_root_dev_release, 483f0f8cef5SAlex Elder }; 484f0f8cef5SAlex Elder 48506ecc6cbSAlex Elder static __printf(2, 3) 48606ecc6cbSAlex Elder void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...) 48706ecc6cbSAlex Elder { 48806ecc6cbSAlex Elder struct va_format vaf; 48906ecc6cbSAlex Elder va_list args; 49006ecc6cbSAlex Elder 49106ecc6cbSAlex Elder va_start(args, fmt); 49206ecc6cbSAlex Elder vaf.fmt = fmt; 49306ecc6cbSAlex Elder vaf.va = &args; 49406ecc6cbSAlex Elder 49506ecc6cbSAlex Elder if (!rbd_dev) 49606ecc6cbSAlex Elder printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf); 49706ecc6cbSAlex Elder else if (rbd_dev->disk) 49806ecc6cbSAlex Elder printk(KERN_WARNING "%s: %s: %pV\n", 49906ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf); 50006ecc6cbSAlex Elder else if (rbd_dev->spec && rbd_dev->spec->image_name) 50106ecc6cbSAlex Elder printk(KERN_WARNING "%s: image %s: %pV\n", 50206ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf); 50306ecc6cbSAlex Elder else if (rbd_dev->spec && rbd_dev->spec->image_id) 50406ecc6cbSAlex Elder printk(KERN_WARNING "%s: id %s: %pV\n", 50506ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf); 50606ecc6cbSAlex Elder else /* punt */ 50706ecc6cbSAlex Elder printk(KERN_WARNING "%s: rbd_dev %p: %pV\n", 50806ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev, &vaf); 50906ecc6cbSAlex Elder va_end(args); 51006ecc6cbSAlex Elder } 51106ecc6cbSAlex Elder 512aafb230eSAlex Elder #ifdef RBD_DEBUG 513aafb230eSAlex Elder #define rbd_assert(expr) \ 514aafb230eSAlex Elder if (unlikely(!(expr))) { \ 515aafb230eSAlex Elder printk(KERN_ERR "\nAssertion failure in %s() " \ 516aafb230eSAlex Elder "at line %d:\n\n" \ 517aafb230eSAlex Elder "\trbd_assert(%s);\n\n", \ 518aafb230eSAlex Elder __func__, __LINE__, #expr); \ 519aafb230eSAlex Elder BUG(); \ 520aafb230eSAlex Elder } 521aafb230eSAlex Elder #else /* !RBD_DEBUG */ 522aafb230eSAlex Elder # define rbd_assert(expr) ((void) 0) 523aafb230eSAlex Elder #endif /* !RBD_DEBUG */ 524dfc5606dSYehuda Sadeh 525b454e36dSAlex Elder static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request); 52605a46afdSAlex Elder static void rbd_img_parent_read(struct rbd_obj_request *obj_request); 52705a46afdSAlex Elder static void rbd_dev_remove_parent(struct rbd_device *rbd_dev); 5288b3e1a56SAlex Elder 529cc4a38bdSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev); 5302df3fac7SAlex Elder static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev); 531a720ae09SIlya Dryomov static int rbd_dev_header_info(struct rbd_device *rbd_dev); 532e8f59b59SIlya Dryomov static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev); 53354cac61fSAlex Elder static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, 53454cac61fSAlex Elder u64 snap_id); 5352ad3d716SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id, 5362ad3d716SAlex Elder u8 *order, u64 *snap_size); 5372ad3d716SAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id, 5382ad3d716SAlex Elder u64 *snap_features); 5392ad3d716SAlex Elder static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name); 54059c2be1eSYehuda Sadeh 541602adf40SYehuda Sadeh static int rbd_open(struct block_device *bdev, fmode_t mode) 542602adf40SYehuda Sadeh { 543f0f8cef5SAlex Elder struct rbd_device *rbd_dev = bdev->bd_disk->private_data; 544b82d167bSAlex Elder bool removing = false; 545602adf40SYehuda Sadeh 546f84344f3SAlex Elder if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only) 547602adf40SYehuda Sadeh return -EROFS; 548602adf40SYehuda Sadeh 549a14ea269SAlex Elder spin_lock_irq(&rbd_dev->lock); 550b82d167bSAlex Elder if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags)) 551b82d167bSAlex Elder removing = true; 552b82d167bSAlex Elder else 553b82d167bSAlex Elder rbd_dev->open_count++; 554a14ea269SAlex Elder spin_unlock_irq(&rbd_dev->lock); 555b82d167bSAlex Elder if (removing) 556b82d167bSAlex Elder return -ENOENT; 557b82d167bSAlex Elder 558c3e946ceSAlex Elder (void) get_device(&rbd_dev->dev); 559340c7a2bSAlex Elder 560602adf40SYehuda Sadeh return 0; 561602adf40SYehuda Sadeh } 562602adf40SYehuda Sadeh 563db2a144bSAl Viro static void rbd_release(struct gendisk *disk, fmode_t mode) 564dfc5606dSYehuda Sadeh { 565dfc5606dSYehuda Sadeh struct rbd_device *rbd_dev = disk->private_data; 566b82d167bSAlex Elder unsigned long open_count_before; 567b82d167bSAlex Elder 568a14ea269SAlex Elder spin_lock_irq(&rbd_dev->lock); 569b82d167bSAlex Elder open_count_before = rbd_dev->open_count--; 570a14ea269SAlex Elder spin_unlock_irq(&rbd_dev->lock); 571b82d167bSAlex Elder rbd_assert(open_count_before > 0); 572dfc5606dSYehuda Sadeh 573c3e946ceSAlex Elder put_device(&rbd_dev->dev); 574dfc5606dSYehuda Sadeh } 575dfc5606dSYehuda Sadeh 576131fd9f6SGuangliang Zhao static int rbd_ioctl_set_ro(struct rbd_device *rbd_dev, unsigned long arg) 577131fd9f6SGuangliang Zhao { 57877f33c03SJosh Durgin int ret = 0; 579131fd9f6SGuangliang Zhao int val; 580131fd9f6SGuangliang Zhao bool ro; 58177f33c03SJosh Durgin bool ro_changed = false; 582131fd9f6SGuangliang Zhao 58377f33c03SJosh Durgin /* get_user() may sleep, so call it before taking rbd_dev->lock */ 584131fd9f6SGuangliang Zhao if (get_user(val, (int __user *)(arg))) 585131fd9f6SGuangliang Zhao return -EFAULT; 586131fd9f6SGuangliang Zhao 587131fd9f6SGuangliang Zhao ro = val ? true : false; 588131fd9f6SGuangliang Zhao /* Snapshot doesn't allow to write*/ 589131fd9f6SGuangliang Zhao if (rbd_dev->spec->snap_id != CEPH_NOSNAP && !ro) 590131fd9f6SGuangliang Zhao return -EROFS; 591131fd9f6SGuangliang Zhao 59277f33c03SJosh Durgin spin_lock_irq(&rbd_dev->lock); 59377f33c03SJosh Durgin /* prevent others open this device */ 59477f33c03SJosh Durgin if (rbd_dev->open_count > 1) { 59577f33c03SJosh Durgin ret = -EBUSY; 59677f33c03SJosh Durgin goto out; 597131fd9f6SGuangliang Zhao } 598131fd9f6SGuangliang Zhao 59977f33c03SJosh Durgin if (rbd_dev->mapping.read_only != ro) { 60077f33c03SJosh Durgin rbd_dev->mapping.read_only = ro; 60177f33c03SJosh Durgin ro_changed = true; 60277f33c03SJosh Durgin } 60377f33c03SJosh Durgin 60477f33c03SJosh Durgin out: 60577f33c03SJosh Durgin spin_unlock_irq(&rbd_dev->lock); 60677f33c03SJosh Durgin /* set_disk_ro() may sleep, so call it after releasing rbd_dev->lock */ 60777f33c03SJosh Durgin if (ret == 0 && ro_changed) 60877f33c03SJosh Durgin set_disk_ro(rbd_dev->disk, ro ? 1 : 0); 60977f33c03SJosh Durgin 61077f33c03SJosh Durgin return ret; 611131fd9f6SGuangliang Zhao } 612131fd9f6SGuangliang Zhao 613131fd9f6SGuangliang Zhao static int rbd_ioctl(struct block_device *bdev, fmode_t mode, 614131fd9f6SGuangliang Zhao unsigned int cmd, unsigned long arg) 615131fd9f6SGuangliang Zhao { 616131fd9f6SGuangliang Zhao struct rbd_device *rbd_dev = bdev->bd_disk->private_data; 617131fd9f6SGuangliang Zhao int ret = 0; 618131fd9f6SGuangliang Zhao 619131fd9f6SGuangliang Zhao switch (cmd) { 620131fd9f6SGuangliang Zhao case BLKROSET: 621131fd9f6SGuangliang Zhao ret = rbd_ioctl_set_ro(rbd_dev, arg); 622131fd9f6SGuangliang Zhao break; 623131fd9f6SGuangliang Zhao default: 624131fd9f6SGuangliang Zhao ret = -ENOTTY; 625131fd9f6SGuangliang Zhao } 626131fd9f6SGuangliang Zhao 627131fd9f6SGuangliang Zhao return ret; 628131fd9f6SGuangliang Zhao } 629131fd9f6SGuangliang Zhao 630131fd9f6SGuangliang Zhao #ifdef CONFIG_COMPAT 631131fd9f6SGuangliang Zhao static int rbd_compat_ioctl(struct block_device *bdev, fmode_t mode, 632131fd9f6SGuangliang Zhao unsigned int cmd, unsigned long arg) 633131fd9f6SGuangliang Zhao { 634131fd9f6SGuangliang Zhao return rbd_ioctl(bdev, mode, cmd, arg); 635131fd9f6SGuangliang Zhao } 636131fd9f6SGuangliang Zhao #endif /* CONFIG_COMPAT */ 637131fd9f6SGuangliang Zhao 638602adf40SYehuda Sadeh static const struct block_device_operations rbd_bd_ops = { 639602adf40SYehuda Sadeh .owner = THIS_MODULE, 640602adf40SYehuda Sadeh .open = rbd_open, 641dfc5606dSYehuda Sadeh .release = rbd_release, 642131fd9f6SGuangliang Zhao .ioctl = rbd_ioctl, 643131fd9f6SGuangliang Zhao #ifdef CONFIG_COMPAT 644131fd9f6SGuangliang Zhao .compat_ioctl = rbd_compat_ioctl, 645131fd9f6SGuangliang Zhao #endif 646602adf40SYehuda Sadeh }; 647602adf40SYehuda Sadeh 648602adf40SYehuda Sadeh /* 6497262cfcaSAlex Elder * Initialize an rbd client instance. Success or not, this function 650cfbf6377SAlex Elder * consumes ceph_opts. Caller holds client_mutex. 651602adf40SYehuda Sadeh */ 652f8c38929SAlex Elder static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts) 653602adf40SYehuda Sadeh { 654602adf40SYehuda Sadeh struct rbd_client *rbdc; 655602adf40SYehuda Sadeh int ret = -ENOMEM; 656602adf40SYehuda Sadeh 65737206ee5SAlex Elder dout("%s:\n", __func__); 658602adf40SYehuda Sadeh rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL); 659602adf40SYehuda Sadeh if (!rbdc) 660602adf40SYehuda Sadeh goto out_opt; 661602adf40SYehuda Sadeh 662602adf40SYehuda Sadeh kref_init(&rbdc->kref); 663602adf40SYehuda Sadeh INIT_LIST_HEAD(&rbdc->node); 664602adf40SYehuda Sadeh 66543ae4701SAlex Elder rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0); 666602adf40SYehuda Sadeh if (IS_ERR(rbdc->client)) 66708f75463SAlex Elder goto out_rbdc; 66843ae4701SAlex Elder ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */ 669602adf40SYehuda Sadeh 670602adf40SYehuda Sadeh ret = ceph_open_session(rbdc->client); 671602adf40SYehuda Sadeh if (ret < 0) 67208f75463SAlex Elder goto out_client; 673602adf40SYehuda Sadeh 674432b8587SAlex Elder spin_lock(&rbd_client_list_lock); 675602adf40SYehuda Sadeh list_add_tail(&rbdc->node, &rbd_client_list); 676432b8587SAlex Elder spin_unlock(&rbd_client_list_lock); 677602adf40SYehuda Sadeh 67837206ee5SAlex Elder dout("%s: rbdc %p\n", __func__, rbdc); 679bc534d86SAlex Elder 680602adf40SYehuda Sadeh return rbdc; 68108f75463SAlex Elder out_client: 682602adf40SYehuda Sadeh ceph_destroy_client(rbdc->client); 68308f75463SAlex Elder out_rbdc: 684602adf40SYehuda Sadeh kfree(rbdc); 685602adf40SYehuda Sadeh out_opt: 68643ae4701SAlex Elder if (ceph_opts) 68743ae4701SAlex Elder ceph_destroy_options(ceph_opts); 68837206ee5SAlex Elder dout("%s: error %d\n", __func__, ret); 68937206ee5SAlex Elder 69028f259b7SVasiliy Kulikov return ERR_PTR(ret); 691602adf40SYehuda Sadeh } 692602adf40SYehuda Sadeh 6932f82ee54SAlex Elder static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc) 6942f82ee54SAlex Elder { 6952f82ee54SAlex Elder kref_get(&rbdc->kref); 6962f82ee54SAlex Elder 6972f82ee54SAlex Elder return rbdc; 6982f82ee54SAlex Elder } 6992f82ee54SAlex Elder 700602adf40SYehuda Sadeh /* 7011f7ba331SAlex Elder * Find a ceph client with specific addr and configuration. If 7021f7ba331SAlex Elder * found, bump its reference count. 703602adf40SYehuda Sadeh */ 7041f7ba331SAlex Elder static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts) 705602adf40SYehuda Sadeh { 706602adf40SYehuda Sadeh struct rbd_client *client_node; 7071f7ba331SAlex Elder bool found = false; 708602adf40SYehuda Sadeh 70943ae4701SAlex Elder if (ceph_opts->flags & CEPH_OPT_NOSHARE) 710602adf40SYehuda Sadeh return NULL; 711602adf40SYehuda Sadeh 7121f7ba331SAlex Elder spin_lock(&rbd_client_list_lock); 7131f7ba331SAlex Elder list_for_each_entry(client_node, &rbd_client_list, node) { 7141f7ba331SAlex Elder if (!ceph_compare_options(ceph_opts, client_node->client)) { 7152f82ee54SAlex Elder __rbd_get_client(client_node); 7162f82ee54SAlex Elder 7171f7ba331SAlex Elder found = true; 7181f7ba331SAlex Elder break; 7191f7ba331SAlex Elder } 7201f7ba331SAlex Elder } 7211f7ba331SAlex Elder spin_unlock(&rbd_client_list_lock); 7221f7ba331SAlex Elder 7231f7ba331SAlex Elder return found ? client_node : NULL; 724602adf40SYehuda Sadeh } 725602adf40SYehuda Sadeh 726602adf40SYehuda Sadeh /* 72759c2be1eSYehuda Sadeh * mount options 72859c2be1eSYehuda Sadeh */ 72959c2be1eSYehuda Sadeh enum { 73059c2be1eSYehuda Sadeh Opt_last_int, 73159c2be1eSYehuda Sadeh /* int args above */ 73259c2be1eSYehuda Sadeh Opt_last_string, 73359c2be1eSYehuda Sadeh /* string args above */ 734cc0538b6SAlex Elder Opt_read_only, 735cc0538b6SAlex Elder Opt_read_write, 736cc0538b6SAlex Elder /* Boolean args above */ 737cc0538b6SAlex Elder Opt_last_bool, 73859c2be1eSYehuda Sadeh }; 73959c2be1eSYehuda Sadeh 74043ae4701SAlex Elder static match_table_t rbd_opts_tokens = { 74159c2be1eSYehuda Sadeh /* int args above */ 74259c2be1eSYehuda Sadeh /* string args above */ 743be466c1cSAlex Elder {Opt_read_only, "read_only"}, 744cc0538b6SAlex Elder {Opt_read_only, "ro"}, /* Alternate spelling */ 745cc0538b6SAlex Elder {Opt_read_write, "read_write"}, 746cc0538b6SAlex Elder {Opt_read_write, "rw"}, /* Alternate spelling */ 747cc0538b6SAlex Elder /* Boolean args above */ 74859c2be1eSYehuda Sadeh {-1, NULL} 74959c2be1eSYehuda Sadeh }; 75059c2be1eSYehuda Sadeh 75198571b5aSAlex Elder struct rbd_options { 75298571b5aSAlex Elder bool read_only; 75398571b5aSAlex Elder }; 75498571b5aSAlex Elder 75598571b5aSAlex Elder #define RBD_READ_ONLY_DEFAULT false 75698571b5aSAlex Elder 75759c2be1eSYehuda Sadeh static int parse_rbd_opts_token(char *c, void *private) 75859c2be1eSYehuda Sadeh { 75943ae4701SAlex Elder struct rbd_options *rbd_opts = private; 76059c2be1eSYehuda Sadeh substring_t argstr[MAX_OPT_ARGS]; 76159c2be1eSYehuda Sadeh int token, intval, ret; 76259c2be1eSYehuda Sadeh 76343ae4701SAlex Elder token = match_token(c, rbd_opts_tokens, argstr); 76459c2be1eSYehuda Sadeh if (token < 0) 76559c2be1eSYehuda Sadeh return -EINVAL; 76659c2be1eSYehuda Sadeh 76759c2be1eSYehuda Sadeh if (token < Opt_last_int) { 76859c2be1eSYehuda Sadeh ret = match_int(&argstr[0], &intval); 76959c2be1eSYehuda Sadeh if (ret < 0) { 77059c2be1eSYehuda Sadeh pr_err("bad mount option arg (not int) " 77159c2be1eSYehuda Sadeh "at '%s'\n", c); 77259c2be1eSYehuda Sadeh return ret; 77359c2be1eSYehuda Sadeh } 77459c2be1eSYehuda Sadeh dout("got int token %d val %d\n", token, intval); 77559c2be1eSYehuda Sadeh } else if (token > Opt_last_int && token < Opt_last_string) { 77659c2be1eSYehuda Sadeh dout("got string token %d val %s\n", token, 77759c2be1eSYehuda Sadeh argstr[0].from); 778cc0538b6SAlex Elder } else if (token > Opt_last_string && token < Opt_last_bool) { 779cc0538b6SAlex Elder dout("got Boolean token %d\n", token); 78059c2be1eSYehuda Sadeh } else { 78159c2be1eSYehuda Sadeh dout("got token %d\n", token); 78259c2be1eSYehuda Sadeh } 78359c2be1eSYehuda Sadeh 78459c2be1eSYehuda Sadeh switch (token) { 785cc0538b6SAlex Elder case Opt_read_only: 786cc0538b6SAlex Elder rbd_opts->read_only = true; 787cc0538b6SAlex Elder break; 788cc0538b6SAlex Elder case Opt_read_write: 789cc0538b6SAlex Elder rbd_opts->read_only = false; 790cc0538b6SAlex Elder break; 79159c2be1eSYehuda Sadeh default: 792aafb230eSAlex Elder rbd_assert(false); 793aafb230eSAlex Elder break; 79459c2be1eSYehuda Sadeh } 79559c2be1eSYehuda Sadeh return 0; 79659c2be1eSYehuda Sadeh } 79759c2be1eSYehuda Sadeh 7986d2940c8SGuangliang Zhao static char* obj_op_name(enum obj_operation_type op_type) 7996d2940c8SGuangliang Zhao { 8006d2940c8SGuangliang Zhao switch (op_type) { 8016d2940c8SGuangliang Zhao case OBJ_OP_READ: 8026d2940c8SGuangliang Zhao return "read"; 8036d2940c8SGuangliang Zhao case OBJ_OP_WRITE: 8046d2940c8SGuangliang Zhao return "write"; 80590e98c52SGuangliang Zhao case OBJ_OP_DISCARD: 80690e98c52SGuangliang Zhao return "discard"; 8076d2940c8SGuangliang Zhao default: 8086d2940c8SGuangliang Zhao return "???"; 8096d2940c8SGuangliang Zhao } 8106d2940c8SGuangliang Zhao } 8116d2940c8SGuangliang Zhao 81259c2be1eSYehuda Sadeh /* 813602adf40SYehuda Sadeh * Get a ceph client with specific addr and configuration, if one does 8147262cfcaSAlex Elder * not exist create it. Either way, ceph_opts is consumed by this 8157262cfcaSAlex Elder * function. 816602adf40SYehuda Sadeh */ 8179d3997fdSAlex Elder static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts) 818602adf40SYehuda Sadeh { 819f8c38929SAlex Elder struct rbd_client *rbdc; 82059c2be1eSYehuda Sadeh 821cfbf6377SAlex Elder mutex_lock_nested(&client_mutex, SINGLE_DEPTH_NESTING); 8221f7ba331SAlex Elder rbdc = rbd_client_find(ceph_opts); 8239d3997fdSAlex Elder if (rbdc) /* using an existing client */ 82443ae4701SAlex Elder ceph_destroy_options(ceph_opts); 8259d3997fdSAlex Elder else 826f8c38929SAlex Elder rbdc = rbd_client_create(ceph_opts); 827cfbf6377SAlex Elder mutex_unlock(&client_mutex); 828d720bcb0SAlex Elder 8299d3997fdSAlex Elder return rbdc; 830602adf40SYehuda Sadeh } 831602adf40SYehuda Sadeh 832602adf40SYehuda Sadeh /* 833602adf40SYehuda Sadeh * Destroy ceph client 834d23a4b3fSAlex Elder * 835432b8587SAlex Elder * Caller must hold rbd_client_list_lock. 836602adf40SYehuda Sadeh */ 837602adf40SYehuda Sadeh static void rbd_client_release(struct kref *kref) 838602adf40SYehuda Sadeh { 839602adf40SYehuda Sadeh struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref); 840602adf40SYehuda Sadeh 84137206ee5SAlex Elder dout("%s: rbdc %p\n", __func__, rbdc); 842cd9d9f5dSAlex Elder spin_lock(&rbd_client_list_lock); 843602adf40SYehuda Sadeh list_del(&rbdc->node); 844cd9d9f5dSAlex Elder spin_unlock(&rbd_client_list_lock); 845602adf40SYehuda Sadeh 846602adf40SYehuda Sadeh ceph_destroy_client(rbdc->client); 847602adf40SYehuda Sadeh kfree(rbdc); 848602adf40SYehuda Sadeh } 849602adf40SYehuda Sadeh 850602adf40SYehuda Sadeh /* 851602adf40SYehuda Sadeh * Drop reference to ceph client node. If it's not referenced anymore, release 852602adf40SYehuda Sadeh * it. 853602adf40SYehuda Sadeh */ 8549d3997fdSAlex Elder static void rbd_put_client(struct rbd_client *rbdc) 855602adf40SYehuda Sadeh { 856c53d5893SAlex Elder if (rbdc) 8579d3997fdSAlex Elder kref_put(&rbdc->kref, rbd_client_release); 858602adf40SYehuda Sadeh } 859602adf40SYehuda Sadeh 860a30b71b9SAlex Elder static bool rbd_image_format_valid(u32 image_format) 861a30b71b9SAlex Elder { 862a30b71b9SAlex Elder return image_format == 1 || image_format == 2; 863a30b71b9SAlex Elder } 864a30b71b9SAlex Elder 8658e94af8eSAlex Elder static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk) 8668e94af8eSAlex Elder { 867103a150fSAlex Elder size_t size; 868103a150fSAlex Elder u32 snap_count; 869103a150fSAlex Elder 870103a150fSAlex Elder /* The header has to start with the magic rbd header text */ 871103a150fSAlex Elder if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT))) 872103a150fSAlex Elder return false; 873103a150fSAlex Elder 874db2388b6SAlex Elder /* The bio layer requires at least sector-sized I/O */ 875db2388b6SAlex Elder 876db2388b6SAlex Elder if (ondisk->options.order < SECTOR_SHIFT) 877db2388b6SAlex Elder return false; 878db2388b6SAlex Elder 879db2388b6SAlex Elder /* If we use u64 in a few spots we may be able to loosen this */ 880db2388b6SAlex Elder 881db2388b6SAlex Elder if (ondisk->options.order > 8 * sizeof (int) - 1) 882db2388b6SAlex Elder return false; 883db2388b6SAlex Elder 884103a150fSAlex Elder /* 885103a150fSAlex Elder * The size of a snapshot header has to fit in a size_t, and 886103a150fSAlex Elder * that limits the number of snapshots. 887103a150fSAlex Elder */ 888103a150fSAlex Elder snap_count = le32_to_cpu(ondisk->snap_count); 889103a150fSAlex Elder size = SIZE_MAX - sizeof (struct ceph_snap_context); 890103a150fSAlex Elder if (snap_count > size / sizeof (__le64)) 891103a150fSAlex Elder return false; 892103a150fSAlex Elder 893103a150fSAlex Elder /* 894103a150fSAlex Elder * Not only that, but the size of the entire the snapshot 895103a150fSAlex Elder * header must also be representable in a size_t. 896103a150fSAlex Elder */ 897103a150fSAlex Elder size -= snap_count * sizeof (__le64); 898103a150fSAlex Elder if ((u64) size < le64_to_cpu(ondisk->snap_names_len)) 899103a150fSAlex Elder return false; 900103a150fSAlex Elder 901103a150fSAlex Elder return true; 9028e94af8eSAlex Elder } 9038e94af8eSAlex Elder 904602adf40SYehuda Sadeh /* 905bb23e37aSAlex Elder * Fill an rbd image header with information from the given format 1 906bb23e37aSAlex Elder * on-disk header. 907602adf40SYehuda Sadeh */ 908662518b1SAlex Elder static int rbd_header_from_disk(struct rbd_device *rbd_dev, 9094156d998SAlex Elder struct rbd_image_header_ondisk *ondisk) 910602adf40SYehuda Sadeh { 911662518b1SAlex Elder struct rbd_image_header *header = &rbd_dev->header; 912bb23e37aSAlex Elder bool first_time = header->object_prefix == NULL; 913bb23e37aSAlex Elder struct ceph_snap_context *snapc; 914bb23e37aSAlex Elder char *object_prefix = NULL; 915bb23e37aSAlex Elder char *snap_names = NULL; 916bb23e37aSAlex Elder u64 *snap_sizes = NULL; 917ccece235SAlex Elder u32 snap_count; 918d2bb24e5SAlex Elder size_t size; 919bb23e37aSAlex Elder int ret = -ENOMEM; 920621901d6SAlex Elder u32 i; 921602adf40SYehuda Sadeh 922bb23e37aSAlex Elder /* Allocate this now to avoid having to handle failure below */ 923103a150fSAlex Elder 924bb23e37aSAlex Elder if (first_time) { 925bb23e37aSAlex Elder size_t len; 926bb23e37aSAlex Elder 927bb23e37aSAlex Elder len = strnlen(ondisk->object_prefix, 928bb23e37aSAlex Elder sizeof (ondisk->object_prefix)); 929bb23e37aSAlex Elder object_prefix = kmalloc(len + 1, GFP_KERNEL); 930bb23e37aSAlex Elder if (!object_prefix) 931602adf40SYehuda Sadeh return -ENOMEM; 932bb23e37aSAlex Elder memcpy(object_prefix, ondisk->object_prefix, len); 933bb23e37aSAlex Elder object_prefix[len] = '\0'; 934bb23e37aSAlex Elder } 93500f1f36fSAlex Elder 936bb23e37aSAlex Elder /* Allocate the snapshot context and fill it in */ 937d2bb24e5SAlex Elder 938602adf40SYehuda Sadeh snap_count = le32_to_cpu(ondisk->snap_count); 939bb23e37aSAlex Elder snapc = ceph_create_snap_context(snap_count, GFP_KERNEL); 940bb23e37aSAlex Elder if (!snapc) 941bb23e37aSAlex Elder goto out_err; 942bb23e37aSAlex Elder snapc->seq = le64_to_cpu(ondisk->snap_seq); 943602adf40SYehuda Sadeh if (snap_count) { 944bb23e37aSAlex Elder struct rbd_image_snap_ondisk *snaps; 945f785cc1dSAlex Elder u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len); 946f785cc1dSAlex Elder 947bb23e37aSAlex Elder /* We'll keep a copy of the snapshot names... */ 948621901d6SAlex Elder 949f785cc1dSAlex Elder if (snap_names_len > (u64)SIZE_MAX) 950bb23e37aSAlex Elder goto out_2big; 951bb23e37aSAlex Elder snap_names = kmalloc(snap_names_len, GFP_KERNEL); 952bb23e37aSAlex Elder if (!snap_names) 953602adf40SYehuda Sadeh goto out_err; 954bb23e37aSAlex Elder 955bb23e37aSAlex Elder /* ...as well as the array of their sizes. */ 956bb23e37aSAlex Elder 957bb23e37aSAlex Elder size = snap_count * sizeof (*header->snap_sizes); 958bb23e37aSAlex Elder snap_sizes = kmalloc(size, GFP_KERNEL); 959bb23e37aSAlex Elder if (!snap_sizes) 960bb23e37aSAlex Elder goto out_err; 961bb23e37aSAlex Elder 962f785cc1dSAlex Elder /* 963bb23e37aSAlex Elder * Copy the names, and fill in each snapshot's id 964bb23e37aSAlex Elder * and size. 965bb23e37aSAlex Elder * 96699a41ebcSAlex Elder * Note that rbd_dev_v1_header_info() guarantees the 967bb23e37aSAlex Elder * ondisk buffer we're working with has 968f785cc1dSAlex Elder * snap_names_len bytes beyond the end of the 969f785cc1dSAlex Elder * snapshot id array, this memcpy() is safe. 970f785cc1dSAlex Elder */ 971bb23e37aSAlex Elder memcpy(snap_names, &ondisk->snaps[snap_count], snap_names_len); 972bb23e37aSAlex Elder snaps = ondisk->snaps; 973bb23e37aSAlex Elder for (i = 0; i < snap_count; i++) { 974bb23e37aSAlex Elder snapc->snaps[i] = le64_to_cpu(snaps[i].id); 975bb23e37aSAlex Elder snap_sizes[i] = le64_to_cpu(snaps[i].image_size); 976bb23e37aSAlex Elder } 977602adf40SYehuda Sadeh } 978849b4260SAlex Elder 979bb23e37aSAlex Elder /* We won't fail any more, fill in the header */ 980bb23e37aSAlex Elder 981bb23e37aSAlex Elder if (first_time) { 982bb23e37aSAlex Elder header->object_prefix = object_prefix; 983602adf40SYehuda Sadeh header->obj_order = ondisk->options.order; 984602adf40SYehuda Sadeh header->crypt_type = ondisk->options.crypt_type; 985602adf40SYehuda Sadeh header->comp_type = ondisk->options.comp_type; 986bb23e37aSAlex Elder /* The rest aren't used for format 1 images */ 987bb23e37aSAlex Elder header->stripe_unit = 0; 988bb23e37aSAlex Elder header->stripe_count = 0; 989bb23e37aSAlex Elder header->features = 0; 990662518b1SAlex Elder } else { 991662518b1SAlex Elder ceph_put_snap_context(header->snapc); 992662518b1SAlex Elder kfree(header->snap_names); 993662518b1SAlex Elder kfree(header->snap_sizes); 994bb23e37aSAlex Elder } 9956a52325fSAlex Elder 996bb23e37aSAlex Elder /* The remaining fields always get updated (when we refresh) */ 997621901d6SAlex Elder 998f84344f3SAlex Elder header->image_size = le64_to_cpu(ondisk->image_size); 999bb23e37aSAlex Elder header->snapc = snapc; 1000bb23e37aSAlex Elder header->snap_names = snap_names; 1001bb23e37aSAlex Elder header->snap_sizes = snap_sizes; 1002468521c1SAlex Elder 1003602adf40SYehuda Sadeh return 0; 1004bb23e37aSAlex Elder out_2big: 1005bb23e37aSAlex Elder ret = -EIO; 10066a52325fSAlex Elder out_err: 1007bb23e37aSAlex Elder kfree(snap_sizes); 1008bb23e37aSAlex Elder kfree(snap_names); 1009bb23e37aSAlex Elder ceph_put_snap_context(snapc); 1010bb23e37aSAlex Elder kfree(object_prefix); 1011ccece235SAlex Elder 1012bb23e37aSAlex Elder return ret; 1013602adf40SYehuda Sadeh } 1014602adf40SYehuda Sadeh 10159682fc6dSAlex Elder static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which) 10169682fc6dSAlex Elder { 10179682fc6dSAlex Elder const char *snap_name; 10189682fc6dSAlex Elder 10199682fc6dSAlex Elder rbd_assert(which < rbd_dev->header.snapc->num_snaps); 10209682fc6dSAlex Elder 10219682fc6dSAlex Elder /* Skip over names until we find the one we are looking for */ 10229682fc6dSAlex Elder 10239682fc6dSAlex Elder snap_name = rbd_dev->header.snap_names; 10249682fc6dSAlex Elder while (which--) 10259682fc6dSAlex Elder snap_name += strlen(snap_name) + 1; 10269682fc6dSAlex Elder 10279682fc6dSAlex Elder return kstrdup(snap_name, GFP_KERNEL); 10289682fc6dSAlex Elder } 10299682fc6dSAlex Elder 103030d1cff8SAlex Elder /* 103130d1cff8SAlex Elder * Snapshot id comparison function for use with qsort()/bsearch(). 103230d1cff8SAlex Elder * Note that result is for snapshots in *descending* order. 103330d1cff8SAlex Elder */ 103430d1cff8SAlex Elder static int snapid_compare_reverse(const void *s1, const void *s2) 103530d1cff8SAlex Elder { 103630d1cff8SAlex Elder u64 snap_id1 = *(u64 *)s1; 103730d1cff8SAlex Elder u64 snap_id2 = *(u64 *)s2; 103830d1cff8SAlex Elder 103930d1cff8SAlex Elder if (snap_id1 < snap_id2) 104030d1cff8SAlex Elder return 1; 104130d1cff8SAlex Elder return snap_id1 == snap_id2 ? 0 : -1; 104230d1cff8SAlex Elder } 104330d1cff8SAlex Elder 104430d1cff8SAlex Elder /* 104530d1cff8SAlex Elder * Search a snapshot context to see if the given snapshot id is 104630d1cff8SAlex Elder * present. 104730d1cff8SAlex Elder * 104830d1cff8SAlex Elder * Returns the position of the snapshot id in the array if it's found, 104930d1cff8SAlex Elder * or BAD_SNAP_INDEX otherwise. 105030d1cff8SAlex Elder * 105130d1cff8SAlex Elder * Note: The snapshot array is in kept sorted (by the osd) in 105230d1cff8SAlex Elder * reverse order, highest snapshot id first. 105330d1cff8SAlex Elder */ 10549682fc6dSAlex Elder static u32 rbd_dev_snap_index(struct rbd_device *rbd_dev, u64 snap_id) 10559682fc6dSAlex Elder { 10569682fc6dSAlex Elder struct ceph_snap_context *snapc = rbd_dev->header.snapc; 105730d1cff8SAlex Elder u64 *found; 10589682fc6dSAlex Elder 105930d1cff8SAlex Elder found = bsearch(&snap_id, &snapc->snaps, snapc->num_snaps, 106030d1cff8SAlex Elder sizeof (snap_id), snapid_compare_reverse); 10619682fc6dSAlex Elder 106230d1cff8SAlex Elder return found ? (u32)(found - &snapc->snaps[0]) : BAD_SNAP_INDEX; 10639682fc6dSAlex Elder } 10649682fc6dSAlex Elder 10652ad3d716SAlex Elder static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, 10662ad3d716SAlex Elder u64 snap_id) 106754cac61fSAlex Elder { 106854cac61fSAlex Elder u32 which; 1069da6a6b63SJosh Durgin const char *snap_name; 107054cac61fSAlex Elder 107154cac61fSAlex Elder which = rbd_dev_snap_index(rbd_dev, snap_id); 107254cac61fSAlex Elder if (which == BAD_SNAP_INDEX) 1073da6a6b63SJosh Durgin return ERR_PTR(-ENOENT); 107454cac61fSAlex Elder 1075da6a6b63SJosh Durgin snap_name = _rbd_dev_v1_snap_name(rbd_dev, which); 1076da6a6b63SJosh Durgin return snap_name ? snap_name : ERR_PTR(-ENOMEM); 107754cac61fSAlex Elder } 107854cac61fSAlex Elder 10799e15b77dSAlex Elder static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id) 10809e15b77dSAlex Elder { 10819e15b77dSAlex Elder if (snap_id == CEPH_NOSNAP) 10829e15b77dSAlex Elder return RBD_SNAP_HEAD_NAME; 10839e15b77dSAlex Elder 108454cac61fSAlex Elder rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 108554cac61fSAlex Elder if (rbd_dev->image_format == 1) 108654cac61fSAlex Elder return rbd_dev_v1_snap_name(rbd_dev, snap_id); 10879e15b77dSAlex Elder 108854cac61fSAlex Elder return rbd_dev_v2_snap_name(rbd_dev, snap_id); 10899e15b77dSAlex Elder } 10909e15b77dSAlex Elder 10912ad3d716SAlex Elder static int rbd_snap_size(struct rbd_device *rbd_dev, u64 snap_id, 10922ad3d716SAlex Elder u64 *snap_size) 1093602adf40SYehuda Sadeh { 10942ad3d716SAlex Elder rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 10952ad3d716SAlex Elder if (snap_id == CEPH_NOSNAP) { 10962ad3d716SAlex Elder *snap_size = rbd_dev->header.image_size; 10972ad3d716SAlex Elder } else if (rbd_dev->image_format == 1) { 10982ad3d716SAlex Elder u32 which; 109900f1f36fSAlex Elder 11002ad3d716SAlex Elder which = rbd_dev_snap_index(rbd_dev, snap_id); 11012ad3d716SAlex Elder if (which == BAD_SNAP_INDEX) 11022ad3d716SAlex Elder return -ENOENT; 110300f1f36fSAlex Elder 11042ad3d716SAlex Elder *snap_size = rbd_dev->header.snap_sizes[which]; 11052ad3d716SAlex Elder } else { 11062ad3d716SAlex Elder u64 size = 0; 11072ad3d716SAlex Elder int ret; 11082ad3d716SAlex Elder 11092ad3d716SAlex Elder ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size); 11102ad3d716SAlex Elder if (ret) 11112ad3d716SAlex Elder return ret; 11122ad3d716SAlex Elder 11132ad3d716SAlex Elder *snap_size = size; 11142ad3d716SAlex Elder } 11152ad3d716SAlex Elder return 0; 11162ad3d716SAlex Elder } 11172ad3d716SAlex Elder 11182ad3d716SAlex Elder static int rbd_snap_features(struct rbd_device *rbd_dev, u64 snap_id, 11192ad3d716SAlex Elder u64 *snap_features) 11202ad3d716SAlex Elder { 11212ad3d716SAlex Elder rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 11222ad3d716SAlex Elder if (snap_id == CEPH_NOSNAP) { 11232ad3d716SAlex Elder *snap_features = rbd_dev->header.features; 11242ad3d716SAlex Elder } else if (rbd_dev->image_format == 1) { 11252ad3d716SAlex Elder *snap_features = 0; /* No features for format 1 */ 11262ad3d716SAlex Elder } else { 11272ad3d716SAlex Elder u64 features = 0; 11282ad3d716SAlex Elder int ret; 11292ad3d716SAlex Elder 11302ad3d716SAlex Elder ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, &features); 11312ad3d716SAlex Elder if (ret) 11322ad3d716SAlex Elder return ret; 11332ad3d716SAlex Elder 11342ad3d716SAlex Elder *snap_features = features; 11352ad3d716SAlex Elder } 11362ad3d716SAlex Elder return 0; 113700f1f36fSAlex Elder } 1138602adf40SYehuda Sadeh 1139d1cf5788SAlex Elder static int rbd_dev_mapping_set(struct rbd_device *rbd_dev) 1140602adf40SYehuda Sadeh { 11418f4b7d98SAlex Elder u64 snap_id = rbd_dev->spec->snap_id; 11422ad3d716SAlex Elder u64 size = 0; 11432ad3d716SAlex Elder u64 features = 0; 11442ad3d716SAlex Elder int ret; 11458b0241f8SAlex Elder 11462ad3d716SAlex Elder ret = rbd_snap_size(rbd_dev, snap_id, &size); 11472ad3d716SAlex Elder if (ret) 11482ad3d716SAlex Elder return ret; 11492ad3d716SAlex Elder ret = rbd_snap_features(rbd_dev, snap_id, &features); 11502ad3d716SAlex Elder if (ret) 11512ad3d716SAlex Elder return ret; 11522ad3d716SAlex Elder 11532ad3d716SAlex Elder rbd_dev->mapping.size = size; 11542ad3d716SAlex Elder rbd_dev->mapping.features = features; 11552ad3d716SAlex Elder 11568b0241f8SAlex Elder return 0; 1157602adf40SYehuda Sadeh } 1158602adf40SYehuda Sadeh 1159d1cf5788SAlex Elder static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev) 1160d1cf5788SAlex Elder { 1161d1cf5788SAlex Elder rbd_dev->mapping.size = 0; 1162d1cf5788SAlex Elder rbd_dev->mapping.features = 0; 1163200a6a8bSAlex Elder } 1164200a6a8bSAlex Elder 11657d5079aaSHimangi Saraogi static void rbd_segment_name_free(const char *name) 11667d5079aaSHimangi Saraogi { 11677d5079aaSHimangi Saraogi /* The explicit cast here is needed to drop the const qualifier */ 11687d5079aaSHimangi Saraogi 11697d5079aaSHimangi Saraogi kmem_cache_free(rbd_segment_name_cache, (void *)name); 11707d5079aaSHimangi Saraogi } 11717d5079aaSHimangi Saraogi 117298571b5aSAlex Elder static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset) 1173602adf40SYehuda Sadeh { 117465ccfe21SAlex Elder char *name; 117565ccfe21SAlex Elder u64 segment; 117665ccfe21SAlex Elder int ret; 11773a96d5cdSJosh Durgin char *name_format; 1178602adf40SYehuda Sadeh 117978c2a44aSAlex Elder name = kmem_cache_alloc(rbd_segment_name_cache, GFP_NOIO); 118065ccfe21SAlex Elder if (!name) 118165ccfe21SAlex Elder return NULL; 118265ccfe21SAlex Elder segment = offset >> rbd_dev->header.obj_order; 11833a96d5cdSJosh Durgin name_format = "%s.%012llx"; 11843a96d5cdSJosh Durgin if (rbd_dev->image_format == 2) 11853a96d5cdSJosh Durgin name_format = "%s.%016llx"; 11862d0ebc5dSIlya Dryomov ret = snprintf(name, CEPH_MAX_OID_NAME_LEN + 1, name_format, 118765ccfe21SAlex Elder rbd_dev->header.object_prefix, segment); 11882d0ebc5dSIlya Dryomov if (ret < 0 || ret > CEPH_MAX_OID_NAME_LEN) { 118965ccfe21SAlex Elder pr_err("error formatting segment name for #%llu (%d)\n", 119065ccfe21SAlex Elder segment, ret); 11917d5079aaSHimangi Saraogi rbd_segment_name_free(name); 119265ccfe21SAlex Elder name = NULL; 119365ccfe21SAlex Elder } 1194602adf40SYehuda Sadeh 119565ccfe21SAlex Elder return name; 119665ccfe21SAlex Elder } 1197602adf40SYehuda Sadeh 119865ccfe21SAlex Elder static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset) 119965ccfe21SAlex Elder { 120065ccfe21SAlex Elder u64 segment_size = (u64) 1 << rbd_dev->header.obj_order; 1201602adf40SYehuda Sadeh 120265ccfe21SAlex Elder return offset & (segment_size - 1); 120365ccfe21SAlex Elder } 120465ccfe21SAlex Elder 120565ccfe21SAlex Elder static u64 rbd_segment_length(struct rbd_device *rbd_dev, 120665ccfe21SAlex Elder u64 offset, u64 length) 120765ccfe21SAlex Elder { 120865ccfe21SAlex Elder u64 segment_size = (u64) 1 << rbd_dev->header.obj_order; 120965ccfe21SAlex Elder 121065ccfe21SAlex Elder offset &= segment_size - 1; 121165ccfe21SAlex Elder 1212aafb230eSAlex Elder rbd_assert(length <= U64_MAX - offset); 121365ccfe21SAlex Elder if (offset + length > segment_size) 121465ccfe21SAlex Elder length = segment_size - offset; 121565ccfe21SAlex Elder 121665ccfe21SAlex Elder return length; 1217602adf40SYehuda Sadeh } 1218602adf40SYehuda Sadeh 1219602adf40SYehuda Sadeh /* 1220029bcbd8SJosh Durgin * returns the size of an object in the image 1221029bcbd8SJosh Durgin */ 1222029bcbd8SJosh Durgin static u64 rbd_obj_bytes(struct rbd_image_header *header) 1223029bcbd8SJosh Durgin { 1224029bcbd8SJosh Durgin return 1 << header->obj_order; 1225029bcbd8SJosh Durgin } 1226029bcbd8SJosh Durgin 1227029bcbd8SJosh Durgin /* 1228602adf40SYehuda Sadeh * bio helpers 1229602adf40SYehuda Sadeh */ 1230602adf40SYehuda Sadeh 1231602adf40SYehuda Sadeh static void bio_chain_put(struct bio *chain) 1232602adf40SYehuda Sadeh { 1233602adf40SYehuda Sadeh struct bio *tmp; 1234602adf40SYehuda Sadeh 1235602adf40SYehuda Sadeh while (chain) { 1236602adf40SYehuda Sadeh tmp = chain; 1237602adf40SYehuda Sadeh chain = chain->bi_next; 1238602adf40SYehuda Sadeh bio_put(tmp); 1239602adf40SYehuda Sadeh } 1240602adf40SYehuda Sadeh } 1241602adf40SYehuda Sadeh 1242602adf40SYehuda Sadeh /* 1243602adf40SYehuda Sadeh * zeros a bio chain, starting at specific offset 1244602adf40SYehuda Sadeh */ 1245602adf40SYehuda Sadeh static void zero_bio_chain(struct bio *chain, int start_ofs) 1246602adf40SYehuda Sadeh { 12477988613bSKent Overstreet struct bio_vec bv; 12487988613bSKent Overstreet struct bvec_iter iter; 1249602adf40SYehuda Sadeh unsigned long flags; 1250602adf40SYehuda Sadeh void *buf; 1251602adf40SYehuda Sadeh int pos = 0; 1252602adf40SYehuda Sadeh 1253602adf40SYehuda Sadeh while (chain) { 12547988613bSKent Overstreet bio_for_each_segment(bv, chain, iter) { 12557988613bSKent Overstreet if (pos + bv.bv_len > start_ofs) { 1256602adf40SYehuda Sadeh int remainder = max(start_ofs - pos, 0); 12577988613bSKent Overstreet buf = bvec_kmap_irq(&bv, &flags); 1258602adf40SYehuda Sadeh memset(buf + remainder, 0, 12597988613bSKent Overstreet bv.bv_len - remainder); 12607988613bSKent Overstreet flush_dcache_page(bv.bv_page); 126185b5aaa6SDan Carpenter bvec_kunmap_irq(buf, &flags); 1262602adf40SYehuda Sadeh } 12637988613bSKent Overstreet pos += bv.bv_len; 1264602adf40SYehuda Sadeh } 1265602adf40SYehuda Sadeh 1266602adf40SYehuda Sadeh chain = chain->bi_next; 1267602adf40SYehuda Sadeh } 1268602adf40SYehuda Sadeh } 1269602adf40SYehuda Sadeh 1270602adf40SYehuda Sadeh /* 1271b9434c5bSAlex Elder * similar to zero_bio_chain(), zeros data defined by a page array, 1272b9434c5bSAlex Elder * starting at the given byte offset from the start of the array and 1273b9434c5bSAlex Elder * continuing up to the given end offset. The pages array is 1274b9434c5bSAlex Elder * assumed to be big enough to hold all bytes up to the end. 1275b9434c5bSAlex Elder */ 1276b9434c5bSAlex Elder static void zero_pages(struct page **pages, u64 offset, u64 end) 1277b9434c5bSAlex Elder { 1278b9434c5bSAlex Elder struct page **page = &pages[offset >> PAGE_SHIFT]; 1279b9434c5bSAlex Elder 1280b9434c5bSAlex Elder rbd_assert(end > offset); 1281b9434c5bSAlex Elder rbd_assert(end - offset <= (u64)SIZE_MAX); 1282b9434c5bSAlex Elder while (offset < end) { 1283b9434c5bSAlex Elder size_t page_offset; 1284b9434c5bSAlex Elder size_t length; 1285b9434c5bSAlex Elder unsigned long flags; 1286b9434c5bSAlex Elder void *kaddr; 1287b9434c5bSAlex Elder 1288491205a8SGeert Uytterhoeven page_offset = offset & ~PAGE_MASK; 1289491205a8SGeert Uytterhoeven length = min_t(size_t, PAGE_SIZE - page_offset, end - offset); 1290b9434c5bSAlex Elder local_irq_save(flags); 1291b9434c5bSAlex Elder kaddr = kmap_atomic(*page); 1292b9434c5bSAlex Elder memset(kaddr + page_offset, 0, length); 1293e2156054SAlex Elder flush_dcache_page(*page); 1294b9434c5bSAlex Elder kunmap_atomic(kaddr); 1295b9434c5bSAlex Elder local_irq_restore(flags); 1296b9434c5bSAlex Elder 1297b9434c5bSAlex Elder offset += length; 1298b9434c5bSAlex Elder page++; 1299b9434c5bSAlex Elder } 1300b9434c5bSAlex Elder } 1301b9434c5bSAlex Elder 1302b9434c5bSAlex Elder /* 1303f7760dadSAlex Elder * Clone a portion of a bio, starting at the given byte offset 1304f7760dadSAlex Elder * and continuing for the number of bytes indicated. 1305602adf40SYehuda Sadeh */ 1306f7760dadSAlex Elder static struct bio *bio_clone_range(struct bio *bio_src, 1307f7760dadSAlex Elder unsigned int offset, 1308f7760dadSAlex Elder unsigned int len, 1309f7760dadSAlex Elder gfp_t gfpmask) 1310602adf40SYehuda Sadeh { 1311f7760dadSAlex Elder struct bio *bio; 1312602adf40SYehuda Sadeh 13135341a627SKent Overstreet bio = bio_clone(bio_src, gfpmask); 1314f7760dadSAlex Elder if (!bio) 1315f7760dadSAlex Elder return NULL; /* ENOMEM */ 1316f7760dadSAlex Elder 13175341a627SKent Overstreet bio_advance(bio, offset); 13184f024f37SKent Overstreet bio->bi_iter.bi_size = len; 1319602adf40SYehuda Sadeh 1320f7760dadSAlex Elder return bio; 1321602adf40SYehuda Sadeh } 1322602adf40SYehuda Sadeh 1323f7760dadSAlex Elder /* 1324f7760dadSAlex Elder * Clone a portion of a bio chain, starting at the given byte offset 1325f7760dadSAlex Elder * into the first bio in the source chain and continuing for the 1326f7760dadSAlex Elder * number of bytes indicated. The result is another bio chain of 1327f7760dadSAlex Elder * exactly the given length, or a null pointer on error. 1328f7760dadSAlex Elder * 1329f7760dadSAlex Elder * The bio_src and offset parameters are both in-out. On entry they 1330f7760dadSAlex Elder * refer to the first source bio and the offset into that bio where 1331f7760dadSAlex Elder * the start of data to be cloned is located. 1332f7760dadSAlex Elder * 1333f7760dadSAlex Elder * On return, bio_src is updated to refer to the bio in the source 1334f7760dadSAlex Elder * chain that contains first un-cloned byte, and *offset will 1335f7760dadSAlex Elder * contain the offset of that byte within that bio. 1336f7760dadSAlex Elder */ 1337f7760dadSAlex Elder static struct bio *bio_chain_clone_range(struct bio **bio_src, 1338f7760dadSAlex Elder unsigned int *offset, 1339f7760dadSAlex Elder unsigned int len, 1340f7760dadSAlex Elder gfp_t gfpmask) 1341f7760dadSAlex Elder { 1342f7760dadSAlex Elder struct bio *bi = *bio_src; 1343f7760dadSAlex Elder unsigned int off = *offset; 1344f7760dadSAlex Elder struct bio *chain = NULL; 1345f7760dadSAlex Elder struct bio **end; 1346602adf40SYehuda Sadeh 1347f7760dadSAlex Elder /* Build up a chain of clone bios up to the limit */ 1348602adf40SYehuda Sadeh 13494f024f37SKent Overstreet if (!bi || off >= bi->bi_iter.bi_size || !len) 1350f7760dadSAlex Elder return NULL; /* Nothing to clone */ 1351602adf40SYehuda Sadeh 1352f7760dadSAlex Elder end = &chain; 1353f7760dadSAlex Elder while (len) { 1354f7760dadSAlex Elder unsigned int bi_size; 1355f7760dadSAlex Elder struct bio *bio; 1356f7760dadSAlex Elder 1357f5400b7aSAlex Elder if (!bi) { 1358f5400b7aSAlex Elder rbd_warn(NULL, "bio_chain exhausted with %u left", len); 1359f7760dadSAlex Elder goto out_err; /* EINVAL; ran out of bio's */ 1360f5400b7aSAlex Elder } 13614f024f37SKent Overstreet bi_size = min_t(unsigned int, bi->bi_iter.bi_size - off, len); 1362f7760dadSAlex Elder bio = bio_clone_range(bi, off, bi_size, gfpmask); 1363f7760dadSAlex Elder if (!bio) 1364f7760dadSAlex Elder goto out_err; /* ENOMEM */ 1365f7760dadSAlex Elder 1366f7760dadSAlex Elder *end = bio; 1367f7760dadSAlex Elder end = &bio->bi_next; 1368f7760dadSAlex Elder 1369f7760dadSAlex Elder off += bi_size; 13704f024f37SKent Overstreet if (off == bi->bi_iter.bi_size) { 1371f7760dadSAlex Elder bi = bi->bi_next; 1372f7760dadSAlex Elder off = 0; 1373f7760dadSAlex Elder } 1374f7760dadSAlex Elder len -= bi_size; 1375f7760dadSAlex Elder } 1376f7760dadSAlex Elder *bio_src = bi; 1377f7760dadSAlex Elder *offset = off; 1378f7760dadSAlex Elder 1379f7760dadSAlex Elder return chain; 1380f7760dadSAlex Elder out_err: 1381f7760dadSAlex Elder bio_chain_put(chain); 1382f7760dadSAlex Elder 1383602adf40SYehuda Sadeh return NULL; 1384602adf40SYehuda Sadeh } 1385602adf40SYehuda Sadeh 1386926f9b3fSAlex Elder /* 1387926f9b3fSAlex Elder * The default/initial value for all object request flags is 0. For 1388926f9b3fSAlex Elder * each flag, once its value is set to 1 it is never reset to 0 1389926f9b3fSAlex Elder * again. 1390926f9b3fSAlex Elder */ 13916365d33aSAlex Elder static void obj_request_img_data_set(struct rbd_obj_request *obj_request) 13926365d33aSAlex Elder { 13936365d33aSAlex Elder if (test_and_set_bit(OBJ_REQ_IMG_DATA, &obj_request->flags)) { 13946365d33aSAlex Elder struct rbd_device *rbd_dev; 13956365d33aSAlex Elder 139657acbaa7SAlex Elder rbd_dev = obj_request->img_request->rbd_dev; 13979584d508SIlya Dryomov rbd_warn(rbd_dev, "obj_request %p already marked img_data", 13986365d33aSAlex Elder obj_request); 13996365d33aSAlex Elder } 14006365d33aSAlex Elder } 14016365d33aSAlex Elder 14026365d33aSAlex Elder static bool obj_request_img_data_test(struct rbd_obj_request *obj_request) 14036365d33aSAlex Elder { 14046365d33aSAlex Elder smp_mb(); 14056365d33aSAlex Elder return test_bit(OBJ_REQ_IMG_DATA, &obj_request->flags) != 0; 14066365d33aSAlex Elder } 14076365d33aSAlex Elder 140857acbaa7SAlex Elder static void obj_request_done_set(struct rbd_obj_request *obj_request) 140957acbaa7SAlex Elder { 141057acbaa7SAlex Elder if (test_and_set_bit(OBJ_REQ_DONE, &obj_request->flags)) { 141157acbaa7SAlex Elder struct rbd_device *rbd_dev = NULL; 141257acbaa7SAlex Elder 141357acbaa7SAlex Elder if (obj_request_img_data_test(obj_request)) 141457acbaa7SAlex Elder rbd_dev = obj_request->img_request->rbd_dev; 14159584d508SIlya Dryomov rbd_warn(rbd_dev, "obj_request %p already marked done", 141657acbaa7SAlex Elder obj_request); 141757acbaa7SAlex Elder } 141857acbaa7SAlex Elder } 141957acbaa7SAlex Elder 142057acbaa7SAlex Elder static bool obj_request_done_test(struct rbd_obj_request *obj_request) 142157acbaa7SAlex Elder { 142257acbaa7SAlex Elder smp_mb(); 142357acbaa7SAlex Elder return test_bit(OBJ_REQ_DONE, &obj_request->flags) != 0; 142457acbaa7SAlex Elder } 142557acbaa7SAlex Elder 14265679c59fSAlex Elder /* 14275679c59fSAlex Elder * This sets the KNOWN flag after (possibly) setting the EXISTS 14285679c59fSAlex Elder * flag. The latter is set based on the "exists" value provided. 14295679c59fSAlex Elder * 14305679c59fSAlex Elder * Note that for our purposes once an object exists it never goes 14315679c59fSAlex Elder * away again. It's possible that the response from two existence 14325679c59fSAlex Elder * checks are separated by the creation of the target object, and 14335679c59fSAlex Elder * the first ("doesn't exist") response arrives *after* the second 14345679c59fSAlex Elder * ("does exist"). In that case we ignore the second one. 14355679c59fSAlex Elder */ 14365679c59fSAlex Elder static void obj_request_existence_set(struct rbd_obj_request *obj_request, 14375679c59fSAlex Elder bool exists) 14385679c59fSAlex Elder { 14395679c59fSAlex Elder if (exists) 14405679c59fSAlex Elder set_bit(OBJ_REQ_EXISTS, &obj_request->flags); 14415679c59fSAlex Elder set_bit(OBJ_REQ_KNOWN, &obj_request->flags); 14425679c59fSAlex Elder smp_mb(); 14435679c59fSAlex Elder } 14445679c59fSAlex Elder 14455679c59fSAlex Elder static bool obj_request_known_test(struct rbd_obj_request *obj_request) 14465679c59fSAlex Elder { 14475679c59fSAlex Elder smp_mb(); 14485679c59fSAlex Elder return test_bit(OBJ_REQ_KNOWN, &obj_request->flags) != 0; 14495679c59fSAlex Elder } 14505679c59fSAlex Elder 14515679c59fSAlex Elder static bool obj_request_exists_test(struct rbd_obj_request *obj_request) 14525679c59fSAlex Elder { 14535679c59fSAlex Elder smp_mb(); 14545679c59fSAlex Elder return test_bit(OBJ_REQ_EXISTS, &obj_request->flags) != 0; 14555679c59fSAlex Elder } 14565679c59fSAlex Elder 14579638556aSIlya Dryomov static bool obj_request_overlaps_parent(struct rbd_obj_request *obj_request) 14589638556aSIlya Dryomov { 14599638556aSIlya Dryomov struct rbd_device *rbd_dev = obj_request->img_request->rbd_dev; 14609638556aSIlya Dryomov 14619638556aSIlya Dryomov return obj_request->img_offset < 14629638556aSIlya Dryomov round_up(rbd_dev->parent_overlap, rbd_obj_bytes(&rbd_dev->header)); 14639638556aSIlya Dryomov } 14649638556aSIlya Dryomov 1465bf0d5f50SAlex Elder static void rbd_obj_request_get(struct rbd_obj_request *obj_request) 1466bf0d5f50SAlex Elder { 146737206ee5SAlex Elder dout("%s: obj %p (was %d)\n", __func__, obj_request, 146837206ee5SAlex Elder atomic_read(&obj_request->kref.refcount)); 1469bf0d5f50SAlex Elder kref_get(&obj_request->kref); 1470bf0d5f50SAlex Elder } 1471bf0d5f50SAlex Elder 1472bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref); 1473bf0d5f50SAlex Elder static void rbd_obj_request_put(struct rbd_obj_request *obj_request) 1474bf0d5f50SAlex Elder { 1475bf0d5f50SAlex Elder rbd_assert(obj_request != NULL); 147637206ee5SAlex Elder dout("%s: obj %p (was %d)\n", __func__, obj_request, 147737206ee5SAlex Elder atomic_read(&obj_request->kref.refcount)); 1478bf0d5f50SAlex Elder kref_put(&obj_request->kref, rbd_obj_request_destroy); 1479bf0d5f50SAlex Elder } 1480bf0d5f50SAlex Elder 14810f2d5be7SAlex Elder static void rbd_img_request_get(struct rbd_img_request *img_request) 14820f2d5be7SAlex Elder { 14830f2d5be7SAlex Elder dout("%s: img %p (was %d)\n", __func__, img_request, 14840f2d5be7SAlex Elder atomic_read(&img_request->kref.refcount)); 14850f2d5be7SAlex Elder kref_get(&img_request->kref); 14860f2d5be7SAlex Elder } 14870f2d5be7SAlex Elder 1488e93f3152SAlex Elder static bool img_request_child_test(struct rbd_img_request *img_request); 1489e93f3152SAlex Elder static void rbd_parent_request_destroy(struct kref *kref); 1490bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref); 1491bf0d5f50SAlex Elder static void rbd_img_request_put(struct rbd_img_request *img_request) 1492bf0d5f50SAlex Elder { 1493bf0d5f50SAlex Elder rbd_assert(img_request != NULL); 149437206ee5SAlex Elder dout("%s: img %p (was %d)\n", __func__, img_request, 149537206ee5SAlex Elder atomic_read(&img_request->kref.refcount)); 1496e93f3152SAlex Elder if (img_request_child_test(img_request)) 1497e93f3152SAlex Elder kref_put(&img_request->kref, rbd_parent_request_destroy); 1498e93f3152SAlex Elder else 1499bf0d5f50SAlex Elder kref_put(&img_request->kref, rbd_img_request_destroy); 1500bf0d5f50SAlex Elder } 1501bf0d5f50SAlex Elder 1502bf0d5f50SAlex Elder static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request, 1503bf0d5f50SAlex Elder struct rbd_obj_request *obj_request) 1504bf0d5f50SAlex Elder { 150525dcf954SAlex Elder rbd_assert(obj_request->img_request == NULL); 150625dcf954SAlex Elder 1507b155e86cSAlex Elder /* Image request now owns object's original reference */ 1508bf0d5f50SAlex Elder obj_request->img_request = img_request; 150925dcf954SAlex Elder obj_request->which = img_request->obj_request_count; 15106365d33aSAlex Elder rbd_assert(!obj_request_img_data_test(obj_request)); 15116365d33aSAlex Elder obj_request_img_data_set(obj_request); 1512bf0d5f50SAlex Elder rbd_assert(obj_request->which != BAD_WHICH); 151325dcf954SAlex Elder img_request->obj_request_count++; 151425dcf954SAlex Elder list_add_tail(&obj_request->links, &img_request->obj_requests); 151537206ee5SAlex Elder dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request, 151637206ee5SAlex Elder obj_request->which); 1517bf0d5f50SAlex Elder } 1518bf0d5f50SAlex Elder 1519bf0d5f50SAlex Elder static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request, 1520bf0d5f50SAlex Elder struct rbd_obj_request *obj_request) 1521bf0d5f50SAlex Elder { 1522bf0d5f50SAlex Elder rbd_assert(obj_request->which != BAD_WHICH); 152325dcf954SAlex Elder 152437206ee5SAlex Elder dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request, 152537206ee5SAlex Elder obj_request->which); 1526bf0d5f50SAlex Elder list_del(&obj_request->links); 152725dcf954SAlex Elder rbd_assert(img_request->obj_request_count > 0); 152825dcf954SAlex Elder img_request->obj_request_count--; 152925dcf954SAlex Elder rbd_assert(obj_request->which == img_request->obj_request_count); 153025dcf954SAlex Elder obj_request->which = BAD_WHICH; 15316365d33aSAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 1532bf0d5f50SAlex Elder rbd_assert(obj_request->img_request == img_request); 1533bf0d5f50SAlex Elder obj_request->img_request = NULL; 153425dcf954SAlex Elder obj_request->callback = NULL; 1535bf0d5f50SAlex Elder rbd_obj_request_put(obj_request); 1536bf0d5f50SAlex Elder } 1537bf0d5f50SAlex Elder 1538bf0d5f50SAlex Elder static bool obj_request_type_valid(enum obj_request_type type) 1539bf0d5f50SAlex Elder { 1540bf0d5f50SAlex Elder switch (type) { 15419969ebc5SAlex Elder case OBJ_REQUEST_NODATA: 1542bf0d5f50SAlex Elder case OBJ_REQUEST_BIO: 1543788e2df3SAlex Elder case OBJ_REQUEST_PAGES: 1544bf0d5f50SAlex Elder return true; 1545bf0d5f50SAlex Elder default: 1546bf0d5f50SAlex Elder return false; 1547bf0d5f50SAlex Elder } 1548bf0d5f50SAlex Elder } 1549bf0d5f50SAlex Elder 1550bf0d5f50SAlex Elder static int rbd_obj_request_submit(struct ceph_osd_client *osdc, 1551bf0d5f50SAlex Elder struct rbd_obj_request *obj_request) 1552bf0d5f50SAlex Elder { 155371c20a06SIlya Dryomov dout("%s %p\n", __func__, obj_request); 1554bf0d5f50SAlex Elder return ceph_osdc_start_request(osdc, obj_request->osd_req, false); 1555bf0d5f50SAlex Elder } 1556bf0d5f50SAlex Elder 155771c20a06SIlya Dryomov static void rbd_obj_request_end(struct rbd_obj_request *obj_request) 155871c20a06SIlya Dryomov { 155971c20a06SIlya Dryomov dout("%s %p\n", __func__, obj_request); 156071c20a06SIlya Dryomov ceph_osdc_cancel_request(obj_request->osd_req); 156171c20a06SIlya Dryomov } 156271c20a06SIlya Dryomov 156371c20a06SIlya Dryomov /* 156471c20a06SIlya Dryomov * Wait for an object request to complete. If interrupted, cancel the 156571c20a06SIlya Dryomov * underlying osd request. 15662894e1d7SIlya Dryomov * 15672894e1d7SIlya Dryomov * @timeout: in jiffies, 0 means "wait forever" 156871c20a06SIlya Dryomov */ 15692894e1d7SIlya Dryomov static int __rbd_obj_request_wait(struct rbd_obj_request *obj_request, 15702894e1d7SIlya Dryomov unsigned long timeout) 157171c20a06SIlya Dryomov { 15722894e1d7SIlya Dryomov long ret; 157371c20a06SIlya Dryomov 157471c20a06SIlya Dryomov dout("%s %p\n", __func__, obj_request); 15752894e1d7SIlya Dryomov ret = wait_for_completion_interruptible_timeout( 15762894e1d7SIlya Dryomov &obj_request->completion, 15772894e1d7SIlya Dryomov ceph_timeout_jiffies(timeout)); 15782894e1d7SIlya Dryomov if (ret <= 0) { 15792894e1d7SIlya Dryomov if (ret == 0) 15802894e1d7SIlya Dryomov ret = -ETIMEDOUT; 158171c20a06SIlya Dryomov rbd_obj_request_end(obj_request); 15822894e1d7SIlya Dryomov } else { 15832894e1d7SIlya Dryomov ret = 0; 15842894e1d7SIlya Dryomov } 15852894e1d7SIlya Dryomov 15862894e1d7SIlya Dryomov dout("%s %p ret %d\n", __func__, obj_request, (int)ret); 158771c20a06SIlya Dryomov return ret; 158871c20a06SIlya Dryomov } 158971c20a06SIlya Dryomov 15902894e1d7SIlya Dryomov static int rbd_obj_request_wait(struct rbd_obj_request *obj_request) 15912894e1d7SIlya Dryomov { 15922894e1d7SIlya Dryomov return __rbd_obj_request_wait(obj_request, 0); 15932894e1d7SIlya Dryomov } 15942894e1d7SIlya Dryomov 15952894e1d7SIlya Dryomov static int rbd_obj_request_wait_timeout(struct rbd_obj_request *obj_request, 15962894e1d7SIlya Dryomov unsigned long timeout) 15972894e1d7SIlya Dryomov { 15982894e1d7SIlya Dryomov return __rbd_obj_request_wait(obj_request, timeout); 159971c20a06SIlya Dryomov } 160071c20a06SIlya Dryomov 1601bf0d5f50SAlex Elder static void rbd_img_request_complete(struct rbd_img_request *img_request) 1602bf0d5f50SAlex Elder { 160355f27e09SAlex Elder 160437206ee5SAlex Elder dout("%s: img %p\n", __func__, img_request); 160555f27e09SAlex Elder 160655f27e09SAlex Elder /* 160755f27e09SAlex Elder * If no error occurred, compute the aggregate transfer 160855f27e09SAlex Elder * count for the image request. We could instead use 160955f27e09SAlex Elder * atomic64_cmpxchg() to update it as each object request 161055f27e09SAlex Elder * completes; not clear which way is better off hand. 161155f27e09SAlex Elder */ 161255f27e09SAlex Elder if (!img_request->result) { 161355f27e09SAlex Elder struct rbd_obj_request *obj_request; 161455f27e09SAlex Elder u64 xferred = 0; 161555f27e09SAlex Elder 161655f27e09SAlex Elder for_each_obj_request(img_request, obj_request) 161755f27e09SAlex Elder xferred += obj_request->xferred; 161855f27e09SAlex Elder img_request->xferred = xferred; 161955f27e09SAlex Elder } 162055f27e09SAlex Elder 1621bf0d5f50SAlex Elder if (img_request->callback) 1622bf0d5f50SAlex Elder img_request->callback(img_request); 1623bf0d5f50SAlex Elder else 1624bf0d5f50SAlex Elder rbd_img_request_put(img_request); 1625bf0d5f50SAlex Elder } 1626bf0d5f50SAlex Elder 16270c425248SAlex Elder /* 16280c425248SAlex Elder * The default/initial value for all image request flags is 0. Each 16290c425248SAlex Elder * is conditionally set to 1 at image request initialization time 16300c425248SAlex Elder * and currently never change thereafter. 16310c425248SAlex Elder */ 16320c425248SAlex Elder static void img_request_write_set(struct rbd_img_request *img_request) 16330c425248SAlex Elder { 16340c425248SAlex Elder set_bit(IMG_REQ_WRITE, &img_request->flags); 16350c425248SAlex Elder smp_mb(); 16360c425248SAlex Elder } 16370c425248SAlex Elder 16380c425248SAlex Elder static bool img_request_write_test(struct rbd_img_request *img_request) 16390c425248SAlex Elder { 16400c425248SAlex Elder smp_mb(); 16410c425248SAlex Elder return test_bit(IMG_REQ_WRITE, &img_request->flags) != 0; 16420c425248SAlex Elder } 16430c425248SAlex Elder 164490e98c52SGuangliang Zhao /* 164590e98c52SGuangliang Zhao * Set the discard flag when the img_request is an discard request 164690e98c52SGuangliang Zhao */ 164790e98c52SGuangliang Zhao static void img_request_discard_set(struct rbd_img_request *img_request) 164890e98c52SGuangliang Zhao { 164990e98c52SGuangliang Zhao set_bit(IMG_REQ_DISCARD, &img_request->flags); 165090e98c52SGuangliang Zhao smp_mb(); 165190e98c52SGuangliang Zhao } 165290e98c52SGuangliang Zhao 165390e98c52SGuangliang Zhao static bool img_request_discard_test(struct rbd_img_request *img_request) 165490e98c52SGuangliang Zhao { 165590e98c52SGuangliang Zhao smp_mb(); 165690e98c52SGuangliang Zhao return test_bit(IMG_REQ_DISCARD, &img_request->flags) != 0; 165790e98c52SGuangliang Zhao } 165890e98c52SGuangliang Zhao 16599849e986SAlex Elder static void img_request_child_set(struct rbd_img_request *img_request) 16609849e986SAlex Elder { 16619849e986SAlex Elder set_bit(IMG_REQ_CHILD, &img_request->flags); 16629849e986SAlex Elder smp_mb(); 16639849e986SAlex Elder } 16649849e986SAlex Elder 1665e93f3152SAlex Elder static void img_request_child_clear(struct rbd_img_request *img_request) 1666e93f3152SAlex Elder { 1667e93f3152SAlex Elder clear_bit(IMG_REQ_CHILD, &img_request->flags); 1668e93f3152SAlex Elder smp_mb(); 1669e93f3152SAlex Elder } 1670e93f3152SAlex Elder 16719849e986SAlex Elder static bool img_request_child_test(struct rbd_img_request *img_request) 16729849e986SAlex Elder { 16739849e986SAlex Elder smp_mb(); 16749849e986SAlex Elder return test_bit(IMG_REQ_CHILD, &img_request->flags) != 0; 16759849e986SAlex Elder } 16769849e986SAlex Elder 1677d0b2e944SAlex Elder static void img_request_layered_set(struct rbd_img_request *img_request) 1678d0b2e944SAlex Elder { 1679d0b2e944SAlex Elder set_bit(IMG_REQ_LAYERED, &img_request->flags); 1680d0b2e944SAlex Elder smp_mb(); 1681d0b2e944SAlex Elder } 1682d0b2e944SAlex Elder 1683a2acd00eSAlex Elder static void img_request_layered_clear(struct rbd_img_request *img_request) 1684a2acd00eSAlex Elder { 1685a2acd00eSAlex Elder clear_bit(IMG_REQ_LAYERED, &img_request->flags); 1686a2acd00eSAlex Elder smp_mb(); 1687a2acd00eSAlex Elder } 1688a2acd00eSAlex Elder 1689d0b2e944SAlex Elder static bool img_request_layered_test(struct rbd_img_request *img_request) 1690d0b2e944SAlex Elder { 1691d0b2e944SAlex Elder smp_mb(); 1692d0b2e944SAlex Elder return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0; 1693d0b2e944SAlex Elder } 1694d0b2e944SAlex Elder 16953b434a2aSJosh Durgin static enum obj_operation_type 16963b434a2aSJosh Durgin rbd_img_request_op_type(struct rbd_img_request *img_request) 16973b434a2aSJosh Durgin { 16983b434a2aSJosh Durgin if (img_request_write_test(img_request)) 16993b434a2aSJosh Durgin return OBJ_OP_WRITE; 17003b434a2aSJosh Durgin else if (img_request_discard_test(img_request)) 17013b434a2aSJosh Durgin return OBJ_OP_DISCARD; 17023b434a2aSJosh Durgin else 17033b434a2aSJosh Durgin return OBJ_OP_READ; 17043b434a2aSJosh Durgin } 17053b434a2aSJosh Durgin 17066e2a4505SAlex Elder static void 17076e2a4505SAlex Elder rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request) 17086e2a4505SAlex Elder { 1709b9434c5bSAlex Elder u64 xferred = obj_request->xferred; 1710b9434c5bSAlex Elder u64 length = obj_request->length; 1711b9434c5bSAlex Elder 17126e2a4505SAlex Elder dout("%s: obj %p img %p result %d %llu/%llu\n", __func__, 17136e2a4505SAlex Elder obj_request, obj_request->img_request, obj_request->result, 1714b9434c5bSAlex Elder xferred, length); 17156e2a4505SAlex Elder /* 171617c1cc1dSJosh Durgin * ENOENT means a hole in the image. We zero-fill the entire 171717c1cc1dSJosh Durgin * length of the request. A short read also implies zero-fill 171817c1cc1dSJosh Durgin * to the end of the request. An error requires the whole 171917c1cc1dSJosh Durgin * length of the request to be reported finished with an error 172017c1cc1dSJosh Durgin * to the block layer. In each case we update the xferred 172117c1cc1dSJosh Durgin * count to indicate the whole request was satisfied. 17226e2a4505SAlex Elder */ 1723b9434c5bSAlex Elder rbd_assert(obj_request->type != OBJ_REQUEST_NODATA); 17246e2a4505SAlex Elder if (obj_request->result == -ENOENT) { 1725b9434c5bSAlex Elder if (obj_request->type == OBJ_REQUEST_BIO) 17266e2a4505SAlex Elder zero_bio_chain(obj_request->bio_list, 0); 1727b9434c5bSAlex Elder else 1728b9434c5bSAlex Elder zero_pages(obj_request->pages, 0, length); 17296e2a4505SAlex Elder obj_request->result = 0; 1730b9434c5bSAlex Elder } else if (xferred < length && !obj_request->result) { 1731b9434c5bSAlex Elder if (obj_request->type == OBJ_REQUEST_BIO) 1732b9434c5bSAlex Elder zero_bio_chain(obj_request->bio_list, xferred); 1733b9434c5bSAlex Elder else 1734b9434c5bSAlex Elder zero_pages(obj_request->pages, xferred, length); 17356e2a4505SAlex Elder } 173617c1cc1dSJosh Durgin obj_request->xferred = length; 17376e2a4505SAlex Elder obj_request_done_set(obj_request); 17386e2a4505SAlex Elder } 17396e2a4505SAlex Elder 1740bf0d5f50SAlex Elder static void rbd_obj_request_complete(struct rbd_obj_request *obj_request) 1741bf0d5f50SAlex Elder { 174237206ee5SAlex Elder dout("%s: obj %p cb %p\n", __func__, obj_request, 174337206ee5SAlex Elder obj_request->callback); 1744bf0d5f50SAlex Elder if (obj_request->callback) 1745bf0d5f50SAlex Elder obj_request->callback(obj_request); 1746788e2df3SAlex Elder else 1747788e2df3SAlex Elder complete_all(&obj_request->completion); 1748bf0d5f50SAlex Elder } 1749bf0d5f50SAlex Elder 1750c47f9371SAlex Elder static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request) 175139bf2c5dSAlex Elder { 175239bf2c5dSAlex Elder dout("%s: obj %p\n", __func__, obj_request); 175339bf2c5dSAlex Elder obj_request_done_set(obj_request); 175439bf2c5dSAlex Elder } 175539bf2c5dSAlex Elder 1756c47f9371SAlex Elder static void rbd_osd_read_callback(struct rbd_obj_request *obj_request) 1757bf0d5f50SAlex Elder { 175857acbaa7SAlex Elder struct rbd_img_request *img_request = NULL; 1759a9e8ba2cSAlex Elder struct rbd_device *rbd_dev = NULL; 176057acbaa7SAlex Elder bool layered = false; 176157acbaa7SAlex Elder 176257acbaa7SAlex Elder if (obj_request_img_data_test(obj_request)) { 176357acbaa7SAlex Elder img_request = obj_request->img_request; 176457acbaa7SAlex Elder layered = img_request && img_request_layered_test(img_request); 1765a9e8ba2cSAlex Elder rbd_dev = img_request->rbd_dev; 176657acbaa7SAlex Elder } 17678b3e1a56SAlex Elder 17688b3e1a56SAlex Elder dout("%s: obj %p img %p result %d %llu/%llu\n", __func__, 17698b3e1a56SAlex Elder obj_request, img_request, obj_request->result, 17708b3e1a56SAlex Elder obj_request->xferred, obj_request->length); 1771a9e8ba2cSAlex Elder if (layered && obj_request->result == -ENOENT && 1772a9e8ba2cSAlex Elder obj_request->img_offset < rbd_dev->parent_overlap) 17738b3e1a56SAlex Elder rbd_img_parent_read(obj_request); 17748b3e1a56SAlex Elder else if (img_request) 17756e2a4505SAlex Elder rbd_img_obj_request_read_callback(obj_request); 17766e2a4505SAlex Elder else 177707741308SAlex Elder obj_request_done_set(obj_request); 1778bf0d5f50SAlex Elder } 1779bf0d5f50SAlex Elder 1780c47f9371SAlex Elder static void rbd_osd_write_callback(struct rbd_obj_request *obj_request) 1781bf0d5f50SAlex Elder { 17821b83bef2SSage Weil dout("%s: obj %p result %d %llu\n", __func__, obj_request, 17831b83bef2SSage Weil obj_request->result, obj_request->length); 17841b83bef2SSage Weil /* 17858b3e1a56SAlex Elder * There is no such thing as a successful short write. Set 17868b3e1a56SAlex Elder * it to our originally-requested length. 17871b83bef2SSage Weil */ 17881b83bef2SSage Weil obj_request->xferred = obj_request->length; 178907741308SAlex Elder obj_request_done_set(obj_request); 1790bf0d5f50SAlex Elder } 1791bf0d5f50SAlex Elder 179290e98c52SGuangliang Zhao static void rbd_osd_discard_callback(struct rbd_obj_request *obj_request) 179390e98c52SGuangliang Zhao { 179490e98c52SGuangliang Zhao dout("%s: obj %p result %d %llu\n", __func__, obj_request, 179590e98c52SGuangliang Zhao obj_request->result, obj_request->length); 179690e98c52SGuangliang Zhao /* 179790e98c52SGuangliang Zhao * There is no such thing as a successful short discard. Set 179890e98c52SGuangliang Zhao * it to our originally-requested length. 179990e98c52SGuangliang Zhao */ 180090e98c52SGuangliang Zhao obj_request->xferred = obj_request->length; 1801d0265de7SJosh Durgin /* discarding a non-existent object is not a problem */ 1802d0265de7SJosh Durgin if (obj_request->result == -ENOENT) 1803d0265de7SJosh Durgin obj_request->result = 0; 180490e98c52SGuangliang Zhao obj_request_done_set(obj_request); 180590e98c52SGuangliang Zhao } 180690e98c52SGuangliang Zhao 1807fbfab539SAlex Elder /* 1808fbfab539SAlex Elder * For a simple stat call there's nothing to do. We'll do more if 1809fbfab539SAlex Elder * this is part of a write sequence for a layered image. 1810fbfab539SAlex Elder */ 1811c47f9371SAlex Elder static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request) 1812fbfab539SAlex Elder { 181337206ee5SAlex Elder dout("%s: obj %p\n", __func__, obj_request); 1814fbfab539SAlex Elder obj_request_done_set(obj_request); 1815fbfab539SAlex Elder } 1816fbfab539SAlex Elder 1817bf0d5f50SAlex Elder static void rbd_osd_req_callback(struct ceph_osd_request *osd_req, 1818bf0d5f50SAlex Elder struct ceph_msg *msg) 1819bf0d5f50SAlex Elder { 1820bf0d5f50SAlex Elder struct rbd_obj_request *obj_request = osd_req->r_priv; 1821bf0d5f50SAlex Elder u16 opcode; 1822bf0d5f50SAlex Elder 182337206ee5SAlex Elder dout("%s: osd_req %p msg %p\n", __func__, osd_req, msg); 1824bf0d5f50SAlex Elder rbd_assert(osd_req == obj_request->osd_req); 182557acbaa7SAlex Elder if (obj_request_img_data_test(obj_request)) { 182657acbaa7SAlex Elder rbd_assert(obj_request->img_request); 182757acbaa7SAlex Elder rbd_assert(obj_request->which != BAD_WHICH); 182857acbaa7SAlex Elder } else { 182957acbaa7SAlex Elder rbd_assert(obj_request->which == BAD_WHICH); 183057acbaa7SAlex Elder } 1831bf0d5f50SAlex Elder 18321b83bef2SSage Weil if (osd_req->r_result < 0) 18331b83bef2SSage Weil obj_request->result = osd_req->r_result; 1834bf0d5f50SAlex Elder 18357cc69d42SIlya Dryomov rbd_assert(osd_req->r_num_ops <= CEPH_OSD_MAX_OP); 1836bf0d5f50SAlex Elder 1837c47f9371SAlex Elder /* 1838c47f9371SAlex Elder * We support a 64-bit length, but ultimately it has to be 18397ad18afaSChristoph Hellwig * passed to the block layer, which just supports a 32-bit 18407ad18afaSChristoph Hellwig * length field. 1841c47f9371SAlex Elder */ 18421b83bef2SSage Weil obj_request->xferred = osd_req->r_reply_op_len[0]; 1843c47f9371SAlex Elder rbd_assert(obj_request->xferred < (u64)UINT_MAX); 18440ccd5926SIlya Dryomov 184579528734SAlex Elder opcode = osd_req->r_ops[0].op; 1846bf0d5f50SAlex Elder switch (opcode) { 1847bf0d5f50SAlex Elder case CEPH_OSD_OP_READ: 1848c47f9371SAlex Elder rbd_osd_read_callback(obj_request); 1849bf0d5f50SAlex Elder break; 18500ccd5926SIlya Dryomov case CEPH_OSD_OP_SETALLOCHINT: 18510ccd5926SIlya Dryomov rbd_assert(osd_req->r_ops[1].op == CEPH_OSD_OP_WRITE); 18520ccd5926SIlya Dryomov /* fall through */ 1853bf0d5f50SAlex Elder case CEPH_OSD_OP_WRITE: 1854c47f9371SAlex Elder rbd_osd_write_callback(obj_request); 1855bf0d5f50SAlex Elder break; 1856fbfab539SAlex Elder case CEPH_OSD_OP_STAT: 1857c47f9371SAlex Elder rbd_osd_stat_callback(obj_request); 1858fbfab539SAlex Elder break; 185990e98c52SGuangliang Zhao case CEPH_OSD_OP_DELETE: 186090e98c52SGuangliang Zhao case CEPH_OSD_OP_TRUNCATE: 186190e98c52SGuangliang Zhao case CEPH_OSD_OP_ZERO: 186290e98c52SGuangliang Zhao rbd_osd_discard_callback(obj_request); 186390e98c52SGuangliang Zhao break; 186436be9a76SAlex Elder case CEPH_OSD_OP_CALL: 1865b8d70035SAlex Elder case CEPH_OSD_OP_NOTIFY_ACK: 18669969ebc5SAlex Elder case CEPH_OSD_OP_WATCH: 1867c47f9371SAlex Elder rbd_osd_trivial_callback(obj_request); 18689969ebc5SAlex Elder break; 1869bf0d5f50SAlex Elder default: 18709584d508SIlya Dryomov rbd_warn(NULL, "%s: unsupported op %hu", 1871bf0d5f50SAlex Elder obj_request->object_name, (unsigned short) opcode); 1872bf0d5f50SAlex Elder break; 1873bf0d5f50SAlex Elder } 1874bf0d5f50SAlex Elder 187507741308SAlex Elder if (obj_request_done_test(obj_request)) 1876bf0d5f50SAlex Elder rbd_obj_request_complete(obj_request); 1877bf0d5f50SAlex Elder } 1878bf0d5f50SAlex Elder 18799d4df01fSAlex Elder static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request) 1880430c28c3SAlex Elder { 1881430c28c3SAlex Elder struct rbd_img_request *img_request = obj_request->img_request; 18828c042b0dSAlex Elder struct ceph_osd_request *osd_req = obj_request->osd_req; 18839d4df01fSAlex Elder u64 snap_id; 1884430c28c3SAlex Elder 18858c042b0dSAlex Elder rbd_assert(osd_req != NULL); 1886430c28c3SAlex Elder 18879d4df01fSAlex Elder snap_id = img_request ? img_request->snap_id : CEPH_NOSNAP; 18888c042b0dSAlex Elder ceph_osdc_build_request(osd_req, obj_request->offset, 18899d4df01fSAlex Elder NULL, snap_id, NULL); 18909d4df01fSAlex Elder } 18919d4df01fSAlex Elder 18929d4df01fSAlex Elder static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request) 18939d4df01fSAlex Elder { 18949d4df01fSAlex Elder struct rbd_img_request *img_request = obj_request->img_request; 18959d4df01fSAlex Elder struct ceph_osd_request *osd_req = obj_request->osd_req; 18969d4df01fSAlex Elder struct ceph_snap_context *snapc; 18979d4df01fSAlex Elder struct timespec mtime = CURRENT_TIME; 18989d4df01fSAlex Elder 18999d4df01fSAlex Elder rbd_assert(osd_req != NULL); 19009d4df01fSAlex Elder 19019d4df01fSAlex Elder snapc = img_request ? img_request->snapc : NULL; 19029d4df01fSAlex Elder ceph_osdc_build_request(osd_req, obj_request->offset, 19039d4df01fSAlex Elder snapc, CEPH_NOSNAP, &mtime); 1904430c28c3SAlex Elder } 1905430c28c3SAlex Elder 19060ccd5926SIlya Dryomov /* 19070ccd5926SIlya Dryomov * Create an osd request. A read request has one osd op (read). 19080ccd5926SIlya Dryomov * A write request has either one (watch) or two (hint+write) osd ops. 19090ccd5926SIlya Dryomov * (All rbd data writes are prefixed with an allocation hint op, but 19100ccd5926SIlya Dryomov * technically osd watch is a write request, hence this distinction.) 19110ccd5926SIlya Dryomov */ 1912bf0d5f50SAlex Elder static struct ceph_osd_request *rbd_osd_req_create( 1913bf0d5f50SAlex Elder struct rbd_device *rbd_dev, 19146d2940c8SGuangliang Zhao enum obj_operation_type op_type, 1915deb236b3SIlya Dryomov unsigned int num_ops, 1916430c28c3SAlex Elder struct rbd_obj_request *obj_request) 1917bf0d5f50SAlex Elder { 1918bf0d5f50SAlex Elder struct ceph_snap_context *snapc = NULL; 1919bf0d5f50SAlex Elder struct ceph_osd_client *osdc; 1920bf0d5f50SAlex Elder struct ceph_osd_request *osd_req; 1921bf0d5f50SAlex Elder 192290e98c52SGuangliang Zhao if (obj_request_img_data_test(obj_request) && 192390e98c52SGuangliang Zhao (op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_WRITE)) { 19246365d33aSAlex Elder struct rbd_img_request *img_request = obj_request->img_request; 192590e98c52SGuangliang Zhao if (op_type == OBJ_OP_WRITE) { 19266d2940c8SGuangliang Zhao rbd_assert(img_request_write_test(img_request)); 192790e98c52SGuangliang Zhao } else { 192890e98c52SGuangliang Zhao rbd_assert(img_request_discard_test(img_request)); 192990e98c52SGuangliang Zhao } 1930bf0d5f50SAlex Elder snapc = img_request->snapc; 1931bf0d5f50SAlex Elder } 1932bf0d5f50SAlex Elder 19336d2940c8SGuangliang Zhao rbd_assert(num_ops == 1 || ((op_type == OBJ_OP_WRITE) && num_ops == 2)); 1934deb236b3SIlya Dryomov 1935deb236b3SIlya Dryomov /* Allocate and initialize the request, for the num_ops ops */ 1936bf0d5f50SAlex Elder 1937bf0d5f50SAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 1938deb236b3SIlya Dryomov osd_req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false, 1939deb236b3SIlya Dryomov GFP_ATOMIC); 1940bf0d5f50SAlex Elder if (!osd_req) 1941bf0d5f50SAlex Elder return NULL; /* ENOMEM */ 1942bf0d5f50SAlex Elder 194390e98c52SGuangliang Zhao if (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD) 1944bf0d5f50SAlex Elder osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK; 1945430c28c3SAlex Elder else 1946bf0d5f50SAlex Elder osd_req->r_flags = CEPH_OSD_FLAG_READ; 1947bf0d5f50SAlex Elder 1948bf0d5f50SAlex Elder osd_req->r_callback = rbd_osd_req_callback; 1949bf0d5f50SAlex Elder osd_req->r_priv = obj_request; 1950bf0d5f50SAlex Elder 19513c972c95SIlya Dryomov osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout); 19523c972c95SIlya Dryomov ceph_oid_set_name(&osd_req->r_base_oid, obj_request->object_name); 1953bf0d5f50SAlex Elder 1954bf0d5f50SAlex Elder return osd_req; 1955bf0d5f50SAlex Elder } 1956bf0d5f50SAlex Elder 19570eefd470SAlex Elder /* 1958d3246fb0SJosh Durgin * Create a copyup osd request based on the information in the object 1959d3246fb0SJosh Durgin * request supplied. A copyup request has two or three osd ops, a 1960d3246fb0SJosh Durgin * copyup method call, potentially a hint op, and a write or truncate 1961d3246fb0SJosh Durgin * or zero op. 19620eefd470SAlex Elder */ 19630eefd470SAlex Elder static struct ceph_osd_request * 19640eefd470SAlex Elder rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request) 19650eefd470SAlex Elder { 19660eefd470SAlex Elder struct rbd_img_request *img_request; 19670eefd470SAlex Elder struct ceph_snap_context *snapc; 19680eefd470SAlex Elder struct rbd_device *rbd_dev; 19690eefd470SAlex Elder struct ceph_osd_client *osdc; 19700eefd470SAlex Elder struct ceph_osd_request *osd_req; 1971d3246fb0SJosh Durgin int num_osd_ops = 3; 19720eefd470SAlex Elder 19730eefd470SAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 19740eefd470SAlex Elder img_request = obj_request->img_request; 19750eefd470SAlex Elder rbd_assert(img_request); 1976d3246fb0SJosh Durgin rbd_assert(img_request_write_test(img_request) || 1977d3246fb0SJosh Durgin img_request_discard_test(img_request)); 19780eefd470SAlex Elder 1979d3246fb0SJosh Durgin if (img_request_discard_test(img_request)) 1980d3246fb0SJosh Durgin num_osd_ops = 2; 1981d3246fb0SJosh Durgin 1982d3246fb0SJosh Durgin /* Allocate and initialize the request, for all the ops */ 19830eefd470SAlex Elder 19840eefd470SAlex Elder snapc = img_request->snapc; 19850eefd470SAlex Elder rbd_dev = img_request->rbd_dev; 19860eefd470SAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 1987d3246fb0SJosh Durgin osd_req = ceph_osdc_alloc_request(osdc, snapc, num_osd_ops, 1988d3246fb0SJosh Durgin false, GFP_ATOMIC); 19890eefd470SAlex Elder if (!osd_req) 19900eefd470SAlex Elder return NULL; /* ENOMEM */ 19910eefd470SAlex Elder 19920eefd470SAlex Elder osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK; 19930eefd470SAlex Elder osd_req->r_callback = rbd_osd_req_callback; 19940eefd470SAlex Elder osd_req->r_priv = obj_request; 19950eefd470SAlex Elder 19963c972c95SIlya Dryomov osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout); 19973c972c95SIlya Dryomov ceph_oid_set_name(&osd_req->r_base_oid, obj_request->object_name); 19980eefd470SAlex Elder 19990eefd470SAlex Elder return osd_req; 20000eefd470SAlex Elder } 20010eefd470SAlex Elder 20020eefd470SAlex Elder 2003bf0d5f50SAlex Elder static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req) 2004bf0d5f50SAlex Elder { 2005bf0d5f50SAlex Elder ceph_osdc_put_request(osd_req); 2006bf0d5f50SAlex Elder } 2007bf0d5f50SAlex Elder 2008bf0d5f50SAlex Elder /* object_name is assumed to be a non-null pointer and NUL-terminated */ 2009bf0d5f50SAlex Elder 2010bf0d5f50SAlex Elder static struct rbd_obj_request *rbd_obj_request_create(const char *object_name, 2011bf0d5f50SAlex Elder u64 offset, u64 length, 2012bf0d5f50SAlex Elder enum obj_request_type type) 2013bf0d5f50SAlex Elder { 2014bf0d5f50SAlex Elder struct rbd_obj_request *obj_request; 2015bf0d5f50SAlex Elder size_t size; 2016bf0d5f50SAlex Elder char *name; 2017bf0d5f50SAlex Elder 2018bf0d5f50SAlex Elder rbd_assert(obj_request_type_valid(type)); 2019bf0d5f50SAlex Elder 2020bf0d5f50SAlex Elder size = strlen(object_name) + 1; 2021f907ad55SAlex Elder name = kmalloc(size, GFP_KERNEL); 2022f907ad55SAlex Elder if (!name) 2023bf0d5f50SAlex Elder return NULL; 2024bf0d5f50SAlex Elder 2025868311b1SAlex Elder obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_KERNEL); 2026f907ad55SAlex Elder if (!obj_request) { 2027f907ad55SAlex Elder kfree(name); 2028f907ad55SAlex Elder return NULL; 2029f907ad55SAlex Elder } 2030f907ad55SAlex Elder 2031bf0d5f50SAlex Elder obj_request->object_name = memcpy(name, object_name, size); 2032bf0d5f50SAlex Elder obj_request->offset = offset; 2033bf0d5f50SAlex Elder obj_request->length = length; 2034926f9b3fSAlex Elder obj_request->flags = 0; 2035bf0d5f50SAlex Elder obj_request->which = BAD_WHICH; 2036bf0d5f50SAlex Elder obj_request->type = type; 2037bf0d5f50SAlex Elder INIT_LIST_HEAD(&obj_request->links); 2038788e2df3SAlex Elder init_completion(&obj_request->completion); 2039bf0d5f50SAlex Elder kref_init(&obj_request->kref); 2040bf0d5f50SAlex Elder 204137206ee5SAlex Elder dout("%s: \"%s\" %llu/%llu %d -> obj %p\n", __func__, object_name, 204237206ee5SAlex Elder offset, length, (int)type, obj_request); 204337206ee5SAlex Elder 2044bf0d5f50SAlex Elder return obj_request; 2045bf0d5f50SAlex Elder } 2046bf0d5f50SAlex Elder 2047bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref) 2048bf0d5f50SAlex Elder { 2049bf0d5f50SAlex Elder struct rbd_obj_request *obj_request; 2050bf0d5f50SAlex Elder 2051bf0d5f50SAlex Elder obj_request = container_of(kref, struct rbd_obj_request, kref); 2052bf0d5f50SAlex Elder 205337206ee5SAlex Elder dout("%s: obj %p\n", __func__, obj_request); 205437206ee5SAlex Elder 2055bf0d5f50SAlex Elder rbd_assert(obj_request->img_request == NULL); 2056bf0d5f50SAlex Elder rbd_assert(obj_request->which == BAD_WHICH); 2057bf0d5f50SAlex Elder 2058bf0d5f50SAlex Elder if (obj_request->osd_req) 2059bf0d5f50SAlex Elder rbd_osd_req_destroy(obj_request->osd_req); 2060bf0d5f50SAlex Elder 2061bf0d5f50SAlex Elder rbd_assert(obj_request_type_valid(obj_request->type)); 2062bf0d5f50SAlex Elder switch (obj_request->type) { 20639969ebc5SAlex Elder case OBJ_REQUEST_NODATA: 20649969ebc5SAlex Elder break; /* Nothing to do */ 2065bf0d5f50SAlex Elder case OBJ_REQUEST_BIO: 2066bf0d5f50SAlex Elder if (obj_request->bio_list) 2067bf0d5f50SAlex Elder bio_chain_put(obj_request->bio_list); 2068bf0d5f50SAlex Elder break; 2069788e2df3SAlex Elder case OBJ_REQUEST_PAGES: 2070788e2df3SAlex Elder if (obj_request->pages) 2071788e2df3SAlex Elder ceph_release_page_vector(obj_request->pages, 2072788e2df3SAlex Elder obj_request->page_count); 2073788e2df3SAlex Elder break; 2074bf0d5f50SAlex Elder } 2075bf0d5f50SAlex Elder 2076f907ad55SAlex Elder kfree(obj_request->object_name); 2077868311b1SAlex Elder obj_request->object_name = NULL; 2078868311b1SAlex Elder kmem_cache_free(rbd_obj_request_cache, obj_request); 2079bf0d5f50SAlex Elder } 2080bf0d5f50SAlex Elder 2081fb65d228SAlex Elder /* It's OK to call this for a device with no parent */ 2082fb65d228SAlex Elder 2083fb65d228SAlex Elder static void rbd_spec_put(struct rbd_spec *spec); 2084fb65d228SAlex Elder static void rbd_dev_unparent(struct rbd_device *rbd_dev) 2085fb65d228SAlex Elder { 2086fb65d228SAlex Elder rbd_dev_remove_parent(rbd_dev); 2087fb65d228SAlex Elder rbd_spec_put(rbd_dev->parent_spec); 2088fb65d228SAlex Elder rbd_dev->parent_spec = NULL; 2089fb65d228SAlex Elder rbd_dev->parent_overlap = 0; 2090fb65d228SAlex Elder } 2091fb65d228SAlex Elder 2092bf0d5f50SAlex Elder /* 2093a2acd00eSAlex Elder * Parent image reference counting is used to determine when an 2094a2acd00eSAlex Elder * image's parent fields can be safely torn down--after there are no 2095a2acd00eSAlex Elder * more in-flight requests to the parent image. When the last 2096a2acd00eSAlex Elder * reference is dropped, cleaning them up is safe. 2097a2acd00eSAlex Elder */ 2098a2acd00eSAlex Elder static void rbd_dev_parent_put(struct rbd_device *rbd_dev) 2099a2acd00eSAlex Elder { 2100a2acd00eSAlex Elder int counter; 2101a2acd00eSAlex Elder 2102a2acd00eSAlex Elder if (!rbd_dev->parent_spec) 2103a2acd00eSAlex Elder return; 2104a2acd00eSAlex Elder 2105a2acd00eSAlex Elder counter = atomic_dec_return_safe(&rbd_dev->parent_ref); 2106a2acd00eSAlex Elder if (counter > 0) 2107a2acd00eSAlex Elder return; 2108a2acd00eSAlex Elder 2109a2acd00eSAlex Elder /* Last reference; clean up parent data structures */ 2110a2acd00eSAlex Elder 2111a2acd00eSAlex Elder if (!counter) 2112a2acd00eSAlex Elder rbd_dev_unparent(rbd_dev); 2113a2acd00eSAlex Elder else 21149584d508SIlya Dryomov rbd_warn(rbd_dev, "parent reference underflow"); 2115a2acd00eSAlex Elder } 2116a2acd00eSAlex Elder 2117a2acd00eSAlex Elder /* 2118a2acd00eSAlex Elder * If an image has a non-zero parent overlap, get a reference to its 2119a2acd00eSAlex Elder * parent. 2120a2acd00eSAlex Elder * 2121a2acd00eSAlex Elder * Returns true if the rbd device has a parent with a non-zero 2122a2acd00eSAlex Elder * overlap and a reference for it was successfully taken, or 2123a2acd00eSAlex Elder * false otherwise. 2124a2acd00eSAlex Elder */ 2125a2acd00eSAlex Elder static bool rbd_dev_parent_get(struct rbd_device *rbd_dev) 2126a2acd00eSAlex Elder { 2127ae43e9d0SIlya Dryomov int counter = 0; 2128a2acd00eSAlex Elder 2129a2acd00eSAlex Elder if (!rbd_dev->parent_spec) 2130a2acd00eSAlex Elder return false; 2131a2acd00eSAlex Elder 2132ae43e9d0SIlya Dryomov down_read(&rbd_dev->header_rwsem); 2133ae43e9d0SIlya Dryomov if (rbd_dev->parent_overlap) 2134a2acd00eSAlex Elder counter = atomic_inc_return_safe(&rbd_dev->parent_ref); 2135ae43e9d0SIlya Dryomov up_read(&rbd_dev->header_rwsem); 2136a2acd00eSAlex Elder 2137a2acd00eSAlex Elder if (counter < 0) 21389584d508SIlya Dryomov rbd_warn(rbd_dev, "parent reference overflow"); 2139a2acd00eSAlex Elder 2140ae43e9d0SIlya Dryomov return counter > 0; 2141a2acd00eSAlex Elder } 2142a2acd00eSAlex Elder 2143bf0d5f50SAlex Elder /* 2144bf0d5f50SAlex Elder * Caller is responsible for filling in the list of object requests 2145bf0d5f50SAlex Elder * that comprises the image request, and the Linux request pointer 2146bf0d5f50SAlex Elder * (if there is one). 2147bf0d5f50SAlex Elder */ 2148cc344fa1SAlex Elder static struct rbd_img_request *rbd_img_request_create( 2149cc344fa1SAlex Elder struct rbd_device *rbd_dev, 2150bf0d5f50SAlex Elder u64 offset, u64 length, 21516d2940c8SGuangliang Zhao enum obj_operation_type op_type, 21524e752f0aSJosh Durgin struct ceph_snap_context *snapc) 2153bf0d5f50SAlex Elder { 2154bf0d5f50SAlex Elder struct rbd_img_request *img_request; 2155bf0d5f50SAlex Elder 21567a716aacSIlya Dryomov img_request = kmem_cache_alloc(rbd_img_request_cache, GFP_NOIO); 2157bf0d5f50SAlex Elder if (!img_request) 2158bf0d5f50SAlex Elder return NULL; 2159bf0d5f50SAlex Elder 2160bf0d5f50SAlex Elder img_request->rq = NULL; 2161bf0d5f50SAlex Elder img_request->rbd_dev = rbd_dev; 2162bf0d5f50SAlex Elder img_request->offset = offset; 2163bf0d5f50SAlex Elder img_request->length = length; 21640c425248SAlex Elder img_request->flags = 0; 216590e98c52SGuangliang Zhao if (op_type == OBJ_OP_DISCARD) { 216690e98c52SGuangliang Zhao img_request_discard_set(img_request); 216790e98c52SGuangliang Zhao img_request->snapc = snapc; 216890e98c52SGuangliang Zhao } else if (op_type == OBJ_OP_WRITE) { 21690c425248SAlex Elder img_request_write_set(img_request); 21704e752f0aSJosh Durgin img_request->snapc = snapc; 21710c425248SAlex Elder } else { 2172bf0d5f50SAlex Elder img_request->snap_id = rbd_dev->spec->snap_id; 21730c425248SAlex Elder } 2174a2acd00eSAlex Elder if (rbd_dev_parent_get(rbd_dev)) 2175d0b2e944SAlex Elder img_request_layered_set(img_request); 2176bf0d5f50SAlex Elder spin_lock_init(&img_request->completion_lock); 2177bf0d5f50SAlex Elder img_request->next_completion = 0; 2178bf0d5f50SAlex Elder img_request->callback = NULL; 2179a5a337d4SAlex Elder img_request->result = 0; 2180bf0d5f50SAlex Elder img_request->obj_request_count = 0; 2181bf0d5f50SAlex Elder INIT_LIST_HEAD(&img_request->obj_requests); 2182bf0d5f50SAlex Elder kref_init(&img_request->kref); 2183bf0d5f50SAlex Elder 218437206ee5SAlex Elder dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev, 21856d2940c8SGuangliang Zhao obj_op_name(op_type), offset, length, img_request); 218637206ee5SAlex Elder 2187bf0d5f50SAlex Elder return img_request; 2188bf0d5f50SAlex Elder } 2189bf0d5f50SAlex Elder 2190bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref) 2191bf0d5f50SAlex Elder { 2192bf0d5f50SAlex Elder struct rbd_img_request *img_request; 2193bf0d5f50SAlex Elder struct rbd_obj_request *obj_request; 2194bf0d5f50SAlex Elder struct rbd_obj_request *next_obj_request; 2195bf0d5f50SAlex Elder 2196bf0d5f50SAlex Elder img_request = container_of(kref, struct rbd_img_request, kref); 2197bf0d5f50SAlex Elder 219837206ee5SAlex Elder dout("%s: img %p\n", __func__, img_request); 219937206ee5SAlex Elder 2200bf0d5f50SAlex Elder for_each_obj_request_safe(img_request, obj_request, next_obj_request) 2201bf0d5f50SAlex Elder rbd_img_obj_request_del(img_request, obj_request); 220225dcf954SAlex Elder rbd_assert(img_request->obj_request_count == 0); 2203bf0d5f50SAlex Elder 2204a2acd00eSAlex Elder if (img_request_layered_test(img_request)) { 2205a2acd00eSAlex Elder img_request_layered_clear(img_request); 2206a2acd00eSAlex Elder rbd_dev_parent_put(img_request->rbd_dev); 2207a2acd00eSAlex Elder } 2208a2acd00eSAlex Elder 2209bef95455SJosh Durgin if (img_request_write_test(img_request) || 2210bef95455SJosh Durgin img_request_discard_test(img_request)) 2211812164f8SAlex Elder ceph_put_snap_context(img_request->snapc); 2212bf0d5f50SAlex Elder 22131c2a9dfeSAlex Elder kmem_cache_free(rbd_img_request_cache, img_request); 2214bf0d5f50SAlex Elder } 2215bf0d5f50SAlex Elder 2216e93f3152SAlex Elder static struct rbd_img_request *rbd_parent_request_create( 2217e93f3152SAlex Elder struct rbd_obj_request *obj_request, 2218e93f3152SAlex Elder u64 img_offset, u64 length) 2219e93f3152SAlex Elder { 2220e93f3152SAlex Elder struct rbd_img_request *parent_request; 2221e93f3152SAlex Elder struct rbd_device *rbd_dev; 2222e93f3152SAlex Elder 2223e93f3152SAlex Elder rbd_assert(obj_request->img_request); 2224e93f3152SAlex Elder rbd_dev = obj_request->img_request->rbd_dev; 2225e93f3152SAlex Elder 22264e752f0aSJosh Durgin parent_request = rbd_img_request_create(rbd_dev->parent, img_offset, 22276d2940c8SGuangliang Zhao length, OBJ_OP_READ, NULL); 2228e93f3152SAlex Elder if (!parent_request) 2229e93f3152SAlex Elder return NULL; 2230e93f3152SAlex Elder 2231e93f3152SAlex Elder img_request_child_set(parent_request); 2232e93f3152SAlex Elder rbd_obj_request_get(obj_request); 2233e93f3152SAlex Elder parent_request->obj_request = obj_request; 2234e93f3152SAlex Elder 2235e93f3152SAlex Elder return parent_request; 2236e93f3152SAlex Elder } 2237e93f3152SAlex Elder 2238e93f3152SAlex Elder static void rbd_parent_request_destroy(struct kref *kref) 2239e93f3152SAlex Elder { 2240e93f3152SAlex Elder struct rbd_img_request *parent_request; 2241e93f3152SAlex Elder struct rbd_obj_request *orig_request; 2242e93f3152SAlex Elder 2243e93f3152SAlex Elder parent_request = container_of(kref, struct rbd_img_request, kref); 2244e93f3152SAlex Elder orig_request = parent_request->obj_request; 2245e93f3152SAlex Elder 2246e93f3152SAlex Elder parent_request->obj_request = NULL; 2247e93f3152SAlex Elder rbd_obj_request_put(orig_request); 2248e93f3152SAlex Elder img_request_child_clear(parent_request); 2249e93f3152SAlex Elder 2250e93f3152SAlex Elder rbd_img_request_destroy(kref); 2251e93f3152SAlex Elder } 2252e93f3152SAlex Elder 22531217857fSAlex Elder static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request) 22541217857fSAlex Elder { 22556365d33aSAlex Elder struct rbd_img_request *img_request; 22561217857fSAlex Elder unsigned int xferred; 22571217857fSAlex Elder int result; 22588b3e1a56SAlex Elder bool more; 22591217857fSAlex Elder 22606365d33aSAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 22616365d33aSAlex Elder img_request = obj_request->img_request; 22626365d33aSAlex Elder 22631217857fSAlex Elder rbd_assert(obj_request->xferred <= (u64)UINT_MAX); 22641217857fSAlex Elder xferred = (unsigned int)obj_request->xferred; 22651217857fSAlex Elder result = obj_request->result; 22661217857fSAlex Elder if (result) { 22671217857fSAlex Elder struct rbd_device *rbd_dev = img_request->rbd_dev; 22686d2940c8SGuangliang Zhao enum obj_operation_type op_type; 22696d2940c8SGuangliang Zhao 227090e98c52SGuangliang Zhao if (img_request_discard_test(img_request)) 227190e98c52SGuangliang Zhao op_type = OBJ_OP_DISCARD; 227290e98c52SGuangliang Zhao else if (img_request_write_test(img_request)) 227390e98c52SGuangliang Zhao op_type = OBJ_OP_WRITE; 227490e98c52SGuangliang Zhao else 227590e98c52SGuangliang Zhao op_type = OBJ_OP_READ; 22761217857fSAlex Elder 22779584d508SIlya Dryomov rbd_warn(rbd_dev, "%s %llx at %llx (%llx)", 22786d2940c8SGuangliang Zhao obj_op_name(op_type), obj_request->length, 22796d2940c8SGuangliang Zhao obj_request->img_offset, obj_request->offset); 22809584d508SIlya Dryomov rbd_warn(rbd_dev, " result %d xferred %x", 22811217857fSAlex Elder result, xferred); 22821217857fSAlex Elder if (!img_request->result) 22831217857fSAlex Elder img_request->result = result; 2284082a75daSIlya Dryomov /* 2285082a75daSIlya Dryomov * Need to end I/O on the entire obj_request worth of 2286082a75daSIlya Dryomov * bytes in case of error. 2287082a75daSIlya Dryomov */ 2288082a75daSIlya Dryomov xferred = obj_request->length; 22891217857fSAlex Elder } 22901217857fSAlex Elder 2291f1a4739fSAlex Elder /* Image object requests don't own their page array */ 2292f1a4739fSAlex Elder 2293f1a4739fSAlex Elder if (obj_request->type == OBJ_REQUEST_PAGES) { 2294f1a4739fSAlex Elder obj_request->pages = NULL; 2295f1a4739fSAlex Elder obj_request->page_count = 0; 2296f1a4739fSAlex Elder } 2297f1a4739fSAlex Elder 22988b3e1a56SAlex Elder if (img_request_child_test(img_request)) { 22998b3e1a56SAlex Elder rbd_assert(img_request->obj_request != NULL); 23008b3e1a56SAlex Elder more = obj_request->which < img_request->obj_request_count - 1; 23018b3e1a56SAlex Elder } else { 23028b3e1a56SAlex Elder rbd_assert(img_request->rq != NULL); 23037ad18afaSChristoph Hellwig 23047ad18afaSChristoph Hellwig more = blk_update_request(img_request->rq, result, xferred); 23057ad18afaSChristoph Hellwig if (!more) 23067ad18afaSChristoph Hellwig __blk_mq_end_request(img_request->rq, result); 23078b3e1a56SAlex Elder } 23088b3e1a56SAlex Elder 23098b3e1a56SAlex Elder return more; 23101217857fSAlex Elder } 23111217857fSAlex Elder 23122169238dSAlex Elder static void rbd_img_obj_callback(struct rbd_obj_request *obj_request) 23132169238dSAlex Elder { 23142169238dSAlex Elder struct rbd_img_request *img_request; 23152169238dSAlex Elder u32 which = obj_request->which; 23162169238dSAlex Elder bool more = true; 23172169238dSAlex Elder 23186365d33aSAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 23192169238dSAlex Elder img_request = obj_request->img_request; 23202169238dSAlex Elder 23212169238dSAlex Elder dout("%s: img %p obj %p\n", __func__, img_request, obj_request); 23222169238dSAlex Elder rbd_assert(img_request != NULL); 23232169238dSAlex Elder rbd_assert(img_request->obj_request_count > 0); 23242169238dSAlex Elder rbd_assert(which != BAD_WHICH); 23252169238dSAlex Elder rbd_assert(which < img_request->obj_request_count); 23262169238dSAlex Elder 23272169238dSAlex Elder spin_lock_irq(&img_request->completion_lock); 23282169238dSAlex Elder if (which != img_request->next_completion) 23292169238dSAlex Elder goto out; 23302169238dSAlex Elder 23312169238dSAlex Elder for_each_obj_request_from(img_request, obj_request) { 23322169238dSAlex Elder rbd_assert(more); 23332169238dSAlex Elder rbd_assert(which < img_request->obj_request_count); 23342169238dSAlex Elder 23352169238dSAlex Elder if (!obj_request_done_test(obj_request)) 23362169238dSAlex Elder break; 23371217857fSAlex Elder more = rbd_img_obj_end_request(obj_request); 23382169238dSAlex Elder which++; 23392169238dSAlex Elder } 23402169238dSAlex Elder 23412169238dSAlex Elder rbd_assert(more ^ (which == img_request->obj_request_count)); 23422169238dSAlex Elder img_request->next_completion = which; 23432169238dSAlex Elder out: 23442169238dSAlex Elder spin_unlock_irq(&img_request->completion_lock); 23450f2d5be7SAlex Elder rbd_img_request_put(img_request); 23462169238dSAlex Elder 23472169238dSAlex Elder if (!more) 23482169238dSAlex Elder rbd_img_request_complete(img_request); 23492169238dSAlex Elder } 23502169238dSAlex Elder 2351f1a4739fSAlex Elder /* 23523b434a2aSJosh Durgin * Add individual osd ops to the given ceph_osd_request and prepare 23533b434a2aSJosh Durgin * them for submission. num_ops is the current number of 23543b434a2aSJosh Durgin * osd operations already to the object request. 23553b434a2aSJosh Durgin */ 23563b434a2aSJosh Durgin static void rbd_img_obj_request_fill(struct rbd_obj_request *obj_request, 23573b434a2aSJosh Durgin struct ceph_osd_request *osd_request, 23583b434a2aSJosh Durgin enum obj_operation_type op_type, 23593b434a2aSJosh Durgin unsigned int num_ops) 23603b434a2aSJosh Durgin { 23613b434a2aSJosh Durgin struct rbd_img_request *img_request = obj_request->img_request; 23623b434a2aSJosh Durgin struct rbd_device *rbd_dev = img_request->rbd_dev; 23633b434a2aSJosh Durgin u64 object_size = rbd_obj_bytes(&rbd_dev->header); 23643b434a2aSJosh Durgin u64 offset = obj_request->offset; 23653b434a2aSJosh Durgin u64 length = obj_request->length; 23663b434a2aSJosh Durgin u64 img_end; 23673b434a2aSJosh Durgin u16 opcode; 23683b434a2aSJosh Durgin 23693b434a2aSJosh Durgin if (op_type == OBJ_OP_DISCARD) { 2370d3246fb0SJosh Durgin if (!offset && length == object_size && 2371d3246fb0SJosh Durgin (!img_request_layered_test(img_request) || 2372d3246fb0SJosh Durgin !obj_request_overlaps_parent(obj_request))) { 23733b434a2aSJosh Durgin opcode = CEPH_OSD_OP_DELETE; 23743b434a2aSJosh Durgin } else if ((offset + length == object_size)) { 23753b434a2aSJosh Durgin opcode = CEPH_OSD_OP_TRUNCATE; 23763b434a2aSJosh Durgin } else { 23773b434a2aSJosh Durgin down_read(&rbd_dev->header_rwsem); 23783b434a2aSJosh Durgin img_end = rbd_dev->header.image_size; 23793b434a2aSJosh Durgin up_read(&rbd_dev->header_rwsem); 23803b434a2aSJosh Durgin 23813b434a2aSJosh Durgin if (obj_request->img_offset + length == img_end) 23823b434a2aSJosh Durgin opcode = CEPH_OSD_OP_TRUNCATE; 23833b434a2aSJosh Durgin else 23843b434a2aSJosh Durgin opcode = CEPH_OSD_OP_ZERO; 23853b434a2aSJosh Durgin } 23863b434a2aSJosh Durgin } else if (op_type == OBJ_OP_WRITE) { 23873b434a2aSJosh Durgin opcode = CEPH_OSD_OP_WRITE; 23883b434a2aSJosh Durgin osd_req_op_alloc_hint_init(osd_request, num_ops, 23893b434a2aSJosh Durgin object_size, object_size); 23903b434a2aSJosh Durgin num_ops++; 23913b434a2aSJosh Durgin } else { 23923b434a2aSJosh Durgin opcode = CEPH_OSD_OP_READ; 23933b434a2aSJosh Durgin } 23943b434a2aSJosh Durgin 23957e868b6eSIlya Dryomov if (opcode == CEPH_OSD_OP_DELETE) 2396144cba14SYan, Zheng osd_req_op_init(osd_request, num_ops, opcode, 0); 23977e868b6eSIlya Dryomov else 23987e868b6eSIlya Dryomov osd_req_op_extent_init(osd_request, num_ops, opcode, 23997e868b6eSIlya Dryomov offset, length, 0, 0); 24007e868b6eSIlya Dryomov 24013b434a2aSJosh Durgin if (obj_request->type == OBJ_REQUEST_BIO) 24023b434a2aSJosh Durgin osd_req_op_extent_osd_data_bio(osd_request, num_ops, 24033b434a2aSJosh Durgin obj_request->bio_list, length); 24043b434a2aSJosh Durgin else if (obj_request->type == OBJ_REQUEST_PAGES) 24053b434a2aSJosh Durgin osd_req_op_extent_osd_data_pages(osd_request, num_ops, 24063b434a2aSJosh Durgin obj_request->pages, length, 24073b434a2aSJosh Durgin offset & ~PAGE_MASK, false, false); 24083b434a2aSJosh Durgin 24093b434a2aSJosh Durgin /* Discards are also writes */ 24103b434a2aSJosh Durgin if (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD) 24113b434a2aSJosh Durgin rbd_osd_req_format_write(obj_request); 24123b434a2aSJosh Durgin else 24133b434a2aSJosh Durgin rbd_osd_req_format_read(obj_request); 24143b434a2aSJosh Durgin } 24153b434a2aSJosh Durgin 24163b434a2aSJosh Durgin /* 2417f1a4739fSAlex Elder * Split up an image request into one or more object requests, each 2418f1a4739fSAlex Elder * to a different object. The "type" parameter indicates whether 2419f1a4739fSAlex Elder * "data_desc" is the pointer to the head of a list of bio 2420f1a4739fSAlex Elder * structures, or the base of a page array. In either case this 2421f1a4739fSAlex Elder * function assumes data_desc describes memory sufficient to hold 2422f1a4739fSAlex Elder * all data described by the image request. 2423f1a4739fSAlex Elder */ 2424f1a4739fSAlex Elder static int rbd_img_request_fill(struct rbd_img_request *img_request, 2425f1a4739fSAlex Elder enum obj_request_type type, 2426f1a4739fSAlex Elder void *data_desc) 2427bf0d5f50SAlex Elder { 2428bf0d5f50SAlex Elder struct rbd_device *rbd_dev = img_request->rbd_dev; 2429bf0d5f50SAlex Elder struct rbd_obj_request *obj_request = NULL; 2430bf0d5f50SAlex Elder struct rbd_obj_request *next_obj_request; 2431a158073cSJingoo Han struct bio *bio_list = NULL; 2432f1a4739fSAlex Elder unsigned int bio_offset = 0; 2433a158073cSJingoo Han struct page **pages = NULL; 24346d2940c8SGuangliang Zhao enum obj_operation_type op_type; 24357da22d29SAlex Elder u64 img_offset; 2436bf0d5f50SAlex Elder u64 resid; 2437bf0d5f50SAlex Elder 2438f1a4739fSAlex Elder dout("%s: img %p type %d data_desc %p\n", __func__, img_request, 2439f1a4739fSAlex Elder (int)type, data_desc); 244037206ee5SAlex Elder 24417da22d29SAlex Elder img_offset = img_request->offset; 2442bf0d5f50SAlex Elder resid = img_request->length; 24434dda41d3SAlex Elder rbd_assert(resid > 0); 24443b434a2aSJosh Durgin op_type = rbd_img_request_op_type(img_request); 2445f1a4739fSAlex Elder 2446f1a4739fSAlex Elder if (type == OBJ_REQUEST_BIO) { 2447f1a4739fSAlex Elder bio_list = data_desc; 24484f024f37SKent Overstreet rbd_assert(img_offset == 24494f024f37SKent Overstreet bio_list->bi_iter.bi_sector << SECTOR_SHIFT); 245090e98c52SGuangliang Zhao } else if (type == OBJ_REQUEST_PAGES) { 2451f1a4739fSAlex Elder pages = data_desc; 2452f1a4739fSAlex Elder } 2453f1a4739fSAlex Elder 2454bf0d5f50SAlex Elder while (resid) { 24552fa12320SAlex Elder struct ceph_osd_request *osd_req; 2456bf0d5f50SAlex Elder const char *object_name; 2457bf0d5f50SAlex Elder u64 offset; 2458bf0d5f50SAlex Elder u64 length; 2459bf0d5f50SAlex Elder 24607da22d29SAlex Elder object_name = rbd_segment_name(rbd_dev, img_offset); 2461bf0d5f50SAlex Elder if (!object_name) 2462bf0d5f50SAlex Elder goto out_unwind; 24637da22d29SAlex Elder offset = rbd_segment_offset(rbd_dev, img_offset); 24647da22d29SAlex Elder length = rbd_segment_length(rbd_dev, img_offset, resid); 2465bf0d5f50SAlex Elder obj_request = rbd_obj_request_create(object_name, 2466f1a4739fSAlex Elder offset, length, type); 246778c2a44aSAlex Elder /* object request has its own copy of the object name */ 246878c2a44aSAlex Elder rbd_segment_name_free(object_name); 2469bf0d5f50SAlex Elder if (!obj_request) 2470bf0d5f50SAlex Elder goto out_unwind; 247162054da6SIlya Dryomov 247203507db6SJosh Durgin /* 247303507db6SJosh Durgin * set obj_request->img_request before creating the 247403507db6SJosh Durgin * osd_request so that it gets the right snapc 247503507db6SJosh Durgin */ 247603507db6SJosh Durgin rbd_img_obj_request_add(img_request, obj_request); 2477bf0d5f50SAlex Elder 2478f1a4739fSAlex Elder if (type == OBJ_REQUEST_BIO) { 2479f1a4739fSAlex Elder unsigned int clone_size; 2480f1a4739fSAlex Elder 2481bf0d5f50SAlex Elder rbd_assert(length <= (u64)UINT_MAX); 2482bf0d5f50SAlex Elder clone_size = (unsigned int)length; 2483f1a4739fSAlex Elder obj_request->bio_list = 2484f1a4739fSAlex Elder bio_chain_clone_range(&bio_list, 2485f1a4739fSAlex Elder &bio_offset, 2486f1a4739fSAlex Elder clone_size, 2487bf0d5f50SAlex Elder GFP_ATOMIC); 2488bf0d5f50SAlex Elder if (!obj_request->bio_list) 248962054da6SIlya Dryomov goto out_unwind; 249090e98c52SGuangliang Zhao } else if (type == OBJ_REQUEST_PAGES) { 2491f1a4739fSAlex Elder unsigned int page_count; 2492f1a4739fSAlex Elder 2493f1a4739fSAlex Elder obj_request->pages = pages; 2494f1a4739fSAlex Elder page_count = (u32)calc_pages_for(offset, length); 2495f1a4739fSAlex Elder obj_request->page_count = page_count; 2496f1a4739fSAlex Elder if ((offset + length) & ~PAGE_MASK) 2497f1a4739fSAlex Elder page_count--; /* more on last page */ 2498f1a4739fSAlex Elder pages += page_count; 2499f1a4739fSAlex Elder } 2500bf0d5f50SAlex Elder 25016d2940c8SGuangliang Zhao osd_req = rbd_osd_req_create(rbd_dev, op_type, 25026d2940c8SGuangliang Zhao (op_type == OBJ_OP_WRITE) ? 2 : 1, 25032fa12320SAlex Elder obj_request); 25042fa12320SAlex Elder if (!osd_req) 250562054da6SIlya Dryomov goto out_unwind; 25063b434a2aSJosh Durgin 25072fa12320SAlex Elder obj_request->osd_req = osd_req; 25082169238dSAlex Elder obj_request->callback = rbd_img_obj_callback; 25097da22d29SAlex Elder obj_request->img_offset = img_offset; 2510bf0d5f50SAlex Elder 25113b434a2aSJosh Durgin rbd_img_obj_request_fill(obj_request, osd_req, op_type, 0); 25123b434a2aSJosh Durgin 25133b434a2aSJosh Durgin rbd_img_request_get(img_request); 25143b434a2aSJosh Durgin 25157da22d29SAlex Elder img_offset += length; 2516bf0d5f50SAlex Elder resid -= length; 2517bf0d5f50SAlex Elder } 2518bf0d5f50SAlex Elder 2519bf0d5f50SAlex Elder return 0; 2520bf0d5f50SAlex Elder 2521bf0d5f50SAlex Elder out_unwind: 2522bf0d5f50SAlex Elder for_each_obj_request_safe(img_request, obj_request, next_obj_request) 252342dd037cSIlya Dryomov rbd_img_obj_request_del(img_request, obj_request); 2524bf0d5f50SAlex Elder 2525bf0d5f50SAlex Elder return -ENOMEM; 2526bf0d5f50SAlex Elder } 2527bf0d5f50SAlex Elder 25283d7efd18SAlex Elder static void 25290eefd470SAlex Elder rbd_img_obj_copyup_callback(struct rbd_obj_request *obj_request) 25300eefd470SAlex Elder { 25310eefd470SAlex Elder struct rbd_img_request *img_request; 25320eefd470SAlex Elder struct rbd_device *rbd_dev; 2533ebda6408SAlex Elder struct page **pages; 25340eefd470SAlex Elder u32 page_count; 25350eefd470SAlex Elder 2536d3246fb0SJosh Durgin rbd_assert(obj_request->type == OBJ_REQUEST_BIO || 2537d3246fb0SJosh Durgin obj_request->type == OBJ_REQUEST_NODATA); 25380eefd470SAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 25390eefd470SAlex Elder img_request = obj_request->img_request; 25400eefd470SAlex Elder rbd_assert(img_request); 25410eefd470SAlex Elder 25420eefd470SAlex Elder rbd_dev = img_request->rbd_dev; 25430eefd470SAlex Elder rbd_assert(rbd_dev); 25440eefd470SAlex Elder 2545ebda6408SAlex Elder pages = obj_request->copyup_pages; 2546ebda6408SAlex Elder rbd_assert(pages != NULL); 25470eefd470SAlex Elder obj_request->copyup_pages = NULL; 2548ebda6408SAlex Elder page_count = obj_request->copyup_page_count; 2549ebda6408SAlex Elder rbd_assert(page_count); 2550ebda6408SAlex Elder obj_request->copyup_page_count = 0; 2551ebda6408SAlex Elder ceph_release_page_vector(pages, page_count); 25520eefd470SAlex Elder 25530eefd470SAlex Elder /* 25540eefd470SAlex Elder * We want the transfer count to reflect the size of the 25550eefd470SAlex Elder * original write request. There is no such thing as a 25560eefd470SAlex Elder * successful short write, so if the request was successful 25570eefd470SAlex Elder * we can just set it to the originally-requested length. 25580eefd470SAlex Elder */ 25590eefd470SAlex Elder if (!obj_request->result) 25600eefd470SAlex Elder obj_request->xferred = obj_request->length; 25610eefd470SAlex Elder 25620eefd470SAlex Elder /* Finish up with the normal image object callback */ 25630eefd470SAlex Elder 25640eefd470SAlex Elder rbd_img_obj_callback(obj_request); 25650eefd470SAlex Elder } 25660eefd470SAlex Elder 25670eefd470SAlex Elder static void 25683d7efd18SAlex Elder rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request) 25693d7efd18SAlex Elder { 25703d7efd18SAlex Elder struct rbd_obj_request *orig_request; 25710eefd470SAlex Elder struct ceph_osd_request *osd_req; 25720eefd470SAlex Elder struct ceph_osd_client *osdc; 25730eefd470SAlex Elder struct rbd_device *rbd_dev; 25743d7efd18SAlex Elder struct page **pages; 2575d3246fb0SJosh Durgin enum obj_operation_type op_type; 2576ebda6408SAlex Elder u32 page_count; 2577bbea1c1aSAlex Elder int img_result; 2578ebda6408SAlex Elder u64 parent_length; 25793d7efd18SAlex Elder 25803d7efd18SAlex Elder rbd_assert(img_request_child_test(img_request)); 25813d7efd18SAlex Elder 25823d7efd18SAlex Elder /* First get what we need from the image request */ 25833d7efd18SAlex Elder 25843d7efd18SAlex Elder pages = img_request->copyup_pages; 25853d7efd18SAlex Elder rbd_assert(pages != NULL); 25863d7efd18SAlex Elder img_request->copyup_pages = NULL; 2587ebda6408SAlex Elder page_count = img_request->copyup_page_count; 2588ebda6408SAlex Elder rbd_assert(page_count); 2589ebda6408SAlex Elder img_request->copyup_page_count = 0; 25903d7efd18SAlex Elder 25913d7efd18SAlex Elder orig_request = img_request->obj_request; 25923d7efd18SAlex Elder rbd_assert(orig_request != NULL); 2593b91f09f1SAlex Elder rbd_assert(obj_request_type_valid(orig_request->type)); 2594bbea1c1aSAlex Elder img_result = img_request->result; 2595ebda6408SAlex Elder parent_length = img_request->length; 2596ebda6408SAlex Elder rbd_assert(parent_length == img_request->xferred); 25973d7efd18SAlex Elder rbd_img_request_put(img_request); 25983d7efd18SAlex Elder 259991c6febbSAlex Elder rbd_assert(orig_request->img_request); 260091c6febbSAlex Elder rbd_dev = orig_request->img_request->rbd_dev; 26013d7efd18SAlex Elder rbd_assert(rbd_dev); 26023d7efd18SAlex Elder 2603bbea1c1aSAlex Elder /* 2604bbea1c1aSAlex Elder * If the overlap has become 0 (most likely because the 2605bbea1c1aSAlex Elder * image has been flattened) we need to free the pages 2606bbea1c1aSAlex Elder * and re-submit the original write request. 2607bbea1c1aSAlex Elder */ 2608bbea1c1aSAlex Elder if (!rbd_dev->parent_overlap) { 2609bbea1c1aSAlex Elder struct ceph_osd_client *osdc; 2610bbea1c1aSAlex Elder 2611bbea1c1aSAlex Elder ceph_release_page_vector(pages, page_count); 2612bbea1c1aSAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 2613bbea1c1aSAlex Elder img_result = rbd_obj_request_submit(osdc, orig_request); 2614bbea1c1aSAlex Elder if (!img_result) 2615bbea1c1aSAlex Elder return; 2616bbea1c1aSAlex Elder } 2617bbea1c1aSAlex Elder 2618bbea1c1aSAlex Elder if (img_result) 26190eefd470SAlex Elder goto out_err; 26203d7efd18SAlex Elder 26218785b1d4SAlex Elder /* 26228785b1d4SAlex Elder * The original osd request is of no use to use any more. 26230ccd5926SIlya Dryomov * We need a new one that can hold the three ops in a copyup 26248785b1d4SAlex Elder * request. Allocate the new copyup osd request for the 26258785b1d4SAlex Elder * original request, and release the old one. 26268785b1d4SAlex Elder */ 2627bbea1c1aSAlex Elder img_result = -ENOMEM; 26280eefd470SAlex Elder osd_req = rbd_osd_req_create_copyup(orig_request); 26290eefd470SAlex Elder if (!osd_req) 26300eefd470SAlex Elder goto out_err; 26318785b1d4SAlex Elder rbd_osd_req_destroy(orig_request->osd_req); 26320eefd470SAlex Elder orig_request->osd_req = osd_req; 26330eefd470SAlex Elder orig_request->copyup_pages = pages; 2634ebda6408SAlex Elder orig_request->copyup_page_count = page_count; 26353d7efd18SAlex Elder 26360eefd470SAlex Elder /* Initialize the copyup op */ 26370eefd470SAlex Elder 26380eefd470SAlex Elder osd_req_op_cls_init(osd_req, 0, CEPH_OSD_OP_CALL, "rbd", "copyup"); 2639ebda6408SAlex Elder osd_req_op_cls_request_data_pages(osd_req, 0, pages, parent_length, 0, 26400eefd470SAlex Elder false, false); 26410eefd470SAlex Elder 2642d3246fb0SJosh Durgin /* Add the other op(s) */ 26430ccd5926SIlya Dryomov 2644d3246fb0SJosh Durgin op_type = rbd_img_request_op_type(orig_request->img_request); 2645d3246fb0SJosh Durgin rbd_img_obj_request_fill(orig_request, osd_req, op_type, 1); 26460eefd470SAlex Elder 26470eefd470SAlex Elder /* All set, send it off. */ 26480eefd470SAlex Elder 26490eefd470SAlex Elder orig_request->callback = rbd_img_obj_copyup_callback; 26500eefd470SAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 2651bbea1c1aSAlex Elder img_result = rbd_obj_request_submit(osdc, orig_request); 2652bbea1c1aSAlex Elder if (!img_result) 26530eefd470SAlex Elder return; 26540eefd470SAlex Elder out_err: 26550eefd470SAlex Elder /* Record the error code and complete the request */ 26560eefd470SAlex Elder 2657bbea1c1aSAlex Elder orig_request->result = img_result; 26580eefd470SAlex Elder orig_request->xferred = 0; 26593d7efd18SAlex Elder obj_request_done_set(orig_request); 26603d7efd18SAlex Elder rbd_obj_request_complete(orig_request); 26613d7efd18SAlex Elder } 26623d7efd18SAlex Elder 26633d7efd18SAlex Elder /* 26643d7efd18SAlex Elder * Read from the parent image the range of data that covers the 26653d7efd18SAlex Elder * entire target of the given object request. This is used for 26663d7efd18SAlex Elder * satisfying a layered image write request when the target of an 26673d7efd18SAlex Elder * object request from the image request does not exist. 26683d7efd18SAlex Elder * 26693d7efd18SAlex Elder * A page array big enough to hold the returned data is allocated 26703d7efd18SAlex Elder * and supplied to rbd_img_request_fill() as the "data descriptor." 26713d7efd18SAlex Elder * When the read completes, this page array will be transferred to 26723d7efd18SAlex Elder * the original object request for the copyup operation. 26733d7efd18SAlex Elder * 26743d7efd18SAlex Elder * If an error occurs, record it as the result of the original 26753d7efd18SAlex Elder * object request and mark it done so it gets completed. 26763d7efd18SAlex Elder */ 26773d7efd18SAlex Elder static int rbd_img_obj_parent_read_full(struct rbd_obj_request *obj_request) 26783d7efd18SAlex Elder { 26793d7efd18SAlex Elder struct rbd_img_request *img_request = NULL; 26803d7efd18SAlex Elder struct rbd_img_request *parent_request = NULL; 26813d7efd18SAlex Elder struct rbd_device *rbd_dev; 26823d7efd18SAlex Elder u64 img_offset; 26833d7efd18SAlex Elder u64 length; 26843d7efd18SAlex Elder struct page **pages = NULL; 26853d7efd18SAlex Elder u32 page_count; 26863d7efd18SAlex Elder int result; 26873d7efd18SAlex Elder 26883d7efd18SAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 2689b91f09f1SAlex Elder rbd_assert(obj_request_type_valid(obj_request->type)); 26903d7efd18SAlex Elder 26913d7efd18SAlex Elder img_request = obj_request->img_request; 26923d7efd18SAlex Elder rbd_assert(img_request != NULL); 26933d7efd18SAlex Elder rbd_dev = img_request->rbd_dev; 26943d7efd18SAlex Elder rbd_assert(rbd_dev->parent != NULL); 26953d7efd18SAlex Elder 26963d7efd18SAlex Elder /* 26973d7efd18SAlex Elder * Determine the byte range covered by the object in the 26983d7efd18SAlex Elder * child image to which the original request was to be sent. 26993d7efd18SAlex Elder */ 27003d7efd18SAlex Elder img_offset = obj_request->img_offset - obj_request->offset; 27013d7efd18SAlex Elder length = (u64)1 << rbd_dev->header.obj_order; 27023d7efd18SAlex Elder 27033d7efd18SAlex Elder /* 2704a9e8ba2cSAlex Elder * There is no defined parent data beyond the parent 2705a9e8ba2cSAlex Elder * overlap, so limit what we read at that boundary if 2706a9e8ba2cSAlex Elder * necessary. 2707a9e8ba2cSAlex Elder */ 2708a9e8ba2cSAlex Elder if (img_offset + length > rbd_dev->parent_overlap) { 2709a9e8ba2cSAlex Elder rbd_assert(img_offset < rbd_dev->parent_overlap); 2710a9e8ba2cSAlex Elder length = rbd_dev->parent_overlap - img_offset; 2711a9e8ba2cSAlex Elder } 2712a9e8ba2cSAlex Elder 2713a9e8ba2cSAlex Elder /* 27143d7efd18SAlex Elder * Allocate a page array big enough to receive the data read 27153d7efd18SAlex Elder * from the parent. 27163d7efd18SAlex Elder */ 27173d7efd18SAlex Elder page_count = (u32)calc_pages_for(0, length); 27183d7efd18SAlex Elder pages = ceph_alloc_page_vector(page_count, GFP_KERNEL); 27193d7efd18SAlex Elder if (IS_ERR(pages)) { 27203d7efd18SAlex Elder result = PTR_ERR(pages); 27213d7efd18SAlex Elder pages = NULL; 27223d7efd18SAlex Elder goto out_err; 27233d7efd18SAlex Elder } 27243d7efd18SAlex Elder 27253d7efd18SAlex Elder result = -ENOMEM; 2726e93f3152SAlex Elder parent_request = rbd_parent_request_create(obj_request, 2727e93f3152SAlex Elder img_offset, length); 27283d7efd18SAlex Elder if (!parent_request) 27293d7efd18SAlex Elder goto out_err; 27303d7efd18SAlex Elder 27313d7efd18SAlex Elder result = rbd_img_request_fill(parent_request, OBJ_REQUEST_PAGES, pages); 27323d7efd18SAlex Elder if (result) 27333d7efd18SAlex Elder goto out_err; 27343d7efd18SAlex Elder parent_request->copyup_pages = pages; 2735ebda6408SAlex Elder parent_request->copyup_page_count = page_count; 27363d7efd18SAlex Elder 27373d7efd18SAlex Elder parent_request->callback = rbd_img_obj_parent_read_full_callback; 27383d7efd18SAlex Elder result = rbd_img_request_submit(parent_request); 27393d7efd18SAlex Elder if (!result) 27403d7efd18SAlex Elder return 0; 27413d7efd18SAlex Elder 27423d7efd18SAlex Elder parent_request->copyup_pages = NULL; 2743ebda6408SAlex Elder parent_request->copyup_page_count = 0; 27443d7efd18SAlex Elder parent_request->obj_request = NULL; 27453d7efd18SAlex Elder rbd_obj_request_put(obj_request); 27463d7efd18SAlex Elder out_err: 27473d7efd18SAlex Elder if (pages) 27483d7efd18SAlex Elder ceph_release_page_vector(pages, page_count); 27493d7efd18SAlex Elder if (parent_request) 27503d7efd18SAlex Elder rbd_img_request_put(parent_request); 27513d7efd18SAlex Elder obj_request->result = result; 27523d7efd18SAlex Elder obj_request->xferred = 0; 27533d7efd18SAlex Elder obj_request_done_set(obj_request); 27543d7efd18SAlex Elder 27553d7efd18SAlex Elder return result; 27563d7efd18SAlex Elder } 27573d7efd18SAlex Elder 2758c5b5ef6cSAlex Elder static void rbd_img_obj_exists_callback(struct rbd_obj_request *obj_request) 2759c5b5ef6cSAlex Elder { 2760c5b5ef6cSAlex Elder struct rbd_obj_request *orig_request; 2761638f5abeSAlex Elder struct rbd_device *rbd_dev; 2762c5b5ef6cSAlex Elder int result; 2763c5b5ef6cSAlex Elder 2764c5b5ef6cSAlex Elder rbd_assert(!obj_request_img_data_test(obj_request)); 2765c5b5ef6cSAlex Elder 2766c5b5ef6cSAlex Elder /* 2767c5b5ef6cSAlex Elder * All we need from the object request is the original 2768c5b5ef6cSAlex Elder * request and the result of the STAT op. Grab those, then 2769c5b5ef6cSAlex Elder * we're done with the request. 2770c5b5ef6cSAlex Elder */ 2771c5b5ef6cSAlex Elder orig_request = obj_request->obj_request; 2772c5b5ef6cSAlex Elder obj_request->obj_request = NULL; 2773912c317dSAlex Elder rbd_obj_request_put(orig_request); 2774c5b5ef6cSAlex Elder rbd_assert(orig_request); 2775c5b5ef6cSAlex Elder rbd_assert(orig_request->img_request); 2776c5b5ef6cSAlex Elder 2777c5b5ef6cSAlex Elder result = obj_request->result; 2778c5b5ef6cSAlex Elder obj_request->result = 0; 2779c5b5ef6cSAlex Elder 2780c5b5ef6cSAlex Elder dout("%s: obj %p for obj %p result %d %llu/%llu\n", __func__, 2781c5b5ef6cSAlex Elder obj_request, orig_request, result, 2782c5b5ef6cSAlex Elder obj_request->xferred, obj_request->length); 2783c5b5ef6cSAlex Elder rbd_obj_request_put(obj_request); 2784c5b5ef6cSAlex Elder 2785638f5abeSAlex Elder /* 2786638f5abeSAlex Elder * If the overlap has become 0 (most likely because the 2787638f5abeSAlex Elder * image has been flattened) we need to free the pages 2788638f5abeSAlex Elder * and re-submit the original write request. 2789638f5abeSAlex Elder */ 2790638f5abeSAlex Elder rbd_dev = orig_request->img_request->rbd_dev; 2791638f5abeSAlex Elder if (!rbd_dev->parent_overlap) { 2792638f5abeSAlex Elder struct ceph_osd_client *osdc; 2793638f5abeSAlex Elder 2794638f5abeSAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 2795638f5abeSAlex Elder result = rbd_obj_request_submit(osdc, orig_request); 2796638f5abeSAlex Elder if (!result) 2797638f5abeSAlex Elder return; 2798638f5abeSAlex Elder } 2799c5b5ef6cSAlex Elder 2800c5b5ef6cSAlex Elder /* 2801c5b5ef6cSAlex Elder * Our only purpose here is to determine whether the object 2802c5b5ef6cSAlex Elder * exists, and we don't want to treat the non-existence as 2803c5b5ef6cSAlex Elder * an error. If something else comes back, transfer the 2804c5b5ef6cSAlex Elder * error to the original request and complete it now. 2805c5b5ef6cSAlex Elder */ 2806c5b5ef6cSAlex Elder if (!result) { 2807c5b5ef6cSAlex Elder obj_request_existence_set(orig_request, true); 2808c5b5ef6cSAlex Elder } else if (result == -ENOENT) { 2809c5b5ef6cSAlex Elder obj_request_existence_set(orig_request, false); 2810c5b5ef6cSAlex Elder } else if (result) { 2811c5b5ef6cSAlex Elder orig_request->result = result; 28123d7efd18SAlex Elder goto out; 2813c5b5ef6cSAlex Elder } 2814c5b5ef6cSAlex Elder 2815c5b5ef6cSAlex Elder /* 2816c5b5ef6cSAlex Elder * Resubmit the original request now that we have recorded 2817c5b5ef6cSAlex Elder * whether the target object exists. 2818c5b5ef6cSAlex Elder */ 2819b454e36dSAlex Elder orig_request->result = rbd_img_obj_request_submit(orig_request); 28203d7efd18SAlex Elder out: 2821c5b5ef6cSAlex Elder if (orig_request->result) 2822c5b5ef6cSAlex Elder rbd_obj_request_complete(orig_request); 2823c5b5ef6cSAlex Elder } 2824c5b5ef6cSAlex Elder 2825c5b5ef6cSAlex Elder static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request) 2826c5b5ef6cSAlex Elder { 2827c5b5ef6cSAlex Elder struct rbd_obj_request *stat_request; 2828c5b5ef6cSAlex Elder struct rbd_device *rbd_dev; 2829c5b5ef6cSAlex Elder struct ceph_osd_client *osdc; 2830c5b5ef6cSAlex Elder struct page **pages = NULL; 2831c5b5ef6cSAlex Elder u32 page_count; 2832c5b5ef6cSAlex Elder size_t size; 2833c5b5ef6cSAlex Elder int ret; 2834c5b5ef6cSAlex Elder 2835c5b5ef6cSAlex Elder /* 2836c5b5ef6cSAlex Elder * The response data for a STAT call consists of: 2837c5b5ef6cSAlex Elder * le64 length; 2838c5b5ef6cSAlex Elder * struct { 2839c5b5ef6cSAlex Elder * le32 tv_sec; 2840c5b5ef6cSAlex Elder * le32 tv_nsec; 2841c5b5ef6cSAlex Elder * } mtime; 2842c5b5ef6cSAlex Elder */ 2843c5b5ef6cSAlex Elder size = sizeof (__le64) + sizeof (__le32) + sizeof (__le32); 2844c5b5ef6cSAlex Elder page_count = (u32)calc_pages_for(0, size); 2845c5b5ef6cSAlex Elder pages = ceph_alloc_page_vector(page_count, GFP_KERNEL); 2846c5b5ef6cSAlex Elder if (IS_ERR(pages)) 2847c5b5ef6cSAlex Elder return PTR_ERR(pages); 2848c5b5ef6cSAlex Elder 2849c5b5ef6cSAlex Elder ret = -ENOMEM; 2850c5b5ef6cSAlex Elder stat_request = rbd_obj_request_create(obj_request->object_name, 0, 0, 2851c5b5ef6cSAlex Elder OBJ_REQUEST_PAGES); 2852c5b5ef6cSAlex Elder if (!stat_request) 2853c5b5ef6cSAlex Elder goto out; 2854c5b5ef6cSAlex Elder 2855c5b5ef6cSAlex Elder rbd_obj_request_get(obj_request); 2856c5b5ef6cSAlex Elder stat_request->obj_request = obj_request; 2857c5b5ef6cSAlex Elder stat_request->pages = pages; 2858c5b5ef6cSAlex Elder stat_request->page_count = page_count; 2859c5b5ef6cSAlex Elder 2860c5b5ef6cSAlex Elder rbd_assert(obj_request->img_request); 2861c5b5ef6cSAlex Elder rbd_dev = obj_request->img_request->rbd_dev; 28626d2940c8SGuangliang Zhao stat_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1, 2863c5b5ef6cSAlex Elder stat_request); 2864c5b5ef6cSAlex Elder if (!stat_request->osd_req) 2865c5b5ef6cSAlex Elder goto out; 2866c5b5ef6cSAlex Elder stat_request->callback = rbd_img_obj_exists_callback; 2867c5b5ef6cSAlex Elder 2868144cba14SYan, Zheng osd_req_op_init(stat_request->osd_req, 0, CEPH_OSD_OP_STAT, 0); 2869c5b5ef6cSAlex Elder osd_req_op_raw_data_in_pages(stat_request->osd_req, 0, pages, size, 0, 2870c5b5ef6cSAlex Elder false, false); 28719d4df01fSAlex Elder rbd_osd_req_format_read(stat_request); 2872c5b5ef6cSAlex Elder 2873c5b5ef6cSAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 2874c5b5ef6cSAlex Elder ret = rbd_obj_request_submit(osdc, stat_request); 2875c5b5ef6cSAlex Elder out: 2876c5b5ef6cSAlex Elder if (ret) 2877c5b5ef6cSAlex Elder rbd_obj_request_put(obj_request); 2878c5b5ef6cSAlex Elder 2879c5b5ef6cSAlex Elder return ret; 2880c5b5ef6cSAlex Elder } 2881c5b5ef6cSAlex Elder 288270d045f6SIlya Dryomov static bool img_obj_request_simple(struct rbd_obj_request *obj_request) 2883b454e36dSAlex Elder { 2884b454e36dSAlex Elder struct rbd_img_request *img_request; 2885a9e8ba2cSAlex Elder struct rbd_device *rbd_dev; 2886b454e36dSAlex Elder 2887b454e36dSAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 2888b454e36dSAlex Elder 2889b454e36dSAlex Elder img_request = obj_request->img_request; 2890b454e36dSAlex Elder rbd_assert(img_request); 2891a9e8ba2cSAlex Elder rbd_dev = img_request->rbd_dev; 2892b454e36dSAlex Elder 289370d045f6SIlya Dryomov /* Reads */ 28941c220881SJosh Durgin if (!img_request_write_test(img_request) && 28951c220881SJosh Durgin !img_request_discard_test(img_request)) 289670d045f6SIlya Dryomov return true; 2897b454e36dSAlex Elder 289870d045f6SIlya Dryomov /* Non-layered writes */ 289970d045f6SIlya Dryomov if (!img_request_layered_test(img_request)) 290070d045f6SIlya Dryomov return true; 290170d045f6SIlya Dryomov 290270d045f6SIlya Dryomov /* 290370d045f6SIlya Dryomov * Layered writes outside of the parent overlap range don't 290470d045f6SIlya Dryomov * share any data with the parent. 290570d045f6SIlya Dryomov */ 290670d045f6SIlya Dryomov if (!obj_request_overlaps_parent(obj_request)) 290770d045f6SIlya Dryomov return true; 290870d045f6SIlya Dryomov 290970d045f6SIlya Dryomov /* 2910c622d226SGuangliang Zhao * Entire-object layered writes - we will overwrite whatever 2911c622d226SGuangliang Zhao * parent data there is anyway. 2912c622d226SGuangliang Zhao */ 2913c622d226SGuangliang Zhao if (!obj_request->offset && 2914c622d226SGuangliang Zhao obj_request->length == rbd_obj_bytes(&rbd_dev->header)) 2915c622d226SGuangliang Zhao return true; 2916c622d226SGuangliang Zhao 2917c622d226SGuangliang Zhao /* 291870d045f6SIlya Dryomov * If the object is known to already exist, its parent data has 291970d045f6SIlya Dryomov * already been copied. 292070d045f6SIlya Dryomov */ 292170d045f6SIlya Dryomov if (obj_request_known_test(obj_request) && 292270d045f6SIlya Dryomov obj_request_exists_test(obj_request)) 292370d045f6SIlya Dryomov return true; 292470d045f6SIlya Dryomov 292570d045f6SIlya Dryomov return false; 292670d045f6SIlya Dryomov } 292770d045f6SIlya Dryomov 292870d045f6SIlya Dryomov static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request) 292970d045f6SIlya Dryomov { 293070d045f6SIlya Dryomov if (img_obj_request_simple(obj_request)) { 2931b454e36dSAlex Elder struct rbd_device *rbd_dev; 2932b454e36dSAlex Elder struct ceph_osd_client *osdc; 2933b454e36dSAlex Elder 2934b454e36dSAlex Elder rbd_dev = obj_request->img_request->rbd_dev; 2935b454e36dSAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 2936b454e36dSAlex Elder 2937b454e36dSAlex Elder return rbd_obj_request_submit(osdc, obj_request); 2938b454e36dSAlex Elder } 2939b454e36dSAlex Elder 2940b454e36dSAlex Elder /* 29413d7efd18SAlex Elder * It's a layered write. The target object might exist but 29423d7efd18SAlex Elder * we may not know that yet. If we know it doesn't exist, 29433d7efd18SAlex Elder * start by reading the data for the full target object from 29443d7efd18SAlex Elder * the parent so we can use it for a copyup to the target. 2945b454e36dSAlex Elder */ 294670d045f6SIlya Dryomov if (obj_request_known_test(obj_request)) 29473d7efd18SAlex Elder return rbd_img_obj_parent_read_full(obj_request); 29483d7efd18SAlex Elder 29493d7efd18SAlex Elder /* We don't know whether the target exists. Go find out. */ 2950b454e36dSAlex Elder 2951b454e36dSAlex Elder return rbd_img_obj_exists_submit(obj_request); 2952b454e36dSAlex Elder } 2953b454e36dSAlex Elder 2954bf0d5f50SAlex Elder static int rbd_img_request_submit(struct rbd_img_request *img_request) 2955bf0d5f50SAlex Elder { 2956bf0d5f50SAlex Elder struct rbd_obj_request *obj_request; 295746faeed4SAlex Elder struct rbd_obj_request *next_obj_request; 2958bf0d5f50SAlex Elder 295937206ee5SAlex Elder dout("%s: img %p\n", __func__, img_request); 296046faeed4SAlex Elder for_each_obj_request_safe(img_request, obj_request, next_obj_request) { 2961bf0d5f50SAlex Elder int ret; 2962bf0d5f50SAlex Elder 2963b454e36dSAlex Elder ret = rbd_img_obj_request_submit(obj_request); 2964bf0d5f50SAlex Elder if (ret) 2965bf0d5f50SAlex Elder return ret; 2966bf0d5f50SAlex Elder } 2967bf0d5f50SAlex Elder 2968bf0d5f50SAlex Elder return 0; 2969bf0d5f50SAlex Elder } 2970bf0d5f50SAlex Elder 29718b3e1a56SAlex Elder static void rbd_img_parent_read_callback(struct rbd_img_request *img_request) 29728b3e1a56SAlex Elder { 29738b3e1a56SAlex Elder struct rbd_obj_request *obj_request; 2974a9e8ba2cSAlex Elder struct rbd_device *rbd_dev; 2975a9e8ba2cSAlex Elder u64 obj_end; 297602c74fbaSAlex Elder u64 img_xferred; 297702c74fbaSAlex Elder int img_result; 29788b3e1a56SAlex Elder 29798b3e1a56SAlex Elder rbd_assert(img_request_child_test(img_request)); 29808b3e1a56SAlex Elder 298102c74fbaSAlex Elder /* First get what we need from the image request and release it */ 298202c74fbaSAlex Elder 29838b3e1a56SAlex Elder obj_request = img_request->obj_request; 298402c74fbaSAlex Elder img_xferred = img_request->xferred; 298502c74fbaSAlex Elder img_result = img_request->result; 298602c74fbaSAlex Elder rbd_img_request_put(img_request); 298702c74fbaSAlex Elder 298802c74fbaSAlex Elder /* 298902c74fbaSAlex Elder * If the overlap has become 0 (most likely because the 299002c74fbaSAlex Elder * image has been flattened) we need to re-submit the 299102c74fbaSAlex Elder * original request. 299202c74fbaSAlex Elder */ 2993a9e8ba2cSAlex Elder rbd_assert(obj_request); 2994a9e8ba2cSAlex Elder rbd_assert(obj_request->img_request); 299502c74fbaSAlex Elder rbd_dev = obj_request->img_request->rbd_dev; 299602c74fbaSAlex Elder if (!rbd_dev->parent_overlap) { 299702c74fbaSAlex Elder struct ceph_osd_client *osdc; 29988b3e1a56SAlex Elder 299902c74fbaSAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 300002c74fbaSAlex Elder img_result = rbd_obj_request_submit(osdc, obj_request); 300102c74fbaSAlex Elder if (!img_result) 300202c74fbaSAlex Elder return; 300302c74fbaSAlex Elder } 300402c74fbaSAlex Elder 300502c74fbaSAlex Elder obj_request->result = img_result; 3006a9e8ba2cSAlex Elder if (obj_request->result) 3007a9e8ba2cSAlex Elder goto out; 3008a9e8ba2cSAlex Elder 3009a9e8ba2cSAlex Elder /* 3010a9e8ba2cSAlex Elder * We need to zero anything beyond the parent overlap 3011a9e8ba2cSAlex Elder * boundary. Since rbd_img_obj_request_read_callback() 3012a9e8ba2cSAlex Elder * will zero anything beyond the end of a short read, an 3013a9e8ba2cSAlex Elder * easy way to do this is to pretend the data from the 3014a9e8ba2cSAlex Elder * parent came up short--ending at the overlap boundary. 3015a9e8ba2cSAlex Elder */ 3016a9e8ba2cSAlex Elder rbd_assert(obj_request->img_offset < U64_MAX - obj_request->length); 3017a9e8ba2cSAlex Elder obj_end = obj_request->img_offset + obj_request->length; 3018a9e8ba2cSAlex Elder if (obj_end > rbd_dev->parent_overlap) { 3019a9e8ba2cSAlex Elder u64 xferred = 0; 3020a9e8ba2cSAlex Elder 3021a9e8ba2cSAlex Elder if (obj_request->img_offset < rbd_dev->parent_overlap) 3022a9e8ba2cSAlex Elder xferred = rbd_dev->parent_overlap - 3023a9e8ba2cSAlex Elder obj_request->img_offset; 3024a9e8ba2cSAlex Elder 302502c74fbaSAlex Elder obj_request->xferred = min(img_xferred, xferred); 3026a9e8ba2cSAlex Elder } else { 302702c74fbaSAlex Elder obj_request->xferred = img_xferred; 3028a9e8ba2cSAlex Elder } 3029a9e8ba2cSAlex Elder out: 30308b3e1a56SAlex Elder rbd_img_obj_request_read_callback(obj_request); 30318b3e1a56SAlex Elder rbd_obj_request_complete(obj_request); 30328b3e1a56SAlex Elder } 30338b3e1a56SAlex Elder 30348b3e1a56SAlex Elder static void rbd_img_parent_read(struct rbd_obj_request *obj_request) 30358b3e1a56SAlex Elder { 30368b3e1a56SAlex Elder struct rbd_img_request *img_request; 30378b3e1a56SAlex Elder int result; 30388b3e1a56SAlex Elder 30398b3e1a56SAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 30408b3e1a56SAlex Elder rbd_assert(obj_request->img_request != NULL); 30418b3e1a56SAlex Elder rbd_assert(obj_request->result == (s32) -ENOENT); 30425b2ab72dSAlex Elder rbd_assert(obj_request_type_valid(obj_request->type)); 30438b3e1a56SAlex Elder 30448b3e1a56SAlex Elder /* rbd_read_finish(obj_request, obj_request->length); */ 3045e93f3152SAlex Elder img_request = rbd_parent_request_create(obj_request, 30468b3e1a56SAlex Elder obj_request->img_offset, 3047e93f3152SAlex Elder obj_request->length); 30488b3e1a56SAlex Elder result = -ENOMEM; 30498b3e1a56SAlex Elder if (!img_request) 30508b3e1a56SAlex Elder goto out_err; 30518b3e1a56SAlex Elder 30525b2ab72dSAlex Elder if (obj_request->type == OBJ_REQUEST_BIO) 3053f1a4739fSAlex Elder result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO, 3054f1a4739fSAlex Elder obj_request->bio_list); 30555b2ab72dSAlex Elder else 30565b2ab72dSAlex Elder result = rbd_img_request_fill(img_request, OBJ_REQUEST_PAGES, 30575b2ab72dSAlex Elder obj_request->pages); 30588b3e1a56SAlex Elder if (result) 30598b3e1a56SAlex Elder goto out_err; 30608b3e1a56SAlex Elder 30618b3e1a56SAlex Elder img_request->callback = rbd_img_parent_read_callback; 30628b3e1a56SAlex Elder result = rbd_img_request_submit(img_request); 30638b3e1a56SAlex Elder if (result) 30648b3e1a56SAlex Elder goto out_err; 30658b3e1a56SAlex Elder 30668b3e1a56SAlex Elder return; 30678b3e1a56SAlex Elder out_err: 30688b3e1a56SAlex Elder if (img_request) 30698b3e1a56SAlex Elder rbd_img_request_put(img_request); 30708b3e1a56SAlex Elder obj_request->result = result; 30718b3e1a56SAlex Elder obj_request->xferred = 0; 30728b3e1a56SAlex Elder obj_request_done_set(obj_request); 30738b3e1a56SAlex Elder } 30748b3e1a56SAlex Elder 307520e0af67SJosh Durgin static int rbd_obj_notify_ack_sync(struct rbd_device *rbd_dev, u64 notify_id) 3076b8d70035SAlex Elder { 3077b8d70035SAlex Elder struct rbd_obj_request *obj_request; 30782169238dSAlex Elder struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 3079b8d70035SAlex Elder int ret; 3080b8d70035SAlex Elder 3081b8d70035SAlex Elder obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0, 3082b8d70035SAlex Elder OBJ_REQUEST_NODATA); 3083b8d70035SAlex Elder if (!obj_request) 3084b8d70035SAlex Elder return -ENOMEM; 3085b8d70035SAlex Elder 3086b8d70035SAlex Elder ret = -ENOMEM; 30876d2940c8SGuangliang Zhao obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1, 3088deb236b3SIlya Dryomov obj_request); 3089b8d70035SAlex Elder if (!obj_request->osd_req) 3090b8d70035SAlex Elder goto out; 3091b8d70035SAlex Elder 3092c99d2d4aSAlex Elder osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_NOTIFY_ACK, 3093cc4a38bdSAlex Elder notify_id, 0, 0); 30949d4df01fSAlex Elder rbd_osd_req_format_read(obj_request); 3095430c28c3SAlex Elder 3096b8d70035SAlex Elder ret = rbd_obj_request_submit(osdc, obj_request); 3097cf81b60eSAlex Elder if (ret) 309820e0af67SJosh Durgin goto out; 309920e0af67SJosh Durgin ret = rbd_obj_request_wait(obj_request); 310020e0af67SJosh Durgin out: 3101b8d70035SAlex Elder rbd_obj_request_put(obj_request); 3102b8d70035SAlex Elder 3103b8d70035SAlex Elder return ret; 3104b8d70035SAlex Elder } 3105b8d70035SAlex Elder 3106b8d70035SAlex Elder static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data) 3107b8d70035SAlex Elder { 3108b8d70035SAlex Elder struct rbd_device *rbd_dev = (struct rbd_device *)data; 3109e627db08SAlex Elder int ret; 3110b8d70035SAlex Elder 3111b8d70035SAlex Elder if (!rbd_dev) 3112b8d70035SAlex Elder return; 3113b8d70035SAlex Elder 311437206ee5SAlex Elder dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__, 3115b8d70035SAlex Elder rbd_dev->header_name, (unsigned long long)notify_id, 3116b8d70035SAlex Elder (unsigned int)opcode); 311752bb1f9bSIlya Dryomov 311852bb1f9bSIlya Dryomov /* 311952bb1f9bSIlya Dryomov * Until adequate refresh error handling is in place, there is 312052bb1f9bSIlya Dryomov * not much we can do here, except warn. 312152bb1f9bSIlya Dryomov * 312252bb1f9bSIlya Dryomov * See http://tracker.ceph.com/issues/5040 312352bb1f9bSIlya Dryomov */ 3124e627db08SAlex Elder ret = rbd_dev_refresh(rbd_dev); 3125e627db08SAlex Elder if (ret) 31269584d508SIlya Dryomov rbd_warn(rbd_dev, "refresh failed: %d", ret); 3127b8d70035SAlex Elder 312852bb1f9bSIlya Dryomov ret = rbd_obj_notify_ack_sync(rbd_dev, notify_id); 312952bb1f9bSIlya Dryomov if (ret) 31309584d508SIlya Dryomov rbd_warn(rbd_dev, "notify_ack ret %d", ret); 3131b8d70035SAlex Elder } 3132b8d70035SAlex Elder 31339969ebc5SAlex Elder /* 3134bb040aa0SIlya Dryomov * Send a (un)watch request and wait for the ack. Return a request 3135bb040aa0SIlya Dryomov * with a ref held on success or error. 3136bb040aa0SIlya Dryomov */ 3137bb040aa0SIlya Dryomov static struct rbd_obj_request *rbd_obj_watch_request_helper( 3138bb040aa0SIlya Dryomov struct rbd_device *rbd_dev, 3139bb040aa0SIlya Dryomov bool watch) 3140bb040aa0SIlya Dryomov { 3141bb040aa0SIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 31422894e1d7SIlya Dryomov struct ceph_options *opts = osdc->client->options; 3143bb040aa0SIlya Dryomov struct rbd_obj_request *obj_request; 3144bb040aa0SIlya Dryomov int ret; 3145bb040aa0SIlya Dryomov 3146bb040aa0SIlya Dryomov obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0, 3147bb040aa0SIlya Dryomov OBJ_REQUEST_NODATA); 3148bb040aa0SIlya Dryomov if (!obj_request) 3149bb040aa0SIlya Dryomov return ERR_PTR(-ENOMEM); 3150bb040aa0SIlya Dryomov 31516d2940c8SGuangliang Zhao obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_WRITE, 1, 3152bb040aa0SIlya Dryomov obj_request); 3153bb040aa0SIlya Dryomov if (!obj_request->osd_req) { 3154bb040aa0SIlya Dryomov ret = -ENOMEM; 3155bb040aa0SIlya Dryomov goto out; 3156bb040aa0SIlya Dryomov } 3157bb040aa0SIlya Dryomov 3158bb040aa0SIlya Dryomov osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH, 3159bb040aa0SIlya Dryomov rbd_dev->watch_event->cookie, 0, watch); 3160bb040aa0SIlya Dryomov rbd_osd_req_format_write(obj_request); 3161bb040aa0SIlya Dryomov 3162bb040aa0SIlya Dryomov if (watch) 3163bb040aa0SIlya Dryomov ceph_osdc_set_request_linger(osdc, obj_request->osd_req); 3164bb040aa0SIlya Dryomov 3165bb040aa0SIlya Dryomov ret = rbd_obj_request_submit(osdc, obj_request); 3166bb040aa0SIlya Dryomov if (ret) 3167bb040aa0SIlya Dryomov goto out; 3168bb040aa0SIlya Dryomov 31692894e1d7SIlya Dryomov ret = rbd_obj_request_wait_timeout(obj_request, opts->mount_timeout); 3170bb040aa0SIlya Dryomov if (ret) 3171bb040aa0SIlya Dryomov goto out; 3172bb040aa0SIlya Dryomov 3173bb040aa0SIlya Dryomov ret = obj_request->result; 3174bb040aa0SIlya Dryomov if (ret) { 3175bb040aa0SIlya Dryomov if (watch) 3176bb040aa0SIlya Dryomov rbd_obj_request_end(obj_request); 3177bb040aa0SIlya Dryomov goto out; 3178bb040aa0SIlya Dryomov } 3179bb040aa0SIlya Dryomov 3180bb040aa0SIlya Dryomov return obj_request; 3181bb040aa0SIlya Dryomov 3182bb040aa0SIlya Dryomov out: 3183bb040aa0SIlya Dryomov rbd_obj_request_put(obj_request); 3184bb040aa0SIlya Dryomov return ERR_PTR(ret); 3185bb040aa0SIlya Dryomov } 3186bb040aa0SIlya Dryomov 3187bb040aa0SIlya Dryomov /* 3188b30a01f2SIlya Dryomov * Initiate a watch request, synchronously. 31899969ebc5SAlex Elder */ 3190b30a01f2SIlya Dryomov static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev) 31919969ebc5SAlex Elder { 31929969ebc5SAlex Elder struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 31939969ebc5SAlex Elder struct rbd_obj_request *obj_request; 31949969ebc5SAlex Elder int ret; 31959969ebc5SAlex Elder 3196b30a01f2SIlya Dryomov rbd_assert(!rbd_dev->watch_event); 3197b30a01f2SIlya Dryomov rbd_assert(!rbd_dev->watch_request); 31989969ebc5SAlex Elder 31993c663bbdSAlex Elder ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev, 32009969ebc5SAlex Elder &rbd_dev->watch_event); 32019969ebc5SAlex Elder if (ret < 0) 32029969ebc5SAlex Elder return ret; 32039969ebc5SAlex Elder 320476756a51SIlya Dryomov obj_request = rbd_obj_watch_request_helper(rbd_dev, true); 320576756a51SIlya Dryomov if (IS_ERR(obj_request)) { 320676756a51SIlya Dryomov ceph_osdc_cancel_event(rbd_dev->watch_event); 320776756a51SIlya Dryomov rbd_dev->watch_event = NULL; 320876756a51SIlya Dryomov return PTR_ERR(obj_request); 3209b30a01f2SIlya Dryomov } 32109969ebc5SAlex Elder 32118eb87565SAlex Elder /* 32128eb87565SAlex Elder * A watch request is set to linger, so the underlying osd 32138eb87565SAlex Elder * request won't go away until we unregister it. We retain 32148eb87565SAlex Elder * a pointer to the object request during that time (in 321576756a51SIlya Dryomov * rbd_dev->watch_request), so we'll keep a reference to it. 321676756a51SIlya Dryomov * We'll drop that reference after we've unregistered it in 321776756a51SIlya Dryomov * rbd_dev_header_unwatch_sync(). 32188eb87565SAlex Elder */ 32198eb87565SAlex Elder rbd_dev->watch_request = obj_request; 32208eb87565SAlex Elder 32218eb87565SAlex Elder return 0; 32229969ebc5SAlex Elder } 32239969ebc5SAlex Elder 3224b30a01f2SIlya Dryomov /* 3225b30a01f2SIlya Dryomov * Tear down a watch request, synchronously. 3226b30a01f2SIlya Dryomov */ 322776756a51SIlya Dryomov static void rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev) 3228fca27065SIlya Dryomov { 3229b30a01f2SIlya Dryomov struct rbd_obj_request *obj_request; 3230b30a01f2SIlya Dryomov 3231b30a01f2SIlya Dryomov rbd_assert(rbd_dev->watch_event); 3232b30a01f2SIlya Dryomov rbd_assert(rbd_dev->watch_request); 3233b30a01f2SIlya Dryomov 323476756a51SIlya Dryomov rbd_obj_request_end(rbd_dev->watch_request); 3235b30a01f2SIlya Dryomov rbd_obj_request_put(rbd_dev->watch_request); 3236b30a01f2SIlya Dryomov rbd_dev->watch_request = NULL; 3237b30a01f2SIlya Dryomov 323876756a51SIlya Dryomov obj_request = rbd_obj_watch_request_helper(rbd_dev, false); 323976756a51SIlya Dryomov if (!IS_ERR(obj_request)) 3240b30a01f2SIlya Dryomov rbd_obj_request_put(obj_request); 324176756a51SIlya Dryomov else 324276756a51SIlya Dryomov rbd_warn(rbd_dev, "unable to tear down watch request (%ld)", 324376756a51SIlya Dryomov PTR_ERR(obj_request)); 324476756a51SIlya Dryomov 3245b30a01f2SIlya Dryomov ceph_osdc_cancel_event(rbd_dev->watch_event); 3246b30a01f2SIlya Dryomov rbd_dev->watch_event = NULL; 3247fca27065SIlya Dryomov } 3248fca27065SIlya Dryomov 324936be9a76SAlex Elder /* 3250f40eb349SAlex Elder * Synchronous osd object method call. Returns the number of bytes 3251f40eb349SAlex Elder * returned in the outbound buffer, or a negative error code. 325236be9a76SAlex Elder */ 325336be9a76SAlex Elder static int rbd_obj_method_sync(struct rbd_device *rbd_dev, 325436be9a76SAlex Elder const char *object_name, 325536be9a76SAlex Elder const char *class_name, 325636be9a76SAlex Elder const char *method_name, 32574157976bSAlex Elder const void *outbound, 325836be9a76SAlex Elder size_t outbound_size, 32594157976bSAlex Elder void *inbound, 3260e2a58ee5SAlex Elder size_t inbound_size) 326136be9a76SAlex Elder { 32622169238dSAlex Elder struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 326336be9a76SAlex Elder struct rbd_obj_request *obj_request; 326436be9a76SAlex Elder struct page **pages; 326536be9a76SAlex Elder u32 page_count; 326636be9a76SAlex Elder int ret; 326736be9a76SAlex Elder 326836be9a76SAlex Elder /* 32696010a451SAlex Elder * Method calls are ultimately read operations. The result 32706010a451SAlex Elder * should placed into the inbound buffer provided. They 32716010a451SAlex Elder * also supply outbound data--parameters for the object 32726010a451SAlex Elder * method. Currently if this is present it will be a 32736010a451SAlex Elder * snapshot id. 327436be9a76SAlex Elder */ 327536be9a76SAlex Elder page_count = (u32)calc_pages_for(0, inbound_size); 327636be9a76SAlex Elder pages = ceph_alloc_page_vector(page_count, GFP_KERNEL); 327736be9a76SAlex Elder if (IS_ERR(pages)) 327836be9a76SAlex Elder return PTR_ERR(pages); 327936be9a76SAlex Elder 328036be9a76SAlex Elder ret = -ENOMEM; 32816010a451SAlex Elder obj_request = rbd_obj_request_create(object_name, 0, inbound_size, 328236be9a76SAlex Elder OBJ_REQUEST_PAGES); 328336be9a76SAlex Elder if (!obj_request) 328436be9a76SAlex Elder goto out; 328536be9a76SAlex Elder 328636be9a76SAlex Elder obj_request->pages = pages; 328736be9a76SAlex Elder obj_request->page_count = page_count; 328836be9a76SAlex Elder 32896d2940c8SGuangliang Zhao obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1, 3290deb236b3SIlya Dryomov obj_request); 329136be9a76SAlex Elder if (!obj_request->osd_req) 329236be9a76SAlex Elder goto out; 329336be9a76SAlex Elder 3294c99d2d4aSAlex Elder osd_req_op_cls_init(obj_request->osd_req, 0, CEPH_OSD_OP_CALL, 329504017e29SAlex Elder class_name, method_name); 329604017e29SAlex Elder if (outbound_size) { 329704017e29SAlex Elder struct ceph_pagelist *pagelist; 329804017e29SAlex Elder 329904017e29SAlex Elder pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS); 330004017e29SAlex Elder if (!pagelist) 330104017e29SAlex Elder goto out; 330204017e29SAlex Elder 330304017e29SAlex Elder ceph_pagelist_init(pagelist); 330404017e29SAlex Elder ceph_pagelist_append(pagelist, outbound, outbound_size); 330504017e29SAlex Elder osd_req_op_cls_request_data_pagelist(obj_request->osd_req, 0, 330604017e29SAlex Elder pagelist); 330704017e29SAlex Elder } 3308a4ce40a9SAlex Elder osd_req_op_cls_response_data_pages(obj_request->osd_req, 0, 3309a4ce40a9SAlex Elder obj_request->pages, inbound_size, 331044cd188dSAlex Elder 0, false, false); 33119d4df01fSAlex Elder rbd_osd_req_format_read(obj_request); 3312430c28c3SAlex Elder 331336be9a76SAlex Elder ret = rbd_obj_request_submit(osdc, obj_request); 331436be9a76SAlex Elder if (ret) 331536be9a76SAlex Elder goto out; 331636be9a76SAlex Elder ret = rbd_obj_request_wait(obj_request); 331736be9a76SAlex Elder if (ret) 331836be9a76SAlex Elder goto out; 331936be9a76SAlex Elder 332036be9a76SAlex Elder ret = obj_request->result; 332136be9a76SAlex Elder if (ret < 0) 332236be9a76SAlex Elder goto out; 332357385b51SAlex Elder 332457385b51SAlex Elder rbd_assert(obj_request->xferred < (u64)INT_MAX); 332557385b51SAlex Elder ret = (int)obj_request->xferred; 3326903bb32eSAlex Elder ceph_copy_from_page_vector(pages, inbound, 0, obj_request->xferred); 332736be9a76SAlex Elder out: 332836be9a76SAlex Elder if (obj_request) 332936be9a76SAlex Elder rbd_obj_request_put(obj_request); 333036be9a76SAlex Elder else 333136be9a76SAlex Elder ceph_release_page_vector(pages, page_count); 333236be9a76SAlex Elder 333336be9a76SAlex Elder return ret; 333436be9a76SAlex Elder } 333536be9a76SAlex Elder 33367ad18afaSChristoph Hellwig static void rbd_queue_workfn(struct work_struct *work) 3337bc1ecc65SIlya Dryomov { 33387ad18afaSChristoph Hellwig struct request *rq = blk_mq_rq_from_pdu(work); 33397ad18afaSChristoph Hellwig struct rbd_device *rbd_dev = rq->q->queuedata; 3340bc1ecc65SIlya Dryomov struct rbd_img_request *img_request; 33414e752f0aSJosh Durgin struct ceph_snap_context *snapc = NULL; 3342bc1ecc65SIlya Dryomov u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT; 3343bc1ecc65SIlya Dryomov u64 length = blk_rq_bytes(rq); 33446d2940c8SGuangliang Zhao enum obj_operation_type op_type; 33454e752f0aSJosh Durgin u64 mapping_size; 3346bc1ecc65SIlya Dryomov int result; 3347bc1ecc65SIlya Dryomov 33487ad18afaSChristoph Hellwig if (rq->cmd_type != REQ_TYPE_FS) { 33497ad18afaSChristoph Hellwig dout("%s: non-fs request type %d\n", __func__, 33507ad18afaSChristoph Hellwig (int) rq->cmd_type); 33517ad18afaSChristoph Hellwig result = -EIO; 33527ad18afaSChristoph Hellwig goto err; 33537ad18afaSChristoph Hellwig } 33547ad18afaSChristoph Hellwig 335590e98c52SGuangliang Zhao if (rq->cmd_flags & REQ_DISCARD) 335690e98c52SGuangliang Zhao op_type = OBJ_OP_DISCARD; 335790e98c52SGuangliang Zhao else if (rq->cmd_flags & REQ_WRITE) 33586d2940c8SGuangliang Zhao op_type = OBJ_OP_WRITE; 33596d2940c8SGuangliang Zhao else 33606d2940c8SGuangliang Zhao op_type = OBJ_OP_READ; 33616d2940c8SGuangliang Zhao 3362bc1ecc65SIlya Dryomov /* Ignore/skip any zero-length requests */ 3363bc1ecc65SIlya Dryomov 3364bc1ecc65SIlya Dryomov if (!length) { 3365bc1ecc65SIlya Dryomov dout("%s: zero-length request\n", __func__); 3366bc1ecc65SIlya Dryomov result = 0; 3367bc1ecc65SIlya Dryomov goto err_rq; 3368bc1ecc65SIlya Dryomov } 3369bc1ecc65SIlya Dryomov 33706d2940c8SGuangliang Zhao /* Only reads are allowed to a read-only device */ 3371bc1ecc65SIlya Dryomov 33726d2940c8SGuangliang Zhao if (op_type != OBJ_OP_READ) { 3373bc1ecc65SIlya Dryomov if (rbd_dev->mapping.read_only) { 3374bc1ecc65SIlya Dryomov result = -EROFS; 3375bc1ecc65SIlya Dryomov goto err_rq; 3376bc1ecc65SIlya Dryomov } 3377bc1ecc65SIlya Dryomov rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP); 3378bc1ecc65SIlya Dryomov } 3379bc1ecc65SIlya Dryomov 3380bc1ecc65SIlya Dryomov /* 3381bc1ecc65SIlya Dryomov * Quit early if the mapped snapshot no longer exists. It's 3382bc1ecc65SIlya Dryomov * still possible the snapshot will have disappeared by the 3383bc1ecc65SIlya Dryomov * time our request arrives at the osd, but there's no sense in 3384bc1ecc65SIlya Dryomov * sending it if we already know. 3385bc1ecc65SIlya Dryomov */ 3386bc1ecc65SIlya Dryomov if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) { 3387bc1ecc65SIlya Dryomov dout("request for non-existent snapshot"); 3388bc1ecc65SIlya Dryomov rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP); 3389bc1ecc65SIlya Dryomov result = -ENXIO; 3390bc1ecc65SIlya Dryomov goto err_rq; 3391bc1ecc65SIlya Dryomov } 3392bc1ecc65SIlya Dryomov 3393bc1ecc65SIlya Dryomov if (offset && length > U64_MAX - offset + 1) { 3394bc1ecc65SIlya Dryomov rbd_warn(rbd_dev, "bad request range (%llu~%llu)", offset, 3395bc1ecc65SIlya Dryomov length); 3396bc1ecc65SIlya Dryomov result = -EINVAL; 3397bc1ecc65SIlya Dryomov goto err_rq; /* Shouldn't happen */ 3398bc1ecc65SIlya Dryomov } 3399bc1ecc65SIlya Dryomov 34007ad18afaSChristoph Hellwig blk_mq_start_request(rq); 34017ad18afaSChristoph Hellwig 34024e752f0aSJosh Durgin down_read(&rbd_dev->header_rwsem); 34034e752f0aSJosh Durgin mapping_size = rbd_dev->mapping.size; 34046d2940c8SGuangliang Zhao if (op_type != OBJ_OP_READ) { 34054e752f0aSJosh Durgin snapc = rbd_dev->header.snapc; 34064e752f0aSJosh Durgin ceph_get_snap_context(snapc); 34074e752f0aSJosh Durgin } 34084e752f0aSJosh Durgin up_read(&rbd_dev->header_rwsem); 34094e752f0aSJosh Durgin 34104e752f0aSJosh Durgin if (offset + length > mapping_size) { 3411bc1ecc65SIlya Dryomov rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)", offset, 34124e752f0aSJosh Durgin length, mapping_size); 3413bc1ecc65SIlya Dryomov result = -EIO; 3414bc1ecc65SIlya Dryomov goto err_rq; 3415bc1ecc65SIlya Dryomov } 3416bc1ecc65SIlya Dryomov 34176d2940c8SGuangliang Zhao img_request = rbd_img_request_create(rbd_dev, offset, length, op_type, 34184e752f0aSJosh Durgin snapc); 3419bc1ecc65SIlya Dryomov if (!img_request) { 3420bc1ecc65SIlya Dryomov result = -ENOMEM; 3421bc1ecc65SIlya Dryomov goto err_rq; 3422bc1ecc65SIlya Dryomov } 3423bc1ecc65SIlya Dryomov img_request->rq = rq; 3424bc1ecc65SIlya Dryomov 342590e98c52SGuangliang Zhao if (op_type == OBJ_OP_DISCARD) 342690e98c52SGuangliang Zhao result = rbd_img_request_fill(img_request, OBJ_REQUEST_NODATA, 342790e98c52SGuangliang Zhao NULL); 342890e98c52SGuangliang Zhao else 342990e98c52SGuangliang Zhao result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO, 343090e98c52SGuangliang Zhao rq->bio); 3431bc1ecc65SIlya Dryomov if (result) 3432bc1ecc65SIlya Dryomov goto err_img_request; 3433bc1ecc65SIlya Dryomov 3434bc1ecc65SIlya Dryomov result = rbd_img_request_submit(img_request); 3435bc1ecc65SIlya Dryomov if (result) 3436bc1ecc65SIlya Dryomov goto err_img_request; 3437bc1ecc65SIlya Dryomov 3438bc1ecc65SIlya Dryomov return; 3439bc1ecc65SIlya Dryomov 3440bc1ecc65SIlya Dryomov err_img_request: 3441bc1ecc65SIlya Dryomov rbd_img_request_put(img_request); 3442bc1ecc65SIlya Dryomov err_rq: 3443bc1ecc65SIlya Dryomov if (result) 3444bc1ecc65SIlya Dryomov rbd_warn(rbd_dev, "%s %llx at %llx result %d", 34456d2940c8SGuangliang Zhao obj_op_name(op_type), length, offset, result); 34464e752f0aSJosh Durgin ceph_put_snap_context(snapc); 34477ad18afaSChristoph Hellwig err: 34487ad18afaSChristoph Hellwig blk_mq_end_request(rq, result); 3449bc1ecc65SIlya Dryomov } 3450bc1ecc65SIlya Dryomov 34517ad18afaSChristoph Hellwig static int rbd_queue_rq(struct blk_mq_hw_ctx *hctx, 34527ad18afaSChristoph Hellwig const struct blk_mq_queue_data *bd) 3453bc1ecc65SIlya Dryomov { 34547ad18afaSChristoph Hellwig struct request *rq = bd->rq; 34557ad18afaSChristoph Hellwig struct work_struct *work = blk_mq_rq_to_pdu(rq); 3456bc1ecc65SIlya Dryomov 34577ad18afaSChristoph Hellwig queue_work(rbd_wq, work); 34587ad18afaSChristoph Hellwig return BLK_MQ_RQ_QUEUE_OK; 3459bf0d5f50SAlex Elder } 3460bf0d5f50SAlex Elder 3461602adf40SYehuda Sadeh /* 3462602adf40SYehuda Sadeh * a queue callback. Makes sure that we don't create a bio that spans across 3463602adf40SYehuda Sadeh * multiple osd objects. One exception would be with a single page bios, 3464f7760dadSAlex Elder * which we handle later at bio_chain_clone_range() 3465602adf40SYehuda Sadeh */ 3466602adf40SYehuda Sadeh static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd, 3467602adf40SYehuda Sadeh struct bio_vec *bvec) 3468602adf40SYehuda Sadeh { 3469602adf40SYehuda Sadeh struct rbd_device *rbd_dev = q->queuedata; 3470e5cfeed2SAlex Elder sector_t sector_offset; 3471e5cfeed2SAlex Elder sector_t sectors_per_obj; 3472e5cfeed2SAlex Elder sector_t obj_sector_offset; 3473e5cfeed2SAlex Elder int ret; 3474602adf40SYehuda Sadeh 3475e5cfeed2SAlex Elder /* 3476e5cfeed2SAlex Elder * Find how far into its rbd object the partition-relative 3477e5cfeed2SAlex Elder * bio start sector is to offset relative to the enclosing 3478e5cfeed2SAlex Elder * device. 3479e5cfeed2SAlex Elder */ 3480e5cfeed2SAlex Elder sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector; 3481e5cfeed2SAlex Elder sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT); 3482e5cfeed2SAlex Elder obj_sector_offset = sector_offset & (sectors_per_obj - 1); 3483593a9e7bSAlex Elder 3484e5cfeed2SAlex Elder /* 3485e5cfeed2SAlex Elder * Compute the number of bytes from that offset to the end 3486e5cfeed2SAlex Elder * of the object. Account for what's already used by the bio. 3487e5cfeed2SAlex Elder */ 3488e5cfeed2SAlex Elder ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT; 3489e5cfeed2SAlex Elder if (ret > bmd->bi_size) 3490e5cfeed2SAlex Elder ret -= bmd->bi_size; 3491e5cfeed2SAlex Elder else 3492e5cfeed2SAlex Elder ret = 0; 3493e5cfeed2SAlex Elder 3494e5cfeed2SAlex Elder /* 3495e5cfeed2SAlex Elder * Don't send back more than was asked for. And if the bio 3496e5cfeed2SAlex Elder * was empty, let the whole thing through because: "Note 3497e5cfeed2SAlex Elder * that a block device *must* allow a single page to be 3498e5cfeed2SAlex Elder * added to an empty bio." 3499e5cfeed2SAlex Elder */ 3500e5cfeed2SAlex Elder rbd_assert(bvec->bv_len <= PAGE_SIZE); 3501e5cfeed2SAlex Elder if (ret > (int) bvec->bv_len || !bmd->bi_size) 3502e5cfeed2SAlex Elder ret = (int) bvec->bv_len; 3503e5cfeed2SAlex Elder 3504e5cfeed2SAlex Elder return ret; 3505602adf40SYehuda Sadeh } 3506602adf40SYehuda Sadeh 3507602adf40SYehuda Sadeh static void rbd_free_disk(struct rbd_device *rbd_dev) 3508602adf40SYehuda Sadeh { 3509602adf40SYehuda Sadeh struct gendisk *disk = rbd_dev->disk; 3510602adf40SYehuda Sadeh 3511602adf40SYehuda Sadeh if (!disk) 3512602adf40SYehuda Sadeh return; 3513602adf40SYehuda Sadeh 3514a0cab924SAlex Elder rbd_dev->disk = NULL; 3515a0cab924SAlex Elder if (disk->flags & GENHD_FL_UP) { 3516602adf40SYehuda Sadeh del_gendisk(disk); 3517602adf40SYehuda Sadeh if (disk->queue) 3518602adf40SYehuda Sadeh blk_cleanup_queue(disk->queue); 35197ad18afaSChristoph Hellwig blk_mq_free_tag_set(&rbd_dev->tag_set); 3520a0cab924SAlex Elder } 3521602adf40SYehuda Sadeh put_disk(disk); 3522602adf40SYehuda Sadeh } 3523602adf40SYehuda Sadeh 3524788e2df3SAlex Elder static int rbd_obj_read_sync(struct rbd_device *rbd_dev, 3525788e2df3SAlex Elder const char *object_name, 35267097f8dfSAlex Elder u64 offset, u64 length, void *buf) 3527788e2df3SAlex Elder 3528788e2df3SAlex Elder { 35292169238dSAlex Elder struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 3530788e2df3SAlex Elder struct rbd_obj_request *obj_request; 3531788e2df3SAlex Elder struct page **pages = NULL; 3532788e2df3SAlex Elder u32 page_count; 35331ceae7efSAlex Elder size_t size; 3534788e2df3SAlex Elder int ret; 3535788e2df3SAlex Elder 3536788e2df3SAlex Elder page_count = (u32) calc_pages_for(offset, length); 3537788e2df3SAlex Elder pages = ceph_alloc_page_vector(page_count, GFP_KERNEL); 3538788e2df3SAlex Elder if (IS_ERR(pages)) 3539a8d42056SJan Kara return PTR_ERR(pages); 3540788e2df3SAlex Elder 3541788e2df3SAlex Elder ret = -ENOMEM; 3542788e2df3SAlex Elder obj_request = rbd_obj_request_create(object_name, offset, length, 3543788e2df3SAlex Elder OBJ_REQUEST_PAGES); 3544788e2df3SAlex Elder if (!obj_request) 3545788e2df3SAlex Elder goto out; 3546788e2df3SAlex Elder 3547788e2df3SAlex Elder obj_request->pages = pages; 3548788e2df3SAlex Elder obj_request->page_count = page_count; 3549788e2df3SAlex Elder 35506d2940c8SGuangliang Zhao obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1, 3551deb236b3SIlya Dryomov obj_request); 3552788e2df3SAlex Elder if (!obj_request->osd_req) 3553788e2df3SAlex Elder goto out; 3554788e2df3SAlex Elder 3555c99d2d4aSAlex Elder osd_req_op_extent_init(obj_request->osd_req, 0, CEPH_OSD_OP_READ, 3556c99d2d4aSAlex Elder offset, length, 0, 0); 3557406e2c9fSAlex Elder osd_req_op_extent_osd_data_pages(obj_request->osd_req, 0, 3558a4ce40a9SAlex Elder obj_request->pages, 355944cd188dSAlex Elder obj_request->length, 356044cd188dSAlex Elder obj_request->offset & ~PAGE_MASK, 356144cd188dSAlex Elder false, false); 35629d4df01fSAlex Elder rbd_osd_req_format_read(obj_request); 3563430c28c3SAlex Elder 3564788e2df3SAlex Elder ret = rbd_obj_request_submit(osdc, obj_request); 3565788e2df3SAlex Elder if (ret) 3566788e2df3SAlex Elder goto out; 3567788e2df3SAlex Elder ret = rbd_obj_request_wait(obj_request); 3568788e2df3SAlex Elder if (ret) 3569788e2df3SAlex Elder goto out; 3570788e2df3SAlex Elder 3571788e2df3SAlex Elder ret = obj_request->result; 3572788e2df3SAlex Elder if (ret < 0) 3573788e2df3SAlex Elder goto out; 35741ceae7efSAlex Elder 35751ceae7efSAlex Elder rbd_assert(obj_request->xferred <= (u64) SIZE_MAX); 35761ceae7efSAlex Elder size = (size_t) obj_request->xferred; 3577903bb32eSAlex Elder ceph_copy_from_page_vector(pages, buf, 0, size); 357823ed6e13SAlex Elder rbd_assert(size <= (size_t)INT_MAX); 357923ed6e13SAlex Elder ret = (int)size; 3580788e2df3SAlex Elder out: 3581788e2df3SAlex Elder if (obj_request) 3582788e2df3SAlex Elder rbd_obj_request_put(obj_request); 3583788e2df3SAlex Elder else 3584788e2df3SAlex Elder ceph_release_page_vector(pages, page_count); 3585788e2df3SAlex Elder 3586788e2df3SAlex Elder return ret; 3587788e2df3SAlex Elder } 3588788e2df3SAlex Elder 3589602adf40SYehuda Sadeh /* 3590662518b1SAlex Elder * Read the complete header for the given rbd device. On successful 3591662518b1SAlex Elder * return, the rbd_dev->header field will contain up-to-date 3592662518b1SAlex Elder * information about the image. 35934156d998SAlex Elder */ 359499a41ebcSAlex Elder static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev) 35954156d998SAlex Elder { 35964156d998SAlex Elder struct rbd_image_header_ondisk *ondisk = NULL; 35974156d998SAlex Elder u32 snap_count = 0; 35984156d998SAlex Elder u64 names_size = 0; 35994156d998SAlex Elder u32 want_count; 36004156d998SAlex Elder int ret; 36014156d998SAlex Elder 36024156d998SAlex Elder /* 36034156d998SAlex Elder * The complete header will include an array of its 64-bit 36044156d998SAlex Elder * snapshot ids, followed by the names of those snapshots as 36054156d998SAlex Elder * a contiguous block of NUL-terminated strings. Note that 36064156d998SAlex Elder * the number of snapshots could change by the time we read 36074156d998SAlex Elder * it in, in which case we re-read it. 36084156d998SAlex Elder */ 36094156d998SAlex Elder do { 36104156d998SAlex Elder size_t size; 36114156d998SAlex Elder 36124156d998SAlex Elder kfree(ondisk); 36134156d998SAlex Elder 36144156d998SAlex Elder size = sizeof (*ondisk); 36154156d998SAlex Elder size += snap_count * sizeof (struct rbd_image_snap_ondisk); 36164156d998SAlex Elder size += names_size; 36174156d998SAlex Elder ondisk = kmalloc(size, GFP_KERNEL); 36184156d998SAlex Elder if (!ondisk) 3619662518b1SAlex Elder return -ENOMEM; 36204156d998SAlex Elder 3621788e2df3SAlex Elder ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name, 36227097f8dfSAlex Elder 0, size, ondisk); 36234156d998SAlex Elder if (ret < 0) 3624662518b1SAlex Elder goto out; 3625c0cd10dbSAlex Elder if ((size_t)ret < size) { 36264156d998SAlex Elder ret = -ENXIO; 362706ecc6cbSAlex Elder rbd_warn(rbd_dev, "short header read (want %zd got %d)", 362806ecc6cbSAlex Elder size, ret); 3629662518b1SAlex Elder goto out; 36304156d998SAlex Elder } 36314156d998SAlex Elder if (!rbd_dev_ondisk_valid(ondisk)) { 36324156d998SAlex Elder ret = -ENXIO; 363306ecc6cbSAlex Elder rbd_warn(rbd_dev, "invalid header"); 3634662518b1SAlex Elder goto out; 36354156d998SAlex Elder } 36364156d998SAlex Elder 36374156d998SAlex Elder names_size = le64_to_cpu(ondisk->snap_names_len); 36384156d998SAlex Elder want_count = snap_count; 36394156d998SAlex Elder snap_count = le32_to_cpu(ondisk->snap_count); 36404156d998SAlex Elder } while (snap_count != want_count); 36414156d998SAlex Elder 3642662518b1SAlex Elder ret = rbd_header_from_disk(rbd_dev, ondisk); 3643662518b1SAlex Elder out: 36444156d998SAlex Elder kfree(ondisk); 36454156d998SAlex Elder 3646dfc5606dSYehuda Sadeh return ret; 3647602adf40SYehuda Sadeh } 3648602adf40SYehuda Sadeh 364915228edeSAlex Elder /* 365015228edeSAlex Elder * Clear the rbd device's EXISTS flag if the snapshot it's mapped to 365115228edeSAlex Elder * has disappeared from the (just updated) snapshot context. 365215228edeSAlex Elder */ 365315228edeSAlex Elder static void rbd_exists_validate(struct rbd_device *rbd_dev) 365415228edeSAlex Elder { 365515228edeSAlex Elder u64 snap_id; 365615228edeSAlex Elder 365715228edeSAlex Elder if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) 365815228edeSAlex Elder return; 365915228edeSAlex Elder 366015228edeSAlex Elder snap_id = rbd_dev->spec->snap_id; 366115228edeSAlex Elder if (snap_id == CEPH_NOSNAP) 366215228edeSAlex Elder return; 366315228edeSAlex Elder 366415228edeSAlex Elder if (rbd_dev_snap_index(rbd_dev, snap_id) == BAD_SNAP_INDEX) 366515228edeSAlex Elder clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); 366615228edeSAlex Elder } 366715228edeSAlex Elder 36689875201eSJosh Durgin static void rbd_dev_update_size(struct rbd_device *rbd_dev) 36699875201eSJosh Durgin { 36709875201eSJosh Durgin sector_t size; 36719875201eSJosh Durgin bool removing; 36729875201eSJosh Durgin 36739875201eSJosh Durgin /* 36749875201eSJosh Durgin * Don't hold the lock while doing disk operations, 36759875201eSJosh Durgin * or lock ordering will conflict with the bdev mutex via: 36769875201eSJosh Durgin * rbd_add() -> blkdev_get() -> rbd_open() 36779875201eSJosh Durgin */ 36789875201eSJosh Durgin spin_lock_irq(&rbd_dev->lock); 36799875201eSJosh Durgin removing = test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags); 36809875201eSJosh Durgin spin_unlock_irq(&rbd_dev->lock); 36819875201eSJosh Durgin /* 36829875201eSJosh Durgin * If the device is being removed, rbd_dev->disk has 36839875201eSJosh Durgin * been destroyed, so don't try to update its size 36849875201eSJosh Durgin */ 36859875201eSJosh Durgin if (!removing) { 36869875201eSJosh Durgin size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE; 36879875201eSJosh Durgin dout("setting size to %llu sectors", (unsigned long long)size); 36889875201eSJosh Durgin set_capacity(rbd_dev->disk, size); 36899875201eSJosh Durgin revalidate_disk(rbd_dev->disk); 36909875201eSJosh Durgin } 36919875201eSJosh Durgin } 36929875201eSJosh Durgin 3693cc4a38bdSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev) 36941fe5e993SAlex Elder { 3695e627db08SAlex Elder u64 mapping_size; 36961fe5e993SAlex Elder int ret; 36971fe5e993SAlex Elder 3698cfbf6377SAlex Elder down_write(&rbd_dev->header_rwsem); 36993b5cf2a2SAlex Elder mapping_size = rbd_dev->mapping.size; 3700a720ae09SIlya Dryomov 3701a720ae09SIlya Dryomov ret = rbd_dev_header_info(rbd_dev); 370252bb1f9bSIlya Dryomov if (ret) 370373e39e4dSIlya Dryomov goto out; 370415228edeSAlex Elder 3705e8f59b59SIlya Dryomov /* 3706e8f59b59SIlya Dryomov * If there is a parent, see if it has disappeared due to the 3707e8f59b59SIlya Dryomov * mapped image getting flattened. 3708e8f59b59SIlya Dryomov */ 3709e8f59b59SIlya Dryomov if (rbd_dev->parent) { 3710e8f59b59SIlya Dryomov ret = rbd_dev_v2_parent_info(rbd_dev); 3711e8f59b59SIlya Dryomov if (ret) 371273e39e4dSIlya Dryomov goto out; 3713e8f59b59SIlya Dryomov } 3714e8f59b59SIlya Dryomov 37155ff1108cSIlya Dryomov if (rbd_dev->spec->snap_id == CEPH_NOSNAP) { 37165ff1108cSIlya Dryomov rbd_dev->mapping.size = rbd_dev->header.image_size; 37175ff1108cSIlya Dryomov } else { 37185ff1108cSIlya Dryomov /* validate mapped snapshot's EXISTS flag */ 371915228edeSAlex Elder rbd_exists_validate(rbd_dev); 37205ff1108cSIlya Dryomov } 37215ff1108cSIlya Dryomov 372273e39e4dSIlya Dryomov out: 3723cfbf6377SAlex Elder up_write(&rbd_dev->header_rwsem); 372473e39e4dSIlya Dryomov if (!ret && mapping_size != rbd_dev->mapping.size) 37259875201eSJosh Durgin rbd_dev_update_size(rbd_dev); 37261fe5e993SAlex Elder 372773e39e4dSIlya Dryomov return ret; 37281fe5e993SAlex Elder } 37291fe5e993SAlex Elder 37307ad18afaSChristoph Hellwig static int rbd_init_request(void *data, struct request *rq, 37317ad18afaSChristoph Hellwig unsigned int hctx_idx, unsigned int request_idx, 37327ad18afaSChristoph Hellwig unsigned int numa_node) 37337ad18afaSChristoph Hellwig { 37347ad18afaSChristoph Hellwig struct work_struct *work = blk_mq_rq_to_pdu(rq); 37357ad18afaSChristoph Hellwig 37367ad18afaSChristoph Hellwig INIT_WORK(work, rbd_queue_workfn); 37377ad18afaSChristoph Hellwig return 0; 37387ad18afaSChristoph Hellwig } 37397ad18afaSChristoph Hellwig 37407ad18afaSChristoph Hellwig static struct blk_mq_ops rbd_mq_ops = { 37417ad18afaSChristoph Hellwig .queue_rq = rbd_queue_rq, 37427ad18afaSChristoph Hellwig .map_queue = blk_mq_map_queue, 37437ad18afaSChristoph Hellwig .init_request = rbd_init_request, 37447ad18afaSChristoph Hellwig }; 37457ad18afaSChristoph Hellwig 3746602adf40SYehuda Sadeh static int rbd_init_disk(struct rbd_device *rbd_dev) 3747602adf40SYehuda Sadeh { 3748602adf40SYehuda Sadeh struct gendisk *disk; 3749602adf40SYehuda Sadeh struct request_queue *q; 3750593a9e7bSAlex Elder u64 segment_size; 37517ad18afaSChristoph Hellwig int err; 3752602adf40SYehuda Sadeh 3753602adf40SYehuda Sadeh /* create gendisk info */ 37547e513d43SIlya Dryomov disk = alloc_disk(single_major ? 37557e513d43SIlya Dryomov (1 << RBD_SINGLE_MAJOR_PART_SHIFT) : 37567e513d43SIlya Dryomov RBD_MINORS_PER_MAJOR); 3757602adf40SYehuda Sadeh if (!disk) 37581fcdb8aaSAlex Elder return -ENOMEM; 3759602adf40SYehuda Sadeh 3760f0f8cef5SAlex Elder snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d", 3761de71a297SAlex Elder rbd_dev->dev_id); 3762602adf40SYehuda Sadeh disk->major = rbd_dev->major; 3763dd82fff1SIlya Dryomov disk->first_minor = rbd_dev->minor; 37647e513d43SIlya Dryomov if (single_major) 37657e513d43SIlya Dryomov disk->flags |= GENHD_FL_EXT_DEVT; 3766602adf40SYehuda Sadeh disk->fops = &rbd_bd_ops; 3767602adf40SYehuda Sadeh disk->private_data = rbd_dev; 3768602adf40SYehuda Sadeh 37697ad18afaSChristoph Hellwig memset(&rbd_dev->tag_set, 0, sizeof(rbd_dev->tag_set)); 37707ad18afaSChristoph Hellwig rbd_dev->tag_set.ops = &rbd_mq_ops; 37717ad18afaSChristoph Hellwig rbd_dev->tag_set.queue_depth = BLKDEV_MAX_RQ; 37727ad18afaSChristoph Hellwig rbd_dev->tag_set.numa_node = NUMA_NO_NODE; 37737ad18afaSChristoph Hellwig rbd_dev->tag_set.flags = 37747ad18afaSChristoph Hellwig BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE; 37757ad18afaSChristoph Hellwig rbd_dev->tag_set.nr_hw_queues = 1; 37767ad18afaSChristoph Hellwig rbd_dev->tag_set.cmd_size = sizeof(struct work_struct); 37777ad18afaSChristoph Hellwig 37787ad18afaSChristoph Hellwig err = blk_mq_alloc_tag_set(&rbd_dev->tag_set); 37797ad18afaSChristoph Hellwig if (err) 3780602adf40SYehuda Sadeh goto out_disk; 3781029bcbd8SJosh Durgin 37827ad18afaSChristoph Hellwig q = blk_mq_init_queue(&rbd_dev->tag_set); 37837ad18afaSChristoph Hellwig if (IS_ERR(q)) { 37847ad18afaSChristoph Hellwig err = PTR_ERR(q); 37857ad18afaSChristoph Hellwig goto out_tag_set; 37867ad18afaSChristoph Hellwig } 37877ad18afaSChristoph Hellwig 3788d8a2c89cSIlya Dryomov queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q); 3789d8a2c89cSIlya Dryomov /* QUEUE_FLAG_ADD_RANDOM is off by default for blk-mq */ 3790593a9e7bSAlex Elder 3791029bcbd8SJosh Durgin /* set io sizes to object size */ 3792593a9e7bSAlex Elder segment_size = rbd_obj_bytes(&rbd_dev->header); 3793593a9e7bSAlex Elder blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE); 3794593a9e7bSAlex Elder blk_queue_max_segment_size(q, segment_size); 3795593a9e7bSAlex Elder blk_queue_io_min(q, segment_size); 3796593a9e7bSAlex Elder blk_queue_io_opt(q, segment_size); 3797029bcbd8SJosh Durgin 379890e98c52SGuangliang Zhao /* enable the discard support */ 379990e98c52SGuangliang Zhao queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q); 380090e98c52SGuangliang Zhao q->limits.discard_granularity = segment_size; 380190e98c52SGuangliang Zhao q->limits.discard_alignment = segment_size; 3802b76f8239SJosh Durgin q->limits.max_discard_sectors = segment_size / SECTOR_SIZE; 3803b76f8239SJosh Durgin q->limits.discard_zeroes_data = 1; 380490e98c52SGuangliang Zhao 3805602adf40SYehuda Sadeh blk_queue_merge_bvec(q, rbd_merge_bvec); 3806602adf40SYehuda Sadeh disk->queue = q; 3807602adf40SYehuda Sadeh 3808602adf40SYehuda Sadeh q->queuedata = rbd_dev; 3809602adf40SYehuda Sadeh 3810602adf40SYehuda Sadeh rbd_dev->disk = disk; 3811602adf40SYehuda Sadeh 3812602adf40SYehuda Sadeh return 0; 38137ad18afaSChristoph Hellwig out_tag_set: 38147ad18afaSChristoph Hellwig blk_mq_free_tag_set(&rbd_dev->tag_set); 3815602adf40SYehuda Sadeh out_disk: 3816602adf40SYehuda Sadeh put_disk(disk); 38177ad18afaSChristoph Hellwig return err; 3818602adf40SYehuda Sadeh } 3819602adf40SYehuda Sadeh 3820dfc5606dSYehuda Sadeh /* 3821dfc5606dSYehuda Sadeh sysfs 3822dfc5606dSYehuda Sadeh */ 3823602adf40SYehuda Sadeh 3824593a9e7bSAlex Elder static struct rbd_device *dev_to_rbd_dev(struct device *dev) 3825593a9e7bSAlex Elder { 3826593a9e7bSAlex Elder return container_of(dev, struct rbd_device, dev); 3827593a9e7bSAlex Elder } 3828593a9e7bSAlex Elder 3829dfc5606dSYehuda Sadeh static ssize_t rbd_size_show(struct device *dev, 3830dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 3831602adf40SYehuda Sadeh { 3832593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 3833dfc5606dSYehuda Sadeh 3834fc71d833SAlex Elder return sprintf(buf, "%llu\n", 3835fc71d833SAlex Elder (unsigned long long)rbd_dev->mapping.size); 3836602adf40SYehuda Sadeh } 3837602adf40SYehuda Sadeh 383834b13184SAlex Elder /* 383934b13184SAlex Elder * Note this shows the features for whatever's mapped, which is not 384034b13184SAlex Elder * necessarily the base image. 384134b13184SAlex Elder */ 384234b13184SAlex Elder static ssize_t rbd_features_show(struct device *dev, 384334b13184SAlex Elder struct device_attribute *attr, char *buf) 384434b13184SAlex Elder { 384534b13184SAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 384634b13184SAlex Elder 384734b13184SAlex Elder return sprintf(buf, "0x%016llx\n", 384834b13184SAlex Elder (unsigned long long)rbd_dev->mapping.features); 384934b13184SAlex Elder } 385034b13184SAlex Elder 3851dfc5606dSYehuda Sadeh static ssize_t rbd_major_show(struct device *dev, 3852dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 3853602adf40SYehuda Sadeh { 3854593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 3855dfc5606dSYehuda Sadeh 3856fc71d833SAlex Elder if (rbd_dev->major) 3857dfc5606dSYehuda Sadeh return sprintf(buf, "%d\n", rbd_dev->major); 3858fc71d833SAlex Elder 3859fc71d833SAlex Elder return sprintf(buf, "(none)\n"); 3860dd82fff1SIlya Dryomov } 3861fc71d833SAlex Elder 3862dd82fff1SIlya Dryomov static ssize_t rbd_minor_show(struct device *dev, 3863dd82fff1SIlya Dryomov struct device_attribute *attr, char *buf) 3864dd82fff1SIlya Dryomov { 3865dd82fff1SIlya Dryomov struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 3866dd82fff1SIlya Dryomov 3867dd82fff1SIlya Dryomov return sprintf(buf, "%d\n", rbd_dev->minor); 3868dfc5606dSYehuda Sadeh } 3869dfc5606dSYehuda Sadeh 3870dfc5606dSYehuda Sadeh static ssize_t rbd_client_id_show(struct device *dev, 3871dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 3872dfc5606dSYehuda Sadeh { 3873593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 3874dfc5606dSYehuda Sadeh 38751dbb4399SAlex Elder return sprintf(buf, "client%lld\n", 38761dbb4399SAlex Elder ceph_client_id(rbd_dev->rbd_client->client)); 3877dfc5606dSYehuda Sadeh } 3878dfc5606dSYehuda Sadeh 3879dfc5606dSYehuda Sadeh static ssize_t rbd_pool_show(struct device *dev, 3880dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 3881dfc5606dSYehuda Sadeh { 3882593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 3883dfc5606dSYehuda Sadeh 38840d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->pool_name); 3885dfc5606dSYehuda Sadeh } 3886dfc5606dSYehuda Sadeh 38879bb2f334SAlex Elder static ssize_t rbd_pool_id_show(struct device *dev, 38889bb2f334SAlex Elder struct device_attribute *attr, char *buf) 38899bb2f334SAlex Elder { 38909bb2f334SAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 38919bb2f334SAlex Elder 38920d7dbfceSAlex Elder return sprintf(buf, "%llu\n", 38930d7dbfceSAlex Elder (unsigned long long) rbd_dev->spec->pool_id); 38949bb2f334SAlex Elder } 38959bb2f334SAlex Elder 3896dfc5606dSYehuda Sadeh static ssize_t rbd_name_show(struct device *dev, 3897dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 3898dfc5606dSYehuda Sadeh { 3899593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 3900dfc5606dSYehuda Sadeh 3901a92ffdf8SAlex Elder if (rbd_dev->spec->image_name) 39020d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->image_name); 3903a92ffdf8SAlex Elder 3904a92ffdf8SAlex Elder return sprintf(buf, "(unknown)\n"); 3905dfc5606dSYehuda Sadeh } 3906dfc5606dSYehuda Sadeh 3907589d30e0SAlex Elder static ssize_t rbd_image_id_show(struct device *dev, 3908589d30e0SAlex Elder struct device_attribute *attr, char *buf) 3909589d30e0SAlex Elder { 3910589d30e0SAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 3911589d30e0SAlex Elder 39120d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->image_id); 3913589d30e0SAlex Elder } 3914589d30e0SAlex Elder 391534b13184SAlex Elder /* 391634b13184SAlex Elder * Shows the name of the currently-mapped snapshot (or 391734b13184SAlex Elder * RBD_SNAP_HEAD_NAME for the base image). 391834b13184SAlex Elder */ 3919dfc5606dSYehuda Sadeh static ssize_t rbd_snap_show(struct device *dev, 3920dfc5606dSYehuda Sadeh struct device_attribute *attr, 3921dfc5606dSYehuda Sadeh char *buf) 3922dfc5606dSYehuda Sadeh { 3923593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 3924dfc5606dSYehuda Sadeh 39250d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->snap_name); 3926dfc5606dSYehuda Sadeh } 3927dfc5606dSYehuda Sadeh 392886b00e0dSAlex Elder /* 3929ff96128fSIlya Dryomov * For a v2 image, shows the chain of parent images, separated by empty 3930ff96128fSIlya Dryomov * lines. For v1 images or if there is no parent, shows "(no parent 3931ff96128fSIlya Dryomov * image)". 393286b00e0dSAlex Elder */ 393386b00e0dSAlex Elder static ssize_t rbd_parent_show(struct device *dev, 393486b00e0dSAlex Elder struct device_attribute *attr, 393586b00e0dSAlex Elder char *buf) 393686b00e0dSAlex Elder { 393786b00e0dSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 3938ff96128fSIlya Dryomov ssize_t count = 0; 393986b00e0dSAlex Elder 3940ff96128fSIlya Dryomov if (!rbd_dev->parent) 394186b00e0dSAlex Elder return sprintf(buf, "(no parent image)\n"); 394286b00e0dSAlex Elder 3943ff96128fSIlya Dryomov for ( ; rbd_dev->parent; rbd_dev = rbd_dev->parent) { 3944ff96128fSIlya Dryomov struct rbd_spec *spec = rbd_dev->parent_spec; 394586b00e0dSAlex Elder 3946ff96128fSIlya Dryomov count += sprintf(&buf[count], "%s" 3947ff96128fSIlya Dryomov "pool_id %llu\npool_name %s\n" 3948ff96128fSIlya Dryomov "image_id %s\nimage_name %s\n" 3949ff96128fSIlya Dryomov "snap_id %llu\nsnap_name %s\n" 3950ff96128fSIlya Dryomov "overlap %llu\n", 3951ff96128fSIlya Dryomov !count ? "" : "\n", /* first? */ 3952ff96128fSIlya Dryomov spec->pool_id, spec->pool_name, 3953ff96128fSIlya Dryomov spec->image_id, spec->image_name ?: "(unknown)", 3954ff96128fSIlya Dryomov spec->snap_id, spec->snap_name, 3955ff96128fSIlya Dryomov rbd_dev->parent_overlap); 3956ff96128fSIlya Dryomov } 395786b00e0dSAlex Elder 395886b00e0dSAlex Elder return count; 395986b00e0dSAlex Elder } 396086b00e0dSAlex Elder 3961dfc5606dSYehuda Sadeh static ssize_t rbd_image_refresh(struct device *dev, 3962dfc5606dSYehuda Sadeh struct device_attribute *attr, 3963dfc5606dSYehuda Sadeh const char *buf, 3964dfc5606dSYehuda Sadeh size_t size) 3965dfc5606dSYehuda Sadeh { 3966593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 3967b813623aSAlex Elder int ret; 3968602adf40SYehuda Sadeh 3969cc4a38bdSAlex Elder ret = rbd_dev_refresh(rbd_dev); 3970e627db08SAlex Elder if (ret) 397152bb1f9bSIlya Dryomov return ret; 3972b813623aSAlex Elder 397352bb1f9bSIlya Dryomov return size; 3974dfc5606dSYehuda Sadeh } 3975602adf40SYehuda Sadeh 3976dfc5606dSYehuda Sadeh static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL); 397734b13184SAlex Elder static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL); 3978dfc5606dSYehuda Sadeh static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL); 3979dd82fff1SIlya Dryomov static DEVICE_ATTR(minor, S_IRUGO, rbd_minor_show, NULL); 3980dfc5606dSYehuda Sadeh static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL); 3981dfc5606dSYehuda Sadeh static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL); 39829bb2f334SAlex Elder static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL); 3983dfc5606dSYehuda Sadeh static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL); 3984589d30e0SAlex Elder static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL); 3985dfc5606dSYehuda Sadeh static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh); 3986dfc5606dSYehuda Sadeh static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL); 398786b00e0dSAlex Elder static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL); 3988dfc5606dSYehuda Sadeh 3989dfc5606dSYehuda Sadeh static struct attribute *rbd_attrs[] = { 3990dfc5606dSYehuda Sadeh &dev_attr_size.attr, 399134b13184SAlex Elder &dev_attr_features.attr, 3992dfc5606dSYehuda Sadeh &dev_attr_major.attr, 3993dd82fff1SIlya Dryomov &dev_attr_minor.attr, 3994dfc5606dSYehuda Sadeh &dev_attr_client_id.attr, 3995dfc5606dSYehuda Sadeh &dev_attr_pool.attr, 39969bb2f334SAlex Elder &dev_attr_pool_id.attr, 3997dfc5606dSYehuda Sadeh &dev_attr_name.attr, 3998589d30e0SAlex Elder &dev_attr_image_id.attr, 3999dfc5606dSYehuda Sadeh &dev_attr_current_snap.attr, 400086b00e0dSAlex Elder &dev_attr_parent.attr, 4001dfc5606dSYehuda Sadeh &dev_attr_refresh.attr, 4002dfc5606dSYehuda Sadeh NULL 4003dfc5606dSYehuda Sadeh }; 4004dfc5606dSYehuda Sadeh 4005dfc5606dSYehuda Sadeh static struct attribute_group rbd_attr_group = { 4006dfc5606dSYehuda Sadeh .attrs = rbd_attrs, 4007dfc5606dSYehuda Sadeh }; 4008dfc5606dSYehuda Sadeh 4009dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_attr_groups[] = { 4010dfc5606dSYehuda Sadeh &rbd_attr_group, 4011dfc5606dSYehuda Sadeh NULL 4012dfc5606dSYehuda Sadeh }; 4013dfc5606dSYehuda Sadeh 4014dfc5606dSYehuda Sadeh static void rbd_sysfs_dev_release(struct device *dev) 4015dfc5606dSYehuda Sadeh { 4016dfc5606dSYehuda Sadeh } 4017dfc5606dSYehuda Sadeh 4018dfc5606dSYehuda Sadeh static struct device_type rbd_device_type = { 4019dfc5606dSYehuda Sadeh .name = "rbd", 4020dfc5606dSYehuda Sadeh .groups = rbd_attr_groups, 4021dfc5606dSYehuda Sadeh .release = rbd_sysfs_dev_release, 4022dfc5606dSYehuda Sadeh }; 4023dfc5606dSYehuda Sadeh 40248b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec) 40258b8fb99cSAlex Elder { 40268b8fb99cSAlex Elder kref_get(&spec->kref); 40278b8fb99cSAlex Elder 40288b8fb99cSAlex Elder return spec; 40298b8fb99cSAlex Elder } 40308b8fb99cSAlex Elder 40318b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref); 40328b8fb99cSAlex Elder static void rbd_spec_put(struct rbd_spec *spec) 40338b8fb99cSAlex Elder { 40348b8fb99cSAlex Elder if (spec) 40358b8fb99cSAlex Elder kref_put(&spec->kref, rbd_spec_free); 40368b8fb99cSAlex Elder } 40378b8fb99cSAlex Elder 40388b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_alloc(void) 40398b8fb99cSAlex Elder { 40408b8fb99cSAlex Elder struct rbd_spec *spec; 40418b8fb99cSAlex Elder 40428b8fb99cSAlex Elder spec = kzalloc(sizeof (*spec), GFP_KERNEL); 40438b8fb99cSAlex Elder if (!spec) 40448b8fb99cSAlex Elder return NULL; 404504077599SIlya Dryomov 404604077599SIlya Dryomov spec->pool_id = CEPH_NOPOOL; 404704077599SIlya Dryomov spec->snap_id = CEPH_NOSNAP; 40488b8fb99cSAlex Elder kref_init(&spec->kref); 40498b8fb99cSAlex Elder 40508b8fb99cSAlex Elder return spec; 40518b8fb99cSAlex Elder } 40528b8fb99cSAlex Elder 40538b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref) 40548b8fb99cSAlex Elder { 40558b8fb99cSAlex Elder struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref); 40568b8fb99cSAlex Elder 40578b8fb99cSAlex Elder kfree(spec->pool_name); 40588b8fb99cSAlex Elder kfree(spec->image_id); 40598b8fb99cSAlex Elder kfree(spec->image_name); 40608b8fb99cSAlex Elder kfree(spec->snap_name); 40618b8fb99cSAlex Elder kfree(spec); 40628b8fb99cSAlex Elder } 40638b8fb99cSAlex Elder 4064cc344fa1SAlex Elder static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc, 4065c53d5893SAlex Elder struct rbd_spec *spec) 4066c53d5893SAlex Elder { 4067c53d5893SAlex Elder struct rbd_device *rbd_dev; 4068c53d5893SAlex Elder 4069c53d5893SAlex Elder rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL); 4070c53d5893SAlex Elder if (!rbd_dev) 4071c53d5893SAlex Elder return NULL; 4072c53d5893SAlex Elder 4073c53d5893SAlex Elder spin_lock_init(&rbd_dev->lock); 40746d292906SAlex Elder rbd_dev->flags = 0; 4075a2acd00eSAlex Elder atomic_set(&rbd_dev->parent_ref, 0); 4076c53d5893SAlex Elder INIT_LIST_HEAD(&rbd_dev->node); 4077c53d5893SAlex Elder init_rwsem(&rbd_dev->header_rwsem); 4078c53d5893SAlex Elder 4079c53d5893SAlex Elder rbd_dev->spec = spec; 4080c53d5893SAlex Elder rbd_dev->rbd_client = rbdc; 4081c53d5893SAlex Elder 40820903e875SAlex Elder /* Initialize the layout used for all rbd requests */ 40830903e875SAlex Elder 40840903e875SAlex Elder rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); 40850903e875SAlex Elder rbd_dev->layout.fl_stripe_count = cpu_to_le32(1); 40860903e875SAlex Elder rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); 40870903e875SAlex Elder rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id); 40880903e875SAlex Elder 4089c53d5893SAlex Elder return rbd_dev; 4090c53d5893SAlex Elder } 4091c53d5893SAlex Elder 4092c53d5893SAlex Elder static void rbd_dev_destroy(struct rbd_device *rbd_dev) 4093c53d5893SAlex Elder { 4094c53d5893SAlex Elder rbd_put_client(rbd_dev->rbd_client); 4095c53d5893SAlex Elder rbd_spec_put(rbd_dev->spec); 4096c53d5893SAlex Elder kfree(rbd_dev); 4097c53d5893SAlex Elder } 4098c53d5893SAlex Elder 4099dfc5606dSYehuda Sadeh /* 41009d475de5SAlex Elder * Get the size and object order for an image snapshot, or if 41019d475de5SAlex Elder * snap_id is CEPH_NOSNAP, gets this information for the base 41029d475de5SAlex Elder * image. 41039d475de5SAlex Elder */ 41049d475de5SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id, 41059d475de5SAlex Elder u8 *order, u64 *snap_size) 41069d475de5SAlex Elder { 41079d475de5SAlex Elder __le64 snapid = cpu_to_le64(snap_id); 41089d475de5SAlex Elder int ret; 41099d475de5SAlex Elder struct { 41109d475de5SAlex Elder u8 order; 41119d475de5SAlex Elder __le64 size; 41129d475de5SAlex Elder } __attribute__ ((packed)) size_buf = { 0 }; 41139d475de5SAlex Elder 411436be9a76SAlex Elder ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, 41159d475de5SAlex Elder "rbd", "get_size", 41164157976bSAlex Elder &snapid, sizeof (snapid), 4117e2a58ee5SAlex Elder &size_buf, sizeof (size_buf)); 411836be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 41199d475de5SAlex Elder if (ret < 0) 41209d475de5SAlex Elder return ret; 412157385b51SAlex Elder if (ret < sizeof (size_buf)) 412257385b51SAlex Elder return -ERANGE; 41239d475de5SAlex Elder 4124c3545579SJosh Durgin if (order) { 41259d475de5SAlex Elder *order = size_buf.order; 4126c3545579SJosh Durgin dout(" order %u", (unsigned int)*order); 4127c3545579SJosh Durgin } 41289d475de5SAlex Elder *snap_size = le64_to_cpu(size_buf.size); 41299d475de5SAlex Elder 4130c3545579SJosh Durgin dout(" snap_id 0x%016llx snap_size = %llu\n", 4131c3545579SJosh Durgin (unsigned long long)snap_id, 41329d475de5SAlex Elder (unsigned long long)*snap_size); 41339d475de5SAlex Elder 41349d475de5SAlex Elder return 0; 41359d475de5SAlex Elder } 41369d475de5SAlex Elder 41379d475de5SAlex Elder static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev) 41389d475de5SAlex Elder { 41399d475de5SAlex Elder return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP, 41409d475de5SAlex Elder &rbd_dev->header.obj_order, 41419d475de5SAlex Elder &rbd_dev->header.image_size); 41429d475de5SAlex Elder } 41439d475de5SAlex Elder 41441e130199SAlex Elder static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev) 41451e130199SAlex Elder { 41461e130199SAlex Elder void *reply_buf; 41471e130199SAlex Elder int ret; 41481e130199SAlex Elder void *p; 41491e130199SAlex Elder 41501e130199SAlex Elder reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL); 41511e130199SAlex Elder if (!reply_buf) 41521e130199SAlex Elder return -ENOMEM; 41531e130199SAlex Elder 415436be9a76SAlex Elder ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, 41554157976bSAlex Elder "rbd", "get_object_prefix", NULL, 0, 4156e2a58ee5SAlex Elder reply_buf, RBD_OBJ_PREFIX_LEN_MAX); 415736be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 41581e130199SAlex Elder if (ret < 0) 41591e130199SAlex Elder goto out; 41601e130199SAlex Elder 41611e130199SAlex Elder p = reply_buf; 41621e130199SAlex Elder rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p, 416357385b51SAlex Elder p + ret, NULL, GFP_NOIO); 416457385b51SAlex Elder ret = 0; 41651e130199SAlex Elder 41661e130199SAlex Elder if (IS_ERR(rbd_dev->header.object_prefix)) { 41671e130199SAlex Elder ret = PTR_ERR(rbd_dev->header.object_prefix); 41681e130199SAlex Elder rbd_dev->header.object_prefix = NULL; 41691e130199SAlex Elder } else { 41701e130199SAlex Elder dout(" object_prefix = %s\n", rbd_dev->header.object_prefix); 41711e130199SAlex Elder } 41721e130199SAlex Elder out: 41731e130199SAlex Elder kfree(reply_buf); 41741e130199SAlex Elder 41751e130199SAlex Elder return ret; 41761e130199SAlex Elder } 41771e130199SAlex Elder 4178b1b5402aSAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id, 4179b1b5402aSAlex Elder u64 *snap_features) 4180b1b5402aSAlex Elder { 4181b1b5402aSAlex Elder __le64 snapid = cpu_to_le64(snap_id); 4182b1b5402aSAlex Elder struct { 4183b1b5402aSAlex Elder __le64 features; 4184b1b5402aSAlex Elder __le64 incompat; 41854157976bSAlex Elder } __attribute__ ((packed)) features_buf = { 0 }; 4186d889140cSAlex Elder u64 incompat; 4187b1b5402aSAlex Elder int ret; 4188b1b5402aSAlex Elder 418936be9a76SAlex Elder ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, 4190b1b5402aSAlex Elder "rbd", "get_features", 41914157976bSAlex Elder &snapid, sizeof (snapid), 4192e2a58ee5SAlex Elder &features_buf, sizeof (features_buf)); 419336be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 4194b1b5402aSAlex Elder if (ret < 0) 4195b1b5402aSAlex Elder return ret; 419657385b51SAlex Elder if (ret < sizeof (features_buf)) 419757385b51SAlex Elder return -ERANGE; 4198d889140cSAlex Elder 4199d889140cSAlex Elder incompat = le64_to_cpu(features_buf.incompat); 42005cbf6f12SAlex Elder if (incompat & ~RBD_FEATURES_SUPPORTED) 4201b8f5c6edSAlex Elder return -ENXIO; 4202d889140cSAlex Elder 4203b1b5402aSAlex Elder *snap_features = le64_to_cpu(features_buf.features); 4204b1b5402aSAlex Elder 4205b1b5402aSAlex Elder dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n", 4206b1b5402aSAlex Elder (unsigned long long)snap_id, 4207b1b5402aSAlex Elder (unsigned long long)*snap_features, 4208b1b5402aSAlex Elder (unsigned long long)le64_to_cpu(features_buf.incompat)); 4209b1b5402aSAlex Elder 4210b1b5402aSAlex Elder return 0; 4211b1b5402aSAlex Elder } 4212b1b5402aSAlex Elder 4213b1b5402aSAlex Elder static int rbd_dev_v2_features(struct rbd_device *rbd_dev) 4214b1b5402aSAlex Elder { 4215b1b5402aSAlex Elder return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP, 4216b1b5402aSAlex Elder &rbd_dev->header.features); 4217b1b5402aSAlex Elder } 4218b1b5402aSAlex Elder 421986b00e0dSAlex Elder static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev) 422086b00e0dSAlex Elder { 422186b00e0dSAlex Elder struct rbd_spec *parent_spec; 422286b00e0dSAlex Elder size_t size; 422386b00e0dSAlex Elder void *reply_buf = NULL; 422486b00e0dSAlex Elder __le64 snapid; 422586b00e0dSAlex Elder void *p; 422686b00e0dSAlex Elder void *end; 4227642a2537SAlex Elder u64 pool_id; 422886b00e0dSAlex Elder char *image_id; 42293b5cf2a2SAlex Elder u64 snap_id; 423086b00e0dSAlex Elder u64 overlap; 423186b00e0dSAlex Elder int ret; 423286b00e0dSAlex Elder 423386b00e0dSAlex Elder parent_spec = rbd_spec_alloc(); 423486b00e0dSAlex Elder if (!parent_spec) 423586b00e0dSAlex Elder return -ENOMEM; 423686b00e0dSAlex Elder 423786b00e0dSAlex Elder size = sizeof (__le64) + /* pool_id */ 423886b00e0dSAlex Elder sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX + /* image_id */ 423986b00e0dSAlex Elder sizeof (__le64) + /* snap_id */ 424086b00e0dSAlex Elder sizeof (__le64); /* overlap */ 424186b00e0dSAlex Elder reply_buf = kmalloc(size, GFP_KERNEL); 424286b00e0dSAlex Elder if (!reply_buf) { 424386b00e0dSAlex Elder ret = -ENOMEM; 424486b00e0dSAlex Elder goto out_err; 424586b00e0dSAlex Elder } 424686b00e0dSAlex Elder 42474d9b67cdSIlya Dryomov snapid = cpu_to_le64(rbd_dev->spec->snap_id); 424836be9a76SAlex Elder ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, 424986b00e0dSAlex Elder "rbd", "get_parent", 42504157976bSAlex Elder &snapid, sizeof (snapid), 4251e2a58ee5SAlex Elder reply_buf, size); 425236be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 425386b00e0dSAlex Elder if (ret < 0) 425486b00e0dSAlex Elder goto out_err; 425586b00e0dSAlex Elder 425686b00e0dSAlex Elder p = reply_buf; 425757385b51SAlex Elder end = reply_buf + ret; 425857385b51SAlex Elder ret = -ERANGE; 4259642a2537SAlex Elder ceph_decode_64_safe(&p, end, pool_id, out_err); 4260392a9dadSAlex Elder if (pool_id == CEPH_NOPOOL) { 4261392a9dadSAlex Elder /* 4262392a9dadSAlex Elder * Either the parent never existed, or we have 4263392a9dadSAlex Elder * record of it but the image got flattened so it no 4264392a9dadSAlex Elder * longer has a parent. When the parent of a 4265392a9dadSAlex Elder * layered image disappears we immediately set the 4266392a9dadSAlex Elder * overlap to 0. The effect of this is that all new 4267392a9dadSAlex Elder * requests will be treated as if the image had no 4268392a9dadSAlex Elder * parent. 4269392a9dadSAlex Elder */ 4270392a9dadSAlex Elder if (rbd_dev->parent_overlap) { 4271392a9dadSAlex Elder rbd_dev->parent_overlap = 0; 4272392a9dadSAlex Elder rbd_dev_parent_put(rbd_dev); 4273392a9dadSAlex Elder pr_info("%s: clone image has been flattened\n", 4274392a9dadSAlex Elder rbd_dev->disk->disk_name); 4275392a9dadSAlex Elder } 4276392a9dadSAlex Elder 427786b00e0dSAlex Elder goto out; /* No parent? No problem. */ 4278392a9dadSAlex Elder } 427986b00e0dSAlex Elder 42800903e875SAlex Elder /* The ceph file layout needs to fit pool id in 32 bits */ 42810903e875SAlex Elder 42820903e875SAlex Elder ret = -EIO; 4283642a2537SAlex Elder if (pool_id > (u64)U32_MAX) { 42849584d508SIlya Dryomov rbd_warn(NULL, "parent pool id too large (%llu > %u)", 4285642a2537SAlex Elder (unsigned long long)pool_id, U32_MAX); 428657385b51SAlex Elder goto out_err; 4287c0cd10dbSAlex Elder } 42880903e875SAlex Elder 4289979ed480SAlex Elder image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL); 429086b00e0dSAlex Elder if (IS_ERR(image_id)) { 429186b00e0dSAlex Elder ret = PTR_ERR(image_id); 429286b00e0dSAlex Elder goto out_err; 429386b00e0dSAlex Elder } 42943b5cf2a2SAlex Elder ceph_decode_64_safe(&p, end, snap_id, out_err); 429586b00e0dSAlex Elder ceph_decode_64_safe(&p, end, overlap, out_err); 429686b00e0dSAlex Elder 42973b5cf2a2SAlex Elder /* 42983b5cf2a2SAlex Elder * The parent won't change (except when the clone is 42993b5cf2a2SAlex Elder * flattened, already handled that). So we only need to 43003b5cf2a2SAlex Elder * record the parent spec we have not already done so. 43013b5cf2a2SAlex Elder */ 43023b5cf2a2SAlex Elder if (!rbd_dev->parent_spec) { 43033b5cf2a2SAlex Elder parent_spec->pool_id = pool_id; 43043b5cf2a2SAlex Elder parent_spec->image_id = image_id; 43053b5cf2a2SAlex Elder parent_spec->snap_id = snap_id; 430686b00e0dSAlex Elder rbd_dev->parent_spec = parent_spec; 430786b00e0dSAlex Elder parent_spec = NULL; /* rbd_dev now owns this */ 4308fbba11b3SIlya Dryomov } else { 4309fbba11b3SIlya Dryomov kfree(image_id); 43103b5cf2a2SAlex Elder } 43113b5cf2a2SAlex Elder 43123b5cf2a2SAlex Elder /* 4313cf32bd9cSIlya Dryomov * We always update the parent overlap. If it's zero we issue 4314cf32bd9cSIlya Dryomov * a warning, as we will proceed as if there was no parent. 43153b5cf2a2SAlex Elder */ 43163b5cf2a2SAlex Elder if (!overlap) { 43173b5cf2a2SAlex Elder if (parent_spec) { 4318cf32bd9cSIlya Dryomov /* refresh, careful to warn just once */ 4319cf32bd9cSIlya Dryomov if (rbd_dev->parent_overlap) 4320cf32bd9cSIlya Dryomov rbd_warn(rbd_dev, 4321cf32bd9cSIlya Dryomov "clone now standalone (overlap became 0)"); 432270cf49cfSAlex Elder } else { 4323cf32bd9cSIlya Dryomov /* initial probe */ 4324cf32bd9cSIlya Dryomov rbd_warn(rbd_dev, "clone is standalone (overlap 0)"); 43253b5cf2a2SAlex Elder } 432670cf49cfSAlex Elder } 4327cf32bd9cSIlya Dryomov rbd_dev->parent_overlap = overlap; 4328cf32bd9cSIlya Dryomov 432986b00e0dSAlex Elder out: 433086b00e0dSAlex Elder ret = 0; 433186b00e0dSAlex Elder out_err: 433286b00e0dSAlex Elder kfree(reply_buf); 433386b00e0dSAlex Elder rbd_spec_put(parent_spec); 433486b00e0dSAlex Elder 433586b00e0dSAlex Elder return ret; 433686b00e0dSAlex Elder } 433786b00e0dSAlex Elder 4338cc070d59SAlex Elder static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev) 4339cc070d59SAlex Elder { 4340cc070d59SAlex Elder struct { 4341cc070d59SAlex Elder __le64 stripe_unit; 4342cc070d59SAlex Elder __le64 stripe_count; 4343cc070d59SAlex Elder } __attribute__ ((packed)) striping_info_buf = { 0 }; 4344cc070d59SAlex Elder size_t size = sizeof (striping_info_buf); 4345cc070d59SAlex Elder void *p; 4346cc070d59SAlex Elder u64 obj_size; 4347cc070d59SAlex Elder u64 stripe_unit; 4348cc070d59SAlex Elder u64 stripe_count; 4349cc070d59SAlex Elder int ret; 4350cc070d59SAlex Elder 4351cc070d59SAlex Elder ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, 4352cc070d59SAlex Elder "rbd", "get_stripe_unit_count", NULL, 0, 4353e2a58ee5SAlex Elder (char *)&striping_info_buf, size); 4354cc070d59SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 4355cc070d59SAlex Elder if (ret < 0) 4356cc070d59SAlex Elder return ret; 4357cc070d59SAlex Elder if (ret < size) 4358cc070d59SAlex Elder return -ERANGE; 4359cc070d59SAlex Elder 4360cc070d59SAlex Elder /* 4361cc070d59SAlex Elder * We don't actually support the "fancy striping" feature 4362cc070d59SAlex Elder * (STRIPINGV2) yet, but if the striping sizes are the 4363cc070d59SAlex Elder * defaults the behavior is the same as before. So find 4364cc070d59SAlex Elder * out, and only fail if the image has non-default values. 4365cc070d59SAlex Elder */ 4366cc070d59SAlex Elder ret = -EINVAL; 4367cc070d59SAlex Elder obj_size = (u64)1 << rbd_dev->header.obj_order; 4368cc070d59SAlex Elder p = &striping_info_buf; 4369cc070d59SAlex Elder stripe_unit = ceph_decode_64(&p); 4370cc070d59SAlex Elder if (stripe_unit != obj_size) { 4371cc070d59SAlex Elder rbd_warn(rbd_dev, "unsupported stripe unit " 4372cc070d59SAlex Elder "(got %llu want %llu)", 4373cc070d59SAlex Elder stripe_unit, obj_size); 4374cc070d59SAlex Elder return -EINVAL; 4375cc070d59SAlex Elder } 4376cc070d59SAlex Elder stripe_count = ceph_decode_64(&p); 4377cc070d59SAlex Elder if (stripe_count != 1) { 4378cc070d59SAlex Elder rbd_warn(rbd_dev, "unsupported stripe count " 4379cc070d59SAlex Elder "(got %llu want 1)", stripe_count); 4380cc070d59SAlex Elder return -EINVAL; 4381cc070d59SAlex Elder } 4382500d0c0fSAlex Elder rbd_dev->header.stripe_unit = stripe_unit; 4383500d0c0fSAlex Elder rbd_dev->header.stripe_count = stripe_count; 4384cc070d59SAlex Elder 4385cc070d59SAlex Elder return 0; 4386cc070d59SAlex Elder } 4387cc070d59SAlex Elder 43889e15b77dSAlex Elder static char *rbd_dev_image_name(struct rbd_device *rbd_dev) 43899e15b77dSAlex Elder { 43909e15b77dSAlex Elder size_t image_id_size; 43919e15b77dSAlex Elder char *image_id; 43929e15b77dSAlex Elder void *p; 43939e15b77dSAlex Elder void *end; 43949e15b77dSAlex Elder size_t size; 43959e15b77dSAlex Elder void *reply_buf = NULL; 43969e15b77dSAlex Elder size_t len = 0; 43979e15b77dSAlex Elder char *image_name = NULL; 43989e15b77dSAlex Elder int ret; 43999e15b77dSAlex Elder 44009e15b77dSAlex Elder rbd_assert(!rbd_dev->spec->image_name); 44019e15b77dSAlex Elder 440269e7a02fSAlex Elder len = strlen(rbd_dev->spec->image_id); 440369e7a02fSAlex Elder image_id_size = sizeof (__le32) + len; 44049e15b77dSAlex Elder image_id = kmalloc(image_id_size, GFP_KERNEL); 44059e15b77dSAlex Elder if (!image_id) 44069e15b77dSAlex Elder return NULL; 44079e15b77dSAlex Elder 44089e15b77dSAlex Elder p = image_id; 44094157976bSAlex Elder end = image_id + image_id_size; 441069e7a02fSAlex Elder ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len); 44119e15b77dSAlex Elder 44129e15b77dSAlex Elder size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX; 44139e15b77dSAlex Elder reply_buf = kmalloc(size, GFP_KERNEL); 44149e15b77dSAlex Elder if (!reply_buf) 44159e15b77dSAlex Elder goto out; 44169e15b77dSAlex Elder 441736be9a76SAlex Elder ret = rbd_obj_method_sync(rbd_dev, RBD_DIRECTORY, 44189e15b77dSAlex Elder "rbd", "dir_get_name", 44199e15b77dSAlex Elder image_id, image_id_size, 4420e2a58ee5SAlex Elder reply_buf, size); 44219e15b77dSAlex Elder if (ret < 0) 44229e15b77dSAlex Elder goto out; 44239e15b77dSAlex Elder p = reply_buf; 4424f40eb349SAlex Elder end = reply_buf + ret; 4425f40eb349SAlex Elder 44269e15b77dSAlex Elder image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL); 44279e15b77dSAlex Elder if (IS_ERR(image_name)) 44289e15b77dSAlex Elder image_name = NULL; 44299e15b77dSAlex Elder else 44309e15b77dSAlex Elder dout("%s: name is %s len is %zd\n", __func__, image_name, len); 44319e15b77dSAlex Elder out: 44329e15b77dSAlex Elder kfree(reply_buf); 44339e15b77dSAlex Elder kfree(image_id); 44349e15b77dSAlex Elder 44359e15b77dSAlex Elder return image_name; 44369e15b77dSAlex Elder } 44379e15b77dSAlex Elder 44382ad3d716SAlex Elder static u64 rbd_v1_snap_id_by_name(struct rbd_device *rbd_dev, const char *name) 44392ad3d716SAlex Elder { 44402ad3d716SAlex Elder struct ceph_snap_context *snapc = rbd_dev->header.snapc; 44412ad3d716SAlex Elder const char *snap_name; 44422ad3d716SAlex Elder u32 which = 0; 44432ad3d716SAlex Elder 44442ad3d716SAlex Elder /* Skip over names until we find the one we are looking for */ 44452ad3d716SAlex Elder 44462ad3d716SAlex Elder snap_name = rbd_dev->header.snap_names; 44472ad3d716SAlex Elder while (which < snapc->num_snaps) { 44482ad3d716SAlex Elder if (!strcmp(name, snap_name)) 44492ad3d716SAlex Elder return snapc->snaps[which]; 44502ad3d716SAlex Elder snap_name += strlen(snap_name) + 1; 44512ad3d716SAlex Elder which++; 44522ad3d716SAlex Elder } 44532ad3d716SAlex Elder return CEPH_NOSNAP; 44542ad3d716SAlex Elder } 44552ad3d716SAlex Elder 44562ad3d716SAlex Elder static u64 rbd_v2_snap_id_by_name(struct rbd_device *rbd_dev, const char *name) 44572ad3d716SAlex Elder { 44582ad3d716SAlex Elder struct ceph_snap_context *snapc = rbd_dev->header.snapc; 44592ad3d716SAlex Elder u32 which; 44602ad3d716SAlex Elder bool found = false; 44612ad3d716SAlex Elder u64 snap_id; 44622ad3d716SAlex Elder 44632ad3d716SAlex Elder for (which = 0; !found && which < snapc->num_snaps; which++) { 44642ad3d716SAlex Elder const char *snap_name; 44652ad3d716SAlex Elder 44662ad3d716SAlex Elder snap_id = snapc->snaps[which]; 44672ad3d716SAlex Elder snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id); 4468efadc98aSJosh Durgin if (IS_ERR(snap_name)) { 4469efadc98aSJosh Durgin /* ignore no-longer existing snapshots */ 4470efadc98aSJosh Durgin if (PTR_ERR(snap_name) == -ENOENT) 4471efadc98aSJosh Durgin continue; 4472efadc98aSJosh Durgin else 44732ad3d716SAlex Elder break; 4474efadc98aSJosh Durgin } 44752ad3d716SAlex Elder found = !strcmp(name, snap_name); 44762ad3d716SAlex Elder kfree(snap_name); 44772ad3d716SAlex Elder } 44782ad3d716SAlex Elder return found ? snap_id : CEPH_NOSNAP; 44792ad3d716SAlex Elder } 44802ad3d716SAlex Elder 44812ad3d716SAlex Elder /* 44822ad3d716SAlex Elder * Assumes name is never RBD_SNAP_HEAD_NAME; returns CEPH_NOSNAP if 44832ad3d716SAlex Elder * no snapshot by that name is found, or if an error occurs. 44842ad3d716SAlex Elder */ 44852ad3d716SAlex Elder static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name) 44862ad3d716SAlex Elder { 44872ad3d716SAlex Elder if (rbd_dev->image_format == 1) 44882ad3d716SAlex Elder return rbd_v1_snap_id_by_name(rbd_dev, name); 44892ad3d716SAlex Elder 44902ad3d716SAlex Elder return rbd_v2_snap_id_by_name(rbd_dev, name); 44912ad3d716SAlex Elder } 44922ad3d716SAlex Elder 44939e15b77dSAlex Elder /* 449404077599SIlya Dryomov * An image being mapped will have everything but the snap id. 44959e15b77dSAlex Elder */ 449604077599SIlya Dryomov static int rbd_spec_fill_snap_id(struct rbd_device *rbd_dev) 449704077599SIlya Dryomov { 449804077599SIlya Dryomov struct rbd_spec *spec = rbd_dev->spec; 449904077599SIlya Dryomov 450004077599SIlya Dryomov rbd_assert(spec->pool_id != CEPH_NOPOOL && spec->pool_name); 450104077599SIlya Dryomov rbd_assert(spec->image_id && spec->image_name); 450204077599SIlya Dryomov rbd_assert(spec->snap_name); 450304077599SIlya Dryomov 450404077599SIlya Dryomov if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) { 450504077599SIlya Dryomov u64 snap_id; 450604077599SIlya Dryomov 450704077599SIlya Dryomov snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name); 450804077599SIlya Dryomov if (snap_id == CEPH_NOSNAP) 450904077599SIlya Dryomov return -ENOENT; 451004077599SIlya Dryomov 451104077599SIlya Dryomov spec->snap_id = snap_id; 451204077599SIlya Dryomov } else { 451304077599SIlya Dryomov spec->snap_id = CEPH_NOSNAP; 451404077599SIlya Dryomov } 451504077599SIlya Dryomov 451604077599SIlya Dryomov return 0; 451704077599SIlya Dryomov } 451804077599SIlya Dryomov 451904077599SIlya Dryomov /* 452004077599SIlya Dryomov * A parent image will have all ids but none of the names. 452104077599SIlya Dryomov * 452204077599SIlya Dryomov * All names in an rbd spec are dynamically allocated. It's OK if we 452304077599SIlya Dryomov * can't figure out the name for an image id. 452404077599SIlya Dryomov */ 452504077599SIlya Dryomov static int rbd_spec_fill_names(struct rbd_device *rbd_dev) 45269e15b77dSAlex Elder { 45272e9f7f1cSAlex Elder struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 45282e9f7f1cSAlex Elder struct rbd_spec *spec = rbd_dev->spec; 45292e9f7f1cSAlex Elder const char *pool_name; 45302e9f7f1cSAlex Elder const char *image_name; 45312e9f7f1cSAlex Elder const char *snap_name; 45329e15b77dSAlex Elder int ret; 45339e15b77dSAlex Elder 453404077599SIlya Dryomov rbd_assert(spec->pool_id != CEPH_NOPOOL); 453504077599SIlya Dryomov rbd_assert(spec->image_id); 453604077599SIlya Dryomov rbd_assert(spec->snap_id != CEPH_NOSNAP); 45379e15b77dSAlex Elder 45382e9f7f1cSAlex Elder /* Get the pool name; we have to make our own copy of this */ 45399e15b77dSAlex Elder 45402e9f7f1cSAlex Elder pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id); 45412e9f7f1cSAlex Elder if (!pool_name) { 45422e9f7f1cSAlex Elder rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id); 4543935dc89fSAlex Elder return -EIO; 4544935dc89fSAlex Elder } 45452e9f7f1cSAlex Elder pool_name = kstrdup(pool_name, GFP_KERNEL); 45462e9f7f1cSAlex Elder if (!pool_name) 45479e15b77dSAlex Elder return -ENOMEM; 45489e15b77dSAlex Elder 45499e15b77dSAlex Elder /* Fetch the image name; tolerate failure here */ 45509e15b77dSAlex Elder 45512e9f7f1cSAlex Elder image_name = rbd_dev_image_name(rbd_dev); 45522e9f7f1cSAlex Elder if (!image_name) 455306ecc6cbSAlex Elder rbd_warn(rbd_dev, "unable to get image name"); 45549e15b77dSAlex Elder 455504077599SIlya Dryomov /* Fetch the snapshot name */ 45569e15b77dSAlex Elder 45572e9f7f1cSAlex Elder snap_name = rbd_snap_name(rbd_dev, spec->snap_id); 4558da6a6b63SJosh Durgin if (IS_ERR(snap_name)) { 4559da6a6b63SJosh Durgin ret = PTR_ERR(snap_name); 45609e15b77dSAlex Elder goto out_err; 45612e9f7f1cSAlex Elder } 45622e9f7f1cSAlex Elder 45632e9f7f1cSAlex Elder spec->pool_name = pool_name; 45642e9f7f1cSAlex Elder spec->image_name = image_name; 45652e9f7f1cSAlex Elder spec->snap_name = snap_name; 45669e15b77dSAlex Elder 45679e15b77dSAlex Elder return 0; 456804077599SIlya Dryomov 45699e15b77dSAlex Elder out_err: 45702e9f7f1cSAlex Elder kfree(image_name); 45712e9f7f1cSAlex Elder kfree(pool_name); 45729e15b77dSAlex Elder return ret; 45739e15b77dSAlex Elder } 45749e15b77dSAlex Elder 4575cc4a38bdSAlex Elder static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev) 457635d489f9SAlex Elder { 457735d489f9SAlex Elder size_t size; 457835d489f9SAlex Elder int ret; 457935d489f9SAlex Elder void *reply_buf; 458035d489f9SAlex Elder void *p; 458135d489f9SAlex Elder void *end; 458235d489f9SAlex Elder u64 seq; 458335d489f9SAlex Elder u32 snap_count; 458435d489f9SAlex Elder struct ceph_snap_context *snapc; 458535d489f9SAlex Elder u32 i; 458635d489f9SAlex Elder 458735d489f9SAlex Elder /* 458835d489f9SAlex Elder * We'll need room for the seq value (maximum snapshot id), 458935d489f9SAlex Elder * snapshot count, and array of that many snapshot ids. 459035d489f9SAlex Elder * For now we have a fixed upper limit on the number we're 459135d489f9SAlex Elder * prepared to receive. 459235d489f9SAlex Elder */ 459335d489f9SAlex Elder size = sizeof (__le64) + sizeof (__le32) + 459435d489f9SAlex Elder RBD_MAX_SNAP_COUNT * sizeof (__le64); 459535d489f9SAlex Elder reply_buf = kzalloc(size, GFP_KERNEL); 459635d489f9SAlex Elder if (!reply_buf) 459735d489f9SAlex Elder return -ENOMEM; 459835d489f9SAlex Elder 459936be9a76SAlex Elder ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, 46004157976bSAlex Elder "rbd", "get_snapcontext", NULL, 0, 4601e2a58ee5SAlex Elder reply_buf, size); 460236be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 460335d489f9SAlex Elder if (ret < 0) 460435d489f9SAlex Elder goto out; 460535d489f9SAlex Elder 460635d489f9SAlex Elder p = reply_buf; 460757385b51SAlex Elder end = reply_buf + ret; 460857385b51SAlex Elder ret = -ERANGE; 460935d489f9SAlex Elder ceph_decode_64_safe(&p, end, seq, out); 461035d489f9SAlex Elder ceph_decode_32_safe(&p, end, snap_count, out); 461135d489f9SAlex Elder 461235d489f9SAlex Elder /* 461335d489f9SAlex Elder * Make sure the reported number of snapshot ids wouldn't go 461435d489f9SAlex Elder * beyond the end of our buffer. But before checking that, 461535d489f9SAlex Elder * make sure the computed size of the snapshot context we 461635d489f9SAlex Elder * allocate is representable in a size_t. 461735d489f9SAlex Elder */ 461835d489f9SAlex Elder if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context)) 461935d489f9SAlex Elder / sizeof (u64)) { 462035d489f9SAlex Elder ret = -EINVAL; 462135d489f9SAlex Elder goto out; 462235d489f9SAlex Elder } 462335d489f9SAlex Elder if (!ceph_has_room(&p, end, snap_count * sizeof (__le64))) 462435d489f9SAlex Elder goto out; 4625468521c1SAlex Elder ret = 0; 462635d489f9SAlex Elder 4627812164f8SAlex Elder snapc = ceph_create_snap_context(snap_count, GFP_KERNEL); 462835d489f9SAlex Elder if (!snapc) { 462935d489f9SAlex Elder ret = -ENOMEM; 463035d489f9SAlex Elder goto out; 463135d489f9SAlex Elder } 463235d489f9SAlex Elder snapc->seq = seq; 463335d489f9SAlex Elder for (i = 0; i < snap_count; i++) 463435d489f9SAlex Elder snapc->snaps[i] = ceph_decode_64(&p); 463535d489f9SAlex Elder 463649ece554SAlex Elder ceph_put_snap_context(rbd_dev->header.snapc); 463735d489f9SAlex Elder rbd_dev->header.snapc = snapc; 463835d489f9SAlex Elder 463935d489f9SAlex Elder dout(" snap context seq = %llu, snap_count = %u\n", 464035d489f9SAlex Elder (unsigned long long)seq, (unsigned int)snap_count); 464135d489f9SAlex Elder out: 464235d489f9SAlex Elder kfree(reply_buf); 464335d489f9SAlex Elder 464457385b51SAlex Elder return ret; 464535d489f9SAlex Elder } 464635d489f9SAlex Elder 464754cac61fSAlex Elder static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, 464854cac61fSAlex Elder u64 snap_id) 4649b8b1e2dbSAlex Elder { 4650b8b1e2dbSAlex Elder size_t size; 4651b8b1e2dbSAlex Elder void *reply_buf; 465254cac61fSAlex Elder __le64 snapid; 4653b8b1e2dbSAlex Elder int ret; 4654b8b1e2dbSAlex Elder void *p; 4655b8b1e2dbSAlex Elder void *end; 4656b8b1e2dbSAlex Elder char *snap_name; 4657b8b1e2dbSAlex Elder 4658b8b1e2dbSAlex Elder size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN; 4659b8b1e2dbSAlex Elder reply_buf = kmalloc(size, GFP_KERNEL); 4660b8b1e2dbSAlex Elder if (!reply_buf) 4661b8b1e2dbSAlex Elder return ERR_PTR(-ENOMEM); 4662b8b1e2dbSAlex Elder 466354cac61fSAlex Elder snapid = cpu_to_le64(snap_id); 466436be9a76SAlex Elder ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, 4665b8b1e2dbSAlex Elder "rbd", "get_snapshot_name", 466654cac61fSAlex Elder &snapid, sizeof (snapid), 4667e2a58ee5SAlex Elder reply_buf, size); 466836be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 4669f40eb349SAlex Elder if (ret < 0) { 4670f40eb349SAlex Elder snap_name = ERR_PTR(ret); 4671b8b1e2dbSAlex Elder goto out; 4672f40eb349SAlex Elder } 4673b8b1e2dbSAlex Elder 4674b8b1e2dbSAlex Elder p = reply_buf; 4675f40eb349SAlex Elder end = reply_buf + ret; 4676e5c35534SAlex Elder snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL); 4677f40eb349SAlex Elder if (IS_ERR(snap_name)) 4678b8b1e2dbSAlex Elder goto out; 4679f40eb349SAlex Elder 4680b8b1e2dbSAlex Elder dout(" snap_id 0x%016llx snap_name = %s\n", 468154cac61fSAlex Elder (unsigned long long)snap_id, snap_name); 4682b8b1e2dbSAlex Elder out: 4683b8b1e2dbSAlex Elder kfree(reply_buf); 4684b8b1e2dbSAlex Elder 4685f40eb349SAlex Elder return snap_name; 4686b8b1e2dbSAlex Elder } 4687b8b1e2dbSAlex Elder 46882df3fac7SAlex Elder static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev) 4689117973fbSAlex Elder { 46902df3fac7SAlex Elder bool first_time = rbd_dev->header.object_prefix == NULL; 4691117973fbSAlex Elder int ret; 4692117973fbSAlex Elder 46931617e40cSJosh Durgin ret = rbd_dev_v2_image_size(rbd_dev); 46941617e40cSJosh Durgin if (ret) 4695cfbf6377SAlex Elder return ret; 46961617e40cSJosh Durgin 46972df3fac7SAlex Elder if (first_time) { 46982df3fac7SAlex Elder ret = rbd_dev_v2_header_onetime(rbd_dev); 46992df3fac7SAlex Elder if (ret) 4700cfbf6377SAlex Elder return ret; 47012df3fac7SAlex Elder } 47022df3fac7SAlex Elder 4703cc4a38bdSAlex Elder ret = rbd_dev_v2_snap_context(rbd_dev); 4704117973fbSAlex Elder dout("rbd_dev_v2_snap_context returned %d\n", ret); 4705117973fbSAlex Elder 4706117973fbSAlex Elder return ret; 4707117973fbSAlex Elder } 4708117973fbSAlex Elder 4709a720ae09SIlya Dryomov static int rbd_dev_header_info(struct rbd_device *rbd_dev) 4710a720ae09SIlya Dryomov { 4711a720ae09SIlya Dryomov rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 4712a720ae09SIlya Dryomov 4713a720ae09SIlya Dryomov if (rbd_dev->image_format == 1) 4714a720ae09SIlya Dryomov return rbd_dev_v1_header_info(rbd_dev); 4715a720ae09SIlya Dryomov 4716a720ae09SIlya Dryomov return rbd_dev_v2_header_info(rbd_dev); 4717a720ae09SIlya Dryomov } 4718a720ae09SIlya Dryomov 4719dfc5606dSYehuda Sadeh static int rbd_bus_add_dev(struct rbd_device *rbd_dev) 4720dfc5606dSYehuda Sadeh { 4721dfc5606dSYehuda Sadeh struct device *dev; 4722cd789ab9SAlex Elder int ret; 4723dfc5606dSYehuda Sadeh 4724cd789ab9SAlex Elder dev = &rbd_dev->dev; 4725dfc5606dSYehuda Sadeh dev->bus = &rbd_bus_type; 4726dfc5606dSYehuda Sadeh dev->type = &rbd_device_type; 4727dfc5606dSYehuda Sadeh dev->parent = &rbd_root_dev; 4728200a6a8bSAlex Elder dev->release = rbd_dev_device_release; 4729de71a297SAlex Elder dev_set_name(dev, "%d", rbd_dev->dev_id); 4730dfc5606dSYehuda Sadeh ret = device_register(dev); 4731dfc5606dSYehuda Sadeh 4732dfc5606dSYehuda Sadeh return ret; 4733602adf40SYehuda Sadeh } 4734602adf40SYehuda Sadeh 4735dfc5606dSYehuda Sadeh static void rbd_bus_del_dev(struct rbd_device *rbd_dev) 4736dfc5606dSYehuda Sadeh { 4737dfc5606dSYehuda Sadeh device_unregister(&rbd_dev->dev); 4738dfc5606dSYehuda Sadeh } 4739dfc5606dSYehuda Sadeh 47401ddbe94eSAlex Elder /* 4741499afd5bSAlex Elder * Get a unique rbd identifier for the given new rbd_dev, and add 4742f8a22fc2SIlya Dryomov * the rbd_dev to the global list. 47431ddbe94eSAlex Elder */ 4744f8a22fc2SIlya Dryomov static int rbd_dev_id_get(struct rbd_device *rbd_dev) 4745b7f23c36SAlex Elder { 4746f8a22fc2SIlya Dryomov int new_dev_id; 4747f8a22fc2SIlya Dryomov 47489b60e70bSIlya Dryomov new_dev_id = ida_simple_get(&rbd_dev_id_ida, 47499b60e70bSIlya Dryomov 0, minor_to_rbd_dev_id(1 << MINORBITS), 47509b60e70bSIlya Dryomov GFP_KERNEL); 4751f8a22fc2SIlya Dryomov if (new_dev_id < 0) 4752f8a22fc2SIlya Dryomov return new_dev_id; 4753f8a22fc2SIlya Dryomov 4754f8a22fc2SIlya Dryomov rbd_dev->dev_id = new_dev_id; 4755499afd5bSAlex Elder 4756499afd5bSAlex Elder spin_lock(&rbd_dev_list_lock); 4757499afd5bSAlex Elder list_add_tail(&rbd_dev->node, &rbd_dev_list); 4758499afd5bSAlex Elder spin_unlock(&rbd_dev_list_lock); 4759f8a22fc2SIlya Dryomov 476070eebd20SIlya Dryomov dout("rbd_dev %p given dev id %d\n", rbd_dev, rbd_dev->dev_id); 4761f8a22fc2SIlya Dryomov 4762f8a22fc2SIlya Dryomov return 0; 4763b7f23c36SAlex Elder } 4764b7f23c36SAlex Elder 47651ddbe94eSAlex Elder /* 4766499afd5bSAlex Elder * Remove an rbd_dev from the global list, and record that its 4767499afd5bSAlex Elder * identifier is no longer in use. 47681ddbe94eSAlex Elder */ 4769e2839308SAlex Elder static void rbd_dev_id_put(struct rbd_device *rbd_dev) 47701ddbe94eSAlex Elder { 4771499afd5bSAlex Elder spin_lock(&rbd_dev_list_lock); 4772499afd5bSAlex Elder list_del_init(&rbd_dev->node); 4773499afd5bSAlex Elder spin_unlock(&rbd_dev_list_lock); 47741ddbe94eSAlex Elder 4775f8a22fc2SIlya Dryomov ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id); 4776f8a22fc2SIlya Dryomov 4777f8a22fc2SIlya Dryomov dout("rbd_dev %p released dev id %d\n", rbd_dev, rbd_dev->dev_id); 4778b7f23c36SAlex Elder } 4779b7f23c36SAlex Elder 4780a725f65eSAlex Elder /* 4781e28fff26SAlex Elder * Skips over white space at *buf, and updates *buf to point to the 4782e28fff26SAlex Elder * first found non-space character (if any). Returns the length of 4783593a9e7bSAlex Elder * the token (string of non-white space characters) found. Note 4784593a9e7bSAlex Elder * that *buf must be terminated with '\0'. 4785e28fff26SAlex Elder */ 4786e28fff26SAlex Elder static inline size_t next_token(const char **buf) 4787e28fff26SAlex Elder { 4788e28fff26SAlex Elder /* 4789e28fff26SAlex Elder * These are the characters that produce nonzero for 4790e28fff26SAlex Elder * isspace() in the "C" and "POSIX" locales. 4791e28fff26SAlex Elder */ 4792e28fff26SAlex Elder const char *spaces = " \f\n\r\t\v"; 4793e28fff26SAlex Elder 4794e28fff26SAlex Elder *buf += strspn(*buf, spaces); /* Find start of token */ 4795e28fff26SAlex Elder 4796e28fff26SAlex Elder return strcspn(*buf, spaces); /* Return token length */ 4797e28fff26SAlex Elder } 4798e28fff26SAlex Elder 4799e28fff26SAlex Elder /* 4800ea3352f4SAlex Elder * Finds the next token in *buf, dynamically allocates a buffer big 4801ea3352f4SAlex Elder * enough to hold a copy of it, and copies the token into the new 4802ea3352f4SAlex Elder * buffer. The copy is guaranteed to be terminated with '\0'. Note 4803ea3352f4SAlex Elder * that a duplicate buffer is created even for a zero-length token. 4804ea3352f4SAlex Elder * 4805ea3352f4SAlex Elder * Returns a pointer to the newly-allocated duplicate, or a null 4806ea3352f4SAlex Elder * pointer if memory for the duplicate was not available. If 4807ea3352f4SAlex Elder * the lenp argument is a non-null pointer, the length of the token 4808ea3352f4SAlex Elder * (not including the '\0') is returned in *lenp. 4809ea3352f4SAlex Elder * 4810ea3352f4SAlex Elder * If successful, the *buf pointer will be updated to point beyond 4811ea3352f4SAlex Elder * the end of the found token. 4812ea3352f4SAlex Elder * 4813ea3352f4SAlex Elder * Note: uses GFP_KERNEL for allocation. 4814ea3352f4SAlex Elder */ 4815ea3352f4SAlex Elder static inline char *dup_token(const char **buf, size_t *lenp) 4816ea3352f4SAlex Elder { 4817ea3352f4SAlex Elder char *dup; 4818ea3352f4SAlex Elder size_t len; 4819ea3352f4SAlex Elder 4820ea3352f4SAlex Elder len = next_token(buf); 48214caf35f9SAlex Elder dup = kmemdup(*buf, len + 1, GFP_KERNEL); 4822ea3352f4SAlex Elder if (!dup) 4823ea3352f4SAlex Elder return NULL; 4824ea3352f4SAlex Elder *(dup + len) = '\0'; 4825ea3352f4SAlex Elder *buf += len; 4826ea3352f4SAlex Elder 4827ea3352f4SAlex Elder if (lenp) 4828ea3352f4SAlex Elder *lenp = len; 4829ea3352f4SAlex Elder 4830ea3352f4SAlex Elder return dup; 4831ea3352f4SAlex Elder } 4832ea3352f4SAlex Elder 4833ea3352f4SAlex Elder /* 4834859c31dfSAlex Elder * Parse the options provided for an "rbd add" (i.e., rbd image 4835859c31dfSAlex Elder * mapping) request. These arrive via a write to /sys/bus/rbd/add, 4836859c31dfSAlex Elder * and the data written is passed here via a NUL-terminated buffer. 4837859c31dfSAlex Elder * Returns 0 if successful or an error code otherwise. 4838d22f76e7SAlex Elder * 4839859c31dfSAlex Elder * The information extracted from these options is recorded in 4840859c31dfSAlex Elder * the other parameters which return dynamically-allocated 4841859c31dfSAlex Elder * structures: 4842859c31dfSAlex Elder * ceph_opts 4843859c31dfSAlex Elder * The address of a pointer that will refer to a ceph options 4844859c31dfSAlex Elder * structure. Caller must release the returned pointer using 4845859c31dfSAlex Elder * ceph_destroy_options() when it is no longer needed. 4846859c31dfSAlex Elder * rbd_opts 4847859c31dfSAlex Elder * Address of an rbd options pointer. Fully initialized by 4848859c31dfSAlex Elder * this function; caller must release with kfree(). 4849859c31dfSAlex Elder * spec 4850859c31dfSAlex Elder * Address of an rbd image specification pointer. Fully 4851859c31dfSAlex Elder * initialized by this function based on parsed options. 4852859c31dfSAlex Elder * Caller must release with rbd_spec_put(). 4853859c31dfSAlex Elder * 4854859c31dfSAlex Elder * The options passed take this form: 4855859c31dfSAlex Elder * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>] 4856859c31dfSAlex Elder * where: 4857859c31dfSAlex Elder * <mon_addrs> 4858859c31dfSAlex Elder * A comma-separated list of one or more monitor addresses. 4859859c31dfSAlex Elder * A monitor address is an ip address, optionally followed 4860859c31dfSAlex Elder * by a port number (separated by a colon). 4861859c31dfSAlex Elder * I.e.: ip1[:port1][,ip2[:port2]...] 4862859c31dfSAlex Elder * <options> 4863859c31dfSAlex Elder * A comma-separated list of ceph and/or rbd options. 4864859c31dfSAlex Elder * <pool_name> 4865859c31dfSAlex Elder * The name of the rados pool containing the rbd image. 4866859c31dfSAlex Elder * <image_name> 4867859c31dfSAlex Elder * The name of the image in that pool to map. 4868859c31dfSAlex Elder * <snap_id> 4869859c31dfSAlex Elder * An optional snapshot id. If provided, the mapping will 4870859c31dfSAlex Elder * present data from the image at the time that snapshot was 4871859c31dfSAlex Elder * created. The image head is used if no snapshot id is 4872859c31dfSAlex Elder * provided. Snapshot mappings are always read-only. 4873a725f65eSAlex Elder */ 4874859c31dfSAlex Elder static int rbd_add_parse_args(const char *buf, 4875dc79b113SAlex Elder struct ceph_options **ceph_opts, 4876859c31dfSAlex Elder struct rbd_options **opts, 4877859c31dfSAlex Elder struct rbd_spec **rbd_spec) 4878a725f65eSAlex Elder { 4879e28fff26SAlex Elder size_t len; 4880859c31dfSAlex Elder char *options; 48810ddebc0cSAlex Elder const char *mon_addrs; 4882ecb4dc22SAlex Elder char *snap_name; 48830ddebc0cSAlex Elder size_t mon_addrs_size; 4884859c31dfSAlex Elder struct rbd_spec *spec = NULL; 48854e9afebaSAlex Elder struct rbd_options *rbd_opts = NULL; 4886859c31dfSAlex Elder struct ceph_options *copts; 4887dc79b113SAlex Elder int ret; 4888e28fff26SAlex Elder 4889e28fff26SAlex Elder /* The first four tokens are required */ 4890e28fff26SAlex Elder 48917ef3214aSAlex Elder len = next_token(&buf); 48924fb5d671SAlex Elder if (!len) { 48934fb5d671SAlex Elder rbd_warn(NULL, "no monitor address(es) provided"); 48944fb5d671SAlex Elder return -EINVAL; 48954fb5d671SAlex Elder } 48960ddebc0cSAlex Elder mon_addrs = buf; 4897f28e565aSAlex Elder mon_addrs_size = len + 1; 48987ef3214aSAlex Elder buf += len; 4899a725f65eSAlex Elder 4900dc79b113SAlex Elder ret = -EINVAL; 4901f28e565aSAlex Elder options = dup_token(&buf, NULL); 4902f28e565aSAlex Elder if (!options) 4903dc79b113SAlex Elder return -ENOMEM; 49044fb5d671SAlex Elder if (!*options) { 49054fb5d671SAlex Elder rbd_warn(NULL, "no options provided"); 49064fb5d671SAlex Elder goto out_err; 49074fb5d671SAlex Elder } 4908a725f65eSAlex Elder 4909859c31dfSAlex Elder spec = rbd_spec_alloc(); 4910859c31dfSAlex Elder if (!spec) 4911f28e565aSAlex Elder goto out_mem; 4912859c31dfSAlex Elder 4913859c31dfSAlex Elder spec->pool_name = dup_token(&buf, NULL); 4914859c31dfSAlex Elder if (!spec->pool_name) 4915859c31dfSAlex Elder goto out_mem; 49164fb5d671SAlex Elder if (!*spec->pool_name) { 49174fb5d671SAlex Elder rbd_warn(NULL, "no pool name provided"); 49184fb5d671SAlex Elder goto out_err; 49194fb5d671SAlex Elder } 4920e28fff26SAlex Elder 492169e7a02fSAlex Elder spec->image_name = dup_token(&buf, NULL); 4922859c31dfSAlex Elder if (!spec->image_name) 4923f28e565aSAlex Elder goto out_mem; 49244fb5d671SAlex Elder if (!*spec->image_name) { 49254fb5d671SAlex Elder rbd_warn(NULL, "no image name provided"); 49264fb5d671SAlex Elder goto out_err; 49274fb5d671SAlex Elder } 4928e28fff26SAlex Elder 4929f28e565aSAlex Elder /* 4930f28e565aSAlex Elder * Snapshot name is optional; default is to use "-" 4931f28e565aSAlex Elder * (indicating the head/no snapshot). 4932f28e565aSAlex Elder */ 49333feeb894SAlex Elder len = next_token(&buf); 4934820a5f3eSAlex Elder if (!len) { 49353feeb894SAlex Elder buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */ 49363feeb894SAlex Elder len = sizeof (RBD_SNAP_HEAD_NAME) - 1; 4937f28e565aSAlex Elder } else if (len > RBD_MAX_SNAP_NAME_LEN) { 4938dc79b113SAlex Elder ret = -ENAMETOOLONG; 4939f28e565aSAlex Elder goto out_err; 4940849b4260SAlex Elder } 4941ecb4dc22SAlex Elder snap_name = kmemdup(buf, len + 1, GFP_KERNEL); 4942ecb4dc22SAlex Elder if (!snap_name) 4943f28e565aSAlex Elder goto out_mem; 4944ecb4dc22SAlex Elder *(snap_name + len) = '\0'; 4945ecb4dc22SAlex Elder spec->snap_name = snap_name; 4946e5c35534SAlex Elder 49470ddebc0cSAlex Elder /* Initialize all rbd options to the defaults */ 4948e28fff26SAlex Elder 49494e9afebaSAlex Elder rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL); 49504e9afebaSAlex Elder if (!rbd_opts) 49514e9afebaSAlex Elder goto out_mem; 49524e9afebaSAlex Elder 49534e9afebaSAlex Elder rbd_opts->read_only = RBD_READ_ONLY_DEFAULT; 4954d22f76e7SAlex Elder 4955859c31dfSAlex Elder copts = ceph_parse_options(options, mon_addrs, 49560ddebc0cSAlex Elder mon_addrs + mon_addrs_size - 1, 49574e9afebaSAlex Elder parse_rbd_opts_token, rbd_opts); 4958859c31dfSAlex Elder if (IS_ERR(copts)) { 4959859c31dfSAlex Elder ret = PTR_ERR(copts); 4960dc79b113SAlex Elder goto out_err; 4961dc79b113SAlex Elder } 4962859c31dfSAlex Elder kfree(options); 4963859c31dfSAlex Elder 4964859c31dfSAlex Elder *ceph_opts = copts; 49654e9afebaSAlex Elder *opts = rbd_opts; 4966859c31dfSAlex Elder *rbd_spec = spec; 49670ddebc0cSAlex Elder 4968dc79b113SAlex Elder return 0; 4969f28e565aSAlex Elder out_mem: 4970dc79b113SAlex Elder ret = -ENOMEM; 4971d22f76e7SAlex Elder out_err: 4972859c31dfSAlex Elder kfree(rbd_opts); 4973859c31dfSAlex Elder rbd_spec_put(spec); 4974f28e565aSAlex Elder kfree(options); 4975d22f76e7SAlex Elder 4976dc79b113SAlex Elder return ret; 4977a725f65eSAlex Elder } 4978a725f65eSAlex Elder 4979589d30e0SAlex Elder /* 498030ba1f02SIlya Dryomov * Return pool id (>= 0) or a negative error code. 498130ba1f02SIlya Dryomov */ 498230ba1f02SIlya Dryomov static int rbd_add_get_pool_id(struct rbd_client *rbdc, const char *pool_name) 498330ba1f02SIlya Dryomov { 4984a319bf56SIlya Dryomov struct ceph_options *opts = rbdc->client->options; 498530ba1f02SIlya Dryomov u64 newest_epoch; 498630ba1f02SIlya Dryomov int tries = 0; 498730ba1f02SIlya Dryomov int ret; 498830ba1f02SIlya Dryomov 498930ba1f02SIlya Dryomov again: 499030ba1f02SIlya Dryomov ret = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, pool_name); 499130ba1f02SIlya Dryomov if (ret == -ENOENT && tries++ < 1) { 499230ba1f02SIlya Dryomov ret = ceph_monc_do_get_version(&rbdc->client->monc, "osdmap", 499330ba1f02SIlya Dryomov &newest_epoch); 499430ba1f02SIlya Dryomov if (ret < 0) 499530ba1f02SIlya Dryomov return ret; 499630ba1f02SIlya Dryomov 499730ba1f02SIlya Dryomov if (rbdc->client->osdc.osdmap->epoch < newest_epoch) { 499830ba1f02SIlya Dryomov ceph_monc_request_next_osdmap(&rbdc->client->monc); 499930ba1f02SIlya Dryomov (void) ceph_monc_wait_osdmap(&rbdc->client->monc, 5000a319bf56SIlya Dryomov newest_epoch, 5001a319bf56SIlya Dryomov opts->mount_timeout); 500230ba1f02SIlya Dryomov goto again; 500330ba1f02SIlya Dryomov } else { 500430ba1f02SIlya Dryomov /* the osdmap we have is new enough */ 500530ba1f02SIlya Dryomov return -ENOENT; 500630ba1f02SIlya Dryomov } 500730ba1f02SIlya Dryomov } 500830ba1f02SIlya Dryomov 500930ba1f02SIlya Dryomov return ret; 501030ba1f02SIlya Dryomov } 501130ba1f02SIlya Dryomov 501230ba1f02SIlya Dryomov /* 5013589d30e0SAlex Elder * An rbd format 2 image has a unique identifier, distinct from the 5014589d30e0SAlex Elder * name given to it by the user. Internally, that identifier is 5015589d30e0SAlex Elder * what's used to specify the names of objects related to the image. 5016589d30e0SAlex Elder * 5017589d30e0SAlex Elder * A special "rbd id" object is used to map an rbd image name to its 5018589d30e0SAlex Elder * id. If that object doesn't exist, then there is no v2 rbd image 5019589d30e0SAlex Elder * with the supplied name. 5020589d30e0SAlex Elder * 5021589d30e0SAlex Elder * This function will record the given rbd_dev's image_id field if 5022589d30e0SAlex Elder * it can be determined, and in that case will return 0. If any 5023589d30e0SAlex Elder * errors occur a negative errno will be returned and the rbd_dev's 5024589d30e0SAlex Elder * image_id field will be unchanged (and should be NULL). 5025589d30e0SAlex Elder */ 5026589d30e0SAlex Elder static int rbd_dev_image_id(struct rbd_device *rbd_dev) 5027589d30e0SAlex Elder { 5028589d30e0SAlex Elder int ret; 5029589d30e0SAlex Elder size_t size; 5030589d30e0SAlex Elder char *object_name; 5031589d30e0SAlex Elder void *response; 5032c0fba368SAlex Elder char *image_id; 50332f82ee54SAlex Elder 5034589d30e0SAlex Elder /* 50352c0d0a10SAlex Elder * When probing a parent image, the image id is already 50362c0d0a10SAlex Elder * known (and the image name likely is not). There's no 5037c0fba368SAlex Elder * need to fetch the image id again in this case. We 5038c0fba368SAlex Elder * do still need to set the image format though. 50392c0d0a10SAlex Elder */ 5040c0fba368SAlex Elder if (rbd_dev->spec->image_id) { 5041c0fba368SAlex Elder rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1; 5042c0fba368SAlex Elder 50432c0d0a10SAlex Elder return 0; 5044c0fba368SAlex Elder } 50452c0d0a10SAlex Elder 50462c0d0a10SAlex Elder /* 5047589d30e0SAlex Elder * First, see if the format 2 image id file exists, and if 5048589d30e0SAlex Elder * so, get the image's persistent id from it. 5049589d30e0SAlex Elder */ 505069e7a02fSAlex Elder size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name); 5051589d30e0SAlex Elder object_name = kmalloc(size, GFP_NOIO); 5052589d30e0SAlex Elder if (!object_name) 5053589d30e0SAlex Elder return -ENOMEM; 50540d7dbfceSAlex Elder sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name); 5055589d30e0SAlex Elder dout("rbd id object name is %s\n", object_name); 5056589d30e0SAlex Elder 5057589d30e0SAlex Elder /* Response will be an encoded string, which includes a length */ 5058589d30e0SAlex Elder 5059589d30e0SAlex Elder size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX; 5060589d30e0SAlex Elder response = kzalloc(size, GFP_NOIO); 5061589d30e0SAlex Elder if (!response) { 5062589d30e0SAlex Elder ret = -ENOMEM; 5063589d30e0SAlex Elder goto out; 5064589d30e0SAlex Elder } 5065589d30e0SAlex Elder 5066c0fba368SAlex Elder /* If it doesn't exist we'll assume it's a format 1 image */ 5067c0fba368SAlex Elder 506836be9a76SAlex Elder ret = rbd_obj_method_sync(rbd_dev, object_name, 50694157976bSAlex Elder "rbd", "get_id", NULL, 0, 5070e2a58ee5SAlex Elder response, RBD_IMAGE_ID_LEN_MAX); 507136be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 5072c0fba368SAlex Elder if (ret == -ENOENT) { 5073c0fba368SAlex Elder image_id = kstrdup("", GFP_KERNEL); 5074c0fba368SAlex Elder ret = image_id ? 0 : -ENOMEM; 5075c0fba368SAlex Elder if (!ret) 5076c0fba368SAlex Elder rbd_dev->image_format = 1; 50777dd440c9SIlya Dryomov } else if (ret >= 0) { 5078c0fba368SAlex Elder void *p = response; 5079589d30e0SAlex Elder 5080c0fba368SAlex Elder image_id = ceph_extract_encoded_string(&p, p + ret, 5081979ed480SAlex Elder NULL, GFP_NOIO); 5082461f758aSDuan Jiong ret = PTR_ERR_OR_ZERO(image_id); 5083c0fba368SAlex Elder if (!ret) 5084c0fba368SAlex Elder rbd_dev->image_format = 2; 5085c0fba368SAlex Elder } 5086c0fba368SAlex Elder 5087c0fba368SAlex Elder if (!ret) { 5088c0fba368SAlex Elder rbd_dev->spec->image_id = image_id; 5089c0fba368SAlex Elder dout("image_id is %s\n", image_id); 5090589d30e0SAlex Elder } 5091589d30e0SAlex Elder out: 5092589d30e0SAlex Elder kfree(response); 5093589d30e0SAlex Elder kfree(object_name); 5094589d30e0SAlex Elder 5095589d30e0SAlex Elder return ret; 5096589d30e0SAlex Elder } 5097589d30e0SAlex Elder 50983abef3b3SAlex Elder /* 50993abef3b3SAlex Elder * Undo whatever state changes are made by v1 or v2 header info 51003abef3b3SAlex Elder * call. 51013abef3b3SAlex Elder */ 51026fd48b3bSAlex Elder static void rbd_dev_unprobe(struct rbd_device *rbd_dev) 51036fd48b3bSAlex Elder { 51046fd48b3bSAlex Elder struct rbd_image_header *header; 51056fd48b3bSAlex Elder 5106a2acd00eSAlex Elder rbd_dev_parent_put(rbd_dev); 51076fd48b3bSAlex Elder 51086fd48b3bSAlex Elder /* Free dynamic fields from the header, then zero it out */ 51096fd48b3bSAlex Elder 51106fd48b3bSAlex Elder header = &rbd_dev->header; 5111812164f8SAlex Elder ceph_put_snap_context(header->snapc); 51126fd48b3bSAlex Elder kfree(header->snap_sizes); 51136fd48b3bSAlex Elder kfree(header->snap_names); 51146fd48b3bSAlex Elder kfree(header->object_prefix); 51156fd48b3bSAlex Elder memset(header, 0, sizeof (*header)); 51166fd48b3bSAlex Elder } 51176fd48b3bSAlex Elder 51182df3fac7SAlex Elder static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev) 5119a30b71b9SAlex Elder { 5120a30b71b9SAlex Elder int ret; 5121a30b71b9SAlex Elder 51221e130199SAlex Elder ret = rbd_dev_v2_object_prefix(rbd_dev); 512357385b51SAlex Elder if (ret) 51241e130199SAlex Elder goto out_err; 5125b1b5402aSAlex Elder 51262df3fac7SAlex Elder /* 51272df3fac7SAlex Elder * Get the and check features for the image. Currently the 51282df3fac7SAlex Elder * features are assumed to never change. 51292df3fac7SAlex Elder */ 5130b1b5402aSAlex Elder ret = rbd_dev_v2_features(rbd_dev); 513157385b51SAlex Elder if (ret) 5132b1b5402aSAlex Elder goto out_err; 513335d489f9SAlex Elder 5134cc070d59SAlex Elder /* If the image supports fancy striping, get its parameters */ 5135cc070d59SAlex Elder 5136cc070d59SAlex Elder if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) { 5137cc070d59SAlex Elder ret = rbd_dev_v2_striping_info(rbd_dev); 5138cc070d59SAlex Elder if (ret < 0) 5139cc070d59SAlex Elder goto out_err; 5140cc070d59SAlex Elder } 51412df3fac7SAlex Elder /* No support for crypto and compression type format 2 images */ 5142a30b71b9SAlex Elder 514335152979SAlex Elder return 0; 51449d475de5SAlex Elder out_err: 5145642a2537SAlex Elder rbd_dev->header.features = 0; 51461e130199SAlex Elder kfree(rbd_dev->header.object_prefix); 51471e130199SAlex Elder rbd_dev->header.object_prefix = NULL; 51489d475de5SAlex Elder 51499d475de5SAlex Elder return ret; 5150a30b71b9SAlex Elder } 5151a30b71b9SAlex Elder 5152124afba2SAlex Elder static int rbd_dev_probe_parent(struct rbd_device *rbd_dev) 515383a06263SAlex Elder { 51542f82ee54SAlex Elder struct rbd_device *parent = NULL; 5155124afba2SAlex Elder struct rbd_spec *parent_spec; 5156124afba2SAlex Elder struct rbd_client *rbdc; 5157124afba2SAlex Elder int ret; 5158124afba2SAlex Elder 5159124afba2SAlex Elder if (!rbd_dev->parent_spec) 5160124afba2SAlex Elder return 0; 5161124afba2SAlex Elder /* 5162124afba2SAlex Elder * We need to pass a reference to the client and the parent 5163124afba2SAlex Elder * spec when creating the parent rbd_dev. Images related by 5164124afba2SAlex Elder * parent/child relationships always share both. 5165124afba2SAlex Elder */ 5166124afba2SAlex Elder parent_spec = rbd_spec_get(rbd_dev->parent_spec); 5167124afba2SAlex Elder rbdc = __rbd_get_client(rbd_dev->rbd_client); 5168124afba2SAlex Elder 5169124afba2SAlex Elder ret = -ENOMEM; 5170124afba2SAlex Elder parent = rbd_dev_create(rbdc, parent_spec); 5171124afba2SAlex Elder if (!parent) 5172124afba2SAlex Elder goto out_err; 5173124afba2SAlex Elder 51741f3ef788SAlex Elder ret = rbd_dev_image_probe(parent, false); 5175124afba2SAlex Elder if (ret < 0) 5176124afba2SAlex Elder goto out_err; 5177124afba2SAlex Elder rbd_dev->parent = parent; 5178a2acd00eSAlex Elder atomic_set(&rbd_dev->parent_ref, 1); 5179124afba2SAlex Elder 5180124afba2SAlex Elder return 0; 5181124afba2SAlex Elder out_err: 5182124afba2SAlex Elder if (parent) { 5183fb65d228SAlex Elder rbd_dev_unparent(rbd_dev); 5184124afba2SAlex Elder kfree(rbd_dev->header_name); 5185124afba2SAlex Elder rbd_dev_destroy(parent); 5186124afba2SAlex Elder } else { 5187124afba2SAlex Elder rbd_put_client(rbdc); 5188124afba2SAlex Elder rbd_spec_put(parent_spec); 5189124afba2SAlex Elder } 5190124afba2SAlex Elder 5191124afba2SAlex Elder return ret; 5192124afba2SAlex Elder } 5193124afba2SAlex Elder 5194200a6a8bSAlex Elder static int rbd_dev_device_setup(struct rbd_device *rbd_dev) 5195124afba2SAlex Elder { 519683a06263SAlex Elder int ret; 519783a06263SAlex Elder 5198f8a22fc2SIlya Dryomov /* Get an id and fill in device name. */ 519983a06263SAlex Elder 5200f8a22fc2SIlya Dryomov ret = rbd_dev_id_get(rbd_dev); 5201f8a22fc2SIlya Dryomov if (ret) 5202f8a22fc2SIlya Dryomov return ret; 5203f8a22fc2SIlya Dryomov 520483a06263SAlex Elder BUILD_BUG_ON(DEV_NAME_LEN 520583a06263SAlex Elder < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH); 520683a06263SAlex Elder sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id); 520783a06263SAlex Elder 52089b60e70bSIlya Dryomov /* Record our major and minor device numbers. */ 520983a06263SAlex Elder 52109b60e70bSIlya Dryomov if (!single_major) { 521183a06263SAlex Elder ret = register_blkdev(0, rbd_dev->name); 521283a06263SAlex Elder if (ret < 0) 521383a06263SAlex Elder goto err_out_id; 52149b60e70bSIlya Dryomov 521583a06263SAlex Elder rbd_dev->major = ret; 5216dd82fff1SIlya Dryomov rbd_dev->minor = 0; 52179b60e70bSIlya Dryomov } else { 52189b60e70bSIlya Dryomov rbd_dev->major = rbd_major; 52199b60e70bSIlya Dryomov rbd_dev->minor = rbd_dev_id_to_minor(rbd_dev->dev_id); 52209b60e70bSIlya Dryomov } 522183a06263SAlex Elder 522283a06263SAlex Elder /* Set up the blkdev mapping. */ 522383a06263SAlex Elder 522483a06263SAlex Elder ret = rbd_init_disk(rbd_dev); 522583a06263SAlex Elder if (ret) 522683a06263SAlex Elder goto err_out_blkdev; 522783a06263SAlex Elder 5228f35a4deeSAlex Elder ret = rbd_dev_mapping_set(rbd_dev); 522983a06263SAlex Elder if (ret) 523083a06263SAlex Elder goto err_out_disk; 5231bc1ecc65SIlya Dryomov 5232f35a4deeSAlex Elder set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE); 523322001f61SJosh Durgin set_disk_ro(rbd_dev->disk, rbd_dev->mapping.read_only); 5234f35a4deeSAlex Elder 5235f35a4deeSAlex Elder ret = rbd_bus_add_dev(rbd_dev); 5236f35a4deeSAlex Elder if (ret) 5237f5ee37bdSIlya Dryomov goto err_out_mapping; 523883a06263SAlex Elder 523983a06263SAlex Elder /* Everything's ready. Announce the disk to the world. */ 524083a06263SAlex Elder 5241129b79d4SAlex Elder set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); 524283a06263SAlex Elder add_disk(rbd_dev->disk); 524383a06263SAlex Elder 524483a06263SAlex Elder pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name, 524583a06263SAlex Elder (unsigned long long) rbd_dev->mapping.size); 524683a06263SAlex Elder 524783a06263SAlex Elder return ret; 52482f82ee54SAlex Elder 5249f35a4deeSAlex Elder err_out_mapping: 5250f35a4deeSAlex Elder rbd_dev_mapping_clear(rbd_dev); 525183a06263SAlex Elder err_out_disk: 525283a06263SAlex Elder rbd_free_disk(rbd_dev); 525383a06263SAlex Elder err_out_blkdev: 52549b60e70bSIlya Dryomov if (!single_major) 525583a06263SAlex Elder unregister_blkdev(rbd_dev->major, rbd_dev->name); 525683a06263SAlex Elder err_out_id: 525783a06263SAlex Elder rbd_dev_id_put(rbd_dev); 5258d1cf5788SAlex Elder rbd_dev_mapping_clear(rbd_dev); 525983a06263SAlex Elder 526083a06263SAlex Elder return ret; 526183a06263SAlex Elder } 526283a06263SAlex Elder 5263332bb12dSAlex Elder static int rbd_dev_header_name(struct rbd_device *rbd_dev) 5264332bb12dSAlex Elder { 5265332bb12dSAlex Elder struct rbd_spec *spec = rbd_dev->spec; 5266332bb12dSAlex Elder size_t size; 5267332bb12dSAlex Elder 5268332bb12dSAlex Elder /* Record the header object name for this rbd image. */ 5269332bb12dSAlex Elder 5270332bb12dSAlex Elder rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 5271332bb12dSAlex Elder 5272332bb12dSAlex Elder if (rbd_dev->image_format == 1) 5273332bb12dSAlex Elder size = strlen(spec->image_name) + sizeof (RBD_SUFFIX); 5274332bb12dSAlex Elder else 5275332bb12dSAlex Elder size = sizeof (RBD_HEADER_PREFIX) + strlen(spec->image_id); 5276332bb12dSAlex Elder 5277332bb12dSAlex Elder rbd_dev->header_name = kmalloc(size, GFP_KERNEL); 5278332bb12dSAlex Elder if (!rbd_dev->header_name) 5279332bb12dSAlex Elder return -ENOMEM; 5280332bb12dSAlex Elder 5281332bb12dSAlex Elder if (rbd_dev->image_format == 1) 5282332bb12dSAlex Elder sprintf(rbd_dev->header_name, "%s%s", 5283332bb12dSAlex Elder spec->image_name, RBD_SUFFIX); 5284332bb12dSAlex Elder else 5285332bb12dSAlex Elder sprintf(rbd_dev->header_name, "%s%s", 5286332bb12dSAlex Elder RBD_HEADER_PREFIX, spec->image_id); 5287332bb12dSAlex Elder return 0; 5288332bb12dSAlex Elder } 5289332bb12dSAlex Elder 5290200a6a8bSAlex Elder static void rbd_dev_image_release(struct rbd_device *rbd_dev) 5291200a6a8bSAlex Elder { 52926fd48b3bSAlex Elder rbd_dev_unprobe(rbd_dev); 5293200a6a8bSAlex Elder kfree(rbd_dev->header_name); 52946fd48b3bSAlex Elder rbd_dev->header_name = NULL; 52956fd48b3bSAlex Elder rbd_dev->image_format = 0; 52966fd48b3bSAlex Elder kfree(rbd_dev->spec->image_id); 52976fd48b3bSAlex Elder rbd_dev->spec->image_id = NULL; 52986fd48b3bSAlex Elder 5299200a6a8bSAlex Elder rbd_dev_destroy(rbd_dev); 5300200a6a8bSAlex Elder } 5301200a6a8bSAlex Elder 5302a30b71b9SAlex Elder /* 5303a30b71b9SAlex Elder * Probe for the existence of the header object for the given rbd 53041f3ef788SAlex Elder * device. If this image is the one being mapped (i.e., not a 53051f3ef788SAlex Elder * parent), initiate a watch on its header object before using that 53061f3ef788SAlex Elder * object to get detailed information about the rbd image. 5307a30b71b9SAlex Elder */ 53081f3ef788SAlex Elder static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool mapping) 5309a30b71b9SAlex Elder { 5310a30b71b9SAlex Elder int ret; 5311a30b71b9SAlex Elder 5312a30b71b9SAlex Elder /* 53133abef3b3SAlex Elder * Get the id from the image id object. Unless there's an 53143abef3b3SAlex Elder * error, rbd_dev->spec->image_id will be filled in with 53153abef3b3SAlex Elder * a dynamically-allocated string, and rbd_dev->image_format 53163abef3b3SAlex Elder * will be set to either 1 or 2. 5317a30b71b9SAlex Elder */ 5318a30b71b9SAlex Elder ret = rbd_dev_image_id(rbd_dev); 5319a30b71b9SAlex Elder if (ret) 5320c0fba368SAlex Elder return ret; 5321c0fba368SAlex Elder 5322332bb12dSAlex Elder ret = rbd_dev_header_name(rbd_dev); 5323332bb12dSAlex Elder if (ret) 5324332bb12dSAlex Elder goto err_out_format; 5325332bb12dSAlex Elder 53261f3ef788SAlex Elder if (mapping) { 5327fca27065SIlya Dryomov ret = rbd_dev_header_watch_sync(rbd_dev); 53281fe48023SIlya Dryomov if (ret) { 53291fe48023SIlya Dryomov if (ret == -ENOENT) 53301fe48023SIlya Dryomov pr_info("image %s/%s does not exist\n", 53311fe48023SIlya Dryomov rbd_dev->spec->pool_name, 53321fe48023SIlya Dryomov rbd_dev->spec->image_name); 5333b644de2bSAlex Elder goto out_header_name; 53341f3ef788SAlex Elder } 53351fe48023SIlya Dryomov } 5336b644de2bSAlex Elder 5337a720ae09SIlya Dryomov ret = rbd_dev_header_info(rbd_dev); 53385655c4d9SAlex Elder if (ret) 5339b644de2bSAlex Elder goto err_out_watch; 5340a30b71b9SAlex Elder 534104077599SIlya Dryomov /* 534204077599SIlya Dryomov * If this image is the one being mapped, we have pool name and 534304077599SIlya Dryomov * id, image name and id, and snap name - need to fill snap id. 534404077599SIlya Dryomov * Otherwise this is a parent image, identified by pool, image 534504077599SIlya Dryomov * and snap ids - need to fill in names for those ids. 534604077599SIlya Dryomov */ 534704077599SIlya Dryomov if (mapping) 534804077599SIlya Dryomov ret = rbd_spec_fill_snap_id(rbd_dev); 534904077599SIlya Dryomov else 535004077599SIlya Dryomov ret = rbd_spec_fill_names(rbd_dev); 53511fe48023SIlya Dryomov if (ret) { 53521fe48023SIlya Dryomov if (ret == -ENOENT) 53531fe48023SIlya Dryomov pr_info("snap %s/%s@%s does not exist\n", 53541fe48023SIlya Dryomov rbd_dev->spec->pool_name, 53551fe48023SIlya Dryomov rbd_dev->spec->image_name, 53561fe48023SIlya Dryomov rbd_dev->spec->snap_name); 535733dca39fSAlex Elder goto err_out_probe; 53581fe48023SIlya Dryomov } 53599bb81c9bSAlex Elder 5360e8f59b59SIlya Dryomov if (rbd_dev->header.features & RBD_FEATURE_LAYERING) { 5361e8f59b59SIlya Dryomov ret = rbd_dev_v2_parent_info(rbd_dev); 5362e8f59b59SIlya Dryomov if (ret) 5363e8f59b59SIlya Dryomov goto err_out_probe; 5364e8f59b59SIlya Dryomov 5365e8f59b59SIlya Dryomov /* 5366e8f59b59SIlya Dryomov * Need to warn users if this image is the one being 5367e8f59b59SIlya Dryomov * mapped and has a parent. 5368e8f59b59SIlya Dryomov */ 5369e8f59b59SIlya Dryomov if (mapping && rbd_dev->parent_spec) 5370e8f59b59SIlya Dryomov rbd_warn(rbd_dev, 5371e8f59b59SIlya Dryomov "WARNING: kernel layering is EXPERIMENTAL!"); 5372e8f59b59SIlya Dryomov } 5373e8f59b59SIlya Dryomov 53749bb81c9bSAlex Elder ret = rbd_dev_probe_parent(rbd_dev); 537530d60ba2SAlex Elder if (ret) 537630d60ba2SAlex Elder goto err_out_probe; 537783a06263SAlex Elder 537830d60ba2SAlex Elder dout("discovered format %u image, header name is %s\n", 537930d60ba2SAlex Elder rbd_dev->image_format, rbd_dev->header_name); 538030d60ba2SAlex Elder return 0; 5381e8f59b59SIlya Dryomov 53826fd48b3bSAlex Elder err_out_probe: 53836fd48b3bSAlex Elder rbd_dev_unprobe(rbd_dev); 5384b644de2bSAlex Elder err_out_watch: 5385fca27065SIlya Dryomov if (mapping) 5386fca27065SIlya Dryomov rbd_dev_header_unwatch_sync(rbd_dev); 5387332bb12dSAlex Elder out_header_name: 5388332bb12dSAlex Elder kfree(rbd_dev->header_name); 5389332bb12dSAlex Elder rbd_dev->header_name = NULL; 5390332bb12dSAlex Elder err_out_format: 5391332bb12dSAlex Elder rbd_dev->image_format = 0; 53925655c4d9SAlex Elder kfree(rbd_dev->spec->image_id); 53935655c4d9SAlex Elder rbd_dev->spec->image_id = NULL; 53945655c4d9SAlex Elder return ret; 539583a06263SAlex Elder } 539683a06263SAlex Elder 53979b60e70bSIlya Dryomov static ssize_t do_rbd_add(struct bus_type *bus, 539859c2be1eSYehuda Sadeh const char *buf, 539959c2be1eSYehuda Sadeh size_t count) 5400602adf40SYehuda Sadeh { 5401cb8627c7SAlex Elder struct rbd_device *rbd_dev = NULL; 5402dc79b113SAlex Elder struct ceph_options *ceph_opts = NULL; 54034e9afebaSAlex Elder struct rbd_options *rbd_opts = NULL; 5404859c31dfSAlex Elder struct rbd_spec *spec = NULL; 54059d3997fdSAlex Elder struct rbd_client *rbdc; 540651344a38SAlex Elder bool read_only; 540727cc2594SAlex Elder int rc = -ENOMEM; 5408602adf40SYehuda Sadeh 5409602adf40SYehuda Sadeh if (!try_module_get(THIS_MODULE)) 5410602adf40SYehuda Sadeh return -ENODEV; 5411602adf40SYehuda Sadeh 5412a725f65eSAlex Elder /* parse add command */ 5413859c31dfSAlex Elder rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec); 5414dc79b113SAlex Elder if (rc < 0) 5415bd4ba655SAlex Elder goto err_out_module; 541651344a38SAlex Elder read_only = rbd_opts->read_only; 541751344a38SAlex Elder kfree(rbd_opts); 541851344a38SAlex Elder rbd_opts = NULL; /* done with this */ 5419a725f65eSAlex Elder 54209d3997fdSAlex Elder rbdc = rbd_get_client(ceph_opts); 54219d3997fdSAlex Elder if (IS_ERR(rbdc)) { 54229d3997fdSAlex Elder rc = PTR_ERR(rbdc); 54230ddebc0cSAlex Elder goto err_out_args; 54249d3997fdSAlex Elder } 5425602adf40SYehuda Sadeh 5426602adf40SYehuda Sadeh /* pick the pool */ 542730ba1f02SIlya Dryomov rc = rbd_add_get_pool_id(rbdc, spec->pool_name); 54281fe48023SIlya Dryomov if (rc < 0) { 54291fe48023SIlya Dryomov if (rc == -ENOENT) 54301fe48023SIlya Dryomov pr_info("pool %s does not exist\n", spec->pool_name); 5431602adf40SYehuda Sadeh goto err_out_client; 54321fe48023SIlya Dryomov } 5433859c31dfSAlex Elder spec->pool_id = (u64)rc; 5434859c31dfSAlex Elder 54350903e875SAlex Elder /* The ceph file layout needs to fit pool id in 32 bits */ 54360903e875SAlex Elder 5437c0cd10dbSAlex Elder if (spec->pool_id > (u64)U32_MAX) { 54389584d508SIlya Dryomov rbd_warn(NULL, "pool id too large (%llu > %u)", 5439c0cd10dbSAlex Elder (unsigned long long)spec->pool_id, U32_MAX); 54400903e875SAlex Elder rc = -EIO; 54410903e875SAlex Elder goto err_out_client; 54420903e875SAlex Elder } 54430903e875SAlex Elder 5444c53d5893SAlex Elder rbd_dev = rbd_dev_create(rbdc, spec); 5445bd4ba655SAlex Elder if (!rbd_dev) 5446bd4ba655SAlex Elder goto err_out_client; 5447c53d5893SAlex Elder rbdc = NULL; /* rbd_dev now owns this */ 5448c53d5893SAlex Elder spec = NULL; /* rbd_dev now owns this */ 5449602adf40SYehuda Sadeh 54501f3ef788SAlex Elder rc = rbd_dev_image_probe(rbd_dev, true); 5451a30b71b9SAlex Elder if (rc < 0) 5452c53d5893SAlex Elder goto err_out_rbd_dev; 545305fd6f6fSAlex Elder 54547ce4eef7SAlex Elder /* If we are mapping a snapshot it must be marked read-only */ 54557ce4eef7SAlex Elder 54567ce4eef7SAlex Elder if (rbd_dev->spec->snap_id != CEPH_NOSNAP) 54577ce4eef7SAlex Elder read_only = true; 54587ce4eef7SAlex Elder rbd_dev->mapping.read_only = read_only; 54597ce4eef7SAlex Elder 5460b536f69aSAlex Elder rc = rbd_dev_device_setup(rbd_dev); 54613abef3b3SAlex Elder if (rc) { 5462e37180c0SIlya Dryomov /* 5463e37180c0SIlya Dryomov * rbd_dev_header_unwatch_sync() can't be moved into 5464e37180c0SIlya Dryomov * rbd_dev_image_release() without refactoring, see 5465e37180c0SIlya Dryomov * commit 1f3ef78861ac. 5466e37180c0SIlya Dryomov */ 5467e37180c0SIlya Dryomov rbd_dev_header_unwatch_sync(rbd_dev); 54683abef3b3SAlex Elder rbd_dev_image_release(rbd_dev); 54693abef3b3SAlex Elder goto err_out_module; 54703abef3b3SAlex Elder } 54713abef3b3SAlex Elder 5472602adf40SYehuda Sadeh return count; 5473b536f69aSAlex Elder 5474c53d5893SAlex Elder err_out_rbd_dev: 5475c53d5893SAlex Elder rbd_dev_destroy(rbd_dev); 5476bd4ba655SAlex Elder err_out_client: 54779d3997fdSAlex Elder rbd_put_client(rbdc); 54780ddebc0cSAlex Elder err_out_args: 5479859c31dfSAlex Elder rbd_spec_put(spec); 5480bd4ba655SAlex Elder err_out_module: 5481bd4ba655SAlex Elder module_put(THIS_MODULE); 548227cc2594SAlex Elder 5483602adf40SYehuda Sadeh dout("Error adding device %s\n", buf); 548427cc2594SAlex Elder 548527cc2594SAlex Elder return (ssize_t)rc; 5486602adf40SYehuda Sadeh } 5487602adf40SYehuda Sadeh 54889b60e70bSIlya Dryomov static ssize_t rbd_add(struct bus_type *bus, 54899b60e70bSIlya Dryomov const char *buf, 54909b60e70bSIlya Dryomov size_t count) 54919b60e70bSIlya Dryomov { 54929b60e70bSIlya Dryomov if (single_major) 54939b60e70bSIlya Dryomov return -EINVAL; 54949b60e70bSIlya Dryomov 54959b60e70bSIlya Dryomov return do_rbd_add(bus, buf, count); 54969b60e70bSIlya Dryomov } 54979b60e70bSIlya Dryomov 54989b60e70bSIlya Dryomov static ssize_t rbd_add_single_major(struct bus_type *bus, 54999b60e70bSIlya Dryomov const char *buf, 55009b60e70bSIlya Dryomov size_t count) 55019b60e70bSIlya Dryomov { 55029b60e70bSIlya Dryomov return do_rbd_add(bus, buf, count); 55039b60e70bSIlya Dryomov } 55049b60e70bSIlya Dryomov 5505200a6a8bSAlex Elder static void rbd_dev_device_release(struct device *dev) 5506602adf40SYehuda Sadeh { 5507593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 5508602adf40SYehuda Sadeh 5509602adf40SYehuda Sadeh rbd_free_disk(rbd_dev); 5510200a6a8bSAlex Elder clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); 55116d80b130SAlex Elder rbd_dev_mapping_clear(rbd_dev); 55129b60e70bSIlya Dryomov if (!single_major) 5513602adf40SYehuda Sadeh unregister_blkdev(rbd_dev->major, rbd_dev->name); 5514e2839308SAlex Elder rbd_dev_id_put(rbd_dev); 5515d1cf5788SAlex Elder rbd_dev_mapping_clear(rbd_dev); 5516602adf40SYehuda Sadeh } 5517602adf40SYehuda Sadeh 551805a46afdSAlex Elder static void rbd_dev_remove_parent(struct rbd_device *rbd_dev) 551905a46afdSAlex Elder { 5520ad945fc1SAlex Elder while (rbd_dev->parent) { 552105a46afdSAlex Elder struct rbd_device *first = rbd_dev; 552205a46afdSAlex Elder struct rbd_device *second = first->parent; 552305a46afdSAlex Elder struct rbd_device *third; 552405a46afdSAlex Elder 552505a46afdSAlex Elder /* 552605a46afdSAlex Elder * Follow to the parent with no grandparent and 552705a46afdSAlex Elder * remove it. 552805a46afdSAlex Elder */ 552905a46afdSAlex Elder while (second && (third = second->parent)) { 553005a46afdSAlex Elder first = second; 553105a46afdSAlex Elder second = third; 553205a46afdSAlex Elder } 5533ad945fc1SAlex Elder rbd_assert(second); 55348ad42cd0SAlex Elder rbd_dev_image_release(second); 5535ad945fc1SAlex Elder first->parent = NULL; 5536ad945fc1SAlex Elder first->parent_overlap = 0; 5537ad945fc1SAlex Elder 5538ad945fc1SAlex Elder rbd_assert(first->parent_spec); 553905a46afdSAlex Elder rbd_spec_put(first->parent_spec); 554005a46afdSAlex Elder first->parent_spec = NULL; 554105a46afdSAlex Elder } 554205a46afdSAlex Elder } 554305a46afdSAlex Elder 55449b60e70bSIlya Dryomov static ssize_t do_rbd_remove(struct bus_type *bus, 5545602adf40SYehuda Sadeh const char *buf, 5546602adf40SYehuda Sadeh size_t count) 5547602adf40SYehuda Sadeh { 5548602adf40SYehuda Sadeh struct rbd_device *rbd_dev = NULL; 5549751cc0e3SAlex Elder struct list_head *tmp; 5550751cc0e3SAlex Elder int dev_id; 5551602adf40SYehuda Sadeh unsigned long ul; 555282a442d2SAlex Elder bool already = false; 55530d8189e1SAlex Elder int ret; 5554602adf40SYehuda Sadeh 5555bb8e0e84SJingoo Han ret = kstrtoul(buf, 10, &ul); 55560d8189e1SAlex Elder if (ret) 55570d8189e1SAlex Elder return ret; 5558602adf40SYehuda Sadeh 5559602adf40SYehuda Sadeh /* convert to int; abort if we lost anything in the conversion */ 5560751cc0e3SAlex Elder dev_id = (int)ul; 5561751cc0e3SAlex Elder if (dev_id != ul) 5562602adf40SYehuda Sadeh return -EINVAL; 5563602adf40SYehuda Sadeh 5564602adf40SYehuda Sadeh ret = -ENOENT; 5565751cc0e3SAlex Elder spin_lock(&rbd_dev_list_lock); 5566751cc0e3SAlex Elder list_for_each(tmp, &rbd_dev_list) { 5567751cc0e3SAlex Elder rbd_dev = list_entry(tmp, struct rbd_device, node); 5568751cc0e3SAlex Elder if (rbd_dev->dev_id == dev_id) { 5569751cc0e3SAlex Elder ret = 0; 5570751cc0e3SAlex Elder break; 5571602adf40SYehuda Sadeh } 5572751cc0e3SAlex Elder } 5573751cc0e3SAlex Elder if (!ret) { 5574a14ea269SAlex Elder spin_lock_irq(&rbd_dev->lock); 5575b82d167bSAlex Elder if (rbd_dev->open_count) 557642382b70SAlex Elder ret = -EBUSY; 5577b82d167bSAlex Elder else 557882a442d2SAlex Elder already = test_and_set_bit(RBD_DEV_FLAG_REMOVING, 557982a442d2SAlex Elder &rbd_dev->flags); 5580a14ea269SAlex Elder spin_unlock_irq(&rbd_dev->lock); 5581751cc0e3SAlex Elder } 5582751cc0e3SAlex Elder spin_unlock(&rbd_dev_list_lock); 558382a442d2SAlex Elder if (ret < 0 || already) 55841ba0f1e7SAlex Elder return ret; 5585751cc0e3SAlex Elder 5586fca27065SIlya Dryomov rbd_dev_header_unwatch_sync(rbd_dev); 55879abc5990SJosh Durgin /* 55889abc5990SJosh Durgin * flush remaining watch callbacks - these must be complete 55899abc5990SJosh Durgin * before the osd_client is shutdown 55909abc5990SJosh Durgin */ 55919abc5990SJosh Durgin dout("%s: flushing notifies", __func__); 55929abc5990SJosh Durgin ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc); 5593fca27065SIlya Dryomov 55949875201eSJosh Durgin /* 55959875201eSJosh Durgin * Don't free anything from rbd_dev->disk until after all 55969875201eSJosh Durgin * notifies are completely processed. Otherwise 55979875201eSJosh Durgin * rbd_bus_del_dev() will race with rbd_watch_cb(), resulting 55989875201eSJosh Durgin * in a potential use after free of rbd_dev->disk or rbd_dev. 55999875201eSJosh Durgin */ 56009875201eSJosh Durgin rbd_bus_del_dev(rbd_dev); 56018ad42cd0SAlex Elder rbd_dev_image_release(rbd_dev); 560279ab7558SAlex Elder module_put(THIS_MODULE); 5603aafb230eSAlex Elder 56041ba0f1e7SAlex Elder return count; 5605602adf40SYehuda Sadeh } 5606602adf40SYehuda Sadeh 56079b60e70bSIlya Dryomov static ssize_t rbd_remove(struct bus_type *bus, 56089b60e70bSIlya Dryomov const char *buf, 56099b60e70bSIlya Dryomov size_t count) 56109b60e70bSIlya Dryomov { 56119b60e70bSIlya Dryomov if (single_major) 56129b60e70bSIlya Dryomov return -EINVAL; 56139b60e70bSIlya Dryomov 56149b60e70bSIlya Dryomov return do_rbd_remove(bus, buf, count); 56159b60e70bSIlya Dryomov } 56169b60e70bSIlya Dryomov 56179b60e70bSIlya Dryomov static ssize_t rbd_remove_single_major(struct bus_type *bus, 56189b60e70bSIlya Dryomov const char *buf, 56199b60e70bSIlya Dryomov size_t count) 56209b60e70bSIlya Dryomov { 56219b60e70bSIlya Dryomov return do_rbd_remove(bus, buf, count); 56229b60e70bSIlya Dryomov } 56239b60e70bSIlya Dryomov 5624602adf40SYehuda Sadeh /* 5625602adf40SYehuda Sadeh * create control files in sysfs 5626dfc5606dSYehuda Sadeh * /sys/bus/rbd/... 5627602adf40SYehuda Sadeh */ 5628602adf40SYehuda Sadeh static int rbd_sysfs_init(void) 5629602adf40SYehuda Sadeh { 5630dfc5606dSYehuda Sadeh int ret; 5631602adf40SYehuda Sadeh 5632fed4c143SAlex Elder ret = device_register(&rbd_root_dev); 5633dfc5606dSYehuda Sadeh if (ret < 0) 5634dfc5606dSYehuda Sadeh return ret; 5635602adf40SYehuda Sadeh 5636fed4c143SAlex Elder ret = bus_register(&rbd_bus_type); 5637fed4c143SAlex Elder if (ret < 0) 5638fed4c143SAlex Elder device_unregister(&rbd_root_dev); 5639602adf40SYehuda Sadeh 5640602adf40SYehuda Sadeh return ret; 5641602adf40SYehuda Sadeh } 5642602adf40SYehuda Sadeh 5643602adf40SYehuda Sadeh static void rbd_sysfs_cleanup(void) 5644602adf40SYehuda Sadeh { 5645dfc5606dSYehuda Sadeh bus_unregister(&rbd_bus_type); 5646fed4c143SAlex Elder device_unregister(&rbd_root_dev); 5647602adf40SYehuda Sadeh } 5648602adf40SYehuda Sadeh 56491c2a9dfeSAlex Elder static int rbd_slab_init(void) 56501c2a9dfeSAlex Elder { 56511c2a9dfeSAlex Elder rbd_assert(!rbd_img_request_cache); 56521c2a9dfeSAlex Elder rbd_img_request_cache = kmem_cache_create("rbd_img_request", 56531c2a9dfeSAlex Elder sizeof (struct rbd_img_request), 56541c2a9dfeSAlex Elder __alignof__(struct rbd_img_request), 56551c2a9dfeSAlex Elder 0, NULL); 5656868311b1SAlex Elder if (!rbd_img_request_cache) 5657868311b1SAlex Elder return -ENOMEM; 5658868311b1SAlex Elder 5659868311b1SAlex Elder rbd_assert(!rbd_obj_request_cache); 5660868311b1SAlex Elder rbd_obj_request_cache = kmem_cache_create("rbd_obj_request", 5661868311b1SAlex Elder sizeof (struct rbd_obj_request), 5662868311b1SAlex Elder __alignof__(struct rbd_obj_request), 5663868311b1SAlex Elder 0, NULL); 566478c2a44aSAlex Elder if (!rbd_obj_request_cache) 566578c2a44aSAlex Elder goto out_err; 566678c2a44aSAlex Elder 566778c2a44aSAlex Elder rbd_assert(!rbd_segment_name_cache); 566878c2a44aSAlex Elder rbd_segment_name_cache = kmem_cache_create("rbd_segment_name", 56692d0ebc5dSIlya Dryomov CEPH_MAX_OID_NAME_LEN + 1, 1, 0, NULL); 567078c2a44aSAlex Elder if (rbd_segment_name_cache) 56711c2a9dfeSAlex Elder return 0; 567278c2a44aSAlex Elder out_err: 567378c2a44aSAlex Elder if (rbd_obj_request_cache) { 567478c2a44aSAlex Elder kmem_cache_destroy(rbd_obj_request_cache); 567578c2a44aSAlex Elder rbd_obj_request_cache = NULL; 567678c2a44aSAlex Elder } 56771c2a9dfeSAlex Elder 5678868311b1SAlex Elder kmem_cache_destroy(rbd_img_request_cache); 5679868311b1SAlex Elder rbd_img_request_cache = NULL; 5680868311b1SAlex Elder 56811c2a9dfeSAlex Elder return -ENOMEM; 56821c2a9dfeSAlex Elder } 56831c2a9dfeSAlex Elder 56841c2a9dfeSAlex Elder static void rbd_slab_exit(void) 56851c2a9dfeSAlex Elder { 568678c2a44aSAlex Elder rbd_assert(rbd_segment_name_cache); 568778c2a44aSAlex Elder kmem_cache_destroy(rbd_segment_name_cache); 568878c2a44aSAlex Elder rbd_segment_name_cache = NULL; 568978c2a44aSAlex Elder 5690868311b1SAlex Elder rbd_assert(rbd_obj_request_cache); 5691868311b1SAlex Elder kmem_cache_destroy(rbd_obj_request_cache); 5692868311b1SAlex Elder rbd_obj_request_cache = NULL; 5693868311b1SAlex Elder 56941c2a9dfeSAlex Elder rbd_assert(rbd_img_request_cache); 56951c2a9dfeSAlex Elder kmem_cache_destroy(rbd_img_request_cache); 56961c2a9dfeSAlex Elder rbd_img_request_cache = NULL; 56971c2a9dfeSAlex Elder } 56981c2a9dfeSAlex Elder 5699cc344fa1SAlex Elder static int __init rbd_init(void) 5700602adf40SYehuda Sadeh { 5701602adf40SYehuda Sadeh int rc; 5702602adf40SYehuda Sadeh 57031e32d34cSAlex Elder if (!libceph_compatible(NULL)) { 57041e32d34cSAlex Elder rbd_warn(NULL, "libceph incompatibility (quitting)"); 57051e32d34cSAlex Elder return -EINVAL; 57061e32d34cSAlex Elder } 5707e1b4d96dSIlya Dryomov 57081c2a9dfeSAlex Elder rc = rbd_slab_init(); 5709602adf40SYehuda Sadeh if (rc) 5710602adf40SYehuda Sadeh return rc; 5711e1b4d96dSIlya Dryomov 5712f5ee37bdSIlya Dryomov /* 5713f5ee37bdSIlya Dryomov * The number of active work items is limited by the number of 5714f77303bdSIlya Dryomov * rbd devices * queue depth, so leave @max_active at default. 5715f5ee37bdSIlya Dryomov */ 5716f5ee37bdSIlya Dryomov rbd_wq = alloc_workqueue(RBD_DRV_NAME, WQ_MEM_RECLAIM, 0); 5717f5ee37bdSIlya Dryomov if (!rbd_wq) { 5718f5ee37bdSIlya Dryomov rc = -ENOMEM; 5719f5ee37bdSIlya Dryomov goto err_out_slab; 5720f5ee37bdSIlya Dryomov } 5721f5ee37bdSIlya Dryomov 57229b60e70bSIlya Dryomov if (single_major) { 57239b60e70bSIlya Dryomov rbd_major = register_blkdev(0, RBD_DRV_NAME); 57249b60e70bSIlya Dryomov if (rbd_major < 0) { 57259b60e70bSIlya Dryomov rc = rbd_major; 5726f5ee37bdSIlya Dryomov goto err_out_wq; 57279b60e70bSIlya Dryomov } 57289b60e70bSIlya Dryomov } 57299b60e70bSIlya Dryomov 57301c2a9dfeSAlex Elder rc = rbd_sysfs_init(); 57311c2a9dfeSAlex Elder if (rc) 57329b60e70bSIlya Dryomov goto err_out_blkdev; 57331c2a9dfeSAlex Elder 57349b60e70bSIlya Dryomov if (single_major) 57359b60e70bSIlya Dryomov pr_info("loaded (major %d)\n", rbd_major); 57369b60e70bSIlya Dryomov else 5737e1b4d96dSIlya Dryomov pr_info("loaded\n"); 57389b60e70bSIlya Dryomov 5739e1b4d96dSIlya Dryomov return 0; 5740e1b4d96dSIlya Dryomov 57419b60e70bSIlya Dryomov err_out_blkdev: 57429b60e70bSIlya Dryomov if (single_major) 57439b60e70bSIlya Dryomov unregister_blkdev(rbd_major, RBD_DRV_NAME); 5744f5ee37bdSIlya Dryomov err_out_wq: 5745f5ee37bdSIlya Dryomov destroy_workqueue(rbd_wq); 5746e1b4d96dSIlya Dryomov err_out_slab: 5747e1b4d96dSIlya Dryomov rbd_slab_exit(); 57481c2a9dfeSAlex Elder return rc; 5749602adf40SYehuda Sadeh } 5750602adf40SYehuda Sadeh 5751cc344fa1SAlex Elder static void __exit rbd_exit(void) 5752602adf40SYehuda Sadeh { 5753ffe312cfSIlya Dryomov ida_destroy(&rbd_dev_id_ida); 5754602adf40SYehuda Sadeh rbd_sysfs_cleanup(); 57559b60e70bSIlya Dryomov if (single_major) 57569b60e70bSIlya Dryomov unregister_blkdev(rbd_major, RBD_DRV_NAME); 5757f5ee37bdSIlya Dryomov destroy_workqueue(rbd_wq); 57581c2a9dfeSAlex Elder rbd_slab_exit(); 5759602adf40SYehuda Sadeh } 5760602adf40SYehuda Sadeh 5761602adf40SYehuda Sadeh module_init(rbd_init); 5762602adf40SYehuda Sadeh module_exit(rbd_exit); 5763602adf40SYehuda Sadeh 5764d552c619SAlex Elder MODULE_AUTHOR("Alex Elder <elder@inktank.com>"); 5765602adf40SYehuda Sadeh MODULE_AUTHOR("Sage Weil <sage@newdream.net>"); 5766602adf40SYehuda Sadeh MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>"); 5767602adf40SYehuda Sadeh /* following authorship retained from original osdblk.c */ 5768602adf40SYehuda Sadeh MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>"); 5769602adf40SYehuda Sadeh 577090da258bSIlya Dryomov MODULE_DESCRIPTION("RADOS Block Device (RBD) driver"); 5771602adf40SYehuda Sadeh MODULE_LICENSE("GPL"); 5772