1e2a58ee5SAlex Elder 2602adf40SYehuda Sadeh /* 3602adf40SYehuda Sadeh rbd.c -- Export ceph rados objects as a Linux block device 4602adf40SYehuda Sadeh 5602adf40SYehuda Sadeh 6602adf40SYehuda Sadeh based on drivers/block/osdblk.c: 7602adf40SYehuda Sadeh 8602adf40SYehuda Sadeh Copyright 2009 Red Hat, Inc. 9602adf40SYehuda Sadeh 10602adf40SYehuda Sadeh This program is free software; you can redistribute it and/or modify 11602adf40SYehuda Sadeh it under the terms of the GNU General Public License as published by 12602adf40SYehuda Sadeh the Free Software Foundation. 13602adf40SYehuda Sadeh 14602adf40SYehuda Sadeh This program is distributed in the hope that it will be useful, 15602adf40SYehuda Sadeh but WITHOUT ANY WARRANTY; without even the implied warranty of 16602adf40SYehuda Sadeh MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17602adf40SYehuda Sadeh GNU General Public License for more details. 18602adf40SYehuda Sadeh 19602adf40SYehuda Sadeh You should have received a copy of the GNU General Public License 20602adf40SYehuda Sadeh along with this program; see the file COPYING. If not, write to 21602adf40SYehuda Sadeh the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 22602adf40SYehuda Sadeh 23602adf40SYehuda Sadeh 24602adf40SYehuda Sadeh 25dfc5606dSYehuda Sadeh For usage instructions, please refer to: 26602adf40SYehuda Sadeh 27dfc5606dSYehuda Sadeh Documentation/ABI/testing/sysfs-bus-rbd 28602adf40SYehuda Sadeh 29602adf40SYehuda Sadeh */ 30602adf40SYehuda Sadeh 31602adf40SYehuda Sadeh #include <linux/ceph/libceph.h> 32602adf40SYehuda Sadeh #include <linux/ceph/osd_client.h> 33602adf40SYehuda Sadeh #include <linux/ceph/mon_client.h> 34602adf40SYehuda Sadeh #include <linux/ceph/decode.h> 3559c2be1eSYehuda Sadeh #include <linux/parser.h> 3630d1cff8SAlex Elder #include <linux/bsearch.h> 37602adf40SYehuda Sadeh 38602adf40SYehuda Sadeh #include <linux/kernel.h> 39602adf40SYehuda Sadeh #include <linux/device.h> 40602adf40SYehuda Sadeh #include <linux/module.h> 417ad18afaSChristoph Hellwig #include <linux/blk-mq.h> 42602adf40SYehuda Sadeh #include <linux/fs.h> 43602adf40SYehuda Sadeh #include <linux/blkdev.h> 441c2a9dfeSAlex Elder #include <linux/slab.h> 45f8a22fc2SIlya Dryomov #include <linux/idr.h> 46bc1ecc65SIlya Dryomov #include <linux/workqueue.h> 47602adf40SYehuda Sadeh 48602adf40SYehuda Sadeh #include "rbd_types.h" 49602adf40SYehuda Sadeh 50aafb230eSAlex Elder #define RBD_DEBUG /* Activate rbd_assert() calls */ 51aafb230eSAlex Elder 52593a9e7bSAlex Elder /* 53593a9e7bSAlex Elder * The basic unit of block I/O is a sector. It is interpreted in a 54593a9e7bSAlex Elder * number of contexts in Linux (blk, bio, genhd), but the default is 55593a9e7bSAlex Elder * universally 512 bytes. These symbols are just slightly more 56593a9e7bSAlex Elder * meaningful than the bare numbers they represent. 57593a9e7bSAlex Elder */ 58593a9e7bSAlex Elder #define SECTOR_SHIFT 9 59593a9e7bSAlex Elder #define SECTOR_SIZE (1ULL << SECTOR_SHIFT) 60593a9e7bSAlex Elder 61a2acd00eSAlex Elder /* 62a2acd00eSAlex Elder * Increment the given counter and return its updated value. 63a2acd00eSAlex Elder * If the counter is already 0 it will not be incremented. 64a2acd00eSAlex Elder * If the counter is already at its maximum value returns 65a2acd00eSAlex Elder * -EINVAL without updating it. 66a2acd00eSAlex Elder */ 67a2acd00eSAlex Elder static int atomic_inc_return_safe(atomic_t *v) 68a2acd00eSAlex Elder { 69a2acd00eSAlex Elder unsigned int counter; 70a2acd00eSAlex Elder 71a2acd00eSAlex Elder counter = (unsigned int)__atomic_add_unless(v, 1, 0); 72a2acd00eSAlex Elder if (counter <= (unsigned int)INT_MAX) 73a2acd00eSAlex Elder return (int)counter; 74a2acd00eSAlex Elder 75a2acd00eSAlex Elder atomic_dec(v); 76a2acd00eSAlex Elder 77a2acd00eSAlex Elder return -EINVAL; 78a2acd00eSAlex Elder } 79a2acd00eSAlex Elder 80a2acd00eSAlex Elder /* Decrement the counter. Return the resulting value, or -EINVAL */ 81a2acd00eSAlex Elder static int atomic_dec_return_safe(atomic_t *v) 82a2acd00eSAlex Elder { 83a2acd00eSAlex Elder int counter; 84a2acd00eSAlex Elder 85a2acd00eSAlex Elder counter = atomic_dec_return(v); 86a2acd00eSAlex Elder if (counter >= 0) 87a2acd00eSAlex Elder return counter; 88a2acd00eSAlex Elder 89a2acd00eSAlex Elder atomic_inc(v); 90a2acd00eSAlex Elder 91a2acd00eSAlex Elder return -EINVAL; 92a2acd00eSAlex Elder } 93a2acd00eSAlex Elder 94f0f8cef5SAlex Elder #define RBD_DRV_NAME "rbd" 95602adf40SYehuda Sadeh 967e513d43SIlya Dryomov #define RBD_MINORS_PER_MAJOR 256 977e513d43SIlya Dryomov #define RBD_SINGLE_MAJOR_PART_SHIFT 4 98602adf40SYehuda Sadeh 996d69bb53SIlya Dryomov #define RBD_MAX_PARENT_CHAIN_LEN 16 1006d69bb53SIlya Dryomov 101d4b125e9SAlex Elder #define RBD_SNAP_DEV_NAME_PREFIX "snap_" 102d4b125e9SAlex Elder #define RBD_MAX_SNAP_NAME_LEN \ 103d4b125e9SAlex Elder (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1)) 104d4b125e9SAlex Elder 10535d489f9SAlex Elder #define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */ 106602adf40SYehuda Sadeh 107602adf40SYehuda Sadeh #define RBD_SNAP_HEAD_NAME "-" 108602adf40SYehuda Sadeh 1099682fc6dSAlex Elder #define BAD_SNAP_INDEX U32_MAX /* invalid index into snap array */ 1109682fc6dSAlex Elder 1119e15b77dSAlex Elder /* This allows a single page to hold an image name sent by OSD */ 1129e15b77dSAlex Elder #define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1) 113589d30e0SAlex Elder #define RBD_IMAGE_ID_LEN_MAX 64 1149e15b77dSAlex Elder 1151e130199SAlex Elder #define RBD_OBJ_PREFIX_LEN_MAX 64 116589d30e0SAlex Elder 117d889140cSAlex Elder /* Feature bits */ 118d889140cSAlex Elder 1195cbf6f12SAlex Elder #define RBD_FEATURE_LAYERING (1<<0) 1205cbf6f12SAlex Elder #define RBD_FEATURE_STRIPINGV2 (1<<1) 1215cbf6f12SAlex Elder #define RBD_FEATURES_ALL \ 1225cbf6f12SAlex Elder (RBD_FEATURE_LAYERING | RBD_FEATURE_STRIPINGV2) 123d889140cSAlex Elder 124d889140cSAlex Elder /* Features supported by this (client software) implementation. */ 125d889140cSAlex Elder 126770eba6eSAlex Elder #define RBD_FEATURES_SUPPORTED (RBD_FEATURES_ALL) 127d889140cSAlex Elder 12881a89793SAlex Elder /* 12981a89793SAlex Elder * An RBD device name will be "rbd#", where the "rbd" comes from 13081a89793SAlex Elder * RBD_DRV_NAME above, and # is a unique integer identifier. 13181a89793SAlex Elder */ 132602adf40SYehuda Sadeh #define DEV_NAME_LEN 32 133602adf40SYehuda Sadeh 134602adf40SYehuda Sadeh /* 135602adf40SYehuda Sadeh * block device image metadata (in-memory version) 136602adf40SYehuda Sadeh */ 137602adf40SYehuda Sadeh struct rbd_image_header { 138f35a4deeSAlex Elder /* These six fields never change for a given rbd image */ 139849b4260SAlex Elder char *object_prefix; 140602adf40SYehuda Sadeh __u8 obj_order; 141602adf40SYehuda Sadeh __u8 crypt_type; 142602adf40SYehuda Sadeh __u8 comp_type; 143f35a4deeSAlex Elder u64 stripe_unit; 144f35a4deeSAlex Elder u64 stripe_count; 145f35a4deeSAlex Elder u64 features; /* Might be changeable someday? */ 146602adf40SYehuda Sadeh 147f84344f3SAlex Elder /* The remaining fields need to be updated occasionally */ 148f84344f3SAlex Elder u64 image_size; 149f84344f3SAlex Elder struct ceph_snap_context *snapc; 150f35a4deeSAlex Elder char *snap_names; /* format 1 only */ 151f35a4deeSAlex Elder u64 *snap_sizes; /* format 1 only */ 15259c2be1eSYehuda Sadeh }; 15359c2be1eSYehuda Sadeh 1540d7dbfceSAlex Elder /* 1550d7dbfceSAlex Elder * An rbd image specification. 1560d7dbfceSAlex Elder * 1570d7dbfceSAlex Elder * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely 158c66c6e0cSAlex Elder * identify an image. Each rbd_dev structure includes a pointer to 159c66c6e0cSAlex Elder * an rbd_spec structure that encapsulates this identity. 160c66c6e0cSAlex Elder * 161c66c6e0cSAlex Elder * Each of the id's in an rbd_spec has an associated name. For a 162c66c6e0cSAlex Elder * user-mapped image, the names are supplied and the id's associated 163c66c6e0cSAlex Elder * with them are looked up. For a layered image, a parent image is 164c66c6e0cSAlex Elder * defined by the tuple, and the names are looked up. 165c66c6e0cSAlex Elder * 166c66c6e0cSAlex Elder * An rbd_dev structure contains a parent_spec pointer which is 167c66c6e0cSAlex Elder * non-null if the image it represents is a child in a layered 168c66c6e0cSAlex Elder * image. This pointer will refer to the rbd_spec structure used 169c66c6e0cSAlex Elder * by the parent rbd_dev for its own identity (i.e., the structure 170c66c6e0cSAlex Elder * is shared between the parent and child). 171c66c6e0cSAlex Elder * 172c66c6e0cSAlex Elder * Since these structures are populated once, during the discovery 173c66c6e0cSAlex Elder * phase of image construction, they are effectively immutable so 174c66c6e0cSAlex Elder * we make no effort to synchronize access to them. 175c66c6e0cSAlex Elder * 176c66c6e0cSAlex Elder * Note that code herein does not assume the image name is known (it 177c66c6e0cSAlex Elder * could be a null pointer). 1780d7dbfceSAlex Elder */ 1790d7dbfceSAlex Elder struct rbd_spec { 1800d7dbfceSAlex Elder u64 pool_id; 181ecb4dc22SAlex Elder const char *pool_name; 1820d7dbfceSAlex Elder 183ecb4dc22SAlex Elder const char *image_id; 184ecb4dc22SAlex Elder const char *image_name; 1850d7dbfceSAlex Elder 1860d7dbfceSAlex Elder u64 snap_id; 187ecb4dc22SAlex Elder const char *snap_name; 1880d7dbfceSAlex Elder 1890d7dbfceSAlex Elder struct kref kref; 1900d7dbfceSAlex Elder }; 1910d7dbfceSAlex Elder 192602adf40SYehuda Sadeh /* 193f0f8cef5SAlex Elder * an instance of the client. multiple devices may share an rbd client. 194602adf40SYehuda Sadeh */ 195602adf40SYehuda Sadeh struct rbd_client { 196602adf40SYehuda Sadeh struct ceph_client *client; 197602adf40SYehuda Sadeh struct kref kref; 198602adf40SYehuda Sadeh struct list_head node; 199602adf40SYehuda Sadeh }; 200602adf40SYehuda Sadeh 201bf0d5f50SAlex Elder struct rbd_img_request; 202bf0d5f50SAlex Elder typedef void (*rbd_img_callback_t)(struct rbd_img_request *); 203bf0d5f50SAlex Elder 204bf0d5f50SAlex Elder #define BAD_WHICH U32_MAX /* Good which or bad which, which? */ 205bf0d5f50SAlex Elder 206bf0d5f50SAlex Elder struct rbd_obj_request; 207bf0d5f50SAlex Elder typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *); 208bf0d5f50SAlex Elder 2099969ebc5SAlex Elder enum obj_request_type { 2109969ebc5SAlex Elder OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES 2119969ebc5SAlex Elder }; 212bf0d5f50SAlex Elder 2136d2940c8SGuangliang Zhao enum obj_operation_type { 2146d2940c8SGuangliang Zhao OBJ_OP_WRITE, 2156d2940c8SGuangliang Zhao OBJ_OP_READ, 21690e98c52SGuangliang Zhao OBJ_OP_DISCARD, 2176d2940c8SGuangliang Zhao }; 2186d2940c8SGuangliang Zhao 219926f9b3fSAlex Elder enum obj_req_flags { 220926f9b3fSAlex Elder OBJ_REQ_DONE, /* completion flag: not done = 0, done = 1 */ 2216365d33aSAlex Elder OBJ_REQ_IMG_DATA, /* object usage: standalone = 0, image = 1 */ 2225679c59fSAlex Elder OBJ_REQ_KNOWN, /* EXISTS flag valid: no = 0, yes = 1 */ 2235679c59fSAlex Elder OBJ_REQ_EXISTS, /* target exists: no = 0, yes = 1 */ 224926f9b3fSAlex Elder }; 225926f9b3fSAlex Elder 226bf0d5f50SAlex Elder struct rbd_obj_request { 227bf0d5f50SAlex Elder const char *object_name; 228bf0d5f50SAlex Elder u64 offset; /* object start byte */ 229bf0d5f50SAlex Elder u64 length; /* bytes from offset */ 230926f9b3fSAlex Elder unsigned long flags; 231bf0d5f50SAlex Elder 232c5b5ef6cSAlex Elder /* 233c5b5ef6cSAlex Elder * An object request associated with an image will have its 234c5b5ef6cSAlex Elder * img_data flag set; a standalone object request will not. 235c5b5ef6cSAlex Elder * 236c5b5ef6cSAlex Elder * A standalone object request will have which == BAD_WHICH 237c5b5ef6cSAlex Elder * and a null obj_request pointer. 238c5b5ef6cSAlex Elder * 239c5b5ef6cSAlex Elder * An object request initiated in support of a layered image 240c5b5ef6cSAlex Elder * object (to check for its existence before a write) will 241c5b5ef6cSAlex Elder * have which == BAD_WHICH and a non-null obj_request pointer. 242c5b5ef6cSAlex Elder * 243c5b5ef6cSAlex Elder * Finally, an object request for rbd image data will have 244c5b5ef6cSAlex Elder * which != BAD_WHICH, and will have a non-null img_request 245c5b5ef6cSAlex Elder * pointer. The value of which will be in the range 246c5b5ef6cSAlex Elder * 0..(img_request->obj_request_count-1). 247c5b5ef6cSAlex Elder */ 248c5b5ef6cSAlex Elder union { 249c5b5ef6cSAlex Elder struct rbd_obj_request *obj_request; /* STAT op */ 250c5b5ef6cSAlex Elder struct { 251bf0d5f50SAlex Elder struct rbd_img_request *img_request; 252c5b5ef6cSAlex Elder u64 img_offset; 253c5b5ef6cSAlex Elder /* links for img_request->obj_requests list */ 254c5b5ef6cSAlex Elder struct list_head links; 255c5b5ef6cSAlex Elder }; 256c5b5ef6cSAlex Elder }; 257bf0d5f50SAlex Elder u32 which; /* posn image request list */ 258bf0d5f50SAlex Elder 259bf0d5f50SAlex Elder enum obj_request_type type; 260788e2df3SAlex Elder union { 261bf0d5f50SAlex Elder struct bio *bio_list; 262788e2df3SAlex Elder struct { 263788e2df3SAlex Elder struct page **pages; 264788e2df3SAlex Elder u32 page_count; 265788e2df3SAlex Elder }; 266788e2df3SAlex Elder }; 2670eefd470SAlex Elder struct page **copyup_pages; 268ebda6408SAlex Elder u32 copyup_page_count; 269bf0d5f50SAlex Elder 270bf0d5f50SAlex Elder struct ceph_osd_request *osd_req; 271bf0d5f50SAlex Elder 272bf0d5f50SAlex Elder u64 xferred; /* bytes transferred */ 2731b83bef2SSage Weil int result; 274bf0d5f50SAlex Elder 275bf0d5f50SAlex Elder rbd_obj_callback_t callback; 276788e2df3SAlex Elder struct completion completion; 277bf0d5f50SAlex Elder 278bf0d5f50SAlex Elder struct kref kref; 279bf0d5f50SAlex Elder }; 280bf0d5f50SAlex Elder 2810c425248SAlex Elder enum img_req_flags { 2829849e986SAlex Elder IMG_REQ_WRITE, /* I/O direction: read = 0, write = 1 */ 2839849e986SAlex Elder IMG_REQ_CHILD, /* initiator: block = 0, child image = 1 */ 284d0b2e944SAlex Elder IMG_REQ_LAYERED, /* ENOENT handling: normal = 0, layered = 1 */ 28590e98c52SGuangliang Zhao IMG_REQ_DISCARD, /* discard: normal = 0, discard request = 1 */ 2860c425248SAlex Elder }; 2870c425248SAlex Elder 288bf0d5f50SAlex Elder struct rbd_img_request { 289bf0d5f50SAlex Elder struct rbd_device *rbd_dev; 290bf0d5f50SAlex Elder u64 offset; /* starting image byte offset */ 291bf0d5f50SAlex Elder u64 length; /* byte count from offset */ 2920c425248SAlex Elder unsigned long flags; 293bf0d5f50SAlex Elder union { 294bf0d5f50SAlex Elder u64 snap_id; /* for reads */ 2959849e986SAlex Elder struct ceph_snap_context *snapc; /* for writes */ 2969849e986SAlex Elder }; 2979849e986SAlex Elder union { 2989849e986SAlex Elder struct request *rq; /* block request */ 2999849e986SAlex Elder struct rbd_obj_request *obj_request; /* obj req initiator */ 300bf0d5f50SAlex Elder }; 3013d7efd18SAlex Elder struct page **copyup_pages; 302ebda6408SAlex Elder u32 copyup_page_count; 303bf0d5f50SAlex Elder spinlock_t completion_lock;/* protects next_completion */ 304bf0d5f50SAlex Elder u32 next_completion; 305bf0d5f50SAlex Elder rbd_img_callback_t callback; 30655f27e09SAlex Elder u64 xferred;/* aggregate bytes transferred */ 307a5a337d4SAlex Elder int result; /* first nonzero obj_request result */ 308bf0d5f50SAlex Elder 309bf0d5f50SAlex Elder u32 obj_request_count; 310bf0d5f50SAlex Elder struct list_head obj_requests; /* rbd_obj_request structs */ 311bf0d5f50SAlex Elder 312bf0d5f50SAlex Elder struct kref kref; 313bf0d5f50SAlex Elder }; 314bf0d5f50SAlex Elder 315bf0d5f50SAlex Elder #define for_each_obj_request(ireq, oreq) \ 316ef06f4d3SAlex Elder list_for_each_entry(oreq, &(ireq)->obj_requests, links) 317bf0d5f50SAlex Elder #define for_each_obj_request_from(ireq, oreq) \ 318ef06f4d3SAlex Elder list_for_each_entry_from(oreq, &(ireq)->obj_requests, links) 319bf0d5f50SAlex Elder #define for_each_obj_request_safe(ireq, oreq, n) \ 320ef06f4d3SAlex Elder list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links) 321bf0d5f50SAlex Elder 322f84344f3SAlex Elder struct rbd_mapping { 32399c1f08fSAlex Elder u64 size; 32434b13184SAlex Elder u64 features; 325f84344f3SAlex Elder bool read_only; 326f84344f3SAlex Elder }; 327f84344f3SAlex Elder 328602adf40SYehuda Sadeh /* 329602adf40SYehuda Sadeh * a single device 330602adf40SYehuda Sadeh */ 331602adf40SYehuda Sadeh struct rbd_device { 332de71a297SAlex Elder int dev_id; /* blkdev unique id */ 333602adf40SYehuda Sadeh 334602adf40SYehuda Sadeh int major; /* blkdev assigned major */ 335dd82fff1SIlya Dryomov int minor; 336602adf40SYehuda Sadeh struct gendisk *disk; /* blkdev's gendisk and rq */ 337602adf40SYehuda Sadeh 338a30b71b9SAlex Elder u32 image_format; /* Either 1 or 2 */ 339602adf40SYehuda Sadeh struct rbd_client *rbd_client; 340602adf40SYehuda Sadeh 341602adf40SYehuda Sadeh char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */ 342602adf40SYehuda Sadeh 343b82d167bSAlex Elder spinlock_t lock; /* queue, flags, open_count */ 344602adf40SYehuda Sadeh 345602adf40SYehuda Sadeh struct rbd_image_header header; 346b82d167bSAlex Elder unsigned long flags; /* possibly lock protected */ 3470d7dbfceSAlex Elder struct rbd_spec *spec; 348d147543dSIlya Dryomov struct rbd_options *opts; 349602adf40SYehuda Sadeh 350c41d13a3SIlya Dryomov struct ceph_object_id header_oid; 351922dab61SIlya Dryomov struct ceph_object_locator header_oloc; 352971f839aSAlex Elder 3531643dfa4SIlya Dryomov struct ceph_file_layout layout; /* used for all rbd requests */ 3540903e875SAlex Elder 355922dab61SIlya Dryomov struct ceph_osd_linger_request *watch_handle; 35659c2be1eSYehuda Sadeh 3571643dfa4SIlya Dryomov struct workqueue_struct *task_wq; 3581643dfa4SIlya Dryomov 35986b00e0dSAlex Elder struct rbd_spec *parent_spec; 36086b00e0dSAlex Elder u64 parent_overlap; 361a2acd00eSAlex Elder atomic_t parent_ref; 3622f82ee54SAlex Elder struct rbd_device *parent; 36386b00e0dSAlex Elder 3647ad18afaSChristoph Hellwig /* Block layer tags. */ 3657ad18afaSChristoph Hellwig struct blk_mq_tag_set tag_set; 3667ad18afaSChristoph Hellwig 367c666601aSJosh Durgin /* protects updating the header */ 368c666601aSJosh Durgin struct rw_semaphore header_rwsem; 369f84344f3SAlex Elder 370f84344f3SAlex Elder struct rbd_mapping mapping; 371602adf40SYehuda Sadeh 372602adf40SYehuda Sadeh struct list_head node; 373dfc5606dSYehuda Sadeh 374dfc5606dSYehuda Sadeh /* sysfs related */ 375dfc5606dSYehuda Sadeh struct device dev; 376b82d167bSAlex Elder unsigned long open_count; /* protected by lock */ 377dfc5606dSYehuda Sadeh }; 378dfc5606dSYehuda Sadeh 379b82d167bSAlex Elder /* 380b82d167bSAlex Elder * Flag bits for rbd_dev->flags. If atomicity is required, 381b82d167bSAlex Elder * rbd_dev->lock is used to protect access. 382b82d167bSAlex Elder * 383b82d167bSAlex Elder * Currently, only the "removing" flag (which is coupled with the 384b82d167bSAlex Elder * "open_count" field) requires atomic access. 385b82d167bSAlex Elder */ 3866d292906SAlex Elder enum rbd_dev_flags { 3876d292906SAlex Elder RBD_DEV_FLAG_EXISTS, /* mapped snapshot has not been deleted */ 388b82d167bSAlex Elder RBD_DEV_FLAG_REMOVING, /* this mapping is being removed */ 3896d292906SAlex Elder }; 3906d292906SAlex Elder 391cfbf6377SAlex Elder static DEFINE_MUTEX(client_mutex); /* Serialize client creation */ 392e124a82fSAlex Elder 393602adf40SYehuda Sadeh static LIST_HEAD(rbd_dev_list); /* devices */ 394e124a82fSAlex Elder static DEFINE_SPINLOCK(rbd_dev_list_lock); 395e124a82fSAlex Elder 396602adf40SYehuda Sadeh static LIST_HEAD(rbd_client_list); /* clients */ 397432b8587SAlex Elder static DEFINE_SPINLOCK(rbd_client_list_lock); 398602adf40SYehuda Sadeh 39978c2a44aSAlex Elder /* Slab caches for frequently-allocated structures */ 40078c2a44aSAlex Elder 4011c2a9dfeSAlex Elder static struct kmem_cache *rbd_img_request_cache; 402868311b1SAlex Elder static struct kmem_cache *rbd_obj_request_cache; 40378c2a44aSAlex Elder static struct kmem_cache *rbd_segment_name_cache; 4041c2a9dfeSAlex Elder 4059b60e70bSIlya Dryomov static int rbd_major; 406f8a22fc2SIlya Dryomov static DEFINE_IDA(rbd_dev_id_ida); 407f8a22fc2SIlya Dryomov 408f5ee37bdSIlya Dryomov static struct workqueue_struct *rbd_wq; 409f5ee37bdSIlya Dryomov 4109b60e70bSIlya Dryomov /* 4119b60e70bSIlya Dryomov * Default to false for now, as single-major requires >= 0.75 version of 4129b60e70bSIlya Dryomov * userspace rbd utility. 4139b60e70bSIlya Dryomov */ 4149b60e70bSIlya Dryomov static bool single_major = false; 4159b60e70bSIlya Dryomov module_param(single_major, bool, S_IRUGO); 4169b60e70bSIlya Dryomov MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: false)"); 4179b60e70bSIlya Dryomov 4183d7efd18SAlex Elder static int rbd_img_request_submit(struct rbd_img_request *img_request); 4193d7efd18SAlex Elder 420f0f8cef5SAlex Elder static ssize_t rbd_add(struct bus_type *bus, const char *buf, 421f0f8cef5SAlex Elder size_t count); 422f0f8cef5SAlex Elder static ssize_t rbd_remove(struct bus_type *bus, const char *buf, 423f0f8cef5SAlex Elder size_t count); 4249b60e70bSIlya Dryomov static ssize_t rbd_add_single_major(struct bus_type *bus, const char *buf, 4259b60e70bSIlya Dryomov size_t count); 4269b60e70bSIlya Dryomov static ssize_t rbd_remove_single_major(struct bus_type *bus, const char *buf, 4279b60e70bSIlya Dryomov size_t count); 4286d69bb53SIlya Dryomov static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth); 429a2acd00eSAlex Elder static void rbd_spec_put(struct rbd_spec *spec); 430f0f8cef5SAlex Elder 4319b60e70bSIlya Dryomov static int rbd_dev_id_to_minor(int dev_id) 4329b60e70bSIlya Dryomov { 4337e513d43SIlya Dryomov return dev_id << RBD_SINGLE_MAJOR_PART_SHIFT; 4349b60e70bSIlya Dryomov } 4359b60e70bSIlya Dryomov 4369b60e70bSIlya Dryomov static int minor_to_rbd_dev_id(int minor) 4379b60e70bSIlya Dryomov { 4387e513d43SIlya Dryomov return minor >> RBD_SINGLE_MAJOR_PART_SHIFT; 4399b60e70bSIlya Dryomov } 4409b60e70bSIlya Dryomov 441b15a21ddSGreg Kroah-Hartman static BUS_ATTR(add, S_IWUSR, NULL, rbd_add); 442b15a21ddSGreg Kroah-Hartman static BUS_ATTR(remove, S_IWUSR, NULL, rbd_remove); 4439b60e70bSIlya Dryomov static BUS_ATTR(add_single_major, S_IWUSR, NULL, rbd_add_single_major); 4449b60e70bSIlya Dryomov static BUS_ATTR(remove_single_major, S_IWUSR, NULL, rbd_remove_single_major); 445b15a21ddSGreg Kroah-Hartman 446b15a21ddSGreg Kroah-Hartman static struct attribute *rbd_bus_attrs[] = { 447b15a21ddSGreg Kroah-Hartman &bus_attr_add.attr, 448b15a21ddSGreg Kroah-Hartman &bus_attr_remove.attr, 4499b60e70bSIlya Dryomov &bus_attr_add_single_major.attr, 4509b60e70bSIlya Dryomov &bus_attr_remove_single_major.attr, 451b15a21ddSGreg Kroah-Hartman NULL, 452f0f8cef5SAlex Elder }; 45392c76dc0SIlya Dryomov 45492c76dc0SIlya Dryomov static umode_t rbd_bus_is_visible(struct kobject *kobj, 45592c76dc0SIlya Dryomov struct attribute *attr, int index) 45692c76dc0SIlya Dryomov { 4579b60e70bSIlya Dryomov if (!single_major && 4589b60e70bSIlya Dryomov (attr == &bus_attr_add_single_major.attr || 4599b60e70bSIlya Dryomov attr == &bus_attr_remove_single_major.attr)) 4609b60e70bSIlya Dryomov return 0; 4619b60e70bSIlya Dryomov 46292c76dc0SIlya Dryomov return attr->mode; 46392c76dc0SIlya Dryomov } 46492c76dc0SIlya Dryomov 46592c76dc0SIlya Dryomov static const struct attribute_group rbd_bus_group = { 46692c76dc0SIlya Dryomov .attrs = rbd_bus_attrs, 46792c76dc0SIlya Dryomov .is_visible = rbd_bus_is_visible, 46892c76dc0SIlya Dryomov }; 46992c76dc0SIlya Dryomov __ATTRIBUTE_GROUPS(rbd_bus); 470f0f8cef5SAlex Elder 471f0f8cef5SAlex Elder static struct bus_type rbd_bus_type = { 472f0f8cef5SAlex Elder .name = "rbd", 473b15a21ddSGreg Kroah-Hartman .bus_groups = rbd_bus_groups, 474f0f8cef5SAlex Elder }; 475f0f8cef5SAlex Elder 476f0f8cef5SAlex Elder static void rbd_root_dev_release(struct device *dev) 477f0f8cef5SAlex Elder { 478f0f8cef5SAlex Elder } 479f0f8cef5SAlex Elder 480f0f8cef5SAlex Elder static struct device rbd_root_dev = { 481f0f8cef5SAlex Elder .init_name = "rbd", 482f0f8cef5SAlex Elder .release = rbd_root_dev_release, 483f0f8cef5SAlex Elder }; 484f0f8cef5SAlex Elder 48506ecc6cbSAlex Elder static __printf(2, 3) 48606ecc6cbSAlex Elder void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...) 48706ecc6cbSAlex Elder { 48806ecc6cbSAlex Elder struct va_format vaf; 48906ecc6cbSAlex Elder va_list args; 49006ecc6cbSAlex Elder 49106ecc6cbSAlex Elder va_start(args, fmt); 49206ecc6cbSAlex Elder vaf.fmt = fmt; 49306ecc6cbSAlex Elder vaf.va = &args; 49406ecc6cbSAlex Elder 49506ecc6cbSAlex Elder if (!rbd_dev) 49606ecc6cbSAlex Elder printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf); 49706ecc6cbSAlex Elder else if (rbd_dev->disk) 49806ecc6cbSAlex Elder printk(KERN_WARNING "%s: %s: %pV\n", 49906ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf); 50006ecc6cbSAlex Elder else if (rbd_dev->spec && rbd_dev->spec->image_name) 50106ecc6cbSAlex Elder printk(KERN_WARNING "%s: image %s: %pV\n", 50206ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf); 50306ecc6cbSAlex Elder else if (rbd_dev->spec && rbd_dev->spec->image_id) 50406ecc6cbSAlex Elder printk(KERN_WARNING "%s: id %s: %pV\n", 50506ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf); 50606ecc6cbSAlex Elder else /* punt */ 50706ecc6cbSAlex Elder printk(KERN_WARNING "%s: rbd_dev %p: %pV\n", 50806ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev, &vaf); 50906ecc6cbSAlex Elder va_end(args); 51006ecc6cbSAlex Elder } 51106ecc6cbSAlex Elder 512aafb230eSAlex Elder #ifdef RBD_DEBUG 513aafb230eSAlex Elder #define rbd_assert(expr) \ 514aafb230eSAlex Elder if (unlikely(!(expr))) { \ 515aafb230eSAlex Elder printk(KERN_ERR "\nAssertion failure in %s() " \ 516aafb230eSAlex Elder "at line %d:\n\n" \ 517aafb230eSAlex Elder "\trbd_assert(%s);\n\n", \ 518aafb230eSAlex Elder __func__, __LINE__, #expr); \ 519aafb230eSAlex Elder BUG(); \ 520aafb230eSAlex Elder } 521aafb230eSAlex Elder #else /* !RBD_DEBUG */ 522aafb230eSAlex Elder # define rbd_assert(expr) ((void) 0) 523aafb230eSAlex Elder #endif /* !RBD_DEBUG */ 524dfc5606dSYehuda Sadeh 5252761713dSIlya Dryomov static void rbd_osd_copyup_callback(struct rbd_obj_request *obj_request); 526b454e36dSAlex Elder static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request); 52705a46afdSAlex Elder static void rbd_img_parent_read(struct rbd_obj_request *obj_request); 52805a46afdSAlex Elder static void rbd_dev_remove_parent(struct rbd_device *rbd_dev); 5298b3e1a56SAlex Elder 530cc4a38bdSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev); 5312df3fac7SAlex Elder static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev); 532a720ae09SIlya Dryomov static int rbd_dev_header_info(struct rbd_device *rbd_dev); 533e8f59b59SIlya Dryomov static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev); 53454cac61fSAlex Elder static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, 53554cac61fSAlex Elder u64 snap_id); 5362ad3d716SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id, 5372ad3d716SAlex Elder u8 *order, u64 *snap_size); 5382ad3d716SAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id, 5392ad3d716SAlex Elder u64 *snap_features); 54059c2be1eSYehuda Sadeh 541602adf40SYehuda Sadeh static int rbd_open(struct block_device *bdev, fmode_t mode) 542602adf40SYehuda Sadeh { 543f0f8cef5SAlex Elder struct rbd_device *rbd_dev = bdev->bd_disk->private_data; 544b82d167bSAlex Elder bool removing = false; 545602adf40SYehuda Sadeh 546f84344f3SAlex Elder if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only) 547602adf40SYehuda Sadeh return -EROFS; 548602adf40SYehuda Sadeh 549a14ea269SAlex Elder spin_lock_irq(&rbd_dev->lock); 550b82d167bSAlex Elder if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags)) 551b82d167bSAlex Elder removing = true; 552b82d167bSAlex Elder else 553b82d167bSAlex Elder rbd_dev->open_count++; 554a14ea269SAlex Elder spin_unlock_irq(&rbd_dev->lock); 555b82d167bSAlex Elder if (removing) 556b82d167bSAlex Elder return -ENOENT; 557b82d167bSAlex Elder 558c3e946ceSAlex Elder (void) get_device(&rbd_dev->dev); 559340c7a2bSAlex Elder 560602adf40SYehuda Sadeh return 0; 561602adf40SYehuda Sadeh } 562602adf40SYehuda Sadeh 563db2a144bSAl Viro static void rbd_release(struct gendisk *disk, fmode_t mode) 564dfc5606dSYehuda Sadeh { 565dfc5606dSYehuda Sadeh struct rbd_device *rbd_dev = disk->private_data; 566b82d167bSAlex Elder unsigned long open_count_before; 567b82d167bSAlex Elder 568a14ea269SAlex Elder spin_lock_irq(&rbd_dev->lock); 569b82d167bSAlex Elder open_count_before = rbd_dev->open_count--; 570a14ea269SAlex Elder spin_unlock_irq(&rbd_dev->lock); 571b82d167bSAlex Elder rbd_assert(open_count_before > 0); 572dfc5606dSYehuda Sadeh 573c3e946ceSAlex Elder put_device(&rbd_dev->dev); 574dfc5606dSYehuda Sadeh } 575dfc5606dSYehuda Sadeh 576131fd9f6SGuangliang Zhao static int rbd_ioctl_set_ro(struct rbd_device *rbd_dev, unsigned long arg) 577131fd9f6SGuangliang Zhao { 57877f33c03SJosh Durgin int ret = 0; 579131fd9f6SGuangliang Zhao int val; 580131fd9f6SGuangliang Zhao bool ro; 58177f33c03SJosh Durgin bool ro_changed = false; 582131fd9f6SGuangliang Zhao 58377f33c03SJosh Durgin /* get_user() may sleep, so call it before taking rbd_dev->lock */ 584131fd9f6SGuangliang Zhao if (get_user(val, (int __user *)(arg))) 585131fd9f6SGuangliang Zhao return -EFAULT; 586131fd9f6SGuangliang Zhao 587131fd9f6SGuangliang Zhao ro = val ? true : false; 588131fd9f6SGuangliang Zhao /* Snapshot doesn't allow to write*/ 589131fd9f6SGuangliang Zhao if (rbd_dev->spec->snap_id != CEPH_NOSNAP && !ro) 590131fd9f6SGuangliang Zhao return -EROFS; 591131fd9f6SGuangliang Zhao 59277f33c03SJosh Durgin spin_lock_irq(&rbd_dev->lock); 59377f33c03SJosh Durgin /* prevent others open this device */ 59477f33c03SJosh Durgin if (rbd_dev->open_count > 1) { 59577f33c03SJosh Durgin ret = -EBUSY; 59677f33c03SJosh Durgin goto out; 597131fd9f6SGuangliang Zhao } 598131fd9f6SGuangliang Zhao 59977f33c03SJosh Durgin if (rbd_dev->mapping.read_only != ro) { 60077f33c03SJosh Durgin rbd_dev->mapping.read_only = ro; 60177f33c03SJosh Durgin ro_changed = true; 60277f33c03SJosh Durgin } 60377f33c03SJosh Durgin 60477f33c03SJosh Durgin out: 60577f33c03SJosh Durgin spin_unlock_irq(&rbd_dev->lock); 60677f33c03SJosh Durgin /* set_disk_ro() may sleep, so call it after releasing rbd_dev->lock */ 60777f33c03SJosh Durgin if (ret == 0 && ro_changed) 60877f33c03SJosh Durgin set_disk_ro(rbd_dev->disk, ro ? 1 : 0); 60977f33c03SJosh Durgin 61077f33c03SJosh Durgin return ret; 611131fd9f6SGuangliang Zhao } 612131fd9f6SGuangliang Zhao 613131fd9f6SGuangliang Zhao static int rbd_ioctl(struct block_device *bdev, fmode_t mode, 614131fd9f6SGuangliang Zhao unsigned int cmd, unsigned long arg) 615131fd9f6SGuangliang Zhao { 616131fd9f6SGuangliang Zhao struct rbd_device *rbd_dev = bdev->bd_disk->private_data; 617131fd9f6SGuangliang Zhao int ret = 0; 618131fd9f6SGuangliang Zhao 619131fd9f6SGuangliang Zhao switch (cmd) { 620131fd9f6SGuangliang Zhao case BLKROSET: 621131fd9f6SGuangliang Zhao ret = rbd_ioctl_set_ro(rbd_dev, arg); 622131fd9f6SGuangliang Zhao break; 623131fd9f6SGuangliang Zhao default: 624131fd9f6SGuangliang Zhao ret = -ENOTTY; 625131fd9f6SGuangliang Zhao } 626131fd9f6SGuangliang Zhao 627131fd9f6SGuangliang Zhao return ret; 628131fd9f6SGuangliang Zhao } 629131fd9f6SGuangliang Zhao 630131fd9f6SGuangliang Zhao #ifdef CONFIG_COMPAT 631131fd9f6SGuangliang Zhao static int rbd_compat_ioctl(struct block_device *bdev, fmode_t mode, 632131fd9f6SGuangliang Zhao unsigned int cmd, unsigned long arg) 633131fd9f6SGuangliang Zhao { 634131fd9f6SGuangliang Zhao return rbd_ioctl(bdev, mode, cmd, arg); 635131fd9f6SGuangliang Zhao } 636131fd9f6SGuangliang Zhao #endif /* CONFIG_COMPAT */ 637131fd9f6SGuangliang Zhao 638602adf40SYehuda Sadeh static const struct block_device_operations rbd_bd_ops = { 639602adf40SYehuda Sadeh .owner = THIS_MODULE, 640602adf40SYehuda Sadeh .open = rbd_open, 641dfc5606dSYehuda Sadeh .release = rbd_release, 642131fd9f6SGuangliang Zhao .ioctl = rbd_ioctl, 643131fd9f6SGuangliang Zhao #ifdef CONFIG_COMPAT 644131fd9f6SGuangliang Zhao .compat_ioctl = rbd_compat_ioctl, 645131fd9f6SGuangliang Zhao #endif 646602adf40SYehuda Sadeh }; 647602adf40SYehuda Sadeh 648602adf40SYehuda Sadeh /* 6497262cfcaSAlex Elder * Initialize an rbd client instance. Success or not, this function 650cfbf6377SAlex Elder * consumes ceph_opts. Caller holds client_mutex. 651602adf40SYehuda Sadeh */ 652f8c38929SAlex Elder static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts) 653602adf40SYehuda Sadeh { 654602adf40SYehuda Sadeh struct rbd_client *rbdc; 655602adf40SYehuda Sadeh int ret = -ENOMEM; 656602adf40SYehuda Sadeh 65737206ee5SAlex Elder dout("%s:\n", __func__); 658602adf40SYehuda Sadeh rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL); 659602adf40SYehuda Sadeh if (!rbdc) 660602adf40SYehuda Sadeh goto out_opt; 661602adf40SYehuda Sadeh 662602adf40SYehuda Sadeh kref_init(&rbdc->kref); 663602adf40SYehuda Sadeh INIT_LIST_HEAD(&rbdc->node); 664602adf40SYehuda Sadeh 66543ae4701SAlex Elder rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0); 666602adf40SYehuda Sadeh if (IS_ERR(rbdc->client)) 66708f75463SAlex Elder goto out_rbdc; 66843ae4701SAlex Elder ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */ 669602adf40SYehuda Sadeh 670602adf40SYehuda Sadeh ret = ceph_open_session(rbdc->client); 671602adf40SYehuda Sadeh if (ret < 0) 67208f75463SAlex Elder goto out_client; 673602adf40SYehuda Sadeh 674432b8587SAlex Elder spin_lock(&rbd_client_list_lock); 675602adf40SYehuda Sadeh list_add_tail(&rbdc->node, &rbd_client_list); 676432b8587SAlex Elder spin_unlock(&rbd_client_list_lock); 677602adf40SYehuda Sadeh 67837206ee5SAlex Elder dout("%s: rbdc %p\n", __func__, rbdc); 679bc534d86SAlex Elder 680602adf40SYehuda Sadeh return rbdc; 68108f75463SAlex Elder out_client: 682602adf40SYehuda Sadeh ceph_destroy_client(rbdc->client); 68308f75463SAlex Elder out_rbdc: 684602adf40SYehuda Sadeh kfree(rbdc); 685602adf40SYehuda Sadeh out_opt: 68643ae4701SAlex Elder if (ceph_opts) 68743ae4701SAlex Elder ceph_destroy_options(ceph_opts); 68837206ee5SAlex Elder dout("%s: error %d\n", __func__, ret); 68937206ee5SAlex Elder 69028f259b7SVasiliy Kulikov return ERR_PTR(ret); 691602adf40SYehuda Sadeh } 692602adf40SYehuda Sadeh 6932f82ee54SAlex Elder static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc) 6942f82ee54SAlex Elder { 6952f82ee54SAlex Elder kref_get(&rbdc->kref); 6962f82ee54SAlex Elder 6972f82ee54SAlex Elder return rbdc; 6982f82ee54SAlex Elder } 6992f82ee54SAlex Elder 700602adf40SYehuda Sadeh /* 7011f7ba331SAlex Elder * Find a ceph client with specific addr and configuration. If 7021f7ba331SAlex Elder * found, bump its reference count. 703602adf40SYehuda Sadeh */ 7041f7ba331SAlex Elder static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts) 705602adf40SYehuda Sadeh { 706602adf40SYehuda Sadeh struct rbd_client *client_node; 7071f7ba331SAlex Elder bool found = false; 708602adf40SYehuda Sadeh 70943ae4701SAlex Elder if (ceph_opts->flags & CEPH_OPT_NOSHARE) 710602adf40SYehuda Sadeh return NULL; 711602adf40SYehuda Sadeh 7121f7ba331SAlex Elder spin_lock(&rbd_client_list_lock); 7131f7ba331SAlex Elder list_for_each_entry(client_node, &rbd_client_list, node) { 7141f7ba331SAlex Elder if (!ceph_compare_options(ceph_opts, client_node->client)) { 7152f82ee54SAlex Elder __rbd_get_client(client_node); 7162f82ee54SAlex Elder 7171f7ba331SAlex Elder found = true; 7181f7ba331SAlex Elder break; 7191f7ba331SAlex Elder } 7201f7ba331SAlex Elder } 7211f7ba331SAlex Elder spin_unlock(&rbd_client_list_lock); 7221f7ba331SAlex Elder 7231f7ba331SAlex Elder return found ? client_node : NULL; 724602adf40SYehuda Sadeh } 725602adf40SYehuda Sadeh 726602adf40SYehuda Sadeh /* 727210c104cSIlya Dryomov * (Per device) rbd map options 72859c2be1eSYehuda Sadeh */ 72959c2be1eSYehuda Sadeh enum { 730b5584180SIlya Dryomov Opt_queue_depth, 73159c2be1eSYehuda Sadeh Opt_last_int, 73259c2be1eSYehuda Sadeh /* int args above */ 73359c2be1eSYehuda Sadeh Opt_last_string, 73459c2be1eSYehuda Sadeh /* string args above */ 735cc0538b6SAlex Elder Opt_read_only, 736cc0538b6SAlex Elder Opt_read_write, 737210c104cSIlya Dryomov Opt_err 73859c2be1eSYehuda Sadeh }; 73959c2be1eSYehuda Sadeh 74043ae4701SAlex Elder static match_table_t rbd_opts_tokens = { 741b5584180SIlya Dryomov {Opt_queue_depth, "queue_depth=%d"}, 74259c2be1eSYehuda Sadeh /* int args above */ 74359c2be1eSYehuda Sadeh /* string args above */ 744be466c1cSAlex Elder {Opt_read_only, "read_only"}, 745cc0538b6SAlex Elder {Opt_read_only, "ro"}, /* Alternate spelling */ 746cc0538b6SAlex Elder {Opt_read_write, "read_write"}, 747cc0538b6SAlex Elder {Opt_read_write, "rw"}, /* Alternate spelling */ 748210c104cSIlya Dryomov {Opt_err, NULL} 74959c2be1eSYehuda Sadeh }; 75059c2be1eSYehuda Sadeh 75198571b5aSAlex Elder struct rbd_options { 752b5584180SIlya Dryomov int queue_depth; 75398571b5aSAlex Elder bool read_only; 75498571b5aSAlex Elder }; 75598571b5aSAlex Elder 756b5584180SIlya Dryomov #define RBD_QUEUE_DEPTH_DEFAULT BLKDEV_MAX_RQ 75798571b5aSAlex Elder #define RBD_READ_ONLY_DEFAULT false 75898571b5aSAlex Elder 75959c2be1eSYehuda Sadeh static int parse_rbd_opts_token(char *c, void *private) 76059c2be1eSYehuda Sadeh { 76143ae4701SAlex Elder struct rbd_options *rbd_opts = private; 76259c2be1eSYehuda Sadeh substring_t argstr[MAX_OPT_ARGS]; 76359c2be1eSYehuda Sadeh int token, intval, ret; 76459c2be1eSYehuda Sadeh 76543ae4701SAlex Elder token = match_token(c, rbd_opts_tokens, argstr); 76659c2be1eSYehuda Sadeh if (token < Opt_last_int) { 76759c2be1eSYehuda Sadeh ret = match_int(&argstr[0], &intval); 76859c2be1eSYehuda Sadeh if (ret < 0) { 769210c104cSIlya Dryomov pr_err("bad mount option arg (not int) at '%s'\n", c); 77059c2be1eSYehuda Sadeh return ret; 77159c2be1eSYehuda Sadeh } 77259c2be1eSYehuda Sadeh dout("got int token %d val %d\n", token, intval); 77359c2be1eSYehuda Sadeh } else if (token > Opt_last_int && token < Opt_last_string) { 774210c104cSIlya Dryomov dout("got string token %d val %s\n", token, argstr[0].from); 77559c2be1eSYehuda Sadeh } else { 77659c2be1eSYehuda Sadeh dout("got token %d\n", token); 77759c2be1eSYehuda Sadeh } 77859c2be1eSYehuda Sadeh 77959c2be1eSYehuda Sadeh switch (token) { 780b5584180SIlya Dryomov case Opt_queue_depth: 781b5584180SIlya Dryomov if (intval < 1) { 782b5584180SIlya Dryomov pr_err("queue_depth out of range\n"); 783b5584180SIlya Dryomov return -EINVAL; 784b5584180SIlya Dryomov } 785b5584180SIlya Dryomov rbd_opts->queue_depth = intval; 786b5584180SIlya Dryomov break; 787cc0538b6SAlex Elder case Opt_read_only: 788cc0538b6SAlex Elder rbd_opts->read_only = true; 789cc0538b6SAlex Elder break; 790cc0538b6SAlex Elder case Opt_read_write: 791cc0538b6SAlex Elder rbd_opts->read_only = false; 792cc0538b6SAlex Elder break; 79359c2be1eSYehuda Sadeh default: 794210c104cSIlya Dryomov /* libceph prints "bad option" msg */ 795210c104cSIlya Dryomov return -EINVAL; 79659c2be1eSYehuda Sadeh } 797210c104cSIlya Dryomov 79859c2be1eSYehuda Sadeh return 0; 79959c2be1eSYehuda Sadeh } 80059c2be1eSYehuda Sadeh 8016d2940c8SGuangliang Zhao static char* obj_op_name(enum obj_operation_type op_type) 8026d2940c8SGuangliang Zhao { 8036d2940c8SGuangliang Zhao switch (op_type) { 8046d2940c8SGuangliang Zhao case OBJ_OP_READ: 8056d2940c8SGuangliang Zhao return "read"; 8066d2940c8SGuangliang Zhao case OBJ_OP_WRITE: 8076d2940c8SGuangliang Zhao return "write"; 80890e98c52SGuangliang Zhao case OBJ_OP_DISCARD: 80990e98c52SGuangliang Zhao return "discard"; 8106d2940c8SGuangliang Zhao default: 8116d2940c8SGuangliang Zhao return "???"; 8126d2940c8SGuangliang Zhao } 8136d2940c8SGuangliang Zhao } 8146d2940c8SGuangliang Zhao 81559c2be1eSYehuda Sadeh /* 816602adf40SYehuda Sadeh * Get a ceph client with specific addr and configuration, if one does 8177262cfcaSAlex Elder * not exist create it. Either way, ceph_opts is consumed by this 8187262cfcaSAlex Elder * function. 819602adf40SYehuda Sadeh */ 8209d3997fdSAlex Elder static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts) 821602adf40SYehuda Sadeh { 822f8c38929SAlex Elder struct rbd_client *rbdc; 82359c2be1eSYehuda Sadeh 824cfbf6377SAlex Elder mutex_lock_nested(&client_mutex, SINGLE_DEPTH_NESTING); 8251f7ba331SAlex Elder rbdc = rbd_client_find(ceph_opts); 8269d3997fdSAlex Elder if (rbdc) /* using an existing client */ 82743ae4701SAlex Elder ceph_destroy_options(ceph_opts); 8289d3997fdSAlex Elder else 829f8c38929SAlex Elder rbdc = rbd_client_create(ceph_opts); 830cfbf6377SAlex Elder mutex_unlock(&client_mutex); 831d720bcb0SAlex Elder 8329d3997fdSAlex Elder return rbdc; 833602adf40SYehuda Sadeh } 834602adf40SYehuda Sadeh 835602adf40SYehuda Sadeh /* 836602adf40SYehuda Sadeh * Destroy ceph client 837d23a4b3fSAlex Elder * 838432b8587SAlex Elder * Caller must hold rbd_client_list_lock. 839602adf40SYehuda Sadeh */ 840602adf40SYehuda Sadeh static void rbd_client_release(struct kref *kref) 841602adf40SYehuda Sadeh { 842602adf40SYehuda Sadeh struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref); 843602adf40SYehuda Sadeh 84437206ee5SAlex Elder dout("%s: rbdc %p\n", __func__, rbdc); 845cd9d9f5dSAlex Elder spin_lock(&rbd_client_list_lock); 846602adf40SYehuda Sadeh list_del(&rbdc->node); 847cd9d9f5dSAlex Elder spin_unlock(&rbd_client_list_lock); 848602adf40SYehuda Sadeh 849602adf40SYehuda Sadeh ceph_destroy_client(rbdc->client); 850602adf40SYehuda Sadeh kfree(rbdc); 851602adf40SYehuda Sadeh } 852602adf40SYehuda Sadeh 853602adf40SYehuda Sadeh /* 854602adf40SYehuda Sadeh * Drop reference to ceph client node. If it's not referenced anymore, release 855602adf40SYehuda Sadeh * it. 856602adf40SYehuda Sadeh */ 8579d3997fdSAlex Elder static void rbd_put_client(struct rbd_client *rbdc) 858602adf40SYehuda Sadeh { 859c53d5893SAlex Elder if (rbdc) 8609d3997fdSAlex Elder kref_put(&rbdc->kref, rbd_client_release); 861602adf40SYehuda Sadeh } 862602adf40SYehuda Sadeh 863a30b71b9SAlex Elder static bool rbd_image_format_valid(u32 image_format) 864a30b71b9SAlex Elder { 865a30b71b9SAlex Elder return image_format == 1 || image_format == 2; 866a30b71b9SAlex Elder } 867a30b71b9SAlex Elder 8688e94af8eSAlex Elder static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk) 8698e94af8eSAlex Elder { 870103a150fSAlex Elder size_t size; 871103a150fSAlex Elder u32 snap_count; 872103a150fSAlex Elder 873103a150fSAlex Elder /* The header has to start with the magic rbd header text */ 874103a150fSAlex Elder if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT))) 875103a150fSAlex Elder return false; 876103a150fSAlex Elder 877db2388b6SAlex Elder /* The bio layer requires at least sector-sized I/O */ 878db2388b6SAlex Elder 879db2388b6SAlex Elder if (ondisk->options.order < SECTOR_SHIFT) 880db2388b6SAlex Elder return false; 881db2388b6SAlex Elder 882db2388b6SAlex Elder /* If we use u64 in a few spots we may be able to loosen this */ 883db2388b6SAlex Elder 884db2388b6SAlex Elder if (ondisk->options.order > 8 * sizeof (int) - 1) 885db2388b6SAlex Elder return false; 886db2388b6SAlex Elder 887103a150fSAlex Elder /* 888103a150fSAlex Elder * The size of a snapshot header has to fit in a size_t, and 889103a150fSAlex Elder * that limits the number of snapshots. 890103a150fSAlex Elder */ 891103a150fSAlex Elder snap_count = le32_to_cpu(ondisk->snap_count); 892103a150fSAlex Elder size = SIZE_MAX - sizeof (struct ceph_snap_context); 893103a150fSAlex Elder if (snap_count > size / sizeof (__le64)) 894103a150fSAlex Elder return false; 895103a150fSAlex Elder 896103a150fSAlex Elder /* 897103a150fSAlex Elder * Not only that, but the size of the entire the snapshot 898103a150fSAlex Elder * header must also be representable in a size_t. 899103a150fSAlex Elder */ 900103a150fSAlex Elder size -= snap_count * sizeof (__le64); 901103a150fSAlex Elder if ((u64) size < le64_to_cpu(ondisk->snap_names_len)) 902103a150fSAlex Elder return false; 903103a150fSAlex Elder 904103a150fSAlex Elder return true; 9058e94af8eSAlex Elder } 9068e94af8eSAlex Elder 907602adf40SYehuda Sadeh /* 908bb23e37aSAlex Elder * Fill an rbd image header with information from the given format 1 909bb23e37aSAlex Elder * on-disk header. 910602adf40SYehuda Sadeh */ 911662518b1SAlex Elder static int rbd_header_from_disk(struct rbd_device *rbd_dev, 9124156d998SAlex Elder struct rbd_image_header_ondisk *ondisk) 913602adf40SYehuda Sadeh { 914662518b1SAlex Elder struct rbd_image_header *header = &rbd_dev->header; 915bb23e37aSAlex Elder bool first_time = header->object_prefix == NULL; 916bb23e37aSAlex Elder struct ceph_snap_context *snapc; 917bb23e37aSAlex Elder char *object_prefix = NULL; 918bb23e37aSAlex Elder char *snap_names = NULL; 919bb23e37aSAlex Elder u64 *snap_sizes = NULL; 920ccece235SAlex Elder u32 snap_count; 921d2bb24e5SAlex Elder size_t size; 922bb23e37aSAlex Elder int ret = -ENOMEM; 923621901d6SAlex Elder u32 i; 924602adf40SYehuda Sadeh 925bb23e37aSAlex Elder /* Allocate this now to avoid having to handle failure below */ 926103a150fSAlex Elder 927bb23e37aSAlex Elder if (first_time) { 928bb23e37aSAlex Elder size_t len; 929bb23e37aSAlex Elder 930bb23e37aSAlex Elder len = strnlen(ondisk->object_prefix, 931bb23e37aSAlex Elder sizeof (ondisk->object_prefix)); 932bb23e37aSAlex Elder object_prefix = kmalloc(len + 1, GFP_KERNEL); 933bb23e37aSAlex Elder if (!object_prefix) 934602adf40SYehuda Sadeh return -ENOMEM; 935bb23e37aSAlex Elder memcpy(object_prefix, ondisk->object_prefix, len); 936bb23e37aSAlex Elder object_prefix[len] = '\0'; 937bb23e37aSAlex Elder } 93800f1f36fSAlex Elder 939bb23e37aSAlex Elder /* Allocate the snapshot context and fill it in */ 940d2bb24e5SAlex Elder 941602adf40SYehuda Sadeh snap_count = le32_to_cpu(ondisk->snap_count); 942bb23e37aSAlex Elder snapc = ceph_create_snap_context(snap_count, GFP_KERNEL); 943bb23e37aSAlex Elder if (!snapc) 944bb23e37aSAlex Elder goto out_err; 945bb23e37aSAlex Elder snapc->seq = le64_to_cpu(ondisk->snap_seq); 946602adf40SYehuda Sadeh if (snap_count) { 947bb23e37aSAlex Elder struct rbd_image_snap_ondisk *snaps; 948f785cc1dSAlex Elder u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len); 949f785cc1dSAlex Elder 950bb23e37aSAlex Elder /* We'll keep a copy of the snapshot names... */ 951621901d6SAlex Elder 952f785cc1dSAlex Elder if (snap_names_len > (u64)SIZE_MAX) 953bb23e37aSAlex Elder goto out_2big; 954bb23e37aSAlex Elder snap_names = kmalloc(snap_names_len, GFP_KERNEL); 955bb23e37aSAlex Elder if (!snap_names) 956602adf40SYehuda Sadeh goto out_err; 957bb23e37aSAlex Elder 958bb23e37aSAlex Elder /* ...as well as the array of their sizes. */ 959bb23e37aSAlex Elder 960bb23e37aSAlex Elder size = snap_count * sizeof (*header->snap_sizes); 961bb23e37aSAlex Elder snap_sizes = kmalloc(size, GFP_KERNEL); 962bb23e37aSAlex Elder if (!snap_sizes) 963bb23e37aSAlex Elder goto out_err; 964bb23e37aSAlex Elder 965f785cc1dSAlex Elder /* 966bb23e37aSAlex Elder * Copy the names, and fill in each snapshot's id 967bb23e37aSAlex Elder * and size. 968bb23e37aSAlex Elder * 96999a41ebcSAlex Elder * Note that rbd_dev_v1_header_info() guarantees the 970bb23e37aSAlex Elder * ondisk buffer we're working with has 971f785cc1dSAlex Elder * snap_names_len bytes beyond the end of the 972f785cc1dSAlex Elder * snapshot id array, this memcpy() is safe. 973f785cc1dSAlex Elder */ 974bb23e37aSAlex Elder memcpy(snap_names, &ondisk->snaps[snap_count], snap_names_len); 975bb23e37aSAlex Elder snaps = ondisk->snaps; 976bb23e37aSAlex Elder for (i = 0; i < snap_count; i++) { 977bb23e37aSAlex Elder snapc->snaps[i] = le64_to_cpu(snaps[i].id); 978bb23e37aSAlex Elder snap_sizes[i] = le64_to_cpu(snaps[i].image_size); 979bb23e37aSAlex Elder } 980602adf40SYehuda Sadeh } 981849b4260SAlex Elder 982bb23e37aSAlex Elder /* We won't fail any more, fill in the header */ 983bb23e37aSAlex Elder 984bb23e37aSAlex Elder if (first_time) { 985bb23e37aSAlex Elder header->object_prefix = object_prefix; 986602adf40SYehuda Sadeh header->obj_order = ondisk->options.order; 987602adf40SYehuda Sadeh header->crypt_type = ondisk->options.crypt_type; 988602adf40SYehuda Sadeh header->comp_type = ondisk->options.comp_type; 989bb23e37aSAlex Elder /* The rest aren't used for format 1 images */ 990bb23e37aSAlex Elder header->stripe_unit = 0; 991bb23e37aSAlex Elder header->stripe_count = 0; 992bb23e37aSAlex Elder header->features = 0; 993662518b1SAlex Elder } else { 994662518b1SAlex Elder ceph_put_snap_context(header->snapc); 995662518b1SAlex Elder kfree(header->snap_names); 996662518b1SAlex Elder kfree(header->snap_sizes); 997bb23e37aSAlex Elder } 9986a52325fSAlex Elder 999bb23e37aSAlex Elder /* The remaining fields always get updated (when we refresh) */ 1000621901d6SAlex Elder 1001f84344f3SAlex Elder header->image_size = le64_to_cpu(ondisk->image_size); 1002bb23e37aSAlex Elder header->snapc = snapc; 1003bb23e37aSAlex Elder header->snap_names = snap_names; 1004bb23e37aSAlex Elder header->snap_sizes = snap_sizes; 1005468521c1SAlex Elder 1006602adf40SYehuda Sadeh return 0; 1007bb23e37aSAlex Elder out_2big: 1008bb23e37aSAlex Elder ret = -EIO; 10096a52325fSAlex Elder out_err: 1010bb23e37aSAlex Elder kfree(snap_sizes); 1011bb23e37aSAlex Elder kfree(snap_names); 1012bb23e37aSAlex Elder ceph_put_snap_context(snapc); 1013bb23e37aSAlex Elder kfree(object_prefix); 1014ccece235SAlex Elder 1015bb23e37aSAlex Elder return ret; 1016602adf40SYehuda Sadeh } 1017602adf40SYehuda Sadeh 10189682fc6dSAlex Elder static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which) 10199682fc6dSAlex Elder { 10209682fc6dSAlex Elder const char *snap_name; 10219682fc6dSAlex Elder 10229682fc6dSAlex Elder rbd_assert(which < rbd_dev->header.snapc->num_snaps); 10239682fc6dSAlex Elder 10249682fc6dSAlex Elder /* Skip over names until we find the one we are looking for */ 10259682fc6dSAlex Elder 10269682fc6dSAlex Elder snap_name = rbd_dev->header.snap_names; 10279682fc6dSAlex Elder while (which--) 10289682fc6dSAlex Elder snap_name += strlen(snap_name) + 1; 10299682fc6dSAlex Elder 10309682fc6dSAlex Elder return kstrdup(snap_name, GFP_KERNEL); 10319682fc6dSAlex Elder } 10329682fc6dSAlex Elder 103330d1cff8SAlex Elder /* 103430d1cff8SAlex Elder * Snapshot id comparison function for use with qsort()/bsearch(). 103530d1cff8SAlex Elder * Note that result is for snapshots in *descending* order. 103630d1cff8SAlex Elder */ 103730d1cff8SAlex Elder static int snapid_compare_reverse(const void *s1, const void *s2) 103830d1cff8SAlex Elder { 103930d1cff8SAlex Elder u64 snap_id1 = *(u64 *)s1; 104030d1cff8SAlex Elder u64 snap_id2 = *(u64 *)s2; 104130d1cff8SAlex Elder 104230d1cff8SAlex Elder if (snap_id1 < snap_id2) 104330d1cff8SAlex Elder return 1; 104430d1cff8SAlex Elder return snap_id1 == snap_id2 ? 0 : -1; 104530d1cff8SAlex Elder } 104630d1cff8SAlex Elder 104730d1cff8SAlex Elder /* 104830d1cff8SAlex Elder * Search a snapshot context to see if the given snapshot id is 104930d1cff8SAlex Elder * present. 105030d1cff8SAlex Elder * 105130d1cff8SAlex Elder * Returns the position of the snapshot id in the array if it's found, 105230d1cff8SAlex Elder * or BAD_SNAP_INDEX otherwise. 105330d1cff8SAlex Elder * 105430d1cff8SAlex Elder * Note: The snapshot array is in kept sorted (by the osd) in 105530d1cff8SAlex Elder * reverse order, highest snapshot id first. 105630d1cff8SAlex Elder */ 10579682fc6dSAlex Elder static u32 rbd_dev_snap_index(struct rbd_device *rbd_dev, u64 snap_id) 10589682fc6dSAlex Elder { 10599682fc6dSAlex Elder struct ceph_snap_context *snapc = rbd_dev->header.snapc; 106030d1cff8SAlex Elder u64 *found; 10619682fc6dSAlex Elder 106230d1cff8SAlex Elder found = bsearch(&snap_id, &snapc->snaps, snapc->num_snaps, 106330d1cff8SAlex Elder sizeof (snap_id), snapid_compare_reverse); 10649682fc6dSAlex Elder 106530d1cff8SAlex Elder return found ? (u32)(found - &snapc->snaps[0]) : BAD_SNAP_INDEX; 10669682fc6dSAlex Elder } 10679682fc6dSAlex Elder 10682ad3d716SAlex Elder static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, 10692ad3d716SAlex Elder u64 snap_id) 107054cac61fSAlex Elder { 107154cac61fSAlex Elder u32 which; 1072da6a6b63SJosh Durgin const char *snap_name; 107354cac61fSAlex Elder 107454cac61fSAlex Elder which = rbd_dev_snap_index(rbd_dev, snap_id); 107554cac61fSAlex Elder if (which == BAD_SNAP_INDEX) 1076da6a6b63SJosh Durgin return ERR_PTR(-ENOENT); 107754cac61fSAlex Elder 1078da6a6b63SJosh Durgin snap_name = _rbd_dev_v1_snap_name(rbd_dev, which); 1079da6a6b63SJosh Durgin return snap_name ? snap_name : ERR_PTR(-ENOMEM); 108054cac61fSAlex Elder } 108154cac61fSAlex Elder 10829e15b77dSAlex Elder static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id) 10839e15b77dSAlex Elder { 10849e15b77dSAlex Elder if (snap_id == CEPH_NOSNAP) 10859e15b77dSAlex Elder return RBD_SNAP_HEAD_NAME; 10869e15b77dSAlex Elder 108754cac61fSAlex Elder rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 108854cac61fSAlex Elder if (rbd_dev->image_format == 1) 108954cac61fSAlex Elder return rbd_dev_v1_snap_name(rbd_dev, snap_id); 10909e15b77dSAlex Elder 109154cac61fSAlex Elder return rbd_dev_v2_snap_name(rbd_dev, snap_id); 10929e15b77dSAlex Elder } 10939e15b77dSAlex Elder 10942ad3d716SAlex Elder static int rbd_snap_size(struct rbd_device *rbd_dev, u64 snap_id, 10952ad3d716SAlex Elder u64 *snap_size) 1096602adf40SYehuda Sadeh { 10972ad3d716SAlex Elder rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 10982ad3d716SAlex Elder if (snap_id == CEPH_NOSNAP) { 10992ad3d716SAlex Elder *snap_size = rbd_dev->header.image_size; 11002ad3d716SAlex Elder } else if (rbd_dev->image_format == 1) { 11012ad3d716SAlex Elder u32 which; 110200f1f36fSAlex Elder 11032ad3d716SAlex Elder which = rbd_dev_snap_index(rbd_dev, snap_id); 11042ad3d716SAlex Elder if (which == BAD_SNAP_INDEX) 11052ad3d716SAlex Elder return -ENOENT; 110600f1f36fSAlex Elder 11072ad3d716SAlex Elder *snap_size = rbd_dev->header.snap_sizes[which]; 11082ad3d716SAlex Elder } else { 11092ad3d716SAlex Elder u64 size = 0; 11102ad3d716SAlex Elder int ret; 11112ad3d716SAlex Elder 11122ad3d716SAlex Elder ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size); 11132ad3d716SAlex Elder if (ret) 11142ad3d716SAlex Elder return ret; 11152ad3d716SAlex Elder 11162ad3d716SAlex Elder *snap_size = size; 11172ad3d716SAlex Elder } 11182ad3d716SAlex Elder return 0; 11192ad3d716SAlex Elder } 11202ad3d716SAlex Elder 11212ad3d716SAlex Elder static int rbd_snap_features(struct rbd_device *rbd_dev, u64 snap_id, 11222ad3d716SAlex Elder u64 *snap_features) 11232ad3d716SAlex Elder { 11242ad3d716SAlex Elder rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 11252ad3d716SAlex Elder if (snap_id == CEPH_NOSNAP) { 11262ad3d716SAlex Elder *snap_features = rbd_dev->header.features; 11272ad3d716SAlex Elder } else if (rbd_dev->image_format == 1) { 11282ad3d716SAlex Elder *snap_features = 0; /* No features for format 1 */ 11292ad3d716SAlex Elder } else { 11302ad3d716SAlex Elder u64 features = 0; 11312ad3d716SAlex Elder int ret; 11322ad3d716SAlex Elder 11332ad3d716SAlex Elder ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, &features); 11342ad3d716SAlex Elder if (ret) 11352ad3d716SAlex Elder return ret; 11362ad3d716SAlex Elder 11372ad3d716SAlex Elder *snap_features = features; 11382ad3d716SAlex Elder } 11392ad3d716SAlex Elder return 0; 114000f1f36fSAlex Elder } 1141602adf40SYehuda Sadeh 1142d1cf5788SAlex Elder static int rbd_dev_mapping_set(struct rbd_device *rbd_dev) 1143602adf40SYehuda Sadeh { 11448f4b7d98SAlex Elder u64 snap_id = rbd_dev->spec->snap_id; 11452ad3d716SAlex Elder u64 size = 0; 11462ad3d716SAlex Elder u64 features = 0; 11472ad3d716SAlex Elder int ret; 11488b0241f8SAlex Elder 11492ad3d716SAlex Elder ret = rbd_snap_size(rbd_dev, snap_id, &size); 11502ad3d716SAlex Elder if (ret) 11512ad3d716SAlex Elder return ret; 11522ad3d716SAlex Elder ret = rbd_snap_features(rbd_dev, snap_id, &features); 11532ad3d716SAlex Elder if (ret) 11542ad3d716SAlex Elder return ret; 11552ad3d716SAlex Elder 11562ad3d716SAlex Elder rbd_dev->mapping.size = size; 11572ad3d716SAlex Elder rbd_dev->mapping.features = features; 11582ad3d716SAlex Elder 11598b0241f8SAlex Elder return 0; 1160602adf40SYehuda Sadeh } 1161602adf40SYehuda Sadeh 1162d1cf5788SAlex Elder static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev) 1163d1cf5788SAlex Elder { 1164d1cf5788SAlex Elder rbd_dev->mapping.size = 0; 1165d1cf5788SAlex Elder rbd_dev->mapping.features = 0; 1166200a6a8bSAlex Elder } 1167200a6a8bSAlex Elder 11687d5079aaSHimangi Saraogi static void rbd_segment_name_free(const char *name) 11697d5079aaSHimangi Saraogi { 11707d5079aaSHimangi Saraogi /* The explicit cast here is needed to drop the const qualifier */ 11717d5079aaSHimangi Saraogi 11727d5079aaSHimangi Saraogi kmem_cache_free(rbd_segment_name_cache, (void *)name); 11737d5079aaSHimangi Saraogi } 11747d5079aaSHimangi Saraogi 117598571b5aSAlex Elder static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset) 1176602adf40SYehuda Sadeh { 117765ccfe21SAlex Elder char *name; 117865ccfe21SAlex Elder u64 segment; 117965ccfe21SAlex Elder int ret; 11803a96d5cdSJosh Durgin char *name_format; 1181602adf40SYehuda Sadeh 118278c2a44aSAlex Elder name = kmem_cache_alloc(rbd_segment_name_cache, GFP_NOIO); 118365ccfe21SAlex Elder if (!name) 118465ccfe21SAlex Elder return NULL; 118565ccfe21SAlex Elder segment = offset >> rbd_dev->header.obj_order; 11863a96d5cdSJosh Durgin name_format = "%s.%012llx"; 11873a96d5cdSJosh Durgin if (rbd_dev->image_format == 2) 11883a96d5cdSJosh Durgin name_format = "%s.%016llx"; 11892d0ebc5dSIlya Dryomov ret = snprintf(name, CEPH_MAX_OID_NAME_LEN + 1, name_format, 119065ccfe21SAlex Elder rbd_dev->header.object_prefix, segment); 11912d0ebc5dSIlya Dryomov if (ret < 0 || ret > CEPH_MAX_OID_NAME_LEN) { 119265ccfe21SAlex Elder pr_err("error formatting segment name for #%llu (%d)\n", 119365ccfe21SAlex Elder segment, ret); 11947d5079aaSHimangi Saraogi rbd_segment_name_free(name); 119565ccfe21SAlex Elder name = NULL; 119665ccfe21SAlex Elder } 1197602adf40SYehuda Sadeh 119865ccfe21SAlex Elder return name; 119965ccfe21SAlex Elder } 1200602adf40SYehuda Sadeh 120165ccfe21SAlex Elder static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset) 120265ccfe21SAlex Elder { 120365ccfe21SAlex Elder u64 segment_size = (u64) 1 << rbd_dev->header.obj_order; 1204602adf40SYehuda Sadeh 120565ccfe21SAlex Elder return offset & (segment_size - 1); 120665ccfe21SAlex Elder } 120765ccfe21SAlex Elder 120865ccfe21SAlex Elder static u64 rbd_segment_length(struct rbd_device *rbd_dev, 120965ccfe21SAlex Elder u64 offset, u64 length) 121065ccfe21SAlex Elder { 121165ccfe21SAlex Elder u64 segment_size = (u64) 1 << rbd_dev->header.obj_order; 121265ccfe21SAlex Elder 121365ccfe21SAlex Elder offset &= segment_size - 1; 121465ccfe21SAlex Elder 1215aafb230eSAlex Elder rbd_assert(length <= U64_MAX - offset); 121665ccfe21SAlex Elder if (offset + length > segment_size) 121765ccfe21SAlex Elder length = segment_size - offset; 121865ccfe21SAlex Elder 121965ccfe21SAlex Elder return length; 1220602adf40SYehuda Sadeh } 1221602adf40SYehuda Sadeh 1222602adf40SYehuda Sadeh /* 1223029bcbd8SJosh Durgin * returns the size of an object in the image 1224029bcbd8SJosh Durgin */ 1225029bcbd8SJosh Durgin static u64 rbd_obj_bytes(struct rbd_image_header *header) 1226029bcbd8SJosh Durgin { 1227029bcbd8SJosh Durgin return 1 << header->obj_order; 1228029bcbd8SJosh Durgin } 1229029bcbd8SJosh Durgin 1230029bcbd8SJosh Durgin /* 1231602adf40SYehuda Sadeh * bio helpers 1232602adf40SYehuda Sadeh */ 1233602adf40SYehuda Sadeh 1234602adf40SYehuda Sadeh static void bio_chain_put(struct bio *chain) 1235602adf40SYehuda Sadeh { 1236602adf40SYehuda Sadeh struct bio *tmp; 1237602adf40SYehuda Sadeh 1238602adf40SYehuda Sadeh while (chain) { 1239602adf40SYehuda Sadeh tmp = chain; 1240602adf40SYehuda Sadeh chain = chain->bi_next; 1241602adf40SYehuda Sadeh bio_put(tmp); 1242602adf40SYehuda Sadeh } 1243602adf40SYehuda Sadeh } 1244602adf40SYehuda Sadeh 1245602adf40SYehuda Sadeh /* 1246602adf40SYehuda Sadeh * zeros a bio chain, starting at specific offset 1247602adf40SYehuda Sadeh */ 1248602adf40SYehuda Sadeh static void zero_bio_chain(struct bio *chain, int start_ofs) 1249602adf40SYehuda Sadeh { 12507988613bSKent Overstreet struct bio_vec bv; 12517988613bSKent Overstreet struct bvec_iter iter; 1252602adf40SYehuda Sadeh unsigned long flags; 1253602adf40SYehuda Sadeh void *buf; 1254602adf40SYehuda Sadeh int pos = 0; 1255602adf40SYehuda Sadeh 1256602adf40SYehuda Sadeh while (chain) { 12577988613bSKent Overstreet bio_for_each_segment(bv, chain, iter) { 12587988613bSKent Overstreet if (pos + bv.bv_len > start_ofs) { 1259602adf40SYehuda Sadeh int remainder = max(start_ofs - pos, 0); 12607988613bSKent Overstreet buf = bvec_kmap_irq(&bv, &flags); 1261602adf40SYehuda Sadeh memset(buf + remainder, 0, 12627988613bSKent Overstreet bv.bv_len - remainder); 12637988613bSKent Overstreet flush_dcache_page(bv.bv_page); 126485b5aaa6SDan Carpenter bvec_kunmap_irq(buf, &flags); 1265602adf40SYehuda Sadeh } 12667988613bSKent Overstreet pos += bv.bv_len; 1267602adf40SYehuda Sadeh } 1268602adf40SYehuda Sadeh 1269602adf40SYehuda Sadeh chain = chain->bi_next; 1270602adf40SYehuda Sadeh } 1271602adf40SYehuda Sadeh } 1272602adf40SYehuda Sadeh 1273602adf40SYehuda Sadeh /* 1274b9434c5bSAlex Elder * similar to zero_bio_chain(), zeros data defined by a page array, 1275b9434c5bSAlex Elder * starting at the given byte offset from the start of the array and 1276b9434c5bSAlex Elder * continuing up to the given end offset. The pages array is 1277b9434c5bSAlex Elder * assumed to be big enough to hold all bytes up to the end. 1278b9434c5bSAlex Elder */ 1279b9434c5bSAlex Elder static void zero_pages(struct page **pages, u64 offset, u64 end) 1280b9434c5bSAlex Elder { 1281b9434c5bSAlex Elder struct page **page = &pages[offset >> PAGE_SHIFT]; 1282b9434c5bSAlex Elder 1283b9434c5bSAlex Elder rbd_assert(end > offset); 1284b9434c5bSAlex Elder rbd_assert(end - offset <= (u64)SIZE_MAX); 1285b9434c5bSAlex Elder while (offset < end) { 1286b9434c5bSAlex Elder size_t page_offset; 1287b9434c5bSAlex Elder size_t length; 1288b9434c5bSAlex Elder unsigned long flags; 1289b9434c5bSAlex Elder void *kaddr; 1290b9434c5bSAlex Elder 1291491205a8SGeert Uytterhoeven page_offset = offset & ~PAGE_MASK; 1292491205a8SGeert Uytterhoeven length = min_t(size_t, PAGE_SIZE - page_offset, end - offset); 1293b9434c5bSAlex Elder local_irq_save(flags); 1294b9434c5bSAlex Elder kaddr = kmap_atomic(*page); 1295b9434c5bSAlex Elder memset(kaddr + page_offset, 0, length); 1296e2156054SAlex Elder flush_dcache_page(*page); 1297b9434c5bSAlex Elder kunmap_atomic(kaddr); 1298b9434c5bSAlex Elder local_irq_restore(flags); 1299b9434c5bSAlex Elder 1300b9434c5bSAlex Elder offset += length; 1301b9434c5bSAlex Elder page++; 1302b9434c5bSAlex Elder } 1303b9434c5bSAlex Elder } 1304b9434c5bSAlex Elder 1305b9434c5bSAlex Elder /* 1306f7760dadSAlex Elder * Clone a portion of a bio, starting at the given byte offset 1307f7760dadSAlex Elder * and continuing for the number of bytes indicated. 1308602adf40SYehuda Sadeh */ 1309f7760dadSAlex Elder static struct bio *bio_clone_range(struct bio *bio_src, 1310f7760dadSAlex Elder unsigned int offset, 1311f7760dadSAlex Elder unsigned int len, 1312f7760dadSAlex Elder gfp_t gfpmask) 1313602adf40SYehuda Sadeh { 1314f7760dadSAlex Elder struct bio *bio; 1315602adf40SYehuda Sadeh 13165341a627SKent Overstreet bio = bio_clone(bio_src, gfpmask); 1317f7760dadSAlex Elder if (!bio) 1318f7760dadSAlex Elder return NULL; /* ENOMEM */ 1319f7760dadSAlex Elder 13205341a627SKent Overstreet bio_advance(bio, offset); 13214f024f37SKent Overstreet bio->bi_iter.bi_size = len; 1322602adf40SYehuda Sadeh 1323f7760dadSAlex Elder return bio; 1324602adf40SYehuda Sadeh } 1325602adf40SYehuda Sadeh 1326f7760dadSAlex Elder /* 1327f7760dadSAlex Elder * Clone a portion of a bio chain, starting at the given byte offset 1328f7760dadSAlex Elder * into the first bio in the source chain and continuing for the 1329f7760dadSAlex Elder * number of bytes indicated. The result is another bio chain of 1330f7760dadSAlex Elder * exactly the given length, or a null pointer on error. 1331f7760dadSAlex Elder * 1332f7760dadSAlex Elder * The bio_src and offset parameters are both in-out. On entry they 1333f7760dadSAlex Elder * refer to the first source bio and the offset into that bio where 1334f7760dadSAlex Elder * the start of data to be cloned is located. 1335f7760dadSAlex Elder * 1336f7760dadSAlex Elder * On return, bio_src is updated to refer to the bio in the source 1337f7760dadSAlex Elder * chain that contains first un-cloned byte, and *offset will 1338f7760dadSAlex Elder * contain the offset of that byte within that bio. 1339f7760dadSAlex Elder */ 1340f7760dadSAlex Elder static struct bio *bio_chain_clone_range(struct bio **bio_src, 1341f7760dadSAlex Elder unsigned int *offset, 1342f7760dadSAlex Elder unsigned int len, 1343f7760dadSAlex Elder gfp_t gfpmask) 1344f7760dadSAlex Elder { 1345f7760dadSAlex Elder struct bio *bi = *bio_src; 1346f7760dadSAlex Elder unsigned int off = *offset; 1347f7760dadSAlex Elder struct bio *chain = NULL; 1348f7760dadSAlex Elder struct bio **end; 1349602adf40SYehuda Sadeh 1350f7760dadSAlex Elder /* Build up a chain of clone bios up to the limit */ 1351602adf40SYehuda Sadeh 13524f024f37SKent Overstreet if (!bi || off >= bi->bi_iter.bi_size || !len) 1353f7760dadSAlex Elder return NULL; /* Nothing to clone */ 1354602adf40SYehuda Sadeh 1355f7760dadSAlex Elder end = &chain; 1356f7760dadSAlex Elder while (len) { 1357f7760dadSAlex Elder unsigned int bi_size; 1358f7760dadSAlex Elder struct bio *bio; 1359f7760dadSAlex Elder 1360f5400b7aSAlex Elder if (!bi) { 1361f5400b7aSAlex Elder rbd_warn(NULL, "bio_chain exhausted with %u left", len); 1362f7760dadSAlex Elder goto out_err; /* EINVAL; ran out of bio's */ 1363f5400b7aSAlex Elder } 13644f024f37SKent Overstreet bi_size = min_t(unsigned int, bi->bi_iter.bi_size - off, len); 1365f7760dadSAlex Elder bio = bio_clone_range(bi, off, bi_size, gfpmask); 1366f7760dadSAlex Elder if (!bio) 1367f7760dadSAlex Elder goto out_err; /* ENOMEM */ 1368f7760dadSAlex Elder 1369f7760dadSAlex Elder *end = bio; 1370f7760dadSAlex Elder end = &bio->bi_next; 1371f7760dadSAlex Elder 1372f7760dadSAlex Elder off += bi_size; 13734f024f37SKent Overstreet if (off == bi->bi_iter.bi_size) { 1374f7760dadSAlex Elder bi = bi->bi_next; 1375f7760dadSAlex Elder off = 0; 1376f7760dadSAlex Elder } 1377f7760dadSAlex Elder len -= bi_size; 1378f7760dadSAlex Elder } 1379f7760dadSAlex Elder *bio_src = bi; 1380f7760dadSAlex Elder *offset = off; 1381f7760dadSAlex Elder 1382f7760dadSAlex Elder return chain; 1383f7760dadSAlex Elder out_err: 1384f7760dadSAlex Elder bio_chain_put(chain); 1385f7760dadSAlex Elder 1386602adf40SYehuda Sadeh return NULL; 1387602adf40SYehuda Sadeh } 1388602adf40SYehuda Sadeh 1389926f9b3fSAlex Elder /* 1390926f9b3fSAlex Elder * The default/initial value for all object request flags is 0. For 1391926f9b3fSAlex Elder * each flag, once its value is set to 1 it is never reset to 0 1392926f9b3fSAlex Elder * again. 1393926f9b3fSAlex Elder */ 13946365d33aSAlex Elder static void obj_request_img_data_set(struct rbd_obj_request *obj_request) 13956365d33aSAlex Elder { 13966365d33aSAlex Elder if (test_and_set_bit(OBJ_REQ_IMG_DATA, &obj_request->flags)) { 13976365d33aSAlex Elder struct rbd_device *rbd_dev; 13986365d33aSAlex Elder 139957acbaa7SAlex Elder rbd_dev = obj_request->img_request->rbd_dev; 14009584d508SIlya Dryomov rbd_warn(rbd_dev, "obj_request %p already marked img_data", 14016365d33aSAlex Elder obj_request); 14026365d33aSAlex Elder } 14036365d33aSAlex Elder } 14046365d33aSAlex Elder 14056365d33aSAlex Elder static bool obj_request_img_data_test(struct rbd_obj_request *obj_request) 14066365d33aSAlex Elder { 14076365d33aSAlex Elder smp_mb(); 14086365d33aSAlex Elder return test_bit(OBJ_REQ_IMG_DATA, &obj_request->flags) != 0; 14096365d33aSAlex Elder } 14106365d33aSAlex Elder 141157acbaa7SAlex Elder static void obj_request_done_set(struct rbd_obj_request *obj_request) 141257acbaa7SAlex Elder { 141357acbaa7SAlex Elder if (test_and_set_bit(OBJ_REQ_DONE, &obj_request->flags)) { 141457acbaa7SAlex Elder struct rbd_device *rbd_dev = NULL; 141557acbaa7SAlex Elder 141657acbaa7SAlex Elder if (obj_request_img_data_test(obj_request)) 141757acbaa7SAlex Elder rbd_dev = obj_request->img_request->rbd_dev; 14189584d508SIlya Dryomov rbd_warn(rbd_dev, "obj_request %p already marked done", 141957acbaa7SAlex Elder obj_request); 142057acbaa7SAlex Elder } 142157acbaa7SAlex Elder } 142257acbaa7SAlex Elder 142357acbaa7SAlex Elder static bool obj_request_done_test(struct rbd_obj_request *obj_request) 142457acbaa7SAlex Elder { 142557acbaa7SAlex Elder smp_mb(); 142657acbaa7SAlex Elder return test_bit(OBJ_REQ_DONE, &obj_request->flags) != 0; 142757acbaa7SAlex Elder } 142857acbaa7SAlex Elder 14295679c59fSAlex Elder /* 14305679c59fSAlex Elder * This sets the KNOWN flag after (possibly) setting the EXISTS 14315679c59fSAlex Elder * flag. The latter is set based on the "exists" value provided. 14325679c59fSAlex Elder * 14335679c59fSAlex Elder * Note that for our purposes once an object exists it never goes 14345679c59fSAlex Elder * away again. It's possible that the response from two existence 14355679c59fSAlex Elder * checks are separated by the creation of the target object, and 14365679c59fSAlex Elder * the first ("doesn't exist") response arrives *after* the second 14375679c59fSAlex Elder * ("does exist"). In that case we ignore the second one. 14385679c59fSAlex Elder */ 14395679c59fSAlex Elder static void obj_request_existence_set(struct rbd_obj_request *obj_request, 14405679c59fSAlex Elder bool exists) 14415679c59fSAlex Elder { 14425679c59fSAlex Elder if (exists) 14435679c59fSAlex Elder set_bit(OBJ_REQ_EXISTS, &obj_request->flags); 14445679c59fSAlex Elder set_bit(OBJ_REQ_KNOWN, &obj_request->flags); 14455679c59fSAlex Elder smp_mb(); 14465679c59fSAlex Elder } 14475679c59fSAlex Elder 14485679c59fSAlex Elder static bool obj_request_known_test(struct rbd_obj_request *obj_request) 14495679c59fSAlex Elder { 14505679c59fSAlex Elder smp_mb(); 14515679c59fSAlex Elder return test_bit(OBJ_REQ_KNOWN, &obj_request->flags) != 0; 14525679c59fSAlex Elder } 14535679c59fSAlex Elder 14545679c59fSAlex Elder static bool obj_request_exists_test(struct rbd_obj_request *obj_request) 14555679c59fSAlex Elder { 14565679c59fSAlex Elder smp_mb(); 14575679c59fSAlex Elder return test_bit(OBJ_REQ_EXISTS, &obj_request->flags) != 0; 14585679c59fSAlex Elder } 14595679c59fSAlex Elder 14609638556aSIlya Dryomov static bool obj_request_overlaps_parent(struct rbd_obj_request *obj_request) 14619638556aSIlya Dryomov { 14629638556aSIlya Dryomov struct rbd_device *rbd_dev = obj_request->img_request->rbd_dev; 14639638556aSIlya Dryomov 14649638556aSIlya Dryomov return obj_request->img_offset < 14659638556aSIlya Dryomov round_up(rbd_dev->parent_overlap, rbd_obj_bytes(&rbd_dev->header)); 14669638556aSIlya Dryomov } 14679638556aSIlya Dryomov 1468bf0d5f50SAlex Elder static void rbd_obj_request_get(struct rbd_obj_request *obj_request) 1469bf0d5f50SAlex Elder { 147037206ee5SAlex Elder dout("%s: obj %p (was %d)\n", __func__, obj_request, 147137206ee5SAlex Elder atomic_read(&obj_request->kref.refcount)); 1472bf0d5f50SAlex Elder kref_get(&obj_request->kref); 1473bf0d5f50SAlex Elder } 1474bf0d5f50SAlex Elder 1475bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref); 1476bf0d5f50SAlex Elder static void rbd_obj_request_put(struct rbd_obj_request *obj_request) 1477bf0d5f50SAlex Elder { 1478bf0d5f50SAlex Elder rbd_assert(obj_request != NULL); 147937206ee5SAlex Elder dout("%s: obj %p (was %d)\n", __func__, obj_request, 148037206ee5SAlex Elder atomic_read(&obj_request->kref.refcount)); 1481bf0d5f50SAlex Elder kref_put(&obj_request->kref, rbd_obj_request_destroy); 1482bf0d5f50SAlex Elder } 1483bf0d5f50SAlex Elder 14840f2d5be7SAlex Elder static void rbd_img_request_get(struct rbd_img_request *img_request) 14850f2d5be7SAlex Elder { 14860f2d5be7SAlex Elder dout("%s: img %p (was %d)\n", __func__, img_request, 14870f2d5be7SAlex Elder atomic_read(&img_request->kref.refcount)); 14880f2d5be7SAlex Elder kref_get(&img_request->kref); 14890f2d5be7SAlex Elder } 14900f2d5be7SAlex Elder 1491e93f3152SAlex Elder static bool img_request_child_test(struct rbd_img_request *img_request); 1492e93f3152SAlex Elder static void rbd_parent_request_destroy(struct kref *kref); 1493bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref); 1494bf0d5f50SAlex Elder static void rbd_img_request_put(struct rbd_img_request *img_request) 1495bf0d5f50SAlex Elder { 1496bf0d5f50SAlex Elder rbd_assert(img_request != NULL); 149737206ee5SAlex Elder dout("%s: img %p (was %d)\n", __func__, img_request, 149837206ee5SAlex Elder atomic_read(&img_request->kref.refcount)); 1499e93f3152SAlex Elder if (img_request_child_test(img_request)) 1500e93f3152SAlex Elder kref_put(&img_request->kref, rbd_parent_request_destroy); 1501e93f3152SAlex Elder else 1502bf0d5f50SAlex Elder kref_put(&img_request->kref, rbd_img_request_destroy); 1503bf0d5f50SAlex Elder } 1504bf0d5f50SAlex Elder 1505bf0d5f50SAlex Elder static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request, 1506bf0d5f50SAlex Elder struct rbd_obj_request *obj_request) 1507bf0d5f50SAlex Elder { 150825dcf954SAlex Elder rbd_assert(obj_request->img_request == NULL); 150925dcf954SAlex Elder 1510b155e86cSAlex Elder /* Image request now owns object's original reference */ 1511bf0d5f50SAlex Elder obj_request->img_request = img_request; 151225dcf954SAlex Elder obj_request->which = img_request->obj_request_count; 15136365d33aSAlex Elder rbd_assert(!obj_request_img_data_test(obj_request)); 15146365d33aSAlex Elder obj_request_img_data_set(obj_request); 1515bf0d5f50SAlex Elder rbd_assert(obj_request->which != BAD_WHICH); 151625dcf954SAlex Elder img_request->obj_request_count++; 151725dcf954SAlex Elder list_add_tail(&obj_request->links, &img_request->obj_requests); 151837206ee5SAlex Elder dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request, 151937206ee5SAlex Elder obj_request->which); 1520bf0d5f50SAlex Elder } 1521bf0d5f50SAlex Elder 1522bf0d5f50SAlex Elder static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request, 1523bf0d5f50SAlex Elder struct rbd_obj_request *obj_request) 1524bf0d5f50SAlex Elder { 1525bf0d5f50SAlex Elder rbd_assert(obj_request->which != BAD_WHICH); 152625dcf954SAlex Elder 152737206ee5SAlex Elder dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request, 152837206ee5SAlex Elder obj_request->which); 1529bf0d5f50SAlex Elder list_del(&obj_request->links); 153025dcf954SAlex Elder rbd_assert(img_request->obj_request_count > 0); 153125dcf954SAlex Elder img_request->obj_request_count--; 153225dcf954SAlex Elder rbd_assert(obj_request->which == img_request->obj_request_count); 153325dcf954SAlex Elder obj_request->which = BAD_WHICH; 15346365d33aSAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 1535bf0d5f50SAlex Elder rbd_assert(obj_request->img_request == img_request); 1536bf0d5f50SAlex Elder obj_request->img_request = NULL; 153725dcf954SAlex Elder obj_request->callback = NULL; 1538bf0d5f50SAlex Elder rbd_obj_request_put(obj_request); 1539bf0d5f50SAlex Elder } 1540bf0d5f50SAlex Elder 1541bf0d5f50SAlex Elder static bool obj_request_type_valid(enum obj_request_type type) 1542bf0d5f50SAlex Elder { 1543bf0d5f50SAlex Elder switch (type) { 15449969ebc5SAlex Elder case OBJ_REQUEST_NODATA: 1545bf0d5f50SAlex Elder case OBJ_REQUEST_BIO: 1546788e2df3SAlex Elder case OBJ_REQUEST_PAGES: 1547bf0d5f50SAlex Elder return true; 1548bf0d5f50SAlex Elder default: 1549bf0d5f50SAlex Elder return false; 1550bf0d5f50SAlex Elder } 1551bf0d5f50SAlex Elder } 1552bf0d5f50SAlex Elder 1553bf0d5f50SAlex Elder static int rbd_obj_request_submit(struct ceph_osd_client *osdc, 1554bf0d5f50SAlex Elder struct rbd_obj_request *obj_request) 1555bf0d5f50SAlex Elder { 155671c20a06SIlya Dryomov dout("%s %p\n", __func__, obj_request); 1557bf0d5f50SAlex Elder return ceph_osdc_start_request(osdc, obj_request->osd_req, false); 1558bf0d5f50SAlex Elder } 1559bf0d5f50SAlex Elder 156071c20a06SIlya Dryomov static void rbd_obj_request_end(struct rbd_obj_request *obj_request) 156171c20a06SIlya Dryomov { 156271c20a06SIlya Dryomov dout("%s %p\n", __func__, obj_request); 156371c20a06SIlya Dryomov ceph_osdc_cancel_request(obj_request->osd_req); 156471c20a06SIlya Dryomov } 156571c20a06SIlya Dryomov 156671c20a06SIlya Dryomov /* 156771c20a06SIlya Dryomov * Wait for an object request to complete. If interrupted, cancel the 156871c20a06SIlya Dryomov * underlying osd request. 15692894e1d7SIlya Dryomov * 15702894e1d7SIlya Dryomov * @timeout: in jiffies, 0 means "wait forever" 157171c20a06SIlya Dryomov */ 15722894e1d7SIlya Dryomov static int __rbd_obj_request_wait(struct rbd_obj_request *obj_request, 15732894e1d7SIlya Dryomov unsigned long timeout) 157471c20a06SIlya Dryomov { 15752894e1d7SIlya Dryomov long ret; 157671c20a06SIlya Dryomov 157771c20a06SIlya Dryomov dout("%s %p\n", __func__, obj_request); 15782894e1d7SIlya Dryomov ret = wait_for_completion_interruptible_timeout( 15792894e1d7SIlya Dryomov &obj_request->completion, 15802894e1d7SIlya Dryomov ceph_timeout_jiffies(timeout)); 15812894e1d7SIlya Dryomov if (ret <= 0) { 15822894e1d7SIlya Dryomov if (ret == 0) 15832894e1d7SIlya Dryomov ret = -ETIMEDOUT; 158471c20a06SIlya Dryomov rbd_obj_request_end(obj_request); 15852894e1d7SIlya Dryomov } else { 15862894e1d7SIlya Dryomov ret = 0; 15872894e1d7SIlya Dryomov } 15882894e1d7SIlya Dryomov 15892894e1d7SIlya Dryomov dout("%s %p ret %d\n", __func__, obj_request, (int)ret); 159071c20a06SIlya Dryomov return ret; 159171c20a06SIlya Dryomov } 159271c20a06SIlya Dryomov 15932894e1d7SIlya Dryomov static int rbd_obj_request_wait(struct rbd_obj_request *obj_request) 15942894e1d7SIlya Dryomov { 15952894e1d7SIlya Dryomov return __rbd_obj_request_wait(obj_request, 0); 15962894e1d7SIlya Dryomov } 15972894e1d7SIlya Dryomov 1598bf0d5f50SAlex Elder static void rbd_img_request_complete(struct rbd_img_request *img_request) 1599bf0d5f50SAlex Elder { 160055f27e09SAlex Elder 160137206ee5SAlex Elder dout("%s: img %p\n", __func__, img_request); 160255f27e09SAlex Elder 160355f27e09SAlex Elder /* 160455f27e09SAlex Elder * If no error occurred, compute the aggregate transfer 160555f27e09SAlex Elder * count for the image request. We could instead use 160655f27e09SAlex Elder * atomic64_cmpxchg() to update it as each object request 160755f27e09SAlex Elder * completes; not clear which way is better off hand. 160855f27e09SAlex Elder */ 160955f27e09SAlex Elder if (!img_request->result) { 161055f27e09SAlex Elder struct rbd_obj_request *obj_request; 161155f27e09SAlex Elder u64 xferred = 0; 161255f27e09SAlex Elder 161355f27e09SAlex Elder for_each_obj_request(img_request, obj_request) 161455f27e09SAlex Elder xferred += obj_request->xferred; 161555f27e09SAlex Elder img_request->xferred = xferred; 161655f27e09SAlex Elder } 161755f27e09SAlex Elder 1618bf0d5f50SAlex Elder if (img_request->callback) 1619bf0d5f50SAlex Elder img_request->callback(img_request); 1620bf0d5f50SAlex Elder else 1621bf0d5f50SAlex Elder rbd_img_request_put(img_request); 1622bf0d5f50SAlex Elder } 1623bf0d5f50SAlex Elder 16240c425248SAlex Elder /* 16250c425248SAlex Elder * The default/initial value for all image request flags is 0. Each 16260c425248SAlex Elder * is conditionally set to 1 at image request initialization time 16270c425248SAlex Elder * and currently never change thereafter. 16280c425248SAlex Elder */ 16290c425248SAlex Elder static void img_request_write_set(struct rbd_img_request *img_request) 16300c425248SAlex Elder { 16310c425248SAlex Elder set_bit(IMG_REQ_WRITE, &img_request->flags); 16320c425248SAlex Elder smp_mb(); 16330c425248SAlex Elder } 16340c425248SAlex Elder 16350c425248SAlex Elder static bool img_request_write_test(struct rbd_img_request *img_request) 16360c425248SAlex Elder { 16370c425248SAlex Elder smp_mb(); 16380c425248SAlex Elder return test_bit(IMG_REQ_WRITE, &img_request->flags) != 0; 16390c425248SAlex Elder } 16400c425248SAlex Elder 164190e98c52SGuangliang Zhao /* 164290e98c52SGuangliang Zhao * Set the discard flag when the img_request is an discard request 164390e98c52SGuangliang Zhao */ 164490e98c52SGuangliang Zhao static void img_request_discard_set(struct rbd_img_request *img_request) 164590e98c52SGuangliang Zhao { 164690e98c52SGuangliang Zhao set_bit(IMG_REQ_DISCARD, &img_request->flags); 164790e98c52SGuangliang Zhao smp_mb(); 164890e98c52SGuangliang Zhao } 164990e98c52SGuangliang Zhao 165090e98c52SGuangliang Zhao static bool img_request_discard_test(struct rbd_img_request *img_request) 165190e98c52SGuangliang Zhao { 165290e98c52SGuangliang Zhao smp_mb(); 165390e98c52SGuangliang Zhao return test_bit(IMG_REQ_DISCARD, &img_request->flags) != 0; 165490e98c52SGuangliang Zhao } 165590e98c52SGuangliang Zhao 16569849e986SAlex Elder static void img_request_child_set(struct rbd_img_request *img_request) 16579849e986SAlex Elder { 16589849e986SAlex Elder set_bit(IMG_REQ_CHILD, &img_request->flags); 16599849e986SAlex Elder smp_mb(); 16609849e986SAlex Elder } 16619849e986SAlex Elder 1662e93f3152SAlex Elder static void img_request_child_clear(struct rbd_img_request *img_request) 1663e93f3152SAlex Elder { 1664e93f3152SAlex Elder clear_bit(IMG_REQ_CHILD, &img_request->flags); 1665e93f3152SAlex Elder smp_mb(); 1666e93f3152SAlex Elder } 1667e93f3152SAlex Elder 16689849e986SAlex Elder static bool img_request_child_test(struct rbd_img_request *img_request) 16699849e986SAlex Elder { 16709849e986SAlex Elder smp_mb(); 16719849e986SAlex Elder return test_bit(IMG_REQ_CHILD, &img_request->flags) != 0; 16729849e986SAlex Elder } 16739849e986SAlex Elder 1674d0b2e944SAlex Elder static void img_request_layered_set(struct rbd_img_request *img_request) 1675d0b2e944SAlex Elder { 1676d0b2e944SAlex Elder set_bit(IMG_REQ_LAYERED, &img_request->flags); 1677d0b2e944SAlex Elder smp_mb(); 1678d0b2e944SAlex Elder } 1679d0b2e944SAlex Elder 1680a2acd00eSAlex Elder static void img_request_layered_clear(struct rbd_img_request *img_request) 1681a2acd00eSAlex Elder { 1682a2acd00eSAlex Elder clear_bit(IMG_REQ_LAYERED, &img_request->flags); 1683a2acd00eSAlex Elder smp_mb(); 1684a2acd00eSAlex Elder } 1685a2acd00eSAlex Elder 1686d0b2e944SAlex Elder static bool img_request_layered_test(struct rbd_img_request *img_request) 1687d0b2e944SAlex Elder { 1688d0b2e944SAlex Elder smp_mb(); 1689d0b2e944SAlex Elder return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0; 1690d0b2e944SAlex Elder } 1691d0b2e944SAlex Elder 16923b434a2aSJosh Durgin static enum obj_operation_type 16933b434a2aSJosh Durgin rbd_img_request_op_type(struct rbd_img_request *img_request) 16943b434a2aSJosh Durgin { 16953b434a2aSJosh Durgin if (img_request_write_test(img_request)) 16963b434a2aSJosh Durgin return OBJ_OP_WRITE; 16973b434a2aSJosh Durgin else if (img_request_discard_test(img_request)) 16983b434a2aSJosh Durgin return OBJ_OP_DISCARD; 16993b434a2aSJosh Durgin else 17003b434a2aSJosh Durgin return OBJ_OP_READ; 17013b434a2aSJosh Durgin } 17023b434a2aSJosh Durgin 17036e2a4505SAlex Elder static void 17046e2a4505SAlex Elder rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request) 17056e2a4505SAlex Elder { 1706b9434c5bSAlex Elder u64 xferred = obj_request->xferred; 1707b9434c5bSAlex Elder u64 length = obj_request->length; 1708b9434c5bSAlex Elder 17096e2a4505SAlex Elder dout("%s: obj %p img %p result %d %llu/%llu\n", __func__, 17106e2a4505SAlex Elder obj_request, obj_request->img_request, obj_request->result, 1711b9434c5bSAlex Elder xferred, length); 17126e2a4505SAlex Elder /* 171317c1cc1dSJosh Durgin * ENOENT means a hole in the image. We zero-fill the entire 171417c1cc1dSJosh Durgin * length of the request. A short read also implies zero-fill 171517c1cc1dSJosh Durgin * to the end of the request. An error requires the whole 171617c1cc1dSJosh Durgin * length of the request to be reported finished with an error 171717c1cc1dSJosh Durgin * to the block layer. In each case we update the xferred 171817c1cc1dSJosh Durgin * count to indicate the whole request was satisfied. 17196e2a4505SAlex Elder */ 1720b9434c5bSAlex Elder rbd_assert(obj_request->type != OBJ_REQUEST_NODATA); 17216e2a4505SAlex Elder if (obj_request->result == -ENOENT) { 1722b9434c5bSAlex Elder if (obj_request->type == OBJ_REQUEST_BIO) 17236e2a4505SAlex Elder zero_bio_chain(obj_request->bio_list, 0); 1724b9434c5bSAlex Elder else 1725b9434c5bSAlex Elder zero_pages(obj_request->pages, 0, length); 17266e2a4505SAlex Elder obj_request->result = 0; 1727b9434c5bSAlex Elder } else if (xferred < length && !obj_request->result) { 1728b9434c5bSAlex Elder if (obj_request->type == OBJ_REQUEST_BIO) 1729b9434c5bSAlex Elder zero_bio_chain(obj_request->bio_list, xferred); 1730b9434c5bSAlex Elder else 1731b9434c5bSAlex Elder zero_pages(obj_request->pages, xferred, length); 17326e2a4505SAlex Elder } 173317c1cc1dSJosh Durgin obj_request->xferred = length; 17346e2a4505SAlex Elder obj_request_done_set(obj_request); 17356e2a4505SAlex Elder } 17366e2a4505SAlex Elder 1737bf0d5f50SAlex Elder static void rbd_obj_request_complete(struct rbd_obj_request *obj_request) 1738bf0d5f50SAlex Elder { 173937206ee5SAlex Elder dout("%s: obj %p cb %p\n", __func__, obj_request, 174037206ee5SAlex Elder obj_request->callback); 1741bf0d5f50SAlex Elder if (obj_request->callback) 1742bf0d5f50SAlex Elder obj_request->callback(obj_request); 1743788e2df3SAlex Elder else 1744788e2df3SAlex Elder complete_all(&obj_request->completion); 1745bf0d5f50SAlex Elder } 1746bf0d5f50SAlex Elder 1747c47f9371SAlex Elder static void rbd_osd_read_callback(struct rbd_obj_request *obj_request) 1748bf0d5f50SAlex Elder { 174957acbaa7SAlex Elder struct rbd_img_request *img_request = NULL; 1750a9e8ba2cSAlex Elder struct rbd_device *rbd_dev = NULL; 175157acbaa7SAlex Elder bool layered = false; 175257acbaa7SAlex Elder 175357acbaa7SAlex Elder if (obj_request_img_data_test(obj_request)) { 175457acbaa7SAlex Elder img_request = obj_request->img_request; 175557acbaa7SAlex Elder layered = img_request && img_request_layered_test(img_request); 1756a9e8ba2cSAlex Elder rbd_dev = img_request->rbd_dev; 175757acbaa7SAlex Elder } 17588b3e1a56SAlex Elder 17598b3e1a56SAlex Elder dout("%s: obj %p img %p result %d %llu/%llu\n", __func__, 17608b3e1a56SAlex Elder obj_request, img_request, obj_request->result, 17618b3e1a56SAlex Elder obj_request->xferred, obj_request->length); 1762a9e8ba2cSAlex Elder if (layered && obj_request->result == -ENOENT && 1763a9e8ba2cSAlex Elder obj_request->img_offset < rbd_dev->parent_overlap) 17648b3e1a56SAlex Elder rbd_img_parent_read(obj_request); 17658b3e1a56SAlex Elder else if (img_request) 17666e2a4505SAlex Elder rbd_img_obj_request_read_callback(obj_request); 17676e2a4505SAlex Elder else 176807741308SAlex Elder obj_request_done_set(obj_request); 1769bf0d5f50SAlex Elder } 1770bf0d5f50SAlex Elder 1771c47f9371SAlex Elder static void rbd_osd_write_callback(struct rbd_obj_request *obj_request) 1772bf0d5f50SAlex Elder { 17731b83bef2SSage Weil dout("%s: obj %p result %d %llu\n", __func__, obj_request, 17741b83bef2SSage Weil obj_request->result, obj_request->length); 17751b83bef2SSage Weil /* 17768b3e1a56SAlex Elder * There is no such thing as a successful short write. Set 17778b3e1a56SAlex Elder * it to our originally-requested length. 17781b83bef2SSage Weil */ 17791b83bef2SSage Weil obj_request->xferred = obj_request->length; 178007741308SAlex Elder obj_request_done_set(obj_request); 1781bf0d5f50SAlex Elder } 1782bf0d5f50SAlex Elder 178390e98c52SGuangliang Zhao static void rbd_osd_discard_callback(struct rbd_obj_request *obj_request) 178490e98c52SGuangliang Zhao { 178590e98c52SGuangliang Zhao dout("%s: obj %p result %d %llu\n", __func__, obj_request, 178690e98c52SGuangliang Zhao obj_request->result, obj_request->length); 178790e98c52SGuangliang Zhao /* 178890e98c52SGuangliang Zhao * There is no such thing as a successful short discard. Set 178990e98c52SGuangliang Zhao * it to our originally-requested length. 179090e98c52SGuangliang Zhao */ 179190e98c52SGuangliang Zhao obj_request->xferred = obj_request->length; 1792d0265de7SJosh Durgin /* discarding a non-existent object is not a problem */ 1793d0265de7SJosh Durgin if (obj_request->result == -ENOENT) 1794d0265de7SJosh Durgin obj_request->result = 0; 179590e98c52SGuangliang Zhao obj_request_done_set(obj_request); 179690e98c52SGuangliang Zhao } 179790e98c52SGuangliang Zhao 1798fbfab539SAlex Elder /* 1799fbfab539SAlex Elder * For a simple stat call there's nothing to do. We'll do more if 1800fbfab539SAlex Elder * this is part of a write sequence for a layered image. 1801fbfab539SAlex Elder */ 1802c47f9371SAlex Elder static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request) 1803fbfab539SAlex Elder { 180437206ee5SAlex Elder dout("%s: obj %p\n", __func__, obj_request); 1805fbfab539SAlex Elder obj_request_done_set(obj_request); 1806fbfab539SAlex Elder } 1807fbfab539SAlex Elder 18082761713dSIlya Dryomov static void rbd_osd_call_callback(struct rbd_obj_request *obj_request) 18092761713dSIlya Dryomov { 18102761713dSIlya Dryomov dout("%s: obj %p\n", __func__, obj_request); 18112761713dSIlya Dryomov 18122761713dSIlya Dryomov if (obj_request_img_data_test(obj_request)) 18132761713dSIlya Dryomov rbd_osd_copyup_callback(obj_request); 18142761713dSIlya Dryomov else 18152761713dSIlya Dryomov obj_request_done_set(obj_request); 18162761713dSIlya Dryomov } 18172761713dSIlya Dryomov 181885e084feSIlya Dryomov static void rbd_osd_req_callback(struct ceph_osd_request *osd_req) 1819bf0d5f50SAlex Elder { 1820bf0d5f50SAlex Elder struct rbd_obj_request *obj_request = osd_req->r_priv; 1821bf0d5f50SAlex Elder u16 opcode; 1822bf0d5f50SAlex Elder 182385e084feSIlya Dryomov dout("%s: osd_req %p\n", __func__, osd_req); 1824bf0d5f50SAlex Elder rbd_assert(osd_req == obj_request->osd_req); 182557acbaa7SAlex Elder if (obj_request_img_data_test(obj_request)) { 182657acbaa7SAlex Elder rbd_assert(obj_request->img_request); 182757acbaa7SAlex Elder rbd_assert(obj_request->which != BAD_WHICH); 182857acbaa7SAlex Elder } else { 182957acbaa7SAlex Elder rbd_assert(obj_request->which == BAD_WHICH); 183057acbaa7SAlex Elder } 1831bf0d5f50SAlex Elder 18321b83bef2SSage Weil if (osd_req->r_result < 0) 18331b83bef2SSage Weil obj_request->result = osd_req->r_result; 1834bf0d5f50SAlex Elder 1835c47f9371SAlex Elder /* 1836c47f9371SAlex Elder * We support a 64-bit length, but ultimately it has to be 18377ad18afaSChristoph Hellwig * passed to the block layer, which just supports a 32-bit 18387ad18afaSChristoph Hellwig * length field. 1839c47f9371SAlex Elder */ 18407665d85bSYan, Zheng obj_request->xferred = osd_req->r_ops[0].outdata_len; 1841c47f9371SAlex Elder rbd_assert(obj_request->xferred < (u64)UINT_MAX); 18420ccd5926SIlya Dryomov 184379528734SAlex Elder opcode = osd_req->r_ops[0].op; 1844bf0d5f50SAlex Elder switch (opcode) { 1845bf0d5f50SAlex Elder case CEPH_OSD_OP_READ: 1846c47f9371SAlex Elder rbd_osd_read_callback(obj_request); 1847bf0d5f50SAlex Elder break; 18480ccd5926SIlya Dryomov case CEPH_OSD_OP_SETALLOCHINT: 1849e30b7577SIlya Dryomov rbd_assert(osd_req->r_ops[1].op == CEPH_OSD_OP_WRITE || 1850e30b7577SIlya Dryomov osd_req->r_ops[1].op == CEPH_OSD_OP_WRITEFULL); 18510ccd5926SIlya Dryomov /* fall through */ 1852bf0d5f50SAlex Elder case CEPH_OSD_OP_WRITE: 1853e30b7577SIlya Dryomov case CEPH_OSD_OP_WRITEFULL: 1854c47f9371SAlex Elder rbd_osd_write_callback(obj_request); 1855bf0d5f50SAlex Elder break; 1856fbfab539SAlex Elder case CEPH_OSD_OP_STAT: 1857c47f9371SAlex Elder rbd_osd_stat_callback(obj_request); 1858fbfab539SAlex Elder break; 185990e98c52SGuangliang Zhao case CEPH_OSD_OP_DELETE: 186090e98c52SGuangliang Zhao case CEPH_OSD_OP_TRUNCATE: 186190e98c52SGuangliang Zhao case CEPH_OSD_OP_ZERO: 186290e98c52SGuangliang Zhao rbd_osd_discard_callback(obj_request); 186390e98c52SGuangliang Zhao break; 186436be9a76SAlex Elder case CEPH_OSD_OP_CALL: 18652761713dSIlya Dryomov rbd_osd_call_callback(obj_request); 18662761713dSIlya Dryomov break; 1867bf0d5f50SAlex Elder default: 18689584d508SIlya Dryomov rbd_warn(NULL, "%s: unsupported op %hu", 1869bf0d5f50SAlex Elder obj_request->object_name, (unsigned short) opcode); 1870bf0d5f50SAlex Elder break; 1871bf0d5f50SAlex Elder } 1872bf0d5f50SAlex Elder 187307741308SAlex Elder if (obj_request_done_test(obj_request)) 1874bf0d5f50SAlex Elder rbd_obj_request_complete(obj_request); 1875bf0d5f50SAlex Elder } 1876bf0d5f50SAlex Elder 18779d4df01fSAlex Elder static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request) 1878430c28c3SAlex Elder { 1879430c28c3SAlex Elder struct rbd_img_request *img_request = obj_request->img_request; 18808c042b0dSAlex Elder struct ceph_osd_request *osd_req = obj_request->osd_req; 1881430c28c3SAlex Elder 1882bb873b53SIlya Dryomov if (img_request) 1883bb873b53SIlya Dryomov osd_req->r_snapid = img_request->snap_id; 18849d4df01fSAlex Elder } 18859d4df01fSAlex Elder 18869d4df01fSAlex Elder static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request) 18879d4df01fSAlex Elder { 18889d4df01fSAlex Elder struct ceph_osd_request *osd_req = obj_request->osd_req; 18899d4df01fSAlex Elder 1890bb873b53SIlya Dryomov osd_req->r_mtime = CURRENT_TIME; 1891bb873b53SIlya Dryomov osd_req->r_data_offset = obj_request->offset; 1892430c28c3SAlex Elder } 1893430c28c3SAlex Elder 18940ccd5926SIlya Dryomov /* 18950ccd5926SIlya Dryomov * Create an osd request. A read request has one osd op (read). 18960ccd5926SIlya Dryomov * A write request has either one (watch) or two (hint+write) osd ops. 18970ccd5926SIlya Dryomov * (All rbd data writes are prefixed with an allocation hint op, but 18980ccd5926SIlya Dryomov * technically osd watch is a write request, hence this distinction.) 18990ccd5926SIlya Dryomov */ 1900bf0d5f50SAlex Elder static struct ceph_osd_request *rbd_osd_req_create( 1901bf0d5f50SAlex Elder struct rbd_device *rbd_dev, 19026d2940c8SGuangliang Zhao enum obj_operation_type op_type, 1903deb236b3SIlya Dryomov unsigned int num_ops, 1904430c28c3SAlex Elder struct rbd_obj_request *obj_request) 1905bf0d5f50SAlex Elder { 1906bf0d5f50SAlex Elder struct ceph_snap_context *snapc = NULL; 1907bf0d5f50SAlex Elder struct ceph_osd_client *osdc; 1908bf0d5f50SAlex Elder struct ceph_osd_request *osd_req; 1909bf0d5f50SAlex Elder 191090e98c52SGuangliang Zhao if (obj_request_img_data_test(obj_request) && 191190e98c52SGuangliang Zhao (op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_WRITE)) { 19126365d33aSAlex Elder struct rbd_img_request *img_request = obj_request->img_request; 191390e98c52SGuangliang Zhao if (op_type == OBJ_OP_WRITE) { 19146d2940c8SGuangliang Zhao rbd_assert(img_request_write_test(img_request)); 191590e98c52SGuangliang Zhao } else { 191690e98c52SGuangliang Zhao rbd_assert(img_request_discard_test(img_request)); 191790e98c52SGuangliang Zhao } 1918bf0d5f50SAlex Elder snapc = img_request->snapc; 1919bf0d5f50SAlex Elder } 1920bf0d5f50SAlex Elder 19216d2940c8SGuangliang Zhao rbd_assert(num_ops == 1 || ((op_type == OBJ_OP_WRITE) && num_ops == 2)); 1922deb236b3SIlya Dryomov 1923deb236b3SIlya Dryomov /* Allocate and initialize the request, for the num_ops ops */ 1924bf0d5f50SAlex Elder 1925bf0d5f50SAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 1926deb236b3SIlya Dryomov osd_req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false, 19272224d879SDavid Disseldorp GFP_NOIO); 1928bf0d5f50SAlex Elder if (!osd_req) 192913d1ad16SIlya Dryomov goto fail; 1930bf0d5f50SAlex Elder 193190e98c52SGuangliang Zhao if (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD) 1932bf0d5f50SAlex Elder osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK; 1933430c28c3SAlex Elder else 1934bf0d5f50SAlex Elder osd_req->r_flags = CEPH_OSD_FLAG_READ; 1935bf0d5f50SAlex Elder 1936bf0d5f50SAlex Elder osd_req->r_callback = rbd_osd_req_callback; 1937bf0d5f50SAlex Elder osd_req->r_priv = obj_request; 1938bf0d5f50SAlex Elder 19397627151eSYan, Zheng osd_req->r_base_oloc.pool = rbd_dev->layout.pool_id; 1940d30291b9SIlya Dryomov if (ceph_oid_aprintf(&osd_req->r_base_oid, GFP_NOIO, "%s", 1941d30291b9SIlya Dryomov obj_request->object_name)) 1942d30291b9SIlya Dryomov goto fail; 1943bf0d5f50SAlex Elder 194413d1ad16SIlya Dryomov if (ceph_osdc_alloc_messages(osd_req, GFP_NOIO)) 194513d1ad16SIlya Dryomov goto fail; 194613d1ad16SIlya Dryomov 1947bf0d5f50SAlex Elder return osd_req; 194813d1ad16SIlya Dryomov 194913d1ad16SIlya Dryomov fail: 195013d1ad16SIlya Dryomov ceph_osdc_put_request(osd_req); 195113d1ad16SIlya Dryomov return NULL; 1952bf0d5f50SAlex Elder } 1953bf0d5f50SAlex Elder 19540eefd470SAlex Elder /* 1955d3246fb0SJosh Durgin * Create a copyup osd request based on the information in the object 1956d3246fb0SJosh Durgin * request supplied. A copyup request has two or three osd ops, a 1957d3246fb0SJosh Durgin * copyup method call, potentially a hint op, and a write or truncate 1958d3246fb0SJosh Durgin * or zero op. 19590eefd470SAlex Elder */ 19600eefd470SAlex Elder static struct ceph_osd_request * 19610eefd470SAlex Elder rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request) 19620eefd470SAlex Elder { 19630eefd470SAlex Elder struct rbd_img_request *img_request; 19640eefd470SAlex Elder struct ceph_snap_context *snapc; 19650eefd470SAlex Elder struct rbd_device *rbd_dev; 19660eefd470SAlex Elder struct ceph_osd_client *osdc; 19670eefd470SAlex Elder struct ceph_osd_request *osd_req; 1968d3246fb0SJosh Durgin int num_osd_ops = 3; 19690eefd470SAlex Elder 19700eefd470SAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 19710eefd470SAlex Elder img_request = obj_request->img_request; 19720eefd470SAlex Elder rbd_assert(img_request); 1973d3246fb0SJosh Durgin rbd_assert(img_request_write_test(img_request) || 1974d3246fb0SJosh Durgin img_request_discard_test(img_request)); 19750eefd470SAlex Elder 1976d3246fb0SJosh Durgin if (img_request_discard_test(img_request)) 1977d3246fb0SJosh Durgin num_osd_ops = 2; 1978d3246fb0SJosh Durgin 1979d3246fb0SJosh Durgin /* Allocate and initialize the request, for all the ops */ 19800eefd470SAlex Elder 19810eefd470SAlex Elder snapc = img_request->snapc; 19820eefd470SAlex Elder rbd_dev = img_request->rbd_dev; 19830eefd470SAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 1984d3246fb0SJosh Durgin osd_req = ceph_osdc_alloc_request(osdc, snapc, num_osd_ops, 19852224d879SDavid Disseldorp false, GFP_NOIO); 19860eefd470SAlex Elder if (!osd_req) 198713d1ad16SIlya Dryomov goto fail; 19880eefd470SAlex Elder 19890eefd470SAlex Elder osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK; 19900eefd470SAlex Elder osd_req->r_callback = rbd_osd_req_callback; 19910eefd470SAlex Elder osd_req->r_priv = obj_request; 19920eefd470SAlex Elder 19937627151eSYan, Zheng osd_req->r_base_oloc.pool = rbd_dev->layout.pool_id; 1994d30291b9SIlya Dryomov if (ceph_oid_aprintf(&osd_req->r_base_oid, GFP_NOIO, "%s", 1995d30291b9SIlya Dryomov obj_request->object_name)) 1996d30291b9SIlya Dryomov goto fail; 19970eefd470SAlex Elder 199813d1ad16SIlya Dryomov if (ceph_osdc_alloc_messages(osd_req, GFP_NOIO)) 199913d1ad16SIlya Dryomov goto fail; 200013d1ad16SIlya Dryomov 20010eefd470SAlex Elder return osd_req; 200213d1ad16SIlya Dryomov 200313d1ad16SIlya Dryomov fail: 200413d1ad16SIlya Dryomov ceph_osdc_put_request(osd_req); 200513d1ad16SIlya Dryomov return NULL; 20060eefd470SAlex Elder } 20070eefd470SAlex Elder 20080eefd470SAlex Elder 2009bf0d5f50SAlex Elder static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req) 2010bf0d5f50SAlex Elder { 2011bf0d5f50SAlex Elder ceph_osdc_put_request(osd_req); 2012bf0d5f50SAlex Elder } 2013bf0d5f50SAlex Elder 2014bf0d5f50SAlex Elder /* object_name is assumed to be a non-null pointer and NUL-terminated */ 2015bf0d5f50SAlex Elder 2016bf0d5f50SAlex Elder static struct rbd_obj_request *rbd_obj_request_create(const char *object_name, 2017bf0d5f50SAlex Elder u64 offset, u64 length, 2018bf0d5f50SAlex Elder enum obj_request_type type) 2019bf0d5f50SAlex Elder { 2020bf0d5f50SAlex Elder struct rbd_obj_request *obj_request; 2021bf0d5f50SAlex Elder size_t size; 2022bf0d5f50SAlex Elder char *name; 2023bf0d5f50SAlex Elder 2024bf0d5f50SAlex Elder rbd_assert(obj_request_type_valid(type)); 2025bf0d5f50SAlex Elder 2026bf0d5f50SAlex Elder size = strlen(object_name) + 1; 20275a60e876SIlya Dryomov name = kmalloc(size, GFP_NOIO); 2028f907ad55SAlex Elder if (!name) 2029bf0d5f50SAlex Elder return NULL; 2030bf0d5f50SAlex Elder 20315a60e876SIlya Dryomov obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_NOIO); 2032f907ad55SAlex Elder if (!obj_request) { 2033f907ad55SAlex Elder kfree(name); 2034f907ad55SAlex Elder return NULL; 2035f907ad55SAlex Elder } 2036f907ad55SAlex Elder 2037bf0d5f50SAlex Elder obj_request->object_name = memcpy(name, object_name, size); 2038bf0d5f50SAlex Elder obj_request->offset = offset; 2039bf0d5f50SAlex Elder obj_request->length = length; 2040926f9b3fSAlex Elder obj_request->flags = 0; 2041bf0d5f50SAlex Elder obj_request->which = BAD_WHICH; 2042bf0d5f50SAlex Elder obj_request->type = type; 2043bf0d5f50SAlex Elder INIT_LIST_HEAD(&obj_request->links); 2044788e2df3SAlex Elder init_completion(&obj_request->completion); 2045bf0d5f50SAlex Elder kref_init(&obj_request->kref); 2046bf0d5f50SAlex Elder 204737206ee5SAlex Elder dout("%s: \"%s\" %llu/%llu %d -> obj %p\n", __func__, object_name, 204837206ee5SAlex Elder offset, length, (int)type, obj_request); 204937206ee5SAlex Elder 2050bf0d5f50SAlex Elder return obj_request; 2051bf0d5f50SAlex Elder } 2052bf0d5f50SAlex Elder 2053bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref) 2054bf0d5f50SAlex Elder { 2055bf0d5f50SAlex Elder struct rbd_obj_request *obj_request; 2056bf0d5f50SAlex Elder 2057bf0d5f50SAlex Elder obj_request = container_of(kref, struct rbd_obj_request, kref); 2058bf0d5f50SAlex Elder 205937206ee5SAlex Elder dout("%s: obj %p\n", __func__, obj_request); 206037206ee5SAlex Elder 2061bf0d5f50SAlex Elder rbd_assert(obj_request->img_request == NULL); 2062bf0d5f50SAlex Elder rbd_assert(obj_request->which == BAD_WHICH); 2063bf0d5f50SAlex Elder 2064bf0d5f50SAlex Elder if (obj_request->osd_req) 2065bf0d5f50SAlex Elder rbd_osd_req_destroy(obj_request->osd_req); 2066bf0d5f50SAlex Elder 2067bf0d5f50SAlex Elder rbd_assert(obj_request_type_valid(obj_request->type)); 2068bf0d5f50SAlex Elder switch (obj_request->type) { 20699969ebc5SAlex Elder case OBJ_REQUEST_NODATA: 20709969ebc5SAlex Elder break; /* Nothing to do */ 2071bf0d5f50SAlex Elder case OBJ_REQUEST_BIO: 2072bf0d5f50SAlex Elder if (obj_request->bio_list) 2073bf0d5f50SAlex Elder bio_chain_put(obj_request->bio_list); 2074bf0d5f50SAlex Elder break; 2075788e2df3SAlex Elder case OBJ_REQUEST_PAGES: 2076788e2df3SAlex Elder if (obj_request->pages) 2077788e2df3SAlex Elder ceph_release_page_vector(obj_request->pages, 2078788e2df3SAlex Elder obj_request->page_count); 2079788e2df3SAlex Elder break; 2080bf0d5f50SAlex Elder } 2081bf0d5f50SAlex Elder 2082f907ad55SAlex Elder kfree(obj_request->object_name); 2083868311b1SAlex Elder obj_request->object_name = NULL; 2084868311b1SAlex Elder kmem_cache_free(rbd_obj_request_cache, obj_request); 2085bf0d5f50SAlex Elder } 2086bf0d5f50SAlex Elder 2087fb65d228SAlex Elder /* It's OK to call this for a device with no parent */ 2088fb65d228SAlex Elder 2089fb65d228SAlex Elder static void rbd_spec_put(struct rbd_spec *spec); 2090fb65d228SAlex Elder static void rbd_dev_unparent(struct rbd_device *rbd_dev) 2091fb65d228SAlex Elder { 2092fb65d228SAlex Elder rbd_dev_remove_parent(rbd_dev); 2093fb65d228SAlex Elder rbd_spec_put(rbd_dev->parent_spec); 2094fb65d228SAlex Elder rbd_dev->parent_spec = NULL; 2095fb65d228SAlex Elder rbd_dev->parent_overlap = 0; 2096fb65d228SAlex Elder } 2097fb65d228SAlex Elder 2098bf0d5f50SAlex Elder /* 2099a2acd00eSAlex Elder * Parent image reference counting is used to determine when an 2100a2acd00eSAlex Elder * image's parent fields can be safely torn down--after there are no 2101a2acd00eSAlex Elder * more in-flight requests to the parent image. When the last 2102a2acd00eSAlex Elder * reference is dropped, cleaning them up is safe. 2103a2acd00eSAlex Elder */ 2104a2acd00eSAlex Elder static void rbd_dev_parent_put(struct rbd_device *rbd_dev) 2105a2acd00eSAlex Elder { 2106a2acd00eSAlex Elder int counter; 2107a2acd00eSAlex Elder 2108a2acd00eSAlex Elder if (!rbd_dev->parent_spec) 2109a2acd00eSAlex Elder return; 2110a2acd00eSAlex Elder 2111a2acd00eSAlex Elder counter = atomic_dec_return_safe(&rbd_dev->parent_ref); 2112a2acd00eSAlex Elder if (counter > 0) 2113a2acd00eSAlex Elder return; 2114a2acd00eSAlex Elder 2115a2acd00eSAlex Elder /* Last reference; clean up parent data structures */ 2116a2acd00eSAlex Elder 2117a2acd00eSAlex Elder if (!counter) 2118a2acd00eSAlex Elder rbd_dev_unparent(rbd_dev); 2119a2acd00eSAlex Elder else 21209584d508SIlya Dryomov rbd_warn(rbd_dev, "parent reference underflow"); 2121a2acd00eSAlex Elder } 2122a2acd00eSAlex Elder 2123a2acd00eSAlex Elder /* 2124a2acd00eSAlex Elder * If an image has a non-zero parent overlap, get a reference to its 2125a2acd00eSAlex Elder * parent. 2126a2acd00eSAlex Elder * 2127a2acd00eSAlex Elder * Returns true if the rbd device has a parent with a non-zero 2128a2acd00eSAlex Elder * overlap and a reference for it was successfully taken, or 2129a2acd00eSAlex Elder * false otherwise. 2130a2acd00eSAlex Elder */ 2131a2acd00eSAlex Elder static bool rbd_dev_parent_get(struct rbd_device *rbd_dev) 2132a2acd00eSAlex Elder { 2133ae43e9d0SIlya Dryomov int counter = 0; 2134a2acd00eSAlex Elder 2135a2acd00eSAlex Elder if (!rbd_dev->parent_spec) 2136a2acd00eSAlex Elder return false; 2137a2acd00eSAlex Elder 2138ae43e9d0SIlya Dryomov down_read(&rbd_dev->header_rwsem); 2139ae43e9d0SIlya Dryomov if (rbd_dev->parent_overlap) 2140a2acd00eSAlex Elder counter = atomic_inc_return_safe(&rbd_dev->parent_ref); 2141ae43e9d0SIlya Dryomov up_read(&rbd_dev->header_rwsem); 2142a2acd00eSAlex Elder 2143a2acd00eSAlex Elder if (counter < 0) 21449584d508SIlya Dryomov rbd_warn(rbd_dev, "parent reference overflow"); 2145a2acd00eSAlex Elder 2146ae43e9d0SIlya Dryomov return counter > 0; 2147a2acd00eSAlex Elder } 2148a2acd00eSAlex Elder 2149bf0d5f50SAlex Elder /* 2150bf0d5f50SAlex Elder * Caller is responsible for filling in the list of object requests 2151bf0d5f50SAlex Elder * that comprises the image request, and the Linux request pointer 2152bf0d5f50SAlex Elder * (if there is one). 2153bf0d5f50SAlex Elder */ 2154cc344fa1SAlex Elder static struct rbd_img_request *rbd_img_request_create( 2155cc344fa1SAlex Elder struct rbd_device *rbd_dev, 2156bf0d5f50SAlex Elder u64 offset, u64 length, 21576d2940c8SGuangliang Zhao enum obj_operation_type op_type, 21584e752f0aSJosh Durgin struct ceph_snap_context *snapc) 2159bf0d5f50SAlex Elder { 2160bf0d5f50SAlex Elder struct rbd_img_request *img_request; 2161bf0d5f50SAlex Elder 21627a716aacSIlya Dryomov img_request = kmem_cache_alloc(rbd_img_request_cache, GFP_NOIO); 2163bf0d5f50SAlex Elder if (!img_request) 2164bf0d5f50SAlex Elder return NULL; 2165bf0d5f50SAlex Elder 2166bf0d5f50SAlex Elder img_request->rq = NULL; 2167bf0d5f50SAlex Elder img_request->rbd_dev = rbd_dev; 2168bf0d5f50SAlex Elder img_request->offset = offset; 2169bf0d5f50SAlex Elder img_request->length = length; 21700c425248SAlex Elder img_request->flags = 0; 217190e98c52SGuangliang Zhao if (op_type == OBJ_OP_DISCARD) { 217290e98c52SGuangliang Zhao img_request_discard_set(img_request); 217390e98c52SGuangliang Zhao img_request->snapc = snapc; 217490e98c52SGuangliang Zhao } else if (op_type == OBJ_OP_WRITE) { 21750c425248SAlex Elder img_request_write_set(img_request); 21764e752f0aSJosh Durgin img_request->snapc = snapc; 21770c425248SAlex Elder } else { 2178bf0d5f50SAlex Elder img_request->snap_id = rbd_dev->spec->snap_id; 21790c425248SAlex Elder } 2180a2acd00eSAlex Elder if (rbd_dev_parent_get(rbd_dev)) 2181d0b2e944SAlex Elder img_request_layered_set(img_request); 2182bf0d5f50SAlex Elder spin_lock_init(&img_request->completion_lock); 2183bf0d5f50SAlex Elder img_request->next_completion = 0; 2184bf0d5f50SAlex Elder img_request->callback = NULL; 2185a5a337d4SAlex Elder img_request->result = 0; 2186bf0d5f50SAlex Elder img_request->obj_request_count = 0; 2187bf0d5f50SAlex Elder INIT_LIST_HEAD(&img_request->obj_requests); 2188bf0d5f50SAlex Elder kref_init(&img_request->kref); 2189bf0d5f50SAlex Elder 219037206ee5SAlex Elder dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev, 21916d2940c8SGuangliang Zhao obj_op_name(op_type), offset, length, img_request); 219237206ee5SAlex Elder 2193bf0d5f50SAlex Elder return img_request; 2194bf0d5f50SAlex Elder } 2195bf0d5f50SAlex Elder 2196bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref) 2197bf0d5f50SAlex Elder { 2198bf0d5f50SAlex Elder struct rbd_img_request *img_request; 2199bf0d5f50SAlex Elder struct rbd_obj_request *obj_request; 2200bf0d5f50SAlex Elder struct rbd_obj_request *next_obj_request; 2201bf0d5f50SAlex Elder 2202bf0d5f50SAlex Elder img_request = container_of(kref, struct rbd_img_request, kref); 2203bf0d5f50SAlex Elder 220437206ee5SAlex Elder dout("%s: img %p\n", __func__, img_request); 220537206ee5SAlex Elder 2206bf0d5f50SAlex Elder for_each_obj_request_safe(img_request, obj_request, next_obj_request) 2207bf0d5f50SAlex Elder rbd_img_obj_request_del(img_request, obj_request); 220825dcf954SAlex Elder rbd_assert(img_request->obj_request_count == 0); 2209bf0d5f50SAlex Elder 2210a2acd00eSAlex Elder if (img_request_layered_test(img_request)) { 2211a2acd00eSAlex Elder img_request_layered_clear(img_request); 2212a2acd00eSAlex Elder rbd_dev_parent_put(img_request->rbd_dev); 2213a2acd00eSAlex Elder } 2214a2acd00eSAlex Elder 2215bef95455SJosh Durgin if (img_request_write_test(img_request) || 2216bef95455SJosh Durgin img_request_discard_test(img_request)) 2217812164f8SAlex Elder ceph_put_snap_context(img_request->snapc); 2218bf0d5f50SAlex Elder 22191c2a9dfeSAlex Elder kmem_cache_free(rbd_img_request_cache, img_request); 2220bf0d5f50SAlex Elder } 2221bf0d5f50SAlex Elder 2222e93f3152SAlex Elder static struct rbd_img_request *rbd_parent_request_create( 2223e93f3152SAlex Elder struct rbd_obj_request *obj_request, 2224e93f3152SAlex Elder u64 img_offset, u64 length) 2225e93f3152SAlex Elder { 2226e93f3152SAlex Elder struct rbd_img_request *parent_request; 2227e93f3152SAlex Elder struct rbd_device *rbd_dev; 2228e93f3152SAlex Elder 2229e93f3152SAlex Elder rbd_assert(obj_request->img_request); 2230e93f3152SAlex Elder rbd_dev = obj_request->img_request->rbd_dev; 2231e93f3152SAlex Elder 22324e752f0aSJosh Durgin parent_request = rbd_img_request_create(rbd_dev->parent, img_offset, 22336d2940c8SGuangliang Zhao length, OBJ_OP_READ, NULL); 2234e93f3152SAlex Elder if (!parent_request) 2235e93f3152SAlex Elder return NULL; 2236e93f3152SAlex Elder 2237e93f3152SAlex Elder img_request_child_set(parent_request); 2238e93f3152SAlex Elder rbd_obj_request_get(obj_request); 2239e93f3152SAlex Elder parent_request->obj_request = obj_request; 2240e93f3152SAlex Elder 2241e93f3152SAlex Elder return parent_request; 2242e93f3152SAlex Elder } 2243e93f3152SAlex Elder 2244e93f3152SAlex Elder static void rbd_parent_request_destroy(struct kref *kref) 2245e93f3152SAlex Elder { 2246e93f3152SAlex Elder struct rbd_img_request *parent_request; 2247e93f3152SAlex Elder struct rbd_obj_request *orig_request; 2248e93f3152SAlex Elder 2249e93f3152SAlex Elder parent_request = container_of(kref, struct rbd_img_request, kref); 2250e93f3152SAlex Elder orig_request = parent_request->obj_request; 2251e93f3152SAlex Elder 2252e93f3152SAlex Elder parent_request->obj_request = NULL; 2253e93f3152SAlex Elder rbd_obj_request_put(orig_request); 2254e93f3152SAlex Elder img_request_child_clear(parent_request); 2255e93f3152SAlex Elder 2256e93f3152SAlex Elder rbd_img_request_destroy(kref); 2257e93f3152SAlex Elder } 2258e93f3152SAlex Elder 22591217857fSAlex Elder static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request) 22601217857fSAlex Elder { 22616365d33aSAlex Elder struct rbd_img_request *img_request; 22621217857fSAlex Elder unsigned int xferred; 22631217857fSAlex Elder int result; 22648b3e1a56SAlex Elder bool more; 22651217857fSAlex Elder 22666365d33aSAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 22676365d33aSAlex Elder img_request = obj_request->img_request; 22686365d33aSAlex Elder 22691217857fSAlex Elder rbd_assert(obj_request->xferred <= (u64)UINT_MAX); 22701217857fSAlex Elder xferred = (unsigned int)obj_request->xferred; 22711217857fSAlex Elder result = obj_request->result; 22721217857fSAlex Elder if (result) { 22731217857fSAlex Elder struct rbd_device *rbd_dev = img_request->rbd_dev; 22746d2940c8SGuangliang Zhao enum obj_operation_type op_type; 22756d2940c8SGuangliang Zhao 227690e98c52SGuangliang Zhao if (img_request_discard_test(img_request)) 227790e98c52SGuangliang Zhao op_type = OBJ_OP_DISCARD; 227890e98c52SGuangliang Zhao else if (img_request_write_test(img_request)) 227990e98c52SGuangliang Zhao op_type = OBJ_OP_WRITE; 228090e98c52SGuangliang Zhao else 228190e98c52SGuangliang Zhao op_type = OBJ_OP_READ; 22821217857fSAlex Elder 22839584d508SIlya Dryomov rbd_warn(rbd_dev, "%s %llx at %llx (%llx)", 22846d2940c8SGuangliang Zhao obj_op_name(op_type), obj_request->length, 22856d2940c8SGuangliang Zhao obj_request->img_offset, obj_request->offset); 22869584d508SIlya Dryomov rbd_warn(rbd_dev, " result %d xferred %x", 22871217857fSAlex Elder result, xferred); 22881217857fSAlex Elder if (!img_request->result) 22891217857fSAlex Elder img_request->result = result; 2290082a75daSIlya Dryomov /* 2291082a75daSIlya Dryomov * Need to end I/O on the entire obj_request worth of 2292082a75daSIlya Dryomov * bytes in case of error. 2293082a75daSIlya Dryomov */ 2294082a75daSIlya Dryomov xferred = obj_request->length; 22951217857fSAlex Elder } 22961217857fSAlex Elder 2297f1a4739fSAlex Elder /* Image object requests don't own their page array */ 2298f1a4739fSAlex Elder 2299f1a4739fSAlex Elder if (obj_request->type == OBJ_REQUEST_PAGES) { 2300f1a4739fSAlex Elder obj_request->pages = NULL; 2301f1a4739fSAlex Elder obj_request->page_count = 0; 2302f1a4739fSAlex Elder } 2303f1a4739fSAlex Elder 23048b3e1a56SAlex Elder if (img_request_child_test(img_request)) { 23058b3e1a56SAlex Elder rbd_assert(img_request->obj_request != NULL); 23068b3e1a56SAlex Elder more = obj_request->which < img_request->obj_request_count - 1; 23078b3e1a56SAlex Elder } else { 23088b3e1a56SAlex Elder rbd_assert(img_request->rq != NULL); 23097ad18afaSChristoph Hellwig 23107ad18afaSChristoph Hellwig more = blk_update_request(img_request->rq, result, xferred); 23117ad18afaSChristoph Hellwig if (!more) 23127ad18afaSChristoph Hellwig __blk_mq_end_request(img_request->rq, result); 23138b3e1a56SAlex Elder } 23148b3e1a56SAlex Elder 23158b3e1a56SAlex Elder return more; 23161217857fSAlex Elder } 23171217857fSAlex Elder 23182169238dSAlex Elder static void rbd_img_obj_callback(struct rbd_obj_request *obj_request) 23192169238dSAlex Elder { 23202169238dSAlex Elder struct rbd_img_request *img_request; 23212169238dSAlex Elder u32 which = obj_request->which; 23222169238dSAlex Elder bool more = true; 23232169238dSAlex Elder 23246365d33aSAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 23252169238dSAlex Elder img_request = obj_request->img_request; 23262169238dSAlex Elder 23272169238dSAlex Elder dout("%s: img %p obj %p\n", __func__, img_request, obj_request); 23282169238dSAlex Elder rbd_assert(img_request != NULL); 23292169238dSAlex Elder rbd_assert(img_request->obj_request_count > 0); 23302169238dSAlex Elder rbd_assert(which != BAD_WHICH); 23312169238dSAlex Elder rbd_assert(which < img_request->obj_request_count); 23322169238dSAlex Elder 23332169238dSAlex Elder spin_lock_irq(&img_request->completion_lock); 23342169238dSAlex Elder if (which != img_request->next_completion) 23352169238dSAlex Elder goto out; 23362169238dSAlex Elder 23372169238dSAlex Elder for_each_obj_request_from(img_request, obj_request) { 23382169238dSAlex Elder rbd_assert(more); 23392169238dSAlex Elder rbd_assert(which < img_request->obj_request_count); 23402169238dSAlex Elder 23412169238dSAlex Elder if (!obj_request_done_test(obj_request)) 23422169238dSAlex Elder break; 23431217857fSAlex Elder more = rbd_img_obj_end_request(obj_request); 23442169238dSAlex Elder which++; 23452169238dSAlex Elder } 23462169238dSAlex Elder 23472169238dSAlex Elder rbd_assert(more ^ (which == img_request->obj_request_count)); 23482169238dSAlex Elder img_request->next_completion = which; 23492169238dSAlex Elder out: 23502169238dSAlex Elder spin_unlock_irq(&img_request->completion_lock); 23510f2d5be7SAlex Elder rbd_img_request_put(img_request); 23522169238dSAlex Elder 23532169238dSAlex Elder if (!more) 23542169238dSAlex Elder rbd_img_request_complete(img_request); 23552169238dSAlex Elder } 23562169238dSAlex Elder 2357f1a4739fSAlex Elder /* 23583b434a2aSJosh Durgin * Add individual osd ops to the given ceph_osd_request and prepare 23593b434a2aSJosh Durgin * them for submission. num_ops is the current number of 23603b434a2aSJosh Durgin * osd operations already to the object request. 23613b434a2aSJosh Durgin */ 23623b434a2aSJosh Durgin static void rbd_img_obj_request_fill(struct rbd_obj_request *obj_request, 23633b434a2aSJosh Durgin struct ceph_osd_request *osd_request, 23643b434a2aSJosh Durgin enum obj_operation_type op_type, 23653b434a2aSJosh Durgin unsigned int num_ops) 23663b434a2aSJosh Durgin { 23673b434a2aSJosh Durgin struct rbd_img_request *img_request = obj_request->img_request; 23683b434a2aSJosh Durgin struct rbd_device *rbd_dev = img_request->rbd_dev; 23693b434a2aSJosh Durgin u64 object_size = rbd_obj_bytes(&rbd_dev->header); 23703b434a2aSJosh Durgin u64 offset = obj_request->offset; 23713b434a2aSJosh Durgin u64 length = obj_request->length; 23723b434a2aSJosh Durgin u64 img_end; 23733b434a2aSJosh Durgin u16 opcode; 23743b434a2aSJosh Durgin 23753b434a2aSJosh Durgin if (op_type == OBJ_OP_DISCARD) { 2376d3246fb0SJosh Durgin if (!offset && length == object_size && 2377d3246fb0SJosh Durgin (!img_request_layered_test(img_request) || 2378d3246fb0SJosh Durgin !obj_request_overlaps_parent(obj_request))) { 23793b434a2aSJosh Durgin opcode = CEPH_OSD_OP_DELETE; 23803b434a2aSJosh Durgin } else if ((offset + length == object_size)) { 23813b434a2aSJosh Durgin opcode = CEPH_OSD_OP_TRUNCATE; 23823b434a2aSJosh Durgin } else { 23833b434a2aSJosh Durgin down_read(&rbd_dev->header_rwsem); 23843b434a2aSJosh Durgin img_end = rbd_dev->header.image_size; 23853b434a2aSJosh Durgin up_read(&rbd_dev->header_rwsem); 23863b434a2aSJosh Durgin 23873b434a2aSJosh Durgin if (obj_request->img_offset + length == img_end) 23883b434a2aSJosh Durgin opcode = CEPH_OSD_OP_TRUNCATE; 23893b434a2aSJosh Durgin else 23903b434a2aSJosh Durgin opcode = CEPH_OSD_OP_ZERO; 23913b434a2aSJosh Durgin } 23923b434a2aSJosh Durgin } else if (op_type == OBJ_OP_WRITE) { 2393e30b7577SIlya Dryomov if (!offset && length == object_size) 2394e30b7577SIlya Dryomov opcode = CEPH_OSD_OP_WRITEFULL; 2395e30b7577SIlya Dryomov else 23963b434a2aSJosh Durgin opcode = CEPH_OSD_OP_WRITE; 23973b434a2aSJosh Durgin osd_req_op_alloc_hint_init(osd_request, num_ops, 23983b434a2aSJosh Durgin object_size, object_size); 23993b434a2aSJosh Durgin num_ops++; 24003b434a2aSJosh Durgin } else { 24013b434a2aSJosh Durgin opcode = CEPH_OSD_OP_READ; 24023b434a2aSJosh Durgin } 24033b434a2aSJosh Durgin 24047e868b6eSIlya Dryomov if (opcode == CEPH_OSD_OP_DELETE) 2405144cba14SYan, Zheng osd_req_op_init(osd_request, num_ops, opcode, 0); 24067e868b6eSIlya Dryomov else 24077e868b6eSIlya Dryomov osd_req_op_extent_init(osd_request, num_ops, opcode, 24087e868b6eSIlya Dryomov offset, length, 0, 0); 24097e868b6eSIlya Dryomov 24103b434a2aSJosh Durgin if (obj_request->type == OBJ_REQUEST_BIO) 24113b434a2aSJosh Durgin osd_req_op_extent_osd_data_bio(osd_request, num_ops, 24123b434a2aSJosh Durgin obj_request->bio_list, length); 24133b434a2aSJosh Durgin else if (obj_request->type == OBJ_REQUEST_PAGES) 24143b434a2aSJosh Durgin osd_req_op_extent_osd_data_pages(osd_request, num_ops, 24153b434a2aSJosh Durgin obj_request->pages, length, 24163b434a2aSJosh Durgin offset & ~PAGE_MASK, false, false); 24173b434a2aSJosh Durgin 24183b434a2aSJosh Durgin /* Discards are also writes */ 24193b434a2aSJosh Durgin if (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD) 24203b434a2aSJosh Durgin rbd_osd_req_format_write(obj_request); 24213b434a2aSJosh Durgin else 24223b434a2aSJosh Durgin rbd_osd_req_format_read(obj_request); 24233b434a2aSJosh Durgin } 24243b434a2aSJosh Durgin 24253b434a2aSJosh Durgin /* 2426f1a4739fSAlex Elder * Split up an image request into one or more object requests, each 2427f1a4739fSAlex Elder * to a different object. The "type" parameter indicates whether 2428f1a4739fSAlex Elder * "data_desc" is the pointer to the head of a list of bio 2429f1a4739fSAlex Elder * structures, or the base of a page array. In either case this 2430f1a4739fSAlex Elder * function assumes data_desc describes memory sufficient to hold 2431f1a4739fSAlex Elder * all data described by the image request. 2432f1a4739fSAlex Elder */ 2433f1a4739fSAlex Elder static int rbd_img_request_fill(struct rbd_img_request *img_request, 2434f1a4739fSAlex Elder enum obj_request_type type, 2435f1a4739fSAlex Elder void *data_desc) 2436bf0d5f50SAlex Elder { 2437bf0d5f50SAlex Elder struct rbd_device *rbd_dev = img_request->rbd_dev; 2438bf0d5f50SAlex Elder struct rbd_obj_request *obj_request = NULL; 2439bf0d5f50SAlex Elder struct rbd_obj_request *next_obj_request; 2440a158073cSJingoo Han struct bio *bio_list = NULL; 2441f1a4739fSAlex Elder unsigned int bio_offset = 0; 2442a158073cSJingoo Han struct page **pages = NULL; 24436d2940c8SGuangliang Zhao enum obj_operation_type op_type; 24447da22d29SAlex Elder u64 img_offset; 2445bf0d5f50SAlex Elder u64 resid; 2446bf0d5f50SAlex Elder 2447f1a4739fSAlex Elder dout("%s: img %p type %d data_desc %p\n", __func__, img_request, 2448f1a4739fSAlex Elder (int)type, data_desc); 244937206ee5SAlex Elder 24507da22d29SAlex Elder img_offset = img_request->offset; 2451bf0d5f50SAlex Elder resid = img_request->length; 24524dda41d3SAlex Elder rbd_assert(resid > 0); 24533b434a2aSJosh Durgin op_type = rbd_img_request_op_type(img_request); 2454f1a4739fSAlex Elder 2455f1a4739fSAlex Elder if (type == OBJ_REQUEST_BIO) { 2456f1a4739fSAlex Elder bio_list = data_desc; 24574f024f37SKent Overstreet rbd_assert(img_offset == 24584f024f37SKent Overstreet bio_list->bi_iter.bi_sector << SECTOR_SHIFT); 245990e98c52SGuangliang Zhao } else if (type == OBJ_REQUEST_PAGES) { 2460f1a4739fSAlex Elder pages = data_desc; 2461f1a4739fSAlex Elder } 2462f1a4739fSAlex Elder 2463bf0d5f50SAlex Elder while (resid) { 24642fa12320SAlex Elder struct ceph_osd_request *osd_req; 2465bf0d5f50SAlex Elder const char *object_name; 2466bf0d5f50SAlex Elder u64 offset; 2467bf0d5f50SAlex Elder u64 length; 2468bf0d5f50SAlex Elder 24697da22d29SAlex Elder object_name = rbd_segment_name(rbd_dev, img_offset); 2470bf0d5f50SAlex Elder if (!object_name) 2471bf0d5f50SAlex Elder goto out_unwind; 24727da22d29SAlex Elder offset = rbd_segment_offset(rbd_dev, img_offset); 24737da22d29SAlex Elder length = rbd_segment_length(rbd_dev, img_offset, resid); 2474bf0d5f50SAlex Elder obj_request = rbd_obj_request_create(object_name, 2475f1a4739fSAlex Elder offset, length, type); 247678c2a44aSAlex Elder /* object request has its own copy of the object name */ 247778c2a44aSAlex Elder rbd_segment_name_free(object_name); 2478bf0d5f50SAlex Elder if (!obj_request) 2479bf0d5f50SAlex Elder goto out_unwind; 248062054da6SIlya Dryomov 248103507db6SJosh Durgin /* 248203507db6SJosh Durgin * set obj_request->img_request before creating the 248303507db6SJosh Durgin * osd_request so that it gets the right snapc 248403507db6SJosh Durgin */ 248503507db6SJosh Durgin rbd_img_obj_request_add(img_request, obj_request); 2486bf0d5f50SAlex Elder 2487f1a4739fSAlex Elder if (type == OBJ_REQUEST_BIO) { 2488f1a4739fSAlex Elder unsigned int clone_size; 2489f1a4739fSAlex Elder 2490bf0d5f50SAlex Elder rbd_assert(length <= (u64)UINT_MAX); 2491bf0d5f50SAlex Elder clone_size = (unsigned int)length; 2492f1a4739fSAlex Elder obj_request->bio_list = 2493f1a4739fSAlex Elder bio_chain_clone_range(&bio_list, 2494f1a4739fSAlex Elder &bio_offset, 2495f1a4739fSAlex Elder clone_size, 24962224d879SDavid Disseldorp GFP_NOIO); 2497bf0d5f50SAlex Elder if (!obj_request->bio_list) 249862054da6SIlya Dryomov goto out_unwind; 249990e98c52SGuangliang Zhao } else if (type == OBJ_REQUEST_PAGES) { 2500f1a4739fSAlex Elder unsigned int page_count; 2501f1a4739fSAlex Elder 2502f1a4739fSAlex Elder obj_request->pages = pages; 2503f1a4739fSAlex Elder page_count = (u32)calc_pages_for(offset, length); 2504f1a4739fSAlex Elder obj_request->page_count = page_count; 2505f1a4739fSAlex Elder if ((offset + length) & ~PAGE_MASK) 2506f1a4739fSAlex Elder page_count--; /* more on last page */ 2507f1a4739fSAlex Elder pages += page_count; 2508f1a4739fSAlex Elder } 2509bf0d5f50SAlex Elder 25106d2940c8SGuangliang Zhao osd_req = rbd_osd_req_create(rbd_dev, op_type, 25116d2940c8SGuangliang Zhao (op_type == OBJ_OP_WRITE) ? 2 : 1, 25122fa12320SAlex Elder obj_request); 25132fa12320SAlex Elder if (!osd_req) 251462054da6SIlya Dryomov goto out_unwind; 25153b434a2aSJosh Durgin 25162fa12320SAlex Elder obj_request->osd_req = osd_req; 25172169238dSAlex Elder obj_request->callback = rbd_img_obj_callback; 25187da22d29SAlex Elder obj_request->img_offset = img_offset; 2519bf0d5f50SAlex Elder 25203b434a2aSJosh Durgin rbd_img_obj_request_fill(obj_request, osd_req, op_type, 0); 25213b434a2aSJosh Durgin 25223b434a2aSJosh Durgin rbd_img_request_get(img_request); 25233b434a2aSJosh Durgin 25247da22d29SAlex Elder img_offset += length; 2525bf0d5f50SAlex Elder resid -= length; 2526bf0d5f50SAlex Elder } 2527bf0d5f50SAlex Elder 2528bf0d5f50SAlex Elder return 0; 2529bf0d5f50SAlex Elder 2530bf0d5f50SAlex Elder out_unwind: 2531bf0d5f50SAlex Elder for_each_obj_request_safe(img_request, obj_request, next_obj_request) 253242dd037cSIlya Dryomov rbd_img_obj_request_del(img_request, obj_request); 2533bf0d5f50SAlex Elder 2534bf0d5f50SAlex Elder return -ENOMEM; 2535bf0d5f50SAlex Elder } 2536bf0d5f50SAlex Elder 25373d7efd18SAlex Elder static void 25382761713dSIlya Dryomov rbd_osd_copyup_callback(struct rbd_obj_request *obj_request) 25390eefd470SAlex Elder { 25400eefd470SAlex Elder struct rbd_img_request *img_request; 25410eefd470SAlex Elder struct rbd_device *rbd_dev; 2542ebda6408SAlex Elder struct page **pages; 25430eefd470SAlex Elder u32 page_count; 25440eefd470SAlex Elder 25452761713dSIlya Dryomov dout("%s: obj %p\n", __func__, obj_request); 25462761713dSIlya Dryomov 2547d3246fb0SJosh Durgin rbd_assert(obj_request->type == OBJ_REQUEST_BIO || 2548d3246fb0SJosh Durgin obj_request->type == OBJ_REQUEST_NODATA); 25490eefd470SAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 25500eefd470SAlex Elder img_request = obj_request->img_request; 25510eefd470SAlex Elder rbd_assert(img_request); 25520eefd470SAlex Elder 25530eefd470SAlex Elder rbd_dev = img_request->rbd_dev; 25540eefd470SAlex Elder rbd_assert(rbd_dev); 25550eefd470SAlex Elder 2556ebda6408SAlex Elder pages = obj_request->copyup_pages; 2557ebda6408SAlex Elder rbd_assert(pages != NULL); 25580eefd470SAlex Elder obj_request->copyup_pages = NULL; 2559ebda6408SAlex Elder page_count = obj_request->copyup_page_count; 2560ebda6408SAlex Elder rbd_assert(page_count); 2561ebda6408SAlex Elder obj_request->copyup_page_count = 0; 2562ebda6408SAlex Elder ceph_release_page_vector(pages, page_count); 25630eefd470SAlex Elder 25640eefd470SAlex Elder /* 25650eefd470SAlex Elder * We want the transfer count to reflect the size of the 25660eefd470SAlex Elder * original write request. There is no such thing as a 25670eefd470SAlex Elder * successful short write, so if the request was successful 25680eefd470SAlex Elder * we can just set it to the originally-requested length. 25690eefd470SAlex Elder */ 25700eefd470SAlex Elder if (!obj_request->result) 25710eefd470SAlex Elder obj_request->xferred = obj_request->length; 25720eefd470SAlex Elder 25732761713dSIlya Dryomov obj_request_done_set(obj_request); 25740eefd470SAlex Elder } 25750eefd470SAlex Elder 25760eefd470SAlex Elder static void 25773d7efd18SAlex Elder rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request) 25783d7efd18SAlex Elder { 25793d7efd18SAlex Elder struct rbd_obj_request *orig_request; 25800eefd470SAlex Elder struct ceph_osd_request *osd_req; 25810eefd470SAlex Elder struct ceph_osd_client *osdc; 25820eefd470SAlex Elder struct rbd_device *rbd_dev; 25833d7efd18SAlex Elder struct page **pages; 2584d3246fb0SJosh Durgin enum obj_operation_type op_type; 2585ebda6408SAlex Elder u32 page_count; 2586bbea1c1aSAlex Elder int img_result; 2587ebda6408SAlex Elder u64 parent_length; 25883d7efd18SAlex Elder 25893d7efd18SAlex Elder rbd_assert(img_request_child_test(img_request)); 25903d7efd18SAlex Elder 25913d7efd18SAlex Elder /* First get what we need from the image request */ 25923d7efd18SAlex Elder 25933d7efd18SAlex Elder pages = img_request->copyup_pages; 25943d7efd18SAlex Elder rbd_assert(pages != NULL); 25953d7efd18SAlex Elder img_request->copyup_pages = NULL; 2596ebda6408SAlex Elder page_count = img_request->copyup_page_count; 2597ebda6408SAlex Elder rbd_assert(page_count); 2598ebda6408SAlex Elder img_request->copyup_page_count = 0; 25993d7efd18SAlex Elder 26003d7efd18SAlex Elder orig_request = img_request->obj_request; 26013d7efd18SAlex Elder rbd_assert(orig_request != NULL); 2602b91f09f1SAlex Elder rbd_assert(obj_request_type_valid(orig_request->type)); 2603bbea1c1aSAlex Elder img_result = img_request->result; 2604ebda6408SAlex Elder parent_length = img_request->length; 2605ebda6408SAlex Elder rbd_assert(parent_length == img_request->xferred); 26063d7efd18SAlex Elder rbd_img_request_put(img_request); 26073d7efd18SAlex Elder 260891c6febbSAlex Elder rbd_assert(orig_request->img_request); 260991c6febbSAlex Elder rbd_dev = orig_request->img_request->rbd_dev; 26103d7efd18SAlex Elder rbd_assert(rbd_dev); 26113d7efd18SAlex Elder 2612bbea1c1aSAlex Elder /* 2613bbea1c1aSAlex Elder * If the overlap has become 0 (most likely because the 2614bbea1c1aSAlex Elder * image has been flattened) we need to free the pages 2615bbea1c1aSAlex Elder * and re-submit the original write request. 2616bbea1c1aSAlex Elder */ 2617bbea1c1aSAlex Elder if (!rbd_dev->parent_overlap) { 2618bbea1c1aSAlex Elder struct ceph_osd_client *osdc; 2619bbea1c1aSAlex Elder 2620bbea1c1aSAlex Elder ceph_release_page_vector(pages, page_count); 2621bbea1c1aSAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 2622bbea1c1aSAlex Elder img_result = rbd_obj_request_submit(osdc, orig_request); 2623bbea1c1aSAlex Elder if (!img_result) 2624bbea1c1aSAlex Elder return; 2625bbea1c1aSAlex Elder } 2626bbea1c1aSAlex Elder 2627bbea1c1aSAlex Elder if (img_result) 26280eefd470SAlex Elder goto out_err; 26293d7efd18SAlex Elder 26308785b1d4SAlex Elder /* 26318785b1d4SAlex Elder * The original osd request is of no use to use any more. 26320ccd5926SIlya Dryomov * We need a new one that can hold the three ops in a copyup 26338785b1d4SAlex Elder * request. Allocate the new copyup osd request for the 26348785b1d4SAlex Elder * original request, and release the old one. 26358785b1d4SAlex Elder */ 2636bbea1c1aSAlex Elder img_result = -ENOMEM; 26370eefd470SAlex Elder osd_req = rbd_osd_req_create_copyup(orig_request); 26380eefd470SAlex Elder if (!osd_req) 26390eefd470SAlex Elder goto out_err; 26408785b1d4SAlex Elder rbd_osd_req_destroy(orig_request->osd_req); 26410eefd470SAlex Elder orig_request->osd_req = osd_req; 26420eefd470SAlex Elder orig_request->copyup_pages = pages; 2643ebda6408SAlex Elder orig_request->copyup_page_count = page_count; 26443d7efd18SAlex Elder 26450eefd470SAlex Elder /* Initialize the copyup op */ 26460eefd470SAlex Elder 26470eefd470SAlex Elder osd_req_op_cls_init(osd_req, 0, CEPH_OSD_OP_CALL, "rbd", "copyup"); 2648ebda6408SAlex Elder osd_req_op_cls_request_data_pages(osd_req, 0, pages, parent_length, 0, 26490eefd470SAlex Elder false, false); 26500eefd470SAlex Elder 2651d3246fb0SJosh Durgin /* Add the other op(s) */ 26520ccd5926SIlya Dryomov 2653d3246fb0SJosh Durgin op_type = rbd_img_request_op_type(orig_request->img_request); 2654d3246fb0SJosh Durgin rbd_img_obj_request_fill(orig_request, osd_req, op_type, 1); 26550eefd470SAlex Elder 26560eefd470SAlex Elder /* All set, send it off. */ 26570eefd470SAlex Elder 26580eefd470SAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 2659bbea1c1aSAlex Elder img_result = rbd_obj_request_submit(osdc, orig_request); 2660bbea1c1aSAlex Elder if (!img_result) 26610eefd470SAlex Elder return; 26620eefd470SAlex Elder out_err: 26630eefd470SAlex Elder /* Record the error code and complete the request */ 26640eefd470SAlex Elder 2665bbea1c1aSAlex Elder orig_request->result = img_result; 26660eefd470SAlex Elder orig_request->xferred = 0; 26673d7efd18SAlex Elder obj_request_done_set(orig_request); 26683d7efd18SAlex Elder rbd_obj_request_complete(orig_request); 26693d7efd18SAlex Elder } 26703d7efd18SAlex Elder 26713d7efd18SAlex Elder /* 26723d7efd18SAlex Elder * Read from the parent image the range of data that covers the 26733d7efd18SAlex Elder * entire target of the given object request. This is used for 26743d7efd18SAlex Elder * satisfying a layered image write request when the target of an 26753d7efd18SAlex Elder * object request from the image request does not exist. 26763d7efd18SAlex Elder * 26773d7efd18SAlex Elder * A page array big enough to hold the returned data is allocated 26783d7efd18SAlex Elder * and supplied to rbd_img_request_fill() as the "data descriptor." 26793d7efd18SAlex Elder * When the read completes, this page array will be transferred to 26803d7efd18SAlex Elder * the original object request for the copyup operation. 26813d7efd18SAlex Elder * 26823d7efd18SAlex Elder * If an error occurs, record it as the result of the original 26833d7efd18SAlex Elder * object request and mark it done so it gets completed. 26843d7efd18SAlex Elder */ 26853d7efd18SAlex Elder static int rbd_img_obj_parent_read_full(struct rbd_obj_request *obj_request) 26863d7efd18SAlex Elder { 26873d7efd18SAlex Elder struct rbd_img_request *img_request = NULL; 26883d7efd18SAlex Elder struct rbd_img_request *parent_request = NULL; 26893d7efd18SAlex Elder struct rbd_device *rbd_dev; 26903d7efd18SAlex Elder u64 img_offset; 26913d7efd18SAlex Elder u64 length; 26923d7efd18SAlex Elder struct page **pages = NULL; 26933d7efd18SAlex Elder u32 page_count; 26943d7efd18SAlex Elder int result; 26953d7efd18SAlex Elder 26963d7efd18SAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 2697b91f09f1SAlex Elder rbd_assert(obj_request_type_valid(obj_request->type)); 26983d7efd18SAlex Elder 26993d7efd18SAlex Elder img_request = obj_request->img_request; 27003d7efd18SAlex Elder rbd_assert(img_request != NULL); 27013d7efd18SAlex Elder rbd_dev = img_request->rbd_dev; 27023d7efd18SAlex Elder rbd_assert(rbd_dev->parent != NULL); 27033d7efd18SAlex Elder 27043d7efd18SAlex Elder /* 27053d7efd18SAlex Elder * Determine the byte range covered by the object in the 27063d7efd18SAlex Elder * child image to which the original request was to be sent. 27073d7efd18SAlex Elder */ 27083d7efd18SAlex Elder img_offset = obj_request->img_offset - obj_request->offset; 27093d7efd18SAlex Elder length = (u64)1 << rbd_dev->header.obj_order; 27103d7efd18SAlex Elder 27113d7efd18SAlex Elder /* 2712a9e8ba2cSAlex Elder * There is no defined parent data beyond the parent 2713a9e8ba2cSAlex Elder * overlap, so limit what we read at that boundary if 2714a9e8ba2cSAlex Elder * necessary. 2715a9e8ba2cSAlex Elder */ 2716a9e8ba2cSAlex Elder if (img_offset + length > rbd_dev->parent_overlap) { 2717a9e8ba2cSAlex Elder rbd_assert(img_offset < rbd_dev->parent_overlap); 2718a9e8ba2cSAlex Elder length = rbd_dev->parent_overlap - img_offset; 2719a9e8ba2cSAlex Elder } 2720a9e8ba2cSAlex Elder 2721a9e8ba2cSAlex Elder /* 27223d7efd18SAlex Elder * Allocate a page array big enough to receive the data read 27233d7efd18SAlex Elder * from the parent. 27243d7efd18SAlex Elder */ 27253d7efd18SAlex Elder page_count = (u32)calc_pages_for(0, length); 27263d7efd18SAlex Elder pages = ceph_alloc_page_vector(page_count, GFP_KERNEL); 27273d7efd18SAlex Elder if (IS_ERR(pages)) { 27283d7efd18SAlex Elder result = PTR_ERR(pages); 27293d7efd18SAlex Elder pages = NULL; 27303d7efd18SAlex Elder goto out_err; 27313d7efd18SAlex Elder } 27323d7efd18SAlex Elder 27333d7efd18SAlex Elder result = -ENOMEM; 2734e93f3152SAlex Elder parent_request = rbd_parent_request_create(obj_request, 2735e93f3152SAlex Elder img_offset, length); 27363d7efd18SAlex Elder if (!parent_request) 27373d7efd18SAlex Elder goto out_err; 27383d7efd18SAlex Elder 27393d7efd18SAlex Elder result = rbd_img_request_fill(parent_request, OBJ_REQUEST_PAGES, pages); 27403d7efd18SAlex Elder if (result) 27413d7efd18SAlex Elder goto out_err; 27423d7efd18SAlex Elder parent_request->copyup_pages = pages; 2743ebda6408SAlex Elder parent_request->copyup_page_count = page_count; 27443d7efd18SAlex Elder 27453d7efd18SAlex Elder parent_request->callback = rbd_img_obj_parent_read_full_callback; 27463d7efd18SAlex Elder result = rbd_img_request_submit(parent_request); 27473d7efd18SAlex Elder if (!result) 27483d7efd18SAlex Elder return 0; 27493d7efd18SAlex Elder 27503d7efd18SAlex Elder parent_request->copyup_pages = NULL; 2751ebda6408SAlex Elder parent_request->copyup_page_count = 0; 27523d7efd18SAlex Elder parent_request->obj_request = NULL; 27533d7efd18SAlex Elder rbd_obj_request_put(obj_request); 27543d7efd18SAlex Elder out_err: 27553d7efd18SAlex Elder if (pages) 27563d7efd18SAlex Elder ceph_release_page_vector(pages, page_count); 27573d7efd18SAlex Elder if (parent_request) 27583d7efd18SAlex Elder rbd_img_request_put(parent_request); 27593d7efd18SAlex Elder obj_request->result = result; 27603d7efd18SAlex Elder obj_request->xferred = 0; 27613d7efd18SAlex Elder obj_request_done_set(obj_request); 27623d7efd18SAlex Elder 27633d7efd18SAlex Elder return result; 27643d7efd18SAlex Elder } 27653d7efd18SAlex Elder 2766c5b5ef6cSAlex Elder static void rbd_img_obj_exists_callback(struct rbd_obj_request *obj_request) 2767c5b5ef6cSAlex Elder { 2768c5b5ef6cSAlex Elder struct rbd_obj_request *orig_request; 2769638f5abeSAlex Elder struct rbd_device *rbd_dev; 2770c5b5ef6cSAlex Elder int result; 2771c5b5ef6cSAlex Elder 2772c5b5ef6cSAlex Elder rbd_assert(!obj_request_img_data_test(obj_request)); 2773c5b5ef6cSAlex Elder 2774c5b5ef6cSAlex Elder /* 2775c5b5ef6cSAlex Elder * All we need from the object request is the original 2776c5b5ef6cSAlex Elder * request and the result of the STAT op. Grab those, then 2777c5b5ef6cSAlex Elder * we're done with the request. 2778c5b5ef6cSAlex Elder */ 2779c5b5ef6cSAlex Elder orig_request = obj_request->obj_request; 2780c5b5ef6cSAlex Elder obj_request->obj_request = NULL; 2781912c317dSAlex Elder rbd_obj_request_put(orig_request); 2782c5b5ef6cSAlex Elder rbd_assert(orig_request); 2783c5b5ef6cSAlex Elder rbd_assert(orig_request->img_request); 2784c5b5ef6cSAlex Elder 2785c5b5ef6cSAlex Elder result = obj_request->result; 2786c5b5ef6cSAlex Elder obj_request->result = 0; 2787c5b5ef6cSAlex Elder 2788c5b5ef6cSAlex Elder dout("%s: obj %p for obj %p result %d %llu/%llu\n", __func__, 2789c5b5ef6cSAlex Elder obj_request, orig_request, result, 2790c5b5ef6cSAlex Elder obj_request->xferred, obj_request->length); 2791c5b5ef6cSAlex Elder rbd_obj_request_put(obj_request); 2792c5b5ef6cSAlex Elder 2793638f5abeSAlex Elder /* 2794638f5abeSAlex Elder * If the overlap has become 0 (most likely because the 2795638f5abeSAlex Elder * image has been flattened) we need to free the pages 2796638f5abeSAlex Elder * and re-submit the original write request. 2797638f5abeSAlex Elder */ 2798638f5abeSAlex Elder rbd_dev = orig_request->img_request->rbd_dev; 2799638f5abeSAlex Elder if (!rbd_dev->parent_overlap) { 2800638f5abeSAlex Elder struct ceph_osd_client *osdc; 2801638f5abeSAlex Elder 2802638f5abeSAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 2803638f5abeSAlex Elder result = rbd_obj_request_submit(osdc, orig_request); 2804638f5abeSAlex Elder if (!result) 2805638f5abeSAlex Elder return; 2806638f5abeSAlex Elder } 2807c5b5ef6cSAlex Elder 2808c5b5ef6cSAlex Elder /* 2809c5b5ef6cSAlex Elder * Our only purpose here is to determine whether the object 2810c5b5ef6cSAlex Elder * exists, and we don't want to treat the non-existence as 2811c5b5ef6cSAlex Elder * an error. If something else comes back, transfer the 2812c5b5ef6cSAlex Elder * error to the original request and complete it now. 2813c5b5ef6cSAlex Elder */ 2814c5b5ef6cSAlex Elder if (!result) { 2815c5b5ef6cSAlex Elder obj_request_existence_set(orig_request, true); 2816c5b5ef6cSAlex Elder } else if (result == -ENOENT) { 2817c5b5ef6cSAlex Elder obj_request_existence_set(orig_request, false); 2818c5b5ef6cSAlex Elder } else if (result) { 2819c5b5ef6cSAlex Elder orig_request->result = result; 28203d7efd18SAlex Elder goto out; 2821c5b5ef6cSAlex Elder } 2822c5b5ef6cSAlex Elder 2823c5b5ef6cSAlex Elder /* 2824c5b5ef6cSAlex Elder * Resubmit the original request now that we have recorded 2825c5b5ef6cSAlex Elder * whether the target object exists. 2826c5b5ef6cSAlex Elder */ 2827b454e36dSAlex Elder orig_request->result = rbd_img_obj_request_submit(orig_request); 28283d7efd18SAlex Elder out: 2829c5b5ef6cSAlex Elder if (orig_request->result) 2830c5b5ef6cSAlex Elder rbd_obj_request_complete(orig_request); 2831c5b5ef6cSAlex Elder } 2832c5b5ef6cSAlex Elder 2833c5b5ef6cSAlex Elder static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request) 2834c5b5ef6cSAlex Elder { 2835c5b5ef6cSAlex Elder struct rbd_obj_request *stat_request; 2836c5b5ef6cSAlex Elder struct rbd_device *rbd_dev; 2837c5b5ef6cSAlex Elder struct ceph_osd_client *osdc; 2838c5b5ef6cSAlex Elder struct page **pages = NULL; 2839c5b5ef6cSAlex Elder u32 page_count; 2840c5b5ef6cSAlex Elder size_t size; 2841c5b5ef6cSAlex Elder int ret; 2842c5b5ef6cSAlex Elder 2843c5b5ef6cSAlex Elder /* 2844c5b5ef6cSAlex Elder * The response data for a STAT call consists of: 2845c5b5ef6cSAlex Elder * le64 length; 2846c5b5ef6cSAlex Elder * struct { 2847c5b5ef6cSAlex Elder * le32 tv_sec; 2848c5b5ef6cSAlex Elder * le32 tv_nsec; 2849c5b5ef6cSAlex Elder * } mtime; 2850c5b5ef6cSAlex Elder */ 2851c5b5ef6cSAlex Elder size = sizeof (__le64) + sizeof (__le32) + sizeof (__le32); 2852c5b5ef6cSAlex Elder page_count = (u32)calc_pages_for(0, size); 2853c5b5ef6cSAlex Elder pages = ceph_alloc_page_vector(page_count, GFP_KERNEL); 2854c5b5ef6cSAlex Elder if (IS_ERR(pages)) 2855c5b5ef6cSAlex Elder return PTR_ERR(pages); 2856c5b5ef6cSAlex Elder 2857c5b5ef6cSAlex Elder ret = -ENOMEM; 2858c5b5ef6cSAlex Elder stat_request = rbd_obj_request_create(obj_request->object_name, 0, 0, 2859c5b5ef6cSAlex Elder OBJ_REQUEST_PAGES); 2860c5b5ef6cSAlex Elder if (!stat_request) 2861c5b5ef6cSAlex Elder goto out; 2862c5b5ef6cSAlex Elder 2863c5b5ef6cSAlex Elder rbd_obj_request_get(obj_request); 2864c5b5ef6cSAlex Elder stat_request->obj_request = obj_request; 2865c5b5ef6cSAlex Elder stat_request->pages = pages; 2866c5b5ef6cSAlex Elder stat_request->page_count = page_count; 2867c5b5ef6cSAlex Elder 2868c5b5ef6cSAlex Elder rbd_assert(obj_request->img_request); 2869c5b5ef6cSAlex Elder rbd_dev = obj_request->img_request->rbd_dev; 28706d2940c8SGuangliang Zhao stat_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1, 2871c5b5ef6cSAlex Elder stat_request); 2872c5b5ef6cSAlex Elder if (!stat_request->osd_req) 2873c5b5ef6cSAlex Elder goto out; 2874c5b5ef6cSAlex Elder stat_request->callback = rbd_img_obj_exists_callback; 2875c5b5ef6cSAlex Elder 2876144cba14SYan, Zheng osd_req_op_init(stat_request->osd_req, 0, CEPH_OSD_OP_STAT, 0); 2877c5b5ef6cSAlex Elder osd_req_op_raw_data_in_pages(stat_request->osd_req, 0, pages, size, 0, 2878c5b5ef6cSAlex Elder false, false); 28799d4df01fSAlex Elder rbd_osd_req_format_read(stat_request); 2880c5b5ef6cSAlex Elder 2881c5b5ef6cSAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 2882c5b5ef6cSAlex Elder ret = rbd_obj_request_submit(osdc, stat_request); 2883c5b5ef6cSAlex Elder out: 2884c5b5ef6cSAlex Elder if (ret) 2885c5b5ef6cSAlex Elder rbd_obj_request_put(obj_request); 2886c5b5ef6cSAlex Elder 2887c5b5ef6cSAlex Elder return ret; 2888c5b5ef6cSAlex Elder } 2889c5b5ef6cSAlex Elder 289070d045f6SIlya Dryomov static bool img_obj_request_simple(struct rbd_obj_request *obj_request) 2891b454e36dSAlex Elder { 2892b454e36dSAlex Elder struct rbd_img_request *img_request; 2893a9e8ba2cSAlex Elder struct rbd_device *rbd_dev; 2894b454e36dSAlex Elder 2895b454e36dSAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 2896b454e36dSAlex Elder 2897b454e36dSAlex Elder img_request = obj_request->img_request; 2898b454e36dSAlex Elder rbd_assert(img_request); 2899a9e8ba2cSAlex Elder rbd_dev = img_request->rbd_dev; 2900b454e36dSAlex Elder 290170d045f6SIlya Dryomov /* Reads */ 29021c220881SJosh Durgin if (!img_request_write_test(img_request) && 29031c220881SJosh Durgin !img_request_discard_test(img_request)) 290470d045f6SIlya Dryomov return true; 2905b454e36dSAlex Elder 290670d045f6SIlya Dryomov /* Non-layered writes */ 290770d045f6SIlya Dryomov if (!img_request_layered_test(img_request)) 290870d045f6SIlya Dryomov return true; 290970d045f6SIlya Dryomov 291070d045f6SIlya Dryomov /* 291170d045f6SIlya Dryomov * Layered writes outside of the parent overlap range don't 291270d045f6SIlya Dryomov * share any data with the parent. 291370d045f6SIlya Dryomov */ 291470d045f6SIlya Dryomov if (!obj_request_overlaps_parent(obj_request)) 291570d045f6SIlya Dryomov return true; 291670d045f6SIlya Dryomov 291770d045f6SIlya Dryomov /* 2918c622d226SGuangliang Zhao * Entire-object layered writes - we will overwrite whatever 2919c622d226SGuangliang Zhao * parent data there is anyway. 2920c622d226SGuangliang Zhao */ 2921c622d226SGuangliang Zhao if (!obj_request->offset && 2922c622d226SGuangliang Zhao obj_request->length == rbd_obj_bytes(&rbd_dev->header)) 2923c622d226SGuangliang Zhao return true; 2924c622d226SGuangliang Zhao 2925c622d226SGuangliang Zhao /* 292670d045f6SIlya Dryomov * If the object is known to already exist, its parent data has 292770d045f6SIlya Dryomov * already been copied. 292870d045f6SIlya Dryomov */ 292970d045f6SIlya Dryomov if (obj_request_known_test(obj_request) && 293070d045f6SIlya Dryomov obj_request_exists_test(obj_request)) 293170d045f6SIlya Dryomov return true; 293270d045f6SIlya Dryomov 293370d045f6SIlya Dryomov return false; 293470d045f6SIlya Dryomov } 293570d045f6SIlya Dryomov 293670d045f6SIlya Dryomov static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request) 293770d045f6SIlya Dryomov { 293870d045f6SIlya Dryomov if (img_obj_request_simple(obj_request)) { 2939b454e36dSAlex Elder struct rbd_device *rbd_dev; 2940b454e36dSAlex Elder struct ceph_osd_client *osdc; 2941b454e36dSAlex Elder 2942b454e36dSAlex Elder rbd_dev = obj_request->img_request->rbd_dev; 2943b454e36dSAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 2944b454e36dSAlex Elder 2945b454e36dSAlex Elder return rbd_obj_request_submit(osdc, obj_request); 2946b454e36dSAlex Elder } 2947b454e36dSAlex Elder 2948b454e36dSAlex Elder /* 29493d7efd18SAlex Elder * It's a layered write. The target object might exist but 29503d7efd18SAlex Elder * we may not know that yet. If we know it doesn't exist, 29513d7efd18SAlex Elder * start by reading the data for the full target object from 29523d7efd18SAlex Elder * the parent so we can use it for a copyup to the target. 2953b454e36dSAlex Elder */ 295470d045f6SIlya Dryomov if (obj_request_known_test(obj_request)) 29553d7efd18SAlex Elder return rbd_img_obj_parent_read_full(obj_request); 29563d7efd18SAlex Elder 29573d7efd18SAlex Elder /* We don't know whether the target exists. Go find out. */ 2958b454e36dSAlex Elder 2959b454e36dSAlex Elder return rbd_img_obj_exists_submit(obj_request); 2960b454e36dSAlex Elder } 2961b454e36dSAlex Elder 2962bf0d5f50SAlex Elder static int rbd_img_request_submit(struct rbd_img_request *img_request) 2963bf0d5f50SAlex Elder { 2964bf0d5f50SAlex Elder struct rbd_obj_request *obj_request; 296546faeed4SAlex Elder struct rbd_obj_request *next_obj_request; 2966663ae2ccSIlya Dryomov int ret = 0; 2967bf0d5f50SAlex Elder 296837206ee5SAlex Elder dout("%s: img %p\n", __func__, img_request); 2969bf0d5f50SAlex Elder 2970663ae2ccSIlya Dryomov rbd_img_request_get(img_request); 2971663ae2ccSIlya Dryomov for_each_obj_request_safe(img_request, obj_request, next_obj_request) { 2972b454e36dSAlex Elder ret = rbd_img_obj_request_submit(obj_request); 2973bf0d5f50SAlex Elder if (ret) 2974663ae2ccSIlya Dryomov goto out_put_ireq; 2975bf0d5f50SAlex Elder } 2976bf0d5f50SAlex Elder 2977663ae2ccSIlya Dryomov out_put_ireq: 2978663ae2ccSIlya Dryomov rbd_img_request_put(img_request); 2979663ae2ccSIlya Dryomov return ret; 2980bf0d5f50SAlex Elder } 2981bf0d5f50SAlex Elder 29828b3e1a56SAlex Elder static void rbd_img_parent_read_callback(struct rbd_img_request *img_request) 29838b3e1a56SAlex Elder { 29848b3e1a56SAlex Elder struct rbd_obj_request *obj_request; 2985a9e8ba2cSAlex Elder struct rbd_device *rbd_dev; 2986a9e8ba2cSAlex Elder u64 obj_end; 298702c74fbaSAlex Elder u64 img_xferred; 298802c74fbaSAlex Elder int img_result; 29898b3e1a56SAlex Elder 29908b3e1a56SAlex Elder rbd_assert(img_request_child_test(img_request)); 29918b3e1a56SAlex Elder 299202c74fbaSAlex Elder /* First get what we need from the image request and release it */ 299302c74fbaSAlex Elder 29948b3e1a56SAlex Elder obj_request = img_request->obj_request; 299502c74fbaSAlex Elder img_xferred = img_request->xferred; 299602c74fbaSAlex Elder img_result = img_request->result; 299702c74fbaSAlex Elder rbd_img_request_put(img_request); 299802c74fbaSAlex Elder 299902c74fbaSAlex Elder /* 300002c74fbaSAlex Elder * If the overlap has become 0 (most likely because the 300102c74fbaSAlex Elder * image has been flattened) we need to re-submit the 300202c74fbaSAlex Elder * original request. 300302c74fbaSAlex Elder */ 3004a9e8ba2cSAlex Elder rbd_assert(obj_request); 3005a9e8ba2cSAlex Elder rbd_assert(obj_request->img_request); 300602c74fbaSAlex Elder rbd_dev = obj_request->img_request->rbd_dev; 300702c74fbaSAlex Elder if (!rbd_dev->parent_overlap) { 300802c74fbaSAlex Elder struct ceph_osd_client *osdc; 30098b3e1a56SAlex Elder 301002c74fbaSAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 301102c74fbaSAlex Elder img_result = rbd_obj_request_submit(osdc, obj_request); 301202c74fbaSAlex Elder if (!img_result) 301302c74fbaSAlex Elder return; 301402c74fbaSAlex Elder } 301502c74fbaSAlex Elder 301602c74fbaSAlex Elder obj_request->result = img_result; 3017a9e8ba2cSAlex Elder if (obj_request->result) 3018a9e8ba2cSAlex Elder goto out; 3019a9e8ba2cSAlex Elder 3020a9e8ba2cSAlex Elder /* 3021a9e8ba2cSAlex Elder * We need to zero anything beyond the parent overlap 3022a9e8ba2cSAlex Elder * boundary. Since rbd_img_obj_request_read_callback() 3023a9e8ba2cSAlex Elder * will zero anything beyond the end of a short read, an 3024a9e8ba2cSAlex Elder * easy way to do this is to pretend the data from the 3025a9e8ba2cSAlex Elder * parent came up short--ending at the overlap boundary. 3026a9e8ba2cSAlex Elder */ 3027a9e8ba2cSAlex Elder rbd_assert(obj_request->img_offset < U64_MAX - obj_request->length); 3028a9e8ba2cSAlex Elder obj_end = obj_request->img_offset + obj_request->length; 3029a9e8ba2cSAlex Elder if (obj_end > rbd_dev->parent_overlap) { 3030a9e8ba2cSAlex Elder u64 xferred = 0; 3031a9e8ba2cSAlex Elder 3032a9e8ba2cSAlex Elder if (obj_request->img_offset < rbd_dev->parent_overlap) 3033a9e8ba2cSAlex Elder xferred = rbd_dev->parent_overlap - 3034a9e8ba2cSAlex Elder obj_request->img_offset; 3035a9e8ba2cSAlex Elder 303602c74fbaSAlex Elder obj_request->xferred = min(img_xferred, xferred); 3037a9e8ba2cSAlex Elder } else { 303802c74fbaSAlex Elder obj_request->xferred = img_xferred; 3039a9e8ba2cSAlex Elder } 3040a9e8ba2cSAlex Elder out: 30418b3e1a56SAlex Elder rbd_img_obj_request_read_callback(obj_request); 30428b3e1a56SAlex Elder rbd_obj_request_complete(obj_request); 30438b3e1a56SAlex Elder } 30448b3e1a56SAlex Elder 30458b3e1a56SAlex Elder static void rbd_img_parent_read(struct rbd_obj_request *obj_request) 30468b3e1a56SAlex Elder { 30478b3e1a56SAlex Elder struct rbd_img_request *img_request; 30488b3e1a56SAlex Elder int result; 30498b3e1a56SAlex Elder 30508b3e1a56SAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 30518b3e1a56SAlex Elder rbd_assert(obj_request->img_request != NULL); 30528b3e1a56SAlex Elder rbd_assert(obj_request->result == (s32) -ENOENT); 30535b2ab72dSAlex Elder rbd_assert(obj_request_type_valid(obj_request->type)); 30548b3e1a56SAlex Elder 30558b3e1a56SAlex Elder /* rbd_read_finish(obj_request, obj_request->length); */ 3056e93f3152SAlex Elder img_request = rbd_parent_request_create(obj_request, 30578b3e1a56SAlex Elder obj_request->img_offset, 3058e93f3152SAlex Elder obj_request->length); 30598b3e1a56SAlex Elder result = -ENOMEM; 30608b3e1a56SAlex Elder if (!img_request) 30618b3e1a56SAlex Elder goto out_err; 30628b3e1a56SAlex Elder 30635b2ab72dSAlex Elder if (obj_request->type == OBJ_REQUEST_BIO) 3064f1a4739fSAlex Elder result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO, 3065f1a4739fSAlex Elder obj_request->bio_list); 30665b2ab72dSAlex Elder else 30675b2ab72dSAlex Elder result = rbd_img_request_fill(img_request, OBJ_REQUEST_PAGES, 30685b2ab72dSAlex Elder obj_request->pages); 30698b3e1a56SAlex Elder if (result) 30708b3e1a56SAlex Elder goto out_err; 30718b3e1a56SAlex Elder 30728b3e1a56SAlex Elder img_request->callback = rbd_img_parent_read_callback; 30738b3e1a56SAlex Elder result = rbd_img_request_submit(img_request); 30748b3e1a56SAlex Elder if (result) 30758b3e1a56SAlex Elder goto out_err; 30768b3e1a56SAlex Elder 30778b3e1a56SAlex Elder return; 30788b3e1a56SAlex Elder out_err: 30798b3e1a56SAlex Elder if (img_request) 30808b3e1a56SAlex Elder rbd_img_request_put(img_request); 30818b3e1a56SAlex Elder obj_request->result = result; 30828b3e1a56SAlex Elder obj_request->xferred = 0; 30838b3e1a56SAlex Elder obj_request_done_set(obj_request); 30848b3e1a56SAlex Elder } 30858b3e1a56SAlex Elder 3086922dab61SIlya Dryomov static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev); 3087922dab61SIlya Dryomov static void __rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev); 3088922dab61SIlya Dryomov 3089922dab61SIlya Dryomov static void rbd_watch_cb(void *arg, u64 notify_id, u64 cookie, 3090922dab61SIlya Dryomov u64 notifier_id, void *data, size_t data_len) 3091b8d70035SAlex Elder { 3092922dab61SIlya Dryomov struct rbd_device *rbd_dev = arg; 30932169238dSAlex Elder struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 3094b8d70035SAlex Elder int ret; 3095b8d70035SAlex Elder 3096922dab61SIlya Dryomov dout("%s rbd_dev %p cookie %llu notify_id %llu\n", __func__, rbd_dev, 3097922dab61SIlya Dryomov cookie, notify_id); 309852bb1f9bSIlya Dryomov 309952bb1f9bSIlya Dryomov /* 310052bb1f9bSIlya Dryomov * Until adequate refresh error handling is in place, there is 310152bb1f9bSIlya Dryomov * not much we can do here, except warn. 310252bb1f9bSIlya Dryomov * 310352bb1f9bSIlya Dryomov * See http://tracker.ceph.com/issues/5040 310452bb1f9bSIlya Dryomov */ 3105e627db08SAlex Elder ret = rbd_dev_refresh(rbd_dev); 3106e627db08SAlex Elder if (ret) 31079584d508SIlya Dryomov rbd_warn(rbd_dev, "refresh failed: %d", ret); 3108b8d70035SAlex Elder 3109922dab61SIlya Dryomov ret = ceph_osdc_notify_ack(osdc, &rbd_dev->header_oid, 3110922dab61SIlya Dryomov &rbd_dev->header_oloc, notify_id, cookie, 3111922dab61SIlya Dryomov NULL, 0); 311252bb1f9bSIlya Dryomov if (ret) 31139584d508SIlya Dryomov rbd_warn(rbd_dev, "notify_ack ret %d", ret); 3114b8d70035SAlex Elder } 3115b8d70035SAlex Elder 3116922dab61SIlya Dryomov static void rbd_watch_errcb(void *arg, u64 cookie, int err) 3117bb040aa0SIlya Dryomov { 3118922dab61SIlya Dryomov struct rbd_device *rbd_dev = arg; 3119bb040aa0SIlya Dryomov int ret; 3120bb040aa0SIlya Dryomov 3121922dab61SIlya Dryomov rbd_warn(rbd_dev, "encountered watch error: %d", err); 3122bb040aa0SIlya Dryomov 3123922dab61SIlya Dryomov __rbd_dev_header_unwatch_sync(rbd_dev); 3124bb040aa0SIlya Dryomov 3125922dab61SIlya Dryomov ret = rbd_dev_header_watch_sync(rbd_dev); 3126bb040aa0SIlya Dryomov if (ret) { 3127922dab61SIlya Dryomov rbd_warn(rbd_dev, "failed to reregister watch: %d", ret); 3128922dab61SIlya Dryomov return; 3129bb040aa0SIlya Dryomov } 3130bb040aa0SIlya Dryomov 3131922dab61SIlya Dryomov ret = rbd_dev_refresh(rbd_dev); 3132922dab61SIlya Dryomov if (ret) 3133922dab61SIlya Dryomov rbd_warn(rbd_dev, "reregisteration refresh failed: %d", ret); 3134bb040aa0SIlya Dryomov } 3135bb040aa0SIlya Dryomov 3136bb040aa0SIlya Dryomov /* 3137b30a01f2SIlya Dryomov * Initiate a watch request, synchronously. 31389969ebc5SAlex Elder */ 3139b30a01f2SIlya Dryomov static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev) 31409969ebc5SAlex Elder { 31419969ebc5SAlex Elder struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 3142922dab61SIlya Dryomov struct ceph_osd_linger_request *handle; 31439969ebc5SAlex Elder 3144922dab61SIlya Dryomov rbd_assert(!rbd_dev->watch_handle); 31459969ebc5SAlex Elder 3146922dab61SIlya Dryomov handle = ceph_osdc_watch(osdc, &rbd_dev->header_oid, 3147922dab61SIlya Dryomov &rbd_dev->header_oloc, rbd_watch_cb, 3148922dab61SIlya Dryomov rbd_watch_errcb, rbd_dev); 3149922dab61SIlya Dryomov if (IS_ERR(handle)) 3150922dab61SIlya Dryomov return PTR_ERR(handle); 31519969ebc5SAlex Elder 3152922dab61SIlya Dryomov rbd_dev->watch_handle = handle; 31538eb87565SAlex Elder return 0; 31549969ebc5SAlex Elder } 31559969ebc5SAlex Elder 3156c525f036SIlya Dryomov static void __rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev) 3157fca27065SIlya Dryomov { 3158922dab61SIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 3159922dab61SIlya Dryomov int ret; 3160b30a01f2SIlya Dryomov 3161922dab61SIlya Dryomov if (!rbd_dev->watch_handle) 3162922dab61SIlya Dryomov return; 3163b30a01f2SIlya Dryomov 3164922dab61SIlya Dryomov ret = ceph_osdc_unwatch(osdc, rbd_dev->watch_handle); 3165922dab61SIlya Dryomov if (ret) 3166922dab61SIlya Dryomov rbd_warn(rbd_dev, "failed to unwatch: %d", ret); 3167b30a01f2SIlya Dryomov 3168922dab61SIlya Dryomov rbd_dev->watch_handle = NULL; 3169c525f036SIlya Dryomov } 3170c525f036SIlya Dryomov 3171c525f036SIlya Dryomov /* 3172c525f036SIlya Dryomov * Tear down a watch request, synchronously. 3173c525f036SIlya Dryomov */ 3174c525f036SIlya Dryomov static void rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev) 3175c525f036SIlya Dryomov { 3176c525f036SIlya Dryomov __rbd_dev_header_unwatch_sync(rbd_dev); 3177811c6688SIlya Dryomov 3178811c6688SIlya Dryomov dout("%s flushing notifies\n", __func__); 3179811c6688SIlya Dryomov ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc); 3180fca27065SIlya Dryomov } 3181fca27065SIlya Dryomov 318236be9a76SAlex Elder /* 3183f40eb349SAlex Elder * Synchronous osd object method call. Returns the number of bytes 3184f40eb349SAlex Elder * returned in the outbound buffer, or a negative error code. 318536be9a76SAlex Elder */ 318636be9a76SAlex Elder static int rbd_obj_method_sync(struct rbd_device *rbd_dev, 318736be9a76SAlex Elder const char *object_name, 318836be9a76SAlex Elder const char *class_name, 318936be9a76SAlex Elder const char *method_name, 31904157976bSAlex Elder const void *outbound, 319136be9a76SAlex Elder size_t outbound_size, 31924157976bSAlex Elder void *inbound, 3193e2a58ee5SAlex Elder size_t inbound_size) 319436be9a76SAlex Elder { 31952169238dSAlex Elder struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 319636be9a76SAlex Elder struct rbd_obj_request *obj_request; 319736be9a76SAlex Elder struct page **pages; 319836be9a76SAlex Elder u32 page_count; 319936be9a76SAlex Elder int ret; 320036be9a76SAlex Elder 320136be9a76SAlex Elder /* 32026010a451SAlex Elder * Method calls are ultimately read operations. The result 32036010a451SAlex Elder * should placed into the inbound buffer provided. They 32046010a451SAlex Elder * also supply outbound data--parameters for the object 32056010a451SAlex Elder * method. Currently if this is present it will be a 32066010a451SAlex Elder * snapshot id. 320736be9a76SAlex Elder */ 320836be9a76SAlex Elder page_count = (u32)calc_pages_for(0, inbound_size); 320936be9a76SAlex Elder pages = ceph_alloc_page_vector(page_count, GFP_KERNEL); 321036be9a76SAlex Elder if (IS_ERR(pages)) 321136be9a76SAlex Elder return PTR_ERR(pages); 321236be9a76SAlex Elder 321336be9a76SAlex Elder ret = -ENOMEM; 32146010a451SAlex Elder obj_request = rbd_obj_request_create(object_name, 0, inbound_size, 321536be9a76SAlex Elder OBJ_REQUEST_PAGES); 321636be9a76SAlex Elder if (!obj_request) 321736be9a76SAlex Elder goto out; 321836be9a76SAlex Elder 321936be9a76SAlex Elder obj_request->pages = pages; 322036be9a76SAlex Elder obj_request->page_count = page_count; 322136be9a76SAlex Elder 32226d2940c8SGuangliang Zhao obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1, 3223deb236b3SIlya Dryomov obj_request); 322436be9a76SAlex Elder if (!obj_request->osd_req) 322536be9a76SAlex Elder goto out; 322636be9a76SAlex Elder 3227c99d2d4aSAlex Elder osd_req_op_cls_init(obj_request->osd_req, 0, CEPH_OSD_OP_CALL, 322804017e29SAlex Elder class_name, method_name); 322904017e29SAlex Elder if (outbound_size) { 323004017e29SAlex Elder struct ceph_pagelist *pagelist; 323104017e29SAlex Elder 323204017e29SAlex Elder pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS); 323304017e29SAlex Elder if (!pagelist) 323404017e29SAlex Elder goto out; 323504017e29SAlex Elder 323604017e29SAlex Elder ceph_pagelist_init(pagelist); 323704017e29SAlex Elder ceph_pagelist_append(pagelist, outbound, outbound_size); 323804017e29SAlex Elder osd_req_op_cls_request_data_pagelist(obj_request->osd_req, 0, 323904017e29SAlex Elder pagelist); 324004017e29SAlex Elder } 3241a4ce40a9SAlex Elder osd_req_op_cls_response_data_pages(obj_request->osd_req, 0, 3242a4ce40a9SAlex Elder obj_request->pages, inbound_size, 324344cd188dSAlex Elder 0, false, false); 32449d4df01fSAlex Elder rbd_osd_req_format_read(obj_request); 3245430c28c3SAlex Elder 324636be9a76SAlex Elder ret = rbd_obj_request_submit(osdc, obj_request); 324736be9a76SAlex Elder if (ret) 324836be9a76SAlex Elder goto out; 324936be9a76SAlex Elder ret = rbd_obj_request_wait(obj_request); 325036be9a76SAlex Elder if (ret) 325136be9a76SAlex Elder goto out; 325236be9a76SAlex Elder 325336be9a76SAlex Elder ret = obj_request->result; 325436be9a76SAlex Elder if (ret < 0) 325536be9a76SAlex Elder goto out; 325657385b51SAlex Elder 325757385b51SAlex Elder rbd_assert(obj_request->xferred < (u64)INT_MAX); 325857385b51SAlex Elder ret = (int)obj_request->xferred; 3259903bb32eSAlex Elder ceph_copy_from_page_vector(pages, inbound, 0, obj_request->xferred); 326036be9a76SAlex Elder out: 326136be9a76SAlex Elder if (obj_request) 326236be9a76SAlex Elder rbd_obj_request_put(obj_request); 326336be9a76SAlex Elder else 326436be9a76SAlex Elder ceph_release_page_vector(pages, page_count); 326536be9a76SAlex Elder 326636be9a76SAlex Elder return ret; 326736be9a76SAlex Elder } 326836be9a76SAlex Elder 32697ad18afaSChristoph Hellwig static void rbd_queue_workfn(struct work_struct *work) 3270bc1ecc65SIlya Dryomov { 32717ad18afaSChristoph Hellwig struct request *rq = blk_mq_rq_from_pdu(work); 32727ad18afaSChristoph Hellwig struct rbd_device *rbd_dev = rq->q->queuedata; 3273bc1ecc65SIlya Dryomov struct rbd_img_request *img_request; 32744e752f0aSJosh Durgin struct ceph_snap_context *snapc = NULL; 3275bc1ecc65SIlya Dryomov u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT; 3276bc1ecc65SIlya Dryomov u64 length = blk_rq_bytes(rq); 32776d2940c8SGuangliang Zhao enum obj_operation_type op_type; 32784e752f0aSJosh Durgin u64 mapping_size; 3279bc1ecc65SIlya Dryomov int result; 3280bc1ecc65SIlya Dryomov 32817ad18afaSChristoph Hellwig if (rq->cmd_type != REQ_TYPE_FS) { 32827ad18afaSChristoph Hellwig dout("%s: non-fs request type %d\n", __func__, 32837ad18afaSChristoph Hellwig (int) rq->cmd_type); 32847ad18afaSChristoph Hellwig result = -EIO; 32857ad18afaSChristoph Hellwig goto err; 32867ad18afaSChristoph Hellwig } 32877ad18afaSChristoph Hellwig 3288c2df40dfSMike Christie if (req_op(rq) == REQ_OP_DISCARD) 328990e98c52SGuangliang Zhao op_type = OBJ_OP_DISCARD; 3290c2df40dfSMike Christie else if (req_op(rq) == REQ_OP_WRITE) 32916d2940c8SGuangliang Zhao op_type = OBJ_OP_WRITE; 32926d2940c8SGuangliang Zhao else 32936d2940c8SGuangliang Zhao op_type = OBJ_OP_READ; 32946d2940c8SGuangliang Zhao 3295bc1ecc65SIlya Dryomov /* Ignore/skip any zero-length requests */ 3296bc1ecc65SIlya Dryomov 3297bc1ecc65SIlya Dryomov if (!length) { 3298bc1ecc65SIlya Dryomov dout("%s: zero-length request\n", __func__); 3299bc1ecc65SIlya Dryomov result = 0; 3300bc1ecc65SIlya Dryomov goto err_rq; 3301bc1ecc65SIlya Dryomov } 3302bc1ecc65SIlya Dryomov 33036d2940c8SGuangliang Zhao /* Only reads are allowed to a read-only device */ 3304bc1ecc65SIlya Dryomov 33056d2940c8SGuangliang Zhao if (op_type != OBJ_OP_READ) { 3306bc1ecc65SIlya Dryomov if (rbd_dev->mapping.read_only) { 3307bc1ecc65SIlya Dryomov result = -EROFS; 3308bc1ecc65SIlya Dryomov goto err_rq; 3309bc1ecc65SIlya Dryomov } 3310bc1ecc65SIlya Dryomov rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP); 3311bc1ecc65SIlya Dryomov } 3312bc1ecc65SIlya Dryomov 3313bc1ecc65SIlya Dryomov /* 3314bc1ecc65SIlya Dryomov * Quit early if the mapped snapshot no longer exists. It's 3315bc1ecc65SIlya Dryomov * still possible the snapshot will have disappeared by the 3316bc1ecc65SIlya Dryomov * time our request arrives at the osd, but there's no sense in 3317bc1ecc65SIlya Dryomov * sending it if we already know. 3318bc1ecc65SIlya Dryomov */ 3319bc1ecc65SIlya Dryomov if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) { 3320bc1ecc65SIlya Dryomov dout("request for non-existent snapshot"); 3321bc1ecc65SIlya Dryomov rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP); 3322bc1ecc65SIlya Dryomov result = -ENXIO; 3323bc1ecc65SIlya Dryomov goto err_rq; 3324bc1ecc65SIlya Dryomov } 3325bc1ecc65SIlya Dryomov 3326bc1ecc65SIlya Dryomov if (offset && length > U64_MAX - offset + 1) { 3327bc1ecc65SIlya Dryomov rbd_warn(rbd_dev, "bad request range (%llu~%llu)", offset, 3328bc1ecc65SIlya Dryomov length); 3329bc1ecc65SIlya Dryomov result = -EINVAL; 3330bc1ecc65SIlya Dryomov goto err_rq; /* Shouldn't happen */ 3331bc1ecc65SIlya Dryomov } 3332bc1ecc65SIlya Dryomov 33337ad18afaSChristoph Hellwig blk_mq_start_request(rq); 33347ad18afaSChristoph Hellwig 33354e752f0aSJosh Durgin down_read(&rbd_dev->header_rwsem); 33364e752f0aSJosh Durgin mapping_size = rbd_dev->mapping.size; 33376d2940c8SGuangliang Zhao if (op_type != OBJ_OP_READ) { 33384e752f0aSJosh Durgin snapc = rbd_dev->header.snapc; 33394e752f0aSJosh Durgin ceph_get_snap_context(snapc); 33404e752f0aSJosh Durgin } 33414e752f0aSJosh Durgin up_read(&rbd_dev->header_rwsem); 33424e752f0aSJosh Durgin 33434e752f0aSJosh Durgin if (offset + length > mapping_size) { 3344bc1ecc65SIlya Dryomov rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)", offset, 33454e752f0aSJosh Durgin length, mapping_size); 3346bc1ecc65SIlya Dryomov result = -EIO; 3347bc1ecc65SIlya Dryomov goto err_rq; 3348bc1ecc65SIlya Dryomov } 3349bc1ecc65SIlya Dryomov 33506d2940c8SGuangliang Zhao img_request = rbd_img_request_create(rbd_dev, offset, length, op_type, 33514e752f0aSJosh Durgin snapc); 3352bc1ecc65SIlya Dryomov if (!img_request) { 3353bc1ecc65SIlya Dryomov result = -ENOMEM; 3354bc1ecc65SIlya Dryomov goto err_rq; 3355bc1ecc65SIlya Dryomov } 3356bc1ecc65SIlya Dryomov img_request->rq = rq; 335770b16db8SIlya Dryomov snapc = NULL; /* img_request consumes a ref */ 3358bc1ecc65SIlya Dryomov 335990e98c52SGuangliang Zhao if (op_type == OBJ_OP_DISCARD) 336090e98c52SGuangliang Zhao result = rbd_img_request_fill(img_request, OBJ_REQUEST_NODATA, 336190e98c52SGuangliang Zhao NULL); 336290e98c52SGuangliang Zhao else 336390e98c52SGuangliang Zhao result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO, 336490e98c52SGuangliang Zhao rq->bio); 3365bc1ecc65SIlya Dryomov if (result) 3366bc1ecc65SIlya Dryomov goto err_img_request; 3367bc1ecc65SIlya Dryomov 3368bc1ecc65SIlya Dryomov result = rbd_img_request_submit(img_request); 3369bc1ecc65SIlya Dryomov if (result) 3370bc1ecc65SIlya Dryomov goto err_img_request; 3371bc1ecc65SIlya Dryomov 3372bc1ecc65SIlya Dryomov return; 3373bc1ecc65SIlya Dryomov 3374bc1ecc65SIlya Dryomov err_img_request: 3375bc1ecc65SIlya Dryomov rbd_img_request_put(img_request); 3376bc1ecc65SIlya Dryomov err_rq: 3377bc1ecc65SIlya Dryomov if (result) 3378bc1ecc65SIlya Dryomov rbd_warn(rbd_dev, "%s %llx at %llx result %d", 33796d2940c8SGuangliang Zhao obj_op_name(op_type), length, offset, result); 33804e752f0aSJosh Durgin ceph_put_snap_context(snapc); 33817ad18afaSChristoph Hellwig err: 33827ad18afaSChristoph Hellwig blk_mq_end_request(rq, result); 3383bc1ecc65SIlya Dryomov } 3384bc1ecc65SIlya Dryomov 33857ad18afaSChristoph Hellwig static int rbd_queue_rq(struct blk_mq_hw_ctx *hctx, 33867ad18afaSChristoph Hellwig const struct blk_mq_queue_data *bd) 3387bc1ecc65SIlya Dryomov { 33887ad18afaSChristoph Hellwig struct request *rq = bd->rq; 33897ad18afaSChristoph Hellwig struct work_struct *work = blk_mq_rq_to_pdu(rq); 3390bc1ecc65SIlya Dryomov 33917ad18afaSChristoph Hellwig queue_work(rbd_wq, work); 33927ad18afaSChristoph Hellwig return BLK_MQ_RQ_QUEUE_OK; 3393bf0d5f50SAlex Elder } 3394bf0d5f50SAlex Elder 3395602adf40SYehuda Sadeh static void rbd_free_disk(struct rbd_device *rbd_dev) 3396602adf40SYehuda Sadeh { 3397602adf40SYehuda Sadeh struct gendisk *disk = rbd_dev->disk; 3398602adf40SYehuda Sadeh 3399602adf40SYehuda Sadeh if (!disk) 3400602adf40SYehuda Sadeh return; 3401602adf40SYehuda Sadeh 3402a0cab924SAlex Elder rbd_dev->disk = NULL; 3403a0cab924SAlex Elder if (disk->flags & GENHD_FL_UP) { 3404602adf40SYehuda Sadeh del_gendisk(disk); 3405602adf40SYehuda Sadeh if (disk->queue) 3406602adf40SYehuda Sadeh blk_cleanup_queue(disk->queue); 34077ad18afaSChristoph Hellwig blk_mq_free_tag_set(&rbd_dev->tag_set); 3408a0cab924SAlex Elder } 3409602adf40SYehuda Sadeh put_disk(disk); 3410602adf40SYehuda Sadeh } 3411602adf40SYehuda Sadeh 3412788e2df3SAlex Elder static int rbd_obj_read_sync(struct rbd_device *rbd_dev, 3413788e2df3SAlex Elder const char *object_name, 34147097f8dfSAlex Elder u64 offset, u64 length, void *buf) 3415788e2df3SAlex Elder 3416788e2df3SAlex Elder { 34172169238dSAlex Elder struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 3418788e2df3SAlex Elder struct rbd_obj_request *obj_request; 3419788e2df3SAlex Elder struct page **pages = NULL; 3420788e2df3SAlex Elder u32 page_count; 34211ceae7efSAlex Elder size_t size; 3422788e2df3SAlex Elder int ret; 3423788e2df3SAlex Elder 3424788e2df3SAlex Elder page_count = (u32) calc_pages_for(offset, length); 3425788e2df3SAlex Elder pages = ceph_alloc_page_vector(page_count, GFP_KERNEL); 3426788e2df3SAlex Elder if (IS_ERR(pages)) 3427a8d42056SJan Kara return PTR_ERR(pages); 3428788e2df3SAlex Elder 3429788e2df3SAlex Elder ret = -ENOMEM; 3430788e2df3SAlex Elder obj_request = rbd_obj_request_create(object_name, offset, length, 3431788e2df3SAlex Elder OBJ_REQUEST_PAGES); 3432788e2df3SAlex Elder if (!obj_request) 3433788e2df3SAlex Elder goto out; 3434788e2df3SAlex Elder 3435788e2df3SAlex Elder obj_request->pages = pages; 3436788e2df3SAlex Elder obj_request->page_count = page_count; 3437788e2df3SAlex Elder 34386d2940c8SGuangliang Zhao obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1, 3439deb236b3SIlya Dryomov obj_request); 3440788e2df3SAlex Elder if (!obj_request->osd_req) 3441788e2df3SAlex Elder goto out; 3442788e2df3SAlex Elder 3443c99d2d4aSAlex Elder osd_req_op_extent_init(obj_request->osd_req, 0, CEPH_OSD_OP_READ, 3444c99d2d4aSAlex Elder offset, length, 0, 0); 3445406e2c9fSAlex Elder osd_req_op_extent_osd_data_pages(obj_request->osd_req, 0, 3446a4ce40a9SAlex Elder obj_request->pages, 344744cd188dSAlex Elder obj_request->length, 344844cd188dSAlex Elder obj_request->offset & ~PAGE_MASK, 344944cd188dSAlex Elder false, false); 34509d4df01fSAlex Elder rbd_osd_req_format_read(obj_request); 3451430c28c3SAlex Elder 3452788e2df3SAlex Elder ret = rbd_obj_request_submit(osdc, obj_request); 3453788e2df3SAlex Elder if (ret) 3454788e2df3SAlex Elder goto out; 3455788e2df3SAlex Elder ret = rbd_obj_request_wait(obj_request); 3456788e2df3SAlex Elder if (ret) 3457788e2df3SAlex Elder goto out; 3458788e2df3SAlex Elder 3459788e2df3SAlex Elder ret = obj_request->result; 3460788e2df3SAlex Elder if (ret < 0) 3461788e2df3SAlex Elder goto out; 34621ceae7efSAlex Elder 34631ceae7efSAlex Elder rbd_assert(obj_request->xferred <= (u64) SIZE_MAX); 34641ceae7efSAlex Elder size = (size_t) obj_request->xferred; 3465903bb32eSAlex Elder ceph_copy_from_page_vector(pages, buf, 0, size); 346623ed6e13SAlex Elder rbd_assert(size <= (size_t)INT_MAX); 346723ed6e13SAlex Elder ret = (int)size; 3468788e2df3SAlex Elder out: 3469788e2df3SAlex Elder if (obj_request) 3470788e2df3SAlex Elder rbd_obj_request_put(obj_request); 3471788e2df3SAlex Elder else 3472788e2df3SAlex Elder ceph_release_page_vector(pages, page_count); 3473788e2df3SAlex Elder 3474788e2df3SAlex Elder return ret; 3475788e2df3SAlex Elder } 3476788e2df3SAlex Elder 3477602adf40SYehuda Sadeh /* 3478662518b1SAlex Elder * Read the complete header for the given rbd device. On successful 3479662518b1SAlex Elder * return, the rbd_dev->header field will contain up-to-date 3480662518b1SAlex Elder * information about the image. 34814156d998SAlex Elder */ 348299a41ebcSAlex Elder static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev) 34834156d998SAlex Elder { 34844156d998SAlex Elder struct rbd_image_header_ondisk *ondisk = NULL; 34854156d998SAlex Elder u32 snap_count = 0; 34864156d998SAlex Elder u64 names_size = 0; 34874156d998SAlex Elder u32 want_count; 34884156d998SAlex Elder int ret; 34894156d998SAlex Elder 34904156d998SAlex Elder /* 34914156d998SAlex Elder * The complete header will include an array of its 64-bit 34924156d998SAlex Elder * snapshot ids, followed by the names of those snapshots as 34934156d998SAlex Elder * a contiguous block of NUL-terminated strings. Note that 34944156d998SAlex Elder * the number of snapshots could change by the time we read 34954156d998SAlex Elder * it in, in which case we re-read it. 34964156d998SAlex Elder */ 34974156d998SAlex Elder do { 34984156d998SAlex Elder size_t size; 34994156d998SAlex Elder 35004156d998SAlex Elder kfree(ondisk); 35014156d998SAlex Elder 35024156d998SAlex Elder size = sizeof (*ondisk); 35034156d998SAlex Elder size += snap_count * sizeof (struct rbd_image_snap_ondisk); 35044156d998SAlex Elder size += names_size; 35054156d998SAlex Elder ondisk = kmalloc(size, GFP_KERNEL); 35064156d998SAlex Elder if (!ondisk) 3507662518b1SAlex Elder return -ENOMEM; 35084156d998SAlex Elder 3509c41d13a3SIlya Dryomov ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_oid.name, 35107097f8dfSAlex Elder 0, size, ondisk); 35114156d998SAlex Elder if (ret < 0) 3512662518b1SAlex Elder goto out; 3513c0cd10dbSAlex Elder if ((size_t)ret < size) { 35144156d998SAlex Elder ret = -ENXIO; 351506ecc6cbSAlex Elder rbd_warn(rbd_dev, "short header read (want %zd got %d)", 351606ecc6cbSAlex Elder size, ret); 3517662518b1SAlex Elder goto out; 35184156d998SAlex Elder } 35194156d998SAlex Elder if (!rbd_dev_ondisk_valid(ondisk)) { 35204156d998SAlex Elder ret = -ENXIO; 352106ecc6cbSAlex Elder rbd_warn(rbd_dev, "invalid header"); 3522662518b1SAlex Elder goto out; 35234156d998SAlex Elder } 35244156d998SAlex Elder 35254156d998SAlex Elder names_size = le64_to_cpu(ondisk->snap_names_len); 35264156d998SAlex Elder want_count = snap_count; 35274156d998SAlex Elder snap_count = le32_to_cpu(ondisk->snap_count); 35284156d998SAlex Elder } while (snap_count != want_count); 35294156d998SAlex Elder 3530662518b1SAlex Elder ret = rbd_header_from_disk(rbd_dev, ondisk); 3531662518b1SAlex Elder out: 35324156d998SAlex Elder kfree(ondisk); 35334156d998SAlex Elder 3534dfc5606dSYehuda Sadeh return ret; 3535602adf40SYehuda Sadeh } 3536602adf40SYehuda Sadeh 353715228edeSAlex Elder /* 353815228edeSAlex Elder * Clear the rbd device's EXISTS flag if the snapshot it's mapped to 353915228edeSAlex Elder * has disappeared from the (just updated) snapshot context. 354015228edeSAlex Elder */ 354115228edeSAlex Elder static void rbd_exists_validate(struct rbd_device *rbd_dev) 354215228edeSAlex Elder { 354315228edeSAlex Elder u64 snap_id; 354415228edeSAlex Elder 354515228edeSAlex Elder if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) 354615228edeSAlex Elder return; 354715228edeSAlex Elder 354815228edeSAlex Elder snap_id = rbd_dev->spec->snap_id; 354915228edeSAlex Elder if (snap_id == CEPH_NOSNAP) 355015228edeSAlex Elder return; 355115228edeSAlex Elder 355215228edeSAlex Elder if (rbd_dev_snap_index(rbd_dev, snap_id) == BAD_SNAP_INDEX) 355315228edeSAlex Elder clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); 355415228edeSAlex Elder } 355515228edeSAlex Elder 35569875201eSJosh Durgin static void rbd_dev_update_size(struct rbd_device *rbd_dev) 35579875201eSJosh Durgin { 35589875201eSJosh Durgin sector_t size; 35599875201eSJosh Durgin 35609875201eSJosh Durgin /* 3561811c6688SIlya Dryomov * If EXISTS is not set, rbd_dev->disk may be NULL, so don't 3562811c6688SIlya Dryomov * try to update its size. If REMOVING is set, updating size 3563811c6688SIlya Dryomov * is just useless work since the device can't be opened. 35649875201eSJosh Durgin */ 3565811c6688SIlya Dryomov if (test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags) && 3566811c6688SIlya Dryomov !test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags)) { 35679875201eSJosh Durgin size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE; 35689875201eSJosh Durgin dout("setting size to %llu sectors", (unsigned long long)size); 35699875201eSJosh Durgin set_capacity(rbd_dev->disk, size); 35709875201eSJosh Durgin revalidate_disk(rbd_dev->disk); 35719875201eSJosh Durgin } 35729875201eSJosh Durgin } 35739875201eSJosh Durgin 3574cc4a38bdSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev) 35751fe5e993SAlex Elder { 3576e627db08SAlex Elder u64 mapping_size; 35771fe5e993SAlex Elder int ret; 35781fe5e993SAlex Elder 3579cfbf6377SAlex Elder down_write(&rbd_dev->header_rwsem); 35803b5cf2a2SAlex Elder mapping_size = rbd_dev->mapping.size; 3581a720ae09SIlya Dryomov 3582a720ae09SIlya Dryomov ret = rbd_dev_header_info(rbd_dev); 358352bb1f9bSIlya Dryomov if (ret) 358473e39e4dSIlya Dryomov goto out; 358515228edeSAlex Elder 3586e8f59b59SIlya Dryomov /* 3587e8f59b59SIlya Dryomov * If there is a parent, see if it has disappeared due to the 3588e8f59b59SIlya Dryomov * mapped image getting flattened. 3589e8f59b59SIlya Dryomov */ 3590e8f59b59SIlya Dryomov if (rbd_dev->parent) { 3591e8f59b59SIlya Dryomov ret = rbd_dev_v2_parent_info(rbd_dev); 3592e8f59b59SIlya Dryomov if (ret) 359373e39e4dSIlya Dryomov goto out; 3594e8f59b59SIlya Dryomov } 3595e8f59b59SIlya Dryomov 35965ff1108cSIlya Dryomov if (rbd_dev->spec->snap_id == CEPH_NOSNAP) { 35975ff1108cSIlya Dryomov rbd_dev->mapping.size = rbd_dev->header.image_size; 35985ff1108cSIlya Dryomov } else { 35995ff1108cSIlya Dryomov /* validate mapped snapshot's EXISTS flag */ 360015228edeSAlex Elder rbd_exists_validate(rbd_dev); 36015ff1108cSIlya Dryomov } 36025ff1108cSIlya Dryomov 360373e39e4dSIlya Dryomov out: 3604cfbf6377SAlex Elder up_write(&rbd_dev->header_rwsem); 360573e39e4dSIlya Dryomov if (!ret && mapping_size != rbd_dev->mapping.size) 36069875201eSJosh Durgin rbd_dev_update_size(rbd_dev); 36071fe5e993SAlex Elder 360873e39e4dSIlya Dryomov return ret; 36091fe5e993SAlex Elder } 36101fe5e993SAlex Elder 36117ad18afaSChristoph Hellwig static int rbd_init_request(void *data, struct request *rq, 36127ad18afaSChristoph Hellwig unsigned int hctx_idx, unsigned int request_idx, 36137ad18afaSChristoph Hellwig unsigned int numa_node) 36147ad18afaSChristoph Hellwig { 36157ad18afaSChristoph Hellwig struct work_struct *work = blk_mq_rq_to_pdu(rq); 36167ad18afaSChristoph Hellwig 36177ad18afaSChristoph Hellwig INIT_WORK(work, rbd_queue_workfn); 36187ad18afaSChristoph Hellwig return 0; 36197ad18afaSChristoph Hellwig } 36207ad18afaSChristoph Hellwig 36217ad18afaSChristoph Hellwig static struct blk_mq_ops rbd_mq_ops = { 36227ad18afaSChristoph Hellwig .queue_rq = rbd_queue_rq, 36237ad18afaSChristoph Hellwig .map_queue = blk_mq_map_queue, 36247ad18afaSChristoph Hellwig .init_request = rbd_init_request, 36257ad18afaSChristoph Hellwig }; 36267ad18afaSChristoph Hellwig 3627602adf40SYehuda Sadeh static int rbd_init_disk(struct rbd_device *rbd_dev) 3628602adf40SYehuda Sadeh { 3629602adf40SYehuda Sadeh struct gendisk *disk; 3630602adf40SYehuda Sadeh struct request_queue *q; 3631593a9e7bSAlex Elder u64 segment_size; 36327ad18afaSChristoph Hellwig int err; 3633602adf40SYehuda Sadeh 3634602adf40SYehuda Sadeh /* create gendisk info */ 36357e513d43SIlya Dryomov disk = alloc_disk(single_major ? 36367e513d43SIlya Dryomov (1 << RBD_SINGLE_MAJOR_PART_SHIFT) : 36377e513d43SIlya Dryomov RBD_MINORS_PER_MAJOR); 3638602adf40SYehuda Sadeh if (!disk) 36391fcdb8aaSAlex Elder return -ENOMEM; 3640602adf40SYehuda Sadeh 3641f0f8cef5SAlex Elder snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d", 3642de71a297SAlex Elder rbd_dev->dev_id); 3643602adf40SYehuda Sadeh disk->major = rbd_dev->major; 3644dd82fff1SIlya Dryomov disk->first_minor = rbd_dev->minor; 36457e513d43SIlya Dryomov if (single_major) 36467e513d43SIlya Dryomov disk->flags |= GENHD_FL_EXT_DEVT; 3647602adf40SYehuda Sadeh disk->fops = &rbd_bd_ops; 3648602adf40SYehuda Sadeh disk->private_data = rbd_dev; 3649602adf40SYehuda Sadeh 36507ad18afaSChristoph Hellwig memset(&rbd_dev->tag_set, 0, sizeof(rbd_dev->tag_set)); 36517ad18afaSChristoph Hellwig rbd_dev->tag_set.ops = &rbd_mq_ops; 3652b5584180SIlya Dryomov rbd_dev->tag_set.queue_depth = rbd_dev->opts->queue_depth; 36537ad18afaSChristoph Hellwig rbd_dev->tag_set.numa_node = NUMA_NO_NODE; 3654b5584180SIlya Dryomov rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE; 36557ad18afaSChristoph Hellwig rbd_dev->tag_set.nr_hw_queues = 1; 36567ad18afaSChristoph Hellwig rbd_dev->tag_set.cmd_size = sizeof(struct work_struct); 36577ad18afaSChristoph Hellwig 36587ad18afaSChristoph Hellwig err = blk_mq_alloc_tag_set(&rbd_dev->tag_set); 36597ad18afaSChristoph Hellwig if (err) 3660602adf40SYehuda Sadeh goto out_disk; 3661029bcbd8SJosh Durgin 36627ad18afaSChristoph Hellwig q = blk_mq_init_queue(&rbd_dev->tag_set); 36637ad18afaSChristoph Hellwig if (IS_ERR(q)) { 36647ad18afaSChristoph Hellwig err = PTR_ERR(q); 36657ad18afaSChristoph Hellwig goto out_tag_set; 36667ad18afaSChristoph Hellwig } 36677ad18afaSChristoph Hellwig 3668d8a2c89cSIlya Dryomov queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q); 3669d8a2c89cSIlya Dryomov /* QUEUE_FLAG_ADD_RANDOM is off by default for blk-mq */ 3670593a9e7bSAlex Elder 3671029bcbd8SJosh Durgin /* set io sizes to object size */ 3672593a9e7bSAlex Elder segment_size = rbd_obj_bytes(&rbd_dev->header); 3673593a9e7bSAlex Elder blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE); 36740d9fde4fSIlya Dryomov q->limits.max_sectors = queue_max_hw_sectors(q); 3675d3834fefSIlya Dryomov blk_queue_max_segments(q, segment_size / SECTOR_SIZE); 3676593a9e7bSAlex Elder blk_queue_max_segment_size(q, segment_size); 3677593a9e7bSAlex Elder blk_queue_io_min(q, segment_size); 3678593a9e7bSAlex Elder blk_queue_io_opt(q, segment_size); 3679029bcbd8SJosh Durgin 368090e98c52SGuangliang Zhao /* enable the discard support */ 368190e98c52SGuangliang Zhao queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q); 368290e98c52SGuangliang Zhao q->limits.discard_granularity = segment_size; 368390e98c52SGuangliang Zhao q->limits.discard_alignment = segment_size; 36842bb4cd5cSJens Axboe blk_queue_max_discard_sectors(q, segment_size / SECTOR_SIZE); 3685b76f8239SJosh Durgin q->limits.discard_zeroes_data = 1; 368690e98c52SGuangliang Zhao 3687bae818eeSRonny Hegewald if (!ceph_test_opt(rbd_dev->rbd_client->client, NOCRC)) 3688bae818eeSRonny Hegewald q->backing_dev_info.capabilities |= BDI_CAP_STABLE_WRITES; 3689bae818eeSRonny Hegewald 3690602adf40SYehuda Sadeh disk->queue = q; 3691602adf40SYehuda Sadeh 3692602adf40SYehuda Sadeh q->queuedata = rbd_dev; 3693602adf40SYehuda Sadeh 3694602adf40SYehuda Sadeh rbd_dev->disk = disk; 3695602adf40SYehuda Sadeh 3696602adf40SYehuda Sadeh return 0; 36977ad18afaSChristoph Hellwig out_tag_set: 36987ad18afaSChristoph Hellwig blk_mq_free_tag_set(&rbd_dev->tag_set); 3699602adf40SYehuda Sadeh out_disk: 3700602adf40SYehuda Sadeh put_disk(disk); 37017ad18afaSChristoph Hellwig return err; 3702602adf40SYehuda Sadeh } 3703602adf40SYehuda Sadeh 3704dfc5606dSYehuda Sadeh /* 3705dfc5606dSYehuda Sadeh sysfs 3706dfc5606dSYehuda Sadeh */ 3707602adf40SYehuda Sadeh 3708593a9e7bSAlex Elder static struct rbd_device *dev_to_rbd_dev(struct device *dev) 3709593a9e7bSAlex Elder { 3710593a9e7bSAlex Elder return container_of(dev, struct rbd_device, dev); 3711593a9e7bSAlex Elder } 3712593a9e7bSAlex Elder 3713dfc5606dSYehuda Sadeh static ssize_t rbd_size_show(struct device *dev, 3714dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 3715602adf40SYehuda Sadeh { 3716593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 3717dfc5606dSYehuda Sadeh 3718fc71d833SAlex Elder return sprintf(buf, "%llu\n", 3719fc71d833SAlex Elder (unsigned long long)rbd_dev->mapping.size); 3720602adf40SYehuda Sadeh } 3721602adf40SYehuda Sadeh 372234b13184SAlex Elder /* 372334b13184SAlex Elder * Note this shows the features for whatever's mapped, which is not 372434b13184SAlex Elder * necessarily the base image. 372534b13184SAlex Elder */ 372634b13184SAlex Elder static ssize_t rbd_features_show(struct device *dev, 372734b13184SAlex Elder struct device_attribute *attr, char *buf) 372834b13184SAlex Elder { 372934b13184SAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 373034b13184SAlex Elder 373134b13184SAlex Elder return sprintf(buf, "0x%016llx\n", 373234b13184SAlex Elder (unsigned long long)rbd_dev->mapping.features); 373334b13184SAlex Elder } 373434b13184SAlex Elder 3735dfc5606dSYehuda Sadeh static ssize_t rbd_major_show(struct device *dev, 3736dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 3737602adf40SYehuda Sadeh { 3738593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 3739dfc5606dSYehuda Sadeh 3740fc71d833SAlex Elder if (rbd_dev->major) 3741dfc5606dSYehuda Sadeh return sprintf(buf, "%d\n", rbd_dev->major); 3742fc71d833SAlex Elder 3743fc71d833SAlex Elder return sprintf(buf, "(none)\n"); 3744dd82fff1SIlya Dryomov } 3745fc71d833SAlex Elder 3746dd82fff1SIlya Dryomov static ssize_t rbd_minor_show(struct device *dev, 3747dd82fff1SIlya Dryomov struct device_attribute *attr, char *buf) 3748dd82fff1SIlya Dryomov { 3749dd82fff1SIlya Dryomov struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 3750dd82fff1SIlya Dryomov 3751dd82fff1SIlya Dryomov return sprintf(buf, "%d\n", rbd_dev->minor); 3752dfc5606dSYehuda Sadeh } 3753dfc5606dSYehuda Sadeh 3754dfc5606dSYehuda Sadeh static ssize_t rbd_client_id_show(struct device *dev, 3755dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 3756dfc5606dSYehuda Sadeh { 3757593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 3758dfc5606dSYehuda Sadeh 37591dbb4399SAlex Elder return sprintf(buf, "client%lld\n", 3760033268a5SIlya Dryomov ceph_client_gid(rbd_dev->rbd_client->client)); 3761dfc5606dSYehuda Sadeh } 3762dfc5606dSYehuda Sadeh 3763dfc5606dSYehuda Sadeh static ssize_t rbd_pool_show(struct device *dev, 3764dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 3765dfc5606dSYehuda Sadeh { 3766593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 3767dfc5606dSYehuda Sadeh 37680d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->pool_name); 3769dfc5606dSYehuda Sadeh } 3770dfc5606dSYehuda Sadeh 37719bb2f334SAlex Elder static ssize_t rbd_pool_id_show(struct device *dev, 37729bb2f334SAlex Elder struct device_attribute *attr, char *buf) 37739bb2f334SAlex Elder { 37749bb2f334SAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 37759bb2f334SAlex Elder 37760d7dbfceSAlex Elder return sprintf(buf, "%llu\n", 37770d7dbfceSAlex Elder (unsigned long long) rbd_dev->spec->pool_id); 37789bb2f334SAlex Elder } 37799bb2f334SAlex Elder 3780dfc5606dSYehuda Sadeh static ssize_t rbd_name_show(struct device *dev, 3781dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 3782dfc5606dSYehuda Sadeh { 3783593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 3784dfc5606dSYehuda Sadeh 3785a92ffdf8SAlex Elder if (rbd_dev->spec->image_name) 37860d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->image_name); 3787a92ffdf8SAlex Elder 3788a92ffdf8SAlex Elder return sprintf(buf, "(unknown)\n"); 3789dfc5606dSYehuda Sadeh } 3790dfc5606dSYehuda Sadeh 3791589d30e0SAlex Elder static ssize_t rbd_image_id_show(struct device *dev, 3792589d30e0SAlex Elder struct device_attribute *attr, char *buf) 3793589d30e0SAlex Elder { 3794589d30e0SAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 3795589d30e0SAlex Elder 37960d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->image_id); 3797589d30e0SAlex Elder } 3798589d30e0SAlex Elder 379934b13184SAlex Elder /* 380034b13184SAlex Elder * Shows the name of the currently-mapped snapshot (or 380134b13184SAlex Elder * RBD_SNAP_HEAD_NAME for the base image). 380234b13184SAlex Elder */ 3803dfc5606dSYehuda Sadeh static ssize_t rbd_snap_show(struct device *dev, 3804dfc5606dSYehuda Sadeh struct device_attribute *attr, 3805dfc5606dSYehuda Sadeh char *buf) 3806dfc5606dSYehuda Sadeh { 3807593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 3808dfc5606dSYehuda Sadeh 38090d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->snap_name); 3810dfc5606dSYehuda Sadeh } 3811dfc5606dSYehuda Sadeh 381286b00e0dSAlex Elder /* 3813ff96128fSIlya Dryomov * For a v2 image, shows the chain of parent images, separated by empty 3814ff96128fSIlya Dryomov * lines. For v1 images or if there is no parent, shows "(no parent 3815ff96128fSIlya Dryomov * image)". 381686b00e0dSAlex Elder */ 381786b00e0dSAlex Elder static ssize_t rbd_parent_show(struct device *dev, 381886b00e0dSAlex Elder struct device_attribute *attr, 381986b00e0dSAlex Elder char *buf) 382086b00e0dSAlex Elder { 382186b00e0dSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 3822ff96128fSIlya Dryomov ssize_t count = 0; 382386b00e0dSAlex Elder 3824ff96128fSIlya Dryomov if (!rbd_dev->parent) 382586b00e0dSAlex Elder return sprintf(buf, "(no parent image)\n"); 382686b00e0dSAlex Elder 3827ff96128fSIlya Dryomov for ( ; rbd_dev->parent; rbd_dev = rbd_dev->parent) { 3828ff96128fSIlya Dryomov struct rbd_spec *spec = rbd_dev->parent_spec; 382986b00e0dSAlex Elder 3830ff96128fSIlya Dryomov count += sprintf(&buf[count], "%s" 3831ff96128fSIlya Dryomov "pool_id %llu\npool_name %s\n" 3832ff96128fSIlya Dryomov "image_id %s\nimage_name %s\n" 3833ff96128fSIlya Dryomov "snap_id %llu\nsnap_name %s\n" 3834ff96128fSIlya Dryomov "overlap %llu\n", 3835ff96128fSIlya Dryomov !count ? "" : "\n", /* first? */ 3836ff96128fSIlya Dryomov spec->pool_id, spec->pool_name, 3837ff96128fSIlya Dryomov spec->image_id, spec->image_name ?: "(unknown)", 3838ff96128fSIlya Dryomov spec->snap_id, spec->snap_name, 3839ff96128fSIlya Dryomov rbd_dev->parent_overlap); 3840ff96128fSIlya Dryomov } 384186b00e0dSAlex Elder 384286b00e0dSAlex Elder return count; 384386b00e0dSAlex Elder } 384486b00e0dSAlex Elder 3845dfc5606dSYehuda Sadeh static ssize_t rbd_image_refresh(struct device *dev, 3846dfc5606dSYehuda Sadeh struct device_attribute *attr, 3847dfc5606dSYehuda Sadeh const char *buf, 3848dfc5606dSYehuda Sadeh size_t size) 3849dfc5606dSYehuda Sadeh { 3850593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 3851b813623aSAlex Elder int ret; 3852602adf40SYehuda Sadeh 3853cc4a38bdSAlex Elder ret = rbd_dev_refresh(rbd_dev); 3854e627db08SAlex Elder if (ret) 385552bb1f9bSIlya Dryomov return ret; 3856b813623aSAlex Elder 385752bb1f9bSIlya Dryomov return size; 3858dfc5606dSYehuda Sadeh } 3859602adf40SYehuda Sadeh 3860dfc5606dSYehuda Sadeh static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL); 386134b13184SAlex Elder static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL); 3862dfc5606dSYehuda Sadeh static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL); 3863dd82fff1SIlya Dryomov static DEVICE_ATTR(minor, S_IRUGO, rbd_minor_show, NULL); 3864dfc5606dSYehuda Sadeh static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL); 3865dfc5606dSYehuda Sadeh static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL); 38669bb2f334SAlex Elder static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL); 3867dfc5606dSYehuda Sadeh static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL); 3868589d30e0SAlex Elder static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL); 3869dfc5606dSYehuda Sadeh static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh); 3870dfc5606dSYehuda Sadeh static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL); 387186b00e0dSAlex Elder static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL); 3872dfc5606dSYehuda Sadeh 3873dfc5606dSYehuda Sadeh static struct attribute *rbd_attrs[] = { 3874dfc5606dSYehuda Sadeh &dev_attr_size.attr, 387534b13184SAlex Elder &dev_attr_features.attr, 3876dfc5606dSYehuda Sadeh &dev_attr_major.attr, 3877dd82fff1SIlya Dryomov &dev_attr_minor.attr, 3878dfc5606dSYehuda Sadeh &dev_attr_client_id.attr, 3879dfc5606dSYehuda Sadeh &dev_attr_pool.attr, 38809bb2f334SAlex Elder &dev_attr_pool_id.attr, 3881dfc5606dSYehuda Sadeh &dev_attr_name.attr, 3882589d30e0SAlex Elder &dev_attr_image_id.attr, 3883dfc5606dSYehuda Sadeh &dev_attr_current_snap.attr, 388486b00e0dSAlex Elder &dev_attr_parent.attr, 3885dfc5606dSYehuda Sadeh &dev_attr_refresh.attr, 3886dfc5606dSYehuda Sadeh NULL 3887dfc5606dSYehuda Sadeh }; 3888dfc5606dSYehuda Sadeh 3889dfc5606dSYehuda Sadeh static struct attribute_group rbd_attr_group = { 3890dfc5606dSYehuda Sadeh .attrs = rbd_attrs, 3891dfc5606dSYehuda Sadeh }; 3892dfc5606dSYehuda Sadeh 3893dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_attr_groups[] = { 3894dfc5606dSYehuda Sadeh &rbd_attr_group, 3895dfc5606dSYehuda Sadeh NULL 3896dfc5606dSYehuda Sadeh }; 3897dfc5606dSYehuda Sadeh 38986cac4695SIlya Dryomov static void rbd_dev_release(struct device *dev); 3899dfc5606dSYehuda Sadeh 3900dfc5606dSYehuda Sadeh static struct device_type rbd_device_type = { 3901dfc5606dSYehuda Sadeh .name = "rbd", 3902dfc5606dSYehuda Sadeh .groups = rbd_attr_groups, 39036cac4695SIlya Dryomov .release = rbd_dev_release, 3904dfc5606dSYehuda Sadeh }; 3905dfc5606dSYehuda Sadeh 39068b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec) 39078b8fb99cSAlex Elder { 39088b8fb99cSAlex Elder kref_get(&spec->kref); 39098b8fb99cSAlex Elder 39108b8fb99cSAlex Elder return spec; 39118b8fb99cSAlex Elder } 39128b8fb99cSAlex Elder 39138b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref); 39148b8fb99cSAlex Elder static void rbd_spec_put(struct rbd_spec *spec) 39158b8fb99cSAlex Elder { 39168b8fb99cSAlex Elder if (spec) 39178b8fb99cSAlex Elder kref_put(&spec->kref, rbd_spec_free); 39188b8fb99cSAlex Elder } 39198b8fb99cSAlex Elder 39208b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_alloc(void) 39218b8fb99cSAlex Elder { 39228b8fb99cSAlex Elder struct rbd_spec *spec; 39238b8fb99cSAlex Elder 39248b8fb99cSAlex Elder spec = kzalloc(sizeof (*spec), GFP_KERNEL); 39258b8fb99cSAlex Elder if (!spec) 39268b8fb99cSAlex Elder return NULL; 392704077599SIlya Dryomov 392804077599SIlya Dryomov spec->pool_id = CEPH_NOPOOL; 392904077599SIlya Dryomov spec->snap_id = CEPH_NOSNAP; 39308b8fb99cSAlex Elder kref_init(&spec->kref); 39318b8fb99cSAlex Elder 39328b8fb99cSAlex Elder return spec; 39338b8fb99cSAlex Elder } 39348b8fb99cSAlex Elder 39358b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref) 39368b8fb99cSAlex Elder { 39378b8fb99cSAlex Elder struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref); 39388b8fb99cSAlex Elder 39398b8fb99cSAlex Elder kfree(spec->pool_name); 39408b8fb99cSAlex Elder kfree(spec->image_id); 39418b8fb99cSAlex Elder kfree(spec->image_name); 39428b8fb99cSAlex Elder kfree(spec->snap_name); 39438b8fb99cSAlex Elder kfree(spec); 39448b8fb99cSAlex Elder } 39458b8fb99cSAlex Elder 39461643dfa4SIlya Dryomov static void rbd_dev_free(struct rbd_device *rbd_dev) 3947dd5ac32dSIlya Dryomov { 3948c41d13a3SIlya Dryomov ceph_oid_destroy(&rbd_dev->header_oid); 39496b6dddbeSIlya Dryomov ceph_oloc_destroy(&rbd_dev->header_oloc); 3950c41d13a3SIlya Dryomov 3951dd5ac32dSIlya Dryomov rbd_put_client(rbd_dev->rbd_client); 3952dd5ac32dSIlya Dryomov rbd_spec_put(rbd_dev->spec); 3953dd5ac32dSIlya Dryomov kfree(rbd_dev->opts); 3954dd5ac32dSIlya Dryomov kfree(rbd_dev); 39551643dfa4SIlya Dryomov } 39561643dfa4SIlya Dryomov 39571643dfa4SIlya Dryomov static void rbd_dev_release(struct device *dev) 39581643dfa4SIlya Dryomov { 39591643dfa4SIlya Dryomov struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 39601643dfa4SIlya Dryomov bool need_put = !!rbd_dev->opts; 39611643dfa4SIlya Dryomov 39621643dfa4SIlya Dryomov if (need_put) { 39631643dfa4SIlya Dryomov destroy_workqueue(rbd_dev->task_wq); 39641643dfa4SIlya Dryomov ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id); 39651643dfa4SIlya Dryomov } 39661643dfa4SIlya Dryomov 39671643dfa4SIlya Dryomov rbd_dev_free(rbd_dev); 3968dd5ac32dSIlya Dryomov 3969dd5ac32dSIlya Dryomov /* 3970dd5ac32dSIlya Dryomov * This is racy, but way better than putting module outside of 3971dd5ac32dSIlya Dryomov * the release callback. The race window is pretty small, so 3972dd5ac32dSIlya Dryomov * doing something similar to dm (dm-builtin.c) is overkill. 3973dd5ac32dSIlya Dryomov */ 3974dd5ac32dSIlya Dryomov if (need_put) 3975dd5ac32dSIlya Dryomov module_put(THIS_MODULE); 3976dd5ac32dSIlya Dryomov } 3977dd5ac32dSIlya Dryomov 39781643dfa4SIlya Dryomov static struct rbd_device *__rbd_dev_create(struct rbd_client *rbdc, 39791643dfa4SIlya Dryomov struct rbd_spec *spec) 3980c53d5893SAlex Elder { 3981c53d5893SAlex Elder struct rbd_device *rbd_dev; 3982c53d5893SAlex Elder 3983c53d5893SAlex Elder rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL); 3984c53d5893SAlex Elder if (!rbd_dev) 3985c53d5893SAlex Elder return NULL; 3986c53d5893SAlex Elder 3987c53d5893SAlex Elder spin_lock_init(&rbd_dev->lock); 3988c53d5893SAlex Elder INIT_LIST_HEAD(&rbd_dev->node); 3989c53d5893SAlex Elder init_rwsem(&rbd_dev->header_rwsem); 3990c53d5893SAlex Elder 3991c41d13a3SIlya Dryomov ceph_oid_init(&rbd_dev->header_oid); 3992922dab61SIlya Dryomov ceph_oloc_init(&rbd_dev->header_oloc); 3993c41d13a3SIlya Dryomov 3994dd5ac32dSIlya Dryomov rbd_dev->dev.bus = &rbd_bus_type; 3995dd5ac32dSIlya Dryomov rbd_dev->dev.type = &rbd_device_type; 3996dd5ac32dSIlya Dryomov rbd_dev->dev.parent = &rbd_root_dev; 3997dd5ac32dSIlya Dryomov device_initialize(&rbd_dev->dev); 3998dd5ac32dSIlya Dryomov 3999c53d5893SAlex Elder rbd_dev->rbd_client = rbdc; 4000d147543dSIlya Dryomov rbd_dev->spec = spec; 40010903e875SAlex Elder 40027627151eSYan, Zheng rbd_dev->layout.stripe_unit = 1 << RBD_MAX_OBJ_ORDER; 40037627151eSYan, Zheng rbd_dev->layout.stripe_count = 1; 40047627151eSYan, Zheng rbd_dev->layout.object_size = 1 << RBD_MAX_OBJ_ORDER; 40057627151eSYan, Zheng rbd_dev->layout.pool_id = spec->pool_id; 400630c156d9SYan, Zheng RCU_INIT_POINTER(rbd_dev->layout.pool_ns, NULL); 40070903e875SAlex Elder 40081643dfa4SIlya Dryomov return rbd_dev; 40091643dfa4SIlya Dryomov } 40101643dfa4SIlya Dryomov 4011dd5ac32dSIlya Dryomov /* 40121643dfa4SIlya Dryomov * Create a mapping rbd_dev. 4013dd5ac32dSIlya Dryomov */ 40141643dfa4SIlya Dryomov static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc, 40151643dfa4SIlya Dryomov struct rbd_spec *spec, 40161643dfa4SIlya Dryomov struct rbd_options *opts) 40171643dfa4SIlya Dryomov { 40181643dfa4SIlya Dryomov struct rbd_device *rbd_dev; 40191643dfa4SIlya Dryomov 40201643dfa4SIlya Dryomov rbd_dev = __rbd_dev_create(rbdc, spec); 40211643dfa4SIlya Dryomov if (!rbd_dev) 40221643dfa4SIlya Dryomov return NULL; 40231643dfa4SIlya Dryomov 40241643dfa4SIlya Dryomov rbd_dev->opts = opts; 40251643dfa4SIlya Dryomov 40261643dfa4SIlya Dryomov /* get an id and fill in device name */ 40271643dfa4SIlya Dryomov rbd_dev->dev_id = ida_simple_get(&rbd_dev_id_ida, 0, 40281643dfa4SIlya Dryomov minor_to_rbd_dev_id(1 << MINORBITS), 40291643dfa4SIlya Dryomov GFP_KERNEL); 40301643dfa4SIlya Dryomov if (rbd_dev->dev_id < 0) 40311643dfa4SIlya Dryomov goto fail_rbd_dev; 40321643dfa4SIlya Dryomov 40331643dfa4SIlya Dryomov sprintf(rbd_dev->name, RBD_DRV_NAME "%d", rbd_dev->dev_id); 40341643dfa4SIlya Dryomov rbd_dev->task_wq = alloc_ordered_workqueue("%s-tasks", WQ_MEM_RECLAIM, 40351643dfa4SIlya Dryomov rbd_dev->name); 40361643dfa4SIlya Dryomov if (!rbd_dev->task_wq) 40371643dfa4SIlya Dryomov goto fail_dev_id; 40381643dfa4SIlya Dryomov 40391643dfa4SIlya Dryomov /* we have a ref from do_rbd_add() */ 4040dd5ac32dSIlya Dryomov __module_get(THIS_MODULE); 4041dd5ac32dSIlya Dryomov 40421643dfa4SIlya Dryomov dout("%s rbd_dev %p dev_id %d\n", __func__, rbd_dev, rbd_dev->dev_id); 4043c53d5893SAlex Elder return rbd_dev; 40441643dfa4SIlya Dryomov 40451643dfa4SIlya Dryomov fail_dev_id: 40461643dfa4SIlya Dryomov ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id); 40471643dfa4SIlya Dryomov fail_rbd_dev: 40481643dfa4SIlya Dryomov rbd_dev_free(rbd_dev); 40491643dfa4SIlya Dryomov return NULL; 4050c53d5893SAlex Elder } 4051c53d5893SAlex Elder 4052c53d5893SAlex Elder static void rbd_dev_destroy(struct rbd_device *rbd_dev) 4053c53d5893SAlex Elder { 4054dd5ac32dSIlya Dryomov if (rbd_dev) 4055dd5ac32dSIlya Dryomov put_device(&rbd_dev->dev); 4056c53d5893SAlex Elder } 4057c53d5893SAlex Elder 4058dfc5606dSYehuda Sadeh /* 40599d475de5SAlex Elder * Get the size and object order for an image snapshot, or if 40609d475de5SAlex Elder * snap_id is CEPH_NOSNAP, gets this information for the base 40619d475de5SAlex Elder * image. 40629d475de5SAlex Elder */ 40639d475de5SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id, 40649d475de5SAlex Elder u8 *order, u64 *snap_size) 40659d475de5SAlex Elder { 40669d475de5SAlex Elder __le64 snapid = cpu_to_le64(snap_id); 40679d475de5SAlex Elder int ret; 40689d475de5SAlex Elder struct { 40699d475de5SAlex Elder u8 order; 40709d475de5SAlex Elder __le64 size; 40719d475de5SAlex Elder } __attribute__ ((packed)) size_buf = { 0 }; 40729d475de5SAlex Elder 4073c41d13a3SIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name, 40749d475de5SAlex Elder "rbd", "get_size", 40754157976bSAlex Elder &snapid, sizeof (snapid), 4076e2a58ee5SAlex Elder &size_buf, sizeof (size_buf)); 407736be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 40789d475de5SAlex Elder if (ret < 0) 40799d475de5SAlex Elder return ret; 408057385b51SAlex Elder if (ret < sizeof (size_buf)) 408157385b51SAlex Elder return -ERANGE; 40829d475de5SAlex Elder 4083c3545579SJosh Durgin if (order) { 40849d475de5SAlex Elder *order = size_buf.order; 4085c3545579SJosh Durgin dout(" order %u", (unsigned int)*order); 4086c3545579SJosh Durgin } 40879d475de5SAlex Elder *snap_size = le64_to_cpu(size_buf.size); 40889d475de5SAlex Elder 4089c3545579SJosh Durgin dout(" snap_id 0x%016llx snap_size = %llu\n", 4090c3545579SJosh Durgin (unsigned long long)snap_id, 40919d475de5SAlex Elder (unsigned long long)*snap_size); 40929d475de5SAlex Elder 40939d475de5SAlex Elder return 0; 40949d475de5SAlex Elder } 40959d475de5SAlex Elder 40969d475de5SAlex Elder static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev) 40979d475de5SAlex Elder { 40989d475de5SAlex Elder return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP, 40999d475de5SAlex Elder &rbd_dev->header.obj_order, 41009d475de5SAlex Elder &rbd_dev->header.image_size); 41019d475de5SAlex Elder } 41029d475de5SAlex Elder 41031e130199SAlex Elder static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev) 41041e130199SAlex Elder { 41051e130199SAlex Elder void *reply_buf; 41061e130199SAlex Elder int ret; 41071e130199SAlex Elder void *p; 41081e130199SAlex Elder 41091e130199SAlex Elder reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL); 41101e130199SAlex Elder if (!reply_buf) 41111e130199SAlex Elder return -ENOMEM; 41121e130199SAlex Elder 4113c41d13a3SIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name, 41144157976bSAlex Elder "rbd", "get_object_prefix", NULL, 0, 4115e2a58ee5SAlex Elder reply_buf, RBD_OBJ_PREFIX_LEN_MAX); 411636be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 41171e130199SAlex Elder if (ret < 0) 41181e130199SAlex Elder goto out; 41191e130199SAlex Elder 41201e130199SAlex Elder p = reply_buf; 41211e130199SAlex Elder rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p, 412257385b51SAlex Elder p + ret, NULL, GFP_NOIO); 412357385b51SAlex Elder ret = 0; 41241e130199SAlex Elder 41251e130199SAlex Elder if (IS_ERR(rbd_dev->header.object_prefix)) { 41261e130199SAlex Elder ret = PTR_ERR(rbd_dev->header.object_prefix); 41271e130199SAlex Elder rbd_dev->header.object_prefix = NULL; 41281e130199SAlex Elder } else { 41291e130199SAlex Elder dout(" object_prefix = %s\n", rbd_dev->header.object_prefix); 41301e130199SAlex Elder } 41311e130199SAlex Elder out: 41321e130199SAlex Elder kfree(reply_buf); 41331e130199SAlex Elder 41341e130199SAlex Elder return ret; 41351e130199SAlex Elder } 41361e130199SAlex Elder 4137b1b5402aSAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id, 4138b1b5402aSAlex Elder u64 *snap_features) 4139b1b5402aSAlex Elder { 4140b1b5402aSAlex Elder __le64 snapid = cpu_to_le64(snap_id); 4141b1b5402aSAlex Elder struct { 4142b1b5402aSAlex Elder __le64 features; 4143b1b5402aSAlex Elder __le64 incompat; 41444157976bSAlex Elder } __attribute__ ((packed)) features_buf = { 0 }; 4145d3767f0fSIlya Dryomov u64 unsup; 4146b1b5402aSAlex Elder int ret; 4147b1b5402aSAlex Elder 4148c41d13a3SIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name, 4149b1b5402aSAlex Elder "rbd", "get_features", 41504157976bSAlex Elder &snapid, sizeof (snapid), 4151e2a58ee5SAlex Elder &features_buf, sizeof (features_buf)); 415236be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 4153b1b5402aSAlex Elder if (ret < 0) 4154b1b5402aSAlex Elder return ret; 415557385b51SAlex Elder if (ret < sizeof (features_buf)) 415657385b51SAlex Elder return -ERANGE; 4157d889140cSAlex Elder 4158d3767f0fSIlya Dryomov unsup = le64_to_cpu(features_buf.incompat) & ~RBD_FEATURES_SUPPORTED; 4159d3767f0fSIlya Dryomov if (unsup) { 4160d3767f0fSIlya Dryomov rbd_warn(rbd_dev, "image uses unsupported features: 0x%llx", 4161d3767f0fSIlya Dryomov unsup); 4162b8f5c6edSAlex Elder return -ENXIO; 4163d3767f0fSIlya Dryomov } 4164d889140cSAlex Elder 4165b1b5402aSAlex Elder *snap_features = le64_to_cpu(features_buf.features); 4166b1b5402aSAlex Elder 4167b1b5402aSAlex Elder dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n", 4168b1b5402aSAlex Elder (unsigned long long)snap_id, 4169b1b5402aSAlex Elder (unsigned long long)*snap_features, 4170b1b5402aSAlex Elder (unsigned long long)le64_to_cpu(features_buf.incompat)); 4171b1b5402aSAlex Elder 4172b1b5402aSAlex Elder return 0; 4173b1b5402aSAlex Elder } 4174b1b5402aSAlex Elder 4175b1b5402aSAlex Elder static int rbd_dev_v2_features(struct rbd_device *rbd_dev) 4176b1b5402aSAlex Elder { 4177b1b5402aSAlex Elder return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP, 4178b1b5402aSAlex Elder &rbd_dev->header.features); 4179b1b5402aSAlex Elder } 4180b1b5402aSAlex Elder 418186b00e0dSAlex Elder static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev) 418286b00e0dSAlex Elder { 418386b00e0dSAlex Elder struct rbd_spec *parent_spec; 418486b00e0dSAlex Elder size_t size; 418586b00e0dSAlex Elder void *reply_buf = NULL; 418686b00e0dSAlex Elder __le64 snapid; 418786b00e0dSAlex Elder void *p; 418886b00e0dSAlex Elder void *end; 4189642a2537SAlex Elder u64 pool_id; 419086b00e0dSAlex Elder char *image_id; 41913b5cf2a2SAlex Elder u64 snap_id; 419286b00e0dSAlex Elder u64 overlap; 419386b00e0dSAlex Elder int ret; 419486b00e0dSAlex Elder 419586b00e0dSAlex Elder parent_spec = rbd_spec_alloc(); 419686b00e0dSAlex Elder if (!parent_spec) 419786b00e0dSAlex Elder return -ENOMEM; 419886b00e0dSAlex Elder 419986b00e0dSAlex Elder size = sizeof (__le64) + /* pool_id */ 420086b00e0dSAlex Elder sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX + /* image_id */ 420186b00e0dSAlex Elder sizeof (__le64) + /* snap_id */ 420286b00e0dSAlex Elder sizeof (__le64); /* overlap */ 420386b00e0dSAlex Elder reply_buf = kmalloc(size, GFP_KERNEL); 420486b00e0dSAlex Elder if (!reply_buf) { 420586b00e0dSAlex Elder ret = -ENOMEM; 420686b00e0dSAlex Elder goto out_err; 420786b00e0dSAlex Elder } 420886b00e0dSAlex Elder 42094d9b67cdSIlya Dryomov snapid = cpu_to_le64(rbd_dev->spec->snap_id); 4210c41d13a3SIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name, 421186b00e0dSAlex Elder "rbd", "get_parent", 42124157976bSAlex Elder &snapid, sizeof (snapid), 4213e2a58ee5SAlex Elder reply_buf, size); 421436be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 421586b00e0dSAlex Elder if (ret < 0) 421686b00e0dSAlex Elder goto out_err; 421786b00e0dSAlex Elder 421886b00e0dSAlex Elder p = reply_buf; 421957385b51SAlex Elder end = reply_buf + ret; 422057385b51SAlex Elder ret = -ERANGE; 4221642a2537SAlex Elder ceph_decode_64_safe(&p, end, pool_id, out_err); 4222392a9dadSAlex Elder if (pool_id == CEPH_NOPOOL) { 4223392a9dadSAlex Elder /* 4224392a9dadSAlex Elder * Either the parent never existed, or we have 4225392a9dadSAlex Elder * record of it but the image got flattened so it no 4226392a9dadSAlex Elder * longer has a parent. When the parent of a 4227392a9dadSAlex Elder * layered image disappears we immediately set the 4228392a9dadSAlex Elder * overlap to 0. The effect of this is that all new 4229392a9dadSAlex Elder * requests will be treated as if the image had no 4230392a9dadSAlex Elder * parent. 4231392a9dadSAlex Elder */ 4232392a9dadSAlex Elder if (rbd_dev->parent_overlap) { 4233392a9dadSAlex Elder rbd_dev->parent_overlap = 0; 4234392a9dadSAlex Elder rbd_dev_parent_put(rbd_dev); 4235392a9dadSAlex Elder pr_info("%s: clone image has been flattened\n", 4236392a9dadSAlex Elder rbd_dev->disk->disk_name); 4237392a9dadSAlex Elder } 4238392a9dadSAlex Elder 423986b00e0dSAlex Elder goto out; /* No parent? No problem. */ 4240392a9dadSAlex Elder } 424186b00e0dSAlex Elder 42420903e875SAlex Elder /* The ceph file layout needs to fit pool id in 32 bits */ 42430903e875SAlex Elder 42440903e875SAlex Elder ret = -EIO; 4245642a2537SAlex Elder if (pool_id > (u64)U32_MAX) { 42469584d508SIlya Dryomov rbd_warn(NULL, "parent pool id too large (%llu > %u)", 4247642a2537SAlex Elder (unsigned long long)pool_id, U32_MAX); 424857385b51SAlex Elder goto out_err; 4249c0cd10dbSAlex Elder } 42500903e875SAlex Elder 4251979ed480SAlex Elder image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL); 425286b00e0dSAlex Elder if (IS_ERR(image_id)) { 425386b00e0dSAlex Elder ret = PTR_ERR(image_id); 425486b00e0dSAlex Elder goto out_err; 425586b00e0dSAlex Elder } 42563b5cf2a2SAlex Elder ceph_decode_64_safe(&p, end, snap_id, out_err); 425786b00e0dSAlex Elder ceph_decode_64_safe(&p, end, overlap, out_err); 425886b00e0dSAlex Elder 42593b5cf2a2SAlex Elder /* 42603b5cf2a2SAlex Elder * The parent won't change (except when the clone is 42613b5cf2a2SAlex Elder * flattened, already handled that). So we only need to 42623b5cf2a2SAlex Elder * record the parent spec we have not already done so. 42633b5cf2a2SAlex Elder */ 42643b5cf2a2SAlex Elder if (!rbd_dev->parent_spec) { 42653b5cf2a2SAlex Elder parent_spec->pool_id = pool_id; 42663b5cf2a2SAlex Elder parent_spec->image_id = image_id; 42673b5cf2a2SAlex Elder parent_spec->snap_id = snap_id; 426886b00e0dSAlex Elder rbd_dev->parent_spec = parent_spec; 426986b00e0dSAlex Elder parent_spec = NULL; /* rbd_dev now owns this */ 4270fbba11b3SIlya Dryomov } else { 4271fbba11b3SIlya Dryomov kfree(image_id); 42723b5cf2a2SAlex Elder } 42733b5cf2a2SAlex Elder 42743b5cf2a2SAlex Elder /* 4275cf32bd9cSIlya Dryomov * We always update the parent overlap. If it's zero we issue 4276cf32bd9cSIlya Dryomov * a warning, as we will proceed as if there was no parent. 42773b5cf2a2SAlex Elder */ 42783b5cf2a2SAlex Elder if (!overlap) { 42793b5cf2a2SAlex Elder if (parent_spec) { 4280cf32bd9cSIlya Dryomov /* refresh, careful to warn just once */ 4281cf32bd9cSIlya Dryomov if (rbd_dev->parent_overlap) 4282cf32bd9cSIlya Dryomov rbd_warn(rbd_dev, 4283cf32bd9cSIlya Dryomov "clone now standalone (overlap became 0)"); 428470cf49cfSAlex Elder } else { 4285cf32bd9cSIlya Dryomov /* initial probe */ 4286cf32bd9cSIlya Dryomov rbd_warn(rbd_dev, "clone is standalone (overlap 0)"); 42873b5cf2a2SAlex Elder } 428870cf49cfSAlex Elder } 4289cf32bd9cSIlya Dryomov rbd_dev->parent_overlap = overlap; 4290cf32bd9cSIlya Dryomov 429186b00e0dSAlex Elder out: 429286b00e0dSAlex Elder ret = 0; 429386b00e0dSAlex Elder out_err: 429486b00e0dSAlex Elder kfree(reply_buf); 429586b00e0dSAlex Elder rbd_spec_put(parent_spec); 429686b00e0dSAlex Elder 429786b00e0dSAlex Elder return ret; 429886b00e0dSAlex Elder } 429986b00e0dSAlex Elder 4300cc070d59SAlex Elder static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev) 4301cc070d59SAlex Elder { 4302cc070d59SAlex Elder struct { 4303cc070d59SAlex Elder __le64 stripe_unit; 4304cc070d59SAlex Elder __le64 stripe_count; 4305cc070d59SAlex Elder } __attribute__ ((packed)) striping_info_buf = { 0 }; 4306cc070d59SAlex Elder size_t size = sizeof (striping_info_buf); 4307cc070d59SAlex Elder void *p; 4308cc070d59SAlex Elder u64 obj_size; 4309cc070d59SAlex Elder u64 stripe_unit; 4310cc070d59SAlex Elder u64 stripe_count; 4311cc070d59SAlex Elder int ret; 4312cc070d59SAlex Elder 4313c41d13a3SIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name, 4314cc070d59SAlex Elder "rbd", "get_stripe_unit_count", NULL, 0, 4315e2a58ee5SAlex Elder (char *)&striping_info_buf, size); 4316cc070d59SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 4317cc070d59SAlex Elder if (ret < 0) 4318cc070d59SAlex Elder return ret; 4319cc070d59SAlex Elder if (ret < size) 4320cc070d59SAlex Elder return -ERANGE; 4321cc070d59SAlex Elder 4322cc070d59SAlex Elder /* 4323cc070d59SAlex Elder * We don't actually support the "fancy striping" feature 4324cc070d59SAlex Elder * (STRIPINGV2) yet, but if the striping sizes are the 4325cc070d59SAlex Elder * defaults the behavior is the same as before. So find 4326cc070d59SAlex Elder * out, and only fail if the image has non-default values. 4327cc070d59SAlex Elder */ 4328cc070d59SAlex Elder ret = -EINVAL; 4329cc070d59SAlex Elder obj_size = (u64)1 << rbd_dev->header.obj_order; 4330cc070d59SAlex Elder p = &striping_info_buf; 4331cc070d59SAlex Elder stripe_unit = ceph_decode_64(&p); 4332cc070d59SAlex Elder if (stripe_unit != obj_size) { 4333cc070d59SAlex Elder rbd_warn(rbd_dev, "unsupported stripe unit " 4334cc070d59SAlex Elder "(got %llu want %llu)", 4335cc070d59SAlex Elder stripe_unit, obj_size); 4336cc070d59SAlex Elder return -EINVAL; 4337cc070d59SAlex Elder } 4338cc070d59SAlex Elder stripe_count = ceph_decode_64(&p); 4339cc070d59SAlex Elder if (stripe_count != 1) { 4340cc070d59SAlex Elder rbd_warn(rbd_dev, "unsupported stripe count " 4341cc070d59SAlex Elder "(got %llu want 1)", stripe_count); 4342cc070d59SAlex Elder return -EINVAL; 4343cc070d59SAlex Elder } 4344500d0c0fSAlex Elder rbd_dev->header.stripe_unit = stripe_unit; 4345500d0c0fSAlex Elder rbd_dev->header.stripe_count = stripe_count; 4346cc070d59SAlex Elder 4347cc070d59SAlex Elder return 0; 4348cc070d59SAlex Elder } 4349cc070d59SAlex Elder 43509e15b77dSAlex Elder static char *rbd_dev_image_name(struct rbd_device *rbd_dev) 43519e15b77dSAlex Elder { 43529e15b77dSAlex Elder size_t image_id_size; 43539e15b77dSAlex Elder char *image_id; 43549e15b77dSAlex Elder void *p; 43559e15b77dSAlex Elder void *end; 43569e15b77dSAlex Elder size_t size; 43579e15b77dSAlex Elder void *reply_buf = NULL; 43589e15b77dSAlex Elder size_t len = 0; 43599e15b77dSAlex Elder char *image_name = NULL; 43609e15b77dSAlex Elder int ret; 43619e15b77dSAlex Elder 43629e15b77dSAlex Elder rbd_assert(!rbd_dev->spec->image_name); 43639e15b77dSAlex Elder 436469e7a02fSAlex Elder len = strlen(rbd_dev->spec->image_id); 436569e7a02fSAlex Elder image_id_size = sizeof (__le32) + len; 43669e15b77dSAlex Elder image_id = kmalloc(image_id_size, GFP_KERNEL); 43679e15b77dSAlex Elder if (!image_id) 43689e15b77dSAlex Elder return NULL; 43699e15b77dSAlex Elder 43709e15b77dSAlex Elder p = image_id; 43714157976bSAlex Elder end = image_id + image_id_size; 437269e7a02fSAlex Elder ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len); 43739e15b77dSAlex Elder 43749e15b77dSAlex Elder size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX; 43759e15b77dSAlex Elder reply_buf = kmalloc(size, GFP_KERNEL); 43769e15b77dSAlex Elder if (!reply_buf) 43779e15b77dSAlex Elder goto out; 43789e15b77dSAlex Elder 437936be9a76SAlex Elder ret = rbd_obj_method_sync(rbd_dev, RBD_DIRECTORY, 43809e15b77dSAlex Elder "rbd", "dir_get_name", 43819e15b77dSAlex Elder image_id, image_id_size, 4382e2a58ee5SAlex Elder reply_buf, size); 43839e15b77dSAlex Elder if (ret < 0) 43849e15b77dSAlex Elder goto out; 43859e15b77dSAlex Elder p = reply_buf; 4386f40eb349SAlex Elder end = reply_buf + ret; 4387f40eb349SAlex Elder 43889e15b77dSAlex Elder image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL); 43899e15b77dSAlex Elder if (IS_ERR(image_name)) 43909e15b77dSAlex Elder image_name = NULL; 43919e15b77dSAlex Elder else 43929e15b77dSAlex Elder dout("%s: name is %s len is %zd\n", __func__, image_name, len); 43939e15b77dSAlex Elder out: 43949e15b77dSAlex Elder kfree(reply_buf); 43959e15b77dSAlex Elder kfree(image_id); 43969e15b77dSAlex Elder 43979e15b77dSAlex Elder return image_name; 43989e15b77dSAlex Elder } 43999e15b77dSAlex Elder 44002ad3d716SAlex Elder static u64 rbd_v1_snap_id_by_name(struct rbd_device *rbd_dev, const char *name) 44012ad3d716SAlex Elder { 44022ad3d716SAlex Elder struct ceph_snap_context *snapc = rbd_dev->header.snapc; 44032ad3d716SAlex Elder const char *snap_name; 44042ad3d716SAlex Elder u32 which = 0; 44052ad3d716SAlex Elder 44062ad3d716SAlex Elder /* Skip over names until we find the one we are looking for */ 44072ad3d716SAlex Elder 44082ad3d716SAlex Elder snap_name = rbd_dev->header.snap_names; 44092ad3d716SAlex Elder while (which < snapc->num_snaps) { 44102ad3d716SAlex Elder if (!strcmp(name, snap_name)) 44112ad3d716SAlex Elder return snapc->snaps[which]; 44122ad3d716SAlex Elder snap_name += strlen(snap_name) + 1; 44132ad3d716SAlex Elder which++; 44142ad3d716SAlex Elder } 44152ad3d716SAlex Elder return CEPH_NOSNAP; 44162ad3d716SAlex Elder } 44172ad3d716SAlex Elder 44182ad3d716SAlex Elder static u64 rbd_v2_snap_id_by_name(struct rbd_device *rbd_dev, const char *name) 44192ad3d716SAlex Elder { 44202ad3d716SAlex Elder struct ceph_snap_context *snapc = rbd_dev->header.snapc; 44212ad3d716SAlex Elder u32 which; 44222ad3d716SAlex Elder bool found = false; 44232ad3d716SAlex Elder u64 snap_id; 44242ad3d716SAlex Elder 44252ad3d716SAlex Elder for (which = 0; !found && which < snapc->num_snaps; which++) { 44262ad3d716SAlex Elder const char *snap_name; 44272ad3d716SAlex Elder 44282ad3d716SAlex Elder snap_id = snapc->snaps[which]; 44292ad3d716SAlex Elder snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id); 4430efadc98aSJosh Durgin if (IS_ERR(snap_name)) { 4431efadc98aSJosh Durgin /* ignore no-longer existing snapshots */ 4432efadc98aSJosh Durgin if (PTR_ERR(snap_name) == -ENOENT) 4433efadc98aSJosh Durgin continue; 4434efadc98aSJosh Durgin else 44352ad3d716SAlex Elder break; 4436efadc98aSJosh Durgin } 44372ad3d716SAlex Elder found = !strcmp(name, snap_name); 44382ad3d716SAlex Elder kfree(snap_name); 44392ad3d716SAlex Elder } 44402ad3d716SAlex Elder return found ? snap_id : CEPH_NOSNAP; 44412ad3d716SAlex Elder } 44422ad3d716SAlex Elder 44432ad3d716SAlex Elder /* 44442ad3d716SAlex Elder * Assumes name is never RBD_SNAP_HEAD_NAME; returns CEPH_NOSNAP if 44452ad3d716SAlex Elder * no snapshot by that name is found, or if an error occurs. 44462ad3d716SAlex Elder */ 44472ad3d716SAlex Elder static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name) 44482ad3d716SAlex Elder { 44492ad3d716SAlex Elder if (rbd_dev->image_format == 1) 44502ad3d716SAlex Elder return rbd_v1_snap_id_by_name(rbd_dev, name); 44512ad3d716SAlex Elder 44522ad3d716SAlex Elder return rbd_v2_snap_id_by_name(rbd_dev, name); 44532ad3d716SAlex Elder } 44542ad3d716SAlex Elder 44559e15b77dSAlex Elder /* 445604077599SIlya Dryomov * An image being mapped will have everything but the snap id. 44579e15b77dSAlex Elder */ 445804077599SIlya Dryomov static int rbd_spec_fill_snap_id(struct rbd_device *rbd_dev) 445904077599SIlya Dryomov { 446004077599SIlya Dryomov struct rbd_spec *spec = rbd_dev->spec; 446104077599SIlya Dryomov 446204077599SIlya Dryomov rbd_assert(spec->pool_id != CEPH_NOPOOL && spec->pool_name); 446304077599SIlya Dryomov rbd_assert(spec->image_id && spec->image_name); 446404077599SIlya Dryomov rbd_assert(spec->snap_name); 446504077599SIlya Dryomov 446604077599SIlya Dryomov if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) { 446704077599SIlya Dryomov u64 snap_id; 446804077599SIlya Dryomov 446904077599SIlya Dryomov snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name); 447004077599SIlya Dryomov if (snap_id == CEPH_NOSNAP) 447104077599SIlya Dryomov return -ENOENT; 447204077599SIlya Dryomov 447304077599SIlya Dryomov spec->snap_id = snap_id; 447404077599SIlya Dryomov } else { 447504077599SIlya Dryomov spec->snap_id = CEPH_NOSNAP; 447604077599SIlya Dryomov } 447704077599SIlya Dryomov 447804077599SIlya Dryomov return 0; 447904077599SIlya Dryomov } 448004077599SIlya Dryomov 448104077599SIlya Dryomov /* 448204077599SIlya Dryomov * A parent image will have all ids but none of the names. 448304077599SIlya Dryomov * 448404077599SIlya Dryomov * All names in an rbd spec are dynamically allocated. It's OK if we 448504077599SIlya Dryomov * can't figure out the name for an image id. 448604077599SIlya Dryomov */ 448704077599SIlya Dryomov static int rbd_spec_fill_names(struct rbd_device *rbd_dev) 44889e15b77dSAlex Elder { 44892e9f7f1cSAlex Elder struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 44902e9f7f1cSAlex Elder struct rbd_spec *spec = rbd_dev->spec; 44912e9f7f1cSAlex Elder const char *pool_name; 44922e9f7f1cSAlex Elder const char *image_name; 44932e9f7f1cSAlex Elder const char *snap_name; 44949e15b77dSAlex Elder int ret; 44959e15b77dSAlex Elder 449604077599SIlya Dryomov rbd_assert(spec->pool_id != CEPH_NOPOOL); 449704077599SIlya Dryomov rbd_assert(spec->image_id); 449804077599SIlya Dryomov rbd_assert(spec->snap_id != CEPH_NOSNAP); 44999e15b77dSAlex Elder 45002e9f7f1cSAlex Elder /* Get the pool name; we have to make our own copy of this */ 45019e15b77dSAlex Elder 45022e9f7f1cSAlex Elder pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id); 45032e9f7f1cSAlex Elder if (!pool_name) { 45042e9f7f1cSAlex Elder rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id); 4505935dc89fSAlex Elder return -EIO; 4506935dc89fSAlex Elder } 45072e9f7f1cSAlex Elder pool_name = kstrdup(pool_name, GFP_KERNEL); 45082e9f7f1cSAlex Elder if (!pool_name) 45099e15b77dSAlex Elder return -ENOMEM; 45109e15b77dSAlex Elder 45119e15b77dSAlex Elder /* Fetch the image name; tolerate failure here */ 45129e15b77dSAlex Elder 45132e9f7f1cSAlex Elder image_name = rbd_dev_image_name(rbd_dev); 45142e9f7f1cSAlex Elder if (!image_name) 451506ecc6cbSAlex Elder rbd_warn(rbd_dev, "unable to get image name"); 45169e15b77dSAlex Elder 451704077599SIlya Dryomov /* Fetch the snapshot name */ 45189e15b77dSAlex Elder 45192e9f7f1cSAlex Elder snap_name = rbd_snap_name(rbd_dev, spec->snap_id); 4520da6a6b63SJosh Durgin if (IS_ERR(snap_name)) { 4521da6a6b63SJosh Durgin ret = PTR_ERR(snap_name); 45229e15b77dSAlex Elder goto out_err; 45232e9f7f1cSAlex Elder } 45242e9f7f1cSAlex Elder 45252e9f7f1cSAlex Elder spec->pool_name = pool_name; 45262e9f7f1cSAlex Elder spec->image_name = image_name; 45272e9f7f1cSAlex Elder spec->snap_name = snap_name; 45289e15b77dSAlex Elder 45299e15b77dSAlex Elder return 0; 453004077599SIlya Dryomov 45319e15b77dSAlex Elder out_err: 45322e9f7f1cSAlex Elder kfree(image_name); 45332e9f7f1cSAlex Elder kfree(pool_name); 45349e15b77dSAlex Elder return ret; 45359e15b77dSAlex Elder } 45369e15b77dSAlex Elder 4537cc4a38bdSAlex Elder static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev) 453835d489f9SAlex Elder { 453935d489f9SAlex Elder size_t size; 454035d489f9SAlex Elder int ret; 454135d489f9SAlex Elder void *reply_buf; 454235d489f9SAlex Elder void *p; 454335d489f9SAlex Elder void *end; 454435d489f9SAlex Elder u64 seq; 454535d489f9SAlex Elder u32 snap_count; 454635d489f9SAlex Elder struct ceph_snap_context *snapc; 454735d489f9SAlex Elder u32 i; 454835d489f9SAlex Elder 454935d489f9SAlex Elder /* 455035d489f9SAlex Elder * We'll need room for the seq value (maximum snapshot id), 455135d489f9SAlex Elder * snapshot count, and array of that many snapshot ids. 455235d489f9SAlex Elder * For now we have a fixed upper limit on the number we're 455335d489f9SAlex Elder * prepared to receive. 455435d489f9SAlex Elder */ 455535d489f9SAlex Elder size = sizeof (__le64) + sizeof (__le32) + 455635d489f9SAlex Elder RBD_MAX_SNAP_COUNT * sizeof (__le64); 455735d489f9SAlex Elder reply_buf = kzalloc(size, GFP_KERNEL); 455835d489f9SAlex Elder if (!reply_buf) 455935d489f9SAlex Elder return -ENOMEM; 456035d489f9SAlex Elder 4561c41d13a3SIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name, 45624157976bSAlex Elder "rbd", "get_snapcontext", NULL, 0, 4563e2a58ee5SAlex Elder reply_buf, size); 456436be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 456535d489f9SAlex Elder if (ret < 0) 456635d489f9SAlex Elder goto out; 456735d489f9SAlex Elder 456835d489f9SAlex Elder p = reply_buf; 456957385b51SAlex Elder end = reply_buf + ret; 457057385b51SAlex Elder ret = -ERANGE; 457135d489f9SAlex Elder ceph_decode_64_safe(&p, end, seq, out); 457235d489f9SAlex Elder ceph_decode_32_safe(&p, end, snap_count, out); 457335d489f9SAlex Elder 457435d489f9SAlex Elder /* 457535d489f9SAlex Elder * Make sure the reported number of snapshot ids wouldn't go 457635d489f9SAlex Elder * beyond the end of our buffer. But before checking that, 457735d489f9SAlex Elder * make sure the computed size of the snapshot context we 457835d489f9SAlex Elder * allocate is representable in a size_t. 457935d489f9SAlex Elder */ 458035d489f9SAlex Elder if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context)) 458135d489f9SAlex Elder / sizeof (u64)) { 458235d489f9SAlex Elder ret = -EINVAL; 458335d489f9SAlex Elder goto out; 458435d489f9SAlex Elder } 458535d489f9SAlex Elder if (!ceph_has_room(&p, end, snap_count * sizeof (__le64))) 458635d489f9SAlex Elder goto out; 4587468521c1SAlex Elder ret = 0; 458835d489f9SAlex Elder 4589812164f8SAlex Elder snapc = ceph_create_snap_context(snap_count, GFP_KERNEL); 459035d489f9SAlex Elder if (!snapc) { 459135d489f9SAlex Elder ret = -ENOMEM; 459235d489f9SAlex Elder goto out; 459335d489f9SAlex Elder } 459435d489f9SAlex Elder snapc->seq = seq; 459535d489f9SAlex Elder for (i = 0; i < snap_count; i++) 459635d489f9SAlex Elder snapc->snaps[i] = ceph_decode_64(&p); 459735d489f9SAlex Elder 459849ece554SAlex Elder ceph_put_snap_context(rbd_dev->header.snapc); 459935d489f9SAlex Elder rbd_dev->header.snapc = snapc; 460035d489f9SAlex Elder 460135d489f9SAlex Elder dout(" snap context seq = %llu, snap_count = %u\n", 460235d489f9SAlex Elder (unsigned long long)seq, (unsigned int)snap_count); 460335d489f9SAlex Elder out: 460435d489f9SAlex Elder kfree(reply_buf); 460535d489f9SAlex Elder 460657385b51SAlex Elder return ret; 460735d489f9SAlex Elder } 460835d489f9SAlex Elder 460954cac61fSAlex Elder static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, 461054cac61fSAlex Elder u64 snap_id) 4611b8b1e2dbSAlex Elder { 4612b8b1e2dbSAlex Elder size_t size; 4613b8b1e2dbSAlex Elder void *reply_buf; 461454cac61fSAlex Elder __le64 snapid; 4615b8b1e2dbSAlex Elder int ret; 4616b8b1e2dbSAlex Elder void *p; 4617b8b1e2dbSAlex Elder void *end; 4618b8b1e2dbSAlex Elder char *snap_name; 4619b8b1e2dbSAlex Elder 4620b8b1e2dbSAlex Elder size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN; 4621b8b1e2dbSAlex Elder reply_buf = kmalloc(size, GFP_KERNEL); 4622b8b1e2dbSAlex Elder if (!reply_buf) 4623b8b1e2dbSAlex Elder return ERR_PTR(-ENOMEM); 4624b8b1e2dbSAlex Elder 462554cac61fSAlex Elder snapid = cpu_to_le64(snap_id); 4626c41d13a3SIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name, 4627b8b1e2dbSAlex Elder "rbd", "get_snapshot_name", 462854cac61fSAlex Elder &snapid, sizeof (snapid), 4629e2a58ee5SAlex Elder reply_buf, size); 463036be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 4631f40eb349SAlex Elder if (ret < 0) { 4632f40eb349SAlex Elder snap_name = ERR_PTR(ret); 4633b8b1e2dbSAlex Elder goto out; 4634f40eb349SAlex Elder } 4635b8b1e2dbSAlex Elder 4636b8b1e2dbSAlex Elder p = reply_buf; 4637f40eb349SAlex Elder end = reply_buf + ret; 4638e5c35534SAlex Elder snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL); 4639f40eb349SAlex Elder if (IS_ERR(snap_name)) 4640b8b1e2dbSAlex Elder goto out; 4641f40eb349SAlex Elder 4642b8b1e2dbSAlex Elder dout(" snap_id 0x%016llx snap_name = %s\n", 464354cac61fSAlex Elder (unsigned long long)snap_id, snap_name); 4644b8b1e2dbSAlex Elder out: 4645b8b1e2dbSAlex Elder kfree(reply_buf); 4646b8b1e2dbSAlex Elder 4647f40eb349SAlex Elder return snap_name; 4648b8b1e2dbSAlex Elder } 4649b8b1e2dbSAlex Elder 46502df3fac7SAlex Elder static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev) 4651117973fbSAlex Elder { 46522df3fac7SAlex Elder bool first_time = rbd_dev->header.object_prefix == NULL; 4653117973fbSAlex Elder int ret; 4654117973fbSAlex Elder 46551617e40cSJosh Durgin ret = rbd_dev_v2_image_size(rbd_dev); 46561617e40cSJosh Durgin if (ret) 4657cfbf6377SAlex Elder return ret; 46581617e40cSJosh Durgin 46592df3fac7SAlex Elder if (first_time) { 46602df3fac7SAlex Elder ret = rbd_dev_v2_header_onetime(rbd_dev); 46612df3fac7SAlex Elder if (ret) 4662cfbf6377SAlex Elder return ret; 46632df3fac7SAlex Elder } 46642df3fac7SAlex Elder 4665cc4a38bdSAlex Elder ret = rbd_dev_v2_snap_context(rbd_dev); 4666d194cd1dSIlya Dryomov if (ret && first_time) { 4667d194cd1dSIlya Dryomov kfree(rbd_dev->header.object_prefix); 4668d194cd1dSIlya Dryomov rbd_dev->header.object_prefix = NULL; 4669d194cd1dSIlya Dryomov } 4670117973fbSAlex Elder 4671117973fbSAlex Elder return ret; 4672117973fbSAlex Elder } 4673117973fbSAlex Elder 4674a720ae09SIlya Dryomov static int rbd_dev_header_info(struct rbd_device *rbd_dev) 4675a720ae09SIlya Dryomov { 4676a720ae09SIlya Dryomov rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 4677a720ae09SIlya Dryomov 4678a720ae09SIlya Dryomov if (rbd_dev->image_format == 1) 4679a720ae09SIlya Dryomov return rbd_dev_v1_header_info(rbd_dev); 4680a720ae09SIlya Dryomov 4681a720ae09SIlya Dryomov return rbd_dev_v2_header_info(rbd_dev); 4682a720ae09SIlya Dryomov } 4683a720ae09SIlya Dryomov 46841ddbe94eSAlex Elder /* 4685e28fff26SAlex Elder * Skips over white space at *buf, and updates *buf to point to the 4686e28fff26SAlex Elder * first found non-space character (if any). Returns the length of 4687593a9e7bSAlex Elder * the token (string of non-white space characters) found. Note 4688593a9e7bSAlex Elder * that *buf must be terminated with '\0'. 4689e28fff26SAlex Elder */ 4690e28fff26SAlex Elder static inline size_t next_token(const char **buf) 4691e28fff26SAlex Elder { 4692e28fff26SAlex Elder /* 4693e28fff26SAlex Elder * These are the characters that produce nonzero for 4694e28fff26SAlex Elder * isspace() in the "C" and "POSIX" locales. 4695e28fff26SAlex Elder */ 4696e28fff26SAlex Elder const char *spaces = " \f\n\r\t\v"; 4697e28fff26SAlex Elder 4698e28fff26SAlex Elder *buf += strspn(*buf, spaces); /* Find start of token */ 4699e28fff26SAlex Elder 4700e28fff26SAlex Elder return strcspn(*buf, spaces); /* Return token length */ 4701e28fff26SAlex Elder } 4702e28fff26SAlex Elder 4703e28fff26SAlex Elder /* 4704ea3352f4SAlex Elder * Finds the next token in *buf, dynamically allocates a buffer big 4705ea3352f4SAlex Elder * enough to hold a copy of it, and copies the token into the new 4706ea3352f4SAlex Elder * buffer. The copy is guaranteed to be terminated with '\0'. Note 4707ea3352f4SAlex Elder * that a duplicate buffer is created even for a zero-length token. 4708ea3352f4SAlex Elder * 4709ea3352f4SAlex Elder * Returns a pointer to the newly-allocated duplicate, or a null 4710ea3352f4SAlex Elder * pointer if memory for the duplicate was not available. If 4711ea3352f4SAlex Elder * the lenp argument is a non-null pointer, the length of the token 4712ea3352f4SAlex Elder * (not including the '\0') is returned in *lenp. 4713ea3352f4SAlex Elder * 4714ea3352f4SAlex Elder * If successful, the *buf pointer will be updated to point beyond 4715ea3352f4SAlex Elder * the end of the found token. 4716ea3352f4SAlex Elder * 4717ea3352f4SAlex Elder * Note: uses GFP_KERNEL for allocation. 4718ea3352f4SAlex Elder */ 4719ea3352f4SAlex Elder static inline char *dup_token(const char **buf, size_t *lenp) 4720ea3352f4SAlex Elder { 4721ea3352f4SAlex Elder char *dup; 4722ea3352f4SAlex Elder size_t len; 4723ea3352f4SAlex Elder 4724ea3352f4SAlex Elder len = next_token(buf); 47254caf35f9SAlex Elder dup = kmemdup(*buf, len + 1, GFP_KERNEL); 4726ea3352f4SAlex Elder if (!dup) 4727ea3352f4SAlex Elder return NULL; 4728ea3352f4SAlex Elder *(dup + len) = '\0'; 4729ea3352f4SAlex Elder *buf += len; 4730ea3352f4SAlex Elder 4731ea3352f4SAlex Elder if (lenp) 4732ea3352f4SAlex Elder *lenp = len; 4733ea3352f4SAlex Elder 4734ea3352f4SAlex Elder return dup; 4735ea3352f4SAlex Elder } 4736ea3352f4SAlex Elder 4737ea3352f4SAlex Elder /* 4738859c31dfSAlex Elder * Parse the options provided for an "rbd add" (i.e., rbd image 4739859c31dfSAlex Elder * mapping) request. These arrive via a write to /sys/bus/rbd/add, 4740859c31dfSAlex Elder * and the data written is passed here via a NUL-terminated buffer. 4741859c31dfSAlex Elder * Returns 0 if successful or an error code otherwise. 4742d22f76e7SAlex Elder * 4743859c31dfSAlex Elder * The information extracted from these options is recorded in 4744859c31dfSAlex Elder * the other parameters which return dynamically-allocated 4745859c31dfSAlex Elder * structures: 4746859c31dfSAlex Elder * ceph_opts 4747859c31dfSAlex Elder * The address of a pointer that will refer to a ceph options 4748859c31dfSAlex Elder * structure. Caller must release the returned pointer using 4749859c31dfSAlex Elder * ceph_destroy_options() when it is no longer needed. 4750859c31dfSAlex Elder * rbd_opts 4751859c31dfSAlex Elder * Address of an rbd options pointer. Fully initialized by 4752859c31dfSAlex Elder * this function; caller must release with kfree(). 4753859c31dfSAlex Elder * spec 4754859c31dfSAlex Elder * Address of an rbd image specification pointer. Fully 4755859c31dfSAlex Elder * initialized by this function based on parsed options. 4756859c31dfSAlex Elder * Caller must release with rbd_spec_put(). 4757859c31dfSAlex Elder * 4758859c31dfSAlex Elder * The options passed take this form: 4759859c31dfSAlex Elder * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>] 4760859c31dfSAlex Elder * where: 4761859c31dfSAlex Elder * <mon_addrs> 4762859c31dfSAlex Elder * A comma-separated list of one or more monitor addresses. 4763859c31dfSAlex Elder * A monitor address is an ip address, optionally followed 4764859c31dfSAlex Elder * by a port number (separated by a colon). 4765859c31dfSAlex Elder * I.e.: ip1[:port1][,ip2[:port2]...] 4766859c31dfSAlex Elder * <options> 4767859c31dfSAlex Elder * A comma-separated list of ceph and/or rbd options. 4768859c31dfSAlex Elder * <pool_name> 4769859c31dfSAlex Elder * The name of the rados pool containing the rbd image. 4770859c31dfSAlex Elder * <image_name> 4771859c31dfSAlex Elder * The name of the image in that pool to map. 4772859c31dfSAlex Elder * <snap_id> 4773859c31dfSAlex Elder * An optional snapshot id. If provided, the mapping will 4774859c31dfSAlex Elder * present data from the image at the time that snapshot was 4775859c31dfSAlex Elder * created. The image head is used if no snapshot id is 4776859c31dfSAlex Elder * provided. Snapshot mappings are always read-only. 4777a725f65eSAlex Elder */ 4778859c31dfSAlex Elder static int rbd_add_parse_args(const char *buf, 4779dc79b113SAlex Elder struct ceph_options **ceph_opts, 4780859c31dfSAlex Elder struct rbd_options **opts, 4781859c31dfSAlex Elder struct rbd_spec **rbd_spec) 4782a725f65eSAlex Elder { 4783e28fff26SAlex Elder size_t len; 4784859c31dfSAlex Elder char *options; 47850ddebc0cSAlex Elder const char *mon_addrs; 4786ecb4dc22SAlex Elder char *snap_name; 47870ddebc0cSAlex Elder size_t mon_addrs_size; 4788859c31dfSAlex Elder struct rbd_spec *spec = NULL; 47894e9afebaSAlex Elder struct rbd_options *rbd_opts = NULL; 4790859c31dfSAlex Elder struct ceph_options *copts; 4791dc79b113SAlex Elder int ret; 4792e28fff26SAlex Elder 4793e28fff26SAlex Elder /* The first four tokens are required */ 4794e28fff26SAlex Elder 47957ef3214aSAlex Elder len = next_token(&buf); 47964fb5d671SAlex Elder if (!len) { 47974fb5d671SAlex Elder rbd_warn(NULL, "no monitor address(es) provided"); 47984fb5d671SAlex Elder return -EINVAL; 47994fb5d671SAlex Elder } 48000ddebc0cSAlex Elder mon_addrs = buf; 4801f28e565aSAlex Elder mon_addrs_size = len + 1; 48027ef3214aSAlex Elder buf += len; 4803a725f65eSAlex Elder 4804dc79b113SAlex Elder ret = -EINVAL; 4805f28e565aSAlex Elder options = dup_token(&buf, NULL); 4806f28e565aSAlex Elder if (!options) 4807dc79b113SAlex Elder return -ENOMEM; 48084fb5d671SAlex Elder if (!*options) { 48094fb5d671SAlex Elder rbd_warn(NULL, "no options provided"); 48104fb5d671SAlex Elder goto out_err; 48114fb5d671SAlex Elder } 4812a725f65eSAlex Elder 4813859c31dfSAlex Elder spec = rbd_spec_alloc(); 4814859c31dfSAlex Elder if (!spec) 4815f28e565aSAlex Elder goto out_mem; 4816859c31dfSAlex Elder 4817859c31dfSAlex Elder spec->pool_name = dup_token(&buf, NULL); 4818859c31dfSAlex Elder if (!spec->pool_name) 4819859c31dfSAlex Elder goto out_mem; 48204fb5d671SAlex Elder if (!*spec->pool_name) { 48214fb5d671SAlex Elder rbd_warn(NULL, "no pool name provided"); 48224fb5d671SAlex Elder goto out_err; 48234fb5d671SAlex Elder } 4824e28fff26SAlex Elder 482569e7a02fSAlex Elder spec->image_name = dup_token(&buf, NULL); 4826859c31dfSAlex Elder if (!spec->image_name) 4827f28e565aSAlex Elder goto out_mem; 48284fb5d671SAlex Elder if (!*spec->image_name) { 48294fb5d671SAlex Elder rbd_warn(NULL, "no image name provided"); 48304fb5d671SAlex Elder goto out_err; 48314fb5d671SAlex Elder } 4832e28fff26SAlex Elder 4833f28e565aSAlex Elder /* 4834f28e565aSAlex Elder * Snapshot name is optional; default is to use "-" 4835f28e565aSAlex Elder * (indicating the head/no snapshot). 4836f28e565aSAlex Elder */ 48373feeb894SAlex Elder len = next_token(&buf); 4838820a5f3eSAlex Elder if (!len) { 48393feeb894SAlex Elder buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */ 48403feeb894SAlex Elder len = sizeof (RBD_SNAP_HEAD_NAME) - 1; 4841f28e565aSAlex Elder } else if (len > RBD_MAX_SNAP_NAME_LEN) { 4842dc79b113SAlex Elder ret = -ENAMETOOLONG; 4843f28e565aSAlex Elder goto out_err; 4844849b4260SAlex Elder } 4845ecb4dc22SAlex Elder snap_name = kmemdup(buf, len + 1, GFP_KERNEL); 4846ecb4dc22SAlex Elder if (!snap_name) 4847f28e565aSAlex Elder goto out_mem; 4848ecb4dc22SAlex Elder *(snap_name + len) = '\0'; 4849ecb4dc22SAlex Elder spec->snap_name = snap_name; 4850e5c35534SAlex Elder 48510ddebc0cSAlex Elder /* Initialize all rbd options to the defaults */ 4852e28fff26SAlex Elder 48534e9afebaSAlex Elder rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL); 48544e9afebaSAlex Elder if (!rbd_opts) 48554e9afebaSAlex Elder goto out_mem; 48564e9afebaSAlex Elder 48574e9afebaSAlex Elder rbd_opts->read_only = RBD_READ_ONLY_DEFAULT; 4858b5584180SIlya Dryomov rbd_opts->queue_depth = RBD_QUEUE_DEPTH_DEFAULT; 4859d22f76e7SAlex Elder 4860859c31dfSAlex Elder copts = ceph_parse_options(options, mon_addrs, 48610ddebc0cSAlex Elder mon_addrs + mon_addrs_size - 1, 48624e9afebaSAlex Elder parse_rbd_opts_token, rbd_opts); 4863859c31dfSAlex Elder if (IS_ERR(copts)) { 4864859c31dfSAlex Elder ret = PTR_ERR(copts); 4865dc79b113SAlex Elder goto out_err; 4866dc79b113SAlex Elder } 4867859c31dfSAlex Elder kfree(options); 4868859c31dfSAlex Elder 4869859c31dfSAlex Elder *ceph_opts = copts; 48704e9afebaSAlex Elder *opts = rbd_opts; 4871859c31dfSAlex Elder *rbd_spec = spec; 48720ddebc0cSAlex Elder 4873dc79b113SAlex Elder return 0; 4874f28e565aSAlex Elder out_mem: 4875dc79b113SAlex Elder ret = -ENOMEM; 4876d22f76e7SAlex Elder out_err: 4877859c31dfSAlex Elder kfree(rbd_opts); 4878859c31dfSAlex Elder rbd_spec_put(spec); 4879f28e565aSAlex Elder kfree(options); 4880d22f76e7SAlex Elder 4881dc79b113SAlex Elder return ret; 4882a725f65eSAlex Elder } 4883a725f65eSAlex Elder 4884589d30e0SAlex Elder /* 488530ba1f02SIlya Dryomov * Return pool id (>= 0) or a negative error code. 488630ba1f02SIlya Dryomov */ 488730ba1f02SIlya Dryomov static int rbd_add_get_pool_id(struct rbd_client *rbdc, const char *pool_name) 488830ba1f02SIlya Dryomov { 4889a319bf56SIlya Dryomov struct ceph_options *opts = rbdc->client->options; 489030ba1f02SIlya Dryomov u64 newest_epoch; 489130ba1f02SIlya Dryomov int tries = 0; 489230ba1f02SIlya Dryomov int ret; 489330ba1f02SIlya Dryomov 489430ba1f02SIlya Dryomov again: 489530ba1f02SIlya Dryomov ret = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, pool_name); 489630ba1f02SIlya Dryomov if (ret == -ENOENT && tries++ < 1) { 4897d0b19705SIlya Dryomov ret = ceph_monc_get_version(&rbdc->client->monc, "osdmap", 489830ba1f02SIlya Dryomov &newest_epoch); 489930ba1f02SIlya Dryomov if (ret < 0) 490030ba1f02SIlya Dryomov return ret; 490130ba1f02SIlya Dryomov 490230ba1f02SIlya Dryomov if (rbdc->client->osdc.osdmap->epoch < newest_epoch) { 49037cca78c9SIlya Dryomov ceph_osdc_maybe_request_map(&rbdc->client->osdc); 490430ba1f02SIlya Dryomov (void) ceph_monc_wait_osdmap(&rbdc->client->monc, 4905a319bf56SIlya Dryomov newest_epoch, 4906a319bf56SIlya Dryomov opts->mount_timeout); 490730ba1f02SIlya Dryomov goto again; 490830ba1f02SIlya Dryomov } else { 490930ba1f02SIlya Dryomov /* the osdmap we have is new enough */ 491030ba1f02SIlya Dryomov return -ENOENT; 491130ba1f02SIlya Dryomov } 491230ba1f02SIlya Dryomov } 491330ba1f02SIlya Dryomov 491430ba1f02SIlya Dryomov return ret; 491530ba1f02SIlya Dryomov } 491630ba1f02SIlya Dryomov 491730ba1f02SIlya Dryomov /* 4918589d30e0SAlex Elder * An rbd format 2 image has a unique identifier, distinct from the 4919589d30e0SAlex Elder * name given to it by the user. Internally, that identifier is 4920589d30e0SAlex Elder * what's used to specify the names of objects related to the image. 4921589d30e0SAlex Elder * 4922589d30e0SAlex Elder * A special "rbd id" object is used to map an rbd image name to its 4923589d30e0SAlex Elder * id. If that object doesn't exist, then there is no v2 rbd image 4924589d30e0SAlex Elder * with the supplied name. 4925589d30e0SAlex Elder * 4926589d30e0SAlex Elder * This function will record the given rbd_dev's image_id field if 4927589d30e0SAlex Elder * it can be determined, and in that case will return 0. If any 4928589d30e0SAlex Elder * errors occur a negative errno will be returned and the rbd_dev's 4929589d30e0SAlex Elder * image_id field will be unchanged (and should be NULL). 4930589d30e0SAlex Elder */ 4931589d30e0SAlex Elder static int rbd_dev_image_id(struct rbd_device *rbd_dev) 4932589d30e0SAlex Elder { 4933589d30e0SAlex Elder int ret; 4934589d30e0SAlex Elder size_t size; 4935589d30e0SAlex Elder char *object_name; 4936589d30e0SAlex Elder void *response; 4937c0fba368SAlex Elder char *image_id; 49382f82ee54SAlex Elder 4939589d30e0SAlex Elder /* 49402c0d0a10SAlex Elder * When probing a parent image, the image id is already 49412c0d0a10SAlex Elder * known (and the image name likely is not). There's no 4942c0fba368SAlex Elder * need to fetch the image id again in this case. We 4943c0fba368SAlex Elder * do still need to set the image format though. 49442c0d0a10SAlex Elder */ 4945c0fba368SAlex Elder if (rbd_dev->spec->image_id) { 4946c0fba368SAlex Elder rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1; 4947c0fba368SAlex Elder 49482c0d0a10SAlex Elder return 0; 4949c0fba368SAlex Elder } 49502c0d0a10SAlex Elder 49512c0d0a10SAlex Elder /* 4952589d30e0SAlex Elder * First, see if the format 2 image id file exists, and if 4953589d30e0SAlex Elder * so, get the image's persistent id from it. 4954589d30e0SAlex Elder */ 495569e7a02fSAlex Elder size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name); 4956589d30e0SAlex Elder object_name = kmalloc(size, GFP_NOIO); 4957589d30e0SAlex Elder if (!object_name) 4958589d30e0SAlex Elder return -ENOMEM; 49590d7dbfceSAlex Elder sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name); 4960589d30e0SAlex Elder dout("rbd id object name is %s\n", object_name); 4961589d30e0SAlex Elder 4962589d30e0SAlex Elder /* Response will be an encoded string, which includes a length */ 4963589d30e0SAlex Elder 4964589d30e0SAlex Elder size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX; 4965589d30e0SAlex Elder response = kzalloc(size, GFP_NOIO); 4966589d30e0SAlex Elder if (!response) { 4967589d30e0SAlex Elder ret = -ENOMEM; 4968589d30e0SAlex Elder goto out; 4969589d30e0SAlex Elder } 4970589d30e0SAlex Elder 4971c0fba368SAlex Elder /* If it doesn't exist we'll assume it's a format 1 image */ 4972c0fba368SAlex Elder 497336be9a76SAlex Elder ret = rbd_obj_method_sync(rbd_dev, object_name, 49744157976bSAlex Elder "rbd", "get_id", NULL, 0, 4975e2a58ee5SAlex Elder response, RBD_IMAGE_ID_LEN_MAX); 497636be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 4977c0fba368SAlex Elder if (ret == -ENOENT) { 4978c0fba368SAlex Elder image_id = kstrdup("", GFP_KERNEL); 4979c0fba368SAlex Elder ret = image_id ? 0 : -ENOMEM; 4980c0fba368SAlex Elder if (!ret) 4981c0fba368SAlex Elder rbd_dev->image_format = 1; 49827dd440c9SIlya Dryomov } else if (ret >= 0) { 4983c0fba368SAlex Elder void *p = response; 4984589d30e0SAlex Elder 4985c0fba368SAlex Elder image_id = ceph_extract_encoded_string(&p, p + ret, 4986979ed480SAlex Elder NULL, GFP_NOIO); 4987461f758aSDuan Jiong ret = PTR_ERR_OR_ZERO(image_id); 4988c0fba368SAlex Elder if (!ret) 4989c0fba368SAlex Elder rbd_dev->image_format = 2; 4990c0fba368SAlex Elder } 4991c0fba368SAlex Elder 4992c0fba368SAlex Elder if (!ret) { 4993c0fba368SAlex Elder rbd_dev->spec->image_id = image_id; 4994c0fba368SAlex Elder dout("image_id is %s\n", image_id); 4995589d30e0SAlex Elder } 4996589d30e0SAlex Elder out: 4997589d30e0SAlex Elder kfree(response); 4998589d30e0SAlex Elder kfree(object_name); 4999589d30e0SAlex Elder 5000589d30e0SAlex Elder return ret; 5001589d30e0SAlex Elder } 5002589d30e0SAlex Elder 50033abef3b3SAlex Elder /* 50043abef3b3SAlex Elder * Undo whatever state changes are made by v1 or v2 header info 50053abef3b3SAlex Elder * call. 50063abef3b3SAlex Elder */ 50076fd48b3bSAlex Elder static void rbd_dev_unprobe(struct rbd_device *rbd_dev) 50086fd48b3bSAlex Elder { 50096fd48b3bSAlex Elder struct rbd_image_header *header; 50106fd48b3bSAlex Elder 5011a2acd00eSAlex Elder rbd_dev_parent_put(rbd_dev); 50126fd48b3bSAlex Elder 50136fd48b3bSAlex Elder /* Free dynamic fields from the header, then zero it out */ 50146fd48b3bSAlex Elder 50156fd48b3bSAlex Elder header = &rbd_dev->header; 5016812164f8SAlex Elder ceph_put_snap_context(header->snapc); 50176fd48b3bSAlex Elder kfree(header->snap_sizes); 50186fd48b3bSAlex Elder kfree(header->snap_names); 50196fd48b3bSAlex Elder kfree(header->object_prefix); 50206fd48b3bSAlex Elder memset(header, 0, sizeof (*header)); 50216fd48b3bSAlex Elder } 50226fd48b3bSAlex Elder 50232df3fac7SAlex Elder static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev) 5024a30b71b9SAlex Elder { 5025a30b71b9SAlex Elder int ret; 5026a30b71b9SAlex Elder 50271e130199SAlex Elder ret = rbd_dev_v2_object_prefix(rbd_dev); 502857385b51SAlex Elder if (ret) 50291e130199SAlex Elder goto out_err; 5030b1b5402aSAlex Elder 50312df3fac7SAlex Elder /* 50322df3fac7SAlex Elder * Get the and check features for the image. Currently the 50332df3fac7SAlex Elder * features are assumed to never change. 50342df3fac7SAlex Elder */ 5035b1b5402aSAlex Elder ret = rbd_dev_v2_features(rbd_dev); 503657385b51SAlex Elder if (ret) 5037b1b5402aSAlex Elder goto out_err; 503835d489f9SAlex Elder 5039cc070d59SAlex Elder /* If the image supports fancy striping, get its parameters */ 5040cc070d59SAlex Elder 5041cc070d59SAlex Elder if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) { 5042cc070d59SAlex Elder ret = rbd_dev_v2_striping_info(rbd_dev); 5043cc070d59SAlex Elder if (ret < 0) 5044cc070d59SAlex Elder goto out_err; 5045cc070d59SAlex Elder } 50462df3fac7SAlex Elder /* No support for crypto and compression type format 2 images */ 5047a30b71b9SAlex Elder 504835152979SAlex Elder return 0; 50499d475de5SAlex Elder out_err: 5050642a2537SAlex Elder rbd_dev->header.features = 0; 50511e130199SAlex Elder kfree(rbd_dev->header.object_prefix); 50521e130199SAlex Elder rbd_dev->header.object_prefix = NULL; 50539d475de5SAlex Elder 50549d475de5SAlex Elder return ret; 5055a30b71b9SAlex Elder } 5056a30b71b9SAlex Elder 50576d69bb53SIlya Dryomov /* 50586d69bb53SIlya Dryomov * @depth is rbd_dev_image_probe() -> rbd_dev_probe_parent() -> 50596d69bb53SIlya Dryomov * rbd_dev_image_probe() recursion depth, which means it's also the 50606d69bb53SIlya Dryomov * length of the already discovered part of the parent chain. 50616d69bb53SIlya Dryomov */ 50626d69bb53SIlya Dryomov static int rbd_dev_probe_parent(struct rbd_device *rbd_dev, int depth) 506383a06263SAlex Elder { 50642f82ee54SAlex Elder struct rbd_device *parent = NULL; 5065124afba2SAlex Elder int ret; 5066124afba2SAlex Elder 5067124afba2SAlex Elder if (!rbd_dev->parent_spec) 5068124afba2SAlex Elder return 0; 5069124afba2SAlex Elder 50706d69bb53SIlya Dryomov if (++depth > RBD_MAX_PARENT_CHAIN_LEN) { 50716d69bb53SIlya Dryomov pr_info("parent chain is too long (%d)\n", depth); 50726d69bb53SIlya Dryomov ret = -EINVAL; 50736d69bb53SIlya Dryomov goto out_err; 50746d69bb53SIlya Dryomov } 50756d69bb53SIlya Dryomov 50761643dfa4SIlya Dryomov parent = __rbd_dev_create(rbd_dev->rbd_client, rbd_dev->parent_spec); 50771f2c6651SIlya Dryomov if (!parent) { 5078124afba2SAlex Elder ret = -ENOMEM; 5079124afba2SAlex Elder goto out_err; 50801f2c6651SIlya Dryomov } 50811f2c6651SIlya Dryomov 50821f2c6651SIlya Dryomov /* 50831f2c6651SIlya Dryomov * Images related by parent/child relationships always share 50841f2c6651SIlya Dryomov * rbd_client and spec/parent_spec, so bump their refcounts. 50851f2c6651SIlya Dryomov */ 50861f2c6651SIlya Dryomov __rbd_get_client(rbd_dev->rbd_client); 50871f2c6651SIlya Dryomov rbd_spec_get(rbd_dev->parent_spec); 5088124afba2SAlex Elder 50896d69bb53SIlya Dryomov ret = rbd_dev_image_probe(parent, depth); 5090124afba2SAlex Elder if (ret < 0) 5091124afba2SAlex Elder goto out_err; 50921f2c6651SIlya Dryomov 5093124afba2SAlex Elder rbd_dev->parent = parent; 5094a2acd00eSAlex Elder atomic_set(&rbd_dev->parent_ref, 1); 5095124afba2SAlex Elder return 0; 5096124afba2SAlex Elder 50971f2c6651SIlya Dryomov out_err: 50981f2c6651SIlya Dryomov rbd_dev_unparent(rbd_dev); 50991f2c6651SIlya Dryomov rbd_dev_destroy(parent); 5100124afba2SAlex Elder return ret; 5101124afba2SAlex Elder } 5102124afba2SAlex Elder 5103811c6688SIlya Dryomov /* 5104811c6688SIlya Dryomov * rbd_dev->header_rwsem must be locked for write and will be unlocked 5105811c6688SIlya Dryomov * upon return. 5106811c6688SIlya Dryomov */ 5107200a6a8bSAlex Elder static int rbd_dev_device_setup(struct rbd_device *rbd_dev) 5108124afba2SAlex Elder { 510983a06263SAlex Elder int ret; 511083a06263SAlex Elder 51119b60e70bSIlya Dryomov /* Record our major and minor device numbers. */ 511283a06263SAlex Elder 51139b60e70bSIlya Dryomov if (!single_major) { 511483a06263SAlex Elder ret = register_blkdev(0, rbd_dev->name); 511583a06263SAlex Elder if (ret < 0) 51161643dfa4SIlya Dryomov goto err_out_unlock; 51179b60e70bSIlya Dryomov 511883a06263SAlex Elder rbd_dev->major = ret; 5119dd82fff1SIlya Dryomov rbd_dev->minor = 0; 51209b60e70bSIlya Dryomov } else { 51219b60e70bSIlya Dryomov rbd_dev->major = rbd_major; 51229b60e70bSIlya Dryomov rbd_dev->minor = rbd_dev_id_to_minor(rbd_dev->dev_id); 51239b60e70bSIlya Dryomov } 512483a06263SAlex Elder 512583a06263SAlex Elder /* Set up the blkdev mapping. */ 512683a06263SAlex Elder 512783a06263SAlex Elder ret = rbd_init_disk(rbd_dev); 512883a06263SAlex Elder if (ret) 512983a06263SAlex Elder goto err_out_blkdev; 513083a06263SAlex Elder 5131f35a4deeSAlex Elder ret = rbd_dev_mapping_set(rbd_dev); 513283a06263SAlex Elder if (ret) 513383a06263SAlex Elder goto err_out_disk; 5134bc1ecc65SIlya Dryomov 5135f35a4deeSAlex Elder set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE); 513622001f61SJosh Durgin set_disk_ro(rbd_dev->disk, rbd_dev->mapping.read_only); 5137f35a4deeSAlex Elder 5138dd5ac32dSIlya Dryomov dev_set_name(&rbd_dev->dev, "%d", rbd_dev->dev_id); 5139dd5ac32dSIlya Dryomov ret = device_add(&rbd_dev->dev); 5140f35a4deeSAlex Elder if (ret) 5141f5ee37bdSIlya Dryomov goto err_out_mapping; 514283a06263SAlex Elder 514383a06263SAlex Elder /* Everything's ready. Announce the disk to the world. */ 514483a06263SAlex Elder 5145129b79d4SAlex Elder set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); 5146811c6688SIlya Dryomov up_write(&rbd_dev->header_rwsem); 514783a06263SAlex Elder 51481643dfa4SIlya Dryomov spin_lock(&rbd_dev_list_lock); 51491643dfa4SIlya Dryomov list_add_tail(&rbd_dev->node, &rbd_dev_list); 51501643dfa4SIlya Dryomov spin_unlock(&rbd_dev_list_lock); 51511643dfa4SIlya Dryomov 5152811c6688SIlya Dryomov add_disk(rbd_dev->disk); 515383a06263SAlex Elder pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name, 515483a06263SAlex Elder (unsigned long long) rbd_dev->mapping.size); 515583a06263SAlex Elder 515683a06263SAlex Elder return ret; 51572f82ee54SAlex Elder 5158f35a4deeSAlex Elder err_out_mapping: 5159f35a4deeSAlex Elder rbd_dev_mapping_clear(rbd_dev); 516083a06263SAlex Elder err_out_disk: 516183a06263SAlex Elder rbd_free_disk(rbd_dev); 516283a06263SAlex Elder err_out_blkdev: 51639b60e70bSIlya Dryomov if (!single_major) 516483a06263SAlex Elder unregister_blkdev(rbd_dev->major, rbd_dev->name); 5165811c6688SIlya Dryomov err_out_unlock: 5166811c6688SIlya Dryomov up_write(&rbd_dev->header_rwsem); 516783a06263SAlex Elder return ret; 516883a06263SAlex Elder } 516983a06263SAlex Elder 5170332bb12dSAlex Elder static int rbd_dev_header_name(struct rbd_device *rbd_dev) 5171332bb12dSAlex Elder { 5172332bb12dSAlex Elder struct rbd_spec *spec = rbd_dev->spec; 5173c41d13a3SIlya Dryomov int ret; 5174332bb12dSAlex Elder 5175332bb12dSAlex Elder /* Record the header object name for this rbd image. */ 5176332bb12dSAlex Elder 5177332bb12dSAlex Elder rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 5178332bb12dSAlex Elder 51797627151eSYan, Zheng rbd_dev->header_oloc.pool = rbd_dev->layout.pool_id; 5180332bb12dSAlex Elder if (rbd_dev->image_format == 1) 5181c41d13a3SIlya Dryomov ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s", 5182332bb12dSAlex Elder spec->image_name, RBD_SUFFIX); 5183332bb12dSAlex Elder else 5184c41d13a3SIlya Dryomov ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s", 5185332bb12dSAlex Elder RBD_HEADER_PREFIX, spec->image_id); 5186c41d13a3SIlya Dryomov 5187c41d13a3SIlya Dryomov return ret; 5188332bb12dSAlex Elder } 5189332bb12dSAlex Elder 5190200a6a8bSAlex Elder static void rbd_dev_image_release(struct rbd_device *rbd_dev) 5191200a6a8bSAlex Elder { 51926fd48b3bSAlex Elder rbd_dev_unprobe(rbd_dev); 51936fd48b3bSAlex Elder rbd_dev->image_format = 0; 51946fd48b3bSAlex Elder kfree(rbd_dev->spec->image_id); 51956fd48b3bSAlex Elder rbd_dev->spec->image_id = NULL; 51966fd48b3bSAlex Elder 5197200a6a8bSAlex Elder rbd_dev_destroy(rbd_dev); 5198200a6a8bSAlex Elder } 5199200a6a8bSAlex Elder 5200a30b71b9SAlex Elder /* 5201a30b71b9SAlex Elder * Probe for the existence of the header object for the given rbd 52021f3ef788SAlex Elder * device. If this image is the one being mapped (i.e., not a 52031f3ef788SAlex Elder * parent), initiate a watch on its header object before using that 52041f3ef788SAlex Elder * object to get detailed information about the rbd image. 5205a30b71b9SAlex Elder */ 52066d69bb53SIlya Dryomov static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth) 5207a30b71b9SAlex Elder { 5208a30b71b9SAlex Elder int ret; 5209a30b71b9SAlex Elder 5210a30b71b9SAlex Elder /* 52113abef3b3SAlex Elder * Get the id from the image id object. Unless there's an 52123abef3b3SAlex Elder * error, rbd_dev->spec->image_id will be filled in with 52133abef3b3SAlex Elder * a dynamically-allocated string, and rbd_dev->image_format 52143abef3b3SAlex Elder * will be set to either 1 or 2. 5215a30b71b9SAlex Elder */ 5216a30b71b9SAlex Elder ret = rbd_dev_image_id(rbd_dev); 5217a30b71b9SAlex Elder if (ret) 5218c0fba368SAlex Elder return ret; 5219c0fba368SAlex Elder 5220332bb12dSAlex Elder ret = rbd_dev_header_name(rbd_dev); 5221332bb12dSAlex Elder if (ret) 5222332bb12dSAlex Elder goto err_out_format; 5223332bb12dSAlex Elder 52246d69bb53SIlya Dryomov if (!depth) { 5225fca27065SIlya Dryomov ret = rbd_dev_header_watch_sync(rbd_dev); 52261fe48023SIlya Dryomov if (ret) { 52271fe48023SIlya Dryomov if (ret == -ENOENT) 52281fe48023SIlya Dryomov pr_info("image %s/%s does not exist\n", 52291fe48023SIlya Dryomov rbd_dev->spec->pool_name, 52301fe48023SIlya Dryomov rbd_dev->spec->image_name); 5231c41d13a3SIlya Dryomov goto err_out_format; 52321f3ef788SAlex Elder } 52331fe48023SIlya Dryomov } 5234b644de2bSAlex Elder 5235a720ae09SIlya Dryomov ret = rbd_dev_header_info(rbd_dev); 52365655c4d9SAlex Elder if (ret) 5237b644de2bSAlex Elder goto err_out_watch; 5238a30b71b9SAlex Elder 523904077599SIlya Dryomov /* 524004077599SIlya Dryomov * If this image is the one being mapped, we have pool name and 524104077599SIlya Dryomov * id, image name and id, and snap name - need to fill snap id. 524204077599SIlya Dryomov * Otherwise this is a parent image, identified by pool, image 524304077599SIlya Dryomov * and snap ids - need to fill in names for those ids. 524404077599SIlya Dryomov */ 52456d69bb53SIlya Dryomov if (!depth) 524604077599SIlya Dryomov ret = rbd_spec_fill_snap_id(rbd_dev); 524704077599SIlya Dryomov else 524804077599SIlya Dryomov ret = rbd_spec_fill_names(rbd_dev); 52491fe48023SIlya Dryomov if (ret) { 52501fe48023SIlya Dryomov if (ret == -ENOENT) 52511fe48023SIlya Dryomov pr_info("snap %s/%s@%s does not exist\n", 52521fe48023SIlya Dryomov rbd_dev->spec->pool_name, 52531fe48023SIlya Dryomov rbd_dev->spec->image_name, 52541fe48023SIlya Dryomov rbd_dev->spec->snap_name); 525533dca39fSAlex Elder goto err_out_probe; 52561fe48023SIlya Dryomov } 52579bb81c9bSAlex Elder 5258e8f59b59SIlya Dryomov if (rbd_dev->header.features & RBD_FEATURE_LAYERING) { 5259e8f59b59SIlya Dryomov ret = rbd_dev_v2_parent_info(rbd_dev); 5260e8f59b59SIlya Dryomov if (ret) 5261e8f59b59SIlya Dryomov goto err_out_probe; 5262e8f59b59SIlya Dryomov 5263e8f59b59SIlya Dryomov /* 5264e8f59b59SIlya Dryomov * Need to warn users if this image is the one being 5265e8f59b59SIlya Dryomov * mapped and has a parent. 5266e8f59b59SIlya Dryomov */ 52676d69bb53SIlya Dryomov if (!depth && rbd_dev->parent_spec) 5268e8f59b59SIlya Dryomov rbd_warn(rbd_dev, 5269e8f59b59SIlya Dryomov "WARNING: kernel layering is EXPERIMENTAL!"); 5270e8f59b59SIlya Dryomov } 5271e8f59b59SIlya Dryomov 52726d69bb53SIlya Dryomov ret = rbd_dev_probe_parent(rbd_dev, depth); 527330d60ba2SAlex Elder if (ret) 527430d60ba2SAlex Elder goto err_out_probe; 527583a06263SAlex Elder 527630d60ba2SAlex Elder dout("discovered format %u image, header name is %s\n", 5277c41d13a3SIlya Dryomov rbd_dev->image_format, rbd_dev->header_oid.name); 527830d60ba2SAlex Elder return 0; 5279e8f59b59SIlya Dryomov 52806fd48b3bSAlex Elder err_out_probe: 52816fd48b3bSAlex Elder rbd_dev_unprobe(rbd_dev); 5282b644de2bSAlex Elder err_out_watch: 52836d69bb53SIlya Dryomov if (!depth) 5284fca27065SIlya Dryomov rbd_dev_header_unwatch_sync(rbd_dev); 5285332bb12dSAlex Elder err_out_format: 5286332bb12dSAlex Elder rbd_dev->image_format = 0; 52875655c4d9SAlex Elder kfree(rbd_dev->spec->image_id); 52885655c4d9SAlex Elder rbd_dev->spec->image_id = NULL; 52895655c4d9SAlex Elder return ret; 529083a06263SAlex Elder } 529183a06263SAlex Elder 52929b60e70bSIlya Dryomov static ssize_t do_rbd_add(struct bus_type *bus, 529359c2be1eSYehuda Sadeh const char *buf, 529459c2be1eSYehuda Sadeh size_t count) 5295602adf40SYehuda Sadeh { 5296cb8627c7SAlex Elder struct rbd_device *rbd_dev = NULL; 5297dc79b113SAlex Elder struct ceph_options *ceph_opts = NULL; 52984e9afebaSAlex Elder struct rbd_options *rbd_opts = NULL; 5299859c31dfSAlex Elder struct rbd_spec *spec = NULL; 53009d3997fdSAlex Elder struct rbd_client *rbdc; 530151344a38SAlex Elder bool read_only; 5302b51c83c2SIlya Dryomov int rc; 5303602adf40SYehuda Sadeh 5304602adf40SYehuda Sadeh if (!try_module_get(THIS_MODULE)) 5305602adf40SYehuda Sadeh return -ENODEV; 5306602adf40SYehuda Sadeh 5307a725f65eSAlex Elder /* parse add command */ 5308859c31dfSAlex Elder rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec); 5309dc79b113SAlex Elder if (rc < 0) 5310dd5ac32dSIlya Dryomov goto out; 5311a725f65eSAlex Elder 53129d3997fdSAlex Elder rbdc = rbd_get_client(ceph_opts); 53139d3997fdSAlex Elder if (IS_ERR(rbdc)) { 53149d3997fdSAlex Elder rc = PTR_ERR(rbdc); 53150ddebc0cSAlex Elder goto err_out_args; 53169d3997fdSAlex Elder } 5317602adf40SYehuda Sadeh 5318602adf40SYehuda Sadeh /* pick the pool */ 531930ba1f02SIlya Dryomov rc = rbd_add_get_pool_id(rbdc, spec->pool_name); 53201fe48023SIlya Dryomov if (rc < 0) { 53211fe48023SIlya Dryomov if (rc == -ENOENT) 53221fe48023SIlya Dryomov pr_info("pool %s does not exist\n", spec->pool_name); 5323602adf40SYehuda Sadeh goto err_out_client; 53241fe48023SIlya Dryomov } 5325859c31dfSAlex Elder spec->pool_id = (u64)rc; 5326859c31dfSAlex Elder 5327d147543dSIlya Dryomov rbd_dev = rbd_dev_create(rbdc, spec, rbd_opts); 5328b51c83c2SIlya Dryomov if (!rbd_dev) { 5329b51c83c2SIlya Dryomov rc = -ENOMEM; 5330bd4ba655SAlex Elder goto err_out_client; 5331b51c83c2SIlya Dryomov } 5332c53d5893SAlex Elder rbdc = NULL; /* rbd_dev now owns this */ 5333c53d5893SAlex Elder spec = NULL; /* rbd_dev now owns this */ 5334d147543dSIlya Dryomov rbd_opts = NULL; /* rbd_dev now owns this */ 5335602adf40SYehuda Sadeh 5336811c6688SIlya Dryomov down_write(&rbd_dev->header_rwsem); 53376d69bb53SIlya Dryomov rc = rbd_dev_image_probe(rbd_dev, 0); 5338a30b71b9SAlex Elder if (rc < 0) 5339c53d5893SAlex Elder goto err_out_rbd_dev; 534005fd6f6fSAlex Elder 53417ce4eef7SAlex Elder /* If we are mapping a snapshot it must be marked read-only */ 53427ce4eef7SAlex Elder 5343d147543dSIlya Dryomov read_only = rbd_dev->opts->read_only; 53447ce4eef7SAlex Elder if (rbd_dev->spec->snap_id != CEPH_NOSNAP) 53457ce4eef7SAlex Elder read_only = true; 53467ce4eef7SAlex Elder rbd_dev->mapping.read_only = read_only; 53477ce4eef7SAlex Elder 5348b536f69aSAlex Elder rc = rbd_dev_device_setup(rbd_dev); 53493abef3b3SAlex Elder if (rc) { 5350e37180c0SIlya Dryomov /* 5351e37180c0SIlya Dryomov * rbd_dev_header_unwatch_sync() can't be moved into 5352e37180c0SIlya Dryomov * rbd_dev_image_release() without refactoring, see 5353e37180c0SIlya Dryomov * commit 1f3ef78861ac. 5354e37180c0SIlya Dryomov */ 5355e37180c0SIlya Dryomov rbd_dev_header_unwatch_sync(rbd_dev); 53563abef3b3SAlex Elder rbd_dev_image_release(rbd_dev); 5357dd5ac32dSIlya Dryomov goto out; 53583abef3b3SAlex Elder } 53593abef3b3SAlex Elder 5360dd5ac32dSIlya Dryomov rc = count; 5361dd5ac32dSIlya Dryomov out: 5362dd5ac32dSIlya Dryomov module_put(THIS_MODULE); 5363dd5ac32dSIlya Dryomov return rc; 5364b536f69aSAlex Elder 5365c53d5893SAlex Elder err_out_rbd_dev: 5366811c6688SIlya Dryomov up_write(&rbd_dev->header_rwsem); 5367c53d5893SAlex Elder rbd_dev_destroy(rbd_dev); 5368bd4ba655SAlex Elder err_out_client: 53699d3997fdSAlex Elder rbd_put_client(rbdc); 53700ddebc0cSAlex Elder err_out_args: 5371859c31dfSAlex Elder rbd_spec_put(spec); 5372d147543dSIlya Dryomov kfree(rbd_opts); 5373dd5ac32dSIlya Dryomov goto out; 5374602adf40SYehuda Sadeh } 5375602adf40SYehuda Sadeh 53769b60e70bSIlya Dryomov static ssize_t rbd_add(struct bus_type *bus, 53779b60e70bSIlya Dryomov const char *buf, 53789b60e70bSIlya Dryomov size_t count) 53799b60e70bSIlya Dryomov { 53809b60e70bSIlya Dryomov if (single_major) 53819b60e70bSIlya Dryomov return -EINVAL; 53829b60e70bSIlya Dryomov 53839b60e70bSIlya Dryomov return do_rbd_add(bus, buf, count); 53849b60e70bSIlya Dryomov } 53859b60e70bSIlya Dryomov 53869b60e70bSIlya Dryomov static ssize_t rbd_add_single_major(struct bus_type *bus, 53879b60e70bSIlya Dryomov const char *buf, 53889b60e70bSIlya Dryomov size_t count) 53899b60e70bSIlya Dryomov { 53909b60e70bSIlya Dryomov return do_rbd_add(bus, buf, count); 53919b60e70bSIlya Dryomov } 53929b60e70bSIlya Dryomov 5393dd5ac32dSIlya Dryomov static void rbd_dev_device_release(struct rbd_device *rbd_dev) 5394602adf40SYehuda Sadeh { 5395602adf40SYehuda Sadeh rbd_free_disk(rbd_dev); 53961643dfa4SIlya Dryomov 53971643dfa4SIlya Dryomov spin_lock(&rbd_dev_list_lock); 53981643dfa4SIlya Dryomov list_del_init(&rbd_dev->node); 53991643dfa4SIlya Dryomov spin_unlock(&rbd_dev_list_lock); 54001643dfa4SIlya Dryomov 5401200a6a8bSAlex Elder clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); 5402dd5ac32dSIlya Dryomov device_del(&rbd_dev->dev); 54036d80b130SAlex Elder rbd_dev_mapping_clear(rbd_dev); 54049b60e70bSIlya Dryomov if (!single_major) 5405602adf40SYehuda Sadeh unregister_blkdev(rbd_dev->major, rbd_dev->name); 5406602adf40SYehuda Sadeh } 5407602adf40SYehuda Sadeh 540805a46afdSAlex Elder static void rbd_dev_remove_parent(struct rbd_device *rbd_dev) 540905a46afdSAlex Elder { 5410ad945fc1SAlex Elder while (rbd_dev->parent) { 541105a46afdSAlex Elder struct rbd_device *first = rbd_dev; 541205a46afdSAlex Elder struct rbd_device *second = first->parent; 541305a46afdSAlex Elder struct rbd_device *third; 541405a46afdSAlex Elder 541505a46afdSAlex Elder /* 541605a46afdSAlex Elder * Follow to the parent with no grandparent and 541705a46afdSAlex Elder * remove it. 541805a46afdSAlex Elder */ 541905a46afdSAlex Elder while (second && (third = second->parent)) { 542005a46afdSAlex Elder first = second; 542105a46afdSAlex Elder second = third; 542205a46afdSAlex Elder } 5423ad945fc1SAlex Elder rbd_assert(second); 54248ad42cd0SAlex Elder rbd_dev_image_release(second); 5425ad945fc1SAlex Elder first->parent = NULL; 5426ad945fc1SAlex Elder first->parent_overlap = 0; 5427ad945fc1SAlex Elder 5428ad945fc1SAlex Elder rbd_assert(first->parent_spec); 542905a46afdSAlex Elder rbd_spec_put(first->parent_spec); 543005a46afdSAlex Elder first->parent_spec = NULL; 543105a46afdSAlex Elder } 543205a46afdSAlex Elder } 543305a46afdSAlex Elder 54349b60e70bSIlya Dryomov static ssize_t do_rbd_remove(struct bus_type *bus, 5435602adf40SYehuda Sadeh const char *buf, 5436602adf40SYehuda Sadeh size_t count) 5437602adf40SYehuda Sadeh { 5438602adf40SYehuda Sadeh struct rbd_device *rbd_dev = NULL; 5439751cc0e3SAlex Elder struct list_head *tmp; 5440751cc0e3SAlex Elder int dev_id; 5441602adf40SYehuda Sadeh unsigned long ul; 544282a442d2SAlex Elder bool already = false; 54430d8189e1SAlex Elder int ret; 5444602adf40SYehuda Sadeh 5445bb8e0e84SJingoo Han ret = kstrtoul(buf, 10, &ul); 54460d8189e1SAlex Elder if (ret) 54470d8189e1SAlex Elder return ret; 5448602adf40SYehuda Sadeh 5449602adf40SYehuda Sadeh /* convert to int; abort if we lost anything in the conversion */ 5450751cc0e3SAlex Elder dev_id = (int)ul; 5451751cc0e3SAlex Elder if (dev_id != ul) 5452602adf40SYehuda Sadeh return -EINVAL; 5453602adf40SYehuda Sadeh 5454602adf40SYehuda Sadeh ret = -ENOENT; 5455751cc0e3SAlex Elder spin_lock(&rbd_dev_list_lock); 5456751cc0e3SAlex Elder list_for_each(tmp, &rbd_dev_list) { 5457751cc0e3SAlex Elder rbd_dev = list_entry(tmp, struct rbd_device, node); 5458751cc0e3SAlex Elder if (rbd_dev->dev_id == dev_id) { 5459751cc0e3SAlex Elder ret = 0; 5460751cc0e3SAlex Elder break; 5461602adf40SYehuda Sadeh } 5462751cc0e3SAlex Elder } 5463751cc0e3SAlex Elder if (!ret) { 5464a14ea269SAlex Elder spin_lock_irq(&rbd_dev->lock); 5465b82d167bSAlex Elder if (rbd_dev->open_count) 546642382b70SAlex Elder ret = -EBUSY; 5467b82d167bSAlex Elder else 546882a442d2SAlex Elder already = test_and_set_bit(RBD_DEV_FLAG_REMOVING, 546982a442d2SAlex Elder &rbd_dev->flags); 5470a14ea269SAlex Elder spin_unlock_irq(&rbd_dev->lock); 5471751cc0e3SAlex Elder } 5472751cc0e3SAlex Elder spin_unlock(&rbd_dev_list_lock); 547382a442d2SAlex Elder if (ret < 0 || already) 54741ba0f1e7SAlex Elder return ret; 5475751cc0e3SAlex Elder 5476fca27065SIlya Dryomov rbd_dev_header_unwatch_sync(rbd_dev); 5477fca27065SIlya Dryomov 54789875201eSJosh Durgin /* 54799875201eSJosh Durgin * Don't free anything from rbd_dev->disk until after all 54809875201eSJosh Durgin * notifies are completely processed. Otherwise 54819875201eSJosh Durgin * rbd_bus_del_dev() will race with rbd_watch_cb(), resulting 54829875201eSJosh Durgin * in a potential use after free of rbd_dev->disk or rbd_dev. 54839875201eSJosh Durgin */ 5484dd5ac32dSIlya Dryomov rbd_dev_device_release(rbd_dev); 54858ad42cd0SAlex Elder rbd_dev_image_release(rbd_dev); 5486aafb230eSAlex Elder 54871ba0f1e7SAlex Elder return count; 5488602adf40SYehuda Sadeh } 5489602adf40SYehuda Sadeh 54909b60e70bSIlya Dryomov static ssize_t rbd_remove(struct bus_type *bus, 54919b60e70bSIlya Dryomov const char *buf, 54929b60e70bSIlya Dryomov size_t count) 54939b60e70bSIlya Dryomov { 54949b60e70bSIlya Dryomov if (single_major) 54959b60e70bSIlya Dryomov return -EINVAL; 54969b60e70bSIlya Dryomov 54979b60e70bSIlya Dryomov return do_rbd_remove(bus, buf, count); 54989b60e70bSIlya Dryomov } 54999b60e70bSIlya Dryomov 55009b60e70bSIlya Dryomov static ssize_t rbd_remove_single_major(struct bus_type *bus, 55019b60e70bSIlya Dryomov const char *buf, 55029b60e70bSIlya Dryomov size_t count) 55039b60e70bSIlya Dryomov { 55049b60e70bSIlya Dryomov return do_rbd_remove(bus, buf, count); 55059b60e70bSIlya Dryomov } 55069b60e70bSIlya Dryomov 5507602adf40SYehuda Sadeh /* 5508602adf40SYehuda Sadeh * create control files in sysfs 5509dfc5606dSYehuda Sadeh * /sys/bus/rbd/... 5510602adf40SYehuda Sadeh */ 5511602adf40SYehuda Sadeh static int rbd_sysfs_init(void) 5512602adf40SYehuda Sadeh { 5513dfc5606dSYehuda Sadeh int ret; 5514602adf40SYehuda Sadeh 5515fed4c143SAlex Elder ret = device_register(&rbd_root_dev); 5516dfc5606dSYehuda Sadeh if (ret < 0) 5517dfc5606dSYehuda Sadeh return ret; 5518602adf40SYehuda Sadeh 5519fed4c143SAlex Elder ret = bus_register(&rbd_bus_type); 5520fed4c143SAlex Elder if (ret < 0) 5521fed4c143SAlex Elder device_unregister(&rbd_root_dev); 5522602adf40SYehuda Sadeh 5523602adf40SYehuda Sadeh return ret; 5524602adf40SYehuda Sadeh } 5525602adf40SYehuda Sadeh 5526602adf40SYehuda Sadeh static void rbd_sysfs_cleanup(void) 5527602adf40SYehuda Sadeh { 5528dfc5606dSYehuda Sadeh bus_unregister(&rbd_bus_type); 5529fed4c143SAlex Elder device_unregister(&rbd_root_dev); 5530602adf40SYehuda Sadeh } 5531602adf40SYehuda Sadeh 55321c2a9dfeSAlex Elder static int rbd_slab_init(void) 55331c2a9dfeSAlex Elder { 55341c2a9dfeSAlex Elder rbd_assert(!rbd_img_request_cache); 553503d94406SGeliang Tang rbd_img_request_cache = KMEM_CACHE(rbd_img_request, 0); 5536868311b1SAlex Elder if (!rbd_img_request_cache) 5537868311b1SAlex Elder return -ENOMEM; 5538868311b1SAlex Elder 5539868311b1SAlex Elder rbd_assert(!rbd_obj_request_cache); 554003d94406SGeliang Tang rbd_obj_request_cache = KMEM_CACHE(rbd_obj_request, 0); 554178c2a44aSAlex Elder if (!rbd_obj_request_cache) 554278c2a44aSAlex Elder goto out_err; 554378c2a44aSAlex Elder 554478c2a44aSAlex Elder rbd_assert(!rbd_segment_name_cache); 554578c2a44aSAlex Elder rbd_segment_name_cache = kmem_cache_create("rbd_segment_name", 55462d0ebc5dSIlya Dryomov CEPH_MAX_OID_NAME_LEN + 1, 1, 0, NULL); 554778c2a44aSAlex Elder if (rbd_segment_name_cache) 55481c2a9dfeSAlex Elder return 0; 554978c2a44aSAlex Elder out_err: 555078c2a44aSAlex Elder kmem_cache_destroy(rbd_obj_request_cache); 555178c2a44aSAlex Elder rbd_obj_request_cache = NULL; 55521c2a9dfeSAlex Elder 5553868311b1SAlex Elder kmem_cache_destroy(rbd_img_request_cache); 5554868311b1SAlex Elder rbd_img_request_cache = NULL; 5555868311b1SAlex Elder 55561c2a9dfeSAlex Elder return -ENOMEM; 55571c2a9dfeSAlex Elder } 55581c2a9dfeSAlex Elder 55591c2a9dfeSAlex Elder static void rbd_slab_exit(void) 55601c2a9dfeSAlex Elder { 556178c2a44aSAlex Elder rbd_assert(rbd_segment_name_cache); 556278c2a44aSAlex Elder kmem_cache_destroy(rbd_segment_name_cache); 556378c2a44aSAlex Elder rbd_segment_name_cache = NULL; 556478c2a44aSAlex Elder 5565868311b1SAlex Elder rbd_assert(rbd_obj_request_cache); 5566868311b1SAlex Elder kmem_cache_destroy(rbd_obj_request_cache); 5567868311b1SAlex Elder rbd_obj_request_cache = NULL; 5568868311b1SAlex Elder 55691c2a9dfeSAlex Elder rbd_assert(rbd_img_request_cache); 55701c2a9dfeSAlex Elder kmem_cache_destroy(rbd_img_request_cache); 55711c2a9dfeSAlex Elder rbd_img_request_cache = NULL; 55721c2a9dfeSAlex Elder } 55731c2a9dfeSAlex Elder 5574cc344fa1SAlex Elder static int __init rbd_init(void) 5575602adf40SYehuda Sadeh { 5576602adf40SYehuda Sadeh int rc; 5577602adf40SYehuda Sadeh 55781e32d34cSAlex Elder if (!libceph_compatible(NULL)) { 55791e32d34cSAlex Elder rbd_warn(NULL, "libceph incompatibility (quitting)"); 55801e32d34cSAlex Elder return -EINVAL; 55811e32d34cSAlex Elder } 5582e1b4d96dSIlya Dryomov 55831c2a9dfeSAlex Elder rc = rbd_slab_init(); 5584602adf40SYehuda Sadeh if (rc) 5585602adf40SYehuda Sadeh return rc; 5586e1b4d96dSIlya Dryomov 5587f5ee37bdSIlya Dryomov /* 5588f5ee37bdSIlya Dryomov * The number of active work items is limited by the number of 5589f77303bdSIlya Dryomov * rbd devices * queue depth, so leave @max_active at default. 5590f5ee37bdSIlya Dryomov */ 5591f5ee37bdSIlya Dryomov rbd_wq = alloc_workqueue(RBD_DRV_NAME, WQ_MEM_RECLAIM, 0); 5592f5ee37bdSIlya Dryomov if (!rbd_wq) { 5593f5ee37bdSIlya Dryomov rc = -ENOMEM; 5594f5ee37bdSIlya Dryomov goto err_out_slab; 5595f5ee37bdSIlya Dryomov } 5596f5ee37bdSIlya Dryomov 55979b60e70bSIlya Dryomov if (single_major) { 55989b60e70bSIlya Dryomov rbd_major = register_blkdev(0, RBD_DRV_NAME); 55999b60e70bSIlya Dryomov if (rbd_major < 0) { 56009b60e70bSIlya Dryomov rc = rbd_major; 5601f5ee37bdSIlya Dryomov goto err_out_wq; 56029b60e70bSIlya Dryomov } 56039b60e70bSIlya Dryomov } 56049b60e70bSIlya Dryomov 56051c2a9dfeSAlex Elder rc = rbd_sysfs_init(); 56061c2a9dfeSAlex Elder if (rc) 56079b60e70bSIlya Dryomov goto err_out_blkdev; 56081c2a9dfeSAlex Elder 56099b60e70bSIlya Dryomov if (single_major) 56109b60e70bSIlya Dryomov pr_info("loaded (major %d)\n", rbd_major); 56119b60e70bSIlya Dryomov else 5612e1b4d96dSIlya Dryomov pr_info("loaded\n"); 56139b60e70bSIlya Dryomov 5614e1b4d96dSIlya Dryomov return 0; 5615e1b4d96dSIlya Dryomov 56169b60e70bSIlya Dryomov err_out_blkdev: 56179b60e70bSIlya Dryomov if (single_major) 56189b60e70bSIlya Dryomov unregister_blkdev(rbd_major, RBD_DRV_NAME); 5619f5ee37bdSIlya Dryomov err_out_wq: 5620f5ee37bdSIlya Dryomov destroy_workqueue(rbd_wq); 5621e1b4d96dSIlya Dryomov err_out_slab: 5622e1b4d96dSIlya Dryomov rbd_slab_exit(); 56231c2a9dfeSAlex Elder return rc; 5624602adf40SYehuda Sadeh } 5625602adf40SYehuda Sadeh 5626cc344fa1SAlex Elder static void __exit rbd_exit(void) 5627602adf40SYehuda Sadeh { 5628ffe312cfSIlya Dryomov ida_destroy(&rbd_dev_id_ida); 5629602adf40SYehuda Sadeh rbd_sysfs_cleanup(); 56309b60e70bSIlya Dryomov if (single_major) 56319b60e70bSIlya Dryomov unregister_blkdev(rbd_major, RBD_DRV_NAME); 5632f5ee37bdSIlya Dryomov destroy_workqueue(rbd_wq); 56331c2a9dfeSAlex Elder rbd_slab_exit(); 5634602adf40SYehuda Sadeh } 5635602adf40SYehuda Sadeh 5636602adf40SYehuda Sadeh module_init(rbd_init); 5637602adf40SYehuda Sadeh module_exit(rbd_exit); 5638602adf40SYehuda Sadeh 5639d552c619SAlex Elder MODULE_AUTHOR("Alex Elder <elder@inktank.com>"); 5640602adf40SYehuda Sadeh MODULE_AUTHOR("Sage Weil <sage@newdream.net>"); 5641602adf40SYehuda Sadeh MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>"); 5642602adf40SYehuda Sadeh /* following authorship retained from original osdblk.c */ 5643602adf40SYehuda Sadeh MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>"); 5644602adf40SYehuda Sadeh 564590da258bSIlya Dryomov MODULE_DESCRIPTION("RADOS Block Device (RBD) driver"); 5646602adf40SYehuda Sadeh MODULE_LICENSE("GPL"); 5647