1e2a58ee5SAlex Elder 2602adf40SYehuda Sadeh /* 3602adf40SYehuda Sadeh rbd.c -- Export ceph rados objects as a Linux block device 4602adf40SYehuda Sadeh 5602adf40SYehuda Sadeh 6602adf40SYehuda Sadeh based on drivers/block/osdblk.c: 7602adf40SYehuda Sadeh 8602adf40SYehuda Sadeh Copyright 2009 Red Hat, Inc. 9602adf40SYehuda Sadeh 10602adf40SYehuda Sadeh This program is free software; you can redistribute it and/or modify 11602adf40SYehuda Sadeh it under the terms of the GNU General Public License as published by 12602adf40SYehuda Sadeh the Free Software Foundation. 13602adf40SYehuda Sadeh 14602adf40SYehuda Sadeh This program is distributed in the hope that it will be useful, 15602adf40SYehuda Sadeh but WITHOUT ANY WARRANTY; without even the implied warranty of 16602adf40SYehuda Sadeh MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17602adf40SYehuda Sadeh GNU General Public License for more details. 18602adf40SYehuda Sadeh 19602adf40SYehuda Sadeh You should have received a copy of the GNU General Public License 20602adf40SYehuda Sadeh along with this program; see the file COPYING. If not, write to 21602adf40SYehuda Sadeh the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 22602adf40SYehuda Sadeh 23602adf40SYehuda Sadeh 24602adf40SYehuda Sadeh 25dfc5606dSYehuda Sadeh For usage instructions, please refer to: 26602adf40SYehuda Sadeh 27dfc5606dSYehuda Sadeh Documentation/ABI/testing/sysfs-bus-rbd 28602adf40SYehuda Sadeh 29602adf40SYehuda Sadeh */ 30602adf40SYehuda Sadeh 31602adf40SYehuda Sadeh #include <linux/ceph/libceph.h> 32602adf40SYehuda Sadeh #include <linux/ceph/osd_client.h> 33602adf40SYehuda Sadeh #include <linux/ceph/mon_client.h> 34602adf40SYehuda Sadeh #include <linux/ceph/decode.h> 3559c2be1eSYehuda Sadeh #include <linux/parser.h> 3630d1cff8SAlex Elder #include <linux/bsearch.h> 37602adf40SYehuda Sadeh 38602adf40SYehuda Sadeh #include <linux/kernel.h> 39602adf40SYehuda Sadeh #include <linux/device.h> 40602adf40SYehuda Sadeh #include <linux/module.h> 417ad18afaSChristoph Hellwig #include <linux/blk-mq.h> 42602adf40SYehuda Sadeh #include <linux/fs.h> 43602adf40SYehuda Sadeh #include <linux/blkdev.h> 441c2a9dfeSAlex Elder #include <linux/slab.h> 45f8a22fc2SIlya Dryomov #include <linux/idr.h> 46bc1ecc65SIlya Dryomov #include <linux/workqueue.h> 47602adf40SYehuda Sadeh 48602adf40SYehuda Sadeh #include "rbd_types.h" 49602adf40SYehuda Sadeh 50aafb230eSAlex Elder #define RBD_DEBUG /* Activate rbd_assert() calls */ 51aafb230eSAlex Elder 52593a9e7bSAlex Elder /* 53593a9e7bSAlex Elder * The basic unit of block I/O is a sector. It is interpreted in a 54593a9e7bSAlex Elder * number of contexts in Linux (blk, bio, genhd), but the default is 55593a9e7bSAlex Elder * universally 512 bytes. These symbols are just slightly more 56593a9e7bSAlex Elder * meaningful than the bare numbers they represent. 57593a9e7bSAlex Elder */ 58593a9e7bSAlex Elder #define SECTOR_SHIFT 9 59593a9e7bSAlex Elder #define SECTOR_SIZE (1ULL << SECTOR_SHIFT) 60593a9e7bSAlex Elder 61a2acd00eSAlex Elder /* 62a2acd00eSAlex Elder * Increment the given counter and return its updated value. 63a2acd00eSAlex Elder * If the counter is already 0 it will not be incremented. 64a2acd00eSAlex Elder * If the counter is already at its maximum value returns 65a2acd00eSAlex Elder * -EINVAL without updating it. 66a2acd00eSAlex Elder */ 67a2acd00eSAlex Elder static int atomic_inc_return_safe(atomic_t *v) 68a2acd00eSAlex Elder { 69a2acd00eSAlex Elder unsigned int counter; 70a2acd00eSAlex Elder 71a2acd00eSAlex Elder counter = (unsigned int)__atomic_add_unless(v, 1, 0); 72a2acd00eSAlex Elder if (counter <= (unsigned int)INT_MAX) 73a2acd00eSAlex Elder return (int)counter; 74a2acd00eSAlex Elder 75a2acd00eSAlex Elder atomic_dec(v); 76a2acd00eSAlex Elder 77a2acd00eSAlex Elder return -EINVAL; 78a2acd00eSAlex Elder } 79a2acd00eSAlex Elder 80a2acd00eSAlex Elder /* Decrement the counter. Return the resulting value, or -EINVAL */ 81a2acd00eSAlex Elder static int atomic_dec_return_safe(atomic_t *v) 82a2acd00eSAlex Elder { 83a2acd00eSAlex Elder int counter; 84a2acd00eSAlex Elder 85a2acd00eSAlex Elder counter = atomic_dec_return(v); 86a2acd00eSAlex Elder if (counter >= 0) 87a2acd00eSAlex Elder return counter; 88a2acd00eSAlex Elder 89a2acd00eSAlex Elder atomic_inc(v); 90a2acd00eSAlex Elder 91a2acd00eSAlex Elder return -EINVAL; 92a2acd00eSAlex Elder } 93a2acd00eSAlex Elder 94f0f8cef5SAlex Elder #define RBD_DRV_NAME "rbd" 95602adf40SYehuda Sadeh 967e513d43SIlya Dryomov #define RBD_MINORS_PER_MAJOR 256 977e513d43SIlya Dryomov #define RBD_SINGLE_MAJOR_PART_SHIFT 4 98602adf40SYehuda Sadeh 996d69bb53SIlya Dryomov #define RBD_MAX_PARENT_CHAIN_LEN 16 1006d69bb53SIlya Dryomov 101d4b125e9SAlex Elder #define RBD_SNAP_DEV_NAME_PREFIX "snap_" 102d4b125e9SAlex Elder #define RBD_MAX_SNAP_NAME_LEN \ 103d4b125e9SAlex Elder (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1)) 104d4b125e9SAlex Elder 10535d489f9SAlex Elder #define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */ 106602adf40SYehuda Sadeh 107602adf40SYehuda Sadeh #define RBD_SNAP_HEAD_NAME "-" 108602adf40SYehuda Sadeh 1099682fc6dSAlex Elder #define BAD_SNAP_INDEX U32_MAX /* invalid index into snap array */ 1109682fc6dSAlex Elder 1119e15b77dSAlex Elder /* This allows a single page to hold an image name sent by OSD */ 1129e15b77dSAlex Elder #define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1) 113589d30e0SAlex Elder #define RBD_IMAGE_ID_LEN_MAX 64 1149e15b77dSAlex Elder 1151e130199SAlex Elder #define RBD_OBJ_PREFIX_LEN_MAX 64 116589d30e0SAlex Elder 117d889140cSAlex Elder /* Feature bits */ 118d889140cSAlex Elder 1195cbf6f12SAlex Elder #define RBD_FEATURE_LAYERING (1<<0) 1205cbf6f12SAlex Elder #define RBD_FEATURE_STRIPINGV2 (1<<1) 1215cbf6f12SAlex Elder #define RBD_FEATURES_ALL \ 1225cbf6f12SAlex Elder (RBD_FEATURE_LAYERING | RBD_FEATURE_STRIPINGV2) 123d889140cSAlex Elder 124d889140cSAlex Elder /* Features supported by this (client software) implementation. */ 125d889140cSAlex Elder 126770eba6eSAlex Elder #define RBD_FEATURES_SUPPORTED (RBD_FEATURES_ALL) 127d889140cSAlex Elder 12881a89793SAlex Elder /* 12981a89793SAlex Elder * An RBD device name will be "rbd#", where the "rbd" comes from 13081a89793SAlex Elder * RBD_DRV_NAME above, and # is a unique integer identifier. 13181a89793SAlex Elder * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big 13281a89793SAlex Elder * enough to hold all possible device names. 13381a89793SAlex Elder */ 134602adf40SYehuda Sadeh #define DEV_NAME_LEN 32 13581a89793SAlex Elder #define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1) 136602adf40SYehuda Sadeh 137602adf40SYehuda Sadeh /* 138602adf40SYehuda Sadeh * block device image metadata (in-memory version) 139602adf40SYehuda Sadeh */ 140602adf40SYehuda Sadeh struct rbd_image_header { 141f35a4deeSAlex Elder /* These six fields never change for a given rbd image */ 142849b4260SAlex Elder char *object_prefix; 143602adf40SYehuda Sadeh __u8 obj_order; 144602adf40SYehuda Sadeh __u8 crypt_type; 145602adf40SYehuda Sadeh __u8 comp_type; 146f35a4deeSAlex Elder u64 stripe_unit; 147f35a4deeSAlex Elder u64 stripe_count; 148f35a4deeSAlex Elder u64 features; /* Might be changeable someday? */ 149602adf40SYehuda Sadeh 150f84344f3SAlex Elder /* The remaining fields need to be updated occasionally */ 151f84344f3SAlex Elder u64 image_size; 152f84344f3SAlex Elder struct ceph_snap_context *snapc; 153f35a4deeSAlex Elder char *snap_names; /* format 1 only */ 154f35a4deeSAlex Elder u64 *snap_sizes; /* format 1 only */ 15559c2be1eSYehuda Sadeh }; 15659c2be1eSYehuda Sadeh 1570d7dbfceSAlex Elder /* 1580d7dbfceSAlex Elder * An rbd image specification. 1590d7dbfceSAlex Elder * 1600d7dbfceSAlex Elder * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely 161c66c6e0cSAlex Elder * identify an image. Each rbd_dev structure includes a pointer to 162c66c6e0cSAlex Elder * an rbd_spec structure that encapsulates this identity. 163c66c6e0cSAlex Elder * 164c66c6e0cSAlex Elder * Each of the id's in an rbd_spec has an associated name. For a 165c66c6e0cSAlex Elder * user-mapped image, the names are supplied and the id's associated 166c66c6e0cSAlex Elder * with them are looked up. For a layered image, a parent image is 167c66c6e0cSAlex Elder * defined by the tuple, and the names are looked up. 168c66c6e0cSAlex Elder * 169c66c6e0cSAlex Elder * An rbd_dev structure contains a parent_spec pointer which is 170c66c6e0cSAlex Elder * non-null if the image it represents is a child in a layered 171c66c6e0cSAlex Elder * image. This pointer will refer to the rbd_spec structure used 172c66c6e0cSAlex Elder * by the parent rbd_dev for its own identity (i.e., the structure 173c66c6e0cSAlex Elder * is shared between the parent and child). 174c66c6e0cSAlex Elder * 175c66c6e0cSAlex Elder * Since these structures are populated once, during the discovery 176c66c6e0cSAlex Elder * phase of image construction, they are effectively immutable so 177c66c6e0cSAlex Elder * we make no effort to synchronize access to them. 178c66c6e0cSAlex Elder * 179c66c6e0cSAlex Elder * Note that code herein does not assume the image name is known (it 180c66c6e0cSAlex Elder * could be a null pointer). 1810d7dbfceSAlex Elder */ 1820d7dbfceSAlex Elder struct rbd_spec { 1830d7dbfceSAlex Elder u64 pool_id; 184ecb4dc22SAlex Elder const char *pool_name; 1850d7dbfceSAlex Elder 186ecb4dc22SAlex Elder const char *image_id; 187ecb4dc22SAlex Elder const char *image_name; 1880d7dbfceSAlex Elder 1890d7dbfceSAlex Elder u64 snap_id; 190ecb4dc22SAlex Elder const char *snap_name; 1910d7dbfceSAlex Elder 1920d7dbfceSAlex Elder struct kref kref; 1930d7dbfceSAlex Elder }; 1940d7dbfceSAlex Elder 195602adf40SYehuda Sadeh /* 196f0f8cef5SAlex Elder * an instance of the client. multiple devices may share an rbd client. 197602adf40SYehuda Sadeh */ 198602adf40SYehuda Sadeh struct rbd_client { 199602adf40SYehuda Sadeh struct ceph_client *client; 200602adf40SYehuda Sadeh struct kref kref; 201602adf40SYehuda Sadeh struct list_head node; 202602adf40SYehuda Sadeh }; 203602adf40SYehuda Sadeh 204bf0d5f50SAlex Elder struct rbd_img_request; 205bf0d5f50SAlex Elder typedef void (*rbd_img_callback_t)(struct rbd_img_request *); 206bf0d5f50SAlex Elder 207bf0d5f50SAlex Elder #define BAD_WHICH U32_MAX /* Good which or bad which, which? */ 208bf0d5f50SAlex Elder 209bf0d5f50SAlex Elder struct rbd_obj_request; 210bf0d5f50SAlex Elder typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *); 211bf0d5f50SAlex Elder 2129969ebc5SAlex Elder enum obj_request_type { 2139969ebc5SAlex Elder OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES 2149969ebc5SAlex Elder }; 215bf0d5f50SAlex Elder 2166d2940c8SGuangliang Zhao enum obj_operation_type { 2176d2940c8SGuangliang Zhao OBJ_OP_WRITE, 2186d2940c8SGuangliang Zhao OBJ_OP_READ, 21990e98c52SGuangliang Zhao OBJ_OP_DISCARD, 2206d2940c8SGuangliang Zhao }; 2216d2940c8SGuangliang Zhao 222926f9b3fSAlex Elder enum obj_req_flags { 223926f9b3fSAlex Elder OBJ_REQ_DONE, /* completion flag: not done = 0, done = 1 */ 2246365d33aSAlex Elder OBJ_REQ_IMG_DATA, /* object usage: standalone = 0, image = 1 */ 2255679c59fSAlex Elder OBJ_REQ_KNOWN, /* EXISTS flag valid: no = 0, yes = 1 */ 2265679c59fSAlex Elder OBJ_REQ_EXISTS, /* target exists: no = 0, yes = 1 */ 227926f9b3fSAlex Elder }; 228926f9b3fSAlex Elder 229bf0d5f50SAlex Elder struct rbd_obj_request { 230bf0d5f50SAlex Elder const char *object_name; 231bf0d5f50SAlex Elder u64 offset; /* object start byte */ 232bf0d5f50SAlex Elder u64 length; /* bytes from offset */ 233926f9b3fSAlex Elder unsigned long flags; 234bf0d5f50SAlex Elder 235c5b5ef6cSAlex Elder /* 236c5b5ef6cSAlex Elder * An object request associated with an image will have its 237c5b5ef6cSAlex Elder * img_data flag set; a standalone object request will not. 238c5b5ef6cSAlex Elder * 239c5b5ef6cSAlex Elder * A standalone object request will have which == BAD_WHICH 240c5b5ef6cSAlex Elder * and a null obj_request pointer. 241c5b5ef6cSAlex Elder * 242c5b5ef6cSAlex Elder * An object request initiated in support of a layered image 243c5b5ef6cSAlex Elder * object (to check for its existence before a write) will 244c5b5ef6cSAlex Elder * have which == BAD_WHICH and a non-null obj_request pointer. 245c5b5ef6cSAlex Elder * 246c5b5ef6cSAlex Elder * Finally, an object request for rbd image data will have 247c5b5ef6cSAlex Elder * which != BAD_WHICH, and will have a non-null img_request 248c5b5ef6cSAlex Elder * pointer. The value of which will be in the range 249c5b5ef6cSAlex Elder * 0..(img_request->obj_request_count-1). 250c5b5ef6cSAlex Elder */ 251c5b5ef6cSAlex Elder union { 252c5b5ef6cSAlex Elder struct rbd_obj_request *obj_request; /* STAT op */ 253c5b5ef6cSAlex Elder struct { 254bf0d5f50SAlex Elder struct rbd_img_request *img_request; 255c5b5ef6cSAlex Elder u64 img_offset; 256c5b5ef6cSAlex Elder /* links for img_request->obj_requests list */ 257c5b5ef6cSAlex Elder struct list_head links; 258c5b5ef6cSAlex Elder }; 259c5b5ef6cSAlex Elder }; 260bf0d5f50SAlex Elder u32 which; /* posn image request list */ 261bf0d5f50SAlex Elder 262bf0d5f50SAlex Elder enum obj_request_type type; 263788e2df3SAlex Elder union { 264bf0d5f50SAlex Elder struct bio *bio_list; 265788e2df3SAlex Elder struct { 266788e2df3SAlex Elder struct page **pages; 267788e2df3SAlex Elder u32 page_count; 268788e2df3SAlex Elder }; 269788e2df3SAlex Elder }; 2700eefd470SAlex Elder struct page **copyup_pages; 271ebda6408SAlex Elder u32 copyup_page_count; 272bf0d5f50SAlex Elder 273bf0d5f50SAlex Elder struct ceph_osd_request *osd_req; 274bf0d5f50SAlex Elder 275bf0d5f50SAlex Elder u64 xferred; /* bytes transferred */ 2761b83bef2SSage Weil int result; 277bf0d5f50SAlex Elder 278bf0d5f50SAlex Elder rbd_obj_callback_t callback; 279788e2df3SAlex Elder struct completion completion; 280bf0d5f50SAlex Elder 281bf0d5f50SAlex Elder struct kref kref; 282bf0d5f50SAlex Elder }; 283bf0d5f50SAlex Elder 2840c425248SAlex Elder enum img_req_flags { 2859849e986SAlex Elder IMG_REQ_WRITE, /* I/O direction: read = 0, write = 1 */ 2869849e986SAlex Elder IMG_REQ_CHILD, /* initiator: block = 0, child image = 1 */ 287d0b2e944SAlex Elder IMG_REQ_LAYERED, /* ENOENT handling: normal = 0, layered = 1 */ 28890e98c52SGuangliang Zhao IMG_REQ_DISCARD, /* discard: normal = 0, discard request = 1 */ 2890c425248SAlex Elder }; 2900c425248SAlex Elder 291bf0d5f50SAlex Elder struct rbd_img_request { 292bf0d5f50SAlex Elder struct rbd_device *rbd_dev; 293bf0d5f50SAlex Elder u64 offset; /* starting image byte offset */ 294bf0d5f50SAlex Elder u64 length; /* byte count from offset */ 2950c425248SAlex Elder unsigned long flags; 296bf0d5f50SAlex Elder union { 297bf0d5f50SAlex Elder u64 snap_id; /* for reads */ 2989849e986SAlex Elder struct ceph_snap_context *snapc; /* for writes */ 2999849e986SAlex Elder }; 3009849e986SAlex Elder union { 3019849e986SAlex Elder struct request *rq; /* block request */ 3029849e986SAlex Elder struct rbd_obj_request *obj_request; /* obj req initiator */ 303bf0d5f50SAlex Elder }; 3043d7efd18SAlex Elder struct page **copyup_pages; 305ebda6408SAlex Elder u32 copyup_page_count; 306bf0d5f50SAlex Elder spinlock_t completion_lock;/* protects next_completion */ 307bf0d5f50SAlex Elder u32 next_completion; 308bf0d5f50SAlex Elder rbd_img_callback_t callback; 30955f27e09SAlex Elder u64 xferred;/* aggregate bytes transferred */ 310a5a337d4SAlex Elder int result; /* first nonzero obj_request result */ 311bf0d5f50SAlex Elder 312bf0d5f50SAlex Elder u32 obj_request_count; 313bf0d5f50SAlex Elder struct list_head obj_requests; /* rbd_obj_request structs */ 314bf0d5f50SAlex Elder 315bf0d5f50SAlex Elder struct kref kref; 316bf0d5f50SAlex Elder }; 317bf0d5f50SAlex Elder 318bf0d5f50SAlex Elder #define for_each_obj_request(ireq, oreq) \ 319ef06f4d3SAlex Elder list_for_each_entry(oreq, &(ireq)->obj_requests, links) 320bf0d5f50SAlex Elder #define for_each_obj_request_from(ireq, oreq) \ 321ef06f4d3SAlex Elder list_for_each_entry_from(oreq, &(ireq)->obj_requests, links) 322bf0d5f50SAlex Elder #define for_each_obj_request_safe(ireq, oreq, n) \ 323ef06f4d3SAlex Elder list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links) 324bf0d5f50SAlex Elder 325f84344f3SAlex Elder struct rbd_mapping { 32699c1f08fSAlex Elder u64 size; 32734b13184SAlex Elder u64 features; 328f84344f3SAlex Elder bool read_only; 329f84344f3SAlex Elder }; 330f84344f3SAlex Elder 331602adf40SYehuda Sadeh /* 332602adf40SYehuda Sadeh * a single device 333602adf40SYehuda Sadeh */ 334602adf40SYehuda Sadeh struct rbd_device { 335de71a297SAlex Elder int dev_id; /* blkdev unique id */ 336602adf40SYehuda Sadeh 337602adf40SYehuda Sadeh int major; /* blkdev assigned major */ 338dd82fff1SIlya Dryomov int minor; 339602adf40SYehuda Sadeh struct gendisk *disk; /* blkdev's gendisk and rq */ 340602adf40SYehuda Sadeh 341a30b71b9SAlex Elder u32 image_format; /* Either 1 or 2 */ 342602adf40SYehuda Sadeh struct rbd_client *rbd_client; 343602adf40SYehuda Sadeh 344602adf40SYehuda Sadeh char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */ 345602adf40SYehuda Sadeh 346b82d167bSAlex Elder spinlock_t lock; /* queue, flags, open_count */ 347602adf40SYehuda Sadeh 348602adf40SYehuda Sadeh struct rbd_image_header header; 349b82d167bSAlex Elder unsigned long flags; /* possibly lock protected */ 3500d7dbfceSAlex Elder struct rbd_spec *spec; 351d147543dSIlya Dryomov struct rbd_options *opts; 352602adf40SYehuda Sadeh 3530d7dbfceSAlex Elder char *header_name; 354971f839aSAlex Elder 3550903e875SAlex Elder struct ceph_file_layout layout; 3560903e875SAlex Elder 35759c2be1eSYehuda Sadeh struct ceph_osd_event *watch_event; 358975241afSAlex Elder struct rbd_obj_request *watch_request; 35959c2be1eSYehuda Sadeh 36086b00e0dSAlex Elder struct rbd_spec *parent_spec; 36186b00e0dSAlex Elder u64 parent_overlap; 362a2acd00eSAlex Elder atomic_t parent_ref; 3632f82ee54SAlex Elder struct rbd_device *parent; 36486b00e0dSAlex Elder 3657ad18afaSChristoph Hellwig /* Block layer tags. */ 3667ad18afaSChristoph Hellwig struct blk_mq_tag_set tag_set; 3677ad18afaSChristoph Hellwig 368c666601aSJosh Durgin /* protects updating the header */ 369c666601aSJosh Durgin struct rw_semaphore header_rwsem; 370f84344f3SAlex Elder 371f84344f3SAlex Elder struct rbd_mapping mapping; 372602adf40SYehuda Sadeh 373602adf40SYehuda Sadeh struct list_head node; 374dfc5606dSYehuda Sadeh 375dfc5606dSYehuda Sadeh /* sysfs related */ 376dfc5606dSYehuda Sadeh struct device dev; 377b82d167bSAlex Elder unsigned long open_count; /* protected by lock */ 378dfc5606dSYehuda Sadeh }; 379dfc5606dSYehuda Sadeh 380b82d167bSAlex Elder /* 381b82d167bSAlex Elder * Flag bits for rbd_dev->flags. If atomicity is required, 382b82d167bSAlex Elder * rbd_dev->lock is used to protect access. 383b82d167bSAlex Elder * 384b82d167bSAlex Elder * Currently, only the "removing" flag (which is coupled with the 385b82d167bSAlex Elder * "open_count" field) requires atomic access. 386b82d167bSAlex Elder */ 3876d292906SAlex Elder enum rbd_dev_flags { 3886d292906SAlex Elder RBD_DEV_FLAG_EXISTS, /* mapped snapshot has not been deleted */ 389b82d167bSAlex Elder RBD_DEV_FLAG_REMOVING, /* this mapping is being removed */ 3906d292906SAlex Elder }; 3916d292906SAlex Elder 392cfbf6377SAlex Elder static DEFINE_MUTEX(client_mutex); /* Serialize client creation */ 393e124a82fSAlex Elder 394602adf40SYehuda Sadeh static LIST_HEAD(rbd_dev_list); /* devices */ 395e124a82fSAlex Elder static DEFINE_SPINLOCK(rbd_dev_list_lock); 396e124a82fSAlex Elder 397602adf40SYehuda Sadeh static LIST_HEAD(rbd_client_list); /* clients */ 398432b8587SAlex Elder static DEFINE_SPINLOCK(rbd_client_list_lock); 399602adf40SYehuda Sadeh 40078c2a44aSAlex Elder /* Slab caches for frequently-allocated structures */ 40178c2a44aSAlex Elder 4021c2a9dfeSAlex Elder static struct kmem_cache *rbd_img_request_cache; 403868311b1SAlex Elder static struct kmem_cache *rbd_obj_request_cache; 40478c2a44aSAlex Elder static struct kmem_cache *rbd_segment_name_cache; 4051c2a9dfeSAlex Elder 4069b60e70bSIlya Dryomov static int rbd_major; 407f8a22fc2SIlya Dryomov static DEFINE_IDA(rbd_dev_id_ida); 408f8a22fc2SIlya Dryomov 409f5ee37bdSIlya Dryomov static struct workqueue_struct *rbd_wq; 410f5ee37bdSIlya Dryomov 4119b60e70bSIlya Dryomov /* 4129b60e70bSIlya Dryomov * Default to false for now, as single-major requires >= 0.75 version of 4139b60e70bSIlya Dryomov * userspace rbd utility. 4149b60e70bSIlya Dryomov */ 4159b60e70bSIlya Dryomov static bool single_major = false; 4169b60e70bSIlya Dryomov module_param(single_major, bool, S_IRUGO); 4179b60e70bSIlya Dryomov MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: false)"); 4189b60e70bSIlya Dryomov 4193d7efd18SAlex Elder static int rbd_img_request_submit(struct rbd_img_request *img_request); 4203d7efd18SAlex Elder 421200a6a8bSAlex Elder static void rbd_dev_device_release(struct device *dev); 422dfc5606dSYehuda Sadeh 423f0f8cef5SAlex Elder static ssize_t rbd_add(struct bus_type *bus, const char *buf, 424f0f8cef5SAlex Elder size_t count); 425f0f8cef5SAlex Elder static ssize_t rbd_remove(struct bus_type *bus, const char *buf, 426f0f8cef5SAlex Elder size_t count); 4279b60e70bSIlya Dryomov static ssize_t rbd_add_single_major(struct bus_type *bus, const char *buf, 4289b60e70bSIlya Dryomov size_t count); 4299b60e70bSIlya Dryomov static ssize_t rbd_remove_single_major(struct bus_type *bus, const char *buf, 4309b60e70bSIlya Dryomov size_t count); 4316d69bb53SIlya Dryomov static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth); 432a2acd00eSAlex Elder static void rbd_spec_put(struct rbd_spec *spec); 433f0f8cef5SAlex Elder 4349b60e70bSIlya Dryomov static int rbd_dev_id_to_minor(int dev_id) 4359b60e70bSIlya Dryomov { 4367e513d43SIlya Dryomov return dev_id << RBD_SINGLE_MAJOR_PART_SHIFT; 4379b60e70bSIlya Dryomov } 4389b60e70bSIlya Dryomov 4399b60e70bSIlya Dryomov static int minor_to_rbd_dev_id(int minor) 4409b60e70bSIlya Dryomov { 4417e513d43SIlya Dryomov return minor >> RBD_SINGLE_MAJOR_PART_SHIFT; 4429b60e70bSIlya Dryomov } 4439b60e70bSIlya Dryomov 444b15a21ddSGreg Kroah-Hartman static BUS_ATTR(add, S_IWUSR, NULL, rbd_add); 445b15a21ddSGreg Kroah-Hartman static BUS_ATTR(remove, S_IWUSR, NULL, rbd_remove); 4469b60e70bSIlya Dryomov static BUS_ATTR(add_single_major, S_IWUSR, NULL, rbd_add_single_major); 4479b60e70bSIlya Dryomov static BUS_ATTR(remove_single_major, S_IWUSR, NULL, rbd_remove_single_major); 448b15a21ddSGreg Kroah-Hartman 449b15a21ddSGreg Kroah-Hartman static struct attribute *rbd_bus_attrs[] = { 450b15a21ddSGreg Kroah-Hartman &bus_attr_add.attr, 451b15a21ddSGreg Kroah-Hartman &bus_attr_remove.attr, 4529b60e70bSIlya Dryomov &bus_attr_add_single_major.attr, 4539b60e70bSIlya Dryomov &bus_attr_remove_single_major.attr, 454b15a21ddSGreg Kroah-Hartman NULL, 455f0f8cef5SAlex Elder }; 45692c76dc0SIlya Dryomov 45792c76dc0SIlya Dryomov static umode_t rbd_bus_is_visible(struct kobject *kobj, 45892c76dc0SIlya Dryomov struct attribute *attr, int index) 45992c76dc0SIlya Dryomov { 4609b60e70bSIlya Dryomov if (!single_major && 4619b60e70bSIlya Dryomov (attr == &bus_attr_add_single_major.attr || 4629b60e70bSIlya Dryomov attr == &bus_attr_remove_single_major.attr)) 4639b60e70bSIlya Dryomov return 0; 4649b60e70bSIlya Dryomov 46592c76dc0SIlya Dryomov return attr->mode; 46692c76dc0SIlya Dryomov } 46792c76dc0SIlya Dryomov 46892c76dc0SIlya Dryomov static const struct attribute_group rbd_bus_group = { 46992c76dc0SIlya Dryomov .attrs = rbd_bus_attrs, 47092c76dc0SIlya Dryomov .is_visible = rbd_bus_is_visible, 47192c76dc0SIlya Dryomov }; 47292c76dc0SIlya Dryomov __ATTRIBUTE_GROUPS(rbd_bus); 473f0f8cef5SAlex Elder 474f0f8cef5SAlex Elder static struct bus_type rbd_bus_type = { 475f0f8cef5SAlex Elder .name = "rbd", 476b15a21ddSGreg Kroah-Hartman .bus_groups = rbd_bus_groups, 477f0f8cef5SAlex Elder }; 478f0f8cef5SAlex Elder 479f0f8cef5SAlex Elder static void rbd_root_dev_release(struct device *dev) 480f0f8cef5SAlex Elder { 481f0f8cef5SAlex Elder } 482f0f8cef5SAlex Elder 483f0f8cef5SAlex Elder static struct device rbd_root_dev = { 484f0f8cef5SAlex Elder .init_name = "rbd", 485f0f8cef5SAlex Elder .release = rbd_root_dev_release, 486f0f8cef5SAlex Elder }; 487f0f8cef5SAlex Elder 48806ecc6cbSAlex Elder static __printf(2, 3) 48906ecc6cbSAlex Elder void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...) 49006ecc6cbSAlex Elder { 49106ecc6cbSAlex Elder struct va_format vaf; 49206ecc6cbSAlex Elder va_list args; 49306ecc6cbSAlex Elder 49406ecc6cbSAlex Elder va_start(args, fmt); 49506ecc6cbSAlex Elder vaf.fmt = fmt; 49606ecc6cbSAlex Elder vaf.va = &args; 49706ecc6cbSAlex Elder 49806ecc6cbSAlex Elder if (!rbd_dev) 49906ecc6cbSAlex Elder printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf); 50006ecc6cbSAlex Elder else if (rbd_dev->disk) 50106ecc6cbSAlex Elder printk(KERN_WARNING "%s: %s: %pV\n", 50206ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf); 50306ecc6cbSAlex Elder else if (rbd_dev->spec && rbd_dev->spec->image_name) 50406ecc6cbSAlex Elder printk(KERN_WARNING "%s: image %s: %pV\n", 50506ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf); 50606ecc6cbSAlex Elder else if (rbd_dev->spec && rbd_dev->spec->image_id) 50706ecc6cbSAlex Elder printk(KERN_WARNING "%s: id %s: %pV\n", 50806ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf); 50906ecc6cbSAlex Elder else /* punt */ 51006ecc6cbSAlex Elder printk(KERN_WARNING "%s: rbd_dev %p: %pV\n", 51106ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev, &vaf); 51206ecc6cbSAlex Elder va_end(args); 51306ecc6cbSAlex Elder } 51406ecc6cbSAlex Elder 515aafb230eSAlex Elder #ifdef RBD_DEBUG 516aafb230eSAlex Elder #define rbd_assert(expr) \ 517aafb230eSAlex Elder if (unlikely(!(expr))) { \ 518aafb230eSAlex Elder printk(KERN_ERR "\nAssertion failure in %s() " \ 519aafb230eSAlex Elder "at line %d:\n\n" \ 520aafb230eSAlex Elder "\trbd_assert(%s);\n\n", \ 521aafb230eSAlex Elder __func__, __LINE__, #expr); \ 522aafb230eSAlex Elder BUG(); \ 523aafb230eSAlex Elder } 524aafb230eSAlex Elder #else /* !RBD_DEBUG */ 525aafb230eSAlex Elder # define rbd_assert(expr) ((void) 0) 526aafb230eSAlex Elder #endif /* !RBD_DEBUG */ 527dfc5606dSYehuda Sadeh 5282761713dSIlya Dryomov static void rbd_osd_copyup_callback(struct rbd_obj_request *obj_request); 529b454e36dSAlex Elder static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request); 53005a46afdSAlex Elder static void rbd_img_parent_read(struct rbd_obj_request *obj_request); 53105a46afdSAlex Elder static void rbd_dev_remove_parent(struct rbd_device *rbd_dev); 5328b3e1a56SAlex Elder 533cc4a38bdSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev); 5342df3fac7SAlex Elder static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev); 535a720ae09SIlya Dryomov static int rbd_dev_header_info(struct rbd_device *rbd_dev); 536e8f59b59SIlya Dryomov static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev); 53754cac61fSAlex Elder static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, 53854cac61fSAlex Elder u64 snap_id); 5392ad3d716SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id, 5402ad3d716SAlex Elder u8 *order, u64 *snap_size); 5412ad3d716SAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id, 5422ad3d716SAlex Elder u64 *snap_features); 5432ad3d716SAlex Elder static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name); 54459c2be1eSYehuda Sadeh 545602adf40SYehuda Sadeh static int rbd_open(struct block_device *bdev, fmode_t mode) 546602adf40SYehuda Sadeh { 547f0f8cef5SAlex Elder struct rbd_device *rbd_dev = bdev->bd_disk->private_data; 548b82d167bSAlex Elder bool removing = false; 549602adf40SYehuda Sadeh 550f84344f3SAlex Elder if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only) 551602adf40SYehuda Sadeh return -EROFS; 552602adf40SYehuda Sadeh 553a14ea269SAlex Elder spin_lock_irq(&rbd_dev->lock); 554b82d167bSAlex Elder if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags)) 555b82d167bSAlex Elder removing = true; 556b82d167bSAlex Elder else 557b82d167bSAlex Elder rbd_dev->open_count++; 558a14ea269SAlex Elder spin_unlock_irq(&rbd_dev->lock); 559b82d167bSAlex Elder if (removing) 560b82d167bSAlex Elder return -ENOENT; 561b82d167bSAlex Elder 562c3e946ceSAlex Elder (void) get_device(&rbd_dev->dev); 563340c7a2bSAlex Elder 564602adf40SYehuda Sadeh return 0; 565602adf40SYehuda Sadeh } 566602adf40SYehuda Sadeh 567db2a144bSAl Viro static void rbd_release(struct gendisk *disk, fmode_t mode) 568dfc5606dSYehuda Sadeh { 569dfc5606dSYehuda Sadeh struct rbd_device *rbd_dev = disk->private_data; 570b82d167bSAlex Elder unsigned long open_count_before; 571b82d167bSAlex Elder 572a14ea269SAlex Elder spin_lock_irq(&rbd_dev->lock); 573b82d167bSAlex Elder open_count_before = rbd_dev->open_count--; 574a14ea269SAlex Elder spin_unlock_irq(&rbd_dev->lock); 575b82d167bSAlex Elder rbd_assert(open_count_before > 0); 576dfc5606dSYehuda Sadeh 577c3e946ceSAlex Elder put_device(&rbd_dev->dev); 578dfc5606dSYehuda Sadeh } 579dfc5606dSYehuda Sadeh 580131fd9f6SGuangliang Zhao static int rbd_ioctl_set_ro(struct rbd_device *rbd_dev, unsigned long arg) 581131fd9f6SGuangliang Zhao { 58277f33c03SJosh Durgin int ret = 0; 583131fd9f6SGuangliang Zhao int val; 584131fd9f6SGuangliang Zhao bool ro; 58577f33c03SJosh Durgin bool ro_changed = false; 586131fd9f6SGuangliang Zhao 58777f33c03SJosh Durgin /* get_user() may sleep, so call it before taking rbd_dev->lock */ 588131fd9f6SGuangliang Zhao if (get_user(val, (int __user *)(arg))) 589131fd9f6SGuangliang Zhao return -EFAULT; 590131fd9f6SGuangliang Zhao 591131fd9f6SGuangliang Zhao ro = val ? true : false; 592131fd9f6SGuangliang Zhao /* Snapshot doesn't allow to write*/ 593131fd9f6SGuangliang Zhao if (rbd_dev->spec->snap_id != CEPH_NOSNAP && !ro) 594131fd9f6SGuangliang Zhao return -EROFS; 595131fd9f6SGuangliang Zhao 59677f33c03SJosh Durgin spin_lock_irq(&rbd_dev->lock); 59777f33c03SJosh Durgin /* prevent others open this device */ 59877f33c03SJosh Durgin if (rbd_dev->open_count > 1) { 59977f33c03SJosh Durgin ret = -EBUSY; 60077f33c03SJosh Durgin goto out; 601131fd9f6SGuangliang Zhao } 602131fd9f6SGuangliang Zhao 60377f33c03SJosh Durgin if (rbd_dev->mapping.read_only != ro) { 60477f33c03SJosh Durgin rbd_dev->mapping.read_only = ro; 60577f33c03SJosh Durgin ro_changed = true; 60677f33c03SJosh Durgin } 60777f33c03SJosh Durgin 60877f33c03SJosh Durgin out: 60977f33c03SJosh Durgin spin_unlock_irq(&rbd_dev->lock); 61077f33c03SJosh Durgin /* set_disk_ro() may sleep, so call it after releasing rbd_dev->lock */ 61177f33c03SJosh Durgin if (ret == 0 && ro_changed) 61277f33c03SJosh Durgin set_disk_ro(rbd_dev->disk, ro ? 1 : 0); 61377f33c03SJosh Durgin 61477f33c03SJosh Durgin return ret; 615131fd9f6SGuangliang Zhao } 616131fd9f6SGuangliang Zhao 617131fd9f6SGuangliang Zhao static int rbd_ioctl(struct block_device *bdev, fmode_t mode, 618131fd9f6SGuangliang Zhao unsigned int cmd, unsigned long arg) 619131fd9f6SGuangliang Zhao { 620131fd9f6SGuangliang Zhao struct rbd_device *rbd_dev = bdev->bd_disk->private_data; 621131fd9f6SGuangliang Zhao int ret = 0; 622131fd9f6SGuangliang Zhao 623131fd9f6SGuangliang Zhao switch (cmd) { 624131fd9f6SGuangliang Zhao case BLKROSET: 625131fd9f6SGuangliang Zhao ret = rbd_ioctl_set_ro(rbd_dev, arg); 626131fd9f6SGuangliang Zhao break; 627131fd9f6SGuangliang Zhao default: 628131fd9f6SGuangliang Zhao ret = -ENOTTY; 629131fd9f6SGuangliang Zhao } 630131fd9f6SGuangliang Zhao 631131fd9f6SGuangliang Zhao return ret; 632131fd9f6SGuangliang Zhao } 633131fd9f6SGuangliang Zhao 634131fd9f6SGuangliang Zhao #ifdef CONFIG_COMPAT 635131fd9f6SGuangliang Zhao static int rbd_compat_ioctl(struct block_device *bdev, fmode_t mode, 636131fd9f6SGuangliang Zhao unsigned int cmd, unsigned long arg) 637131fd9f6SGuangliang Zhao { 638131fd9f6SGuangliang Zhao return rbd_ioctl(bdev, mode, cmd, arg); 639131fd9f6SGuangliang Zhao } 640131fd9f6SGuangliang Zhao #endif /* CONFIG_COMPAT */ 641131fd9f6SGuangliang Zhao 642602adf40SYehuda Sadeh static const struct block_device_operations rbd_bd_ops = { 643602adf40SYehuda Sadeh .owner = THIS_MODULE, 644602adf40SYehuda Sadeh .open = rbd_open, 645dfc5606dSYehuda Sadeh .release = rbd_release, 646131fd9f6SGuangliang Zhao .ioctl = rbd_ioctl, 647131fd9f6SGuangliang Zhao #ifdef CONFIG_COMPAT 648131fd9f6SGuangliang Zhao .compat_ioctl = rbd_compat_ioctl, 649131fd9f6SGuangliang Zhao #endif 650602adf40SYehuda Sadeh }; 651602adf40SYehuda Sadeh 652602adf40SYehuda Sadeh /* 6537262cfcaSAlex Elder * Initialize an rbd client instance. Success or not, this function 654cfbf6377SAlex Elder * consumes ceph_opts. Caller holds client_mutex. 655602adf40SYehuda Sadeh */ 656f8c38929SAlex Elder static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts) 657602adf40SYehuda Sadeh { 658602adf40SYehuda Sadeh struct rbd_client *rbdc; 659602adf40SYehuda Sadeh int ret = -ENOMEM; 660602adf40SYehuda Sadeh 66137206ee5SAlex Elder dout("%s:\n", __func__); 662602adf40SYehuda Sadeh rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL); 663602adf40SYehuda Sadeh if (!rbdc) 664602adf40SYehuda Sadeh goto out_opt; 665602adf40SYehuda Sadeh 666602adf40SYehuda Sadeh kref_init(&rbdc->kref); 667602adf40SYehuda Sadeh INIT_LIST_HEAD(&rbdc->node); 668602adf40SYehuda Sadeh 66943ae4701SAlex Elder rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0); 670602adf40SYehuda Sadeh if (IS_ERR(rbdc->client)) 67108f75463SAlex Elder goto out_rbdc; 67243ae4701SAlex Elder ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */ 673602adf40SYehuda Sadeh 674602adf40SYehuda Sadeh ret = ceph_open_session(rbdc->client); 675602adf40SYehuda Sadeh if (ret < 0) 67608f75463SAlex Elder goto out_client; 677602adf40SYehuda Sadeh 678432b8587SAlex Elder spin_lock(&rbd_client_list_lock); 679602adf40SYehuda Sadeh list_add_tail(&rbdc->node, &rbd_client_list); 680432b8587SAlex Elder spin_unlock(&rbd_client_list_lock); 681602adf40SYehuda Sadeh 68237206ee5SAlex Elder dout("%s: rbdc %p\n", __func__, rbdc); 683bc534d86SAlex Elder 684602adf40SYehuda Sadeh return rbdc; 68508f75463SAlex Elder out_client: 686602adf40SYehuda Sadeh ceph_destroy_client(rbdc->client); 68708f75463SAlex Elder out_rbdc: 688602adf40SYehuda Sadeh kfree(rbdc); 689602adf40SYehuda Sadeh out_opt: 69043ae4701SAlex Elder if (ceph_opts) 69143ae4701SAlex Elder ceph_destroy_options(ceph_opts); 69237206ee5SAlex Elder dout("%s: error %d\n", __func__, ret); 69337206ee5SAlex Elder 69428f259b7SVasiliy Kulikov return ERR_PTR(ret); 695602adf40SYehuda Sadeh } 696602adf40SYehuda Sadeh 6972f82ee54SAlex Elder static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc) 6982f82ee54SAlex Elder { 6992f82ee54SAlex Elder kref_get(&rbdc->kref); 7002f82ee54SAlex Elder 7012f82ee54SAlex Elder return rbdc; 7022f82ee54SAlex Elder } 7032f82ee54SAlex Elder 704602adf40SYehuda Sadeh /* 7051f7ba331SAlex Elder * Find a ceph client with specific addr and configuration. If 7061f7ba331SAlex Elder * found, bump its reference count. 707602adf40SYehuda Sadeh */ 7081f7ba331SAlex Elder static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts) 709602adf40SYehuda Sadeh { 710602adf40SYehuda Sadeh struct rbd_client *client_node; 7111f7ba331SAlex Elder bool found = false; 712602adf40SYehuda Sadeh 71343ae4701SAlex Elder if (ceph_opts->flags & CEPH_OPT_NOSHARE) 714602adf40SYehuda Sadeh return NULL; 715602adf40SYehuda Sadeh 7161f7ba331SAlex Elder spin_lock(&rbd_client_list_lock); 7171f7ba331SAlex Elder list_for_each_entry(client_node, &rbd_client_list, node) { 7181f7ba331SAlex Elder if (!ceph_compare_options(ceph_opts, client_node->client)) { 7192f82ee54SAlex Elder __rbd_get_client(client_node); 7202f82ee54SAlex Elder 7211f7ba331SAlex Elder found = true; 7221f7ba331SAlex Elder break; 7231f7ba331SAlex Elder } 7241f7ba331SAlex Elder } 7251f7ba331SAlex Elder spin_unlock(&rbd_client_list_lock); 7261f7ba331SAlex Elder 7271f7ba331SAlex Elder return found ? client_node : NULL; 728602adf40SYehuda Sadeh } 729602adf40SYehuda Sadeh 730602adf40SYehuda Sadeh /* 731210c104cSIlya Dryomov * (Per device) rbd map options 73259c2be1eSYehuda Sadeh */ 73359c2be1eSYehuda Sadeh enum { 734b5584180SIlya Dryomov Opt_queue_depth, 73559c2be1eSYehuda Sadeh Opt_last_int, 73659c2be1eSYehuda Sadeh /* int args above */ 73759c2be1eSYehuda Sadeh Opt_last_string, 73859c2be1eSYehuda Sadeh /* string args above */ 739cc0538b6SAlex Elder Opt_read_only, 740cc0538b6SAlex Elder Opt_read_write, 741210c104cSIlya Dryomov Opt_err 74259c2be1eSYehuda Sadeh }; 74359c2be1eSYehuda Sadeh 74443ae4701SAlex Elder static match_table_t rbd_opts_tokens = { 745b5584180SIlya Dryomov {Opt_queue_depth, "queue_depth=%d"}, 74659c2be1eSYehuda Sadeh /* int args above */ 74759c2be1eSYehuda Sadeh /* string args above */ 748be466c1cSAlex Elder {Opt_read_only, "read_only"}, 749cc0538b6SAlex Elder {Opt_read_only, "ro"}, /* Alternate spelling */ 750cc0538b6SAlex Elder {Opt_read_write, "read_write"}, 751cc0538b6SAlex Elder {Opt_read_write, "rw"}, /* Alternate spelling */ 752210c104cSIlya Dryomov {Opt_err, NULL} 75359c2be1eSYehuda Sadeh }; 75459c2be1eSYehuda Sadeh 75598571b5aSAlex Elder struct rbd_options { 756b5584180SIlya Dryomov int queue_depth; 75798571b5aSAlex Elder bool read_only; 75898571b5aSAlex Elder }; 75998571b5aSAlex Elder 760b5584180SIlya Dryomov #define RBD_QUEUE_DEPTH_DEFAULT BLKDEV_MAX_RQ 76198571b5aSAlex Elder #define RBD_READ_ONLY_DEFAULT false 76298571b5aSAlex Elder 76359c2be1eSYehuda Sadeh static int parse_rbd_opts_token(char *c, void *private) 76459c2be1eSYehuda Sadeh { 76543ae4701SAlex Elder struct rbd_options *rbd_opts = private; 76659c2be1eSYehuda Sadeh substring_t argstr[MAX_OPT_ARGS]; 76759c2be1eSYehuda Sadeh int token, intval, ret; 76859c2be1eSYehuda Sadeh 76943ae4701SAlex Elder token = match_token(c, rbd_opts_tokens, argstr); 77059c2be1eSYehuda Sadeh if (token < Opt_last_int) { 77159c2be1eSYehuda Sadeh ret = match_int(&argstr[0], &intval); 77259c2be1eSYehuda Sadeh if (ret < 0) { 773210c104cSIlya Dryomov pr_err("bad mount option arg (not int) at '%s'\n", c); 77459c2be1eSYehuda Sadeh return ret; 77559c2be1eSYehuda Sadeh } 77659c2be1eSYehuda Sadeh dout("got int token %d val %d\n", token, intval); 77759c2be1eSYehuda Sadeh } else if (token > Opt_last_int && token < Opt_last_string) { 778210c104cSIlya Dryomov dout("got string token %d val %s\n", token, argstr[0].from); 77959c2be1eSYehuda Sadeh } else { 78059c2be1eSYehuda Sadeh dout("got token %d\n", token); 78159c2be1eSYehuda Sadeh } 78259c2be1eSYehuda Sadeh 78359c2be1eSYehuda Sadeh switch (token) { 784b5584180SIlya Dryomov case Opt_queue_depth: 785b5584180SIlya Dryomov if (intval < 1) { 786b5584180SIlya Dryomov pr_err("queue_depth out of range\n"); 787b5584180SIlya Dryomov return -EINVAL; 788b5584180SIlya Dryomov } 789b5584180SIlya Dryomov rbd_opts->queue_depth = intval; 790b5584180SIlya Dryomov break; 791cc0538b6SAlex Elder case Opt_read_only: 792cc0538b6SAlex Elder rbd_opts->read_only = true; 793cc0538b6SAlex Elder break; 794cc0538b6SAlex Elder case Opt_read_write: 795cc0538b6SAlex Elder rbd_opts->read_only = false; 796cc0538b6SAlex Elder break; 79759c2be1eSYehuda Sadeh default: 798210c104cSIlya Dryomov /* libceph prints "bad option" msg */ 799210c104cSIlya Dryomov return -EINVAL; 80059c2be1eSYehuda Sadeh } 801210c104cSIlya Dryomov 80259c2be1eSYehuda Sadeh return 0; 80359c2be1eSYehuda Sadeh } 80459c2be1eSYehuda Sadeh 8056d2940c8SGuangliang Zhao static char* obj_op_name(enum obj_operation_type op_type) 8066d2940c8SGuangliang Zhao { 8076d2940c8SGuangliang Zhao switch (op_type) { 8086d2940c8SGuangliang Zhao case OBJ_OP_READ: 8096d2940c8SGuangliang Zhao return "read"; 8106d2940c8SGuangliang Zhao case OBJ_OP_WRITE: 8116d2940c8SGuangliang Zhao return "write"; 81290e98c52SGuangliang Zhao case OBJ_OP_DISCARD: 81390e98c52SGuangliang Zhao return "discard"; 8146d2940c8SGuangliang Zhao default: 8156d2940c8SGuangliang Zhao return "???"; 8166d2940c8SGuangliang Zhao } 8176d2940c8SGuangliang Zhao } 8186d2940c8SGuangliang Zhao 81959c2be1eSYehuda Sadeh /* 820602adf40SYehuda Sadeh * Get a ceph client with specific addr and configuration, if one does 8217262cfcaSAlex Elder * not exist create it. Either way, ceph_opts is consumed by this 8227262cfcaSAlex Elder * function. 823602adf40SYehuda Sadeh */ 8249d3997fdSAlex Elder static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts) 825602adf40SYehuda Sadeh { 826f8c38929SAlex Elder struct rbd_client *rbdc; 82759c2be1eSYehuda Sadeh 828cfbf6377SAlex Elder mutex_lock_nested(&client_mutex, SINGLE_DEPTH_NESTING); 8291f7ba331SAlex Elder rbdc = rbd_client_find(ceph_opts); 8309d3997fdSAlex Elder if (rbdc) /* using an existing client */ 83143ae4701SAlex Elder ceph_destroy_options(ceph_opts); 8329d3997fdSAlex Elder else 833f8c38929SAlex Elder rbdc = rbd_client_create(ceph_opts); 834cfbf6377SAlex Elder mutex_unlock(&client_mutex); 835d720bcb0SAlex Elder 8369d3997fdSAlex Elder return rbdc; 837602adf40SYehuda Sadeh } 838602adf40SYehuda Sadeh 839602adf40SYehuda Sadeh /* 840602adf40SYehuda Sadeh * Destroy ceph client 841d23a4b3fSAlex Elder * 842432b8587SAlex Elder * Caller must hold rbd_client_list_lock. 843602adf40SYehuda Sadeh */ 844602adf40SYehuda Sadeh static void rbd_client_release(struct kref *kref) 845602adf40SYehuda Sadeh { 846602adf40SYehuda Sadeh struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref); 847602adf40SYehuda Sadeh 84837206ee5SAlex Elder dout("%s: rbdc %p\n", __func__, rbdc); 849cd9d9f5dSAlex Elder spin_lock(&rbd_client_list_lock); 850602adf40SYehuda Sadeh list_del(&rbdc->node); 851cd9d9f5dSAlex Elder spin_unlock(&rbd_client_list_lock); 852602adf40SYehuda Sadeh 853602adf40SYehuda Sadeh ceph_destroy_client(rbdc->client); 854602adf40SYehuda Sadeh kfree(rbdc); 855602adf40SYehuda Sadeh } 856602adf40SYehuda Sadeh 857602adf40SYehuda Sadeh /* 858602adf40SYehuda Sadeh * Drop reference to ceph client node. If it's not referenced anymore, release 859602adf40SYehuda Sadeh * it. 860602adf40SYehuda Sadeh */ 8619d3997fdSAlex Elder static void rbd_put_client(struct rbd_client *rbdc) 862602adf40SYehuda Sadeh { 863c53d5893SAlex Elder if (rbdc) 8649d3997fdSAlex Elder kref_put(&rbdc->kref, rbd_client_release); 865602adf40SYehuda Sadeh } 866602adf40SYehuda Sadeh 867a30b71b9SAlex Elder static bool rbd_image_format_valid(u32 image_format) 868a30b71b9SAlex Elder { 869a30b71b9SAlex Elder return image_format == 1 || image_format == 2; 870a30b71b9SAlex Elder } 871a30b71b9SAlex Elder 8728e94af8eSAlex Elder static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk) 8738e94af8eSAlex Elder { 874103a150fSAlex Elder size_t size; 875103a150fSAlex Elder u32 snap_count; 876103a150fSAlex Elder 877103a150fSAlex Elder /* The header has to start with the magic rbd header text */ 878103a150fSAlex Elder if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT))) 879103a150fSAlex Elder return false; 880103a150fSAlex Elder 881db2388b6SAlex Elder /* The bio layer requires at least sector-sized I/O */ 882db2388b6SAlex Elder 883db2388b6SAlex Elder if (ondisk->options.order < SECTOR_SHIFT) 884db2388b6SAlex Elder return false; 885db2388b6SAlex Elder 886db2388b6SAlex Elder /* If we use u64 in a few spots we may be able to loosen this */ 887db2388b6SAlex Elder 888db2388b6SAlex Elder if (ondisk->options.order > 8 * sizeof (int) - 1) 889db2388b6SAlex Elder return false; 890db2388b6SAlex Elder 891103a150fSAlex Elder /* 892103a150fSAlex Elder * The size of a snapshot header has to fit in a size_t, and 893103a150fSAlex Elder * that limits the number of snapshots. 894103a150fSAlex Elder */ 895103a150fSAlex Elder snap_count = le32_to_cpu(ondisk->snap_count); 896103a150fSAlex Elder size = SIZE_MAX - sizeof (struct ceph_snap_context); 897103a150fSAlex Elder if (snap_count > size / sizeof (__le64)) 898103a150fSAlex Elder return false; 899103a150fSAlex Elder 900103a150fSAlex Elder /* 901103a150fSAlex Elder * Not only that, but the size of the entire the snapshot 902103a150fSAlex Elder * header must also be representable in a size_t. 903103a150fSAlex Elder */ 904103a150fSAlex Elder size -= snap_count * sizeof (__le64); 905103a150fSAlex Elder if ((u64) size < le64_to_cpu(ondisk->snap_names_len)) 906103a150fSAlex Elder return false; 907103a150fSAlex Elder 908103a150fSAlex Elder return true; 9098e94af8eSAlex Elder } 9108e94af8eSAlex Elder 911602adf40SYehuda Sadeh /* 912bb23e37aSAlex Elder * Fill an rbd image header with information from the given format 1 913bb23e37aSAlex Elder * on-disk header. 914602adf40SYehuda Sadeh */ 915662518b1SAlex Elder static int rbd_header_from_disk(struct rbd_device *rbd_dev, 9164156d998SAlex Elder struct rbd_image_header_ondisk *ondisk) 917602adf40SYehuda Sadeh { 918662518b1SAlex Elder struct rbd_image_header *header = &rbd_dev->header; 919bb23e37aSAlex Elder bool first_time = header->object_prefix == NULL; 920bb23e37aSAlex Elder struct ceph_snap_context *snapc; 921bb23e37aSAlex Elder char *object_prefix = NULL; 922bb23e37aSAlex Elder char *snap_names = NULL; 923bb23e37aSAlex Elder u64 *snap_sizes = NULL; 924ccece235SAlex Elder u32 snap_count; 925d2bb24e5SAlex Elder size_t size; 926bb23e37aSAlex Elder int ret = -ENOMEM; 927621901d6SAlex Elder u32 i; 928602adf40SYehuda Sadeh 929bb23e37aSAlex Elder /* Allocate this now to avoid having to handle failure below */ 930103a150fSAlex Elder 931bb23e37aSAlex Elder if (first_time) { 932bb23e37aSAlex Elder size_t len; 933bb23e37aSAlex Elder 934bb23e37aSAlex Elder len = strnlen(ondisk->object_prefix, 935bb23e37aSAlex Elder sizeof (ondisk->object_prefix)); 936bb23e37aSAlex Elder object_prefix = kmalloc(len + 1, GFP_KERNEL); 937bb23e37aSAlex Elder if (!object_prefix) 938602adf40SYehuda Sadeh return -ENOMEM; 939bb23e37aSAlex Elder memcpy(object_prefix, ondisk->object_prefix, len); 940bb23e37aSAlex Elder object_prefix[len] = '\0'; 941bb23e37aSAlex Elder } 94200f1f36fSAlex Elder 943bb23e37aSAlex Elder /* Allocate the snapshot context and fill it in */ 944d2bb24e5SAlex Elder 945602adf40SYehuda Sadeh snap_count = le32_to_cpu(ondisk->snap_count); 946bb23e37aSAlex Elder snapc = ceph_create_snap_context(snap_count, GFP_KERNEL); 947bb23e37aSAlex Elder if (!snapc) 948bb23e37aSAlex Elder goto out_err; 949bb23e37aSAlex Elder snapc->seq = le64_to_cpu(ondisk->snap_seq); 950602adf40SYehuda Sadeh if (snap_count) { 951bb23e37aSAlex Elder struct rbd_image_snap_ondisk *snaps; 952f785cc1dSAlex Elder u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len); 953f785cc1dSAlex Elder 954bb23e37aSAlex Elder /* We'll keep a copy of the snapshot names... */ 955621901d6SAlex Elder 956f785cc1dSAlex Elder if (snap_names_len > (u64)SIZE_MAX) 957bb23e37aSAlex Elder goto out_2big; 958bb23e37aSAlex Elder snap_names = kmalloc(snap_names_len, GFP_KERNEL); 959bb23e37aSAlex Elder if (!snap_names) 960602adf40SYehuda Sadeh goto out_err; 961bb23e37aSAlex Elder 962bb23e37aSAlex Elder /* ...as well as the array of their sizes. */ 963bb23e37aSAlex Elder 964bb23e37aSAlex Elder size = snap_count * sizeof (*header->snap_sizes); 965bb23e37aSAlex Elder snap_sizes = kmalloc(size, GFP_KERNEL); 966bb23e37aSAlex Elder if (!snap_sizes) 967bb23e37aSAlex Elder goto out_err; 968bb23e37aSAlex Elder 969f785cc1dSAlex Elder /* 970bb23e37aSAlex Elder * Copy the names, and fill in each snapshot's id 971bb23e37aSAlex Elder * and size. 972bb23e37aSAlex Elder * 97399a41ebcSAlex Elder * Note that rbd_dev_v1_header_info() guarantees the 974bb23e37aSAlex Elder * ondisk buffer we're working with has 975f785cc1dSAlex Elder * snap_names_len bytes beyond the end of the 976f785cc1dSAlex Elder * snapshot id array, this memcpy() is safe. 977f785cc1dSAlex Elder */ 978bb23e37aSAlex Elder memcpy(snap_names, &ondisk->snaps[snap_count], snap_names_len); 979bb23e37aSAlex Elder snaps = ondisk->snaps; 980bb23e37aSAlex Elder for (i = 0; i < snap_count; i++) { 981bb23e37aSAlex Elder snapc->snaps[i] = le64_to_cpu(snaps[i].id); 982bb23e37aSAlex Elder snap_sizes[i] = le64_to_cpu(snaps[i].image_size); 983bb23e37aSAlex Elder } 984602adf40SYehuda Sadeh } 985849b4260SAlex Elder 986bb23e37aSAlex Elder /* We won't fail any more, fill in the header */ 987bb23e37aSAlex Elder 988bb23e37aSAlex Elder if (first_time) { 989bb23e37aSAlex Elder header->object_prefix = object_prefix; 990602adf40SYehuda Sadeh header->obj_order = ondisk->options.order; 991602adf40SYehuda Sadeh header->crypt_type = ondisk->options.crypt_type; 992602adf40SYehuda Sadeh header->comp_type = ondisk->options.comp_type; 993bb23e37aSAlex Elder /* The rest aren't used for format 1 images */ 994bb23e37aSAlex Elder header->stripe_unit = 0; 995bb23e37aSAlex Elder header->stripe_count = 0; 996bb23e37aSAlex Elder header->features = 0; 997662518b1SAlex Elder } else { 998662518b1SAlex Elder ceph_put_snap_context(header->snapc); 999662518b1SAlex Elder kfree(header->snap_names); 1000662518b1SAlex Elder kfree(header->snap_sizes); 1001bb23e37aSAlex Elder } 10026a52325fSAlex Elder 1003bb23e37aSAlex Elder /* The remaining fields always get updated (when we refresh) */ 1004621901d6SAlex Elder 1005f84344f3SAlex Elder header->image_size = le64_to_cpu(ondisk->image_size); 1006bb23e37aSAlex Elder header->snapc = snapc; 1007bb23e37aSAlex Elder header->snap_names = snap_names; 1008bb23e37aSAlex Elder header->snap_sizes = snap_sizes; 1009468521c1SAlex Elder 1010602adf40SYehuda Sadeh return 0; 1011bb23e37aSAlex Elder out_2big: 1012bb23e37aSAlex Elder ret = -EIO; 10136a52325fSAlex Elder out_err: 1014bb23e37aSAlex Elder kfree(snap_sizes); 1015bb23e37aSAlex Elder kfree(snap_names); 1016bb23e37aSAlex Elder ceph_put_snap_context(snapc); 1017bb23e37aSAlex Elder kfree(object_prefix); 1018ccece235SAlex Elder 1019bb23e37aSAlex Elder return ret; 1020602adf40SYehuda Sadeh } 1021602adf40SYehuda Sadeh 10229682fc6dSAlex Elder static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which) 10239682fc6dSAlex Elder { 10249682fc6dSAlex Elder const char *snap_name; 10259682fc6dSAlex Elder 10269682fc6dSAlex Elder rbd_assert(which < rbd_dev->header.snapc->num_snaps); 10279682fc6dSAlex Elder 10289682fc6dSAlex Elder /* Skip over names until we find the one we are looking for */ 10299682fc6dSAlex Elder 10309682fc6dSAlex Elder snap_name = rbd_dev->header.snap_names; 10319682fc6dSAlex Elder while (which--) 10329682fc6dSAlex Elder snap_name += strlen(snap_name) + 1; 10339682fc6dSAlex Elder 10349682fc6dSAlex Elder return kstrdup(snap_name, GFP_KERNEL); 10359682fc6dSAlex Elder } 10369682fc6dSAlex Elder 103730d1cff8SAlex Elder /* 103830d1cff8SAlex Elder * Snapshot id comparison function for use with qsort()/bsearch(). 103930d1cff8SAlex Elder * Note that result is for snapshots in *descending* order. 104030d1cff8SAlex Elder */ 104130d1cff8SAlex Elder static int snapid_compare_reverse(const void *s1, const void *s2) 104230d1cff8SAlex Elder { 104330d1cff8SAlex Elder u64 snap_id1 = *(u64 *)s1; 104430d1cff8SAlex Elder u64 snap_id2 = *(u64 *)s2; 104530d1cff8SAlex Elder 104630d1cff8SAlex Elder if (snap_id1 < snap_id2) 104730d1cff8SAlex Elder return 1; 104830d1cff8SAlex Elder return snap_id1 == snap_id2 ? 0 : -1; 104930d1cff8SAlex Elder } 105030d1cff8SAlex Elder 105130d1cff8SAlex Elder /* 105230d1cff8SAlex Elder * Search a snapshot context to see if the given snapshot id is 105330d1cff8SAlex Elder * present. 105430d1cff8SAlex Elder * 105530d1cff8SAlex Elder * Returns the position of the snapshot id in the array if it's found, 105630d1cff8SAlex Elder * or BAD_SNAP_INDEX otherwise. 105730d1cff8SAlex Elder * 105830d1cff8SAlex Elder * Note: The snapshot array is in kept sorted (by the osd) in 105930d1cff8SAlex Elder * reverse order, highest snapshot id first. 106030d1cff8SAlex Elder */ 10619682fc6dSAlex Elder static u32 rbd_dev_snap_index(struct rbd_device *rbd_dev, u64 snap_id) 10629682fc6dSAlex Elder { 10639682fc6dSAlex Elder struct ceph_snap_context *snapc = rbd_dev->header.snapc; 106430d1cff8SAlex Elder u64 *found; 10659682fc6dSAlex Elder 106630d1cff8SAlex Elder found = bsearch(&snap_id, &snapc->snaps, snapc->num_snaps, 106730d1cff8SAlex Elder sizeof (snap_id), snapid_compare_reverse); 10689682fc6dSAlex Elder 106930d1cff8SAlex Elder return found ? (u32)(found - &snapc->snaps[0]) : BAD_SNAP_INDEX; 10709682fc6dSAlex Elder } 10719682fc6dSAlex Elder 10722ad3d716SAlex Elder static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, 10732ad3d716SAlex Elder u64 snap_id) 107454cac61fSAlex Elder { 107554cac61fSAlex Elder u32 which; 1076da6a6b63SJosh Durgin const char *snap_name; 107754cac61fSAlex Elder 107854cac61fSAlex Elder which = rbd_dev_snap_index(rbd_dev, snap_id); 107954cac61fSAlex Elder if (which == BAD_SNAP_INDEX) 1080da6a6b63SJosh Durgin return ERR_PTR(-ENOENT); 108154cac61fSAlex Elder 1082da6a6b63SJosh Durgin snap_name = _rbd_dev_v1_snap_name(rbd_dev, which); 1083da6a6b63SJosh Durgin return snap_name ? snap_name : ERR_PTR(-ENOMEM); 108454cac61fSAlex Elder } 108554cac61fSAlex Elder 10869e15b77dSAlex Elder static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id) 10879e15b77dSAlex Elder { 10889e15b77dSAlex Elder if (snap_id == CEPH_NOSNAP) 10899e15b77dSAlex Elder return RBD_SNAP_HEAD_NAME; 10909e15b77dSAlex Elder 109154cac61fSAlex Elder rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 109254cac61fSAlex Elder if (rbd_dev->image_format == 1) 109354cac61fSAlex Elder return rbd_dev_v1_snap_name(rbd_dev, snap_id); 10949e15b77dSAlex Elder 109554cac61fSAlex Elder return rbd_dev_v2_snap_name(rbd_dev, snap_id); 10969e15b77dSAlex Elder } 10979e15b77dSAlex Elder 10982ad3d716SAlex Elder static int rbd_snap_size(struct rbd_device *rbd_dev, u64 snap_id, 10992ad3d716SAlex Elder u64 *snap_size) 1100602adf40SYehuda Sadeh { 11012ad3d716SAlex Elder rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 11022ad3d716SAlex Elder if (snap_id == CEPH_NOSNAP) { 11032ad3d716SAlex Elder *snap_size = rbd_dev->header.image_size; 11042ad3d716SAlex Elder } else if (rbd_dev->image_format == 1) { 11052ad3d716SAlex Elder u32 which; 110600f1f36fSAlex Elder 11072ad3d716SAlex Elder which = rbd_dev_snap_index(rbd_dev, snap_id); 11082ad3d716SAlex Elder if (which == BAD_SNAP_INDEX) 11092ad3d716SAlex Elder return -ENOENT; 111000f1f36fSAlex Elder 11112ad3d716SAlex Elder *snap_size = rbd_dev->header.snap_sizes[which]; 11122ad3d716SAlex Elder } else { 11132ad3d716SAlex Elder u64 size = 0; 11142ad3d716SAlex Elder int ret; 11152ad3d716SAlex Elder 11162ad3d716SAlex Elder ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size); 11172ad3d716SAlex Elder if (ret) 11182ad3d716SAlex Elder return ret; 11192ad3d716SAlex Elder 11202ad3d716SAlex Elder *snap_size = size; 11212ad3d716SAlex Elder } 11222ad3d716SAlex Elder return 0; 11232ad3d716SAlex Elder } 11242ad3d716SAlex Elder 11252ad3d716SAlex Elder static int rbd_snap_features(struct rbd_device *rbd_dev, u64 snap_id, 11262ad3d716SAlex Elder u64 *snap_features) 11272ad3d716SAlex Elder { 11282ad3d716SAlex Elder rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 11292ad3d716SAlex Elder if (snap_id == CEPH_NOSNAP) { 11302ad3d716SAlex Elder *snap_features = rbd_dev->header.features; 11312ad3d716SAlex Elder } else if (rbd_dev->image_format == 1) { 11322ad3d716SAlex Elder *snap_features = 0; /* No features for format 1 */ 11332ad3d716SAlex Elder } else { 11342ad3d716SAlex Elder u64 features = 0; 11352ad3d716SAlex Elder int ret; 11362ad3d716SAlex Elder 11372ad3d716SAlex Elder ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, &features); 11382ad3d716SAlex Elder if (ret) 11392ad3d716SAlex Elder return ret; 11402ad3d716SAlex Elder 11412ad3d716SAlex Elder *snap_features = features; 11422ad3d716SAlex Elder } 11432ad3d716SAlex Elder return 0; 114400f1f36fSAlex Elder } 1145602adf40SYehuda Sadeh 1146d1cf5788SAlex Elder static int rbd_dev_mapping_set(struct rbd_device *rbd_dev) 1147602adf40SYehuda Sadeh { 11488f4b7d98SAlex Elder u64 snap_id = rbd_dev->spec->snap_id; 11492ad3d716SAlex Elder u64 size = 0; 11502ad3d716SAlex Elder u64 features = 0; 11512ad3d716SAlex Elder int ret; 11528b0241f8SAlex Elder 11532ad3d716SAlex Elder ret = rbd_snap_size(rbd_dev, snap_id, &size); 11542ad3d716SAlex Elder if (ret) 11552ad3d716SAlex Elder return ret; 11562ad3d716SAlex Elder ret = rbd_snap_features(rbd_dev, snap_id, &features); 11572ad3d716SAlex Elder if (ret) 11582ad3d716SAlex Elder return ret; 11592ad3d716SAlex Elder 11602ad3d716SAlex Elder rbd_dev->mapping.size = size; 11612ad3d716SAlex Elder rbd_dev->mapping.features = features; 11622ad3d716SAlex Elder 11638b0241f8SAlex Elder return 0; 1164602adf40SYehuda Sadeh } 1165602adf40SYehuda Sadeh 1166d1cf5788SAlex Elder static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev) 1167d1cf5788SAlex Elder { 1168d1cf5788SAlex Elder rbd_dev->mapping.size = 0; 1169d1cf5788SAlex Elder rbd_dev->mapping.features = 0; 1170200a6a8bSAlex Elder } 1171200a6a8bSAlex Elder 11727d5079aaSHimangi Saraogi static void rbd_segment_name_free(const char *name) 11737d5079aaSHimangi Saraogi { 11747d5079aaSHimangi Saraogi /* The explicit cast here is needed to drop the const qualifier */ 11757d5079aaSHimangi Saraogi 11767d5079aaSHimangi Saraogi kmem_cache_free(rbd_segment_name_cache, (void *)name); 11777d5079aaSHimangi Saraogi } 11787d5079aaSHimangi Saraogi 117998571b5aSAlex Elder static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset) 1180602adf40SYehuda Sadeh { 118165ccfe21SAlex Elder char *name; 118265ccfe21SAlex Elder u64 segment; 118365ccfe21SAlex Elder int ret; 11843a96d5cdSJosh Durgin char *name_format; 1185602adf40SYehuda Sadeh 118678c2a44aSAlex Elder name = kmem_cache_alloc(rbd_segment_name_cache, GFP_NOIO); 118765ccfe21SAlex Elder if (!name) 118865ccfe21SAlex Elder return NULL; 118965ccfe21SAlex Elder segment = offset >> rbd_dev->header.obj_order; 11903a96d5cdSJosh Durgin name_format = "%s.%012llx"; 11913a96d5cdSJosh Durgin if (rbd_dev->image_format == 2) 11923a96d5cdSJosh Durgin name_format = "%s.%016llx"; 11932d0ebc5dSIlya Dryomov ret = snprintf(name, CEPH_MAX_OID_NAME_LEN + 1, name_format, 119465ccfe21SAlex Elder rbd_dev->header.object_prefix, segment); 11952d0ebc5dSIlya Dryomov if (ret < 0 || ret > CEPH_MAX_OID_NAME_LEN) { 119665ccfe21SAlex Elder pr_err("error formatting segment name for #%llu (%d)\n", 119765ccfe21SAlex Elder segment, ret); 11987d5079aaSHimangi Saraogi rbd_segment_name_free(name); 119965ccfe21SAlex Elder name = NULL; 120065ccfe21SAlex Elder } 1201602adf40SYehuda Sadeh 120265ccfe21SAlex Elder return name; 120365ccfe21SAlex Elder } 1204602adf40SYehuda Sadeh 120565ccfe21SAlex Elder static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset) 120665ccfe21SAlex Elder { 120765ccfe21SAlex Elder u64 segment_size = (u64) 1 << rbd_dev->header.obj_order; 1208602adf40SYehuda Sadeh 120965ccfe21SAlex Elder return offset & (segment_size - 1); 121065ccfe21SAlex Elder } 121165ccfe21SAlex Elder 121265ccfe21SAlex Elder static u64 rbd_segment_length(struct rbd_device *rbd_dev, 121365ccfe21SAlex Elder u64 offset, u64 length) 121465ccfe21SAlex Elder { 121565ccfe21SAlex Elder u64 segment_size = (u64) 1 << rbd_dev->header.obj_order; 121665ccfe21SAlex Elder 121765ccfe21SAlex Elder offset &= segment_size - 1; 121865ccfe21SAlex Elder 1219aafb230eSAlex Elder rbd_assert(length <= U64_MAX - offset); 122065ccfe21SAlex Elder if (offset + length > segment_size) 122165ccfe21SAlex Elder length = segment_size - offset; 122265ccfe21SAlex Elder 122365ccfe21SAlex Elder return length; 1224602adf40SYehuda Sadeh } 1225602adf40SYehuda Sadeh 1226602adf40SYehuda Sadeh /* 1227029bcbd8SJosh Durgin * returns the size of an object in the image 1228029bcbd8SJosh Durgin */ 1229029bcbd8SJosh Durgin static u64 rbd_obj_bytes(struct rbd_image_header *header) 1230029bcbd8SJosh Durgin { 1231029bcbd8SJosh Durgin return 1 << header->obj_order; 1232029bcbd8SJosh Durgin } 1233029bcbd8SJosh Durgin 1234029bcbd8SJosh Durgin /* 1235602adf40SYehuda Sadeh * bio helpers 1236602adf40SYehuda Sadeh */ 1237602adf40SYehuda Sadeh 1238602adf40SYehuda Sadeh static void bio_chain_put(struct bio *chain) 1239602adf40SYehuda Sadeh { 1240602adf40SYehuda Sadeh struct bio *tmp; 1241602adf40SYehuda Sadeh 1242602adf40SYehuda Sadeh while (chain) { 1243602adf40SYehuda Sadeh tmp = chain; 1244602adf40SYehuda Sadeh chain = chain->bi_next; 1245602adf40SYehuda Sadeh bio_put(tmp); 1246602adf40SYehuda Sadeh } 1247602adf40SYehuda Sadeh } 1248602adf40SYehuda Sadeh 1249602adf40SYehuda Sadeh /* 1250602adf40SYehuda Sadeh * zeros a bio chain, starting at specific offset 1251602adf40SYehuda Sadeh */ 1252602adf40SYehuda Sadeh static void zero_bio_chain(struct bio *chain, int start_ofs) 1253602adf40SYehuda Sadeh { 12547988613bSKent Overstreet struct bio_vec bv; 12557988613bSKent Overstreet struct bvec_iter iter; 1256602adf40SYehuda Sadeh unsigned long flags; 1257602adf40SYehuda Sadeh void *buf; 1258602adf40SYehuda Sadeh int pos = 0; 1259602adf40SYehuda Sadeh 1260602adf40SYehuda Sadeh while (chain) { 12617988613bSKent Overstreet bio_for_each_segment(bv, chain, iter) { 12627988613bSKent Overstreet if (pos + bv.bv_len > start_ofs) { 1263602adf40SYehuda Sadeh int remainder = max(start_ofs - pos, 0); 12647988613bSKent Overstreet buf = bvec_kmap_irq(&bv, &flags); 1265602adf40SYehuda Sadeh memset(buf + remainder, 0, 12667988613bSKent Overstreet bv.bv_len - remainder); 12677988613bSKent Overstreet flush_dcache_page(bv.bv_page); 126885b5aaa6SDan Carpenter bvec_kunmap_irq(buf, &flags); 1269602adf40SYehuda Sadeh } 12707988613bSKent Overstreet pos += bv.bv_len; 1271602adf40SYehuda Sadeh } 1272602adf40SYehuda Sadeh 1273602adf40SYehuda Sadeh chain = chain->bi_next; 1274602adf40SYehuda Sadeh } 1275602adf40SYehuda Sadeh } 1276602adf40SYehuda Sadeh 1277602adf40SYehuda Sadeh /* 1278b9434c5bSAlex Elder * similar to zero_bio_chain(), zeros data defined by a page array, 1279b9434c5bSAlex Elder * starting at the given byte offset from the start of the array and 1280b9434c5bSAlex Elder * continuing up to the given end offset. The pages array is 1281b9434c5bSAlex Elder * assumed to be big enough to hold all bytes up to the end. 1282b9434c5bSAlex Elder */ 1283b9434c5bSAlex Elder static void zero_pages(struct page **pages, u64 offset, u64 end) 1284b9434c5bSAlex Elder { 1285b9434c5bSAlex Elder struct page **page = &pages[offset >> PAGE_SHIFT]; 1286b9434c5bSAlex Elder 1287b9434c5bSAlex Elder rbd_assert(end > offset); 1288b9434c5bSAlex Elder rbd_assert(end - offset <= (u64)SIZE_MAX); 1289b9434c5bSAlex Elder while (offset < end) { 1290b9434c5bSAlex Elder size_t page_offset; 1291b9434c5bSAlex Elder size_t length; 1292b9434c5bSAlex Elder unsigned long flags; 1293b9434c5bSAlex Elder void *kaddr; 1294b9434c5bSAlex Elder 1295491205a8SGeert Uytterhoeven page_offset = offset & ~PAGE_MASK; 1296491205a8SGeert Uytterhoeven length = min_t(size_t, PAGE_SIZE - page_offset, end - offset); 1297b9434c5bSAlex Elder local_irq_save(flags); 1298b9434c5bSAlex Elder kaddr = kmap_atomic(*page); 1299b9434c5bSAlex Elder memset(kaddr + page_offset, 0, length); 1300e2156054SAlex Elder flush_dcache_page(*page); 1301b9434c5bSAlex Elder kunmap_atomic(kaddr); 1302b9434c5bSAlex Elder local_irq_restore(flags); 1303b9434c5bSAlex Elder 1304b9434c5bSAlex Elder offset += length; 1305b9434c5bSAlex Elder page++; 1306b9434c5bSAlex Elder } 1307b9434c5bSAlex Elder } 1308b9434c5bSAlex Elder 1309b9434c5bSAlex Elder /* 1310f7760dadSAlex Elder * Clone a portion of a bio, starting at the given byte offset 1311f7760dadSAlex Elder * and continuing for the number of bytes indicated. 1312602adf40SYehuda Sadeh */ 1313f7760dadSAlex Elder static struct bio *bio_clone_range(struct bio *bio_src, 1314f7760dadSAlex Elder unsigned int offset, 1315f7760dadSAlex Elder unsigned int len, 1316f7760dadSAlex Elder gfp_t gfpmask) 1317602adf40SYehuda Sadeh { 1318f7760dadSAlex Elder struct bio *bio; 1319602adf40SYehuda Sadeh 13205341a627SKent Overstreet bio = bio_clone(bio_src, gfpmask); 1321f7760dadSAlex Elder if (!bio) 1322f7760dadSAlex Elder return NULL; /* ENOMEM */ 1323f7760dadSAlex Elder 13245341a627SKent Overstreet bio_advance(bio, offset); 13254f024f37SKent Overstreet bio->bi_iter.bi_size = len; 1326602adf40SYehuda Sadeh 1327f7760dadSAlex Elder return bio; 1328602adf40SYehuda Sadeh } 1329602adf40SYehuda Sadeh 1330f7760dadSAlex Elder /* 1331f7760dadSAlex Elder * Clone a portion of a bio chain, starting at the given byte offset 1332f7760dadSAlex Elder * into the first bio in the source chain and continuing for the 1333f7760dadSAlex Elder * number of bytes indicated. The result is another bio chain of 1334f7760dadSAlex Elder * exactly the given length, or a null pointer on error. 1335f7760dadSAlex Elder * 1336f7760dadSAlex Elder * The bio_src and offset parameters are both in-out. On entry they 1337f7760dadSAlex Elder * refer to the first source bio and the offset into that bio where 1338f7760dadSAlex Elder * the start of data to be cloned is located. 1339f7760dadSAlex Elder * 1340f7760dadSAlex Elder * On return, bio_src is updated to refer to the bio in the source 1341f7760dadSAlex Elder * chain that contains first un-cloned byte, and *offset will 1342f7760dadSAlex Elder * contain the offset of that byte within that bio. 1343f7760dadSAlex Elder */ 1344f7760dadSAlex Elder static struct bio *bio_chain_clone_range(struct bio **bio_src, 1345f7760dadSAlex Elder unsigned int *offset, 1346f7760dadSAlex Elder unsigned int len, 1347f7760dadSAlex Elder gfp_t gfpmask) 1348f7760dadSAlex Elder { 1349f7760dadSAlex Elder struct bio *bi = *bio_src; 1350f7760dadSAlex Elder unsigned int off = *offset; 1351f7760dadSAlex Elder struct bio *chain = NULL; 1352f7760dadSAlex Elder struct bio **end; 1353602adf40SYehuda Sadeh 1354f7760dadSAlex Elder /* Build up a chain of clone bios up to the limit */ 1355602adf40SYehuda Sadeh 13564f024f37SKent Overstreet if (!bi || off >= bi->bi_iter.bi_size || !len) 1357f7760dadSAlex Elder return NULL; /* Nothing to clone */ 1358602adf40SYehuda Sadeh 1359f7760dadSAlex Elder end = &chain; 1360f7760dadSAlex Elder while (len) { 1361f7760dadSAlex Elder unsigned int bi_size; 1362f7760dadSAlex Elder struct bio *bio; 1363f7760dadSAlex Elder 1364f5400b7aSAlex Elder if (!bi) { 1365f5400b7aSAlex Elder rbd_warn(NULL, "bio_chain exhausted with %u left", len); 1366f7760dadSAlex Elder goto out_err; /* EINVAL; ran out of bio's */ 1367f5400b7aSAlex Elder } 13684f024f37SKent Overstreet bi_size = min_t(unsigned int, bi->bi_iter.bi_size - off, len); 1369f7760dadSAlex Elder bio = bio_clone_range(bi, off, bi_size, gfpmask); 1370f7760dadSAlex Elder if (!bio) 1371f7760dadSAlex Elder goto out_err; /* ENOMEM */ 1372f7760dadSAlex Elder 1373f7760dadSAlex Elder *end = bio; 1374f7760dadSAlex Elder end = &bio->bi_next; 1375f7760dadSAlex Elder 1376f7760dadSAlex Elder off += bi_size; 13774f024f37SKent Overstreet if (off == bi->bi_iter.bi_size) { 1378f7760dadSAlex Elder bi = bi->bi_next; 1379f7760dadSAlex Elder off = 0; 1380f7760dadSAlex Elder } 1381f7760dadSAlex Elder len -= bi_size; 1382f7760dadSAlex Elder } 1383f7760dadSAlex Elder *bio_src = bi; 1384f7760dadSAlex Elder *offset = off; 1385f7760dadSAlex Elder 1386f7760dadSAlex Elder return chain; 1387f7760dadSAlex Elder out_err: 1388f7760dadSAlex Elder bio_chain_put(chain); 1389f7760dadSAlex Elder 1390602adf40SYehuda Sadeh return NULL; 1391602adf40SYehuda Sadeh } 1392602adf40SYehuda Sadeh 1393926f9b3fSAlex Elder /* 1394926f9b3fSAlex Elder * The default/initial value for all object request flags is 0. For 1395926f9b3fSAlex Elder * each flag, once its value is set to 1 it is never reset to 0 1396926f9b3fSAlex Elder * again. 1397926f9b3fSAlex Elder */ 13986365d33aSAlex Elder static void obj_request_img_data_set(struct rbd_obj_request *obj_request) 13996365d33aSAlex Elder { 14006365d33aSAlex Elder if (test_and_set_bit(OBJ_REQ_IMG_DATA, &obj_request->flags)) { 14016365d33aSAlex Elder struct rbd_device *rbd_dev; 14026365d33aSAlex Elder 140357acbaa7SAlex Elder rbd_dev = obj_request->img_request->rbd_dev; 14049584d508SIlya Dryomov rbd_warn(rbd_dev, "obj_request %p already marked img_data", 14056365d33aSAlex Elder obj_request); 14066365d33aSAlex Elder } 14076365d33aSAlex Elder } 14086365d33aSAlex Elder 14096365d33aSAlex Elder static bool obj_request_img_data_test(struct rbd_obj_request *obj_request) 14106365d33aSAlex Elder { 14116365d33aSAlex Elder smp_mb(); 14126365d33aSAlex Elder return test_bit(OBJ_REQ_IMG_DATA, &obj_request->flags) != 0; 14136365d33aSAlex Elder } 14146365d33aSAlex Elder 141557acbaa7SAlex Elder static void obj_request_done_set(struct rbd_obj_request *obj_request) 141657acbaa7SAlex Elder { 141757acbaa7SAlex Elder if (test_and_set_bit(OBJ_REQ_DONE, &obj_request->flags)) { 141857acbaa7SAlex Elder struct rbd_device *rbd_dev = NULL; 141957acbaa7SAlex Elder 142057acbaa7SAlex Elder if (obj_request_img_data_test(obj_request)) 142157acbaa7SAlex Elder rbd_dev = obj_request->img_request->rbd_dev; 14229584d508SIlya Dryomov rbd_warn(rbd_dev, "obj_request %p already marked done", 142357acbaa7SAlex Elder obj_request); 142457acbaa7SAlex Elder } 142557acbaa7SAlex Elder } 142657acbaa7SAlex Elder 142757acbaa7SAlex Elder static bool obj_request_done_test(struct rbd_obj_request *obj_request) 142857acbaa7SAlex Elder { 142957acbaa7SAlex Elder smp_mb(); 143057acbaa7SAlex Elder return test_bit(OBJ_REQ_DONE, &obj_request->flags) != 0; 143157acbaa7SAlex Elder } 143257acbaa7SAlex Elder 14335679c59fSAlex Elder /* 14345679c59fSAlex Elder * This sets the KNOWN flag after (possibly) setting the EXISTS 14355679c59fSAlex Elder * flag. The latter is set based on the "exists" value provided. 14365679c59fSAlex Elder * 14375679c59fSAlex Elder * Note that for our purposes once an object exists it never goes 14385679c59fSAlex Elder * away again. It's possible that the response from two existence 14395679c59fSAlex Elder * checks are separated by the creation of the target object, and 14405679c59fSAlex Elder * the first ("doesn't exist") response arrives *after* the second 14415679c59fSAlex Elder * ("does exist"). In that case we ignore the second one. 14425679c59fSAlex Elder */ 14435679c59fSAlex Elder static void obj_request_existence_set(struct rbd_obj_request *obj_request, 14445679c59fSAlex Elder bool exists) 14455679c59fSAlex Elder { 14465679c59fSAlex Elder if (exists) 14475679c59fSAlex Elder set_bit(OBJ_REQ_EXISTS, &obj_request->flags); 14485679c59fSAlex Elder set_bit(OBJ_REQ_KNOWN, &obj_request->flags); 14495679c59fSAlex Elder smp_mb(); 14505679c59fSAlex Elder } 14515679c59fSAlex Elder 14525679c59fSAlex Elder static bool obj_request_known_test(struct rbd_obj_request *obj_request) 14535679c59fSAlex Elder { 14545679c59fSAlex Elder smp_mb(); 14555679c59fSAlex Elder return test_bit(OBJ_REQ_KNOWN, &obj_request->flags) != 0; 14565679c59fSAlex Elder } 14575679c59fSAlex Elder 14585679c59fSAlex Elder static bool obj_request_exists_test(struct rbd_obj_request *obj_request) 14595679c59fSAlex Elder { 14605679c59fSAlex Elder smp_mb(); 14615679c59fSAlex Elder return test_bit(OBJ_REQ_EXISTS, &obj_request->flags) != 0; 14625679c59fSAlex Elder } 14635679c59fSAlex Elder 14649638556aSIlya Dryomov static bool obj_request_overlaps_parent(struct rbd_obj_request *obj_request) 14659638556aSIlya Dryomov { 14669638556aSIlya Dryomov struct rbd_device *rbd_dev = obj_request->img_request->rbd_dev; 14679638556aSIlya Dryomov 14689638556aSIlya Dryomov return obj_request->img_offset < 14699638556aSIlya Dryomov round_up(rbd_dev->parent_overlap, rbd_obj_bytes(&rbd_dev->header)); 14709638556aSIlya Dryomov } 14719638556aSIlya Dryomov 1472bf0d5f50SAlex Elder static void rbd_obj_request_get(struct rbd_obj_request *obj_request) 1473bf0d5f50SAlex Elder { 147437206ee5SAlex Elder dout("%s: obj %p (was %d)\n", __func__, obj_request, 147537206ee5SAlex Elder atomic_read(&obj_request->kref.refcount)); 1476bf0d5f50SAlex Elder kref_get(&obj_request->kref); 1477bf0d5f50SAlex Elder } 1478bf0d5f50SAlex Elder 1479bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref); 1480bf0d5f50SAlex Elder static void rbd_obj_request_put(struct rbd_obj_request *obj_request) 1481bf0d5f50SAlex Elder { 1482bf0d5f50SAlex Elder rbd_assert(obj_request != NULL); 148337206ee5SAlex Elder dout("%s: obj %p (was %d)\n", __func__, obj_request, 148437206ee5SAlex Elder atomic_read(&obj_request->kref.refcount)); 1485bf0d5f50SAlex Elder kref_put(&obj_request->kref, rbd_obj_request_destroy); 1486bf0d5f50SAlex Elder } 1487bf0d5f50SAlex Elder 14880f2d5be7SAlex Elder static void rbd_img_request_get(struct rbd_img_request *img_request) 14890f2d5be7SAlex Elder { 14900f2d5be7SAlex Elder dout("%s: img %p (was %d)\n", __func__, img_request, 14910f2d5be7SAlex Elder atomic_read(&img_request->kref.refcount)); 14920f2d5be7SAlex Elder kref_get(&img_request->kref); 14930f2d5be7SAlex Elder } 14940f2d5be7SAlex Elder 1495e93f3152SAlex Elder static bool img_request_child_test(struct rbd_img_request *img_request); 1496e93f3152SAlex Elder static void rbd_parent_request_destroy(struct kref *kref); 1497bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref); 1498bf0d5f50SAlex Elder static void rbd_img_request_put(struct rbd_img_request *img_request) 1499bf0d5f50SAlex Elder { 1500bf0d5f50SAlex Elder rbd_assert(img_request != NULL); 150137206ee5SAlex Elder dout("%s: img %p (was %d)\n", __func__, img_request, 150237206ee5SAlex Elder atomic_read(&img_request->kref.refcount)); 1503e93f3152SAlex Elder if (img_request_child_test(img_request)) 1504e93f3152SAlex Elder kref_put(&img_request->kref, rbd_parent_request_destroy); 1505e93f3152SAlex Elder else 1506bf0d5f50SAlex Elder kref_put(&img_request->kref, rbd_img_request_destroy); 1507bf0d5f50SAlex Elder } 1508bf0d5f50SAlex Elder 1509bf0d5f50SAlex Elder static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request, 1510bf0d5f50SAlex Elder struct rbd_obj_request *obj_request) 1511bf0d5f50SAlex Elder { 151225dcf954SAlex Elder rbd_assert(obj_request->img_request == NULL); 151325dcf954SAlex Elder 1514b155e86cSAlex Elder /* Image request now owns object's original reference */ 1515bf0d5f50SAlex Elder obj_request->img_request = img_request; 151625dcf954SAlex Elder obj_request->which = img_request->obj_request_count; 15176365d33aSAlex Elder rbd_assert(!obj_request_img_data_test(obj_request)); 15186365d33aSAlex Elder obj_request_img_data_set(obj_request); 1519bf0d5f50SAlex Elder rbd_assert(obj_request->which != BAD_WHICH); 152025dcf954SAlex Elder img_request->obj_request_count++; 152125dcf954SAlex Elder list_add_tail(&obj_request->links, &img_request->obj_requests); 152237206ee5SAlex Elder dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request, 152337206ee5SAlex Elder obj_request->which); 1524bf0d5f50SAlex Elder } 1525bf0d5f50SAlex Elder 1526bf0d5f50SAlex Elder static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request, 1527bf0d5f50SAlex Elder struct rbd_obj_request *obj_request) 1528bf0d5f50SAlex Elder { 1529bf0d5f50SAlex Elder rbd_assert(obj_request->which != BAD_WHICH); 153025dcf954SAlex Elder 153137206ee5SAlex Elder dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request, 153237206ee5SAlex Elder obj_request->which); 1533bf0d5f50SAlex Elder list_del(&obj_request->links); 153425dcf954SAlex Elder rbd_assert(img_request->obj_request_count > 0); 153525dcf954SAlex Elder img_request->obj_request_count--; 153625dcf954SAlex Elder rbd_assert(obj_request->which == img_request->obj_request_count); 153725dcf954SAlex Elder obj_request->which = BAD_WHICH; 15386365d33aSAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 1539bf0d5f50SAlex Elder rbd_assert(obj_request->img_request == img_request); 1540bf0d5f50SAlex Elder obj_request->img_request = NULL; 154125dcf954SAlex Elder obj_request->callback = NULL; 1542bf0d5f50SAlex Elder rbd_obj_request_put(obj_request); 1543bf0d5f50SAlex Elder } 1544bf0d5f50SAlex Elder 1545bf0d5f50SAlex Elder static bool obj_request_type_valid(enum obj_request_type type) 1546bf0d5f50SAlex Elder { 1547bf0d5f50SAlex Elder switch (type) { 15489969ebc5SAlex Elder case OBJ_REQUEST_NODATA: 1549bf0d5f50SAlex Elder case OBJ_REQUEST_BIO: 1550788e2df3SAlex Elder case OBJ_REQUEST_PAGES: 1551bf0d5f50SAlex Elder return true; 1552bf0d5f50SAlex Elder default: 1553bf0d5f50SAlex Elder return false; 1554bf0d5f50SAlex Elder } 1555bf0d5f50SAlex Elder } 1556bf0d5f50SAlex Elder 1557bf0d5f50SAlex Elder static int rbd_obj_request_submit(struct ceph_osd_client *osdc, 1558bf0d5f50SAlex Elder struct rbd_obj_request *obj_request) 1559bf0d5f50SAlex Elder { 156071c20a06SIlya Dryomov dout("%s %p\n", __func__, obj_request); 1561bf0d5f50SAlex Elder return ceph_osdc_start_request(osdc, obj_request->osd_req, false); 1562bf0d5f50SAlex Elder } 1563bf0d5f50SAlex Elder 156471c20a06SIlya Dryomov static void rbd_obj_request_end(struct rbd_obj_request *obj_request) 156571c20a06SIlya Dryomov { 156671c20a06SIlya Dryomov dout("%s %p\n", __func__, obj_request); 156771c20a06SIlya Dryomov ceph_osdc_cancel_request(obj_request->osd_req); 156871c20a06SIlya Dryomov } 156971c20a06SIlya Dryomov 157071c20a06SIlya Dryomov /* 157171c20a06SIlya Dryomov * Wait for an object request to complete. If interrupted, cancel the 157271c20a06SIlya Dryomov * underlying osd request. 15732894e1d7SIlya Dryomov * 15742894e1d7SIlya Dryomov * @timeout: in jiffies, 0 means "wait forever" 157571c20a06SIlya Dryomov */ 15762894e1d7SIlya Dryomov static int __rbd_obj_request_wait(struct rbd_obj_request *obj_request, 15772894e1d7SIlya Dryomov unsigned long timeout) 157871c20a06SIlya Dryomov { 15792894e1d7SIlya Dryomov long ret; 158071c20a06SIlya Dryomov 158171c20a06SIlya Dryomov dout("%s %p\n", __func__, obj_request); 15822894e1d7SIlya Dryomov ret = wait_for_completion_interruptible_timeout( 15832894e1d7SIlya Dryomov &obj_request->completion, 15842894e1d7SIlya Dryomov ceph_timeout_jiffies(timeout)); 15852894e1d7SIlya Dryomov if (ret <= 0) { 15862894e1d7SIlya Dryomov if (ret == 0) 15872894e1d7SIlya Dryomov ret = -ETIMEDOUT; 158871c20a06SIlya Dryomov rbd_obj_request_end(obj_request); 15892894e1d7SIlya Dryomov } else { 15902894e1d7SIlya Dryomov ret = 0; 15912894e1d7SIlya Dryomov } 15922894e1d7SIlya Dryomov 15932894e1d7SIlya Dryomov dout("%s %p ret %d\n", __func__, obj_request, (int)ret); 159471c20a06SIlya Dryomov return ret; 159571c20a06SIlya Dryomov } 159671c20a06SIlya Dryomov 15972894e1d7SIlya Dryomov static int rbd_obj_request_wait(struct rbd_obj_request *obj_request) 15982894e1d7SIlya Dryomov { 15992894e1d7SIlya Dryomov return __rbd_obj_request_wait(obj_request, 0); 16002894e1d7SIlya Dryomov } 16012894e1d7SIlya Dryomov 16022894e1d7SIlya Dryomov static int rbd_obj_request_wait_timeout(struct rbd_obj_request *obj_request, 16032894e1d7SIlya Dryomov unsigned long timeout) 16042894e1d7SIlya Dryomov { 16052894e1d7SIlya Dryomov return __rbd_obj_request_wait(obj_request, timeout); 160671c20a06SIlya Dryomov } 160771c20a06SIlya Dryomov 1608bf0d5f50SAlex Elder static void rbd_img_request_complete(struct rbd_img_request *img_request) 1609bf0d5f50SAlex Elder { 161055f27e09SAlex Elder 161137206ee5SAlex Elder dout("%s: img %p\n", __func__, img_request); 161255f27e09SAlex Elder 161355f27e09SAlex Elder /* 161455f27e09SAlex Elder * If no error occurred, compute the aggregate transfer 161555f27e09SAlex Elder * count for the image request. We could instead use 161655f27e09SAlex Elder * atomic64_cmpxchg() to update it as each object request 161755f27e09SAlex Elder * completes; not clear which way is better off hand. 161855f27e09SAlex Elder */ 161955f27e09SAlex Elder if (!img_request->result) { 162055f27e09SAlex Elder struct rbd_obj_request *obj_request; 162155f27e09SAlex Elder u64 xferred = 0; 162255f27e09SAlex Elder 162355f27e09SAlex Elder for_each_obj_request(img_request, obj_request) 162455f27e09SAlex Elder xferred += obj_request->xferred; 162555f27e09SAlex Elder img_request->xferred = xferred; 162655f27e09SAlex Elder } 162755f27e09SAlex Elder 1628bf0d5f50SAlex Elder if (img_request->callback) 1629bf0d5f50SAlex Elder img_request->callback(img_request); 1630bf0d5f50SAlex Elder else 1631bf0d5f50SAlex Elder rbd_img_request_put(img_request); 1632bf0d5f50SAlex Elder } 1633bf0d5f50SAlex Elder 16340c425248SAlex Elder /* 16350c425248SAlex Elder * The default/initial value for all image request flags is 0. Each 16360c425248SAlex Elder * is conditionally set to 1 at image request initialization time 16370c425248SAlex Elder * and currently never change thereafter. 16380c425248SAlex Elder */ 16390c425248SAlex Elder static void img_request_write_set(struct rbd_img_request *img_request) 16400c425248SAlex Elder { 16410c425248SAlex Elder set_bit(IMG_REQ_WRITE, &img_request->flags); 16420c425248SAlex Elder smp_mb(); 16430c425248SAlex Elder } 16440c425248SAlex Elder 16450c425248SAlex Elder static bool img_request_write_test(struct rbd_img_request *img_request) 16460c425248SAlex Elder { 16470c425248SAlex Elder smp_mb(); 16480c425248SAlex Elder return test_bit(IMG_REQ_WRITE, &img_request->flags) != 0; 16490c425248SAlex Elder } 16500c425248SAlex Elder 165190e98c52SGuangliang Zhao /* 165290e98c52SGuangliang Zhao * Set the discard flag when the img_request is an discard request 165390e98c52SGuangliang Zhao */ 165490e98c52SGuangliang Zhao static void img_request_discard_set(struct rbd_img_request *img_request) 165590e98c52SGuangliang Zhao { 165690e98c52SGuangliang Zhao set_bit(IMG_REQ_DISCARD, &img_request->flags); 165790e98c52SGuangliang Zhao smp_mb(); 165890e98c52SGuangliang Zhao } 165990e98c52SGuangliang Zhao 166090e98c52SGuangliang Zhao static bool img_request_discard_test(struct rbd_img_request *img_request) 166190e98c52SGuangliang Zhao { 166290e98c52SGuangliang Zhao smp_mb(); 166390e98c52SGuangliang Zhao return test_bit(IMG_REQ_DISCARD, &img_request->flags) != 0; 166490e98c52SGuangliang Zhao } 166590e98c52SGuangliang Zhao 16669849e986SAlex Elder static void img_request_child_set(struct rbd_img_request *img_request) 16679849e986SAlex Elder { 16689849e986SAlex Elder set_bit(IMG_REQ_CHILD, &img_request->flags); 16699849e986SAlex Elder smp_mb(); 16709849e986SAlex Elder } 16719849e986SAlex Elder 1672e93f3152SAlex Elder static void img_request_child_clear(struct rbd_img_request *img_request) 1673e93f3152SAlex Elder { 1674e93f3152SAlex Elder clear_bit(IMG_REQ_CHILD, &img_request->flags); 1675e93f3152SAlex Elder smp_mb(); 1676e93f3152SAlex Elder } 1677e93f3152SAlex Elder 16789849e986SAlex Elder static bool img_request_child_test(struct rbd_img_request *img_request) 16799849e986SAlex Elder { 16809849e986SAlex Elder smp_mb(); 16819849e986SAlex Elder return test_bit(IMG_REQ_CHILD, &img_request->flags) != 0; 16829849e986SAlex Elder } 16839849e986SAlex Elder 1684d0b2e944SAlex Elder static void img_request_layered_set(struct rbd_img_request *img_request) 1685d0b2e944SAlex Elder { 1686d0b2e944SAlex Elder set_bit(IMG_REQ_LAYERED, &img_request->flags); 1687d0b2e944SAlex Elder smp_mb(); 1688d0b2e944SAlex Elder } 1689d0b2e944SAlex Elder 1690a2acd00eSAlex Elder static void img_request_layered_clear(struct rbd_img_request *img_request) 1691a2acd00eSAlex Elder { 1692a2acd00eSAlex Elder clear_bit(IMG_REQ_LAYERED, &img_request->flags); 1693a2acd00eSAlex Elder smp_mb(); 1694a2acd00eSAlex Elder } 1695a2acd00eSAlex Elder 1696d0b2e944SAlex Elder static bool img_request_layered_test(struct rbd_img_request *img_request) 1697d0b2e944SAlex Elder { 1698d0b2e944SAlex Elder smp_mb(); 1699d0b2e944SAlex Elder return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0; 1700d0b2e944SAlex Elder } 1701d0b2e944SAlex Elder 17023b434a2aSJosh Durgin static enum obj_operation_type 17033b434a2aSJosh Durgin rbd_img_request_op_type(struct rbd_img_request *img_request) 17043b434a2aSJosh Durgin { 17053b434a2aSJosh Durgin if (img_request_write_test(img_request)) 17063b434a2aSJosh Durgin return OBJ_OP_WRITE; 17073b434a2aSJosh Durgin else if (img_request_discard_test(img_request)) 17083b434a2aSJosh Durgin return OBJ_OP_DISCARD; 17093b434a2aSJosh Durgin else 17103b434a2aSJosh Durgin return OBJ_OP_READ; 17113b434a2aSJosh Durgin } 17123b434a2aSJosh Durgin 17136e2a4505SAlex Elder static void 17146e2a4505SAlex Elder rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request) 17156e2a4505SAlex Elder { 1716b9434c5bSAlex Elder u64 xferred = obj_request->xferred; 1717b9434c5bSAlex Elder u64 length = obj_request->length; 1718b9434c5bSAlex Elder 17196e2a4505SAlex Elder dout("%s: obj %p img %p result %d %llu/%llu\n", __func__, 17206e2a4505SAlex Elder obj_request, obj_request->img_request, obj_request->result, 1721b9434c5bSAlex Elder xferred, length); 17226e2a4505SAlex Elder /* 172317c1cc1dSJosh Durgin * ENOENT means a hole in the image. We zero-fill the entire 172417c1cc1dSJosh Durgin * length of the request. A short read also implies zero-fill 172517c1cc1dSJosh Durgin * to the end of the request. An error requires the whole 172617c1cc1dSJosh Durgin * length of the request to be reported finished with an error 172717c1cc1dSJosh Durgin * to the block layer. In each case we update the xferred 172817c1cc1dSJosh Durgin * count to indicate the whole request was satisfied. 17296e2a4505SAlex Elder */ 1730b9434c5bSAlex Elder rbd_assert(obj_request->type != OBJ_REQUEST_NODATA); 17316e2a4505SAlex Elder if (obj_request->result == -ENOENT) { 1732b9434c5bSAlex Elder if (obj_request->type == OBJ_REQUEST_BIO) 17336e2a4505SAlex Elder zero_bio_chain(obj_request->bio_list, 0); 1734b9434c5bSAlex Elder else 1735b9434c5bSAlex Elder zero_pages(obj_request->pages, 0, length); 17366e2a4505SAlex Elder obj_request->result = 0; 1737b9434c5bSAlex Elder } else if (xferred < length && !obj_request->result) { 1738b9434c5bSAlex Elder if (obj_request->type == OBJ_REQUEST_BIO) 1739b9434c5bSAlex Elder zero_bio_chain(obj_request->bio_list, xferred); 1740b9434c5bSAlex Elder else 1741b9434c5bSAlex Elder zero_pages(obj_request->pages, xferred, length); 17426e2a4505SAlex Elder } 174317c1cc1dSJosh Durgin obj_request->xferred = length; 17446e2a4505SAlex Elder obj_request_done_set(obj_request); 17456e2a4505SAlex Elder } 17466e2a4505SAlex Elder 1747bf0d5f50SAlex Elder static void rbd_obj_request_complete(struct rbd_obj_request *obj_request) 1748bf0d5f50SAlex Elder { 174937206ee5SAlex Elder dout("%s: obj %p cb %p\n", __func__, obj_request, 175037206ee5SAlex Elder obj_request->callback); 1751bf0d5f50SAlex Elder if (obj_request->callback) 1752bf0d5f50SAlex Elder obj_request->callback(obj_request); 1753788e2df3SAlex Elder else 1754788e2df3SAlex Elder complete_all(&obj_request->completion); 1755bf0d5f50SAlex Elder } 1756bf0d5f50SAlex Elder 1757c47f9371SAlex Elder static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request) 175839bf2c5dSAlex Elder { 175939bf2c5dSAlex Elder dout("%s: obj %p\n", __func__, obj_request); 176039bf2c5dSAlex Elder obj_request_done_set(obj_request); 176139bf2c5dSAlex Elder } 176239bf2c5dSAlex Elder 1763c47f9371SAlex Elder static void rbd_osd_read_callback(struct rbd_obj_request *obj_request) 1764bf0d5f50SAlex Elder { 176557acbaa7SAlex Elder struct rbd_img_request *img_request = NULL; 1766a9e8ba2cSAlex Elder struct rbd_device *rbd_dev = NULL; 176757acbaa7SAlex Elder bool layered = false; 176857acbaa7SAlex Elder 176957acbaa7SAlex Elder if (obj_request_img_data_test(obj_request)) { 177057acbaa7SAlex Elder img_request = obj_request->img_request; 177157acbaa7SAlex Elder layered = img_request && img_request_layered_test(img_request); 1772a9e8ba2cSAlex Elder rbd_dev = img_request->rbd_dev; 177357acbaa7SAlex Elder } 17748b3e1a56SAlex Elder 17758b3e1a56SAlex Elder dout("%s: obj %p img %p result %d %llu/%llu\n", __func__, 17768b3e1a56SAlex Elder obj_request, img_request, obj_request->result, 17778b3e1a56SAlex Elder obj_request->xferred, obj_request->length); 1778a9e8ba2cSAlex Elder if (layered && obj_request->result == -ENOENT && 1779a9e8ba2cSAlex Elder obj_request->img_offset < rbd_dev->parent_overlap) 17808b3e1a56SAlex Elder rbd_img_parent_read(obj_request); 17818b3e1a56SAlex Elder else if (img_request) 17826e2a4505SAlex Elder rbd_img_obj_request_read_callback(obj_request); 17836e2a4505SAlex Elder else 178407741308SAlex Elder obj_request_done_set(obj_request); 1785bf0d5f50SAlex Elder } 1786bf0d5f50SAlex Elder 1787c47f9371SAlex Elder static void rbd_osd_write_callback(struct rbd_obj_request *obj_request) 1788bf0d5f50SAlex Elder { 17891b83bef2SSage Weil dout("%s: obj %p result %d %llu\n", __func__, obj_request, 17901b83bef2SSage Weil obj_request->result, obj_request->length); 17911b83bef2SSage Weil /* 17928b3e1a56SAlex Elder * There is no such thing as a successful short write. Set 17938b3e1a56SAlex Elder * it to our originally-requested length. 17941b83bef2SSage Weil */ 17951b83bef2SSage Weil obj_request->xferred = obj_request->length; 179607741308SAlex Elder obj_request_done_set(obj_request); 1797bf0d5f50SAlex Elder } 1798bf0d5f50SAlex Elder 179990e98c52SGuangliang Zhao static void rbd_osd_discard_callback(struct rbd_obj_request *obj_request) 180090e98c52SGuangliang Zhao { 180190e98c52SGuangliang Zhao dout("%s: obj %p result %d %llu\n", __func__, obj_request, 180290e98c52SGuangliang Zhao obj_request->result, obj_request->length); 180390e98c52SGuangliang Zhao /* 180490e98c52SGuangliang Zhao * There is no such thing as a successful short discard. Set 180590e98c52SGuangliang Zhao * it to our originally-requested length. 180690e98c52SGuangliang Zhao */ 180790e98c52SGuangliang Zhao obj_request->xferred = obj_request->length; 1808d0265de7SJosh Durgin /* discarding a non-existent object is not a problem */ 1809d0265de7SJosh Durgin if (obj_request->result == -ENOENT) 1810d0265de7SJosh Durgin obj_request->result = 0; 181190e98c52SGuangliang Zhao obj_request_done_set(obj_request); 181290e98c52SGuangliang Zhao } 181390e98c52SGuangliang Zhao 1814fbfab539SAlex Elder /* 1815fbfab539SAlex Elder * For a simple stat call there's nothing to do. We'll do more if 1816fbfab539SAlex Elder * this is part of a write sequence for a layered image. 1817fbfab539SAlex Elder */ 1818c47f9371SAlex Elder static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request) 1819fbfab539SAlex Elder { 182037206ee5SAlex Elder dout("%s: obj %p\n", __func__, obj_request); 1821fbfab539SAlex Elder obj_request_done_set(obj_request); 1822fbfab539SAlex Elder } 1823fbfab539SAlex Elder 18242761713dSIlya Dryomov static void rbd_osd_call_callback(struct rbd_obj_request *obj_request) 18252761713dSIlya Dryomov { 18262761713dSIlya Dryomov dout("%s: obj %p\n", __func__, obj_request); 18272761713dSIlya Dryomov 18282761713dSIlya Dryomov if (obj_request_img_data_test(obj_request)) 18292761713dSIlya Dryomov rbd_osd_copyup_callback(obj_request); 18302761713dSIlya Dryomov else 18312761713dSIlya Dryomov obj_request_done_set(obj_request); 18322761713dSIlya Dryomov } 18332761713dSIlya Dryomov 1834bf0d5f50SAlex Elder static void rbd_osd_req_callback(struct ceph_osd_request *osd_req, 1835bf0d5f50SAlex Elder struct ceph_msg *msg) 1836bf0d5f50SAlex Elder { 1837bf0d5f50SAlex Elder struct rbd_obj_request *obj_request = osd_req->r_priv; 1838bf0d5f50SAlex Elder u16 opcode; 1839bf0d5f50SAlex Elder 184037206ee5SAlex Elder dout("%s: osd_req %p msg %p\n", __func__, osd_req, msg); 1841bf0d5f50SAlex Elder rbd_assert(osd_req == obj_request->osd_req); 184257acbaa7SAlex Elder if (obj_request_img_data_test(obj_request)) { 184357acbaa7SAlex Elder rbd_assert(obj_request->img_request); 184457acbaa7SAlex Elder rbd_assert(obj_request->which != BAD_WHICH); 184557acbaa7SAlex Elder } else { 184657acbaa7SAlex Elder rbd_assert(obj_request->which == BAD_WHICH); 184757acbaa7SAlex Elder } 1848bf0d5f50SAlex Elder 18491b83bef2SSage Weil if (osd_req->r_result < 0) 18501b83bef2SSage Weil obj_request->result = osd_req->r_result; 1851bf0d5f50SAlex Elder 18527cc69d42SIlya Dryomov rbd_assert(osd_req->r_num_ops <= CEPH_OSD_MAX_OP); 1853bf0d5f50SAlex Elder 1854c47f9371SAlex Elder /* 1855c47f9371SAlex Elder * We support a 64-bit length, but ultimately it has to be 18567ad18afaSChristoph Hellwig * passed to the block layer, which just supports a 32-bit 18577ad18afaSChristoph Hellwig * length field. 1858c47f9371SAlex Elder */ 18591b83bef2SSage Weil obj_request->xferred = osd_req->r_reply_op_len[0]; 1860c47f9371SAlex Elder rbd_assert(obj_request->xferred < (u64)UINT_MAX); 18610ccd5926SIlya Dryomov 186279528734SAlex Elder opcode = osd_req->r_ops[0].op; 1863bf0d5f50SAlex Elder switch (opcode) { 1864bf0d5f50SAlex Elder case CEPH_OSD_OP_READ: 1865c47f9371SAlex Elder rbd_osd_read_callback(obj_request); 1866bf0d5f50SAlex Elder break; 18670ccd5926SIlya Dryomov case CEPH_OSD_OP_SETALLOCHINT: 1868e30b7577SIlya Dryomov rbd_assert(osd_req->r_ops[1].op == CEPH_OSD_OP_WRITE || 1869e30b7577SIlya Dryomov osd_req->r_ops[1].op == CEPH_OSD_OP_WRITEFULL); 18700ccd5926SIlya Dryomov /* fall through */ 1871bf0d5f50SAlex Elder case CEPH_OSD_OP_WRITE: 1872e30b7577SIlya Dryomov case CEPH_OSD_OP_WRITEFULL: 1873c47f9371SAlex Elder rbd_osd_write_callback(obj_request); 1874bf0d5f50SAlex Elder break; 1875fbfab539SAlex Elder case CEPH_OSD_OP_STAT: 1876c47f9371SAlex Elder rbd_osd_stat_callback(obj_request); 1877fbfab539SAlex Elder break; 187890e98c52SGuangliang Zhao case CEPH_OSD_OP_DELETE: 187990e98c52SGuangliang Zhao case CEPH_OSD_OP_TRUNCATE: 188090e98c52SGuangliang Zhao case CEPH_OSD_OP_ZERO: 188190e98c52SGuangliang Zhao rbd_osd_discard_callback(obj_request); 188290e98c52SGuangliang Zhao break; 188336be9a76SAlex Elder case CEPH_OSD_OP_CALL: 18842761713dSIlya Dryomov rbd_osd_call_callback(obj_request); 18852761713dSIlya Dryomov break; 1886b8d70035SAlex Elder case CEPH_OSD_OP_NOTIFY_ACK: 18879969ebc5SAlex Elder case CEPH_OSD_OP_WATCH: 1888c47f9371SAlex Elder rbd_osd_trivial_callback(obj_request); 18899969ebc5SAlex Elder break; 1890bf0d5f50SAlex Elder default: 18919584d508SIlya Dryomov rbd_warn(NULL, "%s: unsupported op %hu", 1892bf0d5f50SAlex Elder obj_request->object_name, (unsigned short) opcode); 1893bf0d5f50SAlex Elder break; 1894bf0d5f50SAlex Elder } 1895bf0d5f50SAlex Elder 189607741308SAlex Elder if (obj_request_done_test(obj_request)) 1897bf0d5f50SAlex Elder rbd_obj_request_complete(obj_request); 1898bf0d5f50SAlex Elder } 1899bf0d5f50SAlex Elder 19009d4df01fSAlex Elder static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request) 1901430c28c3SAlex Elder { 1902430c28c3SAlex Elder struct rbd_img_request *img_request = obj_request->img_request; 19038c042b0dSAlex Elder struct ceph_osd_request *osd_req = obj_request->osd_req; 19049d4df01fSAlex Elder u64 snap_id; 1905430c28c3SAlex Elder 19068c042b0dSAlex Elder rbd_assert(osd_req != NULL); 1907430c28c3SAlex Elder 19089d4df01fSAlex Elder snap_id = img_request ? img_request->snap_id : CEPH_NOSNAP; 19098c042b0dSAlex Elder ceph_osdc_build_request(osd_req, obj_request->offset, 19109d4df01fSAlex Elder NULL, snap_id, NULL); 19119d4df01fSAlex Elder } 19129d4df01fSAlex Elder 19139d4df01fSAlex Elder static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request) 19149d4df01fSAlex Elder { 19159d4df01fSAlex Elder struct rbd_img_request *img_request = obj_request->img_request; 19169d4df01fSAlex Elder struct ceph_osd_request *osd_req = obj_request->osd_req; 19179d4df01fSAlex Elder struct ceph_snap_context *snapc; 19189d4df01fSAlex Elder struct timespec mtime = CURRENT_TIME; 19199d4df01fSAlex Elder 19209d4df01fSAlex Elder rbd_assert(osd_req != NULL); 19219d4df01fSAlex Elder 19229d4df01fSAlex Elder snapc = img_request ? img_request->snapc : NULL; 19239d4df01fSAlex Elder ceph_osdc_build_request(osd_req, obj_request->offset, 19249d4df01fSAlex Elder snapc, CEPH_NOSNAP, &mtime); 1925430c28c3SAlex Elder } 1926430c28c3SAlex Elder 19270ccd5926SIlya Dryomov /* 19280ccd5926SIlya Dryomov * Create an osd request. A read request has one osd op (read). 19290ccd5926SIlya Dryomov * A write request has either one (watch) or two (hint+write) osd ops. 19300ccd5926SIlya Dryomov * (All rbd data writes are prefixed with an allocation hint op, but 19310ccd5926SIlya Dryomov * technically osd watch is a write request, hence this distinction.) 19320ccd5926SIlya Dryomov */ 1933bf0d5f50SAlex Elder static struct ceph_osd_request *rbd_osd_req_create( 1934bf0d5f50SAlex Elder struct rbd_device *rbd_dev, 19356d2940c8SGuangliang Zhao enum obj_operation_type op_type, 1936deb236b3SIlya Dryomov unsigned int num_ops, 1937430c28c3SAlex Elder struct rbd_obj_request *obj_request) 1938bf0d5f50SAlex Elder { 1939bf0d5f50SAlex Elder struct ceph_snap_context *snapc = NULL; 1940bf0d5f50SAlex Elder struct ceph_osd_client *osdc; 1941bf0d5f50SAlex Elder struct ceph_osd_request *osd_req; 1942bf0d5f50SAlex Elder 194390e98c52SGuangliang Zhao if (obj_request_img_data_test(obj_request) && 194490e98c52SGuangliang Zhao (op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_WRITE)) { 19456365d33aSAlex Elder struct rbd_img_request *img_request = obj_request->img_request; 194690e98c52SGuangliang Zhao if (op_type == OBJ_OP_WRITE) { 19476d2940c8SGuangliang Zhao rbd_assert(img_request_write_test(img_request)); 194890e98c52SGuangliang Zhao } else { 194990e98c52SGuangliang Zhao rbd_assert(img_request_discard_test(img_request)); 195090e98c52SGuangliang Zhao } 1951bf0d5f50SAlex Elder snapc = img_request->snapc; 1952bf0d5f50SAlex Elder } 1953bf0d5f50SAlex Elder 19546d2940c8SGuangliang Zhao rbd_assert(num_ops == 1 || ((op_type == OBJ_OP_WRITE) && num_ops == 2)); 1955deb236b3SIlya Dryomov 1956deb236b3SIlya Dryomov /* Allocate and initialize the request, for the num_ops ops */ 1957bf0d5f50SAlex Elder 1958bf0d5f50SAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 1959deb236b3SIlya Dryomov osd_req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false, 1960deb236b3SIlya Dryomov GFP_ATOMIC); 1961bf0d5f50SAlex Elder if (!osd_req) 1962bf0d5f50SAlex Elder return NULL; /* ENOMEM */ 1963bf0d5f50SAlex Elder 196490e98c52SGuangliang Zhao if (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD) 1965bf0d5f50SAlex Elder osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK; 1966430c28c3SAlex Elder else 1967bf0d5f50SAlex Elder osd_req->r_flags = CEPH_OSD_FLAG_READ; 1968bf0d5f50SAlex Elder 1969bf0d5f50SAlex Elder osd_req->r_callback = rbd_osd_req_callback; 1970bf0d5f50SAlex Elder osd_req->r_priv = obj_request; 1971bf0d5f50SAlex Elder 19723c972c95SIlya Dryomov osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout); 19733c972c95SIlya Dryomov ceph_oid_set_name(&osd_req->r_base_oid, obj_request->object_name); 1974bf0d5f50SAlex Elder 1975bf0d5f50SAlex Elder return osd_req; 1976bf0d5f50SAlex Elder } 1977bf0d5f50SAlex Elder 19780eefd470SAlex Elder /* 1979d3246fb0SJosh Durgin * Create a copyup osd request based on the information in the object 1980d3246fb0SJosh Durgin * request supplied. A copyup request has two or three osd ops, a 1981d3246fb0SJosh Durgin * copyup method call, potentially a hint op, and a write or truncate 1982d3246fb0SJosh Durgin * or zero op. 19830eefd470SAlex Elder */ 19840eefd470SAlex Elder static struct ceph_osd_request * 19850eefd470SAlex Elder rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request) 19860eefd470SAlex Elder { 19870eefd470SAlex Elder struct rbd_img_request *img_request; 19880eefd470SAlex Elder struct ceph_snap_context *snapc; 19890eefd470SAlex Elder struct rbd_device *rbd_dev; 19900eefd470SAlex Elder struct ceph_osd_client *osdc; 19910eefd470SAlex Elder struct ceph_osd_request *osd_req; 1992d3246fb0SJosh Durgin int num_osd_ops = 3; 19930eefd470SAlex Elder 19940eefd470SAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 19950eefd470SAlex Elder img_request = obj_request->img_request; 19960eefd470SAlex Elder rbd_assert(img_request); 1997d3246fb0SJosh Durgin rbd_assert(img_request_write_test(img_request) || 1998d3246fb0SJosh Durgin img_request_discard_test(img_request)); 19990eefd470SAlex Elder 2000d3246fb0SJosh Durgin if (img_request_discard_test(img_request)) 2001d3246fb0SJosh Durgin num_osd_ops = 2; 2002d3246fb0SJosh Durgin 2003d3246fb0SJosh Durgin /* Allocate and initialize the request, for all the ops */ 20040eefd470SAlex Elder 20050eefd470SAlex Elder snapc = img_request->snapc; 20060eefd470SAlex Elder rbd_dev = img_request->rbd_dev; 20070eefd470SAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 2008d3246fb0SJosh Durgin osd_req = ceph_osdc_alloc_request(osdc, snapc, num_osd_ops, 2009d3246fb0SJosh Durgin false, GFP_ATOMIC); 20100eefd470SAlex Elder if (!osd_req) 20110eefd470SAlex Elder return NULL; /* ENOMEM */ 20120eefd470SAlex Elder 20130eefd470SAlex Elder osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK; 20140eefd470SAlex Elder osd_req->r_callback = rbd_osd_req_callback; 20150eefd470SAlex Elder osd_req->r_priv = obj_request; 20160eefd470SAlex Elder 20173c972c95SIlya Dryomov osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout); 20183c972c95SIlya Dryomov ceph_oid_set_name(&osd_req->r_base_oid, obj_request->object_name); 20190eefd470SAlex Elder 20200eefd470SAlex Elder return osd_req; 20210eefd470SAlex Elder } 20220eefd470SAlex Elder 20230eefd470SAlex Elder 2024bf0d5f50SAlex Elder static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req) 2025bf0d5f50SAlex Elder { 2026bf0d5f50SAlex Elder ceph_osdc_put_request(osd_req); 2027bf0d5f50SAlex Elder } 2028bf0d5f50SAlex Elder 2029bf0d5f50SAlex Elder /* object_name is assumed to be a non-null pointer and NUL-terminated */ 2030bf0d5f50SAlex Elder 2031bf0d5f50SAlex Elder static struct rbd_obj_request *rbd_obj_request_create(const char *object_name, 2032bf0d5f50SAlex Elder u64 offset, u64 length, 2033bf0d5f50SAlex Elder enum obj_request_type type) 2034bf0d5f50SAlex Elder { 2035bf0d5f50SAlex Elder struct rbd_obj_request *obj_request; 2036bf0d5f50SAlex Elder size_t size; 2037bf0d5f50SAlex Elder char *name; 2038bf0d5f50SAlex Elder 2039bf0d5f50SAlex Elder rbd_assert(obj_request_type_valid(type)); 2040bf0d5f50SAlex Elder 2041bf0d5f50SAlex Elder size = strlen(object_name) + 1; 20425a60e876SIlya Dryomov name = kmalloc(size, GFP_NOIO); 2043f907ad55SAlex Elder if (!name) 2044bf0d5f50SAlex Elder return NULL; 2045bf0d5f50SAlex Elder 20465a60e876SIlya Dryomov obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_NOIO); 2047f907ad55SAlex Elder if (!obj_request) { 2048f907ad55SAlex Elder kfree(name); 2049f907ad55SAlex Elder return NULL; 2050f907ad55SAlex Elder } 2051f907ad55SAlex Elder 2052bf0d5f50SAlex Elder obj_request->object_name = memcpy(name, object_name, size); 2053bf0d5f50SAlex Elder obj_request->offset = offset; 2054bf0d5f50SAlex Elder obj_request->length = length; 2055926f9b3fSAlex Elder obj_request->flags = 0; 2056bf0d5f50SAlex Elder obj_request->which = BAD_WHICH; 2057bf0d5f50SAlex Elder obj_request->type = type; 2058bf0d5f50SAlex Elder INIT_LIST_HEAD(&obj_request->links); 2059788e2df3SAlex Elder init_completion(&obj_request->completion); 2060bf0d5f50SAlex Elder kref_init(&obj_request->kref); 2061bf0d5f50SAlex Elder 206237206ee5SAlex Elder dout("%s: \"%s\" %llu/%llu %d -> obj %p\n", __func__, object_name, 206337206ee5SAlex Elder offset, length, (int)type, obj_request); 206437206ee5SAlex Elder 2065bf0d5f50SAlex Elder return obj_request; 2066bf0d5f50SAlex Elder } 2067bf0d5f50SAlex Elder 2068bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref) 2069bf0d5f50SAlex Elder { 2070bf0d5f50SAlex Elder struct rbd_obj_request *obj_request; 2071bf0d5f50SAlex Elder 2072bf0d5f50SAlex Elder obj_request = container_of(kref, struct rbd_obj_request, kref); 2073bf0d5f50SAlex Elder 207437206ee5SAlex Elder dout("%s: obj %p\n", __func__, obj_request); 207537206ee5SAlex Elder 2076bf0d5f50SAlex Elder rbd_assert(obj_request->img_request == NULL); 2077bf0d5f50SAlex Elder rbd_assert(obj_request->which == BAD_WHICH); 2078bf0d5f50SAlex Elder 2079bf0d5f50SAlex Elder if (obj_request->osd_req) 2080bf0d5f50SAlex Elder rbd_osd_req_destroy(obj_request->osd_req); 2081bf0d5f50SAlex Elder 2082bf0d5f50SAlex Elder rbd_assert(obj_request_type_valid(obj_request->type)); 2083bf0d5f50SAlex Elder switch (obj_request->type) { 20849969ebc5SAlex Elder case OBJ_REQUEST_NODATA: 20859969ebc5SAlex Elder break; /* Nothing to do */ 2086bf0d5f50SAlex Elder case OBJ_REQUEST_BIO: 2087bf0d5f50SAlex Elder if (obj_request->bio_list) 2088bf0d5f50SAlex Elder bio_chain_put(obj_request->bio_list); 2089bf0d5f50SAlex Elder break; 2090788e2df3SAlex Elder case OBJ_REQUEST_PAGES: 2091788e2df3SAlex Elder if (obj_request->pages) 2092788e2df3SAlex Elder ceph_release_page_vector(obj_request->pages, 2093788e2df3SAlex Elder obj_request->page_count); 2094788e2df3SAlex Elder break; 2095bf0d5f50SAlex Elder } 2096bf0d5f50SAlex Elder 2097f907ad55SAlex Elder kfree(obj_request->object_name); 2098868311b1SAlex Elder obj_request->object_name = NULL; 2099868311b1SAlex Elder kmem_cache_free(rbd_obj_request_cache, obj_request); 2100bf0d5f50SAlex Elder } 2101bf0d5f50SAlex Elder 2102fb65d228SAlex Elder /* It's OK to call this for a device with no parent */ 2103fb65d228SAlex Elder 2104fb65d228SAlex Elder static void rbd_spec_put(struct rbd_spec *spec); 2105fb65d228SAlex Elder static void rbd_dev_unparent(struct rbd_device *rbd_dev) 2106fb65d228SAlex Elder { 2107fb65d228SAlex Elder rbd_dev_remove_parent(rbd_dev); 2108fb65d228SAlex Elder rbd_spec_put(rbd_dev->parent_spec); 2109fb65d228SAlex Elder rbd_dev->parent_spec = NULL; 2110fb65d228SAlex Elder rbd_dev->parent_overlap = 0; 2111fb65d228SAlex Elder } 2112fb65d228SAlex Elder 2113bf0d5f50SAlex Elder /* 2114a2acd00eSAlex Elder * Parent image reference counting is used to determine when an 2115a2acd00eSAlex Elder * image's parent fields can be safely torn down--after there are no 2116a2acd00eSAlex Elder * more in-flight requests to the parent image. When the last 2117a2acd00eSAlex Elder * reference is dropped, cleaning them up is safe. 2118a2acd00eSAlex Elder */ 2119a2acd00eSAlex Elder static void rbd_dev_parent_put(struct rbd_device *rbd_dev) 2120a2acd00eSAlex Elder { 2121a2acd00eSAlex Elder int counter; 2122a2acd00eSAlex Elder 2123a2acd00eSAlex Elder if (!rbd_dev->parent_spec) 2124a2acd00eSAlex Elder return; 2125a2acd00eSAlex Elder 2126a2acd00eSAlex Elder counter = atomic_dec_return_safe(&rbd_dev->parent_ref); 2127a2acd00eSAlex Elder if (counter > 0) 2128a2acd00eSAlex Elder return; 2129a2acd00eSAlex Elder 2130a2acd00eSAlex Elder /* Last reference; clean up parent data structures */ 2131a2acd00eSAlex Elder 2132a2acd00eSAlex Elder if (!counter) 2133a2acd00eSAlex Elder rbd_dev_unparent(rbd_dev); 2134a2acd00eSAlex Elder else 21359584d508SIlya Dryomov rbd_warn(rbd_dev, "parent reference underflow"); 2136a2acd00eSAlex Elder } 2137a2acd00eSAlex Elder 2138a2acd00eSAlex Elder /* 2139a2acd00eSAlex Elder * If an image has a non-zero parent overlap, get a reference to its 2140a2acd00eSAlex Elder * parent. 2141a2acd00eSAlex Elder * 2142a2acd00eSAlex Elder * Returns true if the rbd device has a parent with a non-zero 2143a2acd00eSAlex Elder * overlap and a reference for it was successfully taken, or 2144a2acd00eSAlex Elder * false otherwise. 2145a2acd00eSAlex Elder */ 2146a2acd00eSAlex Elder static bool rbd_dev_parent_get(struct rbd_device *rbd_dev) 2147a2acd00eSAlex Elder { 2148ae43e9d0SIlya Dryomov int counter = 0; 2149a2acd00eSAlex Elder 2150a2acd00eSAlex Elder if (!rbd_dev->parent_spec) 2151a2acd00eSAlex Elder return false; 2152a2acd00eSAlex Elder 2153ae43e9d0SIlya Dryomov down_read(&rbd_dev->header_rwsem); 2154ae43e9d0SIlya Dryomov if (rbd_dev->parent_overlap) 2155a2acd00eSAlex Elder counter = atomic_inc_return_safe(&rbd_dev->parent_ref); 2156ae43e9d0SIlya Dryomov up_read(&rbd_dev->header_rwsem); 2157a2acd00eSAlex Elder 2158a2acd00eSAlex Elder if (counter < 0) 21599584d508SIlya Dryomov rbd_warn(rbd_dev, "parent reference overflow"); 2160a2acd00eSAlex Elder 2161ae43e9d0SIlya Dryomov return counter > 0; 2162a2acd00eSAlex Elder } 2163a2acd00eSAlex Elder 2164bf0d5f50SAlex Elder /* 2165bf0d5f50SAlex Elder * Caller is responsible for filling in the list of object requests 2166bf0d5f50SAlex Elder * that comprises the image request, and the Linux request pointer 2167bf0d5f50SAlex Elder * (if there is one). 2168bf0d5f50SAlex Elder */ 2169cc344fa1SAlex Elder static struct rbd_img_request *rbd_img_request_create( 2170cc344fa1SAlex Elder struct rbd_device *rbd_dev, 2171bf0d5f50SAlex Elder u64 offset, u64 length, 21726d2940c8SGuangliang Zhao enum obj_operation_type op_type, 21734e752f0aSJosh Durgin struct ceph_snap_context *snapc) 2174bf0d5f50SAlex Elder { 2175bf0d5f50SAlex Elder struct rbd_img_request *img_request; 2176bf0d5f50SAlex Elder 21777a716aacSIlya Dryomov img_request = kmem_cache_alloc(rbd_img_request_cache, GFP_NOIO); 2178bf0d5f50SAlex Elder if (!img_request) 2179bf0d5f50SAlex Elder return NULL; 2180bf0d5f50SAlex Elder 2181bf0d5f50SAlex Elder img_request->rq = NULL; 2182bf0d5f50SAlex Elder img_request->rbd_dev = rbd_dev; 2183bf0d5f50SAlex Elder img_request->offset = offset; 2184bf0d5f50SAlex Elder img_request->length = length; 21850c425248SAlex Elder img_request->flags = 0; 218690e98c52SGuangliang Zhao if (op_type == OBJ_OP_DISCARD) { 218790e98c52SGuangliang Zhao img_request_discard_set(img_request); 218890e98c52SGuangliang Zhao img_request->snapc = snapc; 218990e98c52SGuangliang Zhao } else if (op_type == OBJ_OP_WRITE) { 21900c425248SAlex Elder img_request_write_set(img_request); 21914e752f0aSJosh Durgin img_request->snapc = snapc; 21920c425248SAlex Elder } else { 2193bf0d5f50SAlex Elder img_request->snap_id = rbd_dev->spec->snap_id; 21940c425248SAlex Elder } 2195a2acd00eSAlex Elder if (rbd_dev_parent_get(rbd_dev)) 2196d0b2e944SAlex Elder img_request_layered_set(img_request); 2197bf0d5f50SAlex Elder spin_lock_init(&img_request->completion_lock); 2198bf0d5f50SAlex Elder img_request->next_completion = 0; 2199bf0d5f50SAlex Elder img_request->callback = NULL; 2200a5a337d4SAlex Elder img_request->result = 0; 2201bf0d5f50SAlex Elder img_request->obj_request_count = 0; 2202bf0d5f50SAlex Elder INIT_LIST_HEAD(&img_request->obj_requests); 2203bf0d5f50SAlex Elder kref_init(&img_request->kref); 2204bf0d5f50SAlex Elder 220537206ee5SAlex Elder dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev, 22066d2940c8SGuangliang Zhao obj_op_name(op_type), offset, length, img_request); 220737206ee5SAlex Elder 2208bf0d5f50SAlex Elder return img_request; 2209bf0d5f50SAlex Elder } 2210bf0d5f50SAlex Elder 2211bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref) 2212bf0d5f50SAlex Elder { 2213bf0d5f50SAlex Elder struct rbd_img_request *img_request; 2214bf0d5f50SAlex Elder struct rbd_obj_request *obj_request; 2215bf0d5f50SAlex Elder struct rbd_obj_request *next_obj_request; 2216bf0d5f50SAlex Elder 2217bf0d5f50SAlex Elder img_request = container_of(kref, struct rbd_img_request, kref); 2218bf0d5f50SAlex Elder 221937206ee5SAlex Elder dout("%s: img %p\n", __func__, img_request); 222037206ee5SAlex Elder 2221bf0d5f50SAlex Elder for_each_obj_request_safe(img_request, obj_request, next_obj_request) 2222bf0d5f50SAlex Elder rbd_img_obj_request_del(img_request, obj_request); 222325dcf954SAlex Elder rbd_assert(img_request->obj_request_count == 0); 2224bf0d5f50SAlex Elder 2225a2acd00eSAlex Elder if (img_request_layered_test(img_request)) { 2226a2acd00eSAlex Elder img_request_layered_clear(img_request); 2227a2acd00eSAlex Elder rbd_dev_parent_put(img_request->rbd_dev); 2228a2acd00eSAlex Elder } 2229a2acd00eSAlex Elder 2230bef95455SJosh Durgin if (img_request_write_test(img_request) || 2231bef95455SJosh Durgin img_request_discard_test(img_request)) 2232812164f8SAlex Elder ceph_put_snap_context(img_request->snapc); 2233bf0d5f50SAlex Elder 22341c2a9dfeSAlex Elder kmem_cache_free(rbd_img_request_cache, img_request); 2235bf0d5f50SAlex Elder } 2236bf0d5f50SAlex Elder 2237e93f3152SAlex Elder static struct rbd_img_request *rbd_parent_request_create( 2238e93f3152SAlex Elder struct rbd_obj_request *obj_request, 2239e93f3152SAlex Elder u64 img_offset, u64 length) 2240e93f3152SAlex Elder { 2241e93f3152SAlex Elder struct rbd_img_request *parent_request; 2242e93f3152SAlex Elder struct rbd_device *rbd_dev; 2243e93f3152SAlex Elder 2244e93f3152SAlex Elder rbd_assert(obj_request->img_request); 2245e93f3152SAlex Elder rbd_dev = obj_request->img_request->rbd_dev; 2246e93f3152SAlex Elder 22474e752f0aSJosh Durgin parent_request = rbd_img_request_create(rbd_dev->parent, img_offset, 22486d2940c8SGuangliang Zhao length, OBJ_OP_READ, NULL); 2249e93f3152SAlex Elder if (!parent_request) 2250e93f3152SAlex Elder return NULL; 2251e93f3152SAlex Elder 2252e93f3152SAlex Elder img_request_child_set(parent_request); 2253e93f3152SAlex Elder rbd_obj_request_get(obj_request); 2254e93f3152SAlex Elder parent_request->obj_request = obj_request; 2255e93f3152SAlex Elder 2256e93f3152SAlex Elder return parent_request; 2257e93f3152SAlex Elder } 2258e93f3152SAlex Elder 2259e93f3152SAlex Elder static void rbd_parent_request_destroy(struct kref *kref) 2260e93f3152SAlex Elder { 2261e93f3152SAlex Elder struct rbd_img_request *parent_request; 2262e93f3152SAlex Elder struct rbd_obj_request *orig_request; 2263e93f3152SAlex Elder 2264e93f3152SAlex Elder parent_request = container_of(kref, struct rbd_img_request, kref); 2265e93f3152SAlex Elder orig_request = parent_request->obj_request; 2266e93f3152SAlex Elder 2267e93f3152SAlex Elder parent_request->obj_request = NULL; 2268e93f3152SAlex Elder rbd_obj_request_put(orig_request); 2269e93f3152SAlex Elder img_request_child_clear(parent_request); 2270e93f3152SAlex Elder 2271e93f3152SAlex Elder rbd_img_request_destroy(kref); 2272e93f3152SAlex Elder } 2273e93f3152SAlex Elder 22741217857fSAlex Elder static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request) 22751217857fSAlex Elder { 22766365d33aSAlex Elder struct rbd_img_request *img_request; 22771217857fSAlex Elder unsigned int xferred; 22781217857fSAlex Elder int result; 22798b3e1a56SAlex Elder bool more; 22801217857fSAlex Elder 22816365d33aSAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 22826365d33aSAlex Elder img_request = obj_request->img_request; 22836365d33aSAlex Elder 22841217857fSAlex Elder rbd_assert(obj_request->xferred <= (u64)UINT_MAX); 22851217857fSAlex Elder xferred = (unsigned int)obj_request->xferred; 22861217857fSAlex Elder result = obj_request->result; 22871217857fSAlex Elder if (result) { 22881217857fSAlex Elder struct rbd_device *rbd_dev = img_request->rbd_dev; 22896d2940c8SGuangliang Zhao enum obj_operation_type op_type; 22906d2940c8SGuangliang Zhao 229190e98c52SGuangliang Zhao if (img_request_discard_test(img_request)) 229290e98c52SGuangliang Zhao op_type = OBJ_OP_DISCARD; 229390e98c52SGuangliang Zhao else if (img_request_write_test(img_request)) 229490e98c52SGuangliang Zhao op_type = OBJ_OP_WRITE; 229590e98c52SGuangliang Zhao else 229690e98c52SGuangliang Zhao op_type = OBJ_OP_READ; 22971217857fSAlex Elder 22989584d508SIlya Dryomov rbd_warn(rbd_dev, "%s %llx at %llx (%llx)", 22996d2940c8SGuangliang Zhao obj_op_name(op_type), obj_request->length, 23006d2940c8SGuangliang Zhao obj_request->img_offset, obj_request->offset); 23019584d508SIlya Dryomov rbd_warn(rbd_dev, " result %d xferred %x", 23021217857fSAlex Elder result, xferred); 23031217857fSAlex Elder if (!img_request->result) 23041217857fSAlex Elder img_request->result = result; 2305082a75daSIlya Dryomov /* 2306082a75daSIlya Dryomov * Need to end I/O on the entire obj_request worth of 2307082a75daSIlya Dryomov * bytes in case of error. 2308082a75daSIlya Dryomov */ 2309082a75daSIlya Dryomov xferred = obj_request->length; 23101217857fSAlex Elder } 23111217857fSAlex Elder 2312f1a4739fSAlex Elder /* Image object requests don't own their page array */ 2313f1a4739fSAlex Elder 2314f1a4739fSAlex Elder if (obj_request->type == OBJ_REQUEST_PAGES) { 2315f1a4739fSAlex Elder obj_request->pages = NULL; 2316f1a4739fSAlex Elder obj_request->page_count = 0; 2317f1a4739fSAlex Elder } 2318f1a4739fSAlex Elder 23198b3e1a56SAlex Elder if (img_request_child_test(img_request)) { 23208b3e1a56SAlex Elder rbd_assert(img_request->obj_request != NULL); 23218b3e1a56SAlex Elder more = obj_request->which < img_request->obj_request_count - 1; 23228b3e1a56SAlex Elder } else { 23238b3e1a56SAlex Elder rbd_assert(img_request->rq != NULL); 23247ad18afaSChristoph Hellwig 23257ad18afaSChristoph Hellwig more = blk_update_request(img_request->rq, result, xferred); 23267ad18afaSChristoph Hellwig if (!more) 23277ad18afaSChristoph Hellwig __blk_mq_end_request(img_request->rq, result); 23288b3e1a56SAlex Elder } 23298b3e1a56SAlex Elder 23308b3e1a56SAlex Elder return more; 23311217857fSAlex Elder } 23321217857fSAlex Elder 23332169238dSAlex Elder static void rbd_img_obj_callback(struct rbd_obj_request *obj_request) 23342169238dSAlex Elder { 23352169238dSAlex Elder struct rbd_img_request *img_request; 23362169238dSAlex Elder u32 which = obj_request->which; 23372169238dSAlex Elder bool more = true; 23382169238dSAlex Elder 23396365d33aSAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 23402169238dSAlex Elder img_request = obj_request->img_request; 23412169238dSAlex Elder 23422169238dSAlex Elder dout("%s: img %p obj %p\n", __func__, img_request, obj_request); 23432169238dSAlex Elder rbd_assert(img_request != NULL); 23442169238dSAlex Elder rbd_assert(img_request->obj_request_count > 0); 23452169238dSAlex Elder rbd_assert(which != BAD_WHICH); 23462169238dSAlex Elder rbd_assert(which < img_request->obj_request_count); 23472169238dSAlex Elder 23482169238dSAlex Elder spin_lock_irq(&img_request->completion_lock); 23492169238dSAlex Elder if (which != img_request->next_completion) 23502169238dSAlex Elder goto out; 23512169238dSAlex Elder 23522169238dSAlex Elder for_each_obj_request_from(img_request, obj_request) { 23532169238dSAlex Elder rbd_assert(more); 23542169238dSAlex Elder rbd_assert(which < img_request->obj_request_count); 23552169238dSAlex Elder 23562169238dSAlex Elder if (!obj_request_done_test(obj_request)) 23572169238dSAlex Elder break; 23581217857fSAlex Elder more = rbd_img_obj_end_request(obj_request); 23592169238dSAlex Elder which++; 23602169238dSAlex Elder } 23612169238dSAlex Elder 23622169238dSAlex Elder rbd_assert(more ^ (which == img_request->obj_request_count)); 23632169238dSAlex Elder img_request->next_completion = which; 23642169238dSAlex Elder out: 23652169238dSAlex Elder spin_unlock_irq(&img_request->completion_lock); 23660f2d5be7SAlex Elder rbd_img_request_put(img_request); 23672169238dSAlex Elder 23682169238dSAlex Elder if (!more) 23692169238dSAlex Elder rbd_img_request_complete(img_request); 23702169238dSAlex Elder } 23712169238dSAlex Elder 2372f1a4739fSAlex Elder /* 23733b434a2aSJosh Durgin * Add individual osd ops to the given ceph_osd_request and prepare 23743b434a2aSJosh Durgin * them for submission. num_ops is the current number of 23753b434a2aSJosh Durgin * osd operations already to the object request. 23763b434a2aSJosh Durgin */ 23773b434a2aSJosh Durgin static void rbd_img_obj_request_fill(struct rbd_obj_request *obj_request, 23783b434a2aSJosh Durgin struct ceph_osd_request *osd_request, 23793b434a2aSJosh Durgin enum obj_operation_type op_type, 23803b434a2aSJosh Durgin unsigned int num_ops) 23813b434a2aSJosh Durgin { 23823b434a2aSJosh Durgin struct rbd_img_request *img_request = obj_request->img_request; 23833b434a2aSJosh Durgin struct rbd_device *rbd_dev = img_request->rbd_dev; 23843b434a2aSJosh Durgin u64 object_size = rbd_obj_bytes(&rbd_dev->header); 23853b434a2aSJosh Durgin u64 offset = obj_request->offset; 23863b434a2aSJosh Durgin u64 length = obj_request->length; 23873b434a2aSJosh Durgin u64 img_end; 23883b434a2aSJosh Durgin u16 opcode; 23893b434a2aSJosh Durgin 23903b434a2aSJosh Durgin if (op_type == OBJ_OP_DISCARD) { 2391d3246fb0SJosh Durgin if (!offset && length == object_size && 2392d3246fb0SJosh Durgin (!img_request_layered_test(img_request) || 2393d3246fb0SJosh Durgin !obj_request_overlaps_parent(obj_request))) { 23943b434a2aSJosh Durgin opcode = CEPH_OSD_OP_DELETE; 23953b434a2aSJosh Durgin } else if ((offset + length == object_size)) { 23963b434a2aSJosh Durgin opcode = CEPH_OSD_OP_TRUNCATE; 23973b434a2aSJosh Durgin } else { 23983b434a2aSJosh Durgin down_read(&rbd_dev->header_rwsem); 23993b434a2aSJosh Durgin img_end = rbd_dev->header.image_size; 24003b434a2aSJosh Durgin up_read(&rbd_dev->header_rwsem); 24013b434a2aSJosh Durgin 24023b434a2aSJosh Durgin if (obj_request->img_offset + length == img_end) 24033b434a2aSJosh Durgin opcode = CEPH_OSD_OP_TRUNCATE; 24043b434a2aSJosh Durgin else 24053b434a2aSJosh Durgin opcode = CEPH_OSD_OP_ZERO; 24063b434a2aSJosh Durgin } 24073b434a2aSJosh Durgin } else if (op_type == OBJ_OP_WRITE) { 2408e30b7577SIlya Dryomov if (!offset && length == object_size) 2409e30b7577SIlya Dryomov opcode = CEPH_OSD_OP_WRITEFULL; 2410e30b7577SIlya Dryomov else 24113b434a2aSJosh Durgin opcode = CEPH_OSD_OP_WRITE; 24123b434a2aSJosh Durgin osd_req_op_alloc_hint_init(osd_request, num_ops, 24133b434a2aSJosh Durgin object_size, object_size); 24143b434a2aSJosh Durgin num_ops++; 24153b434a2aSJosh Durgin } else { 24163b434a2aSJosh Durgin opcode = CEPH_OSD_OP_READ; 24173b434a2aSJosh Durgin } 24183b434a2aSJosh Durgin 24197e868b6eSIlya Dryomov if (opcode == CEPH_OSD_OP_DELETE) 2420144cba14SYan, Zheng osd_req_op_init(osd_request, num_ops, opcode, 0); 24217e868b6eSIlya Dryomov else 24227e868b6eSIlya Dryomov osd_req_op_extent_init(osd_request, num_ops, opcode, 24237e868b6eSIlya Dryomov offset, length, 0, 0); 24247e868b6eSIlya Dryomov 24253b434a2aSJosh Durgin if (obj_request->type == OBJ_REQUEST_BIO) 24263b434a2aSJosh Durgin osd_req_op_extent_osd_data_bio(osd_request, num_ops, 24273b434a2aSJosh Durgin obj_request->bio_list, length); 24283b434a2aSJosh Durgin else if (obj_request->type == OBJ_REQUEST_PAGES) 24293b434a2aSJosh Durgin osd_req_op_extent_osd_data_pages(osd_request, num_ops, 24303b434a2aSJosh Durgin obj_request->pages, length, 24313b434a2aSJosh Durgin offset & ~PAGE_MASK, false, false); 24323b434a2aSJosh Durgin 24333b434a2aSJosh Durgin /* Discards are also writes */ 24343b434a2aSJosh Durgin if (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD) 24353b434a2aSJosh Durgin rbd_osd_req_format_write(obj_request); 24363b434a2aSJosh Durgin else 24373b434a2aSJosh Durgin rbd_osd_req_format_read(obj_request); 24383b434a2aSJosh Durgin } 24393b434a2aSJosh Durgin 24403b434a2aSJosh Durgin /* 2441f1a4739fSAlex Elder * Split up an image request into one or more object requests, each 2442f1a4739fSAlex Elder * to a different object. The "type" parameter indicates whether 2443f1a4739fSAlex Elder * "data_desc" is the pointer to the head of a list of bio 2444f1a4739fSAlex Elder * structures, or the base of a page array. In either case this 2445f1a4739fSAlex Elder * function assumes data_desc describes memory sufficient to hold 2446f1a4739fSAlex Elder * all data described by the image request. 2447f1a4739fSAlex Elder */ 2448f1a4739fSAlex Elder static int rbd_img_request_fill(struct rbd_img_request *img_request, 2449f1a4739fSAlex Elder enum obj_request_type type, 2450f1a4739fSAlex Elder void *data_desc) 2451bf0d5f50SAlex Elder { 2452bf0d5f50SAlex Elder struct rbd_device *rbd_dev = img_request->rbd_dev; 2453bf0d5f50SAlex Elder struct rbd_obj_request *obj_request = NULL; 2454bf0d5f50SAlex Elder struct rbd_obj_request *next_obj_request; 2455a158073cSJingoo Han struct bio *bio_list = NULL; 2456f1a4739fSAlex Elder unsigned int bio_offset = 0; 2457a158073cSJingoo Han struct page **pages = NULL; 24586d2940c8SGuangliang Zhao enum obj_operation_type op_type; 24597da22d29SAlex Elder u64 img_offset; 2460bf0d5f50SAlex Elder u64 resid; 2461bf0d5f50SAlex Elder 2462f1a4739fSAlex Elder dout("%s: img %p type %d data_desc %p\n", __func__, img_request, 2463f1a4739fSAlex Elder (int)type, data_desc); 246437206ee5SAlex Elder 24657da22d29SAlex Elder img_offset = img_request->offset; 2466bf0d5f50SAlex Elder resid = img_request->length; 24674dda41d3SAlex Elder rbd_assert(resid > 0); 24683b434a2aSJosh Durgin op_type = rbd_img_request_op_type(img_request); 2469f1a4739fSAlex Elder 2470f1a4739fSAlex Elder if (type == OBJ_REQUEST_BIO) { 2471f1a4739fSAlex Elder bio_list = data_desc; 24724f024f37SKent Overstreet rbd_assert(img_offset == 24734f024f37SKent Overstreet bio_list->bi_iter.bi_sector << SECTOR_SHIFT); 247490e98c52SGuangliang Zhao } else if (type == OBJ_REQUEST_PAGES) { 2475f1a4739fSAlex Elder pages = data_desc; 2476f1a4739fSAlex Elder } 2477f1a4739fSAlex Elder 2478bf0d5f50SAlex Elder while (resid) { 24792fa12320SAlex Elder struct ceph_osd_request *osd_req; 2480bf0d5f50SAlex Elder const char *object_name; 2481bf0d5f50SAlex Elder u64 offset; 2482bf0d5f50SAlex Elder u64 length; 2483bf0d5f50SAlex Elder 24847da22d29SAlex Elder object_name = rbd_segment_name(rbd_dev, img_offset); 2485bf0d5f50SAlex Elder if (!object_name) 2486bf0d5f50SAlex Elder goto out_unwind; 24877da22d29SAlex Elder offset = rbd_segment_offset(rbd_dev, img_offset); 24887da22d29SAlex Elder length = rbd_segment_length(rbd_dev, img_offset, resid); 2489bf0d5f50SAlex Elder obj_request = rbd_obj_request_create(object_name, 2490f1a4739fSAlex Elder offset, length, type); 249178c2a44aSAlex Elder /* object request has its own copy of the object name */ 249278c2a44aSAlex Elder rbd_segment_name_free(object_name); 2493bf0d5f50SAlex Elder if (!obj_request) 2494bf0d5f50SAlex Elder goto out_unwind; 249562054da6SIlya Dryomov 249603507db6SJosh Durgin /* 249703507db6SJosh Durgin * set obj_request->img_request before creating the 249803507db6SJosh Durgin * osd_request so that it gets the right snapc 249903507db6SJosh Durgin */ 250003507db6SJosh Durgin rbd_img_obj_request_add(img_request, obj_request); 2501bf0d5f50SAlex Elder 2502f1a4739fSAlex Elder if (type == OBJ_REQUEST_BIO) { 2503f1a4739fSAlex Elder unsigned int clone_size; 2504f1a4739fSAlex Elder 2505bf0d5f50SAlex Elder rbd_assert(length <= (u64)UINT_MAX); 2506bf0d5f50SAlex Elder clone_size = (unsigned int)length; 2507f1a4739fSAlex Elder obj_request->bio_list = 2508f1a4739fSAlex Elder bio_chain_clone_range(&bio_list, 2509f1a4739fSAlex Elder &bio_offset, 2510f1a4739fSAlex Elder clone_size, 2511bf0d5f50SAlex Elder GFP_ATOMIC); 2512bf0d5f50SAlex Elder if (!obj_request->bio_list) 251362054da6SIlya Dryomov goto out_unwind; 251490e98c52SGuangliang Zhao } else if (type == OBJ_REQUEST_PAGES) { 2515f1a4739fSAlex Elder unsigned int page_count; 2516f1a4739fSAlex Elder 2517f1a4739fSAlex Elder obj_request->pages = pages; 2518f1a4739fSAlex Elder page_count = (u32)calc_pages_for(offset, length); 2519f1a4739fSAlex Elder obj_request->page_count = page_count; 2520f1a4739fSAlex Elder if ((offset + length) & ~PAGE_MASK) 2521f1a4739fSAlex Elder page_count--; /* more on last page */ 2522f1a4739fSAlex Elder pages += page_count; 2523f1a4739fSAlex Elder } 2524bf0d5f50SAlex Elder 25256d2940c8SGuangliang Zhao osd_req = rbd_osd_req_create(rbd_dev, op_type, 25266d2940c8SGuangliang Zhao (op_type == OBJ_OP_WRITE) ? 2 : 1, 25272fa12320SAlex Elder obj_request); 25282fa12320SAlex Elder if (!osd_req) 252962054da6SIlya Dryomov goto out_unwind; 25303b434a2aSJosh Durgin 25312fa12320SAlex Elder obj_request->osd_req = osd_req; 25322169238dSAlex Elder obj_request->callback = rbd_img_obj_callback; 25337da22d29SAlex Elder obj_request->img_offset = img_offset; 2534bf0d5f50SAlex Elder 25353b434a2aSJosh Durgin rbd_img_obj_request_fill(obj_request, osd_req, op_type, 0); 25363b434a2aSJosh Durgin 25373b434a2aSJosh Durgin rbd_img_request_get(img_request); 25383b434a2aSJosh Durgin 25397da22d29SAlex Elder img_offset += length; 2540bf0d5f50SAlex Elder resid -= length; 2541bf0d5f50SAlex Elder } 2542bf0d5f50SAlex Elder 2543bf0d5f50SAlex Elder return 0; 2544bf0d5f50SAlex Elder 2545bf0d5f50SAlex Elder out_unwind: 2546bf0d5f50SAlex Elder for_each_obj_request_safe(img_request, obj_request, next_obj_request) 254742dd037cSIlya Dryomov rbd_img_obj_request_del(img_request, obj_request); 2548bf0d5f50SAlex Elder 2549bf0d5f50SAlex Elder return -ENOMEM; 2550bf0d5f50SAlex Elder } 2551bf0d5f50SAlex Elder 25523d7efd18SAlex Elder static void 25532761713dSIlya Dryomov rbd_osd_copyup_callback(struct rbd_obj_request *obj_request) 25540eefd470SAlex Elder { 25550eefd470SAlex Elder struct rbd_img_request *img_request; 25560eefd470SAlex Elder struct rbd_device *rbd_dev; 2557ebda6408SAlex Elder struct page **pages; 25580eefd470SAlex Elder u32 page_count; 25590eefd470SAlex Elder 25602761713dSIlya Dryomov dout("%s: obj %p\n", __func__, obj_request); 25612761713dSIlya Dryomov 2562d3246fb0SJosh Durgin rbd_assert(obj_request->type == OBJ_REQUEST_BIO || 2563d3246fb0SJosh Durgin obj_request->type == OBJ_REQUEST_NODATA); 25640eefd470SAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 25650eefd470SAlex Elder img_request = obj_request->img_request; 25660eefd470SAlex Elder rbd_assert(img_request); 25670eefd470SAlex Elder 25680eefd470SAlex Elder rbd_dev = img_request->rbd_dev; 25690eefd470SAlex Elder rbd_assert(rbd_dev); 25700eefd470SAlex Elder 2571ebda6408SAlex Elder pages = obj_request->copyup_pages; 2572ebda6408SAlex Elder rbd_assert(pages != NULL); 25730eefd470SAlex Elder obj_request->copyup_pages = NULL; 2574ebda6408SAlex Elder page_count = obj_request->copyup_page_count; 2575ebda6408SAlex Elder rbd_assert(page_count); 2576ebda6408SAlex Elder obj_request->copyup_page_count = 0; 2577ebda6408SAlex Elder ceph_release_page_vector(pages, page_count); 25780eefd470SAlex Elder 25790eefd470SAlex Elder /* 25800eefd470SAlex Elder * We want the transfer count to reflect the size of the 25810eefd470SAlex Elder * original write request. There is no such thing as a 25820eefd470SAlex Elder * successful short write, so if the request was successful 25830eefd470SAlex Elder * we can just set it to the originally-requested length. 25840eefd470SAlex Elder */ 25850eefd470SAlex Elder if (!obj_request->result) 25860eefd470SAlex Elder obj_request->xferred = obj_request->length; 25870eefd470SAlex Elder 25882761713dSIlya Dryomov obj_request_done_set(obj_request); 25890eefd470SAlex Elder } 25900eefd470SAlex Elder 25910eefd470SAlex Elder static void 25923d7efd18SAlex Elder rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request) 25933d7efd18SAlex Elder { 25943d7efd18SAlex Elder struct rbd_obj_request *orig_request; 25950eefd470SAlex Elder struct ceph_osd_request *osd_req; 25960eefd470SAlex Elder struct ceph_osd_client *osdc; 25970eefd470SAlex Elder struct rbd_device *rbd_dev; 25983d7efd18SAlex Elder struct page **pages; 2599d3246fb0SJosh Durgin enum obj_operation_type op_type; 2600ebda6408SAlex Elder u32 page_count; 2601bbea1c1aSAlex Elder int img_result; 2602ebda6408SAlex Elder u64 parent_length; 26033d7efd18SAlex Elder 26043d7efd18SAlex Elder rbd_assert(img_request_child_test(img_request)); 26053d7efd18SAlex Elder 26063d7efd18SAlex Elder /* First get what we need from the image request */ 26073d7efd18SAlex Elder 26083d7efd18SAlex Elder pages = img_request->copyup_pages; 26093d7efd18SAlex Elder rbd_assert(pages != NULL); 26103d7efd18SAlex Elder img_request->copyup_pages = NULL; 2611ebda6408SAlex Elder page_count = img_request->copyup_page_count; 2612ebda6408SAlex Elder rbd_assert(page_count); 2613ebda6408SAlex Elder img_request->copyup_page_count = 0; 26143d7efd18SAlex Elder 26153d7efd18SAlex Elder orig_request = img_request->obj_request; 26163d7efd18SAlex Elder rbd_assert(orig_request != NULL); 2617b91f09f1SAlex Elder rbd_assert(obj_request_type_valid(orig_request->type)); 2618bbea1c1aSAlex Elder img_result = img_request->result; 2619ebda6408SAlex Elder parent_length = img_request->length; 2620ebda6408SAlex Elder rbd_assert(parent_length == img_request->xferred); 26213d7efd18SAlex Elder rbd_img_request_put(img_request); 26223d7efd18SAlex Elder 262391c6febbSAlex Elder rbd_assert(orig_request->img_request); 262491c6febbSAlex Elder rbd_dev = orig_request->img_request->rbd_dev; 26253d7efd18SAlex Elder rbd_assert(rbd_dev); 26263d7efd18SAlex Elder 2627bbea1c1aSAlex Elder /* 2628bbea1c1aSAlex Elder * If the overlap has become 0 (most likely because the 2629bbea1c1aSAlex Elder * image has been flattened) we need to free the pages 2630bbea1c1aSAlex Elder * and re-submit the original write request. 2631bbea1c1aSAlex Elder */ 2632bbea1c1aSAlex Elder if (!rbd_dev->parent_overlap) { 2633bbea1c1aSAlex Elder struct ceph_osd_client *osdc; 2634bbea1c1aSAlex Elder 2635bbea1c1aSAlex Elder ceph_release_page_vector(pages, page_count); 2636bbea1c1aSAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 2637bbea1c1aSAlex Elder img_result = rbd_obj_request_submit(osdc, orig_request); 2638bbea1c1aSAlex Elder if (!img_result) 2639bbea1c1aSAlex Elder return; 2640bbea1c1aSAlex Elder } 2641bbea1c1aSAlex Elder 2642bbea1c1aSAlex Elder if (img_result) 26430eefd470SAlex Elder goto out_err; 26443d7efd18SAlex Elder 26458785b1d4SAlex Elder /* 26468785b1d4SAlex Elder * The original osd request is of no use to use any more. 26470ccd5926SIlya Dryomov * We need a new one that can hold the three ops in a copyup 26488785b1d4SAlex Elder * request. Allocate the new copyup osd request for the 26498785b1d4SAlex Elder * original request, and release the old one. 26508785b1d4SAlex Elder */ 2651bbea1c1aSAlex Elder img_result = -ENOMEM; 26520eefd470SAlex Elder osd_req = rbd_osd_req_create_copyup(orig_request); 26530eefd470SAlex Elder if (!osd_req) 26540eefd470SAlex Elder goto out_err; 26558785b1d4SAlex Elder rbd_osd_req_destroy(orig_request->osd_req); 26560eefd470SAlex Elder orig_request->osd_req = osd_req; 26570eefd470SAlex Elder orig_request->copyup_pages = pages; 2658ebda6408SAlex Elder orig_request->copyup_page_count = page_count; 26593d7efd18SAlex Elder 26600eefd470SAlex Elder /* Initialize the copyup op */ 26610eefd470SAlex Elder 26620eefd470SAlex Elder osd_req_op_cls_init(osd_req, 0, CEPH_OSD_OP_CALL, "rbd", "copyup"); 2663ebda6408SAlex Elder osd_req_op_cls_request_data_pages(osd_req, 0, pages, parent_length, 0, 26640eefd470SAlex Elder false, false); 26650eefd470SAlex Elder 2666d3246fb0SJosh Durgin /* Add the other op(s) */ 26670ccd5926SIlya Dryomov 2668d3246fb0SJosh Durgin op_type = rbd_img_request_op_type(orig_request->img_request); 2669d3246fb0SJosh Durgin rbd_img_obj_request_fill(orig_request, osd_req, op_type, 1); 26700eefd470SAlex Elder 26710eefd470SAlex Elder /* All set, send it off. */ 26720eefd470SAlex Elder 26730eefd470SAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 2674bbea1c1aSAlex Elder img_result = rbd_obj_request_submit(osdc, orig_request); 2675bbea1c1aSAlex Elder if (!img_result) 26760eefd470SAlex Elder return; 26770eefd470SAlex Elder out_err: 26780eefd470SAlex Elder /* Record the error code and complete the request */ 26790eefd470SAlex Elder 2680bbea1c1aSAlex Elder orig_request->result = img_result; 26810eefd470SAlex Elder orig_request->xferred = 0; 26823d7efd18SAlex Elder obj_request_done_set(orig_request); 26833d7efd18SAlex Elder rbd_obj_request_complete(orig_request); 26843d7efd18SAlex Elder } 26853d7efd18SAlex Elder 26863d7efd18SAlex Elder /* 26873d7efd18SAlex Elder * Read from the parent image the range of data that covers the 26883d7efd18SAlex Elder * entire target of the given object request. This is used for 26893d7efd18SAlex Elder * satisfying a layered image write request when the target of an 26903d7efd18SAlex Elder * object request from the image request does not exist. 26913d7efd18SAlex Elder * 26923d7efd18SAlex Elder * A page array big enough to hold the returned data is allocated 26933d7efd18SAlex Elder * and supplied to rbd_img_request_fill() as the "data descriptor." 26943d7efd18SAlex Elder * When the read completes, this page array will be transferred to 26953d7efd18SAlex Elder * the original object request for the copyup operation. 26963d7efd18SAlex Elder * 26973d7efd18SAlex Elder * If an error occurs, record it as the result of the original 26983d7efd18SAlex Elder * object request and mark it done so it gets completed. 26993d7efd18SAlex Elder */ 27003d7efd18SAlex Elder static int rbd_img_obj_parent_read_full(struct rbd_obj_request *obj_request) 27013d7efd18SAlex Elder { 27023d7efd18SAlex Elder struct rbd_img_request *img_request = NULL; 27033d7efd18SAlex Elder struct rbd_img_request *parent_request = NULL; 27043d7efd18SAlex Elder struct rbd_device *rbd_dev; 27053d7efd18SAlex Elder u64 img_offset; 27063d7efd18SAlex Elder u64 length; 27073d7efd18SAlex Elder struct page **pages = NULL; 27083d7efd18SAlex Elder u32 page_count; 27093d7efd18SAlex Elder int result; 27103d7efd18SAlex Elder 27113d7efd18SAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 2712b91f09f1SAlex Elder rbd_assert(obj_request_type_valid(obj_request->type)); 27133d7efd18SAlex Elder 27143d7efd18SAlex Elder img_request = obj_request->img_request; 27153d7efd18SAlex Elder rbd_assert(img_request != NULL); 27163d7efd18SAlex Elder rbd_dev = img_request->rbd_dev; 27173d7efd18SAlex Elder rbd_assert(rbd_dev->parent != NULL); 27183d7efd18SAlex Elder 27193d7efd18SAlex Elder /* 27203d7efd18SAlex Elder * Determine the byte range covered by the object in the 27213d7efd18SAlex Elder * child image to which the original request was to be sent. 27223d7efd18SAlex Elder */ 27233d7efd18SAlex Elder img_offset = obj_request->img_offset - obj_request->offset; 27243d7efd18SAlex Elder length = (u64)1 << rbd_dev->header.obj_order; 27253d7efd18SAlex Elder 27263d7efd18SAlex Elder /* 2727a9e8ba2cSAlex Elder * There is no defined parent data beyond the parent 2728a9e8ba2cSAlex Elder * overlap, so limit what we read at that boundary if 2729a9e8ba2cSAlex Elder * necessary. 2730a9e8ba2cSAlex Elder */ 2731a9e8ba2cSAlex Elder if (img_offset + length > rbd_dev->parent_overlap) { 2732a9e8ba2cSAlex Elder rbd_assert(img_offset < rbd_dev->parent_overlap); 2733a9e8ba2cSAlex Elder length = rbd_dev->parent_overlap - img_offset; 2734a9e8ba2cSAlex Elder } 2735a9e8ba2cSAlex Elder 2736a9e8ba2cSAlex Elder /* 27373d7efd18SAlex Elder * Allocate a page array big enough to receive the data read 27383d7efd18SAlex Elder * from the parent. 27393d7efd18SAlex Elder */ 27403d7efd18SAlex Elder page_count = (u32)calc_pages_for(0, length); 27413d7efd18SAlex Elder pages = ceph_alloc_page_vector(page_count, GFP_KERNEL); 27423d7efd18SAlex Elder if (IS_ERR(pages)) { 27433d7efd18SAlex Elder result = PTR_ERR(pages); 27443d7efd18SAlex Elder pages = NULL; 27453d7efd18SAlex Elder goto out_err; 27463d7efd18SAlex Elder } 27473d7efd18SAlex Elder 27483d7efd18SAlex Elder result = -ENOMEM; 2749e93f3152SAlex Elder parent_request = rbd_parent_request_create(obj_request, 2750e93f3152SAlex Elder img_offset, length); 27513d7efd18SAlex Elder if (!parent_request) 27523d7efd18SAlex Elder goto out_err; 27533d7efd18SAlex Elder 27543d7efd18SAlex Elder result = rbd_img_request_fill(parent_request, OBJ_REQUEST_PAGES, pages); 27553d7efd18SAlex Elder if (result) 27563d7efd18SAlex Elder goto out_err; 27573d7efd18SAlex Elder parent_request->copyup_pages = pages; 2758ebda6408SAlex Elder parent_request->copyup_page_count = page_count; 27593d7efd18SAlex Elder 27603d7efd18SAlex Elder parent_request->callback = rbd_img_obj_parent_read_full_callback; 27613d7efd18SAlex Elder result = rbd_img_request_submit(parent_request); 27623d7efd18SAlex Elder if (!result) 27633d7efd18SAlex Elder return 0; 27643d7efd18SAlex Elder 27653d7efd18SAlex Elder parent_request->copyup_pages = NULL; 2766ebda6408SAlex Elder parent_request->copyup_page_count = 0; 27673d7efd18SAlex Elder parent_request->obj_request = NULL; 27683d7efd18SAlex Elder rbd_obj_request_put(obj_request); 27693d7efd18SAlex Elder out_err: 27703d7efd18SAlex Elder if (pages) 27713d7efd18SAlex Elder ceph_release_page_vector(pages, page_count); 27723d7efd18SAlex Elder if (parent_request) 27733d7efd18SAlex Elder rbd_img_request_put(parent_request); 27743d7efd18SAlex Elder obj_request->result = result; 27753d7efd18SAlex Elder obj_request->xferred = 0; 27763d7efd18SAlex Elder obj_request_done_set(obj_request); 27773d7efd18SAlex Elder 27783d7efd18SAlex Elder return result; 27793d7efd18SAlex Elder } 27803d7efd18SAlex Elder 2781c5b5ef6cSAlex Elder static void rbd_img_obj_exists_callback(struct rbd_obj_request *obj_request) 2782c5b5ef6cSAlex Elder { 2783c5b5ef6cSAlex Elder struct rbd_obj_request *orig_request; 2784638f5abeSAlex Elder struct rbd_device *rbd_dev; 2785c5b5ef6cSAlex Elder int result; 2786c5b5ef6cSAlex Elder 2787c5b5ef6cSAlex Elder rbd_assert(!obj_request_img_data_test(obj_request)); 2788c5b5ef6cSAlex Elder 2789c5b5ef6cSAlex Elder /* 2790c5b5ef6cSAlex Elder * All we need from the object request is the original 2791c5b5ef6cSAlex Elder * request and the result of the STAT op. Grab those, then 2792c5b5ef6cSAlex Elder * we're done with the request. 2793c5b5ef6cSAlex Elder */ 2794c5b5ef6cSAlex Elder orig_request = obj_request->obj_request; 2795c5b5ef6cSAlex Elder obj_request->obj_request = NULL; 2796912c317dSAlex Elder rbd_obj_request_put(orig_request); 2797c5b5ef6cSAlex Elder rbd_assert(orig_request); 2798c5b5ef6cSAlex Elder rbd_assert(orig_request->img_request); 2799c5b5ef6cSAlex Elder 2800c5b5ef6cSAlex Elder result = obj_request->result; 2801c5b5ef6cSAlex Elder obj_request->result = 0; 2802c5b5ef6cSAlex Elder 2803c5b5ef6cSAlex Elder dout("%s: obj %p for obj %p result %d %llu/%llu\n", __func__, 2804c5b5ef6cSAlex Elder obj_request, orig_request, result, 2805c5b5ef6cSAlex Elder obj_request->xferred, obj_request->length); 2806c5b5ef6cSAlex Elder rbd_obj_request_put(obj_request); 2807c5b5ef6cSAlex Elder 2808638f5abeSAlex Elder /* 2809638f5abeSAlex Elder * If the overlap has become 0 (most likely because the 2810638f5abeSAlex Elder * image has been flattened) we need to free the pages 2811638f5abeSAlex Elder * and re-submit the original write request. 2812638f5abeSAlex Elder */ 2813638f5abeSAlex Elder rbd_dev = orig_request->img_request->rbd_dev; 2814638f5abeSAlex Elder if (!rbd_dev->parent_overlap) { 2815638f5abeSAlex Elder struct ceph_osd_client *osdc; 2816638f5abeSAlex Elder 2817638f5abeSAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 2818638f5abeSAlex Elder result = rbd_obj_request_submit(osdc, orig_request); 2819638f5abeSAlex Elder if (!result) 2820638f5abeSAlex Elder return; 2821638f5abeSAlex Elder } 2822c5b5ef6cSAlex Elder 2823c5b5ef6cSAlex Elder /* 2824c5b5ef6cSAlex Elder * Our only purpose here is to determine whether the object 2825c5b5ef6cSAlex Elder * exists, and we don't want to treat the non-existence as 2826c5b5ef6cSAlex Elder * an error. If something else comes back, transfer the 2827c5b5ef6cSAlex Elder * error to the original request and complete it now. 2828c5b5ef6cSAlex Elder */ 2829c5b5ef6cSAlex Elder if (!result) { 2830c5b5ef6cSAlex Elder obj_request_existence_set(orig_request, true); 2831c5b5ef6cSAlex Elder } else if (result == -ENOENT) { 2832c5b5ef6cSAlex Elder obj_request_existence_set(orig_request, false); 2833c5b5ef6cSAlex Elder } else if (result) { 2834c5b5ef6cSAlex Elder orig_request->result = result; 28353d7efd18SAlex Elder goto out; 2836c5b5ef6cSAlex Elder } 2837c5b5ef6cSAlex Elder 2838c5b5ef6cSAlex Elder /* 2839c5b5ef6cSAlex Elder * Resubmit the original request now that we have recorded 2840c5b5ef6cSAlex Elder * whether the target object exists. 2841c5b5ef6cSAlex Elder */ 2842b454e36dSAlex Elder orig_request->result = rbd_img_obj_request_submit(orig_request); 28433d7efd18SAlex Elder out: 2844c5b5ef6cSAlex Elder if (orig_request->result) 2845c5b5ef6cSAlex Elder rbd_obj_request_complete(orig_request); 2846c5b5ef6cSAlex Elder } 2847c5b5ef6cSAlex Elder 2848c5b5ef6cSAlex Elder static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request) 2849c5b5ef6cSAlex Elder { 2850c5b5ef6cSAlex Elder struct rbd_obj_request *stat_request; 2851c5b5ef6cSAlex Elder struct rbd_device *rbd_dev; 2852c5b5ef6cSAlex Elder struct ceph_osd_client *osdc; 2853c5b5ef6cSAlex Elder struct page **pages = NULL; 2854c5b5ef6cSAlex Elder u32 page_count; 2855c5b5ef6cSAlex Elder size_t size; 2856c5b5ef6cSAlex Elder int ret; 2857c5b5ef6cSAlex Elder 2858c5b5ef6cSAlex Elder /* 2859c5b5ef6cSAlex Elder * The response data for a STAT call consists of: 2860c5b5ef6cSAlex Elder * le64 length; 2861c5b5ef6cSAlex Elder * struct { 2862c5b5ef6cSAlex Elder * le32 tv_sec; 2863c5b5ef6cSAlex Elder * le32 tv_nsec; 2864c5b5ef6cSAlex Elder * } mtime; 2865c5b5ef6cSAlex Elder */ 2866c5b5ef6cSAlex Elder size = sizeof (__le64) + sizeof (__le32) + sizeof (__le32); 2867c5b5ef6cSAlex Elder page_count = (u32)calc_pages_for(0, size); 2868c5b5ef6cSAlex Elder pages = ceph_alloc_page_vector(page_count, GFP_KERNEL); 2869c5b5ef6cSAlex Elder if (IS_ERR(pages)) 2870c5b5ef6cSAlex Elder return PTR_ERR(pages); 2871c5b5ef6cSAlex Elder 2872c5b5ef6cSAlex Elder ret = -ENOMEM; 2873c5b5ef6cSAlex Elder stat_request = rbd_obj_request_create(obj_request->object_name, 0, 0, 2874c5b5ef6cSAlex Elder OBJ_REQUEST_PAGES); 2875c5b5ef6cSAlex Elder if (!stat_request) 2876c5b5ef6cSAlex Elder goto out; 2877c5b5ef6cSAlex Elder 2878c5b5ef6cSAlex Elder rbd_obj_request_get(obj_request); 2879c5b5ef6cSAlex Elder stat_request->obj_request = obj_request; 2880c5b5ef6cSAlex Elder stat_request->pages = pages; 2881c5b5ef6cSAlex Elder stat_request->page_count = page_count; 2882c5b5ef6cSAlex Elder 2883c5b5ef6cSAlex Elder rbd_assert(obj_request->img_request); 2884c5b5ef6cSAlex Elder rbd_dev = obj_request->img_request->rbd_dev; 28856d2940c8SGuangliang Zhao stat_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1, 2886c5b5ef6cSAlex Elder stat_request); 2887c5b5ef6cSAlex Elder if (!stat_request->osd_req) 2888c5b5ef6cSAlex Elder goto out; 2889c5b5ef6cSAlex Elder stat_request->callback = rbd_img_obj_exists_callback; 2890c5b5ef6cSAlex Elder 2891144cba14SYan, Zheng osd_req_op_init(stat_request->osd_req, 0, CEPH_OSD_OP_STAT, 0); 2892c5b5ef6cSAlex Elder osd_req_op_raw_data_in_pages(stat_request->osd_req, 0, pages, size, 0, 2893c5b5ef6cSAlex Elder false, false); 28949d4df01fSAlex Elder rbd_osd_req_format_read(stat_request); 2895c5b5ef6cSAlex Elder 2896c5b5ef6cSAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 2897c5b5ef6cSAlex Elder ret = rbd_obj_request_submit(osdc, stat_request); 2898c5b5ef6cSAlex Elder out: 2899c5b5ef6cSAlex Elder if (ret) 2900c5b5ef6cSAlex Elder rbd_obj_request_put(obj_request); 2901c5b5ef6cSAlex Elder 2902c5b5ef6cSAlex Elder return ret; 2903c5b5ef6cSAlex Elder } 2904c5b5ef6cSAlex Elder 290570d045f6SIlya Dryomov static bool img_obj_request_simple(struct rbd_obj_request *obj_request) 2906b454e36dSAlex Elder { 2907b454e36dSAlex Elder struct rbd_img_request *img_request; 2908a9e8ba2cSAlex Elder struct rbd_device *rbd_dev; 2909b454e36dSAlex Elder 2910b454e36dSAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 2911b454e36dSAlex Elder 2912b454e36dSAlex Elder img_request = obj_request->img_request; 2913b454e36dSAlex Elder rbd_assert(img_request); 2914a9e8ba2cSAlex Elder rbd_dev = img_request->rbd_dev; 2915b454e36dSAlex Elder 291670d045f6SIlya Dryomov /* Reads */ 29171c220881SJosh Durgin if (!img_request_write_test(img_request) && 29181c220881SJosh Durgin !img_request_discard_test(img_request)) 291970d045f6SIlya Dryomov return true; 2920b454e36dSAlex Elder 292170d045f6SIlya Dryomov /* Non-layered writes */ 292270d045f6SIlya Dryomov if (!img_request_layered_test(img_request)) 292370d045f6SIlya Dryomov return true; 292470d045f6SIlya Dryomov 292570d045f6SIlya Dryomov /* 292670d045f6SIlya Dryomov * Layered writes outside of the parent overlap range don't 292770d045f6SIlya Dryomov * share any data with the parent. 292870d045f6SIlya Dryomov */ 292970d045f6SIlya Dryomov if (!obj_request_overlaps_parent(obj_request)) 293070d045f6SIlya Dryomov return true; 293170d045f6SIlya Dryomov 293270d045f6SIlya Dryomov /* 2933c622d226SGuangliang Zhao * Entire-object layered writes - we will overwrite whatever 2934c622d226SGuangliang Zhao * parent data there is anyway. 2935c622d226SGuangliang Zhao */ 2936c622d226SGuangliang Zhao if (!obj_request->offset && 2937c622d226SGuangliang Zhao obj_request->length == rbd_obj_bytes(&rbd_dev->header)) 2938c622d226SGuangliang Zhao return true; 2939c622d226SGuangliang Zhao 2940c622d226SGuangliang Zhao /* 294170d045f6SIlya Dryomov * If the object is known to already exist, its parent data has 294270d045f6SIlya Dryomov * already been copied. 294370d045f6SIlya Dryomov */ 294470d045f6SIlya Dryomov if (obj_request_known_test(obj_request) && 294570d045f6SIlya Dryomov obj_request_exists_test(obj_request)) 294670d045f6SIlya Dryomov return true; 294770d045f6SIlya Dryomov 294870d045f6SIlya Dryomov return false; 294970d045f6SIlya Dryomov } 295070d045f6SIlya Dryomov 295170d045f6SIlya Dryomov static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request) 295270d045f6SIlya Dryomov { 295370d045f6SIlya Dryomov if (img_obj_request_simple(obj_request)) { 2954b454e36dSAlex Elder struct rbd_device *rbd_dev; 2955b454e36dSAlex Elder struct ceph_osd_client *osdc; 2956b454e36dSAlex Elder 2957b454e36dSAlex Elder rbd_dev = obj_request->img_request->rbd_dev; 2958b454e36dSAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 2959b454e36dSAlex Elder 2960b454e36dSAlex Elder return rbd_obj_request_submit(osdc, obj_request); 2961b454e36dSAlex Elder } 2962b454e36dSAlex Elder 2963b454e36dSAlex Elder /* 29643d7efd18SAlex Elder * It's a layered write. The target object might exist but 29653d7efd18SAlex Elder * we may not know that yet. If we know it doesn't exist, 29663d7efd18SAlex Elder * start by reading the data for the full target object from 29673d7efd18SAlex Elder * the parent so we can use it for a copyup to the target. 2968b454e36dSAlex Elder */ 296970d045f6SIlya Dryomov if (obj_request_known_test(obj_request)) 29703d7efd18SAlex Elder return rbd_img_obj_parent_read_full(obj_request); 29713d7efd18SAlex Elder 29723d7efd18SAlex Elder /* We don't know whether the target exists. Go find out. */ 2973b454e36dSAlex Elder 2974b454e36dSAlex Elder return rbd_img_obj_exists_submit(obj_request); 2975b454e36dSAlex Elder } 2976b454e36dSAlex Elder 2977bf0d5f50SAlex Elder static int rbd_img_request_submit(struct rbd_img_request *img_request) 2978bf0d5f50SAlex Elder { 2979bf0d5f50SAlex Elder struct rbd_obj_request *obj_request; 298046faeed4SAlex Elder struct rbd_obj_request *next_obj_request; 2981bf0d5f50SAlex Elder 298237206ee5SAlex Elder dout("%s: img %p\n", __func__, img_request); 298346faeed4SAlex Elder for_each_obj_request_safe(img_request, obj_request, next_obj_request) { 2984bf0d5f50SAlex Elder int ret; 2985bf0d5f50SAlex Elder 2986b454e36dSAlex Elder ret = rbd_img_obj_request_submit(obj_request); 2987bf0d5f50SAlex Elder if (ret) 2988bf0d5f50SAlex Elder return ret; 2989bf0d5f50SAlex Elder } 2990bf0d5f50SAlex Elder 2991bf0d5f50SAlex Elder return 0; 2992bf0d5f50SAlex Elder } 2993bf0d5f50SAlex Elder 29948b3e1a56SAlex Elder static void rbd_img_parent_read_callback(struct rbd_img_request *img_request) 29958b3e1a56SAlex Elder { 29968b3e1a56SAlex Elder struct rbd_obj_request *obj_request; 2997a9e8ba2cSAlex Elder struct rbd_device *rbd_dev; 2998a9e8ba2cSAlex Elder u64 obj_end; 299902c74fbaSAlex Elder u64 img_xferred; 300002c74fbaSAlex Elder int img_result; 30018b3e1a56SAlex Elder 30028b3e1a56SAlex Elder rbd_assert(img_request_child_test(img_request)); 30038b3e1a56SAlex Elder 300402c74fbaSAlex Elder /* First get what we need from the image request and release it */ 300502c74fbaSAlex Elder 30068b3e1a56SAlex Elder obj_request = img_request->obj_request; 300702c74fbaSAlex Elder img_xferred = img_request->xferred; 300802c74fbaSAlex Elder img_result = img_request->result; 300902c74fbaSAlex Elder rbd_img_request_put(img_request); 301002c74fbaSAlex Elder 301102c74fbaSAlex Elder /* 301202c74fbaSAlex Elder * If the overlap has become 0 (most likely because the 301302c74fbaSAlex Elder * image has been flattened) we need to re-submit the 301402c74fbaSAlex Elder * original request. 301502c74fbaSAlex Elder */ 3016a9e8ba2cSAlex Elder rbd_assert(obj_request); 3017a9e8ba2cSAlex Elder rbd_assert(obj_request->img_request); 301802c74fbaSAlex Elder rbd_dev = obj_request->img_request->rbd_dev; 301902c74fbaSAlex Elder if (!rbd_dev->parent_overlap) { 302002c74fbaSAlex Elder struct ceph_osd_client *osdc; 30218b3e1a56SAlex Elder 302202c74fbaSAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 302302c74fbaSAlex Elder img_result = rbd_obj_request_submit(osdc, obj_request); 302402c74fbaSAlex Elder if (!img_result) 302502c74fbaSAlex Elder return; 302602c74fbaSAlex Elder } 302702c74fbaSAlex Elder 302802c74fbaSAlex Elder obj_request->result = img_result; 3029a9e8ba2cSAlex Elder if (obj_request->result) 3030a9e8ba2cSAlex Elder goto out; 3031a9e8ba2cSAlex Elder 3032a9e8ba2cSAlex Elder /* 3033a9e8ba2cSAlex Elder * We need to zero anything beyond the parent overlap 3034a9e8ba2cSAlex Elder * boundary. Since rbd_img_obj_request_read_callback() 3035a9e8ba2cSAlex Elder * will zero anything beyond the end of a short read, an 3036a9e8ba2cSAlex Elder * easy way to do this is to pretend the data from the 3037a9e8ba2cSAlex Elder * parent came up short--ending at the overlap boundary. 3038a9e8ba2cSAlex Elder */ 3039a9e8ba2cSAlex Elder rbd_assert(obj_request->img_offset < U64_MAX - obj_request->length); 3040a9e8ba2cSAlex Elder obj_end = obj_request->img_offset + obj_request->length; 3041a9e8ba2cSAlex Elder if (obj_end > rbd_dev->parent_overlap) { 3042a9e8ba2cSAlex Elder u64 xferred = 0; 3043a9e8ba2cSAlex Elder 3044a9e8ba2cSAlex Elder if (obj_request->img_offset < rbd_dev->parent_overlap) 3045a9e8ba2cSAlex Elder xferred = rbd_dev->parent_overlap - 3046a9e8ba2cSAlex Elder obj_request->img_offset; 3047a9e8ba2cSAlex Elder 304802c74fbaSAlex Elder obj_request->xferred = min(img_xferred, xferred); 3049a9e8ba2cSAlex Elder } else { 305002c74fbaSAlex Elder obj_request->xferred = img_xferred; 3051a9e8ba2cSAlex Elder } 3052a9e8ba2cSAlex Elder out: 30538b3e1a56SAlex Elder rbd_img_obj_request_read_callback(obj_request); 30548b3e1a56SAlex Elder rbd_obj_request_complete(obj_request); 30558b3e1a56SAlex Elder } 30568b3e1a56SAlex Elder 30578b3e1a56SAlex Elder static void rbd_img_parent_read(struct rbd_obj_request *obj_request) 30588b3e1a56SAlex Elder { 30598b3e1a56SAlex Elder struct rbd_img_request *img_request; 30608b3e1a56SAlex Elder int result; 30618b3e1a56SAlex Elder 30628b3e1a56SAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 30638b3e1a56SAlex Elder rbd_assert(obj_request->img_request != NULL); 30648b3e1a56SAlex Elder rbd_assert(obj_request->result == (s32) -ENOENT); 30655b2ab72dSAlex Elder rbd_assert(obj_request_type_valid(obj_request->type)); 30668b3e1a56SAlex Elder 30678b3e1a56SAlex Elder /* rbd_read_finish(obj_request, obj_request->length); */ 3068e93f3152SAlex Elder img_request = rbd_parent_request_create(obj_request, 30698b3e1a56SAlex Elder obj_request->img_offset, 3070e93f3152SAlex Elder obj_request->length); 30718b3e1a56SAlex Elder result = -ENOMEM; 30728b3e1a56SAlex Elder if (!img_request) 30738b3e1a56SAlex Elder goto out_err; 30748b3e1a56SAlex Elder 30755b2ab72dSAlex Elder if (obj_request->type == OBJ_REQUEST_BIO) 3076f1a4739fSAlex Elder result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO, 3077f1a4739fSAlex Elder obj_request->bio_list); 30785b2ab72dSAlex Elder else 30795b2ab72dSAlex Elder result = rbd_img_request_fill(img_request, OBJ_REQUEST_PAGES, 30805b2ab72dSAlex Elder obj_request->pages); 30818b3e1a56SAlex Elder if (result) 30828b3e1a56SAlex Elder goto out_err; 30838b3e1a56SAlex Elder 30848b3e1a56SAlex Elder img_request->callback = rbd_img_parent_read_callback; 30858b3e1a56SAlex Elder result = rbd_img_request_submit(img_request); 30868b3e1a56SAlex Elder if (result) 30878b3e1a56SAlex Elder goto out_err; 30888b3e1a56SAlex Elder 30898b3e1a56SAlex Elder return; 30908b3e1a56SAlex Elder out_err: 30918b3e1a56SAlex Elder if (img_request) 30928b3e1a56SAlex Elder rbd_img_request_put(img_request); 30938b3e1a56SAlex Elder obj_request->result = result; 30948b3e1a56SAlex Elder obj_request->xferred = 0; 30958b3e1a56SAlex Elder obj_request_done_set(obj_request); 30968b3e1a56SAlex Elder } 30978b3e1a56SAlex Elder 309820e0af67SJosh Durgin static int rbd_obj_notify_ack_sync(struct rbd_device *rbd_dev, u64 notify_id) 3099b8d70035SAlex Elder { 3100b8d70035SAlex Elder struct rbd_obj_request *obj_request; 31012169238dSAlex Elder struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 3102b8d70035SAlex Elder int ret; 3103b8d70035SAlex Elder 3104b8d70035SAlex Elder obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0, 3105b8d70035SAlex Elder OBJ_REQUEST_NODATA); 3106b8d70035SAlex Elder if (!obj_request) 3107b8d70035SAlex Elder return -ENOMEM; 3108b8d70035SAlex Elder 3109b8d70035SAlex Elder ret = -ENOMEM; 31106d2940c8SGuangliang Zhao obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1, 3111deb236b3SIlya Dryomov obj_request); 3112b8d70035SAlex Elder if (!obj_request->osd_req) 3113b8d70035SAlex Elder goto out; 3114b8d70035SAlex Elder 3115c99d2d4aSAlex Elder osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_NOTIFY_ACK, 3116cc4a38bdSAlex Elder notify_id, 0, 0); 31179d4df01fSAlex Elder rbd_osd_req_format_read(obj_request); 3118430c28c3SAlex Elder 3119b8d70035SAlex Elder ret = rbd_obj_request_submit(osdc, obj_request); 3120cf81b60eSAlex Elder if (ret) 312120e0af67SJosh Durgin goto out; 312220e0af67SJosh Durgin ret = rbd_obj_request_wait(obj_request); 312320e0af67SJosh Durgin out: 3124b8d70035SAlex Elder rbd_obj_request_put(obj_request); 3125b8d70035SAlex Elder 3126b8d70035SAlex Elder return ret; 3127b8d70035SAlex Elder } 3128b8d70035SAlex Elder 3129b8d70035SAlex Elder static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data) 3130b8d70035SAlex Elder { 3131b8d70035SAlex Elder struct rbd_device *rbd_dev = (struct rbd_device *)data; 3132e627db08SAlex Elder int ret; 3133b8d70035SAlex Elder 3134b8d70035SAlex Elder if (!rbd_dev) 3135b8d70035SAlex Elder return; 3136b8d70035SAlex Elder 313737206ee5SAlex Elder dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__, 3138b8d70035SAlex Elder rbd_dev->header_name, (unsigned long long)notify_id, 3139b8d70035SAlex Elder (unsigned int)opcode); 314052bb1f9bSIlya Dryomov 314152bb1f9bSIlya Dryomov /* 314252bb1f9bSIlya Dryomov * Until adequate refresh error handling is in place, there is 314352bb1f9bSIlya Dryomov * not much we can do here, except warn. 314452bb1f9bSIlya Dryomov * 314552bb1f9bSIlya Dryomov * See http://tracker.ceph.com/issues/5040 314652bb1f9bSIlya Dryomov */ 3147e627db08SAlex Elder ret = rbd_dev_refresh(rbd_dev); 3148e627db08SAlex Elder if (ret) 31499584d508SIlya Dryomov rbd_warn(rbd_dev, "refresh failed: %d", ret); 3150b8d70035SAlex Elder 315152bb1f9bSIlya Dryomov ret = rbd_obj_notify_ack_sync(rbd_dev, notify_id); 315252bb1f9bSIlya Dryomov if (ret) 31539584d508SIlya Dryomov rbd_warn(rbd_dev, "notify_ack ret %d", ret); 3154b8d70035SAlex Elder } 3155b8d70035SAlex Elder 31569969ebc5SAlex Elder /* 3157bb040aa0SIlya Dryomov * Send a (un)watch request and wait for the ack. Return a request 3158bb040aa0SIlya Dryomov * with a ref held on success or error. 3159bb040aa0SIlya Dryomov */ 3160bb040aa0SIlya Dryomov static struct rbd_obj_request *rbd_obj_watch_request_helper( 3161bb040aa0SIlya Dryomov struct rbd_device *rbd_dev, 3162bb040aa0SIlya Dryomov bool watch) 3163bb040aa0SIlya Dryomov { 3164bb040aa0SIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 31652894e1d7SIlya Dryomov struct ceph_options *opts = osdc->client->options; 3166bb040aa0SIlya Dryomov struct rbd_obj_request *obj_request; 3167bb040aa0SIlya Dryomov int ret; 3168bb040aa0SIlya Dryomov 3169bb040aa0SIlya Dryomov obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0, 3170bb040aa0SIlya Dryomov OBJ_REQUEST_NODATA); 3171bb040aa0SIlya Dryomov if (!obj_request) 3172bb040aa0SIlya Dryomov return ERR_PTR(-ENOMEM); 3173bb040aa0SIlya Dryomov 31746d2940c8SGuangliang Zhao obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_WRITE, 1, 3175bb040aa0SIlya Dryomov obj_request); 3176bb040aa0SIlya Dryomov if (!obj_request->osd_req) { 3177bb040aa0SIlya Dryomov ret = -ENOMEM; 3178bb040aa0SIlya Dryomov goto out; 3179bb040aa0SIlya Dryomov } 3180bb040aa0SIlya Dryomov 3181bb040aa0SIlya Dryomov osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH, 3182bb040aa0SIlya Dryomov rbd_dev->watch_event->cookie, 0, watch); 3183bb040aa0SIlya Dryomov rbd_osd_req_format_write(obj_request); 3184bb040aa0SIlya Dryomov 3185bb040aa0SIlya Dryomov if (watch) 3186bb040aa0SIlya Dryomov ceph_osdc_set_request_linger(osdc, obj_request->osd_req); 3187bb040aa0SIlya Dryomov 3188bb040aa0SIlya Dryomov ret = rbd_obj_request_submit(osdc, obj_request); 3189bb040aa0SIlya Dryomov if (ret) 3190bb040aa0SIlya Dryomov goto out; 3191bb040aa0SIlya Dryomov 31922894e1d7SIlya Dryomov ret = rbd_obj_request_wait_timeout(obj_request, opts->mount_timeout); 3193bb040aa0SIlya Dryomov if (ret) 3194bb040aa0SIlya Dryomov goto out; 3195bb040aa0SIlya Dryomov 3196bb040aa0SIlya Dryomov ret = obj_request->result; 3197bb040aa0SIlya Dryomov if (ret) { 3198bb040aa0SIlya Dryomov if (watch) 3199bb040aa0SIlya Dryomov rbd_obj_request_end(obj_request); 3200bb040aa0SIlya Dryomov goto out; 3201bb040aa0SIlya Dryomov } 3202bb040aa0SIlya Dryomov 3203bb040aa0SIlya Dryomov return obj_request; 3204bb040aa0SIlya Dryomov 3205bb040aa0SIlya Dryomov out: 3206bb040aa0SIlya Dryomov rbd_obj_request_put(obj_request); 3207bb040aa0SIlya Dryomov return ERR_PTR(ret); 3208bb040aa0SIlya Dryomov } 3209bb040aa0SIlya Dryomov 3210bb040aa0SIlya Dryomov /* 3211b30a01f2SIlya Dryomov * Initiate a watch request, synchronously. 32129969ebc5SAlex Elder */ 3213b30a01f2SIlya Dryomov static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev) 32149969ebc5SAlex Elder { 32159969ebc5SAlex Elder struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 32169969ebc5SAlex Elder struct rbd_obj_request *obj_request; 32179969ebc5SAlex Elder int ret; 32189969ebc5SAlex Elder 3219b30a01f2SIlya Dryomov rbd_assert(!rbd_dev->watch_event); 3220b30a01f2SIlya Dryomov rbd_assert(!rbd_dev->watch_request); 32219969ebc5SAlex Elder 32223c663bbdSAlex Elder ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev, 32239969ebc5SAlex Elder &rbd_dev->watch_event); 32249969ebc5SAlex Elder if (ret < 0) 32259969ebc5SAlex Elder return ret; 32269969ebc5SAlex Elder 322776756a51SIlya Dryomov obj_request = rbd_obj_watch_request_helper(rbd_dev, true); 322876756a51SIlya Dryomov if (IS_ERR(obj_request)) { 322976756a51SIlya Dryomov ceph_osdc_cancel_event(rbd_dev->watch_event); 323076756a51SIlya Dryomov rbd_dev->watch_event = NULL; 323176756a51SIlya Dryomov return PTR_ERR(obj_request); 3232b30a01f2SIlya Dryomov } 32339969ebc5SAlex Elder 32348eb87565SAlex Elder /* 32358eb87565SAlex Elder * A watch request is set to linger, so the underlying osd 32368eb87565SAlex Elder * request won't go away until we unregister it. We retain 32378eb87565SAlex Elder * a pointer to the object request during that time (in 323876756a51SIlya Dryomov * rbd_dev->watch_request), so we'll keep a reference to it. 323976756a51SIlya Dryomov * We'll drop that reference after we've unregistered it in 324076756a51SIlya Dryomov * rbd_dev_header_unwatch_sync(). 32418eb87565SAlex Elder */ 32428eb87565SAlex Elder rbd_dev->watch_request = obj_request; 32438eb87565SAlex Elder 32448eb87565SAlex Elder return 0; 32459969ebc5SAlex Elder } 32469969ebc5SAlex Elder 3247b30a01f2SIlya Dryomov /* 3248b30a01f2SIlya Dryomov * Tear down a watch request, synchronously. 3249b30a01f2SIlya Dryomov */ 325076756a51SIlya Dryomov static void rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev) 3251fca27065SIlya Dryomov { 3252b30a01f2SIlya Dryomov struct rbd_obj_request *obj_request; 3253b30a01f2SIlya Dryomov 3254b30a01f2SIlya Dryomov rbd_assert(rbd_dev->watch_event); 3255b30a01f2SIlya Dryomov rbd_assert(rbd_dev->watch_request); 3256b30a01f2SIlya Dryomov 325776756a51SIlya Dryomov rbd_obj_request_end(rbd_dev->watch_request); 3258b30a01f2SIlya Dryomov rbd_obj_request_put(rbd_dev->watch_request); 3259b30a01f2SIlya Dryomov rbd_dev->watch_request = NULL; 3260b30a01f2SIlya Dryomov 326176756a51SIlya Dryomov obj_request = rbd_obj_watch_request_helper(rbd_dev, false); 326276756a51SIlya Dryomov if (!IS_ERR(obj_request)) 3263b30a01f2SIlya Dryomov rbd_obj_request_put(obj_request); 326476756a51SIlya Dryomov else 326576756a51SIlya Dryomov rbd_warn(rbd_dev, "unable to tear down watch request (%ld)", 326676756a51SIlya Dryomov PTR_ERR(obj_request)); 326776756a51SIlya Dryomov 3268b30a01f2SIlya Dryomov ceph_osdc_cancel_event(rbd_dev->watch_event); 3269b30a01f2SIlya Dryomov rbd_dev->watch_event = NULL; 3270fca27065SIlya Dryomov } 3271fca27065SIlya Dryomov 327236be9a76SAlex Elder /* 3273f40eb349SAlex Elder * Synchronous osd object method call. Returns the number of bytes 3274f40eb349SAlex Elder * returned in the outbound buffer, or a negative error code. 327536be9a76SAlex Elder */ 327636be9a76SAlex Elder static int rbd_obj_method_sync(struct rbd_device *rbd_dev, 327736be9a76SAlex Elder const char *object_name, 327836be9a76SAlex Elder const char *class_name, 327936be9a76SAlex Elder const char *method_name, 32804157976bSAlex Elder const void *outbound, 328136be9a76SAlex Elder size_t outbound_size, 32824157976bSAlex Elder void *inbound, 3283e2a58ee5SAlex Elder size_t inbound_size) 328436be9a76SAlex Elder { 32852169238dSAlex Elder struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 328636be9a76SAlex Elder struct rbd_obj_request *obj_request; 328736be9a76SAlex Elder struct page **pages; 328836be9a76SAlex Elder u32 page_count; 328936be9a76SAlex Elder int ret; 329036be9a76SAlex Elder 329136be9a76SAlex Elder /* 32926010a451SAlex Elder * Method calls are ultimately read operations. The result 32936010a451SAlex Elder * should placed into the inbound buffer provided. They 32946010a451SAlex Elder * also supply outbound data--parameters for the object 32956010a451SAlex Elder * method. Currently if this is present it will be a 32966010a451SAlex Elder * snapshot id. 329736be9a76SAlex Elder */ 329836be9a76SAlex Elder page_count = (u32)calc_pages_for(0, inbound_size); 329936be9a76SAlex Elder pages = ceph_alloc_page_vector(page_count, GFP_KERNEL); 330036be9a76SAlex Elder if (IS_ERR(pages)) 330136be9a76SAlex Elder return PTR_ERR(pages); 330236be9a76SAlex Elder 330336be9a76SAlex Elder ret = -ENOMEM; 33046010a451SAlex Elder obj_request = rbd_obj_request_create(object_name, 0, inbound_size, 330536be9a76SAlex Elder OBJ_REQUEST_PAGES); 330636be9a76SAlex Elder if (!obj_request) 330736be9a76SAlex Elder goto out; 330836be9a76SAlex Elder 330936be9a76SAlex Elder obj_request->pages = pages; 331036be9a76SAlex Elder obj_request->page_count = page_count; 331136be9a76SAlex Elder 33126d2940c8SGuangliang Zhao obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1, 3313deb236b3SIlya Dryomov obj_request); 331436be9a76SAlex Elder if (!obj_request->osd_req) 331536be9a76SAlex Elder goto out; 331636be9a76SAlex Elder 3317c99d2d4aSAlex Elder osd_req_op_cls_init(obj_request->osd_req, 0, CEPH_OSD_OP_CALL, 331804017e29SAlex Elder class_name, method_name); 331904017e29SAlex Elder if (outbound_size) { 332004017e29SAlex Elder struct ceph_pagelist *pagelist; 332104017e29SAlex Elder 332204017e29SAlex Elder pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS); 332304017e29SAlex Elder if (!pagelist) 332404017e29SAlex Elder goto out; 332504017e29SAlex Elder 332604017e29SAlex Elder ceph_pagelist_init(pagelist); 332704017e29SAlex Elder ceph_pagelist_append(pagelist, outbound, outbound_size); 332804017e29SAlex Elder osd_req_op_cls_request_data_pagelist(obj_request->osd_req, 0, 332904017e29SAlex Elder pagelist); 333004017e29SAlex Elder } 3331a4ce40a9SAlex Elder osd_req_op_cls_response_data_pages(obj_request->osd_req, 0, 3332a4ce40a9SAlex Elder obj_request->pages, inbound_size, 333344cd188dSAlex Elder 0, false, false); 33349d4df01fSAlex Elder rbd_osd_req_format_read(obj_request); 3335430c28c3SAlex Elder 333636be9a76SAlex Elder ret = rbd_obj_request_submit(osdc, obj_request); 333736be9a76SAlex Elder if (ret) 333836be9a76SAlex Elder goto out; 333936be9a76SAlex Elder ret = rbd_obj_request_wait(obj_request); 334036be9a76SAlex Elder if (ret) 334136be9a76SAlex Elder goto out; 334236be9a76SAlex Elder 334336be9a76SAlex Elder ret = obj_request->result; 334436be9a76SAlex Elder if (ret < 0) 334536be9a76SAlex Elder goto out; 334657385b51SAlex Elder 334757385b51SAlex Elder rbd_assert(obj_request->xferred < (u64)INT_MAX); 334857385b51SAlex Elder ret = (int)obj_request->xferred; 3349903bb32eSAlex Elder ceph_copy_from_page_vector(pages, inbound, 0, obj_request->xferred); 335036be9a76SAlex Elder out: 335136be9a76SAlex Elder if (obj_request) 335236be9a76SAlex Elder rbd_obj_request_put(obj_request); 335336be9a76SAlex Elder else 335436be9a76SAlex Elder ceph_release_page_vector(pages, page_count); 335536be9a76SAlex Elder 335636be9a76SAlex Elder return ret; 335736be9a76SAlex Elder } 335836be9a76SAlex Elder 33597ad18afaSChristoph Hellwig static void rbd_queue_workfn(struct work_struct *work) 3360bc1ecc65SIlya Dryomov { 33617ad18afaSChristoph Hellwig struct request *rq = blk_mq_rq_from_pdu(work); 33627ad18afaSChristoph Hellwig struct rbd_device *rbd_dev = rq->q->queuedata; 3363bc1ecc65SIlya Dryomov struct rbd_img_request *img_request; 33644e752f0aSJosh Durgin struct ceph_snap_context *snapc = NULL; 3365bc1ecc65SIlya Dryomov u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT; 3366bc1ecc65SIlya Dryomov u64 length = blk_rq_bytes(rq); 33676d2940c8SGuangliang Zhao enum obj_operation_type op_type; 33684e752f0aSJosh Durgin u64 mapping_size; 3369bc1ecc65SIlya Dryomov int result; 3370bc1ecc65SIlya Dryomov 33717ad18afaSChristoph Hellwig if (rq->cmd_type != REQ_TYPE_FS) { 33727ad18afaSChristoph Hellwig dout("%s: non-fs request type %d\n", __func__, 33737ad18afaSChristoph Hellwig (int) rq->cmd_type); 33747ad18afaSChristoph Hellwig result = -EIO; 33757ad18afaSChristoph Hellwig goto err; 33767ad18afaSChristoph Hellwig } 33777ad18afaSChristoph Hellwig 337890e98c52SGuangliang Zhao if (rq->cmd_flags & REQ_DISCARD) 337990e98c52SGuangliang Zhao op_type = OBJ_OP_DISCARD; 338090e98c52SGuangliang Zhao else if (rq->cmd_flags & REQ_WRITE) 33816d2940c8SGuangliang Zhao op_type = OBJ_OP_WRITE; 33826d2940c8SGuangliang Zhao else 33836d2940c8SGuangliang Zhao op_type = OBJ_OP_READ; 33846d2940c8SGuangliang Zhao 3385bc1ecc65SIlya Dryomov /* Ignore/skip any zero-length requests */ 3386bc1ecc65SIlya Dryomov 3387bc1ecc65SIlya Dryomov if (!length) { 3388bc1ecc65SIlya Dryomov dout("%s: zero-length request\n", __func__); 3389bc1ecc65SIlya Dryomov result = 0; 3390bc1ecc65SIlya Dryomov goto err_rq; 3391bc1ecc65SIlya Dryomov } 3392bc1ecc65SIlya Dryomov 33936d2940c8SGuangliang Zhao /* Only reads are allowed to a read-only device */ 3394bc1ecc65SIlya Dryomov 33956d2940c8SGuangliang Zhao if (op_type != OBJ_OP_READ) { 3396bc1ecc65SIlya Dryomov if (rbd_dev->mapping.read_only) { 3397bc1ecc65SIlya Dryomov result = -EROFS; 3398bc1ecc65SIlya Dryomov goto err_rq; 3399bc1ecc65SIlya Dryomov } 3400bc1ecc65SIlya Dryomov rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP); 3401bc1ecc65SIlya Dryomov } 3402bc1ecc65SIlya Dryomov 3403bc1ecc65SIlya Dryomov /* 3404bc1ecc65SIlya Dryomov * Quit early if the mapped snapshot no longer exists. It's 3405bc1ecc65SIlya Dryomov * still possible the snapshot will have disappeared by the 3406bc1ecc65SIlya Dryomov * time our request arrives at the osd, but there's no sense in 3407bc1ecc65SIlya Dryomov * sending it if we already know. 3408bc1ecc65SIlya Dryomov */ 3409bc1ecc65SIlya Dryomov if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) { 3410bc1ecc65SIlya Dryomov dout("request for non-existent snapshot"); 3411bc1ecc65SIlya Dryomov rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP); 3412bc1ecc65SIlya Dryomov result = -ENXIO; 3413bc1ecc65SIlya Dryomov goto err_rq; 3414bc1ecc65SIlya Dryomov } 3415bc1ecc65SIlya Dryomov 3416bc1ecc65SIlya Dryomov if (offset && length > U64_MAX - offset + 1) { 3417bc1ecc65SIlya Dryomov rbd_warn(rbd_dev, "bad request range (%llu~%llu)", offset, 3418bc1ecc65SIlya Dryomov length); 3419bc1ecc65SIlya Dryomov result = -EINVAL; 3420bc1ecc65SIlya Dryomov goto err_rq; /* Shouldn't happen */ 3421bc1ecc65SIlya Dryomov } 3422bc1ecc65SIlya Dryomov 34237ad18afaSChristoph Hellwig blk_mq_start_request(rq); 34247ad18afaSChristoph Hellwig 34254e752f0aSJosh Durgin down_read(&rbd_dev->header_rwsem); 34264e752f0aSJosh Durgin mapping_size = rbd_dev->mapping.size; 34276d2940c8SGuangliang Zhao if (op_type != OBJ_OP_READ) { 34284e752f0aSJosh Durgin snapc = rbd_dev->header.snapc; 34294e752f0aSJosh Durgin ceph_get_snap_context(snapc); 34304e752f0aSJosh Durgin } 34314e752f0aSJosh Durgin up_read(&rbd_dev->header_rwsem); 34324e752f0aSJosh Durgin 34334e752f0aSJosh Durgin if (offset + length > mapping_size) { 3434bc1ecc65SIlya Dryomov rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)", offset, 34354e752f0aSJosh Durgin length, mapping_size); 3436bc1ecc65SIlya Dryomov result = -EIO; 3437bc1ecc65SIlya Dryomov goto err_rq; 3438bc1ecc65SIlya Dryomov } 3439bc1ecc65SIlya Dryomov 34406d2940c8SGuangliang Zhao img_request = rbd_img_request_create(rbd_dev, offset, length, op_type, 34414e752f0aSJosh Durgin snapc); 3442bc1ecc65SIlya Dryomov if (!img_request) { 3443bc1ecc65SIlya Dryomov result = -ENOMEM; 3444bc1ecc65SIlya Dryomov goto err_rq; 3445bc1ecc65SIlya Dryomov } 3446bc1ecc65SIlya Dryomov img_request->rq = rq; 3447bc1ecc65SIlya Dryomov 344890e98c52SGuangliang Zhao if (op_type == OBJ_OP_DISCARD) 344990e98c52SGuangliang Zhao result = rbd_img_request_fill(img_request, OBJ_REQUEST_NODATA, 345090e98c52SGuangliang Zhao NULL); 345190e98c52SGuangliang Zhao else 345290e98c52SGuangliang Zhao result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO, 345390e98c52SGuangliang Zhao rq->bio); 3454bc1ecc65SIlya Dryomov if (result) 3455bc1ecc65SIlya Dryomov goto err_img_request; 3456bc1ecc65SIlya Dryomov 3457bc1ecc65SIlya Dryomov result = rbd_img_request_submit(img_request); 3458bc1ecc65SIlya Dryomov if (result) 3459bc1ecc65SIlya Dryomov goto err_img_request; 3460bc1ecc65SIlya Dryomov 3461bc1ecc65SIlya Dryomov return; 3462bc1ecc65SIlya Dryomov 3463bc1ecc65SIlya Dryomov err_img_request: 3464bc1ecc65SIlya Dryomov rbd_img_request_put(img_request); 3465bc1ecc65SIlya Dryomov err_rq: 3466bc1ecc65SIlya Dryomov if (result) 3467bc1ecc65SIlya Dryomov rbd_warn(rbd_dev, "%s %llx at %llx result %d", 34686d2940c8SGuangliang Zhao obj_op_name(op_type), length, offset, result); 34694e752f0aSJosh Durgin ceph_put_snap_context(snapc); 34707ad18afaSChristoph Hellwig err: 34717ad18afaSChristoph Hellwig blk_mq_end_request(rq, result); 3472bc1ecc65SIlya Dryomov } 3473bc1ecc65SIlya Dryomov 34747ad18afaSChristoph Hellwig static int rbd_queue_rq(struct blk_mq_hw_ctx *hctx, 34757ad18afaSChristoph Hellwig const struct blk_mq_queue_data *bd) 3476bc1ecc65SIlya Dryomov { 34777ad18afaSChristoph Hellwig struct request *rq = bd->rq; 34787ad18afaSChristoph Hellwig struct work_struct *work = blk_mq_rq_to_pdu(rq); 3479bc1ecc65SIlya Dryomov 34807ad18afaSChristoph Hellwig queue_work(rbd_wq, work); 34817ad18afaSChristoph Hellwig return BLK_MQ_RQ_QUEUE_OK; 3482bf0d5f50SAlex Elder } 3483bf0d5f50SAlex Elder 3484602adf40SYehuda Sadeh static void rbd_free_disk(struct rbd_device *rbd_dev) 3485602adf40SYehuda Sadeh { 3486602adf40SYehuda Sadeh struct gendisk *disk = rbd_dev->disk; 3487602adf40SYehuda Sadeh 3488602adf40SYehuda Sadeh if (!disk) 3489602adf40SYehuda Sadeh return; 3490602adf40SYehuda Sadeh 3491a0cab924SAlex Elder rbd_dev->disk = NULL; 3492a0cab924SAlex Elder if (disk->flags & GENHD_FL_UP) { 3493602adf40SYehuda Sadeh del_gendisk(disk); 3494602adf40SYehuda Sadeh if (disk->queue) 3495602adf40SYehuda Sadeh blk_cleanup_queue(disk->queue); 34967ad18afaSChristoph Hellwig blk_mq_free_tag_set(&rbd_dev->tag_set); 3497a0cab924SAlex Elder } 3498602adf40SYehuda Sadeh put_disk(disk); 3499602adf40SYehuda Sadeh } 3500602adf40SYehuda Sadeh 3501788e2df3SAlex Elder static int rbd_obj_read_sync(struct rbd_device *rbd_dev, 3502788e2df3SAlex Elder const char *object_name, 35037097f8dfSAlex Elder u64 offset, u64 length, void *buf) 3504788e2df3SAlex Elder 3505788e2df3SAlex Elder { 35062169238dSAlex Elder struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 3507788e2df3SAlex Elder struct rbd_obj_request *obj_request; 3508788e2df3SAlex Elder struct page **pages = NULL; 3509788e2df3SAlex Elder u32 page_count; 35101ceae7efSAlex Elder size_t size; 3511788e2df3SAlex Elder int ret; 3512788e2df3SAlex Elder 3513788e2df3SAlex Elder page_count = (u32) calc_pages_for(offset, length); 3514788e2df3SAlex Elder pages = ceph_alloc_page_vector(page_count, GFP_KERNEL); 3515788e2df3SAlex Elder if (IS_ERR(pages)) 3516a8d42056SJan Kara return PTR_ERR(pages); 3517788e2df3SAlex Elder 3518788e2df3SAlex Elder ret = -ENOMEM; 3519788e2df3SAlex Elder obj_request = rbd_obj_request_create(object_name, offset, length, 3520788e2df3SAlex Elder OBJ_REQUEST_PAGES); 3521788e2df3SAlex Elder if (!obj_request) 3522788e2df3SAlex Elder goto out; 3523788e2df3SAlex Elder 3524788e2df3SAlex Elder obj_request->pages = pages; 3525788e2df3SAlex Elder obj_request->page_count = page_count; 3526788e2df3SAlex Elder 35276d2940c8SGuangliang Zhao obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1, 3528deb236b3SIlya Dryomov obj_request); 3529788e2df3SAlex Elder if (!obj_request->osd_req) 3530788e2df3SAlex Elder goto out; 3531788e2df3SAlex Elder 3532c99d2d4aSAlex Elder osd_req_op_extent_init(obj_request->osd_req, 0, CEPH_OSD_OP_READ, 3533c99d2d4aSAlex Elder offset, length, 0, 0); 3534406e2c9fSAlex Elder osd_req_op_extent_osd_data_pages(obj_request->osd_req, 0, 3535a4ce40a9SAlex Elder obj_request->pages, 353644cd188dSAlex Elder obj_request->length, 353744cd188dSAlex Elder obj_request->offset & ~PAGE_MASK, 353844cd188dSAlex Elder false, false); 35399d4df01fSAlex Elder rbd_osd_req_format_read(obj_request); 3540430c28c3SAlex Elder 3541788e2df3SAlex Elder ret = rbd_obj_request_submit(osdc, obj_request); 3542788e2df3SAlex Elder if (ret) 3543788e2df3SAlex Elder goto out; 3544788e2df3SAlex Elder ret = rbd_obj_request_wait(obj_request); 3545788e2df3SAlex Elder if (ret) 3546788e2df3SAlex Elder goto out; 3547788e2df3SAlex Elder 3548788e2df3SAlex Elder ret = obj_request->result; 3549788e2df3SAlex Elder if (ret < 0) 3550788e2df3SAlex Elder goto out; 35511ceae7efSAlex Elder 35521ceae7efSAlex Elder rbd_assert(obj_request->xferred <= (u64) SIZE_MAX); 35531ceae7efSAlex Elder size = (size_t) obj_request->xferred; 3554903bb32eSAlex Elder ceph_copy_from_page_vector(pages, buf, 0, size); 355523ed6e13SAlex Elder rbd_assert(size <= (size_t)INT_MAX); 355623ed6e13SAlex Elder ret = (int)size; 3557788e2df3SAlex Elder out: 3558788e2df3SAlex Elder if (obj_request) 3559788e2df3SAlex Elder rbd_obj_request_put(obj_request); 3560788e2df3SAlex Elder else 3561788e2df3SAlex Elder ceph_release_page_vector(pages, page_count); 3562788e2df3SAlex Elder 3563788e2df3SAlex Elder return ret; 3564788e2df3SAlex Elder } 3565788e2df3SAlex Elder 3566602adf40SYehuda Sadeh /* 3567662518b1SAlex Elder * Read the complete header for the given rbd device. On successful 3568662518b1SAlex Elder * return, the rbd_dev->header field will contain up-to-date 3569662518b1SAlex Elder * information about the image. 35704156d998SAlex Elder */ 357199a41ebcSAlex Elder static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev) 35724156d998SAlex Elder { 35734156d998SAlex Elder struct rbd_image_header_ondisk *ondisk = NULL; 35744156d998SAlex Elder u32 snap_count = 0; 35754156d998SAlex Elder u64 names_size = 0; 35764156d998SAlex Elder u32 want_count; 35774156d998SAlex Elder int ret; 35784156d998SAlex Elder 35794156d998SAlex Elder /* 35804156d998SAlex Elder * The complete header will include an array of its 64-bit 35814156d998SAlex Elder * snapshot ids, followed by the names of those snapshots as 35824156d998SAlex Elder * a contiguous block of NUL-terminated strings. Note that 35834156d998SAlex Elder * the number of snapshots could change by the time we read 35844156d998SAlex Elder * it in, in which case we re-read it. 35854156d998SAlex Elder */ 35864156d998SAlex Elder do { 35874156d998SAlex Elder size_t size; 35884156d998SAlex Elder 35894156d998SAlex Elder kfree(ondisk); 35904156d998SAlex Elder 35914156d998SAlex Elder size = sizeof (*ondisk); 35924156d998SAlex Elder size += snap_count * sizeof (struct rbd_image_snap_ondisk); 35934156d998SAlex Elder size += names_size; 35944156d998SAlex Elder ondisk = kmalloc(size, GFP_KERNEL); 35954156d998SAlex Elder if (!ondisk) 3596662518b1SAlex Elder return -ENOMEM; 35974156d998SAlex Elder 3598788e2df3SAlex Elder ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name, 35997097f8dfSAlex Elder 0, size, ondisk); 36004156d998SAlex Elder if (ret < 0) 3601662518b1SAlex Elder goto out; 3602c0cd10dbSAlex Elder if ((size_t)ret < size) { 36034156d998SAlex Elder ret = -ENXIO; 360406ecc6cbSAlex Elder rbd_warn(rbd_dev, "short header read (want %zd got %d)", 360506ecc6cbSAlex Elder size, ret); 3606662518b1SAlex Elder goto out; 36074156d998SAlex Elder } 36084156d998SAlex Elder if (!rbd_dev_ondisk_valid(ondisk)) { 36094156d998SAlex Elder ret = -ENXIO; 361006ecc6cbSAlex Elder rbd_warn(rbd_dev, "invalid header"); 3611662518b1SAlex Elder goto out; 36124156d998SAlex Elder } 36134156d998SAlex Elder 36144156d998SAlex Elder names_size = le64_to_cpu(ondisk->snap_names_len); 36154156d998SAlex Elder want_count = snap_count; 36164156d998SAlex Elder snap_count = le32_to_cpu(ondisk->snap_count); 36174156d998SAlex Elder } while (snap_count != want_count); 36184156d998SAlex Elder 3619662518b1SAlex Elder ret = rbd_header_from_disk(rbd_dev, ondisk); 3620662518b1SAlex Elder out: 36214156d998SAlex Elder kfree(ondisk); 36224156d998SAlex Elder 3623dfc5606dSYehuda Sadeh return ret; 3624602adf40SYehuda Sadeh } 3625602adf40SYehuda Sadeh 362615228edeSAlex Elder /* 362715228edeSAlex Elder * Clear the rbd device's EXISTS flag if the snapshot it's mapped to 362815228edeSAlex Elder * has disappeared from the (just updated) snapshot context. 362915228edeSAlex Elder */ 363015228edeSAlex Elder static void rbd_exists_validate(struct rbd_device *rbd_dev) 363115228edeSAlex Elder { 363215228edeSAlex Elder u64 snap_id; 363315228edeSAlex Elder 363415228edeSAlex Elder if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) 363515228edeSAlex Elder return; 363615228edeSAlex Elder 363715228edeSAlex Elder snap_id = rbd_dev->spec->snap_id; 363815228edeSAlex Elder if (snap_id == CEPH_NOSNAP) 363915228edeSAlex Elder return; 364015228edeSAlex Elder 364115228edeSAlex Elder if (rbd_dev_snap_index(rbd_dev, snap_id) == BAD_SNAP_INDEX) 364215228edeSAlex Elder clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); 364315228edeSAlex Elder } 364415228edeSAlex Elder 36459875201eSJosh Durgin static void rbd_dev_update_size(struct rbd_device *rbd_dev) 36469875201eSJosh Durgin { 36479875201eSJosh Durgin sector_t size; 36489875201eSJosh Durgin bool removing; 36499875201eSJosh Durgin 36509875201eSJosh Durgin /* 36519875201eSJosh Durgin * Don't hold the lock while doing disk operations, 36529875201eSJosh Durgin * or lock ordering will conflict with the bdev mutex via: 36539875201eSJosh Durgin * rbd_add() -> blkdev_get() -> rbd_open() 36549875201eSJosh Durgin */ 36559875201eSJosh Durgin spin_lock_irq(&rbd_dev->lock); 36569875201eSJosh Durgin removing = test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags); 36579875201eSJosh Durgin spin_unlock_irq(&rbd_dev->lock); 36589875201eSJosh Durgin /* 36599875201eSJosh Durgin * If the device is being removed, rbd_dev->disk has 36609875201eSJosh Durgin * been destroyed, so don't try to update its size 36619875201eSJosh Durgin */ 36629875201eSJosh Durgin if (!removing) { 36639875201eSJosh Durgin size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE; 36649875201eSJosh Durgin dout("setting size to %llu sectors", (unsigned long long)size); 36659875201eSJosh Durgin set_capacity(rbd_dev->disk, size); 36669875201eSJosh Durgin revalidate_disk(rbd_dev->disk); 36679875201eSJosh Durgin } 36689875201eSJosh Durgin } 36699875201eSJosh Durgin 3670cc4a38bdSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev) 36711fe5e993SAlex Elder { 3672e627db08SAlex Elder u64 mapping_size; 36731fe5e993SAlex Elder int ret; 36741fe5e993SAlex Elder 3675cfbf6377SAlex Elder down_write(&rbd_dev->header_rwsem); 36763b5cf2a2SAlex Elder mapping_size = rbd_dev->mapping.size; 3677a720ae09SIlya Dryomov 3678a720ae09SIlya Dryomov ret = rbd_dev_header_info(rbd_dev); 367952bb1f9bSIlya Dryomov if (ret) 368073e39e4dSIlya Dryomov goto out; 368115228edeSAlex Elder 3682e8f59b59SIlya Dryomov /* 3683e8f59b59SIlya Dryomov * If there is a parent, see if it has disappeared due to the 3684e8f59b59SIlya Dryomov * mapped image getting flattened. 3685e8f59b59SIlya Dryomov */ 3686e8f59b59SIlya Dryomov if (rbd_dev->parent) { 3687e8f59b59SIlya Dryomov ret = rbd_dev_v2_parent_info(rbd_dev); 3688e8f59b59SIlya Dryomov if (ret) 368973e39e4dSIlya Dryomov goto out; 3690e8f59b59SIlya Dryomov } 3691e8f59b59SIlya Dryomov 36925ff1108cSIlya Dryomov if (rbd_dev->spec->snap_id == CEPH_NOSNAP) { 36935ff1108cSIlya Dryomov rbd_dev->mapping.size = rbd_dev->header.image_size; 36945ff1108cSIlya Dryomov } else { 36955ff1108cSIlya Dryomov /* validate mapped snapshot's EXISTS flag */ 369615228edeSAlex Elder rbd_exists_validate(rbd_dev); 36975ff1108cSIlya Dryomov } 36985ff1108cSIlya Dryomov 369973e39e4dSIlya Dryomov out: 3700cfbf6377SAlex Elder up_write(&rbd_dev->header_rwsem); 370173e39e4dSIlya Dryomov if (!ret && mapping_size != rbd_dev->mapping.size) 37029875201eSJosh Durgin rbd_dev_update_size(rbd_dev); 37031fe5e993SAlex Elder 370473e39e4dSIlya Dryomov return ret; 37051fe5e993SAlex Elder } 37061fe5e993SAlex Elder 37077ad18afaSChristoph Hellwig static int rbd_init_request(void *data, struct request *rq, 37087ad18afaSChristoph Hellwig unsigned int hctx_idx, unsigned int request_idx, 37097ad18afaSChristoph Hellwig unsigned int numa_node) 37107ad18afaSChristoph Hellwig { 37117ad18afaSChristoph Hellwig struct work_struct *work = blk_mq_rq_to_pdu(rq); 37127ad18afaSChristoph Hellwig 37137ad18afaSChristoph Hellwig INIT_WORK(work, rbd_queue_workfn); 37147ad18afaSChristoph Hellwig return 0; 37157ad18afaSChristoph Hellwig } 37167ad18afaSChristoph Hellwig 37177ad18afaSChristoph Hellwig static struct blk_mq_ops rbd_mq_ops = { 37187ad18afaSChristoph Hellwig .queue_rq = rbd_queue_rq, 37197ad18afaSChristoph Hellwig .map_queue = blk_mq_map_queue, 37207ad18afaSChristoph Hellwig .init_request = rbd_init_request, 37217ad18afaSChristoph Hellwig }; 37227ad18afaSChristoph Hellwig 3723602adf40SYehuda Sadeh static int rbd_init_disk(struct rbd_device *rbd_dev) 3724602adf40SYehuda Sadeh { 3725602adf40SYehuda Sadeh struct gendisk *disk; 3726602adf40SYehuda Sadeh struct request_queue *q; 3727593a9e7bSAlex Elder u64 segment_size; 37287ad18afaSChristoph Hellwig int err; 3729602adf40SYehuda Sadeh 3730602adf40SYehuda Sadeh /* create gendisk info */ 37317e513d43SIlya Dryomov disk = alloc_disk(single_major ? 37327e513d43SIlya Dryomov (1 << RBD_SINGLE_MAJOR_PART_SHIFT) : 37337e513d43SIlya Dryomov RBD_MINORS_PER_MAJOR); 3734602adf40SYehuda Sadeh if (!disk) 37351fcdb8aaSAlex Elder return -ENOMEM; 3736602adf40SYehuda Sadeh 3737f0f8cef5SAlex Elder snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d", 3738de71a297SAlex Elder rbd_dev->dev_id); 3739602adf40SYehuda Sadeh disk->major = rbd_dev->major; 3740dd82fff1SIlya Dryomov disk->first_minor = rbd_dev->minor; 37417e513d43SIlya Dryomov if (single_major) 37427e513d43SIlya Dryomov disk->flags |= GENHD_FL_EXT_DEVT; 3743602adf40SYehuda Sadeh disk->fops = &rbd_bd_ops; 3744602adf40SYehuda Sadeh disk->private_data = rbd_dev; 3745602adf40SYehuda Sadeh 37467ad18afaSChristoph Hellwig memset(&rbd_dev->tag_set, 0, sizeof(rbd_dev->tag_set)); 37477ad18afaSChristoph Hellwig rbd_dev->tag_set.ops = &rbd_mq_ops; 3748b5584180SIlya Dryomov rbd_dev->tag_set.queue_depth = rbd_dev->opts->queue_depth; 37497ad18afaSChristoph Hellwig rbd_dev->tag_set.numa_node = NUMA_NO_NODE; 3750b5584180SIlya Dryomov rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE; 37517ad18afaSChristoph Hellwig rbd_dev->tag_set.nr_hw_queues = 1; 37527ad18afaSChristoph Hellwig rbd_dev->tag_set.cmd_size = sizeof(struct work_struct); 37537ad18afaSChristoph Hellwig 37547ad18afaSChristoph Hellwig err = blk_mq_alloc_tag_set(&rbd_dev->tag_set); 37557ad18afaSChristoph Hellwig if (err) 3756602adf40SYehuda Sadeh goto out_disk; 3757029bcbd8SJosh Durgin 37587ad18afaSChristoph Hellwig q = blk_mq_init_queue(&rbd_dev->tag_set); 37597ad18afaSChristoph Hellwig if (IS_ERR(q)) { 37607ad18afaSChristoph Hellwig err = PTR_ERR(q); 37617ad18afaSChristoph Hellwig goto out_tag_set; 37627ad18afaSChristoph Hellwig } 37637ad18afaSChristoph Hellwig 3764d8a2c89cSIlya Dryomov queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q); 3765d8a2c89cSIlya Dryomov /* QUEUE_FLAG_ADD_RANDOM is off by default for blk-mq */ 3766593a9e7bSAlex Elder 3767029bcbd8SJosh Durgin /* set io sizes to object size */ 3768593a9e7bSAlex Elder segment_size = rbd_obj_bytes(&rbd_dev->header); 3769593a9e7bSAlex Elder blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE); 37700d9fde4fSIlya Dryomov q->limits.max_sectors = queue_max_hw_sectors(q); 3771d3834fefSIlya Dryomov blk_queue_max_segments(q, segment_size / SECTOR_SIZE); 3772593a9e7bSAlex Elder blk_queue_max_segment_size(q, segment_size); 3773593a9e7bSAlex Elder blk_queue_io_min(q, segment_size); 3774593a9e7bSAlex Elder blk_queue_io_opt(q, segment_size); 3775029bcbd8SJosh Durgin 377690e98c52SGuangliang Zhao /* enable the discard support */ 377790e98c52SGuangliang Zhao queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q); 377890e98c52SGuangliang Zhao q->limits.discard_granularity = segment_size; 377990e98c52SGuangliang Zhao q->limits.discard_alignment = segment_size; 37802bb4cd5cSJens Axboe blk_queue_max_discard_sectors(q, segment_size / SECTOR_SIZE); 3781b76f8239SJosh Durgin q->limits.discard_zeroes_data = 1; 378290e98c52SGuangliang Zhao 3783bae818eeSRonny Hegewald if (!ceph_test_opt(rbd_dev->rbd_client->client, NOCRC)) 3784bae818eeSRonny Hegewald q->backing_dev_info.capabilities |= BDI_CAP_STABLE_WRITES; 3785bae818eeSRonny Hegewald 3786602adf40SYehuda Sadeh disk->queue = q; 3787602adf40SYehuda Sadeh 3788602adf40SYehuda Sadeh q->queuedata = rbd_dev; 3789602adf40SYehuda Sadeh 3790602adf40SYehuda Sadeh rbd_dev->disk = disk; 3791602adf40SYehuda Sadeh 3792602adf40SYehuda Sadeh return 0; 37937ad18afaSChristoph Hellwig out_tag_set: 37947ad18afaSChristoph Hellwig blk_mq_free_tag_set(&rbd_dev->tag_set); 3795602adf40SYehuda Sadeh out_disk: 3796602adf40SYehuda Sadeh put_disk(disk); 37977ad18afaSChristoph Hellwig return err; 3798602adf40SYehuda Sadeh } 3799602adf40SYehuda Sadeh 3800dfc5606dSYehuda Sadeh /* 3801dfc5606dSYehuda Sadeh sysfs 3802dfc5606dSYehuda Sadeh */ 3803602adf40SYehuda Sadeh 3804593a9e7bSAlex Elder static struct rbd_device *dev_to_rbd_dev(struct device *dev) 3805593a9e7bSAlex Elder { 3806593a9e7bSAlex Elder return container_of(dev, struct rbd_device, dev); 3807593a9e7bSAlex Elder } 3808593a9e7bSAlex Elder 3809dfc5606dSYehuda Sadeh static ssize_t rbd_size_show(struct device *dev, 3810dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 3811602adf40SYehuda Sadeh { 3812593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 3813dfc5606dSYehuda Sadeh 3814fc71d833SAlex Elder return sprintf(buf, "%llu\n", 3815fc71d833SAlex Elder (unsigned long long)rbd_dev->mapping.size); 3816602adf40SYehuda Sadeh } 3817602adf40SYehuda Sadeh 381834b13184SAlex Elder /* 381934b13184SAlex Elder * Note this shows the features for whatever's mapped, which is not 382034b13184SAlex Elder * necessarily the base image. 382134b13184SAlex Elder */ 382234b13184SAlex Elder static ssize_t rbd_features_show(struct device *dev, 382334b13184SAlex Elder struct device_attribute *attr, char *buf) 382434b13184SAlex Elder { 382534b13184SAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 382634b13184SAlex Elder 382734b13184SAlex Elder return sprintf(buf, "0x%016llx\n", 382834b13184SAlex Elder (unsigned long long)rbd_dev->mapping.features); 382934b13184SAlex Elder } 383034b13184SAlex Elder 3831dfc5606dSYehuda Sadeh static ssize_t rbd_major_show(struct device *dev, 3832dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 3833602adf40SYehuda Sadeh { 3834593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 3835dfc5606dSYehuda Sadeh 3836fc71d833SAlex Elder if (rbd_dev->major) 3837dfc5606dSYehuda Sadeh return sprintf(buf, "%d\n", rbd_dev->major); 3838fc71d833SAlex Elder 3839fc71d833SAlex Elder return sprintf(buf, "(none)\n"); 3840dd82fff1SIlya Dryomov } 3841fc71d833SAlex Elder 3842dd82fff1SIlya Dryomov static ssize_t rbd_minor_show(struct device *dev, 3843dd82fff1SIlya Dryomov struct device_attribute *attr, char *buf) 3844dd82fff1SIlya Dryomov { 3845dd82fff1SIlya Dryomov struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 3846dd82fff1SIlya Dryomov 3847dd82fff1SIlya Dryomov return sprintf(buf, "%d\n", rbd_dev->minor); 3848dfc5606dSYehuda Sadeh } 3849dfc5606dSYehuda Sadeh 3850dfc5606dSYehuda Sadeh static ssize_t rbd_client_id_show(struct device *dev, 3851dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 3852dfc5606dSYehuda Sadeh { 3853593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 3854dfc5606dSYehuda Sadeh 38551dbb4399SAlex Elder return sprintf(buf, "client%lld\n", 38561dbb4399SAlex Elder ceph_client_id(rbd_dev->rbd_client->client)); 3857dfc5606dSYehuda Sadeh } 3858dfc5606dSYehuda Sadeh 3859dfc5606dSYehuda Sadeh static ssize_t rbd_pool_show(struct device *dev, 3860dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 3861dfc5606dSYehuda Sadeh { 3862593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 3863dfc5606dSYehuda Sadeh 38640d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->pool_name); 3865dfc5606dSYehuda Sadeh } 3866dfc5606dSYehuda Sadeh 38679bb2f334SAlex Elder static ssize_t rbd_pool_id_show(struct device *dev, 38689bb2f334SAlex Elder struct device_attribute *attr, char *buf) 38699bb2f334SAlex Elder { 38709bb2f334SAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 38719bb2f334SAlex Elder 38720d7dbfceSAlex Elder return sprintf(buf, "%llu\n", 38730d7dbfceSAlex Elder (unsigned long long) rbd_dev->spec->pool_id); 38749bb2f334SAlex Elder } 38759bb2f334SAlex Elder 3876dfc5606dSYehuda Sadeh static ssize_t rbd_name_show(struct device *dev, 3877dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 3878dfc5606dSYehuda Sadeh { 3879593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 3880dfc5606dSYehuda Sadeh 3881a92ffdf8SAlex Elder if (rbd_dev->spec->image_name) 38820d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->image_name); 3883a92ffdf8SAlex Elder 3884a92ffdf8SAlex Elder return sprintf(buf, "(unknown)\n"); 3885dfc5606dSYehuda Sadeh } 3886dfc5606dSYehuda Sadeh 3887589d30e0SAlex Elder static ssize_t rbd_image_id_show(struct device *dev, 3888589d30e0SAlex Elder struct device_attribute *attr, char *buf) 3889589d30e0SAlex Elder { 3890589d30e0SAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 3891589d30e0SAlex Elder 38920d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->image_id); 3893589d30e0SAlex Elder } 3894589d30e0SAlex Elder 389534b13184SAlex Elder /* 389634b13184SAlex Elder * Shows the name of the currently-mapped snapshot (or 389734b13184SAlex Elder * RBD_SNAP_HEAD_NAME for the base image). 389834b13184SAlex Elder */ 3899dfc5606dSYehuda Sadeh static ssize_t rbd_snap_show(struct device *dev, 3900dfc5606dSYehuda Sadeh struct device_attribute *attr, 3901dfc5606dSYehuda Sadeh char *buf) 3902dfc5606dSYehuda Sadeh { 3903593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 3904dfc5606dSYehuda Sadeh 39050d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->snap_name); 3906dfc5606dSYehuda Sadeh } 3907dfc5606dSYehuda Sadeh 390886b00e0dSAlex Elder /* 3909ff96128fSIlya Dryomov * For a v2 image, shows the chain of parent images, separated by empty 3910ff96128fSIlya Dryomov * lines. For v1 images or if there is no parent, shows "(no parent 3911ff96128fSIlya Dryomov * image)". 391286b00e0dSAlex Elder */ 391386b00e0dSAlex Elder static ssize_t rbd_parent_show(struct device *dev, 391486b00e0dSAlex Elder struct device_attribute *attr, 391586b00e0dSAlex Elder char *buf) 391686b00e0dSAlex Elder { 391786b00e0dSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 3918ff96128fSIlya Dryomov ssize_t count = 0; 391986b00e0dSAlex Elder 3920ff96128fSIlya Dryomov if (!rbd_dev->parent) 392186b00e0dSAlex Elder return sprintf(buf, "(no parent image)\n"); 392286b00e0dSAlex Elder 3923ff96128fSIlya Dryomov for ( ; rbd_dev->parent; rbd_dev = rbd_dev->parent) { 3924ff96128fSIlya Dryomov struct rbd_spec *spec = rbd_dev->parent_spec; 392586b00e0dSAlex Elder 3926ff96128fSIlya Dryomov count += sprintf(&buf[count], "%s" 3927ff96128fSIlya Dryomov "pool_id %llu\npool_name %s\n" 3928ff96128fSIlya Dryomov "image_id %s\nimage_name %s\n" 3929ff96128fSIlya Dryomov "snap_id %llu\nsnap_name %s\n" 3930ff96128fSIlya Dryomov "overlap %llu\n", 3931ff96128fSIlya Dryomov !count ? "" : "\n", /* first? */ 3932ff96128fSIlya Dryomov spec->pool_id, spec->pool_name, 3933ff96128fSIlya Dryomov spec->image_id, spec->image_name ?: "(unknown)", 3934ff96128fSIlya Dryomov spec->snap_id, spec->snap_name, 3935ff96128fSIlya Dryomov rbd_dev->parent_overlap); 3936ff96128fSIlya Dryomov } 393786b00e0dSAlex Elder 393886b00e0dSAlex Elder return count; 393986b00e0dSAlex Elder } 394086b00e0dSAlex Elder 3941dfc5606dSYehuda Sadeh static ssize_t rbd_image_refresh(struct device *dev, 3942dfc5606dSYehuda Sadeh struct device_attribute *attr, 3943dfc5606dSYehuda Sadeh const char *buf, 3944dfc5606dSYehuda Sadeh size_t size) 3945dfc5606dSYehuda Sadeh { 3946593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 3947b813623aSAlex Elder int ret; 3948602adf40SYehuda Sadeh 3949cc4a38bdSAlex Elder ret = rbd_dev_refresh(rbd_dev); 3950e627db08SAlex Elder if (ret) 395152bb1f9bSIlya Dryomov return ret; 3952b813623aSAlex Elder 395352bb1f9bSIlya Dryomov return size; 3954dfc5606dSYehuda Sadeh } 3955602adf40SYehuda Sadeh 3956dfc5606dSYehuda Sadeh static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL); 395734b13184SAlex Elder static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL); 3958dfc5606dSYehuda Sadeh static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL); 3959dd82fff1SIlya Dryomov static DEVICE_ATTR(minor, S_IRUGO, rbd_minor_show, NULL); 3960dfc5606dSYehuda Sadeh static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL); 3961dfc5606dSYehuda Sadeh static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL); 39629bb2f334SAlex Elder static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL); 3963dfc5606dSYehuda Sadeh static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL); 3964589d30e0SAlex Elder static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL); 3965dfc5606dSYehuda Sadeh static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh); 3966dfc5606dSYehuda Sadeh static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL); 396786b00e0dSAlex Elder static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL); 3968dfc5606dSYehuda Sadeh 3969dfc5606dSYehuda Sadeh static struct attribute *rbd_attrs[] = { 3970dfc5606dSYehuda Sadeh &dev_attr_size.attr, 397134b13184SAlex Elder &dev_attr_features.attr, 3972dfc5606dSYehuda Sadeh &dev_attr_major.attr, 3973dd82fff1SIlya Dryomov &dev_attr_minor.attr, 3974dfc5606dSYehuda Sadeh &dev_attr_client_id.attr, 3975dfc5606dSYehuda Sadeh &dev_attr_pool.attr, 39769bb2f334SAlex Elder &dev_attr_pool_id.attr, 3977dfc5606dSYehuda Sadeh &dev_attr_name.attr, 3978589d30e0SAlex Elder &dev_attr_image_id.attr, 3979dfc5606dSYehuda Sadeh &dev_attr_current_snap.attr, 398086b00e0dSAlex Elder &dev_attr_parent.attr, 3981dfc5606dSYehuda Sadeh &dev_attr_refresh.attr, 3982dfc5606dSYehuda Sadeh NULL 3983dfc5606dSYehuda Sadeh }; 3984dfc5606dSYehuda Sadeh 3985dfc5606dSYehuda Sadeh static struct attribute_group rbd_attr_group = { 3986dfc5606dSYehuda Sadeh .attrs = rbd_attrs, 3987dfc5606dSYehuda Sadeh }; 3988dfc5606dSYehuda Sadeh 3989dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_attr_groups[] = { 3990dfc5606dSYehuda Sadeh &rbd_attr_group, 3991dfc5606dSYehuda Sadeh NULL 3992dfc5606dSYehuda Sadeh }; 3993dfc5606dSYehuda Sadeh 3994dfc5606dSYehuda Sadeh static void rbd_sysfs_dev_release(struct device *dev) 3995dfc5606dSYehuda Sadeh { 3996dfc5606dSYehuda Sadeh } 3997dfc5606dSYehuda Sadeh 3998dfc5606dSYehuda Sadeh static struct device_type rbd_device_type = { 3999dfc5606dSYehuda Sadeh .name = "rbd", 4000dfc5606dSYehuda Sadeh .groups = rbd_attr_groups, 4001dfc5606dSYehuda Sadeh .release = rbd_sysfs_dev_release, 4002dfc5606dSYehuda Sadeh }; 4003dfc5606dSYehuda Sadeh 40048b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec) 40058b8fb99cSAlex Elder { 40068b8fb99cSAlex Elder kref_get(&spec->kref); 40078b8fb99cSAlex Elder 40088b8fb99cSAlex Elder return spec; 40098b8fb99cSAlex Elder } 40108b8fb99cSAlex Elder 40118b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref); 40128b8fb99cSAlex Elder static void rbd_spec_put(struct rbd_spec *spec) 40138b8fb99cSAlex Elder { 40148b8fb99cSAlex Elder if (spec) 40158b8fb99cSAlex Elder kref_put(&spec->kref, rbd_spec_free); 40168b8fb99cSAlex Elder } 40178b8fb99cSAlex Elder 40188b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_alloc(void) 40198b8fb99cSAlex Elder { 40208b8fb99cSAlex Elder struct rbd_spec *spec; 40218b8fb99cSAlex Elder 40228b8fb99cSAlex Elder spec = kzalloc(sizeof (*spec), GFP_KERNEL); 40238b8fb99cSAlex Elder if (!spec) 40248b8fb99cSAlex Elder return NULL; 402504077599SIlya Dryomov 402604077599SIlya Dryomov spec->pool_id = CEPH_NOPOOL; 402704077599SIlya Dryomov spec->snap_id = CEPH_NOSNAP; 40288b8fb99cSAlex Elder kref_init(&spec->kref); 40298b8fb99cSAlex Elder 40308b8fb99cSAlex Elder return spec; 40318b8fb99cSAlex Elder } 40328b8fb99cSAlex Elder 40338b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref) 40348b8fb99cSAlex Elder { 40358b8fb99cSAlex Elder struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref); 40368b8fb99cSAlex Elder 40378b8fb99cSAlex Elder kfree(spec->pool_name); 40388b8fb99cSAlex Elder kfree(spec->image_id); 40398b8fb99cSAlex Elder kfree(spec->image_name); 40408b8fb99cSAlex Elder kfree(spec->snap_name); 40418b8fb99cSAlex Elder kfree(spec); 40428b8fb99cSAlex Elder } 40438b8fb99cSAlex Elder 4044cc344fa1SAlex Elder static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc, 4045d147543dSIlya Dryomov struct rbd_spec *spec, 4046d147543dSIlya Dryomov struct rbd_options *opts) 4047c53d5893SAlex Elder { 4048c53d5893SAlex Elder struct rbd_device *rbd_dev; 4049c53d5893SAlex Elder 4050c53d5893SAlex Elder rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL); 4051c53d5893SAlex Elder if (!rbd_dev) 4052c53d5893SAlex Elder return NULL; 4053c53d5893SAlex Elder 4054c53d5893SAlex Elder spin_lock_init(&rbd_dev->lock); 40556d292906SAlex Elder rbd_dev->flags = 0; 4056a2acd00eSAlex Elder atomic_set(&rbd_dev->parent_ref, 0); 4057c53d5893SAlex Elder INIT_LIST_HEAD(&rbd_dev->node); 4058c53d5893SAlex Elder init_rwsem(&rbd_dev->header_rwsem); 4059c53d5893SAlex Elder 4060c53d5893SAlex Elder rbd_dev->rbd_client = rbdc; 4061d147543dSIlya Dryomov rbd_dev->spec = spec; 4062d147543dSIlya Dryomov rbd_dev->opts = opts; 4063c53d5893SAlex Elder 40640903e875SAlex Elder /* Initialize the layout used for all rbd requests */ 40650903e875SAlex Elder 40660903e875SAlex Elder rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); 40670903e875SAlex Elder rbd_dev->layout.fl_stripe_count = cpu_to_le32(1); 40680903e875SAlex Elder rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); 40690903e875SAlex Elder rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id); 40700903e875SAlex Elder 4071c53d5893SAlex Elder return rbd_dev; 4072c53d5893SAlex Elder } 4073c53d5893SAlex Elder 4074c53d5893SAlex Elder static void rbd_dev_destroy(struct rbd_device *rbd_dev) 4075c53d5893SAlex Elder { 4076c53d5893SAlex Elder rbd_put_client(rbd_dev->rbd_client); 4077c53d5893SAlex Elder rbd_spec_put(rbd_dev->spec); 4078d147543dSIlya Dryomov kfree(rbd_dev->opts); 4079c53d5893SAlex Elder kfree(rbd_dev); 4080c53d5893SAlex Elder } 4081c53d5893SAlex Elder 4082dfc5606dSYehuda Sadeh /* 40839d475de5SAlex Elder * Get the size and object order for an image snapshot, or if 40849d475de5SAlex Elder * snap_id is CEPH_NOSNAP, gets this information for the base 40859d475de5SAlex Elder * image. 40869d475de5SAlex Elder */ 40879d475de5SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id, 40889d475de5SAlex Elder u8 *order, u64 *snap_size) 40899d475de5SAlex Elder { 40909d475de5SAlex Elder __le64 snapid = cpu_to_le64(snap_id); 40919d475de5SAlex Elder int ret; 40929d475de5SAlex Elder struct { 40939d475de5SAlex Elder u8 order; 40949d475de5SAlex Elder __le64 size; 40959d475de5SAlex Elder } __attribute__ ((packed)) size_buf = { 0 }; 40969d475de5SAlex Elder 409736be9a76SAlex Elder ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, 40989d475de5SAlex Elder "rbd", "get_size", 40994157976bSAlex Elder &snapid, sizeof (snapid), 4100e2a58ee5SAlex Elder &size_buf, sizeof (size_buf)); 410136be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 41029d475de5SAlex Elder if (ret < 0) 41039d475de5SAlex Elder return ret; 410457385b51SAlex Elder if (ret < sizeof (size_buf)) 410557385b51SAlex Elder return -ERANGE; 41069d475de5SAlex Elder 4107c3545579SJosh Durgin if (order) { 41089d475de5SAlex Elder *order = size_buf.order; 4109c3545579SJosh Durgin dout(" order %u", (unsigned int)*order); 4110c3545579SJosh Durgin } 41119d475de5SAlex Elder *snap_size = le64_to_cpu(size_buf.size); 41129d475de5SAlex Elder 4113c3545579SJosh Durgin dout(" snap_id 0x%016llx snap_size = %llu\n", 4114c3545579SJosh Durgin (unsigned long long)snap_id, 41159d475de5SAlex Elder (unsigned long long)*snap_size); 41169d475de5SAlex Elder 41179d475de5SAlex Elder return 0; 41189d475de5SAlex Elder } 41199d475de5SAlex Elder 41209d475de5SAlex Elder static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev) 41219d475de5SAlex Elder { 41229d475de5SAlex Elder return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP, 41239d475de5SAlex Elder &rbd_dev->header.obj_order, 41249d475de5SAlex Elder &rbd_dev->header.image_size); 41259d475de5SAlex Elder } 41269d475de5SAlex Elder 41271e130199SAlex Elder static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev) 41281e130199SAlex Elder { 41291e130199SAlex Elder void *reply_buf; 41301e130199SAlex Elder int ret; 41311e130199SAlex Elder void *p; 41321e130199SAlex Elder 41331e130199SAlex Elder reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL); 41341e130199SAlex Elder if (!reply_buf) 41351e130199SAlex Elder return -ENOMEM; 41361e130199SAlex Elder 413736be9a76SAlex Elder ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, 41384157976bSAlex Elder "rbd", "get_object_prefix", NULL, 0, 4139e2a58ee5SAlex Elder reply_buf, RBD_OBJ_PREFIX_LEN_MAX); 414036be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 41411e130199SAlex Elder if (ret < 0) 41421e130199SAlex Elder goto out; 41431e130199SAlex Elder 41441e130199SAlex Elder p = reply_buf; 41451e130199SAlex Elder rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p, 414657385b51SAlex Elder p + ret, NULL, GFP_NOIO); 414757385b51SAlex Elder ret = 0; 41481e130199SAlex Elder 41491e130199SAlex Elder if (IS_ERR(rbd_dev->header.object_prefix)) { 41501e130199SAlex Elder ret = PTR_ERR(rbd_dev->header.object_prefix); 41511e130199SAlex Elder rbd_dev->header.object_prefix = NULL; 41521e130199SAlex Elder } else { 41531e130199SAlex Elder dout(" object_prefix = %s\n", rbd_dev->header.object_prefix); 41541e130199SAlex Elder } 41551e130199SAlex Elder out: 41561e130199SAlex Elder kfree(reply_buf); 41571e130199SAlex Elder 41581e130199SAlex Elder return ret; 41591e130199SAlex Elder } 41601e130199SAlex Elder 4161b1b5402aSAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id, 4162b1b5402aSAlex Elder u64 *snap_features) 4163b1b5402aSAlex Elder { 4164b1b5402aSAlex Elder __le64 snapid = cpu_to_le64(snap_id); 4165b1b5402aSAlex Elder struct { 4166b1b5402aSAlex Elder __le64 features; 4167b1b5402aSAlex Elder __le64 incompat; 41684157976bSAlex Elder } __attribute__ ((packed)) features_buf = { 0 }; 4169d889140cSAlex Elder u64 incompat; 4170b1b5402aSAlex Elder int ret; 4171b1b5402aSAlex Elder 417236be9a76SAlex Elder ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, 4173b1b5402aSAlex Elder "rbd", "get_features", 41744157976bSAlex Elder &snapid, sizeof (snapid), 4175e2a58ee5SAlex Elder &features_buf, sizeof (features_buf)); 417636be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 4177b1b5402aSAlex Elder if (ret < 0) 4178b1b5402aSAlex Elder return ret; 417957385b51SAlex Elder if (ret < sizeof (features_buf)) 418057385b51SAlex Elder return -ERANGE; 4181d889140cSAlex Elder 4182d889140cSAlex Elder incompat = le64_to_cpu(features_buf.incompat); 41835cbf6f12SAlex Elder if (incompat & ~RBD_FEATURES_SUPPORTED) 4184b8f5c6edSAlex Elder return -ENXIO; 4185d889140cSAlex Elder 4186b1b5402aSAlex Elder *snap_features = le64_to_cpu(features_buf.features); 4187b1b5402aSAlex Elder 4188b1b5402aSAlex Elder dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n", 4189b1b5402aSAlex Elder (unsigned long long)snap_id, 4190b1b5402aSAlex Elder (unsigned long long)*snap_features, 4191b1b5402aSAlex Elder (unsigned long long)le64_to_cpu(features_buf.incompat)); 4192b1b5402aSAlex Elder 4193b1b5402aSAlex Elder return 0; 4194b1b5402aSAlex Elder } 4195b1b5402aSAlex Elder 4196b1b5402aSAlex Elder static int rbd_dev_v2_features(struct rbd_device *rbd_dev) 4197b1b5402aSAlex Elder { 4198b1b5402aSAlex Elder return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP, 4199b1b5402aSAlex Elder &rbd_dev->header.features); 4200b1b5402aSAlex Elder } 4201b1b5402aSAlex Elder 420286b00e0dSAlex Elder static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev) 420386b00e0dSAlex Elder { 420486b00e0dSAlex Elder struct rbd_spec *parent_spec; 420586b00e0dSAlex Elder size_t size; 420686b00e0dSAlex Elder void *reply_buf = NULL; 420786b00e0dSAlex Elder __le64 snapid; 420886b00e0dSAlex Elder void *p; 420986b00e0dSAlex Elder void *end; 4210642a2537SAlex Elder u64 pool_id; 421186b00e0dSAlex Elder char *image_id; 42123b5cf2a2SAlex Elder u64 snap_id; 421386b00e0dSAlex Elder u64 overlap; 421486b00e0dSAlex Elder int ret; 421586b00e0dSAlex Elder 421686b00e0dSAlex Elder parent_spec = rbd_spec_alloc(); 421786b00e0dSAlex Elder if (!parent_spec) 421886b00e0dSAlex Elder return -ENOMEM; 421986b00e0dSAlex Elder 422086b00e0dSAlex Elder size = sizeof (__le64) + /* pool_id */ 422186b00e0dSAlex Elder sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX + /* image_id */ 422286b00e0dSAlex Elder sizeof (__le64) + /* snap_id */ 422386b00e0dSAlex Elder sizeof (__le64); /* overlap */ 422486b00e0dSAlex Elder reply_buf = kmalloc(size, GFP_KERNEL); 422586b00e0dSAlex Elder if (!reply_buf) { 422686b00e0dSAlex Elder ret = -ENOMEM; 422786b00e0dSAlex Elder goto out_err; 422886b00e0dSAlex Elder } 422986b00e0dSAlex Elder 42304d9b67cdSIlya Dryomov snapid = cpu_to_le64(rbd_dev->spec->snap_id); 423136be9a76SAlex Elder ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, 423286b00e0dSAlex Elder "rbd", "get_parent", 42334157976bSAlex Elder &snapid, sizeof (snapid), 4234e2a58ee5SAlex Elder reply_buf, size); 423536be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 423686b00e0dSAlex Elder if (ret < 0) 423786b00e0dSAlex Elder goto out_err; 423886b00e0dSAlex Elder 423986b00e0dSAlex Elder p = reply_buf; 424057385b51SAlex Elder end = reply_buf + ret; 424157385b51SAlex Elder ret = -ERANGE; 4242642a2537SAlex Elder ceph_decode_64_safe(&p, end, pool_id, out_err); 4243392a9dadSAlex Elder if (pool_id == CEPH_NOPOOL) { 4244392a9dadSAlex Elder /* 4245392a9dadSAlex Elder * Either the parent never existed, or we have 4246392a9dadSAlex Elder * record of it but the image got flattened so it no 4247392a9dadSAlex Elder * longer has a parent. When the parent of a 4248392a9dadSAlex Elder * layered image disappears we immediately set the 4249392a9dadSAlex Elder * overlap to 0. The effect of this is that all new 4250392a9dadSAlex Elder * requests will be treated as if the image had no 4251392a9dadSAlex Elder * parent. 4252392a9dadSAlex Elder */ 4253392a9dadSAlex Elder if (rbd_dev->parent_overlap) { 4254392a9dadSAlex Elder rbd_dev->parent_overlap = 0; 4255392a9dadSAlex Elder rbd_dev_parent_put(rbd_dev); 4256392a9dadSAlex Elder pr_info("%s: clone image has been flattened\n", 4257392a9dadSAlex Elder rbd_dev->disk->disk_name); 4258392a9dadSAlex Elder } 4259392a9dadSAlex Elder 426086b00e0dSAlex Elder goto out; /* No parent? No problem. */ 4261392a9dadSAlex Elder } 426286b00e0dSAlex Elder 42630903e875SAlex Elder /* The ceph file layout needs to fit pool id in 32 bits */ 42640903e875SAlex Elder 42650903e875SAlex Elder ret = -EIO; 4266642a2537SAlex Elder if (pool_id > (u64)U32_MAX) { 42679584d508SIlya Dryomov rbd_warn(NULL, "parent pool id too large (%llu > %u)", 4268642a2537SAlex Elder (unsigned long long)pool_id, U32_MAX); 426957385b51SAlex Elder goto out_err; 4270c0cd10dbSAlex Elder } 42710903e875SAlex Elder 4272979ed480SAlex Elder image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL); 427386b00e0dSAlex Elder if (IS_ERR(image_id)) { 427486b00e0dSAlex Elder ret = PTR_ERR(image_id); 427586b00e0dSAlex Elder goto out_err; 427686b00e0dSAlex Elder } 42773b5cf2a2SAlex Elder ceph_decode_64_safe(&p, end, snap_id, out_err); 427886b00e0dSAlex Elder ceph_decode_64_safe(&p, end, overlap, out_err); 427986b00e0dSAlex Elder 42803b5cf2a2SAlex Elder /* 42813b5cf2a2SAlex Elder * The parent won't change (except when the clone is 42823b5cf2a2SAlex Elder * flattened, already handled that). So we only need to 42833b5cf2a2SAlex Elder * record the parent spec we have not already done so. 42843b5cf2a2SAlex Elder */ 42853b5cf2a2SAlex Elder if (!rbd_dev->parent_spec) { 42863b5cf2a2SAlex Elder parent_spec->pool_id = pool_id; 42873b5cf2a2SAlex Elder parent_spec->image_id = image_id; 42883b5cf2a2SAlex Elder parent_spec->snap_id = snap_id; 428986b00e0dSAlex Elder rbd_dev->parent_spec = parent_spec; 429086b00e0dSAlex Elder parent_spec = NULL; /* rbd_dev now owns this */ 4291fbba11b3SIlya Dryomov } else { 4292fbba11b3SIlya Dryomov kfree(image_id); 42933b5cf2a2SAlex Elder } 42943b5cf2a2SAlex Elder 42953b5cf2a2SAlex Elder /* 4296cf32bd9cSIlya Dryomov * We always update the parent overlap. If it's zero we issue 4297cf32bd9cSIlya Dryomov * a warning, as we will proceed as if there was no parent. 42983b5cf2a2SAlex Elder */ 42993b5cf2a2SAlex Elder if (!overlap) { 43003b5cf2a2SAlex Elder if (parent_spec) { 4301cf32bd9cSIlya Dryomov /* refresh, careful to warn just once */ 4302cf32bd9cSIlya Dryomov if (rbd_dev->parent_overlap) 4303cf32bd9cSIlya Dryomov rbd_warn(rbd_dev, 4304cf32bd9cSIlya Dryomov "clone now standalone (overlap became 0)"); 430570cf49cfSAlex Elder } else { 4306cf32bd9cSIlya Dryomov /* initial probe */ 4307cf32bd9cSIlya Dryomov rbd_warn(rbd_dev, "clone is standalone (overlap 0)"); 43083b5cf2a2SAlex Elder } 430970cf49cfSAlex Elder } 4310cf32bd9cSIlya Dryomov rbd_dev->parent_overlap = overlap; 4311cf32bd9cSIlya Dryomov 431286b00e0dSAlex Elder out: 431386b00e0dSAlex Elder ret = 0; 431486b00e0dSAlex Elder out_err: 431586b00e0dSAlex Elder kfree(reply_buf); 431686b00e0dSAlex Elder rbd_spec_put(parent_spec); 431786b00e0dSAlex Elder 431886b00e0dSAlex Elder return ret; 431986b00e0dSAlex Elder } 432086b00e0dSAlex Elder 4321cc070d59SAlex Elder static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev) 4322cc070d59SAlex Elder { 4323cc070d59SAlex Elder struct { 4324cc070d59SAlex Elder __le64 stripe_unit; 4325cc070d59SAlex Elder __le64 stripe_count; 4326cc070d59SAlex Elder } __attribute__ ((packed)) striping_info_buf = { 0 }; 4327cc070d59SAlex Elder size_t size = sizeof (striping_info_buf); 4328cc070d59SAlex Elder void *p; 4329cc070d59SAlex Elder u64 obj_size; 4330cc070d59SAlex Elder u64 stripe_unit; 4331cc070d59SAlex Elder u64 stripe_count; 4332cc070d59SAlex Elder int ret; 4333cc070d59SAlex Elder 4334cc070d59SAlex Elder ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, 4335cc070d59SAlex Elder "rbd", "get_stripe_unit_count", NULL, 0, 4336e2a58ee5SAlex Elder (char *)&striping_info_buf, size); 4337cc070d59SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 4338cc070d59SAlex Elder if (ret < 0) 4339cc070d59SAlex Elder return ret; 4340cc070d59SAlex Elder if (ret < size) 4341cc070d59SAlex Elder return -ERANGE; 4342cc070d59SAlex Elder 4343cc070d59SAlex Elder /* 4344cc070d59SAlex Elder * We don't actually support the "fancy striping" feature 4345cc070d59SAlex Elder * (STRIPINGV2) yet, but if the striping sizes are the 4346cc070d59SAlex Elder * defaults the behavior is the same as before. So find 4347cc070d59SAlex Elder * out, and only fail if the image has non-default values. 4348cc070d59SAlex Elder */ 4349cc070d59SAlex Elder ret = -EINVAL; 4350cc070d59SAlex Elder obj_size = (u64)1 << rbd_dev->header.obj_order; 4351cc070d59SAlex Elder p = &striping_info_buf; 4352cc070d59SAlex Elder stripe_unit = ceph_decode_64(&p); 4353cc070d59SAlex Elder if (stripe_unit != obj_size) { 4354cc070d59SAlex Elder rbd_warn(rbd_dev, "unsupported stripe unit " 4355cc070d59SAlex Elder "(got %llu want %llu)", 4356cc070d59SAlex Elder stripe_unit, obj_size); 4357cc070d59SAlex Elder return -EINVAL; 4358cc070d59SAlex Elder } 4359cc070d59SAlex Elder stripe_count = ceph_decode_64(&p); 4360cc070d59SAlex Elder if (stripe_count != 1) { 4361cc070d59SAlex Elder rbd_warn(rbd_dev, "unsupported stripe count " 4362cc070d59SAlex Elder "(got %llu want 1)", stripe_count); 4363cc070d59SAlex Elder return -EINVAL; 4364cc070d59SAlex Elder } 4365500d0c0fSAlex Elder rbd_dev->header.stripe_unit = stripe_unit; 4366500d0c0fSAlex Elder rbd_dev->header.stripe_count = stripe_count; 4367cc070d59SAlex Elder 4368cc070d59SAlex Elder return 0; 4369cc070d59SAlex Elder } 4370cc070d59SAlex Elder 43719e15b77dSAlex Elder static char *rbd_dev_image_name(struct rbd_device *rbd_dev) 43729e15b77dSAlex Elder { 43739e15b77dSAlex Elder size_t image_id_size; 43749e15b77dSAlex Elder char *image_id; 43759e15b77dSAlex Elder void *p; 43769e15b77dSAlex Elder void *end; 43779e15b77dSAlex Elder size_t size; 43789e15b77dSAlex Elder void *reply_buf = NULL; 43799e15b77dSAlex Elder size_t len = 0; 43809e15b77dSAlex Elder char *image_name = NULL; 43819e15b77dSAlex Elder int ret; 43829e15b77dSAlex Elder 43839e15b77dSAlex Elder rbd_assert(!rbd_dev->spec->image_name); 43849e15b77dSAlex Elder 438569e7a02fSAlex Elder len = strlen(rbd_dev->spec->image_id); 438669e7a02fSAlex Elder image_id_size = sizeof (__le32) + len; 43879e15b77dSAlex Elder image_id = kmalloc(image_id_size, GFP_KERNEL); 43889e15b77dSAlex Elder if (!image_id) 43899e15b77dSAlex Elder return NULL; 43909e15b77dSAlex Elder 43919e15b77dSAlex Elder p = image_id; 43924157976bSAlex Elder end = image_id + image_id_size; 439369e7a02fSAlex Elder ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len); 43949e15b77dSAlex Elder 43959e15b77dSAlex Elder size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX; 43969e15b77dSAlex Elder reply_buf = kmalloc(size, GFP_KERNEL); 43979e15b77dSAlex Elder if (!reply_buf) 43989e15b77dSAlex Elder goto out; 43999e15b77dSAlex Elder 440036be9a76SAlex Elder ret = rbd_obj_method_sync(rbd_dev, RBD_DIRECTORY, 44019e15b77dSAlex Elder "rbd", "dir_get_name", 44029e15b77dSAlex Elder image_id, image_id_size, 4403e2a58ee5SAlex Elder reply_buf, size); 44049e15b77dSAlex Elder if (ret < 0) 44059e15b77dSAlex Elder goto out; 44069e15b77dSAlex Elder p = reply_buf; 4407f40eb349SAlex Elder end = reply_buf + ret; 4408f40eb349SAlex Elder 44099e15b77dSAlex Elder image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL); 44109e15b77dSAlex Elder if (IS_ERR(image_name)) 44119e15b77dSAlex Elder image_name = NULL; 44129e15b77dSAlex Elder else 44139e15b77dSAlex Elder dout("%s: name is %s len is %zd\n", __func__, image_name, len); 44149e15b77dSAlex Elder out: 44159e15b77dSAlex Elder kfree(reply_buf); 44169e15b77dSAlex Elder kfree(image_id); 44179e15b77dSAlex Elder 44189e15b77dSAlex Elder return image_name; 44199e15b77dSAlex Elder } 44209e15b77dSAlex Elder 44212ad3d716SAlex Elder static u64 rbd_v1_snap_id_by_name(struct rbd_device *rbd_dev, const char *name) 44222ad3d716SAlex Elder { 44232ad3d716SAlex Elder struct ceph_snap_context *snapc = rbd_dev->header.snapc; 44242ad3d716SAlex Elder const char *snap_name; 44252ad3d716SAlex Elder u32 which = 0; 44262ad3d716SAlex Elder 44272ad3d716SAlex Elder /* Skip over names until we find the one we are looking for */ 44282ad3d716SAlex Elder 44292ad3d716SAlex Elder snap_name = rbd_dev->header.snap_names; 44302ad3d716SAlex Elder while (which < snapc->num_snaps) { 44312ad3d716SAlex Elder if (!strcmp(name, snap_name)) 44322ad3d716SAlex Elder return snapc->snaps[which]; 44332ad3d716SAlex Elder snap_name += strlen(snap_name) + 1; 44342ad3d716SAlex Elder which++; 44352ad3d716SAlex Elder } 44362ad3d716SAlex Elder return CEPH_NOSNAP; 44372ad3d716SAlex Elder } 44382ad3d716SAlex Elder 44392ad3d716SAlex Elder static u64 rbd_v2_snap_id_by_name(struct rbd_device *rbd_dev, const char *name) 44402ad3d716SAlex Elder { 44412ad3d716SAlex Elder struct ceph_snap_context *snapc = rbd_dev->header.snapc; 44422ad3d716SAlex Elder u32 which; 44432ad3d716SAlex Elder bool found = false; 44442ad3d716SAlex Elder u64 snap_id; 44452ad3d716SAlex Elder 44462ad3d716SAlex Elder for (which = 0; !found && which < snapc->num_snaps; which++) { 44472ad3d716SAlex Elder const char *snap_name; 44482ad3d716SAlex Elder 44492ad3d716SAlex Elder snap_id = snapc->snaps[which]; 44502ad3d716SAlex Elder snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id); 4451efadc98aSJosh Durgin if (IS_ERR(snap_name)) { 4452efadc98aSJosh Durgin /* ignore no-longer existing snapshots */ 4453efadc98aSJosh Durgin if (PTR_ERR(snap_name) == -ENOENT) 4454efadc98aSJosh Durgin continue; 4455efadc98aSJosh Durgin else 44562ad3d716SAlex Elder break; 4457efadc98aSJosh Durgin } 44582ad3d716SAlex Elder found = !strcmp(name, snap_name); 44592ad3d716SAlex Elder kfree(snap_name); 44602ad3d716SAlex Elder } 44612ad3d716SAlex Elder return found ? snap_id : CEPH_NOSNAP; 44622ad3d716SAlex Elder } 44632ad3d716SAlex Elder 44642ad3d716SAlex Elder /* 44652ad3d716SAlex Elder * Assumes name is never RBD_SNAP_HEAD_NAME; returns CEPH_NOSNAP if 44662ad3d716SAlex Elder * no snapshot by that name is found, or if an error occurs. 44672ad3d716SAlex Elder */ 44682ad3d716SAlex Elder static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name) 44692ad3d716SAlex Elder { 44702ad3d716SAlex Elder if (rbd_dev->image_format == 1) 44712ad3d716SAlex Elder return rbd_v1_snap_id_by_name(rbd_dev, name); 44722ad3d716SAlex Elder 44732ad3d716SAlex Elder return rbd_v2_snap_id_by_name(rbd_dev, name); 44742ad3d716SAlex Elder } 44752ad3d716SAlex Elder 44769e15b77dSAlex Elder /* 447704077599SIlya Dryomov * An image being mapped will have everything but the snap id. 44789e15b77dSAlex Elder */ 447904077599SIlya Dryomov static int rbd_spec_fill_snap_id(struct rbd_device *rbd_dev) 448004077599SIlya Dryomov { 448104077599SIlya Dryomov struct rbd_spec *spec = rbd_dev->spec; 448204077599SIlya Dryomov 448304077599SIlya Dryomov rbd_assert(spec->pool_id != CEPH_NOPOOL && spec->pool_name); 448404077599SIlya Dryomov rbd_assert(spec->image_id && spec->image_name); 448504077599SIlya Dryomov rbd_assert(spec->snap_name); 448604077599SIlya Dryomov 448704077599SIlya Dryomov if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) { 448804077599SIlya Dryomov u64 snap_id; 448904077599SIlya Dryomov 449004077599SIlya Dryomov snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name); 449104077599SIlya Dryomov if (snap_id == CEPH_NOSNAP) 449204077599SIlya Dryomov return -ENOENT; 449304077599SIlya Dryomov 449404077599SIlya Dryomov spec->snap_id = snap_id; 449504077599SIlya Dryomov } else { 449604077599SIlya Dryomov spec->snap_id = CEPH_NOSNAP; 449704077599SIlya Dryomov } 449804077599SIlya Dryomov 449904077599SIlya Dryomov return 0; 450004077599SIlya Dryomov } 450104077599SIlya Dryomov 450204077599SIlya Dryomov /* 450304077599SIlya Dryomov * A parent image will have all ids but none of the names. 450404077599SIlya Dryomov * 450504077599SIlya Dryomov * All names in an rbd spec are dynamically allocated. It's OK if we 450604077599SIlya Dryomov * can't figure out the name for an image id. 450704077599SIlya Dryomov */ 450804077599SIlya Dryomov static int rbd_spec_fill_names(struct rbd_device *rbd_dev) 45099e15b77dSAlex Elder { 45102e9f7f1cSAlex Elder struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 45112e9f7f1cSAlex Elder struct rbd_spec *spec = rbd_dev->spec; 45122e9f7f1cSAlex Elder const char *pool_name; 45132e9f7f1cSAlex Elder const char *image_name; 45142e9f7f1cSAlex Elder const char *snap_name; 45159e15b77dSAlex Elder int ret; 45169e15b77dSAlex Elder 451704077599SIlya Dryomov rbd_assert(spec->pool_id != CEPH_NOPOOL); 451804077599SIlya Dryomov rbd_assert(spec->image_id); 451904077599SIlya Dryomov rbd_assert(spec->snap_id != CEPH_NOSNAP); 45209e15b77dSAlex Elder 45212e9f7f1cSAlex Elder /* Get the pool name; we have to make our own copy of this */ 45229e15b77dSAlex Elder 45232e9f7f1cSAlex Elder pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id); 45242e9f7f1cSAlex Elder if (!pool_name) { 45252e9f7f1cSAlex Elder rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id); 4526935dc89fSAlex Elder return -EIO; 4527935dc89fSAlex Elder } 45282e9f7f1cSAlex Elder pool_name = kstrdup(pool_name, GFP_KERNEL); 45292e9f7f1cSAlex Elder if (!pool_name) 45309e15b77dSAlex Elder return -ENOMEM; 45319e15b77dSAlex Elder 45329e15b77dSAlex Elder /* Fetch the image name; tolerate failure here */ 45339e15b77dSAlex Elder 45342e9f7f1cSAlex Elder image_name = rbd_dev_image_name(rbd_dev); 45352e9f7f1cSAlex Elder if (!image_name) 453606ecc6cbSAlex Elder rbd_warn(rbd_dev, "unable to get image name"); 45379e15b77dSAlex Elder 453804077599SIlya Dryomov /* Fetch the snapshot name */ 45399e15b77dSAlex Elder 45402e9f7f1cSAlex Elder snap_name = rbd_snap_name(rbd_dev, spec->snap_id); 4541da6a6b63SJosh Durgin if (IS_ERR(snap_name)) { 4542da6a6b63SJosh Durgin ret = PTR_ERR(snap_name); 45439e15b77dSAlex Elder goto out_err; 45442e9f7f1cSAlex Elder } 45452e9f7f1cSAlex Elder 45462e9f7f1cSAlex Elder spec->pool_name = pool_name; 45472e9f7f1cSAlex Elder spec->image_name = image_name; 45482e9f7f1cSAlex Elder spec->snap_name = snap_name; 45499e15b77dSAlex Elder 45509e15b77dSAlex Elder return 0; 455104077599SIlya Dryomov 45529e15b77dSAlex Elder out_err: 45532e9f7f1cSAlex Elder kfree(image_name); 45542e9f7f1cSAlex Elder kfree(pool_name); 45559e15b77dSAlex Elder return ret; 45569e15b77dSAlex Elder } 45579e15b77dSAlex Elder 4558cc4a38bdSAlex Elder static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev) 455935d489f9SAlex Elder { 456035d489f9SAlex Elder size_t size; 456135d489f9SAlex Elder int ret; 456235d489f9SAlex Elder void *reply_buf; 456335d489f9SAlex Elder void *p; 456435d489f9SAlex Elder void *end; 456535d489f9SAlex Elder u64 seq; 456635d489f9SAlex Elder u32 snap_count; 456735d489f9SAlex Elder struct ceph_snap_context *snapc; 456835d489f9SAlex Elder u32 i; 456935d489f9SAlex Elder 457035d489f9SAlex Elder /* 457135d489f9SAlex Elder * We'll need room for the seq value (maximum snapshot id), 457235d489f9SAlex Elder * snapshot count, and array of that many snapshot ids. 457335d489f9SAlex Elder * For now we have a fixed upper limit on the number we're 457435d489f9SAlex Elder * prepared to receive. 457535d489f9SAlex Elder */ 457635d489f9SAlex Elder size = sizeof (__le64) + sizeof (__le32) + 457735d489f9SAlex Elder RBD_MAX_SNAP_COUNT * sizeof (__le64); 457835d489f9SAlex Elder reply_buf = kzalloc(size, GFP_KERNEL); 457935d489f9SAlex Elder if (!reply_buf) 458035d489f9SAlex Elder return -ENOMEM; 458135d489f9SAlex Elder 458236be9a76SAlex Elder ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, 45834157976bSAlex Elder "rbd", "get_snapcontext", NULL, 0, 4584e2a58ee5SAlex Elder reply_buf, size); 458536be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 458635d489f9SAlex Elder if (ret < 0) 458735d489f9SAlex Elder goto out; 458835d489f9SAlex Elder 458935d489f9SAlex Elder p = reply_buf; 459057385b51SAlex Elder end = reply_buf + ret; 459157385b51SAlex Elder ret = -ERANGE; 459235d489f9SAlex Elder ceph_decode_64_safe(&p, end, seq, out); 459335d489f9SAlex Elder ceph_decode_32_safe(&p, end, snap_count, out); 459435d489f9SAlex Elder 459535d489f9SAlex Elder /* 459635d489f9SAlex Elder * Make sure the reported number of snapshot ids wouldn't go 459735d489f9SAlex Elder * beyond the end of our buffer. But before checking that, 459835d489f9SAlex Elder * make sure the computed size of the snapshot context we 459935d489f9SAlex Elder * allocate is representable in a size_t. 460035d489f9SAlex Elder */ 460135d489f9SAlex Elder if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context)) 460235d489f9SAlex Elder / sizeof (u64)) { 460335d489f9SAlex Elder ret = -EINVAL; 460435d489f9SAlex Elder goto out; 460535d489f9SAlex Elder } 460635d489f9SAlex Elder if (!ceph_has_room(&p, end, snap_count * sizeof (__le64))) 460735d489f9SAlex Elder goto out; 4608468521c1SAlex Elder ret = 0; 460935d489f9SAlex Elder 4610812164f8SAlex Elder snapc = ceph_create_snap_context(snap_count, GFP_KERNEL); 461135d489f9SAlex Elder if (!snapc) { 461235d489f9SAlex Elder ret = -ENOMEM; 461335d489f9SAlex Elder goto out; 461435d489f9SAlex Elder } 461535d489f9SAlex Elder snapc->seq = seq; 461635d489f9SAlex Elder for (i = 0; i < snap_count; i++) 461735d489f9SAlex Elder snapc->snaps[i] = ceph_decode_64(&p); 461835d489f9SAlex Elder 461949ece554SAlex Elder ceph_put_snap_context(rbd_dev->header.snapc); 462035d489f9SAlex Elder rbd_dev->header.snapc = snapc; 462135d489f9SAlex Elder 462235d489f9SAlex Elder dout(" snap context seq = %llu, snap_count = %u\n", 462335d489f9SAlex Elder (unsigned long long)seq, (unsigned int)snap_count); 462435d489f9SAlex Elder out: 462535d489f9SAlex Elder kfree(reply_buf); 462635d489f9SAlex Elder 462757385b51SAlex Elder return ret; 462835d489f9SAlex Elder } 462935d489f9SAlex Elder 463054cac61fSAlex Elder static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, 463154cac61fSAlex Elder u64 snap_id) 4632b8b1e2dbSAlex Elder { 4633b8b1e2dbSAlex Elder size_t size; 4634b8b1e2dbSAlex Elder void *reply_buf; 463554cac61fSAlex Elder __le64 snapid; 4636b8b1e2dbSAlex Elder int ret; 4637b8b1e2dbSAlex Elder void *p; 4638b8b1e2dbSAlex Elder void *end; 4639b8b1e2dbSAlex Elder char *snap_name; 4640b8b1e2dbSAlex Elder 4641b8b1e2dbSAlex Elder size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN; 4642b8b1e2dbSAlex Elder reply_buf = kmalloc(size, GFP_KERNEL); 4643b8b1e2dbSAlex Elder if (!reply_buf) 4644b8b1e2dbSAlex Elder return ERR_PTR(-ENOMEM); 4645b8b1e2dbSAlex Elder 464654cac61fSAlex Elder snapid = cpu_to_le64(snap_id); 464736be9a76SAlex Elder ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, 4648b8b1e2dbSAlex Elder "rbd", "get_snapshot_name", 464954cac61fSAlex Elder &snapid, sizeof (snapid), 4650e2a58ee5SAlex Elder reply_buf, size); 465136be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 4652f40eb349SAlex Elder if (ret < 0) { 4653f40eb349SAlex Elder snap_name = ERR_PTR(ret); 4654b8b1e2dbSAlex Elder goto out; 4655f40eb349SAlex Elder } 4656b8b1e2dbSAlex Elder 4657b8b1e2dbSAlex Elder p = reply_buf; 4658f40eb349SAlex Elder end = reply_buf + ret; 4659e5c35534SAlex Elder snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL); 4660f40eb349SAlex Elder if (IS_ERR(snap_name)) 4661b8b1e2dbSAlex Elder goto out; 4662f40eb349SAlex Elder 4663b8b1e2dbSAlex Elder dout(" snap_id 0x%016llx snap_name = %s\n", 466454cac61fSAlex Elder (unsigned long long)snap_id, snap_name); 4665b8b1e2dbSAlex Elder out: 4666b8b1e2dbSAlex Elder kfree(reply_buf); 4667b8b1e2dbSAlex Elder 4668f40eb349SAlex Elder return snap_name; 4669b8b1e2dbSAlex Elder } 4670b8b1e2dbSAlex Elder 46712df3fac7SAlex Elder static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev) 4672117973fbSAlex Elder { 46732df3fac7SAlex Elder bool first_time = rbd_dev->header.object_prefix == NULL; 4674117973fbSAlex Elder int ret; 4675117973fbSAlex Elder 46761617e40cSJosh Durgin ret = rbd_dev_v2_image_size(rbd_dev); 46771617e40cSJosh Durgin if (ret) 4678cfbf6377SAlex Elder return ret; 46791617e40cSJosh Durgin 46802df3fac7SAlex Elder if (first_time) { 46812df3fac7SAlex Elder ret = rbd_dev_v2_header_onetime(rbd_dev); 46822df3fac7SAlex Elder if (ret) 4683cfbf6377SAlex Elder return ret; 46842df3fac7SAlex Elder } 46852df3fac7SAlex Elder 4686cc4a38bdSAlex Elder ret = rbd_dev_v2_snap_context(rbd_dev); 4687d194cd1dSIlya Dryomov if (ret && first_time) { 4688d194cd1dSIlya Dryomov kfree(rbd_dev->header.object_prefix); 4689d194cd1dSIlya Dryomov rbd_dev->header.object_prefix = NULL; 4690d194cd1dSIlya Dryomov } 4691117973fbSAlex Elder 4692117973fbSAlex Elder return ret; 4693117973fbSAlex Elder } 4694117973fbSAlex Elder 4695a720ae09SIlya Dryomov static int rbd_dev_header_info(struct rbd_device *rbd_dev) 4696a720ae09SIlya Dryomov { 4697a720ae09SIlya Dryomov rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 4698a720ae09SIlya Dryomov 4699a720ae09SIlya Dryomov if (rbd_dev->image_format == 1) 4700a720ae09SIlya Dryomov return rbd_dev_v1_header_info(rbd_dev); 4701a720ae09SIlya Dryomov 4702a720ae09SIlya Dryomov return rbd_dev_v2_header_info(rbd_dev); 4703a720ae09SIlya Dryomov } 4704a720ae09SIlya Dryomov 4705dfc5606dSYehuda Sadeh static int rbd_bus_add_dev(struct rbd_device *rbd_dev) 4706dfc5606dSYehuda Sadeh { 4707dfc5606dSYehuda Sadeh struct device *dev; 4708cd789ab9SAlex Elder int ret; 4709dfc5606dSYehuda Sadeh 4710cd789ab9SAlex Elder dev = &rbd_dev->dev; 4711dfc5606dSYehuda Sadeh dev->bus = &rbd_bus_type; 4712dfc5606dSYehuda Sadeh dev->type = &rbd_device_type; 4713dfc5606dSYehuda Sadeh dev->parent = &rbd_root_dev; 4714200a6a8bSAlex Elder dev->release = rbd_dev_device_release; 4715de71a297SAlex Elder dev_set_name(dev, "%d", rbd_dev->dev_id); 4716dfc5606dSYehuda Sadeh ret = device_register(dev); 4717dfc5606dSYehuda Sadeh 4718dfc5606dSYehuda Sadeh return ret; 4719602adf40SYehuda Sadeh } 4720602adf40SYehuda Sadeh 4721dfc5606dSYehuda Sadeh static void rbd_bus_del_dev(struct rbd_device *rbd_dev) 4722dfc5606dSYehuda Sadeh { 4723dfc5606dSYehuda Sadeh device_unregister(&rbd_dev->dev); 4724dfc5606dSYehuda Sadeh } 4725dfc5606dSYehuda Sadeh 47261ddbe94eSAlex Elder /* 4727499afd5bSAlex Elder * Get a unique rbd identifier for the given new rbd_dev, and add 4728f8a22fc2SIlya Dryomov * the rbd_dev to the global list. 47291ddbe94eSAlex Elder */ 4730f8a22fc2SIlya Dryomov static int rbd_dev_id_get(struct rbd_device *rbd_dev) 4731b7f23c36SAlex Elder { 4732f8a22fc2SIlya Dryomov int new_dev_id; 4733f8a22fc2SIlya Dryomov 47349b60e70bSIlya Dryomov new_dev_id = ida_simple_get(&rbd_dev_id_ida, 47359b60e70bSIlya Dryomov 0, minor_to_rbd_dev_id(1 << MINORBITS), 47369b60e70bSIlya Dryomov GFP_KERNEL); 4737f8a22fc2SIlya Dryomov if (new_dev_id < 0) 4738f8a22fc2SIlya Dryomov return new_dev_id; 4739f8a22fc2SIlya Dryomov 4740f8a22fc2SIlya Dryomov rbd_dev->dev_id = new_dev_id; 4741499afd5bSAlex Elder 4742499afd5bSAlex Elder spin_lock(&rbd_dev_list_lock); 4743499afd5bSAlex Elder list_add_tail(&rbd_dev->node, &rbd_dev_list); 4744499afd5bSAlex Elder spin_unlock(&rbd_dev_list_lock); 4745f8a22fc2SIlya Dryomov 474670eebd20SIlya Dryomov dout("rbd_dev %p given dev id %d\n", rbd_dev, rbd_dev->dev_id); 4747f8a22fc2SIlya Dryomov 4748f8a22fc2SIlya Dryomov return 0; 4749b7f23c36SAlex Elder } 4750b7f23c36SAlex Elder 47511ddbe94eSAlex Elder /* 4752499afd5bSAlex Elder * Remove an rbd_dev from the global list, and record that its 4753499afd5bSAlex Elder * identifier is no longer in use. 47541ddbe94eSAlex Elder */ 4755e2839308SAlex Elder static void rbd_dev_id_put(struct rbd_device *rbd_dev) 47561ddbe94eSAlex Elder { 4757499afd5bSAlex Elder spin_lock(&rbd_dev_list_lock); 4758499afd5bSAlex Elder list_del_init(&rbd_dev->node); 4759499afd5bSAlex Elder spin_unlock(&rbd_dev_list_lock); 47601ddbe94eSAlex Elder 4761f8a22fc2SIlya Dryomov ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id); 4762f8a22fc2SIlya Dryomov 4763f8a22fc2SIlya Dryomov dout("rbd_dev %p released dev id %d\n", rbd_dev, rbd_dev->dev_id); 4764b7f23c36SAlex Elder } 4765b7f23c36SAlex Elder 4766a725f65eSAlex Elder /* 4767e28fff26SAlex Elder * Skips over white space at *buf, and updates *buf to point to the 4768e28fff26SAlex Elder * first found non-space character (if any). Returns the length of 4769593a9e7bSAlex Elder * the token (string of non-white space characters) found. Note 4770593a9e7bSAlex Elder * that *buf must be terminated with '\0'. 4771e28fff26SAlex Elder */ 4772e28fff26SAlex Elder static inline size_t next_token(const char **buf) 4773e28fff26SAlex Elder { 4774e28fff26SAlex Elder /* 4775e28fff26SAlex Elder * These are the characters that produce nonzero for 4776e28fff26SAlex Elder * isspace() in the "C" and "POSIX" locales. 4777e28fff26SAlex Elder */ 4778e28fff26SAlex Elder const char *spaces = " \f\n\r\t\v"; 4779e28fff26SAlex Elder 4780e28fff26SAlex Elder *buf += strspn(*buf, spaces); /* Find start of token */ 4781e28fff26SAlex Elder 4782e28fff26SAlex Elder return strcspn(*buf, spaces); /* Return token length */ 4783e28fff26SAlex Elder } 4784e28fff26SAlex Elder 4785e28fff26SAlex Elder /* 4786ea3352f4SAlex Elder * Finds the next token in *buf, dynamically allocates a buffer big 4787ea3352f4SAlex Elder * enough to hold a copy of it, and copies the token into the new 4788ea3352f4SAlex Elder * buffer. The copy is guaranteed to be terminated with '\0'. Note 4789ea3352f4SAlex Elder * that a duplicate buffer is created even for a zero-length token. 4790ea3352f4SAlex Elder * 4791ea3352f4SAlex Elder * Returns a pointer to the newly-allocated duplicate, or a null 4792ea3352f4SAlex Elder * pointer if memory for the duplicate was not available. If 4793ea3352f4SAlex Elder * the lenp argument is a non-null pointer, the length of the token 4794ea3352f4SAlex Elder * (not including the '\0') is returned in *lenp. 4795ea3352f4SAlex Elder * 4796ea3352f4SAlex Elder * If successful, the *buf pointer will be updated to point beyond 4797ea3352f4SAlex Elder * the end of the found token. 4798ea3352f4SAlex Elder * 4799ea3352f4SAlex Elder * Note: uses GFP_KERNEL for allocation. 4800ea3352f4SAlex Elder */ 4801ea3352f4SAlex Elder static inline char *dup_token(const char **buf, size_t *lenp) 4802ea3352f4SAlex Elder { 4803ea3352f4SAlex Elder char *dup; 4804ea3352f4SAlex Elder size_t len; 4805ea3352f4SAlex Elder 4806ea3352f4SAlex Elder len = next_token(buf); 48074caf35f9SAlex Elder dup = kmemdup(*buf, len + 1, GFP_KERNEL); 4808ea3352f4SAlex Elder if (!dup) 4809ea3352f4SAlex Elder return NULL; 4810ea3352f4SAlex Elder *(dup + len) = '\0'; 4811ea3352f4SAlex Elder *buf += len; 4812ea3352f4SAlex Elder 4813ea3352f4SAlex Elder if (lenp) 4814ea3352f4SAlex Elder *lenp = len; 4815ea3352f4SAlex Elder 4816ea3352f4SAlex Elder return dup; 4817ea3352f4SAlex Elder } 4818ea3352f4SAlex Elder 4819ea3352f4SAlex Elder /* 4820859c31dfSAlex Elder * Parse the options provided for an "rbd add" (i.e., rbd image 4821859c31dfSAlex Elder * mapping) request. These arrive via a write to /sys/bus/rbd/add, 4822859c31dfSAlex Elder * and the data written is passed here via a NUL-terminated buffer. 4823859c31dfSAlex Elder * Returns 0 if successful or an error code otherwise. 4824d22f76e7SAlex Elder * 4825859c31dfSAlex Elder * The information extracted from these options is recorded in 4826859c31dfSAlex Elder * the other parameters which return dynamically-allocated 4827859c31dfSAlex Elder * structures: 4828859c31dfSAlex Elder * ceph_opts 4829859c31dfSAlex Elder * The address of a pointer that will refer to a ceph options 4830859c31dfSAlex Elder * structure. Caller must release the returned pointer using 4831859c31dfSAlex Elder * ceph_destroy_options() when it is no longer needed. 4832859c31dfSAlex Elder * rbd_opts 4833859c31dfSAlex Elder * Address of an rbd options pointer. Fully initialized by 4834859c31dfSAlex Elder * this function; caller must release with kfree(). 4835859c31dfSAlex Elder * spec 4836859c31dfSAlex Elder * Address of an rbd image specification pointer. Fully 4837859c31dfSAlex Elder * initialized by this function based on parsed options. 4838859c31dfSAlex Elder * Caller must release with rbd_spec_put(). 4839859c31dfSAlex Elder * 4840859c31dfSAlex Elder * The options passed take this form: 4841859c31dfSAlex Elder * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>] 4842859c31dfSAlex Elder * where: 4843859c31dfSAlex Elder * <mon_addrs> 4844859c31dfSAlex Elder * A comma-separated list of one or more monitor addresses. 4845859c31dfSAlex Elder * A monitor address is an ip address, optionally followed 4846859c31dfSAlex Elder * by a port number (separated by a colon). 4847859c31dfSAlex Elder * I.e.: ip1[:port1][,ip2[:port2]...] 4848859c31dfSAlex Elder * <options> 4849859c31dfSAlex Elder * A comma-separated list of ceph and/or rbd options. 4850859c31dfSAlex Elder * <pool_name> 4851859c31dfSAlex Elder * The name of the rados pool containing the rbd image. 4852859c31dfSAlex Elder * <image_name> 4853859c31dfSAlex Elder * The name of the image in that pool to map. 4854859c31dfSAlex Elder * <snap_id> 4855859c31dfSAlex Elder * An optional snapshot id. If provided, the mapping will 4856859c31dfSAlex Elder * present data from the image at the time that snapshot was 4857859c31dfSAlex Elder * created. The image head is used if no snapshot id is 4858859c31dfSAlex Elder * provided. Snapshot mappings are always read-only. 4859a725f65eSAlex Elder */ 4860859c31dfSAlex Elder static int rbd_add_parse_args(const char *buf, 4861dc79b113SAlex Elder struct ceph_options **ceph_opts, 4862859c31dfSAlex Elder struct rbd_options **opts, 4863859c31dfSAlex Elder struct rbd_spec **rbd_spec) 4864a725f65eSAlex Elder { 4865e28fff26SAlex Elder size_t len; 4866859c31dfSAlex Elder char *options; 48670ddebc0cSAlex Elder const char *mon_addrs; 4868ecb4dc22SAlex Elder char *snap_name; 48690ddebc0cSAlex Elder size_t mon_addrs_size; 4870859c31dfSAlex Elder struct rbd_spec *spec = NULL; 48714e9afebaSAlex Elder struct rbd_options *rbd_opts = NULL; 4872859c31dfSAlex Elder struct ceph_options *copts; 4873dc79b113SAlex Elder int ret; 4874e28fff26SAlex Elder 4875e28fff26SAlex Elder /* The first four tokens are required */ 4876e28fff26SAlex Elder 48777ef3214aSAlex Elder len = next_token(&buf); 48784fb5d671SAlex Elder if (!len) { 48794fb5d671SAlex Elder rbd_warn(NULL, "no monitor address(es) provided"); 48804fb5d671SAlex Elder return -EINVAL; 48814fb5d671SAlex Elder } 48820ddebc0cSAlex Elder mon_addrs = buf; 4883f28e565aSAlex Elder mon_addrs_size = len + 1; 48847ef3214aSAlex Elder buf += len; 4885a725f65eSAlex Elder 4886dc79b113SAlex Elder ret = -EINVAL; 4887f28e565aSAlex Elder options = dup_token(&buf, NULL); 4888f28e565aSAlex Elder if (!options) 4889dc79b113SAlex Elder return -ENOMEM; 48904fb5d671SAlex Elder if (!*options) { 48914fb5d671SAlex Elder rbd_warn(NULL, "no options provided"); 48924fb5d671SAlex Elder goto out_err; 48934fb5d671SAlex Elder } 4894a725f65eSAlex Elder 4895859c31dfSAlex Elder spec = rbd_spec_alloc(); 4896859c31dfSAlex Elder if (!spec) 4897f28e565aSAlex Elder goto out_mem; 4898859c31dfSAlex Elder 4899859c31dfSAlex Elder spec->pool_name = dup_token(&buf, NULL); 4900859c31dfSAlex Elder if (!spec->pool_name) 4901859c31dfSAlex Elder goto out_mem; 49024fb5d671SAlex Elder if (!*spec->pool_name) { 49034fb5d671SAlex Elder rbd_warn(NULL, "no pool name provided"); 49044fb5d671SAlex Elder goto out_err; 49054fb5d671SAlex Elder } 4906e28fff26SAlex Elder 490769e7a02fSAlex Elder spec->image_name = dup_token(&buf, NULL); 4908859c31dfSAlex Elder if (!spec->image_name) 4909f28e565aSAlex Elder goto out_mem; 49104fb5d671SAlex Elder if (!*spec->image_name) { 49114fb5d671SAlex Elder rbd_warn(NULL, "no image name provided"); 49124fb5d671SAlex Elder goto out_err; 49134fb5d671SAlex Elder } 4914e28fff26SAlex Elder 4915f28e565aSAlex Elder /* 4916f28e565aSAlex Elder * Snapshot name is optional; default is to use "-" 4917f28e565aSAlex Elder * (indicating the head/no snapshot). 4918f28e565aSAlex Elder */ 49193feeb894SAlex Elder len = next_token(&buf); 4920820a5f3eSAlex Elder if (!len) { 49213feeb894SAlex Elder buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */ 49223feeb894SAlex Elder len = sizeof (RBD_SNAP_HEAD_NAME) - 1; 4923f28e565aSAlex Elder } else if (len > RBD_MAX_SNAP_NAME_LEN) { 4924dc79b113SAlex Elder ret = -ENAMETOOLONG; 4925f28e565aSAlex Elder goto out_err; 4926849b4260SAlex Elder } 4927ecb4dc22SAlex Elder snap_name = kmemdup(buf, len + 1, GFP_KERNEL); 4928ecb4dc22SAlex Elder if (!snap_name) 4929f28e565aSAlex Elder goto out_mem; 4930ecb4dc22SAlex Elder *(snap_name + len) = '\0'; 4931ecb4dc22SAlex Elder spec->snap_name = snap_name; 4932e5c35534SAlex Elder 49330ddebc0cSAlex Elder /* Initialize all rbd options to the defaults */ 4934e28fff26SAlex Elder 49354e9afebaSAlex Elder rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL); 49364e9afebaSAlex Elder if (!rbd_opts) 49374e9afebaSAlex Elder goto out_mem; 49384e9afebaSAlex Elder 49394e9afebaSAlex Elder rbd_opts->read_only = RBD_READ_ONLY_DEFAULT; 4940b5584180SIlya Dryomov rbd_opts->queue_depth = RBD_QUEUE_DEPTH_DEFAULT; 4941d22f76e7SAlex Elder 4942859c31dfSAlex Elder copts = ceph_parse_options(options, mon_addrs, 49430ddebc0cSAlex Elder mon_addrs + mon_addrs_size - 1, 49444e9afebaSAlex Elder parse_rbd_opts_token, rbd_opts); 4945859c31dfSAlex Elder if (IS_ERR(copts)) { 4946859c31dfSAlex Elder ret = PTR_ERR(copts); 4947dc79b113SAlex Elder goto out_err; 4948dc79b113SAlex Elder } 4949859c31dfSAlex Elder kfree(options); 4950859c31dfSAlex Elder 4951859c31dfSAlex Elder *ceph_opts = copts; 49524e9afebaSAlex Elder *opts = rbd_opts; 4953859c31dfSAlex Elder *rbd_spec = spec; 49540ddebc0cSAlex Elder 4955dc79b113SAlex Elder return 0; 4956f28e565aSAlex Elder out_mem: 4957dc79b113SAlex Elder ret = -ENOMEM; 4958d22f76e7SAlex Elder out_err: 4959859c31dfSAlex Elder kfree(rbd_opts); 4960859c31dfSAlex Elder rbd_spec_put(spec); 4961f28e565aSAlex Elder kfree(options); 4962d22f76e7SAlex Elder 4963dc79b113SAlex Elder return ret; 4964a725f65eSAlex Elder } 4965a725f65eSAlex Elder 4966589d30e0SAlex Elder /* 496730ba1f02SIlya Dryomov * Return pool id (>= 0) or a negative error code. 496830ba1f02SIlya Dryomov */ 496930ba1f02SIlya Dryomov static int rbd_add_get_pool_id(struct rbd_client *rbdc, const char *pool_name) 497030ba1f02SIlya Dryomov { 4971a319bf56SIlya Dryomov struct ceph_options *opts = rbdc->client->options; 497230ba1f02SIlya Dryomov u64 newest_epoch; 497330ba1f02SIlya Dryomov int tries = 0; 497430ba1f02SIlya Dryomov int ret; 497530ba1f02SIlya Dryomov 497630ba1f02SIlya Dryomov again: 497730ba1f02SIlya Dryomov ret = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, pool_name); 497830ba1f02SIlya Dryomov if (ret == -ENOENT && tries++ < 1) { 497930ba1f02SIlya Dryomov ret = ceph_monc_do_get_version(&rbdc->client->monc, "osdmap", 498030ba1f02SIlya Dryomov &newest_epoch); 498130ba1f02SIlya Dryomov if (ret < 0) 498230ba1f02SIlya Dryomov return ret; 498330ba1f02SIlya Dryomov 498430ba1f02SIlya Dryomov if (rbdc->client->osdc.osdmap->epoch < newest_epoch) { 498530ba1f02SIlya Dryomov ceph_monc_request_next_osdmap(&rbdc->client->monc); 498630ba1f02SIlya Dryomov (void) ceph_monc_wait_osdmap(&rbdc->client->monc, 4987a319bf56SIlya Dryomov newest_epoch, 4988a319bf56SIlya Dryomov opts->mount_timeout); 498930ba1f02SIlya Dryomov goto again; 499030ba1f02SIlya Dryomov } else { 499130ba1f02SIlya Dryomov /* the osdmap we have is new enough */ 499230ba1f02SIlya Dryomov return -ENOENT; 499330ba1f02SIlya Dryomov } 499430ba1f02SIlya Dryomov } 499530ba1f02SIlya Dryomov 499630ba1f02SIlya Dryomov return ret; 499730ba1f02SIlya Dryomov } 499830ba1f02SIlya Dryomov 499930ba1f02SIlya Dryomov /* 5000589d30e0SAlex Elder * An rbd format 2 image has a unique identifier, distinct from the 5001589d30e0SAlex Elder * name given to it by the user. Internally, that identifier is 5002589d30e0SAlex Elder * what's used to specify the names of objects related to the image. 5003589d30e0SAlex Elder * 5004589d30e0SAlex Elder * A special "rbd id" object is used to map an rbd image name to its 5005589d30e0SAlex Elder * id. If that object doesn't exist, then there is no v2 rbd image 5006589d30e0SAlex Elder * with the supplied name. 5007589d30e0SAlex Elder * 5008589d30e0SAlex Elder * This function will record the given rbd_dev's image_id field if 5009589d30e0SAlex Elder * it can be determined, and in that case will return 0. If any 5010589d30e0SAlex Elder * errors occur a negative errno will be returned and the rbd_dev's 5011589d30e0SAlex Elder * image_id field will be unchanged (and should be NULL). 5012589d30e0SAlex Elder */ 5013589d30e0SAlex Elder static int rbd_dev_image_id(struct rbd_device *rbd_dev) 5014589d30e0SAlex Elder { 5015589d30e0SAlex Elder int ret; 5016589d30e0SAlex Elder size_t size; 5017589d30e0SAlex Elder char *object_name; 5018589d30e0SAlex Elder void *response; 5019c0fba368SAlex Elder char *image_id; 50202f82ee54SAlex Elder 5021589d30e0SAlex Elder /* 50222c0d0a10SAlex Elder * When probing a parent image, the image id is already 50232c0d0a10SAlex Elder * known (and the image name likely is not). There's no 5024c0fba368SAlex Elder * need to fetch the image id again in this case. We 5025c0fba368SAlex Elder * do still need to set the image format though. 50262c0d0a10SAlex Elder */ 5027c0fba368SAlex Elder if (rbd_dev->spec->image_id) { 5028c0fba368SAlex Elder rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1; 5029c0fba368SAlex Elder 50302c0d0a10SAlex Elder return 0; 5031c0fba368SAlex Elder } 50322c0d0a10SAlex Elder 50332c0d0a10SAlex Elder /* 5034589d30e0SAlex Elder * First, see if the format 2 image id file exists, and if 5035589d30e0SAlex Elder * so, get the image's persistent id from it. 5036589d30e0SAlex Elder */ 503769e7a02fSAlex Elder size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name); 5038589d30e0SAlex Elder object_name = kmalloc(size, GFP_NOIO); 5039589d30e0SAlex Elder if (!object_name) 5040589d30e0SAlex Elder return -ENOMEM; 50410d7dbfceSAlex Elder sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name); 5042589d30e0SAlex Elder dout("rbd id object name is %s\n", object_name); 5043589d30e0SAlex Elder 5044589d30e0SAlex Elder /* Response will be an encoded string, which includes a length */ 5045589d30e0SAlex Elder 5046589d30e0SAlex Elder size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX; 5047589d30e0SAlex Elder response = kzalloc(size, GFP_NOIO); 5048589d30e0SAlex Elder if (!response) { 5049589d30e0SAlex Elder ret = -ENOMEM; 5050589d30e0SAlex Elder goto out; 5051589d30e0SAlex Elder } 5052589d30e0SAlex Elder 5053c0fba368SAlex Elder /* If it doesn't exist we'll assume it's a format 1 image */ 5054c0fba368SAlex Elder 505536be9a76SAlex Elder ret = rbd_obj_method_sync(rbd_dev, object_name, 50564157976bSAlex Elder "rbd", "get_id", NULL, 0, 5057e2a58ee5SAlex Elder response, RBD_IMAGE_ID_LEN_MAX); 505836be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 5059c0fba368SAlex Elder if (ret == -ENOENT) { 5060c0fba368SAlex Elder image_id = kstrdup("", GFP_KERNEL); 5061c0fba368SAlex Elder ret = image_id ? 0 : -ENOMEM; 5062c0fba368SAlex Elder if (!ret) 5063c0fba368SAlex Elder rbd_dev->image_format = 1; 50647dd440c9SIlya Dryomov } else if (ret >= 0) { 5065c0fba368SAlex Elder void *p = response; 5066589d30e0SAlex Elder 5067c0fba368SAlex Elder image_id = ceph_extract_encoded_string(&p, p + ret, 5068979ed480SAlex Elder NULL, GFP_NOIO); 5069461f758aSDuan Jiong ret = PTR_ERR_OR_ZERO(image_id); 5070c0fba368SAlex Elder if (!ret) 5071c0fba368SAlex Elder rbd_dev->image_format = 2; 5072c0fba368SAlex Elder } 5073c0fba368SAlex Elder 5074c0fba368SAlex Elder if (!ret) { 5075c0fba368SAlex Elder rbd_dev->spec->image_id = image_id; 5076c0fba368SAlex Elder dout("image_id is %s\n", image_id); 5077589d30e0SAlex Elder } 5078589d30e0SAlex Elder out: 5079589d30e0SAlex Elder kfree(response); 5080589d30e0SAlex Elder kfree(object_name); 5081589d30e0SAlex Elder 5082589d30e0SAlex Elder return ret; 5083589d30e0SAlex Elder } 5084589d30e0SAlex Elder 50853abef3b3SAlex Elder /* 50863abef3b3SAlex Elder * Undo whatever state changes are made by v1 or v2 header info 50873abef3b3SAlex Elder * call. 50883abef3b3SAlex Elder */ 50896fd48b3bSAlex Elder static void rbd_dev_unprobe(struct rbd_device *rbd_dev) 50906fd48b3bSAlex Elder { 50916fd48b3bSAlex Elder struct rbd_image_header *header; 50926fd48b3bSAlex Elder 5093a2acd00eSAlex Elder rbd_dev_parent_put(rbd_dev); 50946fd48b3bSAlex Elder 50956fd48b3bSAlex Elder /* Free dynamic fields from the header, then zero it out */ 50966fd48b3bSAlex Elder 50976fd48b3bSAlex Elder header = &rbd_dev->header; 5098812164f8SAlex Elder ceph_put_snap_context(header->snapc); 50996fd48b3bSAlex Elder kfree(header->snap_sizes); 51006fd48b3bSAlex Elder kfree(header->snap_names); 51016fd48b3bSAlex Elder kfree(header->object_prefix); 51026fd48b3bSAlex Elder memset(header, 0, sizeof (*header)); 51036fd48b3bSAlex Elder } 51046fd48b3bSAlex Elder 51052df3fac7SAlex Elder static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev) 5106a30b71b9SAlex Elder { 5107a30b71b9SAlex Elder int ret; 5108a30b71b9SAlex Elder 51091e130199SAlex Elder ret = rbd_dev_v2_object_prefix(rbd_dev); 511057385b51SAlex Elder if (ret) 51111e130199SAlex Elder goto out_err; 5112b1b5402aSAlex Elder 51132df3fac7SAlex Elder /* 51142df3fac7SAlex Elder * Get the and check features for the image. Currently the 51152df3fac7SAlex Elder * features are assumed to never change. 51162df3fac7SAlex Elder */ 5117b1b5402aSAlex Elder ret = rbd_dev_v2_features(rbd_dev); 511857385b51SAlex Elder if (ret) 5119b1b5402aSAlex Elder goto out_err; 512035d489f9SAlex Elder 5121cc070d59SAlex Elder /* If the image supports fancy striping, get its parameters */ 5122cc070d59SAlex Elder 5123cc070d59SAlex Elder if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) { 5124cc070d59SAlex Elder ret = rbd_dev_v2_striping_info(rbd_dev); 5125cc070d59SAlex Elder if (ret < 0) 5126cc070d59SAlex Elder goto out_err; 5127cc070d59SAlex Elder } 51282df3fac7SAlex Elder /* No support for crypto and compression type format 2 images */ 5129a30b71b9SAlex Elder 513035152979SAlex Elder return 0; 51319d475de5SAlex Elder out_err: 5132642a2537SAlex Elder rbd_dev->header.features = 0; 51331e130199SAlex Elder kfree(rbd_dev->header.object_prefix); 51341e130199SAlex Elder rbd_dev->header.object_prefix = NULL; 51359d475de5SAlex Elder 51369d475de5SAlex Elder return ret; 5137a30b71b9SAlex Elder } 5138a30b71b9SAlex Elder 51396d69bb53SIlya Dryomov /* 51406d69bb53SIlya Dryomov * @depth is rbd_dev_image_probe() -> rbd_dev_probe_parent() -> 51416d69bb53SIlya Dryomov * rbd_dev_image_probe() recursion depth, which means it's also the 51426d69bb53SIlya Dryomov * length of the already discovered part of the parent chain. 51436d69bb53SIlya Dryomov */ 51446d69bb53SIlya Dryomov static int rbd_dev_probe_parent(struct rbd_device *rbd_dev, int depth) 514583a06263SAlex Elder { 51462f82ee54SAlex Elder struct rbd_device *parent = NULL; 5147124afba2SAlex Elder int ret; 5148124afba2SAlex Elder 5149124afba2SAlex Elder if (!rbd_dev->parent_spec) 5150124afba2SAlex Elder return 0; 5151124afba2SAlex Elder 51526d69bb53SIlya Dryomov if (++depth > RBD_MAX_PARENT_CHAIN_LEN) { 51536d69bb53SIlya Dryomov pr_info("parent chain is too long (%d)\n", depth); 51546d69bb53SIlya Dryomov ret = -EINVAL; 51556d69bb53SIlya Dryomov goto out_err; 51566d69bb53SIlya Dryomov } 51576d69bb53SIlya Dryomov 51581f2c6651SIlya Dryomov parent = rbd_dev_create(rbd_dev->rbd_client, rbd_dev->parent_spec, 51591f2c6651SIlya Dryomov NULL); 51601f2c6651SIlya Dryomov if (!parent) { 5161124afba2SAlex Elder ret = -ENOMEM; 5162124afba2SAlex Elder goto out_err; 51631f2c6651SIlya Dryomov } 51641f2c6651SIlya Dryomov 51651f2c6651SIlya Dryomov /* 51661f2c6651SIlya Dryomov * Images related by parent/child relationships always share 51671f2c6651SIlya Dryomov * rbd_client and spec/parent_spec, so bump their refcounts. 51681f2c6651SIlya Dryomov */ 51691f2c6651SIlya Dryomov __rbd_get_client(rbd_dev->rbd_client); 51701f2c6651SIlya Dryomov rbd_spec_get(rbd_dev->parent_spec); 5171124afba2SAlex Elder 51726d69bb53SIlya Dryomov ret = rbd_dev_image_probe(parent, depth); 5173124afba2SAlex Elder if (ret < 0) 5174124afba2SAlex Elder goto out_err; 51751f2c6651SIlya Dryomov 5176124afba2SAlex Elder rbd_dev->parent = parent; 5177a2acd00eSAlex Elder atomic_set(&rbd_dev->parent_ref, 1); 5178124afba2SAlex Elder return 0; 5179124afba2SAlex Elder 51801f2c6651SIlya Dryomov out_err: 51811f2c6651SIlya Dryomov rbd_dev_unparent(rbd_dev); 51821f2c6651SIlya Dryomov if (parent) 51831f2c6651SIlya Dryomov rbd_dev_destroy(parent); 5184124afba2SAlex Elder return ret; 5185124afba2SAlex Elder } 5186124afba2SAlex Elder 5187200a6a8bSAlex Elder static int rbd_dev_device_setup(struct rbd_device *rbd_dev) 5188124afba2SAlex Elder { 518983a06263SAlex Elder int ret; 519083a06263SAlex Elder 5191f8a22fc2SIlya Dryomov /* Get an id and fill in device name. */ 519283a06263SAlex Elder 5193f8a22fc2SIlya Dryomov ret = rbd_dev_id_get(rbd_dev); 5194f8a22fc2SIlya Dryomov if (ret) 5195f8a22fc2SIlya Dryomov return ret; 5196f8a22fc2SIlya Dryomov 519783a06263SAlex Elder BUILD_BUG_ON(DEV_NAME_LEN 519883a06263SAlex Elder < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH); 519983a06263SAlex Elder sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id); 520083a06263SAlex Elder 52019b60e70bSIlya Dryomov /* Record our major and minor device numbers. */ 520283a06263SAlex Elder 52039b60e70bSIlya Dryomov if (!single_major) { 520483a06263SAlex Elder ret = register_blkdev(0, rbd_dev->name); 520583a06263SAlex Elder if (ret < 0) 520683a06263SAlex Elder goto err_out_id; 52079b60e70bSIlya Dryomov 520883a06263SAlex Elder rbd_dev->major = ret; 5209dd82fff1SIlya Dryomov rbd_dev->minor = 0; 52109b60e70bSIlya Dryomov } else { 52119b60e70bSIlya Dryomov rbd_dev->major = rbd_major; 52129b60e70bSIlya Dryomov rbd_dev->minor = rbd_dev_id_to_minor(rbd_dev->dev_id); 52139b60e70bSIlya Dryomov } 521483a06263SAlex Elder 521583a06263SAlex Elder /* Set up the blkdev mapping. */ 521683a06263SAlex Elder 521783a06263SAlex Elder ret = rbd_init_disk(rbd_dev); 521883a06263SAlex Elder if (ret) 521983a06263SAlex Elder goto err_out_blkdev; 522083a06263SAlex Elder 5221f35a4deeSAlex Elder ret = rbd_dev_mapping_set(rbd_dev); 522283a06263SAlex Elder if (ret) 522383a06263SAlex Elder goto err_out_disk; 5224bc1ecc65SIlya Dryomov 5225f35a4deeSAlex Elder set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE); 522622001f61SJosh Durgin set_disk_ro(rbd_dev->disk, rbd_dev->mapping.read_only); 5227f35a4deeSAlex Elder 5228f35a4deeSAlex Elder ret = rbd_bus_add_dev(rbd_dev); 5229f35a4deeSAlex Elder if (ret) 5230f5ee37bdSIlya Dryomov goto err_out_mapping; 523183a06263SAlex Elder 523283a06263SAlex Elder /* Everything's ready. Announce the disk to the world. */ 523383a06263SAlex Elder 5234129b79d4SAlex Elder set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); 523583a06263SAlex Elder add_disk(rbd_dev->disk); 523683a06263SAlex Elder 523783a06263SAlex Elder pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name, 523883a06263SAlex Elder (unsigned long long) rbd_dev->mapping.size); 523983a06263SAlex Elder 524083a06263SAlex Elder return ret; 52412f82ee54SAlex Elder 5242f35a4deeSAlex Elder err_out_mapping: 5243f35a4deeSAlex Elder rbd_dev_mapping_clear(rbd_dev); 524483a06263SAlex Elder err_out_disk: 524583a06263SAlex Elder rbd_free_disk(rbd_dev); 524683a06263SAlex Elder err_out_blkdev: 52479b60e70bSIlya Dryomov if (!single_major) 524883a06263SAlex Elder unregister_blkdev(rbd_dev->major, rbd_dev->name); 524983a06263SAlex Elder err_out_id: 525083a06263SAlex Elder rbd_dev_id_put(rbd_dev); 5251d1cf5788SAlex Elder rbd_dev_mapping_clear(rbd_dev); 525283a06263SAlex Elder 525383a06263SAlex Elder return ret; 525483a06263SAlex Elder } 525583a06263SAlex Elder 5256332bb12dSAlex Elder static int rbd_dev_header_name(struct rbd_device *rbd_dev) 5257332bb12dSAlex Elder { 5258332bb12dSAlex Elder struct rbd_spec *spec = rbd_dev->spec; 5259332bb12dSAlex Elder size_t size; 5260332bb12dSAlex Elder 5261332bb12dSAlex Elder /* Record the header object name for this rbd image. */ 5262332bb12dSAlex Elder 5263332bb12dSAlex Elder rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 5264332bb12dSAlex Elder 5265332bb12dSAlex Elder if (rbd_dev->image_format == 1) 5266332bb12dSAlex Elder size = strlen(spec->image_name) + sizeof (RBD_SUFFIX); 5267332bb12dSAlex Elder else 5268332bb12dSAlex Elder size = sizeof (RBD_HEADER_PREFIX) + strlen(spec->image_id); 5269332bb12dSAlex Elder 5270332bb12dSAlex Elder rbd_dev->header_name = kmalloc(size, GFP_KERNEL); 5271332bb12dSAlex Elder if (!rbd_dev->header_name) 5272332bb12dSAlex Elder return -ENOMEM; 5273332bb12dSAlex Elder 5274332bb12dSAlex Elder if (rbd_dev->image_format == 1) 5275332bb12dSAlex Elder sprintf(rbd_dev->header_name, "%s%s", 5276332bb12dSAlex Elder spec->image_name, RBD_SUFFIX); 5277332bb12dSAlex Elder else 5278332bb12dSAlex Elder sprintf(rbd_dev->header_name, "%s%s", 5279332bb12dSAlex Elder RBD_HEADER_PREFIX, spec->image_id); 5280332bb12dSAlex Elder return 0; 5281332bb12dSAlex Elder } 5282332bb12dSAlex Elder 5283200a6a8bSAlex Elder static void rbd_dev_image_release(struct rbd_device *rbd_dev) 5284200a6a8bSAlex Elder { 52856fd48b3bSAlex Elder rbd_dev_unprobe(rbd_dev); 5286200a6a8bSAlex Elder kfree(rbd_dev->header_name); 52876fd48b3bSAlex Elder rbd_dev->header_name = NULL; 52886fd48b3bSAlex Elder rbd_dev->image_format = 0; 52896fd48b3bSAlex Elder kfree(rbd_dev->spec->image_id); 52906fd48b3bSAlex Elder rbd_dev->spec->image_id = NULL; 52916fd48b3bSAlex Elder 5292200a6a8bSAlex Elder rbd_dev_destroy(rbd_dev); 5293200a6a8bSAlex Elder } 5294200a6a8bSAlex Elder 5295a30b71b9SAlex Elder /* 5296a30b71b9SAlex Elder * Probe for the existence of the header object for the given rbd 52971f3ef788SAlex Elder * device. If this image is the one being mapped (i.e., not a 52981f3ef788SAlex Elder * parent), initiate a watch on its header object before using that 52991f3ef788SAlex Elder * object to get detailed information about the rbd image. 5300a30b71b9SAlex Elder */ 53016d69bb53SIlya Dryomov static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth) 5302a30b71b9SAlex Elder { 5303a30b71b9SAlex Elder int ret; 5304a30b71b9SAlex Elder 5305a30b71b9SAlex Elder /* 53063abef3b3SAlex Elder * Get the id from the image id object. Unless there's an 53073abef3b3SAlex Elder * error, rbd_dev->spec->image_id will be filled in with 53083abef3b3SAlex Elder * a dynamically-allocated string, and rbd_dev->image_format 53093abef3b3SAlex Elder * will be set to either 1 or 2. 5310a30b71b9SAlex Elder */ 5311a30b71b9SAlex Elder ret = rbd_dev_image_id(rbd_dev); 5312a30b71b9SAlex Elder if (ret) 5313c0fba368SAlex Elder return ret; 5314c0fba368SAlex Elder 5315332bb12dSAlex Elder ret = rbd_dev_header_name(rbd_dev); 5316332bb12dSAlex Elder if (ret) 5317332bb12dSAlex Elder goto err_out_format; 5318332bb12dSAlex Elder 53196d69bb53SIlya Dryomov if (!depth) { 5320fca27065SIlya Dryomov ret = rbd_dev_header_watch_sync(rbd_dev); 53211fe48023SIlya Dryomov if (ret) { 53221fe48023SIlya Dryomov if (ret == -ENOENT) 53231fe48023SIlya Dryomov pr_info("image %s/%s does not exist\n", 53241fe48023SIlya Dryomov rbd_dev->spec->pool_name, 53251fe48023SIlya Dryomov rbd_dev->spec->image_name); 5326b644de2bSAlex Elder goto out_header_name; 53271f3ef788SAlex Elder } 53281fe48023SIlya Dryomov } 5329b644de2bSAlex Elder 5330a720ae09SIlya Dryomov ret = rbd_dev_header_info(rbd_dev); 53315655c4d9SAlex Elder if (ret) 5332b644de2bSAlex Elder goto err_out_watch; 5333a30b71b9SAlex Elder 533404077599SIlya Dryomov /* 533504077599SIlya Dryomov * If this image is the one being mapped, we have pool name and 533604077599SIlya Dryomov * id, image name and id, and snap name - need to fill snap id. 533704077599SIlya Dryomov * Otherwise this is a parent image, identified by pool, image 533804077599SIlya Dryomov * and snap ids - need to fill in names for those ids. 533904077599SIlya Dryomov */ 53406d69bb53SIlya Dryomov if (!depth) 534104077599SIlya Dryomov ret = rbd_spec_fill_snap_id(rbd_dev); 534204077599SIlya Dryomov else 534304077599SIlya Dryomov ret = rbd_spec_fill_names(rbd_dev); 53441fe48023SIlya Dryomov if (ret) { 53451fe48023SIlya Dryomov if (ret == -ENOENT) 53461fe48023SIlya Dryomov pr_info("snap %s/%s@%s does not exist\n", 53471fe48023SIlya Dryomov rbd_dev->spec->pool_name, 53481fe48023SIlya Dryomov rbd_dev->spec->image_name, 53491fe48023SIlya Dryomov rbd_dev->spec->snap_name); 535033dca39fSAlex Elder goto err_out_probe; 53511fe48023SIlya Dryomov } 53529bb81c9bSAlex Elder 5353e8f59b59SIlya Dryomov if (rbd_dev->header.features & RBD_FEATURE_LAYERING) { 5354e8f59b59SIlya Dryomov ret = rbd_dev_v2_parent_info(rbd_dev); 5355e8f59b59SIlya Dryomov if (ret) 5356e8f59b59SIlya Dryomov goto err_out_probe; 5357e8f59b59SIlya Dryomov 5358e8f59b59SIlya Dryomov /* 5359e8f59b59SIlya Dryomov * Need to warn users if this image is the one being 5360e8f59b59SIlya Dryomov * mapped and has a parent. 5361e8f59b59SIlya Dryomov */ 53626d69bb53SIlya Dryomov if (!depth && rbd_dev->parent_spec) 5363e8f59b59SIlya Dryomov rbd_warn(rbd_dev, 5364e8f59b59SIlya Dryomov "WARNING: kernel layering is EXPERIMENTAL!"); 5365e8f59b59SIlya Dryomov } 5366e8f59b59SIlya Dryomov 53676d69bb53SIlya Dryomov ret = rbd_dev_probe_parent(rbd_dev, depth); 536830d60ba2SAlex Elder if (ret) 536930d60ba2SAlex Elder goto err_out_probe; 537083a06263SAlex Elder 537130d60ba2SAlex Elder dout("discovered format %u image, header name is %s\n", 537230d60ba2SAlex Elder rbd_dev->image_format, rbd_dev->header_name); 537330d60ba2SAlex Elder return 0; 5374e8f59b59SIlya Dryomov 53756fd48b3bSAlex Elder err_out_probe: 53766fd48b3bSAlex Elder rbd_dev_unprobe(rbd_dev); 5377b644de2bSAlex Elder err_out_watch: 53786d69bb53SIlya Dryomov if (!depth) 5379fca27065SIlya Dryomov rbd_dev_header_unwatch_sync(rbd_dev); 5380332bb12dSAlex Elder out_header_name: 5381332bb12dSAlex Elder kfree(rbd_dev->header_name); 5382332bb12dSAlex Elder rbd_dev->header_name = NULL; 5383332bb12dSAlex Elder err_out_format: 5384332bb12dSAlex Elder rbd_dev->image_format = 0; 53855655c4d9SAlex Elder kfree(rbd_dev->spec->image_id); 53865655c4d9SAlex Elder rbd_dev->spec->image_id = NULL; 53875655c4d9SAlex Elder return ret; 538883a06263SAlex Elder } 538983a06263SAlex Elder 53909b60e70bSIlya Dryomov static ssize_t do_rbd_add(struct bus_type *bus, 539159c2be1eSYehuda Sadeh const char *buf, 539259c2be1eSYehuda Sadeh size_t count) 5393602adf40SYehuda Sadeh { 5394cb8627c7SAlex Elder struct rbd_device *rbd_dev = NULL; 5395dc79b113SAlex Elder struct ceph_options *ceph_opts = NULL; 53964e9afebaSAlex Elder struct rbd_options *rbd_opts = NULL; 5397859c31dfSAlex Elder struct rbd_spec *spec = NULL; 53989d3997fdSAlex Elder struct rbd_client *rbdc; 539951344a38SAlex Elder bool read_only; 540027cc2594SAlex Elder int rc = -ENOMEM; 5401602adf40SYehuda Sadeh 5402602adf40SYehuda Sadeh if (!try_module_get(THIS_MODULE)) 5403602adf40SYehuda Sadeh return -ENODEV; 5404602adf40SYehuda Sadeh 5405a725f65eSAlex Elder /* parse add command */ 5406859c31dfSAlex Elder rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec); 5407dc79b113SAlex Elder if (rc < 0) 5408bd4ba655SAlex Elder goto err_out_module; 5409a725f65eSAlex Elder 54109d3997fdSAlex Elder rbdc = rbd_get_client(ceph_opts); 54119d3997fdSAlex Elder if (IS_ERR(rbdc)) { 54129d3997fdSAlex Elder rc = PTR_ERR(rbdc); 54130ddebc0cSAlex Elder goto err_out_args; 54149d3997fdSAlex Elder } 5415602adf40SYehuda Sadeh 5416602adf40SYehuda Sadeh /* pick the pool */ 541730ba1f02SIlya Dryomov rc = rbd_add_get_pool_id(rbdc, spec->pool_name); 54181fe48023SIlya Dryomov if (rc < 0) { 54191fe48023SIlya Dryomov if (rc == -ENOENT) 54201fe48023SIlya Dryomov pr_info("pool %s does not exist\n", spec->pool_name); 5421602adf40SYehuda Sadeh goto err_out_client; 54221fe48023SIlya Dryomov } 5423859c31dfSAlex Elder spec->pool_id = (u64)rc; 5424859c31dfSAlex Elder 54250903e875SAlex Elder /* The ceph file layout needs to fit pool id in 32 bits */ 54260903e875SAlex Elder 5427c0cd10dbSAlex Elder if (spec->pool_id > (u64)U32_MAX) { 54289584d508SIlya Dryomov rbd_warn(NULL, "pool id too large (%llu > %u)", 5429c0cd10dbSAlex Elder (unsigned long long)spec->pool_id, U32_MAX); 54300903e875SAlex Elder rc = -EIO; 54310903e875SAlex Elder goto err_out_client; 54320903e875SAlex Elder } 54330903e875SAlex Elder 5434d147543dSIlya Dryomov rbd_dev = rbd_dev_create(rbdc, spec, rbd_opts); 5435bd4ba655SAlex Elder if (!rbd_dev) 5436bd4ba655SAlex Elder goto err_out_client; 5437c53d5893SAlex Elder rbdc = NULL; /* rbd_dev now owns this */ 5438c53d5893SAlex Elder spec = NULL; /* rbd_dev now owns this */ 5439d147543dSIlya Dryomov rbd_opts = NULL; /* rbd_dev now owns this */ 5440602adf40SYehuda Sadeh 54416d69bb53SIlya Dryomov rc = rbd_dev_image_probe(rbd_dev, 0); 5442a30b71b9SAlex Elder if (rc < 0) 5443c53d5893SAlex Elder goto err_out_rbd_dev; 544405fd6f6fSAlex Elder 54457ce4eef7SAlex Elder /* If we are mapping a snapshot it must be marked read-only */ 54467ce4eef7SAlex Elder 5447d147543dSIlya Dryomov read_only = rbd_dev->opts->read_only; 54487ce4eef7SAlex Elder if (rbd_dev->spec->snap_id != CEPH_NOSNAP) 54497ce4eef7SAlex Elder read_only = true; 54507ce4eef7SAlex Elder rbd_dev->mapping.read_only = read_only; 54517ce4eef7SAlex Elder 5452b536f69aSAlex Elder rc = rbd_dev_device_setup(rbd_dev); 54533abef3b3SAlex Elder if (rc) { 5454e37180c0SIlya Dryomov /* 5455e37180c0SIlya Dryomov * rbd_dev_header_unwatch_sync() can't be moved into 5456e37180c0SIlya Dryomov * rbd_dev_image_release() without refactoring, see 5457e37180c0SIlya Dryomov * commit 1f3ef78861ac. 5458e37180c0SIlya Dryomov */ 5459e37180c0SIlya Dryomov rbd_dev_header_unwatch_sync(rbd_dev); 54603abef3b3SAlex Elder rbd_dev_image_release(rbd_dev); 54613abef3b3SAlex Elder goto err_out_module; 54623abef3b3SAlex Elder } 54633abef3b3SAlex Elder 5464602adf40SYehuda Sadeh return count; 5465b536f69aSAlex Elder 5466c53d5893SAlex Elder err_out_rbd_dev: 5467c53d5893SAlex Elder rbd_dev_destroy(rbd_dev); 5468bd4ba655SAlex Elder err_out_client: 54699d3997fdSAlex Elder rbd_put_client(rbdc); 54700ddebc0cSAlex Elder err_out_args: 5471859c31dfSAlex Elder rbd_spec_put(spec); 5472d147543dSIlya Dryomov kfree(rbd_opts); 5473bd4ba655SAlex Elder err_out_module: 5474bd4ba655SAlex Elder module_put(THIS_MODULE); 547527cc2594SAlex Elder 5476602adf40SYehuda Sadeh dout("Error adding device %s\n", buf); 547727cc2594SAlex Elder 547827cc2594SAlex Elder return (ssize_t)rc; 5479602adf40SYehuda Sadeh } 5480602adf40SYehuda Sadeh 54819b60e70bSIlya Dryomov static ssize_t rbd_add(struct bus_type *bus, 54829b60e70bSIlya Dryomov const char *buf, 54839b60e70bSIlya Dryomov size_t count) 54849b60e70bSIlya Dryomov { 54859b60e70bSIlya Dryomov if (single_major) 54869b60e70bSIlya Dryomov return -EINVAL; 54879b60e70bSIlya Dryomov 54889b60e70bSIlya Dryomov return do_rbd_add(bus, buf, count); 54899b60e70bSIlya Dryomov } 54909b60e70bSIlya Dryomov 54919b60e70bSIlya Dryomov static ssize_t rbd_add_single_major(struct bus_type *bus, 54929b60e70bSIlya Dryomov const char *buf, 54939b60e70bSIlya Dryomov size_t count) 54949b60e70bSIlya Dryomov { 54959b60e70bSIlya Dryomov return do_rbd_add(bus, buf, count); 54969b60e70bSIlya Dryomov } 54979b60e70bSIlya Dryomov 5498200a6a8bSAlex Elder static void rbd_dev_device_release(struct device *dev) 5499602adf40SYehuda Sadeh { 5500593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 5501602adf40SYehuda Sadeh 5502602adf40SYehuda Sadeh rbd_free_disk(rbd_dev); 5503200a6a8bSAlex Elder clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); 55046d80b130SAlex Elder rbd_dev_mapping_clear(rbd_dev); 55059b60e70bSIlya Dryomov if (!single_major) 5506602adf40SYehuda Sadeh unregister_blkdev(rbd_dev->major, rbd_dev->name); 5507e2839308SAlex Elder rbd_dev_id_put(rbd_dev); 5508d1cf5788SAlex Elder rbd_dev_mapping_clear(rbd_dev); 5509602adf40SYehuda Sadeh } 5510602adf40SYehuda Sadeh 551105a46afdSAlex Elder static void rbd_dev_remove_parent(struct rbd_device *rbd_dev) 551205a46afdSAlex Elder { 5513ad945fc1SAlex Elder while (rbd_dev->parent) { 551405a46afdSAlex Elder struct rbd_device *first = rbd_dev; 551505a46afdSAlex Elder struct rbd_device *second = first->parent; 551605a46afdSAlex Elder struct rbd_device *third; 551705a46afdSAlex Elder 551805a46afdSAlex Elder /* 551905a46afdSAlex Elder * Follow to the parent with no grandparent and 552005a46afdSAlex Elder * remove it. 552105a46afdSAlex Elder */ 552205a46afdSAlex Elder while (second && (third = second->parent)) { 552305a46afdSAlex Elder first = second; 552405a46afdSAlex Elder second = third; 552505a46afdSAlex Elder } 5526ad945fc1SAlex Elder rbd_assert(second); 55278ad42cd0SAlex Elder rbd_dev_image_release(second); 5528ad945fc1SAlex Elder first->parent = NULL; 5529ad945fc1SAlex Elder first->parent_overlap = 0; 5530ad945fc1SAlex Elder 5531ad945fc1SAlex Elder rbd_assert(first->parent_spec); 553205a46afdSAlex Elder rbd_spec_put(first->parent_spec); 553305a46afdSAlex Elder first->parent_spec = NULL; 553405a46afdSAlex Elder } 553505a46afdSAlex Elder } 553605a46afdSAlex Elder 55379b60e70bSIlya Dryomov static ssize_t do_rbd_remove(struct bus_type *bus, 5538602adf40SYehuda Sadeh const char *buf, 5539602adf40SYehuda Sadeh size_t count) 5540602adf40SYehuda Sadeh { 5541602adf40SYehuda Sadeh struct rbd_device *rbd_dev = NULL; 5542751cc0e3SAlex Elder struct list_head *tmp; 5543751cc0e3SAlex Elder int dev_id; 5544602adf40SYehuda Sadeh unsigned long ul; 554582a442d2SAlex Elder bool already = false; 55460d8189e1SAlex Elder int ret; 5547602adf40SYehuda Sadeh 5548bb8e0e84SJingoo Han ret = kstrtoul(buf, 10, &ul); 55490d8189e1SAlex Elder if (ret) 55500d8189e1SAlex Elder return ret; 5551602adf40SYehuda Sadeh 5552602adf40SYehuda Sadeh /* convert to int; abort if we lost anything in the conversion */ 5553751cc0e3SAlex Elder dev_id = (int)ul; 5554751cc0e3SAlex Elder if (dev_id != ul) 5555602adf40SYehuda Sadeh return -EINVAL; 5556602adf40SYehuda Sadeh 5557602adf40SYehuda Sadeh ret = -ENOENT; 5558751cc0e3SAlex Elder spin_lock(&rbd_dev_list_lock); 5559751cc0e3SAlex Elder list_for_each(tmp, &rbd_dev_list) { 5560751cc0e3SAlex Elder rbd_dev = list_entry(tmp, struct rbd_device, node); 5561751cc0e3SAlex Elder if (rbd_dev->dev_id == dev_id) { 5562751cc0e3SAlex Elder ret = 0; 5563751cc0e3SAlex Elder break; 5564602adf40SYehuda Sadeh } 5565751cc0e3SAlex Elder } 5566751cc0e3SAlex Elder if (!ret) { 5567a14ea269SAlex Elder spin_lock_irq(&rbd_dev->lock); 5568b82d167bSAlex Elder if (rbd_dev->open_count) 556942382b70SAlex Elder ret = -EBUSY; 5570b82d167bSAlex Elder else 557182a442d2SAlex Elder already = test_and_set_bit(RBD_DEV_FLAG_REMOVING, 557282a442d2SAlex Elder &rbd_dev->flags); 5573a14ea269SAlex Elder spin_unlock_irq(&rbd_dev->lock); 5574751cc0e3SAlex Elder } 5575751cc0e3SAlex Elder spin_unlock(&rbd_dev_list_lock); 557682a442d2SAlex Elder if (ret < 0 || already) 55771ba0f1e7SAlex Elder return ret; 5578751cc0e3SAlex Elder 5579fca27065SIlya Dryomov rbd_dev_header_unwatch_sync(rbd_dev); 55809abc5990SJosh Durgin /* 55819abc5990SJosh Durgin * flush remaining watch callbacks - these must be complete 55829abc5990SJosh Durgin * before the osd_client is shutdown 55839abc5990SJosh Durgin */ 55849abc5990SJosh Durgin dout("%s: flushing notifies", __func__); 55859abc5990SJosh Durgin ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc); 5586fca27065SIlya Dryomov 55879875201eSJosh Durgin /* 55889875201eSJosh Durgin * Don't free anything from rbd_dev->disk until after all 55899875201eSJosh Durgin * notifies are completely processed. Otherwise 55909875201eSJosh Durgin * rbd_bus_del_dev() will race with rbd_watch_cb(), resulting 55919875201eSJosh Durgin * in a potential use after free of rbd_dev->disk or rbd_dev. 55929875201eSJosh Durgin */ 55939875201eSJosh Durgin rbd_bus_del_dev(rbd_dev); 55948ad42cd0SAlex Elder rbd_dev_image_release(rbd_dev); 559579ab7558SAlex Elder module_put(THIS_MODULE); 5596aafb230eSAlex Elder 55971ba0f1e7SAlex Elder return count; 5598602adf40SYehuda Sadeh } 5599602adf40SYehuda Sadeh 56009b60e70bSIlya Dryomov static ssize_t rbd_remove(struct bus_type *bus, 56019b60e70bSIlya Dryomov const char *buf, 56029b60e70bSIlya Dryomov size_t count) 56039b60e70bSIlya Dryomov { 56049b60e70bSIlya Dryomov if (single_major) 56059b60e70bSIlya Dryomov return -EINVAL; 56069b60e70bSIlya Dryomov 56079b60e70bSIlya Dryomov return do_rbd_remove(bus, buf, count); 56089b60e70bSIlya Dryomov } 56099b60e70bSIlya Dryomov 56109b60e70bSIlya Dryomov static ssize_t rbd_remove_single_major(struct bus_type *bus, 56119b60e70bSIlya Dryomov const char *buf, 56129b60e70bSIlya Dryomov size_t count) 56139b60e70bSIlya Dryomov { 56149b60e70bSIlya Dryomov return do_rbd_remove(bus, buf, count); 56159b60e70bSIlya Dryomov } 56169b60e70bSIlya Dryomov 5617602adf40SYehuda Sadeh /* 5618602adf40SYehuda Sadeh * create control files in sysfs 5619dfc5606dSYehuda Sadeh * /sys/bus/rbd/... 5620602adf40SYehuda Sadeh */ 5621602adf40SYehuda Sadeh static int rbd_sysfs_init(void) 5622602adf40SYehuda Sadeh { 5623dfc5606dSYehuda Sadeh int ret; 5624602adf40SYehuda Sadeh 5625fed4c143SAlex Elder ret = device_register(&rbd_root_dev); 5626dfc5606dSYehuda Sadeh if (ret < 0) 5627dfc5606dSYehuda Sadeh return ret; 5628602adf40SYehuda Sadeh 5629fed4c143SAlex Elder ret = bus_register(&rbd_bus_type); 5630fed4c143SAlex Elder if (ret < 0) 5631fed4c143SAlex Elder device_unregister(&rbd_root_dev); 5632602adf40SYehuda Sadeh 5633602adf40SYehuda Sadeh return ret; 5634602adf40SYehuda Sadeh } 5635602adf40SYehuda Sadeh 5636602adf40SYehuda Sadeh static void rbd_sysfs_cleanup(void) 5637602adf40SYehuda Sadeh { 5638dfc5606dSYehuda Sadeh bus_unregister(&rbd_bus_type); 5639fed4c143SAlex Elder device_unregister(&rbd_root_dev); 5640602adf40SYehuda Sadeh } 5641602adf40SYehuda Sadeh 56421c2a9dfeSAlex Elder static int rbd_slab_init(void) 56431c2a9dfeSAlex Elder { 56441c2a9dfeSAlex Elder rbd_assert(!rbd_img_request_cache); 56451c2a9dfeSAlex Elder rbd_img_request_cache = kmem_cache_create("rbd_img_request", 56461c2a9dfeSAlex Elder sizeof (struct rbd_img_request), 56471c2a9dfeSAlex Elder __alignof__(struct rbd_img_request), 56481c2a9dfeSAlex Elder 0, NULL); 5649868311b1SAlex Elder if (!rbd_img_request_cache) 5650868311b1SAlex Elder return -ENOMEM; 5651868311b1SAlex Elder 5652868311b1SAlex Elder rbd_assert(!rbd_obj_request_cache); 5653868311b1SAlex Elder rbd_obj_request_cache = kmem_cache_create("rbd_obj_request", 5654868311b1SAlex Elder sizeof (struct rbd_obj_request), 5655868311b1SAlex Elder __alignof__(struct rbd_obj_request), 5656868311b1SAlex Elder 0, NULL); 565778c2a44aSAlex Elder if (!rbd_obj_request_cache) 565878c2a44aSAlex Elder goto out_err; 565978c2a44aSAlex Elder 566078c2a44aSAlex Elder rbd_assert(!rbd_segment_name_cache); 566178c2a44aSAlex Elder rbd_segment_name_cache = kmem_cache_create("rbd_segment_name", 56622d0ebc5dSIlya Dryomov CEPH_MAX_OID_NAME_LEN + 1, 1, 0, NULL); 566378c2a44aSAlex Elder if (rbd_segment_name_cache) 56641c2a9dfeSAlex Elder return 0; 566578c2a44aSAlex Elder out_err: 566678c2a44aSAlex Elder if (rbd_obj_request_cache) { 566778c2a44aSAlex Elder kmem_cache_destroy(rbd_obj_request_cache); 566878c2a44aSAlex Elder rbd_obj_request_cache = NULL; 566978c2a44aSAlex Elder } 56701c2a9dfeSAlex Elder 5671868311b1SAlex Elder kmem_cache_destroy(rbd_img_request_cache); 5672868311b1SAlex Elder rbd_img_request_cache = NULL; 5673868311b1SAlex Elder 56741c2a9dfeSAlex Elder return -ENOMEM; 56751c2a9dfeSAlex Elder } 56761c2a9dfeSAlex Elder 56771c2a9dfeSAlex Elder static void rbd_slab_exit(void) 56781c2a9dfeSAlex Elder { 567978c2a44aSAlex Elder rbd_assert(rbd_segment_name_cache); 568078c2a44aSAlex Elder kmem_cache_destroy(rbd_segment_name_cache); 568178c2a44aSAlex Elder rbd_segment_name_cache = NULL; 568278c2a44aSAlex Elder 5683868311b1SAlex Elder rbd_assert(rbd_obj_request_cache); 5684868311b1SAlex Elder kmem_cache_destroy(rbd_obj_request_cache); 5685868311b1SAlex Elder rbd_obj_request_cache = NULL; 5686868311b1SAlex Elder 56871c2a9dfeSAlex Elder rbd_assert(rbd_img_request_cache); 56881c2a9dfeSAlex Elder kmem_cache_destroy(rbd_img_request_cache); 56891c2a9dfeSAlex Elder rbd_img_request_cache = NULL; 56901c2a9dfeSAlex Elder } 56911c2a9dfeSAlex Elder 5692cc344fa1SAlex Elder static int __init rbd_init(void) 5693602adf40SYehuda Sadeh { 5694602adf40SYehuda Sadeh int rc; 5695602adf40SYehuda Sadeh 56961e32d34cSAlex Elder if (!libceph_compatible(NULL)) { 56971e32d34cSAlex Elder rbd_warn(NULL, "libceph incompatibility (quitting)"); 56981e32d34cSAlex Elder return -EINVAL; 56991e32d34cSAlex Elder } 5700e1b4d96dSIlya Dryomov 57011c2a9dfeSAlex Elder rc = rbd_slab_init(); 5702602adf40SYehuda Sadeh if (rc) 5703602adf40SYehuda Sadeh return rc; 5704e1b4d96dSIlya Dryomov 5705f5ee37bdSIlya Dryomov /* 5706f5ee37bdSIlya Dryomov * The number of active work items is limited by the number of 5707f77303bdSIlya Dryomov * rbd devices * queue depth, so leave @max_active at default. 5708f5ee37bdSIlya Dryomov */ 5709f5ee37bdSIlya Dryomov rbd_wq = alloc_workqueue(RBD_DRV_NAME, WQ_MEM_RECLAIM, 0); 5710f5ee37bdSIlya Dryomov if (!rbd_wq) { 5711f5ee37bdSIlya Dryomov rc = -ENOMEM; 5712f5ee37bdSIlya Dryomov goto err_out_slab; 5713f5ee37bdSIlya Dryomov } 5714f5ee37bdSIlya Dryomov 57159b60e70bSIlya Dryomov if (single_major) { 57169b60e70bSIlya Dryomov rbd_major = register_blkdev(0, RBD_DRV_NAME); 57179b60e70bSIlya Dryomov if (rbd_major < 0) { 57189b60e70bSIlya Dryomov rc = rbd_major; 5719f5ee37bdSIlya Dryomov goto err_out_wq; 57209b60e70bSIlya Dryomov } 57219b60e70bSIlya Dryomov } 57229b60e70bSIlya Dryomov 57231c2a9dfeSAlex Elder rc = rbd_sysfs_init(); 57241c2a9dfeSAlex Elder if (rc) 57259b60e70bSIlya Dryomov goto err_out_blkdev; 57261c2a9dfeSAlex Elder 57279b60e70bSIlya Dryomov if (single_major) 57289b60e70bSIlya Dryomov pr_info("loaded (major %d)\n", rbd_major); 57299b60e70bSIlya Dryomov else 5730e1b4d96dSIlya Dryomov pr_info("loaded\n"); 57319b60e70bSIlya Dryomov 5732e1b4d96dSIlya Dryomov return 0; 5733e1b4d96dSIlya Dryomov 57349b60e70bSIlya Dryomov err_out_blkdev: 57359b60e70bSIlya Dryomov if (single_major) 57369b60e70bSIlya Dryomov unregister_blkdev(rbd_major, RBD_DRV_NAME); 5737f5ee37bdSIlya Dryomov err_out_wq: 5738f5ee37bdSIlya Dryomov destroy_workqueue(rbd_wq); 5739e1b4d96dSIlya Dryomov err_out_slab: 5740e1b4d96dSIlya Dryomov rbd_slab_exit(); 57411c2a9dfeSAlex Elder return rc; 5742602adf40SYehuda Sadeh } 5743602adf40SYehuda Sadeh 5744cc344fa1SAlex Elder static void __exit rbd_exit(void) 5745602adf40SYehuda Sadeh { 5746ffe312cfSIlya Dryomov ida_destroy(&rbd_dev_id_ida); 5747602adf40SYehuda Sadeh rbd_sysfs_cleanup(); 57489b60e70bSIlya Dryomov if (single_major) 57499b60e70bSIlya Dryomov unregister_blkdev(rbd_major, RBD_DRV_NAME); 5750f5ee37bdSIlya Dryomov destroy_workqueue(rbd_wq); 57511c2a9dfeSAlex Elder rbd_slab_exit(); 5752602adf40SYehuda Sadeh } 5753602adf40SYehuda Sadeh 5754602adf40SYehuda Sadeh module_init(rbd_init); 5755602adf40SYehuda Sadeh module_exit(rbd_exit); 5756602adf40SYehuda Sadeh 5757d552c619SAlex Elder MODULE_AUTHOR("Alex Elder <elder@inktank.com>"); 5758602adf40SYehuda Sadeh MODULE_AUTHOR("Sage Weil <sage@newdream.net>"); 5759602adf40SYehuda Sadeh MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>"); 5760602adf40SYehuda Sadeh /* following authorship retained from original osdblk.c */ 5761602adf40SYehuda Sadeh MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>"); 5762602adf40SYehuda Sadeh 576390da258bSIlya Dryomov MODULE_DESCRIPTION("RADOS Block Device (RBD) driver"); 5764602adf40SYehuda Sadeh MODULE_LICENSE("GPL"); 5765