1e2a58ee5SAlex Elder 2602adf40SYehuda Sadeh /* 3602adf40SYehuda Sadeh rbd.c -- Export ceph rados objects as a Linux block device 4602adf40SYehuda Sadeh 5602adf40SYehuda Sadeh 6602adf40SYehuda Sadeh based on drivers/block/osdblk.c: 7602adf40SYehuda Sadeh 8602adf40SYehuda Sadeh Copyright 2009 Red Hat, Inc. 9602adf40SYehuda Sadeh 10602adf40SYehuda Sadeh This program is free software; you can redistribute it and/or modify 11602adf40SYehuda Sadeh it under the terms of the GNU General Public License as published by 12602adf40SYehuda Sadeh the Free Software Foundation. 13602adf40SYehuda Sadeh 14602adf40SYehuda Sadeh This program is distributed in the hope that it will be useful, 15602adf40SYehuda Sadeh but WITHOUT ANY WARRANTY; without even the implied warranty of 16602adf40SYehuda Sadeh MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17602adf40SYehuda Sadeh GNU General Public License for more details. 18602adf40SYehuda Sadeh 19602adf40SYehuda Sadeh You should have received a copy of the GNU General Public License 20602adf40SYehuda Sadeh along with this program; see the file COPYING. If not, write to 21602adf40SYehuda Sadeh the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 22602adf40SYehuda Sadeh 23602adf40SYehuda Sadeh 24602adf40SYehuda Sadeh 25dfc5606dSYehuda Sadeh For usage instructions, please refer to: 26602adf40SYehuda Sadeh 27dfc5606dSYehuda Sadeh Documentation/ABI/testing/sysfs-bus-rbd 28602adf40SYehuda Sadeh 29602adf40SYehuda Sadeh */ 30602adf40SYehuda Sadeh 31602adf40SYehuda Sadeh #include <linux/ceph/libceph.h> 32602adf40SYehuda Sadeh #include <linux/ceph/osd_client.h> 33602adf40SYehuda Sadeh #include <linux/ceph/mon_client.h> 34602adf40SYehuda Sadeh #include <linux/ceph/decode.h> 3559c2be1eSYehuda Sadeh #include <linux/parser.h> 3630d1cff8SAlex Elder #include <linux/bsearch.h> 37602adf40SYehuda Sadeh 38602adf40SYehuda Sadeh #include <linux/kernel.h> 39602adf40SYehuda Sadeh #include <linux/device.h> 40602adf40SYehuda Sadeh #include <linux/module.h> 417ad18afaSChristoph Hellwig #include <linux/blk-mq.h> 42602adf40SYehuda Sadeh #include <linux/fs.h> 43602adf40SYehuda Sadeh #include <linux/blkdev.h> 441c2a9dfeSAlex Elder #include <linux/slab.h> 45f8a22fc2SIlya Dryomov #include <linux/idr.h> 46bc1ecc65SIlya Dryomov #include <linux/workqueue.h> 47602adf40SYehuda Sadeh 48602adf40SYehuda Sadeh #include "rbd_types.h" 49602adf40SYehuda Sadeh 50aafb230eSAlex Elder #define RBD_DEBUG /* Activate rbd_assert() calls */ 51aafb230eSAlex Elder 52593a9e7bSAlex Elder /* 53593a9e7bSAlex Elder * The basic unit of block I/O is a sector. It is interpreted in a 54593a9e7bSAlex Elder * number of contexts in Linux (blk, bio, genhd), but the default is 55593a9e7bSAlex Elder * universally 512 bytes. These symbols are just slightly more 56593a9e7bSAlex Elder * meaningful than the bare numbers they represent. 57593a9e7bSAlex Elder */ 58593a9e7bSAlex Elder #define SECTOR_SHIFT 9 59593a9e7bSAlex Elder #define SECTOR_SIZE (1ULL << SECTOR_SHIFT) 60593a9e7bSAlex Elder 61a2acd00eSAlex Elder /* 62a2acd00eSAlex Elder * Increment the given counter and return its updated value. 63a2acd00eSAlex Elder * If the counter is already 0 it will not be incremented. 64a2acd00eSAlex Elder * If the counter is already at its maximum value returns 65a2acd00eSAlex Elder * -EINVAL without updating it. 66a2acd00eSAlex Elder */ 67a2acd00eSAlex Elder static int atomic_inc_return_safe(atomic_t *v) 68a2acd00eSAlex Elder { 69a2acd00eSAlex Elder unsigned int counter; 70a2acd00eSAlex Elder 71a2acd00eSAlex Elder counter = (unsigned int)__atomic_add_unless(v, 1, 0); 72a2acd00eSAlex Elder if (counter <= (unsigned int)INT_MAX) 73a2acd00eSAlex Elder return (int)counter; 74a2acd00eSAlex Elder 75a2acd00eSAlex Elder atomic_dec(v); 76a2acd00eSAlex Elder 77a2acd00eSAlex Elder return -EINVAL; 78a2acd00eSAlex Elder } 79a2acd00eSAlex Elder 80a2acd00eSAlex Elder /* Decrement the counter. Return the resulting value, or -EINVAL */ 81a2acd00eSAlex Elder static int atomic_dec_return_safe(atomic_t *v) 82a2acd00eSAlex Elder { 83a2acd00eSAlex Elder int counter; 84a2acd00eSAlex Elder 85a2acd00eSAlex Elder counter = atomic_dec_return(v); 86a2acd00eSAlex Elder if (counter >= 0) 87a2acd00eSAlex Elder return counter; 88a2acd00eSAlex Elder 89a2acd00eSAlex Elder atomic_inc(v); 90a2acd00eSAlex Elder 91a2acd00eSAlex Elder return -EINVAL; 92a2acd00eSAlex Elder } 93a2acd00eSAlex Elder 94f0f8cef5SAlex Elder #define RBD_DRV_NAME "rbd" 95602adf40SYehuda Sadeh 967e513d43SIlya Dryomov #define RBD_MINORS_PER_MAJOR 256 977e513d43SIlya Dryomov #define RBD_SINGLE_MAJOR_PART_SHIFT 4 98602adf40SYehuda Sadeh 996d69bb53SIlya Dryomov #define RBD_MAX_PARENT_CHAIN_LEN 16 1006d69bb53SIlya Dryomov 101d4b125e9SAlex Elder #define RBD_SNAP_DEV_NAME_PREFIX "snap_" 102d4b125e9SAlex Elder #define RBD_MAX_SNAP_NAME_LEN \ 103d4b125e9SAlex Elder (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1)) 104d4b125e9SAlex Elder 10535d489f9SAlex Elder #define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */ 106602adf40SYehuda Sadeh 107602adf40SYehuda Sadeh #define RBD_SNAP_HEAD_NAME "-" 108602adf40SYehuda Sadeh 1099682fc6dSAlex Elder #define BAD_SNAP_INDEX U32_MAX /* invalid index into snap array */ 1109682fc6dSAlex Elder 1119e15b77dSAlex Elder /* This allows a single page to hold an image name sent by OSD */ 1129e15b77dSAlex Elder #define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1) 113589d30e0SAlex Elder #define RBD_IMAGE_ID_LEN_MAX 64 1149e15b77dSAlex Elder 1151e130199SAlex Elder #define RBD_OBJ_PREFIX_LEN_MAX 64 116589d30e0SAlex Elder 117d889140cSAlex Elder /* Feature bits */ 118d889140cSAlex Elder 1195cbf6f12SAlex Elder #define RBD_FEATURE_LAYERING (1<<0) 1205cbf6f12SAlex Elder #define RBD_FEATURE_STRIPINGV2 (1<<1) 1215cbf6f12SAlex Elder #define RBD_FEATURES_ALL \ 1225cbf6f12SAlex Elder (RBD_FEATURE_LAYERING | RBD_FEATURE_STRIPINGV2) 123d889140cSAlex Elder 124d889140cSAlex Elder /* Features supported by this (client software) implementation. */ 125d889140cSAlex Elder 126770eba6eSAlex Elder #define RBD_FEATURES_SUPPORTED (RBD_FEATURES_ALL) 127d889140cSAlex Elder 12881a89793SAlex Elder /* 12981a89793SAlex Elder * An RBD device name will be "rbd#", where the "rbd" comes from 13081a89793SAlex Elder * RBD_DRV_NAME above, and # is a unique integer identifier. 13181a89793SAlex Elder * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big 13281a89793SAlex Elder * enough to hold all possible device names. 13381a89793SAlex Elder */ 134602adf40SYehuda Sadeh #define DEV_NAME_LEN 32 13581a89793SAlex Elder #define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1) 136602adf40SYehuda Sadeh 137602adf40SYehuda Sadeh /* 138602adf40SYehuda Sadeh * block device image metadata (in-memory version) 139602adf40SYehuda Sadeh */ 140602adf40SYehuda Sadeh struct rbd_image_header { 141f35a4deeSAlex Elder /* These six fields never change for a given rbd image */ 142849b4260SAlex Elder char *object_prefix; 143602adf40SYehuda Sadeh __u8 obj_order; 144602adf40SYehuda Sadeh __u8 crypt_type; 145602adf40SYehuda Sadeh __u8 comp_type; 146f35a4deeSAlex Elder u64 stripe_unit; 147f35a4deeSAlex Elder u64 stripe_count; 148f35a4deeSAlex Elder u64 features; /* Might be changeable someday? */ 149602adf40SYehuda Sadeh 150f84344f3SAlex Elder /* The remaining fields need to be updated occasionally */ 151f84344f3SAlex Elder u64 image_size; 152f84344f3SAlex Elder struct ceph_snap_context *snapc; 153f35a4deeSAlex Elder char *snap_names; /* format 1 only */ 154f35a4deeSAlex Elder u64 *snap_sizes; /* format 1 only */ 15559c2be1eSYehuda Sadeh }; 15659c2be1eSYehuda Sadeh 1570d7dbfceSAlex Elder /* 1580d7dbfceSAlex Elder * An rbd image specification. 1590d7dbfceSAlex Elder * 1600d7dbfceSAlex Elder * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely 161c66c6e0cSAlex Elder * identify an image. Each rbd_dev structure includes a pointer to 162c66c6e0cSAlex Elder * an rbd_spec structure that encapsulates this identity. 163c66c6e0cSAlex Elder * 164c66c6e0cSAlex Elder * Each of the id's in an rbd_spec has an associated name. For a 165c66c6e0cSAlex Elder * user-mapped image, the names are supplied and the id's associated 166c66c6e0cSAlex Elder * with them are looked up. For a layered image, a parent image is 167c66c6e0cSAlex Elder * defined by the tuple, and the names are looked up. 168c66c6e0cSAlex Elder * 169c66c6e0cSAlex Elder * An rbd_dev structure contains a parent_spec pointer which is 170c66c6e0cSAlex Elder * non-null if the image it represents is a child in a layered 171c66c6e0cSAlex Elder * image. This pointer will refer to the rbd_spec structure used 172c66c6e0cSAlex Elder * by the parent rbd_dev for its own identity (i.e., the structure 173c66c6e0cSAlex Elder * is shared between the parent and child). 174c66c6e0cSAlex Elder * 175c66c6e0cSAlex Elder * Since these structures are populated once, during the discovery 176c66c6e0cSAlex Elder * phase of image construction, they are effectively immutable so 177c66c6e0cSAlex Elder * we make no effort to synchronize access to them. 178c66c6e0cSAlex Elder * 179c66c6e0cSAlex Elder * Note that code herein does not assume the image name is known (it 180c66c6e0cSAlex Elder * could be a null pointer). 1810d7dbfceSAlex Elder */ 1820d7dbfceSAlex Elder struct rbd_spec { 1830d7dbfceSAlex Elder u64 pool_id; 184ecb4dc22SAlex Elder const char *pool_name; 1850d7dbfceSAlex Elder 186ecb4dc22SAlex Elder const char *image_id; 187ecb4dc22SAlex Elder const char *image_name; 1880d7dbfceSAlex Elder 1890d7dbfceSAlex Elder u64 snap_id; 190ecb4dc22SAlex Elder const char *snap_name; 1910d7dbfceSAlex Elder 1920d7dbfceSAlex Elder struct kref kref; 1930d7dbfceSAlex Elder }; 1940d7dbfceSAlex Elder 195602adf40SYehuda Sadeh /* 196f0f8cef5SAlex Elder * an instance of the client. multiple devices may share an rbd client. 197602adf40SYehuda Sadeh */ 198602adf40SYehuda Sadeh struct rbd_client { 199602adf40SYehuda Sadeh struct ceph_client *client; 200602adf40SYehuda Sadeh struct kref kref; 201602adf40SYehuda Sadeh struct list_head node; 202602adf40SYehuda Sadeh }; 203602adf40SYehuda Sadeh 204bf0d5f50SAlex Elder struct rbd_img_request; 205bf0d5f50SAlex Elder typedef void (*rbd_img_callback_t)(struct rbd_img_request *); 206bf0d5f50SAlex Elder 207bf0d5f50SAlex Elder #define BAD_WHICH U32_MAX /* Good which or bad which, which? */ 208bf0d5f50SAlex Elder 209bf0d5f50SAlex Elder struct rbd_obj_request; 210bf0d5f50SAlex Elder typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *); 211bf0d5f50SAlex Elder 2129969ebc5SAlex Elder enum obj_request_type { 2139969ebc5SAlex Elder OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES 2149969ebc5SAlex Elder }; 215bf0d5f50SAlex Elder 2166d2940c8SGuangliang Zhao enum obj_operation_type { 2176d2940c8SGuangliang Zhao OBJ_OP_WRITE, 2186d2940c8SGuangliang Zhao OBJ_OP_READ, 21990e98c52SGuangliang Zhao OBJ_OP_DISCARD, 2206d2940c8SGuangliang Zhao }; 2216d2940c8SGuangliang Zhao 222926f9b3fSAlex Elder enum obj_req_flags { 223926f9b3fSAlex Elder OBJ_REQ_DONE, /* completion flag: not done = 0, done = 1 */ 2246365d33aSAlex Elder OBJ_REQ_IMG_DATA, /* object usage: standalone = 0, image = 1 */ 2255679c59fSAlex Elder OBJ_REQ_KNOWN, /* EXISTS flag valid: no = 0, yes = 1 */ 2265679c59fSAlex Elder OBJ_REQ_EXISTS, /* target exists: no = 0, yes = 1 */ 227926f9b3fSAlex Elder }; 228926f9b3fSAlex Elder 229bf0d5f50SAlex Elder struct rbd_obj_request { 230bf0d5f50SAlex Elder const char *object_name; 231bf0d5f50SAlex Elder u64 offset; /* object start byte */ 232bf0d5f50SAlex Elder u64 length; /* bytes from offset */ 233926f9b3fSAlex Elder unsigned long flags; 234bf0d5f50SAlex Elder 235c5b5ef6cSAlex Elder /* 236c5b5ef6cSAlex Elder * An object request associated with an image will have its 237c5b5ef6cSAlex Elder * img_data flag set; a standalone object request will not. 238c5b5ef6cSAlex Elder * 239c5b5ef6cSAlex Elder * A standalone object request will have which == BAD_WHICH 240c5b5ef6cSAlex Elder * and a null obj_request pointer. 241c5b5ef6cSAlex Elder * 242c5b5ef6cSAlex Elder * An object request initiated in support of a layered image 243c5b5ef6cSAlex Elder * object (to check for its existence before a write) will 244c5b5ef6cSAlex Elder * have which == BAD_WHICH and a non-null obj_request pointer. 245c5b5ef6cSAlex Elder * 246c5b5ef6cSAlex Elder * Finally, an object request for rbd image data will have 247c5b5ef6cSAlex Elder * which != BAD_WHICH, and will have a non-null img_request 248c5b5ef6cSAlex Elder * pointer. The value of which will be in the range 249c5b5ef6cSAlex Elder * 0..(img_request->obj_request_count-1). 250c5b5ef6cSAlex Elder */ 251c5b5ef6cSAlex Elder union { 252c5b5ef6cSAlex Elder struct rbd_obj_request *obj_request; /* STAT op */ 253c5b5ef6cSAlex Elder struct { 254bf0d5f50SAlex Elder struct rbd_img_request *img_request; 255c5b5ef6cSAlex Elder u64 img_offset; 256c5b5ef6cSAlex Elder /* links for img_request->obj_requests list */ 257c5b5ef6cSAlex Elder struct list_head links; 258c5b5ef6cSAlex Elder }; 259c5b5ef6cSAlex Elder }; 260bf0d5f50SAlex Elder u32 which; /* posn image request list */ 261bf0d5f50SAlex Elder 262bf0d5f50SAlex Elder enum obj_request_type type; 263788e2df3SAlex Elder union { 264bf0d5f50SAlex Elder struct bio *bio_list; 265788e2df3SAlex Elder struct { 266788e2df3SAlex Elder struct page **pages; 267788e2df3SAlex Elder u32 page_count; 268788e2df3SAlex Elder }; 269788e2df3SAlex Elder }; 2700eefd470SAlex Elder struct page **copyup_pages; 271ebda6408SAlex Elder u32 copyup_page_count; 272bf0d5f50SAlex Elder 273bf0d5f50SAlex Elder struct ceph_osd_request *osd_req; 274bf0d5f50SAlex Elder 275bf0d5f50SAlex Elder u64 xferred; /* bytes transferred */ 2761b83bef2SSage Weil int result; 277bf0d5f50SAlex Elder 278bf0d5f50SAlex Elder rbd_obj_callback_t callback; 279788e2df3SAlex Elder struct completion completion; 280bf0d5f50SAlex Elder 281bf0d5f50SAlex Elder struct kref kref; 282bf0d5f50SAlex Elder }; 283bf0d5f50SAlex Elder 2840c425248SAlex Elder enum img_req_flags { 2859849e986SAlex Elder IMG_REQ_WRITE, /* I/O direction: read = 0, write = 1 */ 2869849e986SAlex Elder IMG_REQ_CHILD, /* initiator: block = 0, child image = 1 */ 287d0b2e944SAlex Elder IMG_REQ_LAYERED, /* ENOENT handling: normal = 0, layered = 1 */ 28890e98c52SGuangliang Zhao IMG_REQ_DISCARD, /* discard: normal = 0, discard request = 1 */ 2890c425248SAlex Elder }; 2900c425248SAlex Elder 291bf0d5f50SAlex Elder struct rbd_img_request { 292bf0d5f50SAlex Elder struct rbd_device *rbd_dev; 293bf0d5f50SAlex Elder u64 offset; /* starting image byte offset */ 294bf0d5f50SAlex Elder u64 length; /* byte count from offset */ 2950c425248SAlex Elder unsigned long flags; 296bf0d5f50SAlex Elder union { 297bf0d5f50SAlex Elder u64 snap_id; /* for reads */ 2989849e986SAlex Elder struct ceph_snap_context *snapc; /* for writes */ 2999849e986SAlex Elder }; 3009849e986SAlex Elder union { 3019849e986SAlex Elder struct request *rq; /* block request */ 3029849e986SAlex Elder struct rbd_obj_request *obj_request; /* obj req initiator */ 303bf0d5f50SAlex Elder }; 3043d7efd18SAlex Elder struct page **copyup_pages; 305ebda6408SAlex Elder u32 copyup_page_count; 306bf0d5f50SAlex Elder spinlock_t completion_lock;/* protects next_completion */ 307bf0d5f50SAlex Elder u32 next_completion; 308bf0d5f50SAlex Elder rbd_img_callback_t callback; 30955f27e09SAlex Elder u64 xferred;/* aggregate bytes transferred */ 310a5a337d4SAlex Elder int result; /* first nonzero obj_request result */ 311bf0d5f50SAlex Elder 312bf0d5f50SAlex Elder u32 obj_request_count; 313bf0d5f50SAlex Elder struct list_head obj_requests; /* rbd_obj_request structs */ 314bf0d5f50SAlex Elder 315bf0d5f50SAlex Elder struct kref kref; 316bf0d5f50SAlex Elder }; 317bf0d5f50SAlex Elder 318bf0d5f50SAlex Elder #define for_each_obj_request(ireq, oreq) \ 319ef06f4d3SAlex Elder list_for_each_entry(oreq, &(ireq)->obj_requests, links) 320bf0d5f50SAlex Elder #define for_each_obj_request_from(ireq, oreq) \ 321ef06f4d3SAlex Elder list_for_each_entry_from(oreq, &(ireq)->obj_requests, links) 322bf0d5f50SAlex Elder #define for_each_obj_request_safe(ireq, oreq, n) \ 323ef06f4d3SAlex Elder list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links) 324bf0d5f50SAlex Elder 325f84344f3SAlex Elder struct rbd_mapping { 32699c1f08fSAlex Elder u64 size; 32734b13184SAlex Elder u64 features; 328f84344f3SAlex Elder bool read_only; 329f84344f3SAlex Elder }; 330f84344f3SAlex Elder 331602adf40SYehuda Sadeh /* 332602adf40SYehuda Sadeh * a single device 333602adf40SYehuda Sadeh */ 334602adf40SYehuda Sadeh struct rbd_device { 335de71a297SAlex Elder int dev_id; /* blkdev unique id */ 336602adf40SYehuda Sadeh 337602adf40SYehuda Sadeh int major; /* blkdev assigned major */ 338dd82fff1SIlya Dryomov int minor; 339602adf40SYehuda Sadeh struct gendisk *disk; /* blkdev's gendisk and rq */ 340602adf40SYehuda Sadeh 341a30b71b9SAlex Elder u32 image_format; /* Either 1 or 2 */ 342602adf40SYehuda Sadeh struct rbd_client *rbd_client; 343602adf40SYehuda Sadeh 344602adf40SYehuda Sadeh char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */ 345602adf40SYehuda Sadeh 346b82d167bSAlex Elder spinlock_t lock; /* queue, flags, open_count */ 347602adf40SYehuda Sadeh 348602adf40SYehuda Sadeh struct rbd_image_header header; 349b82d167bSAlex Elder unsigned long flags; /* possibly lock protected */ 3500d7dbfceSAlex Elder struct rbd_spec *spec; 351d147543dSIlya Dryomov struct rbd_options *opts; 352602adf40SYehuda Sadeh 3530d7dbfceSAlex Elder char *header_name; 354971f839aSAlex Elder 3550903e875SAlex Elder struct ceph_file_layout layout; 3560903e875SAlex Elder 35759c2be1eSYehuda Sadeh struct ceph_osd_event *watch_event; 358975241afSAlex Elder struct rbd_obj_request *watch_request; 35959c2be1eSYehuda Sadeh 36086b00e0dSAlex Elder struct rbd_spec *parent_spec; 36186b00e0dSAlex Elder u64 parent_overlap; 362a2acd00eSAlex Elder atomic_t parent_ref; 3632f82ee54SAlex Elder struct rbd_device *parent; 36486b00e0dSAlex Elder 3657ad18afaSChristoph Hellwig /* Block layer tags. */ 3667ad18afaSChristoph Hellwig struct blk_mq_tag_set tag_set; 3677ad18afaSChristoph Hellwig 368c666601aSJosh Durgin /* protects updating the header */ 369c666601aSJosh Durgin struct rw_semaphore header_rwsem; 370f84344f3SAlex Elder 371f84344f3SAlex Elder struct rbd_mapping mapping; 372602adf40SYehuda Sadeh 373602adf40SYehuda Sadeh struct list_head node; 374dfc5606dSYehuda Sadeh 375dfc5606dSYehuda Sadeh /* sysfs related */ 376dfc5606dSYehuda Sadeh struct device dev; 377b82d167bSAlex Elder unsigned long open_count; /* protected by lock */ 378dfc5606dSYehuda Sadeh }; 379dfc5606dSYehuda Sadeh 380b82d167bSAlex Elder /* 381b82d167bSAlex Elder * Flag bits for rbd_dev->flags. If atomicity is required, 382b82d167bSAlex Elder * rbd_dev->lock is used to protect access. 383b82d167bSAlex Elder * 384b82d167bSAlex Elder * Currently, only the "removing" flag (which is coupled with the 385b82d167bSAlex Elder * "open_count" field) requires atomic access. 386b82d167bSAlex Elder */ 3876d292906SAlex Elder enum rbd_dev_flags { 3886d292906SAlex Elder RBD_DEV_FLAG_EXISTS, /* mapped snapshot has not been deleted */ 389b82d167bSAlex Elder RBD_DEV_FLAG_REMOVING, /* this mapping is being removed */ 3906d292906SAlex Elder }; 3916d292906SAlex Elder 392cfbf6377SAlex Elder static DEFINE_MUTEX(client_mutex); /* Serialize client creation */ 393e124a82fSAlex Elder 394602adf40SYehuda Sadeh static LIST_HEAD(rbd_dev_list); /* devices */ 395e124a82fSAlex Elder static DEFINE_SPINLOCK(rbd_dev_list_lock); 396e124a82fSAlex Elder 397602adf40SYehuda Sadeh static LIST_HEAD(rbd_client_list); /* clients */ 398432b8587SAlex Elder static DEFINE_SPINLOCK(rbd_client_list_lock); 399602adf40SYehuda Sadeh 40078c2a44aSAlex Elder /* Slab caches for frequently-allocated structures */ 40178c2a44aSAlex Elder 4021c2a9dfeSAlex Elder static struct kmem_cache *rbd_img_request_cache; 403868311b1SAlex Elder static struct kmem_cache *rbd_obj_request_cache; 40478c2a44aSAlex Elder static struct kmem_cache *rbd_segment_name_cache; 4051c2a9dfeSAlex Elder 4069b60e70bSIlya Dryomov static int rbd_major; 407f8a22fc2SIlya Dryomov static DEFINE_IDA(rbd_dev_id_ida); 408f8a22fc2SIlya Dryomov 409f5ee37bdSIlya Dryomov static struct workqueue_struct *rbd_wq; 410f5ee37bdSIlya Dryomov 4119b60e70bSIlya Dryomov /* 4129b60e70bSIlya Dryomov * Default to false for now, as single-major requires >= 0.75 version of 4139b60e70bSIlya Dryomov * userspace rbd utility. 4149b60e70bSIlya Dryomov */ 4159b60e70bSIlya Dryomov static bool single_major = false; 4169b60e70bSIlya Dryomov module_param(single_major, bool, S_IRUGO); 4179b60e70bSIlya Dryomov MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: false)"); 4189b60e70bSIlya Dryomov 4193d7efd18SAlex Elder static int rbd_img_request_submit(struct rbd_img_request *img_request); 4203d7efd18SAlex Elder 421f0f8cef5SAlex Elder static ssize_t rbd_add(struct bus_type *bus, const char *buf, 422f0f8cef5SAlex Elder size_t count); 423f0f8cef5SAlex Elder static ssize_t rbd_remove(struct bus_type *bus, const char *buf, 424f0f8cef5SAlex Elder size_t count); 4259b60e70bSIlya Dryomov static ssize_t rbd_add_single_major(struct bus_type *bus, const char *buf, 4269b60e70bSIlya Dryomov size_t count); 4279b60e70bSIlya Dryomov static ssize_t rbd_remove_single_major(struct bus_type *bus, const char *buf, 4289b60e70bSIlya Dryomov size_t count); 4296d69bb53SIlya Dryomov static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth); 430a2acd00eSAlex Elder static void rbd_spec_put(struct rbd_spec *spec); 431f0f8cef5SAlex Elder 4329b60e70bSIlya Dryomov static int rbd_dev_id_to_minor(int dev_id) 4339b60e70bSIlya Dryomov { 4347e513d43SIlya Dryomov return dev_id << RBD_SINGLE_MAJOR_PART_SHIFT; 4359b60e70bSIlya Dryomov } 4369b60e70bSIlya Dryomov 4379b60e70bSIlya Dryomov static int minor_to_rbd_dev_id(int minor) 4389b60e70bSIlya Dryomov { 4397e513d43SIlya Dryomov return minor >> RBD_SINGLE_MAJOR_PART_SHIFT; 4409b60e70bSIlya Dryomov } 4419b60e70bSIlya Dryomov 442b15a21ddSGreg Kroah-Hartman static BUS_ATTR(add, S_IWUSR, NULL, rbd_add); 443b15a21ddSGreg Kroah-Hartman static BUS_ATTR(remove, S_IWUSR, NULL, rbd_remove); 4449b60e70bSIlya Dryomov static BUS_ATTR(add_single_major, S_IWUSR, NULL, rbd_add_single_major); 4459b60e70bSIlya Dryomov static BUS_ATTR(remove_single_major, S_IWUSR, NULL, rbd_remove_single_major); 446b15a21ddSGreg Kroah-Hartman 447b15a21ddSGreg Kroah-Hartman static struct attribute *rbd_bus_attrs[] = { 448b15a21ddSGreg Kroah-Hartman &bus_attr_add.attr, 449b15a21ddSGreg Kroah-Hartman &bus_attr_remove.attr, 4509b60e70bSIlya Dryomov &bus_attr_add_single_major.attr, 4519b60e70bSIlya Dryomov &bus_attr_remove_single_major.attr, 452b15a21ddSGreg Kroah-Hartman NULL, 453f0f8cef5SAlex Elder }; 45492c76dc0SIlya Dryomov 45592c76dc0SIlya Dryomov static umode_t rbd_bus_is_visible(struct kobject *kobj, 45692c76dc0SIlya Dryomov struct attribute *attr, int index) 45792c76dc0SIlya Dryomov { 4589b60e70bSIlya Dryomov if (!single_major && 4599b60e70bSIlya Dryomov (attr == &bus_attr_add_single_major.attr || 4609b60e70bSIlya Dryomov attr == &bus_attr_remove_single_major.attr)) 4619b60e70bSIlya Dryomov return 0; 4629b60e70bSIlya Dryomov 46392c76dc0SIlya Dryomov return attr->mode; 46492c76dc0SIlya Dryomov } 46592c76dc0SIlya Dryomov 46692c76dc0SIlya Dryomov static const struct attribute_group rbd_bus_group = { 46792c76dc0SIlya Dryomov .attrs = rbd_bus_attrs, 46892c76dc0SIlya Dryomov .is_visible = rbd_bus_is_visible, 46992c76dc0SIlya Dryomov }; 47092c76dc0SIlya Dryomov __ATTRIBUTE_GROUPS(rbd_bus); 471f0f8cef5SAlex Elder 472f0f8cef5SAlex Elder static struct bus_type rbd_bus_type = { 473f0f8cef5SAlex Elder .name = "rbd", 474b15a21ddSGreg Kroah-Hartman .bus_groups = rbd_bus_groups, 475f0f8cef5SAlex Elder }; 476f0f8cef5SAlex Elder 477f0f8cef5SAlex Elder static void rbd_root_dev_release(struct device *dev) 478f0f8cef5SAlex Elder { 479f0f8cef5SAlex Elder } 480f0f8cef5SAlex Elder 481f0f8cef5SAlex Elder static struct device rbd_root_dev = { 482f0f8cef5SAlex Elder .init_name = "rbd", 483f0f8cef5SAlex Elder .release = rbd_root_dev_release, 484f0f8cef5SAlex Elder }; 485f0f8cef5SAlex Elder 48606ecc6cbSAlex Elder static __printf(2, 3) 48706ecc6cbSAlex Elder void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...) 48806ecc6cbSAlex Elder { 48906ecc6cbSAlex Elder struct va_format vaf; 49006ecc6cbSAlex Elder va_list args; 49106ecc6cbSAlex Elder 49206ecc6cbSAlex Elder va_start(args, fmt); 49306ecc6cbSAlex Elder vaf.fmt = fmt; 49406ecc6cbSAlex Elder vaf.va = &args; 49506ecc6cbSAlex Elder 49606ecc6cbSAlex Elder if (!rbd_dev) 49706ecc6cbSAlex Elder printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf); 49806ecc6cbSAlex Elder else if (rbd_dev->disk) 49906ecc6cbSAlex Elder printk(KERN_WARNING "%s: %s: %pV\n", 50006ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf); 50106ecc6cbSAlex Elder else if (rbd_dev->spec && rbd_dev->spec->image_name) 50206ecc6cbSAlex Elder printk(KERN_WARNING "%s: image %s: %pV\n", 50306ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf); 50406ecc6cbSAlex Elder else if (rbd_dev->spec && rbd_dev->spec->image_id) 50506ecc6cbSAlex Elder printk(KERN_WARNING "%s: id %s: %pV\n", 50606ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf); 50706ecc6cbSAlex Elder else /* punt */ 50806ecc6cbSAlex Elder printk(KERN_WARNING "%s: rbd_dev %p: %pV\n", 50906ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev, &vaf); 51006ecc6cbSAlex Elder va_end(args); 51106ecc6cbSAlex Elder } 51206ecc6cbSAlex Elder 513aafb230eSAlex Elder #ifdef RBD_DEBUG 514aafb230eSAlex Elder #define rbd_assert(expr) \ 515aafb230eSAlex Elder if (unlikely(!(expr))) { \ 516aafb230eSAlex Elder printk(KERN_ERR "\nAssertion failure in %s() " \ 517aafb230eSAlex Elder "at line %d:\n\n" \ 518aafb230eSAlex Elder "\trbd_assert(%s);\n\n", \ 519aafb230eSAlex Elder __func__, __LINE__, #expr); \ 520aafb230eSAlex Elder BUG(); \ 521aafb230eSAlex Elder } 522aafb230eSAlex Elder #else /* !RBD_DEBUG */ 523aafb230eSAlex Elder # define rbd_assert(expr) ((void) 0) 524aafb230eSAlex Elder #endif /* !RBD_DEBUG */ 525dfc5606dSYehuda Sadeh 5262761713dSIlya Dryomov static void rbd_osd_copyup_callback(struct rbd_obj_request *obj_request); 527b454e36dSAlex Elder static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request); 52805a46afdSAlex Elder static void rbd_img_parent_read(struct rbd_obj_request *obj_request); 52905a46afdSAlex Elder static void rbd_dev_remove_parent(struct rbd_device *rbd_dev); 5308b3e1a56SAlex Elder 531cc4a38bdSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev); 5322df3fac7SAlex Elder static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev); 533a720ae09SIlya Dryomov static int rbd_dev_header_info(struct rbd_device *rbd_dev); 534e8f59b59SIlya Dryomov static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev); 53554cac61fSAlex Elder static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, 53654cac61fSAlex Elder u64 snap_id); 5372ad3d716SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id, 5382ad3d716SAlex Elder u8 *order, u64 *snap_size); 5392ad3d716SAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id, 5402ad3d716SAlex Elder u64 *snap_features); 5412ad3d716SAlex Elder static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name); 54259c2be1eSYehuda Sadeh 543602adf40SYehuda Sadeh static int rbd_open(struct block_device *bdev, fmode_t mode) 544602adf40SYehuda Sadeh { 545f0f8cef5SAlex Elder struct rbd_device *rbd_dev = bdev->bd_disk->private_data; 546b82d167bSAlex Elder bool removing = false; 547602adf40SYehuda Sadeh 548f84344f3SAlex Elder if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only) 549602adf40SYehuda Sadeh return -EROFS; 550602adf40SYehuda Sadeh 551a14ea269SAlex Elder spin_lock_irq(&rbd_dev->lock); 552b82d167bSAlex Elder if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags)) 553b82d167bSAlex Elder removing = true; 554b82d167bSAlex Elder else 555b82d167bSAlex Elder rbd_dev->open_count++; 556a14ea269SAlex Elder spin_unlock_irq(&rbd_dev->lock); 557b82d167bSAlex Elder if (removing) 558b82d167bSAlex Elder return -ENOENT; 559b82d167bSAlex Elder 560c3e946ceSAlex Elder (void) get_device(&rbd_dev->dev); 561340c7a2bSAlex Elder 562602adf40SYehuda Sadeh return 0; 563602adf40SYehuda Sadeh } 564602adf40SYehuda Sadeh 565db2a144bSAl Viro static void rbd_release(struct gendisk *disk, fmode_t mode) 566dfc5606dSYehuda Sadeh { 567dfc5606dSYehuda Sadeh struct rbd_device *rbd_dev = disk->private_data; 568b82d167bSAlex Elder unsigned long open_count_before; 569b82d167bSAlex Elder 570a14ea269SAlex Elder spin_lock_irq(&rbd_dev->lock); 571b82d167bSAlex Elder open_count_before = rbd_dev->open_count--; 572a14ea269SAlex Elder spin_unlock_irq(&rbd_dev->lock); 573b82d167bSAlex Elder rbd_assert(open_count_before > 0); 574dfc5606dSYehuda Sadeh 575c3e946ceSAlex Elder put_device(&rbd_dev->dev); 576dfc5606dSYehuda Sadeh } 577dfc5606dSYehuda Sadeh 578131fd9f6SGuangliang Zhao static int rbd_ioctl_set_ro(struct rbd_device *rbd_dev, unsigned long arg) 579131fd9f6SGuangliang Zhao { 58077f33c03SJosh Durgin int ret = 0; 581131fd9f6SGuangliang Zhao int val; 582131fd9f6SGuangliang Zhao bool ro; 58377f33c03SJosh Durgin bool ro_changed = false; 584131fd9f6SGuangliang Zhao 58577f33c03SJosh Durgin /* get_user() may sleep, so call it before taking rbd_dev->lock */ 586131fd9f6SGuangliang Zhao if (get_user(val, (int __user *)(arg))) 587131fd9f6SGuangliang Zhao return -EFAULT; 588131fd9f6SGuangliang Zhao 589131fd9f6SGuangliang Zhao ro = val ? true : false; 590131fd9f6SGuangliang Zhao /* Snapshot doesn't allow to write*/ 591131fd9f6SGuangliang Zhao if (rbd_dev->spec->snap_id != CEPH_NOSNAP && !ro) 592131fd9f6SGuangliang Zhao return -EROFS; 593131fd9f6SGuangliang Zhao 59477f33c03SJosh Durgin spin_lock_irq(&rbd_dev->lock); 59577f33c03SJosh Durgin /* prevent others open this device */ 59677f33c03SJosh Durgin if (rbd_dev->open_count > 1) { 59777f33c03SJosh Durgin ret = -EBUSY; 59877f33c03SJosh Durgin goto out; 599131fd9f6SGuangliang Zhao } 600131fd9f6SGuangliang Zhao 60177f33c03SJosh Durgin if (rbd_dev->mapping.read_only != ro) { 60277f33c03SJosh Durgin rbd_dev->mapping.read_only = ro; 60377f33c03SJosh Durgin ro_changed = true; 60477f33c03SJosh Durgin } 60577f33c03SJosh Durgin 60677f33c03SJosh Durgin out: 60777f33c03SJosh Durgin spin_unlock_irq(&rbd_dev->lock); 60877f33c03SJosh Durgin /* set_disk_ro() may sleep, so call it after releasing rbd_dev->lock */ 60977f33c03SJosh Durgin if (ret == 0 && ro_changed) 61077f33c03SJosh Durgin set_disk_ro(rbd_dev->disk, ro ? 1 : 0); 61177f33c03SJosh Durgin 61277f33c03SJosh Durgin return ret; 613131fd9f6SGuangliang Zhao } 614131fd9f6SGuangliang Zhao 615131fd9f6SGuangliang Zhao static int rbd_ioctl(struct block_device *bdev, fmode_t mode, 616131fd9f6SGuangliang Zhao unsigned int cmd, unsigned long arg) 617131fd9f6SGuangliang Zhao { 618131fd9f6SGuangliang Zhao struct rbd_device *rbd_dev = bdev->bd_disk->private_data; 619131fd9f6SGuangliang Zhao int ret = 0; 620131fd9f6SGuangliang Zhao 621131fd9f6SGuangliang Zhao switch (cmd) { 622131fd9f6SGuangliang Zhao case BLKROSET: 623131fd9f6SGuangliang Zhao ret = rbd_ioctl_set_ro(rbd_dev, arg); 624131fd9f6SGuangliang Zhao break; 625131fd9f6SGuangliang Zhao default: 626131fd9f6SGuangliang Zhao ret = -ENOTTY; 627131fd9f6SGuangliang Zhao } 628131fd9f6SGuangliang Zhao 629131fd9f6SGuangliang Zhao return ret; 630131fd9f6SGuangliang Zhao } 631131fd9f6SGuangliang Zhao 632131fd9f6SGuangliang Zhao #ifdef CONFIG_COMPAT 633131fd9f6SGuangliang Zhao static int rbd_compat_ioctl(struct block_device *bdev, fmode_t mode, 634131fd9f6SGuangliang Zhao unsigned int cmd, unsigned long arg) 635131fd9f6SGuangliang Zhao { 636131fd9f6SGuangliang Zhao return rbd_ioctl(bdev, mode, cmd, arg); 637131fd9f6SGuangliang Zhao } 638131fd9f6SGuangliang Zhao #endif /* CONFIG_COMPAT */ 639131fd9f6SGuangliang Zhao 640602adf40SYehuda Sadeh static const struct block_device_operations rbd_bd_ops = { 641602adf40SYehuda Sadeh .owner = THIS_MODULE, 642602adf40SYehuda Sadeh .open = rbd_open, 643dfc5606dSYehuda Sadeh .release = rbd_release, 644131fd9f6SGuangliang Zhao .ioctl = rbd_ioctl, 645131fd9f6SGuangliang Zhao #ifdef CONFIG_COMPAT 646131fd9f6SGuangliang Zhao .compat_ioctl = rbd_compat_ioctl, 647131fd9f6SGuangliang Zhao #endif 648602adf40SYehuda Sadeh }; 649602adf40SYehuda Sadeh 650602adf40SYehuda Sadeh /* 6517262cfcaSAlex Elder * Initialize an rbd client instance. Success or not, this function 652cfbf6377SAlex Elder * consumes ceph_opts. Caller holds client_mutex. 653602adf40SYehuda Sadeh */ 654f8c38929SAlex Elder static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts) 655602adf40SYehuda Sadeh { 656602adf40SYehuda Sadeh struct rbd_client *rbdc; 657602adf40SYehuda Sadeh int ret = -ENOMEM; 658602adf40SYehuda Sadeh 65937206ee5SAlex Elder dout("%s:\n", __func__); 660602adf40SYehuda Sadeh rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL); 661602adf40SYehuda Sadeh if (!rbdc) 662602adf40SYehuda Sadeh goto out_opt; 663602adf40SYehuda Sadeh 664602adf40SYehuda Sadeh kref_init(&rbdc->kref); 665602adf40SYehuda Sadeh INIT_LIST_HEAD(&rbdc->node); 666602adf40SYehuda Sadeh 66743ae4701SAlex Elder rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0); 668602adf40SYehuda Sadeh if (IS_ERR(rbdc->client)) 66908f75463SAlex Elder goto out_rbdc; 67043ae4701SAlex Elder ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */ 671602adf40SYehuda Sadeh 672602adf40SYehuda Sadeh ret = ceph_open_session(rbdc->client); 673602adf40SYehuda Sadeh if (ret < 0) 67408f75463SAlex Elder goto out_client; 675602adf40SYehuda Sadeh 676432b8587SAlex Elder spin_lock(&rbd_client_list_lock); 677602adf40SYehuda Sadeh list_add_tail(&rbdc->node, &rbd_client_list); 678432b8587SAlex Elder spin_unlock(&rbd_client_list_lock); 679602adf40SYehuda Sadeh 68037206ee5SAlex Elder dout("%s: rbdc %p\n", __func__, rbdc); 681bc534d86SAlex Elder 682602adf40SYehuda Sadeh return rbdc; 68308f75463SAlex Elder out_client: 684602adf40SYehuda Sadeh ceph_destroy_client(rbdc->client); 68508f75463SAlex Elder out_rbdc: 686602adf40SYehuda Sadeh kfree(rbdc); 687602adf40SYehuda Sadeh out_opt: 68843ae4701SAlex Elder if (ceph_opts) 68943ae4701SAlex Elder ceph_destroy_options(ceph_opts); 69037206ee5SAlex Elder dout("%s: error %d\n", __func__, ret); 69137206ee5SAlex Elder 69228f259b7SVasiliy Kulikov return ERR_PTR(ret); 693602adf40SYehuda Sadeh } 694602adf40SYehuda Sadeh 6952f82ee54SAlex Elder static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc) 6962f82ee54SAlex Elder { 6972f82ee54SAlex Elder kref_get(&rbdc->kref); 6982f82ee54SAlex Elder 6992f82ee54SAlex Elder return rbdc; 7002f82ee54SAlex Elder } 7012f82ee54SAlex Elder 702602adf40SYehuda Sadeh /* 7031f7ba331SAlex Elder * Find a ceph client with specific addr and configuration. If 7041f7ba331SAlex Elder * found, bump its reference count. 705602adf40SYehuda Sadeh */ 7061f7ba331SAlex Elder static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts) 707602adf40SYehuda Sadeh { 708602adf40SYehuda Sadeh struct rbd_client *client_node; 7091f7ba331SAlex Elder bool found = false; 710602adf40SYehuda Sadeh 71143ae4701SAlex Elder if (ceph_opts->flags & CEPH_OPT_NOSHARE) 712602adf40SYehuda Sadeh return NULL; 713602adf40SYehuda Sadeh 7141f7ba331SAlex Elder spin_lock(&rbd_client_list_lock); 7151f7ba331SAlex Elder list_for_each_entry(client_node, &rbd_client_list, node) { 7161f7ba331SAlex Elder if (!ceph_compare_options(ceph_opts, client_node->client)) { 7172f82ee54SAlex Elder __rbd_get_client(client_node); 7182f82ee54SAlex Elder 7191f7ba331SAlex Elder found = true; 7201f7ba331SAlex Elder break; 7211f7ba331SAlex Elder } 7221f7ba331SAlex Elder } 7231f7ba331SAlex Elder spin_unlock(&rbd_client_list_lock); 7241f7ba331SAlex Elder 7251f7ba331SAlex Elder return found ? client_node : NULL; 726602adf40SYehuda Sadeh } 727602adf40SYehuda Sadeh 728602adf40SYehuda Sadeh /* 729210c104cSIlya Dryomov * (Per device) rbd map options 73059c2be1eSYehuda Sadeh */ 73159c2be1eSYehuda Sadeh enum { 732b5584180SIlya Dryomov Opt_queue_depth, 73359c2be1eSYehuda Sadeh Opt_last_int, 73459c2be1eSYehuda Sadeh /* int args above */ 73559c2be1eSYehuda Sadeh Opt_last_string, 73659c2be1eSYehuda Sadeh /* string args above */ 737cc0538b6SAlex Elder Opt_read_only, 738cc0538b6SAlex Elder Opt_read_write, 739210c104cSIlya Dryomov Opt_err 74059c2be1eSYehuda Sadeh }; 74159c2be1eSYehuda Sadeh 74243ae4701SAlex Elder static match_table_t rbd_opts_tokens = { 743b5584180SIlya Dryomov {Opt_queue_depth, "queue_depth=%d"}, 74459c2be1eSYehuda Sadeh /* int args above */ 74559c2be1eSYehuda Sadeh /* string args above */ 746be466c1cSAlex Elder {Opt_read_only, "read_only"}, 747cc0538b6SAlex Elder {Opt_read_only, "ro"}, /* Alternate spelling */ 748cc0538b6SAlex Elder {Opt_read_write, "read_write"}, 749cc0538b6SAlex Elder {Opt_read_write, "rw"}, /* Alternate spelling */ 750210c104cSIlya Dryomov {Opt_err, NULL} 75159c2be1eSYehuda Sadeh }; 75259c2be1eSYehuda Sadeh 75398571b5aSAlex Elder struct rbd_options { 754b5584180SIlya Dryomov int queue_depth; 75598571b5aSAlex Elder bool read_only; 75698571b5aSAlex Elder }; 75798571b5aSAlex Elder 758b5584180SIlya Dryomov #define RBD_QUEUE_DEPTH_DEFAULT BLKDEV_MAX_RQ 75998571b5aSAlex Elder #define RBD_READ_ONLY_DEFAULT false 76098571b5aSAlex Elder 76159c2be1eSYehuda Sadeh static int parse_rbd_opts_token(char *c, void *private) 76259c2be1eSYehuda Sadeh { 76343ae4701SAlex Elder struct rbd_options *rbd_opts = private; 76459c2be1eSYehuda Sadeh substring_t argstr[MAX_OPT_ARGS]; 76559c2be1eSYehuda Sadeh int token, intval, ret; 76659c2be1eSYehuda Sadeh 76743ae4701SAlex Elder token = match_token(c, rbd_opts_tokens, argstr); 76859c2be1eSYehuda Sadeh if (token < Opt_last_int) { 76959c2be1eSYehuda Sadeh ret = match_int(&argstr[0], &intval); 77059c2be1eSYehuda Sadeh if (ret < 0) { 771210c104cSIlya Dryomov pr_err("bad mount option arg (not int) at '%s'\n", c); 77259c2be1eSYehuda Sadeh return ret; 77359c2be1eSYehuda Sadeh } 77459c2be1eSYehuda Sadeh dout("got int token %d val %d\n", token, intval); 77559c2be1eSYehuda Sadeh } else if (token > Opt_last_int && token < Opt_last_string) { 776210c104cSIlya Dryomov dout("got string token %d val %s\n", token, argstr[0].from); 77759c2be1eSYehuda Sadeh } else { 77859c2be1eSYehuda Sadeh dout("got token %d\n", token); 77959c2be1eSYehuda Sadeh } 78059c2be1eSYehuda Sadeh 78159c2be1eSYehuda Sadeh switch (token) { 782b5584180SIlya Dryomov case Opt_queue_depth: 783b5584180SIlya Dryomov if (intval < 1) { 784b5584180SIlya Dryomov pr_err("queue_depth out of range\n"); 785b5584180SIlya Dryomov return -EINVAL; 786b5584180SIlya Dryomov } 787b5584180SIlya Dryomov rbd_opts->queue_depth = intval; 788b5584180SIlya Dryomov break; 789cc0538b6SAlex Elder case Opt_read_only: 790cc0538b6SAlex Elder rbd_opts->read_only = true; 791cc0538b6SAlex Elder break; 792cc0538b6SAlex Elder case Opt_read_write: 793cc0538b6SAlex Elder rbd_opts->read_only = false; 794cc0538b6SAlex Elder break; 79559c2be1eSYehuda Sadeh default: 796210c104cSIlya Dryomov /* libceph prints "bad option" msg */ 797210c104cSIlya Dryomov return -EINVAL; 79859c2be1eSYehuda Sadeh } 799210c104cSIlya Dryomov 80059c2be1eSYehuda Sadeh return 0; 80159c2be1eSYehuda Sadeh } 80259c2be1eSYehuda Sadeh 8036d2940c8SGuangliang Zhao static char* obj_op_name(enum obj_operation_type op_type) 8046d2940c8SGuangliang Zhao { 8056d2940c8SGuangliang Zhao switch (op_type) { 8066d2940c8SGuangliang Zhao case OBJ_OP_READ: 8076d2940c8SGuangliang Zhao return "read"; 8086d2940c8SGuangliang Zhao case OBJ_OP_WRITE: 8096d2940c8SGuangliang Zhao return "write"; 81090e98c52SGuangliang Zhao case OBJ_OP_DISCARD: 81190e98c52SGuangliang Zhao return "discard"; 8126d2940c8SGuangliang Zhao default: 8136d2940c8SGuangliang Zhao return "???"; 8146d2940c8SGuangliang Zhao } 8156d2940c8SGuangliang Zhao } 8166d2940c8SGuangliang Zhao 81759c2be1eSYehuda Sadeh /* 818602adf40SYehuda Sadeh * Get a ceph client with specific addr and configuration, if one does 8197262cfcaSAlex Elder * not exist create it. Either way, ceph_opts is consumed by this 8207262cfcaSAlex Elder * function. 821602adf40SYehuda Sadeh */ 8229d3997fdSAlex Elder static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts) 823602adf40SYehuda Sadeh { 824f8c38929SAlex Elder struct rbd_client *rbdc; 82559c2be1eSYehuda Sadeh 826cfbf6377SAlex Elder mutex_lock_nested(&client_mutex, SINGLE_DEPTH_NESTING); 8271f7ba331SAlex Elder rbdc = rbd_client_find(ceph_opts); 8289d3997fdSAlex Elder if (rbdc) /* using an existing client */ 82943ae4701SAlex Elder ceph_destroy_options(ceph_opts); 8309d3997fdSAlex Elder else 831f8c38929SAlex Elder rbdc = rbd_client_create(ceph_opts); 832cfbf6377SAlex Elder mutex_unlock(&client_mutex); 833d720bcb0SAlex Elder 8349d3997fdSAlex Elder return rbdc; 835602adf40SYehuda Sadeh } 836602adf40SYehuda Sadeh 837602adf40SYehuda Sadeh /* 838602adf40SYehuda Sadeh * Destroy ceph client 839d23a4b3fSAlex Elder * 840432b8587SAlex Elder * Caller must hold rbd_client_list_lock. 841602adf40SYehuda Sadeh */ 842602adf40SYehuda Sadeh static void rbd_client_release(struct kref *kref) 843602adf40SYehuda Sadeh { 844602adf40SYehuda Sadeh struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref); 845602adf40SYehuda Sadeh 84637206ee5SAlex Elder dout("%s: rbdc %p\n", __func__, rbdc); 847cd9d9f5dSAlex Elder spin_lock(&rbd_client_list_lock); 848602adf40SYehuda Sadeh list_del(&rbdc->node); 849cd9d9f5dSAlex Elder spin_unlock(&rbd_client_list_lock); 850602adf40SYehuda Sadeh 851602adf40SYehuda Sadeh ceph_destroy_client(rbdc->client); 852602adf40SYehuda Sadeh kfree(rbdc); 853602adf40SYehuda Sadeh } 854602adf40SYehuda Sadeh 855602adf40SYehuda Sadeh /* 856602adf40SYehuda Sadeh * Drop reference to ceph client node. If it's not referenced anymore, release 857602adf40SYehuda Sadeh * it. 858602adf40SYehuda Sadeh */ 8599d3997fdSAlex Elder static void rbd_put_client(struct rbd_client *rbdc) 860602adf40SYehuda Sadeh { 861c53d5893SAlex Elder if (rbdc) 8629d3997fdSAlex Elder kref_put(&rbdc->kref, rbd_client_release); 863602adf40SYehuda Sadeh } 864602adf40SYehuda Sadeh 865a30b71b9SAlex Elder static bool rbd_image_format_valid(u32 image_format) 866a30b71b9SAlex Elder { 867a30b71b9SAlex Elder return image_format == 1 || image_format == 2; 868a30b71b9SAlex Elder } 869a30b71b9SAlex Elder 8708e94af8eSAlex Elder static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk) 8718e94af8eSAlex Elder { 872103a150fSAlex Elder size_t size; 873103a150fSAlex Elder u32 snap_count; 874103a150fSAlex Elder 875103a150fSAlex Elder /* The header has to start with the magic rbd header text */ 876103a150fSAlex Elder if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT))) 877103a150fSAlex Elder return false; 878103a150fSAlex Elder 879db2388b6SAlex Elder /* The bio layer requires at least sector-sized I/O */ 880db2388b6SAlex Elder 881db2388b6SAlex Elder if (ondisk->options.order < SECTOR_SHIFT) 882db2388b6SAlex Elder return false; 883db2388b6SAlex Elder 884db2388b6SAlex Elder /* If we use u64 in a few spots we may be able to loosen this */ 885db2388b6SAlex Elder 886db2388b6SAlex Elder if (ondisk->options.order > 8 * sizeof (int) - 1) 887db2388b6SAlex Elder return false; 888db2388b6SAlex Elder 889103a150fSAlex Elder /* 890103a150fSAlex Elder * The size of a snapshot header has to fit in a size_t, and 891103a150fSAlex Elder * that limits the number of snapshots. 892103a150fSAlex Elder */ 893103a150fSAlex Elder snap_count = le32_to_cpu(ondisk->snap_count); 894103a150fSAlex Elder size = SIZE_MAX - sizeof (struct ceph_snap_context); 895103a150fSAlex Elder if (snap_count > size / sizeof (__le64)) 896103a150fSAlex Elder return false; 897103a150fSAlex Elder 898103a150fSAlex Elder /* 899103a150fSAlex Elder * Not only that, but the size of the entire the snapshot 900103a150fSAlex Elder * header must also be representable in a size_t. 901103a150fSAlex Elder */ 902103a150fSAlex Elder size -= snap_count * sizeof (__le64); 903103a150fSAlex Elder if ((u64) size < le64_to_cpu(ondisk->snap_names_len)) 904103a150fSAlex Elder return false; 905103a150fSAlex Elder 906103a150fSAlex Elder return true; 9078e94af8eSAlex Elder } 9088e94af8eSAlex Elder 909602adf40SYehuda Sadeh /* 910bb23e37aSAlex Elder * Fill an rbd image header with information from the given format 1 911bb23e37aSAlex Elder * on-disk header. 912602adf40SYehuda Sadeh */ 913662518b1SAlex Elder static int rbd_header_from_disk(struct rbd_device *rbd_dev, 9144156d998SAlex Elder struct rbd_image_header_ondisk *ondisk) 915602adf40SYehuda Sadeh { 916662518b1SAlex Elder struct rbd_image_header *header = &rbd_dev->header; 917bb23e37aSAlex Elder bool first_time = header->object_prefix == NULL; 918bb23e37aSAlex Elder struct ceph_snap_context *snapc; 919bb23e37aSAlex Elder char *object_prefix = NULL; 920bb23e37aSAlex Elder char *snap_names = NULL; 921bb23e37aSAlex Elder u64 *snap_sizes = NULL; 922ccece235SAlex Elder u32 snap_count; 923d2bb24e5SAlex Elder size_t size; 924bb23e37aSAlex Elder int ret = -ENOMEM; 925621901d6SAlex Elder u32 i; 926602adf40SYehuda Sadeh 927bb23e37aSAlex Elder /* Allocate this now to avoid having to handle failure below */ 928103a150fSAlex Elder 929bb23e37aSAlex Elder if (first_time) { 930bb23e37aSAlex Elder size_t len; 931bb23e37aSAlex Elder 932bb23e37aSAlex Elder len = strnlen(ondisk->object_prefix, 933bb23e37aSAlex Elder sizeof (ondisk->object_prefix)); 934bb23e37aSAlex Elder object_prefix = kmalloc(len + 1, GFP_KERNEL); 935bb23e37aSAlex Elder if (!object_prefix) 936602adf40SYehuda Sadeh return -ENOMEM; 937bb23e37aSAlex Elder memcpy(object_prefix, ondisk->object_prefix, len); 938bb23e37aSAlex Elder object_prefix[len] = '\0'; 939bb23e37aSAlex Elder } 94000f1f36fSAlex Elder 941bb23e37aSAlex Elder /* Allocate the snapshot context and fill it in */ 942d2bb24e5SAlex Elder 943602adf40SYehuda Sadeh snap_count = le32_to_cpu(ondisk->snap_count); 944bb23e37aSAlex Elder snapc = ceph_create_snap_context(snap_count, GFP_KERNEL); 945bb23e37aSAlex Elder if (!snapc) 946bb23e37aSAlex Elder goto out_err; 947bb23e37aSAlex Elder snapc->seq = le64_to_cpu(ondisk->snap_seq); 948602adf40SYehuda Sadeh if (snap_count) { 949bb23e37aSAlex Elder struct rbd_image_snap_ondisk *snaps; 950f785cc1dSAlex Elder u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len); 951f785cc1dSAlex Elder 952bb23e37aSAlex Elder /* We'll keep a copy of the snapshot names... */ 953621901d6SAlex Elder 954f785cc1dSAlex Elder if (snap_names_len > (u64)SIZE_MAX) 955bb23e37aSAlex Elder goto out_2big; 956bb23e37aSAlex Elder snap_names = kmalloc(snap_names_len, GFP_KERNEL); 957bb23e37aSAlex Elder if (!snap_names) 958602adf40SYehuda Sadeh goto out_err; 959bb23e37aSAlex Elder 960bb23e37aSAlex Elder /* ...as well as the array of their sizes. */ 961bb23e37aSAlex Elder 962bb23e37aSAlex Elder size = snap_count * sizeof (*header->snap_sizes); 963bb23e37aSAlex Elder snap_sizes = kmalloc(size, GFP_KERNEL); 964bb23e37aSAlex Elder if (!snap_sizes) 965bb23e37aSAlex Elder goto out_err; 966bb23e37aSAlex Elder 967f785cc1dSAlex Elder /* 968bb23e37aSAlex Elder * Copy the names, and fill in each snapshot's id 969bb23e37aSAlex Elder * and size. 970bb23e37aSAlex Elder * 97199a41ebcSAlex Elder * Note that rbd_dev_v1_header_info() guarantees the 972bb23e37aSAlex Elder * ondisk buffer we're working with has 973f785cc1dSAlex Elder * snap_names_len bytes beyond the end of the 974f785cc1dSAlex Elder * snapshot id array, this memcpy() is safe. 975f785cc1dSAlex Elder */ 976bb23e37aSAlex Elder memcpy(snap_names, &ondisk->snaps[snap_count], snap_names_len); 977bb23e37aSAlex Elder snaps = ondisk->snaps; 978bb23e37aSAlex Elder for (i = 0; i < snap_count; i++) { 979bb23e37aSAlex Elder snapc->snaps[i] = le64_to_cpu(snaps[i].id); 980bb23e37aSAlex Elder snap_sizes[i] = le64_to_cpu(snaps[i].image_size); 981bb23e37aSAlex Elder } 982602adf40SYehuda Sadeh } 983849b4260SAlex Elder 984bb23e37aSAlex Elder /* We won't fail any more, fill in the header */ 985bb23e37aSAlex Elder 986bb23e37aSAlex Elder if (first_time) { 987bb23e37aSAlex Elder header->object_prefix = object_prefix; 988602adf40SYehuda Sadeh header->obj_order = ondisk->options.order; 989602adf40SYehuda Sadeh header->crypt_type = ondisk->options.crypt_type; 990602adf40SYehuda Sadeh header->comp_type = ondisk->options.comp_type; 991bb23e37aSAlex Elder /* The rest aren't used for format 1 images */ 992bb23e37aSAlex Elder header->stripe_unit = 0; 993bb23e37aSAlex Elder header->stripe_count = 0; 994bb23e37aSAlex Elder header->features = 0; 995662518b1SAlex Elder } else { 996662518b1SAlex Elder ceph_put_snap_context(header->snapc); 997662518b1SAlex Elder kfree(header->snap_names); 998662518b1SAlex Elder kfree(header->snap_sizes); 999bb23e37aSAlex Elder } 10006a52325fSAlex Elder 1001bb23e37aSAlex Elder /* The remaining fields always get updated (when we refresh) */ 1002621901d6SAlex Elder 1003f84344f3SAlex Elder header->image_size = le64_to_cpu(ondisk->image_size); 1004bb23e37aSAlex Elder header->snapc = snapc; 1005bb23e37aSAlex Elder header->snap_names = snap_names; 1006bb23e37aSAlex Elder header->snap_sizes = snap_sizes; 1007468521c1SAlex Elder 1008602adf40SYehuda Sadeh return 0; 1009bb23e37aSAlex Elder out_2big: 1010bb23e37aSAlex Elder ret = -EIO; 10116a52325fSAlex Elder out_err: 1012bb23e37aSAlex Elder kfree(snap_sizes); 1013bb23e37aSAlex Elder kfree(snap_names); 1014bb23e37aSAlex Elder ceph_put_snap_context(snapc); 1015bb23e37aSAlex Elder kfree(object_prefix); 1016ccece235SAlex Elder 1017bb23e37aSAlex Elder return ret; 1018602adf40SYehuda Sadeh } 1019602adf40SYehuda Sadeh 10209682fc6dSAlex Elder static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which) 10219682fc6dSAlex Elder { 10229682fc6dSAlex Elder const char *snap_name; 10239682fc6dSAlex Elder 10249682fc6dSAlex Elder rbd_assert(which < rbd_dev->header.snapc->num_snaps); 10259682fc6dSAlex Elder 10269682fc6dSAlex Elder /* Skip over names until we find the one we are looking for */ 10279682fc6dSAlex Elder 10289682fc6dSAlex Elder snap_name = rbd_dev->header.snap_names; 10299682fc6dSAlex Elder while (which--) 10309682fc6dSAlex Elder snap_name += strlen(snap_name) + 1; 10319682fc6dSAlex Elder 10329682fc6dSAlex Elder return kstrdup(snap_name, GFP_KERNEL); 10339682fc6dSAlex Elder } 10349682fc6dSAlex Elder 103530d1cff8SAlex Elder /* 103630d1cff8SAlex Elder * Snapshot id comparison function for use with qsort()/bsearch(). 103730d1cff8SAlex Elder * Note that result is for snapshots in *descending* order. 103830d1cff8SAlex Elder */ 103930d1cff8SAlex Elder static int snapid_compare_reverse(const void *s1, const void *s2) 104030d1cff8SAlex Elder { 104130d1cff8SAlex Elder u64 snap_id1 = *(u64 *)s1; 104230d1cff8SAlex Elder u64 snap_id2 = *(u64 *)s2; 104330d1cff8SAlex Elder 104430d1cff8SAlex Elder if (snap_id1 < snap_id2) 104530d1cff8SAlex Elder return 1; 104630d1cff8SAlex Elder return snap_id1 == snap_id2 ? 0 : -1; 104730d1cff8SAlex Elder } 104830d1cff8SAlex Elder 104930d1cff8SAlex Elder /* 105030d1cff8SAlex Elder * Search a snapshot context to see if the given snapshot id is 105130d1cff8SAlex Elder * present. 105230d1cff8SAlex Elder * 105330d1cff8SAlex Elder * Returns the position of the snapshot id in the array if it's found, 105430d1cff8SAlex Elder * or BAD_SNAP_INDEX otherwise. 105530d1cff8SAlex Elder * 105630d1cff8SAlex Elder * Note: The snapshot array is in kept sorted (by the osd) in 105730d1cff8SAlex Elder * reverse order, highest snapshot id first. 105830d1cff8SAlex Elder */ 10599682fc6dSAlex Elder static u32 rbd_dev_snap_index(struct rbd_device *rbd_dev, u64 snap_id) 10609682fc6dSAlex Elder { 10619682fc6dSAlex Elder struct ceph_snap_context *snapc = rbd_dev->header.snapc; 106230d1cff8SAlex Elder u64 *found; 10639682fc6dSAlex Elder 106430d1cff8SAlex Elder found = bsearch(&snap_id, &snapc->snaps, snapc->num_snaps, 106530d1cff8SAlex Elder sizeof (snap_id), snapid_compare_reverse); 10669682fc6dSAlex Elder 106730d1cff8SAlex Elder return found ? (u32)(found - &snapc->snaps[0]) : BAD_SNAP_INDEX; 10689682fc6dSAlex Elder } 10699682fc6dSAlex Elder 10702ad3d716SAlex Elder static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, 10712ad3d716SAlex Elder u64 snap_id) 107254cac61fSAlex Elder { 107354cac61fSAlex Elder u32 which; 1074da6a6b63SJosh Durgin const char *snap_name; 107554cac61fSAlex Elder 107654cac61fSAlex Elder which = rbd_dev_snap_index(rbd_dev, snap_id); 107754cac61fSAlex Elder if (which == BAD_SNAP_INDEX) 1078da6a6b63SJosh Durgin return ERR_PTR(-ENOENT); 107954cac61fSAlex Elder 1080da6a6b63SJosh Durgin snap_name = _rbd_dev_v1_snap_name(rbd_dev, which); 1081da6a6b63SJosh Durgin return snap_name ? snap_name : ERR_PTR(-ENOMEM); 108254cac61fSAlex Elder } 108354cac61fSAlex Elder 10849e15b77dSAlex Elder static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id) 10859e15b77dSAlex Elder { 10869e15b77dSAlex Elder if (snap_id == CEPH_NOSNAP) 10879e15b77dSAlex Elder return RBD_SNAP_HEAD_NAME; 10889e15b77dSAlex Elder 108954cac61fSAlex Elder rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 109054cac61fSAlex Elder if (rbd_dev->image_format == 1) 109154cac61fSAlex Elder return rbd_dev_v1_snap_name(rbd_dev, snap_id); 10929e15b77dSAlex Elder 109354cac61fSAlex Elder return rbd_dev_v2_snap_name(rbd_dev, snap_id); 10949e15b77dSAlex Elder } 10959e15b77dSAlex Elder 10962ad3d716SAlex Elder static int rbd_snap_size(struct rbd_device *rbd_dev, u64 snap_id, 10972ad3d716SAlex Elder u64 *snap_size) 1098602adf40SYehuda Sadeh { 10992ad3d716SAlex Elder rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 11002ad3d716SAlex Elder if (snap_id == CEPH_NOSNAP) { 11012ad3d716SAlex Elder *snap_size = rbd_dev->header.image_size; 11022ad3d716SAlex Elder } else if (rbd_dev->image_format == 1) { 11032ad3d716SAlex Elder u32 which; 110400f1f36fSAlex Elder 11052ad3d716SAlex Elder which = rbd_dev_snap_index(rbd_dev, snap_id); 11062ad3d716SAlex Elder if (which == BAD_SNAP_INDEX) 11072ad3d716SAlex Elder return -ENOENT; 110800f1f36fSAlex Elder 11092ad3d716SAlex Elder *snap_size = rbd_dev->header.snap_sizes[which]; 11102ad3d716SAlex Elder } else { 11112ad3d716SAlex Elder u64 size = 0; 11122ad3d716SAlex Elder int ret; 11132ad3d716SAlex Elder 11142ad3d716SAlex Elder ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size); 11152ad3d716SAlex Elder if (ret) 11162ad3d716SAlex Elder return ret; 11172ad3d716SAlex Elder 11182ad3d716SAlex Elder *snap_size = size; 11192ad3d716SAlex Elder } 11202ad3d716SAlex Elder return 0; 11212ad3d716SAlex Elder } 11222ad3d716SAlex Elder 11232ad3d716SAlex Elder static int rbd_snap_features(struct rbd_device *rbd_dev, u64 snap_id, 11242ad3d716SAlex Elder u64 *snap_features) 11252ad3d716SAlex Elder { 11262ad3d716SAlex Elder rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 11272ad3d716SAlex Elder if (snap_id == CEPH_NOSNAP) { 11282ad3d716SAlex Elder *snap_features = rbd_dev->header.features; 11292ad3d716SAlex Elder } else if (rbd_dev->image_format == 1) { 11302ad3d716SAlex Elder *snap_features = 0; /* No features for format 1 */ 11312ad3d716SAlex Elder } else { 11322ad3d716SAlex Elder u64 features = 0; 11332ad3d716SAlex Elder int ret; 11342ad3d716SAlex Elder 11352ad3d716SAlex Elder ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, &features); 11362ad3d716SAlex Elder if (ret) 11372ad3d716SAlex Elder return ret; 11382ad3d716SAlex Elder 11392ad3d716SAlex Elder *snap_features = features; 11402ad3d716SAlex Elder } 11412ad3d716SAlex Elder return 0; 114200f1f36fSAlex Elder } 1143602adf40SYehuda Sadeh 1144d1cf5788SAlex Elder static int rbd_dev_mapping_set(struct rbd_device *rbd_dev) 1145602adf40SYehuda Sadeh { 11468f4b7d98SAlex Elder u64 snap_id = rbd_dev->spec->snap_id; 11472ad3d716SAlex Elder u64 size = 0; 11482ad3d716SAlex Elder u64 features = 0; 11492ad3d716SAlex Elder int ret; 11508b0241f8SAlex Elder 11512ad3d716SAlex Elder ret = rbd_snap_size(rbd_dev, snap_id, &size); 11522ad3d716SAlex Elder if (ret) 11532ad3d716SAlex Elder return ret; 11542ad3d716SAlex Elder ret = rbd_snap_features(rbd_dev, snap_id, &features); 11552ad3d716SAlex Elder if (ret) 11562ad3d716SAlex Elder return ret; 11572ad3d716SAlex Elder 11582ad3d716SAlex Elder rbd_dev->mapping.size = size; 11592ad3d716SAlex Elder rbd_dev->mapping.features = features; 11602ad3d716SAlex Elder 11618b0241f8SAlex Elder return 0; 1162602adf40SYehuda Sadeh } 1163602adf40SYehuda Sadeh 1164d1cf5788SAlex Elder static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev) 1165d1cf5788SAlex Elder { 1166d1cf5788SAlex Elder rbd_dev->mapping.size = 0; 1167d1cf5788SAlex Elder rbd_dev->mapping.features = 0; 1168200a6a8bSAlex Elder } 1169200a6a8bSAlex Elder 11707d5079aaSHimangi Saraogi static void rbd_segment_name_free(const char *name) 11717d5079aaSHimangi Saraogi { 11727d5079aaSHimangi Saraogi /* The explicit cast here is needed to drop the const qualifier */ 11737d5079aaSHimangi Saraogi 11747d5079aaSHimangi Saraogi kmem_cache_free(rbd_segment_name_cache, (void *)name); 11757d5079aaSHimangi Saraogi } 11767d5079aaSHimangi Saraogi 117798571b5aSAlex Elder static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset) 1178602adf40SYehuda Sadeh { 117965ccfe21SAlex Elder char *name; 118065ccfe21SAlex Elder u64 segment; 118165ccfe21SAlex Elder int ret; 11823a96d5cdSJosh Durgin char *name_format; 1183602adf40SYehuda Sadeh 118478c2a44aSAlex Elder name = kmem_cache_alloc(rbd_segment_name_cache, GFP_NOIO); 118565ccfe21SAlex Elder if (!name) 118665ccfe21SAlex Elder return NULL; 118765ccfe21SAlex Elder segment = offset >> rbd_dev->header.obj_order; 11883a96d5cdSJosh Durgin name_format = "%s.%012llx"; 11893a96d5cdSJosh Durgin if (rbd_dev->image_format == 2) 11903a96d5cdSJosh Durgin name_format = "%s.%016llx"; 11912d0ebc5dSIlya Dryomov ret = snprintf(name, CEPH_MAX_OID_NAME_LEN + 1, name_format, 119265ccfe21SAlex Elder rbd_dev->header.object_prefix, segment); 11932d0ebc5dSIlya Dryomov if (ret < 0 || ret > CEPH_MAX_OID_NAME_LEN) { 119465ccfe21SAlex Elder pr_err("error formatting segment name for #%llu (%d)\n", 119565ccfe21SAlex Elder segment, ret); 11967d5079aaSHimangi Saraogi rbd_segment_name_free(name); 119765ccfe21SAlex Elder name = NULL; 119865ccfe21SAlex Elder } 1199602adf40SYehuda Sadeh 120065ccfe21SAlex Elder return name; 120165ccfe21SAlex Elder } 1202602adf40SYehuda Sadeh 120365ccfe21SAlex Elder static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset) 120465ccfe21SAlex Elder { 120565ccfe21SAlex Elder u64 segment_size = (u64) 1 << rbd_dev->header.obj_order; 1206602adf40SYehuda Sadeh 120765ccfe21SAlex Elder return offset & (segment_size - 1); 120865ccfe21SAlex Elder } 120965ccfe21SAlex Elder 121065ccfe21SAlex Elder static u64 rbd_segment_length(struct rbd_device *rbd_dev, 121165ccfe21SAlex Elder u64 offset, u64 length) 121265ccfe21SAlex Elder { 121365ccfe21SAlex Elder u64 segment_size = (u64) 1 << rbd_dev->header.obj_order; 121465ccfe21SAlex Elder 121565ccfe21SAlex Elder offset &= segment_size - 1; 121665ccfe21SAlex Elder 1217aafb230eSAlex Elder rbd_assert(length <= U64_MAX - offset); 121865ccfe21SAlex Elder if (offset + length > segment_size) 121965ccfe21SAlex Elder length = segment_size - offset; 122065ccfe21SAlex Elder 122165ccfe21SAlex Elder return length; 1222602adf40SYehuda Sadeh } 1223602adf40SYehuda Sadeh 1224602adf40SYehuda Sadeh /* 1225029bcbd8SJosh Durgin * returns the size of an object in the image 1226029bcbd8SJosh Durgin */ 1227029bcbd8SJosh Durgin static u64 rbd_obj_bytes(struct rbd_image_header *header) 1228029bcbd8SJosh Durgin { 1229029bcbd8SJosh Durgin return 1 << header->obj_order; 1230029bcbd8SJosh Durgin } 1231029bcbd8SJosh Durgin 1232029bcbd8SJosh Durgin /* 1233602adf40SYehuda Sadeh * bio helpers 1234602adf40SYehuda Sadeh */ 1235602adf40SYehuda Sadeh 1236602adf40SYehuda Sadeh static void bio_chain_put(struct bio *chain) 1237602adf40SYehuda Sadeh { 1238602adf40SYehuda Sadeh struct bio *tmp; 1239602adf40SYehuda Sadeh 1240602adf40SYehuda Sadeh while (chain) { 1241602adf40SYehuda Sadeh tmp = chain; 1242602adf40SYehuda Sadeh chain = chain->bi_next; 1243602adf40SYehuda Sadeh bio_put(tmp); 1244602adf40SYehuda Sadeh } 1245602adf40SYehuda Sadeh } 1246602adf40SYehuda Sadeh 1247602adf40SYehuda Sadeh /* 1248602adf40SYehuda Sadeh * zeros a bio chain, starting at specific offset 1249602adf40SYehuda Sadeh */ 1250602adf40SYehuda Sadeh static void zero_bio_chain(struct bio *chain, int start_ofs) 1251602adf40SYehuda Sadeh { 12527988613bSKent Overstreet struct bio_vec bv; 12537988613bSKent Overstreet struct bvec_iter iter; 1254602adf40SYehuda Sadeh unsigned long flags; 1255602adf40SYehuda Sadeh void *buf; 1256602adf40SYehuda Sadeh int pos = 0; 1257602adf40SYehuda Sadeh 1258602adf40SYehuda Sadeh while (chain) { 12597988613bSKent Overstreet bio_for_each_segment(bv, chain, iter) { 12607988613bSKent Overstreet if (pos + bv.bv_len > start_ofs) { 1261602adf40SYehuda Sadeh int remainder = max(start_ofs - pos, 0); 12627988613bSKent Overstreet buf = bvec_kmap_irq(&bv, &flags); 1263602adf40SYehuda Sadeh memset(buf + remainder, 0, 12647988613bSKent Overstreet bv.bv_len - remainder); 12657988613bSKent Overstreet flush_dcache_page(bv.bv_page); 126685b5aaa6SDan Carpenter bvec_kunmap_irq(buf, &flags); 1267602adf40SYehuda Sadeh } 12687988613bSKent Overstreet pos += bv.bv_len; 1269602adf40SYehuda Sadeh } 1270602adf40SYehuda Sadeh 1271602adf40SYehuda Sadeh chain = chain->bi_next; 1272602adf40SYehuda Sadeh } 1273602adf40SYehuda Sadeh } 1274602adf40SYehuda Sadeh 1275602adf40SYehuda Sadeh /* 1276b9434c5bSAlex Elder * similar to zero_bio_chain(), zeros data defined by a page array, 1277b9434c5bSAlex Elder * starting at the given byte offset from the start of the array and 1278b9434c5bSAlex Elder * continuing up to the given end offset. The pages array is 1279b9434c5bSAlex Elder * assumed to be big enough to hold all bytes up to the end. 1280b9434c5bSAlex Elder */ 1281b9434c5bSAlex Elder static void zero_pages(struct page **pages, u64 offset, u64 end) 1282b9434c5bSAlex Elder { 1283b9434c5bSAlex Elder struct page **page = &pages[offset >> PAGE_SHIFT]; 1284b9434c5bSAlex Elder 1285b9434c5bSAlex Elder rbd_assert(end > offset); 1286b9434c5bSAlex Elder rbd_assert(end - offset <= (u64)SIZE_MAX); 1287b9434c5bSAlex Elder while (offset < end) { 1288b9434c5bSAlex Elder size_t page_offset; 1289b9434c5bSAlex Elder size_t length; 1290b9434c5bSAlex Elder unsigned long flags; 1291b9434c5bSAlex Elder void *kaddr; 1292b9434c5bSAlex Elder 1293491205a8SGeert Uytterhoeven page_offset = offset & ~PAGE_MASK; 1294491205a8SGeert Uytterhoeven length = min_t(size_t, PAGE_SIZE - page_offset, end - offset); 1295b9434c5bSAlex Elder local_irq_save(flags); 1296b9434c5bSAlex Elder kaddr = kmap_atomic(*page); 1297b9434c5bSAlex Elder memset(kaddr + page_offset, 0, length); 1298e2156054SAlex Elder flush_dcache_page(*page); 1299b9434c5bSAlex Elder kunmap_atomic(kaddr); 1300b9434c5bSAlex Elder local_irq_restore(flags); 1301b9434c5bSAlex Elder 1302b9434c5bSAlex Elder offset += length; 1303b9434c5bSAlex Elder page++; 1304b9434c5bSAlex Elder } 1305b9434c5bSAlex Elder } 1306b9434c5bSAlex Elder 1307b9434c5bSAlex Elder /* 1308f7760dadSAlex Elder * Clone a portion of a bio, starting at the given byte offset 1309f7760dadSAlex Elder * and continuing for the number of bytes indicated. 1310602adf40SYehuda Sadeh */ 1311f7760dadSAlex Elder static struct bio *bio_clone_range(struct bio *bio_src, 1312f7760dadSAlex Elder unsigned int offset, 1313f7760dadSAlex Elder unsigned int len, 1314f7760dadSAlex Elder gfp_t gfpmask) 1315602adf40SYehuda Sadeh { 1316f7760dadSAlex Elder struct bio *bio; 1317602adf40SYehuda Sadeh 13185341a627SKent Overstreet bio = bio_clone(bio_src, gfpmask); 1319f7760dadSAlex Elder if (!bio) 1320f7760dadSAlex Elder return NULL; /* ENOMEM */ 1321f7760dadSAlex Elder 13225341a627SKent Overstreet bio_advance(bio, offset); 13234f024f37SKent Overstreet bio->bi_iter.bi_size = len; 1324602adf40SYehuda Sadeh 1325f7760dadSAlex Elder return bio; 1326602adf40SYehuda Sadeh } 1327602adf40SYehuda Sadeh 1328f7760dadSAlex Elder /* 1329f7760dadSAlex Elder * Clone a portion of a bio chain, starting at the given byte offset 1330f7760dadSAlex Elder * into the first bio in the source chain and continuing for the 1331f7760dadSAlex Elder * number of bytes indicated. The result is another bio chain of 1332f7760dadSAlex Elder * exactly the given length, or a null pointer on error. 1333f7760dadSAlex Elder * 1334f7760dadSAlex Elder * The bio_src and offset parameters are both in-out. On entry they 1335f7760dadSAlex Elder * refer to the first source bio and the offset into that bio where 1336f7760dadSAlex Elder * the start of data to be cloned is located. 1337f7760dadSAlex Elder * 1338f7760dadSAlex Elder * On return, bio_src is updated to refer to the bio in the source 1339f7760dadSAlex Elder * chain that contains first un-cloned byte, and *offset will 1340f7760dadSAlex Elder * contain the offset of that byte within that bio. 1341f7760dadSAlex Elder */ 1342f7760dadSAlex Elder static struct bio *bio_chain_clone_range(struct bio **bio_src, 1343f7760dadSAlex Elder unsigned int *offset, 1344f7760dadSAlex Elder unsigned int len, 1345f7760dadSAlex Elder gfp_t gfpmask) 1346f7760dadSAlex Elder { 1347f7760dadSAlex Elder struct bio *bi = *bio_src; 1348f7760dadSAlex Elder unsigned int off = *offset; 1349f7760dadSAlex Elder struct bio *chain = NULL; 1350f7760dadSAlex Elder struct bio **end; 1351602adf40SYehuda Sadeh 1352f7760dadSAlex Elder /* Build up a chain of clone bios up to the limit */ 1353602adf40SYehuda Sadeh 13544f024f37SKent Overstreet if (!bi || off >= bi->bi_iter.bi_size || !len) 1355f7760dadSAlex Elder return NULL; /* Nothing to clone */ 1356602adf40SYehuda Sadeh 1357f7760dadSAlex Elder end = &chain; 1358f7760dadSAlex Elder while (len) { 1359f7760dadSAlex Elder unsigned int bi_size; 1360f7760dadSAlex Elder struct bio *bio; 1361f7760dadSAlex Elder 1362f5400b7aSAlex Elder if (!bi) { 1363f5400b7aSAlex Elder rbd_warn(NULL, "bio_chain exhausted with %u left", len); 1364f7760dadSAlex Elder goto out_err; /* EINVAL; ran out of bio's */ 1365f5400b7aSAlex Elder } 13664f024f37SKent Overstreet bi_size = min_t(unsigned int, bi->bi_iter.bi_size - off, len); 1367f7760dadSAlex Elder bio = bio_clone_range(bi, off, bi_size, gfpmask); 1368f7760dadSAlex Elder if (!bio) 1369f7760dadSAlex Elder goto out_err; /* ENOMEM */ 1370f7760dadSAlex Elder 1371f7760dadSAlex Elder *end = bio; 1372f7760dadSAlex Elder end = &bio->bi_next; 1373f7760dadSAlex Elder 1374f7760dadSAlex Elder off += bi_size; 13754f024f37SKent Overstreet if (off == bi->bi_iter.bi_size) { 1376f7760dadSAlex Elder bi = bi->bi_next; 1377f7760dadSAlex Elder off = 0; 1378f7760dadSAlex Elder } 1379f7760dadSAlex Elder len -= bi_size; 1380f7760dadSAlex Elder } 1381f7760dadSAlex Elder *bio_src = bi; 1382f7760dadSAlex Elder *offset = off; 1383f7760dadSAlex Elder 1384f7760dadSAlex Elder return chain; 1385f7760dadSAlex Elder out_err: 1386f7760dadSAlex Elder bio_chain_put(chain); 1387f7760dadSAlex Elder 1388602adf40SYehuda Sadeh return NULL; 1389602adf40SYehuda Sadeh } 1390602adf40SYehuda Sadeh 1391926f9b3fSAlex Elder /* 1392926f9b3fSAlex Elder * The default/initial value for all object request flags is 0. For 1393926f9b3fSAlex Elder * each flag, once its value is set to 1 it is never reset to 0 1394926f9b3fSAlex Elder * again. 1395926f9b3fSAlex Elder */ 13966365d33aSAlex Elder static void obj_request_img_data_set(struct rbd_obj_request *obj_request) 13976365d33aSAlex Elder { 13986365d33aSAlex Elder if (test_and_set_bit(OBJ_REQ_IMG_DATA, &obj_request->flags)) { 13996365d33aSAlex Elder struct rbd_device *rbd_dev; 14006365d33aSAlex Elder 140157acbaa7SAlex Elder rbd_dev = obj_request->img_request->rbd_dev; 14029584d508SIlya Dryomov rbd_warn(rbd_dev, "obj_request %p already marked img_data", 14036365d33aSAlex Elder obj_request); 14046365d33aSAlex Elder } 14056365d33aSAlex Elder } 14066365d33aSAlex Elder 14076365d33aSAlex Elder static bool obj_request_img_data_test(struct rbd_obj_request *obj_request) 14086365d33aSAlex Elder { 14096365d33aSAlex Elder smp_mb(); 14106365d33aSAlex Elder return test_bit(OBJ_REQ_IMG_DATA, &obj_request->flags) != 0; 14116365d33aSAlex Elder } 14126365d33aSAlex Elder 141357acbaa7SAlex Elder static void obj_request_done_set(struct rbd_obj_request *obj_request) 141457acbaa7SAlex Elder { 141557acbaa7SAlex Elder if (test_and_set_bit(OBJ_REQ_DONE, &obj_request->flags)) { 141657acbaa7SAlex Elder struct rbd_device *rbd_dev = NULL; 141757acbaa7SAlex Elder 141857acbaa7SAlex Elder if (obj_request_img_data_test(obj_request)) 141957acbaa7SAlex Elder rbd_dev = obj_request->img_request->rbd_dev; 14209584d508SIlya Dryomov rbd_warn(rbd_dev, "obj_request %p already marked done", 142157acbaa7SAlex Elder obj_request); 142257acbaa7SAlex Elder } 142357acbaa7SAlex Elder } 142457acbaa7SAlex Elder 142557acbaa7SAlex Elder static bool obj_request_done_test(struct rbd_obj_request *obj_request) 142657acbaa7SAlex Elder { 142757acbaa7SAlex Elder smp_mb(); 142857acbaa7SAlex Elder return test_bit(OBJ_REQ_DONE, &obj_request->flags) != 0; 142957acbaa7SAlex Elder } 143057acbaa7SAlex Elder 14315679c59fSAlex Elder /* 14325679c59fSAlex Elder * This sets the KNOWN flag after (possibly) setting the EXISTS 14335679c59fSAlex Elder * flag. The latter is set based on the "exists" value provided. 14345679c59fSAlex Elder * 14355679c59fSAlex Elder * Note that for our purposes once an object exists it never goes 14365679c59fSAlex Elder * away again. It's possible that the response from two existence 14375679c59fSAlex Elder * checks are separated by the creation of the target object, and 14385679c59fSAlex Elder * the first ("doesn't exist") response arrives *after* the second 14395679c59fSAlex Elder * ("does exist"). In that case we ignore the second one. 14405679c59fSAlex Elder */ 14415679c59fSAlex Elder static void obj_request_existence_set(struct rbd_obj_request *obj_request, 14425679c59fSAlex Elder bool exists) 14435679c59fSAlex Elder { 14445679c59fSAlex Elder if (exists) 14455679c59fSAlex Elder set_bit(OBJ_REQ_EXISTS, &obj_request->flags); 14465679c59fSAlex Elder set_bit(OBJ_REQ_KNOWN, &obj_request->flags); 14475679c59fSAlex Elder smp_mb(); 14485679c59fSAlex Elder } 14495679c59fSAlex Elder 14505679c59fSAlex Elder static bool obj_request_known_test(struct rbd_obj_request *obj_request) 14515679c59fSAlex Elder { 14525679c59fSAlex Elder smp_mb(); 14535679c59fSAlex Elder return test_bit(OBJ_REQ_KNOWN, &obj_request->flags) != 0; 14545679c59fSAlex Elder } 14555679c59fSAlex Elder 14565679c59fSAlex Elder static bool obj_request_exists_test(struct rbd_obj_request *obj_request) 14575679c59fSAlex Elder { 14585679c59fSAlex Elder smp_mb(); 14595679c59fSAlex Elder return test_bit(OBJ_REQ_EXISTS, &obj_request->flags) != 0; 14605679c59fSAlex Elder } 14615679c59fSAlex Elder 14629638556aSIlya Dryomov static bool obj_request_overlaps_parent(struct rbd_obj_request *obj_request) 14639638556aSIlya Dryomov { 14649638556aSIlya Dryomov struct rbd_device *rbd_dev = obj_request->img_request->rbd_dev; 14659638556aSIlya Dryomov 14669638556aSIlya Dryomov return obj_request->img_offset < 14679638556aSIlya Dryomov round_up(rbd_dev->parent_overlap, rbd_obj_bytes(&rbd_dev->header)); 14689638556aSIlya Dryomov } 14699638556aSIlya Dryomov 1470bf0d5f50SAlex Elder static void rbd_obj_request_get(struct rbd_obj_request *obj_request) 1471bf0d5f50SAlex Elder { 147237206ee5SAlex Elder dout("%s: obj %p (was %d)\n", __func__, obj_request, 147337206ee5SAlex Elder atomic_read(&obj_request->kref.refcount)); 1474bf0d5f50SAlex Elder kref_get(&obj_request->kref); 1475bf0d5f50SAlex Elder } 1476bf0d5f50SAlex Elder 1477bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref); 1478bf0d5f50SAlex Elder static void rbd_obj_request_put(struct rbd_obj_request *obj_request) 1479bf0d5f50SAlex Elder { 1480bf0d5f50SAlex Elder rbd_assert(obj_request != NULL); 148137206ee5SAlex Elder dout("%s: obj %p (was %d)\n", __func__, obj_request, 148237206ee5SAlex Elder atomic_read(&obj_request->kref.refcount)); 1483bf0d5f50SAlex Elder kref_put(&obj_request->kref, rbd_obj_request_destroy); 1484bf0d5f50SAlex Elder } 1485bf0d5f50SAlex Elder 14860f2d5be7SAlex Elder static void rbd_img_request_get(struct rbd_img_request *img_request) 14870f2d5be7SAlex Elder { 14880f2d5be7SAlex Elder dout("%s: img %p (was %d)\n", __func__, img_request, 14890f2d5be7SAlex Elder atomic_read(&img_request->kref.refcount)); 14900f2d5be7SAlex Elder kref_get(&img_request->kref); 14910f2d5be7SAlex Elder } 14920f2d5be7SAlex Elder 1493e93f3152SAlex Elder static bool img_request_child_test(struct rbd_img_request *img_request); 1494e93f3152SAlex Elder static void rbd_parent_request_destroy(struct kref *kref); 1495bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref); 1496bf0d5f50SAlex Elder static void rbd_img_request_put(struct rbd_img_request *img_request) 1497bf0d5f50SAlex Elder { 1498bf0d5f50SAlex Elder rbd_assert(img_request != NULL); 149937206ee5SAlex Elder dout("%s: img %p (was %d)\n", __func__, img_request, 150037206ee5SAlex Elder atomic_read(&img_request->kref.refcount)); 1501e93f3152SAlex Elder if (img_request_child_test(img_request)) 1502e93f3152SAlex Elder kref_put(&img_request->kref, rbd_parent_request_destroy); 1503e93f3152SAlex Elder else 1504bf0d5f50SAlex Elder kref_put(&img_request->kref, rbd_img_request_destroy); 1505bf0d5f50SAlex Elder } 1506bf0d5f50SAlex Elder 1507bf0d5f50SAlex Elder static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request, 1508bf0d5f50SAlex Elder struct rbd_obj_request *obj_request) 1509bf0d5f50SAlex Elder { 151025dcf954SAlex Elder rbd_assert(obj_request->img_request == NULL); 151125dcf954SAlex Elder 1512b155e86cSAlex Elder /* Image request now owns object's original reference */ 1513bf0d5f50SAlex Elder obj_request->img_request = img_request; 151425dcf954SAlex Elder obj_request->which = img_request->obj_request_count; 15156365d33aSAlex Elder rbd_assert(!obj_request_img_data_test(obj_request)); 15166365d33aSAlex Elder obj_request_img_data_set(obj_request); 1517bf0d5f50SAlex Elder rbd_assert(obj_request->which != BAD_WHICH); 151825dcf954SAlex Elder img_request->obj_request_count++; 151925dcf954SAlex Elder list_add_tail(&obj_request->links, &img_request->obj_requests); 152037206ee5SAlex Elder dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request, 152137206ee5SAlex Elder obj_request->which); 1522bf0d5f50SAlex Elder } 1523bf0d5f50SAlex Elder 1524bf0d5f50SAlex Elder static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request, 1525bf0d5f50SAlex Elder struct rbd_obj_request *obj_request) 1526bf0d5f50SAlex Elder { 1527bf0d5f50SAlex Elder rbd_assert(obj_request->which != BAD_WHICH); 152825dcf954SAlex Elder 152937206ee5SAlex Elder dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request, 153037206ee5SAlex Elder obj_request->which); 1531bf0d5f50SAlex Elder list_del(&obj_request->links); 153225dcf954SAlex Elder rbd_assert(img_request->obj_request_count > 0); 153325dcf954SAlex Elder img_request->obj_request_count--; 153425dcf954SAlex Elder rbd_assert(obj_request->which == img_request->obj_request_count); 153525dcf954SAlex Elder obj_request->which = BAD_WHICH; 15366365d33aSAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 1537bf0d5f50SAlex Elder rbd_assert(obj_request->img_request == img_request); 1538bf0d5f50SAlex Elder obj_request->img_request = NULL; 153925dcf954SAlex Elder obj_request->callback = NULL; 1540bf0d5f50SAlex Elder rbd_obj_request_put(obj_request); 1541bf0d5f50SAlex Elder } 1542bf0d5f50SAlex Elder 1543bf0d5f50SAlex Elder static bool obj_request_type_valid(enum obj_request_type type) 1544bf0d5f50SAlex Elder { 1545bf0d5f50SAlex Elder switch (type) { 15469969ebc5SAlex Elder case OBJ_REQUEST_NODATA: 1547bf0d5f50SAlex Elder case OBJ_REQUEST_BIO: 1548788e2df3SAlex Elder case OBJ_REQUEST_PAGES: 1549bf0d5f50SAlex Elder return true; 1550bf0d5f50SAlex Elder default: 1551bf0d5f50SAlex Elder return false; 1552bf0d5f50SAlex Elder } 1553bf0d5f50SAlex Elder } 1554bf0d5f50SAlex Elder 1555bf0d5f50SAlex Elder static int rbd_obj_request_submit(struct ceph_osd_client *osdc, 1556bf0d5f50SAlex Elder struct rbd_obj_request *obj_request) 1557bf0d5f50SAlex Elder { 155871c20a06SIlya Dryomov dout("%s %p\n", __func__, obj_request); 1559bf0d5f50SAlex Elder return ceph_osdc_start_request(osdc, obj_request->osd_req, false); 1560bf0d5f50SAlex Elder } 1561bf0d5f50SAlex Elder 156271c20a06SIlya Dryomov static void rbd_obj_request_end(struct rbd_obj_request *obj_request) 156371c20a06SIlya Dryomov { 156471c20a06SIlya Dryomov dout("%s %p\n", __func__, obj_request); 156571c20a06SIlya Dryomov ceph_osdc_cancel_request(obj_request->osd_req); 156671c20a06SIlya Dryomov } 156771c20a06SIlya Dryomov 156871c20a06SIlya Dryomov /* 156971c20a06SIlya Dryomov * Wait for an object request to complete. If interrupted, cancel the 157071c20a06SIlya Dryomov * underlying osd request. 15712894e1d7SIlya Dryomov * 15722894e1d7SIlya Dryomov * @timeout: in jiffies, 0 means "wait forever" 157371c20a06SIlya Dryomov */ 15742894e1d7SIlya Dryomov static int __rbd_obj_request_wait(struct rbd_obj_request *obj_request, 15752894e1d7SIlya Dryomov unsigned long timeout) 157671c20a06SIlya Dryomov { 15772894e1d7SIlya Dryomov long ret; 157871c20a06SIlya Dryomov 157971c20a06SIlya Dryomov dout("%s %p\n", __func__, obj_request); 15802894e1d7SIlya Dryomov ret = wait_for_completion_interruptible_timeout( 15812894e1d7SIlya Dryomov &obj_request->completion, 15822894e1d7SIlya Dryomov ceph_timeout_jiffies(timeout)); 15832894e1d7SIlya Dryomov if (ret <= 0) { 15842894e1d7SIlya Dryomov if (ret == 0) 15852894e1d7SIlya Dryomov ret = -ETIMEDOUT; 158671c20a06SIlya Dryomov rbd_obj_request_end(obj_request); 15872894e1d7SIlya Dryomov } else { 15882894e1d7SIlya Dryomov ret = 0; 15892894e1d7SIlya Dryomov } 15902894e1d7SIlya Dryomov 15912894e1d7SIlya Dryomov dout("%s %p ret %d\n", __func__, obj_request, (int)ret); 159271c20a06SIlya Dryomov return ret; 159371c20a06SIlya Dryomov } 159471c20a06SIlya Dryomov 15952894e1d7SIlya Dryomov static int rbd_obj_request_wait(struct rbd_obj_request *obj_request) 15962894e1d7SIlya Dryomov { 15972894e1d7SIlya Dryomov return __rbd_obj_request_wait(obj_request, 0); 15982894e1d7SIlya Dryomov } 15992894e1d7SIlya Dryomov 16002894e1d7SIlya Dryomov static int rbd_obj_request_wait_timeout(struct rbd_obj_request *obj_request, 16012894e1d7SIlya Dryomov unsigned long timeout) 16022894e1d7SIlya Dryomov { 16032894e1d7SIlya Dryomov return __rbd_obj_request_wait(obj_request, timeout); 160471c20a06SIlya Dryomov } 160571c20a06SIlya Dryomov 1606bf0d5f50SAlex Elder static void rbd_img_request_complete(struct rbd_img_request *img_request) 1607bf0d5f50SAlex Elder { 160855f27e09SAlex Elder 160937206ee5SAlex Elder dout("%s: img %p\n", __func__, img_request); 161055f27e09SAlex Elder 161155f27e09SAlex Elder /* 161255f27e09SAlex Elder * If no error occurred, compute the aggregate transfer 161355f27e09SAlex Elder * count for the image request. We could instead use 161455f27e09SAlex Elder * atomic64_cmpxchg() to update it as each object request 161555f27e09SAlex Elder * completes; not clear which way is better off hand. 161655f27e09SAlex Elder */ 161755f27e09SAlex Elder if (!img_request->result) { 161855f27e09SAlex Elder struct rbd_obj_request *obj_request; 161955f27e09SAlex Elder u64 xferred = 0; 162055f27e09SAlex Elder 162155f27e09SAlex Elder for_each_obj_request(img_request, obj_request) 162255f27e09SAlex Elder xferred += obj_request->xferred; 162355f27e09SAlex Elder img_request->xferred = xferred; 162455f27e09SAlex Elder } 162555f27e09SAlex Elder 1626bf0d5f50SAlex Elder if (img_request->callback) 1627bf0d5f50SAlex Elder img_request->callback(img_request); 1628bf0d5f50SAlex Elder else 1629bf0d5f50SAlex Elder rbd_img_request_put(img_request); 1630bf0d5f50SAlex Elder } 1631bf0d5f50SAlex Elder 16320c425248SAlex Elder /* 16330c425248SAlex Elder * The default/initial value for all image request flags is 0. Each 16340c425248SAlex Elder * is conditionally set to 1 at image request initialization time 16350c425248SAlex Elder * and currently never change thereafter. 16360c425248SAlex Elder */ 16370c425248SAlex Elder static void img_request_write_set(struct rbd_img_request *img_request) 16380c425248SAlex Elder { 16390c425248SAlex Elder set_bit(IMG_REQ_WRITE, &img_request->flags); 16400c425248SAlex Elder smp_mb(); 16410c425248SAlex Elder } 16420c425248SAlex Elder 16430c425248SAlex Elder static bool img_request_write_test(struct rbd_img_request *img_request) 16440c425248SAlex Elder { 16450c425248SAlex Elder smp_mb(); 16460c425248SAlex Elder return test_bit(IMG_REQ_WRITE, &img_request->flags) != 0; 16470c425248SAlex Elder } 16480c425248SAlex Elder 164990e98c52SGuangliang Zhao /* 165090e98c52SGuangliang Zhao * Set the discard flag when the img_request is an discard request 165190e98c52SGuangliang Zhao */ 165290e98c52SGuangliang Zhao static void img_request_discard_set(struct rbd_img_request *img_request) 165390e98c52SGuangliang Zhao { 165490e98c52SGuangliang Zhao set_bit(IMG_REQ_DISCARD, &img_request->flags); 165590e98c52SGuangliang Zhao smp_mb(); 165690e98c52SGuangliang Zhao } 165790e98c52SGuangliang Zhao 165890e98c52SGuangliang Zhao static bool img_request_discard_test(struct rbd_img_request *img_request) 165990e98c52SGuangliang Zhao { 166090e98c52SGuangliang Zhao smp_mb(); 166190e98c52SGuangliang Zhao return test_bit(IMG_REQ_DISCARD, &img_request->flags) != 0; 166290e98c52SGuangliang Zhao } 166390e98c52SGuangliang Zhao 16649849e986SAlex Elder static void img_request_child_set(struct rbd_img_request *img_request) 16659849e986SAlex Elder { 16669849e986SAlex Elder set_bit(IMG_REQ_CHILD, &img_request->flags); 16679849e986SAlex Elder smp_mb(); 16689849e986SAlex Elder } 16699849e986SAlex Elder 1670e93f3152SAlex Elder static void img_request_child_clear(struct rbd_img_request *img_request) 1671e93f3152SAlex Elder { 1672e93f3152SAlex Elder clear_bit(IMG_REQ_CHILD, &img_request->flags); 1673e93f3152SAlex Elder smp_mb(); 1674e93f3152SAlex Elder } 1675e93f3152SAlex Elder 16769849e986SAlex Elder static bool img_request_child_test(struct rbd_img_request *img_request) 16779849e986SAlex Elder { 16789849e986SAlex Elder smp_mb(); 16799849e986SAlex Elder return test_bit(IMG_REQ_CHILD, &img_request->flags) != 0; 16809849e986SAlex Elder } 16819849e986SAlex Elder 1682d0b2e944SAlex Elder static void img_request_layered_set(struct rbd_img_request *img_request) 1683d0b2e944SAlex Elder { 1684d0b2e944SAlex Elder set_bit(IMG_REQ_LAYERED, &img_request->flags); 1685d0b2e944SAlex Elder smp_mb(); 1686d0b2e944SAlex Elder } 1687d0b2e944SAlex Elder 1688a2acd00eSAlex Elder static void img_request_layered_clear(struct rbd_img_request *img_request) 1689a2acd00eSAlex Elder { 1690a2acd00eSAlex Elder clear_bit(IMG_REQ_LAYERED, &img_request->flags); 1691a2acd00eSAlex Elder smp_mb(); 1692a2acd00eSAlex Elder } 1693a2acd00eSAlex Elder 1694d0b2e944SAlex Elder static bool img_request_layered_test(struct rbd_img_request *img_request) 1695d0b2e944SAlex Elder { 1696d0b2e944SAlex Elder smp_mb(); 1697d0b2e944SAlex Elder return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0; 1698d0b2e944SAlex Elder } 1699d0b2e944SAlex Elder 17003b434a2aSJosh Durgin static enum obj_operation_type 17013b434a2aSJosh Durgin rbd_img_request_op_type(struct rbd_img_request *img_request) 17023b434a2aSJosh Durgin { 17033b434a2aSJosh Durgin if (img_request_write_test(img_request)) 17043b434a2aSJosh Durgin return OBJ_OP_WRITE; 17053b434a2aSJosh Durgin else if (img_request_discard_test(img_request)) 17063b434a2aSJosh Durgin return OBJ_OP_DISCARD; 17073b434a2aSJosh Durgin else 17083b434a2aSJosh Durgin return OBJ_OP_READ; 17093b434a2aSJosh Durgin } 17103b434a2aSJosh Durgin 17116e2a4505SAlex Elder static void 17126e2a4505SAlex Elder rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request) 17136e2a4505SAlex Elder { 1714b9434c5bSAlex Elder u64 xferred = obj_request->xferred; 1715b9434c5bSAlex Elder u64 length = obj_request->length; 1716b9434c5bSAlex Elder 17176e2a4505SAlex Elder dout("%s: obj %p img %p result %d %llu/%llu\n", __func__, 17186e2a4505SAlex Elder obj_request, obj_request->img_request, obj_request->result, 1719b9434c5bSAlex Elder xferred, length); 17206e2a4505SAlex Elder /* 172117c1cc1dSJosh Durgin * ENOENT means a hole in the image. We zero-fill the entire 172217c1cc1dSJosh Durgin * length of the request. A short read also implies zero-fill 172317c1cc1dSJosh Durgin * to the end of the request. An error requires the whole 172417c1cc1dSJosh Durgin * length of the request to be reported finished with an error 172517c1cc1dSJosh Durgin * to the block layer. In each case we update the xferred 172617c1cc1dSJosh Durgin * count to indicate the whole request was satisfied. 17276e2a4505SAlex Elder */ 1728b9434c5bSAlex Elder rbd_assert(obj_request->type != OBJ_REQUEST_NODATA); 17296e2a4505SAlex Elder if (obj_request->result == -ENOENT) { 1730b9434c5bSAlex Elder if (obj_request->type == OBJ_REQUEST_BIO) 17316e2a4505SAlex Elder zero_bio_chain(obj_request->bio_list, 0); 1732b9434c5bSAlex Elder else 1733b9434c5bSAlex Elder zero_pages(obj_request->pages, 0, length); 17346e2a4505SAlex Elder obj_request->result = 0; 1735b9434c5bSAlex Elder } else if (xferred < length && !obj_request->result) { 1736b9434c5bSAlex Elder if (obj_request->type == OBJ_REQUEST_BIO) 1737b9434c5bSAlex Elder zero_bio_chain(obj_request->bio_list, xferred); 1738b9434c5bSAlex Elder else 1739b9434c5bSAlex Elder zero_pages(obj_request->pages, xferred, length); 17406e2a4505SAlex Elder } 174117c1cc1dSJosh Durgin obj_request->xferred = length; 17426e2a4505SAlex Elder obj_request_done_set(obj_request); 17436e2a4505SAlex Elder } 17446e2a4505SAlex Elder 1745bf0d5f50SAlex Elder static void rbd_obj_request_complete(struct rbd_obj_request *obj_request) 1746bf0d5f50SAlex Elder { 174737206ee5SAlex Elder dout("%s: obj %p cb %p\n", __func__, obj_request, 174837206ee5SAlex Elder obj_request->callback); 1749bf0d5f50SAlex Elder if (obj_request->callback) 1750bf0d5f50SAlex Elder obj_request->callback(obj_request); 1751788e2df3SAlex Elder else 1752788e2df3SAlex Elder complete_all(&obj_request->completion); 1753bf0d5f50SAlex Elder } 1754bf0d5f50SAlex Elder 1755c47f9371SAlex Elder static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request) 175639bf2c5dSAlex Elder { 175739bf2c5dSAlex Elder dout("%s: obj %p\n", __func__, obj_request); 175839bf2c5dSAlex Elder obj_request_done_set(obj_request); 175939bf2c5dSAlex Elder } 176039bf2c5dSAlex Elder 1761c47f9371SAlex Elder static void rbd_osd_read_callback(struct rbd_obj_request *obj_request) 1762bf0d5f50SAlex Elder { 176357acbaa7SAlex Elder struct rbd_img_request *img_request = NULL; 1764a9e8ba2cSAlex Elder struct rbd_device *rbd_dev = NULL; 176557acbaa7SAlex Elder bool layered = false; 176657acbaa7SAlex Elder 176757acbaa7SAlex Elder if (obj_request_img_data_test(obj_request)) { 176857acbaa7SAlex Elder img_request = obj_request->img_request; 176957acbaa7SAlex Elder layered = img_request && img_request_layered_test(img_request); 1770a9e8ba2cSAlex Elder rbd_dev = img_request->rbd_dev; 177157acbaa7SAlex Elder } 17728b3e1a56SAlex Elder 17738b3e1a56SAlex Elder dout("%s: obj %p img %p result %d %llu/%llu\n", __func__, 17748b3e1a56SAlex Elder obj_request, img_request, obj_request->result, 17758b3e1a56SAlex Elder obj_request->xferred, obj_request->length); 1776a9e8ba2cSAlex Elder if (layered && obj_request->result == -ENOENT && 1777a9e8ba2cSAlex Elder obj_request->img_offset < rbd_dev->parent_overlap) 17788b3e1a56SAlex Elder rbd_img_parent_read(obj_request); 17798b3e1a56SAlex Elder else if (img_request) 17806e2a4505SAlex Elder rbd_img_obj_request_read_callback(obj_request); 17816e2a4505SAlex Elder else 178207741308SAlex Elder obj_request_done_set(obj_request); 1783bf0d5f50SAlex Elder } 1784bf0d5f50SAlex Elder 1785c47f9371SAlex Elder static void rbd_osd_write_callback(struct rbd_obj_request *obj_request) 1786bf0d5f50SAlex Elder { 17871b83bef2SSage Weil dout("%s: obj %p result %d %llu\n", __func__, obj_request, 17881b83bef2SSage Weil obj_request->result, obj_request->length); 17891b83bef2SSage Weil /* 17908b3e1a56SAlex Elder * There is no such thing as a successful short write. Set 17918b3e1a56SAlex Elder * it to our originally-requested length. 17921b83bef2SSage Weil */ 17931b83bef2SSage Weil obj_request->xferred = obj_request->length; 179407741308SAlex Elder obj_request_done_set(obj_request); 1795bf0d5f50SAlex Elder } 1796bf0d5f50SAlex Elder 179790e98c52SGuangliang Zhao static void rbd_osd_discard_callback(struct rbd_obj_request *obj_request) 179890e98c52SGuangliang Zhao { 179990e98c52SGuangliang Zhao dout("%s: obj %p result %d %llu\n", __func__, obj_request, 180090e98c52SGuangliang Zhao obj_request->result, obj_request->length); 180190e98c52SGuangliang Zhao /* 180290e98c52SGuangliang Zhao * There is no such thing as a successful short discard. Set 180390e98c52SGuangliang Zhao * it to our originally-requested length. 180490e98c52SGuangliang Zhao */ 180590e98c52SGuangliang Zhao obj_request->xferred = obj_request->length; 1806d0265de7SJosh Durgin /* discarding a non-existent object is not a problem */ 1807d0265de7SJosh Durgin if (obj_request->result == -ENOENT) 1808d0265de7SJosh Durgin obj_request->result = 0; 180990e98c52SGuangliang Zhao obj_request_done_set(obj_request); 181090e98c52SGuangliang Zhao } 181190e98c52SGuangliang Zhao 1812fbfab539SAlex Elder /* 1813fbfab539SAlex Elder * For a simple stat call there's nothing to do. We'll do more if 1814fbfab539SAlex Elder * this is part of a write sequence for a layered image. 1815fbfab539SAlex Elder */ 1816c47f9371SAlex Elder static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request) 1817fbfab539SAlex Elder { 181837206ee5SAlex Elder dout("%s: obj %p\n", __func__, obj_request); 1819fbfab539SAlex Elder obj_request_done_set(obj_request); 1820fbfab539SAlex Elder } 1821fbfab539SAlex Elder 18222761713dSIlya Dryomov static void rbd_osd_call_callback(struct rbd_obj_request *obj_request) 18232761713dSIlya Dryomov { 18242761713dSIlya Dryomov dout("%s: obj %p\n", __func__, obj_request); 18252761713dSIlya Dryomov 18262761713dSIlya Dryomov if (obj_request_img_data_test(obj_request)) 18272761713dSIlya Dryomov rbd_osd_copyup_callback(obj_request); 18282761713dSIlya Dryomov else 18292761713dSIlya Dryomov obj_request_done_set(obj_request); 18302761713dSIlya Dryomov } 18312761713dSIlya Dryomov 1832bf0d5f50SAlex Elder static void rbd_osd_req_callback(struct ceph_osd_request *osd_req, 1833bf0d5f50SAlex Elder struct ceph_msg *msg) 1834bf0d5f50SAlex Elder { 1835bf0d5f50SAlex Elder struct rbd_obj_request *obj_request = osd_req->r_priv; 1836bf0d5f50SAlex Elder u16 opcode; 1837bf0d5f50SAlex Elder 183837206ee5SAlex Elder dout("%s: osd_req %p msg %p\n", __func__, osd_req, msg); 1839bf0d5f50SAlex Elder rbd_assert(osd_req == obj_request->osd_req); 184057acbaa7SAlex Elder if (obj_request_img_data_test(obj_request)) { 184157acbaa7SAlex Elder rbd_assert(obj_request->img_request); 184257acbaa7SAlex Elder rbd_assert(obj_request->which != BAD_WHICH); 184357acbaa7SAlex Elder } else { 184457acbaa7SAlex Elder rbd_assert(obj_request->which == BAD_WHICH); 184557acbaa7SAlex Elder } 1846bf0d5f50SAlex Elder 18471b83bef2SSage Weil if (osd_req->r_result < 0) 18481b83bef2SSage Weil obj_request->result = osd_req->r_result; 1849bf0d5f50SAlex Elder 18507cc69d42SIlya Dryomov rbd_assert(osd_req->r_num_ops <= CEPH_OSD_MAX_OP); 1851bf0d5f50SAlex Elder 1852c47f9371SAlex Elder /* 1853c47f9371SAlex Elder * We support a 64-bit length, but ultimately it has to be 18547ad18afaSChristoph Hellwig * passed to the block layer, which just supports a 32-bit 18557ad18afaSChristoph Hellwig * length field. 1856c47f9371SAlex Elder */ 18571b83bef2SSage Weil obj_request->xferred = osd_req->r_reply_op_len[0]; 1858c47f9371SAlex Elder rbd_assert(obj_request->xferred < (u64)UINT_MAX); 18590ccd5926SIlya Dryomov 186079528734SAlex Elder opcode = osd_req->r_ops[0].op; 1861bf0d5f50SAlex Elder switch (opcode) { 1862bf0d5f50SAlex Elder case CEPH_OSD_OP_READ: 1863c47f9371SAlex Elder rbd_osd_read_callback(obj_request); 1864bf0d5f50SAlex Elder break; 18650ccd5926SIlya Dryomov case CEPH_OSD_OP_SETALLOCHINT: 1866e30b7577SIlya Dryomov rbd_assert(osd_req->r_ops[1].op == CEPH_OSD_OP_WRITE || 1867e30b7577SIlya Dryomov osd_req->r_ops[1].op == CEPH_OSD_OP_WRITEFULL); 18680ccd5926SIlya Dryomov /* fall through */ 1869bf0d5f50SAlex Elder case CEPH_OSD_OP_WRITE: 1870e30b7577SIlya Dryomov case CEPH_OSD_OP_WRITEFULL: 1871c47f9371SAlex Elder rbd_osd_write_callback(obj_request); 1872bf0d5f50SAlex Elder break; 1873fbfab539SAlex Elder case CEPH_OSD_OP_STAT: 1874c47f9371SAlex Elder rbd_osd_stat_callback(obj_request); 1875fbfab539SAlex Elder break; 187690e98c52SGuangliang Zhao case CEPH_OSD_OP_DELETE: 187790e98c52SGuangliang Zhao case CEPH_OSD_OP_TRUNCATE: 187890e98c52SGuangliang Zhao case CEPH_OSD_OP_ZERO: 187990e98c52SGuangliang Zhao rbd_osd_discard_callback(obj_request); 188090e98c52SGuangliang Zhao break; 188136be9a76SAlex Elder case CEPH_OSD_OP_CALL: 18822761713dSIlya Dryomov rbd_osd_call_callback(obj_request); 18832761713dSIlya Dryomov break; 1884b8d70035SAlex Elder case CEPH_OSD_OP_NOTIFY_ACK: 18859969ebc5SAlex Elder case CEPH_OSD_OP_WATCH: 1886c47f9371SAlex Elder rbd_osd_trivial_callback(obj_request); 18879969ebc5SAlex Elder break; 1888bf0d5f50SAlex Elder default: 18899584d508SIlya Dryomov rbd_warn(NULL, "%s: unsupported op %hu", 1890bf0d5f50SAlex Elder obj_request->object_name, (unsigned short) opcode); 1891bf0d5f50SAlex Elder break; 1892bf0d5f50SAlex Elder } 1893bf0d5f50SAlex Elder 189407741308SAlex Elder if (obj_request_done_test(obj_request)) 1895bf0d5f50SAlex Elder rbd_obj_request_complete(obj_request); 1896bf0d5f50SAlex Elder } 1897bf0d5f50SAlex Elder 18989d4df01fSAlex Elder static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request) 1899430c28c3SAlex Elder { 1900430c28c3SAlex Elder struct rbd_img_request *img_request = obj_request->img_request; 19018c042b0dSAlex Elder struct ceph_osd_request *osd_req = obj_request->osd_req; 19029d4df01fSAlex Elder u64 snap_id; 1903430c28c3SAlex Elder 19048c042b0dSAlex Elder rbd_assert(osd_req != NULL); 1905430c28c3SAlex Elder 19069d4df01fSAlex Elder snap_id = img_request ? img_request->snap_id : CEPH_NOSNAP; 19078c042b0dSAlex Elder ceph_osdc_build_request(osd_req, obj_request->offset, 19089d4df01fSAlex Elder NULL, snap_id, NULL); 19099d4df01fSAlex Elder } 19109d4df01fSAlex Elder 19119d4df01fSAlex Elder static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request) 19129d4df01fSAlex Elder { 19139d4df01fSAlex Elder struct rbd_img_request *img_request = obj_request->img_request; 19149d4df01fSAlex Elder struct ceph_osd_request *osd_req = obj_request->osd_req; 19159d4df01fSAlex Elder struct ceph_snap_context *snapc; 19169d4df01fSAlex Elder struct timespec mtime = CURRENT_TIME; 19179d4df01fSAlex Elder 19189d4df01fSAlex Elder rbd_assert(osd_req != NULL); 19199d4df01fSAlex Elder 19209d4df01fSAlex Elder snapc = img_request ? img_request->snapc : NULL; 19219d4df01fSAlex Elder ceph_osdc_build_request(osd_req, obj_request->offset, 19229d4df01fSAlex Elder snapc, CEPH_NOSNAP, &mtime); 1923430c28c3SAlex Elder } 1924430c28c3SAlex Elder 19250ccd5926SIlya Dryomov /* 19260ccd5926SIlya Dryomov * Create an osd request. A read request has one osd op (read). 19270ccd5926SIlya Dryomov * A write request has either one (watch) or two (hint+write) osd ops. 19280ccd5926SIlya Dryomov * (All rbd data writes are prefixed with an allocation hint op, but 19290ccd5926SIlya Dryomov * technically osd watch is a write request, hence this distinction.) 19300ccd5926SIlya Dryomov */ 1931bf0d5f50SAlex Elder static struct ceph_osd_request *rbd_osd_req_create( 1932bf0d5f50SAlex Elder struct rbd_device *rbd_dev, 19336d2940c8SGuangliang Zhao enum obj_operation_type op_type, 1934deb236b3SIlya Dryomov unsigned int num_ops, 1935430c28c3SAlex Elder struct rbd_obj_request *obj_request) 1936bf0d5f50SAlex Elder { 1937bf0d5f50SAlex Elder struct ceph_snap_context *snapc = NULL; 1938bf0d5f50SAlex Elder struct ceph_osd_client *osdc; 1939bf0d5f50SAlex Elder struct ceph_osd_request *osd_req; 1940bf0d5f50SAlex Elder 194190e98c52SGuangliang Zhao if (obj_request_img_data_test(obj_request) && 194290e98c52SGuangliang Zhao (op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_WRITE)) { 19436365d33aSAlex Elder struct rbd_img_request *img_request = obj_request->img_request; 194490e98c52SGuangliang Zhao if (op_type == OBJ_OP_WRITE) { 19456d2940c8SGuangliang Zhao rbd_assert(img_request_write_test(img_request)); 194690e98c52SGuangliang Zhao } else { 194790e98c52SGuangliang Zhao rbd_assert(img_request_discard_test(img_request)); 194890e98c52SGuangliang Zhao } 1949bf0d5f50SAlex Elder snapc = img_request->snapc; 1950bf0d5f50SAlex Elder } 1951bf0d5f50SAlex Elder 19526d2940c8SGuangliang Zhao rbd_assert(num_ops == 1 || ((op_type == OBJ_OP_WRITE) && num_ops == 2)); 1953deb236b3SIlya Dryomov 1954deb236b3SIlya Dryomov /* Allocate and initialize the request, for the num_ops ops */ 1955bf0d5f50SAlex Elder 1956bf0d5f50SAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 1957deb236b3SIlya Dryomov osd_req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false, 1958deb236b3SIlya Dryomov GFP_ATOMIC); 1959bf0d5f50SAlex Elder if (!osd_req) 1960bf0d5f50SAlex Elder return NULL; /* ENOMEM */ 1961bf0d5f50SAlex Elder 196290e98c52SGuangliang Zhao if (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD) 1963bf0d5f50SAlex Elder osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK; 1964430c28c3SAlex Elder else 1965bf0d5f50SAlex Elder osd_req->r_flags = CEPH_OSD_FLAG_READ; 1966bf0d5f50SAlex Elder 1967bf0d5f50SAlex Elder osd_req->r_callback = rbd_osd_req_callback; 1968bf0d5f50SAlex Elder osd_req->r_priv = obj_request; 1969bf0d5f50SAlex Elder 19703c972c95SIlya Dryomov osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout); 19713c972c95SIlya Dryomov ceph_oid_set_name(&osd_req->r_base_oid, obj_request->object_name); 1972bf0d5f50SAlex Elder 1973bf0d5f50SAlex Elder return osd_req; 1974bf0d5f50SAlex Elder } 1975bf0d5f50SAlex Elder 19760eefd470SAlex Elder /* 1977d3246fb0SJosh Durgin * Create a copyup osd request based on the information in the object 1978d3246fb0SJosh Durgin * request supplied. A copyup request has two or three osd ops, a 1979d3246fb0SJosh Durgin * copyup method call, potentially a hint op, and a write or truncate 1980d3246fb0SJosh Durgin * or zero op. 19810eefd470SAlex Elder */ 19820eefd470SAlex Elder static struct ceph_osd_request * 19830eefd470SAlex Elder rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request) 19840eefd470SAlex Elder { 19850eefd470SAlex Elder struct rbd_img_request *img_request; 19860eefd470SAlex Elder struct ceph_snap_context *snapc; 19870eefd470SAlex Elder struct rbd_device *rbd_dev; 19880eefd470SAlex Elder struct ceph_osd_client *osdc; 19890eefd470SAlex Elder struct ceph_osd_request *osd_req; 1990d3246fb0SJosh Durgin int num_osd_ops = 3; 19910eefd470SAlex Elder 19920eefd470SAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 19930eefd470SAlex Elder img_request = obj_request->img_request; 19940eefd470SAlex Elder rbd_assert(img_request); 1995d3246fb0SJosh Durgin rbd_assert(img_request_write_test(img_request) || 1996d3246fb0SJosh Durgin img_request_discard_test(img_request)); 19970eefd470SAlex Elder 1998d3246fb0SJosh Durgin if (img_request_discard_test(img_request)) 1999d3246fb0SJosh Durgin num_osd_ops = 2; 2000d3246fb0SJosh Durgin 2001d3246fb0SJosh Durgin /* Allocate and initialize the request, for all the ops */ 20020eefd470SAlex Elder 20030eefd470SAlex Elder snapc = img_request->snapc; 20040eefd470SAlex Elder rbd_dev = img_request->rbd_dev; 20050eefd470SAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 2006d3246fb0SJosh Durgin osd_req = ceph_osdc_alloc_request(osdc, snapc, num_osd_ops, 2007d3246fb0SJosh Durgin false, GFP_ATOMIC); 20080eefd470SAlex Elder if (!osd_req) 20090eefd470SAlex Elder return NULL; /* ENOMEM */ 20100eefd470SAlex Elder 20110eefd470SAlex Elder osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK; 20120eefd470SAlex Elder osd_req->r_callback = rbd_osd_req_callback; 20130eefd470SAlex Elder osd_req->r_priv = obj_request; 20140eefd470SAlex Elder 20153c972c95SIlya Dryomov osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout); 20163c972c95SIlya Dryomov ceph_oid_set_name(&osd_req->r_base_oid, obj_request->object_name); 20170eefd470SAlex Elder 20180eefd470SAlex Elder return osd_req; 20190eefd470SAlex Elder } 20200eefd470SAlex Elder 20210eefd470SAlex Elder 2022bf0d5f50SAlex Elder static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req) 2023bf0d5f50SAlex Elder { 2024bf0d5f50SAlex Elder ceph_osdc_put_request(osd_req); 2025bf0d5f50SAlex Elder } 2026bf0d5f50SAlex Elder 2027bf0d5f50SAlex Elder /* object_name is assumed to be a non-null pointer and NUL-terminated */ 2028bf0d5f50SAlex Elder 2029bf0d5f50SAlex Elder static struct rbd_obj_request *rbd_obj_request_create(const char *object_name, 2030bf0d5f50SAlex Elder u64 offset, u64 length, 2031bf0d5f50SAlex Elder enum obj_request_type type) 2032bf0d5f50SAlex Elder { 2033bf0d5f50SAlex Elder struct rbd_obj_request *obj_request; 2034bf0d5f50SAlex Elder size_t size; 2035bf0d5f50SAlex Elder char *name; 2036bf0d5f50SAlex Elder 2037bf0d5f50SAlex Elder rbd_assert(obj_request_type_valid(type)); 2038bf0d5f50SAlex Elder 2039bf0d5f50SAlex Elder size = strlen(object_name) + 1; 20405a60e876SIlya Dryomov name = kmalloc(size, GFP_NOIO); 2041f907ad55SAlex Elder if (!name) 2042bf0d5f50SAlex Elder return NULL; 2043bf0d5f50SAlex Elder 20445a60e876SIlya Dryomov obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_NOIO); 2045f907ad55SAlex Elder if (!obj_request) { 2046f907ad55SAlex Elder kfree(name); 2047f907ad55SAlex Elder return NULL; 2048f907ad55SAlex Elder } 2049f907ad55SAlex Elder 2050bf0d5f50SAlex Elder obj_request->object_name = memcpy(name, object_name, size); 2051bf0d5f50SAlex Elder obj_request->offset = offset; 2052bf0d5f50SAlex Elder obj_request->length = length; 2053926f9b3fSAlex Elder obj_request->flags = 0; 2054bf0d5f50SAlex Elder obj_request->which = BAD_WHICH; 2055bf0d5f50SAlex Elder obj_request->type = type; 2056bf0d5f50SAlex Elder INIT_LIST_HEAD(&obj_request->links); 2057788e2df3SAlex Elder init_completion(&obj_request->completion); 2058bf0d5f50SAlex Elder kref_init(&obj_request->kref); 2059bf0d5f50SAlex Elder 206037206ee5SAlex Elder dout("%s: \"%s\" %llu/%llu %d -> obj %p\n", __func__, object_name, 206137206ee5SAlex Elder offset, length, (int)type, obj_request); 206237206ee5SAlex Elder 2063bf0d5f50SAlex Elder return obj_request; 2064bf0d5f50SAlex Elder } 2065bf0d5f50SAlex Elder 2066bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref) 2067bf0d5f50SAlex Elder { 2068bf0d5f50SAlex Elder struct rbd_obj_request *obj_request; 2069bf0d5f50SAlex Elder 2070bf0d5f50SAlex Elder obj_request = container_of(kref, struct rbd_obj_request, kref); 2071bf0d5f50SAlex Elder 207237206ee5SAlex Elder dout("%s: obj %p\n", __func__, obj_request); 207337206ee5SAlex Elder 2074bf0d5f50SAlex Elder rbd_assert(obj_request->img_request == NULL); 2075bf0d5f50SAlex Elder rbd_assert(obj_request->which == BAD_WHICH); 2076bf0d5f50SAlex Elder 2077bf0d5f50SAlex Elder if (obj_request->osd_req) 2078bf0d5f50SAlex Elder rbd_osd_req_destroy(obj_request->osd_req); 2079bf0d5f50SAlex Elder 2080bf0d5f50SAlex Elder rbd_assert(obj_request_type_valid(obj_request->type)); 2081bf0d5f50SAlex Elder switch (obj_request->type) { 20829969ebc5SAlex Elder case OBJ_REQUEST_NODATA: 20839969ebc5SAlex Elder break; /* Nothing to do */ 2084bf0d5f50SAlex Elder case OBJ_REQUEST_BIO: 2085bf0d5f50SAlex Elder if (obj_request->bio_list) 2086bf0d5f50SAlex Elder bio_chain_put(obj_request->bio_list); 2087bf0d5f50SAlex Elder break; 2088788e2df3SAlex Elder case OBJ_REQUEST_PAGES: 2089788e2df3SAlex Elder if (obj_request->pages) 2090788e2df3SAlex Elder ceph_release_page_vector(obj_request->pages, 2091788e2df3SAlex Elder obj_request->page_count); 2092788e2df3SAlex Elder break; 2093bf0d5f50SAlex Elder } 2094bf0d5f50SAlex Elder 2095f907ad55SAlex Elder kfree(obj_request->object_name); 2096868311b1SAlex Elder obj_request->object_name = NULL; 2097868311b1SAlex Elder kmem_cache_free(rbd_obj_request_cache, obj_request); 2098bf0d5f50SAlex Elder } 2099bf0d5f50SAlex Elder 2100fb65d228SAlex Elder /* It's OK to call this for a device with no parent */ 2101fb65d228SAlex Elder 2102fb65d228SAlex Elder static void rbd_spec_put(struct rbd_spec *spec); 2103fb65d228SAlex Elder static void rbd_dev_unparent(struct rbd_device *rbd_dev) 2104fb65d228SAlex Elder { 2105fb65d228SAlex Elder rbd_dev_remove_parent(rbd_dev); 2106fb65d228SAlex Elder rbd_spec_put(rbd_dev->parent_spec); 2107fb65d228SAlex Elder rbd_dev->parent_spec = NULL; 2108fb65d228SAlex Elder rbd_dev->parent_overlap = 0; 2109fb65d228SAlex Elder } 2110fb65d228SAlex Elder 2111bf0d5f50SAlex Elder /* 2112a2acd00eSAlex Elder * Parent image reference counting is used to determine when an 2113a2acd00eSAlex Elder * image's parent fields can be safely torn down--after there are no 2114a2acd00eSAlex Elder * more in-flight requests to the parent image. When the last 2115a2acd00eSAlex Elder * reference is dropped, cleaning them up is safe. 2116a2acd00eSAlex Elder */ 2117a2acd00eSAlex Elder static void rbd_dev_parent_put(struct rbd_device *rbd_dev) 2118a2acd00eSAlex Elder { 2119a2acd00eSAlex Elder int counter; 2120a2acd00eSAlex Elder 2121a2acd00eSAlex Elder if (!rbd_dev->parent_spec) 2122a2acd00eSAlex Elder return; 2123a2acd00eSAlex Elder 2124a2acd00eSAlex Elder counter = atomic_dec_return_safe(&rbd_dev->parent_ref); 2125a2acd00eSAlex Elder if (counter > 0) 2126a2acd00eSAlex Elder return; 2127a2acd00eSAlex Elder 2128a2acd00eSAlex Elder /* Last reference; clean up parent data structures */ 2129a2acd00eSAlex Elder 2130a2acd00eSAlex Elder if (!counter) 2131a2acd00eSAlex Elder rbd_dev_unparent(rbd_dev); 2132a2acd00eSAlex Elder else 21339584d508SIlya Dryomov rbd_warn(rbd_dev, "parent reference underflow"); 2134a2acd00eSAlex Elder } 2135a2acd00eSAlex Elder 2136a2acd00eSAlex Elder /* 2137a2acd00eSAlex Elder * If an image has a non-zero parent overlap, get a reference to its 2138a2acd00eSAlex Elder * parent. 2139a2acd00eSAlex Elder * 2140a2acd00eSAlex Elder * Returns true if the rbd device has a parent with a non-zero 2141a2acd00eSAlex Elder * overlap and a reference for it was successfully taken, or 2142a2acd00eSAlex Elder * false otherwise. 2143a2acd00eSAlex Elder */ 2144a2acd00eSAlex Elder static bool rbd_dev_parent_get(struct rbd_device *rbd_dev) 2145a2acd00eSAlex Elder { 2146ae43e9d0SIlya Dryomov int counter = 0; 2147a2acd00eSAlex Elder 2148a2acd00eSAlex Elder if (!rbd_dev->parent_spec) 2149a2acd00eSAlex Elder return false; 2150a2acd00eSAlex Elder 2151ae43e9d0SIlya Dryomov down_read(&rbd_dev->header_rwsem); 2152ae43e9d0SIlya Dryomov if (rbd_dev->parent_overlap) 2153a2acd00eSAlex Elder counter = atomic_inc_return_safe(&rbd_dev->parent_ref); 2154ae43e9d0SIlya Dryomov up_read(&rbd_dev->header_rwsem); 2155a2acd00eSAlex Elder 2156a2acd00eSAlex Elder if (counter < 0) 21579584d508SIlya Dryomov rbd_warn(rbd_dev, "parent reference overflow"); 2158a2acd00eSAlex Elder 2159ae43e9d0SIlya Dryomov return counter > 0; 2160a2acd00eSAlex Elder } 2161a2acd00eSAlex Elder 2162bf0d5f50SAlex Elder /* 2163bf0d5f50SAlex Elder * Caller is responsible for filling in the list of object requests 2164bf0d5f50SAlex Elder * that comprises the image request, and the Linux request pointer 2165bf0d5f50SAlex Elder * (if there is one). 2166bf0d5f50SAlex Elder */ 2167cc344fa1SAlex Elder static struct rbd_img_request *rbd_img_request_create( 2168cc344fa1SAlex Elder struct rbd_device *rbd_dev, 2169bf0d5f50SAlex Elder u64 offset, u64 length, 21706d2940c8SGuangliang Zhao enum obj_operation_type op_type, 21714e752f0aSJosh Durgin struct ceph_snap_context *snapc) 2172bf0d5f50SAlex Elder { 2173bf0d5f50SAlex Elder struct rbd_img_request *img_request; 2174bf0d5f50SAlex Elder 21757a716aacSIlya Dryomov img_request = kmem_cache_alloc(rbd_img_request_cache, GFP_NOIO); 2176bf0d5f50SAlex Elder if (!img_request) 2177bf0d5f50SAlex Elder return NULL; 2178bf0d5f50SAlex Elder 2179bf0d5f50SAlex Elder img_request->rq = NULL; 2180bf0d5f50SAlex Elder img_request->rbd_dev = rbd_dev; 2181bf0d5f50SAlex Elder img_request->offset = offset; 2182bf0d5f50SAlex Elder img_request->length = length; 21830c425248SAlex Elder img_request->flags = 0; 218490e98c52SGuangliang Zhao if (op_type == OBJ_OP_DISCARD) { 218590e98c52SGuangliang Zhao img_request_discard_set(img_request); 218690e98c52SGuangliang Zhao img_request->snapc = snapc; 218790e98c52SGuangliang Zhao } else if (op_type == OBJ_OP_WRITE) { 21880c425248SAlex Elder img_request_write_set(img_request); 21894e752f0aSJosh Durgin img_request->snapc = snapc; 21900c425248SAlex Elder } else { 2191bf0d5f50SAlex Elder img_request->snap_id = rbd_dev->spec->snap_id; 21920c425248SAlex Elder } 2193a2acd00eSAlex Elder if (rbd_dev_parent_get(rbd_dev)) 2194d0b2e944SAlex Elder img_request_layered_set(img_request); 2195bf0d5f50SAlex Elder spin_lock_init(&img_request->completion_lock); 2196bf0d5f50SAlex Elder img_request->next_completion = 0; 2197bf0d5f50SAlex Elder img_request->callback = NULL; 2198a5a337d4SAlex Elder img_request->result = 0; 2199bf0d5f50SAlex Elder img_request->obj_request_count = 0; 2200bf0d5f50SAlex Elder INIT_LIST_HEAD(&img_request->obj_requests); 2201bf0d5f50SAlex Elder kref_init(&img_request->kref); 2202bf0d5f50SAlex Elder 220337206ee5SAlex Elder dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev, 22046d2940c8SGuangliang Zhao obj_op_name(op_type), offset, length, img_request); 220537206ee5SAlex Elder 2206bf0d5f50SAlex Elder return img_request; 2207bf0d5f50SAlex Elder } 2208bf0d5f50SAlex Elder 2209bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref) 2210bf0d5f50SAlex Elder { 2211bf0d5f50SAlex Elder struct rbd_img_request *img_request; 2212bf0d5f50SAlex Elder struct rbd_obj_request *obj_request; 2213bf0d5f50SAlex Elder struct rbd_obj_request *next_obj_request; 2214bf0d5f50SAlex Elder 2215bf0d5f50SAlex Elder img_request = container_of(kref, struct rbd_img_request, kref); 2216bf0d5f50SAlex Elder 221737206ee5SAlex Elder dout("%s: img %p\n", __func__, img_request); 221837206ee5SAlex Elder 2219bf0d5f50SAlex Elder for_each_obj_request_safe(img_request, obj_request, next_obj_request) 2220bf0d5f50SAlex Elder rbd_img_obj_request_del(img_request, obj_request); 222125dcf954SAlex Elder rbd_assert(img_request->obj_request_count == 0); 2222bf0d5f50SAlex Elder 2223a2acd00eSAlex Elder if (img_request_layered_test(img_request)) { 2224a2acd00eSAlex Elder img_request_layered_clear(img_request); 2225a2acd00eSAlex Elder rbd_dev_parent_put(img_request->rbd_dev); 2226a2acd00eSAlex Elder } 2227a2acd00eSAlex Elder 2228bef95455SJosh Durgin if (img_request_write_test(img_request) || 2229bef95455SJosh Durgin img_request_discard_test(img_request)) 2230812164f8SAlex Elder ceph_put_snap_context(img_request->snapc); 2231bf0d5f50SAlex Elder 22321c2a9dfeSAlex Elder kmem_cache_free(rbd_img_request_cache, img_request); 2233bf0d5f50SAlex Elder } 2234bf0d5f50SAlex Elder 2235e93f3152SAlex Elder static struct rbd_img_request *rbd_parent_request_create( 2236e93f3152SAlex Elder struct rbd_obj_request *obj_request, 2237e93f3152SAlex Elder u64 img_offset, u64 length) 2238e93f3152SAlex Elder { 2239e93f3152SAlex Elder struct rbd_img_request *parent_request; 2240e93f3152SAlex Elder struct rbd_device *rbd_dev; 2241e93f3152SAlex Elder 2242e93f3152SAlex Elder rbd_assert(obj_request->img_request); 2243e93f3152SAlex Elder rbd_dev = obj_request->img_request->rbd_dev; 2244e93f3152SAlex Elder 22454e752f0aSJosh Durgin parent_request = rbd_img_request_create(rbd_dev->parent, img_offset, 22466d2940c8SGuangliang Zhao length, OBJ_OP_READ, NULL); 2247e93f3152SAlex Elder if (!parent_request) 2248e93f3152SAlex Elder return NULL; 2249e93f3152SAlex Elder 2250e93f3152SAlex Elder img_request_child_set(parent_request); 2251e93f3152SAlex Elder rbd_obj_request_get(obj_request); 2252e93f3152SAlex Elder parent_request->obj_request = obj_request; 2253e93f3152SAlex Elder 2254e93f3152SAlex Elder return parent_request; 2255e93f3152SAlex Elder } 2256e93f3152SAlex Elder 2257e93f3152SAlex Elder static void rbd_parent_request_destroy(struct kref *kref) 2258e93f3152SAlex Elder { 2259e93f3152SAlex Elder struct rbd_img_request *parent_request; 2260e93f3152SAlex Elder struct rbd_obj_request *orig_request; 2261e93f3152SAlex Elder 2262e93f3152SAlex Elder parent_request = container_of(kref, struct rbd_img_request, kref); 2263e93f3152SAlex Elder orig_request = parent_request->obj_request; 2264e93f3152SAlex Elder 2265e93f3152SAlex Elder parent_request->obj_request = NULL; 2266e93f3152SAlex Elder rbd_obj_request_put(orig_request); 2267e93f3152SAlex Elder img_request_child_clear(parent_request); 2268e93f3152SAlex Elder 2269e93f3152SAlex Elder rbd_img_request_destroy(kref); 2270e93f3152SAlex Elder } 2271e93f3152SAlex Elder 22721217857fSAlex Elder static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request) 22731217857fSAlex Elder { 22746365d33aSAlex Elder struct rbd_img_request *img_request; 22751217857fSAlex Elder unsigned int xferred; 22761217857fSAlex Elder int result; 22778b3e1a56SAlex Elder bool more; 22781217857fSAlex Elder 22796365d33aSAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 22806365d33aSAlex Elder img_request = obj_request->img_request; 22816365d33aSAlex Elder 22821217857fSAlex Elder rbd_assert(obj_request->xferred <= (u64)UINT_MAX); 22831217857fSAlex Elder xferred = (unsigned int)obj_request->xferred; 22841217857fSAlex Elder result = obj_request->result; 22851217857fSAlex Elder if (result) { 22861217857fSAlex Elder struct rbd_device *rbd_dev = img_request->rbd_dev; 22876d2940c8SGuangliang Zhao enum obj_operation_type op_type; 22886d2940c8SGuangliang Zhao 228990e98c52SGuangliang Zhao if (img_request_discard_test(img_request)) 229090e98c52SGuangliang Zhao op_type = OBJ_OP_DISCARD; 229190e98c52SGuangliang Zhao else if (img_request_write_test(img_request)) 229290e98c52SGuangliang Zhao op_type = OBJ_OP_WRITE; 229390e98c52SGuangliang Zhao else 229490e98c52SGuangliang Zhao op_type = OBJ_OP_READ; 22951217857fSAlex Elder 22969584d508SIlya Dryomov rbd_warn(rbd_dev, "%s %llx at %llx (%llx)", 22976d2940c8SGuangliang Zhao obj_op_name(op_type), obj_request->length, 22986d2940c8SGuangliang Zhao obj_request->img_offset, obj_request->offset); 22999584d508SIlya Dryomov rbd_warn(rbd_dev, " result %d xferred %x", 23001217857fSAlex Elder result, xferred); 23011217857fSAlex Elder if (!img_request->result) 23021217857fSAlex Elder img_request->result = result; 2303082a75daSIlya Dryomov /* 2304082a75daSIlya Dryomov * Need to end I/O on the entire obj_request worth of 2305082a75daSIlya Dryomov * bytes in case of error. 2306082a75daSIlya Dryomov */ 2307082a75daSIlya Dryomov xferred = obj_request->length; 23081217857fSAlex Elder } 23091217857fSAlex Elder 2310f1a4739fSAlex Elder /* Image object requests don't own their page array */ 2311f1a4739fSAlex Elder 2312f1a4739fSAlex Elder if (obj_request->type == OBJ_REQUEST_PAGES) { 2313f1a4739fSAlex Elder obj_request->pages = NULL; 2314f1a4739fSAlex Elder obj_request->page_count = 0; 2315f1a4739fSAlex Elder } 2316f1a4739fSAlex Elder 23178b3e1a56SAlex Elder if (img_request_child_test(img_request)) { 23188b3e1a56SAlex Elder rbd_assert(img_request->obj_request != NULL); 23198b3e1a56SAlex Elder more = obj_request->which < img_request->obj_request_count - 1; 23208b3e1a56SAlex Elder } else { 23218b3e1a56SAlex Elder rbd_assert(img_request->rq != NULL); 23227ad18afaSChristoph Hellwig 23237ad18afaSChristoph Hellwig more = blk_update_request(img_request->rq, result, xferred); 23247ad18afaSChristoph Hellwig if (!more) 23257ad18afaSChristoph Hellwig __blk_mq_end_request(img_request->rq, result); 23268b3e1a56SAlex Elder } 23278b3e1a56SAlex Elder 23288b3e1a56SAlex Elder return more; 23291217857fSAlex Elder } 23301217857fSAlex Elder 23312169238dSAlex Elder static void rbd_img_obj_callback(struct rbd_obj_request *obj_request) 23322169238dSAlex Elder { 23332169238dSAlex Elder struct rbd_img_request *img_request; 23342169238dSAlex Elder u32 which = obj_request->which; 23352169238dSAlex Elder bool more = true; 23362169238dSAlex Elder 23376365d33aSAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 23382169238dSAlex Elder img_request = obj_request->img_request; 23392169238dSAlex Elder 23402169238dSAlex Elder dout("%s: img %p obj %p\n", __func__, img_request, obj_request); 23412169238dSAlex Elder rbd_assert(img_request != NULL); 23422169238dSAlex Elder rbd_assert(img_request->obj_request_count > 0); 23432169238dSAlex Elder rbd_assert(which != BAD_WHICH); 23442169238dSAlex Elder rbd_assert(which < img_request->obj_request_count); 23452169238dSAlex Elder 23462169238dSAlex Elder spin_lock_irq(&img_request->completion_lock); 23472169238dSAlex Elder if (which != img_request->next_completion) 23482169238dSAlex Elder goto out; 23492169238dSAlex Elder 23502169238dSAlex Elder for_each_obj_request_from(img_request, obj_request) { 23512169238dSAlex Elder rbd_assert(more); 23522169238dSAlex Elder rbd_assert(which < img_request->obj_request_count); 23532169238dSAlex Elder 23542169238dSAlex Elder if (!obj_request_done_test(obj_request)) 23552169238dSAlex Elder break; 23561217857fSAlex Elder more = rbd_img_obj_end_request(obj_request); 23572169238dSAlex Elder which++; 23582169238dSAlex Elder } 23592169238dSAlex Elder 23602169238dSAlex Elder rbd_assert(more ^ (which == img_request->obj_request_count)); 23612169238dSAlex Elder img_request->next_completion = which; 23622169238dSAlex Elder out: 23632169238dSAlex Elder spin_unlock_irq(&img_request->completion_lock); 23640f2d5be7SAlex Elder rbd_img_request_put(img_request); 23652169238dSAlex Elder 23662169238dSAlex Elder if (!more) 23672169238dSAlex Elder rbd_img_request_complete(img_request); 23682169238dSAlex Elder } 23692169238dSAlex Elder 2370f1a4739fSAlex Elder /* 23713b434a2aSJosh Durgin * Add individual osd ops to the given ceph_osd_request and prepare 23723b434a2aSJosh Durgin * them for submission. num_ops is the current number of 23733b434a2aSJosh Durgin * osd operations already to the object request. 23743b434a2aSJosh Durgin */ 23753b434a2aSJosh Durgin static void rbd_img_obj_request_fill(struct rbd_obj_request *obj_request, 23763b434a2aSJosh Durgin struct ceph_osd_request *osd_request, 23773b434a2aSJosh Durgin enum obj_operation_type op_type, 23783b434a2aSJosh Durgin unsigned int num_ops) 23793b434a2aSJosh Durgin { 23803b434a2aSJosh Durgin struct rbd_img_request *img_request = obj_request->img_request; 23813b434a2aSJosh Durgin struct rbd_device *rbd_dev = img_request->rbd_dev; 23823b434a2aSJosh Durgin u64 object_size = rbd_obj_bytes(&rbd_dev->header); 23833b434a2aSJosh Durgin u64 offset = obj_request->offset; 23843b434a2aSJosh Durgin u64 length = obj_request->length; 23853b434a2aSJosh Durgin u64 img_end; 23863b434a2aSJosh Durgin u16 opcode; 23873b434a2aSJosh Durgin 23883b434a2aSJosh Durgin if (op_type == OBJ_OP_DISCARD) { 2389d3246fb0SJosh Durgin if (!offset && length == object_size && 2390d3246fb0SJosh Durgin (!img_request_layered_test(img_request) || 2391d3246fb0SJosh Durgin !obj_request_overlaps_parent(obj_request))) { 23923b434a2aSJosh Durgin opcode = CEPH_OSD_OP_DELETE; 23933b434a2aSJosh Durgin } else if ((offset + length == object_size)) { 23943b434a2aSJosh Durgin opcode = CEPH_OSD_OP_TRUNCATE; 23953b434a2aSJosh Durgin } else { 23963b434a2aSJosh Durgin down_read(&rbd_dev->header_rwsem); 23973b434a2aSJosh Durgin img_end = rbd_dev->header.image_size; 23983b434a2aSJosh Durgin up_read(&rbd_dev->header_rwsem); 23993b434a2aSJosh Durgin 24003b434a2aSJosh Durgin if (obj_request->img_offset + length == img_end) 24013b434a2aSJosh Durgin opcode = CEPH_OSD_OP_TRUNCATE; 24023b434a2aSJosh Durgin else 24033b434a2aSJosh Durgin opcode = CEPH_OSD_OP_ZERO; 24043b434a2aSJosh Durgin } 24053b434a2aSJosh Durgin } else if (op_type == OBJ_OP_WRITE) { 2406e30b7577SIlya Dryomov if (!offset && length == object_size) 2407e30b7577SIlya Dryomov opcode = CEPH_OSD_OP_WRITEFULL; 2408e30b7577SIlya Dryomov else 24093b434a2aSJosh Durgin opcode = CEPH_OSD_OP_WRITE; 24103b434a2aSJosh Durgin osd_req_op_alloc_hint_init(osd_request, num_ops, 24113b434a2aSJosh Durgin object_size, object_size); 24123b434a2aSJosh Durgin num_ops++; 24133b434a2aSJosh Durgin } else { 24143b434a2aSJosh Durgin opcode = CEPH_OSD_OP_READ; 24153b434a2aSJosh Durgin } 24163b434a2aSJosh Durgin 24177e868b6eSIlya Dryomov if (opcode == CEPH_OSD_OP_DELETE) 2418144cba14SYan, Zheng osd_req_op_init(osd_request, num_ops, opcode, 0); 24197e868b6eSIlya Dryomov else 24207e868b6eSIlya Dryomov osd_req_op_extent_init(osd_request, num_ops, opcode, 24217e868b6eSIlya Dryomov offset, length, 0, 0); 24227e868b6eSIlya Dryomov 24233b434a2aSJosh Durgin if (obj_request->type == OBJ_REQUEST_BIO) 24243b434a2aSJosh Durgin osd_req_op_extent_osd_data_bio(osd_request, num_ops, 24253b434a2aSJosh Durgin obj_request->bio_list, length); 24263b434a2aSJosh Durgin else if (obj_request->type == OBJ_REQUEST_PAGES) 24273b434a2aSJosh Durgin osd_req_op_extent_osd_data_pages(osd_request, num_ops, 24283b434a2aSJosh Durgin obj_request->pages, length, 24293b434a2aSJosh Durgin offset & ~PAGE_MASK, false, false); 24303b434a2aSJosh Durgin 24313b434a2aSJosh Durgin /* Discards are also writes */ 24323b434a2aSJosh Durgin if (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD) 24333b434a2aSJosh Durgin rbd_osd_req_format_write(obj_request); 24343b434a2aSJosh Durgin else 24353b434a2aSJosh Durgin rbd_osd_req_format_read(obj_request); 24363b434a2aSJosh Durgin } 24373b434a2aSJosh Durgin 24383b434a2aSJosh Durgin /* 2439f1a4739fSAlex Elder * Split up an image request into one or more object requests, each 2440f1a4739fSAlex Elder * to a different object. The "type" parameter indicates whether 2441f1a4739fSAlex Elder * "data_desc" is the pointer to the head of a list of bio 2442f1a4739fSAlex Elder * structures, or the base of a page array. In either case this 2443f1a4739fSAlex Elder * function assumes data_desc describes memory sufficient to hold 2444f1a4739fSAlex Elder * all data described by the image request. 2445f1a4739fSAlex Elder */ 2446f1a4739fSAlex Elder static int rbd_img_request_fill(struct rbd_img_request *img_request, 2447f1a4739fSAlex Elder enum obj_request_type type, 2448f1a4739fSAlex Elder void *data_desc) 2449bf0d5f50SAlex Elder { 2450bf0d5f50SAlex Elder struct rbd_device *rbd_dev = img_request->rbd_dev; 2451bf0d5f50SAlex Elder struct rbd_obj_request *obj_request = NULL; 2452bf0d5f50SAlex Elder struct rbd_obj_request *next_obj_request; 2453a158073cSJingoo Han struct bio *bio_list = NULL; 2454f1a4739fSAlex Elder unsigned int bio_offset = 0; 2455a158073cSJingoo Han struct page **pages = NULL; 24566d2940c8SGuangliang Zhao enum obj_operation_type op_type; 24577da22d29SAlex Elder u64 img_offset; 2458bf0d5f50SAlex Elder u64 resid; 2459bf0d5f50SAlex Elder 2460f1a4739fSAlex Elder dout("%s: img %p type %d data_desc %p\n", __func__, img_request, 2461f1a4739fSAlex Elder (int)type, data_desc); 246237206ee5SAlex Elder 24637da22d29SAlex Elder img_offset = img_request->offset; 2464bf0d5f50SAlex Elder resid = img_request->length; 24654dda41d3SAlex Elder rbd_assert(resid > 0); 24663b434a2aSJosh Durgin op_type = rbd_img_request_op_type(img_request); 2467f1a4739fSAlex Elder 2468f1a4739fSAlex Elder if (type == OBJ_REQUEST_BIO) { 2469f1a4739fSAlex Elder bio_list = data_desc; 24704f024f37SKent Overstreet rbd_assert(img_offset == 24714f024f37SKent Overstreet bio_list->bi_iter.bi_sector << SECTOR_SHIFT); 247290e98c52SGuangliang Zhao } else if (type == OBJ_REQUEST_PAGES) { 2473f1a4739fSAlex Elder pages = data_desc; 2474f1a4739fSAlex Elder } 2475f1a4739fSAlex Elder 2476bf0d5f50SAlex Elder while (resid) { 24772fa12320SAlex Elder struct ceph_osd_request *osd_req; 2478bf0d5f50SAlex Elder const char *object_name; 2479bf0d5f50SAlex Elder u64 offset; 2480bf0d5f50SAlex Elder u64 length; 2481bf0d5f50SAlex Elder 24827da22d29SAlex Elder object_name = rbd_segment_name(rbd_dev, img_offset); 2483bf0d5f50SAlex Elder if (!object_name) 2484bf0d5f50SAlex Elder goto out_unwind; 24857da22d29SAlex Elder offset = rbd_segment_offset(rbd_dev, img_offset); 24867da22d29SAlex Elder length = rbd_segment_length(rbd_dev, img_offset, resid); 2487bf0d5f50SAlex Elder obj_request = rbd_obj_request_create(object_name, 2488f1a4739fSAlex Elder offset, length, type); 248978c2a44aSAlex Elder /* object request has its own copy of the object name */ 249078c2a44aSAlex Elder rbd_segment_name_free(object_name); 2491bf0d5f50SAlex Elder if (!obj_request) 2492bf0d5f50SAlex Elder goto out_unwind; 249362054da6SIlya Dryomov 249403507db6SJosh Durgin /* 249503507db6SJosh Durgin * set obj_request->img_request before creating the 249603507db6SJosh Durgin * osd_request so that it gets the right snapc 249703507db6SJosh Durgin */ 249803507db6SJosh Durgin rbd_img_obj_request_add(img_request, obj_request); 2499bf0d5f50SAlex Elder 2500f1a4739fSAlex Elder if (type == OBJ_REQUEST_BIO) { 2501f1a4739fSAlex Elder unsigned int clone_size; 2502f1a4739fSAlex Elder 2503bf0d5f50SAlex Elder rbd_assert(length <= (u64)UINT_MAX); 2504bf0d5f50SAlex Elder clone_size = (unsigned int)length; 2505f1a4739fSAlex Elder obj_request->bio_list = 2506f1a4739fSAlex Elder bio_chain_clone_range(&bio_list, 2507f1a4739fSAlex Elder &bio_offset, 2508f1a4739fSAlex Elder clone_size, 2509bf0d5f50SAlex Elder GFP_ATOMIC); 2510bf0d5f50SAlex Elder if (!obj_request->bio_list) 251162054da6SIlya Dryomov goto out_unwind; 251290e98c52SGuangliang Zhao } else if (type == OBJ_REQUEST_PAGES) { 2513f1a4739fSAlex Elder unsigned int page_count; 2514f1a4739fSAlex Elder 2515f1a4739fSAlex Elder obj_request->pages = pages; 2516f1a4739fSAlex Elder page_count = (u32)calc_pages_for(offset, length); 2517f1a4739fSAlex Elder obj_request->page_count = page_count; 2518f1a4739fSAlex Elder if ((offset + length) & ~PAGE_MASK) 2519f1a4739fSAlex Elder page_count--; /* more on last page */ 2520f1a4739fSAlex Elder pages += page_count; 2521f1a4739fSAlex Elder } 2522bf0d5f50SAlex Elder 25236d2940c8SGuangliang Zhao osd_req = rbd_osd_req_create(rbd_dev, op_type, 25246d2940c8SGuangliang Zhao (op_type == OBJ_OP_WRITE) ? 2 : 1, 25252fa12320SAlex Elder obj_request); 25262fa12320SAlex Elder if (!osd_req) 252762054da6SIlya Dryomov goto out_unwind; 25283b434a2aSJosh Durgin 25292fa12320SAlex Elder obj_request->osd_req = osd_req; 25302169238dSAlex Elder obj_request->callback = rbd_img_obj_callback; 25317da22d29SAlex Elder obj_request->img_offset = img_offset; 2532bf0d5f50SAlex Elder 25333b434a2aSJosh Durgin rbd_img_obj_request_fill(obj_request, osd_req, op_type, 0); 25343b434a2aSJosh Durgin 25353b434a2aSJosh Durgin rbd_img_request_get(img_request); 25363b434a2aSJosh Durgin 25377da22d29SAlex Elder img_offset += length; 2538bf0d5f50SAlex Elder resid -= length; 2539bf0d5f50SAlex Elder } 2540bf0d5f50SAlex Elder 2541bf0d5f50SAlex Elder return 0; 2542bf0d5f50SAlex Elder 2543bf0d5f50SAlex Elder out_unwind: 2544bf0d5f50SAlex Elder for_each_obj_request_safe(img_request, obj_request, next_obj_request) 254542dd037cSIlya Dryomov rbd_img_obj_request_del(img_request, obj_request); 2546bf0d5f50SAlex Elder 2547bf0d5f50SAlex Elder return -ENOMEM; 2548bf0d5f50SAlex Elder } 2549bf0d5f50SAlex Elder 25503d7efd18SAlex Elder static void 25512761713dSIlya Dryomov rbd_osd_copyup_callback(struct rbd_obj_request *obj_request) 25520eefd470SAlex Elder { 25530eefd470SAlex Elder struct rbd_img_request *img_request; 25540eefd470SAlex Elder struct rbd_device *rbd_dev; 2555ebda6408SAlex Elder struct page **pages; 25560eefd470SAlex Elder u32 page_count; 25570eefd470SAlex Elder 25582761713dSIlya Dryomov dout("%s: obj %p\n", __func__, obj_request); 25592761713dSIlya Dryomov 2560d3246fb0SJosh Durgin rbd_assert(obj_request->type == OBJ_REQUEST_BIO || 2561d3246fb0SJosh Durgin obj_request->type == OBJ_REQUEST_NODATA); 25620eefd470SAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 25630eefd470SAlex Elder img_request = obj_request->img_request; 25640eefd470SAlex Elder rbd_assert(img_request); 25650eefd470SAlex Elder 25660eefd470SAlex Elder rbd_dev = img_request->rbd_dev; 25670eefd470SAlex Elder rbd_assert(rbd_dev); 25680eefd470SAlex Elder 2569ebda6408SAlex Elder pages = obj_request->copyup_pages; 2570ebda6408SAlex Elder rbd_assert(pages != NULL); 25710eefd470SAlex Elder obj_request->copyup_pages = NULL; 2572ebda6408SAlex Elder page_count = obj_request->copyup_page_count; 2573ebda6408SAlex Elder rbd_assert(page_count); 2574ebda6408SAlex Elder obj_request->copyup_page_count = 0; 2575ebda6408SAlex Elder ceph_release_page_vector(pages, page_count); 25760eefd470SAlex Elder 25770eefd470SAlex Elder /* 25780eefd470SAlex Elder * We want the transfer count to reflect the size of the 25790eefd470SAlex Elder * original write request. There is no such thing as a 25800eefd470SAlex Elder * successful short write, so if the request was successful 25810eefd470SAlex Elder * we can just set it to the originally-requested length. 25820eefd470SAlex Elder */ 25830eefd470SAlex Elder if (!obj_request->result) 25840eefd470SAlex Elder obj_request->xferred = obj_request->length; 25850eefd470SAlex Elder 25862761713dSIlya Dryomov obj_request_done_set(obj_request); 25870eefd470SAlex Elder } 25880eefd470SAlex Elder 25890eefd470SAlex Elder static void 25903d7efd18SAlex Elder rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request) 25913d7efd18SAlex Elder { 25923d7efd18SAlex Elder struct rbd_obj_request *orig_request; 25930eefd470SAlex Elder struct ceph_osd_request *osd_req; 25940eefd470SAlex Elder struct ceph_osd_client *osdc; 25950eefd470SAlex Elder struct rbd_device *rbd_dev; 25963d7efd18SAlex Elder struct page **pages; 2597d3246fb0SJosh Durgin enum obj_operation_type op_type; 2598ebda6408SAlex Elder u32 page_count; 2599bbea1c1aSAlex Elder int img_result; 2600ebda6408SAlex Elder u64 parent_length; 26013d7efd18SAlex Elder 26023d7efd18SAlex Elder rbd_assert(img_request_child_test(img_request)); 26033d7efd18SAlex Elder 26043d7efd18SAlex Elder /* First get what we need from the image request */ 26053d7efd18SAlex Elder 26063d7efd18SAlex Elder pages = img_request->copyup_pages; 26073d7efd18SAlex Elder rbd_assert(pages != NULL); 26083d7efd18SAlex Elder img_request->copyup_pages = NULL; 2609ebda6408SAlex Elder page_count = img_request->copyup_page_count; 2610ebda6408SAlex Elder rbd_assert(page_count); 2611ebda6408SAlex Elder img_request->copyup_page_count = 0; 26123d7efd18SAlex Elder 26133d7efd18SAlex Elder orig_request = img_request->obj_request; 26143d7efd18SAlex Elder rbd_assert(orig_request != NULL); 2615b91f09f1SAlex Elder rbd_assert(obj_request_type_valid(orig_request->type)); 2616bbea1c1aSAlex Elder img_result = img_request->result; 2617ebda6408SAlex Elder parent_length = img_request->length; 2618ebda6408SAlex Elder rbd_assert(parent_length == img_request->xferred); 26193d7efd18SAlex Elder rbd_img_request_put(img_request); 26203d7efd18SAlex Elder 262191c6febbSAlex Elder rbd_assert(orig_request->img_request); 262291c6febbSAlex Elder rbd_dev = orig_request->img_request->rbd_dev; 26233d7efd18SAlex Elder rbd_assert(rbd_dev); 26243d7efd18SAlex Elder 2625bbea1c1aSAlex Elder /* 2626bbea1c1aSAlex Elder * If the overlap has become 0 (most likely because the 2627bbea1c1aSAlex Elder * image has been flattened) we need to free the pages 2628bbea1c1aSAlex Elder * and re-submit the original write request. 2629bbea1c1aSAlex Elder */ 2630bbea1c1aSAlex Elder if (!rbd_dev->parent_overlap) { 2631bbea1c1aSAlex Elder struct ceph_osd_client *osdc; 2632bbea1c1aSAlex Elder 2633bbea1c1aSAlex Elder ceph_release_page_vector(pages, page_count); 2634bbea1c1aSAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 2635bbea1c1aSAlex Elder img_result = rbd_obj_request_submit(osdc, orig_request); 2636bbea1c1aSAlex Elder if (!img_result) 2637bbea1c1aSAlex Elder return; 2638bbea1c1aSAlex Elder } 2639bbea1c1aSAlex Elder 2640bbea1c1aSAlex Elder if (img_result) 26410eefd470SAlex Elder goto out_err; 26423d7efd18SAlex Elder 26438785b1d4SAlex Elder /* 26448785b1d4SAlex Elder * The original osd request is of no use to use any more. 26450ccd5926SIlya Dryomov * We need a new one that can hold the three ops in a copyup 26468785b1d4SAlex Elder * request. Allocate the new copyup osd request for the 26478785b1d4SAlex Elder * original request, and release the old one. 26488785b1d4SAlex Elder */ 2649bbea1c1aSAlex Elder img_result = -ENOMEM; 26500eefd470SAlex Elder osd_req = rbd_osd_req_create_copyup(orig_request); 26510eefd470SAlex Elder if (!osd_req) 26520eefd470SAlex Elder goto out_err; 26538785b1d4SAlex Elder rbd_osd_req_destroy(orig_request->osd_req); 26540eefd470SAlex Elder orig_request->osd_req = osd_req; 26550eefd470SAlex Elder orig_request->copyup_pages = pages; 2656ebda6408SAlex Elder orig_request->copyup_page_count = page_count; 26573d7efd18SAlex Elder 26580eefd470SAlex Elder /* Initialize the copyup op */ 26590eefd470SAlex Elder 26600eefd470SAlex Elder osd_req_op_cls_init(osd_req, 0, CEPH_OSD_OP_CALL, "rbd", "copyup"); 2661ebda6408SAlex Elder osd_req_op_cls_request_data_pages(osd_req, 0, pages, parent_length, 0, 26620eefd470SAlex Elder false, false); 26630eefd470SAlex Elder 2664d3246fb0SJosh Durgin /* Add the other op(s) */ 26650ccd5926SIlya Dryomov 2666d3246fb0SJosh Durgin op_type = rbd_img_request_op_type(orig_request->img_request); 2667d3246fb0SJosh Durgin rbd_img_obj_request_fill(orig_request, osd_req, op_type, 1); 26680eefd470SAlex Elder 26690eefd470SAlex Elder /* All set, send it off. */ 26700eefd470SAlex Elder 26710eefd470SAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 2672bbea1c1aSAlex Elder img_result = rbd_obj_request_submit(osdc, orig_request); 2673bbea1c1aSAlex Elder if (!img_result) 26740eefd470SAlex Elder return; 26750eefd470SAlex Elder out_err: 26760eefd470SAlex Elder /* Record the error code and complete the request */ 26770eefd470SAlex Elder 2678bbea1c1aSAlex Elder orig_request->result = img_result; 26790eefd470SAlex Elder orig_request->xferred = 0; 26803d7efd18SAlex Elder obj_request_done_set(orig_request); 26813d7efd18SAlex Elder rbd_obj_request_complete(orig_request); 26823d7efd18SAlex Elder } 26833d7efd18SAlex Elder 26843d7efd18SAlex Elder /* 26853d7efd18SAlex Elder * Read from the parent image the range of data that covers the 26863d7efd18SAlex Elder * entire target of the given object request. This is used for 26873d7efd18SAlex Elder * satisfying a layered image write request when the target of an 26883d7efd18SAlex Elder * object request from the image request does not exist. 26893d7efd18SAlex Elder * 26903d7efd18SAlex Elder * A page array big enough to hold the returned data is allocated 26913d7efd18SAlex Elder * and supplied to rbd_img_request_fill() as the "data descriptor." 26923d7efd18SAlex Elder * When the read completes, this page array will be transferred to 26933d7efd18SAlex Elder * the original object request for the copyup operation. 26943d7efd18SAlex Elder * 26953d7efd18SAlex Elder * If an error occurs, record it as the result of the original 26963d7efd18SAlex Elder * object request and mark it done so it gets completed. 26973d7efd18SAlex Elder */ 26983d7efd18SAlex Elder static int rbd_img_obj_parent_read_full(struct rbd_obj_request *obj_request) 26993d7efd18SAlex Elder { 27003d7efd18SAlex Elder struct rbd_img_request *img_request = NULL; 27013d7efd18SAlex Elder struct rbd_img_request *parent_request = NULL; 27023d7efd18SAlex Elder struct rbd_device *rbd_dev; 27033d7efd18SAlex Elder u64 img_offset; 27043d7efd18SAlex Elder u64 length; 27053d7efd18SAlex Elder struct page **pages = NULL; 27063d7efd18SAlex Elder u32 page_count; 27073d7efd18SAlex Elder int result; 27083d7efd18SAlex Elder 27093d7efd18SAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 2710b91f09f1SAlex Elder rbd_assert(obj_request_type_valid(obj_request->type)); 27113d7efd18SAlex Elder 27123d7efd18SAlex Elder img_request = obj_request->img_request; 27133d7efd18SAlex Elder rbd_assert(img_request != NULL); 27143d7efd18SAlex Elder rbd_dev = img_request->rbd_dev; 27153d7efd18SAlex Elder rbd_assert(rbd_dev->parent != NULL); 27163d7efd18SAlex Elder 27173d7efd18SAlex Elder /* 27183d7efd18SAlex Elder * Determine the byte range covered by the object in the 27193d7efd18SAlex Elder * child image to which the original request was to be sent. 27203d7efd18SAlex Elder */ 27213d7efd18SAlex Elder img_offset = obj_request->img_offset - obj_request->offset; 27223d7efd18SAlex Elder length = (u64)1 << rbd_dev->header.obj_order; 27233d7efd18SAlex Elder 27243d7efd18SAlex Elder /* 2725a9e8ba2cSAlex Elder * There is no defined parent data beyond the parent 2726a9e8ba2cSAlex Elder * overlap, so limit what we read at that boundary if 2727a9e8ba2cSAlex Elder * necessary. 2728a9e8ba2cSAlex Elder */ 2729a9e8ba2cSAlex Elder if (img_offset + length > rbd_dev->parent_overlap) { 2730a9e8ba2cSAlex Elder rbd_assert(img_offset < rbd_dev->parent_overlap); 2731a9e8ba2cSAlex Elder length = rbd_dev->parent_overlap - img_offset; 2732a9e8ba2cSAlex Elder } 2733a9e8ba2cSAlex Elder 2734a9e8ba2cSAlex Elder /* 27353d7efd18SAlex Elder * Allocate a page array big enough to receive the data read 27363d7efd18SAlex Elder * from the parent. 27373d7efd18SAlex Elder */ 27383d7efd18SAlex Elder page_count = (u32)calc_pages_for(0, length); 27393d7efd18SAlex Elder pages = ceph_alloc_page_vector(page_count, GFP_KERNEL); 27403d7efd18SAlex Elder if (IS_ERR(pages)) { 27413d7efd18SAlex Elder result = PTR_ERR(pages); 27423d7efd18SAlex Elder pages = NULL; 27433d7efd18SAlex Elder goto out_err; 27443d7efd18SAlex Elder } 27453d7efd18SAlex Elder 27463d7efd18SAlex Elder result = -ENOMEM; 2747e93f3152SAlex Elder parent_request = rbd_parent_request_create(obj_request, 2748e93f3152SAlex Elder img_offset, length); 27493d7efd18SAlex Elder if (!parent_request) 27503d7efd18SAlex Elder goto out_err; 27513d7efd18SAlex Elder 27523d7efd18SAlex Elder result = rbd_img_request_fill(parent_request, OBJ_REQUEST_PAGES, pages); 27533d7efd18SAlex Elder if (result) 27543d7efd18SAlex Elder goto out_err; 27553d7efd18SAlex Elder parent_request->copyup_pages = pages; 2756ebda6408SAlex Elder parent_request->copyup_page_count = page_count; 27573d7efd18SAlex Elder 27583d7efd18SAlex Elder parent_request->callback = rbd_img_obj_parent_read_full_callback; 27593d7efd18SAlex Elder result = rbd_img_request_submit(parent_request); 27603d7efd18SAlex Elder if (!result) 27613d7efd18SAlex Elder return 0; 27623d7efd18SAlex Elder 27633d7efd18SAlex Elder parent_request->copyup_pages = NULL; 2764ebda6408SAlex Elder parent_request->copyup_page_count = 0; 27653d7efd18SAlex Elder parent_request->obj_request = NULL; 27663d7efd18SAlex Elder rbd_obj_request_put(obj_request); 27673d7efd18SAlex Elder out_err: 27683d7efd18SAlex Elder if (pages) 27693d7efd18SAlex Elder ceph_release_page_vector(pages, page_count); 27703d7efd18SAlex Elder if (parent_request) 27713d7efd18SAlex Elder rbd_img_request_put(parent_request); 27723d7efd18SAlex Elder obj_request->result = result; 27733d7efd18SAlex Elder obj_request->xferred = 0; 27743d7efd18SAlex Elder obj_request_done_set(obj_request); 27753d7efd18SAlex Elder 27763d7efd18SAlex Elder return result; 27773d7efd18SAlex Elder } 27783d7efd18SAlex Elder 2779c5b5ef6cSAlex Elder static void rbd_img_obj_exists_callback(struct rbd_obj_request *obj_request) 2780c5b5ef6cSAlex Elder { 2781c5b5ef6cSAlex Elder struct rbd_obj_request *orig_request; 2782638f5abeSAlex Elder struct rbd_device *rbd_dev; 2783c5b5ef6cSAlex Elder int result; 2784c5b5ef6cSAlex Elder 2785c5b5ef6cSAlex Elder rbd_assert(!obj_request_img_data_test(obj_request)); 2786c5b5ef6cSAlex Elder 2787c5b5ef6cSAlex Elder /* 2788c5b5ef6cSAlex Elder * All we need from the object request is the original 2789c5b5ef6cSAlex Elder * request and the result of the STAT op. Grab those, then 2790c5b5ef6cSAlex Elder * we're done with the request. 2791c5b5ef6cSAlex Elder */ 2792c5b5ef6cSAlex Elder orig_request = obj_request->obj_request; 2793c5b5ef6cSAlex Elder obj_request->obj_request = NULL; 2794912c317dSAlex Elder rbd_obj_request_put(orig_request); 2795c5b5ef6cSAlex Elder rbd_assert(orig_request); 2796c5b5ef6cSAlex Elder rbd_assert(orig_request->img_request); 2797c5b5ef6cSAlex Elder 2798c5b5ef6cSAlex Elder result = obj_request->result; 2799c5b5ef6cSAlex Elder obj_request->result = 0; 2800c5b5ef6cSAlex Elder 2801c5b5ef6cSAlex Elder dout("%s: obj %p for obj %p result %d %llu/%llu\n", __func__, 2802c5b5ef6cSAlex Elder obj_request, orig_request, result, 2803c5b5ef6cSAlex Elder obj_request->xferred, obj_request->length); 2804c5b5ef6cSAlex Elder rbd_obj_request_put(obj_request); 2805c5b5ef6cSAlex Elder 2806638f5abeSAlex Elder /* 2807638f5abeSAlex Elder * If the overlap has become 0 (most likely because the 2808638f5abeSAlex Elder * image has been flattened) we need to free the pages 2809638f5abeSAlex Elder * and re-submit the original write request. 2810638f5abeSAlex Elder */ 2811638f5abeSAlex Elder rbd_dev = orig_request->img_request->rbd_dev; 2812638f5abeSAlex Elder if (!rbd_dev->parent_overlap) { 2813638f5abeSAlex Elder struct ceph_osd_client *osdc; 2814638f5abeSAlex Elder 2815638f5abeSAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 2816638f5abeSAlex Elder result = rbd_obj_request_submit(osdc, orig_request); 2817638f5abeSAlex Elder if (!result) 2818638f5abeSAlex Elder return; 2819638f5abeSAlex Elder } 2820c5b5ef6cSAlex Elder 2821c5b5ef6cSAlex Elder /* 2822c5b5ef6cSAlex Elder * Our only purpose here is to determine whether the object 2823c5b5ef6cSAlex Elder * exists, and we don't want to treat the non-existence as 2824c5b5ef6cSAlex Elder * an error. If something else comes back, transfer the 2825c5b5ef6cSAlex Elder * error to the original request and complete it now. 2826c5b5ef6cSAlex Elder */ 2827c5b5ef6cSAlex Elder if (!result) { 2828c5b5ef6cSAlex Elder obj_request_existence_set(orig_request, true); 2829c5b5ef6cSAlex Elder } else if (result == -ENOENT) { 2830c5b5ef6cSAlex Elder obj_request_existence_set(orig_request, false); 2831c5b5ef6cSAlex Elder } else if (result) { 2832c5b5ef6cSAlex Elder orig_request->result = result; 28333d7efd18SAlex Elder goto out; 2834c5b5ef6cSAlex Elder } 2835c5b5ef6cSAlex Elder 2836c5b5ef6cSAlex Elder /* 2837c5b5ef6cSAlex Elder * Resubmit the original request now that we have recorded 2838c5b5ef6cSAlex Elder * whether the target object exists. 2839c5b5ef6cSAlex Elder */ 2840b454e36dSAlex Elder orig_request->result = rbd_img_obj_request_submit(orig_request); 28413d7efd18SAlex Elder out: 2842c5b5ef6cSAlex Elder if (orig_request->result) 2843c5b5ef6cSAlex Elder rbd_obj_request_complete(orig_request); 2844c5b5ef6cSAlex Elder } 2845c5b5ef6cSAlex Elder 2846c5b5ef6cSAlex Elder static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request) 2847c5b5ef6cSAlex Elder { 2848c5b5ef6cSAlex Elder struct rbd_obj_request *stat_request; 2849c5b5ef6cSAlex Elder struct rbd_device *rbd_dev; 2850c5b5ef6cSAlex Elder struct ceph_osd_client *osdc; 2851c5b5ef6cSAlex Elder struct page **pages = NULL; 2852c5b5ef6cSAlex Elder u32 page_count; 2853c5b5ef6cSAlex Elder size_t size; 2854c5b5ef6cSAlex Elder int ret; 2855c5b5ef6cSAlex Elder 2856c5b5ef6cSAlex Elder /* 2857c5b5ef6cSAlex Elder * The response data for a STAT call consists of: 2858c5b5ef6cSAlex Elder * le64 length; 2859c5b5ef6cSAlex Elder * struct { 2860c5b5ef6cSAlex Elder * le32 tv_sec; 2861c5b5ef6cSAlex Elder * le32 tv_nsec; 2862c5b5ef6cSAlex Elder * } mtime; 2863c5b5ef6cSAlex Elder */ 2864c5b5ef6cSAlex Elder size = sizeof (__le64) + sizeof (__le32) + sizeof (__le32); 2865c5b5ef6cSAlex Elder page_count = (u32)calc_pages_for(0, size); 2866c5b5ef6cSAlex Elder pages = ceph_alloc_page_vector(page_count, GFP_KERNEL); 2867c5b5ef6cSAlex Elder if (IS_ERR(pages)) 2868c5b5ef6cSAlex Elder return PTR_ERR(pages); 2869c5b5ef6cSAlex Elder 2870c5b5ef6cSAlex Elder ret = -ENOMEM; 2871c5b5ef6cSAlex Elder stat_request = rbd_obj_request_create(obj_request->object_name, 0, 0, 2872c5b5ef6cSAlex Elder OBJ_REQUEST_PAGES); 2873c5b5ef6cSAlex Elder if (!stat_request) 2874c5b5ef6cSAlex Elder goto out; 2875c5b5ef6cSAlex Elder 2876c5b5ef6cSAlex Elder rbd_obj_request_get(obj_request); 2877c5b5ef6cSAlex Elder stat_request->obj_request = obj_request; 2878c5b5ef6cSAlex Elder stat_request->pages = pages; 2879c5b5ef6cSAlex Elder stat_request->page_count = page_count; 2880c5b5ef6cSAlex Elder 2881c5b5ef6cSAlex Elder rbd_assert(obj_request->img_request); 2882c5b5ef6cSAlex Elder rbd_dev = obj_request->img_request->rbd_dev; 28836d2940c8SGuangliang Zhao stat_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1, 2884c5b5ef6cSAlex Elder stat_request); 2885c5b5ef6cSAlex Elder if (!stat_request->osd_req) 2886c5b5ef6cSAlex Elder goto out; 2887c5b5ef6cSAlex Elder stat_request->callback = rbd_img_obj_exists_callback; 2888c5b5ef6cSAlex Elder 2889144cba14SYan, Zheng osd_req_op_init(stat_request->osd_req, 0, CEPH_OSD_OP_STAT, 0); 2890c5b5ef6cSAlex Elder osd_req_op_raw_data_in_pages(stat_request->osd_req, 0, pages, size, 0, 2891c5b5ef6cSAlex Elder false, false); 28929d4df01fSAlex Elder rbd_osd_req_format_read(stat_request); 2893c5b5ef6cSAlex Elder 2894c5b5ef6cSAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 2895c5b5ef6cSAlex Elder ret = rbd_obj_request_submit(osdc, stat_request); 2896c5b5ef6cSAlex Elder out: 2897c5b5ef6cSAlex Elder if (ret) 2898c5b5ef6cSAlex Elder rbd_obj_request_put(obj_request); 2899c5b5ef6cSAlex Elder 2900c5b5ef6cSAlex Elder return ret; 2901c5b5ef6cSAlex Elder } 2902c5b5ef6cSAlex Elder 290370d045f6SIlya Dryomov static bool img_obj_request_simple(struct rbd_obj_request *obj_request) 2904b454e36dSAlex Elder { 2905b454e36dSAlex Elder struct rbd_img_request *img_request; 2906a9e8ba2cSAlex Elder struct rbd_device *rbd_dev; 2907b454e36dSAlex Elder 2908b454e36dSAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 2909b454e36dSAlex Elder 2910b454e36dSAlex Elder img_request = obj_request->img_request; 2911b454e36dSAlex Elder rbd_assert(img_request); 2912a9e8ba2cSAlex Elder rbd_dev = img_request->rbd_dev; 2913b454e36dSAlex Elder 291470d045f6SIlya Dryomov /* Reads */ 29151c220881SJosh Durgin if (!img_request_write_test(img_request) && 29161c220881SJosh Durgin !img_request_discard_test(img_request)) 291770d045f6SIlya Dryomov return true; 2918b454e36dSAlex Elder 291970d045f6SIlya Dryomov /* Non-layered writes */ 292070d045f6SIlya Dryomov if (!img_request_layered_test(img_request)) 292170d045f6SIlya Dryomov return true; 292270d045f6SIlya Dryomov 292370d045f6SIlya Dryomov /* 292470d045f6SIlya Dryomov * Layered writes outside of the parent overlap range don't 292570d045f6SIlya Dryomov * share any data with the parent. 292670d045f6SIlya Dryomov */ 292770d045f6SIlya Dryomov if (!obj_request_overlaps_parent(obj_request)) 292870d045f6SIlya Dryomov return true; 292970d045f6SIlya Dryomov 293070d045f6SIlya Dryomov /* 2931c622d226SGuangliang Zhao * Entire-object layered writes - we will overwrite whatever 2932c622d226SGuangliang Zhao * parent data there is anyway. 2933c622d226SGuangliang Zhao */ 2934c622d226SGuangliang Zhao if (!obj_request->offset && 2935c622d226SGuangliang Zhao obj_request->length == rbd_obj_bytes(&rbd_dev->header)) 2936c622d226SGuangliang Zhao return true; 2937c622d226SGuangliang Zhao 2938c622d226SGuangliang Zhao /* 293970d045f6SIlya Dryomov * If the object is known to already exist, its parent data has 294070d045f6SIlya Dryomov * already been copied. 294170d045f6SIlya Dryomov */ 294270d045f6SIlya Dryomov if (obj_request_known_test(obj_request) && 294370d045f6SIlya Dryomov obj_request_exists_test(obj_request)) 294470d045f6SIlya Dryomov return true; 294570d045f6SIlya Dryomov 294670d045f6SIlya Dryomov return false; 294770d045f6SIlya Dryomov } 294870d045f6SIlya Dryomov 294970d045f6SIlya Dryomov static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request) 295070d045f6SIlya Dryomov { 295170d045f6SIlya Dryomov if (img_obj_request_simple(obj_request)) { 2952b454e36dSAlex Elder struct rbd_device *rbd_dev; 2953b454e36dSAlex Elder struct ceph_osd_client *osdc; 2954b454e36dSAlex Elder 2955b454e36dSAlex Elder rbd_dev = obj_request->img_request->rbd_dev; 2956b454e36dSAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 2957b454e36dSAlex Elder 2958b454e36dSAlex Elder return rbd_obj_request_submit(osdc, obj_request); 2959b454e36dSAlex Elder } 2960b454e36dSAlex Elder 2961b454e36dSAlex Elder /* 29623d7efd18SAlex Elder * It's a layered write. The target object might exist but 29633d7efd18SAlex Elder * we may not know that yet. If we know it doesn't exist, 29643d7efd18SAlex Elder * start by reading the data for the full target object from 29653d7efd18SAlex Elder * the parent so we can use it for a copyup to the target. 2966b454e36dSAlex Elder */ 296770d045f6SIlya Dryomov if (obj_request_known_test(obj_request)) 29683d7efd18SAlex Elder return rbd_img_obj_parent_read_full(obj_request); 29693d7efd18SAlex Elder 29703d7efd18SAlex Elder /* We don't know whether the target exists. Go find out. */ 2971b454e36dSAlex Elder 2972b454e36dSAlex Elder return rbd_img_obj_exists_submit(obj_request); 2973b454e36dSAlex Elder } 2974b454e36dSAlex Elder 2975bf0d5f50SAlex Elder static int rbd_img_request_submit(struct rbd_img_request *img_request) 2976bf0d5f50SAlex Elder { 2977bf0d5f50SAlex Elder struct rbd_obj_request *obj_request; 297846faeed4SAlex Elder struct rbd_obj_request *next_obj_request; 2979bf0d5f50SAlex Elder 298037206ee5SAlex Elder dout("%s: img %p\n", __func__, img_request); 298146faeed4SAlex Elder for_each_obj_request_safe(img_request, obj_request, next_obj_request) { 2982bf0d5f50SAlex Elder int ret; 2983bf0d5f50SAlex Elder 2984b454e36dSAlex Elder ret = rbd_img_obj_request_submit(obj_request); 2985bf0d5f50SAlex Elder if (ret) 2986bf0d5f50SAlex Elder return ret; 2987bf0d5f50SAlex Elder } 2988bf0d5f50SAlex Elder 2989bf0d5f50SAlex Elder return 0; 2990bf0d5f50SAlex Elder } 2991bf0d5f50SAlex Elder 29928b3e1a56SAlex Elder static void rbd_img_parent_read_callback(struct rbd_img_request *img_request) 29938b3e1a56SAlex Elder { 29948b3e1a56SAlex Elder struct rbd_obj_request *obj_request; 2995a9e8ba2cSAlex Elder struct rbd_device *rbd_dev; 2996a9e8ba2cSAlex Elder u64 obj_end; 299702c74fbaSAlex Elder u64 img_xferred; 299802c74fbaSAlex Elder int img_result; 29998b3e1a56SAlex Elder 30008b3e1a56SAlex Elder rbd_assert(img_request_child_test(img_request)); 30018b3e1a56SAlex Elder 300202c74fbaSAlex Elder /* First get what we need from the image request and release it */ 300302c74fbaSAlex Elder 30048b3e1a56SAlex Elder obj_request = img_request->obj_request; 300502c74fbaSAlex Elder img_xferred = img_request->xferred; 300602c74fbaSAlex Elder img_result = img_request->result; 300702c74fbaSAlex Elder rbd_img_request_put(img_request); 300802c74fbaSAlex Elder 300902c74fbaSAlex Elder /* 301002c74fbaSAlex Elder * If the overlap has become 0 (most likely because the 301102c74fbaSAlex Elder * image has been flattened) we need to re-submit the 301202c74fbaSAlex Elder * original request. 301302c74fbaSAlex Elder */ 3014a9e8ba2cSAlex Elder rbd_assert(obj_request); 3015a9e8ba2cSAlex Elder rbd_assert(obj_request->img_request); 301602c74fbaSAlex Elder rbd_dev = obj_request->img_request->rbd_dev; 301702c74fbaSAlex Elder if (!rbd_dev->parent_overlap) { 301802c74fbaSAlex Elder struct ceph_osd_client *osdc; 30198b3e1a56SAlex Elder 302002c74fbaSAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 302102c74fbaSAlex Elder img_result = rbd_obj_request_submit(osdc, obj_request); 302202c74fbaSAlex Elder if (!img_result) 302302c74fbaSAlex Elder return; 302402c74fbaSAlex Elder } 302502c74fbaSAlex Elder 302602c74fbaSAlex Elder obj_request->result = img_result; 3027a9e8ba2cSAlex Elder if (obj_request->result) 3028a9e8ba2cSAlex Elder goto out; 3029a9e8ba2cSAlex Elder 3030a9e8ba2cSAlex Elder /* 3031a9e8ba2cSAlex Elder * We need to zero anything beyond the parent overlap 3032a9e8ba2cSAlex Elder * boundary. Since rbd_img_obj_request_read_callback() 3033a9e8ba2cSAlex Elder * will zero anything beyond the end of a short read, an 3034a9e8ba2cSAlex Elder * easy way to do this is to pretend the data from the 3035a9e8ba2cSAlex Elder * parent came up short--ending at the overlap boundary. 3036a9e8ba2cSAlex Elder */ 3037a9e8ba2cSAlex Elder rbd_assert(obj_request->img_offset < U64_MAX - obj_request->length); 3038a9e8ba2cSAlex Elder obj_end = obj_request->img_offset + obj_request->length; 3039a9e8ba2cSAlex Elder if (obj_end > rbd_dev->parent_overlap) { 3040a9e8ba2cSAlex Elder u64 xferred = 0; 3041a9e8ba2cSAlex Elder 3042a9e8ba2cSAlex Elder if (obj_request->img_offset < rbd_dev->parent_overlap) 3043a9e8ba2cSAlex Elder xferred = rbd_dev->parent_overlap - 3044a9e8ba2cSAlex Elder obj_request->img_offset; 3045a9e8ba2cSAlex Elder 304602c74fbaSAlex Elder obj_request->xferred = min(img_xferred, xferred); 3047a9e8ba2cSAlex Elder } else { 304802c74fbaSAlex Elder obj_request->xferred = img_xferred; 3049a9e8ba2cSAlex Elder } 3050a9e8ba2cSAlex Elder out: 30518b3e1a56SAlex Elder rbd_img_obj_request_read_callback(obj_request); 30528b3e1a56SAlex Elder rbd_obj_request_complete(obj_request); 30538b3e1a56SAlex Elder } 30548b3e1a56SAlex Elder 30558b3e1a56SAlex Elder static void rbd_img_parent_read(struct rbd_obj_request *obj_request) 30568b3e1a56SAlex Elder { 30578b3e1a56SAlex Elder struct rbd_img_request *img_request; 30588b3e1a56SAlex Elder int result; 30598b3e1a56SAlex Elder 30608b3e1a56SAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 30618b3e1a56SAlex Elder rbd_assert(obj_request->img_request != NULL); 30628b3e1a56SAlex Elder rbd_assert(obj_request->result == (s32) -ENOENT); 30635b2ab72dSAlex Elder rbd_assert(obj_request_type_valid(obj_request->type)); 30648b3e1a56SAlex Elder 30658b3e1a56SAlex Elder /* rbd_read_finish(obj_request, obj_request->length); */ 3066e93f3152SAlex Elder img_request = rbd_parent_request_create(obj_request, 30678b3e1a56SAlex Elder obj_request->img_offset, 3068e93f3152SAlex Elder obj_request->length); 30698b3e1a56SAlex Elder result = -ENOMEM; 30708b3e1a56SAlex Elder if (!img_request) 30718b3e1a56SAlex Elder goto out_err; 30728b3e1a56SAlex Elder 30735b2ab72dSAlex Elder if (obj_request->type == OBJ_REQUEST_BIO) 3074f1a4739fSAlex Elder result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO, 3075f1a4739fSAlex Elder obj_request->bio_list); 30765b2ab72dSAlex Elder else 30775b2ab72dSAlex Elder result = rbd_img_request_fill(img_request, OBJ_REQUEST_PAGES, 30785b2ab72dSAlex Elder obj_request->pages); 30798b3e1a56SAlex Elder if (result) 30808b3e1a56SAlex Elder goto out_err; 30818b3e1a56SAlex Elder 30828b3e1a56SAlex Elder img_request->callback = rbd_img_parent_read_callback; 30838b3e1a56SAlex Elder result = rbd_img_request_submit(img_request); 30848b3e1a56SAlex Elder if (result) 30858b3e1a56SAlex Elder goto out_err; 30868b3e1a56SAlex Elder 30878b3e1a56SAlex Elder return; 30888b3e1a56SAlex Elder out_err: 30898b3e1a56SAlex Elder if (img_request) 30908b3e1a56SAlex Elder rbd_img_request_put(img_request); 30918b3e1a56SAlex Elder obj_request->result = result; 30928b3e1a56SAlex Elder obj_request->xferred = 0; 30938b3e1a56SAlex Elder obj_request_done_set(obj_request); 30948b3e1a56SAlex Elder } 30958b3e1a56SAlex Elder 309620e0af67SJosh Durgin static int rbd_obj_notify_ack_sync(struct rbd_device *rbd_dev, u64 notify_id) 3097b8d70035SAlex Elder { 3098b8d70035SAlex Elder struct rbd_obj_request *obj_request; 30992169238dSAlex Elder struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 3100b8d70035SAlex Elder int ret; 3101b8d70035SAlex Elder 3102b8d70035SAlex Elder obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0, 3103b8d70035SAlex Elder OBJ_REQUEST_NODATA); 3104b8d70035SAlex Elder if (!obj_request) 3105b8d70035SAlex Elder return -ENOMEM; 3106b8d70035SAlex Elder 3107b8d70035SAlex Elder ret = -ENOMEM; 31086d2940c8SGuangliang Zhao obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1, 3109deb236b3SIlya Dryomov obj_request); 3110b8d70035SAlex Elder if (!obj_request->osd_req) 3111b8d70035SAlex Elder goto out; 3112b8d70035SAlex Elder 3113c99d2d4aSAlex Elder osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_NOTIFY_ACK, 3114cc4a38bdSAlex Elder notify_id, 0, 0); 31159d4df01fSAlex Elder rbd_osd_req_format_read(obj_request); 3116430c28c3SAlex Elder 3117b8d70035SAlex Elder ret = rbd_obj_request_submit(osdc, obj_request); 3118cf81b60eSAlex Elder if (ret) 311920e0af67SJosh Durgin goto out; 312020e0af67SJosh Durgin ret = rbd_obj_request_wait(obj_request); 312120e0af67SJosh Durgin out: 3122b8d70035SAlex Elder rbd_obj_request_put(obj_request); 3123b8d70035SAlex Elder 3124b8d70035SAlex Elder return ret; 3125b8d70035SAlex Elder } 3126b8d70035SAlex Elder 3127b8d70035SAlex Elder static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data) 3128b8d70035SAlex Elder { 3129b8d70035SAlex Elder struct rbd_device *rbd_dev = (struct rbd_device *)data; 3130e627db08SAlex Elder int ret; 3131b8d70035SAlex Elder 3132b8d70035SAlex Elder if (!rbd_dev) 3133b8d70035SAlex Elder return; 3134b8d70035SAlex Elder 313537206ee5SAlex Elder dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__, 3136b8d70035SAlex Elder rbd_dev->header_name, (unsigned long long)notify_id, 3137b8d70035SAlex Elder (unsigned int)opcode); 313852bb1f9bSIlya Dryomov 313952bb1f9bSIlya Dryomov /* 314052bb1f9bSIlya Dryomov * Until adequate refresh error handling is in place, there is 314152bb1f9bSIlya Dryomov * not much we can do here, except warn. 314252bb1f9bSIlya Dryomov * 314352bb1f9bSIlya Dryomov * See http://tracker.ceph.com/issues/5040 314452bb1f9bSIlya Dryomov */ 3145e627db08SAlex Elder ret = rbd_dev_refresh(rbd_dev); 3146e627db08SAlex Elder if (ret) 31479584d508SIlya Dryomov rbd_warn(rbd_dev, "refresh failed: %d", ret); 3148b8d70035SAlex Elder 314952bb1f9bSIlya Dryomov ret = rbd_obj_notify_ack_sync(rbd_dev, notify_id); 315052bb1f9bSIlya Dryomov if (ret) 31519584d508SIlya Dryomov rbd_warn(rbd_dev, "notify_ack ret %d", ret); 3152b8d70035SAlex Elder } 3153b8d70035SAlex Elder 31549969ebc5SAlex Elder /* 3155bb040aa0SIlya Dryomov * Send a (un)watch request and wait for the ack. Return a request 3156bb040aa0SIlya Dryomov * with a ref held on success or error. 3157bb040aa0SIlya Dryomov */ 3158bb040aa0SIlya Dryomov static struct rbd_obj_request *rbd_obj_watch_request_helper( 3159bb040aa0SIlya Dryomov struct rbd_device *rbd_dev, 3160bb040aa0SIlya Dryomov bool watch) 3161bb040aa0SIlya Dryomov { 3162bb040aa0SIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 31632894e1d7SIlya Dryomov struct ceph_options *opts = osdc->client->options; 3164bb040aa0SIlya Dryomov struct rbd_obj_request *obj_request; 3165bb040aa0SIlya Dryomov int ret; 3166bb040aa0SIlya Dryomov 3167bb040aa0SIlya Dryomov obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0, 3168bb040aa0SIlya Dryomov OBJ_REQUEST_NODATA); 3169bb040aa0SIlya Dryomov if (!obj_request) 3170bb040aa0SIlya Dryomov return ERR_PTR(-ENOMEM); 3171bb040aa0SIlya Dryomov 31726d2940c8SGuangliang Zhao obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_WRITE, 1, 3173bb040aa0SIlya Dryomov obj_request); 3174bb040aa0SIlya Dryomov if (!obj_request->osd_req) { 3175bb040aa0SIlya Dryomov ret = -ENOMEM; 3176bb040aa0SIlya Dryomov goto out; 3177bb040aa0SIlya Dryomov } 3178bb040aa0SIlya Dryomov 3179bb040aa0SIlya Dryomov osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH, 3180bb040aa0SIlya Dryomov rbd_dev->watch_event->cookie, 0, watch); 3181bb040aa0SIlya Dryomov rbd_osd_req_format_write(obj_request); 3182bb040aa0SIlya Dryomov 3183bb040aa0SIlya Dryomov if (watch) 3184bb040aa0SIlya Dryomov ceph_osdc_set_request_linger(osdc, obj_request->osd_req); 3185bb040aa0SIlya Dryomov 3186bb040aa0SIlya Dryomov ret = rbd_obj_request_submit(osdc, obj_request); 3187bb040aa0SIlya Dryomov if (ret) 3188bb040aa0SIlya Dryomov goto out; 3189bb040aa0SIlya Dryomov 31902894e1d7SIlya Dryomov ret = rbd_obj_request_wait_timeout(obj_request, opts->mount_timeout); 3191bb040aa0SIlya Dryomov if (ret) 3192bb040aa0SIlya Dryomov goto out; 3193bb040aa0SIlya Dryomov 3194bb040aa0SIlya Dryomov ret = obj_request->result; 3195bb040aa0SIlya Dryomov if (ret) { 3196bb040aa0SIlya Dryomov if (watch) 3197bb040aa0SIlya Dryomov rbd_obj_request_end(obj_request); 3198bb040aa0SIlya Dryomov goto out; 3199bb040aa0SIlya Dryomov } 3200bb040aa0SIlya Dryomov 3201bb040aa0SIlya Dryomov return obj_request; 3202bb040aa0SIlya Dryomov 3203bb040aa0SIlya Dryomov out: 3204bb040aa0SIlya Dryomov rbd_obj_request_put(obj_request); 3205bb040aa0SIlya Dryomov return ERR_PTR(ret); 3206bb040aa0SIlya Dryomov } 3207bb040aa0SIlya Dryomov 3208bb040aa0SIlya Dryomov /* 3209b30a01f2SIlya Dryomov * Initiate a watch request, synchronously. 32109969ebc5SAlex Elder */ 3211b30a01f2SIlya Dryomov static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev) 32129969ebc5SAlex Elder { 32139969ebc5SAlex Elder struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 32149969ebc5SAlex Elder struct rbd_obj_request *obj_request; 32159969ebc5SAlex Elder int ret; 32169969ebc5SAlex Elder 3217b30a01f2SIlya Dryomov rbd_assert(!rbd_dev->watch_event); 3218b30a01f2SIlya Dryomov rbd_assert(!rbd_dev->watch_request); 32199969ebc5SAlex Elder 32203c663bbdSAlex Elder ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev, 32219969ebc5SAlex Elder &rbd_dev->watch_event); 32229969ebc5SAlex Elder if (ret < 0) 32239969ebc5SAlex Elder return ret; 32249969ebc5SAlex Elder 322576756a51SIlya Dryomov obj_request = rbd_obj_watch_request_helper(rbd_dev, true); 322676756a51SIlya Dryomov if (IS_ERR(obj_request)) { 322776756a51SIlya Dryomov ceph_osdc_cancel_event(rbd_dev->watch_event); 322876756a51SIlya Dryomov rbd_dev->watch_event = NULL; 322976756a51SIlya Dryomov return PTR_ERR(obj_request); 3230b30a01f2SIlya Dryomov } 32319969ebc5SAlex Elder 32328eb87565SAlex Elder /* 32338eb87565SAlex Elder * A watch request is set to linger, so the underlying osd 32348eb87565SAlex Elder * request won't go away until we unregister it. We retain 32358eb87565SAlex Elder * a pointer to the object request during that time (in 323676756a51SIlya Dryomov * rbd_dev->watch_request), so we'll keep a reference to it. 323776756a51SIlya Dryomov * We'll drop that reference after we've unregistered it in 323876756a51SIlya Dryomov * rbd_dev_header_unwatch_sync(). 32398eb87565SAlex Elder */ 32408eb87565SAlex Elder rbd_dev->watch_request = obj_request; 32418eb87565SAlex Elder 32428eb87565SAlex Elder return 0; 32439969ebc5SAlex Elder } 32449969ebc5SAlex Elder 3245b30a01f2SIlya Dryomov /* 3246b30a01f2SIlya Dryomov * Tear down a watch request, synchronously. 3247b30a01f2SIlya Dryomov */ 324876756a51SIlya Dryomov static void rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev) 3249fca27065SIlya Dryomov { 3250b30a01f2SIlya Dryomov struct rbd_obj_request *obj_request; 3251b30a01f2SIlya Dryomov 3252b30a01f2SIlya Dryomov rbd_assert(rbd_dev->watch_event); 3253b30a01f2SIlya Dryomov rbd_assert(rbd_dev->watch_request); 3254b30a01f2SIlya Dryomov 325576756a51SIlya Dryomov rbd_obj_request_end(rbd_dev->watch_request); 3256b30a01f2SIlya Dryomov rbd_obj_request_put(rbd_dev->watch_request); 3257b30a01f2SIlya Dryomov rbd_dev->watch_request = NULL; 3258b30a01f2SIlya Dryomov 325976756a51SIlya Dryomov obj_request = rbd_obj_watch_request_helper(rbd_dev, false); 326076756a51SIlya Dryomov if (!IS_ERR(obj_request)) 3261b30a01f2SIlya Dryomov rbd_obj_request_put(obj_request); 326276756a51SIlya Dryomov else 326376756a51SIlya Dryomov rbd_warn(rbd_dev, "unable to tear down watch request (%ld)", 326476756a51SIlya Dryomov PTR_ERR(obj_request)); 326576756a51SIlya Dryomov 3266b30a01f2SIlya Dryomov ceph_osdc_cancel_event(rbd_dev->watch_event); 3267b30a01f2SIlya Dryomov rbd_dev->watch_event = NULL; 3268fca27065SIlya Dryomov } 3269fca27065SIlya Dryomov 327036be9a76SAlex Elder /* 3271f40eb349SAlex Elder * Synchronous osd object method call. Returns the number of bytes 3272f40eb349SAlex Elder * returned in the outbound buffer, or a negative error code. 327336be9a76SAlex Elder */ 327436be9a76SAlex Elder static int rbd_obj_method_sync(struct rbd_device *rbd_dev, 327536be9a76SAlex Elder const char *object_name, 327636be9a76SAlex Elder const char *class_name, 327736be9a76SAlex Elder const char *method_name, 32784157976bSAlex Elder const void *outbound, 327936be9a76SAlex Elder size_t outbound_size, 32804157976bSAlex Elder void *inbound, 3281e2a58ee5SAlex Elder size_t inbound_size) 328236be9a76SAlex Elder { 32832169238dSAlex Elder struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 328436be9a76SAlex Elder struct rbd_obj_request *obj_request; 328536be9a76SAlex Elder struct page **pages; 328636be9a76SAlex Elder u32 page_count; 328736be9a76SAlex Elder int ret; 328836be9a76SAlex Elder 328936be9a76SAlex Elder /* 32906010a451SAlex Elder * Method calls are ultimately read operations. The result 32916010a451SAlex Elder * should placed into the inbound buffer provided. They 32926010a451SAlex Elder * also supply outbound data--parameters for the object 32936010a451SAlex Elder * method. Currently if this is present it will be a 32946010a451SAlex Elder * snapshot id. 329536be9a76SAlex Elder */ 329636be9a76SAlex Elder page_count = (u32)calc_pages_for(0, inbound_size); 329736be9a76SAlex Elder pages = ceph_alloc_page_vector(page_count, GFP_KERNEL); 329836be9a76SAlex Elder if (IS_ERR(pages)) 329936be9a76SAlex Elder return PTR_ERR(pages); 330036be9a76SAlex Elder 330136be9a76SAlex Elder ret = -ENOMEM; 33026010a451SAlex Elder obj_request = rbd_obj_request_create(object_name, 0, inbound_size, 330336be9a76SAlex Elder OBJ_REQUEST_PAGES); 330436be9a76SAlex Elder if (!obj_request) 330536be9a76SAlex Elder goto out; 330636be9a76SAlex Elder 330736be9a76SAlex Elder obj_request->pages = pages; 330836be9a76SAlex Elder obj_request->page_count = page_count; 330936be9a76SAlex Elder 33106d2940c8SGuangliang Zhao obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1, 3311deb236b3SIlya Dryomov obj_request); 331236be9a76SAlex Elder if (!obj_request->osd_req) 331336be9a76SAlex Elder goto out; 331436be9a76SAlex Elder 3315c99d2d4aSAlex Elder osd_req_op_cls_init(obj_request->osd_req, 0, CEPH_OSD_OP_CALL, 331604017e29SAlex Elder class_name, method_name); 331704017e29SAlex Elder if (outbound_size) { 331804017e29SAlex Elder struct ceph_pagelist *pagelist; 331904017e29SAlex Elder 332004017e29SAlex Elder pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS); 332104017e29SAlex Elder if (!pagelist) 332204017e29SAlex Elder goto out; 332304017e29SAlex Elder 332404017e29SAlex Elder ceph_pagelist_init(pagelist); 332504017e29SAlex Elder ceph_pagelist_append(pagelist, outbound, outbound_size); 332604017e29SAlex Elder osd_req_op_cls_request_data_pagelist(obj_request->osd_req, 0, 332704017e29SAlex Elder pagelist); 332804017e29SAlex Elder } 3329a4ce40a9SAlex Elder osd_req_op_cls_response_data_pages(obj_request->osd_req, 0, 3330a4ce40a9SAlex Elder obj_request->pages, inbound_size, 333144cd188dSAlex Elder 0, false, false); 33329d4df01fSAlex Elder rbd_osd_req_format_read(obj_request); 3333430c28c3SAlex Elder 333436be9a76SAlex Elder ret = rbd_obj_request_submit(osdc, obj_request); 333536be9a76SAlex Elder if (ret) 333636be9a76SAlex Elder goto out; 333736be9a76SAlex Elder ret = rbd_obj_request_wait(obj_request); 333836be9a76SAlex Elder if (ret) 333936be9a76SAlex Elder goto out; 334036be9a76SAlex Elder 334136be9a76SAlex Elder ret = obj_request->result; 334236be9a76SAlex Elder if (ret < 0) 334336be9a76SAlex Elder goto out; 334457385b51SAlex Elder 334557385b51SAlex Elder rbd_assert(obj_request->xferred < (u64)INT_MAX); 334657385b51SAlex Elder ret = (int)obj_request->xferred; 3347903bb32eSAlex Elder ceph_copy_from_page_vector(pages, inbound, 0, obj_request->xferred); 334836be9a76SAlex Elder out: 334936be9a76SAlex Elder if (obj_request) 335036be9a76SAlex Elder rbd_obj_request_put(obj_request); 335136be9a76SAlex Elder else 335236be9a76SAlex Elder ceph_release_page_vector(pages, page_count); 335336be9a76SAlex Elder 335436be9a76SAlex Elder return ret; 335536be9a76SAlex Elder } 335636be9a76SAlex Elder 33577ad18afaSChristoph Hellwig static void rbd_queue_workfn(struct work_struct *work) 3358bc1ecc65SIlya Dryomov { 33597ad18afaSChristoph Hellwig struct request *rq = blk_mq_rq_from_pdu(work); 33607ad18afaSChristoph Hellwig struct rbd_device *rbd_dev = rq->q->queuedata; 3361bc1ecc65SIlya Dryomov struct rbd_img_request *img_request; 33624e752f0aSJosh Durgin struct ceph_snap_context *snapc = NULL; 3363bc1ecc65SIlya Dryomov u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT; 3364bc1ecc65SIlya Dryomov u64 length = blk_rq_bytes(rq); 33656d2940c8SGuangliang Zhao enum obj_operation_type op_type; 33664e752f0aSJosh Durgin u64 mapping_size; 3367bc1ecc65SIlya Dryomov int result; 3368bc1ecc65SIlya Dryomov 33697ad18afaSChristoph Hellwig if (rq->cmd_type != REQ_TYPE_FS) { 33707ad18afaSChristoph Hellwig dout("%s: non-fs request type %d\n", __func__, 33717ad18afaSChristoph Hellwig (int) rq->cmd_type); 33727ad18afaSChristoph Hellwig result = -EIO; 33737ad18afaSChristoph Hellwig goto err; 33747ad18afaSChristoph Hellwig } 33757ad18afaSChristoph Hellwig 337690e98c52SGuangliang Zhao if (rq->cmd_flags & REQ_DISCARD) 337790e98c52SGuangliang Zhao op_type = OBJ_OP_DISCARD; 337890e98c52SGuangliang Zhao else if (rq->cmd_flags & REQ_WRITE) 33796d2940c8SGuangliang Zhao op_type = OBJ_OP_WRITE; 33806d2940c8SGuangliang Zhao else 33816d2940c8SGuangliang Zhao op_type = OBJ_OP_READ; 33826d2940c8SGuangliang Zhao 3383bc1ecc65SIlya Dryomov /* Ignore/skip any zero-length requests */ 3384bc1ecc65SIlya Dryomov 3385bc1ecc65SIlya Dryomov if (!length) { 3386bc1ecc65SIlya Dryomov dout("%s: zero-length request\n", __func__); 3387bc1ecc65SIlya Dryomov result = 0; 3388bc1ecc65SIlya Dryomov goto err_rq; 3389bc1ecc65SIlya Dryomov } 3390bc1ecc65SIlya Dryomov 33916d2940c8SGuangliang Zhao /* Only reads are allowed to a read-only device */ 3392bc1ecc65SIlya Dryomov 33936d2940c8SGuangliang Zhao if (op_type != OBJ_OP_READ) { 3394bc1ecc65SIlya Dryomov if (rbd_dev->mapping.read_only) { 3395bc1ecc65SIlya Dryomov result = -EROFS; 3396bc1ecc65SIlya Dryomov goto err_rq; 3397bc1ecc65SIlya Dryomov } 3398bc1ecc65SIlya Dryomov rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP); 3399bc1ecc65SIlya Dryomov } 3400bc1ecc65SIlya Dryomov 3401bc1ecc65SIlya Dryomov /* 3402bc1ecc65SIlya Dryomov * Quit early if the mapped snapshot no longer exists. It's 3403bc1ecc65SIlya Dryomov * still possible the snapshot will have disappeared by the 3404bc1ecc65SIlya Dryomov * time our request arrives at the osd, but there's no sense in 3405bc1ecc65SIlya Dryomov * sending it if we already know. 3406bc1ecc65SIlya Dryomov */ 3407bc1ecc65SIlya Dryomov if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) { 3408bc1ecc65SIlya Dryomov dout("request for non-existent snapshot"); 3409bc1ecc65SIlya Dryomov rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP); 3410bc1ecc65SIlya Dryomov result = -ENXIO; 3411bc1ecc65SIlya Dryomov goto err_rq; 3412bc1ecc65SIlya Dryomov } 3413bc1ecc65SIlya Dryomov 3414bc1ecc65SIlya Dryomov if (offset && length > U64_MAX - offset + 1) { 3415bc1ecc65SIlya Dryomov rbd_warn(rbd_dev, "bad request range (%llu~%llu)", offset, 3416bc1ecc65SIlya Dryomov length); 3417bc1ecc65SIlya Dryomov result = -EINVAL; 3418bc1ecc65SIlya Dryomov goto err_rq; /* Shouldn't happen */ 3419bc1ecc65SIlya Dryomov } 3420bc1ecc65SIlya Dryomov 34217ad18afaSChristoph Hellwig blk_mq_start_request(rq); 34227ad18afaSChristoph Hellwig 34234e752f0aSJosh Durgin down_read(&rbd_dev->header_rwsem); 34244e752f0aSJosh Durgin mapping_size = rbd_dev->mapping.size; 34256d2940c8SGuangliang Zhao if (op_type != OBJ_OP_READ) { 34264e752f0aSJosh Durgin snapc = rbd_dev->header.snapc; 34274e752f0aSJosh Durgin ceph_get_snap_context(snapc); 34284e752f0aSJosh Durgin } 34294e752f0aSJosh Durgin up_read(&rbd_dev->header_rwsem); 34304e752f0aSJosh Durgin 34314e752f0aSJosh Durgin if (offset + length > mapping_size) { 3432bc1ecc65SIlya Dryomov rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)", offset, 34334e752f0aSJosh Durgin length, mapping_size); 3434bc1ecc65SIlya Dryomov result = -EIO; 3435bc1ecc65SIlya Dryomov goto err_rq; 3436bc1ecc65SIlya Dryomov } 3437bc1ecc65SIlya Dryomov 34386d2940c8SGuangliang Zhao img_request = rbd_img_request_create(rbd_dev, offset, length, op_type, 34394e752f0aSJosh Durgin snapc); 3440bc1ecc65SIlya Dryomov if (!img_request) { 3441bc1ecc65SIlya Dryomov result = -ENOMEM; 3442bc1ecc65SIlya Dryomov goto err_rq; 3443bc1ecc65SIlya Dryomov } 3444bc1ecc65SIlya Dryomov img_request->rq = rq; 3445bc1ecc65SIlya Dryomov 344690e98c52SGuangliang Zhao if (op_type == OBJ_OP_DISCARD) 344790e98c52SGuangliang Zhao result = rbd_img_request_fill(img_request, OBJ_REQUEST_NODATA, 344890e98c52SGuangliang Zhao NULL); 344990e98c52SGuangliang Zhao else 345090e98c52SGuangliang Zhao result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO, 345190e98c52SGuangliang Zhao rq->bio); 3452bc1ecc65SIlya Dryomov if (result) 3453bc1ecc65SIlya Dryomov goto err_img_request; 3454bc1ecc65SIlya Dryomov 3455bc1ecc65SIlya Dryomov result = rbd_img_request_submit(img_request); 3456bc1ecc65SIlya Dryomov if (result) 3457bc1ecc65SIlya Dryomov goto err_img_request; 3458bc1ecc65SIlya Dryomov 3459bc1ecc65SIlya Dryomov return; 3460bc1ecc65SIlya Dryomov 3461bc1ecc65SIlya Dryomov err_img_request: 3462bc1ecc65SIlya Dryomov rbd_img_request_put(img_request); 3463bc1ecc65SIlya Dryomov err_rq: 3464bc1ecc65SIlya Dryomov if (result) 3465bc1ecc65SIlya Dryomov rbd_warn(rbd_dev, "%s %llx at %llx result %d", 34666d2940c8SGuangliang Zhao obj_op_name(op_type), length, offset, result); 34674e752f0aSJosh Durgin ceph_put_snap_context(snapc); 34687ad18afaSChristoph Hellwig err: 34697ad18afaSChristoph Hellwig blk_mq_end_request(rq, result); 3470bc1ecc65SIlya Dryomov } 3471bc1ecc65SIlya Dryomov 34727ad18afaSChristoph Hellwig static int rbd_queue_rq(struct blk_mq_hw_ctx *hctx, 34737ad18afaSChristoph Hellwig const struct blk_mq_queue_data *bd) 3474bc1ecc65SIlya Dryomov { 34757ad18afaSChristoph Hellwig struct request *rq = bd->rq; 34767ad18afaSChristoph Hellwig struct work_struct *work = blk_mq_rq_to_pdu(rq); 3477bc1ecc65SIlya Dryomov 34787ad18afaSChristoph Hellwig queue_work(rbd_wq, work); 34797ad18afaSChristoph Hellwig return BLK_MQ_RQ_QUEUE_OK; 3480bf0d5f50SAlex Elder } 3481bf0d5f50SAlex Elder 3482602adf40SYehuda Sadeh static void rbd_free_disk(struct rbd_device *rbd_dev) 3483602adf40SYehuda Sadeh { 3484602adf40SYehuda Sadeh struct gendisk *disk = rbd_dev->disk; 3485602adf40SYehuda Sadeh 3486602adf40SYehuda Sadeh if (!disk) 3487602adf40SYehuda Sadeh return; 3488602adf40SYehuda Sadeh 3489a0cab924SAlex Elder rbd_dev->disk = NULL; 3490a0cab924SAlex Elder if (disk->flags & GENHD_FL_UP) { 3491602adf40SYehuda Sadeh del_gendisk(disk); 3492602adf40SYehuda Sadeh if (disk->queue) 3493602adf40SYehuda Sadeh blk_cleanup_queue(disk->queue); 34947ad18afaSChristoph Hellwig blk_mq_free_tag_set(&rbd_dev->tag_set); 3495a0cab924SAlex Elder } 3496602adf40SYehuda Sadeh put_disk(disk); 3497602adf40SYehuda Sadeh } 3498602adf40SYehuda Sadeh 3499788e2df3SAlex Elder static int rbd_obj_read_sync(struct rbd_device *rbd_dev, 3500788e2df3SAlex Elder const char *object_name, 35017097f8dfSAlex Elder u64 offset, u64 length, void *buf) 3502788e2df3SAlex Elder 3503788e2df3SAlex Elder { 35042169238dSAlex Elder struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 3505788e2df3SAlex Elder struct rbd_obj_request *obj_request; 3506788e2df3SAlex Elder struct page **pages = NULL; 3507788e2df3SAlex Elder u32 page_count; 35081ceae7efSAlex Elder size_t size; 3509788e2df3SAlex Elder int ret; 3510788e2df3SAlex Elder 3511788e2df3SAlex Elder page_count = (u32) calc_pages_for(offset, length); 3512788e2df3SAlex Elder pages = ceph_alloc_page_vector(page_count, GFP_KERNEL); 3513788e2df3SAlex Elder if (IS_ERR(pages)) 3514a8d42056SJan Kara return PTR_ERR(pages); 3515788e2df3SAlex Elder 3516788e2df3SAlex Elder ret = -ENOMEM; 3517788e2df3SAlex Elder obj_request = rbd_obj_request_create(object_name, offset, length, 3518788e2df3SAlex Elder OBJ_REQUEST_PAGES); 3519788e2df3SAlex Elder if (!obj_request) 3520788e2df3SAlex Elder goto out; 3521788e2df3SAlex Elder 3522788e2df3SAlex Elder obj_request->pages = pages; 3523788e2df3SAlex Elder obj_request->page_count = page_count; 3524788e2df3SAlex Elder 35256d2940c8SGuangliang Zhao obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1, 3526deb236b3SIlya Dryomov obj_request); 3527788e2df3SAlex Elder if (!obj_request->osd_req) 3528788e2df3SAlex Elder goto out; 3529788e2df3SAlex Elder 3530c99d2d4aSAlex Elder osd_req_op_extent_init(obj_request->osd_req, 0, CEPH_OSD_OP_READ, 3531c99d2d4aSAlex Elder offset, length, 0, 0); 3532406e2c9fSAlex Elder osd_req_op_extent_osd_data_pages(obj_request->osd_req, 0, 3533a4ce40a9SAlex Elder obj_request->pages, 353444cd188dSAlex Elder obj_request->length, 353544cd188dSAlex Elder obj_request->offset & ~PAGE_MASK, 353644cd188dSAlex Elder false, false); 35379d4df01fSAlex Elder rbd_osd_req_format_read(obj_request); 3538430c28c3SAlex Elder 3539788e2df3SAlex Elder ret = rbd_obj_request_submit(osdc, obj_request); 3540788e2df3SAlex Elder if (ret) 3541788e2df3SAlex Elder goto out; 3542788e2df3SAlex Elder ret = rbd_obj_request_wait(obj_request); 3543788e2df3SAlex Elder if (ret) 3544788e2df3SAlex Elder goto out; 3545788e2df3SAlex Elder 3546788e2df3SAlex Elder ret = obj_request->result; 3547788e2df3SAlex Elder if (ret < 0) 3548788e2df3SAlex Elder goto out; 35491ceae7efSAlex Elder 35501ceae7efSAlex Elder rbd_assert(obj_request->xferred <= (u64) SIZE_MAX); 35511ceae7efSAlex Elder size = (size_t) obj_request->xferred; 3552903bb32eSAlex Elder ceph_copy_from_page_vector(pages, buf, 0, size); 355323ed6e13SAlex Elder rbd_assert(size <= (size_t)INT_MAX); 355423ed6e13SAlex Elder ret = (int)size; 3555788e2df3SAlex Elder out: 3556788e2df3SAlex Elder if (obj_request) 3557788e2df3SAlex Elder rbd_obj_request_put(obj_request); 3558788e2df3SAlex Elder else 3559788e2df3SAlex Elder ceph_release_page_vector(pages, page_count); 3560788e2df3SAlex Elder 3561788e2df3SAlex Elder return ret; 3562788e2df3SAlex Elder } 3563788e2df3SAlex Elder 3564602adf40SYehuda Sadeh /* 3565662518b1SAlex Elder * Read the complete header for the given rbd device. On successful 3566662518b1SAlex Elder * return, the rbd_dev->header field will contain up-to-date 3567662518b1SAlex Elder * information about the image. 35684156d998SAlex Elder */ 356999a41ebcSAlex Elder static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev) 35704156d998SAlex Elder { 35714156d998SAlex Elder struct rbd_image_header_ondisk *ondisk = NULL; 35724156d998SAlex Elder u32 snap_count = 0; 35734156d998SAlex Elder u64 names_size = 0; 35744156d998SAlex Elder u32 want_count; 35754156d998SAlex Elder int ret; 35764156d998SAlex Elder 35774156d998SAlex Elder /* 35784156d998SAlex Elder * The complete header will include an array of its 64-bit 35794156d998SAlex Elder * snapshot ids, followed by the names of those snapshots as 35804156d998SAlex Elder * a contiguous block of NUL-terminated strings. Note that 35814156d998SAlex Elder * the number of snapshots could change by the time we read 35824156d998SAlex Elder * it in, in which case we re-read it. 35834156d998SAlex Elder */ 35844156d998SAlex Elder do { 35854156d998SAlex Elder size_t size; 35864156d998SAlex Elder 35874156d998SAlex Elder kfree(ondisk); 35884156d998SAlex Elder 35894156d998SAlex Elder size = sizeof (*ondisk); 35904156d998SAlex Elder size += snap_count * sizeof (struct rbd_image_snap_ondisk); 35914156d998SAlex Elder size += names_size; 35924156d998SAlex Elder ondisk = kmalloc(size, GFP_KERNEL); 35934156d998SAlex Elder if (!ondisk) 3594662518b1SAlex Elder return -ENOMEM; 35954156d998SAlex Elder 3596788e2df3SAlex Elder ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name, 35977097f8dfSAlex Elder 0, size, ondisk); 35984156d998SAlex Elder if (ret < 0) 3599662518b1SAlex Elder goto out; 3600c0cd10dbSAlex Elder if ((size_t)ret < size) { 36014156d998SAlex Elder ret = -ENXIO; 360206ecc6cbSAlex Elder rbd_warn(rbd_dev, "short header read (want %zd got %d)", 360306ecc6cbSAlex Elder size, ret); 3604662518b1SAlex Elder goto out; 36054156d998SAlex Elder } 36064156d998SAlex Elder if (!rbd_dev_ondisk_valid(ondisk)) { 36074156d998SAlex Elder ret = -ENXIO; 360806ecc6cbSAlex Elder rbd_warn(rbd_dev, "invalid header"); 3609662518b1SAlex Elder goto out; 36104156d998SAlex Elder } 36114156d998SAlex Elder 36124156d998SAlex Elder names_size = le64_to_cpu(ondisk->snap_names_len); 36134156d998SAlex Elder want_count = snap_count; 36144156d998SAlex Elder snap_count = le32_to_cpu(ondisk->snap_count); 36154156d998SAlex Elder } while (snap_count != want_count); 36164156d998SAlex Elder 3617662518b1SAlex Elder ret = rbd_header_from_disk(rbd_dev, ondisk); 3618662518b1SAlex Elder out: 36194156d998SAlex Elder kfree(ondisk); 36204156d998SAlex Elder 3621dfc5606dSYehuda Sadeh return ret; 3622602adf40SYehuda Sadeh } 3623602adf40SYehuda Sadeh 362415228edeSAlex Elder /* 362515228edeSAlex Elder * Clear the rbd device's EXISTS flag if the snapshot it's mapped to 362615228edeSAlex Elder * has disappeared from the (just updated) snapshot context. 362715228edeSAlex Elder */ 362815228edeSAlex Elder static void rbd_exists_validate(struct rbd_device *rbd_dev) 362915228edeSAlex Elder { 363015228edeSAlex Elder u64 snap_id; 363115228edeSAlex Elder 363215228edeSAlex Elder if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) 363315228edeSAlex Elder return; 363415228edeSAlex Elder 363515228edeSAlex Elder snap_id = rbd_dev->spec->snap_id; 363615228edeSAlex Elder if (snap_id == CEPH_NOSNAP) 363715228edeSAlex Elder return; 363815228edeSAlex Elder 363915228edeSAlex Elder if (rbd_dev_snap_index(rbd_dev, snap_id) == BAD_SNAP_INDEX) 364015228edeSAlex Elder clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); 364115228edeSAlex Elder } 364215228edeSAlex Elder 36439875201eSJosh Durgin static void rbd_dev_update_size(struct rbd_device *rbd_dev) 36449875201eSJosh Durgin { 36459875201eSJosh Durgin sector_t size; 36469875201eSJosh Durgin bool removing; 36479875201eSJosh Durgin 36489875201eSJosh Durgin /* 36499875201eSJosh Durgin * Don't hold the lock while doing disk operations, 36509875201eSJosh Durgin * or lock ordering will conflict with the bdev mutex via: 36519875201eSJosh Durgin * rbd_add() -> blkdev_get() -> rbd_open() 36529875201eSJosh Durgin */ 36539875201eSJosh Durgin spin_lock_irq(&rbd_dev->lock); 36549875201eSJosh Durgin removing = test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags); 36559875201eSJosh Durgin spin_unlock_irq(&rbd_dev->lock); 36569875201eSJosh Durgin /* 36579875201eSJosh Durgin * If the device is being removed, rbd_dev->disk has 36589875201eSJosh Durgin * been destroyed, so don't try to update its size 36599875201eSJosh Durgin */ 36609875201eSJosh Durgin if (!removing) { 36619875201eSJosh Durgin size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE; 36629875201eSJosh Durgin dout("setting size to %llu sectors", (unsigned long long)size); 36639875201eSJosh Durgin set_capacity(rbd_dev->disk, size); 36649875201eSJosh Durgin revalidate_disk(rbd_dev->disk); 36659875201eSJosh Durgin } 36669875201eSJosh Durgin } 36679875201eSJosh Durgin 3668cc4a38bdSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev) 36691fe5e993SAlex Elder { 3670e627db08SAlex Elder u64 mapping_size; 36711fe5e993SAlex Elder int ret; 36721fe5e993SAlex Elder 3673cfbf6377SAlex Elder down_write(&rbd_dev->header_rwsem); 36743b5cf2a2SAlex Elder mapping_size = rbd_dev->mapping.size; 3675a720ae09SIlya Dryomov 3676a720ae09SIlya Dryomov ret = rbd_dev_header_info(rbd_dev); 367752bb1f9bSIlya Dryomov if (ret) 367873e39e4dSIlya Dryomov goto out; 367915228edeSAlex Elder 3680e8f59b59SIlya Dryomov /* 3681e8f59b59SIlya Dryomov * If there is a parent, see if it has disappeared due to the 3682e8f59b59SIlya Dryomov * mapped image getting flattened. 3683e8f59b59SIlya Dryomov */ 3684e8f59b59SIlya Dryomov if (rbd_dev->parent) { 3685e8f59b59SIlya Dryomov ret = rbd_dev_v2_parent_info(rbd_dev); 3686e8f59b59SIlya Dryomov if (ret) 368773e39e4dSIlya Dryomov goto out; 3688e8f59b59SIlya Dryomov } 3689e8f59b59SIlya Dryomov 36905ff1108cSIlya Dryomov if (rbd_dev->spec->snap_id == CEPH_NOSNAP) { 36915ff1108cSIlya Dryomov rbd_dev->mapping.size = rbd_dev->header.image_size; 36925ff1108cSIlya Dryomov } else { 36935ff1108cSIlya Dryomov /* validate mapped snapshot's EXISTS flag */ 369415228edeSAlex Elder rbd_exists_validate(rbd_dev); 36955ff1108cSIlya Dryomov } 36965ff1108cSIlya Dryomov 369773e39e4dSIlya Dryomov out: 3698cfbf6377SAlex Elder up_write(&rbd_dev->header_rwsem); 369973e39e4dSIlya Dryomov if (!ret && mapping_size != rbd_dev->mapping.size) 37009875201eSJosh Durgin rbd_dev_update_size(rbd_dev); 37011fe5e993SAlex Elder 370273e39e4dSIlya Dryomov return ret; 37031fe5e993SAlex Elder } 37041fe5e993SAlex Elder 37057ad18afaSChristoph Hellwig static int rbd_init_request(void *data, struct request *rq, 37067ad18afaSChristoph Hellwig unsigned int hctx_idx, unsigned int request_idx, 37077ad18afaSChristoph Hellwig unsigned int numa_node) 37087ad18afaSChristoph Hellwig { 37097ad18afaSChristoph Hellwig struct work_struct *work = blk_mq_rq_to_pdu(rq); 37107ad18afaSChristoph Hellwig 37117ad18afaSChristoph Hellwig INIT_WORK(work, rbd_queue_workfn); 37127ad18afaSChristoph Hellwig return 0; 37137ad18afaSChristoph Hellwig } 37147ad18afaSChristoph Hellwig 37157ad18afaSChristoph Hellwig static struct blk_mq_ops rbd_mq_ops = { 37167ad18afaSChristoph Hellwig .queue_rq = rbd_queue_rq, 37177ad18afaSChristoph Hellwig .map_queue = blk_mq_map_queue, 37187ad18afaSChristoph Hellwig .init_request = rbd_init_request, 37197ad18afaSChristoph Hellwig }; 37207ad18afaSChristoph Hellwig 3721602adf40SYehuda Sadeh static int rbd_init_disk(struct rbd_device *rbd_dev) 3722602adf40SYehuda Sadeh { 3723602adf40SYehuda Sadeh struct gendisk *disk; 3724602adf40SYehuda Sadeh struct request_queue *q; 3725593a9e7bSAlex Elder u64 segment_size; 37267ad18afaSChristoph Hellwig int err; 3727602adf40SYehuda Sadeh 3728602adf40SYehuda Sadeh /* create gendisk info */ 37297e513d43SIlya Dryomov disk = alloc_disk(single_major ? 37307e513d43SIlya Dryomov (1 << RBD_SINGLE_MAJOR_PART_SHIFT) : 37317e513d43SIlya Dryomov RBD_MINORS_PER_MAJOR); 3732602adf40SYehuda Sadeh if (!disk) 37331fcdb8aaSAlex Elder return -ENOMEM; 3734602adf40SYehuda Sadeh 3735f0f8cef5SAlex Elder snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d", 3736de71a297SAlex Elder rbd_dev->dev_id); 3737602adf40SYehuda Sadeh disk->major = rbd_dev->major; 3738dd82fff1SIlya Dryomov disk->first_minor = rbd_dev->minor; 37397e513d43SIlya Dryomov if (single_major) 37407e513d43SIlya Dryomov disk->flags |= GENHD_FL_EXT_DEVT; 3741602adf40SYehuda Sadeh disk->fops = &rbd_bd_ops; 3742602adf40SYehuda Sadeh disk->private_data = rbd_dev; 3743602adf40SYehuda Sadeh 37447ad18afaSChristoph Hellwig memset(&rbd_dev->tag_set, 0, sizeof(rbd_dev->tag_set)); 37457ad18afaSChristoph Hellwig rbd_dev->tag_set.ops = &rbd_mq_ops; 3746b5584180SIlya Dryomov rbd_dev->tag_set.queue_depth = rbd_dev->opts->queue_depth; 37477ad18afaSChristoph Hellwig rbd_dev->tag_set.numa_node = NUMA_NO_NODE; 3748b5584180SIlya Dryomov rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE; 37497ad18afaSChristoph Hellwig rbd_dev->tag_set.nr_hw_queues = 1; 37507ad18afaSChristoph Hellwig rbd_dev->tag_set.cmd_size = sizeof(struct work_struct); 37517ad18afaSChristoph Hellwig 37527ad18afaSChristoph Hellwig err = blk_mq_alloc_tag_set(&rbd_dev->tag_set); 37537ad18afaSChristoph Hellwig if (err) 3754602adf40SYehuda Sadeh goto out_disk; 3755029bcbd8SJosh Durgin 37567ad18afaSChristoph Hellwig q = blk_mq_init_queue(&rbd_dev->tag_set); 37577ad18afaSChristoph Hellwig if (IS_ERR(q)) { 37587ad18afaSChristoph Hellwig err = PTR_ERR(q); 37597ad18afaSChristoph Hellwig goto out_tag_set; 37607ad18afaSChristoph Hellwig } 37617ad18afaSChristoph Hellwig 3762d8a2c89cSIlya Dryomov queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q); 3763d8a2c89cSIlya Dryomov /* QUEUE_FLAG_ADD_RANDOM is off by default for blk-mq */ 3764593a9e7bSAlex Elder 3765029bcbd8SJosh Durgin /* set io sizes to object size */ 3766593a9e7bSAlex Elder segment_size = rbd_obj_bytes(&rbd_dev->header); 3767593a9e7bSAlex Elder blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE); 37680d9fde4fSIlya Dryomov q->limits.max_sectors = queue_max_hw_sectors(q); 3769d3834fefSIlya Dryomov blk_queue_max_segments(q, segment_size / SECTOR_SIZE); 3770593a9e7bSAlex Elder blk_queue_max_segment_size(q, segment_size); 3771593a9e7bSAlex Elder blk_queue_io_min(q, segment_size); 3772593a9e7bSAlex Elder blk_queue_io_opt(q, segment_size); 3773029bcbd8SJosh Durgin 377490e98c52SGuangliang Zhao /* enable the discard support */ 377590e98c52SGuangliang Zhao queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q); 377690e98c52SGuangliang Zhao q->limits.discard_granularity = segment_size; 377790e98c52SGuangliang Zhao q->limits.discard_alignment = segment_size; 37782bb4cd5cSJens Axboe blk_queue_max_discard_sectors(q, segment_size / SECTOR_SIZE); 3779b76f8239SJosh Durgin q->limits.discard_zeroes_data = 1; 378090e98c52SGuangliang Zhao 3781bae818eeSRonny Hegewald if (!ceph_test_opt(rbd_dev->rbd_client->client, NOCRC)) 3782bae818eeSRonny Hegewald q->backing_dev_info.capabilities |= BDI_CAP_STABLE_WRITES; 3783bae818eeSRonny Hegewald 3784602adf40SYehuda Sadeh disk->queue = q; 3785602adf40SYehuda Sadeh 3786602adf40SYehuda Sadeh q->queuedata = rbd_dev; 3787602adf40SYehuda Sadeh 3788602adf40SYehuda Sadeh rbd_dev->disk = disk; 3789602adf40SYehuda Sadeh 3790602adf40SYehuda Sadeh return 0; 37917ad18afaSChristoph Hellwig out_tag_set: 37927ad18afaSChristoph Hellwig blk_mq_free_tag_set(&rbd_dev->tag_set); 3793602adf40SYehuda Sadeh out_disk: 3794602adf40SYehuda Sadeh put_disk(disk); 37957ad18afaSChristoph Hellwig return err; 3796602adf40SYehuda Sadeh } 3797602adf40SYehuda Sadeh 3798dfc5606dSYehuda Sadeh /* 3799dfc5606dSYehuda Sadeh sysfs 3800dfc5606dSYehuda Sadeh */ 3801602adf40SYehuda Sadeh 3802593a9e7bSAlex Elder static struct rbd_device *dev_to_rbd_dev(struct device *dev) 3803593a9e7bSAlex Elder { 3804593a9e7bSAlex Elder return container_of(dev, struct rbd_device, dev); 3805593a9e7bSAlex Elder } 3806593a9e7bSAlex Elder 3807dfc5606dSYehuda Sadeh static ssize_t rbd_size_show(struct device *dev, 3808dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 3809602adf40SYehuda Sadeh { 3810593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 3811dfc5606dSYehuda Sadeh 3812fc71d833SAlex Elder return sprintf(buf, "%llu\n", 3813fc71d833SAlex Elder (unsigned long long)rbd_dev->mapping.size); 3814602adf40SYehuda Sadeh } 3815602adf40SYehuda Sadeh 381634b13184SAlex Elder /* 381734b13184SAlex Elder * Note this shows the features for whatever's mapped, which is not 381834b13184SAlex Elder * necessarily the base image. 381934b13184SAlex Elder */ 382034b13184SAlex Elder static ssize_t rbd_features_show(struct device *dev, 382134b13184SAlex Elder struct device_attribute *attr, char *buf) 382234b13184SAlex Elder { 382334b13184SAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 382434b13184SAlex Elder 382534b13184SAlex Elder return sprintf(buf, "0x%016llx\n", 382634b13184SAlex Elder (unsigned long long)rbd_dev->mapping.features); 382734b13184SAlex Elder } 382834b13184SAlex Elder 3829dfc5606dSYehuda Sadeh static ssize_t rbd_major_show(struct device *dev, 3830dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 3831602adf40SYehuda Sadeh { 3832593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 3833dfc5606dSYehuda Sadeh 3834fc71d833SAlex Elder if (rbd_dev->major) 3835dfc5606dSYehuda Sadeh return sprintf(buf, "%d\n", rbd_dev->major); 3836fc71d833SAlex Elder 3837fc71d833SAlex Elder return sprintf(buf, "(none)\n"); 3838dd82fff1SIlya Dryomov } 3839fc71d833SAlex Elder 3840dd82fff1SIlya Dryomov static ssize_t rbd_minor_show(struct device *dev, 3841dd82fff1SIlya Dryomov struct device_attribute *attr, char *buf) 3842dd82fff1SIlya Dryomov { 3843dd82fff1SIlya Dryomov struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 3844dd82fff1SIlya Dryomov 3845dd82fff1SIlya Dryomov return sprintf(buf, "%d\n", rbd_dev->minor); 3846dfc5606dSYehuda Sadeh } 3847dfc5606dSYehuda Sadeh 3848dfc5606dSYehuda Sadeh static ssize_t rbd_client_id_show(struct device *dev, 3849dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 3850dfc5606dSYehuda Sadeh { 3851593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 3852dfc5606dSYehuda Sadeh 38531dbb4399SAlex Elder return sprintf(buf, "client%lld\n", 38541dbb4399SAlex Elder ceph_client_id(rbd_dev->rbd_client->client)); 3855dfc5606dSYehuda Sadeh } 3856dfc5606dSYehuda Sadeh 3857dfc5606dSYehuda Sadeh static ssize_t rbd_pool_show(struct device *dev, 3858dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 3859dfc5606dSYehuda Sadeh { 3860593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 3861dfc5606dSYehuda Sadeh 38620d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->pool_name); 3863dfc5606dSYehuda Sadeh } 3864dfc5606dSYehuda Sadeh 38659bb2f334SAlex Elder static ssize_t rbd_pool_id_show(struct device *dev, 38669bb2f334SAlex Elder struct device_attribute *attr, char *buf) 38679bb2f334SAlex Elder { 38689bb2f334SAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 38699bb2f334SAlex Elder 38700d7dbfceSAlex Elder return sprintf(buf, "%llu\n", 38710d7dbfceSAlex Elder (unsigned long long) rbd_dev->spec->pool_id); 38729bb2f334SAlex Elder } 38739bb2f334SAlex Elder 3874dfc5606dSYehuda Sadeh static ssize_t rbd_name_show(struct device *dev, 3875dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 3876dfc5606dSYehuda Sadeh { 3877593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 3878dfc5606dSYehuda Sadeh 3879a92ffdf8SAlex Elder if (rbd_dev->spec->image_name) 38800d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->image_name); 3881a92ffdf8SAlex Elder 3882a92ffdf8SAlex Elder return sprintf(buf, "(unknown)\n"); 3883dfc5606dSYehuda Sadeh } 3884dfc5606dSYehuda Sadeh 3885589d30e0SAlex Elder static ssize_t rbd_image_id_show(struct device *dev, 3886589d30e0SAlex Elder struct device_attribute *attr, char *buf) 3887589d30e0SAlex Elder { 3888589d30e0SAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 3889589d30e0SAlex Elder 38900d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->image_id); 3891589d30e0SAlex Elder } 3892589d30e0SAlex Elder 389334b13184SAlex Elder /* 389434b13184SAlex Elder * Shows the name of the currently-mapped snapshot (or 389534b13184SAlex Elder * RBD_SNAP_HEAD_NAME for the base image). 389634b13184SAlex Elder */ 3897dfc5606dSYehuda Sadeh static ssize_t rbd_snap_show(struct device *dev, 3898dfc5606dSYehuda Sadeh struct device_attribute *attr, 3899dfc5606dSYehuda Sadeh char *buf) 3900dfc5606dSYehuda Sadeh { 3901593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 3902dfc5606dSYehuda Sadeh 39030d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->snap_name); 3904dfc5606dSYehuda Sadeh } 3905dfc5606dSYehuda Sadeh 390686b00e0dSAlex Elder /* 3907ff96128fSIlya Dryomov * For a v2 image, shows the chain of parent images, separated by empty 3908ff96128fSIlya Dryomov * lines. For v1 images or if there is no parent, shows "(no parent 3909ff96128fSIlya Dryomov * image)". 391086b00e0dSAlex Elder */ 391186b00e0dSAlex Elder static ssize_t rbd_parent_show(struct device *dev, 391286b00e0dSAlex Elder struct device_attribute *attr, 391386b00e0dSAlex Elder char *buf) 391486b00e0dSAlex Elder { 391586b00e0dSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 3916ff96128fSIlya Dryomov ssize_t count = 0; 391786b00e0dSAlex Elder 3918ff96128fSIlya Dryomov if (!rbd_dev->parent) 391986b00e0dSAlex Elder return sprintf(buf, "(no parent image)\n"); 392086b00e0dSAlex Elder 3921ff96128fSIlya Dryomov for ( ; rbd_dev->parent; rbd_dev = rbd_dev->parent) { 3922ff96128fSIlya Dryomov struct rbd_spec *spec = rbd_dev->parent_spec; 392386b00e0dSAlex Elder 3924ff96128fSIlya Dryomov count += sprintf(&buf[count], "%s" 3925ff96128fSIlya Dryomov "pool_id %llu\npool_name %s\n" 3926ff96128fSIlya Dryomov "image_id %s\nimage_name %s\n" 3927ff96128fSIlya Dryomov "snap_id %llu\nsnap_name %s\n" 3928ff96128fSIlya Dryomov "overlap %llu\n", 3929ff96128fSIlya Dryomov !count ? "" : "\n", /* first? */ 3930ff96128fSIlya Dryomov spec->pool_id, spec->pool_name, 3931ff96128fSIlya Dryomov spec->image_id, spec->image_name ?: "(unknown)", 3932ff96128fSIlya Dryomov spec->snap_id, spec->snap_name, 3933ff96128fSIlya Dryomov rbd_dev->parent_overlap); 3934ff96128fSIlya Dryomov } 393586b00e0dSAlex Elder 393686b00e0dSAlex Elder return count; 393786b00e0dSAlex Elder } 393886b00e0dSAlex Elder 3939dfc5606dSYehuda Sadeh static ssize_t rbd_image_refresh(struct device *dev, 3940dfc5606dSYehuda Sadeh struct device_attribute *attr, 3941dfc5606dSYehuda Sadeh const char *buf, 3942dfc5606dSYehuda Sadeh size_t size) 3943dfc5606dSYehuda Sadeh { 3944593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 3945b813623aSAlex Elder int ret; 3946602adf40SYehuda Sadeh 3947cc4a38bdSAlex Elder ret = rbd_dev_refresh(rbd_dev); 3948e627db08SAlex Elder if (ret) 394952bb1f9bSIlya Dryomov return ret; 3950b813623aSAlex Elder 395152bb1f9bSIlya Dryomov return size; 3952dfc5606dSYehuda Sadeh } 3953602adf40SYehuda Sadeh 3954dfc5606dSYehuda Sadeh static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL); 395534b13184SAlex Elder static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL); 3956dfc5606dSYehuda Sadeh static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL); 3957dd82fff1SIlya Dryomov static DEVICE_ATTR(minor, S_IRUGO, rbd_minor_show, NULL); 3958dfc5606dSYehuda Sadeh static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL); 3959dfc5606dSYehuda Sadeh static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL); 39609bb2f334SAlex Elder static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL); 3961dfc5606dSYehuda Sadeh static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL); 3962589d30e0SAlex Elder static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL); 3963dfc5606dSYehuda Sadeh static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh); 3964dfc5606dSYehuda Sadeh static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL); 396586b00e0dSAlex Elder static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL); 3966dfc5606dSYehuda Sadeh 3967dfc5606dSYehuda Sadeh static struct attribute *rbd_attrs[] = { 3968dfc5606dSYehuda Sadeh &dev_attr_size.attr, 396934b13184SAlex Elder &dev_attr_features.attr, 3970dfc5606dSYehuda Sadeh &dev_attr_major.attr, 3971dd82fff1SIlya Dryomov &dev_attr_minor.attr, 3972dfc5606dSYehuda Sadeh &dev_attr_client_id.attr, 3973dfc5606dSYehuda Sadeh &dev_attr_pool.attr, 39749bb2f334SAlex Elder &dev_attr_pool_id.attr, 3975dfc5606dSYehuda Sadeh &dev_attr_name.attr, 3976589d30e0SAlex Elder &dev_attr_image_id.attr, 3977dfc5606dSYehuda Sadeh &dev_attr_current_snap.attr, 397886b00e0dSAlex Elder &dev_attr_parent.attr, 3979dfc5606dSYehuda Sadeh &dev_attr_refresh.attr, 3980dfc5606dSYehuda Sadeh NULL 3981dfc5606dSYehuda Sadeh }; 3982dfc5606dSYehuda Sadeh 3983dfc5606dSYehuda Sadeh static struct attribute_group rbd_attr_group = { 3984dfc5606dSYehuda Sadeh .attrs = rbd_attrs, 3985dfc5606dSYehuda Sadeh }; 3986dfc5606dSYehuda Sadeh 3987dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_attr_groups[] = { 3988dfc5606dSYehuda Sadeh &rbd_attr_group, 3989dfc5606dSYehuda Sadeh NULL 3990dfc5606dSYehuda Sadeh }; 3991dfc5606dSYehuda Sadeh 39926cac4695SIlya Dryomov static void rbd_dev_release(struct device *dev); 3993dfc5606dSYehuda Sadeh 3994dfc5606dSYehuda Sadeh static struct device_type rbd_device_type = { 3995dfc5606dSYehuda Sadeh .name = "rbd", 3996dfc5606dSYehuda Sadeh .groups = rbd_attr_groups, 39976cac4695SIlya Dryomov .release = rbd_dev_release, 3998dfc5606dSYehuda Sadeh }; 3999dfc5606dSYehuda Sadeh 40008b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec) 40018b8fb99cSAlex Elder { 40028b8fb99cSAlex Elder kref_get(&spec->kref); 40038b8fb99cSAlex Elder 40048b8fb99cSAlex Elder return spec; 40058b8fb99cSAlex Elder } 40068b8fb99cSAlex Elder 40078b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref); 40088b8fb99cSAlex Elder static void rbd_spec_put(struct rbd_spec *spec) 40098b8fb99cSAlex Elder { 40108b8fb99cSAlex Elder if (spec) 40118b8fb99cSAlex Elder kref_put(&spec->kref, rbd_spec_free); 40128b8fb99cSAlex Elder } 40138b8fb99cSAlex Elder 40148b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_alloc(void) 40158b8fb99cSAlex Elder { 40168b8fb99cSAlex Elder struct rbd_spec *spec; 40178b8fb99cSAlex Elder 40188b8fb99cSAlex Elder spec = kzalloc(sizeof (*spec), GFP_KERNEL); 40198b8fb99cSAlex Elder if (!spec) 40208b8fb99cSAlex Elder return NULL; 402104077599SIlya Dryomov 402204077599SIlya Dryomov spec->pool_id = CEPH_NOPOOL; 402304077599SIlya Dryomov spec->snap_id = CEPH_NOSNAP; 40248b8fb99cSAlex Elder kref_init(&spec->kref); 40258b8fb99cSAlex Elder 40268b8fb99cSAlex Elder return spec; 40278b8fb99cSAlex Elder } 40288b8fb99cSAlex Elder 40298b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref) 40308b8fb99cSAlex Elder { 40318b8fb99cSAlex Elder struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref); 40328b8fb99cSAlex Elder 40338b8fb99cSAlex Elder kfree(spec->pool_name); 40348b8fb99cSAlex Elder kfree(spec->image_id); 40358b8fb99cSAlex Elder kfree(spec->image_name); 40368b8fb99cSAlex Elder kfree(spec->snap_name); 40378b8fb99cSAlex Elder kfree(spec); 40388b8fb99cSAlex Elder } 40398b8fb99cSAlex Elder 4040dd5ac32dSIlya Dryomov static void rbd_dev_release(struct device *dev) 4041dd5ac32dSIlya Dryomov { 4042dd5ac32dSIlya Dryomov struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4043dd5ac32dSIlya Dryomov bool need_put = !!rbd_dev->opts; 4044dd5ac32dSIlya Dryomov 4045dd5ac32dSIlya Dryomov rbd_put_client(rbd_dev->rbd_client); 4046dd5ac32dSIlya Dryomov rbd_spec_put(rbd_dev->spec); 4047dd5ac32dSIlya Dryomov kfree(rbd_dev->opts); 4048dd5ac32dSIlya Dryomov kfree(rbd_dev); 4049dd5ac32dSIlya Dryomov 4050dd5ac32dSIlya Dryomov /* 4051dd5ac32dSIlya Dryomov * This is racy, but way better than putting module outside of 4052dd5ac32dSIlya Dryomov * the release callback. The race window is pretty small, so 4053dd5ac32dSIlya Dryomov * doing something similar to dm (dm-builtin.c) is overkill. 4054dd5ac32dSIlya Dryomov */ 4055dd5ac32dSIlya Dryomov if (need_put) 4056dd5ac32dSIlya Dryomov module_put(THIS_MODULE); 4057dd5ac32dSIlya Dryomov } 4058dd5ac32dSIlya Dryomov 4059cc344fa1SAlex Elder static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc, 4060d147543dSIlya Dryomov struct rbd_spec *spec, 4061d147543dSIlya Dryomov struct rbd_options *opts) 4062c53d5893SAlex Elder { 4063c53d5893SAlex Elder struct rbd_device *rbd_dev; 4064c53d5893SAlex Elder 4065c53d5893SAlex Elder rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL); 4066c53d5893SAlex Elder if (!rbd_dev) 4067c53d5893SAlex Elder return NULL; 4068c53d5893SAlex Elder 4069c53d5893SAlex Elder spin_lock_init(&rbd_dev->lock); 40706d292906SAlex Elder rbd_dev->flags = 0; 4071a2acd00eSAlex Elder atomic_set(&rbd_dev->parent_ref, 0); 4072c53d5893SAlex Elder INIT_LIST_HEAD(&rbd_dev->node); 4073c53d5893SAlex Elder init_rwsem(&rbd_dev->header_rwsem); 4074c53d5893SAlex Elder 4075dd5ac32dSIlya Dryomov rbd_dev->dev.bus = &rbd_bus_type; 4076dd5ac32dSIlya Dryomov rbd_dev->dev.type = &rbd_device_type; 4077dd5ac32dSIlya Dryomov rbd_dev->dev.parent = &rbd_root_dev; 4078dd5ac32dSIlya Dryomov device_initialize(&rbd_dev->dev); 4079dd5ac32dSIlya Dryomov 4080c53d5893SAlex Elder rbd_dev->rbd_client = rbdc; 4081d147543dSIlya Dryomov rbd_dev->spec = spec; 4082d147543dSIlya Dryomov rbd_dev->opts = opts; 4083c53d5893SAlex Elder 40840903e875SAlex Elder /* Initialize the layout used for all rbd requests */ 40850903e875SAlex Elder 40860903e875SAlex Elder rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); 40870903e875SAlex Elder rbd_dev->layout.fl_stripe_count = cpu_to_le32(1); 40880903e875SAlex Elder rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); 40890903e875SAlex Elder rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id); 40900903e875SAlex Elder 4091dd5ac32dSIlya Dryomov /* 4092dd5ac32dSIlya Dryomov * If this is a mapping rbd_dev (as opposed to a parent one), 4093dd5ac32dSIlya Dryomov * pin our module. We have a ref from do_rbd_add(), so use 4094dd5ac32dSIlya Dryomov * __module_get(). 4095dd5ac32dSIlya Dryomov */ 4096dd5ac32dSIlya Dryomov if (rbd_dev->opts) 4097dd5ac32dSIlya Dryomov __module_get(THIS_MODULE); 4098dd5ac32dSIlya Dryomov 4099c53d5893SAlex Elder return rbd_dev; 4100c53d5893SAlex Elder } 4101c53d5893SAlex Elder 4102c53d5893SAlex Elder static void rbd_dev_destroy(struct rbd_device *rbd_dev) 4103c53d5893SAlex Elder { 4104dd5ac32dSIlya Dryomov if (rbd_dev) 4105dd5ac32dSIlya Dryomov put_device(&rbd_dev->dev); 4106c53d5893SAlex Elder } 4107c53d5893SAlex Elder 4108dfc5606dSYehuda Sadeh /* 41099d475de5SAlex Elder * Get the size and object order for an image snapshot, or if 41109d475de5SAlex Elder * snap_id is CEPH_NOSNAP, gets this information for the base 41119d475de5SAlex Elder * image. 41129d475de5SAlex Elder */ 41139d475de5SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id, 41149d475de5SAlex Elder u8 *order, u64 *snap_size) 41159d475de5SAlex Elder { 41169d475de5SAlex Elder __le64 snapid = cpu_to_le64(snap_id); 41179d475de5SAlex Elder int ret; 41189d475de5SAlex Elder struct { 41199d475de5SAlex Elder u8 order; 41209d475de5SAlex Elder __le64 size; 41219d475de5SAlex Elder } __attribute__ ((packed)) size_buf = { 0 }; 41229d475de5SAlex Elder 412336be9a76SAlex Elder ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, 41249d475de5SAlex Elder "rbd", "get_size", 41254157976bSAlex Elder &snapid, sizeof (snapid), 4126e2a58ee5SAlex Elder &size_buf, sizeof (size_buf)); 412736be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 41289d475de5SAlex Elder if (ret < 0) 41299d475de5SAlex Elder return ret; 413057385b51SAlex Elder if (ret < sizeof (size_buf)) 413157385b51SAlex Elder return -ERANGE; 41329d475de5SAlex Elder 4133c3545579SJosh Durgin if (order) { 41349d475de5SAlex Elder *order = size_buf.order; 4135c3545579SJosh Durgin dout(" order %u", (unsigned int)*order); 4136c3545579SJosh Durgin } 41379d475de5SAlex Elder *snap_size = le64_to_cpu(size_buf.size); 41389d475de5SAlex Elder 4139c3545579SJosh Durgin dout(" snap_id 0x%016llx snap_size = %llu\n", 4140c3545579SJosh Durgin (unsigned long long)snap_id, 41419d475de5SAlex Elder (unsigned long long)*snap_size); 41429d475de5SAlex Elder 41439d475de5SAlex Elder return 0; 41449d475de5SAlex Elder } 41459d475de5SAlex Elder 41469d475de5SAlex Elder static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev) 41479d475de5SAlex Elder { 41489d475de5SAlex Elder return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP, 41499d475de5SAlex Elder &rbd_dev->header.obj_order, 41509d475de5SAlex Elder &rbd_dev->header.image_size); 41519d475de5SAlex Elder } 41529d475de5SAlex Elder 41531e130199SAlex Elder static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev) 41541e130199SAlex Elder { 41551e130199SAlex Elder void *reply_buf; 41561e130199SAlex Elder int ret; 41571e130199SAlex Elder void *p; 41581e130199SAlex Elder 41591e130199SAlex Elder reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL); 41601e130199SAlex Elder if (!reply_buf) 41611e130199SAlex Elder return -ENOMEM; 41621e130199SAlex Elder 416336be9a76SAlex Elder ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, 41644157976bSAlex Elder "rbd", "get_object_prefix", NULL, 0, 4165e2a58ee5SAlex Elder reply_buf, RBD_OBJ_PREFIX_LEN_MAX); 416636be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 41671e130199SAlex Elder if (ret < 0) 41681e130199SAlex Elder goto out; 41691e130199SAlex Elder 41701e130199SAlex Elder p = reply_buf; 41711e130199SAlex Elder rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p, 417257385b51SAlex Elder p + ret, NULL, GFP_NOIO); 417357385b51SAlex Elder ret = 0; 41741e130199SAlex Elder 41751e130199SAlex Elder if (IS_ERR(rbd_dev->header.object_prefix)) { 41761e130199SAlex Elder ret = PTR_ERR(rbd_dev->header.object_prefix); 41771e130199SAlex Elder rbd_dev->header.object_prefix = NULL; 41781e130199SAlex Elder } else { 41791e130199SAlex Elder dout(" object_prefix = %s\n", rbd_dev->header.object_prefix); 41801e130199SAlex Elder } 41811e130199SAlex Elder out: 41821e130199SAlex Elder kfree(reply_buf); 41831e130199SAlex Elder 41841e130199SAlex Elder return ret; 41851e130199SAlex Elder } 41861e130199SAlex Elder 4187b1b5402aSAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id, 4188b1b5402aSAlex Elder u64 *snap_features) 4189b1b5402aSAlex Elder { 4190b1b5402aSAlex Elder __le64 snapid = cpu_to_le64(snap_id); 4191b1b5402aSAlex Elder struct { 4192b1b5402aSAlex Elder __le64 features; 4193b1b5402aSAlex Elder __le64 incompat; 41944157976bSAlex Elder } __attribute__ ((packed)) features_buf = { 0 }; 4195d889140cSAlex Elder u64 incompat; 4196b1b5402aSAlex Elder int ret; 4197b1b5402aSAlex Elder 419836be9a76SAlex Elder ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, 4199b1b5402aSAlex Elder "rbd", "get_features", 42004157976bSAlex Elder &snapid, sizeof (snapid), 4201e2a58ee5SAlex Elder &features_buf, sizeof (features_buf)); 420236be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 4203b1b5402aSAlex Elder if (ret < 0) 4204b1b5402aSAlex Elder return ret; 420557385b51SAlex Elder if (ret < sizeof (features_buf)) 420657385b51SAlex Elder return -ERANGE; 4207d889140cSAlex Elder 4208d889140cSAlex Elder incompat = le64_to_cpu(features_buf.incompat); 42095cbf6f12SAlex Elder if (incompat & ~RBD_FEATURES_SUPPORTED) 4210b8f5c6edSAlex Elder return -ENXIO; 4211d889140cSAlex Elder 4212b1b5402aSAlex Elder *snap_features = le64_to_cpu(features_buf.features); 4213b1b5402aSAlex Elder 4214b1b5402aSAlex Elder dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n", 4215b1b5402aSAlex Elder (unsigned long long)snap_id, 4216b1b5402aSAlex Elder (unsigned long long)*snap_features, 4217b1b5402aSAlex Elder (unsigned long long)le64_to_cpu(features_buf.incompat)); 4218b1b5402aSAlex Elder 4219b1b5402aSAlex Elder return 0; 4220b1b5402aSAlex Elder } 4221b1b5402aSAlex Elder 4222b1b5402aSAlex Elder static int rbd_dev_v2_features(struct rbd_device *rbd_dev) 4223b1b5402aSAlex Elder { 4224b1b5402aSAlex Elder return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP, 4225b1b5402aSAlex Elder &rbd_dev->header.features); 4226b1b5402aSAlex Elder } 4227b1b5402aSAlex Elder 422886b00e0dSAlex Elder static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev) 422986b00e0dSAlex Elder { 423086b00e0dSAlex Elder struct rbd_spec *parent_spec; 423186b00e0dSAlex Elder size_t size; 423286b00e0dSAlex Elder void *reply_buf = NULL; 423386b00e0dSAlex Elder __le64 snapid; 423486b00e0dSAlex Elder void *p; 423586b00e0dSAlex Elder void *end; 4236642a2537SAlex Elder u64 pool_id; 423786b00e0dSAlex Elder char *image_id; 42383b5cf2a2SAlex Elder u64 snap_id; 423986b00e0dSAlex Elder u64 overlap; 424086b00e0dSAlex Elder int ret; 424186b00e0dSAlex Elder 424286b00e0dSAlex Elder parent_spec = rbd_spec_alloc(); 424386b00e0dSAlex Elder if (!parent_spec) 424486b00e0dSAlex Elder return -ENOMEM; 424586b00e0dSAlex Elder 424686b00e0dSAlex Elder size = sizeof (__le64) + /* pool_id */ 424786b00e0dSAlex Elder sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX + /* image_id */ 424886b00e0dSAlex Elder sizeof (__le64) + /* snap_id */ 424986b00e0dSAlex Elder sizeof (__le64); /* overlap */ 425086b00e0dSAlex Elder reply_buf = kmalloc(size, GFP_KERNEL); 425186b00e0dSAlex Elder if (!reply_buf) { 425286b00e0dSAlex Elder ret = -ENOMEM; 425386b00e0dSAlex Elder goto out_err; 425486b00e0dSAlex Elder } 425586b00e0dSAlex Elder 42564d9b67cdSIlya Dryomov snapid = cpu_to_le64(rbd_dev->spec->snap_id); 425736be9a76SAlex Elder ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, 425886b00e0dSAlex Elder "rbd", "get_parent", 42594157976bSAlex Elder &snapid, sizeof (snapid), 4260e2a58ee5SAlex Elder reply_buf, size); 426136be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 426286b00e0dSAlex Elder if (ret < 0) 426386b00e0dSAlex Elder goto out_err; 426486b00e0dSAlex Elder 426586b00e0dSAlex Elder p = reply_buf; 426657385b51SAlex Elder end = reply_buf + ret; 426757385b51SAlex Elder ret = -ERANGE; 4268642a2537SAlex Elder ceph_decode_64_safe(&p, end, pool_id, out_err); 4269392a9dadSAlex Elder if (pool_id == CEPH_NOPOOL) { 4270392a9dadSAlex Elder /* 4271392a9dadSAlex Elder * Either the parent never existed, or we have 4272392a9dadSAlex Elder * record of it but the image got flattened so it no 4273392a9dadSAlex Elder * longer has a parent. When the parent of a 4274392a9dadSAlex Elder * layered image disappears we immediately set the 4275392a9dadSAlex Elder * overlap to 0. The effect of this is that all new 4276392a9dadSAlex Elder * requests will be treated as if the image had no 4277392a9dadSAlex Elder * parent. 4278392a9dadSAlex Elder */ 4279392a9dadSAlex Elder if (rbd_dev->parent_overlap) { 4280392a9dadSAlex Elder rbd_dev->parent_overlap = 0; 4281392a9dadSAlex Elder rbd_dev_parent_put(rbd_dev); 4282392a9dadSAlex Elder pr_info("%s: clone image has been flattened\n", 4283392a9dadSAlex Elder rbd_dev->disk->disk_name); 4284392a9dadSAlex Elder } 4285392a9dadSAlex Elder 428686b00e0dSAlex Elder goto out; /* No parent? No problem. */ 4287392a9dadSAlex Elder } 428886b00e0dSAlex Elder 42890903e875SAlex Elder /* The ceph file layout needs to fit pool id in 32 bits */ 42900903e875SAlex Elder 42910903e875SAlex Elder ret = -EIO; 4292642a2537SAlex Elder if (pool_id > (u64)U32_MAX) { 42939584d508SIlya Dryomov rbd_warn(NULL, "parent pool id too large (%llu > %u)", 4294642a2537SAlex Elder (unsigned long long)pool_id, U32_MAX); 429557385b51SAlex Elder goto out_err; 4296c0cd10dbSAlex Elder } 42970903e875SAlex Elder 4298979ed480SAlex Elder image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL); 429986b00e0dSAlex Elder if (IS_ERR(image_id)) { 430086b00e0dSAlex Elder ret = PTR_ERR(image_id); 430186b00e0dSAlex Elder goto out_err; 430286b00e0dSAlex Elder } 43033b5cf2a2SAlex Elder ceph_decode_64_safe(&p, end, snap_id, out_err); 430486b00e0dSAlex Elder ceph_decode_64_safe(&p, end, overlap, out_err); 430586b00e0dSAlex Elder 43063b5cf2a2SAlex Elder /* 43073b5cf2a2SAlex Elder * The parent won't change (except when the clone is 43083b5cf2a2SAlex Elder * flattened, already handled that). So we only need to 43093b5cf2a2SAlex Elder * record the parent spec we have not already done so. 43103b5cf2a2SAlex Elder */ 43113b5cf2a2SAlex Elder if (!rbd_dev->parent_spec) { 43123b5cf2a2SAlex Elder parent_spec->pool_id = pool_id; 43133b5cf2a2SAlex Elder parent_spec->image_id = image_id; 43143b5cf2a2SAlex Elder parent_spec->snap_id = snap_id; 431586b00e0dSAlex Elder rbd_dev->parent_spec = parent_spec; 431686b00e0dSAlex Elder parent_spec = NULL; /* rbd_dev now owns this */ 4317fbba11b3SIlya Dryomov } else { 4318fbba11b3SIlya Dryomov kfree(image_id); 43193b5cf2a2SAlex Elder } 43203b5cf2a2SAlex Elder 43213b5cf2a2SAlex Elder /* 4322cf32bd9cSIlya Dryomov * We always update the parent overlap. If it's zero we issue 4323cf32bd9cSIlya Dryomov * a warning, as we will proceed as if there was no parent. 43243b5cf2a2SAlex Elder */ 43253b5cf2a2SAlex Elder if (!overlap) { 43263b5cf2a2SAlex Elder if (parent_spec) { 4327cf32bd9cSIlya Dryomov /* refresh, careful to warn just once */ 4328cf32bd9cSIlya Dryomov if (rbd_dev->parent_overlap) 4329cf32bd9cSIlya Dryomov rbd_warn(rbd_dev, 4330cf32bd9cSIlya Dryomov "clone now standalone (overlap became 0)"); 433170cf49cfSAlex Elder } else { 4332cf32bd9cSIlya Dryomov /* initial probe */ 4333cf32bd9cSIlya Dryomov rbd_warn(rbd_dev, "clone is standalone (overlap 0)"); 43343b5cf2a2SAlex Elder } 433570cf49cfSAlex Elder } 4336cf32bd9cSIlya Dryomov rbd_dev->parent_overlap = overlap; 4337cf32bd9cSIlya Dryomov 433886b00e0dSAlex Elder out: 433986b00e0dSAlex Elder ret = 0; 434086b00e0dSAlex Elder out_err: 434186b00e0dSAlex Elder kfree(reply_buf); 434286b00e0dSAlex Elder rbd_spec_put(parent_spec); 434386b00e0dSAlex Elder 434486b00e0dSAlex Elder return ret; 434586b00e0dSAlex Elder } 434686b00e0dSAlex Elder 4347cc070d59SAlex Elder static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev) 4348cc070d59SAlex Elder { 4349cc070d59SAlex Elder struct { 4350cc070d59SAlex Elder __le64 stripe_unit; 4351cc070d59SAlex Elder __le64 stripe_count; 4352cc070d59SAlex Elder } __attribute__ ((packed)) striping_info_buf = { 0 }; 4353cc070d59SAlex Elder size_t size = sizeof (striping_info_buf); 4354cc070d59SAlex Elder void *p; 4355cc070d59SAlex Elder u64 obj_size; 4356cc070d59SAlex Elder u64 stripe_unit; 4357cc070d59SAlex Elder u64 stripe_count; 4358cc070d59SAlex Elder int ret; 4359cc070d59SAlex Elder 4360cc070d59SAlex Elder ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, 4361cc070d59SAlex Elder "rbd", "get_stripe_unit_count", NULL, 0, 4362e2a58ee5SAlex Elder (char *)&striping_info_buf, size); 4363cc070d59SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 4364cc070d59SAlex Elder if (ret < 0) 4365cc070d59SAlex Elder return ret; 4366cc070d59SAlex Elder if (ret < size) 4367cc070d59SAlex Elder return -ERANGE; 4368cc070d59SAlex Elder 4369cc070d59SAlex Elder /* 4370cc070d59SAlex Elder * We don't actually support the "fancy striping" feature 4371cc070d59SAlex Elder * (STRIPINGV2) yet, but if the striping sizes are the 4372cc070d59SAlex Elder * defaults the behavior is the same as before. So find 4373cc070d59SAlex Elder * out, and only fail if the image has non-default values. 4374cc070d59SAlex Elder */ 4375cc070d59SAlex Elder ret = -EINVAL; 4376cc070d59SAlex Elder obj_size = (u64)1 << rbd_dev->header.obj_order; 4377cc070d59SAlex Elder p = &striping_info_buf; 4378cc070d59SAlex Elder stripe_unit = ceph_decode_64(&p); 4379cc070d59SAlex Elder if (stripe_unit != obj_size) { 4380cc070d59SAlex Elder rbd_warn(rbd_dev, "unsupported stripe unit " 4381cc070d59SAlex Elder "(got %llu want %llu)", 4382cc070d59SAlex Elder stripe_unit, obj_size); 4383cc070d59SAlex Elder return -EINVAL; 4384cc070d59SAlex Elder } 4385cc070d59SAlex Elder stripe_count = ceph_decode_64(&p); 4386cc070d59SAlex Elder if (stripe_count != 1) { 4387cc070d59SAlex Elder rbd_warn(rbd_dev, "unsupported stripe count " 4388cc070d59SAlex Elder "(got %llu want 1)", stripe_count); 4389cc070d59SAlex Elder return -EINVAL; 4390cc070d59SAlex Elder } 4391500d0c0fSAlex Elder rbd_dev->header.stripe_unit = stripe_unit; 4392500d0c0fSAlex Elder rbd_dev->header.stripe_count = stripe_count; 4393cc070d59SAlex Elder 4394cc070d59SAlex Elder return 0; 4395cc070d59SAlex Elder } 4396cc070d59SAlex Elder 43979e15b77dSAlex Elder static char *rbd_dev_image_name(struct rbd_device *rbd_dev) 43989e15b77dSAlex Elder { 43999e15b77dSAlex Elder size_t image_id_size; 44009e15b77dSAlex Elder char *image_id; 44019e15b77dSAlex Elder void *p; 44029e15b77dSAlex Elder void *end; 44039e15b77dSAlex Elder size_t size; 44049e15b77dSAlex Elder void *reply_buf = NULL; 44059e15b77dSAlex Elder size_t len = 0; 44069e15b77dSAlex Elder char *image_name = NULL; 44079e15b77dSAlex Elder int ret; 44089e15b77dSAlex Elder 44099e15b77dSAlex Elder rbd_assert(!rbd_dev->spec->image_name); 44109e15b77dSAlex Elder 441169e7a02fSAlex Elder len = strlen(rbd_dev->spec->image_id); 441269e7a02fSAlex Elder image_id_size = sizeof (__le32) + len; 44139e15b77dSAlex Elder image_id = kmalloc(image_id_size, GFP_KERNEL); 44149e15b77dSAlex Elder if (!image_id) 44159e15b77dSAlex Elder return NULL; 44169e15b77dSAlex Elder 44179e15b77dSAlex Elder p = image_id; 44184157976bSAlex Elder end = image_id + image_id_size; 441969e7a02fSAlex Elder ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len); 44209e15b77dSAlex Elder 44219e15b77dSAlex Elder size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX; 44229e15b77dSAlex Elder reply_buf = kmalloc(size, GFP_KERNEL); 44239e15b77dSAlex Elder if (!reply_buf) 44249e15b77dSAlex Elder goto out; 44259e15b77dSAlex Elder 442636be9a76SAlex Elder ret = rbd_obj_method_sync(rbd_dev, RBD_DIRECTORY, 44279e15b77dSAlex Elder "rbd", "dir_get_name", 44289e15b77dSAlex Elder image_id, image_id_size, 4429e2a58ee5SAlex Elder reply_buf, size); 44309e15b77dSAlex Elder if (ret < 0) 44319e15b77dSAlex Elder goto out; 44329e15b77dSAlex Elder p = reply_buf; 4433f40eb349SAlex Elder end = reply_buf + ret; 4434f40eb349SAlex Elder 44359e15b77dSAlex Elder image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL); 44369e15b77dSAlex Elder if (IS_ERR(image_name)) 44379e15b77dSAlex Elder image_name = NULL; 44389e15b77dSAlex Elder else 44399e15b77dSAlex Elder dout("%s: name is %s len is %zd\n", __func__, image_name, len); 44409e15b77dSAlex Elder out: 44419e15b77dSAlex Elder kfree(reply_buf); 44429e15b77dSAlex Elder kfree(image_id); 44439e15b77dSAlex Elder 44449e15b77dSAlex Elder return image_name; 44459e15b77dSAlex Elder } 44469e15b77dSAlex Elder 44472ad3d716SAlex Elder static u64 rbd_v1_snap_id_by_name(struct rbd_device *rbd_dev, const char *name) 44482ad3d716SAlex Elder { 44492ad3d716SAlex Elder struct ceph_snap_context *snapc = rbd_dev->header.snapc; 44502ad3d716SAlex Elder const char *snap_name; 44512ad3d716SAlex Elder u32 which = 0; 44522ad3d716SAlex Elder 44532ad3d716SAlex Elder /* Skip over names until we find the one we are looking for */ 44542ad3d716SAlex Elder 44552ad3d716SAlex Elder snap_name = rbd_dev->header.snap_names; 44562ad3d716SAlex Elder while (which < snapc->num_snaps) { 44572ad3d716SAlex Elder if (!strcmp(name, snap_name)) 44582ad3d716SAlex Elder return snapc->snaps[which]; 44592ad3d716SAlex Elder snap_name += strlen(snap_name) + 1; 44602ad3d716SAlex Elder which++; 44612ad3d716SAlex Elder } 44622ad3d716SAlex Elder return CEPH_NOSNAP; 44632ad3d716SAlex Elder } 44642ad3d716SAlex Elder 44652ad3d716SAlex Elder static u64 rbd_v2_snap_id_by_name(struct rbd_device *rbd_dev, const char *name) 44662ad3d716SAlex Elder { 44672ad3d716SAlex Elder struct ceph_snap_context *snapc = rbd_dev->header.snapc; 44682ad3d716SAlex Elder u32 which; 44692ad3d716SAlex Elder bool found = false; 44702ad3d716SAlex Elder u64 snap_id; 44712ad3d716SAlex Elder 44722ad3d716SAlex Elder for (which = 0; !found && which < snapc->num_snaps; which++) { 44732ad3d716SAlex Elder const char *snap_name; 44742ad3d716SAlex Elder 44752ad3d716SAlex Elder snap_id = snapc->snaps[which]; 44762ad3d716SAlex Elder snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id); 4477efadc98aSJosh Durgin if (IS_ERR(snap_name)) { 4478efadc98aSJosh Durgin /* ignore no-longer existing snapshots */ 4479efadc98aSJosh Durgin if (PTR_ERR(snap_name) == -ENOENT) 4480efadc98aSJosh Durgin continue; 4481efadc98aSJosh Durgin else 44822ad3d716SAlex Elder break; 4483efadc98aSJosh Durgin } 44842ad3d716SAlex Elder found = !strcmp(name, snap_name); 44852ad3d716SAlex Elder kfree(snap_name); 44862ad3d716SAlex Elder } 44872ad3d716SAlex Elder return found ? snap_id : CEPH_NOSNAP; 44882ad3d716SAlex Elder } 44892ad3d716SAlex Elder 44902ad3d716SAlex Elder /* 44912ad3d716SAlex Elder * Assumes name is never RBD_SNAP_HEAD_NAME; returns CEPH_NOSNAP if 44922ad3d716SAlex Elder * no snapshot by that name is found, or if an error occurs. 44932ad3d716SAlex Elder */ 44942ad3d716SAlex Elder static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name) 44952ad3d716SAlex Elder { 44962ad3d716SAlex Elder if (rbd_dev->image_format == 1) 44972ad3d716SAlex Elder return rbd_v1_snap_id_by_name(rbd_dev, name); 44982ad3d716SAlex Elder 44992ad3d716SAlex Elder return rbd_v2_snap_id_by_name(rbd_dev, name); 45002ad3d716SAlex Elder } 45012ad3d716SAlex Elder 45029e15b77dSAlex Elder /* 450304077599SIlya Dryomov * An image being mapped will have everything but the snap id. 45049e15b77dSAlex Elder */ 450504077599SIlya Dryomov static int rbd_spec_fill_snap_id(struct rbd_device *rbd_dev) 450604077599SIlya Dryomov { 450704077599SIlya Dryomov struct rbd_spec *spec = rbd_dev->spec; 450804077599SIlya Dryomov 450904077599SIlya Dryomov rbd_assert(spec->pool_id != CEPH_NOPOOL && spec->pool_name); 451004077599SIlya Dryomov rbd_assert(spec->image_id && spec->image_name); 451104077599SIlya Dryomov rbd_assert(spec->snap_name); 451204077599SIlya Dryomov 451304077599SIlya Dryomov if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) { 451404077599SIlya Dryomov u64 snap_id; 451504077599SIlya Dryomov 451604077599SIlya Dryomov snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name); 451704077599SIlya Dryomov if (snap_id == CEPH_NOSNAP) 451804077599SIlya Dryomov return -ENOENT; 451904077599SIlya Dryomov 452004077599SIlya Dryomov spec->snap_id = snap_id; 452104077599SIlya Dryomov } else { 452204077599SIlya Dryomov spec->snap_id = CEPH_NOSNAP; 452304077599SIlya Dryomov } 452404077599SIlya Dryomov 452504077599SIlya Dryomov return 0; 452604077599SIlya Dryomov } 452704077599SIlya Dryomov 452804077599SIlya Dryomov /* 452904077599SIlya Dryomov * A parent image will have all ids but none of the names. 453004077599SIlya Dryomov * 453104077599SIlya Dryomov * All names in an rbd spec are dynamically allocated. It's OK if we 453204077599SIlya Dryomov * can't figure out the name for an image id. 453304077599SIlya Dryomov */ 453404077599SIlya Dryomov static int rbd_spec_fill_names(struct rbd_device *rbd_dev) 45359e15b77dSAlex Elder { 45362e9f7f1cSAlex Elder struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 45372e9f7f1cSAlex Elder struct rbd_spec *spec = rbd_dev->spec; 45382e9f7f1cSAlex Elder const char *pool_name; 45392e9f7f1cSAlex Elder const char *image_name; 45402e9f7f1cSAlex Elder const char *snap_name; 45419e15b77dSAlex Elder int ret; 45429e15b77dSAlex Elder 454304077599SIlya Dryomov rbd_assert(spec->pool_id != CEPH_NOPOOL); 454404077599SIlya Dryomov rbd_assert(spec->image_id); 454504077599SIlya Dryomov rbd_assert(spec->snap_id != CEPH_NOSNAP); 45469e15b77dSAlex Elder 45472e9f7f1cSAlex Elder /* Get the pool name; we have to make our own copy of this */ 45489e15b77dSAlex Elder 45492e9f7f1cSAlex Elder pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id); 45502e9f7f1cSAlex Elder if (!pool_name) { 45512e9f7f1cSAlex Elder rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id); 4552935dc89fSAlex Elder return -EIO; 4553935dc89fSAlex Elder } 45542e9f7f1cSAlex Elder pool_name = kstrdup(pool_name, GFP_KERNEL); 45552e9f7f1cSAlex Elder if (!pool_name) 45569e15b77dSAlex Elder return -ENOMEM; 45579e15b77dSAlex Elder 45589e15b77dSAlex Elder /* Fetch the image name; tolerate failure here */ 45599e15b77dSAlex Elder 45602e9f7f1cSAlex Elder image_name = rbd_dev_image_name(rbd_dev); 45612e9f7f1cSAlex Elder if (!image_name) 456206ecc6cbSAlex Elder rbd_warn(rbd_dev, "unable to get image name"); 45639e15b77dSAlex Elder 456404077599SIlya Dryomov /* Fetch the snapshot name */ 45659e15b77dSAlex Elder 45662e9f7f1cSAlex Elder snap_name = rbd_snap_name(rbd_dev, spec->snap_id); 4567da6a6b63SJosh Durgin if (IS_ERR(snap_name)) { 4568da6a6b63SJosh Durgin ret = PTR_ERR(snap_name); 45699e15b77dSAlex Elder goto out_err; 45702e9f7f1cSAlex Elder } 45712e9f7f1cSAlex Elder 45722e9f7f1cSAlex Elder spec->pool_name = pool_name; 45732e9f7f1cSAlex Elder spec->image_name = image_name; 45742e9f7f1cSAlex Elder spec->snap_name = snap_name; 45759e15b77dSAlex Elder 45769e15b77dSAlex Elder return 0; 457704077599SIlya Dryomov 45789e15b77dSAlex Elder out_err: 45792e9f7f1cSAlex Elder kfree(image_name); 45802e9f7f1cSAlex Elder kfree(pool_name); 45819e15b77dSAlex Elder return ret; 45829e15b77dSAlex Elder } 45839e15b77dSAlex Elder 4584cc4a38bdSAlex Elder static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev) 458535d489f9SAlex Elder { 458635d489f9SAlex Elder size_t size; 458735d489f9SAlex Elder int ret; 458835d489f9SAlex Elder void *reply_buf; 458935d489f9SAlex Elder void *p; 459035d489f9SAlex Elder void *end; 459135d489f9SAlex Elder u64 seq; 459235d489f9SAlex Elder u32 snap_count; 459335d489f9SAlex Elder struct ceph_snap_context *snapc; 459435d489f9SAlex Elder u32 i; 459535d489f9SAlex Elder 459635d489f9SAlex Elder /* 459735d489f9SAlex Elder * We'll need room for the seq value (maximum snapshot id), 459835d489f9SAlex Elder * snapshot count, and array of that many snapshot ids. 459935d489f9SAlex Elder * For now we have a fixed upper limit on the number we're 460035d489f9SAlex Elder * prepared to receive. 460135d489f9SAlex Elder */ 460235d489f9SAlex Elder size = sizeof (__le64) + sizeof (__le32) + 460335d489f9SAlex Elder RBD_MAX_SNAP_COUNT * sizeof (__le64); 460435d489f9SAlex Elder reply_buf = kzalloc(size, GFP_KERNEL); 460535d489f9SAlex Elder if (!reply_buf) 460635d489f9SAlex Elder return -ENOMEM; 460735d489f9SAlex Elder 460836be9a76SAlex Elder ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, 46094157976bSAlex Elder "rbd", "get_snapcontext", NULL, 0, 4610e2a58ee5SAlex Elder reply_buf, size); 461136be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 461235d489f9SAlex Elder if (ret < 0) 461335d489f9SAlex Elder goto out; 461435d489f9SAlex Elder 461535d489f9SAlex Elder p = reply_buf; 461657385b51SAlex Elder end = reply_buf + ret; 461757385b51SAlex Elder ret = -ERANGE; 461835d489f9SAlex Elder ceph_decode_64_safe(&p, end, seq, out); 461935d489f9SAlex Elder ceph_decode_32_safe(&p, end, snap_count, out); 462035d489f9SAlex Elder 462135d489f9SAlex Elder /* 462235d489f9SAlex Elder * Make sure the reported number of snapshot ids wouldn't go 462335d489f9SAlex Elder * beyond the end of our buffer. But before checking that, 462435d489f9SAlex Elder * make sure the computed size of the snapshot context we 462535d489f9SAlex Elder * allocate is representable in a size_t. 462635d489f9SAlex Elder */ 462735d489f9SAlex Elder if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context)) 462835d489f9SAlex Elder / sizeof (u64)) { 462935d489f9SAlex Elder ret = -EINVAL; 463035d489f9SAlex Elder goto out; 463135d489f9SAlex Elder } 463235d489f9SAlex Elder if (!ceph_has_room(&p, end, snap_count * sizeof (__le64))) 463335d489f9SAlex Elder goto out; 4634468521c1SAlex Elder ret = 0; 463535d489f9SAlex Elder 4636812164f8SAlex Elder snapc = ceph_create_snap_context(snap_count, GFP_KERNEL); 463735d489f9SAlex Elder if (!snapc) { 463835d489f9SAlex Elder ret = -ENOMEM; 463935d489f9SAlex Elder goto out; 464035d489f9SAlex Elder } 464135d489f9SAlex Elder snapc->seq = seq; 464235d489f9SAlex Elder for (i = 0; i < snap_count; i++) 464335d489f9SAlex Elder snapc->snaps[i] = ceph_decode_64(&p); 464435d489f9SAlex Elder 464549ece554SAlex Elder ceph_put_snap_context(rbd_dev->header.snapc); 464635d489f9SAlex Elder rbd_dev->header.snapc = snapc; 464735d489f9SAlex Elder 464835d489f9SAlex Elder dout(" snap context seq = %llu, snap_count = %u\n", 464935d489f9SAlex Elder (unsigned long long)seq, (unsigned int)snap_count); 465035d489f9SAlex Elder out: 465135d489f9SAlex Elder kfree(reply_buf); 465235d489f9SAlex Elder 465357385b51SAlex Elder return ret; 465435d489f9SAlex Elder } 465535d489f9SAlex Elder 465654cac61fSAlex Elder static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, 465754cac61fSAlex Elder u64 snap_id) 4658b8b1e2dbSAlex Elder { 4659b8b1e2dbSAlex Elder size_t size; 4660b8b1e2dbSAlex Elder void *reply_buf; 466154cac61fSAlex Elder __le64 snapid; 4662b8b1e2dbSAlex Elder int ret; 4663b8b1e2dbSAlex Elder void *p; 4664b8b1e2dbSAlex Elder void *end; 4665b8b1e2dbSAlex Elder char *snap_name; 4666b8b1e2dbSAlex Elder 4667b8b1e2dbSAlex Elder size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN; 4668b8b1e2dbSAlex Elder reply_buf = kmalloc(size, GFP_KERNEL); 4669b8b1e2dbSAlex Elder if (!reply_buf) 4670b8b1e2dbSAlex Elder return ERR_PTR(-ENOMEM); 4671b8b1e2dbSAlex Elder 467254cac61fSAlex Elder snapid = cpu_to_le64(snap_id); 467336be9a76SAlex Elder ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, 4674b8b1e2dbSAlex Elder "rbd", "get_snapshot_name", 467554cac61fSAlex Elder &snapid, sizeof (snapid), 4676e2a58ee5SAlex Elder reply_buf, size); 467736be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 4678f40eb349SAlex Elder if (ret < 0) { 4679f40eb349SAlex Elder snap_name = ERR_PTR(ret); 4680b8b1e2dbSAlex Elder goto out; 4681f40eb349SAlex Elder } 4682b8b1e2dbSAlex Elder 4683b8b1e2dbSAlex Elder p = reply_buf; 4684f40eb349SAlex Elder end = reply_buf + ret; 4685e5c35534SAlex Elder snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL); 4686f40eb349SAlex Elder if (IS_ERR(snap_name)) 4687b8b1e2dbSAlex Elder goto out; 4688f40eb349SAlex Elder 4689b8b1e2dbSAlex Elder dout(" snap_id 0x%016llx snap_name = %s\n", 469054cac61fSAlex Elder (unsigned long long)snap_id, snap_name); 4691b8b1e2dbSAlex Elder out: 4692b8b1e2dbSAlex Elder kfree(reply_buf); 4693b8b1e2dbSAlex Elder 4694f40eb349SAlex Elder return snap_name; 4695b8b1e2dbSAlex Elder } 4696b8b1e2dbSAlex Elder 46972df3fac7SAlex Elder static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev) 4698117973fbSAlex Elder { 46992df3fac7SAlex Elder bool first_time = rbd_dev->header.object_prefix == NULL; 4700117973fbSAlex Elder int ret; 4701117973fbSAlex Elder 47021617e40cSJosh Durgin ret = rbd_dev_v2_image_size(rbd_dev); 47031617e40cSJosh Durgin if (ret) 4704cfbf6377SAlex Elder return ret; 47051617e40cSJosh Durgin 47062df3fac7SAlex Elder if (first_time) { 47072df3fac7SAlex Elder ret = rbd_dev_v2_header_onetime(rbd_dev); 47082df3fac7SAlex Elder if (ret) 4709cfbf6377SAlex Elder return ret; 47102df3fac7SAlex Elder } 47112df3fac7SAlex Elder 4712cc4a38bdSAlex Elder ret = rbd_dev_v2_snap_context(rbd_dev); 4713d194cd1dSIlya Dryomov if (ret && first_time) { 4714d194cd1dSIlya Dryomov kfree(rbd_dev->header.object_prefix); 4715d194cd1dSIlya Dryomov rbd_dev->header.object_prefix = NULL; 4716d194cd1dSIlya Dryomov } 4717117973fbSAlex Elder 4718117973fbSAlex Elder return ret; 4719117973fbSAlex Elder } 4720117973fbSAlex Elder 4721a720ae09SIlya Dryomov static int rbd_dev_header_info(struct rbd_device *rbd_dev) 4722a720ae09SIlya Dryomov { 4723a720ae09SIlya Dryomov rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 4724a720ae09SIlya Dryomov 4725a720ae09SIlya Dryomov if (rbd_dev->image_format == 1) 4726a720ae09SIlya Dryomov return rbd_dev_v1_header_info(rbd_dev); 4727a720ae09SIlya Dryomov 4728a720ae09SIlya Dryomov return rbd_dev_v2_header_info(rbd_dev); 4729a720ae09SIlya Dryomov } 4730a720ae09SIlya Dryomov 47311ddbe94eSAlex Elder /* 4732499afd5bSAlex Elder * Get a unique rbd identifier for the given new rbd_dev, and add 4733f8a22fc2SIlya Dryomov * the rbd_dev to the global list. 47341ddbe94eSAlex Elder */ 4735f8a22fc2SIlya Dryomov static int rbd_dev_id_get(struct rbd_device *rbd_dev) 4736b7f23c36SAlex Elder { 4737f8a22fc2SIlya Dryomov int new_dev_id; 4738f8a22fc2SIlya Dryomov 47399b60e70bSIlya Dryomov new_dev_id = ida_simple_get(&rbd_dev_id_ida, 47409b60e70bSIlya Dryomov 0, minor_to_rbd_dev_id(1 << MINORBITS), 47419b60e70bSIlya Dryomov GFP_KERNEL); 4742f8a22fc2SIlya Dryomov if (new_dev_id < 0) 4743f8a22fc2SIlya Dryomov return new_dev_id; 4744f8a22fc2SIlya Dryomov 4745f8a22fc2SIlya Dryomov rbd_dev->dev_id = new_dev_id; 4746499afd5bSAlex Elder 4747499afd5bSAlex Elder spin_lock(&rbd_dev_list_lock); 4748499afd5bSAlex Elder list_add_tail(&rbd_dev->node, &rbd_dev_list); 4749499afd5bSAlex Elder spin_unlock(&rbd_dev_list_lock); 4750f8a22fc2SIlya Dryomov 475170eebd20SIlya Dryomov dout("rbd_dev %p given dev id %d\n", rbd_dev, rbd_dev->dev_id); 4752f8a22fc2SIlya Dryomov 4753f8a22fc2SIlya Dryomov return 0; 4754b7f23c36SAlex Elder } 4755b7f23c36SAlex Elder 47561ddbe94eSAlex Elder /* 4757499afd5bSAlex Elder * Remove an rbd_dev from the global list, and record that its 4758499afd5bSAlex Elder * identifier is no longer in use. 47591ddbe94eSAlex Elder */ 4760e2839308SAlex Elder static void rbd_dev_id_put(struct rbd_device *rbd_dev) 47611ddbe94eSAlex Elder { 4762499afd5bSAlex Elder spin_lock(&rbd_dev_list_lock); 4763499afd5bSAlex Elder list_del_init(&rbd_dev->node); 4764499afd5bSAlex Elder spin_unlock(&rbd_dev_list_lock); 47651ddbe94eSAlex Elder 4766f8a22fc2SIlya Dryomov ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id); 4767f8a22fc2SIlya Dryomov 4768f8a22fc2SIlya Dryomov dout("rbd_dev %p released dev id %d\n", rbd_dev, rbd_dev->dev_id); 4769b7f23c36SAlex Elder } 4770b7f23c36SAlex Elder 4771a725f65eSAlex Elder /* 4772e28fff26SAlex Elder * Skips over white space at *buf, and updates *buf to point to the 4773e28fff26SAlex Elder * first found non-space character (if any). Returns the length of 4774593a9e7bSAlex Elder * the token (string of non-white space characters) found. Note 4775593a9e7bSAlex Elder * that *buf must be terminated with '\0'. 4776e28fff26SAlex Elder */ 4777e28fff26SAlex Elder static inline size_t next_token(const char **buf) 4778e28fff26SAlex Elder { 4779e28fff26SAlex Elder /* 4780e28fff26SAlex Elder * These are the characters that produce nonzero for 4781e28fff26SAlex Elder * isspace() in the "C" and "POSIX" locales. 4782e28fff26SAlex Elder */ 4783e28fff26SAlex Elder const char *spaces = " \f\n\r\t\v"; 4784e28fff26SAlex Elder 4785e28fff26SAlex Elder *buf += strspn(*buf, spaces); /* Find start of token */ 4786e28fff26SAlex Elder 4787e28fff26SAlex Elder return strcspn(*buf, spaces); /* Return token length */ 4788e28fff26SAlex Elder } 4789e28fff26SAlex Elder 4790e28fff26SAlex Elder /* 4791ea3352f4SAlex Elder * Finds the next token in *buf, dynamically allocates a buffer big 4792ea3352f4SAlex Elder * enough to hold a copy of it, and copies the token into the new 4793ea3352f4SAlex Elder * buffer. The copy is guaranteed to be terminated with '\0'. Note 4794ea3352f4SAlex Elder * that a duplicate buffer is created even for a zero-length token. 4795ea3352f4SAlex Elder * 4796ea3352f4SAlex Elder * Returns a pointer to the newly-allocated duplicate, or a null 4797ea3352f4SAlex Elder * pointer if memory for the duplicate was not available. If 4798ea3352f4SAlex Elder * the lenp argument is a non-null pointer, the length of the token 4799ea3352f4SAlex Elder * (not including the '\0') is returned in *lenp. 4800ea3352f4SAlex Elder * 4801ea3352f4SAlex Elder * If successful, the *buf pointer will be updated to point beyond 4802ea3352f4SAlex Elder * the end of the found token. 4803ea3352f4SAlex Elder * 4804ea3352f4SAlex Elder * Note: uses GFP_KERNEL for allocation. 4805ea3352f4SAlex Elder */ 4806ea3352f4SAlex Elder static inline char *dup_token(const char **buf, size_t *lenp) 4807ea3352f4SAlex Elder { 4808ea3352f4SAlex Elder char *dup; 4809ea3352f4SAlex Elder size_t len; 4810ea3352f4SAlex Elder 4811ea3352f4SAlex Elder len = next_token(buf); 48124caf35f9SAlex Elder dup = kmemdup(*buf, len + 1, GFP_KERNEL); 4813ea3352f4SAlex Elder if (!dup) 4814ea3352f4SAlex Elder return NULL; 4815ea3352f4SAlex Elder *(dup + len) = '\0'; 4816ea3352f4SAlex Elder *buf += len; 4817ea3352f4SAlex Elder 4818ea3352f4SAlex Elder if (lenp) 4819ea3352f4SAlex Elder *lenp = len; 4820ea3352f4SAlex Elder 4821ea3352f4SAlex Elder return dup; 4822ea3352f4SAlex Elder } 4823ea3352f4SAlex Elder 4824ea3352f4SAlex Elder /* 4825859c31dfSAlex Elder * Parse the options provided for an "rbd add" (i.e., rbd image 4826859c31dfSAlex Elder * mapping) request. These arrive via a write to /sys/bus/rbd/add, 4827859c31dfSAlex Elder * and the data written is passed here via a NUL-terminated buffer. 4828859c31dfSAlex Elder * Returns 0 if successful or an error code otherwise. 4829d22f76e7SAlex Elder * 4830859c31dfSAlex Elder * The information extracted from these options is recorded in 4831859c31dfSAlex Elder * the other parameters which return dynamically-allocated 4832859c31dfSAlex Elder * structures: 4833859c31dfSAlex Elder * ceph_opts 4834859c31dfSAlex Elder * The address of a pointer that will refer to a ceph options 4835859c31dfSAlex Elder * structure. Caller must release the returned pointer using 4836859c31dfSAlex Elder * ceph_destroy_options() when it is no longer needed. 4837859c31dfSAlex Elder * rbd_opts 4838859c31dfSAlex Elder * Address of an rbd options pointer. Fully initialized by 4839859c31dfSAlex Elder * this function; caller must release with kfree(). 4840859c31dfSAlex Elder * spec 4841859c31dfSAlex Elder * Address of an rbd image specification pointer. Fully 4842859c31dfSAlex Elder * initialized by this function based on parsed options. 4843859c31dfSAlex Elder * Caller must release with rbd_spec_put(). 4844859c31dfSAlex Elder * 4845859c31dfSAlex Elder * The options passed take this form: 4846859c31dfSAlex Elder * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>] 4847859c31dfSAlex Elder * where: 4848859c31dfSAlex Elder * <mon_addrs> 4849859c31dfSAlex Elder * A comma-separated list of one or more monitor addresses. 4850859c31dfSAlex Elder * A monitor address is an ip address, optionally followed 4851859c31dfSAlex Elder * by a port number (separated by a colon). 4852859c31dfSAlex Elder * I.e.: ip1[:port1][,ip2[:port2]...] 4853859c31dfSAlex Elder * <options> 4854859c31dfSAlex Elder * A comma-separated list of ceph and/or rbd options. 4855859c31dfSAlex Elder * <pool_name> 4856859c31dfSAlex Elder * The name of the rados pool containing the rbd image. 4857859c31dfSAlex Elder * <image_name> 4858859c31dfSAlex Elder * The name of the image in that pool to map. 4859859c31dfSAlex Elder * <snap_id> 4860859c31dfSAlex Elder * An optional snapshot id. If provided, the mapping will 4861859c31dfSAlex Elder * present data from the image at the time that snapshot was 4862859c31dfSAlex Elder * created. The image head is used if no snapshot id is 4863859c31dfSAlex Elder * provided. Snapshot mappings are always read-only. 4864a725f65eSAlex Elder */ 4865859c31dfSAlex Elder static int rbd_add_parse_args(const char *buf, 4866dc79b113SAlex Elder struct ceph_options **ceph_opts, 4867859c31dfSAlex Elder struct rbd_options **opts, 4868859c31dfSAlex Elder struct rbd_spec **rbd_spec) 4869a725f65eSAlex Elder { 4870e28fff26SAlex Elder size_t len; 4871859c31dfSAlex Elder char *options; 48720ddebc0cSAlex Elder const char *mon_addrs; 4873ecb4dc22SAlex Elder char *snap_name; 48740ddebc0cSAlex Elder size_t mon_addrs_size; 4875859c31dfSAlex Elder struct rbd_spec *spec = NULL; 48764e9afebaSAlex Elder struct rbd_options *rbd_opts = NULL; 4877859c31dfSAlex Elder struct ceph_options *copts; 4878dc79b113SAlex Elder int ret; 4879e28fff26SAlex Elder 4880e28fff26SAlex Elder /* The first four tokens are required */ 4881e28fff26SAlex Elder 48827ef3214aSAlex Elder len = next_token(&buf); 48834fb5d671SAlex Elder if (!len) { 48844fb5d671SAlex Elder rbd_warn(NULL, "no monitor address(es) provided"); 48854fb5d671SAlex Elder return -EINVAL; 48864fb5d671SAlex Elder } 48870ddebc0cSAlex Elder mon_addrs = buf; 4888f28e565aSAlex Elder mon_addrs_size = len + 1; 48897ef3214aSAlex Elder buf += len; 4890a725f65eSAlex Elder 4891dc79b113SAlex Elder ret = -EINVAL; 4892f28e565aSAlex Elder options = dup_token(&buf, NULL); 4893f28e565aSAlex Elder if (!options) 4894dc79b113SAlex Elder return -ENOMEM; 48954fb5d671SAlex Elder if (!*options) { 48964fb5d671SAlex Elder rbd_warn(NULL, "no options provided"); 48974fb5d671SAlex Elder goto out_err; 48984fb5d671SAlex Elder } 4899a725f65eSAlex Elder 4900859c31dfSAlex Elder spec = rbd_spec_alloc(); 4901859c31dfSAlex Elder if (!spec) 4902f28e565aSAlex Elder goto out_mem; 4903859c31dfSAlex Elder 4904859c31dfSAlex Elder spec->pool_name = dup_token(&buf, NULL); 4905859c31dfSAlex Elder if (!spec->pool_name) 4906859c31dfSAlex Elder goto out_mem; 49074fb5d671SAlex Elder if (!*spec->pool_name) { 49084fb5d671SAlex Elder rbd_warn(NULL, "no pool name provided"); 49094fb5d671SAlex Elder goto out_err; 49104fb5d671SAlex Elder } 4911e28fff26SAlex Elder 491269e7a02fSAlex Elder spec->image_name = dup_token(&buf, NULL); 4913859c31dfSAlex Elder if (!spec->image_name) 4914f28e565aSAlex Elder goto out_mem; 49154fb5d671SAlex Elder if (!*spec->image_name) { 49164fb5d671SAlex Elder rbd_warn(NULL, "no image name provided"); 49174fb5d671SAlex Elder goto out_err; 49184fb5d671SAlex Elder } 4919e28fff26SAlex Elder 4920f28e565aSAlex Elder /* 4921f28e565aSAlex Elder * Snapshot name is optional; default is to use "-" 4922f28e565aSAlex Elder * (indicating the head/no snapshot). 4923f28e565aSAlex Elder */ 49243feeb894SAlex Elder len = next_token(&buf); 4925820a5f3eSAlex Elder if (!len) { 49263feeb894SAlex Elder buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */ 49273feeb894SAlex Elder len = sizeof (RBD_SNAP_HEAD_NAME) - 1; 4928f28e565aSAlex Elder } else if (len > RBD_MAX_SNAP_NAME_LEN) { 4929dc79b113SAlex Elder ret = -ENAMETOOLONG; 4930f28e565aSAlex Elder goto out_err; 4931849b4260SAlex Elder } 4932ecb4dc22SAlex Elder snap_name = kmemdup(buf, len + 1, GFP_KERNEL); 4933ecb4dc22SAlex Elder if (!snap_name) 4934f28e565aSAlex Elder goto out_mem; 4935ecb4dc22SAlex Elder *(snap_name + len) = '\0'; 4936ecb4dc22SAlex Elder spec->snap_name = snap_name; 4937e5c35534SAlex Elder 49380ddebc0cSAlex Elder /* Initialize all rbd options to the defaults */ 4939e28fff26SAlex Elder 49404e9afebaSAlex Elder rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL); 49414e9afebaSAlex Elder if (!rbd_opts) 49424e9afebaSAlex Elder goto out_mem; 49434e9afebaSAlex Elder 49444e9afebaSAlex Elder rbd_opts->read_only = RBD_READ_ONLY_DEFAULT; 4945b5584180SIlya Dryomov rbd_opts->queue_depth = RBD_QUEUE_DEPTH_DEFAULT; 4946d22f76e7SAlex Elder 4947859c31dfSAlex Elder copts = ceph_parse_options(options, mon_addrs, 49480ddebc0cSAlex Elder mon_addrs + mon_addrs_size - 1, 49494e9afebaSAlex Elder parse_rbd_opts_token, rbd_opts); 4950859c31dfSAlex Elder if (IS_ERR(copts)) { 4951859c31dfSAlex Elder ret = PTR_ERR(copts); 4952dc79b113SAlex Elder goto out_err; 4953dc79b113SAlex Elder } 4954859c31dfSAlex Elder kfree(options); 4955859c31dfSAlex Elder 4956859c31dfSAlex Elder *ceph_opts = copts; 49574e9afebaSAlex Elder *opts = rbd_opts; 4958859c31dfSAlex Elder *rbd_spec = spec; 49590ddebc0cSAlex Elder 4960dc79b113SAlex Elder return 0; 4961f28e565aSAlex Elder out_mem: 4962dc79b113SAlex Elder ret = -ENOMEM; 4963d22f76e7SAlex Elder out_err: 4964859c31dfSAlex Elder kfree(rbd_opts); 4965859c31dfSAlex Elder rbd_spec_put(spec); 4966f28e565aSAlex Elder kfree(options); 4967d22f76e7SAlex Elder 4968dc79b113SAlex Elder return ret; 4969a725f65eSAlex Elder } 4970a725f65eSAlex Elder 4971589d30e0SAlex Elder /* 497230ba1f02SIlya Dryomov * Return pool id (>= 0) or a negative error code. 497330ba1f02SIlya Dryomov */ 497430ba1f02SIlya Dryomov static int rbd_add_get_pool_id(struct rbd_client *rbdc, const char *pool_name) 497530ba1f02SIlya Dryomov { 4976a319bf56SIlya Dryomov struct ceph_options *opts = rbdc->client->options; 497730ba1f02SIlya Dryomov u64 newest_epoch; 497830ba1f02SIlya Dryomov int tries = 0; 497930ba1f02SIlya Dryomov int ret; 498030ba1f02SIlya Dryomov 498130ba1f02SIlya Dryomov again: 498230ba1f02SIlya Dryomov ret = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, pool_name); 498330ba1f02SIlya Dryomov if (ret == -ENOENT && tries++ < 1) { 498430ba1f02SIlya Dryomov ret = ceph_monc_do_get_version(&rbdc->client->monc, "osdmap", 498530ba1f02SIlya Dryomov &newest_epoch); 498630ba1f02SIlya Dryomov if (ret < 0) 498730ba1f02SIlya Dryomov return ret; 498830ba1f02SIlya Dryomov 498930ba1f02SIlya Dryomov if (rbdc->client->osdc.osdmap->epoch < newest_epoch) { 499030ba1f02SIlya Dryomov ceph_monc_request_next_osdmap(&rbdc->client->monc); 499130ba1f02SIlya Dryomov (void) ceph_monc_wait_osdmap(&rbdc->client->monc, 4992a319bf56SIlya Dryomov newest_epoch, 4993a319bf56SIlya Dryomov opts->mount_timeout); 499430ba1f02SIlya Dryomov goto again; 499530ba1f02SIlya Dryomov } else { 499630ba1f02SIlya Dryomov /* the osdmap we have is new enough */ 499730ba1f02SIlya Dryomov return -ENOENT; 499830ba1f02SIlya Dryomov } 499930ba1f02SIlya Dryomov } 500030ba1f02SIlya Dryomov 500130ba1f02SIlya Dryomov return ret; 500230ba1f02SIlya Dryomov } 500330ba1f02SIlya Dryomov 500430ba1f02SIlya Dryomov /* 5005589d30e0SAlex Elder * An rbd format 2 image has a unique identifier, distinct from the 5006589d30e0SAlex Elder * name given to it by the user. Internally, that identifier is 5007589d30e0SAlex Elder * what's used to specify the names of objects related to the image. 5008589d30e0SAlex Elder * 5009589d30e0SAlex Elder * A special "rbd id" object is used to map an rbd image name to its 5010589d30e0SAlex Elder * id. If that object doesn't exist, then there is no v2 rbd image 5011589d30e0SAlex Elder * with the supplied name. 5012589d30e0SAlex Elder * 5013589d30e0SAlex Elder * This function will record the given rbd_dev's image_id field if 5014589d30e0SAlex Elder * it can be determined, and in that case will return 0. If any 5015589d30e0SAlex Elder * errors occur a negative errno will be returned and the rbd_dev's 5016589d30e0SAlex Elder * image_id field will be unchanged (and should be NULL). 5017589d30e0SAlex Elder */ 5018589d30e0SAlex Elder static int rbd_dev_image_id(struct rbd_device *rbd_dev) 5019589d30e0SAlex Elder { 5020589d30e0SAlex Elder int ret; 5021589d30e0SAlex Elder size_t size; 5022589d30e0SAlex Elder char *object_name; 5023589d30e0SAlex Elder void *response; 5024c0fba368SAlex Elder char *image_id; 50252f82ee54SAlex Elder 5026589d30e0SAlex Elder /* 50272c0d0a10SAlex Elder * When probing a parent image, the image id is already 50282c0d0a10SAlex Elder * known (and the image name likely is not). There's no 5029c0fba368SAlex Elder * need to fetch the image id again in this case. We 5030c0fba368SAlex Elder * do still need to set the image format though. 50312c0d0a10SAlex Elder */ 5032c0fba368SAlex Elder if (rbd_dev->spec->image_id) { 5033c0fba368SAlex Elder rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1; 5034c0fba368SAlex Elder 50352c0d0a10SAlex Elder return 0; 5036c0fba368SAlex Elder } 50372c0d0a10SAlex Elder 50382c0d0a10SAlex Elder /* 5039589d30e0SAlex Elder * First, see if the format 2 image id file exists, and if 5040589d30e0SAlex Elder * so, get the image's persistent id from it. 5041589d30e0SAlex Elder */ 504269e7a02fSAlex Elder size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name); 5043589d30e0SAlex Elder object_name = kmalloc(size, GFP_NOIO); 5044589d30e0SAlex Elder if (!object_name) 5045589d30e0SAlex Elder return -ENOMEM; 50460d7dbfceSAlex Elder sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name); 5047589d30e0SAlex Elder dout("rbd id object name is %s\n", object_name); 5048589d30e0SAlex Elder 5049589d30e0SAlex Elder /* Response will be an encoded string, which includes a length */ 5050589d30e0SAlex Elder 5051589d30e0SAlex Elder size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX; 5052589d30e0SAlex Elder response = kzalloc(size, GFP_NOIO); 5053589d30e0SAlex Elder if (!response) { 5054589d30e0SAlex Elder ret = -ENOMEM; 5055589d30e0SAlex Elder goto out; 5056589d30e0SAlex Elder } 5057589d30e0SAlex Elder 5058c0fba368SAlex Elder /* If it doesn't exist we'll assume it's a format 1 image */ 5059c0fba368SAlex Elder 506036be9a76SAlex Elder ret = rbd_obj_method_sync(rbd_dev, object_name, 50614157976bSAlex Elder "rbd", "get_id", NULL, 0, 5062e2a58ee5SAlex Elder response, RBD_IMAGE_ID_LEN_MAX); 506336be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 5064c0fba368SAlex Elder if (ret == -ENOENT) { 5065c0fba368SAlex Elder image_id = kstrdup("", GFP_KERNEL); 5066c0fba368SAlex Elder ret = image_id ? 0 : -ENOMEM; 5067c0fba368SAlex Elder if (!ret) 5068c0fba368SAlex Elder rbd_dev->image_format = 1; 50697dd440c9SIlya Dryomov } else if (ret >= 0) { 5070c0fba368SAlex Elder void *p = response; 5071589d30e0SAlex Elder 5072c0fba368SAlex Elder image_id = ceph_extract_encoded_string(&p, p + ret, 5073979ed480SAlex Elder NULL, GFP_NOIO); 5074461f758aSDuan Jiong ret = PTR_ERR_OR_ZERO(image_id); 5075c0fba368SAlex Elder if (!ret) 5076c0fba368SAlex Elder rbd_dev->image_format = 2; 5077c0fba368SAlex Elder } 5078c0fba368SAlex Elder 5079c0fba368SAlex Elder if (!ret) { 5080c0fba368SAlex Elder rbd_dev->spec->image_id = image_id; 5081c0fba368SAlex Elder dout("image_id is %s\n", image_id); 5082589d30e0SAlex Elder } 5083589d30e0SAlex Elder out: 5084589d30e0SAlex Elder kfree(response); 5085589d30e0SAlex Elder kfree(object_name); 5086589d30e0SAlex Elder 5087589d30e0SAlex Elder return ret; 5088589d30e0SAlex Elder } 5089589d30e0SAlex Elder 50903abef3b3SAlex Elder /* 50913abef3b3SAlex Elder * Undo whatever state changes are made by v1 or v2 header info 50923abef3b3SAlex Elder * call. 50933abef3b3SAlex Elder */ 50946fd48b3bSAlex Elder static void rbd_dev_unprobe(struct rbd_device *rbd_dev) 50956fd48b3bSAlex Elder { 50966fd48b3bSAlex Elder struct rbd_image_header *header; 50976fd48b3bSAlex Elder 5098a2acd00eSAlex Elder rbd_dev_parent_put(rbd_dev); 50996fd48b3bSAlex Elder 51006fd48b3bSAlex Elder /* Free dynamic fields from the header, then zero it out */ 51016fd48b3bSAlex Elder 51026fd48b3bSAlex Elder header = &rbd_dev->header; 5103812164f8SAlex Elder ceph_put_snap_context(header->snapc); 51046fd48b3bSAlex Elder kfree(header->snap_sizes); 51056fd48b3bSAlex Elder kfree(header->snap_names); 51066fd48b3bSAlex Elder kfree(header->object_prefix); 51076fd48b3bSAlex Elder memset(header, 0, sizeof (*header)); 51086fd48b3bSAlex Elder } 51096fd48b3bSAlex Elder 51102df3fac7SAlex Elder static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev) 5111a30b71b9SAlex Elder { 5112a30b71b9SAlex Elder int ret; 5113a30b71b9SAlex Elder 51141e130199SAlex Elder ret = rbd_dev_v2_object_prefix(rbd_dev); 511557385b51SAlex Elder if (ret) 51161e130199SAlex Elder goto out_err; 5117b1b5402aSAlex Elder 51182df3fac7SAlex Elder /* 51192df3fac7SAlex Elder * Get the and check features for the image. Currently the 51202df3fac7SAlex Elder * features are assumed to never change. 51212df3fac7SAlex Elder */ 5122b1b5402aSAlex Elder ret = rbd_dev_v2_features(rbd_dev); 512357385b51SAlex Elder if (ret) 5124b1b5402aSAlex Elder goto out_err; 512535d489f9SAlex Elder 5126cc070d59SAlex Elder /* If the image supports fancy striping, get its parameters */ 5127cc070d59SAlex Elder 5128cc070d59SAlex Elder if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) { 5129cc070d59SAlex Elder ret = rbd_dev_v2_striping_info(rbd_dev); 5130cc070d59SAlex Elder if (ret < 0) 5131cc070d59SAlex Elder goto out_err; 5132cc070d59SAlex Elder } 51332df3fac7SAlex Elder /* No support for crypto and compression type format 2 images */ 5134a30b71b9SAlex Elder 513535152979SAlex Elder return 0; 51369d475de5SAlex Elder out_err: 5137642a2537SAlex Elder rbd_dev->header.features = 0; 51381e130199SAlex Elder kfree(rbd_dev->header.object_prefix); 51391e130199SAlex Elder rbd_dev->header.object_prefix = NULL; 51409d475de5SAlex Elder 51419d475de5SAlex Elder return ret; 5142a30b71b9SAlex Elder } 5143a30b71b9SAlex Elder 51446d69bb53SIlya Dryomov /* 51456d69bb53SIlya Dryomov * @depth is rbd_dev_image_probe() -> rbd_dev_probe_parent() -> 51466d69bb53SIlya Dryomov * rbd_dev_image_probe() recursion depth, which means it's also the 51476d69bb53SIlya Dryomov * length of the already discovered part of the parent chain. 51486d69bb53SIlya Dryomov */ 51496d69bb53SIlya Dryomov static int rbd_dev_probe_parent(struct rbd_device *rbd_dev, int depth) 515083a06263SAlex Elder { 51512f82ee54SAlex Elder struct rbd_device *parent = NULL; 5152124afba2SAlex Elder int ret; 5153124afba2SAlex Elder 5154124afba2SAlex Elder if (!rbd_dev->parent_spec) 5155124afba2SAlex Elder return 0; 5156124afba2SAlex Elder 51576d69bb53SIlya Dryomov if (++depth > RBD_MAX_PARENT_CHAIN_LEN) { 51586d69bb53SIlya Dryomov pr_info("parent chain is too long (%d)\n", depth); 51596d69bb53SIlya Dryomov ret = -EINVAL; 51606d69bb53SIlya Dryomov goto out_err; 51616d69bb53SIlya Dryomov } 51626d69bb53SIlya Dryomov 51631f2c6651SIlya Dryomov parent = rbd_dev_create(rbd_dev->rbd_client, rbd_dev->parent_spec, 51641f2c6651SIlya Dryomov NULL); 51651f2c6651SIlya Dryomov if (!parent) { 5166124afba2SAlex Elder ret = -ENOMEM; 5167124afba2SAlex Elder goto out_err; 51681f2c6651SIlya Dryomov } 51691f2c6651SIlya Dryomov 51701f2c6651SIlya Dryomov /* 51711f2c6651SIlya Dryomov * Images related by parent/child relationships always share 51721f2c6651SIlya Dryomov * rbd_client and spec/parent_spec, so bump their refcounts. 51731f2c6651SIlya Dryomov */ 51741f2c6651SIlya Dryomov __rbd_get_client(rbd_dev->rbd_client); 51751f2c6651SIlya Dryomov rbd_spec_get(rbd_dev->parent_spec); 5176124afba2SAlex Elder 51776d69bb53SIlya Dryomov ret = rbd_dev_image_probe(parent, depth); 5178124afba2SAlex Elder if (ret < 0) 5179124afba2SAlex Elder goto out_err; 51801f2c6651SIlya Dryomov 5181124afba2SAlex Elder rbd_dev->parent = parent; 5182a2acd00eSAlex Elder atomic_set(&rbd_dev->parent_ref, 1); 5183124afba2SAlex Elder return 0; 5184124afba2SAlex Elder 51851f2c6651SIlya Dryomov out_err: 51861f2c6651SIlya Dryomov rbd_dev_unparent(rbd_dev); 51871f2c6651SIlya Dryomov if (parent) 51881f2c6651SIlya Dryomov rbd_dev_destroy(parent); 5189124afba2SAlex Elder return ret; 5190124afba2SAlex Elder } 5191124afba2SAlex Elder 5192200a6a8bSAlex Elder static int rbd_dev_device_setup(struct rbd_device *rbd_dev) 5193124afba2SAlex Elder { 519483a06263SAlex Elder int ret; 519583a06263SAlex Elder 5196f8a22fc2SIlya Dryomov /* Get an id and fill in device name. */ 519783a06263SAlex Elder 5198f8a22fc2SIlya Dryomov ret = rbd_dev_id_get(rbd_dev); 5199f8a22fc2SIlya Dryomov if (ret) 5200f8a22fc2SIlya Dryomov return ret; 5201f8a22fc2SIlya Dryomov 520283a06263SAlex Elder BUILD_BUG_ON(DEV_NAME_LEN 520383a06263SAlex Elder < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH); 520483a06263SAlex Elder sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id); 520583a06263SAlex Elder 52069b60e70bSIlya Dryomov /* Record our major and minor device numbers. */ 520783a06263SAlex Elder 52089b60e70bSIlya Dryomov if (!single_major) { 520983a06263SAlex Elder ret = register_blkdev(0, rbd_dev->name); 521083a06263SAlex Elder if (ret < 0) 521183a06263SAlex Elder goto err_out_id; 52129b60e70bSIlya Dryomov 521383a06263SAlex Elder rbd_dev->major = ret; 5214dd82fff1SIlya Dryomov rbd_dev->minor = 0; 52159b60e70bSIlya Dryomov } else { 52169b60e70bSIlya Dryomov rbd_dev->major = rbd_major; 52179b60e70bSIlya Dryomov rbd_dev->minor = rbd_dev_id_to_minor(rbd_dev->dev_id); 52189b60e70bSIlya Dryomov } 521983a06263SAlex Elder 522083a06263SAlex Elder /* Set up the blkdev mapping. */ 522183a06263SAlex Elder 522283a06263SAlex Elder ret = rbd_init_disk(rbd_dev); 522383a06263SAlex Elder if (ret) 522483a06263SAlex Elder goto err_out_blkdev; 522583a06263SAlex Elder 5226f35a4deeSAlex Elder ret = rbd_dev_mapping_set(rbd_dev); 522783a06263SAlex Elder if (ret) 522883a06263SAlex Elder goto err_out_disk; 5229bc1ecc65SIlya Dryomov 5230f35a4deeSAlex Elder set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE); 523122001f61SJosh Durgin set_disk_ro(rbd_dev->disk, rbd_dev->mapping.read_only); 5232f35a4deeSAlex Elder 5233dd5ac32dSIlya Dryomov dev_set_name(&rbd_dev->dev, "%d", rbd_dev->dev_id); 5234dd5ac32dSIlya Dryomov ret = device_add(&rbd_dev->dev); 5235f35a4deeSAlex Elder if (ret) 5236f5ee37bdSIlya Dryomov goto err_out_mapping; 523783a06263SAlex Elder 523883a06263SAlex Elder /* Everything's ready. Announce the disk to the world. */ 523983a06263SAlex Elder 5240129b79d4SAlex Elder set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); 524183a06263SAlex Elder add_disk(rbd_dev->disk); 524283a06263SAlex Elder 524383a06263SAlex Elder pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name, 524483a06263SAlex Elder (unsigned long long) rbd_dev->mapping.size); 524583a06263SAlex Elder 524683a06263SAlex Elder return ret; 52472f82ee54SAlex Elder 5248f35a4deeSAlex Elder err_out_mapping: 5249f35a4deeSAlex Elder rbd_dev_mapping_clear(rbd_dev); 525083a06263SAlex Elder err_out_disk: 525183a06263SAlex Elder rbd_free_disk(rbd_dev); 525283a06263SAlex Elder err_out_blkdev: 52539b60e70bSIlya Dryomov if (!single_major) 525483a06263SAlex Elder unregister_blkdev(rbd_dev->major, rbd_dev->name); 525583a06263SAlex Elder err_out_id: 525683a06263SAlex Elder rbd_dev_id_put(rbd_dev); 5257d1cf5788SAlex Elder rbd_dev_mapping_clear(rbd_dev); 525883a06263SAlex Elder 525983a06263SAlex Elder return ret; 526083a06263SAlex Elder } 526183a06263SAlex Elder 5262332bb12dSAlex Elder static int rbd_dev_header_name(struct rbd_device *rbd_dev) 5263332bb12dSAlex Elder { 5264332bb12dSAlex Elder struct rbd_spec *spec = rbd_dev->spec; 5265332bb12dSAlex Elder size_t size; 5266332bb12dSAlex Elder 5267332bb12dSAlex Elder /* Record the header object name for this rbd image. */ 5268332bb12dSAlex Elder 5269332bb12dSAlex Elder rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 5270332bb12dSAlex Elder 5271332bb12dSAlex Elder if (rbd_dev->image_format == 1) 5272332bb12dSAlex Elder size = strlen(spec->image_name) + sizeof (RBD_SUFFIX); 5273332bb12dSAlex Elder else 5274332bb12dSAlex Elder size = sizeof (RBD_HEADER_PREFIX) + strlen(spec->image_id); 5275332bb12dSAlex Elder 5276332bb12dSAlex Elder rbd_dev->header_name = kmalloc(size, GFP_KERNEL); 5277332bb12dSAlex Elder if (!rbd_dev->header_name) 5278332bb12dSAlex Elder return -ENOMEM; 5279332bb12dSAlex Elder 5280332bb12dSAlex Elder if (rbd_dev->image_format == 1) 5281332bb12dSAlex Elder sprintf(rbd_dev->header_name, "%s%s", 5282332bb12dSAlex Elder spec->image_name, RBD_SUFFIX); 5283332bb12dSAlex Elder else 5284332bb12dSAlex Elder sprintf(rbd_dev->header_name, "%s%s", 5285332bb12dSAlex Elder RBD_HEADER_PREFIX, spec->image_id); 5286332bb12dSAlex Elder return 0; 5287332bb12dSAlex Elder } 5288332bb12dSAlex Elder 5289200a6a8bSAlex Elder static void rbd_dev_image_release(struct rbd_device *rbd_dev) 5290200a6a8bSAlex Elder { 52916fd48b3bSAlex Elder rbd_dev_unprobe(rbd_dev); 5292200a6a8bSAlex Elder kfree(rbd_dev->header_name); 52936fd48b3bSAlex Elder rbd_dev->header_name = NULL; 52946fd48b3bSAlex Elder rbd_dev->image_format = 0; 52956fd48b3bSAlex Elder kfree(rbd_dev->spec->image_id); 52966fd48b3bSAlex Elder rbd_dev->spec->image_id = NULL; 52976fd48b3bSAlex Elder 5298200a6a8bSAlex Elder rbd_dev_destroy(rbd_dev); 5299200a6a8bSAlex Elder } 5300200a6a8bSAlex Elder 5301a30b71b9SAlex Elder /* 5302a30b71b9SAlex Elder * Probe for the existence of the header object for the given rbd 53031f3ef788SAlex Elder * device. If this image is the one being mapped (i.e., not a 53041f3ef788SAlex Elder * parent), initiate a watch on its header object before using that 53051f3ef788SAlex Elder * object to get detailed information about the rbd image. 5306a30b71b9SAlex Elder */ 53076d69bb53SIlya Dryomov static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth) 5308a30b71b9SAlex Elder { 5309a30b71b9SAlex Elder int ret; 5310a30b71b9SAlex Elder 5311a30b71b9SAlex Elder /* 53123abef3b3SAlex Elder * Get the id from the image id object. Unless there's an 53133abef3b3SAlex Elder * error, rbd_dev->spec->image_id will be filled in with 53143abef3b3SAlex Elder * a dynamically-allocated string, and rbd_dev->image_format 53153abef3b3SAlex Elder * will be set to either 1 or 2. 5316a30b71b9SAlex Elder */ 5317a30b71b9SAlex Elder ret = rbd_dev_image_id(rbd_dev); 5318a30b71b9SAlex Elder if (ret) 5319c0fba368SAlex Elder return ret; 5320c0fba368SAlex Elder 5321332bb12dSAlex Elder ret = rbd_dev_header_name(rbd_dev); 5322332bb12dSAlex Elder if (ret) 5323332bb12dSAlex Elder goto err_out_format; 5324332bb12dSAlex Elder 53256d69bb53SIlya Dryomov if (!depth) { 5326fca27065SIlya Dryomov ret = rbd_dev_header_watch_sync(rbd_dev); 53271fe48023SIlya Dryomov if (ret) { 53281fe48023SIlya Dryomov if (ret == -ENOENT) 53291fe48023SIlya Dryomov pr_info("image %s/%s does not exist\n", 53301fe48023SIlya Dryomov rbd_dev->spec->pool_name, 53311fe48023SIlya Dryomov rbd_dev->spec->image_name); 5332b644de2bSAlex Elder goto out_header_name; 53331f3ef788SAlex Elder } 53341fe48023SIlya Dryomov } 5335b644de2bSAlex Elder 5336a720ae09SIlya Dryomov ret = rbd_dev_header_info(rbd_dev); 53375655c4d9SAlex Elder if (ret) 5338b644de2bSAlex Elder goto err_out_watch; 5339a30b71b9SAlex Elder 534004077599SIlya Dryomov /* 534104077599SIlya Dryomov * If this image is the one being mapped, we have pool name and 534204077599SIlya Dryomov * id, image name and id, and snap name - need to fill snap id. 534304077599SIlya Dryomov * Otherwise this is a parent image, identified by pool, image 534404077599SIlya Dryomov * and snap ids - need to fill in names for those ids. 534504077599SIlya Dryomov */ 53466d69bb53SIlya Dryomov if (!depth) 534704077599SIlya Dryomov ret = rbd_spec_fill_snap_id(rbd_dev); 534804077599SIlya Dryomov else 534904077599SIlya Dryomov ret = rbd_spec_fill_names(rbd_dev); 53501fe48023SIlya Dryomov if (ret) { 53511fe48023SIlya Dryomov if (ret == -ENOENT) 53521fe48023SIlya Dryomov pr_info("snap %s/%s@%s does not exist\n", 53531fe48023SIlya Dryomov rbd_dev->spec->pool_name, 53541fe48023SIlya Dryomov rbd_dev->spec->image_name, 53551fe48023SIlya Dryomov rbd_dev->spec->snap_name); 535633dca39fSAlex Elder goto err_out_probe; 53571fe48023SIlya Dryomov } 53589bb81c9bSAlex Elder 5359e8f59b59SIlya Dryomov if (rbd_dev->header.features & RBD_FEATURE_LAYERING) { 5360e8f59b59SIlya Dryomov ret = rbd_dev_v2_parent_info(rbd_dev); 5361e8f59b59SIlya Dryomov if (ret) 5362e8f59b59SIlya Dryomov goto err_out_probe; 5363e8f59b59SIlya Dryomov 5364e8f59b59SIlya Dryomov /* 5365e8f59b59SIlya Dryomov * Need to warn users if this image is the one being 5366e8f59b59SIlya Dryomov * mapped and has a parent. 5367e8f59b59SIlya Dryomov */ 53686d69bb53SIlya Dryomov if (!depth && rbd_dev->parent_spec) 5369e8f59b59SIlya Dryomov rbd_warn(rbd_dev, 5370e8f59b59SIlya Dryomov "WARNING: kernel layering is EXPERIMENTAL!"); 5371e8f59b59SIlya Dryomov } 5372e8f59b59SIlya Dryomov 53736d69bb53SIlya Dryomov ret = rbd_dev_probe_parent(rbd_dev, depth); 537430d60ba2SAlex Elder if (ret) 537530d60ba2SAlex Elder goto err_out_probe; 537683a06263SAlex Elder 537730d60ba2SAlex Elder dout("discovered format %u image, header name is %s\n", 537830d60ba2SAlex Elder rbd_dev->image_format, rbd_dev->header_name); 537930d60ba2SAlex Elder return 0; 5380e8f59b59SIlya Dryomov 53816fd48b3bSAlex Elder err_out_probe: 53826fd48b3bSAlex Elder rbd_dev_unprobe(rbd_dev); 5383b644de2bSAlex Elder err_out_watch: 53846d69bb53SIlya Dryomov if (!depth) 5385fca27065SIlya Dryomov rbd_dev_header_unwatch_sync(rbd_dev); 5386332bb12dSAlex Elder out_header_name: 5387332bb12dSAlex Elder kfree(rbd_dev->header_name); 5388332bb12dSAlex Elder rbd_dev->header_name = NULL; 5389332bb12dSAlex Elder err_out_format: 5390332bb12dSAlex Elder rbd_dev->image_format = 0; 53915655c4d9SAlex Elder kfree(rbd_dev->spec->image_id); 53925655c4d9SAlex Elder rbd_dev->spec->image_id = NULL; 53935655c4d9SAlex Elder return ret; 539483a06263SAlex Elder } 539583a06263SAlex Elder 53969b60e70bSIlya Dryomov static ssize_t do_rbd_add(struct bus_type *bus, 539759c2be1eSYehuda Sadeh const char *buf, 539859c2be1eSYehuda Sadeh size_t count) 5399602adf40SYehuda Sadeh { 5400cb8627c7SAlex Elder struct rbd_device *rbd_dev = NULL; 5401dc79b113SAlex Elder struct ceph_options *ceph_opts = NULL; 54024e9afebaSAlex Elder struct rbd_options *rbd_opts = NULL; 5403859c31dfSAlex Elder struct rbd_spec *spec = NULL; 54049d3997fdSAlex Elder struct rbd_client *rbdc; 540551344a38SAlex Elder bool read_only; 5406b51c83c2SIlya Dryomov int rc; 5407602adf40SYehuda Sadeh 5408602adf40SYehuda Sadeh if (!try_module_get(THIS_MODULE)) 5409602adf40SYehuda Sadeh return -ENODEV; 5410602adf40SYehuda Sadeh 5411a725f65eSAlex Elder /* parse add command */ 5412859c31dfSAlex Elder rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec); 5413dc79b113SAlex Elder if (rc < 0) 5414dd5ac32dSIlya Dryomov goto out; 5415a725f65eSAlex Elder 54169d3997fdSAlex Elder rbdc = rbd_get_client(ceph_opts); 54179d3997fdSAlex Elder if (IS_ERR(rbdc)) { 54189d3997fdSAlex Elder rc = PTR_ERR(rbdc); 54190ddebc0cSAlex Elder goto err_out_args; 54209d3997fdSAlex Elder } 5421602adf40SYehuda Sadeh 5422602adf40SYehuda Sadeh /* pick the pool */ 542330ba1f02SIlya Dryomov rc = rbd_add_get_pool_id(rbdc, spec->pool_name); 54241fe48023SIlya Dryomov if (rc < 0) { 54251fe48023SIlya Dryomov if (rc == -ENOENT) 54261fe48023SIlya Dryomov pr_info("pool %s does not exist\n", spec->pool_name); 5427602adf40SYehuda Sadeh goto err_out_client; 54281fe48023SIlya Dryomov } 5429859c31dfSAlex Elder spec->pool_id = (u64)rc; 5430859c31dfSAlex Elder 54310903e875SAlex Elder /* The ceph file layout needs to fit pool id in 32 bits */ 54320903e875SAlex Elder 5433c0cd10dbSAlex Elder if (spec->pool_id > (u64)U32_MAX) { 54349584d508SIlya Dryomov rbd_warn(NULL, "pool id too large (%llu > %u)", 5435c0cd10dbSAlex Elder (unsigned long long)spec->pool_id, U32_MAX); 54360903e875SAlex Elder rc = -EIO; 54370903e875SAlex Elder goto err_out_client; 54380903e875SAlex Elder } 54390903e875SAlex Elder 5440d147543dSIlya Dryomov rbd_dev = rbd_dev_create(rbdc, spec, rbd_opts); 5441b51c83c2SIlya Dryomov if (!rbd_dev) { 5442b51c83c2SIlya Dryomov rc = -ENOMEM; 5443bd4ba655SAlex Elder goto err_out_client; 5444b51c83c2SIlya Dryomov } 5445c53d5893SAlex Elder rbdc = NULL; /* rbd_dev now owns this */ 5446c53d5893SAlex Elder spec = NULL; /* rbd_dev now owns this */ 5447d147543dSIlya Dryomov rbd_opts = NULL; /* rbd_dev now owns this */ 5448602adf40SYehuda Sadeh 54496d69bb53SIlya Dryomov rc = rbd_dev_image_probe(rbd_dev, 0); 5450a30b71b9SAlex Elder if (rc < 0) 5451c53d5893SAlex Elder goto err_out_rbd_dev; 545205fd6f6fSAlex Elder 54537ce4eef7SAlex Elder /* If we are mapping a snapshot it must be marked read-only */ 54547ce4eef7SAlex Elder 5455d147543dSIlya Dryomov read_only = rbd_dev->opts->read_only; 54567ce4eef7SAlex Elder if (rbd_dev->spec->snap_id != CEPH_NOSNAP) 54577ce4eef7SAlex Elder read_only = true; 54587ce4eef7SAlex Elder rbd_dev->mapping.read_only = read_only; 54597ce4eef7SAlex Elder 5460b536f69aSAlex Elder rc = rbd_dev_device_setup(rbd_dev); 54613abef3b3SAlex Elder if (rc) { 5462e37180c0SIlya Dryomov /* 5463e37180c0SIlya Dryomov * rbd_dev_header_unwatch_sync() can't be moved into 5464e37180c0SIlya Dryomov * rbd_dev_image_release() without refactoring, see 5465e37180c0SIlya Dryomov * commit 1f3ef78861ac. 5466e37180c0SIlya Dryomov */ 5467e37180c0SIlya Dryomov rbd_dev_header_unwatch_sync(rbd_dev); 54683abef3b3SAlex Elder rbd_dev_image_release(rbd_dev); 5469dd5ac32dSIlya Dryomov goto out; 54703abef3b3SAlex Elder } 54713abef3b3SAlex Elder 5472dd5ac32dSIlya Dryomov rc = count; 5473dd5ac32dSIlya Dryomov out: 5474dd5ac32dSIlya Dryomov module_put(THIS_MODULE); 5475dd5ac32dSIlya Dryomov return rc; 5476b536f69aSAlex Elder 5477c53d5893SAlex Elder err_out_rbd_dev: 5478c53d5893SAlex Elder rbd_dev_destroy(rbd_dev); 5479bd4ba655SAlex Elder err_out_client: 54809d3997fdSAlex Elder rbd_put_client(rbdc); 54810ddebc0cSAlex Elder err_out_args: 5482859c31dfSAlex Elder rbd_spec_put(spec); 5483d147543dSIlya Dryomov kfree(rbd_opts); 5484dd5ac32dSIlya Dryomov goto out; 5485602adf40SYehuda Sadeh } 5486602adf40SYehuda Sadeh 54879b60e70bSIlya Dryomov static ssize_t rbd_add(struct bus_type *bus, 54889b60e70bSIlya Dryomov const char *buf, 54899b60e70bSIlya Dryomov size_t count) 54909b60e70bSIlya Dryomov { 54919b60e70bSIlya Dryomov if (single_major) 54929b60e70bSIlya Dryomov return -EINVAL; 54939b60e70bSIlya Dryomov 54949b60e70bSIlya Dryomov return do_rbd_add(bus, buf, count); 54959b60e70bSIlya Dryomov } 54969b60e70bSIlya Dryomov 54979b60e70bSIlya Dryomov static ssize_t rbd_add_single_major(struct bus_type *bus, 54989b60e70bSIlya Dryomov const char *buf, 54999b60e70bSIlya Dryomov size_t count) 55009b60e70bSIlya Dryomov { 55019b60e70bSIlya Dryomov return do_rbd_add(bus, buf, count); 55029b60e70bSIlya Dryomov } 55039b60e70bSIlya Dryomov 5504dd5ac32dSIlya Dryomov static void rbd_dev_device_release(struct rbd_device *rbd_dev) 5505602adf40SYehuda Sadeh { 5506602adf40SYehuda Sadeh rbd_free_disk(rbd_dev); 5507200a6a8bSAlex Elder clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); 5508dd5ac32dSIlya Dryomov device_del(&rbd_dev->dev); 55096d80b130SAlex Elder rbd_dev_mapping_clear(rbd_dev); 55109b60e70bSIlya Dryomov if (!single_major) 5511602adf40SYehuda Sadeh unregister_blkdev(rbd_dev->major, rbd_dev->name); 5512e2839308SAlex Elder rbd_dev_id_put(rbd_dev); 5513d1cf5788SAlex Elder rbd_dev_mapping_clear(rbd_dev); 5514602adf40SYehuda Sadeh } 5515602adf40SYehuda Sadeh 551605a46afdSAlex Elder static void rbd_dev_remove_parent(struct rbd_device *rbd_dev) 551705a46afdSAlex Elder { 5518ad945fc1SAlex Elder while (rbd_dev->parent) { 551905a46afdSAlex Elder struct rbd_device *first = rbd_dev; 552005a46afdSAlex Elder struct rbd_device *second = first->parent; 552105a46afdSAlex Elder struct rbd_device *third; 552205a46afdSAlex Elder 552305a46afdSAlex Elder /* 552405a46afdSAlex Elder * Follow to the parent with no grandparent and 552505a46afdSAlex Elder * remove it. 552605a46afdSAlex Elder */ 552705a46afdSAlex Elder while (second && (third = second->parent)) { 552805a46afdSAlex Elder first = second; 552905a46afdSAlex Elder second = third; 553005a46afdSAlex Elder } 5531ad945fc1SAlex Elder rbd_assert(second); 55328ad42cd0SAlex Elder rbd_dev_image_release(second); 5533ad945fc1SAlex Elder first->parent = NULL; 5534ad945fc1SAlex Elder first->parent_overlap = 0; 5535ad945fc1SAlex Elder 5536ad945fc1SAlex Elder rbd_assert(first->parent_spec); 553705a46afdSAlex Elder rbd_spec_put(first->parent_spec); 553805a46afdSAlex Elder first->parent_spec = NULL; 553905a46afdSAlex Elder } 554005a46afdSAlex Elder } 554105a46afdSAlex Elder 55429b60e70bSIlya Dryomov static ssize_t do_rbd_remove(struct bus_type *bus, 5543602adf40SYehuda Sadeh const char *buf, 5544602adf40SYehuda Sadeh size_t count) 5545602adf40SYehuda Sadeh { 5546602adf40SYehuda Sadeh struct rbd_device *rbd_dev = NULL; 5547751cc0e3SAlex Elder struct list_head *tmp; 5548751cc0e3SAlex Elder int dev_id; 5549602adf40SYehuda Sadeh unsigned long ul; 555082a442d2SAlex Elder bool already = false; 55510d8189e1SAlex Elder int ret; 5552602adf40SYehuda Sadeh 5553bb8e0e84SJingoo Han ret = kstrtoul(buf, 10, &ul); 55540d8189e1SAlex Elder if (ret) 55550d8189e1SAlex Elder return ret; 5556602adf40SYehuda Sadeh 5557602adf40SYehuda Sadeh /* convert to int; abort if we lost anything in the conversion */ 5558751cc0e3SAlex Elder dev_id = (int)ul; 5559751cc0e3SAlex Elder if (dev_id != ul) 5560602adf40SYehuda Sadeh return -EINVAL; 5561602adf40SYehuda Sadeh 5562602adf40SYehuda Sadeh ret = -ENOENT; 5563751cc0e3SAlex Elder spin_lock(&rbd_dev_list_lock); 5564751cc0e3SAlex Elder list_for_each(tmp, &rbd_dev_list) { 5565751cc0e3SAlex Elder rbd_dev = list_entry(tmp, struct rbd_device, node); 5566751cc0e3SAlex Elder if (rbd_dev->dev_id == dev_id) { 5567751cc0e3SAlex Elder ret = 0; 5568751cc0e3SAlex Elder break; 5569602adf40SYehuda Sadeh } 5570751cc0e3SAlex Elder } 5571751cc0e3SAlex Elder if (!ret) { 5572a14ea269SAlex Elder spin_lock_irq(&rbd_dev->lock); 5573b82d167bSAlex Elder if (rbd_dev->open_count) 557442382b70SAlex Elder ret = -EBUSY; 5575b82d167bSAlex Elder else 557682a442d2SAlex Elder already = test_and_set_bit(RBD_DEV_FLAG_REMOVING, 557782a442d2SAlex Elder &rbd_dev->flags); 5578a14ea269SAlex Elder spin_unlock_irq(&rbd_dev->lock); 5579751cc0e3SAlex Elder } 5580751cc0e3SAlex Elder spin_unlock(&rbd_dev_list_lock); 558182a442d2SAlex Elder if (ret < 0 || already) 55821ba0f1e7SAlex Elder return ret; 5583751cc0e3SAlex Elder 5584fca27065SIlya Dryomov rbd_dev_header_unwatch_sync(rbd_dev); 55859abc5990SJosh Durgin /* 55869abc5990SJosh Durgin * flush remaining watch callbacks - these must be complete 55879abc5990SJosh Durgin * before the osd_client is shutdown 55889abc5990SJosh Durgin */ 55899abc5990SJosh Durgin dout("%s: flushing notifies", __func__); 55909abc5990SJosh Durgin ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc); 5591fca27065SIlya Dryomov 55929875201eSJosh Durgin /* 55939875201eSJosh Durgin * Don't free anything from rbd_dev->disk until after all 55949875201eSJosh Durgin * notifies are completely processed. Otherwise 55959875201eSJosh Durgin * rbd_bus_del_dev() will race with rbd_watch_cb(), resulting 55969875201eSJosh Durgin * in a potential use after free of rbd_dev->disk or rbd_dev. 55979875201eSJosh Durgin */ 5598dd5ac32dSIlya Dryomov rbd_dev_device_release(rbd_dev); 55998ad42cd0SAlex Elder rbd_dev_image_release(rbd_dev); 5600aafb230eSAlex Elder 56011ba0f1e7SAlex Elder return count; 5602602adf40SYehuda Sadeh } 5603602adf40SYehuda Sadeh 56049b60e70bSIlya Dryomov static ssize_t rbd_remove(struct bus_type *bus, 56059b60e70bSIlya Dryomov const char *buf, 56069b60e70bSIlya Dryomov size_t count) 56079b60e70bSIlya Dryomov { 56089b60e70bSIlya Dryomov if (single_major) 56099b60e70bSIlya Dryomov return -EINVAL; 56109b60e70bSIlya Dryomov 56119b60e70bSIlya Dryomov return do_rbd_remove(bus, buf, count); 56129b60e70bSIlya Dryomov } 56139b60e70bSIlya Dryomov 56149b60e70bSIlya Dryomov static ssize_t rbd_remove_single_major(struct bus_type *bus, 56159b60e70bSIlya Dryomov const char *buf, 56169b60e70bSIlya Dryomov size_t count) 56179b60e70bSIlya Dryomov { 56189b60e70bSIlya Dryomov return do_rbd_remove(bus, buf, count); 56199b60e70bSIlya Dryomov } 56209b60e70bSIlya Dryomov 5621602adf40SYehuda Sadeh /* 5622602adf40SYehuda Sadeh * create control files in sysfs 5623dfc5606dSYehuda Sadeh * /sys/bus/rbd/... 5624602adf40SYehuda Sadeh */ 5625602adf40SYehuda Sadeh static int rbd_sysfs_init(void) 5626602adf40SYehuda Sadeh { 5627dfc5606dSYehuda Sadeh int ret; 5628602adf40SYehuda Sadeh 5629fed4c143SAlex Elder ret = device_register(&rbd_root_dev); 5630dfc5606dSYehuda Sadeh if (ret < 0) 5631dfc5606dSYehuda Sadeh return ret; 5632602adf40SYehuda Sadeh 5633fed4c143SAlex Elder ret = bus_register(&rbd_bus_type); 5634fed4c143SAlex Elder if (ret < 0) 5635fed4c143SAlex Elder device_unregister(&rbd_root_dev); 5636602adf40SYehuda Sadeh 5637602adf40SYehuda Sadeh return ret; 5638602adf40SYehuda Sadeh } 5639602adf40SYehuda Sadeh 5640602adf40SYehuda Sadeh static void rbd_sysfs_cleanup(void) 5641602adf40SYehuda Sadeh { 5642dfc5606dSYehuda Sadeh bus_unregister(&rbd_bus_type); 5643fed4c143SAlex Elder device_unregister(&rbd_root_dev); 5644602adf40SYehuda Sadeh } 5645602adf40SYehuda Sadeh 56461c2a9dfeSAlex Elder static int rbd_slab_init(void) 56471c2a9dfeSAlex Elder { 56481c2a9dfeSAlex Elder rbd_assert(!rbd_img_request_cache); 56491c2a9dfeSAlex Elder rbd_img_request_cache = kmem_cache_create("rbd_img_request", 56501c2a9dfeSAlex Elder sizeof (struct rbd_img_request), 56511c2a9dfeSAlex Elder __alignof__(struct rbd_img_request), 56521c2a9dfeSAlex Elder 0, NULL); 5653868311b1SAlex Elder if (!rbd_img_request_cache) 5654868311b1SAlex Elder return -ENOMEM; 5655868311b1SAlex Elder 5656868311b1SAlex Elder rbd_assert(!rbd_obj_request_cache); 5657868311b1SAlex Elder rbd_obj_request_cache = kmem_cache_create("rbd_obj_request", 5658868311b1SAlex Elder sizeof (struct rbd_obj_request), 5659868311b1SAlex Elder __alignof__(struct rbd_obj_request), 5660868311b1SAlex Elder 0, NULL); 566178c2a44aSAlex Elder if (!rbd_obj_request_cache) 566278c2a44aSAlex Elder goto out_err; 566378c2a44aSAlex Elder 566478c2a44aSAlex Elder rbd_assert(!rbd_segment_name_cache); 566578c2a44aSAlex Elder rbd_segment_name_cache = kmem_cache_create("rbd_segment_name", 56662d0ebc5dSIlya Dryomov CEPH_MAX_OID_NAME_LEN + 1, 1, 0, NULL); 566778c2a44aSAlex Elder if (rbd_segment_name_cache) 56681c2a9dfeSAlex Elder return 0; 566978c2a44aSAlex Elder out_err: 567078c2a44aSAlex Elder kmem_cache_destroy(rbd_obj_request_cache); 567178c2a44aSAlex Elder rbd_obj_request_cache = NULL; 56721c2a9dfeSAlex Elder 5673868311b1SAlex Elder kmem_cache_destroy(rbd_img_request_cache); 5674868311b1SAlex Elder rbd_img_request_cache = NULL; 5675868311b1SAlex Elder 56761c2a9dfeSAlex Elder return -ENOMEM; 56771c2a9dfeSAlex Elder } 56781c2a9dfeSAlex Elder 56791c2a9dfeSAlex Elder static void rbd_slab_exit(void) 56801c2a9dfeSAlex Elder { 568178c2a44aSAlex Elder rbd_assert(rbd_segment_name_cache); 568278c2a44aSAlex Elder kmem_cache_destroy(rbd_segment_name_cache); 568378c2a44aSAlex Elder rbd_segment_name_cache = NULL; 568478c2a44aSAlex Elder 5685868311b1SAlex Elder rbd_assert(rbd_obj_request_cache); 5686868311b1SAlex Elder kmem_cache_destroy(rbd_obj_request_cache); 5687868311b1SAlex Elder rbd_obj_request_cache = NULL; 5688868311b1SAlex Elder 56891c2a9dfeSAlex Elder rbd_assert(rbd_img_request_cache); 56901c2a9dfeSAlex Elder kmem_cache_destroy(rbd_img_request_cache); 56911c2a9dfeSAlex Elder rbd_img_request_cache = NULL; 56921c2a9dfeSAlex Elder } 56931c2a9dfeSAlex Elder 5694cc344fa1SAlex Elder static int __init rbd_init(void) 5695602adf40SYehuda Sadeh { 5696602adf40SYehuda Sadeh int rc; 5697602adf40SYehuda Sadeh 56981e32d34cSAlex Elder if (!libceph_compatible(NULL)) { 56991e32d34cSAlex Elder rbd_warn(NULL, "libceph incompatibility (quitting)"); 57001e32d34cSAlex Elder return -EINVAL; 57011e32d34cSAlex Elder } 5702e1b4d96dSIlya Dryomov 57031c2a9dfeSAlex Elder rc = rbd_slab_init(); 5704602adf40SYehuda Sadeh if (rc) 5705602adf40SYehuda Sadeh return rc; 5706e1b4d96dSIlya Dryomov 5707f5ee37bdSIlya Dryomov /* 5708f5ee37bdSIlya Dryomov * The number of active work items is limited by the number of 5709f77303bdSIlya Dryomov * rbd devices * queue depth, so leave @max_active at default. 5710f5ee37bdSIlya Dryomov */ 5711f5ee37bdSIlya Dryomov rbd_wq = alloc_workqueue(RBD_DRV_NAME, WQ_MEM_RECLAIM, 0); 5712f5ee37bdSIlya Dryomov if (!rbd_wq) { 5713f5ee37bdSIlya Dryomov rc = -ENOMEM; 5714f5ee37bdSIlya Dryomov goto err_out_slab; 5715f5ee37bdSIlya Dryomov } 5716f5ee37bdSIlya Dryomov 57179b60e70bSIlya Dryomov if (single_major) { 57189b60e70bSIlya Dryomov rbd_major = register_blkdev(0, RBD_DRV_NAME); 57199b60e70bSIlya Dryomov if (rbd_major < 0) { 57209b60e70bSIlya Dryomov rc = rbd_major; 5721f5ee37bdSIlya Dryomov goto err_out_wq; 57229b60e70bSIlya Dryomov } 57239b60e70bSIlya Dryomov } 57249b60e70bSIlya Dryomov 57251c2a9dfeSAlex Elder rc = rbd_sysfs_init(); 57261c2a9dfeSAlex Elder if (rc) 57279b60e70bSIlya Dryomov goto err_out_blkdev; 57281c2a9dfeSAlex Elder 57299b60e70bSIlya Dryomov if (single_major) 57309b60e70bSIlya Dryomov pr_info("loaded (major %d)\n", rbd_major); 57319b60e70bSIlya Dryomov else 5732e1b4d96dSIlya Dryomov pr_info("loaded\n"); 57339b60e70bSIlya Dryomov 5734e1b4d96dSIlya Dryomov return 0; 5735e1b4d96dSIlya Dryomov 57369b60e70bSIlya Dryomov err_out_blkdev: 57379b60e70bSIlya Dryomov if (single_major) 57389b60e70bSIlya Dryomov unregister_blkdev(rbd_major, RBD_DRV_NAME); 5739f5ee37bdSIlya Dryomov err_out_wq: 5740f5ee37bdSIlya Dryomov destroy_workqueue(rbd_wq); 5741e1b4d96dSIlya Dryomov err_out_slab: 5742e1b4d96dSIlya Dryomov rbd_slab_exit(); 57431c2a9dfeSAlex Elder return rc; 5744602adf40SYehuda Sadeh } 5745602adf40SYehuda Sadeh 5746cc344fa1SAlex Elder static void __exit rbd_exit(void) 5747602adf40SYehuda Sadeh { 5748ffe312cfSIlya Dryomov ida_destroy(&rbd_dev_id_ida); 5749602adf40SYehuda Sadeh rbd_sysfs_cleanup(); 57509b60e70bSIlya Dryomov if (single_major) 57519b60e70bSIlya Dryomov unregister_blkdev(rbd_major, RBD_DRV_NAME); 5752f5ee37bdSIlya Dryomov destroy_workqueue(rbd_wq); 57531c2a9dfeSAlex Elder rbd_slab_exit(); 5754602adf40SYehuda Sadeh } 5755602adf40SYehuda Sadeh 5756602adf40SYehuda Sadeh module_init(rbd_init); 5757602adf40SYehuda Sadeh module_exit(rbd_exit); 5758602adf40SYehuda Sadeh 5759d552c619SAlex Elder MODULE_AUTHOR("Alex Elder <elder@inktank.com>"); 5760602adf40SYehuda Sadeh MODULE_AUTHOR("Sage Weil <sage@newdream.net>"); 5761602adf40SYehuda Sadeh MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>"); 5762602adf40SYehuda Sadeh /* following authorship retained from original osdblk.c */ 5763602adf40SYehuda Sadeh MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>"); 5764602adf40SYehuda Sadeh 576590da258bSIlya Dryomov MODULE_DESCRIPTION("RADOS Block Device (RBD) driver"); 5766602adf40SYehuda Sadeh MODULE_LICENSE("GPL"); 5767