1e2a58ee5SAlex Elder 2602adf40SYehuda Sadeh /* 3602adf40SYehuda Sadeh rbd.c -- Export ceph rados objects as a Linux block device 4602adf40SYehuda Sadeh 5602adf40SYehuda Sadeh 6602adf40SYehuda Sadeh based on drivers/block/osdblk.c: 7602adf40SYehuda Sadeh 8602adf40SYehuda Sadeh Copyright 2009 Red Hat, Inc. 9602adf40SYehuda Sadeh 10602adf40SYehuda Sadeh This program is free software; you can redistribute it and/or modify 11602adf40SYehuda Sadeh it under the terms of the GNU General Public License as published by 12602adf40SYehuda Sadeh the Free Software Foundation. 13602adf40SYehuda Sadeh 14602adf40SYehuda Sadeh This program is distributed in the hope that it will be useful, 15602adf40SYehuda Sadeh but WITHOUT ANY WARRANTY; without even the implied warranty of 16602adf40SYehuda Sadeh MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17602adf40SYehuda Sadeh GNU General Public License for more details. 18602adf40SYehuda Sadeh 19602adf40SYehuda Sadeh You should have received a copy of the GNU General Public License 20602adf40SYehuda Sadeh along with this program; see the file COPYING. If not, write to 21602adf40SYehuda Sadeh the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 22602adf40SYehuda Sadeh 23602adf40SYehuda Sadeh 24602adf40SYehuda Sadeh 25dfc5606dSYehuda Sadeh For usage instructions, please refer to: 26602adf40SYehuda Sadeh 27dfc5606dSYehuda Sadeh Documentation/ABI/testing/sysfs-bus-rbd 28602adf40SYehuda Sadeh 29602adf40SYehuda Sadeh */ 30602adf40SYehuda Sadeh 31602adf40SYehuda Sadeh #include <linux/ceph/libceph.h> 32602adf40SYehuda Sadeh #include <linux/ceph/osd_client.h> 33602adf40SYehuda Sadeh #include <linux/ceph/mon_client.h> 34602adf40SYehuda Sadeh #include <linux/ceph/decode.h> 3559c2be1eSYehuda Sadeh #include <linux/parser.h> 3630d1cff8SAlex Elder #include <linux/bsearch.h> 37602adf40SYehuda Sadeh 38602adf40SYehuda Sadeh #include <linux/kernel.h> 39602adf40SYehuda Sadeh #include <linux/device.h> 40602adf40SYehuda Sadeh #include <linux/module.h> 41602adf40SYehuda Sadeh #include <linux/fs.h> 42602adf40SYehuda Sadeh #include <linux/blkdev.h> 431c2a9dfeSAlex Elder #include <linux/slab.h> 44f8a22fc2SIlya Dryomov #include <linux/idr.h> 45602adf40SYehuda Sadeh 46602adf40SYehuda Sadeh #include "rbd_types.h" 47602adf40SYehuda Sadeh 48aafb230eSAlex Elder #define RBD_DEBUG /* Activate rbd_assert() calls */ 49aafb230eSAlex Elder 50593a9e7bSAlex Elder /* 51593a9e7bSAlex Elder * The basic unit of block I/O is a sector. It is interpreted in a 52593a9e7bSAlex Elder * number of contexts in Linux (blk, bio, genhd), but the default is 53593a9e7bSAlex Elder * universally 512 bytes. These symbols are just slightly more 54593a9e7bSAlex Elder * meaningful than the bare numbers they represent. 55593a9e7bSAlex Elder */ 56593a9e7bSAlex Elder #define SECTOR_SHIFT 9 57593a9e7bSAlex Elder #define SECTOR_SIZE (1ULL << SECTOR_SHIFT) 58593a9e7bSAlex Elder 59a2acd00eSAlex Elder /* 60a2acd00eSAlex Elder * Increment the given counter and return its updated value. 61a2acd00eSAlex Elder * If the counter is already 0 it will not be incremented. 62a2acd00eSAlex Elder * If the counter is already at its maximum value returns 63a2acd00eSAlex Elder * -EINVAL without updating it. 64a2acd00eSAlex Elder */ 65a2acd00eSAlex Elder static int atomic_inc_return_safe(atomic_t *v) 66a2acd00eSAlex Elder { 67a2acd00eSAlex Elder unsigned int counter; 68a2acd00eSAlex Elder 69a2acd00eSAlex Elder counter = (unsigned int)__atomic_add_unless(v, 1, 0); 70a2acd00eSAlex Elder if (counter <= (unsigned int)INT_MAX) 71a2acd00eSAlex Elder return (int)counter; 72a2acd00eSAlex Elder 73a2acd00eSAlex Elder atomic_dec(v); 74a2acd00eSAlex Elder 75a2acd00eSAlex Elder return -EINVAL; 76a2acd00eSAlex Elder } 77a2acd00eSAlex Elder 78a2acd00eSAlex Elder /* Decrement the counter. Return the resulting value, or -EINVAL */ 79a2acd00eSAlex Elder static int atomic_dec_return_safe(atomic_t *v) 80a2acd00eSAlex Elder { 81a2acd00eSAlex Elder int counter; 82a2acd00eSAlex Elder 83a2acd00eSAlex Elder counter = atomic_dec_return(v); 84a2acd00eSAlex Elder if (counter >= 0) 85a2acd00eSAlex Elder return counter; 86a2acd00eSAlex Elder 87a2acd00eSAlex Elder atomic_inc(v); 88a2acd00eSAlex Elder 89a2acd00eSAlex Elder return -EINVAL; 90a2acd00eSAlex Elder } 91a2acd00eSAlex Elder 92f0f8cef5SAlex Elder #define RBD_DRV_NAME "rbd" 93602adf40SYehuda Sadeh 947e513d43SIlya Dryomov #define RBD_MINORS_PER_MAJOR 256 957e513d43SIlya Dryomov #define RBD_SINGLE_MAJOR_PART_SHIFT 4 96602adf40SYehuda Sadeh 97d4b125e9SAlex Elder #define RBD_SNAP_DEV_NAME_PREFIX "snap_" 98d4b125e9SAlex Elder #define RBD_MAX_SNAP_NAME_LEN \ 99d4b125e9SAlex Elder (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1)) 100d4b125e9SAlex Elder 10135d489f9SAlex Elder #define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */ 102602adf40SYehuda Sadeh 103602adf40SYehuda Sadeh #define RBD_SNAP_HEAD_NAME "-" 104602adf40SYehuda Sadeh 1059682fc6dSAlex Elder #define BAD_SNAP_INDEX U32_MAX /* invalid index into snap array */ 1069682fc6dSAlex Elder 1079e15b77dSAlex Elder /* This allows a single page to hold an image name sent by OSD */ 1089e15b77dSAlex Elder #define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1) 109589d30e0SAlex Elder #define RBD_IMAGE_ID_LEN_MAX 64 1109e15b77dSAlex Elder 1111e130199SAlex Elder #define RBD_OBJ_PREFIX_LEN_MAX 64 112589d30e0SAlex Elder 113d889140cSAlex Elder /* Feature bits */ 114d889140cSAlex Elder 1155cbf6f12SAlex Elder #define RBD_FEATURE_LAYERING (1<<0) 1165cbf6f12SAlex Elder #define RBD_FEATURE_STRIPINGV2 (1<<1) 1175cbf6f12SAlex Elder #define RBD_FEATURES_ALL \ 1185cbf6f12SAlex Elder (RBD_FEATURE_LAYERING | RBD_FEATURE_STRIPINGV2) 119d889140cSAlex Elder 120d889140cSAlex Elder /* Features supported by this (client software) implementation. */ 121d889140cSAlex Elder 122770eba6eSAlex Elder #define RBD_FEATURES_SUPPORTED (RBD_FEATURES_ALL) 123d889140cSAlex Elder 12481a89793SAlex Elder /* 12581a89793SAlex Elder * An RBD device name will be "rbd#", where the "rbd" comes from 12681a89793SAlex Elder * RBD_DRV_NAME above, and # is a unique integer identifier. 12781a89793SAlex Elder * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big 12881a89793SAlex Elder * enough to hold all possible device names. 12981a89793SAlex Elder */ 130602adf40SYehuda Sadeh #define DEV_NAME_LEN 32 13181a89793SAlex Elder #define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1) 132602adf40SYehuda Sadeh 133602adf40SYehuda Sadeh /* 134602adf40SYehuda Sadeh * block device image metadata (in-memory version) 135602adf40SYehuda Sadeh */ 136602adf40SYehuda Sadeh struct rbd_image_header { 137f35a4deeSAlex Elder /* These six fields never change for a given rbd image */ 138849b4260SAlex Elder char *object_prefix; 139602adf40SYehuda Sadeh __u8 obj_order; 140602adf40SYehuda Sadeh __u8 crypt_type; 141602adf40SYehuda Sadeh __u8 comp_type; 142f35a4deeSAlex Elder u64 stripe_unit; 143f35a4deeSAlex Elder u64 stripe_count; 144f35a4deeSAlex Elder u64 features; /* Might be changeable someday? */ 145602adf40SYehuda Sadeh 146f84344f3SAlex Elder /* The remaining fields need to be updated occasionally */ 147f84344f3SAlex Elder u64 image_size; 148f84344f3SAlex Elder struct ceph_snap_context *snapc; 149f35a4deeSAlex Elder char *snap_names; /* format 1 only */ 150f35a4deeSAlex Elder u64 *snap_sizes; /* format 1 only */ 15159c2be1eSYehuda Sadeh }; 15259c2be1eSYehuda Sadeh 1530d7dbfceSAlex Elder /* 1540d7dbfceSAlex Elder * An rbd image specification. 1550d7dbfceSAlex Elder * 1560d7dbfceSAlex Elder * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely 157c66c6e0cSAlex Elder * identify an image. Each rbd_dev structure includes a pointer to 158c66c6e0cSAlex Elder * an rbd_spec structure that encapsulates this identity. 159c66c6e0cSAlex Elder * 160c66c6e0cSAlex Elder * Each of the id's in an rbd_spec has an associated name. For a 161c66c6e0cSAlex Elder * user-mapped image, the names are supplied and the id's associated 162c66c6e0cSAlex Elder * with them are looked up. For a layered image, a parent image is 163c66c6e0cSAlex Elder * defined by the tuple, and the names are looked up. 164c66c6e0cSAlex Elder * 165c66c6e0cSAlex Elder * An rbd_dev structure contains a parent_spec pointer which is 166c66c6e0cSAlex Elder * non-null if the image it represents is a child in a layered 167c66c6e0cSAlex Elder * image. This pointer will refer to the rbd_spec structure used 168c66c6e0cSAlex Elder * by the parent rbd_dev for its own identity (i.e., the structure 169c66c6e0cSAlex Elder * is shared between the parent and child). 170c66c6e0cSAlex Elder * 171c66c6e0cSAlex Elder * Since these structures are populated once, during the discovery 172c66c6e0cSAlex Elder * phase of image construction, they are effectively immutable so 173c66c6e0cSAlex Elder * we make no effort to synchronize access to them. 174c66c6e0cSAlex Elder * 175c66c6e0cSAlex Elder * Note that code herein does not assume the image name is known (it 176c66c6e0cSAlex Elder * could be a null pointer). 1770d7dbfceSAlex Elder */ 1780d7dbfceSAlex Elder struct rbd_spec { 1790d7dbfceSAlex Elder u64 pool_id; 180ecb4dc22SAlex Elder const char *pool_name; 1810d7dbfceSAlex Elder 182ecb4dc22SAlex Elder const char *image_id; 183ecb4dc22SAlex Elder const char *image_name; 1840d7dbfceSAlex Elder 1850d7dbfceSAlex Elder u64 snap_id; 186ecb4dc22SAlex Elder const char *snap_name; 1870d7dbfceSAlex Elder 1880d7dbfceSAlex Elder struct kref kref; 1890d7dbfceSAlex Elder }; 1900d7dbfceSAlex Elder 191602adf40SYehuda Sadeh /* 192f0f8cef5SAlex Elder * an instance of the client. multiple devices may share an rbd client. 193602adf40SYehuda Sadeh */ 194602adf40SYehuda Sadeh struct rbd_client { 195602adf40SYehuda Sadeh struct ceph_client *client; 196602adf40SYehuda Sadeh struct kref kref; 197602adf40SYehuda Sadeh struct list_head node; 198602adf40SYehuda Sadeh }; 199602adf40SYehuda Sadeh 200bf0d5f50SAlex Elder struct rbd_img_request; 201bf0d5f50SAlex Elder typedef void (*rbd_img_callback_t)(struct rbd_img_request *); 202bf0d5f50SAlex Elder 203bf0d5f50SAlex Elder #define BAD_WHICH U32_MAX /* Good which or bad which, which? */ 204bf0d5f50SAlex Elder 205bf0d5f50SAlex Elder struct rbd_obj_request; 206bf0d5f50SAlex Elder typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *); 207bf0d5f50SAlex Elder 2089969ebc5SAlex Elder enum obj_request_type { 2099969ebc5SAlex Elder OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES 2109969ebc5SAlex Elder }; 211bf0d5f50SAlex Elder 212926f9b3fSAlex Elder enum obj_req_flags { 213926f9b3fSAlex Elder OBJ_REQ_DONE, /* completion flag: not done = 0, done = 1 */ 2146365d33aSAlex Elder OBJ_REQ_IMG_DATA, /* object usage: standalone = 0, image = 1 */ 2155679c59fSAlex Elder OBJ_REQ_KNOWN, /* EXISTS flag valid: no = 0, yes = 1 */ 2165679c59fSAlex Elder OBJ_REQ_EXISTS, /* target exists: no = 0, yes = 1 */ 217926f9b3fSAlex Elder }; 218926f9b3fSAlex Elder 219bf0d5f50SAlex Elder struct rbd_obj_request { 220bf0d5f50SAlex Elder const char *object_name; 221bf0d5f50SAlex Elder u64 offset; /* object start byte */ 222bf0d5f50SAlex Elder u64 length; /* bytes from offset */ 223926f9b3fSAlex Elder unsigned long flags; 224bf0d5f50SAlex Elder 225c5b5ef6cSAlex Elder /* 226c5b5ef6cSAlex Elder * An object request associated with an image will have its 227c5b5ef6cSAlex Elder * img_data flag set; a standalone object request will not. 228c5b5ef6cSAlex Elder * 229c5b5ef6cSAlex Elder * A standalone object request will have which == BAD_WHICH 230c5b5ef6cSAlex Elder * and a null obj_request pointer. 231c5b5ef6cSAlex Elder * 232c5b5ef6cSAlex Elder * An object request initiated in support of a layered image 233c5b5ef6cSAlex Elder * object (to check for its existence before a write) will 234c5b5ef6cSAlex Elder * have which == BAD_WHICH and a non-null obj_request pointer. 235c5b5ef6cSAlex Elder * 236c5b5ef6cSAlex Elder * Finally, an object request for rbd image data will have 237c5b5ef6cSAlex Elder * which != BAD_WHICH, and will have a non-null img_request 238c5b5ef6cSAlex Elder * pointer. The value of which will be in the range 239c5b5ef6cSAlex Elder * 0..(img_request->obj_request_count-1). 240c5b5ef6cSAlex Elder */ 241c5b5ef6cSAlex Elder union { 242c5b5ef6cSAlex Elder struct rbd_obj_request *obj_request; /* STAT op */ 243c5b5ef6cSAlex Elder struct { 244bf0d5f50SAlex Elder struct rbd_img_request *img_request; 245c5b5ef6cSAlex Elder u64 img_offset; 246c5b5ef6cSAlex Elder /* links for img_request->obj_requests list */ 247c5b5ef6cSAlex Elder struct list_head links; 248c5b5ef6cSAlex Elder }; 249c5b5ef6cSAlex Elder }; 250bf0d5f50SAlex Elder u32 which; /* posn image request list */ 251bf0d5f50SAlex Elder 252bf0d5f50SAlex Elder enum obj_request_type type; 253788e2df3SAlex Elder union { 254bf0d5f50SAlex Elder struct bio *bio_list; 255788e2df3SAlex Elder struct { 256788e2df3SAlex Elder struct page **pages; 257788e2df3SAlex Elder u32 page_count; 258788e2df3SAlex Elder }; 259788e2df3SAlex Elder }; 2600eefd470SAlex Elder struct page **copyup_pages; 261ebda6408SAlex Elder u32 copyup_page_count; 262bf0d5f50SAlex Elder 263bf0d5f50SAlex Elder struct ceph_osd_request *osd_req; 264bf0d5f50SAlex Elder 265bf0d5f50SAlex Elder u64 xferred; /* bytes transferred */ 2661b83bef2SSage Weil int result; 267bf0d5f50SAlex Elder 268bf0d5f50SAlex Elder rbd_obj_callback_t callback; 269788e2df3SAlex Elder struct completion completion; 270bf0d5f50SAlex Elder 271bf0d5f50SAlex Elder struct kref kref; 272bf0d5f50SAlex Elder }; 273bf0d5f50SAlex Elder 2740c425248SAlex Elder enum img_req_flags { 2759849e986SAlex Elder IMG_REQ_WRITE, /* I/O direction: read = 0, write = 1 */ 2769849e986SAlex Elder IMG_REQ_CHILD, /* initiator: block = 0, child image = 1 */ 277d0b2e944SAlex Elder IMG_REQ_LAYERED, /* ENOENT handling: normal = 0, layered = 1 */ 2780c425248SAlex Elder }; 2790c425248SAlex Elder 280bf0d5f50SAlex Elder struct rbd_img_request { 281bf0d5f50SAlex Elder struct rbd_device *rbd_dev; 282bf0d5f50SAlex Elder u64 offset; /* starting image byte offset */ 283bf0d5f50SAlex Elder u64 length; /* byte count from offset */ 2840c425248SAlex Elder unsigned long flags; 285bf0d5f50SAlex Elder union { 286bf0d5f50SAlex Elder u64 snap_id; /* for reads */ 2879849e986SAlex Elder struct ceph_snap_context *snapc; /* for writes */ 2889849e986SAlex Elder }; 2899849e986SAlex Elder union { 2909849e986SAlex Elder struct request *rq; /* block request */ 2919849e986SAlex Elder struct rbd_obj_request *obj_request; /* obj req initiator */ 292bf0d5f50SAlex Elder }; 2933d7efd18SAlex Elder struct page **copyup_pages; 294ebda6408SAlex Elder u32 copyup_page_count; 295bf0d5f50SAlex Elder spinlock_t completion_lock;/* protects next_completion */ 296bf0d5f50SAlex Elder u32 next_completion; 297bf0d5f50SAlex Elder rbd_img_callback_t callback; 29855f27e09SAlex Elder u64 xferred;/* aggregate bytes transferred */ 299a5a337d4SAlex Elder int result; /* first nonzero obj_request result */ 300bf0d5f50SAlex Elder 301bf0d5f50SAlex Elder u32 obj_request_count; 302bf0d5f50SAlex Elder struct list_head obj_requests; /* rbd_obj_request structs */ 303bf0d5f50SAlex Elder 304bf0d5f50SAlex Elder struct kref kref; 305bf0d5f50SAlex Elder }; 306bf0d5f50SAlex Elder 307bf0d5f50SAlex Elder #define for_each_obj_request(ireq, oreq) \ 308ef06f4d3SAlex Elder list_for_each_entry(oreq, &(ireq)->obj_requests, links) 309bf0d5f50SAlex Elder #define for_each_obj_request_from(ireq, oreq) \ 310ef06f4d3SAlex Elder list_for_each_entry_from(oreq, &(ireq)->obj_requests, links) 311bf0d5f50SAlex Elder #define for_each_obj_request_safe(ireq, oreq, n) \ 312ef06f4d3SAlex Elder list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links) 313bf0d5f50SAlex Elder 314f84344f3SAlex Elder struct rbd_mapping { 31599c1f08fSAlex Elder u64 size; 31634b13184SAlex Elder u64 features; 317f84344f3SAlex Elder bool read_only; 318f84344f3SAlex Elder }; 319f84344f3SAlex Elder 320602adf40SYehuda Sadeh /* 321602adf40SYehuda Sadeh * a single device 322602adf40SYehuda Sadeh */ 323602adf40SYehuda Sadeh struct rbd_device { 324de71a297SAlex Elder int dev_id; /* blkdev unique id */ 325602adf40SYehuda Sadeh 326602adf40SYehuda Sadeh int major; /* blkdev assigned major */ 327dd82fff1SIlya Dryomov int minor; 328602adf40SYehuda Sadeh struct gendisk *disk; /* blkdev's gendisk and rq */ 329602adf40SYehuda Sadeh 330a30b71b9SAlex Elder u32 image_format; /* Either 1 or 2 */ 331602adf40SYehuda Sadeh struct rbd_client *rbd_client; 332602adf40SYehuda Sadeh 333602adf40SYehuda Sadeh char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */ 334602adf40SYehuda Sadeh 335b82d167bSAlex Elder spinlock_t lock; /* queue, flags, open_count */ 336602adf40SYehuda Sadeh 337602adf40SYehuda Sadeh struct rbd_image_header header; 338b82d167bSAlex Elder unsigned long flags; /* possibly lock protected */ 3390d7dbfceSAlex Elder struct rbd_spec *spec; 340602adf40SYehuda Sadeh 3410d7dbfceSAlex Elder char *header_name; 342971f839aSAlex Elder 3430903e875SAlex Elder struct ceph_file_layout layout; 3440903e875SAlex Elder 34559c2be1eSYehuda Sadeh struct ceph_osd_event *watch_event; 346975241afSAlex Elder struct rbd_obj_request *watch_request; 34759c2be1eSYehuda Sadeh 34886b00e0dSAlex Elder struct rbd_spec *parent_spec; 34986b00e0dSAlex Elder u64 parent_overlap; 350a2acd00eSAlex Elder atomic_t parent_ref; 3512f82ee54SAlex Elder struct rbd_device *parent; 35286b00e0dSAlex Elder 353c666601aSJosh Durgin /* protects updating the header */ 354c666601aSJosh Durgin struct rw_semaphore header_rwsem; 355f84344f3SAlex Elder 356f84344f3SAlex Elder struct rbd_mapping mapping; 357602adf40SYehuda Sadeh 358602adf40SYehuda Sadeh struct list_head node; 359dfc5606dSYehuda Sadeh 360dfc5606dSYehuda Sadeh /* sysfs related */ 361dfc5606dSYehuda Sadeh struct device dev; 362b82d167bSAlex Elder unsigned long open_count; /* protected by lock */ 363dfc5606dSYehuda Sadeh }; 364dfc5606dSYehuda Sadeh 365b82d167bSAlex Elder /* 366b82d167bSAlex Elder * Flag bits for rbd_dev->flags. If atomicity is required, 367b82d167bSAlex Elder * rbd_dev->lock is used to protect access. 368b82d167bSAlex Elder * 369b82d167bSAlex Elder * Currently, only the "removing" flag (which is coupled with the 370b82d167bSAlex Elder * "open_count" field) requires atomic access. 371b82d167bSAlex Elder */ 3726d292906SAlex Elder enum rbd_dev_flags { 3736d292906SAlex Elder RBD_DEV_FLAG_EXISTS, /* mapped snapshot has not been deleted */ 374b82d167bSAlex Elder RBD_DEV_FLAG_REMOVING, /* this mapping is being removed */ 3756d292906SAlex Elder }; 3766d292906SAlex Elder 377cfbf6377SAlex Elder static DEFINE_MUTEX(client_mutex); /* Serialize client creation */ 378e124a82fSAlex Elder 379602adf40SYehuda Sadeh static LIST_HEAD(rbd_dev_list); /* devices */ 380e124a82fSAlex Elder static DEFINE_SPINLOCK(rbd_dev_list_lock); 381e124a82fSAlex Elder 382602adf40SYehuda Sadeh static LIST_HEAD(rbd_client_list); /* clients */ 383432b8587SAlex Elder static DEFINE_SPINLOCK(rbd_client_list_lock); 384602adf40SYehuda Sadeh 38578c2a44aSAlex Elder /* Slab caches for frequently-allocated structures */ 38678c2a44aSAlex Elder 3871c2a9dfeSAlex Elder static struct kmem_cache *rbd_img_request_cache; 388868311b1SAlex Elder static struct kmem_cache *rbd_obj_request_cache; 38978c2a44aSAlex Elder static struct kmem_cache *rbd_segment_name_cache; 3901c2a9dfeSAlex Elder 3919b60e70bSIlya Dryomov static int rbd_major; 392f8a22fc2SIlya Dryomov static DEFINE_IDA(rbd_dev_id_ida); 393f8a22fc2SIlya Dryomov 3949b60e70bSIlya Dryomov /* 3959b60e70bSIlya Dryomov * Default to false for now, as single-major requires >= 0.75 version of 3969b60e70bSIlya Dryomov * userspace rbd utility. 3979b60e70bSIlya Dryomov */ 3989b60e70bSIlya Dryomov static bool single_major = false; 3999b60e70bSIlya Dryomov module_param(single_major, bool, S_IRUGO); 4009b60e70bSIlya Dryomov MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: false)"); 4019b60e70bSIlya Dryomov 4023d7efd18SAlex Elder static int rbd_img_request_submit(struct rbd_img_request *img_request); 4033d7efd18SAlex Elder 404200a6a8bSAlex Elder static void rbd_dev_device_release(struct device *dev); 405dfc5606dSYehuda Sadeh 406f0f8cef5SAlex Elder static ssize_t rbd_add(struct bus_type *bus, const char *buf, 407f0f8cef5SAlex Elder size_t count); 408f0f8cef5SAlex Elder static ssize_t rbd_remove(struct bus_type *bus, const char *buf, 409f0f8cef5SAlex Elder size_t count); 4109b60e70bSIlya Dryomov static ssize_t rbd_add_single_major(struct bus_type *bus, const char *buf, 4119b60e70bSIlya Dryomov size_t count); 4129b60e70bSIlya Dryomov static ssize_t rbd_remove_single_major(struct bus_type *bus, const char *buf, 4139b60e70bSIlya Dryomov size_t count); 4141f3ef788SAlex Elder static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool mapping); 415a2acd00eSAlex Elder static void rbd_spec_put(struct rbd_spec *spec); 416f0f8cef5SAlex Elder 4179b60e70bSIlya Dryomov static int rbd_dev_id_to_minor(int dev_id) 4189b60e70bSIlya Dryomov { 4197e513d43SIlya Dryomov return dev_id << RBD_SINGLE_MAJOR_PART_SHIFT; 4209b60e70bSIlya Dryomov } 4219b60e70bSIlya Dryomov 4229b60e70bSIlya Dryomov static int minor_to_rbd_dev_id(int minor) 4239b60e70bSIlya Dryomov { 4247e513d43SIlya Dryomov return minor >> RBD_SINGLE_MAJOR_PART_SHIFT; 4259b60e70bSIlya Dryomov } 4269b60e70bSIlya Dryomov 427b15a21ddSGreg Kroah-Hartman static BUS_ATTR(add, S_IWUSR, NULL, rbd_add); 428b15a21ddSGreg Kroah-Hartman static BUS_ATTR(remove, S_IWUSR, NULL, rbd_remove); 4299b60e70bSIlya Dryomov static BUS_ATTR(add_single_major, S_IWUSR, NULL, rbd_add_single_major); 4309b60e70bSIlya Dryomov static BUS_ATTR(remove_single_major, S_IWUSR, NULL, rbd_remove_single_major); 431b15a21ddSGreg Kroah-Hartman 432b15a21ddSGreg Kroah-Hartman static struct attribute *rbd_bus_attrs[] = { 433b15a21ddSGreg Kroah-Hartman &bus_attr_add.attr, 434b15a21ddSGreg Kroah-Hartman &bus_attr_remove.attr, 4359b60e70bSIlya Dryomov &bus_attr_add_single_major.attr, 4369b60e70bSIlya Dryomov &bus_attr_remove_single_major.attr, 437b15a21ddSGreg Kroah-Hartman NULL, 438f0f8cef5SAlex Elder }; 43992c76dc0SIlya Dryomov 44092c76dc0SIlya Dryomov static umode_t rbd_bus_is_visible(struct kobject *kobj, 44192c76dc0SIlya Dryomov struct attribute *attr, int index) 44292c76dc0SIlya Dryomov { 4439b60e70bSIlya Dryomov if (!single_major && 4449b60e70bSIlya Dryomov (attr == &bus_attr_add_single_major.attr || 4459b60e70bSIlya Dryomov attr == &bus_attr_remove_single_major.attr)) 4469b60e70bSIlya Dryomov return 0; 4479b60e70bSIlya Dryomov 44892c76dc0SIlya Dryomov return attr->mode; 44992c76dc0SIlya Dryomov } 45092c76dc0SIlya Dryomov 45192c76dc0SIlya Dryomov static const struct attribute_group rbd_bus_group = { 45292c76dc0SIlya Dryomov .attrs = rbd_bus_attrs, 45392c76dc0SIlya Dryomov .is_visible = rbd_bus_is_visible, 45492c76dc0SIlya Dryomov }; 45592c76dc0SIlya Dryomov __ATTRIBUTE_GROUPS(rbd_bus); 456f0f8cef5SAlex Elder 457f0f8cef5SAlex Elder static struct bus_type rbd_bus_type = { 458f0f8cef5SAlex Elder .name = "rbd", 459b15a21ddSGreg Kroah-Hartman .bus_groups = rbd_bus_groups, 460f0f8cef5SAlex Elder }; 461f0f8cef5SAlex Elder 462f0f8cef5SAlex Elder static void rbd_root_dev_release(struct device *dev) 463f0f8cef5SAlex Elder { 464f0f8cef5SAlex Elder } 465f0f8cef5SAlex Elder 466f0f8cef5SAlex Elder static struct device rbd_root_dev = { 467f0f8cef5SAlex Elder .init_name = "rbd", 468f0f8cef5SAlex Elder .release = rbd_root_dev_release, 469f0f8cef5SAlex Elder }; 470f0f8cef5SAlex Elder 47106ecc6cbSAlex Elder static __printf(2, 3) 47206ecc6cbSAlex Elder void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...) 47306ecc6cbSAlex Elder { 47406ecc6cbSAlex Elder struct va_format vaf; 47506ecc6cbSAlex Elder va_list args; 47606ecc6cbSAlex Elder 47706ecc6cbSAlex Elder va_start(args, fmt); 47806ecc6cbSAlex Elder vaf.fmt = fmt; 47906ecc6cbSAlex Elder vaf.va = &args; 48006ecc6cbSAlex Elder 48106ecc6cbSAlex Elder if (!rbd_dev) 48206ecc6cbSAlex Elder printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf); 48306ecc6cbSAlex Elder else if (rbd_dev->disk) 48406ecc6cbSAlex Elder printk(KERN_WARNING "%s: %s: %pV\n", 48506ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf); 48606ecc6cbSAlex Elder else if (rbd_dev->spec && rbd_dev->spec->image_name) 48706ecc6cbSAlex Elder printk(KERN_WARNING "%s: image %s: %pV\n", 48806ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf); 48906ecc6cbSAlex Elder else if (rbd_dev->spec && rbd_dev->spec->image_id) 49006ecc6cbSAlex Elder printk(KERN_WARNING "%s: id %s: %pV\n", 49106ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf); 49206ecc6cbSAlex Elder else /* punt */ 49306ecc6cbSAlex Elder printk(KERN_WARNING "%s: rbd_dev %p: %pV\n", 49406ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev, &vaf); 49506ecc6cbSAlex Elder va_end(args); 49606ecc6cbSAlex Elder } 49706ecc6cbSAlex Elder 498aafb230eSAlex Elder #ifdef RBD_DEBUG 499aafb230eSAlex Elder #define rbd_assert(expr) \ 500aafb230eSAlex Elder if (unlikely(!(expr))) { \ 501aafb230eSAlex Elder printk(KERN_ERR "\nAssertion failure in %s() " \ 502aafb230eSAlex Elder "at line %d:\n\n" \ 503aafb230eSAlex Elder "\trbd_assert(%s);\n\n", \ 504aafb230eSAlex Elder __func__, __LINE__, #expr); \ 505aafb230eSAlex Elder BUG(); \ 506aafb230eSAlex Elder } 507aafb230eSAlex Elder #else /* !RBD_DEBUG */ 508aafb230eSAlex Elder # define rbd_assert(expr) ((void) 0) 509aafb230eSAlex Elder #endif /* !RBD_DEBUG */ 510dfc5606dSYehuda Sadeh 511b454e36dSAlex Elder static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request); 51205a46afdSAlex Elder static void rbd_img_parent_read(struct rbd_obj_request *obj_request); 51305a46afdSAlex Elder static void rbd_dev_remove_parent(struct rbd_device *rbd_dev); 5148b3e1a56SAlex Elder 515cc4a38bdSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev); 5162df3fac7SAlex Elder static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev); 5172df3fac7SAlex Elder static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev); 51854cac61fSAlex Elder static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, 51954cac61fSAlex Elder u64 snap_id); 5202ad3d716SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id, 5212ad3d716SAlex Elder u8 *order, u64 *snap_size); 5222ad3d716SAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id, 5232ad3d716SAlex Elder u64 *snap_features); 5242ad3d716SAlex Elder static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name); 52559c2be1eSYehuda Sadeh 526602adf40SYehuda Sadeh static int rbd_open(struct block_device *bdev, fmode_t mode) 527602adf40SYehuda Sadeh { 528f0f8cef5SAlex Elder struct rbd_device *rbd_dev = bdev->bd_disk->private_data; 529b82d167bSAlex Elder bool removing = false; 530602adf40SYehuda Sadeh 531f84344f3SAlex Elder if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only) 532602adf40SYehuda Sadeh return -EROFS; 533602adf40SYehuda Sadeh 534a14ea269SAlex Elder spin_lock_irq(&rbd_dev->lock); 535b82d167bSAlex Elder if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags)) 536b82d167bSAlex Elder removing = true; 537b82d167bSAlex Elder else 538b82d167bSAlex Elder rbd_dev->open_count++; 539a14ea269SAlex Elder spin_unlock_irq(&rbd_dev->lock); 540b82d167bSAlex Elder if (removing) 541b82d167bSAlex Elder return -ENOENT; 542b82d167bSAlex Elder 543c3e946ceSAlex Elder (void) get_device(&rbd_dev->dev); 544f84344f3SAlex Elder set_device_ro(bdev, rbd_dev->mapping.read_only); 545340c7a2bSAlex Elder 546602adf40SYehuda Sadeh return 0; 547602adf40SYehuda Sadeh } 548602adf40SYehuda Sadeh 549db2a144bSAl Viro static void rbd_release(struct gendisk *disk, fmode_t mode) 550dfc5606dSYehuda Sadeh { 551dfc5606dSYehuda Sadeh struct rbd_device *rbd_dev = disk->private_data; 552b82d167bSAlex Elder unsigned long open_count_before; 553b82d167bSAlex Elder 554a14ea269SAlex Elder spin_lock_irq(&rbd_dev->lock); 555b82d167bSAlex Elder open_count_before = rbd_dev->open_count--; 556a14ea269SAlex Elder spin_unlock_irq(&rbd_dev->lock); 557b82d167bSAlex Elder rbd_assert(open_count_before > 0); 558dfc5606dSYehuda Sadeh 559c3e946ceSAlex Elder put_device(&rbd_dev->dev); 560dfc5606dSYehuda Sadeh } 561dfc5606dSYehuda Sadeh 562602adf40SYehuda Sadeh static const struct block_device_operations rbd_bd_ops = { 563602adf40SYehuda Sadeh .owner = THIS_MODULE, 564602adf40SYehuda Sadeh .open = rbd_open, 565dfc5606dSYehuda Sadeh .release = rbd_release, 566602adf40SYehuda Sadeh }; 567602adf40SYehuda Sadeh 568602adf40SYehuda Sadeh /* 5697262cfcaSAlex Elder * Initialize an rbd client instance. Success or not, this function 570cfbf6377SAlex Elder * consumes ceph_opts. Caller holds client_mutex. 571602adf40SYehuda Sadeh */ 572f8c38929SAlex Elder static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts) 573602adf40SYehuda Sadeh { 574602adf40SYehuda Sadeh struct rbd_client *rbdc; 575602adf40SYehuda Sadeh int ret = -ENOMEM; 576602adf40SYehuda Sadeh 57737206ee5SAlex Elder dout("%s:\n", __func__); 578602adf40SYehuda Sadeh rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL); 579602adf40SYehuda Sadeh if (!rbdc) 580602adf40SYehuda Sadeh goto out_opt; 581602adf40SYehuda Sadeh 582602adf40SYehuda Sadeh kref_init(&rbdc->kref); 583602adf40SYehuda Sadeh INIT_LIST_HEAD(&rbdc->node); 584602adf40SYehuda Sadeh 58543ae4701SAlex Elder rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0); 586602adf40SYehuda Sadeh if (IS_ERR(rbdc->client)) 58708f75463SAlex Elder goto out_rbdc; 58843ae4701SAlex Elder ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */ 589602adf40SYehuda Sadeh 590602adf40SYehuda Sadeh ret = ceph_open_session(rbdc->client); 591602adf40SYehuda Sadeh if (ret < 0) 59208f75463SAlex Elder goto out_client; 593602adf40SYehuda Sadeh 594432b8587SAlex Elder spin_lock(&rbd_client_list_lock); 595602adf40SYehuda Sadeh list_add_tail(&rbdc->node, &rbd_client_list); 596432b8587SAlex Elder spin_unlock(&rbd_client_list_lock); 597602adf40SYehuda Sadeh 59837206ee5SAlex Elder dout("%s: rbdc %p\n", __func__, rbdc); 599bc534d86SAlex Elder 600602adf40SYehuda Sadeh return rbdc; 60108f75463SAlex Elder out_client: 602602adf40SYehuda Sadeh ceph_destroy_client(rbdc->client); 60308f75463SAlex Elder out_rbdc: 604602adf40SYehuda Sadeh kfree(rbdc); 605602adf40SYehuda Sadeh out_opt: 60643ae4701SAlex Elder if (ceph_opts) 60743ae4701SAlex Elder ceph_destroy_options(ceph_opts); 60837206ee5SAlex Elder dout("%s: error %d\n", __func__, ret); 60937206ee5SAlex Elder 61028f259b7SVasiliy Kulikov return ERR_PTR(ret); 611602adf40SYehuda Sadeh } 612602adf40SYehuda Sadeh 6132f82ee54SAlex Elder static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc) 6142f82ee54SAlex Elder { 6152f82ee54SAlex Elder kref_get(&rbdc->kref); 6162f82ee54SAlex Elder 6172f82ee54SAlex Elder return rbdc; 6182f82ee54SAlex Elder } 6192f82ee54SAlex Elder 620602adf40SYehuda Sadeh /* 6211f7ba331SAlex Elder * Find a ceph client with specific addr and configuration. If 6221f7ba331SAlex Elder * found, bump its reference count. 623602adf40SYehuda Sadeh */ 6241f7ba331SAlex Elder static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts) 625602adf40SYehuda Sadeh { 626602adf40SYehuda Sadeh struct rbd_client *client_node; 6271f7ba331SAlex Elder bool found = false; 628602adf40SYehuda Sadeh 62943ae4701SAlex Elder if (ceph_opts->flags & CEPH_OPT_NOSHARE) 630602adf40SYehuda Sadeh return NULL; 631602adf40SYehuda Sadeh 6321f7ba331SAlex Elder spin_lock(&rbd_client_list_lock); 6331f7ba331SAlex Elder list_for_each_entry(client_node, &rbd_client_list, node) { 6341f7ba331SAlex Elder if (!ceph_compare_options(ceph_opts, client_node->client)) { 6352f82ee54SAlex Elder __rbd_get_client(client_node); 6362f82ee54SAlex Elder 6371f7ba331SAlex Elder found = true; 6381f7ba331SAlex Elder break; 6391f7ba331SAlex Elder } 6401f7ba331SAlex Elder } 6411f7ba331SAlex Elder spin_unlock(&rbd_client_list_lock); 6421f7ba331SAlex Elder 6431f7ba331SAlex Elder return found ? client_node : NULL; 644602adf40SYehuda Sadeh } 645602adf40SYehuda Sadeh 646602adf40SYehuda Sadeh /* 64759c2be1eSYehuda Sadeh * mount options 64859c2be1eSYehuda Sadeh */ 64959c2be1eSYehuda Sadeh enum { 65059c2be1eSYehuda Sadeh Opt_last_int, 65159c2be1eSYehuda Sadeh /* int args above */ 65259c2be1eSYehuda Sadeh Opt_last_string, 65359c2be1eSYehuda Sadeh /* string args above */ 654cc0538b6SAlex Elder Opt_read_only, 655cc0538b6SAlex Elder Opt_read_write, 656cc0538b6SAlex Elder /* Boolean args above */ 657cc0538b6SAlex Elder Opt_last_bool, 65859c2be1eSYehuda Sadeh }; 65959c2be1eSYehuda Sadeh 66043ae4701SAlex Elder static match_table_t rbd_opts_tokens = { 66159c2be1eSYehuda Sadeh /* int args above */ 66259c2be1eSYehuda Sadeh /* string args above */ 663be466c1cSAlex Elder {Opt_read_only, "read_only"}, 664cc0538b6SAlex Elder {Opt_read_only, "ro"}, /* Alternate spelling */ 665cc0538b6SAlex Elder {Opt_read_write, "read_write"}, 666cc0538b6SAlex Elder {Opt_read_write, "rw"}, /* Alternate spelling */ 667cc0538b6SAlex Elder /* Boolean args above */ 66859c2be1eSYehuda Sadeh {-1, NULL} 66959c2be1eSYehuda Sadeh }; 67059c2be1eSYehuda Sadeh 67198571b5aSAlex Elder struct rbd_options { 67298571b5aSAlex Elder bool read_only; 67398571b5aSAlex Elder }; 67498571b5aSAlex Elder 67598571b5aSAlex Elder #define RBD_READ_ONLY_DEFAULT false 67698571b5aSAlex Elder 67759c2be1eSYehuda Sadeh static int parse_rbd_opts_token(char *c, void *private) 67859c2be1eSYehuda Sadeh { 67943ae4701SAlex Elder struct rbd_options *rbd_opts = private; 68059c2be1eSYehuda Sadeh substring_t argstr[MAX_OPT_ARGS]; 68159c2be1eSYehuda Sadeh int token, intval, ret; 68259c2be1eSYehuda Sadeh 68343ae4701SAlex Elder token = match_token(c, rbd_opts_tokens, argstr); 68459c2be1eSYehuda Sadeh if (token < 0) 68559c2be1eSYehuda Sadeh return -EINVAL; 68659c2be1eSYehuda Sadeh 68759c2be1eSYehuda Sadeh if (token < Opt_last_int) { 68859c2be1eSYehuda Sadeh ret = match_int(&argstr[0], &intval); 68959c2be1eSYehuda Sadeh if (ret < 0) { 69059c2be1eSYehuda Sadeh pr_err("bad mount option arg (not int) " 69159c2be1eSYehuda Sadeh "at '%s'\n", c); 69259c2be1eSYehuda Sadeh return ret; 69359c2be1eSYehuda Sadeh } 69459c2be1eSYehuda Sadeh dout("got int token %d val %d\n", token, intval); 69559c2be1eSYehuda Sadeh } else if (token > Opt_last_int && token < Opt_last_string) { 69659c2be1eSYehuda Sadeh dout("got string token %d val %s\n", token, 69759c2be1eSYehuda Sadeh argstr[0].from); 698cc0538b6SAlex Elder } else if (token > Opt_last_string && token < Opt_last_bool) { 699cc0538b6SAlex Elder dout("got Boolean token %d\n", token); 70059c2be1eSYehuda Sadeh } else { 70159c2be1eSYehuda Sadeh dout("got token %d\n", token); 70259c2be1eSYehuda Sadeh } 70359c2be1eSYehuda Sadeh 70459c2be1eSYehuda Sadeh switch (token) { 705cc0538b6SAlex Elder case Opt_read_only: 706cc0538b6SAlex Elder rbd_opts->read_only = true; 707cc0538b6SAlex Elder break; 708cc0538b6SAlex Elder case Opt_read_write: 709cc0538b6SAlex Elder rbd_opts->read_only = false; 710cc0538b6SAlex Elder break; 71159c2be1eSYehuda Sadeh default: 712aafb230eSAlex Elder rbd_assert(false); 713aafb230eSAlex Elder break; 71459c2be1eSYehuda Sadeh } 71559c2be1eSYehuda Sadeh return 0; 71659c2be1eSYehuda Sadeh } 71759c2be1eSYehuda Sadeh 71859c2be1eSYehuda Sadeh /* 719602adf40SYehuda Sadeh * Get a ceph client with specific addr and configuration, if one does 7207262cfcaSAlex Elder * not exist create it. Either way, ceph_opts is consumed by this 7217262cfcaSAlex Elder * function. 722602adf40SYehuda Sadeh */ 7239d3997fdSAlex Elder static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts) 724602adf40SYehuda Sadeh { 725f8c38929SAlex Elder struct rbd_client *rbdc; 72659c2be1eSYehuda Sadeh 727cfbf6377SAlex Elder mutex_lock_nested(&client_mutex, SINGLE_DEPTH_NESTING); 7281f7ba331SAlex Elder rbdc = rbd_client_find(ceph_opts); 7299d3997fdSAlex Elder if (rbdc) /* using an existing client */ 73043ae4701SAlex Elder ceph_destroy_options(ceph_opts); 7319d3997fdSAlex Elder else 732f8c38929SAlex Elder rbdc = rbd_client_create(ceph_opts); 733cfbf6377SAlex Elder mutex_unlock(&client_mutex); 734d720bcb0SAlex Elder 7359d3997fdSAlex Elder return rbdc; 736602adf40SYehuda Sadeh } 737602adf40SYehuda Sadeh 738602adf40SYehuda Sadeh /* 739602adf40SYehuda Sadeh * Destroy ceph client 740d23a4b3fSAlex Elder * 741432b8587SAlex Elder * Caller must hold rbd_client_list_lock. 742602adf40SYehuda Sadeh */ 743602adf40SYehuda Sadeh static void rbd_client_release(struct kref *kref) 744602adf40SYehuda Sadeh { 745602adf40SYehuda Sadeh struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref); 746602adf40SYehuda Sadeh 74737206ee5SAlex Elder dout("%s: rbdc %p\n", __func__, rbdc); 748cd9d9f5dSAlex Elder spin_lock(&rbd_client_list_lock); 749602adf40SYehuda Sadeh list_del(&rbdc->node); 750cd9d9f5dSAlex Elder spin_unlock(&rbd_client_list_lock); 751602adf40SYehuda Sadeh 752602adf40SYehuda Sadeh ceph_destroy_client(rbdc->client); 753602adf40SYehuda Sadeh kfree(rbdc); 754602adf40SYehuda Sadeh } 755602adf40SYehuda Sadeh 756602adf40SYehuda Sadeh /* 757602adf40SYehuda Sadeh * Drop reference to ceph client node. If it's not referenced anymore, release 758602adf40SYehuda Sadeh * it. 759602adf40SYehuda Sadeh */ 7609d3997fdSAlex Elder static void rbd_put_client(struct rbd_client *rbdc) 761602adf40SYehuda Sadeh { 762c53d5893SAlex Elder if (rbdc) 7639d3997fdSAlex Elder kref_put(&rbdc->kref, rbd_client_release); 764602adf40SYehuda Sadeh } 765602adf40SYehuda Sadeh 766a30b71b9SAlex Elder static bool rbd_image_format_valid(u32 image_format) 767a30b71b9SAlex Elder { 768a30b71b9SAlex Elder return image_format == 1 || image_format == 2; 769a30b71b9SAlex Elder } 770a30b71b9SAlex Elder 7718e94af8eSAlex Elder static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk) 7728e94af8eSAlex Elder { 773103a150fSAlex Elder size_t size; 774103a150fSAlex Elder u32 snap_count; 775103a150fSAlex Elder 776103a150fSAlex Elder /* The header has to start with the magic rbd header text */ 777103a150fSAlex Elder if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT))) 778103a150fSAlex Elder return false; 779103a150fSAlex Elder 780db2388b6SAlex Elder /* The bio layer requires at least sector-sized I/O */ 781db2388b6SAlex Elder 782db2388b6SAlex Elder if (ondisk->options.order < SECTOR_SHIFT) 783db2388b6SAlex Elder return false; 784db2388b6SAlex Elder 785db2388b6SAlex Elder /* If we use u64 in a few spots we may be able to loosen this */ 786db2388b6SAlex Elder 787db2388b6SAlex Elder if (ondisk->options.order > 8 * sizeof (int) - 1) 788db2388b6SAlex Elder return false; 789db2388b6SAlex Elder 790103a150fSAlex Elder /* 791103a150fSAlex Elder * The size of a snapshot header has to fit in a size_t, and 792103a150fSAlex Elder * that limits the number of snapshots. 793103a150fSAlex Elder */ 794103a150fSAlex Elder snap_count = le32_to_cpu(ondisk->snap_count); 795103a150fSAlex Elder size = SIZE_MAX - sizeof (struct ceph_snap_context); 796103a150fSAlex Elder if (snap_count > size / sizeof (__le64)) 797103a150fSAlex Elder return false; 798103a150fSAlex Elder 799103a150fSAlex Elder /* 800103a150fSAlex Elder * Not only that, but the size of the entire the snapshot 801103a150fSAlex Elder * header must also be representable in a size_t. 802103a150fSAlex Elder */ 803103a150fSAlex Elder size -= snap_count * sizeof (__le64); 804103a150fSAlex Elder if ((u64) size < le64_to_cpu(ondisk->snap_names_len)) 805103a150fSAlex Elder return false; 806103a150fSAlex Elder 807103a150fSAlex Elder return true; 8088e94af8eSAlex Elder } 8098e94af8eSAlex Elder 810602adf40SYehuda Sadeh /* 811bb23e37aSAlex Elder * Fill an rbd image header with information from the given format 1 812bb23e37aSAlex Elder * on-disk header. 813602adf40SYehuda Sadeh */ 814662518b1SAlex Elder static int rbd_header_from_disk(struct rbd_device *rbd_dev, 8154156d998SAlex Elder struct rbd_image_header_ondisk *ondisk) 816602adf40SYehuda Sadeh { 817662518b1SAlex Elder struct rbd_image_header *header = &rbd_dev->header; 818bb23e37aSAlex Elder bool first_time = header->object_prefix == NULL; 819bb23e37aSAlex Elder struct ceph_snap_context *snapc; 820bb23e37aSAlex Elder char *object_prefix = NULL; 821bb23e37aSAlex Elder char *snap_names = NULL; 822bb23e37aSAlex Elder u64 *snap_sizes = NULL; 823ccece235SAlex Elder u32 snap_count; 824d2bb24e5SAlex Elder size_t size; 825bb23e37aSAlex Elder int ret = -ENOMEM; 826621901d6SAlex Elder u32 i; 827602adf40SYehuda Sadeh 828bb23e37aSAlex Elder /* Allocate this now to avoid having to handle failure below */ 829103a150fSAlex Elder 830bb23e37aSAlex Elder if (first_time) { 831bb23e37aSAlex Elder size_t len; 832bb23e37aSAlex Elder 833bb23e37aSAlex Elder len = strnlen(ondisk->object_prefix, 834bb23e37aSAlex Elder sizeof (ondisk->object_prefix)); 835bb23e37aSAlex Elder object_prefix = kmalloc(len + 1, GFP_KERNEL); 836bb23e37aSAlex Elder if (!object_prefix) 837602adf40SYehuda Sadeh return -ENOMEM; 838bb23e37aSAlex Elder memcpy(object_prefix, ondisk->object_prefix, len); 839bb23e37aSAlex Elder object_prefix[len] = '\0'; 840bb23e37aSAlex Elder } 84100f1f36fSAlex Elder 842bb23e37aSAlex Elder /* Allocate the snapshot context and fill it in */ 843d2bb24e5SAlex Elder 844602adf40SYehuda Sadeh snap_count = le32_to_cpu(ondisk->snap_count); 845bb23e37aSAlex Elder snapc = ceph_create_snap_context(snap_count, GFP_KERNEL); 846bb23e37aSAlex Elder if (!snapc) 847bb23e37aSAlex Elder goto out_err; 848bb23e37aSAlex Elder snapc->seq = le64_to_cpu(ondisk->snap_seq); 849602adf40SYehuda Sadeh if (snap_count) { 850bb23e37aSAlex Elder struct rbd_image_snap_ondisk *snaps; 851f785cc1dSAlex Elder u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len); 852f785cc1dSAlex Elder 853bb23e37aSAlex Elder /* We'll keep a copy of the snapshot names... */ 854621901d6SAlex Elder 855f785cc1dSAlex Elder if (snap_names_len > (u64)SIZE_MAX) 856bb23e37aSAlex Elder goto out_2big; 857bb23e37aSAlex Elder snap_names = kmalloc(snap_names_len, GFP_KERNEL); 858bb23e37aSAlex Elder if (!snap_names) 859602adf40SYehuda Sadeh goto out_err; 860bb23e37aSAlex Elder 861bb23e37aSAlex Elder /* ...as well as the array of their sizes. */ 862bb23e37aSAlex Elder 863bb23e37aSAlex Elder size = snap_count * sizeof (*header->snap_sizes); 864bb23e37aSAlex Elder snap_sizes = kmalloc(size, GFP_KERNEL); 865bb23e37aSAlex Elder if (!snap_sizes) 866bb23e37aSAlex Elder goto out_err; 867bb23e37aSAlex Elder 868f785cc1dSAlex Elder /* 869bb23e37aSAlex Elder * Copy the names, and fill in each snapshot's id 870bb23e37aSAlex Elder * and size. 871bb23e37aSAlex Elder * 87299a41ebcSAlex Elder * Note that rbd_dev_v1_header_info() guarantees the 873bb23e37aSAlex Elder * ondisk buffer we're working with has 874f785cc1dSAlex Elder * snap_names_len bytes beyond the end of the 875f785cc1dSAlex Elder * snapshot id array, this memcpy() is safe. 876f785cc1dSAlex Elder */ 877bb23e37aSAlex Elder memcpy(snap_names, &ondisk->snaps[snap_count], snap_names_len); 878bb23e37aSAlex Elder snaps = ondisk->snaps; 879bb23e37aSAlex Elder for (i = 0; i < snap_count; i++) { 880bb23e37aSAlex Elder snapc->snaps[i] = le64_to_cpu(snaps[i].id); 881bb23e37aSAlex Elder snap_sizes[i] = le64_to_cpu(snaps[i].image_size); 882bb23e37aSAlex Elder } 883602adf40SYehuda Sadeh } 884849b4260SAlex Elder 885bb23e37aSAlex Elder /* We won't fail any more, fill in the header */ 886bb23e37aSAlex Elder 887bb23e37aSAlex Elder if (first_time) { 888bb23e37aSAlex Elder header->object_prefix = object_prefix; 889602adf40SYehuda Sadeh header->obj_order = ondisk->options.order; 890602adf40SYehuda Sadeh header->crypt_type = ondisk->options.crypt_type; 891602adf40SYehuda Sadeh header->comp_type = ondisk->options.comp_type; 892bb23e37aSAlex Elder /* The rest aren't used for format 1 images */ 893bb23e37aSAlex Elder header->stripe_unit = 0; 894bb23e37aSAlex Elder header->stripe_count = 0; 895bb23e37aSAlex Elder header->features = 0; 896662518b1SAlex Elder } else { 897662518b1SAlex Elder ceph_put_snap_context(header->snapc); 898662518b1SAlex Elder kfree(header->snap_names); 899662518b1SAlex Elder kfree(header->snap_sizes); 900bb23e37aSAlex Elder } 9016a52325fSAlex Elder 902bb23e37aSAlex Elder /* The remaining fields always get updated (when we refresh) */ 903621901d6SAlex Elder 904f84344f3SAlex Elder header->image_size = le64_to_cpu(ondisk->image_size); 905bb23e37aSAlex Elder header->snapc = snapc; 906bb23e37aSAlex Elder header->snap_names = snap_names; 907bb23e37aSAlex Elder header->snap_sizes = snap_sizes; 908468521c1SAlex Elder 909662518b1SAlex Elder /* Make sure mapping size is consistent with header info */ 910662518b1SAlex Elder 911662518b1SAlex Elder if (rbd_dev->spec->snap_id == CEPH_NOSNAP || first_time) 912662518b1SAlex Elder if (rbd_dev->mapping.size != header->image_size) 913662518b1SAlex Elder rbd_dev->mapping.size = header->image_size; 914662518b1SAlex Elder 915602adf40SYehuda Sadeh return 0; 916bb23e37aSAlex Elder out_2big: 917bb23e37aSAlex Elder ret = -EIO; 9186a52325fSAlex Elder out_err: 919bb23e37aSAlex Elder kfree(snap_sizes); 920bb23e37aSAlex Elder kfree(snap_names); 921bb23e37aSAlex Elder ceph_put_snap_context(snapc); 922bb23e37aSAlex Elder kfree(object_prefix); 923ccece235SAlex Elder 924bb23e37aSAlex Elder return ret; 925602adf40SYehuda Sadeh } 926602adf40SYehuda Sadeh 9279682fc6dSAlex Elder static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which) 9289682fc6dSAlex Elder { 9299682fc6dSAlex Elder const char *snap_name; 9309682fc6dSAlex Elder 9319682fc6dSAlex Elder rbd_assert(which < rbd_dev->header.snapc->num_snaps); 9329682fc6dSAlex Elder 9339682fc6dSAlex Elder /* Skip over names until we find the one we are looking for */ 9349682fc6dSAlex Elder 9359682fc6dSAlex Elder snap_name = rbd_dev->header.snap_names; 9369682fc6dSAlex Elder while (which--) 9379682fc6dSAlex Elder snap_name += strlen(snap_name) + 1; 9389682fc6dSAlex Elder 9399682fc6dSAlex Elder return kstrdup(snap_name, GFP_KERNEL); 9409682fc6dSAlex Elder } 9419682fc6dSAlex Elder 94230d1cff8SAlex Elder /* 94330d1cff8SAlex Elder * Snapshot id comparison function for use with qsort()/bsearch(). 94430d1cff8SAlex Elder * Note that result is for snapshots in *descending* order. 94530d1cff8SAlex Elder */ 94630d1cff8SAlex Elder static int snapid_compare_reverse(const void *s1, const void *s2) 94730d1cff8SAlex Elder { 94830d1cff8SAlex Elder u64 snap_id1 = *(u64 *)s1; 94930d1cff8SAlex Elder u64 snap_id2 = *(u64 *)s2; 95030d1cff8SAlex Elder 95130d1cff8SAlex Elder if (snap_id1 < snap_id2) 95230d1cff8SAlex Elder return 1; 95330d1cff8SAlex Elder return snap_id1 == snap_id2 ? 0 : -1; 95430d1cff8SAlex Elder } 95530d1cff8SAlex Elder 95630d1cff8SAlex Elder /* 95730d1cff8SAlex Elder * Search a snapshot context to see if the given snapshot id is 95830d1cff8SAlex Elder * present. 95930d1cff8SAlex Elder * 96030d1cff8SAlex Elder * Returns the position of the snapshot id in the array if it's found, 96130d1cff8SAlex Elder * or BAD_SNAP_INDEX otherwise. 96230d1cff8SAlex Elder * 96330d1cff8SAlex Elder * Note: The snapshot array is in kept sorted (by the osd) in 96430d1cff8SAlex Elder * reverse order, highest snapshot id first. 96530d1cff8SAlex Elder */ 9669682fc6dSAlex Elder static u32 rbd_dev_snap_index(struct rbd_device *rbd_dev, u64 snap_id) 9679682fc6dSAlex Elder { 9689682fc6dSAlex Elder struct ceph_snap_context *snapc = rbd_dev->header.snapc; 96930d1cff8SAlex Elder u64 *found; 9709682fc6dSAlex Elder 97130d1cff8SAlex Elder found = bsearch(&snap_id, &snapc->snaps, snapc->num_snaps, 97230d1cff8SAlex Elder sizeof (snap_id), snapid_compare_reverse); 9739682fc6dSAlex Elder 97430d1cff8SAlex Elder return found ? (u32)(found - &snapc->snaps[0]) : BAD_SNAP_INDEX; 9759682fc6dSAlex Elder } 9769682fc6dSAlex Elder 9772ad3d716SAlex Elder static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, 9782ad3d716SAlex Elder u64 snap_id) 97954cac61fSAlex Elder { 98054cac61fSAlex Elder u32 which; 981da6a6b63SJosh Durgin const char *snap_name; 98254cac61fSAlex Elder 98354cac61fSAlex Elder which = rbd_dev_snap_index(rbd_dev, snap_id); 98454cac61fSAlex Elder if (which == BAD_SNAP_INDEX) 985da6a6b63SJosh Durgin return ERR_PTR(-ENOENT); 98654cac61fSAlex Elder 987da6a6b63SJosh Durgin snap_name = _rbd_dev_v1_snap_name(rbd_dev, which); 988da6a6b63SJosh Durgin return snap_name ? snap_name : ERR_PTR(-ENOMEM); 98954cac61fSAlex Elder } 99054cac61fSAlex Elder 9919e15b77dSAlex Elder static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id) 9929e15b77dSAlex Elder { 9939e15b77dSAlex Elder if (snap_id == CEPH_NOSNAP) 9949e15b77dSAlex Elder return RBD_SNAP_HEAD_NAME; 9959e15b77dSAlex Elder 99654cac61fSAlex Elder rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 99754cac61fSAlex Elder if (rbd_dev->image_format == 1) 99854cac61fSAlex Elder return rbd_dev_v1_snap_name(rbd_dev, snap_id); 9999e15b77dSAlex Elder 100054cac61fSAlex Elder return rbd_dev_v2_snap_name(rbd_dev, snap_id); 10019e15b77dSAlex Elder } 10029e15b77dSAlex Elder 10032ad3d716SAlex Elder static int rbd_snap_size(struct rbd_device *rbd_dev, u64 snap_id, 10042ad3d716SAlex Elder u64 *snap_size) 1005602adf40SYehuda Sadeh { 10062ad3d716SAlex Elder rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 10072ad3d716SAlex Elder if (snap_id == CEPH_NOSNAP) { 10082ad3d716SAlex Elder *snap_size = rbd_dev->header.image_size; 10092ad3d716SAlex Elder } else if (rbd_dev->image_format == 1) { 10102ad3d716SAlex Elder u32 which; 101100f1f36fSAlex Elder 10122ad3d716SAlex Elder which = rbd_dev_snap_index(rbd_dev, snap_id); 10132ad3d716SAlex Elder if (which == BAD_SNAP_INDEX) 10142ad3d716SAlex Elder return -ENOENT; 101500f1f36fSAlex Elder 10162ad3d716SAlex Elder *snap_size = rbd_dev->header.snap_sizes[which]; 10172ad3d716SAlex Elder } else { 10182ad3d716SAlex Elder u64 size = 0; 10192ad3d716SAlex Elder int ret; 10202ad3d716SAlex Elder 10212ad3d716SAlex Elder ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size); 10222ad3d716SAlex Elder if (ret) 10232ad3d716SAlex Elder return ret; 10242ad3d716SAlex Elder 10252ad3d716SAlex Elder *snap_size = size; 10262ad3d716SAlex Elder } 10272ad3d716SAlex Elder return 0; 10282ad3d716SAlex Elder } 10292ad3d716SAlex Elder 10302ad3d716SAlex Elder static int rbd_snap_features(struct rbd_device *rbd_dev, u64 snap_id, 10312ad3d716SAlex Elder u64 *snap_features) 10322ad3d716SAlex Elder { 10332ad3d716SAlex Elder rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 10342ad3d716SAlex Elder if (snap_id == CEPH_NOSNAP) { 10352ad3d716SAlex Elder *snap_features = rbd_dev->header.features; 10362ad3d716SAlex Elder } else if (rbd_dev->image_format == 1) { 10372ad3d716SAlex Elder *snap_features = 0; /* No features for format 1 */ 10382ad3d716SAlex Elder } else { 10392ad3d716SAlex Elder u64 features = 0; 10402ad3d716SAlex Elder int ret; 10412ad3d716SAlex Elder 10422ad3d716SAlex Elder ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, &features); 10432ad3d716SAlex Elder if (ret) 10442ad3d716SAlex Elder return ret; 10452ad3d716SAlex Elder 10462ad3d716SAlex Elder *snap_features = features; 10472ad3d716SAlex Elder } 10482ad3d716SAlex Elder return 0; 104900f1f36fSAlex Elder } 1050602adf40SYehuda Sadeh 1051d1cf5788SAlex Elder static int rbd_dev_mapping_set(struct rbd_device *rbd_dev) 1052602adf40SYehuda Sadeh { 10538f4b7d98SAlex Elder u64 snap_id = rbd_dev->spec->snap_id; 10542ad3d716SAlex Elder u64 size = 0; 10552ad3d716SAlex Elder u64 features = 0; 10562ad3d716SAlex Elder int ret; 10578b0241f8SAlex Elder 10582ad3d716SAlex Elder ret = rbd_snap_size(rbd_dev, snap_id, &size); 10592ad3d716SAlex Elder if (ret) 10602ad3d716SAlex Elder return ret; 10612ad3d716SAlex Elder ret = rbd_snap_features(rbd_dev, snap_id, &features); 10622ad3d716SAlex Elder if (ret) 10632ad3d716SAlex Elder return ret; 10642ad3d716SAlex Elder 10652ad3d716SAlex Elder rbd_dev->mapping.size = size; 10662ad3d716SAlex Elder rbd_dev->mapping.features = features; 10672ad3d716SAlex Elder 10688b0241f8SAlex Elder return 0; 1069602adf40SYehuda Sadeh } 1070602adf40SYehuda Sadeh 1071d1cf5788SAlex Elder static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev) 1072d1cf5788SAlex Elder { 1073d1cf5788SAlex Elder rbd_dev->mapping.size = 0; 1074d1cf5788SAlex Elder rbd_dev->mapping.features = 0; 1075200a6a8bSAlex Elder } 1076200a6a8bSAlex Elder 107798571b5aSAlex Elder static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset) 1078602adf40SYehuda Sadeh { 107965ccfe21SAlex Elder char *name; 108065ccfe21SAlex Elder u64 segment; 108165ccfe21SAlex Elder int ret; 10823a96d5cdSJosh Durgin char *name_format; 1083602adf40SYehuda Sadeh 108478c2a44aSAlex Elder name = kmem_cache_alloc(rbd_segment_name_cache, GFP_NOIO); 108565ccfe21SAlex Elder if (!name) 108665ccfe21SAlex Elder return NULL; 108765ccfe21SAlex Elder segment = offset >> rbd_dev->header.obj_order; 10883a96d5cdSJosh Durgin name_format = "%s.%012llx"; 10893a96d5cdSJosh Durgin if (rbd_dev->image_format == 2) 10903a96d5cdSJosh Durgin name_format = "%s.%016llx"; 10912d0ebc5dSIlya Dryomov ret = snprintf(name, CEPH_MAX_OID_NAME_LEN + 1, name_format, 109265ccfe21SAlex Elder rbd_dev->header.object_prefix, segment); 10932d0ebc5dSIlya Dryomov if (ret < 0 || ret > CEPH_MAX_OID_NAME_LEN) { 109465ccfe21SAlex Elder pr_err("error formatting segment name for #%llu (%d)\n", 109565ccfe21SAlex Elder segment, ret); 109665ccfe21SAlex Elder kfree(name); 109765ccfe21SAlex Elder name = NULL; 109865ccfe21SAlex Elder } 1099602adf40SYehuda Sadeh 110065ccfe21SAlex Elder return name; 110165ccfe21SAlex Elder } 1102602adf40SYehuda Sadeh 110378c2a44aSAlex Elder static void rbd_segment_name_free(const char *name) 110478c2a44aSAlex Elder { 110578c2a44aSAlex Elder /* The explicit cast here is needed to drop the const qualifier */ 110678c2a44aSAlex Elder 110778c2a44aSAlex Elder kmem_cache_free(rbd_segment_name_cache, (void *)name); 110878c2a44aSAlex Elder } 110978c2a44aSAlex Elder 111065ccfe21SAlex Elder static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset) 111165ccfe21SAlex Elder { 111265ccfe21SAlex Elder u64 segment_size = (u64) 1 << rbd_dev->header.obj_order; 1113602adf40SYehuda Sadeh 111465ccfe21SAlex Elder return offset & (segment_size - 1); 111565ccfe21SAlex Elder } 111665ccfe21SAlex Elder 111765ccfe21SAlex Elder static u64 rbd_segment_length(struct rbd_device *rbd_dev, 111865ccfe21SAlex Elder u64 offset, u64 length) 111965ccfe21SAlex Elder { 112065ccfe21SAlex Elder u64 segment_size = (u64) 1 << rbd_dev->header.obj_order; 112165ccfe21SAlex Elder 112265ccfe21SAlex Elder offset &= segment_size - 1; 112365ccfe21SAlex Elder 1124aafb230eSAlex Elder rbd_assert(length <= U64_MAX - offset); 112565ccfe21SAlex Elder if (offset + length > segment_size) 112665ccfe21SAlex Elder length = segment_size - offset; 112765ccfe21SAlex Elder 112865ccfe21SAlex Elder return length; 1129602adf40SYehuda Sadeh } 1130602adf40SYehuda Sadeh 1131602adf40SYehuda Sadeh /* 1132029bcbd8SJosh Durgin * returns the size of an object in the image 1133029bcbd8SJosh Durgin */ 1134029bcbd8SJosh Durgin static u64 rbd_obj_bytes(struct rbd_image_header *header) 1135029bcbd8SJosh Durgin { 1136029bcbd8SJosh Durgin return 1 << header->obj_order; 1137029bcbd8SJosh Durgin } 1138029bcbd8SJosh Durgin 1139029bcbd8SJosh Durgin /* 1140602adf40SYehuda Sadeh * bio helpers 1141602adf40SYehuda Sadeh */ 1142602adf40SYehuda Sadeh 1143602adf40SYehuda Sadeh static void bio_chain_put(struct bio *chain) 1144602adf40SYehuda Sadeh { 1145602adf40SYehuda Sadeh struct bio *tmp; 1146602adf40SYehuda Sadeh 1147602adf40SYehuda Sadeh while (chain) { 1148602adf40SYehuda Sadeh tmp = chain; 1149602adf40SYehuda Sadeh chain = chain->bi_next; 1150602adf40SYehuda Sadeh bio_put(tmp); 1151602adf40SYehuda Sadeh } 1152602adf40SYehuda Sadeh } 1153602adf40SYehuda Sadeh 1154602adf40SYehuda Sadeh /* 1155602adf40SYehuda Sadeh * zeros a bio chain, starting at specific offset 1156602adf40SYehuda Sadeh */ 1157602adf40SYehuda Sadeh static void zero_bio_chain(struct bio *chain, int start_ofs) 1158602adf40SYehuda Sadeh { 11597988613bSKent Overstreet struct bio_vec bv; 11607988613bSKent Overstreet struct bvec_iter iter; 1161602adf40SYehuda Sadeh unsigned long flags; 1162602adf40SYehuda Sadeh void *buf; 1163602adf40SYehuda Sadeh int pos = 0; 1164602adf40SYehuda Sadeh 1165602adf40SYehuda Sadeh while (chain) { 11667988613bSKent Overstreet bio_for_each_segment(bv, chain, iter) { 11677988613bSKent Overstreet if (pos + bv.bv_len > start_ofs) { 1168602adf40SYehuda Sadeh int remainder = max(start_ofs - pos, 0); 11697988613bSKent Overstreet buf = bvec_kmap_irq(&bv, &flags); 1170602adf40SYehuda Sadeh memset(buf + remainder, 0, 11717988613bSKent Overstreet bv.bv_len - remainder); 11727988613bSKent Overstreet flush_dcache_page(bv.bv_page); 117385b5aaa6SDan Carpenter bvec_kunmap_irq(buf, &flags); 1174602adf40SYehuda Sadeh } 11757988613bSKent Overstreet pos += bv.bv_len; 1176602adf40SYehuda Sadeh } 1177602adf40SYehuda Sadeh 1178602adf40SYehuda Sadeh chain = chain->bi_next; 1179602adf40SYehuda Sadeh } 1180602adf40SYehuda Sadeh } 1181602adf40SYehuda Sadeh 1182602adf40SYehuda Sadeh /* 1183b9434c5bSAlex Elder * similar to zero_bio_chain(), zeros data defined by a page array, 1184b9434c5bSAlex Elder * starting at the given byte offset from the start of the array and 1185b9434c5bSAlex Elder * continuing up to the given end offset. The pages array is 1186b9434c5bSAlex Elder * assumed to be big enough to hold all bytes up to the end. 1187b9434c5bSAlex Elder */ 1188b9434c5bSAlex Elder static void zero_pages(struct page **pages, u64 offset, u64 end) 1189b9434c5bSAlex Elder { 1190b9434c5bSAlex Elder struct page **page = &pages[offset >> PAGE_SHIFT]; 1191b9434c5bSAlex Elder 1192b9434c5bSAlex Elder rbd_assert(end > offset); 1193b9434c5bSAlex Elder rbd_assert(end - offset <= (u64)SIZE_MAX); 1194b9434c5bSAlex Elder while (offset < end) { 1195b9434c5bSAlex Elder size_t page_offset; 1196b9434c5bSAlex Elder size_t length; 1197b9434c5bSAlex Elder unsigned long flags; 1198b9434c5bSAlex Elder void *kaddr; 1199b9434c5bSAlex Elder 1200491205a8SGeert Uytterhoeven page_offset = offset & ~PAGE_MASK; 1201491205a8SGeert Uytterhoeven length = min_t(size_t, PAGE_SIZE - page_offset, end - offset); 1202b9434c5bSAlex Elder local_irq_save(flags); 1203b9434c5bSAlex Elder kaddr = kmap_atomic(*page); 1204b9434c5bSAlex Elder memset(kaddr + page_offset, 0, length); 1205e2156054SAlex Elder flush_dcache_page(*page); 1206b9434c5bSAlex Elder kunmap_atomic(kaddr); 1207b9434c5bSAlex Elder local_irq_restore(flags); 1208b9434c5bSAlex Elder 1209b9434c5bSAlex Elder offset += length; 1210b9434c5bSAlex Elder page++; 1211b9434c5bSAlex Elder } 1212b9434c5bSAlex Elder } 1213b9434c5bSAlex Elder 1214b9434c5bSAlex Elder /* 1215f7760dadSAlex Elder * Clone a portion of a bio, starting at the given byte offset 1216f7760dadSAlex Elder * and continuing for the number of bytes indicated. 1217602adf40SYehuda Sadeh */ 1218f7760dadSAlex Elder static struct bio *bio_clone_range(struct bio *bio_src, 1219f7760dadSAlex Elder unsigned int offset, 1220f7760dadSAlex Elder unsigned int len, 1221f7760dadSAlex Elder gfp_t gfpmask) 1222602adf40SYehuda Sadeh { 1223f7760dadSAlex Elder struct bio *bio; 1224602adf40SYehuda Sadeh 12255341a627SKent Overstreet bio = bio_clone(bio_src, gfpmask); 1226f7760dadSAlex Elder if (!bio) 1227f7760dadSAlex Elder return NULL; /* ENOMEM */ 1228f7760dadSAlex Elder 12295341a627SKent Overstreet bio_advance(bio, offset); 12304f024f37SKent Overstreet bio->bi_iter.bi_size = len; 1231602adf40SYehuda Sadeh 1232f7760dadSAlex Elder return bio; 1233602adf40SYehuda Sadeh } 1234602adf40SYehuda Sadeh 1235f7760dadSAlex Elder /* 1236f7760dadSAlex Elder * Clone a portion of a bio chain, starting at the given byte offset 1237f7760dadSAlex Elder * into the first bio in the source chain and continuing for the 1238f7760dadSAlex Elder * number of bytes indicated. The result is another bio chain of 1239f7760dadSAlex Elder * exactly the given length, or a null pointer on error. 1240f7760dadSAlex Elder * 1241f7760dadSAlex Elder * The bio_src and offset parameters are both in-out. On entry they 1242f7760dadSAlex Elder * refer to the first source bio and the offset into that bio where 1243f7760dadSAlex Elder * the start of data to be cloned is located. 1244f7760dadSAlex Elder * 1245f7760dadSAlex Elder * On return, bio_src is updated to refer to the bio in the source 1246f7760dadSAlex Elder * chain that contains first un-cloned byte, and *offset will 1247f7760dadSAlex Elder * contain the offset of that byte within that bio. 1248f7760dadSAlex Elder */ 1249f7760dadSAlex Elder static struct bio *bio_chain_clone_range(struct bio **bio_src, 1250f7760dadSAlex Elder unsigned int *offset, 1251f7760dadSAlex Elder unsigned int len, 1252f7760dadSAlex Elder gfp_t gfpmask) 1253f7760dadSAlex Elder { 1254f7760dadSAlex Elder struct bio *bi = *bio_src; 1255f7760dadSAlex Elder unsigned int off = *offset; 1256f7760dadSAlex Elder struct bio *chain = NULL; 1257f7760dadSAlex Elder struct bio **end; 1258602adf40SYehuda Sadeh 1259f7760dadSAlex Elder /* Build up a chain of clone bios up to the limit */ 1260602adf40SYehuda Sadeh 12614f024f37SKent Overstreet if (!bi || off >= bi->bi_iter.bi_size || !len) 1262f7760dadSAlex Elder return NULL; /* Nothing to clone */ 1263602adf40SYehuda Sadeh 1264f7760dadSAlex Elder end = &chain; 1265f7760dadSAlex Elder while (len) { 1266f7760dadSAlex Elder unsigned int bi_size; 1267f7760dadSAlex Elder struct bio *bio; 1268f7760dadSAlex Elder 1269f5400b7aSAlex Elder if (!bi) { 1270f5400b7aSAlex Elder rbd_warn(NULL, "bio_chain exhausted with %u left", len); 1271f7760dadSAlex Elder goto out_err; /* EINVAL; ran out of bio's */ 1272f5400b7aSAlex Elder } 12734f024f37SKent Overstreet bi_size = min_t(unsigned int, bi->bi_iter.bi_size - off, len); 1274f7760dadSAlex Elder bio = bio_clone_range(bi, off, bi_size, gfpmask); 1275f7760dadSAlex Elder if (!bio) 1276f7760dadSAlex Elder goto out_err; /* ENOMEM */ 1277f7760dadSAlex Elder 1278f7760dadSAlex Elder *end = bio; 1279f7760dadSAlex Elder end = &bio->bi_next; 1280f7760dadSAlex Elder 1281f7760dadSAlex Elder off += bi_size; 12824f024f37SKent Overstreet if (off == bi->bi_iter.bi_size) { 1283f7760dadSAlex Elder bi = bi->bi_next; 1284f7760dadSAlex Elder off = 0; 1285f7760dadSAlex Elder } 1286f7760dadSAlex Elder len -= bi_size; 1287f7760dadSAlex Elder } 1288f7760dadSAlex Elder *bio_src = bi; 1289f7760dadSAlex Elder *offset = off; 1290f7760dadSAlex Elder 1291f7760dadSAlex Elder return chain; 1292f7760dadSAlex Elder out_err: 1293f7760dadSAlex Elder bio_chain_put(chain); 1294f7760dadSAlex Elder 1295602adf40SYehuda Sadeh return NULL; 1296602adf40SYehuda Sadeh } 1297602adf40SYehuda Sadeh 1298926f9b3fSAlex Elder /* 1299926f9b3fSAlex Elder * The default/initial value for all object request flags is 0. For 1300926f9b3fSAlex Elder * each flag, once its value is set to 1 it is never reset to 0 1301926f9b3fSAlex Elder * again. 1302926f9b3fSAlex Elder */ 13036365d33aSAlex Elder static void obj_request_img_data_set(struct rbd_obj_request *obj_request) 13046365d33aSAlex Elder { 13056365d33aSAlex Elder if (test_and_set_bit(OBJ_REQ_IMG_DATA, &obj_request->flags)) { 13066365d33aSAlex Elder struct rbd_device *rbd_dev; 13076365d33aSAlex Elder 130857acbaa7SAlex Elder rbd_dev = obj_request->img_request->rbd_dev; 13096365d33aSAlex Elder rbd_warn(rbd_dev, "obj_request %p already marked img_data\n", 13106365d33aSAlex Elder obj_request); 13116365d33aSAlex Elder } 13126365d33aSAlex Elder } 13136365d33aSAlex Elder 13146365d33aSAlex Elder static bool obj_request_img_data_test(struct rbd_obj_request *obj_request) 13156365d33aSAlex Elder { 13166365d33aSAlex Elder smp_mb(); 13176365d33aSAlex Elder return test_bit(OBJ_REQ_IMG_DATA, &obj_request->flags) != 0; 13186365d33aSAlex Elder } 13196365d33aSAlex Elder 132057acbaa7SAlex Elder static void obj_request_done_set(struct rbd_obj_request *obj_request) 132157acbaa7SAlex Elder { 132257acbaa7SAlex Elder if (test_and_set_bit(OBJ_REQ_DONE, &obj_request->flags)) { 132357acbaa7SAlex Elder struct rbd_device *rbd_dev = NULL; 132457acbaa7SAlex Elder 132557acbaa7SAlex Elder if (obj_request_img_data_test(obj_request)) 132657acbaa7SAlex Elder rbd_dev = obj_request->img_request->rbd_dev; 132757acbaa7SAlex Elder rbd_warn(rbd_dev, "obj_request %p already marked done\n", 132857acbaa7SAlex Elder obj_request); 132957acbaa7SAlex Elder } 133057acbaa7SAlex Elder } 133157acbaa7SAlex Elder 133257acbaa7SAlex Elder static bool obj_request_done_test(struct rbd_obj_request *obj_request) 133357acbaa7SAlex Elder { 133457acbaa7SAlex Elder smp_mb(); 133557acbaa7SAlex Elder return test_bit(OBJ_REQ_DONE, &obj_request->flags) != 0; 133657acbaa7SAlex Elder } 133757acbaa7SAlex Elder 13385679c59fSAlex Elder /* 13395679c59fSAlex Elder * This sets the KNOWN flag after (possibly) setting the EXISTS 13405679c59fSAlex Elder * flag. The latter is set based on the "exists" value provided. 13415679c59fSAlex Elder * 13425679c59fSAlex Elder * Note that for our purposes once an object exists it never goes 13435679c59fSAlex Elder * away again. It's possible that the response from two existence 13445679c59fSAlex Elder * checks are separated by the creation of the target object, and 13455679c59fSAlex Elder * the first ("doesn't exist") response arrives *after* the second 13465679c59fSAlex Elder * ("does exist"). In that case we ignore the second one. 13475679c59fSAlex Elder */ 13485679c59fSAlex Elder static void obj_request_existence_set(struct rbd_obj_request *obj_request, 13495679c59fSAlex Elder bool exists) 13505679c59fSAlex Elder { 13515679c59fSAlex Elder if (exists) 13525679c59fSAlex Elder set_bit(OBJ_REQ_EXISTS, &obj_request->flags); 13535679c59fSAlex Elder set_bit(OBJ_REQ_KNOWN, &obj_request->flags); 13545679c59fSAlex Elder smp_mb(); 13555679c59fSAlex Elder } 13565679c59fSAlex Elder 13575679c59fSAlex Elder static bool obj_request_known_test(struct rbd_obj_request *obj_request) 13585679c59fSAlex Elder { 13595679c59fSAlex Elder smp_mb(); 13605679c59fSAlex Elder return test_bit(OBJ_REQ_KNOWN, &obj_request->flags) != 0; 13615679c59fSAlex Elder } 13625679c59fSAlex Elder 13635679c59fSAlex Elder static bool obj_request_exists_test(struct rbd_obj_request *obj_request) 13645679c59fSAlex Elder { 13655679c59fSAlex Elder smp_mb(); 13665679c59fSAlex Elder return test_bit(OBJ_REQ_EXISTS, &obj_request->flags) != 0; 13675679c59fSAlex Elder } 13685679c59fSAlex Elder 1369bf0d5f50SAlex Elder static void rbd_obj_request_get(struct rbd_obj_request *obj_request) 1370bf0d5f50SAlex Elder { 137137206ee5SAlex Elder dout("%s: obj %p (was %d)\n", __func__, obj_request, 137237206ee5SAlex Elder atomic_read(&obj_request->kref.refcount)); 1373bf0d5f50SAlex Elder kref_get(&obj_request->kref); 1374bf0d5f50SAlex Elder } 1375bf0d5f50SAlex Elder 1376bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref); 1377bf0d5f50SAlex Elder static void rbd_obj_request_put(struct rbd_obj_request *obj_request) 1378bf0d5f50SAlex Elder { 1379bf0d5f50SAlex Elder rbd_assert(obj_request != NULL); 138037206ee5SAlex Elder dout("%s: obj %p (was %d)\n", __func__, obj_request, 138137206ee5SAlex Elder atomic_read(&obj_request->kref.refcount)); 1382bf0d5f50SAlex Elder kref_put(&obj_request->kref, rbd_obj_request_destroy); 1383bf0d5f50SAlex Elder } 1384bf0d5f50SAlex Elder 1385e93f3152SAlex Elder static bool img_request_child_test(struct rbd_img_request *img_request); 1386e93f3152SAlex Elder static void rbd_parent_request_destroy(struct kref *kref); 1387bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref); 1388bf0d5f50SAlex Elder static void rbd_img_request_put(struct rbd_img_request *img_request) 1389bf0d5f50SAlex Elder { 1390bf0d5f50SAlex Elder rbd_assert(img_request != NULL); 139137206ee5SAlex Elder dout("%s: img %p (was %d)\n", __func__, img_request, 139237206ee5SAlex Elder atomic_read(&img_request->kref.refcount)); 1393e93f3152SAlex Elder if (img_request_child_test(img_request)) 1394e93f3152SAlex Elder kref_put(&img_request->kref, rbd_parent_request_destroy); 1395e93f3152SAlex Elder else 1396bf0d5f50SAlex Elder kref_put(&img_request->kref, rbd_img_request_destroy); 1397bf0d5f50SAlex Elder } 1398bf0d5f50SAlex Elder 1399bf0d5f50SAlex Elder static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request, 1400bf0d5f50SAlex Elder struct rbd_obj_request *obj_request) 1401bf0d5f50SAlex Elder { 140225dcf954SAlex Elder rbd_assert(obj_request->img_request == NULL); 140325dcf954SAlex Elder 1404b155e86cSAlex Elder /* Image request now owns object's original reference */ 1405bf0d5f50SAlex Elder obj_request->img_request = img_request; 140625dcf954SAlex Elder obj_request->which = img_request->obj_request_count; 14076365d33aSAlex Elder rbd_assert(!obj_request_img_data_test(obj_request)); 14086365d33aSAlex Elder obj_request_img_data_set(obj_request); 1409bf0d5f50SAlex Elder rbd_assert(obj_request->which != BAD_WHICH); 141025dcf954SAlex Elder img_request->obj_request_count++; 141125dcf954SAlex Elder list_add_tail(&obj_request->links, &img_request->obj_requests); 141237206ee5SAlex Elder dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request, 141337206ee5SAlex Elder obj_request->which); 1414bf0d5f50SAlex Elder } 1415bf0d5f50SAlex Elder 1416bf0d5f50SAlex Elder static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request, 1417bf0d5f50SAlex Elder struct rbd_obj_request *obj_request) 1418bf0d5f50SAlex Elder { 1419bf0d5f50SAlex Elder rbd_assert(obj_request->which != BAD_WHICH); 142025dcf954SAlex Elder 142137206ee5SAlex Elder dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request, 142237206ee5SAlex Elder obj_request->which); 1423bf0d5f50SAlex Elder list_del(&obj_request->links); 142425dcf954SAlex Elder rbd_assert(img_request->obj_request_count > 0); 142525dcf954SAlex Elder img_request->obj_request_count--; 142625dcf954SAlex Elder rbd_assert(obj_request->which == img_request->obj_request_count); 142725dcf954SAlex Elder obj_request->which = BAD_WHICH; 14286365d33aSAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 1429bf0d5f50SAlex Elder rbd_assert(obj_request->img_request == img_request); 1430bf0d5f50SAlex Elder obj_request->img_request = NULL; 143125dcf954SAlex Elder obj_request->callback = NULL; 1432bf0d5f50SAlex Elder rbd_obj_request_put(obj_request); 1433bf0d5f50SAlex Elder } 1434bf0d5f50SAlex Elder 1435bf0d5f50SAlex Elder static bool obj_request_type_valid(enum obj_request_type type) 1436bf0d5f50SAlex Elder { 1437bf0d5f50SAlex Elder switch (type) { 14389969ebc5SAlex Elder case OBJ_REQUEST_NODATA: 1439bf0d5f50SAlex Elder case OBJ_REQUEST_BIO: 1440788e2df3SAlex Elder case OBJ_REQUEST_PAGES: 1441bf0d5f50SAlex Elder return true; 1442bf0d5f50SAlex Elder default: 1443bf0d5f50SAlex Elder return false; 1444bf0d5f50SAlex Elder } 1445bf0d5f50SAlex Elder } 1446bf0d5f50SAlex Elder 1447bf0d5f50SAlex Elder static int rbd_obj_request_submit(struct ceph_osd_client *osdc, 1448bf0d5f50SAlex Elder struct rbd_obj_request *obj_request) 1449bf0d5f50SAlex Elder { 145037206ee5SAlex Elder dout("%s: osdc %p obj %p\n", __func__, osdc, obj_request); 145137206ee5SAlex Elder 1452bf0d5f50SAlex Elder return ceph_osdc_start_request(osdc, obj_request->osd_req, false); 1453bf0d5f50SAlex Elder } 1454bf0d5f50SAlex Elder 1455bf0d5f50SAlex Elder static void rbd_img_request_complete(struct rbd_img_request *img_request) 1456bf0d5f50SAlex Elder { 145755f27e09SAlex Elder 145837206ee5SAlex Elder dout("%s: img %p\n", __func__, img_request); 145955f27e09SAlex Elder 146055f27e09SAlex Elder /* 146155f27e09SAlex Elder * If no error occurred, compute the aggregate transfer 146255f27e09SAlex Elder * count for the image request. We could instead use 146355f27e09SAlex Elder * atomic64_cmpxchg() to update it as each object request 146455f27e09SAlex Elder * completes; not clear which way is better off hand. 146555f27e09SAlex Elder */ 146655f27e09SAlex Elder if (!img_request->result) { 146755f27e09SAlex Elder struct rbd_obj_request *obj_request; 146855f27e09SAlex Elder u64 xferred = 0; 146955f27e09SAlex Elder 147055f27e09SAlex Elder for_each_obj_request(img_request, obj_request) 147155f27e09SAlex Elder xferred += obj_request->xferred; 147255f27e09SAlex Elder img_request->xferred = xferred; 147355f27e09SAlex Elder } 147455f27e09SAlex Elder 1475bf0d5f50SAlex Elder if (img_request->callback) 1476bf0d5f50SAlex Elder img_request->callback(img_request); 1477bf0d5f50SAlex Elder else 1478bf0d5f50SAlex Elder rbd_img_request_put(img_request); 1479bf0d5f50SAlex Elder } 1480bf0d5f50SAlex Elder 1481788e2df3SAlex Elder /* Caller is responsible for rbd_obj_request_destroy(obj_request) */ 1482788e2df3SAlex Elder 1483788e2df3SAlex Elder static int rbd_obj_request_wait(struct rbd_obj_request *obj_request) 1484788e2df3SAlex Elder { 148537206ee5SAlex Elder dout("%s: obj %p\n", __func__, obj_request); 148637206ee5SAlex Elder 1487788e2df3SAlex Elder return wait_for_completion_interruptible(&obj_request->completion); 1488788e2df3SAlex Elder } 1489788e2df3SAlex Elder 14900c425248SAlex Elder /* 14910c425248SAlex Elder * The default/initial value for all image request flags is 0. Each 14920c425248SAlex Elder * is conditionally set to 1 at image request initialization time 14930c425248SAlex Elder * and currently never change thereafter. 14940c425248SAlex Elder */ 14950c425248SAlex Elder static void img_request_write_set(struct rbd_img_request *img_request) 14960c425248SAlex Elder { 14970c425248SAlex Elder set_bit(IMG_REQ_WRITE, &img_request->flags); 14980c425248SAlex Elder smp_mb(); 14990c425248SAlex Elder } 15000c425248SAlex Elder 15010c425248SAlex Elder static bool img_request_write_test(struct rbd_img_request *img_request) 15020c425248SAlex Elder { 15030c425248SAlex Elder smp_mb(); 15040c425248SAlex Elder return test_bit(IMG_REQ_WRITE, &img_request->flags) != 0; 15050c425248SAlex Elder } 15060c425248SAlex Elder 15079849e986SAlex Elder static void img_request_child_set(struct rbd_img_request *img_request) 15089849e986SAlex Elder { 15099849e986SAlex Elder set_bit(IMG_REQ_CHILD, &img_request->flags); 15109849e986SAlex Elder smp_mb(); 15119849e986SAlex Elder } 15129849e986SAlex Elder 1513e93f3152SAlex Elder static void img_request_child_clear(struct rbd_img_request *img_request) 1514e93f3152SAlex Elder { 1515e93f3152SAlex Elder clear_bit(IMG_REQ_CHILD, &img_request->flags); 1516e93f3152SAlex Elder smp_mb(); 1517e93f3152SAlex Elder } 1518e93f3152SAlex Elder 15199849e986SAlex Elder static bool img_request_child_test(struct rbd_img_request *img_request) 15209849e986SAlex Elder { 15219849e986SAlex Elder smp_mb(); 15229849e986SAlex Elder return test_bit(IMG_REQ_CHILD, &img_request->flags) != 0; 15239849e986SAlex Elder } 15249849e986SAlex Elder 1525d0b2e944SAlex Elder static void img_request_layered_set(struct rbd_img_request *img_request) 1526d0b2e944SAlex Elder { 1527d0b2e944SAlex Elder set_bit(IMG_REQ_LAYERED, &img_request->flags); 1528d0b2e944SAlex Elder smp_mb(); 1529d0b2e944SAlex Elder } 1530d0b2e944SAlex Elder 1531a2acd00eSAlex Elder static void img_request_layered_clear(struct rbd_img_request *img_request) 1532a2acd00eSAlex Elder { 1533a2acd00eSAlex Elder clear_bit(IMG_REQ_LAYERED, &img_request->flags); 1534a2acd00eSAlex Elder smp_mb(); 1535a2acd00eSAlex Elder } 1536a2acd00eSAlex Elder 1537d0b2e944SAlex Elder static bool img_request_layered_test(struct rbd_img_request *img_request) 1538d0b2e944SAlex Elder { 1539d0b2e944SAlex Elder smp_mb(); 1540d0b2e944SAlex Elder return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0; 1541d0b2e944SAlex Elder } 1542d0b2e944SAlex Elder 15436e2a4505SAlex Elder static void 15446e2a4505SAlex Elder rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request) 15456e2a4505SAlex Elder { 1546b9434c5bSAlex Elder u64 xferred = obj_request->xferred; 1547b9434c5bSAlex Elder u64 length = obj_request->length; 1548b9434c5bSAlex Elder 15496e2a4505SAlex Elder dout("%s: obj %p img %p result %d %llu/%llu\n", __func__, 15506e2a4505SAlex Elder obj_request, obj_request->img_request, obj_request->result, 1551b9434c5bSAlex Elder xferred, length); 15526e2a4505SAlex Elder /* 155317c1cc1dSJosh Durgin * ENOENT means a hole in the image. We zero-fill the entire 155417c1cc1dSJosh Durgin * length of the request. A short read also implies zero-fill 155517c1cc1dSJosh Durgin * to the end of the request. An error requires the whole 155617c1cc1dSJosh Durgin * length of the request to be reported finished with an error 155717c1cc1dSJosh Durgin * to the block layer. In each case we update the xferred 155817c1cc1dSJosh Durgin * count to indicate the whole request was satisfied. 15596e2a4505SAlex Elder */ 1560b9434c5bSAlex Elder rbd_assert(obj_request->type != OBJ_REQUEST_NODATA); 15616e2a4505SAlex Elder if (obj_request->result == -ENOENT) { 1562b9434c5bSAlex Elder if (obj_request->type == OBJ_REQUEST_BIO) 15636e2a4505SAlex Elder zero_bio_chain(obj_request->bio_list, 0); 1564b9434c5bSAlex Elder else 1565b9434c5bSAlex Elder zero_pages(obj_request->pages, 0, length); 15666e2a4505SAlex Elder obj_request->result = 0; 1567b9434c5bSAlex Elder } else if (xferred < length && !obj_request->result) { 1568b9434c5bSAlex Elder if (obj_request->type == OBJ_REQUEST_BIO) 1569b9434c5bSAlex Elder zero_bio_chain(obj_request->bio_list, xferred); 1570b9434c5bSAlex Elder else 1571b9434c5bSAlex Elder zero_pages(obj_request->pages, xferred, length); 15726e2a4505SAlex Elder } 157317c1cc1dSJosh Durgin obj_request->xferred = length; 15746e2a4505SAlex Elder obj_request_done_set(obj_request); 15756e2a4505SAlex Elder } 15766e2a4505SAlex Elder 1577bf0d5f50SAlex Elder static void rbd_obj_request_complete(struct rbd_obj_request *obj_request) 1578bf0d5f50SAlex Elder { 157937206ee5SAlex Elder dout("%s: obj %p cb %p\n", __func__, obj_request, 158037206ee5SAlex Elder obj_request->callback); 1581bf0d5f50SAlex Elder if (obj_request->callback) 1582bf0d5f50SAlex Elder obj_request->callback(obj_request); 1583788e2df3SAlex Elder else 1584788e2df3SAlex Elder complete_all(&obj_request->completion); 1585bf0d5f50SAlex Elder } 1586bf0d5f50SAlex Elder 1587c47f9371SAlex Elder static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request) 158839bf2c5dSAlex Elder { 158939bf2c5dSAlex Elder dout("%s: obj %p\n", __func__, obj_request); 159039bf2c5dSAlex Elder obj_request_done_set(obj_request); 159139bf2c5dSAlex Elder } 159239bf2c5dSAlex Elder 1593c47f9371SAlex Elder static void rbd_osd_read_callback(struct rbd_obj_request *obj_request) 1594bf0d5f50SAlex Elder { 159557acbaa7SAlex Elder struct rbd_img_request *img_request = NULL; 1596a9e8ba2cSAlex Elder struct rbd_device *rbd_dev = NULL; 159757acbaa7SAlex Elder bool layered = false; 159857acbaa7SAlex Elder 159957acbaa7SAlex Elder if (obj_request_img_data_test(obj_request)) { 160057acbaa7SAlex Elder img_request = obj_request->img_request; 160157acbaa7SAlex Elder layered = img_request && img_request_layered_test(img_request); 1602a9e8ba2cSAlex Elder rbd_dev = img_request->rbd_dev; 160357acbaa7SAlex Elder } 16048b3e1a56SAlex Elder 16058b3e1a56SAlex Elder dout("%s: obj %p img %p result %d %llu/%llu\n", __func__, 16068b3e1a56SAlex Elder obj_request, img_request, obj_request->result, 16078b3e1a56SAlex Elder obj_request->xferred, obj_request->length); 1608a9e8ba2cSAlex Elder if (layered && obj_request->result == -ENOENT && 1609a9e8ba2cSAlex Elder obj_request->img_offset < rbd_dev->parent_overlap) 16108b3e1a56SAlex Elder rbd_img_parent_read(obj_request); 16118b3e1a56SAlex Elder else if (img_request) 16126e2a4505SAlex Elder rbd_img_obj_request_read_callback(obj_request); 16136e2a4505SAlex Elder else 161407741308SAlex Elder obj_request_done_set(obj_request); 1615bf0d5f50SAlex Elder } 1616bf0d5f50SAlex Elder 1617c47f9371SAlex Elder static void rbd_osd_write_callback(struct rbd_obj_request *obj_request) 1618bf0d5f50SAlex Elder { 16191b83bef2SSage Weil dout("%s: obj %p result %d %llu\n", __func__, obj_request, 16201b83bef2SSage Weil obj_request->result, obj_request->length); 16211b83bef2SSage Weil /* 16228b3e1a56SAlex Elder * There is no such thing as a successful short write. Set 16238b3e1a56SAlex Elder * it to our originally-requested length. 16241b83bef2SSage Weil */ 16251b83bef2SSage Weil obj_request->xferred = obj_request->length; 162607741308SAlex Elder obj_request_done_set(obj_request); 1627bf0d5f50SAlex Elder } 1628bf0d5f50SAlex Elder 1629fbfab539SAlex Elder /* 1630fbfab539SAlex Elder * For a simple stat call there's nothing to do. We'll do more if 1631fbfab539SAlex Elder * this is part of a write sequence for a layered image. 1632fbfab539SAlex Elder */ 1633c47f9371SAlex Elder static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request) 1634fbfab539SAlex Elder { 163537206ee5SAlex Elder dout("%s: obj %p\n", __func__, obj_request); 1636fbfab539SAlex Elder obj_request_done_set(obj_request); 1637fbfab539SAlex Elder } 1638fbfab539SAlex Elder 1639bf0d5f50SAlex Elder static void rbd_osd_req_callback(struct ceph_osd_request *osd_req, 1640bf0d5f50SAlex Elder struct ceph_msg *msg) 1641bf0d5f50SAlex Elder { 1642bf0d5f50SAlex Elder struct rbd_obj_request *obj_request = osd_req->r_priv; 1643bf0d5f50SAlex Elder u16 opcode; 1644bf0d5f50SAlex Elder 164537206ee5SAlex Elder dout("%s: osd_req %p msg %p\n", __func__, osd_req, msg); 1646bf0d5f50SAlex Elder rbd_assert(osd_req == obj_request->osd_req); 164757acbaa7SAlex Elder if (obj_request_img_data_test(obj_request)) { 164857acbaa7SAlex Elder rbd_assert(obj_request->img_request); 164957acbaa7SAlex Elder rbd_assert(obj_request->which != BAD_WHICH); 165057acbaa7SAlex Elder } else { 165157acbaa7SAlex Elder rbd_assert(obj_request->which == BAD_WHICH); 165257acbaa7SAlex Elder } 1653bf0d5f50SAlex Elder 16541b83bef2SSage Weil if (osd_req->r_result < 0) 16551b83bef2SSage Weil obj_request->result = osd_req->r_result; 1656bf0d5f50SAlex Elder 16577cc69d42SIlya Dryomov rbd_assert(osd_req->r_num_ops <= CEPH_OSD_MAX_OP); 1658bf0d5f50SAlex Elder 1659c47f9371SAlex Elder /* 1660c47f9371SAlex Elder * We support a 64-bit length, but ultimately it has to be 1661c47f9371SAlex Elder * passed to blk_end_request(), which takes an unsigned int. 1662c47f9371SAlex Elder */ 16631b83bef2SSage Weil obj_request->xferred = osd_req->r_reply_op_len[0]; 1664c47f9371SAlex Elder rbd_assert(obj_request->xferred < (u64)UINT_MAX); 16650ccd5926SIlya Dryomov 166679528734SAlex Elder opcode = osd_req->r_ops[0].op; 1667bf0d5f50SAlex Elder switch (opcode) { 1668bf0d5f50SAlex Elder case CEPH_OSD_OP_READ: 1669c47f9371SAlex Elder rbd_osd_read_callback(obj_request); 1670bf0d5f50SAlex Elder break; 16710ccd5926SIlya Dryomov case CEPH_OSD_OP_SETALLOCHINT: 16720ccd5926SIlya Dryomov rbd_assert(osd_req->r_ops[1].op == CEPH_OSD_OP_WRITE); 16730ccd5926SIlya Dryomov /* fall through */ 1674bf0d5f50SAlex Elder case CEPH_OSD_OP_WRITE: 1675c47f9371SAlex Elder rbd_osd_write_callback(obj_request); 1676bf0d5f50SAlex Elder break; 1677fbfab539SAlex Elder case CEPH_OSD_OP_STAT: 1678c47f9371SAlex Elder rbd_osd_stat_callback(obj_request); 1679fbfab539SAlex Elder break; 168036be9a76SAlex Elder case CEPH_OSD_OP_CALL: 1681b8d70035SAlex Elder case CEPH_OSD_OP_NOTIFY_ACK: 16829969ebc5SAlex Elder case CEPH_OSD_OP_WATCH: 1683c47f9371SAlex Elder rbd_osd_trivial_callback(obj_request); 16849969ebc5SAlex Elder break; 1685bf0d5f50SAlex Elder default: 1686bf0d5f50SAlex Elder rbd_warn(NULL, "%s: unsupported op %hu\n", 1687bf0d5f50SAlex Elder obj_request->object_name, (unsigned short) opcode); 1688bf0d5f50SAlex Elder break; 1689bf0d5f50SAlex Elder } 1690bf0d5f50SAlex Elder 169107741308SAlex Elder if (obj_request_done_test(obj_request)) 1692bf0d5f50SAlex Elder rbd_obj_request_complete(obj_request); 1693bf0d5f50SAlex Elder } 1694bf0d5f50SAlex Elder 16959d4df01fSAlex Elder static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request) 1696430c28c3SAlex Elder { 1697430c28c3SAlex Elder struct rbd_img_request *img_request = obj_request->img_request; 16988c042b0dSAlex Elder struct ceph_osd_request *osd_req = obj_request->osd_req; 16999d4df01fSAlex Elder u64 snap_id; 1700430c28c3SAlex Elder 17018c042b0dSAlex Elder rbd_assert(osd_req != NULL); 1702430c28c3SAlex Elder 17039d4df01fSAlex Elder snap_id = img_request ? img_request->snap_id : CEPH_NOSNAP; 17048c042b0dSAlex Elder ceph_osdc_build_request(osd_req, obj_request->offset, 17059d4df01fSAlex Elder NULL, snap_id, NULL); 17069d4df01fSAlex Elder } 17079d4df01fSAlex Elder 17089d4df01fSAlex Elder static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request) 17099d4df01fSAlex Elder { 17109d4df01fSAlex Elder struct rbd_img_request *img_request = obj_request->img_request; 17119d4df01fSAlex Elder struct ceph_osd_request *osd_req = obj_request->osd_req; 17129d4df01fSAlex Elder struct ceph_snap_context *snapc; 17139d4df01fSAlex Elder struct timespec mtime = CURRENT_TIME; 17149d4df01fSAlex Elder 17159d4df01fSAlex Elder rbd_assert(osd_req != NULL); 17169d4df01fSAlex Elder 17179d4df01fSAlex Elder snapc = img_request ? img_request->snapc : NULL; 17189d4df01fSAlex Elder ceph_osdc_build_request(osd_req, obj_request->offset, 17199d4df01fSAlex Elder snapc, CEPH_NOSNAP, &mtime); 1720430c28c3SAlex Elder } 1721430c28c3SAlex Elder 17220ccd5926SIlya Dryomov /* 17230ccd5926SIlya Dryomov * Create an osd request. A read request has one osd op (read). 17240ccd5926SIlya Dryomov * A write request has either one (watch) or two (hint+write) osd ops. 17250ccd5926SIlya Dryomov * (All rbd data writes are prefixed with an allocation hint op, but 17260ccd5926SIlya Dryomov * technically osd watch is a write request, hence this distinction.) 17270ccd5926SIlya Dryomov */ 1728bf0d5f50SAlex Elder static struct ceph_osd_request *rbd_osd_req_create( 1729bf0d5f50SAlex Elder struct rbd_device *rbd_dev, 1730bf0d5f50SAlex Elder bool write_request, 1731deb236b3SIlya Dryomov unsigned int num_ops, 1732430c28c3SAlex Elder struct rbd_obj_request *obj_request) 1733bf0d5f50SAlex Elder { 1734bf0d5f50SAlex Elder struct ceph_snap_context *snapc = NULL; 1735bf0d5f50SAlex Elder struct ceph_osd_client *osdc; 1736bf0d5f50SAlex Elder struct ceph_osd_request *osd_req; 1737bf0d5f50SAlex Elder 17386365d33aSAlex Elder if (obj_request_img_data_test(obj_request)) { 17396365d33aSAlex Elder struct rbd_img_request *img_request = obj_request->img_request; 17406365d33aSAlex Elder 17410c425248SAlex Elder rbd_assert(write_request == 17420c425248SAlex Elder img_request_write_test(img_request)); 17430c425248SAlex Elder if (write_request) 1744bf0d5f50SAlex Elder snapc = img_request->snapc; 1745bf0d5f50SAlex Elder } 1746bf0d5f50SAlex Elder 17470ccd5926SIlya Dryomov rbd_assert(num_ops == 1 || (write_request && num_ops == 2)); 1748deb236b3SIlya Dryomov 1749deb236b3SIlya Dryomov /* Allocate and initialize the request, for the num_ops ops */ 1750bf0d5f50SAlex Elder 1751bf0d5f50SAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 1752deb236b3SIlya Dryomov osd_req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false, 1753deb236b3SIlya Dryomov GFP_ATOMIC); 1754bf0d5f50SAlex Elder if (!osd_req) 1755bf0d5f50SAlex Elder return NULL; /* ENOMEM */ 1756bf0d5f50SAlex Elder 1757430c28c3SAlex Elder if (write_request) 1758bf0d5f50SAlex Elder osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK; 1759430c28c3SAlex Elder else 1760bf0d5f50SAlex Elder osd_req->r_flags = CEPH_OSD_FLAG_READ; 1761bf0d5f50SAlex Elder 1762bf0d5f50SAlex Elder osd_req->r_callback = rbd_osd_req_callback; 1763bf0d5f50SAlex Elder osd_req->r_priv = obj_request; 1764bf0d5f50SAlex Elder 17653c972c95SIlya Dryomov osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout); 17663c972c95SIlya Dryomov ceph_oid_set_name(&osd_req->r_base_oid, obj_request->object_name); 1767bf0d5f50SAlex Elder 1768bf0d5f50SAlex Elder return osd_req; 1769bf0d5f50SAlex Elder } 1770bf0d5f50SAlex Elder 17710eefd470SAlex Elder /* 17720eefd470SAlex Elder * Create a copyup osd request based on the information in the 17730ccd5926SIlya Dryomov * object request supplied. A copyup request has three osd ops, 17740ccd5926SIlya Dryomov * a copyup method call, a hint op, and a write op. 17750eefd470SAlex Elder */ 17760eefd470SAlex Elder static struct ceph_osd_request * 17770eefd470SAlex Elder rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request) 17780eefd470SAlex Elder { 17790eefd470SAlex Elder struct rbd_img_request *img_request; 17800eefd470SAlex Elder struct ceph_snap_context *snapc; 17810eefd470SAlex Elder struct rbd_device *rbd_dev; 17820eefd470SAlex Elder struct ceph_osd_client *osdc; 17830eefd470SAlex Elder struct ceph_osd_request *osd_req; 17840eefd470SAlex Elder 17850eefd470SAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 17860eefd470SAlex Elder img_request = obj_request->img_request; 17870eefd470SAlex Elder rbd_assert(img_request); 17880eefd470SAlex Elder rbd_assert(img_request_write_test(img_request)); 17890eefd470SAlex Elder 17900ccd5926SIlya Dryomov /* Allocate and initialize the request, for the three ops */ 17910eefd470SAlex Elder 17920eefd470SAlex Elder snapc = img_request->snapc; 17930eefd470SAlex Elder rbd_dev = img_request->rbd_dev; 17940eefd470SAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 17950ccd5926SIlya Dryomov osd_req = ceph_osdc_alloc_request(osdc, snapc, 3, false, GFP_ATOMIC); 17960eefd470SAlex Elder if (!osd_req) 17970eefd470SAlex Elder return NULL; /* ENOMEM */ 17980eefd470SAlex Elder 17990eefd470SAlex Elder osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK; 18000eefd470SAlex Elder osd_req->r_callback = rbd_osd_req_callback; 18010eefd470SAlex Elder osd_req->r_priv = obj_request; 18020eefd470SAlex Elder 18033c972c95SIlya Dryomov osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout); 18043c972c95SIlya Dryomov ceph_oid_set_name(&osd_req->r_base_oid, obj_request->object_name); 18050eefd470SAlex Elder 18060eefd470SAlex Elder return osd_req; 18070eefd470SAlex Elder } 18080eefd470SAlex Elder 18090eefd470SAlex Elder 1810bf0d5f50SAlex Elder static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req) 1811bf0d5f50SAlex Elder { 1812bf0d5f50SAlex Elder ceph_osdc_put_request(osd_req); 1813bf0d5f50SAlex Elder } 1814bf0d5f50SAlex Elder 1815bf0d5f50SAlex Elder /* object_name is assumed to be a non-null pointer and NUL-terminated */ 1816bf0d5f50SAlex Elder 1817bf0d5f50SAlex Elder static struct rbd_obj_request *rbd_obj_request_create(const char *object_name, 1818bf0d5f50SAlex Elder u64 offset, u64 length, 1819bf0d5f50SAlex Elder enum obj_request_type type) 1820bf0d5f50SAlex Elder { 1821bf0d5f50SAlex Elder struct rbd_obj_request *obj_request; 1822bf0d5f50SAlex Elder size_t size; 1823bf0d5f50SAlex Elder char *name; 1824bf0d5f50SAlex Elder 1825bf0d5f50SAlex Elder rbd_assert(obj_request_type_valid(type)); 1826bf0d5f50SAlex Elder 1827bf0d5f50SAlex Elder size = strlen(object_name) + 1; 1828f907ad55SAlex Elder name = kmalloc(size, GFP_KERNEL); 1829f907ad55SAlex Elder if (!name) 1830bf0d5f50SAlex Elder return NULL; 1831bf0d5f50SAlex Elder 1832868311b1SAlex Elder obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_KERNEL); 1833f907ad55SAlex Elder if (!obj_request) { 1834f907ad55SAlex Elder kfree(name); 1835f907ad55SAlex Elder return NULL; 1836f907ad55SAlex Elder } 1837f907ad55SAlex Elder 1838bf0d5f50SAlex Elder obj_request->object_name = memcpy(name, object_name, size); 1839bf0d5f50SAlex Elder obj_request->offset = offset; 1840bf0d5f50SAlex Elder obj_request->length = length; 1841926f9b3fSAlex Elder obj_request->flags = 0; 1842bf0d5f50SAlex Elder obj_request->which = BAD_WHICH; 1843bf0d5f50SAlex Elder obj_request->type = type; 1844bf0d5f50SAlex Elder INIT_LIST_HEAD(&obj_request->links); 1845788e2df3SAlex Elder init_completion(&obj_request->completion); 1846bf0d5f50SAlex Elder kref_init(&obj_request->kref); 1847bf0d5f50SAlex Elder 184837206ee5SAlex Elder dout("%s: \"%s\" %llu/%llu %d -> obj %p\n", __func__, object_name, 184937206ee5SAlex Elder offset, length, (int)type, obj_request); 185037206ee5SAlex Elder 1851bf0d5f50SAlex Elder return obj_request; 1852bf0d5f50SAlex Elder } 1853bf0d5f50SAlex Elder 1854bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref) 1855bf0d5f50SAlex Elder { 1856bf0d5f50SAlex Elder struct rbd_obj_request *obj_request; 1857bf0d5f50SAlex Elder 1858bf0d5f50SAlex Elder obj_request = container_of(kref, struct rbd_obj_request, kref); 1859bf0d5f50SAlex Elder 186037206ee5SAlex Elder dout("%s: obj %p\n", __func__, obj_request); 186137206ee5SAlex Elder 1862bf0d5f50SAlex Elder rbd_assert(obj_request->img_request == NULL); 1863bf0d5f50SAlex Elder rbd_assert(obj_request->which == BAD_WHICH); 1864bf0d5f50SAlex Elder 1865bf0d5f50SAlex Elder if (obj_request->osd_req) 1866bf0d5f50SAlex Elder rbd_osd_req_destroy(obj_request->osd_req); 1867bf0d5f50SAlex Elder 1868bf0d5f50SAlex Elder rbd_assert(obj_request_type_valid(obj_request->type)); 1869bf0d5f50SAlex Elder switch (obj_request->type) { 18709969ebc5SAlex Elder case OBJ_REQUEST_NODATA: 18719969ebc5SAlex Elder break; /* Nothing to do */ 1872bf0d5f50SAlex Elder case OBJ_REQUEST_BIO: 1873bf0d5f50SAlex Elder if (obj_request->bio_list) 1874bf0d5f50SAlex Elder bio_chain_put(obj_request->bio_list); 1875bf0d5f50SAlex Elder break; 1876788e2df3SAlex Elder case OBJ_REQUEST_PAGES: 1877788e2df3SAlex Elder if (obj_request->pages) 1878788e2df3SAlex Elder ceph_release_page_vector(obj_request->pages, 1879788e2df3SAlex Elder obj_request->page_count); 1880788e2df3SAlex Elder break; 1881bf0d5f50SAlex Elder } 1882bf0d5f50SAlex Elder 1883f907ad55SAlex Elder kfree(obj_request->object_name); 1884868311b1SAlex Elder obj_request->object_name = NULL; 1885868311b1SAlex Elder kmem_cache_free(rbd_obj_request_cache, obj_request); 1886bf0d5f50SAlex Elder } 1887bf0d5f50SAlex Elder 1888fb65d228SAlex Elder /* It's OK to call this for a device with no parent */ 1889fb65d228SAlex Elder 1890fb65d228SAlex Elder static void rbd_spec_put(struct rbd_spec *spec); 1891fb65d228SAlex Elder static void rbd_dev_unparent(struct rbd_device *rbd_dev) 1892fb65d228SAlex Elder { 1893fb65d228SAlex Elder rbd_dev_remove_parent(rbd_dev); 1894fb65d228SAlex Elder rbd_spec_put(rbd_dev->parent_spec); 1895fb65d228SAlex Elder rbd_dev->parent_spec = NULL; 1896fb65d228SAlex Elder rbd_dev->parent_overlap = 0; 1897fb65d228SAlex Elder } 1898fb65d228SAlex Elder 1899bf0d5f50SAlex Elder /* 1900a2acd00eSAlex Elder * Parent image reference counting is used to determine when an 1901a2acd00eSAlex Elder * image's parent fields can be safely torn down--after there are no 1902a2acd00eSAlex Elder * more in-flight requests to the parent image. When the last 1903a2acd00eSAlex Elder * reference is dropped, cleaning them up is safe. 1904a2acd00eSAlex Elder */ 1905a2acd00eSAlex Elder static void rbd_dev_parent_put(struct rbd_device *rbd_dev) 1906a2acd00eSAlex Elder { 1907a2acd00eSAlex Elder int counter; 1908a2acd00eSAlex Elder 1909a2acd00eSAlex Elder if (!rbd_dev->parent_spec) 1910a2acd00eSAlex Elder return; 1911a2acd00eSAlex Elder 1912a2acd00eSAlex Elder counter = atomic_dec_return_safe(&rbd_dev->parent_ref); 1913a2acd00eSAlex Elder if (counter > 0) 1914a2acd00eSAlex Elder return; 1915a2acd00eSAlex Elder 1916a2acd00eSAlex Elder /* Last reference; clean up parent data structures */ 1917a2acd00eSAlex Elder 1918a2acd00eSAlex Elder if (!counter) 1919a2acd00eSAlex Elder rbd_dev_unparent(rbd_dev); 1920a2acd00eSAlex Elder else 1921a2acd00eSAlex Elder rbd_warn(rbd_dev, "parent reference underflow\n"); 1922a2acd00eSAlex Elder } 1923a2acd00eSAlex Elder 1924a2acd00eSAlex Elder /* 1925a2acd00eSAlex Elder * If an image has a non-zero parent overlap, get a reference to its 1926a2acd00eSAlex Elder * parent. 1927a2acd00eSAlex Elder * 1928392a9dadSAlex Elder * We must get the reference before checking for the overlap to 1929392a9dadSAlex Elder * coordinate properly with zeroing the parent overlap in 1930392a9dadSAlex Elder * rbd_dev_v2_parent_info() when an image gets flattened. We 1931392a9dadSAlex Elder * drop it again if there is no overlap. 1932392a9dadSAlex Elder * 1933a2acd00eSAlex Elder * Returns true if the rbd device has a parent with a non-zero 1934a2acd00eSAlex Elder * overlap and a reference for it was successfully taken, or 1935a2acd00eSAlex Elder * false otherwise. 1936a2acd00eSAlex Elder */ 1937a2acd00eSAlex Elder static bool rbd_dev_parent_get(struct rbd_device *rbd_dev) 1938a2acd00eSAlex Elder { 1939a2acd00eSAlex Elder int counter; 1940a2acd00eSAlex Elder 1941a2acd00eSAlex Elder if (!rbd_dev->parent_spec) 1942a2acd00eSAlex Elder return false; 1943a2acd00eSAlex Elder 1944a2acd00eSAlex Elder counter = atomic_inc_return_safe(&rbd_dev->parent_ref); 1945a2acd00eSAlex Elder if (counter > 0 && rbd_dev->parent_overlap) 1946a2acd00eSAlex Elder return true; 1947a2acd00eSAlex Elder 1948a2acd00eSAlex Elder /* Image was flattened, but parent is not yet torn down */ 1949a2acd00eSAlex Elder 1950a2acd00eSAlex Elder if (counter < 0) 1951a2acd00eSAlex Elder rbd_warn(rbd_dev, "parent reference overflow\n"); 1952a2acd00eSAlex Elder 1953a2acd00eSAlex Elder return false; 1954a2acd00eSAlex Elder } 1955a2acd00eSAlex Elder 1956bf0d5f50SAlex Elder /* 1957bf0d5f50SAlex Elder * Caller is responsible for filling in the list of object requests 1958bf0d5f50SAlex Elder * that comprises the image request, and the Linux request pointer 1959bf0d5f50SAlex Elder * (if there is one). 1960bf0d5f50SAlex Elder */ 1961cc344fa1SAlex Elder static struct rbd_img_request *rbd_img_request_create( 1962cc344fa1SAlex Elder struct rbd_device *rbd_dev, 1963bf0d5f50SAlex Elder u64 offset, u64 length, 1964e93f3152SAlex Elder bool write_request) 1965bf0d5f50SAlex Elder { 1966bf0d5f50SAlex Elder struct rbd_img_request *img_request; 1967bf0d5f50SAlex Elder 19681c2a9dfeSAlex Elder img_request = kmem_cache_alloc(rbd_img_request_cache, GFP_ATOMIC); 1969bf0d5f50SAlex Elder if (!img_request) 1970bf0d5f50SAlex Elder return NULL; 1971bf0d5f50SAlex Elder 1972bf0d5f50SAlex Elder if (write_request) { 1973bf0d5f50SAlex Elder down_read(&rbd_dev->header_rwsem); 1974812164f8SAlex Elder ceph_get_snap_context(rbd_dev->header.snapc); 1975bf0d5f50SAlex Elder up_read(&rbd_dev->header_rwsem); 1976bf0d5f50SAlex Elder } 1977bf0d5f50SAlex Elder 1978bf0d5f50SAlex Elder img_request->rq = NULL; 1979bf0d5f50SAlex Elder img_request->rbd_dev = rbd_dev; 1980bf0d5f50SAlex Elder img_request->offset = offset; 1981bf0d5f50SAlex Elder img_request->length = length; 19820c425248SAlex Elder img_request->flags = 0; 19830c425248SAlex Elder if (write_request) { 19840c425248SAlex Elder img_request_write_set(img_request); 1985468521c1SAlex Elder img_request->snapc = rbd_dev->header.snapc; 19860c425248SAlex Elder } else { 1987bf0d5f50SAlex Elder img_request->snap_id = rbd_dev->spec->snap_id; 19880c425248SAlex Elder } 1989a2acd00eSAlex Elder if (rbd_dev_parent_get(rbd_dev)) 1990d0b2e944SAlex Elder img_request_layered_set(img_request); 1991bf0d5f50SAlex Elder spin_lock_init(&img_request->completion_lock); 1992bf0d5f50SAlex Elder img_request->next_completion = 0; 1993bf0d5f50SAlex Elder img_request->callback = NULL; 1994a5a337d4SAlex Elder img_request->result = 0; 1995bf0d5f50SAlex Elder img_request->obj_request_count = 0; 1996bf0d5f50SAlex Elder INIT_LIST_HEAD(&img_request->obj_requests); 1997bf0d5f50SAlex Elder kref_init(&img_request->kref); 1998bf0d5f50SAlex Elder 199937206ee5SAlex Elder dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev, 200037206ee5SAlex Elder write_request ? "write" : "read", offset, length, 200137206ee5SAlex Elder img_request); 200237206ee5SAlex Elder 2003bf0d5f50SAlex Elder return img_request; 2004bf0d5f50SAlex Elder } 2005bf0d5f50SAlex Elder 2006bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref) 2007bf0d5f50SAlex Elder { 2008bf0d5f50SAlex Elder struct rbd_img_request *img_request; 2009bf0d5f50SAlex Elder struct rbd_obj_request *obj_request; 2010bf0d5f50SAlex Elder struct rbd_obj_request *next_obj_request; 2011bf0d5f50SAlex Elder 2012bf0d5f50SAlex Elder img_request = container_of(kref, struct rbd_img_request, kref); 2013bf0d5f50SAlex Elder 201437206ee5SAlex Elder dout("%s: img %p\n", __func__, img_request); 201537206ee5SAlex Elder 2016bf0d5f50SAlex Elder for_each_obj_request_safe(img_request, obj_request, next_obj_request) 2017bf0d5f50SAlex Elder rbd_img_obj_request_del(img_request, obj_request); 201825dcf954SAlex Elder rbd_assert(img_request->obj_request_count == 0); 2019bf0d5f50SAlex Elder 2020a2acd00eSAlex Elder if (img_request_layered_test(img_request)) { 2021a2acd00eSAlex Elder img_request_layered_clear(img_request); 2022a2acd00eSAlex Elder rbd_dev_parent_put(img_request->rbd_dev); 2023a2acd00eSAlex Elder } 2024a2acd00eSAlex Elder 20250c425248SAlex Elder if (img_request_write_test(img_request)) 2026812164f8SAlex Elder ceph_put_snap_context(img_request->snapc); 2027bf0d5f50SAlex Elder 20281c2a9dfeSAlex Elder kmem_cache_free(rbd_img_request_cache, img_request); 2029bf0d5f50SAlex Elder } 2030bf0d5f50SAlex Elder 2031e93f3152SAlex Elder static struct rbd_img_request *rbd_parent_request_create( 2032e93f3152SAlex Elder struct rbd_obj_request *obj_request, 2033e93f3152SAlex Elder u64 img_offset, u64 length) 2034e93f3152SAlex Elder { 2035e93f3152SAlex Elder struct rbd_img_request *parent_request; 2036e93f3152SAlex Elder struct rbd_device *rbd_dev; 2037e93f3152SAlex Elder 2038e93f3152SAlex Elder rbd_assert(obj_request->img_request); 2039e93f3152SAlex Elder rbd_dev = obj_request->img_request->rbd_dev; 2040e93f3152SAlex Elder 2041e93f3152SAlex Elder parent_request = rbd_img_request_create(rbd_dev->parent, 2042e93f3152SAlex Elder img_offset, length, false); 2043e93f3152SAlex Elder if (!parent_request) 2044e93f3152SAlex Elder return NULL; 2045e93f3152SAlex Elder 2046e93f3152SAlex Elder img_request_child_set(parent_request); 2047e93f3152SAlex Elder rbd_obj_request_get(obj_request); 2048e93f3152SAlex Elder parent_request->obj_request = obj_request; 2049e93f3152SAlex Elder 2050e93f3152SAlex Elder return parent_request; 2051e93f3152SAlex Elder } 2052e93f3152SAlex Elder 2053e93f3152SAlex Elder static void rbd_parent_request_destroy(struct kref *kref) 2054e93f3152SAlex Elder { 2055e93f3152SAlex Elder struct rbd_img_request *parent_request; 2056e93f3152SAlex Elder struct rbd_obj_request *orig_request; 2057e93f3152SAlex Elder 2058e93f3152SAlex Elder parent_request = container_of(kref, struct rbd_img_request, kref); 2059e93f3152SAlex Elder orig_request = parent_request->obj_request; 2060e93f3152SAlex Elder 2061e93f3152SAlex Elder parent_request->obj_request = NULL; 2062e93f3152SAlex Elder rbd_obj_request_put(orig_request); 2063e93f3152SAlex Elder img_request_child_clear(parent_request); 2064e93f3152SAlex Elder 2065e93f3152SAlex Elder rbd_img_request_destroy(kref); 2066e93f3152SAlex Elder } 2067e93f3152SAlex Elder 20681217857fSAlex Elder static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request) 20691217857fSAlex Elder { 20706365d33aSAlex Elder struct rbd_img_request *img_request; 20711217857fSAlex Elder unsigned int xferred; 20721217857fSAlex Elder int result; 20738b3e1a56SAlex Elder bool more; 20741217857fSAlex Elder 20756365d33aSAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 20766365d33aSAlex Elder img_request = obj_request->img_request; 20776365d33aSAlex Elder 20781217857fSAlex Elder rbd_assert(obj_request->xferred <= (u64)UINT_MAX); 20791217857fSAlex Elder xferred = (unsigned int)obj_request->xferred; 20801217857fSAlex Elder result = obj_request->result; 20811217857fSAlex Elder if (result) { 20821217857fSAlex Elder struct rbd_device *rbd_dev = img_request->rbd_dev; 20831217857fSAlex Elder 20841217857fSAlex Elder rbd_warn(rbd_dev, "%s %llx at %llx (%llx)\n", 20851217857fSAlex Elder img_request_write_test(img_request) ? "write" : "read", 20861217857fSAlex Elder obj_request->length, obj_request->img_offset, 20871217857fSAlex Elder obj_request->offset); 20881217857fSAlex Elder rbd_warn(rbd_dev, " result %d xferred %x\n", 20891217857fSAlex Elder result, xferred); 20901217857fSAlex Elder if (!img_request->result) 20911217857fSAlex Elder img_request->result = result; 20921217857fSAlex Elder } 20931217857fSAlex Elder 2094f1a4739fSAlex Elder /* Image object requests don't own their page array */ 2095f1a4739fSAlex Elder 2096f1a4739fSAlex Elder if (obj_request->type == OBJ_REQUEST_PAGES) { 2097f1a4739fSAlex Elder obj_request->pages = NULL; 2098f1a4739fSAlex Elder obj_request->page_count = 0; 2099f1a4739fSAlex Elder } 2100f1a4739fSAlex Elder 21018b3e1a56SAlex Elder if (img_request_child_test(img_request)) { 21028b3e1a56SAlex Elder rbd_assert(img_request->obj_request != NULL); 21038b3e1a56SAlex Elder more = obj_request->which < img_request->obj_request_count - 1; 21048b3e1a56SAlex Elder } else { 21058b3e1a56SAlex Elder rbd_assert(img_request->rq != NULL); 21068b3e1a56SAlex Elder more = blk_end_request(img_request->rq, result, xferred); 21078b3e1a56SAlex Elder } 21088b3e1a56SAlex Elder 21098b3e1a56SAlex Elder return more; 21101217857fSAlex Elder } 21111217857fSAlex Elder 21122169238dSAlex Elder static void rbd_img_obj_callback(struct rbd_obj_request *obj_request) 21132169238dSAlex Elder { 21142169238dSAlex Elder struct rbd_img_request *img_request; 21152169238dSAlex Elder u32 which = obj_request->which; 21162169238dSAlex Elder bool more = true; 21172169238dSAlex Elder 21186365d33aSAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 21192169238dSAlex Elder img_request = obj_request->img_request; 21202169238dSAlex Elder 21212169238dSAlex Elder dout("%s: img %p obj %p\n", __func__, img_request, obj_request); 21222169238dSAlex Elder rbd_assert(img_request != NULL); 21232169238dSAlex Elder rbd_assert(img_request->obj_request_count > 0); 21242169238dSAlex Elder rbd_assert(which != BAD_WHICH); 21252169238dSAlex Elder rbd_assert(which < img_request->obj_request_count); 21262169238dSAlex Elder 21272169238dSAlex Elder spin_lock_irq(&img_request->completion_lock); 21282169238dSAlex Elder if (which != img_request->next_completion) 21292169238dSAlex Elder goto out; 21302169238dSAlex Elder 21312169238dSAlex Elder for_each_obj_request_from(img_request, obj_request) { 21322169238dSAlex Elder rbd_assert(more); 21332169238dSAlex Elder rbd_assert(which < img_request->obj_request_count); 21342169238dSAlex Elder 21352169238dSAlex Elder if (!obj_request_done_test(obj_request)) 21362169238dSAlex Elder break; 21371217857fSAlex Elder more = rbd_img_obj_end_request(obj_request); 21382169238dSAlex Elder which++; 21392169238dSAlex Elder } 21402169238dSAlex Elder 21412169238dSAlex Elder rbd_assert(more ^ (which == img_request->obj_request_count)); 21422169238dSAlex Elder img_request->next_completion = which; 21432169238dSAlex Elder out: 21442169238dSAlex Elder spin_unlock_irq(&img_request->completion_lock); 21452169238dSAlex Elder 21462169238dSAlex Elder if (!more) 21472169238dSAlex Elder rbd_img_request_complete(img_request); 21482169238dSAlex Elder } 21492169238dSAlex Elder 2150f1a4739fSAlex Elder /* 2151f1a4739fSAlex Elder * Split up an image request into one or more object requests, each 2152f1a4739fSAlex Elder * to a different object. The "type" parameter indicates whether 2153f1a4739fSAlex Elder * "data_desc" is the pointer to the head of a list of bio 2154f1a4739fSAlex Elder * structures, or the base of a page array. In either case this 2155f1a4739fSAlex Elder * function assumes data_desc describes memory sufficient to hold 2156f1a4739fSAlex Elder * all data described by the image request. 2157f1a4739fSAlex Elder */ 2158f1a4739fSAlex Elder static int rbd_img_request_fill(struct rbd_img_request *img_request, 2159f1a4739fSAlex Elder enum obj_request_type type, 2160f1a4739fSAlex Elder void *data_desc) 2161bf0d5f50SAlex Elder { 2162bf0d5f50SAlex Elder struct rbd_device *rbd_dev = img_request->rbd_dev; 2163bf0d5f50SAlex Elder struct rbd_obj_request *obj_request = NULL; 2164bf0d5f50SAlex Elder struct rbd_obj_request *next_obj_request; 21650c425248SAlex Elder bool write_request = img_request_write_test(img_request); 2166a158073cSJingoo Han struct bio *bio_list = NULL; 2167f1a4739fSAlex Elder unsigned int bio_offset = 0; 2168a158073cSJingoo Han struct page **pages = NULL; 21697da22d29SAlex Elder u64 img_offset; 2170bf0d5f50SAlex Elder u64 resid; 2171bf0d5f50SAlex Elder u16 opcode; 2172bf0d5f50SAlex Elder 2173f1a4739fSAlex Elder dout("%s: img %p type %d data_desc %p\n", __func__, img_request, 2174f1a4739fSAlex Elder (int)type, data_desc); 217537206ee5SAlex Elder 2176430c28c3SAlex Elder opcode = write_request ? CEPH_OSD_OP_WRITE : CEPH_OSD_OP_READ; 21777da22d29SAlex Elder img_offset = img_request->offset; 2178bf0d5f50SAlex Elder resid = img_request->length; 21794dda41d3SAlex Elder rbd_assert(resid > 0); 2180f1a4739fSAlex Elder 2181f1a4739fSAlex Elder if (type == OBJ_REQUEST_BIO) { 2182f1a4739fSAlex Elder bio_list = data_desc; 21834f024f37SKent Overstreet rbd_assert(img_offset == 21844f024f37SKent Overstreet bio_list->bi_iter.bi_sector << SECTOR_SHIFT); 2185f1a4739fSAlex Elder } else { 2186f1a4739fSAlex Elder rbd_assert(type == OBJ_REQUEST_PAGES); 2187f1a4739fSAlex Elder pages = data_desc; 2188f1a4739fSAlex Elder } 2189f1a4739fSAlex Elder 2190bf0d5f50SAlex Elder while (resid) { 21912fa12320SAlex Elder struct ceph_osd_request *osd_req; 2192bf0d5f50SAlex Elder const char *object_name; 2193bf0d5f50SAlex Elder u64 offset; 2194bf0d5f50SAlex Elder u64 length; 21950ccd5926SIlya Dryomov unsigned int which = 0; 2196bf0d5f50SAlex Elder 21977da22d29SAlex Elder object_name = rbd_segment_name(rbd_dev, img_offset); 2198bf0d5f50SAlex Elder if (!object_name) 2199bf0d5f50SAlex Elder goto out_unwind; 22007da22d29SAlex Elder offset = rbd_segment_offset(rbd_dev, img_offset); 22017da22d29SAlex Elder length = rbd_segment_length(rbd_dev, img_offset, resid); 2202bf0d5f50SAlex Elder obj_request = rbd_obj_request_create(object_name, 2203f1a4739fSAlex Elder offset, length, type); 220478c2a44aSAlex Elder /* object request has its own copy of the object name */ 220578c2a44aSAlex Elder rbd_segment_name_free(object_name); 2206bf0d5f50SAlex Elder if (!obj_request) 2207bf0d5f50SAlex Elder goto out_unwind; 220862054da6SIlya Dryomov 220903507db6SJosh Durgin /* 221003507db6SJosh Durgin * set obj_request->img_request before creating the 221103507db6SJosh Durgin * osd_request so that it gets the right snapc 221203507db6SJosh Durgin */ 221303507db6SJosh Durgin rbd_img_obj_request_add(img_request, obj_request); 2214bf0d5f50SAlex Elder 2215f1a4739fSAlex Elder if (type == OBJ_REQUEST_BIO) { 2216f1a4739fSAlex Elder unsigned int clone_size; 2217f1a4739fSAlex Elder 2218bf0d5f50SAlex Elder rbd_assert(length <= (u64)UINT_MAX); 2219bf0d5f50SAlex Elder clone_size = (unsigned int)length; 2220f1a4739fSAlex Elder obj_request->bio_list = 2221f1a4739fSAlex Elder bio_chain_clone_range(&bio_list, 2222f1a4739fSAlex Elder &bio_offset, 2223f1a4739fSAlex Elder clone_size, 2224bf0d5f50SAlex Elder GFP_ATOMIC); 2225bf0d5f50SAlex Elder if (!obj_request->bio_list) 222662054da6SIlya Dryomov goto out_unwind; 2227f1a4739fSAlex Elder } else { 2228f1a4739fSAlex Elder unsigned int page_count; 2229f1a4739fSAlex Elder 2230f1a4739fSAlex Elder obj_request->pages = pages; 2231f1a4739fSAlex Elder page_count = (u32)calc_pages_for(offset, length); 2232f1a4739fSAlex Elder obj_request->page_count = page_count; 2233f1a4739fSAlex Elder if ((offset + length) & ~PAGE_MASK) 2234f1a4739fSAlex Elder page_count--; /* more on last page */ 2235f1a4739fSAlex Elder pages += page_count; 2236f1a4739fSAlex Elder } 2237bf0d5f50SAlex Elder 22380ccd5926SIlya Dryomov osd_req = rbd_osd_req_create(rbd_dev, write_request, 22390ccd5926SIlya Dryomov (write_request ? 2 : 1), 22402fa12320SAlex Elder obj_request); 22412fa12320SAlex Elder if (!osd_req) 224262054da6SIlya Dryomov goto out_unwind; 22432fa12320SAlex Elder obj_request->osd_req = osd_req; 22442169238dSAlex Elder obj_request->callback = rbd_img_obj_callback; 2245430c28c3SAlex Elder 22460ccd5926SIlya Dryomov if (write_request) { 22470ccd5926SIlya Dryomov osd_req_op_alloc_hint_init(osd_req, which, 22480ccd5926SIlya Dryomov rbd_obj_bytes(&rbd_dev->header), 22490ccd5926SIlya Dryomov rbd_obj_bytes(&rbd_dev->header)); 22500ccd5926SIlya Dryomov which++; 22510ccd5926SIlya Dryomov } 22520ccd5926SIlya Dryomov 22530ccd5926SIlya Dryomov osd_req_op_extent_init(osd_req, which, opcode, offset, length, 22542fa12320SAlex Elder 0, 0); 2255f1a4739fSAlex Elder if (type == OBJ_REQUEST_BIO) 22560ccd5926SIlya Dryomov osd_req_op_extent_osd_data_bio(osd_req, which, 2257f1a4739fSAlex Elder obj_request->bio_list, length); 2258f1a4739fSAlex Elder else 22590ccd5926SIlya Dryomov osd_req_op_extent_osd_data_pages(osd_req, which, 2260f1a4739fSAlex Elder obj_request->pages, length, 2261f1a4739fSAlex Elder offset & ~PAGE_MASK, false, false); 22629d4df01fSAlex Elder 22639d4df01fSAlex Elder if (write_request) 22649d4df01fSAlex Elder rbd_osd_req_format_write(obj_request); 22659d4df01fSAlex Elder else 22669d4df01fSAlex Elder rbd_osd_req_format_read(obj_request); 2267430c28c3SAlex Elder 22687da22d29SAlex Elder obj_request->img_offset = img_offset; 2269bf0d5f50SAlex Elder 22707da22d29SAlex Elder img_offset += length; 2271bf0d5f50SAlex Elder resid -= length; 2272bf0d5f50SAlex Elder } 2273bf0d5f50SAlex Elder 2274bf0d5f50SAlex Elder return 0; 2275bf0d5f50SAlex Elder 2276bf0d5f50SAlex Elder out_unwind: 2277bf0d5f50SAlex Elder for_each_obj_request_safe(img_request, obj_request, next_obj_request) 227842dd037cSIlya Dryomov rbd_img_obj_request_del(img_request, obj_request); 2279bf0d5f50SAlex Elder 2280bf0d5f50SAlex Elder return -ENOMEM; 2281bf0d5f50SAlex Elder } 2282bf0d5f50SAlex Elder 22833d7efd18SAlex Elder static void 22840eefd470SAlex Elder rbd_img_obj_copyup_callback(struct rbd_obj_request *obj_request) 22850eefd470SAlex Elder { 22860eefd470SAlex Elder struct rbd_img_request *img_request; 22870eefd470SAlex Elder struct rbd_device *rbd_dev; 2288ebda6408SAlex Elder struct page **pages; 22890eefd470SAlex Elder u32 page_count; 22900eefd470SAlex Elder 22910eefd470SAlex Elder rbd_assert(obj_request->type == OBJ_REQUEST_BIO); 22920eefd470SAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 22930eefd470SAlex Elder img_request = obj_request->img_request; 22940eefd470SAlex Elder rbd_assert(img_request); 22950eefd470SAlex Elder 22960eefd470SAlex Elder rbd_dev = img_request->rbd_dev; 22970eefd470SAlex Elder rbd_assert(rbd_dev); 22980eefd470SAlex Elder 2299ebda6408SAlex Elder pages = obj_request->copyup_pages; 2300ebda6408SAlex Elder rbd_assert(pages != NULL); 23010eefd470SAlex Elder obj_request->copyup_pages = NULL; 2302ebda6408SAlex Elder page_count = obj_request->copyup_page_count; 2303ebda6408SAlex Elder rbd_assert(page_count); 2304ebda6408SAlex Elder obj_request->copyup_page_count = 0; 2305ebda6408SAlex Elder ceph_release_page_vector(pages, page_count); 23060eefd470SAlex Elder 23070eefd470SAlex Elder /* 23080eefd470SAlex Elder * We want the transfer count to reflect the size of the 23090eefd470SAlex Elder * original write request. There is no such thing as a 23100eefd470SAlex Elder * successful short write, so if the request was successful 23110eefd470SAlex Elder * we can just set it to the originally-requested length. 23120eefd470SAlex Elder */ 23130eefd470SAlex Elder if (!obj_request->result) 23140eefd470SAlex Elder obj_request->xferred = obj_request->length; 23150eefd470SAlex Elder 23160eefd470SAlex Elder /* Finish up with the normal image object callback */ 23170eefd470SAlex Elder 23180eefd470SAlex Elder rbd_img_obj_callback(obj_request); 23190eefd470SAlex Elder } 23200eefd470SAlex Elder 23210eefd470SAlex Elder static void 23223d7efd18SAlex Elder rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request) 23233d7efd18SAlex Elder { 23243d7efd18SAlex Elder struct rbd_obj_request *orig_request; 23250eefd470SAlex Elder struct ceph_osd_request *osd_req; 23260eefd470SAlex Elder struct ceph_osd_client *osdc; 23270eefd470SAlex Elder struct rbd_device *rbd_dev; 23283d7efd18SAlex Elder struct page **pages; 2329ebda6408SAlex Elder u32 page_count; 2330bbea1c1aSAlex Elder int img_result; 2331ebda6408SAlex Elder u64 parent_length; 2332b91f09f1SAlex Elder u64 offset; 2333b91f09f1SAlex Elder u64 length; 23343d7efd18SAlex Elder 23353d7efd18SAlex Elder rbd_assert(img_request_child_test(img_request)); 23363d7efd18SAlex Elder 23373d7efd18SAlex Elder /* First get what we need from the image request */ 23383d7efd18SAlex Elder 23393d7efd18SAlex Elder pages = img_request->copyup_pages; 23403d7efd18SAlex Elder rbd_assert(pages != NULL); 23413d7efd18SAlex Elder img_request->copyup_pages = NULL; 2342ebda6408SAlex Elder page_count = img_request->copyup_page_count; 2343ebda6408SAlex Elder rbd_assert(page_count); 2344ebda6408SAlex Elder img_request->copyup_page_count = 0; 23453d7efd18SAlex Elder 23463d7efd18SAlex Elder orig_request = img_request->obj_request; 23473d7efd18SAlex Elder rbd_assert(orig_request != NULL); 2348b91f09f1SAlex Elder rbd_assert(obj_request_type_valid(orig_request->type)); 2349bbea1c1aSAlex Elder img_result = img_request->result; 2350ebda6408SAlex Elder parent_length = img_request->length; 2351ebda6408SAlex Elder rbd_assert(parent_length == img_request->xferred); 23523d7efd18SAlex Elder rbd_img_request_put(img_request); 23533d7efd18SAlex Elder 235491c6febbSAlex Elder rbd_assert(orig_request->img_request); 235591c6febbSAlex Elder rbd_dev = orig_request->img_request->rbd_dev; 23563d7efd18SAlex Elder rbd_assert(rbd_dev); 23573d7efd18SAlex Elder 2358bbea1c1aSAlex Elder /* 2359bbea1c1aSAlex Elder * If the overlap has become 0 (most likely because the 2360bbea1c1aSAlex Elder * image has been flattened) we need to free the pages 2361bbea1c1aSAlex Elder * and re-submit the original write request. 2362bbea1c1aSAlex Elder */ 2363bbea1c1aSAlex Elder if (!rbd_dev->parent_overlap) { 2364bbea1c1aSAlex Elder struct ceph_osd_client *osdc; 2365bbea1c1aSAlex Elder 2366bbea1c1aSAlex Elder ceph_release_page_vector(pages, page_count); 2367bbea1c1aSAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 2368bbea1c1aSAlex Elder img_result = rbd_obj_request_submit(osdc, orig_request); 2369bbea1c1aSAlex Elder if (!img_result) 2370bbea1c1aSAlex Elder return; 2371bbea1c1aSAlex Elder } 2372bbea1c1aSAlex Elder 2373bbea1c1aSAlex Elder if (img_result) 23740eefd470SAlex Elder goto out_err; 23753d7efd18SAlex Elder 23768785b1d4SAlex Elder /* 23778785b1d4SAlex Elder * The original osd request is of no use to use any more. 23780ccd5926SIlya Dryomov * We need a new one that can hold the three ops in a copyup 23798785b1d4SAlex Elder * request. Allocate the new copyup osd request for the 23808785b1d4SAlex Elder * original request, and release the old one. 23818785b1d4SAlex Elder */ 2382bbea1c1aSAlex Elder img_result = -ENOMEM; 23830eefd470SAlex Elder osd_req = rbd_osd_req_create_copyup(orig_request); 23840eefd470SAlex Elder if (!osd_req) 23850eefd470SAlex Elder goto out_err; 23868785b1d4SAlex Elder rbd_osd_req_destroy(orig_request->osd_req); 23870eefd470SAlex Elder orig_request->osd_req = osd_req; 23880eefd470SAlex Elder orig_request->copyup_pages = pages; 2389ebda6408SAlex Elder orig_request->copyup_page_count = page_count; 23903d7efd18SAlex Elder 23910eefd470SAlex Elder /* Initialize the copyup op */ 23920eefd470SAlex Elder 23930eefd470SAlex Elder osd_req_op_cls_init(osd_req, 0, CEPH_OSD_OP_CALL, "rbd", "copyup"); 2394ebda6408SAlex Elder osd_req_op_cls_request_data_pages(osd_req, 0, pages, parent_length, 0, 23950eefd470SAlex Elder false, false); 23960eefd470SAlex Elder 23970ccd5926SIlya Dryomov /* Then the hint op */ 23980ccd5926SIlya Dryomov 23990ccd5926SIlya Dryomov osd_req_op_alloc_hint_init(osd_req, 1, rbd_obj_bytes(&rbd_dev->header), 24000ccd5926SIlya Dryomov rbd_obj_bytes(&rbd_dev->header)); 24010ccd5926SIlya Dryomov 24020ccd5926SIlya Dryomov /* And the original write request op */ 24030eefd470SAlex Elder 2404b91f09f1SAlex Elder offset = orig_request->offset; 2405b91f09f1SAlex Elder length = orig_request->length; 24060ccd5926SIlya Dryomov osd_req_op_extent_init(osd_req, 2, CEPH_OSD_OP_WRITE, 2407b91f09f1SAlex Elder offset, length, 0, 0); 2408b91f09f1SAlex Elder if (orig_request->type == OBJ_REQUEST_BIO) 24090ccd5926SIlya Dryomov osd_req_op_extent_osd_data_bio(osd_req, 2, 2410b91f09f1SAlex Elder orig_request->bio_list, length); 2411b91f09f1SAlex Elder else 24120ccd5926SIlya Dryomov osd_req_op_extent_osd_data_pages(osd_req, 2, 2413b91f09f1SAlex Elder orig_request->pages, length, 2414b91f09f1SAlex Elder offset & ~PAGE_MASK, false, false); 24150eefd470SAlex Elder 24160eefd470SAlex Elder rbd_osd_req_format_write(orig_request); 24170eefd470SAlex Elder 24180eefd470SAlex Elder /* All set, send it off. */ 24190eefd470SAlex Elder 24200eefd470SAlex Elder orig_request->callback = rbd_img_obj_copyup_callback; 24210eefd470SAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 2422bbea1c1aSAlex Elder img_result = rbd_obj_request_submit(osdc, orig_request); 2423bbea1c1aSAlex Elder if (!img_result) 24240eefd470SAlex Elder return; 24250eefd470SAlex Elder out_err: 24260eefd470SAlex Elder /* Record the error code and complete the request */ 24270eefd470SAlex Elder 2428bbea1c1aSAlex Elder orig_request->result = img_result; 24290eefd470SAlex Elder orig_request->xferred = 0; 24303d7efd18SAlex Elder obj_request_done_set(orig_request); 24313d7efd18SAlex Elder rbd_obj_request_complete(orig_request); 24323d7efd18SAlex Elder } 24333d7efd18SAlex Elder 24343d7efd18SAlex Elder /* 24353d7efd18SAlex Elder * Read from the parent image the range of data that covers the 24363d7efd18SAlex Elder * entire target of the given object request. This is used for 24373d7efd18SAlex Elder * satisfying a layered image write request when the target of an 24383d7efd18SAlex Elder * object request from the image request does not exist. 24393d7efd18SAlex Elder * 24403d7efd18SAlex Elder * A page array big enough to hold the returned data is allocated 24413d7efd18SAlex Elder * and supplied to rbd_img_request_fill() as the "data descriptor." 24423d7efd18SAlex Elder * When the read completes, this page array will be transferred to 24433d7efd18SAlex Elder * the original object request for the copyup operation. 24443d7efd18SAlex Elder * 24453d7efd18SAlex Elder * If an error occurs, record it as the result of the original 24463d7efd18SAlex Elder * object request and mark it done so it gets completed. 24473d7efd18SAlex Elder */ 24483d7efd18SAlex Elder static int rbd_img_obj_parent_read_full(struct rbd_obj_request *obj_request) 24493d7efd18SAlex Elder { 24503d7efd18SAlex Elder struct rbd_img_request *img_request = NULL; 24513d7efd18SAlex Elder struct rbd_img_request *parent_request = NULL; 24523d7efd18SAlex Elder struct rbd_device *rbd_dev; 24533d7efd18SAlex Elder u64 img_offset; 24543d7efd18SAlex Elder u64 length; 24553d7efd18SAlex Elder struct page **pages = NULL; 24563d7efd18SAlex Elder u32 page_count; 24573d7efd18SAlex Elder int result; 24583d7efd18SAlex Elder 24593d7efd18SAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 2460b91f09f1SAlex Elder rbd_assert(obj_request_type_valid(obj_request->type)); 24613d7efd18SAlex Elder 24623d7efd18SAlex Elder img_request = obj_request->img_request; 24633d7efd18SAlex Elder rbd_assert(img_request != NULL); 24643d7efd18SAlex Elder rbd_dev = img_request->rbd_dev; 24653d7efd18SAlex Elder rbd_assert(rbd_dev->parent != NULL); 24663d7efd18SAlex Elder 24673d7efd18SAlex Elder /* 24683d7efd18SAlex Elder * Determine the byte range covered by the object in the 24693d7efd18SAlex Elder * child image to which the original request was to be sent. 24703d7efd18SAlex Elder */ 24713d7efd18SAlex Elder img_offset = obj_request->img_offset - obj_request->offset; 24723d7efd18SAlex Elder length = (u64)1 << rbd_dev->header.obj_order; 24733d7efd18SAlex Elder 24743d7efd18SAlex Elder /* 2475a9e8ba2cSAlex Elder * There is no defined parent data beyond the parent 2476a9e8ba2cSAlex Elder * overlap, so limit what we read at that boundary if 2477a9e8ba2cSAlex Elder * necessary. 2478a9e8ba2cSAlex Elder */ 2479a9e8ba2cSAlex Elder if (img_offset + length > rbd_dev->parent_overlap) { 2480a9e8ba2cSAlex Elder rbd_assert(img_offset < rbd_dev->parent_overlap); 2481a9e8ba2cSAlex Elder length = rbd_dev->parent_overlap - img_offset; 2482a9e8ba2cSAlex Elder } 2483a9e8ba2cSAlex Elder 2484a9e8ba2cSAlex Elder /* 24853d7efd18SAlex Elder * Allocate a page array big enough to receive the data read 24863d7efd18SAlex Elder * from the parent. 24873d7efd18SAlex Elder */ 24883d7efd18SAlex Elder page_count = (u32)calc_pages_for(0, length); 24893d7efd18SAlex Elder pages = ceph_alloc_page_vector(page_count, GFP_KERNEL); 24903d7efd18SAlex Elder if (IS_ERR(pages)) { 24913d7efd18SAlex Elder result = PTR_ERR(pages); 24923d7efd18SAlex Elder pages = NULL; 24933d7efd18SAlex Elder goto out_err; 24943d7efd18SAlex Elder } 24953d7efd18SAlex Elder 24963d7efd18SAlex Elder result = -ENOMEM; 2497e93f3152SAlex Elder parent_request = rbd_parent_request_create(obj_request, 2498e93f3152SAlex Elder img_offset, length); 24993d7efd18SAlex Elder if (!parent_request) 25003d7efd18SAlex Elder goto out_err; 25013d7efd18SAlex Elder 25023d7efd18SAlex Elder result = rbd_img_request_fill(parent_request, OBJ_REQUEST_PAGES, pages); 25033d7efd18SAlex Elder if (result) 25043d7efd18SAlex Elder goto out_err; 25053d7efd18SAlex Elder parent_request->copyup_pages = pages; 2506ebda6408SAlex Elder parent_request->copyup_page_count = page_count; 25073d7efd18SAlex Elder 25083d7efd18SAlex Elder parent_request->callback = rbd_img_obj_parent_read_full_callback; 25093d7efd18SAlex Elder result = rbd_img_request_submit(parent_request); 25103d7efd18SAlex Elder if (!result) 25113d7efd18SAlex Elder return 0; 25123d7efd18SAlex Elder 25133d7efd18SAlex Elder parent_request->copyup_pages = NULL; 2514ebda6408SAlex Elder parent_request->copyup_page_count = 0; 25153d7efd18SAlex Elder parent_request->obj_request = NULL; 25163d7efd18SAlex Elder rbd_obj_request_put(obj_request); 25173d7efd18SAlex Elder out_err: 25183d7efd18SAlex Elder if (pages) 25193d7efd18SAlex Elder ceph_release_page_vector(pages, page_count); 25203d7efd18SAlex Elder if (parent_request) 25213d7efd18SAlex Elder rbd_img_request_put(parent_request); 25223d7efd18SAlex Elder obj_request->result = result; 25233d7efd18SAlex Elder obj_request->xferred = 0; 25243d7efd18SAlex Elder obj_request_done_set(obj_request); 25253d7efd18SAlex Elder 25263d7efd18SAlex Elder return result; 25273d7efd18SAlex Elder } 25283d7efd18SAlex Elder 2529c5b5ef6cSAlex Elder static void rbd_img_obj_exists_callback(struct rbd_obj_request *obj_request) 2530c5b5ef6cSAlex Elder { 2531c5b5ef6cSAlex Elder struct rbd_obj_request *orig_request; 2532638f5abeSAlex Elder struct rbd_device *rbd_dev; 2533c5b5ef6cSAlex Elder int result; 2534c5b5ef6cSAlex Elder 2535c5b5ef6cSAlex Elder rbd_assert(!obj_request_img_data_test(obj_request)); 2536c5b5ef6cSAlex Elder 2537c5b5ef6cSAlex Elder /* 2538c5b5ef6cSAlex Elder * All we need from the object request is the original 2539c5b5ef6cSAlex Elder * request and the result of the STAT op. Grab those, then 2540c5b5ef6cSAlex Elder * we're done with the request. 2541c5b5ef6cSAlex Elder */ 2542c5b5ef6cSAlex Elder orig_request = obj_request->obj_request; 2543c5b5ef6cSAlex Elder obj_request->obj_request = NULL; 2544912c317dSAlex Elder rbd_obj_request_put(orig_request); 2545c5b5ef6cSAlex Elder rbd_assert(orig_request); 2546c5b5ef6cSAlex Elder rbd_assert(orig_request->img_request); 2547c5b5ef6cSAlex Elder 2548c5b5ef6cSAlex Elder result = obj_request->result; 2549c5b5ef6cSAlex Elder obj_request->result = 0; 2550c5b5ef6cSAlex Elder 2551c5b5ef6cSAlex Elder dout("%s: obj %p for obj %p result %d %llu/%llu\n", __func__, 2552c5b5ef6cSAlex Elder obj_request, orig_request, result, 2553c5b5ef6cSAlex Elder obj_request->xferred, obj_request->length); 2554c5b5ef6cSAlex Elder rbd_obj_request_put(obj_request); 2555c5b5ef6cSAlex Elder 2556638f5abeSAlex Elder /* 2557638f5abeSAlex Elder * If the overlap has become 0 (most likely because the 2558638f5abeSAlex Elder * image has been flattened) we need to free the pages 2559638f5abeSAlex Elder * and re-submit the original write request. 2560638f5abeSAlex Elder */ 2561638f5abeSAlex Elder rbd_dev = orig_request->img_request->rbd_dev; 2562638f5abeSAlex Elder if (!rbd_dev->parent_overlap) { 2563638f5abeSAlex Elder struct ceph_osd_client *osdc; 2564638f5abeSAlex Elder 2565638f5abeSAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 2566638f5abeSAlex Elder result = rbd_obj_request_submit(osdc, orig_request); 2567638f5abeSAlex Elder if (!result) 2568638f5abeSAlex Elder return; 2569638f5abeSAlex Elder } 2570c5b5ef6cSAlex Elder 2571c5b5ef6cSAlex Elder /* 2572c5b5ef6cSAlex Elder * Our only purpose here is to determine whether the object 2573c5b5ef6cSAlex Elder * exists, and we don't want to treat the non-existence as 2574c5b5ef6cSAlex Elder * an error. If something else comes back, transfer the 2575c5b5ef6cSAlex Elder * error to the original request and complete it now. 2576c5b5ef6cSAlex Elder */ 2577c5b5ef6cSAlex Elder if (!result) { 2578c5b5ef6cSAlex Elder obj_request_existence_set(orig_request, true); 2579c5b5ef6cSAlex Elder } else if (result == -ENOENT) { 2580c5b5ef6cSAlex Elder obj_request_existence_set(orig_request, false); 2581c5b5ef6cSAlex Elder } else if (result) { 2582c5b5ef6cSAlex Elder orig_request->result = result; 25833d7efd18SAlex Elder goto out; 2584c5b5ef6cSAlex Elder } 2585c5b5ef6cSAlex Elder 2586c5b5ef6cSAlex Elder /* 2587c5b5ef6cSAlex Elder * Resubmit the original request now that we have recorded 2588c5b5ef6cSAlex Elder * whether the target object exists. 2589c5b5ef6cSAlex Elder */ 2590b454e36dSAlex Elder orig_request->result = rbd_img_obj_request_submit(orig_request); 25913d7efd18SAlex Elder out: 2592c5b5ef6cSAlex Elder if (orig_request->result) 2593c5b5ef6cSAlex Elder rbd_obj_request_complete(orig_request); 2594c5b5ef6cSAlex Elder } 2595c5b5ef6cSAlex Elder 2596c5b5ef6cSAlex Elder static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request) 2597c5b5ef6cSAlex Elder { 2598c5b5ef6cSAlex Elder struct rbd_obj_request *stat_request; 2599c5b5ef6cSAlex Elder struct rbd_device *rbd_dev; 2600c5b5ef6cSAlex Elder struct ceph_osd_client *osdc; 2601c5b5ef6cSAlex Elder struct page **pages = NULL; 2602c5b5ef6cSAlex Elder u32 page_count; 2603c5b5ef6cSAlex Elder size_t size; 2604c5b5ef6cSAlex Elder int ret; 2605c5b5ef6cSAlex Elder 2606c5b5ef6cSAlex Elder /* 2607c5b5ef6cSAlex Elder * The response data for a STAT call consists of: 2608c5b5ef6cSAlex Elder * le64 length; 2609c5b5ef6cSAlex Elder * struct { 2610c5b5ef6cSAlex Elder * le32 tv_sec; 2611c5b5ef6cSAlex Elder * le32 tv_nsec; 2612c5b5ef6cSAlex Elder * } mtime; 2613c5b5ef6cSAlex Elder */ 2614c5b5ef6cSAlex Elder size = sizeof (__le64) + sizeof (__le32) + sizeof (__le32); 2615c5b5ef6cSAlex Elder page_count = (u32)calc_pages_for(0, size); 2616c5b5ef6cSAlex Elder pages = ceph_alloc_page_vector(page_count, GFP_KERNEL); 2617c5b5ef6cSAlex Elder if (IS_ERR(pages)) 2618c5b5ef6cSAlex Elder return PTR_ERR(pages); 2619c5b5ef6cSAlex Elder 2620c5b5ef6cSAlex Elder ret = -ENOMEM; 2621c5b5ef6cSAlex Elder stat_request = rbd_obj_request_create(obj_request->object_name, 0, 0, 2622c5b5ef6cSAlex Elder OBJ_REQUEST_PAGES); 2623c5b5ef6cSAlex Elder if (!stat_request) 2624c5b5ef6cSAlex Elder goto out; 2625c5b5ef6cSAlex Elder 2626c5b5ef6cSAlex Elder rbd_obj_request_get(obj_request); 2627c5b5ef6cSAlex Elder stat_request->obj_request = obj_request; 2628c5b5ef6cSAlex Elder stat_request->pages = pages; 2629c5b5ef6cSAlex Elder stat_request->page_count = page_count; 2630c5b5ef6cSAlex Elder 2631c5b5ef6cSAlex Elder rbd_assert(obj_request->img_request); 2632c5b5ef6cSAlex Elder rbd_dev = obj_request->img_request->rbd_dev; 2633deb236b3SIlya Dryomov stat_request->osd_req = rbd_osd_req_create(rbd_dev, false, 1, 2634c5b5ef6cSAlex Elder stat_request); 2635c5b5ef6cSAlex Elder if (!stat_request->osd_req) 2636c5b5ef6cSAlex Elder goto out; 2637c5b5ef6cSAlex Elder stat_request->callback = rbd_img_obj_exists_callback; 2638c5b5ef6cSAlex Elder 2639c5b5ef6cSAlex Elder osd_req_op_init(stat_request->osd_req, 0, CEPH_OSD_OP_STAT); 2640c5b5ef6cSAlex Elder osd_req_op_raw_data_in_pages(stat_request->osd_req, 0, pages, size, 0, 2641c5b5ef6cSAlex Elder false, false); 26429d4df01fSAlex Elder rbd_osd_req_format_read(stat_request); 2643c5b5ef6cSAlex Elder 2644c5b5ef6cSAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 2645c5b5ef6cSAlex Elder ret = rbd_obj_request_submit(osdc, stat_request); 2646c5b5ef6cSAlex Elder out: 2647c5b5ef6cSAlex Elder if (ret) 2648c5b5ef6cSAlex Elder rbd_obj_request_put(obj_request); 2649c5b5ef6cSAlex Elder 2650c5b5ef6cSAlex Elder return ret; 2651c5b5ef6cSAlex Elder } 2652c5b5ef6cSAlex Elder 2653b454e36dSAlex Elder static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request) 2654b454e36dSAlex Elder { 2655b454e36dSAlex Elder struct rbd_img_request *img_request; 2656a9e8ba2cSAlex Elder struct rbd_device *rbd_dev; 26573d7efd18SAlex Elder bool known; 2658b454e36dSAlex Elder 2659b454e36dSAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 2660b454e36dSAlex Elder 2661b454e36dSAlex Elder img_request = obj_request->img_request; 2662b454e36dSAlex Elder rbd_assert(img_request); 2663a9e8ba2cSAlex Elder rbd_dev = img_request->rbd_dev; 2664b454e36dSAlex Elder 2665b454e36dSAlex Elder /* 2666a9e8ba2cSAlex Elder * Only writes to layered images need special handling. 2667a9e8ba2cSAlex Elder * Reads and non-layered writes are simple object requests. 2668a9e8ba2cSAlex Elder * Layered writes that start beyond the end of the overlap 2669a9e8ba2cSAlex Elder * with the parent have no parent data, so they too are 2670a9e8ba2cSAlex Elder * simple object requests. Finally, if the target object is 2671a9e8ba2cSAlex Elder * known to already exist, its parent data has already been 2672a9e8ba2cSAlex Elder * copied, so a write to the object can also be handled as a 2673a9e8ba2cSAlex Elder * simple object request. 2674b454e36dSAlex Elder */ 2675b454e36dSAlex Elder if (!img_request_write_test(img_request) || 2676b454e36dSAlex Elder !img_request_layered_test(img_request) || 2677a9e8ba2cSAlex Elder rbd_dev->parent_overlap <= obj_request->img_offset || 26783d7efd18SAlex Elder ((known = obj_request_known_test(obj_request)) && 26793d7efd18SAlex Elder obj_request_exists_test(obj_request))) { 2680b454e36dSAlex Elder 2681b454e36dSAlex Elder struct rbd_device *rbd_dev; 2682b454e36dSAlex Elder struct ceph_osd_client *osdc; 2683b454e36dSAlex Elder 2684b454e36dSAlex Elder rbd_dev = obj_request->img_request->rbd_dev; 2685b454e36dSAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 2686b454e36dSAlex Elder 2687b454e36dSAlex Elder return rbd_obj_request_submit(osdc, obj_request); 2688b454e36dSAlex Elder } 2689b454e36dSAlex Elder 2690b454e36dSAlex Elder /* 26913d7efd18SAlex Elder * It's a layered write. The target object might exist but 26923d7efd18SAlex Elder * we may not know that yet. If we know it doesn't exist, 26933d7efd18SAlex Elder * start by reading the data for the full target object from 26943d7efd18SAlex Elder * the parent so we can use it for a copyup to the target. 2695b454e36dSAlex Elder */ 26963d7efd18SAlex Elder if (known) 26973d7efd18SAlex Elder return rbd_img_obj_parent_read_full(obj_request); 26983d7efd18SAlex Elder 26993d7efd18SAlex Elder /* We don't know whether the target exists. Go find out. */ 2700b454e36dSAlex Elder 2701b454e36dSAlex Elder return rbd_img_obj_exists_submit(obj_request); 2702b454e36dSAlex Elder } 2703b454e36dSAlex Elder 2704bf0d5f50SAlex Elder static int rbd_img_request_submit(struct rbd_img_request *img_request) 2705bf0d5f50SAlex Elder { 2706bf0d5f50SAlex Elder struct rbd_obj_request *obj_request; 270746faeed4SAlex Elder struct rbd_obj_request *next_obj_request; 2708bf0d5f50SAlex Elder 270937206ee5SAlex Elder dout("%s: img %p\n", __func__, img_request); 271046faeed4SAlex Elder for_each_obj_request_safe(img_request, obj_request, next_obj_request) { 2711bf0d5f50SAlex Elder int ret; 2712bf0d5f50SAlex Elder 2713b454e36dSAlex Elder ret = rbd_img_obj_request_submit(obj_request); 2714bf0d5f50SAlex Elder if (ret) 2715bf0d5f50SAlex Elder return ret; 2716bf0d5f50SAlex Elder } 2717bf0d5f50SAlex Elder 2718bf0d5f50SAlex Elder return 0; 2719bf0d5f50SAlex Elder } 2720bf0d5f50SAlex Elder 27218b3e1a56SAlex Elder static void rbd_img_parent_read_callback(struct rbd_img_request *img_request) 27228b3e1a56SAlex Elder { 27238b3e1a56SAlex Elder struct rbd_obj_request *obj_request; 2724a9e8ba2cSAlex Elder struct rbd_device *rbd_dev; 2725a9e8ba2cSAlex Elder u64 obj_end; 272602c74fbaSAlex Elder u64 img_xferred; 272702c74fbaSAlex Elder int img_result; 27288b3e1a56SAlex Elder 27298b3e1a56SAlex Elder rbd_assert(img_request_child_test(img_request)); 27308b3e1a56SAlex Elder 273102c74fbaSAlex Elder /* First get what we need from the image request and release it */ 273202c74fbaSAlex Elder 27338b3e1a56SAlex Elder obj_request = img_request->obj_request; 273402c74fbaSAlex Elder img_xferred = img_request->xferred; 273502c74fbaSAlex Elder img_result = img_request->result; 273602c74fbaSAlex Elder rbd_img_request_put(img_request); 273702c74fbaSAlex Elder 273802c74fbaSAlex Elder /* 273902c74fbaSAlex Elder * If the overlap has become 0 (most likely because the 274002c74fbaSAlex Elder * image has been flattened) we need to re-submit the 274102c74fbaSAlex Elder * original request. 274202c74fbaSAlex Elder */ 2743a9e8ba2cSAlex Elder rbd_assert(obj_request); 2744a9e8ba2cSAlex Elder rbd_assert(obj_request->img_request); 274502c74fbaSAlex Elder rbd_dev = obj_request->img_request->rbd_dev; 274602c74fbaSAlex Elder if (!rbd_dev->parent_overlap) { 274702c74fbaSAlex Elder struct ceph_osd_client *osdc; 27488b3e1a56SAlex Elder 274902c74fbaSAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 275002c74fbaSAlex Elder img_result = rbd_obj_request_submit(osdc, obj_request); 275102c74fbaSAlex Elder if (!img_result) 275202c74fbaSAlex Elder return; 275302c74fbaSAlex Elder } 275402c74fbaSAlex Elder 275502c74fbaSAlex Elder obj_request->result = img_result; 2756a9e8ba2cSAlex Elder if (obj_request->result) 2757a9e8ba2cSAlex Elder goto out; 2758a9e8ba2cSAlex Elder 2759a9e8ba2cSAlex Elder /* 2760a9e8ba2cSAlex Elder * We need to zero anything beyond the parent overlap 2761a9e8ba2cSAlex Elder * boundary. Since rbd_img_obj_request_read_callback() 2762a9e8ba2cSAlex Elder * will zero anything beyond the end of a short read, an 2763a9e8ba2cSAlex Elder * easy way to do this is to pretend the data from the 2764a9e8ba2cSAlex Elder * parent came up short--ending at the overlap boundary. 2765a9e8ba2cSAlex Elder */ 2766a9e8ba2cSAlex Elder rbd_assert(obj_request->img_offset < U64_MAX - obj_request->length); 2767a9e8ba2cSAlex Elder obj_end = obj_request->img_offset + obj_request->length; 2768a9e8ba2cSAlex Elder if (obj_end > rbd_dev->parent_overlap) { 2769a9e8ba2cSAlex Elder u64 xferred = 0; 2770a9e8ba2cSAlex Elder 2771a9e8ba2cSAlex Elder if (obj_request->img_offset < rbd_dev->parent_overlap) 2772a9e8ba2cSAlex Elder xferred = rbd_dev->parent_overlap - 2773a9e8ba2cSAlex Elder obj_request->img_offset; 2774a9e8ba2cSAlex Elder 277502c74fbaSAlex Elder obj_request->xferred = min(img_xferred, xferred); 2776a9e8ba2cSAlex Elder } else { 277702c74fbaSAlex Elder obj_request->xferred = img_xferred; 2778a9e8ba2cSAlex Elder } 2779a9e8ba2cSAlex Elder out: 27808b3e1a56SAlex Elder rbd_img_obj_request_read_callback(obj_request); 27818b3e1a56SAlex Elder rbd_obj_request_complete(obj_request); 27828b3e1a56SAlex Elder } 27838b3e1a56SAlex Elder 27848b3e1a56SAlex Elder static void rbd_img_parent_read(struct rbd_obj_request *obj_request) 27858b3e1a56SAlex Elder { 27868b3e1a56SAlex Elder struct rbd_img_request *img_request; 27878b3e1a56SAlex Elder int result; 27888b3e1a56SAlex Elder 27898b3e1a56SAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 27908b3e1a56SAlex Elder rbd_assert(obj_request->img_request != NULL); 27918b3e1a56SAlex Elder rbd_assert(obj_request->result == (s32) -ENOENT); 27925b2ab72dSAlex Elder rbd_assert(obj_request_type_valid(obj_request->type)); 27938b3e1a56SAlex Elder 27948b3e1a56SAlex Elder /* rbd_read_finish(obj_request, obj_request->length); */ 2795e93f3152SAlex Elder img_request = rbd_parent_request_create(obj_request, 27968b3e1a56SAlex Elder obj_request->img_offset, 2797e93f3152SAlex Elder obj_request->length); 27988b3e1a56SAlex Elder result = -ENOMEM; 27998b3e1a56SAlex Elder if (!img_request) 28008b3e1a56SAlex Elder goto out_err; 28018b3e1a56SAlex Elder 28025b2ab72dSAlex Elder if (obj_request->type == OBJ_REQUEST_BIO) 2803f1a4739fSAlex Elder result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO, 2804f1a4739fSAlex Elder obj_request->bio_list); 28055b2ab72dSAlex Elder else 28065b2ab72dSAlex Elder result = rbd_img_request_fill(img_request, OBJ_REQUEST_PAGES, 28075b2ab72dSAlex Elder obj_request->pages); 28088b3e1a56SAlex Elder if (result) 28098b3e1a56SAlex Elder goto out_err; 28108b3e1a56SAlex Elder 28118b3e1a56SAlex Elder img_request->callback = rbd_img_parent_read_callback; 28128b3e1a56SAlex Elder result = rbd_img_request_submit(img_request); 28138b3e1a56SAlex Elder if (result) 28148b3e1a56SAlex Elder goto out_err; 28158b3e1a56SAlex Elder 28168b3e1a56SAlex Elder return; 28178b3e1a56SAlex Elder out_err: 28188b3e1a56SAlex Elder if (img_request) 28198b3e1a56SAlex Elder rbd_img_request_put(img_request); 28208b3e1a56SAlex Elder obj_request->result = result; 28218b3e1a56SAlex Elder obj_request->xferred = 0; 28228b3e1a56SAlex Elder obj_request_done_set(obj_request); 28238b3e1a56SAlex Elder } 28248b3e1a56SAlex Elder 282520e0af67SJosh Durgin static int rbd_obj_notify_ack_sync(struct rbd_device *rbd_dev, u64 notify_id) 2826b8d70035SAlex Elder { 2827b8d70035SAlex Elder struct rbd_obj_request *obj_request; 28282169238dSAlex Elder struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 2829b8d70035SAlex Elder int ret; 2830b8d70035SAlex Elder 2831b8d70035SAlex Elder obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0, 2832b8d70035SAlex Elder OBJ_REQUEST_NODATA); 2833b8d70035SAlex Elder if (!obj_request) 2834b8d70035SAlex Elder return -ENOMEM; 2835b8d70035SAlex Elder 2836b8d70035SAlex Elder ret = -ENOMEM; 2837deb236b3SIlya Dryomov obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, 1, 2838deb236b3SIlya Dryomov obj_request); 2839b8d70035SAlex Elder if (!obj_request->osd_req) 2840b8d70035SAlex Elder goto out; 2841b8d70035SAlex Elder 2842c99d2d4aSAlex Elder osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_NOTIFY_ACK, 2843cc4a38bdSAlex Elder notify_id, 0, 0); 28449d4df01fSAlex Elder rbd_osd_req_format_read(obj_request); 2845430c28c3SAlex Elder 2846b8d70035SAlex Elder ret = rbd_obj_request_submit(osdc, obj_request); 2847cf81b60eSAlex Elder if (ret) 284820e0af67SJosh Durgin goto out; 284920e0af67SJosh Durgin ret = rbd_obj_request_wait(obj_request); 285020e0af67SJosh Durgin out: 2851b8d70035SAlex Elder rbd_obj_request_put(obj_request); 2852b8d70035SAlex Elder 2853b8d70035SAlex Elder return ret; 2854b8d70035SAlex Elder } 2855b8d70035SAlex Elder 2856b8d70035SAlex Elder static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data) 2857b8d70035SAlex Elder { 2858b8d70035SAlex Elder struct rbd_device *rbd_dev = (struct rbd_device *)data; 2859e627db08SAlex Elder int ret; 2860b8d70035SAlex Elder 2861b8d70035SAlex Elder if (!rbd_dev) 2862b8d70035SAlex Elder return; 2863b8d70035SAlex Elder 286437206ee5SAlex Elder dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__, 2865b8d70035SAlex Elder rbd_dev->header_name, (unsigned long long)notify_id, 2866b8d70035SAlex Elder (unsigned int)opcode); 2867e627db08SAlex Elder ret = rbd_dev_refresh(rbd_dev); 2868e627db08SAlex Elder if (ret) 28693b5cf2a2SAlex Elder rbd_warn(rbd_dev, "header refresh error (%d)\n", ret); 2870b8d70035SAlex Elder 287120e0af67SJosh Durgin rbd_obj_notify_ack_sync(rbd_dev, notify_id); 2872b8d70035SAlex Elder } 2873b8d70035SAlex Elder 28749969ebc5SAlex Elder /* 28759969ebc5SAlex Elder * Request sync osd watch/unwatch. The value of "start" determines 28769969ebc5SAlex Elder * whether a watch request is being initiated or torn down. 28779969ebc5SAlex Elder */ 2878fca27065SIlya Dryomov static int __rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, bool start) 28799969ebc5SAlex Elder { 28809969ebc5SAlex Elder struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 28819969ebc5SAlex Elder struct rbd_obj_request *obj_request; 28829969ebc5SAlex Elder int ret; 28839969ebc5SAlex Elder 28849969ebc5SAlex Elder rbd_assert(start ^ !!rbd_dev->watch_event); 28859969ebc5SAlex Elder rbd_assert(start ^ !!rbd_dev->watch_request); 28869969ebc5SAlex Elder 28879969ebc5SAlex Elder if (start) { 28883c663bbdSAlex Elder ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev, 28899969ebc5SAlex Elder &rbd_dev->watch_event); 28909969ebc5SAlex Elder if (ret < 0) 28919969ebc5SAlex Elder return ret; 28928eb87565SAlex Elder rbd_assert(rbd_dev->watch_event != NULL); 28939969ebc5SAlex Elder } 28949969ebc5SAlex Elder 28959969ebc5SAlex Elder ret = -ENOMEM; 28969969ebc5SAlex Elder obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0, 28979969ebc5SAlex Elder OBJ_REQUEST_NODATA); 28989969ebc5SAlex Elder if (!obj_request) 28999969ebc5SAlex Elder goto out_cancel; 29009969ebc5SAlex Elder 2901deb236b3SIlya Dryomov obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, 1, 2902deb236b3SIlya Dryomov obj_request); 2903430c28c3SAlex Elder if (!obj_request->osd_req) 2904430c28c3SAlex Elder goto out_cancel; 2905430c28c3SAlex Elder 29068eb87565SAlex Elder if (start) 2907975241afSAlex Elder ceph_osdc_set_request_linger(osdc, obj_request->osd_req); 29088eb87565SAlex Elder else 29096977c3f9SAlex Elder ceph_osdc_unregister_linger_request(osdc, 2910975241afSAlex Elder rbd_dev->watch_request->osd_req); 29112169238dSAlex Elder 29122169238dSAlex Elder osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH, 29131f3ef788SAlex Elder rbd_dev->watch_event->cookie, 0, start ? 1 : 0); 29149d4df01fSAlex Elder rbd_osd_req_format_write(obj_request); 29152169238dSAlex Elder 29169969ebc5SAlex Elder ret = rbd_obj_request_submit(osdc, obj_request); 29179969ebc5SAlex Elder if (ret) 29189969ebc5SAlex Elder goto out_cancel; 29199969ebc5SAlex Elder ret = rbd_obj_request_wait(obj_request); 29209969ebc5SAlex Elder if (ret) 29219969ebc5SAlex Elder goto out_cancel; 29229969ebc5SAlex Elder ret = obj_request->result; 29239969ebc5SAlex Elder if (ret) 29249969ebc5SAlex Elder goto out_cancel; 29259969ebc5SAlex Elder 29268eb87565SAlex Elder /* 29278eb87565SAlex Elder * A watch request is set to linger, so the underlying osd 29288eb87565SAlex Elder * request won't go away until we unregister it. We retain 29298eb87565SAlex Elder * a pointer to the object request during that time (in 29308eb87565SAlex Elder * rbd_dev->watch_request), so we'll keep a reference to 29318eb87565SAlex Elder * it. We'll drop that reference (below) after we've 29328eb87565SAlex Elder * unregistered it. 29338eb87565SAlex Elder */ 29348eb87565SAlex Elder if (start) { 29358eb87565SAlex Elder rbd_dev->watch_request = obj_request; 29368eb87565SAlex Elder 29378eb87565SAlex Elder return 0; 29388eb87565SAlex Elder } 29398eb87565SAlex Elder 29408eb87565SAlex Elder /* We have successfully torn down the watch request */ 29418eb87565SAlex Elder 29428eb87565SAlex Elder rbd_obj_request_put(rbd_dev->watch_request); 29438eb87565SAlex Elder rbd_dev->watch_request = NULL; 29449969ebc5SAlex Elder out_cancel: 29459969ebc5SAlex Elder /* Cancel the event if we're tearing down, or on error */ 29469969ebc5SAlex Elder ceph_osdc_cancel_event(rbd_dev->watch_event); 29479969ebc5SAlex Elder rbd_dev->watch_event = NULL; 29489969ebc5SAlex Elder if (obj_request) 29499969ebc5SAlex Elder rbd_obj_request_put(obj_request); 29509969ebc5SAlex Elder 29519969ebc5SAlex Elder return ret; 29529969ebc5SAlex Elder } 29539969ebc5SAlex Elder 2954fca27065SIlya Dryomov static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev) 2955fca27065SIlya Dryomov { 2956fca27065SIlya Dryomov return __rbd_dev_header_watch_sync(rbd_dev, true); 2957fca27065SIlya Dryomov } 2958fca27065SIlya Dryomov 2959fca27065SIlya Dryomov static void rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev) 2960fca27065SIlya Dryomov { 2961fca27065SIlya Dryomov int ret; 2962fca27065SIlya Dryomov 2963fca27065SIlya Dryomov ret = __rbd_dev_header_watch_sync(rbd_dev, false); 2964fca27065SIlya Dryomov if (ret) { 2965fca27065SIlya Dryomov rbd_warn(rbd_dev, "unable to tear down watch request: %d\n", 2966fca27065SIlya Dryomov ret); 2967fca27065SIlya Dryomov } 2968fca27065SIlya Dryomov } 2969fca27065SIlya Dryomov 297036be9a76SAlex Elder /* 2971f40eb349SAlex Elder * Synchronous osd object method call. Returns the number of bytes 2972f40eb349SAlex Elder * returned in the outbound buffer, or a negative error code. 297336be9a76SAlex Elder */ 297436be9a76SAlex Elder static int rbd_obj_method_sync(struct rbd_device *rbd_dev, 297536be9a76SAlex Elder const char *object_name, 297636be9a76SAlex Elder const char *class_name, 297736be9a76SAlex Elder const char *method_name, 29784157976bSAlex Elder const void *outbound, 297936be9a76SAlex Elder size_t outbound_size, 29804157976bSAlex Elder void *inbound, 2981e2a58ee5SAlex Elder size_t inbound_size) 298236be9a76SAlex Elder { 29832169238dSAlex Elder struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 298436be9a76SAlex Elder struct rbd_obj_request *obj_request; 298536be9a76SAlex Elder struct page **pages; 298636be9a76SAlex Elder u32 page_count; 298736be9a76SAlex Elder int ret; 298836be9a76SAlex Elder 298936be9a76SAlex Elder /* 29906010a451SAlex Elder * Method calls are ultimately read operations. The result 29916010a451SAlex Elder * should placed into the inbound buffer provided. They 29926010a451SAlex Elder * also supply outbound data--parameters for the object 29936010a451SAlex Elder * method. Currently if this is present it will be a 29946010a451SAlex Elder * snapshot id. 299536be9a76SAlex Elder */ 299636be9a76SAlex Elder page_count = (u32)calc_pages_for(0, inbound_size); 299736be9a76SAlex Elder pages = ceph_alloc_page_vector(page_count, GFP_KERNEL); 299836be9a76SAlex Elder if (IS_ERR(pages)) 299936be9a76SAlex Elder return PTR_ERR(pages); 300036be9a76SAlex Elder 300136be9a76SAlex Elder ret = -ENOMEM; 30026010a451SAlex Elder obj_request = rbd_obj_request_create(object_name, 0, inbound_size, 300336be9a76SAlex Elder OBJ_REQUEST_PAGES); 300436be9a76SAlex Elder if (!obj_request) 300536be9a76SAlex Elder goto out; 300636be9a76SAlex Elder 300736be9a76SAlex Elder obj_request->pages = pages; 300836be9a76SAlex Elder obj_request->page_count = page_count; 300936be9a76SAlex Elder 3010deb236b3SIlya Dryomov obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, 1, 3011deb236b3SIlya Dryomov obj_request); 301236be9a76SAlex Elder if (!obj_request->osd_req) 301336be9a76SAlex Elder goto out; 301436be9a76SAlex Elder 3015c99d2d4aSAlex Elder osd_req_op_cls_init(obj_request->osd_req, 0, CEPH_OSD_OP_CALL, 301604017e29SAlex Elder class_name, method_name); 301704017e29SAlex Elder if (outbound_size) { 301804017e29SAlex Elder struct ceph_pagelist *pagelist; 301904017e29SAlex Elder 302004017e29SAlex Elder pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS); 302104017e29SAlex Elder if (!pagelist) 302204017e29SAlex Elder goto out; 302304017e29SAlex Elder 302404017e29SAlex Elder ceph_pagelist_init(pagelist); 302504017e29SAlex Elder ceph_pagelist_append(pagelist, outbound, outbound_size); 302604017e29SAlex Elder osd_req_op_cls_request_data_pagelist(obj_request->osd_req, 0, 302704017e29SAlex Elder pagelist); 302804017e29SAlex Elder } 3029a4ce40a9SAlex Elder osd_req_op_cls_response_data_pages(obj_request->osd_req, 0, 3030a4ce40a9SAlex Elder obj_request->pages, inbound_size, 303144cd188dSAlex Elder 0, false, false); 30329d4df01fSAlex Elder rbd_osd_req_format_read(obj_request); 3033430c28c3SAlex Elder 303436be9a76SAlex Elder ret = rbd_obj_request_submit(osdc, obj_request); 303536be9a76SAlex Elder if (ret) 303636be9a76SAlex Elder goto out; 303736be9a76SAlex Elder ret = rbd_obj_request_wait(obj_request); 303836be9a76SAlex Elder if (ret) 303936be9a76SAlex Elder goto out; 304036be9a76SAlex Elder 304136be9a76SAlex Elder ret = obj_request->result; 304236be9a76SAlex Elder if (ret < 0) 304336be9a76SAlex Elder goto out; 304457385b51SAlex Elder 304557385b51SAlex Elder rbd_assert(obj_request->xferred < (u64)INT_MAX); 304657385b51SAlex Elder ret = (int)obj_request->xferred; 3047903bb32eSAlex Elder ceph_copy_from_page_vector(pages, inbound, 0, obj_request->xferred); 304836be9a76SAlex Elder out: 304936be9a76SAlex Elder if (obj_request) 305036be9a76SAlex Elder rbd_obj_request_put(obj_request); 305136be9a76SAlex Elder else 305236be9a76SAlex Elder ceph_release_page_vector(pages, page_count); 305336be9a76SAlex Elder 305436be9a76SAlex Elder return ret; 305536be9a76SAlex Elder } 305636be9a76SAlex Elder 3057bf0d5f50SAlex Elder static void rbd_request_fn(struct request_queue *q) 3058cc344fa1SAlex Elder __releases(q->queue_lock) __acquires(q->queue_lock) 3059bf0d5f50SAlex Elder { 3060bf0d5f50SAlex Elder struct rbd_device *rbd_dev = q->queuedata; 3061bf0d5f50SAlex Elder bool read_only = rbd_dev->mapping.read_only; 3062bf0d5f50SAlex Elder struct request *rq; 3063bf0d5f50SAlex Elder int result; 3064bf0d5f50SAlex Elder 3065bf0d5f50SAlex Elder while ((rq = blk_fetch_request(q))) { 3066bf0d5f50SAlex Elder bool write_request = rq_data_dir(rq) == WRITE; 3067bf0d5f50SAlex Elder struct rbd_img_request *img_request; 3068bf0d5f50SAlex Elder u64 offset; 3069bf0d5f50SAlex Elder u64 length; 3070bf0d5f50SAlex Elder 3071bf0d5f50SAlex Elder /* Ignore any non-FS requests that filter through. */ 3072bf0d5f50SAlex Elder 3073bf0d5f50SAlex Elder if (rq->cmd_type != REQ_TYPE_FS) { 30744dda41d3SAlex Elder dout("%s: non-fs request type %d\n", __func__, 30754dda41d3SAlex Elder (int) rq->cmd_type); 30764dda41d3SAlex Elder __blk_end_request_all(rq, 0); 30774dda41d3SAlex Elder continue; 30784dda41d3SAlex Elder } 30794dda41d3SAlex Elder 30804dda41d3SAlex Elder /* Ignore/skip any zero-length requests */ 30814dda41d3SAlex Elder 30824dda41d3SAlex Elder offset = (u64) blk_rq_pos(rq) << SECTOR_SHIFT; 30834dda41d3SAlex Elder length = (u64) blk_rq_bytes(rq); 30844dda41d3SAlex Elder 30854dda41d3SAlex Elder if (!length) { 30864dda41d3SAlex Elder dout("%s: zero-length request\n", __func__); 3087bf0d5f50SAlex Elder __blk_end_request_all(rq, 0); 3088bf0d5f50SAlex Elder continue; 3089bf0d5f50SAlex Elder } 3090bf0d5f50SAlex Elder 3091bf0d5f50SAlex Elder spin_unlock_irq(q->queue_lock); 3092bf0d5f50SAlex Elder 3093bf0d5f50SAlex Elder /* Disallow writes to a read-only device */ 3094bf0d5f50SAlex Elder 3095bf0d5f50SAlex Elder if (write_request) { 3096bf0d5f50SAlex Elder result = -EROFS; 3097bf0d5f50SAlex Elder if (read_only) 3098bf0d5f50SAlex Elder goto end_request; 3099bf0d5f50SAlex Elder rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP); 3100bf0d5f50SAlex Elder } 3101bf0d5f50SAlex Elder 31026d292906SAlex Elder /* 31036d292906SAlex Elder * Quit early if the mapped snapshot no longer 31046d292906SAlex Elder * exists. It's still possible the snapshot will 31056d292906SAlex Elder * have disappeared by the time our request arrives 31066d292906SAlex Elder * at the osd, but there's no sense in sending it if 31076d292906SAlex Elder * we already know. 31086d292906SAlex Elder */ 31096d292906SAlex Elder if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) { 3110bf0d5f50SAlex Elder dout("request for non-existent snapshot"); 3111bf0d5f50SAlex Elder rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP); 3112bf0d5f50SAlex Elder result = -ENXIO; 3113bf0d5f50SAlex Elder goto end_request; 3114bf0d5f50SAlex Elder } 3115bf0d5f50SAlex Elder 3116bf0d5f50SAlex Elder result = -EINVAL; 3117c0cd10dbSAlex Elder if (offset && length > U64_MAX - offset + 1) { 3118c0cd10dbSAlex Elder rbd_warn(rbd_dev, "bad request range (%llu~%llu)\n", 3119c0cd10dbSAlex Elder offset, length); 3120bf0d5f50SAlex Elder goto end_request; /* Shouldn't happen */ 3121c0cd10dbSAlex Elder } 3122bf0d5f50SAlex Elder 312300a653e2SAlex Elder result = -EIO; 312400a653e2SAlex Elder if (offset + length > rbd_dev->mapping.size) { 312500a653e2SAlex Elder rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)\n", 312600a653e2SAlex Elder offset, length, rbd_dev->mapping.size); 312700a653e2SAlex Elder goto end_request; 312800a653e2SAlex Elder } 312900a653e2SAlex Elder 3130bf0d5f50SAlex Elder result = -ENOMEM; 3131bf0d5f50SAlex Elder img_request = rbd_img_request_create(rbd_dev, offset, length, 3132e93f3152SAlex Elder write_request); 3133bf0d5f50SAlex Elder if (!img_request) 3134bf0d5f50SAlex Elder goto end_request; 3135bf0d5f50SAlex Elder 3136bf0d5f50SAlex Elder img_request->rq = rq; 3137bf0d5f50SAlex Elder 3138f1a4739fSAlex Elder result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO, 3139f1a4739fSAlex Elder rq->bio); 3140bf0d5f50SAlex Elder if (!result) 3141bf0d5f50SAlex Elder result = rbd_img_request_submit(img_request); 3142bf0d5f50SAlex Elder if (result) 3143bf0d5f50SAlex Elder rbd_img_request_put(img_request); 3144bf0d5f50SAlex Elder end_request: 3145bf0d5f50SAlex Elder spin_lock_irq(q->queue_lock); 3146bf0d5f50SAlex Elder if (result < 0) { 31477da22d29SAlex Elder rbd_warn(rbd_dev, "%s %llx at %llx result %d\n", 31487da22d29SAlex Elder write_request ? "write" : "read", 31497da22d29SAlex Elder length, offset, result); 31507da22d29SAlex Elder 3151bf0d5f50SAlex Elder __blk_end_request_all(rq, result); 3152bf0d5f50SAlex Elder } 3153bf0d5f50SAlex Elder } 3154bf0d5f50SAlex Elder } 3155bf0d5f50SAlex Elder 3156602adf40SYehuda Sadeh /* 3157602adf40SYehuda Sadeh * a queue callback. Makes sure that we don't create a bio that spans across 3158602adf40SYehuda Sadeh * multiple osd objects. One exception would be with a single page bios, 3159f7760dadSAlex Elder * which we handle later at bio_chain_clone_range() 3160602adf40SYehuda Sadeh */ 3161602adf40SYehuda Sadeh static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd, 3162602adf40SYehuda Sadeh struct bio_vec *bvec) 3163602adf40SYehuda Sadeh { 3164602adf40SYehuda Sadeh struct rbd_device *rbd_dev = q->queuedata; 3165e5cfeed2SAlex Elder sector_t sector_offset; 3166e5cfeed2SAlex Elder sector_t sectors_per_obj; 3167e5cfeed2SAlex Elder sector_t obj_sector_offset; 3168e5cfeed2SAlex Elder int ret; 3169602adf40SYehuda Sadeh 3170e5cfeed2SAlex Elder /* 3171e5cfeed2SAlex Elder * Find how far into its rbd object the partition-relative 3172e5cfeed2SAlex Elder * bio start sector is to offset relative to the enclosing 3173e5cfeed2SAlex Elder * device. 3174e5cfeed2SAlex Elder */ 3175e5cfeed2SAlex Elder sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector; 3176e5cfeed2SAlex Elder sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT); 3177e5cfeed2SAlex Elder obj_sector_offset = sector_offset & (sectors_per_obj - 1); 3178593a9e7bSAlex Elder 3179e5cfeed2SAlex Elder /* 3180e5cfeed2SAlex Elder * Compute the number of bytes from that offset to the end 3181e5cfeed2SAlex Elder * of the object. Account for what's already used by the bio. 3182e5cfeed2SAlex Elder */ 3183e5cfeed2SAlex Elder ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT; 3184e5cfeed2SAlex Elder if (ret > bmd->bi_size) 3185e5cfeed2SAlex Elder ret -= bmd->bi_size; 3186e5cfeed2SAlex Elder else 3187e5cfeed2SAlex Elder ret = 0; 3188e5cfeed2SAlex Elder 3189e5cfeed2SAlex Elder /* 3190e5cfeed2SAlex Elder * Don't send back more than was asked for. And if the bio 3191e5cfeed2SAlex Elder * was empty, let the whole thing through because: "Note 3192e5cfeed2SAlex Elder * that a block device *must* allow a single page to be 3193e5cfeed2SAlex Elder * added to an empty bio." 3194e5cfeed2SAlex Elder */ 3195e5cfeed2SAlex Elder rbd_assert(bvec->bv_len <= PAGE_SIZE); 3196e5cfeed2SAlex Elder if (ret > (int) bvec->bv_len || !bmd->bi_size) 3197e5cfeed2SAlex Elder ret = (int) bvec->bv_len; 3198e5cfeed2SAlex Elder 3199e5cfeed2SAlex Elder return ret; 3200602adf40SYehuda Sadeh } 3201602adf40SYehuda Sadeh 3202602adf40SYehuda Sadeh static void rbd_free_disk(struct rbd_device *rbd_dev) 3203602adf40SYehuda Sadeh { 3204602adf40SYehuda Sadeh struct gendisk *disk = rbd_dev->disk; 3205602adf40SYehuda Sadeh 3206602adf40SYehuda Sadeh if (!disk) 3207602adf40SYehuda Sadeh return; 3208602adf40SYehuda Sadeh 3209a0cab924SAlex Elder rbd_dev->disk = NULL; 3210a0cab924SAlex Elder if (disk->flags & GENHD_FL_UP) { 3211602adf40SYehuda Sadeh del_gendisk(disk); 3212602adf40SYehuda Sadeh if (disk->queue) 3213602adf40SYehuda Sadeh blk_cleanup_queue(disk->queue); 3214a0cab924SAlex Elder } 3215602adf40SYehuda Sadeh put_disk(disk); 3216602adf40SYehuda Sadeh } 3217602adf40SYehuda Sadeh 3218788e2df3SAlex Elder static int rbd_obj_read_sync(struct rbd_device *rbd_dev, 3219788e2df3SAlex Elder const char *object_name, 32207097f8dfSAlex Elder u64 offset, u64 length, void *buf) 3221788e2df3SAlex Elder 3222788e2df3SAlex Elder { 32232169238dSAlex Elder struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 3224788e2df3SAlex Elder struct rbd_obj_request *obj_request; 3225788e2df3SAlex Elder struct page **pages = NULL; 3226788e2df3SAlex Elder u32 page_count; 32271ceae7efSAlex Elder size_t size; 3228788e2df3SAlex Elder int ret; 3229788e2df3SAlex Elder 3230788e2df3SAlex Elder page_count = (u32) calc_pages_for(offset, length); 3231788e2df3SAlex Elder pages = ceph_alloc_page_vector(page_count, GFP_KERNEL); 3232788e2df3SAlex Elder if (IS_ERR(pages)) 3233788e2df3SAlex Elder ret = PTR_ERR(pages); 3234788e2df3SAlex Elder 3235788e2df3SAlex Elder ret = -ENOMEM; 3236788e2df3SAlex Elder obj_request = rbd_obj_request_create(object_name, offset, length, 3237788e2df3SAlex Elder OBJ_REQUEST_PAGES); 3238788e2df3SAlex Elder if (!obj_request) 3239788e2df3SAlex Elder goto out; 3240788e2df3SAlex Elder 3241788e2df3SAlex Elder obj_request->pages = pages; 3242788e2df3SAlex Elder obj_request->page_count = page_count; 3243788e2df3SAlex Elder 3244deb236b3SIlya Dryomov obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, 1, 3245deb236b3SIlya Dryomov obj_request); 3246788e2df3SAlex Elder if (!obj_request->osd_req) 3247788e2df3SAlex Elder goto out; 3248788e2df3SAlex Elder 3249c99d2d4aSAlex Elder osd_req_op_extent_init(obj_request->osd_req, 0, CEPH_OSD_OP_READ, 3250c99d2d4aSAlex Elder offset, length, 0, 0); 3251406e2c9fSAlex Elder osd_req_op_extent_osd_data_pages(obj_request->osd_req, 0, 3252a4ce40a9SAlex Elder obj_request->pages, 325344cd188dSAlex Elder obj_request->length, 325444cd188dSAlex Elder obj_request->offset & ~PAGE_MASK, 325544cd188dSAlex Elder false, false); 32569d4df01fSAlex Elder rbd_osd_req_format_read(obj_request); 3257430c28c3SAlex Elder 3258788e2df3SAlex Elder ret = rbd_obj_request_submit(osdc, obj_request); 3259788e2df3SAlex Elder if (ret) 3260788e2df3SAlex Elder goto out; 3261788e2df3SAlex Elder ret = rbd_obj_request_wait(obj_request); 3262788e2df3SAlex Elder if (ret) 3263788e2df3SAlex Elder goto out; 3264788e2df3SAlex Elder 3265788e2df3SAlex Elder ret = obj_request->result; 3266788e2df3SAlex Elder if (ret < 0) 3267788e2df3SAlex Elder goto out; 32681ceae7efSAlex Elder 32691ceae7efSAlex Elder rbd_assert(obj_request->xferred <= (u64) SIZE_MAX); 32701ceae7efSAlex Elder size = (size_t) obj_request->xferred; 3271903bb32eSAlex Elder ceph_copy_from_page_vector(pages, buf, 0, size); 327223ed6e13SAlex Elder rbd_assert(size <= (size_t)INT_MAX); 327323ed6e13SAlex Elder ret = (int)size; 3274788e2df3SAlex Elder out: 3275788e2df3SAlex Elder if (obj_request) 3276788e2df3SAlex Elder rbd_obj_request_put(obj_request); 3277788e2df3SAlex Elder else 3278788e2df3SAlex Elder ceph_release_page_vector(pages, page_count); 3279788e2df3SAlex Elder 3280788e2df3SAlex Elder return ret; 3281788e2df3SAlex Elder } 3282788e2df3SAlex Elder 3283602adf40SYehuda Sadeh /* 3284662518b1SAlex Elder * Read the complete header for the given rbd device. On successful 3285662518b1SAlex Elder * return, the rbd_dev->header field will contain up-to-date 3286662518b1SAlex Elder * information about the image. 32874156d998SAlex Elder */ 328899a41ebcSAlex Elder static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev) 32894156d998SAlex Elder { 32904156d998SAlex Elder struct rbd_image_header_ondisk *ondisk = NULL; 32914156d998SAlex Elder u32 snap_count = 0; 32924156d998SAlex Elder u64 names_size = 0; 32934156d998SAlex Elder u32 want_count; 32944156d998SAlex Elder int ret; 32954156d998SAlex Elder 32964156d998SAlex Elder /* 32974156d998SAlex Elder * The complete header will include an array of its 64-bit 32984156d998SAlex Elder * snapshot ids, followed by the names of those snapshots as 32994156d998SAlex Elder * a contiguous block of NUL-terminated strings. Note that 33004156d998SAlex Elder * the number of snapshots could change by the time we read 33014156d998SAlex Elder * it in, in which case we re-read it. 33024156d998SAlex Elder */ 33034156d998SAlex Elder do { 33044156d998SAlex Elder size_t size; 33054156d998SAlex Elder 33064156d998SAlex Elder kfree(ondisk); 33074156d998SAlex Elder 33084156d998SAlex Elder size = sizeof (*ondisk); 33094156d998SAlex Elder size += snap_count * sizeof (struct rbd_image_snap_ondisk); 33104156d998SAlex Elder size += names_size; 33114156d998SAlex Elder ondisk = kmalloc(size, GFP_KERNEL); 33124156d998SAlex Elder if (!ondisk) 3313662518b1SAlex Elder return -ENOMEM; 33144156d998SAlex Elder 3315788e2df3SAlex Elder ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name, 33167097f8dfSAlex Elder 0, size, ondisk); 33174156d998SAlex Elder if (ret < 0) 3318662518b1SAlex Elder goto out; 3319c0cd10dbSAlex Elder if ((size_t)ret < size) { 33204156d998SAlex Elder ret = -ENXIO; 332106ecc6cbSAlex Elder rbd_warn(rbd_dev, "short header read (want %zd got %d)", 332206ecc6cbSAlex Elder size, ret); 3323662518b1SAlex Elder goto out; 33244156d998SAlex Elder } 33254156d998SAlex Elder if (!rbd_dev_ondisk_valid(ondisk)) { 33264156d998SAlex Elder ret = -ENXIO; 332706ecc6cbSAlex Elder rbd_warn(rbd_dev, "invalid header"); 3328662518b1SAlex Elder goto out; 33294156d998SAlex Elder } 33304156d998SAlex Elder 33314156d998SAlex Elder names_size = le64_to_cpu(ondisk->snap_names_len); 33324156d998SAlex Elder want_count = snap_count; 33334156d998SAlex Elder snap_count = le32_to_cpu(ondisk->snap_count); 33344156d998SAlex Elder } while (snap_count != want_count); 33354156d998SAlex Elder 3336662518b1SAlex Elder ret = rbd_header_from_disk(rbd_dev, ondisk); 3337662518b1SAlex Elder out: 33384156d998SAlex Elder kfree(ondisk); 33394156d998SAlex Elder 3340dfc5606dSYehuda Sadeh return ret; 3341602adf40SYehuda Sadeh } 3342602adf40SYehuda Sadeh 334315228edeSAlex Elder /* 334415228edeSAlex Elder * Clear the rbd device's EXISTS flag if the snapshot it's mapped to 334515228edeSAlex Elder * has disappeared from the (just updated) snapshot context. 334615228edeSAlex Elder */ 334715228edeSAlex Elder static void rbd_exists_validate(struct rbd_device *rbd_dev) 334815228edeSAlex Elder { 334915228edeSAlex Elder u64 snap_id; 335015228edeSAlex Elder 335115228edeSAlex Elder if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) 335215228edeSAlex Elder return; 335315228edeSAlex Elder 335415228edeSAlex Elder snap_id = rbd_dev->spec->snap_id; 335515228edeSAlex Elder if (snap_id == CEPH_NOSNAP) 335615228edeSAlex Elder return; 335715228edeSAlex Elder 335815228edeSAlex Elder if (rbd_dev_snap_index(rbd_dev, snap_id) == BAD_SNAP_INDEX) 335915228edeSAlex Elder clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); 336015228edeSAlex Elder } 336115228edeSAlex Elder 33629875201eSJosh Durgin static void rbd_dev_update_size(struct rbd_device *rbd_dev) 33639875201eSJosh Durgin { 33649875201eSJosh Durgin sector_t size; 33659875201eSJosh Durgin bool removing; 33669875201eSJosh Durgin 33679875201eSJosh Durgin /* 33689875201eSJosh Durgin * Don't hold the lock while doing disk operations, 33699875201eSJosh Durgin * or lock ordering will conflict with the bdev mutex via: 33709875201eSJosh Durgin * rbd_add() -> blkdev_get() -> rbd_open() 33719875201eSJosh Durgin */ 33729875201eSJosh Durgin spin_lock_irq(&rbd_dev->lock); 33739875201eSJosh Durgin removing = test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags); 33749875201eSJosh Durgin spin_unlock_irq(&rbd_dev->lock); 33759875201eSJosh Durgin /* 33769875201eSJosh Durgin * If the device is being removed, rbd_dev->disk has 33779875201eSJosh Durgin * been destroyed, so don't try to update its size 33789875201eSJosh Durgin */ 33799875201eSJosh Durgin if (!removing) { 33809875201eSJosh Durgin size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE; 33819875201eSJosh Durgin dout("setting size to %llu sectors", (unsigned long long)size); 33829875201eSJosh Durgin set_capacity(rbd_dev->disk, size); 33839875201eSJosh Durgin revalidate_disk(rbd_dev->disk); 33849875201eSJosh Durgin } 33859875201eSJosh Durgin } 33869875201eSJosh Durgin 3387cc4a38bdSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev) 33881fe5e993SAlex Elder { 3389e627db08SAlex Elder u64 mapping_size; 33901fe5e993SAlex Elder int ret; 33911fe5e993SAlex Elder 3392117973fbSAlex Elder rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 3393cfbf6377SAlex Elder down_write(&rbd_dev->header_rwsem); 33943b5cf2a2SAlex Elder mapping_size = rbd_dev->mapping.size; 3395117973fbSAlex Elder if (rbd_dev->image_format == 1) 339699a41ebcSAlex Elder ret = rbd_dev_v1_header_info(rbd_dev); 3397117973fbSAlex Elder else 33982df3fac7SAlex Elder ret = rbd_dev_v2_header_info(rbd_dev); 339915228edeSAlex Elder 340015228edeSAlex Elder /* If it's a mapped snapshot, validate its EXISTS flag */ 340115228edeSAlex Elder 340215228edeSAlex Elder rbd_exists_validate(rbd_dev); 3403cfbf6377SAlex Elder up_write(&rbd_dev->header_rwsem); 3404cfbf6377SAlex Elder 340500a653e2SAlex Elder if (mapping_size != rbd_dev->mapping.size) { 34069875201eSJosh Durgin rbd_dev_update_size(rbd_dev); 340700a653e2SAlex Elder } 34081fe5e993SAlex Elder 34091fe5e993SAlex Elder return ret; 34101fe5e993SAlex Elder } 34111fe5e993SAlex Elder 3412602adf40SYehuda Sadeh static int rbd_init_disk(struct rbd_device *rbd_dev) 3413602adf40SYehuda Sadeh { 3414602adf40SYehuda Sadeh struct gendisk *disk; 3415602adf40SYehuda Sadeh struct request_queue *q; 3416593a9e7bSAlex Elder u64 segment_size; 3417602adf40SYehuda Sadeh 3418602adf40SYehuda Sadeh /* create gendisk info */ 34197e513d43SIlya Dryomov disk = alloc_disk(single_major ? 34207e513d43SIlya Dryomov (1 << RBD_SINGLE_MAJOR_PART_SHIFT) : 34217e513d43SIlya Dryomov RBD_MINORS_PER_MAJOR); 3422602adf40SYehuda Sadeh if (!disk) 34231fcdb8aaSAlex Elder return -ENOMEM; 3424602adf40SYehuda Sadeh 3425f0f8cef5SAlex Elder snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d", 3426de71a297SAlex Elder rbd_dev->dev_id); 3427602adf40SYehuda Sadeh disk->major = rbd_dev->major; 3428dd82fff1SIlya Dryomov disk->first_minor = rbd_dev->minor; 34297e513d43SIlya Dryomov if (single_major) 34307e513d43SIlya Dryomov disk->flags |= GENHD_FL_EXT_DEVT; 3431602adf40SYehuda Sadeh disk->fops = &rbd_bd_ops; 3432602adf40SYehuda Sadeh disk->private_data = rbd_dev; 3433602adf40SYehuda Sadeh 3434bf0d5f50SAlex Elder q = blk_init_queue(rbd_request_fn, &rbd_dev->lock); 3435602adf40SYehuda Sadeh if (!q) 3436602adf40SYehuda Sadeh goto out_disk; 3437029bcbd8SJosh Durgin 3438593a9e7bSAlex Elder /* We use the default size, but let's be explicit about it. */ 3439593a9e7bSAlex Elder blk_queue_physical_block_size(q, SECTOR_SIZE); 3440593a9e7bSAlex Elder 3441029bcbd8SJosh Durgin /* set io sizes to object size */ 3442593a9e7bSAlex Elder segment_size = rbd_obj_bytes(&rbd_dev->header); 3443593a9e7bSAlex Elder blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE); 3444593a9e7bSAlex Elder blk_queue_max_segment_size(q, segment_size); 3445593a9e7bSAlex Elder blk_queue_io_min(q, segment_size); 3446593a9e7bSAlex Elder blk_queue_io_opt(q, segment_size); 3447029bcbd8SJosh Durgin 3448602adf40SYehuda Sadeh blk_queue_merge_bvec(q, rbd_merge_bvec); 3449602adf40SYehuda Sadeh disk->queue = q; 3450602adf40SYehuda Sadeh 3451602adf40SYehuda Sadeh q->queuedata = rbd_dev; 3452602adf40SYehuda Sadeh 3453602adf40SYehuda Sadeh rbd_dev->disk = disk; 3454602adf40SYehuda Sadeh 3455602adf40SYehuda Sadeh return 0; 3456602adf40SYehuda Sadeh out_disk: 3457602adf40SYehuda Sadeh put_disk(disk); 34581fcdb8aaSAlex Elder 34591fcdb8aaSAlex Elder return -ENOMEM; 3460602adf40SYehuda Sadeh } 3461602adf40SYehuda Sadeh 3462dfc5606dSYehuda Sadeh /* 3463dfc5606dSYehuda Sadeh sysfs 3464dfc5606dSYehuda Sadeh */ 3465602adf40SYehuda Sadeh 3466593a9e7bSAlex Elder static struct rbd_device *dev_to_rbd_dev(struct device *dev) 3467593a9e7bSAlex Elder { 3468593a9e7bSAlex Elder return container_of(dev, struct rbd_device, dev); 3469593a9e7bSAlex Elder } 3470593a9e7bSAlex Elder 3471dfc5606dSYehuda Sadeh static ssize_t rbd_size_show(struct device *dev, 3472dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 3473602adf40SYehuda Sadeh { 3474593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 3475dfc5606dSYehuda Sadeh 3476fc71d833SAlex Elder return sprintf(buf, "%llu\n", 3477fc71d833SAlex Elder (unsigned long long)rbd_dev->mapping.size); 3478602adf40SYehuda Sadeh } 3479602adf40SYehuda Sadeh 348034b13184SAlex Elder /* 348134b13184SAlex Elder * Note this shows the features for whatever's mapped, which is not 348234b13184SAlex Elder * necessarily the base image. 348334b13184SAlex Elder */ 348434b13184SAlex Elder static ssize_t rbd_features_show(struct device *dev, 348534b13184SAlex Elder struct device_attribute *attr, char *buf) 348634b13184SAlex Elder { 348734b13184SAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 348834b13184SAlex Elder 348934b13184SAlex Elder return sprintf(buf, "0x%016llx\n", 349034b13184SAlex Elder (unsigned long long)rbd_dev->mapping.features); 349134b13184SAlex Elder } 349234b13184SAlex Elder 3493dfc5606dSYehuda Sadeh static ssize_t rbd_major_show(struct device *dev, 3494dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 3495602adf40SYehuda Sadeh { 3496593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 3497dfc5606dSYehuda Sadeh 3498fc71d833SAlex Elder if (rbd_dev->major) 3499dfc5606dSYehuda Sadeh return sprintf(buf, "%d\n", rbd_dev->major); 3500fc71d833SAlex Elder 3501fc71d833SAlex Elder return sprintf(buf, "(none)\n"); 3502dd82fff1SIlya Dryomov } 3503fc71d833SAlex Elder 3504dd82fff1SIlya Dryomov static ssize_t rbd_minor_show(struct device *dev, 3505dd82fff1SIlya Dryomov struct device_attribute *attr, char *buf) 3506dd82fff1SIlya Dryomov { 3507dd82fff1SIlya Dryomov struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 3508dd82fff1SIlya Dryomov 3509dd82fff1SIlya Dryomov return sprintf(buf, "%d\n", rbd_dev->minor); 3510dfc5606dSYehuda Sadeh } 3511dfc5606dSYehuda Sadeh 3512dfc5606dSYehuda Sadeh static ssize_t rbd_client_id_show(struct device *dev, 3513dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 3514dfc5606dSYehuda Sadeh { 3515593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 3516dfc5606dSYehuda Sadeh 35171dbb4399SAlex Elder return sprintf(buf, "client%lld\n", 35181dbb4399SAlex Elder ceph_client_id(rbd_dev->rbd_client->client)); 3519dfc5606dSYehuda Sadeh } 3520dfc5606dSYehuda Sadeh 3521dfc5606dSYehuda Sadeh static ssize_t rbd_pool_show(struct device *dev, 3522dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 3523dfc5606dSYehuda Sadeh { 3524593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 3525dfc5606dSYehuda Sadeh 35260d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->pool_name); 3527dfc5606dSYehuda Sadeh } 3528dfc5606dSYehuda Sadeh 35299bb2f334SAlex Elder static ssize_t rbd_pool_id_show(struct device *dev, 35309bb2f334SAlex Elder struct device_attribute *attr, char *buf) 35319bb2f334SAlex Elder { 35329bb2f334SAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 35339bb2f334SAlex Elder 35340d7dbfceSAlex Elder return sprintf(buf, "%llu\n", 35350d7dbfceSAlex Elder (unsigned long long) rbd_dev->spec->pool_id); 35369bb2f334SAlex Elder } 35379bb2f334SAlex Elder 3538dfc5606dSYehuda Sadeh static ssize_t rbd_name_show(struct device *dev, 3539dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 3540dfc5606dSYehuda Sadeh { 3541593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 3542dfc5606dSYehuda Sadeh 3543a92ffdf8SAlex Elder if (rbd_dev->spec->image_name) 35440d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->image_name); 3545a92ffdf8SAlex Elder 3546a92ffdf8SAlex Elder return sprintf(buf, "(unknown)\n"); 3547dfc5606dSYehuda Sadeh } 3548dfc5606dSYehuda Sadeh 3549589d30e0SAlex Elder static ssize_t rbd_image_id_show(struct device *dev, 3550589d30e0SAlex Elder struct device_attribute *attr, char *buf) 3551589d30e0SAlex Elder { 3552589d30e0SAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 3553589d30e0SAlex Elder 35540d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->image_id); 3555589d30e0SAlex Elder } 3556589d30e0SAlex Elder 355734b13184SAlex Elder /* 355834b13184SAlex Elder * Shows the name of the currently-mapped snapshot (or 355934b13184SAlex Elder * RBD_SNAP_HEAD_NAME for the base image). 356034b13184SAlex Elder */ 3561dfc5606dSYehuda Sadeh static ssize_t rbd_snap_show(struct device *dev, 3562dfc5606dSYehuda Sadeh struct device_attribute *attr, 3563dfc5606dSYehuda Sadeh char *buf) 3564dfc5606dSYehuda Sadeh { 3565593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 3566dfc5606dSYehuda Sadeh 35670d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->snap_name); 3568dfc5606dSYehuda Sadeh } 3569dfc5606dSYehuda Sadeh 357086b00e0dSAlex Elder /* 357186b00e0dSAlex Elder * For an rbd v2 image, shows the pool id, image id, and snapshot id 357286b00e0dSAlex Elder * for the parent image. If there is no parent, simply shows 357386b00e0dSAlex Elder * "(no parent image)". 357486b00e0dSAlex Elder */ 357586b00e0dSAlex Elder static ssize_t rbd_parent_show(struct device *dev, 357686b00e0dSAlex Elder struct device_attribute *attr, 357786b00e0dSAlex Elder char *buf) 357886b00e0dSAlex Elder { 357986b00e0dSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 358086b00e0dSAlex Elder struct rbd_spec *spec = rbd_dev->parent_spec; 358186b00e0dSAlex Elder int count; 358286b00e0dSAlex Elder char *bufp = buf; 358386b00e0dSAlex Elder 358486b00e0dSAlex Elder if (!spec) 358586b00e0dSAlex Elder return sprintf(buf, "(no parent image)\n"); 358686b00e0dSAlex Elder 358786b00e0dSAlex Elder count = sprintf(bufp, "pool_id %llu\npool_name %s\n", 358886b00e0dSAlex Elder (unsigned long long) spec->pool_id, spec->pool_name); 358986b00e0dSAlex Elder if (count < 0) 359086b00e0dSAlex Elder return count; 359186b00e0dSAlex Elder bufp += count; 359286b00e0dSAlex Elder 359386b00e0dSAlex Elder count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id, 359486b00e0dSAlex Elder spec->image_name ? spec->image_name : "(unknown)"); 359586b00e0dSAlex Elder if (count < 0) 359686b00e0dSAlex Elder return count; 359786b00e0dSAlex Elder bufp += count; 359886b00e0dSAlex Elder 359986b00e0dSAlex Elder count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n", 360086b00e0dSAlex Elder (unsigned long long) spec->snap_id, spec->snap_name); 360186b00e0dSAlex Elder if (count < 0) 360286b00e0dSAlex Elder return count; 360386b00e0dSAlex Elder bufp += count; 360486b00e0dSAlex Elder 360586b00e0dSAlex Elder count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap); 360686b00e0dSAlex Elder if (count < 0) 360786b00e0dSAlex Elder return count; 360886b00e0dSAlex Elder bufp += count; 360986b00e0dSAlex Elder 361086b00e0dSAlex Elder return (ssize_t) (bufp - buf); 361186b00e0dSAlex Elder } 361286b00e0dSAlex Elder 3613dfc5606dSYehuda Sadeh static ssize_t rbd_image_refresh(struct device *dev, 3614dfc5606dSYehuda Sadeh struct device_attribute *attr, 3615dfc5606dSYehuda Sadeh const char *buf, 3616dfc5606dSYehuda Sadeh size_t size) 3617dfc5606dSYehuda Sadeh { 3618593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 3619b813623aSAlex Elder int ret; 3620602adf40SYehuda Sadeh 3621cc4a38bdSAlex Elder ret = rbd_dev_refresh(rbd_dev); 3622e627db08SAlex Elder if (ret) 3623e627db08SAlex Elder rbd_warn(rbd_dev, ": manual header refresh error (%d)\n", ret); 3624b813623aSAlex Elder 3625b813623aSAlex Elder return ret < 0 ? ret : size; 3626dfc5606dSYehuda Sadeh } 3627602adf40SYehuda Sadeh 3628dfc5606dSYehuda Sadeh static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL); 362934b13184SAlex Elder static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL); 3630dfc5606dSYehuda Sadeh static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL); 3631dd82fff1SIlya Dryomov static DEVICE_ATTR(minor, S_IRUGO, rbd_minor_show, NULL); 3632dfc5606dSYehuda Sadeh static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL); 3633dfc5606dSYehuda Sadeh static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL); 36349bb2f334SAlex Elder static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL); 3635dfc5606dSYehuda Sadeh static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL); 3636589d30e0SAlex Elder static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL); 3637dfc5606dSYehuda Sadeh static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh); 3638dfc5606dSYehuda Sadeh static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL); 363986b00e0dSAlex Elder static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL); 3640dfc5606dSYehuda Sadeh 3641dfc5606dSYehuda Sadeh static struct attribute *rbd_attrs[] = { 3642dfc5606dSYehuda Sadeh &dev_attr_size.attr, 364334b13184SAlex Elder &dev_attr_features.attr, 3644dfc5606dSYehuda Sadeh &dev_attr_major.attr, 3645dd82fff1SIlya Dryomov &dev_attr_minor.attr, 3646dfc5606dSYehuda Sadeh &dev_attr_client_id.attr, 3647dfc5606dSYehuda Sadeh &dev_attr_pool.attr, 36489bb2f334SAlex Elder &dev_attr_pool_id.attr, 3649dfc5606dSYehuda Sadeh &dev_attr_name.attr, 3650589d30e0SAlex Elder &dev_attr_image_id.attr, 3651dfc5606dSYehuda Sadeh &dev_attr_current_snap.attr, 365286b00e0dSAlex Elder &dev_attr_parent.attr, 3653dfc5606dSYehuda Sadeh &dev_attr_refresh.attr, 3654dfc5606dSYehuda Sadeh NULL 3655dfc5606dSYehuda Sadeh }; 3656dfc5606dSYehuda Sadeh 3657dfc5606dSYehuda Sadeh static struct attribute_group rbd_attr_group = { 3658dfc5606dSYehuda Sadeh .attrs = rbd_attrs, 3659dfc5606dSYehuda Sadeh }; 3660dfc5606dSYehuda Sadeh 3661dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_attr_groups[] = { 3662dfc5606dSYehuda Sadeh &rbd_attr_group, 3663dfc5606dSYehuda Sadeh NULL 3664dfc5606dSYehuda Sadeh }; 3665dfc5606dSYehuda Sadeh 3666dfc5606dSYehuda Sadeh static void rbd_sysfs_dev_release(struct device *dev) 3667dfc5606dSYehuda Sadeh { 3668dfc5606dSYehuda Sadeh } 3669dfc5606dSYehuda Sadeh 3670dfc5606dSYehuda Sadeh static struct device_type rbd_device_type = { 3671dfc5606dSYehuda Sadeh .name = "rbd", 3672dfc5606dSYehuda Sadeh .groups = rbd_attr_groups, 3673dfc5606dSYehuda Sadeh .release = rbd_sysfs_dev_release, 3674dfc5606dSYehuda Sadeh }; 3675dfc5606dSYehuda Sadeh 36768b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec) 36778b8fb99cSAlex Elder { 36788b8fb99cSAlex Elder kref_get(&spec->kref); 36798b8fb99cSAlex Elder 36808b8fb99cSAlex Elder return spec; 36818b8fb99cSAlex Elder } 36828b8fb99cSAlex Elder 36838b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref); 36848b8fb99cSAlex Elder static void rbd_spec_put(struct rbd_spec *spec) 36858b8fb99cSAlex Elder { 36868b8fb99cSAlex Elder if (spec) 36878b8fb99cSAlex Elder kref_put(&spec->kref, rbd_spec_free); 36888b8fb99cSAlex Elder } 36898b8fb99cSAlex Elder 36908b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_alloc(void) 36918b8fb99cSAlex Elder { 36928b8fb99cSAlex Elder struct rbd_spec *spec; 36938b8fb99cSAlex Elder 36948b8fb99cSAlex Elder spec = kzalloc(sizeof (*spec), GFP_KERNEL); 36958b8fb99cSAlex Elder if (!spec) 36968b8fb99cSAlex Elder return NULL; 36978b8fb99cSAlex Elder kref_init(&spec->kref); 36988b8fb99cSAlex Elder 36998b8fb99cSAlex Elder return spec; 37008b8fb99cSAlex Elder } 37018b8fb99cSAlex Elder 37028b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref) 37038b8fb99cSAlex Elder { 37048b8fb99cSAlex Elder struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref); 37058b8fb99cSAlex Elder 37068b8fb99cSAlex Elder kfree(spec->pool_name); 37078b8fb99cSAlex Elder kfree(spec->image_id); 37088b8fb99cSAlex Elder kfree(spec->image_name); 37098b8fb99cSAlex Elder kfree(spec->snap_name); 37108b8fb99cSAlex Elder kfree(spec); 37118b8fb99cSAlex Elder } 37128b8fb99cSAlex Elder 3713cc344fa1SAlex Elder static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc, 3714c53d5893SAlex Elder struct rbd_spec *spec) 3715c53d5893SAlex Elder { 3716c53d5893SAlex Elder struct rbd_device *rbd_dev; 3717c53d5893SAlex Elder 3718c53d5893SAlex Elder rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL); 3719c53d5893SAlex Elder if (!rbd_dev) 3720c53d5893SAlex Elder return NULL; 3721c53d5893SAlex Elder 3722c53d5893SAlex Elder spin_lock_init(&rbd_dev->lock); 37236d292906SAlex Elder rbd_dev->flags = 0; 3724a2acd00eSAlex Elder atomic_set(&rbd_dev->parent_ref, 0); 3725c53d5893SAlex Elder INIT_LIST_HEAD(&rbd_dev->node); 3726c53d5893SAlex Elder init_rwsem(&rbd_dev->header_rwsem); 3727c53d5893SAlex Elder 3728c53d5893SAlex Elder rbd_dev->spec = spec; 3729c53d5893SAlex Elder rbd_dev->rbd_client = rbdc; 3730c53d5893SAlex Elder 37310903e875SAlex Elder /* Initialize the layout used for all rbd requests */ 37320903e875SAlex Elder 37330903e875SAlex Elder rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); 37340903e875SAlex Elder rbd_dev->layout.fl_stripe_count = cpu_to_le32(1); 37350903e875SAlex Elder rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); 37360903e875SAlex Elder rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id); 37370903e875SAlex Elder 3738c53d5893SAlex Elder return rbd_dev; 3739c53d5893SAlex Elder } 3740c53d5893SAlex Elder 3741c53d5893SAlex Elder static void rbd_dev_destroy(struct rbd_device *rbd_dev) 3742c53d5893SAlex Elder { 3743c53d5893SAlex Elder rbd_put_client(rbd_dev->rbd_client); 3744c53d5893SAlex Elder rbd_spec_put(rbd_dev->spec); 3745c53d5893SAlex Elder kfree(rbd_dev); 3746c53d5893SAlex Elder } 3747c53d5893SAlex Elder 3748dfc5606dSYehuda Sadeh /* 37499d475de5SAlex Elder * Get the size and object order for an image snapshot, or if 37509d475de5SAlex Elder * snap_id is CEPH_NOSNAP, gets this information for the base 37519d475de5SAlex Elder * image. 37529d475de5SAlex Elder */ 37539d475de5SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id, 37549d475de5SAlex Elder u8 *order, u64 *snap_size) 37559d475de5SAlex Elder { 37569d475de5SAlex Elder __le64 snapid = cpu_to_le64(snap_id); 37579d475de5SAlex Elder int ret; 37589d475de5SAlex Elder struct { 37599d475de5SAlex Elder u8 order; 37609d475de5SAlex Elder __le64 size; 37619d475de5SAlex Elder } __attribute__ ((packed)) size_buf = { 0 }; 37629d475de5SAlex Elder 376336be9a76SAlex Elder ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, 37649d475de5SAlex Elder "rbd", "get_size", 37654157976bSAlex Elder &snapid, sizeof (snapid), 3766e2a58ee5SAlex Elder &size_buf, sizeof (size_buf)); 376736be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 37689d475de5SAlex Elder if (ret < 0) 37699d475de5SAlex Elder return ret; 377057385b51SAlex Elder if (ret < sizeof (size_buf)) 377157385b51SAlex Elder return -ERANGE; 37729d475de5SAlex Elder 3773c3545579SJosh Durgin if (order) { 37749d475de5SAlex Elder *order = size_buf.order; 3775c3545579SJosh Durgin dout(" order %u", (unsigned int)*order); 3776c3545579SJosh Durgin } 37779d475de5SAlex Elder *snap_size = le64_to_cpu(size_buf.size); 37789d475de5SAlex Elder 3779c3545579SJosh Durgin dout(" snap_id 0x%016llx snap_size = %llu\n", 3780c3545579SJosh Durgin (unsigned long long)snap_id, 37819d475de5SAlex Elder (unsigned long long)*snap_size); 37829d475de5SAlex Elder 37839d475de5SAlex Elder return 0; 37849d475de5SAlex Elder } 37859d475de5SAlex Elder 37869d475de5SAlex Elder static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev) 37879d475de5SAlex Elder { 37889d475de5SAlex Elder return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP, 37899d475de5SAlex Elder &rbd_dev->header.obj_order, 37909d475de5SAlex Elder &rbd_dev->header.image_size); 37919d475de5SAlex Elder } 37929d475de5SAlex Elder 37931e130199SAlex Elder static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev) 37941e130199SAlex Elder { 37951e130199SAlex Elder void *reply_buf; 37961e130199SAlex Elder int ret; 37971e130199SAlex Elder void *p; 37981e130199SAlex Elder 37991e130199SAlex Elder reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL); 38001e130199SAlex Elder if (!reply_buf) 38011e130199SAlex Elder return -ENOMEM; 38021e130199SAlex Elder 380336be9a76SAlex Elder ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, 38044157976bSAlex Elder "rbd", "get_object_prefix", NULL, 0, 3805e2a58ee5SAlex Elder reply_buf, RBD_OBJ_PREFIX_LEN_MAX); 380636be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 38071e130199SAlex Elder if (ret < 0) 38081e130199SAlex Elder goto out; 38091e130199SAlex Elder 38101e130199SAlex Elder p = reply_buf; 38111e130199SAlex Elder rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p, 381257385b51SAlex Elder p + ret, NULL, GFP_NOIO); 381357385b51SAlex Elder ret = 0; 38141e130199SAlex Elder 38151e130199SAlex Elder if (IS_ERR(rbd_dev->header.object_prefix)) { 38161e130199SAlex Elder ret = PTR_ERR(rbd_dev->header.object_prefix); 38171e130199SAlex Elder rbd_dev->header.object_prefix = NULL; 38181e130199SAlex Elder } else { 38191e130199SAlex Elder dout(" object_prefix = %s\n", rbd_dev->header.object_prefix); 38201e130199SAlex Elder } 38211e130199SAlex Elder out: 38221e130199SAlex Elder kfree(reply_buf); 38231e130199SAlex Elder 38241e130199SAlex Elder return ret; 38251e130199SAlex Elder } 38261e130199SAlex Elder 3827b1b5402aSAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id, 3828b1b5402aSAlex Elder u64 *snap_features) 3829b1b5402aSAlex Elder { 3830b1b5402aSAlex Elder __le64 snapid = cpu_to_le64(snap_id); 3831b1b5402aSAlex Elder struct { 3832b1b5402aSAlex Elder __le64 features; 3833b1b5402aSAlex Elder __le64 incompat; 38344157976bSAlex Elder } __attribute__ ((packed)) features_buf = { 0 }; 3835d889140cSAlex Elder u64 incompat; 3836b1b5402aSAlex Elder int ret; 3837b1b5402aSAlex Elder 383836be9a76SAlex Elder ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, 3839b1b5402aSAlex Elder "rbd", "get_features", 38404157976bSAlex Elder &snapid, sizeof (snapid), 3841e2a58ee5SAlex Elder &features_buf, sizeof (features_buf)); 384236be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 3843b1b5402aSAlex Elder if (ret < 0) 3844b1b5402aSAlex Elder return ret; 384557385b51SAlex Elder if (ret < sizeof (features_buf)) 384657385b51SAlex Elder return -ERANGE; 3847d889140cSAlex Elder 3848d889140cSAlex Elder incompat = le64_to_cpu(features_buf.incompat); 38495cbf6f12SAlex Elder if (incompat & ~RBD_FEATURES_SUPPORTED) 3850b8f5c6edSAlex Elder return -ENXIO; 3851d889140cSAlex Elder 3852b1b5402aSAlex Elder *snap_features = le64_to_cpu(features_buf.features); 3853b1b5402aSAlex Elder 3854b1b5402aSAlex Elder dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n", 3855b1b5402aSAlex Elder (unsigned long long)snap_id, 3856b1b5402aSAlex Elder (unsigned long long)*snap_features, 3857b1b5402aSAlex Elder (unsigned long long)le64_to_cpu(features_buf.incompat)); 3858b1b5402aSAlex Elder 3859b1b5402aSAlex Elder return 0; 3860b1b5402aSAlex Elder } 3861b1b5402aSAlex Elder 3862b1b5402aSAlex Elder static int rbd_dev_v2_features(struct rbd_device *rbd_dev) 3863b1b5402aSAlex Elder { 3864b1b5402aSAlex Elder return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP, 3865b1b5402aSAlex Elder &rbd_dev->header.features); 3866b1b5402aSAlex Elder } 3867b1b5402aSAlex Elder 386886b00e0dSAlex Elder static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev) 386986b00e0dSAlex Elder { 387086b00e0dSAlex Elder struct rbd_spec *parent_spec; 387186b00e0dSAlex Elder size_t size; 387286b00e0dSAlex Elder void *reply_buf = NULL; 387386b00e0dSAlex Elder __le64 snapid; 387486b00e0dSAlex Elder void *p; 387586b00e0dSAlex Elder void *end; 3876642a2537SAlex Elder u64 pool_id; 387786b00e0dSAlex Elder char *image_id; 38783b5cf2a2SAlex Elder u64 snap_id; 387986b00e0dSAlex Elder u64 overlap; 388086b00e0dSAlex Elder int ret; 388186b00e0dSAlex Elder 388286b00e0dSAlex Elder parent_spec = rbd_spec_alloc(); 388386b00e0dSAlex Elder if (!parent_spec) 388486b00e0dSAlex Elder return -ENOMEM; 388586b00e0dSAlex Elder 388686b00e0dSAlex Elder size = sizeof (__le64) + /* pool_id */ 388786b00e0dSAlex Elder sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX + /* image_id */ 388886b00e0dSAlex Elder sizeof (__le64) + /* snap_id */ 388986b00e0dSAlex Elder sizeof (__le64); /* overlap */ 389086b00e0dSAlex Elder reply_buf = kmalloc(size, GFP_KERNEL); 389186b00e0dSAlex Elder if (!reply_buf) { 389286b00e0dSAlex Elder ret = -ENOMEM; 389386b00e0dSAlex Elder goto out_err; 389486b00e0dSAlex Elder } 389586b00e0dSAlex Elder 389686b00e0dSAlex Elder snapid = cpu_to_le64(CEPH_NOSNAP); 389736be9a76SAlex Elder ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, 389886b00e0dSAlex Elder "rbd", "get_parent", 38994157976bSAlex Elder &snapid, sizeof (snapid), 3900e2a58ee5SAlex Elder reply_buf, size); 390136be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 390286b00e0dSAlex Elder if (ret < 0) 390386b00e0dSAlex Elder goto out_err; 390486b00e0dSAlex Elder 390586b00e0dSAlex Elder p = reply_buf; 390657385b51SAlex Elder end = reply_buf + ret; 390757385b51SAlex Elder ret = -ERANGE; 3908642a2537SAlex Elder ceph_decode_64_safe(&p, end, pool_id, out_err); 3909392a9dadSAlex Elder if (pool_id == CEPH_NOPOOL) { 3910392a9dadSAlex Elder /* 3911392a9dadSAlex Elder * Either the parent never existed, or we have 3912392a9dadSAlex Elder * record of it but the image got flattened so it no 3913392a9dadSAlex Elder * longer has a parent. When the parent of a 3914392a9dadSAlex Elder * layered image disappears we immediately set the 3915392a9dadSAlex Elder * overlap to 0. The effect of this is that all new 3916392a9dadSAlex Elder * requests will be treated as if the image had no 3917392a9dadSAlex Elder * parent. 3918392a9dadSAlex Elder */ 3919392a9dadSAlex Elder if (rbd_dev->parent_overlap) { 3920392a9dadSAlex Elder rbd_dev->parent_overlap = 0; 3921392a9dadSAlex Elder smp_mb(); 3922392a9dadSAlex Elder rbd_dev_parent_put(rbd_dev); 3923392a9dadSAlex Elder pr_info("%s: clone image has been flattened\n", 3924392a9dadSAlex Elder rbd_dev->disk->disk_name); 3925392a9dadSAlex Elder } 3926392a9dadSAlex Elder 392786b00e0dSAlex Elder goto out; /* No parent? No problem. */ 3928392a9dadSAlex Elder } 392986b00e0dSAlex Elder 39300903e875SAlex Elder /* The ceph file layout needs to fit pool id in 32 bits */ 39310903e875SAlex Elder 39320903e875SAlex Elder ret = -EIO; 3933642a2537SAlex Elder if (pool_id > (u64)U32_MAX) { 3934c0cd10dbSAlex Elder rbd_warn(NULL, "parent pool id too large (%llu > %u)\n", 3935642a2537SAlex Elder (unsigned long long)pool_id, U32_MAX); 393657385b51SAlex Elder goto out_err; 3937c0cd10dbSAlex Elder } 39380903e875SAlex Elder 3939979ed480SAlex Elder image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL); 394086b00e0dSAlex Elder if (IS_ERR(image_id)) { 394186b00e0dSAlex Elder ret = PTR_ERR(image_id); 394286b00e0dSAlex Elder goto out_err; 394386b00e0dSAlex Elder } 39443b5cf2a2SAlex Elder ceph_decode_64_safe(&p, end, snap_id, out_err); 394586b00e0dSAlex Elder ceph_decode_64_safe(&p, end, overlap, out_err); 394686b00e0dSAlex Elder 39473b5cf2a2SAlex Elder /* 39483b5cf2a2SAlex Elder * The parent won't change (except when the clone is 39493b5cf2a2SAlex Elder * flattened, already handled that). So we only need to 39503b5cf2a2SAlex Elder * record the parent spec we have not already done so. 39513b5cf2a2SAlex Elder */ 39523b5cf2a2SAlex Elder if (!rbd_dev->parent_spec) { 39533b5cf2a2SAlex Elder parent_spec->pool_id = pool_id; 39543b5cf2a2SAlex Elder parent_spec->image_id = image_id; 39553b5cf2a2SAlex Elder parent_spec->snap_id = snap_id; 395686b00e0dSAlex Elder rbd_dev->parent_spec = parent_spec; 395786b00e0dSAlex Elder parent_spec = NULL; /* rbd_dev now owns this */ 39583b5cf2a2SAlex Elder } 39593b5cf2a2SAlex Elder 39603b5cf2a2SAlex Elder /* 39613b5cf2a2SAlex Elder * We always update the parent overlap. If it's zero we 39623b5cf2a2SAlex Elder * treat it specially. 39633b5cf2a2SAlex Elder */ 396470cf49cfSAlex Elder rbd_dev->parent_overlap = overlap; 39653b5cf2a2SAlex Elder smp_mb(); 39663b5cf2a2SAlex Elder if (!overlap) { 39673b5cf2a2SAlex Elder 39683b5cf2a2SAlex Elder /* A null parent_spec indicates it's the initial probe */ 39693b5cf2a2SAlex Elder 39703b5cf2a2SAlex Elder if (parent_spec) { 39713b5cf2a2SAlex Elder /* 39723b5cf2a2SAlex Elder * The overlap has become zero, so the clone 39733b5cf2a2SAlex Elder * must have been resized down to 0 at some 39743b5cf2a2SAlex Elder * point. Treat this the same as a flatten. 39753b5cf2a2SAlex Elder */ 39763b5cf2a2SAlex Elder rbd_dev_parent_put(rbd_dev); 39773b5cf2a2SAlex Elder pr_info("%s: clone image now standalone\n", 39783b5cf2a2SAlex Elder rbd_dev->disk->disk_name); 397970cf49cfSAlex Elder } else { 39803b5cf2a2SAlex Elder /* 39813b5cf2a2SAlex Elder * For the initial probe, if we find the 39823b5cf2a2SAlex Elder * overlap is zero we just pretend there was 39833b5cf2a2SAlex Elder * no parent image. 39843b5cf2a2SAlex Elder */ 39853b5cf2a2SAlex Elder rbd_warn(rbd_dev, "ignoring parent of " 39863b5cf2a2SAlex Elder "clone with overlap 0\n"); 39873b5cf2a2SAlex Elder } 398870cf49cfSAlex Elder } 398986b00e0dSAlex Elder out: 399086b00e0dSAlex Elder ret = 0; 399186b00e0dSAlex Elder out_err: 399286b00e0dSAlex Elder kfree(reply_buf); 399386b00e0dSAlex Elder rbd_spec_put(parent_spec); 399486b00e0dSAlex Elder 399586b00e0dSAlex Elder return ret; 399686b00e0dSAlex Elder } 399786b00e0dSAlex Elder 3998cc070d59SAlex Elder static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev) 3999cc070d59SAlex Elder { 4000cc070d59SAlex Elder struct { 4001cc070d59SAlex Elder __le64 stripe_unit; 4002cc070d59SAlex Elder __le64 stripe_count; 4003cc070d59SAlex Elder } __attribute__ ((packed)) striping_info_buf = { 0 }; 4004cc070d59SAlex Elder size_t size = sizeof (striping_info_buf); 4005cc070d59SAlex Elder void *p; 4006cc070d59SAlex Elder u64 obj_size; 4007cc070d59SAlex Elder u64 stripe_unit; 4008cc070d59SAlex Elder u64 stripe_count; 4009cc070d59SAlex Elder int ret; 4010cc070d59SAlex Elder 4011cc070d59SAlex Elder ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, 4012cc070d59SAlex Elder "rbd", "get_stripe_unit_count", NULL, 0, 4013e2a58ee5SAlex Elder (char *)&striping_info_buf, size); 4014cc070d59SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 4015cc070d59SAlex Elder if (ret < 0) 4016cc070d59SAlex Elder return ret; 4017cc070d59SAlex Elder if (ret < size) 4018cc070d59SAlex Elder return -ERANGE; 4019cc070d59SAlex Elder 4020cc070d59SAlex Elder /* 4021cc070d59SAlex Elder * We don't actually support the "fancy striping" feature 4022cc070d59SAlex Elder * (STRIPINGV2) yet, but if the striping sizes are the 4023cc070d59SAlex Elder * defaults the behavior is the same as before. So find 4024cc070d59SAlex Elder * out, and only fail if the image has non-default values. 4025cc070d59SAlex Elder */ 4026cc070d59SAlex Elder ret = -EINVAL; 4027cc070d59SAlex Elder obj_size = (u64)1 << rbd_dev->header.obj_order; 4028cc070d59SAlex Elder p = &striping_info_buf; 4029cc070d59SAlex Elder stripe_unit = ceph_decode_64(&p); 4030cc070d59SAlex Elder if (stripe_unit != obj_size) { 4031cc070d59SAlex Elder rbd_warn(rbd_dev, "unsupported stripe unit " 4032cc070d59SAlex Elder "(got %llu want %llu)", 4033cc070d59SAlex Elder stripe_unit, obj_size); 4034cc070d59SAlex Elder return -EINVAL; 4035cc070d59SAlex Elder } 4036cc070d59SAlex Elder stripe_count = ceph_decode_64(&p); 4037cc070d59SAlex Elder if (stripe_count != 1) { 4038cc070d59SAlex Elder rbd_warn(rbd_dev, "unsupported stripe count " 4039cc070d59SAlex Elder "(got %llu want 1)", stripe_count); 4040cc070d59SAlex Elder return -EINVAL; 4041cc070d59SAlex Elder } 4042500d0c0fSAlex Elder rbd_dev->header.stripe_unit = stripe_unit; 4043500d0c0fSAlex Elder rbd_dev->header.stripe_count = stripe_count; 4044cc070d59SAlex Elder 4045cc070d59SAlex Elder return 0; 4046cc070d59SAlex Elder } 4047cc070d59SAlex Elder 40489e15b77dSAlex Elder static char *rbd_dev_image_name(struct rbd_device *rbd_dev) 40499e15b77dSAlex Elder { 40509e15b77dSAlex Elder size_t image_id_size; 40519e15b77dSAlex Elder char *image_id; 40529e15b77dSAlex Elder void *p; 40539e15b77dSAlex Elder void *end; 40549e15b77dSAlex Elder size_t size; 40559e15b77dSAlex Elder void *reply_buf = NULL; 40569e15b77dSAlex Elder size_t len = 0; 40579e15b77dSAlex Elder char *image_name = NULL; 40589e15b77dSAlex Elder int ret; 40599e15b77dSAlex Elder 40609e15b77dSAlex Elder rbd_assert(!rbd_dev->spec->image_name); 40619e15b77dSAlex Elder 406269e7a02fSAlex Elder len = strlen(rbd_dev->spec->image_id); 406369e7a02fSAlex Elder image_id_size = sizeof (__le32) + len; 40649e15b77dSAlex Elder image_id = kmalloc(image_id_size, GFP_KERNEL); 40659e15b77dSAlex Elder if (!image_id) 40669e15b77dSAlex Elder return NULL; 40679e15b77dSAlex Elder 40689e15b77dSAlex Elder p = image_id; 40694157976bSAlex Elder end = image_id + image_id_size; 407069e7a02fSAlex Elder ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len); 40719e15b77dSAlex Elder 40729e15b77dSAlex Elder size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX; 40739e15b77dSAlex Elder reply_buf = kmalloc(size, GFP_KERNEL); 40749e15b77dSAlex Elder if (!reply_buf) 40759e15b77dSAlex Elder goto out; 40769e15b77dSAlex Elder 407736be9a76SAlex Elder ret = rbd_obj_method_sync(rbd_dev, RBD_DIRECTORY, 40789e15b77dSAlex Elder "rbd", "dir_get_name", 40799e15b77dSAlex Elder image_id, image_id_size, 4080e2a58ee5SAlex Elder reply_buf, size); 40819e15b77dSAlex Elder if (ret < 0) 40829e15b77dSAlex Elder goto out; 40839e15b77dSAlex Elder p = reply_buf; 4084f40eb349SAlex Elder end = reply_buf + ret; 4085f40eb349SAlex Elder 40869e15b77dSAlex Elder image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL); 40879e15b77dSAlex Elder if (IS_ERR(image_name)) 40889e15b77dSAlex Elder image_name = NULL; 40899e15b77dSAlex Elder else 40909e15b77dSAlex Elder dout("%s: name is %s len is %zd\n", __func__, image_name, len); 40919e15b77dSAlex Elder out: 40929e15b77dSAlex Elder kfree(reply_buf); 40939e15b77dSAlex Elder kfree(image_id); 40949e15b77dSAlex Elder 40959e15b77dSAlex Elder return image_name; 40969e15b77dSAlex Elder } 40979e15b77dSAlex Elder 40982ad3d716SAlex Elder static u64 rbd_v1_snap_id_by_name(struct rbd_device *rbd_dev, const char *name) 40992ad3d716SAlex Elder { 41002ad3d716SAlex Elder struct ceph_snap_context *snapc = rbd_dev->header.snapc; 41012ad3d716SAlex Elder const char *snap_name; 41022ad3d716SAlex Elder u32 which = 0; 41032ad3d716SAlex Elder 41042ad3d716SAlex Elder /* Skip over names until we find the one we are looking for */ 41052ad3d716SAlex Elder 41062ad3d716SAlex Elder snap_name = rbd_dev->header.snap_names; 41072ad3d716SAlex Elder while (which < snapc->num_snaps) { 41082ad3d716SAlex Elder if (!strcmp(name, snap_name)) 41092ad3d716SAlex Elder return snapc->snaps[which]; 41102ad3d716SAlex Elder snap_name += strlen(snap_name) + 1; 41112ad3d716SAlex Elder which++; 41122ad3d716SAlex Elder } 41132ad3d716SAlex Elder return CEPH_NOSNAP; 41142ad3d716SAlex Elder } 41152ad3d716SAlex Elder 41162ad3d716SAlex Elder static u64 rbd_v2_snap_id_by_name(struct rbd_device *rbd_dev, const char *name) 41172ad3d716SAlex Elder { 41182ad3d716SAlex Elder struct ceph_snap_context *snapc = rbd_dev->header.snapc; 41192ad3d716SAlex Elder u32 which; 41202ad3d716SAlex Elder bool found = false; 41212ad3d716SAlex Elder u64 snap_id; 41222ad3d716SAlex Elder 41232ad3d716SAlex Elder for (which = 0; !found && which < snapc->num_snaps; which++) { 41242ad3d716SAlex Elder const char *snap_name; 41252ad3d716SAlex Elder 41262ad3d716SAlex Elder snap_id = snapc->snaps[which]; 41272ad3d716SAlex Elder snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id); 4128efadc98aSJosh Durgin if (IS_ERR(snap_name)) { 4129efadc98aSJosh Durgin /* ignore no-longer existing snapshots */ 4130efadc98aSJosh Durgin if (PTR_ERR(snap_name) == -ENOENT) 4131efadc98aSJosh Durgin continue; 4132efadc98aSJosh Durgin else 41332ad3d716SAlex Elder break; 4134efadc98aSJosh Durgin } 41352ad3d716SAlex Elder found = !strcmp(name, snap_name); 41362ad3d716SAlex Elder kfree(snap_name); 41372ad3d716SAlex Elder } 41382ad3d716SAlex Elder return found ? snap_id : CEPH_NOSNAP; 41392ad3d716SAlex Elder } 41402ad3d716SAlex Elder 41412ad3d716SAlex Elder /* 41422ad3d716SAlex Elder * Assumes name is never RBD_SNAP_HEAD_NAME; returns CEPH_NOSNAP if 41432ad3d716SAlex Elder * no snapshot by that name is found, or if an error occurs. 41442ad3d716SAlex Elder */ 41452ad3d716SAlex Elder static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name) 41462ad3d716SAlex Elder { 41472ad3d716SAlex Elder if (rbd_dev->image_format == 1) 41482ad3d716SAlex Elder return rbd_v1_snap_id_by_name(rbd_dev, name); 41492ad3d716SAlex Elder 41502ad3d716SAlex Elder return rbd_v2_snap_id_by_name(rbd_dev, name); 41512ad3d716SAlex Elder } 41522ad3d716SAlex Elder 41539e15b77dSAlex Elder /* 41542e9f7f1cSAlex Elder * When an rbd image has a parent image, it is identified by the 41552e9f7f1cSAlex Elder * pool, image, and snapshot ids (not names). This function fills 41562e9f7f1cSAlex Elder * in the names for those ids. (It's OK if we can't figure out the 41572e9f7f1cSAlex Elder * name for an image id, but the pool and snapshot ids should always 41582e9f7f1cSAlex Elder * exist and have names.) All names in an rbd spec are dynamically 41592e9f7f1cSAlex Elder * allocated. 4160e1d4213fSAlex Elder * 4161e1d4213fSAlex Elder * When an image being mapped (not a parent) is probed, we have the 4162e1d4213fSAlex Elder * pool name and pool id, image name and image id, and the snapshot 4163e1d4213fSAlex Elder * name. The only thing we're missing is the snapshot id. 41649e15b77dSAlex Elder */ 41652e9f7f1cSAlex Elder static int rbd_dev_spec_update(struct rbd_device *rbd_dev) 41669e15b77dSAlex Elder { 41672e9f7f1cSAlex Elder struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 41682e9f7f1cSAlex Elder struct rbd_spec *spec = rbd_dev->spec; 41692e9f7f1cSAlex Elder const char *pool_name; 41702e9f7f1cSAlex Elder const char *image_name; 41712e9f7f1cSAlex Elder const char *snap_name; 41729e15b77dSAlex Elder int ret; 41739e15b77dSAlex Elder 4174e1d4213fSAlex Elder /* 4175e1d4213fSAlex Elder * An image being mapped will have the pool name (etc.), but 4176e1d4213fSAlex Elder * we need to look up the snapshot id. 4177e1d4213fSAlex Elder */ 41782e9f7f1cSAlex Elder if (spec->pool_name) { 41792e9f7f1cSAlex Elder if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) { 41802ad3d716SAlex Elder u64 snap_id; 4181e1d4213fSAlex Elder 41822ad3d716SAlex Elder snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name); 41832ad3d716SAlex Elder if (snap_id == CEPH_NOSNAP) 4184e1d4213fSAlex Elder return -ENOENT; 41852ad3d716SAlex Elder spec->snap_id = snap_id; 4186e1d4213fSAlex Elder } else { 41872e9f7f1cSAlex Elder spec->snap_id = CEPH_NOSNAP; 4188e1d4213fSAlex Elder } 4189e1d4213fSAlex Elder 4190e1d4213fSAlex Elder return 0; 4191e1d4213fSAlex Elder } 41929e15b77dSAlex Elder 41932e9f7f1cSAlex Elder /* Get the pool name; we have to make our own copy of this */ 41949e15b77dSAlex Elder 41952e9f7f1cSAlex Elder pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id); 41962e9f7f1cSAlex Elder if (!pool_name) { 41972e9f7f1cSAlex Elder rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id); 4198935dc89fSAlex Elder return -EIO; 4199935dc89fSAlex Elder } 42002e9f7f1cSAlex Elder pool_name = kstrdup(pool_name, GFP_KERNEL); 42012e9f7f1cSAlex Elder if (!pool_name) 42029e15b77dSAlex Elder return -ENOMEM; 42039e15b77dSAlex Elder 42049e15b77dSAlex Elder /* Fetch the image name; tolerate failure here */ 42059e15b77dSAlex Elder 42062e9f7f1cSAlex Elder image_name = rbd_dev_image_name(rbd_dev); 42072e9f7f1cSAlex Elder if (!image_name) 420806ecc6cbSAlex Elder rbd_warn(rbd_dev, "unable to get image name"); 42099e15b77dSAlex Elder 42102e9f7f1cSAlex Elder /* Look up the snapshot name, and make a copy */ 42119e15b77dSAlex Elder 42122e9f7f1cSAlex Elder snap_name = rbd_snap_name(rbd_dev, spec->snap_id); 4213da6a6b63SJosh Durgin if (IS_ERR(snap_name)) { 4214da6a6b63SJosh Durgin ret = PTR_ERR(snap_name); 42159e15b77dSAlex Elder goto out_err; 42162e9f7f1cSAlex Elder } 42172e9f7f1cSAlex Elder 42182e9f7f1cSAlex Elder spec->pool_name = pool_name; 42192e9f7f1cSAlex Elder spec->image_name = image_name; 42202e9f7f1cSAlex Elder spec->snap_name = snap_name; 42219e15b77dSAlex Elder 42229e15b77dSAlex Elder return 0; 42239e15b77dSAlex Elder out_err: 42242e9f7f1cSAlex Elder kfree(image_name); 42252e9f7f1cSAlex Elder kfree(pool_name); 42269e15b77dSAlex Elder 42279e15b77dSAlex Elder return ret; 42289e15b77dSAlex Elder } 42299e15b77dSAlex Elder 4230cc4a38bdSAlex Elder static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev) 423135d489f9SAlex Elder { 423235d489f9SAlex Elder size_t size; 423335d489f9SAlex Elder int ret; 423435d489f9SAlex Elder void *reply_buf; 423535d489f9SAlex Elder void *p; 423635d489f9SAlex Elder void *end; 423735d489f9SAlex Elder u64 seq; 423835d489f9SAlex Elder u32 snap_count; 423935d489f9SAlex Elder struct ceph_snap_context *snapc; 424035d489f9SAlex Elder u32 i; 424135d489f9SAlex Elder 424235d489f9SAlex Elder /* 424335d489f9SAlex Elder * We'll need room for the seq value (maximum snapshot id), 424435d489f9SAlex Elder * snapshot count, and array of that many snapshot ids. 424535d489f9SAlex Elder * For now we have a fixed upper limit on the number we're 424635d489f9SAlex Elder * prepared to receive. 424735d489f9SAlex Elder */ 424835d489f9SAlex Elder size = sizeof (__le64) + sizeof (__le32) + 424935d489f9SAlex Elder RBD_MAX_SNAP_COUNT * sizeof (__le64); 425035d489f9SAlex Elder reply_buf = kzalloc(size, GFP_KERNEL); 425135d489f9SAlex Elder if (!reply_buf) 425235d489f9SAlex Elder return -ENOMEM; 425335d489f9SAlex Elder 425436be9a76SAlex Elder ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, 42554157976bSAlex Elder "rbd", "get_snapcontext", NULL, 0, 4256e2a58ee5SAlex Elder reply_buf, size); 425736be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 425835d489f9SAlex Elder if (ret < 0) 425935d489f9SAlex Elder goto out; 426035d489f9SAlex Elder 426135d489f9SAlex Elder p = reply_buf; 426257385b51SAlex Elder end = reply_buf + ret; 426357385b51SAlex Elder ret = -ERANGE; 426435d489f9SAlex Elder ceph_decode_64_safe(&p, end, seq, out); 426535d489f9SAlex Elder ceph_decode_32_safe(&p, end, snap_count, out); 426635d489f9SAlex Elder 426735d489f9SAlex Elder /* 426835d489f9SAlex Elder * Make sure the reported number of snapshot ids wouldn't go 426935d489f9SAlex Elder * beyond the end of our buffer. But before checking that, 427035d489f9SAlex Elder * make sure the computed size of the snapshot context we 427135d489f9SAlex Elder * allocate is representable in a size_t. 427235d489f9SAlex Elder */ 427335d489f9SAlex Elder if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context)) 427435d489f9SAlex Elder / sizeof (u64)) { 427535d489f9SAlex Elder ret = -EINVAL; 427635d489f9SAlex Elder goto out; 427735d489f9SAlex Elder } 427835d489f9SAlex Elder if (!ceph_has_room(&p, end, snap_count * sizeof (__le64))) 427935d489f9SAlex Elder goto out; 4280468521c1SAlex Elder ret = 0; 428135d489f9SAlex Elder 4282812164f8SAlex Elder snapc = ceph_create_snap_context(snap_count, GFP_KERNEL); 428335d489f9SAlex Elder if (!snapc) { 428435d489f9SAlex Elder ret = -ENOMEM; 428535d489f9SAlex Elder goto out; 428635d489f9SAlex Elder } 428735d489f9SAlex Elder snapc->seq = seq; 428835d489f9SAlex Elder for (i = 0; i < snap_count; i++) 428935d489f9SAlex Elder snapc->snaps[i] = ceph_decode_64(&p); 429035d489f9SAlex Elder 429149ece554SAlex Elder ceph_put_snap_context(rbd_dev->header.snapc); 429235d489f9SAlex Elder rbd_dev->header.snapc = snapc; 429335d489f9SAlex Elder 429435d489f9SAlex Elder dout(" snap context seq = %llu, snap_count = %u\n", 429535d489f9SAlex Elder (unsigned long long)seq, (unsigned int)snap_count); 429635d489f9SAlex Elder out: 429735d489f9SAlex Elder kfree(reply_buf); 429835d489f9SAlex Elder 429957385b51SAlex Elder return ret; 430035d489f9SAlex Elder } 430135d489f9SAlex Elder 430254cac61fSAlex Elder static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, 430354cac61fSAlex Elder u64 snap_id) 4304b8b1e2dbSAlex Elder { 4305b8b1e2dbSAlex Elder size_t size; 4306b8b1e2dbSAlex Elder void *reply_buf; 430754cac61fSAlex Elder __le64 snapid; 4308b8b1e2dbSAlex Elder int ret; 4309b8b1e2dbSAlex Elder void *p; 4310b8b1e2dbSAlex Elder void *end; 4311b8b1e2dbSAlex Elder char *snap_name; 4312b8b1e2dbSAlex Elder 4313b8b1e2dbSAlex Elder size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN; 4314b8b1e2dbSAlex Elder reply_buf = kmalloc(size, GFP_KERNEL); 4315b8b1e2dbSAlex Elder if (!reply_buf) 4316b8b1e2dbSAlex Elder return ERR_PTR(-ENOMEM); 4317b8b1e2dbSAlex Elder 431854cac61fSAlex Elder snapid = cpu_to_le64(snap_id); 431936be9a76SAlex Elder ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, 4320b8b1e2dbSAlex Elder "rbd", "get_snapshot_name", 432154cac61fSAlex Elder &snapid, sizeof (snapid), 4322e2a58ee5SAlex Elder reply_buf, size); 432336be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 4324f40eb349SAlex Elder if (ret < 0) { 4325f40eb349SAlex Elder snap_name = ERR_PTR(ret); 4326b8b1e2dbSAlex Elder goto out; 4327f40eb349SAlex Elder } 4328b8b1e2dbSAlex Elder 4329b8b1e2dbSAlex Elder p = reply_buf; 4330f40eb349SAlex Elder end = reply_buf + ret; 4331e5c35534SAlex Elder snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL); 4332f40eb349SAlex Elder if (IS_ERR(snap_name)) 4333b8b1e2dbSAlex Elder goto out; 4334f40eb349SAlex Elder 4335b8b1e2dbSAlex Elder dout(" snap_id 0x%016llx snap_name = %s\n", 433654cac61fSAlex Elder (unsigned long long)snap_id, snap_name); 4337b8b1e2dbSAlex Elder out: 4338b8b1e2dbSAlex Elder kfree(reply_buf); 4339b8b1e2dbSAlex Elder 4340f40eb349SAlex Elder return snap_name; 4341b8b1e2dbSAlex Elder } 4342b8b1e2dbSAlex Elder 43432df3fac7SAlex Elder static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev) 4344117973fbSAlex Elder { 43452df3fac7SAlex Elder bool first_time = rbd_dev->header.object_prefix == NULL; 4346117973fbSAlex Elder int ret; 4347117973fbSAlex Elder 43481617e40cSJosh Durgin ret = rbd_dev_v2_image_size(rbd_dev); 43491617e40cSJosh Durgin if (ret) 4350cfbf6377SAlex Elder return ret; 43511617e40cSJosh Durgin 43522df3fac7SAlex Elder if (first_time) { 43532df3fac7SAlex Elder ret = rbd_dev_v2_header_onetime(rbd_dev); 43542df3fac7SAlex Elder if (ret) 4355cfbf6377SAlex Elder return ret; 43562df3fac7SAlex Elder } 43572df3fac7SAlex Elder 4358642a2537SAlex Elder /* 4359642a2537SAlex Elder * If the image supports layering, get the parent info. We 4360642a2537SAlex Elder * need to probe the first time regardless. Thereafter we 4361642a2537SAlex Elder * only need to if there's a parent, to see if it has 4362642a2537SAlex Elder * disappeared due to the mapped image getting flattened. 4363642a2537SAlex Elder */ 4364642a2537SAlex Elder if (rbd_dev->header.features & RBD_FEATURE_LAYERING && 4365642a2537SAlex Elder (first_time || rbd_dev->parent_spec)) { 4366642a2537SAlex Elder bool warn; 4367642a2537SAlex Elder 4368642a2537SAlex Elder ret = rbd_dev_v2_parent_info(rbd_dev); 4369642a2537SAlex Elder if (ret) 4370cfbf6377SAlex Elder return ret; 4371642a2537SAlex Elder 4372642a2537SAlex Elder /* 4373642a2537SAlex Elder * Print a warning if this is the initial probe and 4374642a2537SAlex Elder * the image has a parent. Don't print it if the 4375642a2537SAlex Elder * image now being probed is itself a parent. We 4376642a2537SAlex Elder * can tell at this point because we won't know its 4377642a2537SAlex Elder * pool name yet (just its pool id). 4378642a2537SAlex Elder */ 4379642a2537SAlex Elder warn = rbd_dev->parent_spec && rbd_dev->spec->pool_name; 4380642a2537SAlex Elder if (first_time && warn) 4381642a2537SAlex Elder rbd_warn(rbd_dev, "WARNING: kernel layering " 4382642a2537SAlex Elder "is EXPERIMENTAL!"); 4383642a2537SAlex Elder } 4384642a2537SAlex Elder 438529334ba4SAlex Elder if (rbd_dev->spec->snap_id == CEPH_NOSNAP) 438629334ba4SAlex Elder if (rbd_dev->mapping.size != rbd_dev->header.image_size) 438729334ba4SAlex Elder rbd_dev->mapping.size = rbd_dev->header.image_size; 4388117973fbSAlex Elder 4389cc4a38bdSAlex Elder ret = rbd_dev_v2_snap_context(rbd_dev); 4390117973fbSAlex Elder dout("rbd_dev_v2_snap_context returned %d\n", ret); 4391117973fbSAlex Elder 4392117973fbSAlex Elder return ret; 4393117973fbSAlex Elder } 4394117973fbSAlex Elder 4395dfc5606dSYehuda Sadeh static int rbd_bus_add_dev(struct rbd_device *rbd_dev) 4396dfc5606dSYehuda Sadeh { 4397dfc5606dSYehuda Sadeh struct device *dev; 4398cd789ab9SAlex Elder int ret; 4399dfc5606dSYehuda Sadeh 4400cd789ab9SAlex Elder dev = &rbd_dev->dev; 4401dfc5606dSYehuda Sadeh dev->bus = &rbd_bus_type; 4402dfc5606dSYehuda Sadeh dev->type = &rbd_device_type; 4403dfc5606dSYehuda Sadeh dev->parent = &rbd_root_dev; 4404200a6a8bSAlex Elder dev->release = rbd_dev_device_release; 4405de71a297SAlex Elder dev_set_name(dev, "%d", rbd_dev->dev_id); 4406dfc5606dSYehuda Sadeh ret = device_register(dev); 4407dfc5606dSYehuda Sadeh 4408dfc5606dSYehuda Sadeh return ret; 4409602adf40SYehuda Sadeh } 4410602adf40SYehuda Sadeh 4411dfc5606dSYehuda Sadeh static void rbd_bus_del_dev(struct rbd_device *rbd_dev) 4412dfc5606dSYehuda Sadeh { 4413dfc5606dSYehuda Sadeh device_unregister(&rbd_dev->dev); 4414dfc5606dSYehuda Sadeh } 4415dfc5606dSYehuda Sadeh 44161ddbe94eSAlex Elder /* 4417499afd5bSAlex Elder * Get a unique rbd identifier for the given new rbd_dev, and add 4418f8a22fc2SIlya Dryomov * the rbd_dev to the global list. 44191ddbe94eSAlex Elder */ 4420f8a22fc2SIlya Dryomov static int rbd_dev_id_get(struct rbd_device *rbd_dev) 4421b7f23c36SAlex Elder { 4422f8a22fc2SIlya Dryomov int new_dev_id; 4423f8a22fc2SIlya Dryomov 44249b60e70bSIlya Dryomov new_dev_id = ida_simple_get(&rbd_dev_id_ida, 44259b60e70bSIlya Dryomov 0, minor_to_rbd_dev_id(1 << MINORBITS), 44269b60e70bSIlya Dryomov GFP_KERNEL); 4427f8a22fc2SIlya Dryomov if (new_dev_id < 0) 4428f8a22fc2SIlya Dryomov return new_dev_id; 4429f8a22fc2SIlya Dryomov 4430f8a22fc2SIlya Dryomov rbd_dev->dev_id = new_dev_id; 4431499afd5bSAlex Elder 4432499afd5bSAlex Elder spin_lock(&rbd_dev_list_lock); 4433499afd5bSAlex Elder list_add_tail(&rbd_dev->node, &rbd_dev_list); 4434499afd5bSAlex Elder spin_unlock(&rbd_dev_list_lock); 4435f8a22fc2SIlya Dryomov 443670eebd20SIlya Dryomov dout("rbd_dev %p given dev id %d\n", rbd_dev, rbd_dev->dev_id); 4437f8a22fc2SIlya Dryomov 4438f8a22fc2SIlya Dryomov return 0; 4439b7f23c36SAlex Elder } 4440b7f23c36SAlex Elder 44411ddbe94eSAlex Elder /* 4442499afd5bSAlex Elder * Remove an rbd_dev from the global list, and record that its 4443499afd5bSAlex Elder * identifier is no longer in use. 44441ddbe94eSAlex Elder */ 4445e2839308SAlex Elder static void rbd_dev_id_put(struct rbd_device *rbd_dev) 44461ddbe94eSAlex Elder { 4447499afd5bSAlex Elder spin_lock(&rbd_dev_list_lock); 4448499afd5bSAlex Elder list_del_init(&rbd_dev->node); 4449499afd5bSAlex Elder spin_unlock(&rbd_dev_list_lock); 44501ddbe94eSAlex Elder 4451f8a22fc2SIlya Dryomov ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id); 4452f8a22fc2SIlya Dryomov 4453f8a22fc2SIlya Dryomov dout("rbd_dev %p released dev id %d\n", rbd_dev, rbd_dev->dev_id); 4454b7f23c36SAlex Elder } 4455b7f23c36SAlex Elder 4456a725f65eSAlex Elder /* 4457e28fff26SAlex Elder * Skips over white space at *buf, and updates *buf to point to the 4458e28fff26SAlex Elder * first found non-space character (if any). Returns the length of 4459593a9e7bSAlex Elder * the token (string of non-white space characters) found. Note 4460593a9e7bSAlex Elder * that *buf must be terminated with '\0'. 4461e28fff26SAlex Elder */ 4462e28fff26SAlex Elder static inline size_t next_token(const char **buf) 4463e28fff26SAlex Elder { 4464e28fff26SAlex Elder /* 4465e28fff26SAlex Elder * These are the characters that produce nonzero for 4466e28fff26SAlex Elder * isspace() in the "C" and "POSIX" locales. 4467e28fff26SAlex Elder */ 4468e28fff26SAlex Elder const char *spaces = " \f\n\r\t\v"; 4469e28fff26SAlex Elder 4470e28fff26SAlex Elder *buf += strspn(*buf, spaces); /* Find start of token */ 4471e28fff26SAlex Elder 4472e28fff26SAlex Elder return strcspn(*buf, spaces); /* Return token length */ 4473e28fff26SAlex Elder } 4474e28fff26SAlex Elder 4475e28fff26SAlex Elder /* 4476e28fff26SAlex Elder * Finds the next token in *buf, and if the provided token buffer is 4477e28fff26SAlex Elder * big enough, copies the found token into it. The result, if 4478593a9e7bSAlex Elder * copied, is guaranteed to be terminated with '\0'. Note that *buf 4479593a9e7bSAlex Elder * must be terminated with '\0' on entry. 4480e28fff26SAlex Elder * 4481e28fff26SAlex Elder * Returns the length of the token found (not including the '\0'). 4482e28fff26SAlex Elder * Return value will be 0 if no token is found, and it will be >= 4483e28fff26SAlex Elder * token_size if the token would not fit. 4484e28fff26SAlex Elder * 4485593a9e7bSAlex Elder * The *buf pointer will be updated to point beyond the end of the 4486e28fff26SAlex Elder * found token. Note that this occurs even if the token buffer is 4487e28fff26SAlex Elder * too small to hold it. 4488e28fff26SAlex Elder */ 4489e28fff26SAlex Elder static inline size_t copy_token(const char **buf, 4490e28fff26SAlex Elder char *token, 4491e28fff26SAlex Elder size_t token_size) 4492e28fff26SAlex Elder { 4493e28fff26SAlex Elder size_t len; 4494e28fff26SAlex Elder 4495e28fff26SAlex Elder len = next_token(buf); 4496e28fff26SAlex Elder if (len < token_size) { 4497e28fff26SAlex Elder memcpy(token, *buf, len); 4498e28fff26SAlex Elder *(token + len) = '\0'; 4499e28fff26SAlex Elder } 4500e28fff26SAlex Elder *buf += len; 4501e28fff26SAlex Elder 4502e28fff26SAlex Elder return len; 4503e28fff26SAlex Elder } 4504e28fff26SAlex Elder 4505e28fff26SAlex Elder /* 4506ea3352f4SAlex Elder * Finds the next token in *buf, dynamically allocates a buffer big 4507ea3352f4SAlex Elder * enough to hold a copy of it, and copies the token into the new 4508ea3352f4SAlex Elder * buffer. The copy is guaranteed to be terminated with '\0'. Note 4509ea3352f4SAlex Elder * that a duplicate buffer is created even for a zero-length token. 4510ea3352f4SAlex Elder * 4511ea3352f4SAlex Elder * Returns a pointer to the newly-allocated duplicate, or a null 4512ea3352f4SAlex Elder * pointer if memory for the duplicate was not available. If 4513ea3352f4SAlex Elder * the lenp argument is a non-null pointer, the length of the token 4514ea3352f4SAlex Elder * (not including the '\0') is returned in *lenp. 4515ea3352f4SAlex Elder * 4516ea3352f4SAlex Elder * If successful, the *buf pointer will be updated to point beyond 4517ea3352f4SAlex Elder * the end of the found token. 4518ea3352f4SAlex Elder * 4519ea3352f4SAlex Elder * Note: uses GFP_KERNEL for allocation. 4520ea3352f4SAlex Elder */ 4521ea3352f4SAlex Elder static inline char *dup_token(const char **buf, size_t *lenp) 4522ea3352f4SAlex Elder { 4523ea3352f4SAlex Elder char *dup; 4524ea3352f4SAlex Elder size_t len; 4525ea3352f4SAlex Elder 4526ea3352f4SAlex Elder len = next_token(buf); 45274caf35f9SAlex Elder dup = kmemdup(*buf, len + 1, GFP_KERNEL); 4528ea3352f4SAlex Elder if (!dup) 4529ea3352f4SAlex Elder return NULL; 4530ea3352f4SAlex Elder *(dup + len) = '\0'; 4531ea3352f4SAlex Elder *buf += len; 4532ea3352f4SAlex Elder 4533ea3352f4SAlex Elder if (lenp) 4534ea3352f4SAlex Elder *lenp = len; 4535ea3352f4SAlex Elder 4536ea3352f4SAlex Elder return dup; 4537ea3352f4SAlex Elder } 4538ea3352f4SAlex Elder 4539ea3352f4SAlex Elder /* 4540859c31dfSAlex Elder * Parse the options provided for an "rbd add" (i.e., rbd image 4541859c31dfSAlex Elder * mapping) request. These arrive via a write to /sys/bus/rbd/add, 4542859c31dfSAlex Elder * and the data written is passed here via a NUL-terminated buffer. 4543859c31dfSAlex Elder * Returns 0 if successful or an error code otherwise. 4544d22f76e7SAlex Elder * 4545859c31dfSAlex Elder * The information extracted from these options is recorded in 4546859c31dfSAlex Elder * the other parameters which return dynamically-allocated 4547859c31dfSAlex Elder * structures: 4548859c31dfSAlex Elder * ceph_opts 4549859c31dfSAlex Elder * The address of a pointer that will refer to a ceph options 4550859c31dfSAlex Elder * structure. Caller must release the returned pointer using 4551859c31dfSAlex Elder * ceph_destroy_options() when it is no longer needed. 4552859c31dfSAlex Elder * rbd_opts 4553859c31dfSAlex Elder * Address of an rbd options pointer. Fully initialized by 4554859c31dfSAlex Elder * this function; caller must release with kfree(). 4555859c31dfSAlex Elder * spec 4556859c31dfSAlex Elder * Address of an rbd image specification pointer. Fully 4557859c31dfSAlex Elder * initialized by this function based on parsed options. 4558859c31dfSAlex Elder * Caller must release with rbd_spec_put(). 4559859c31dfSAlex Elder * 4560859c31dfSAlex Elder * The options passed take this form: 4561859c31dfSAlex Elder * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>] 4562859c31dfSAlex Elder * where: 4563859c31dfSAlex Elder * <mon_addrs> 4564859c31dfSAlex Elder * A comma-separated list of one or more monitor addresses. 4565859c31dfSAlex Elder * A monitor address is an ip address, optionally followed 4566859c31dfSAlex Elder * by a port number (separated by a colon). 4567859c31dfSAlex Elder * I.e.: ip1[:port1][,ip2[:port2]...] 4568859c31dfSAlex Elder * <options> 4569859c31dfSAlex Elder * A comma-separated list of ceph and/or rbd options. 4570859c31dfSAlex Elder * <pool_name> 4571859c31dfSAlex Elder * The name of the rados pool containing the rbd image. 4572859c31dfSAlex Elder * <image_name> 4573859c31dfSAlex Elder * The name of the image in that pool to map. 4574859c31dfSAlex Elder * <snap_id> 4575859c31dfSAlex Elder * An optional snapshot id. If provided, the mapping will 4576859c31dfSAlex Elder * present data from the image at the time that snapshot was 4577859c31dfSAlex Elder * created. The image head is used if no snapshot id is 4578859c31dfSAlex Elder * provided. Snapshot mappings are always read-only. 4579a725f65eSAlex Elder */ 4580859c31dfSAlex Elder static int rbd_add_parse_args(const char *buf, 4581dc79b113SAlex Elder struct ceph_options **ceph_opts, 4582859c31dfSAlex Elder struct rbd_options **opts, 4583859c31dfSAlex Elder struct rbd_spec **rbd_spec) 4584a725f65eSAlex Elder { 4585e28fff26SAlex Elder size_t len; 4586859c31dfSAlex Elder char *options; 45870ddebc0cSAlex Elder const char *mon_addrs; 4588ecb4dc22SAlex Elder char *snap_name; 45890ddebc0cSAlex Elder size_t mon_addrs_size; 4590859c31dfSAlex Elder struct rbd_spec *spec = NULL; 45914e9afebaSAlex Elder struct rbd_options *rbd_opts = NULL; 4592859c31dfSAlex Elder struct ceph_options *copts; 4593dc79b113SAlex Elder int ret; 4594e28fff26SAlex Elder 4595e28fff26SAlex Elder /* The first four tokens are required */ 4596e28fff26SAlex Elder 45977ef3214aSAlex Elder len = next_token(&buf); 45984fb5d671SAlex Elder if (!len) { 45994fb5d671SAlex Elder rbd_warn(NULL, "no monitor address(es) provided"); 46004fb5d671SAlex Elder return -EINVAL; 46014fb5d671SAlex Elder } 46020ddebc0cSAlex Elder mon_addrs = buf; 4603f28e565aSAlex Elder mon_addrs_size = len + 1; 46047ef3214aSAlex Elder buf += len; 4605a725f65eSAlex Elder 4606dc79b113SAlex Elder ret = -EINVAL; 4607f28e565aSAlex Elder options = dup_token(&buf, NULL); 4608f28e565aSAlex Elder if (!options) 4609dc79b113SAlex Elder return -ENOMEM; 46104fb5d671SAlex Elder if (!*options) { 46114fb5d671SAlex Elder rbd_warn(NULL, "no options provided"); 46124fb5d671SAlex Elder goto out_err; 46134fb5d671SAlex Elder } 4614a725f65eSAlex Elder 4615859c31dfSAlex Elder spec = rbd_spec_alloc(); 4616859c31dfSAlex Elder if (!spec) 4617f28e565aSAlex Elder goto out_mem; 4618859c31dfSAlex Elder 4619859c31dfSAlex Elder spec->pool_name = dup_token(&buf, NULL); 4620859c31dfSAlex Elder if (!spec->pool_name) 4621859c31dfSAlex Elder goto out_mem; 46224fb5d671SAlex Elder if (!*spec->pool_name) { 46234fb5d671SAlex Elder rbd_warn(NULL, "no pool name provided"); 46244fb5d671SAlex Elder goto out_err; 46254fb5d671SAlex Elder } 4626e28fff26SAlex Elder 462769e7a02fSAlex Elder spec->image_name = dup_token(&buf, NULL); 4628859c31dfSAlex Elder if (!spec->image_name) 4629f28e565aSAlex Elder goto out_mem; 46304fb5d671SAlex Elder if (!*spec->image_name) { 46314fb5d671SAlex Elder rbd_warn(NULL, "no image name provided"); 46324fb5d671SAlex Elder goto out_err; 46334fb5d671SAlex Elder } 4634e28fff26SAlex Elder 4635f28e565aSAlex Elder /* 4636f28e565aSAlex Elder * Snapshot name is optional; default is to use "-" 4637f28e565aSAlex Elder * (indicating the head/no snapshot). 4638f28e565aSAlex Elder */ 46393feeb894SAlex Elder len = next_token(&buf); 4640820a5f3eSAlex Elder if (!len) { 46413feeb894SAlex Elder buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */ 46423feeb894SAlex Elder len = sizeof (RBD_SNAP_HEAD_NAME) - 1; 4643f28e565aSAlex Elder } else if (len > RBD_MAX_SNAP_NAME_LEN) { 4644dc79b113SAlex Elder ret = -ENAMETOOLONG; 4645f28e565aSAlex Elder goto out_err; 4646849b4260SAlex Elder } 4647ecb4dc22SAlex Elder snap_name = kmemdup(buf, len + 1, GFP_KERNEL); 4648ecb4dc22SAlex Elder if (!snap_name) 4649f28e565aSAlex Elder goto out_mem; 4650ecb4dc22SAlex Elder *(snap_name + len) = '\0'; 4651ecb4dc22SAlex Elder spec->snap_name = snap_name; 4652e5c35534SAlex Elder 46530ddebc0cSAlex Elder /* Initialize all rbd options to the defaults */ 4654e28fff26SAlex Elder 46554e9afebaSAlex Elder rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL); 46564e9afebaSAlex Elder if (!rbd_opts) 46574e9afebaSAlex Elder goto out_mem; 46584e9afebaSAlex Elder 46594e9afebaSAlex Elder rbd_opts->read_only = RBD_READ_ONLY_DEFAULT; 4660d22f76e7SAlex Elder 4661859c31dfSAlex Elder copts = ceph_parse_options(options, mon_addrs, 46620ddebc0cSAlex Elder mon_addrs + mon_addrs_size - 1, 46634e9afebaSAlex Elder parse_rbd_opts_token, rbd_opts); 4664859c31dfSAlex Elder if (IS_ERR(copts)) { 4665859c31dfSAlex Elder ret = PTR_ERR(copts); 4666dc79b113SAlex Elder goto out_err; 4667dc79b113SAlex Elder } 4668859c31dfSAlex Elder kfree(options); 4669859c31dfSAlex Elder 4670859c31dfSAlex Elder *ceph_opts = copts; 46714e9afebaSAlex Elder *opts = rbd_opts; 4672859c31dfSAlex Elder *rbd_spec = spec; 46730ddebc0cSAlex Elder 4674dc79b113SAlex Elder return 0; 4675f28e565aSAlex Elder out_mem: 4676dc79b113SAlex Elder ret = -ENOMEM; 4677d22f76e7SAlex Elder out_err: 4678859c31dfSAlex Elder kfree(rbd_opts); 4679859c31dfSAlex Elder rbd_spec_put(spec); 4680f28e565aSAlex Elder kfree(options); 4681d22f76e7SAlex Elder 4682dc79b113SAlex Elder return ret; 4683a725f65eSAlex Elder } 4684a725f65eSAlex Elder 4685589d30e0SAlex Elder /* 468630ba1f02SIlya Dryomov * Return pool id (>= 0) or a negative error code. 468730ba1f02SIlya Dryomov */ 468830ba1f02SIlya Dryomov static int rbd_add_get_pool_id(struct rbd_client *rbdc, const char *pool_name) 468930ba1f02SIlya Dryomov { 469030ba1f02SIlya Dryomov u64 newest_epoch; 469130ba1f02SIlya Dryomov unsigned long timeout = rbdc->client->options->mount_timeout * HZ; 469230ba1f02SIlya Dryomov int tries = 0; 469330ba1f02SIlya Dryomov int ret; 469430ba1f02SIlya Dryomov 469530ba1f02SIlya Dryomov again: 469630ba1f02SIlya Dryomov ret = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, pool_name); 469730ba1f02SIlya Dryomov if (ret == -ENOENT && tries++ < 1) { 469830ba1f02SIlya Dryomov ret = ceph_monc_do_get_version(&rbdc->client->monc, "osdmap", 469930ba1f02SIlya Dryomov &newest_epoch); 470030ba1f02SIlya Dryomov if (ret < 0) 470130ba1f02SIlya Dryomov return ret; 470230ba1f02SIlya Dryomov 470330ba1f02SIlya Dryomov if (rbdc->client->osdc.osdmap->epoch < newest_epoch) { 470430ba1f02SIlya Dryomov ceph_monc_request_next_osdmap(&rbdc->client->monc); 470530ba1f02SIlya Dryomov (void) ceph_monc_wait_osdmap(&rbdc->client->monc, 470630ba1f02SIlya Dryomov newest_epoch, timeout); 470730ba1f02SIlya Dryomov goto again; 470830ba1f02SIlya Dryomov } else { 470930ba1f02SIlya Dryomov /* the osdmap we have is new enough */ 471030ba1f02SIlya Dryomov return -ENOENT; 471130ba1f02SIlya Dryomov } 471230ba1f02SIlya Dryomov } 471330ba1f02SIlya Dryomov 471430ba1f02SIlya Dryomov return ret; 471530ba1f02SIlya Dryomov } 471630ba1f02SIlya Dryomov 471730ba1f02SIlya Dryomov /* 4718589d30e0SAlex Elder * An rbd format 2 image has a unique identifier, distinct from the 4719589d30e0SAlex Elder * name given to it by the user. Internally, that identifier is 4720589d30e0SAlex Elder * what's used to specify the names of objects related to the image. 4721589d30e0SAlex Elder * 4722589d30e0SAlex Elder * A special "rbd id" object is used to map an rbd image name to its 4723589d30e0SAlex Elder * id. If that object doesn't exist, then there is no v2 rbd image 4724589d30e0SAlex Elder * with the supplied name. 4725589d30e0SAlex Elder * 4726589d30e0SAlex Elder * This function will record the given rbd_dev's image_id field if 4727589d30e0SAlex Elder * it can be determined, and in that case will return 0. If any 4728589d30e0SAlex Elder * errors occur a negative errno will be returned and the rbd_dev's 4729589d30e0SAlex Elder * image_id field will be unchanged (and should be NULL). 4730589d30e0SAlex Elder */ 4731589d30e0SAlex Elder static int rbd_dev_image_id(struct rbd_device *rbd_dev) 4732589d30e0SAlex Elder { 4733589d30e0SAlex Elder int ret; 4734589d30e0SAlex Elder size_t size; 4735589d30e0SAlex Elder char *object_name; 4736589d30e0SAlex Elder void *response; 4737c0fba368SAlex Elder char *image_id; 47382f82ee54SAlex Elder 4739589d30e0SAlex Elder /* 47402c0d0a10SAlex Elder * When probing a parent image, the image id is already 47412c0d0a10SAlex Elder * known (and the image name likely is not). There's no 4742c0fba368SAlex Elder * need to fetch the image id again in this case. We 4743c0fba368SAlex Elder * do still need to set the image format though. 47442c0d0a10SAlex Elder */ 4745c0fba368SAlex Elder if (rbd_dev->spec->image_id) { 4746c0fba368SAlex Elder rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1; 4747c0fba368SAlex Elder 47482c0d0a10SAlex Elder return 0; 4749c0fba368SAlex Elder } 47502c0d0a10SAlex Elder 47512c0d0a10SAlex Elder /* 4752589d30e0SAlex Elder * First, see if the format 2 image id file exists, and if 4753589d30e0SAlex Elder * so, get the image's persistent id from it. 4754589d30e0SAlex Elder */ 475569e7a02fSAlex Elder size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name); 4756589d30e0SAlex Elder object_name = kmalloc(size, GFP_NOIO); 4757589d30e0SAlex Elder if (!object_name) 4758589d30e0SAlex Elder return -ENOMEM; 47590d7dbfceSAlex Elder sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name); 4760589d30e0SAlex Elder dout("rbd id object name is %s\n", object_name); 4761589d30e0SAlex Elder 4762589d30e0SAlex Elder /* Response will be an encoded string, which includes a length */ 4763589d30e0SAlex Elder 4764589d30e0SAlex Elder size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX; 4765589d30e0SAlex Elder response = kzalloc(size, GFP_NOIO); 4766589d30e0SAlex Elder if (!response) { 4767589d30e0SAlex Elder ret = -ENOMEM; 4768589d30e0SAlex Elder goto out; 4769589d30e0SAlex Elder } 4770589d30e0SAlex Elder 4771c0fba368SAlex Elder /* If it doesn't exist we'll assume it's a format 1 image */ 4772c0fba368SAlex Elder 477336be9a76SAlex Elder ret = rbd_obj_method_sync(rbd_dev, object_name, 47744157976bSAlex Elder "rbd", "get_id", NULL, 0, 4775e2a58ee5SAlex Elder response, RBD_IMAGE_ID_LEN_MAX); 477636be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 4777c0fba368SAlex Elder if (ret == -ENOENT) { 4778c0fba368SAlex Elder image_id = kstrdup("", GFP_KERNEL); 4779c0fba368SAlex Elder ret = image_id ? 0 : -ENOMEM; 4780c0fba368SAlex Elder if (!ret) 4781c0fba368SAlex Elder rbd_dev->image_format = 1; 4782c0fba368SAlex Elder } else if (ret > sizeof (__le32)) { 4783c0fba368SAlex Elder void *p = response; 4784589d30e0SAlex Elder 4785c0fba368SAlex Elder image_id = ceph_extract_encoded_string(&p, p + ret, 4786979ed480SAlex Elder NULL, GFP_NOIO); 4787461f758aSDuan Jiong ret = PTR_ERR_OR_ZERO(image_id); 4788c0fba368SAlex Elder if (!ret) 4789c0fba368SAlex Elder rbd_dev->image_format = 2; 4790589d30e0SAlex Elder } else { 4791c0fba368SAlex Elder ret = -EINVAL; 4792c0fba368SAlex Elder } 4793c0fba368SAlex Elder 4794c0fba368SAlex Elder if (!ret) { 4795c0fba368SAlex Elder rbd_dev->spec->image_id = image_id; 4796c0fba368SAlex Elder dout("image_id is %s\n", image_id); 4797589d30e0SAlex Elder } 4798589d30e0SAlex Elder out: 4799589d30e0SAlex Elder kfree(response); 4800589d30e0SAlex Elder kfree(object_name); 4801589d30e0SAlex Elder 4802589d30e0SAlex Elder return ret; 4803589d30e0SAlex Elder } 4804589d30e0SAlex Elder 48053abef3b3SAlex Elder /* 48063abef3b3SAlex Elder * Undo whatever state changes are made by v1 or v2 header info 48073abef3b3SAlex Elder * call. 48083abef3b3SAlex Elder */ 48096fd48b3bSAlex Elder static void rbd_dev_unprobe(struct rbd_device *rbd_dev) 48106fd48b3bSAlex Elder { 48116fd48b3bSAlex Elder struct rbd_image_header *header; 48126fd48b3bSAlex Elder 4813392a9dadSAlex Elder /* Drop parent reference unless it's already been done (or none) */ 4814392a9dadSAlex Elder 4815392a9dadSAlex Elder if (rbd_dev->parent_overlap) 4816a2acd00eSAlex Elder rbd_dev_parent_put(rbd_dev); 48176fd48b3bSAlex Elder 48186fd48b3bSAlex Elder /* Free dynamic fields from the header, then zero it out */ 48196fd48b3bSAlex Elder 48206fd48b3bSAlex Elder header = &rbd_dev->header; 4821812164f8SAlex Elder ceph_put_snap_context(header->snapc); 48226fd48b3bSAlex Elder kfree(header->snap_sizes); 48236fd48b3bSAlex Elder kfree(header->snap_names); 48246fd48b3bSAlex Elder kfree(header->object_prefix); 48256fd48b3bSAlex Elder memset(header, 0, sizeof (*header)); 48266fd48b3bSAlex Elder } 48276fd48b3bSAlex Elder 48282df3fac7SAlex Elder static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev) 4829a30b71b9SAlex Elder { 4830a30b71b9SAlex Elder int ret; 4831a30b71b9SAlex Elder 48321e130199SAlex Elder ret = rbd_dev_v2_object_prefix(rbd_dev); 483357385b51SAlex Elder if (ret) 48341e130199SAlex Elder goto out_err; 4835b1b5402aSAlex Elder 48362df3fac7SAlex Elder /* 48372df3fac7SAlex Elder * Get the and check features for the image. Currently the 48382df3fac7SAlex Elder * features are assumed to never change. 48392df3fac7SAlex Elder */ 4840b1b5402aSAlex Elder ret = rbd_dev_v2_features(rbd_dev); 484157385b51SAlex Elder if (ret) 4842b1b5402aSAlex Elder goto out_err; 484335d489f9SAlex Elder 4844cc070d59SAlex Elder /* If the image supports fancy striping, get its parameters */ 4845cc070d59SAlex Elder 4846cc070d59SAlex Elder if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) { 4847cc070d59SAlex Elder ret = rbd_dev_v2_striping_info(rbd_dev); 4848cc070d59SAlex Elder if (ret < 0) 4849cc070d59SAlex Elder goto out_err; 4850cc070d59SAlex Elder } 48512df3fac7SAlex Elder /* No support for crypto and compression type format 2 images */ 4852a30b71b9SAlex Elder 485335152979SAlex Elder return 0; 48549d475de5SAlex Elder out_err: 4855642a2537SAlex Elder rbd_dev->header.features = 0; 48561e130199SAlex Elder kfree(rbd_dev->header.object_prefix); 48571e130199SAlex Elder rbd_dev->header.object_prefix = NULL; 48589d475de5SAlex Elder 48599d475de5SAlex Elder return ret; 4860a30b71b9SAlex Elder } 4861a30b71b9SAlex Elder 4862124afba2SAlex Elder static int rbd_dev_probe_parent(struct rbd_device *rbd_dev) 486383a06263SAlex Elder { 48642f82ee54SAlex Elder struct rbd_device *parent = NULL; 4865124afba2SAlex Elder struct rbd_spec *parent_spec; 4866124afba2SAlex Elder struct rbd_client *rbdc; 4867124afba2SAlex Elder int ret; 4868124afba2SAlex Elder 4869124afba2SAlex Elder if (!rbd_dev->parent_spec) 4870124afba2SAlex Elder return 0; 4871124afba2SAlex Elder /* 4872124afba2SAlex Elder * We need to pass a reference to the client and the parent 4873124afba2SAlex Elder * spec when creating the parent rbd_dev. Images related by 4874124afba2SAlex Elder * parent/child relationships always share both. 4875124afba2SAlex Elder */ 4876124afba2SAlex Elder parent_spec = rbd_spec_get(rbd_dev->parent_spec); 4877124afba2SAlex Elder rbdc = __rbd_get_client(rbd_dev->rbd_client); 4878124afba2SAlex Elder 4879124afba2SAlex Elder ret = -ENOMEM; 4880124afba2SAlex Elder parent = rbd_dev_create(rbdc, parent_spec); 4881124afba2SAlex Elder if (!parent) 4882124afba2SAlex Elder goto out_err; 4883124afba2SAlex Elder 48841f3ef788SAlex Elder ret = rbd_dev_image_probe(parent, false); 4885124afba2SAlex Elder if (ret < 0) 4886124afba2SAlex Elder goto out_err; 4887124afba2SAlex Elder rbd_dev->parent = parent; 4888a2acd00eSAlex Elder atomic_set(&rbd_dev->parent_ref, 1); 4889124afba2SAlex Elder 4890124afba2SAlex Elder return 0; 4891124afba2SAlex Elder out_err: 4892124afba2SAlex Elder if (parent) { 4893fb65d228SAlex Elder rbd_dev_unparent(rbd_dev); 4894124afba2SAlex Elder kfree(rbd_dev->header_name); 4895124afba2SAlex Elder rbd_dev_destroy(parent); 4896124afba2SAlex Elder } else { 4897124afba2SAlex Elder rbd_put_client(rbdc); 4898124afba2SAlex Elder rbd_spec_put(parent_spec); 4899124afba2SAlex Elder } 4900124afba2SAlex Elder 4901124afba2SAlex Elder return ret; 4902124afba2SAlex Elder } 4903124afba2SAlex Elder 4904200a6a8bSAlex Elder static int rbd_dev_device_setup(struct rbd_device *rbd_dev) 4905124afba2SAlex Elder { 490683a06263SAlex Elder int ret; 490783a06263SAlex Elder 4908f8a22fc2SIlya Dryomov /* Get an id and fill in device name. */ 490983a06263SAlex Elder 4910f8a22fc2SIlya Dryomov ret = rbd_dev_id_get(rbd_dev); 4911f8a22fc2SIlya Dryomov if (ret) 4912f8a22fc2SIlya Dryomov return ret; 4913f8a22fc2SIlya Dryomov 491483a06263SAlex Elder BUILD_BUG_ON(DEV_NAME_LEN 491583a06263SAlex Elder < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH); 491683a06263SAlex Elder sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id); 491783a06263SAlex Elder 49189b60e70bSIlya Dryomov /* Record our major and minor device numbers. */ 491983a06263SAlex Elder 49209b60e70bSIlya Dryomov if (!single_major) { 492183a06263SAlex Elder ret = register_blkdev(0, rbd_dev->name); 492283a06263SAlex Elder if (ret < 0) 492383a06263SAlex Elder goto err_out_id; 49249b60e70bSIlya Dryomov 492583a06263SAlex Elder rbd_dev->major = ret; 4926dd82fff1SIlya Dryomov rbd_dev->minor = 0; 49279b60e70bSIlya Dryomov } else { 49289b60e70bSIlya Dryomov rbd_dev->major = rbd_major; 49299b60e70bSIlya Dryomov rbd_dev->minor = rbd_dev_id_to_minor(rbd_dev->dev_id); 49309b60e70bSIlya Dryomov } 493183a06263SAlex Elder 493283a06263SAlex Elder /* Set up the blkdev mapping. */ 493383a06263SAlex Elder 493483a06263SAlex Elder ret = rbd_init_disk(rbd_dev); 493583a06263SAlex Elder if (ret) 493683a06263SAlex Elder goto err_out_blkdev; 493783a06263SAlex Elder 4938f35a4deeSAlex Elder ret = rbd_dev_mapping_set(rbd_dev); 493983a06263SAlex Elder if (ret) 494083a06263SAlex Elder goto err_out_disk; 4941f35a4deeSAlex Elder set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE); 4942f35a4deeSAlex Elder 4943f35a4deeSAlex Elder ret = rbd_bus_add_dev(rbd_dev); 4944f35a4deeSAlex Elder if (ret) 4945f35a4deeSAlex Elder goto err_out_mapping; 494683a06263SAlex Elder 494783a06263SAlex Elder /* Everything's ready. Announce the disk to the world. */ 494883a06263SAlex Elder 4949129b79d4SAlex Elder set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); 495083a06263SAlex Elder add_disk(rbd_dev->disk); 495183a06263SAlex Elder 495283a06263SAlex Elder pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name, 495383a06263SAlex Elder (unsigned long long) rbd_dev->mapping.size); 495483a06263SAlex Elder 495583a06263SAlex Elder return ret; 49562f82ee54SAlex Elder 4957f35a4deeSAlex Elder err_out_mapping: 4958f35a4deeSAlex Elder rbd_dev_mapping_clear(rbd_dev); 495983a06263SAlex Elder err_out_disk: 496083a06263SAlex Elder rbd_free_disk(rbd_dev); 496183a06263SAlex Elder err_out_blkdev: 49629b60e70bSIlya Dryomov if (!single_major) 496383a06263SAlex Elder unregister_blkdev(rbd_dev->major, rbd_dev->name); 496483a06263SAlex Elder err_out_id: 496583a06263SAlex Elder rbd_dev_id_put(rbd_dev); 4966d1cf5788SAlex Elder rbd_dev_mapping_clear(rbd_dev); 496783a06263SAlex Elder 496883a06263SAlex Elder return ret; 496983a06263SAlex Elder } 497083a06263SAlex Elder 4971332bb12dSAlex Elder static int rbd_dev_header_name(struct rbd_device *rbd_dev) 4972332bb12dSAlex Elder { 4973332bb12dSAlex Elder struct rbd_spec *spec = rbd_dev->spec; 4974332bb12dSAlex Elder size_t size; 4975332bb12dSAlex Elder 4976332bb12dSAlex Elder /* Record the header object name for this rbd image. */ 4977332bb12dSAlex Elder 4978332bb12dSAlex Elder rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 4979332bb12dSAlex Elder 4980332bb12dSAlex Elder if (rbd_dev->image_format == 1) 4981332bb12dSAlex Elder size = strlen(spec->image_name) + sizeof (RBD_SUFFIX); 4982332bb12dSAlex Elder else 4983332bb12dSAlex Elder size = sizeof (RBD_HEADER_PREFIX) + strlen(spec->image_id); 4984332bb12dSAlex Elder 4985332bb12dSAlex Elder rbd_dev->header_name = kmalloc(size, GFP_KERNEL); 4986332bb12dSAlex Elder if (!rbd_dev->header_name) 4987332bb12dSAlex Elder return -ENOMEM; 4988332bb12dSAlex Elder 4989332bb12dSAlex Elder if (rbd_dev->image_format == 1) 4990332bb12dSAlex Elder sprintf(rbd_dev->header_name, "%s%s", 4991332bb12dSAlex Elder spec->image_name, RBD_SUFFIX); 4992332bb12dSAlex Elder else 4993332bb12dSAlex Elder sprintf(rbd_dev->header_name, "%s%s", 4994332bb12dSAlex Elder RBD_HEADER_PREFIX, spec->image_id); 4995332bb12dSAlex Elder return 0; 4996332bb12dSAlex Elder } 4997332bb12dSAlex Elder 4998200a6a8bSAlex Elder static void rbd_dev_image_release(struct rbd_device *rbd_dev) 4999200a6a8bSAlex Elder { 50006fd48b3bSAlex Elder rbd_dev_unprobe(rbd_dev); 5001200a6a8bSAlex Elder kfree(rbd_dev->header_name); 50026fd48b3bSAlex Elder rbd_dev->header_name = NULL; 50036fd48b3bSAlex Elder rbd_dev->image_format = 0; 50046fd48b3bSAlex Elder kfree(rbd_dev->spec->image_id); 50056fd48b3bSAlex Elder rbd_dev->spec->image_id = NULL; 50066fd48b3bSAlex Elder 5007200a6a8bSAlex Elder rbd_dev_destroy(rbd_dev); 5008200a6a8bSAlex Elder } 5009200a6a8bSAlex Elder 5010a30b71b9SAlex Elder /* 5011a30b71b9SAlex Elder * Probe for the existence of the header object for the given rbd 50121f3ef788SAlex Elder * device. If this image is the one being mapped (i.e., not a 50131f3ef788SAlex Elder * parent), initiate a watch on its header object before using that 50141f3ef788SAlex Elder * object to get detailed information about the rbd image. 5015a30b71b9SAlex Elder */ 50161f3ef788SAlex Elder static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool mapping) 5017a30b71b9SAlex Elder { 5018a30b71b9SAlex Elder int ret; 5019a30b71b9SAlex Elder 5020a30b71b9SAlex Elder /* 50213abef3b3SAlex Elder * Get the id from the image id object. Unless there's an 50223abef3b3SAlex Elder * error, rbd_dev->spec->image_id will be filled in with 50233abef3b3SAlex Elder * a dynamically-allocated string, and rbd_dev->image_format 50243abef3b3SAlex Elder * will be set to either 1 or 2. 5025a30b71b9SAlex Elder */ 5026a30b71b9SAlex Elder ret = rbd_dev_image_id(rbd_dev); 5027a30b71b9SAlex Elder if (ret) 5028c0fba368SAlex Elder return ret; 5029c0fba368SAlex Elder rbd_assert(rbd_dev->spec->image_id); 5030c0fba368SAlex Elder rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 5031c0fba368SAlex Elder 5032332bb12dSAlex Elder ret = rbd_dev_header_name(rbd_dev); 5033332bb12dSAlex Elder if (ret) 5034332bb12dSAlex Elder goto err_out_format; 5035332bb12dSAlex Elder 50361f3ef788SAlex Elder if (mapping) { 5037fca27065SIlya Dryomov ret = rbd_dev_header_watch_sync(rbd_dev); 5038b644de2bSAlex Elder if (ret) 5039b644de2bSAlex Elder goto out_header_name; 50401f3ef788SAlex Elder } 5041b644de2bSAlex Elder 5042c0fba368SAlex Elder if (rbd_dev->image_format == 1) 504399a41ebcSAlex Elder ret = rbd_dev_v1_header_info(rbd_dev); 5044a30b71b9SAlex Elder else 50452df3fac7SAlex Elder ret = rbd_dev_v2_header_info(rbd_dev); 50465655c4d9SAlex Elder if (ret) 5047b644de2bSAlex Elder goto err_out_watch; 5048a30b71b9SAlex Elder 50499bb81c9bSAlex Elder ret = rbd_dev_spec_update(rbd_dev); 50509bb81c9bSAlex Elder if (ret) 505133dca39fSAlex Elder goto err_out_probe; 50529bb81c9bSAlex Elder 50539bb81c9bSAlex Elder ret = rbd_dev_probe_parent(rbd_dev); 505430d60ba2SAlex Elder if (ret) 505530d60ba2SAlex Elder goto err_out_probe; 505683a06263SAlex Elder 505730d60ba2SAlex Elder dout("discovered format %u image, header name is %s\n", 505830d60ba2SAlex Elder rbd_dev->image_format, rbd_dev->header_name); 505930d60ba2SAlex Elder 506030d60ba2SAlex Elder return 0; 50616fd48b3bSAlex Elder err_out_probe: 50626fd48b3bSAlex Elder rbd_dev_unprobe(rbd_dev); 5063b644de2bSAlex Elder err_out_watch: 5064fca27065SIlya Dryomov if (mapping) 5065fca27065SIlya Dryomov rbd_dev_header_unwatch_sync(rbd_dev); 5066332bb12dSAlex Elder out_header_name: 5067332bb12dSAlex Elder kfree(rbd_dev->header_name); 5068332bb12dSAlex Elder rbd_dev->header_name = NULL; 5069332bb12dSAlex Elder err_out_format: 5070332bb12dSAlex Elder rbd_dev->image_format = 0; 50715655c4d9SAlex Elder kfree(rbd_dev->spec->image_id); 50725655c4d9SAlex Elder rbd_dev->spec->image_id = NULL; 50735655c4d9SAlex Elder 50745655c4d9SAlex Elder dout("probe failed, returning %d\n", ret); 50755655c4d9SAlex Elder 50765655c4d9SAlex Elder return ret; 507783a06263SAlex Elder } 507883a06263SAlex Elder 50799b60e70bSIlya Dryomov static ssize_t do_rbd_add(struct bus_type *bus, 508059c2be1eSYehuda Sadeh const char *buf, 508159c2be1eSYehuda Sadeh size_t count) 5082602adf40SYehuda Sadeh { 5083cb8627c7SAlex Elder struct rbd_device *rbd_dev = NULL; 5084dc79b113SAlex Elder struct ceph_options *ceph_opts = NULL; 50854e9afebaSAlex Elder struct rbd_options *rbd_opts = NULL; 5086859c31dfSAlex Elder struct rbd_spec *spec = NULL; 50879d3997fdSAlex Elder struct rbd_client *rbdc; 508851344a38SAlex Elder bool read_only; 508927cc2594SAlex Elder int rc = -ENOMEM; 5090602adf40SYehuda Sadeh 5091602adf40SYehuda Sadeh if (!try_module_get(THIS_MODULE)) 5092602adf40SYehuda Sadeh return -ENODEV; 5093602adf40SYehuda Sadeh 5094a725f65eSAlex Elder /* parse add command */ 5095859c31dfSAlex Elder rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec); 5096dc79b113SAlex Elder if (rc < 0) 5097bd4ba655SAlex Elder goto err_out_module; 509851344a38SAlex Elder read_only = rbd_opts->read_only; 509951344a38SAlex Elder kfree(rbd_opts); 510051344a38SAlex Elder rbd_opts = NULL; /* done with this */ 5101a725f65eSAlex Elder 51029d3997fdSAlex Elder rbdc = rbd_get_client(ceph_opts); 51039d3997fdSAlex Elder if (IS_ERR(rbdc)) { 51049d3997fdSAlex Elder rc = PTR_ERR(rbdc); 51050ddebc0cSAlex Elder goto err_out_args; 51069d3997fdSAlex Elder } 5107602adf40SYehuda Sadeh 5108602adf40SYehuda Sadeh /* pick the pool */ 510930ba1f02SIlya Dryomov rc = rbd_add_get_pool_id(rbdc, spec->pool_name); 5110602adf40SYehuda Sadeh if (rc < 0) 5111602adf40SYehuda Sadeh goto err_out_client; 5112859c31dfSAlex Elder spec->pool_id = (u64)rc; 5113859c31dfSAlex Elder 51140903e875SAlex Elder /* The ceph file layout needs to fit pool id in 32 bits */ 51150903e875SAlex Elder 5116c0cd10dbSAlex Elder if (spec->pool_id > (u64)U32_MAX) { 5117c0cd10dbSAlex Elder rbd_warn(NULL, "pool id too large (%llu > %u)\n", 5118c0cd10dbSAlex Elder (unsigned long long)spec->pool_id, U32_MAX); 51190903e875SAlex Elder rc = -EIO; 51200903e875SAlex Elder goto err_out_client; 51210903e875SAlex Elder } 51220903e875SAlex Elder 5123c53d5893SAlex Elder rbd_dev = rbd_dev_create(rbdc, spec); 5124bd4ba655SAlex Elder if (!rbd_dev) 5125bd4ba655SAlex Elder goto err_out_client; 5126c53d5893SAlex Elder rbdc = NULL; /* rbd_dev now owns this */ 5127c53d5893SAlex Elder spec = NULL; /* rbd_dev now owns this */ 5128602adf40SYehuda Sadeh 51291f3ef788SAlex Elder rc = rbd_dev_image_probe(rbd_dev, true); 5130a30b71b9SAlex Elder if (rc < 0) 5131c53d5893SAlex Elder goto err_out_rbd_dev; 513205fd6f6fSAlex Elder 51337ce4eef7SAlex Elder /* If we are mapping a snapshot it must be marked read-only */ 51347ce4eef7SAlex Elder 51357ce4eef7SAlex Elder if (rbd_dev->spec->snap_id != CEPH_NOSNAP) 51367ce4eef7SAlex Elder read_only = true; 51377ce4eef7SAlex Elder rbd_dev->mapping.read_only = read_only; 51387ce4eef7SAlex Elder 5139b536f69aSAlex Elder rc = rbd_dev_device_setup(rbd_dev); 51403abef3b3SAlex Elder if (rc) { 5141e37180c0SIlya Dryomov /* 5142e37180c0SIlya Dryomov * rbd_dev_header_unwatch_sync() can't be moved into 5143e37180c0SIlya Dryomov * rbd_dev_image_release() without refactoring, see 5144e37180c0SIlya Dryomov * commit 1f3ef78861ac. 5145e37180c0SIlya Dryomov */ 5146e37180c0SIlya Dryomov rbd_dev_header_unwatch_sync(rbd_dev); 51473abef3b3SAlex Elder rbd_dev_image_release(rbd_dev); 51483abef3b3SAlex Elder goto err_out_module; 51493abef3b3SAlex Elder } 51503abef3b3SAlex Elder 5151602adf40SYehuda Sadeh return count; 5152b536f69aSAlex Elder 5153c53d5893SAlex Elder err_out_rbd_dev: 5154c53d5893SAlex Elder rbd_dev_destroy(rbd_dev); 5155bd4ba655SAlex Elder err_out_client: 51569d3997fdSAlex Elder rbd_put_client(rbdc); 51570ddebc0cSAlex Elder err_out_args: 5158859c31dfSAlex Elder rbd_spec_put(spec); 5159bd4ba655SAlex Elder err_out_module: 5160bd4ba655SAlex Elder module_put(THIS_MODULE); 516127cc2594SAlex Elder 5162602adf40SYehuda Sadeh dout("Error adding device %s\n", buf); 516327cc2594SAlex Elder 516427cc2594SAlex Elder return (ssize_t)rc; 5165602adf40SYehuda Sadeh } 5166602adf40SYehuda Sadeh 51679b60e70bSIlya Dryomov static ssize_t rbd_add(struct bus_type *bus, 51689b60e70bSIlya Dryomov const char *buf, 51699b60e70bSIlya Dryomov size_t count) 51709b60e70bSIlya Dryomov { 51719b60e70bSIlya Dryomov if (single_major) 51729b60e70bSIlya Dryomov return -EINVAL; 51739b60e70bSIlya Dryomov 51749b60e70bSIlya Dryomov return do_rbd_add(bus, buf, count); 51759b60e70bSIlya Dryomov } 51769b60e70bSIlya Dryomov 51779b60e70bSIlya Dryomov static ssize_t rbd_add_single_major(struct bus_type *bus, 51789b60e70bSIlya Dryomov const char *buf, 51799b60e70bSIlya Dryomov size_t count) 51809b60e70bSIlya Dryomov { 51819b60e70bSIlya Dryomov return do_rbd_add(bus, buf, count); 51829b60e70bSIlya Dryomov } 51839b60e70bSIlya Dryomov 5184200a6a8bSAlex Elder static void rbd_dev_device_release(struct device *dev) 5185602adf40SYehuda Sadeh { 5186593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 5187602adf40SYehuda Sadeh 5188602adf40SYehuda Sadeh rbd_free_disk(rbd_dev); 5189200a6a8bSAlex Elder clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); 51906d80b130SAlex Elder rbd_dev_mapping_clear(rbd_dev); 51919b60e70bSIlya Dryomov if (!single_major) 5192602adf40SYehuda Sadeh unregister_blkdev(rbd_dev->major, rbd_dev->name); 5193e2839308SAlex Elder rbd_dev_id_put(rbd_dev); 5194d1cf5788SAlex Elder rbd_dev_mapping_clear(rbd_dev); 5195602adf40SYehuda Sadeh } 5196602adf40SYehuda Sadeh 519705a46afdSAlex Elder static void rbd_dev_remove_parent(struct rbd_device *rbd_dev) 519805a46afdSAlex Elder { 5199ad945fc1SAlex Elder while (rbd_dev->parent) { 520005a46afdSAlex Elder struct rbd_device *first = rbd_dev; 520105a46afdSAlex Elder struct rbd_device *second = first->parent; 520205a46afdSAlex Elder struct rbd_device *third; 520305a46afdSAlex Elder 520405a46afdSAlex Elder /* 520505a46afdSAlex Elder * Follow to the parent with no grandparent and 520605a46afdSAlex Elder * remove it. 520705a46afdSAlex Elder */ 520805a46afdSAlex Elder while (second && (third = second->parent)) { 520905a46afdSAlex Elder first = second; 521005a46afdSAlex Elder second = third; 521105a46afdSAlex Elder } 5212ad945fc1SAlex Elder rbd_assert(second); 52138ad42cd0SAlex Elder rbd_dev_image_release(second); 5214ad945fc1SAlex Elder first->parent = NULL; 5215ad945fc1SAlex Elder first->parent_overlap = 0; 5216ad945fc1SAlex Elder 5217ad945fc1SAlex Elder rbd_assert(first->parent_spec); 521805a46afdSAlex Elder rbd_spec_put(first->parent_spec); 521905a46afdSAlex Elder first->parent_spec = NULL; 522005a46afdSAlex Elder } 522105a46afdSAlex Elder } 522205a46afdSAlex Elder 52239b60e70bSIlya Dryomov static ssize_t do_rbd_remove(struct bus_type *bus, 5224602adf40SYehuda Sadeh const char *buf, 5225602adf40SYehuda Sadeh size_t count) 5226602adf40SYehuda Sadeh { 5227602adf40SYehuda Sadeh struct rbd_device *rbd_dev = NULL; 5228751cc0e3SAlex Elder struct list_head *tmp; 5229751cc0e3SAlex Elder int dev_id; 5230602adf40SYehuda Sadeh unsigned long ul; 523182a442d2SAlex Elder bool already = false; 52320d8189e1SAlex Elder int ret; 5233602adf40SYehuda Sadeh 5234bb8e0e84SJingoo Han ret = kstrtoul(buf, 10, &ul); 52350d8189e1SAlex Elder if (ret) 52360d8189e1SAlex Elder return ret; 5237602adf40SYehuda Sadeh 5238602adf40SYehuda Sadeh /* convert to int; abort if we lost anything in the conversion */ 5239751cc0e3SAlex Elder dev_id = (int)ul; 5240751cc0e3SAlex Elder if (dev_id != ul) 5241602adf40SYehuda Sadeh return -EINVAL; 5242602adf40SYehuda Sadeh 5243602adf40SYehuda Sadeh ret = -ENOENT; 5244751cc0e3SAlex Elder spin_lock(&rbd_dev_list_lock); 5245751cc0e3SAlex Elder list_for_each(tmp, &rbd_dev_list) { 5246751cc0e3SAlex Elder rbd_dev = list_entry(tmp, struct rbd_device, node); 5247751cc0e3SAlex Elder if (rbd_dev->dev_id == dev_id) { 5248751cc0e3SAlex Elder ret = 0; 5249751cc0e3SAlex Elder break; 5250602adf40SYehuda Sadeh } 5251751cc0e3SAlex Elder } 5252751cc0e3SAlex Elder if (!ret) { 5253a14ea269SAlex Elder spin_lock_irq(&rbd_dev->lock); 5254b82d167bSAlex Elder if (rbd_dev->open_count) 525542382b70SAlex Elder ret = -EBUSY; 5256b82d167bSAlex Elder else 525782a442d2SAlex Elder already = test_and_set_bit(RBD_DEV_FLAG_REMOVING, 525882a442d2SAlex Elder &rbd_dev->flags); 5259a14ea269SAlex Elder spin_unlock_irq(&rbd_dev->lock); 5260751cc0e3SAlex Elder } 5261751cc0e3SAlex Elder spin_unlock(&rbd_dev_list_lock); 526282a442d2SAlex Elder if (ret < 0 || already) 52631ba0f1e7SAlex Elder return ret; 5264751cc0e3SAlex Elder 5265fca27065SIlya Dryomov rbd_dev_header_unwatch_sync(rbd_dev); 52669abc5990SJosh Durgin /* 52679abc5990SJosh Durgin * flush remaining watch callbacks - these must be complete 52689abc5990SJosh Durgin * before the osd_client is shutdown 52699abc5990SJosh Durgin */ 52709abc5990SJosh Durgin dout("%s: flushing notifies", __func__); 52719abc5990SJosh Durgin ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc); 5272fca27065SIlya Dryomov 52739875201eSJosh Durgin /* 52749875201eSJosh Durgin * Don't free anything from rbd_dev->disk until after all 52759875201eSJosh Durgin * notifies are completely processed. Otherwise 52769875201eSJosh Durgin * rbd_bus_del_dev() will race with rbd_watch_cb(), resulting 52779875201eSJosh Durgin * in a potential use after free of rbd_dev->disk or rbd_dev. 52789875201eSJosh Durgin */ 52799875201eSJosh Durgin rbd_bus_del_dev(rbd_dev); 52808ad42cd0SAlex Elder rbd_dev_image_release(rbd_dev); 528179ab7558SAlex Elder module_put(THIS_MODULE); 5282aafb230eSAlex Elder 52831ba0f1e7SAlex Elder return count; 5284602adf40SYehuda Sadeh } 5285602adf40SYehuda Sadeh 52869b60e70bSIlya Dryomov static ssize_t rbd_remove(struct bus_type *bus, 52879b60e70bSIlya Dryomov const char *buf, 52889b60e70bSIlya Dryomov size_t count) 52899b60e70bSIlya Dryomov { 52909b60e70bSIlya Dryomov if (single_major) 52919b60e70bSIlya Dryomov return -EINVAL; 52929b60e70bSIlya Dryomov 52939b60e70bSIlya Dryomov return do_rbd_remove(bus, buf, count); 52949b60e70bSIlya Dryomov } 52959b60e70bSIlya Dryomov 52969b60e70bSIlya Dryomov static ssize_t rbd_remove_single_major(struct bus_type *bus, 52979b60e70bSIlya Dryomov const char *buf, 52989b60e70bSIlya Dryomov size_t count) 52999b60e70bSIlya Dryomov { 53009b60e70bSIlya Dryomov return do_rbd_remove(bus, buf, count); 53019b60e70bSIlya Dryomov } 53029b60e70bSIlya Dryomov 5303602adf40SYehuda Sadeh /* 5304602adf40SYehuda Sadeh * create control files in sysfs 5305dfc5606dSYehuda Sadeh * /sys/bus/rbd/... 5306602adf40SYehuda Sadeh */ 5307602adf40SYehuda Sadeh static int rbd_sysfs_init(void) 5308602adf40SYehuda Sadeh { 5309dfc5606dSYehuda Sadeh int ret; 5310602adf40SYehuda Sadeh 5311fed4c143SAlex Elder ret = device_register(&rbd_root_dev); 5312dfc5606dSYehuda Sadeh if (ret < 0) 5313dfc5606dSYehuda Sadeh return ret; 5314602adf40SYehuda Sadeh 5315fed4c143SAlex Elder ret = bus_register(&rbd_bus_type); 5316fed4c143SAlex Elder if (ret < 0) 5317fed4c143SAlex Elder device_unregister(&rbd_root_dev); 5318602adf40SYehuda Sadeh 5319602adf40SYehuda Sadeh return ret; 5320602adf40SYehuda Sadeh } 5321602adf40SYehuda Sadeh 5322602adf40SYehuda Sadeh static void rbd_sysfs_cleanup(void) 5323602adf40SYehuda Sadeh { 5324dfc5606dSYehuda Sadeh bus_unregister(&rbd_bus_type); 5325fed4c143SAlex Elder device_unregister(&rbd_root_dev); 5326602adf40SYehuda Sadeh } 5327602adf40SYehuda Sadeh 53281c2a9dfeSAlex Elder static int rbd_slab_init(void) 53291c2a9dfeSAlex Elder { 53301c2a9dfeSAlex Elder rbd_assert(!rbd_img_request_cache); 53311c2a9dfeSAlex Elder rbd_img_request_cache = kmem_cache_create("rbd_img_request", 53321c2a9dfeSAlex Elder sizeof (struct rbd_img_request), 53331c2a9dfeSAlex Elder __alignof__(struct rbd_img_request), 53341c2a9dfeSAlex Elder 0, NULL); 5335868311b1SAlex Elder if (!rbd_img_request_cache) 5336868311b1SAlex Elder return -ENOMEM; 5337868311b1SAlex Elder 5338868311b1SAlex Elder rbd_assert(!rbd_obj_request_cache); 5339868311b1SAlex Elder rbd_obj_request_cache = kmem_cache_create("rbd_obj_request", 5340868311b1SAlex Elder sizeof (struct rbd_obj_request), 5341868311b1SAlex Elder __alignof__(struct rbd_obj_request), 5342868311b1SAlex Elder 0, NULL); 534378c2a44aSAlex Elder if (!rbd_obj_request_cache) 534478c2a44aSAlex Elder goto out_err; 534578c2a44aSAlex Elder 534678c2a44aSAlex Elder rbd_assert(!rbd_segment_name_cache); 534778c2a44aSAlex Elder rbd_segment_name_cache = kmem_cache_create("rbd_segment_name", 53482d0ebc5dSIlya Dryomov CEPH_MAX_OID_NAME_LEN + 1, 1, 0, NULL); 534978c2a44aSAlex Elder if (rbd_segment_name_cache) 53501c2a9dfeSAlex Elder return 0; 535178c2a44aSAlex Elder out_err: 535278c2a44aSAlex Elder if (rbd_obj_request_cache) { 535378c2a44aSAlex Elder kmem_cache_destroy(rbd_obj_request_cache); 535478c2a44aSAlex Elder rbd_obj_request_cache = NULL; 535578c2a44aSAlex Elder } 53561c2a9dfeSAlex Elder 5357868311b1SAlex Elder kmem_cache_destroy(rbd_img_request_cache); 5358868311b1SAlex Elder rbd_img_request_cache = NULL; 5359868311b1SAlex Elder 53601c2a9dfeSAlex Elder return -ENOMEM; 53611c2a9dfeSAlex Elder } 53621c2a9dfeSAlex Elder 53631c2a9dfeSAlex Elder static void rbd_slab_exit(void) 53641c2a9dfeSAlex Elder { 536578c2a44aSAlex Elder rbd_assert(rbd_segment_name_cache); 536678c2a44aSAlex Elder kmem_cache_destroy(rbd_segment_name_cache); 536778c2a44aSAlex Elder rbd_segment_name_cache = NULL; 536878c2a44aSAlex Elder 5369868311b1SAlex Elder rbd_assert(rbd_obj_request_cache); 5370868311b1SAlex Elder kmem_cache_destroy(rbd_obj_request_cache); 5371868311b1SAlex Elder rbd_obj_request_cache = NULL; 5372868311b1SAlex Elder 53731c2a9dfeSAlex Elder rbd_assert(rbd_img_request_cache); 53741c2a9dfeSAlex Elder kmem_cache_destroy(rbd_img_request_cache); 53751c2a9dfeSAlex Elder rbd_img_request_cache = NULL; 53761c2a9dfeSAlex Elder } 53771c2a9dfeSAlex Elder 5378cc344fa1SAlex Elder static int __init rbd_init(void) 5379602adf40SYehuda Sadeh { 5380602adf40SYehuda Sadeh int rc; 5381602adf40SYehuda Sadeh 53821e32d34cSAlex Elder if (!libceph_compatible(NULL)) { 53831e32d34cSAlex Elder rbd_warn(NULL, "libceph incompatibility (quitting)"); 53841e32d34cSAlex Elder return -EINVAL; 53851e32d34cSAlex Elder } 5386e1b4d96dSIlya Dryomov 53871c2a9dfeSAlex Elder rc = rbd_slab_init(); 5388602adf40SYehuda Sadeh if (rc) 5389602adf40SYehuda Sadeh return rc; 5390e1b4d96dSIlya Dryomov 53919b60e70bSIlya Dryomov if (single_major) { 53929b60e70bSIlya Dryomov rbd_major = register_blkdev(0, RBD_DRV_NAME); 53939b60e70bSIlya Dryomov if (rbd_major < 0) { 53949b60e70bSIlya Dryomov rc = rbd_major; 53959b60e70bSIlya Dryomov goto err_out_slab; 53969b60e70bSIlya Dryomov } 53979b60e70bSIlya Dryomov } 53989b60e70bSIlya Dryomov 53991c2a9dfeSAlex Elder rc = rbd_sysfs_init(); 54001c2a9dfeSAlex Elder if (rc) 54019b60e70bSIlya Dryomov goto err_out_blkdev; 54021c2a9dfeSAlex Elder 54039b60e70bSIlya Dryomov if (single_major) 54049b60e70bSIlya Dryomov pr_info("loaded (major %d)\n", rbd_major); 54059b60e70bSIlya Dryomov else 5406e1b4d96dSIlya Dryomov pr_info("loaded\n"); 54079b60e70bSIlya Dryomov 5408e1b4d96dSIlya Dryomov return 0; 5409e1b4d96dSIlya Dryomov 54109b60e70bSIlya Dryomov err_out_blkdev: 54119b60e70bSIlya Dryomov if (single_major) 54129b60e70bSIlya Dryomov unregister_blkdev(rbd_major, RBD_DRV_NAME); 5413e1b4d96dSIlya Dryomov err_out_slab: 5414e1b4d96dSIlya Dryomov rbd_slab_exit(); 54151c2a9dfeSAlex Elder return rc; 5416602adf40SYehuda Sadeh } 5417602adf40SYehuda Sadeh 5418cc344fa1SAlex Elder static void __exit rbd_exit(void) 5419602adf40SYehuda Sadeh { 5420602adf40SYehuda Sadeh rbd_sysfs_cleanup(); 54219b60e70bSIlya Dryomov if (single_major) 54229b60e70bSIlya Dryomov unregister_blkdev(rbd_major, RBD_DRV_NAME); 54231c2a9dfeSAlex Elder rbd_slab_exit(); 5424602adf40SYehuda Sadeh } 5425602adf40SYehuda Sadeh 5426602adf40SYehuda Sadeh module_init(rbd_init); 5427602adf40SYehuda Sadeh module_exit(rbd_exit); 5428602adf40SYehuda Sadeh 5429d552c619SAlex Elder MODULE_AUTHOR("Alex Elder <elder@inktank.com>"); 5430602adf40SYehuda Sadeh MODULE_AUTHOR("Sage Weil <sage@newdream.net>"); 5431602adf40SYehuda Sadeh MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>"); 5432602adf40SYehuda Sadeh /* following authorship retained from original osdblk.c */ 5433602adf40SYehuda Sadeh MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>"); 5434602adf40SYehuda Sadeh 543590da258bSIlya Dryomov MODULE_DESCRIPTION("RADOS Block Device (RBD) driver"); 5436602adf40SYehuda Sadeh MODULE_LICENSE("GPL"); 5437