1e2a58ee5SAlex Elder 2602adf40SYehuda Sadeh /* 3602adf40SYehuda Sadeh rbd.c -- Export ceph rados objects as a Linux block device 4602adf40SYehuda Sadeh 5602adf40SYehuda Sadeh 6602adf40SYehuda Sadeh based on drivers/block/osdblk.c: 7602adf40SYehuda Sadeh 8602adf40SYehuda Sadeh Copyright 2009 Red Hat, Inc. 9602adf40SYehuda Sadeh 10602adf40SYehuda Sadeh This program is free software; you can redistribute it and/or modify 11602adf40SYehuda Sadeh it under the terms of the GNU General Public License as published by 12602adf40SYehuda Sadeh the Free Software Foundation. 13602adf40SYehuda Sadeh 14602adf40SYehuda Sadeh This program is distributed in the hope that it will be useful, 15602adf40SYehuda Sadeh but WITHOUT ANY WARRANTY; without even the implied warranty of 16602adf40SYehuda Sadeh MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17602adf40SYehuda Sadeh GNU General Public License for more details. 18602adf40SYehuda Sadeh 19602adf40SYehuda Sadeh You should have received a copy of the GNU General Public License 20602adf40SYehuda Sadeh along with this program; see the file COPYING. If not, write to 21602adf40SYehuda Sadeh the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 22602adf40SYehuda Sadeh 23602adf40SYehuda Sadeh 24602adf40SYehuda Sadeh 25dfc5606dSYehuda Sadeh For usage instructions, please refer to: 26602adf40SYehuda Sadeh 27dfc5606dSYehuda Sadeh Documentation/ABI/testing/sysfs-bus-rbd 28602adf40SYehuda Sadeh 29602adf40SYehuda Sadeh */ 30602adf40SYehuda Sadeh 31602adf40SYehuda Sadeh #include <linux/ceph/libceph.h> 32602adf40SYehuda Sadeh #include <linux/ceph/osd_client.h> 33602adf40SYehuda Sadeh #include <linux/ceph/mon_client.h> 34602adf40SYehuda Sadeh #include <linux/ceph/decode.h> 3559c2be1eSYehuda Sadeh #include <linux/parser.h> 3630d1cff8SAlex Elder #include <linux/bsearch.h> 37602adf40SYehuda Sadeh 38602adf40SYehuda Sadeh #include <linux/kernel.h> 39602adf40SYehuda Sadeh #include <linux/device.h> 40602adf40SYehuda Sadeh #include <linux/module.h> 41602adf40SYehuda Sadeh #include <linux/fs.h> 42602adf40SYehuda Sadeh #include <linux/blkdev.h> 431c2a9dfeSAlex Elder #include <linux/slab.h> 44602adf40SYehuda Sadeh 45602adf40SYehuda Sadeh #include "rbd_types.h" 46602adf40SYehuda Sadeh 47aafb230eSAlex Elder #define RBD_DEBUG /* Activate rbd_assert() calls */ 48aafb230eSAlex Elder 49593a9e7bSAlex Elder /* 50593a9e7bSAlex Elder * The basic unit of block I/O is a sector. It is interpreted in a 51593a9e7bSAlex Elder * number of contexts in Linux (blk, bio, genhd), but the default is 52593a9e7bSAlex Elder * universally 512 bytes. These symbols are just slightly more 53593a9e7bSAlex Elder * meaningful than the bare numbers they represent. 54593a9e7bSAlex Elder */ 55593a9e7bSAlex Elder #define SECTOR_SHIFT 9 56593a9e7bSAlex Elder #define SECTOR_SIZE (1ULL << SECTOR_SHIFT) 57593a9e7bSAlex Elder 58f0f8cef5SAlex Elder #define RBD_DRV_NAME "rbd" 59f0f8cef5SAlex Elder #define RBD_DRV_NAME_LONG "rbd (rados block device)" 60602adf40SYehuda Sadeh 61602adf40SYehuda Sadeh #define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */ 62602adf40SYehuda Sadeh 63d4b125e9SAlex Elder #define RBD_SNAP_DEV_NAME_PREFIX "snap_" 64d4b125e9SAlex Elder #define RBD_MAX_SNAP_NAME_LEN \ 65d4b125e9SAlex Elder (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1)) 66d4b125e9SAlex Elder 6735d489f9SAlex Elder #define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */ 68602adf40SYehuda Sadeh 69602adf40SYehuda Sadeh #define RBD_SNAP_HEAD_NAME "-" 70602adf40SYehuda Sadeh 719682fc6dSAlex Elder #define BAD_SNAP_INDEX U32_MAX /* invalid index into snap array */ 729682fc6dSAlex Elder 739e15b77dSAlex Elder /* This allows a single page to hold an image name sent by OSD */ 749e15b77dSAlex Elder #define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1) 75589d30e0SAlex Elder #define RBD_IMAGE_ID_LEN_MAX 64 769e15b77dSAlex Elder 771e130199SAlex Elder #define RBD_OBJ_PREFIX_LEN_MAX 64 78589d30e0SAlex Elder 79d889140cSAlex Elder /* Feature bits */ 80d889140cSAlex Elder 815cbf6f12SAlex Elder #define RBD_FEATURE_LAYERING (1<<0) 825cbf6f12SAlex Elder #define RBD_FEATURE_STRIPINGV2 (1<<1) 835cbf6f12SAlex Elder #define RBD_FEATURES_ALL \ 845cbf6f12SAlex Elder (RBD_FEATURE_LAYERING | RBD_FEATURE_STRIPINGV2) 85d889140cSAlex Elder 86d889140cSAlex Elder /* Features supported by this (client software) implementation. */ 87d889140cSAlex Elder 88770eba6eSAlex Elder #define RBD_FEATURES_SUPPORTED (RBD_FEATURES_ALL) 89d889140cSAlex Elder 9081a89793SAlex Elder /* 9181a89793SAlex Elder * An RBD device name will be "rbd#", where the "rbd" comes from 9281a89793SAlex Elder * RBD_DRV_NAME above, and # is a unique integer identifier. 9381a89793SAlex Elder * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big 9481a89793SAlex Elder * enough to hold all possible device names. 9581a89793SAlex Elder */ 96602adf40SYehuda Sadeh #define DEV_NAME_LEN 32 9781a89793SAlex Elder #define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1) 98602adf40SYehuda Sadeh 99602adf40SYehuda Sadeh /* 100602adf40SYehuda Sadeh * block device image metadata (in-memory version) 101602adf40SYehuda Sadeh */ 102602adf40SYehuda Sadeh struct rbd_image_header { 103f35a4deeSAlex Elder /* These six fields never change for a given rbd image */ 104849b4260SAlex Elder char *object_prefix; 105602adf40SYehuda Sadeh __u8 obj_order; 106602adf40SYehuda Sadeh __u8 crypt_type; 107602adf40SYehuda Sadeh __u8 comp_type; 108f35a4deeSAlex Elder u64 stripe_unit; 109f35a4deeSAlex Elder u64 stripe_count; 110f35a4deeSAlex Elder u64 features; /* Might be changeable someday? */ 111602adf40SYehuda Sadeh 112f84344f3SAlex Elder /* The remaining fields need to be updated occasionally */ 113f84344f3SAlex Elder u64 image_size; 114f84344f3SAlex Elder struct ceph_snap_context *snapc; 115f35a4deeSAlex Elder char *snap_names; /* format 1 only */ 116f35a4deeSAlex Elder u64 *snap_sizes; /* format 1 only */ 11759c2be1eSYehuda Sadeh }; 11859c2be1eSYehuda Sadeh 1190d7dbfceSAlex Elder /* 1200d7dbfceSAlex Elder * An rbd image specification. 1210d7dbfceSAlex Elder * 1220d7dbfceSAlex Elder * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely 123c66c6e0cSAlex Elder * identify an image. Each rbd_dev structure includes a pointer to 124c66c6e0cSAlex Elder * an rbd_spec structure that encapsulates this identity. 125c66c6e0cSAlex Elder * 126c66c6e0cSAlex Elder * Each of the id's in an rbd_spec has an associated name. For a 127c66c6e0cSAlex Elder * user-mapped image, the names are supplied and the id's associated 128c66c6e0cSAlex Elder * with them are looked up. For a layered image, a parent image is 129c66c6e0cSAlex Elder * defined by the tuple, and the names are looked up. 130c66c6e0cSAlex Elder * 131c66c6e0cSAlex Elder * An rbd_dev structure contains a parent_spec pointer which is 132c66c6e0cSAlex Elder * non-null if the image it represents is a child in a layered 133c66c6e0cSAlex Elder * image. This pointer will refer to the rbd_spec structure used 134c66c6e0cSAlex Elder * by the parent rbd_dev for its own identity (i.e., the structure 135c66c6e0cSAlex Elder * is shared between the parent and child). 136c66c6e0cSAlex Elder * 137c66c6e0cSAlex Elder * Since these structures are populated once, during the discovery 138c66c6e0cSAlex Elder * phase of image construction, they are effectively immutable so 139c66c6e0cSAlex Elder * we make no effort to synchronize access to them. 140c66c6e0cSAlex Elder * 141c66c6e0cSAlex Elder * Note that code herein does not assume the image name is known (it 142c66c6e0cSAlex Elder * could be a null pointer). 1430d7dbfceSAlex Elder */ 1440d7dbfceSAlex Elder struct rbd_spec { 1450d7dbfceSAlex Elder u64 pool_id; 146ecb4dc22SAlex Elder const char *pool_name; 1470d7dbfceSAlex Elder 148ecb4dc22SAlex Elder const char *image_id; 149ecb4dc22SAlex Elder const char *image_name; 1500d7dbfceSAlex Elder 1510d7dbfceSAlex Elder u64 snap_id; 152ecb4dc22SAlex Elder const char *snap_name; 1530d7dbfceSAlex Elder 1540d7dbfceSAlex Elder struct kref kref; 1550d7dbfceSAlex Elder }; 1560d7dbfceSAlex Elder 157602adf40SYehuda Sadeh /* 158f0f8cef5SAlex Elder * an instance of the client. multiple devices may share an rbd client. 159602adf40SYehuda Sadeh */ 160602adf40SYehuda Sadeh struct rbd_client { 161602adf40SYehuda Sadeh struct ceph_client *client; 162602adf40SYehuda Sadeh struct kref kref; 163602adf40SYehuda Sadeh struct list_head node; 164602adf40SYehuda Sadeh }; 165602adf40SYehuda Sadeh 166bf0d5f50SAlex Elder struct rbd_img_request; 167bf0d5f50SAlex Elder typedef void (*rbd_img_callback_t)(struct rbd_img_request *); 168bf0d5f50SAlex Elder 169bf0d5f50SAlex Elder #define BAD_WHICH U32_MAX /* Good which or bad which, which? */ 170bf0d5f50SAlex Elder 171bf0d5f50SAlex Elder struct rbd_obj_request; 172bf0d5f50SAlex Elder typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *); 173bf0d5f50SAlex Elder 1749969ebc5SAlex Elder enum obj_request_type { 1759969ebc5SAlex Elder OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES 1769969ebc5SAlex Elder }; 177bf0d5f50SAlex Elder 178926f9b3fSAlex Elder enum obj_req_flags { 179926f9b3fSAlex Elder OBJ_REQ_DONE, /* completion flag: not done = 0, done = 1 */ 1806365d33aSAlex Elder OBJ_REQ_IMG_DATA, /* object usage: standalone = 0, image = 1 */ 1815679c59fSAlex Elder OBJ_REQ_KNOWN, /* EXISTS flag valid: no = 0, yes = 1 */ 1825679c59fSAlex Elder OBJ_REQ_EXISTS, /* target exists: no = 0, yes = 1 */ 183926f9b3fSAlex Elder }; 184926f9b3fSAlex Elder 185bf0d5f50SAlex Elder struct rbd_obj_request { 186bf0d5f50SAlex Elder const char *object_name; 187bf0d5f50SAlex Elder u64 offset; /* object start byte */ 188bf0d5f50SAlex Elder u64 length; /* bytes from offset */ 189926f9b3fSAlex Elder unsigned long flags; 190bf0d5f50SAlex Elder 191c5b5ef6cSAlex Elder /* 192c5b5ef6cSAlex Elder * An object request associated with an image will have its 193c5b5ef6cSAlex Elder * img_data flag set; a standalone object request will not. 194c5b5ef6cSAlex Elder * 195c5b5ef6cSAlex Elder * A standalone object request will have which == BAD_WHICH 196c5b5ef6cSAlex Elder * and a null obj_request pointer. 197c5b5ef6cSAlex Elder * 198c5b5ef6cSAlex Elder * An object request initiated in support of a layered image 199c5b5ef6cSAlex Elder * object (to check for its existence before a write) will 200c5b5ef6cSAlex Elder * have which == BAD_WHICH and a non-null obj_request pointer. 201c5b5ef6cSAlex Elder * 202c5b5ef6cSAlex Elder * Finally, an object request for rbd image data will have 203c5b5ef6cSAlex Elder * which != BAD_WHICH, and will have a non-null img_request 204c5b5ef6cSAlex Elder * pointer. The value of which will be in the range 205c5b5ef6cSAlex Elder * 0..(img_request->obj_request_count-1). 206c5b5ef6cSAlex Elder */ 207c5b5ef6cSAlex Elder union { 208c5b5ef6cSAlex Elder struct rbd_obj_request *obj_request; /* STAT op */ 209c5b5ef6cSAlex Elder struct { 210bf0d5f50SAlex Elder struct rbd_img_request *img_request; 211c5b5ef6cSAlex Elder u64 img_offset; 212c5b5ef6cSAlex Elder /* links for img_request->obj_requests list */ 213c5b5ef6cSAlex Elder struct list_head links; 214c5b5ef6cSAlex Elder }; 215c5b5ef6cSAlex Elder }; 216bf0d5f50SAlex Elder u32 which; /* posn image request list */ 217bf0d5f50SAlex Elder 218bf0d5f50SAlex Elder enum obj_request_type type; 219788e2df3SAlex Elder union { 220bf0d5f50SAlex Elder struct bio *bio_list; 221788e2df3SAlex Elder struct { 222788e2df3SAlex Elder struct page **pages; 223788e2df3SAlex Elder u32 page_count; 224788e2df3SAlex Elder }; 225788e2df3SAlex Elder }; 2260eefd470SAlex Elder struct page **copyup_pages; 227bf0d5f50SAlex Elder 228bf0d5f50SAlex Elder struct ceph_osd_request *osd_req; 229bf0d5f50SAlex Elder 230bf0d5f50SAlex Elder u64 xferred; /* bytes transferred */ 2311b83bef2SSage Weil int result; 232bf0d5f50SAlex Elder 233bf0d5f50SAlex Elder rbd_obj_callback_t callback; 234788e2df3SAlex Elder struct completion completion; 235bf0d5f50SAlex Elder 236bf0d5f50SAlex Elder struct kref kref; 237bf0d5f50SAlex Elder }; 238bf0d5f50SAlex Elder 2390c425248SAlex Elder enum img_req_flags { 2409849e986SAlex Elder IMG_REQ_WRITE, /* I/O direction: read = 0, write = 1 */ 2419849e986SAlex Elder IMG_REQ_CHILD, /* initiator: block = 0, child image = 1 */ 242d0b2e944SAlex Elder IMG_REQ_LAYERED, /* ENOENT handling: normal = 0, layered = 1 */ 2430c425248SAlex Elder }; 2440c425248SAlex Elder 245bf0d5f50SAlex Elder struct rbd_img_request { 246bf0d5f50SAlex Elder struct rbd_device *rbd_dev; 247bf0d5f50SAlex Elder u64 offset; /* starting image byte offset */ 248bf0d5f50SAlex Elder u64 length; /* byte count from offset */ 2490c425248SAlex Elder unsigned long flags; 250bf0d5f50SAlex Elder union { 251bf0d5f50SAlex Elder u64 snap_id; /* for reads */ 2529849e986SAlex Elder struct ceph_snap_context *snapc; /* for writes */ 2539849e986SAlex Elder }; 2549849e986SAlex Elder union { 2559849e986SAlex Elder struct request *rq; /* block request */ 2569849e986SAlex Elder struct rbd_obj_request *obj_request; /* obj req initiator */ 257bf0d5f50SAlex Elder }; 2583d7efd18SAlex Elder struct page **copyup_pages; 259bf0d5f50SAlex Elder spinlock_t completion_lock;/* protects next_completion */ 260bf0d5f50SAlex Elder u32 next_completion; 261bf0d5f50SAlex Elder rbd_img_callback_t callback; 26255f27e09SAlex Elder u64 xferred;/* aggregate bytes transferred */ 263a5a337d4SAlex Elder int result; /* first nonzero obj_request result */ 264bf0d5f50SAlex Elder 265bf0d5f50SAlex Elder u32 obj_request_count; 266bf0d5f50SAlex Elder struct list_head obj_requests; /* rbd_obj_request structs */ 267bf0d5f50SAlex Elder 268bf0d5f50SAlex Elder struct kref kref; 269bf0d5f50SAlex Elder }; 270bf0d5f50SAlex Elder 271bf0d5f50SAlex Elder #define for_each_obj_request(ireq, oreq) \ 272ef06f4d3SAlex Elder list_for_each_entry(oreq, &(ireq)->obj_requests, links) 273bf0d5f50SAlex Elder #define for_each_obj_request_from(ireq, oreq) \ 274ef06f4d3SAlex Elder list_for_each_entry_from(oreq, &(ireq)->obj_requests, links) 275bf0d5f50SAlex Elder #define for_each_obj_request_safe(ireq, oreq, n) \ 276ef06f4d3SAlex Elder list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links) 277bf0d5f50SAlex Elder 278f84344f3SAlex Elder struct rbd_mapping { 27999c1f08fSAlex Elder u64 size; 28034b13184SAlex Elder u64 features; 281f84344f3SAlex Elder bool read_only; 282f84344f3SAlex Elder }; 283f84344f3SAlex Elder 284602adf40SYehuda Sadeh /* 285602adf40SYehuda Sadeh * a single device 286602adf40SYehuda Sadeh */ 287602adf40SYehuda Sadeh struct rbd_device { 288de71a297SAlex Elder int dev_id; /* blkdev unique id */ 289602adf40SYehuda Sadeh 290602adf40SYehuda Sadeh int major; /* blkdev assigned major */ 291602adf40SYehuda Sadeh struct gendisk *disk; /* blkdev's gendisk and rq */ 292602adf40SYehuda Sadeh 293a30b71b9SAlex Elder u32 image_format; /* Either 1 or 2 */ 294602adf40SYehuda Sadeh struct rbd_client *rbd_client; 295602adf40SYehuda Sadeh 296602adf40SYehuda Sadeh char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */ 297602adf40SYehuda Sadeh 298b82d167bSAlex Elder spinlock_t lock; /* queue, flags, open_count */ 299602adf40SYehuda Sadeh 300602adf40SYehuda Sadeh struct rbd_image_header header; 301b82d167bSAlex Elder unsigned long flags; /* possibly lock protected */ 3020d7dbfceSAlex Elder struct rbd_spec *spec; 303602adf40SYehuda Sadeh 3040d7dbfceSAlex Elder char *header_name; 305971f839aSAlex Elder 3060903e875SAlex Elder struct ceph_file_layout layout; 3070903e875SAlex Elder 30859c2be1eSYehuda Sadeh struct ceph_osd_event *watch_event; 309975241afSAlex Elder struct rbd_obj_request *watch_request; 31059c2be1eSYehuda Sadeh 31186b00e0dSAlex Elder struct rbd_spec *parent_spec; 31286b00e0dSAlex Elder u64 parent_overlap; 3132f82ee54SAlex Elder struct rbd_device *parent; 31486b00e0dSAlex Elder 315c666601aSJosh Durgin /* protects updating the header */ 316c666601aSJosh Durgin struct rw_semaphore header_rwsem; 317f84344f3SAlex Elder 318f84344f3SAlex Elder struct rbd_mapping mapping; 319602adf40SYehuda Sadeh 320602adf40SYehuda Sadeh struct list_head node; 321dfc5606dSYehuda Sadeh 322dfc5606dSYehuda Sadeh /* sysfs related */ 323dfc5606dSYehuda Sadeh struct device dev; 324b82d167bSAlex Elder unsigned long open_count; /* protected by lock */ 325dfc5606dSYehuda Sadeh }; 326dfc5606dSYehuda Sadeh 327b82d167bSAlex Elder /* 328b82d167bSAlex Elder * Flag bits for rbd_dev->flags. If atomicity is required, 329b82d167bSAlex Elder * rbd_dev->lock is used to protect access. 330b82d167bSAlex Elder * 331b82d167bSAlex Elder * Currently, only the "removing" flag (which is coupled with the 332b82d167bSAlex Elder * "open_count" field) requires atomic access. 333b82d167bSAlex Elder */ 3346d292906SAlex Elder enum rbd_dev_flags { 3356d292906SAlex Elder RBD_DEV_FLAG_EXISTS, /* mapped snapshot has not been deleted */ 336b82d167bSAlex Elder RBD_DEV_FLAG_REMOVING, /* this mapping is being removed */ 3376d292906SAlex Elder }; 3386d292906SAlex Elder 339602adf40SYehuda Sadeh static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */ 340e124a82fSAlex Elder 341602adf40SYehuda Sadeh static LIST_HEAD(rbd_dev_list); /* devices */ 342e124a82fSAlex Elder static DEFINE_SPINLOCK(rbd_dev_list_lock); 343e124a82fSAlex Elder 344602adf40SYehuda Sadeh static LIST_HEAD(rbd_client_list); /* clients */ 345432b8587SAlex Elder static DEFINE_SPINLOCK(rbd_client_list_lock); 346602adf40SYehuda Sadeh 34778c2a44aSAlex Elder /* Slab caches for frequently-allocated structures */ 34878c2a44aSAlex Elder 3491c2a9dfeSAlex Elder static struct kmem_cache *rbd_img_request_cache; 350868311b1SAlex Elder static struct kmem_cache *rbd_obj_request_cache; 35178c2a44aSAlex Elder static struct kmem_cache *rbd_segment_name_cache; 3521c2a9dfeSAlex Elder 3533d7efd18SAlex Elder static int rbd_img_request_submit(struct rbd_img_request *img_request); 3543d7efd18SAlex Elder 355200a6a8bSAlex Elder static void rbd_dev_device_release(struct device *dev); 356dfc5606dSYehuda Sadeh 357f0f8cef5SAlex Elder static ssize_t rbd_add(struct bus_type *bus, const char *buf, 358f0f8cef5SAlex Elder size_t count); 359f0f8cef5SAlex Elder static ssize_t rbd_remove(struct bus_type *bus, const char *buf, 360f0f8cef5SAlex Elder size_t count); 36151344a38SAlex Elder static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool read_only); 362f0f8cef5SAlex Elder 363f0f8cef5SAlex Elder static struct bus_attribute rbd_bus_attrs[] = { 364f0f8cef5SAlex Elder __ATTR(add, S_IWUSR, NULL, rbd_add), 365f0f8cef5SAlex Elder __ATTR(remove, S_IWUSR, NULL, rbd_remove), 366f0f8cef5SAlex Elder __ATTR_NULL 367f0f8cef5SAlex Elder }; 368f0f8cef5SAlex Elder 369f0f8cef5SAlex Elder static struct bus_type rbd_bus_type = { 370f0f8cef5SAlex Elder .name = "rbd", 371f0f8cef5SAlex Elder .bus_attrs = rbd_bus_attrs, 372f0f8cef5SAlex Elder }; 373f0f8cef5SAlex Elder 374f0f8cef5SAlex Elder static void rbd_root_dev_release(struct device *dev) 375f0f8cef5SAlex Elder { 376f0f8cef5SAlex Elder } 377f0f8cef5SAlex Elder 378f0f8cef5SAlex Elder static struct device rbd_root_dev = { 379f0f8cef5SAlex Elder .init_name = "rbd", 380f0f8cef5SAlex Elder .release = rbd_root_dev_release, 381f0f8cef5SAlex Elder }; 382f0f8cef5SAlex Elder 38306ecc6cbSAlex Elder static __printf(2, 3) 38406ecc6cbSAlex Elder void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...) 38506ecc6cbSAlex Elder { 38606ecc6cbSAlex Elder struct va_format vaf; 38706ecc6cbSAlex Elder va_list args; 38806ecc6cbSAlex Elder 38906ecc6cbSAlex Elder va_start(args, fmt); 39006ecc6cbSAlex Elder vaf.fmt = fmt; 39106ecc6cbSAlex Elder vaf.va = &args; 39206ecc6cbSAlex Elder 39306ecc6cbSAlex Elder if (!rbd_dev) 39406ecc6cbSAlex Elder printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf); 39506ecc6cbSAlex Elder else if (rbd_dev->disk) 39606ecc6cbSAlex Elder printk(KERN_WARNING "%s: %s: %pV\n", 39706ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf); 39806ecc6cbSAlex Elder else if (rbd_dev->spec && rbd_dev->spec->image_name) 39906ecc6cbSAlex Elder printk(KERN_WARNING "%s: image %s: %pV\n", 40006ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf); 40106ecc6cbSAlex Elder else if (rbd_dev->spec && rbd_dev->spec->image_id) 40206ecc6cbSAlex Elder printk(KERN_WARNING "%s: id %s: %pV\n", 40306ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf); 40406ecc6cbSAlex Elder else /* punt */ 40506ecc6cbSAlex Elder printk(KERN_WARNING "%s: rbd_dev %p: %pV\n", 40606ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev, &vaf); 40706ecc6cbSAlex Elder va_end(args); 40806ecc6cbSAlex Elder } 40906ecc6cbSAlex Elder 410aafb230eSAlex Elder #ifdef RBD_DEBUG 411aafb230eSAlex Elder #define rbd_assert(expr) \ 412aafb230eSAlex Elder if (unlikely(!(expr))) { \ 413aafb230eSAlex Elder printk(KERN_ERR "\nAssertion failure in %s() " \ 414aafb230eSAlex Elder "at line %d:\n\n" \ 415aafb230eSAlex Elder "\trbd_assert(%s);\n\n", \ 416aafb230eSAlex Elder __func__, __LINE__, #expr); \ 417aafb230eSAlex Elder BUG(); \ 418aafb230eSAlex Elder } 419aafb230eSAlex Elder #else /* !RBD_DEBUG */ 420aafb230eSAlex Elder # define rbd_assert(expr) ((void) 0) 421aafb230eSAlex Elder #endif /* !RBD_DEBUG */ 422dfc5606dSYehuda Sadeh 423b454e36dSAlex Elder static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request); 42405a46afdSAlex Elder static void rbd_img_parent_read(struct rbd_obj_request *obj_request); 42505a46afdSAlex Elder static void rbd_dev_remove_parent(struct rbd_device *rbd_dev); 4268b3e1a56SAlex Elder 427cc4a38bdSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev); 4282df3fac7SAlex Elder static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev); 4292df3fac7SAlex Elder static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev); 43054cac61fSAlex Elder static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, 43154cac61fSAlex Elder u64 snap_id); 4322ad3d716SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id, 4332ad3d716SAlex Elder u8 *order, u64 *snap_size); 4342ad3d716SAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id, 4352ad3d716SAlex Elder u64 *snap_features); 4362ad3d716SAlex Elder static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name); 43759c2be1eSYehuda Sadeh 438602adf40SYehuda Sadeh static int rbd_open(struct block_device *bdev, fmode_t mode) 439602adf40SYehuda Sadeh { 440f0f8cef5SAlex Elder struct rbd_device *rbd_dev = bdev->bd_disk->private_data; 441b82d167bSAlex Elder bool removing = false; 442602adf40SYehuda Sadeh 443f84344f3SAlex Elder if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only) 444602adf40SYehuda Sadeh return -EROFS; 445602adf40SYehuda Sadeh 446a14ea269SAlex Elder spin_lock_irq(&rbd_dev->lock); 447b82d167bSAlex Elder if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags)) 448b82d167bSAlex Elder removing = true; 449b82d167bSAlex Elder else 450b82d167bSAlex Elder rbd_dev->open_count++; 451a14ea269SAlex Elder spin_unlock_irq(&rbd_dev->lock); 452b82d167bSAlex Elder if (removing) 453b82d167bSAlex Elder return -ENOENT; 454b82d167bSAlex Elder 45542382b70SAlex Elder mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); 456c3e946ceSAlex Elder (void) get_device(&rbd_dev->dev); 457f84344f3SAlex Elder set_device_ro(bdev, rbd_dev->mapping.read_only); 45842382b70SAlex Elder mutex_unlock(&ctl_mutex); 459340c7a2bSAlex Elder 460602adf40SYehuda Sadeh return 0; 461602adf40SYehuda Sadeh } 462602adf40SYehuda Sadeh 463dfc5606dSYehuda Sadeh static int rbd_release(struct gendisk *disk, fmode_t mode) 464dfc5606dSYehuda Sadeh { 465dfc5606dSYehuda Sadeh struct rbd_device *rbd_dev = disk->private_data; 466b82d167bSAlex Elder unsigned long open_count_before; 467b82d167bSAlex Elder 468a14ea269SAlex Elder spin_lock_irq(&rbd_dev->lock); 469b82d167bSAlex Elder open_count_before = rbd_dev->open_count--; 470a14ea269SAlex Elder spin_unlock_irq(&rbd_dev->lock); 471b82d167bSAlex Elder rbd_assert(open_count_before > 0); 472dfc5606dSYehuda Sadeh 47342382b70SAlex Elder mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); 474c3e946ceSAlex Elder put_device(&rbd_dev->dev); 47542382b70SAlex Elder mutex_unlock(&ctl_mutex); 476dfc5606dSYehuda Sadeh 477dfc5606dSYehuda Sadeh return 0; 478dfc5606dSYehuda Sadeh } 479dfc5606dSYehuda Sadeh 480602adf40SYehuda Sadeh static const struct block_device_operations rbd_bd_ops = { 481602adf40SYehuda Sadeh .owner = THIS_MODULE, 482602adf40SYehuda Sadeh .open = rbd_open, 483dfc5606dSYehuda Sadeh .release = rbd_release, 484602adf40SYehuda Sadeh }; 485602adf40SYehuda Sadeh 486602adf40SYehuda Sadeh /* 487602adf40SYehuda Sadeh * Initialize an rbd client instance. 48843ae4701SAlex Elder * We own *ceph_opts. 489602adf40SYehuda Sadeh */ 490f8c38929SAlex Elder static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts) 491602adf40SYehuda Sadeh { 492602adf40SYehuda Sadeh struct rbd_client *rbdc; 493602adf40SYehuda Sadeh int ret = -ENOMEM; 494602adf40SYehuda Sadeh 49537206ee5SAlex Elder dout("%s:\n", __func__); 496602adf40SYehuda Sadeh rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL); 497602adf40SYehuda Sadeh if (!rbdc) 498602adf40SYehuda Sadeh goto out_opt; 499602adf40SYehuda Sadeh 500602adf40SYehuda Sadeh kref_init(&rbdc->kref); 501602adf40SYehuda Sadeh INIT_LIST_HEAD(&rbdc->node); 502602adf40SYehuda Sadeh 503bc534d86SAlex Elder mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); 504bc534d86SAlex Elder 50543ae4701SAlex Elder rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0); 506602adf40SYehuda Sadeh if (IS_ERR(rbdc->client)) 507bc534d86SAlex Elder goto out_mutex; 50843ae4701SAlex Elder ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */ 509602adf40SYehuda Sadeh 510602adf40SYehuda Sadeh ret = ceph_open_session(rbdc->client); 511602adf40SYehuda Sadeh if (ret < 0) 512602adf40SYehuda Sadeh goto out_err; 513602adf40SYehuda Sadeh 514432b8587SAlex Elder spin_lock(&rbd_client_list_lock); 515602adf40SYehuda Sadeh list_add_tail(&rbdc->node, &rbd_client_list); 516432b8587SAlex Elder spin_unlock(&rbd_client_list_lock); 517602adf40SYehuda Sadeh 518bc534d86SAlex Elder mutex_unlock(&ctl_mutex); 51937206ee5SAlex Elder dout("%s: rbdc %p\n", __func__, rbdc); 520bc534d86SAlex Elder 521602adf40SYehuda Sadeh return rbdc; 522602adf40SYehuda Sadeh 523602adf40SYehuda Sadeh out_err: 524602adf40SYehuda Sadeh ceph_destroy_client(rbdc->client); 525bc534d86SAlex Elder out_mutex: 526bc534d86SAlex Elder mutex_unlock(&ctl_mutex); 527602adf40SYehuda Sadeh kfree(rbdc); 528602adf40SYehuda Sadeh out_opt: 52943ae4701SAlex Elder if (ceph_opts) 53043ae4701SAlex Elder ceph_destroy_options(ceph_opts); 53137206ee5SAlex Elder dout("%s: error %d\n", __func__, ret); 53237206ee5SAlex Elder 53328f259b7SVasiliy Kulikov return ERR_PTR(ret); 534602adf40SYehuda Sadeh } 535602adf40SYehuda Sadeh 5362f82ee54SAlex Elder static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc) 5372f82ee54SAlex Elder { 5382f82ee54SAlex Elder kref_get(&rbdc->kref); 5392f82ee54SAlex Elder 5402f82ee54SAlex Elder return rbdc; 5412f82ee54SAlex Elder } 5422f82ee54SAlex Elder 543602adf40SYehuda Sadeh /* 5441f7ba331SAlex Elder * Find a ceph client with specific addr and configuration. If 5451f7ba331SAlex Elder * found, bump its reference count. 546602adf40SYehuda Sadeh */ 5471f7ba331SAlex Elder static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts) 548602adf40SYehuda Sadeh { 549602adf40SYehuda Sadeh struct rbd_client *client_node; 5501f7ba331SAlex Elder bool found = false; 551602adf40SYehuda Sadeh 55243ae4701SAlex Elder if (ceph_opts->flags & CEPH_OPT_NOSHARE) 553602adf40SYehuda Sadeh return NULL; 554602adf40SYehuda Sadeh 5551f7ba331SAlex Elder spin_lock(&rbd_client_list_lock); 5561f7ba331SAlex Elder list_for_each_entry(client_node, &rbd_client_list, node) { 5571f7ba331SAlex Elder if (!ceph_compare_options(ceph_opts, client_node->client)) { 5582f82ee54SAlex Elder __rbd_get_client(client_node); 5592f82ee54SAlex Elder 5601f7ba331SAlex Elder found = true; 5611f7ba331SAlex Elder break; 5621f7ba331SAlex Elder } 5631f7ba331SAlex Elder } 5641f7ba331SAlex Elder spin_unlock(&rbd_client_list_lock); 5651f7ba331SAlex Elder 5661f7ba331SAlex Elder return found ? client_node : NULL; 567602adf40SYehuda Sadeh } 568602adf40SYehuda Sadeh 569602adf40SYehuda Sadeh /* 57059c2be1eSYehuda Sadeh * mount options 57159c2be1eSYehuda Sadeh */ 57259c2be1eSYehuda Sadeh enum { 57359c2be1eSYehuda Sadeh Opt_last_int, 57459c2be1eSYehuda Sadeh /* int args above */ 57559c2be1eSYehuda Sadeh Opt_last_string, 57659c2be1eSYehuda Sadeh /* string args above */ 577cc0538b6SAlex Elder Opt_read_only, 578cc0538b6SAlex Elder Opt_read_write, 579cc0538b6SAlex Elder /* Boolean args above */ 580cc0538b6SAlex Elder Opt_last_bool, 58159c2be1eSYehuda Sadeh }; 58259c2be1eSYehuda Sadeh 58343ae4701SAlex Elder static match_table_t rbd_opts_tokens = { 58459c2be1eSYehuda Sadeh /* int args above */ 58559c2be1eSYehuda Sadeh /* string args above */ 586be466c1cSAlex Elder {Opt_read_only, "read_only"}, 587cc0538b6SAlex Elder {Opt_read_only, "ro"}, /* Alternate spelling */ 588cc0538b6SAlex Elder {Opt_read_write, "read_write"}, 589cc0538b6SAlex Elder {Opt_read_write, "rw"}, /* Alternate spelling */ 590cc0538b6SAlex Elder /* Boolean args above */ 59159c2be1eSYehuda Sadeh {-1, NULL} 59259c2be1eSYehuda Sadeh }; 59359c2be1eSYehuda Sadeh 59498571b5aSAlex Elder struct rbd_options { 59598571b5aSAlex Elder bool read_only; 59698571b5aSAlex Elder }; 59798571b5aSAlex Elder 59898571b5aSAlex Elder #define RBD_READ_ONLY_DEFAULT false 59998571b5aSAlex Elder 60059c2be1eSYehuda Sadeh static int parse_rbd_opts_token(char *c, void *private) 60159c2be1eSYehuda Sadeh { 60243ae4701SAlex Elder struct rbd_options *rbd_opts = private; 60359c2be1eSYehuda Sadeh substring_t argstr[MAX_OPT_ARGS]; 60459c2be1eSYehuda Sadeh int token, intval, ret; 60559c2be1eSYehuda Sadeh 60643ae4701SAlex Elder token = match_token(c, rbd_opts_tokens, argstr); 60759c2be1eSYehuda Sadeh if (token < 0) 60859c2be1eSYehuda Sadeh return -EINVAL; 60959c2be1eSYehuda Sadeh 61059c2be1eSYehuda Sadeh if (token < Opt_last_int) { 61159c2be1eSYehuda Sadeh ret = match_int(&argstr[0], &intval); 61259c2be1eSYehuda Sadeh if (ret < 0) { 61359c2be1eSYehuda Sadeh pr_err("bad mount option arg (not int) " 61459c2be1eSYehuda Sadeh "at '%s'\n", c); 61559c2be1eSYehuda Sadeh return ret; 61659c2be1eSYehuda Sadeh } 61759c2be1eSYehuda Sadeh dout("got int token %d val %d\n", token, intval); 61859c2be1eSYehuda Sadeh } else if (token > Opt_last_int && token < Opt_last_string) { 61959c2be1eSYehuda Sadeh dout("got string token %d val %s\n", token, 62059c2be1eSYehuda Sadeh argstr[0].from); 621cc0538b6SAlex Elder } else if (token > Opt_last_string && token < Opt_last_bool) { 622cc0538b6SAlex Elder dout("got Boolean token %d\n", token); 62359c2be1eSYehuda Sadeh } else { 62459c2be1eSYehuda Sadeh dout("got token %d\n", token); 62559c2be1eSYehuda Sadeh } 62659c2be1eSYehuda Sadeh 62759c2be1eSYehuda Sadeh switch (token) { 628cc0538b6SAlex Elder case Opt_read_only: 629cc0538b6SAlex Elder rbd_opts->read_only = true; 630cc0538b6SAlex Elder break; 631cc0538b6SAlex Elder case Opt_read_write: 632cc0538b6SAlex Elder rbd_opts->read_only = false; 633cc0538b6SAlex Elder break; 63459c2be1eSYehuda Sadeh default: 635aafb230eSAlex Elder rbd_assert(false); 636aafb230eSAlex Elder break; 63759c2be1eSYehuda Sadeh } 63859c2be1eSYehuda Sadeh return 0; 63959c2be1eSYehuda Sadeh } 64059c2be1eSYehuda Sadeh 64159c2be1eSYehuda Sadeh /* 642602adf40SYehuda Sadeh * Get a ceph client with specific addr and configuration, if one does 643602adf40SYehuda Sadeh * not exist create it. 644602adf40SYehuda Sadeh */ 6459d3997fdSAlex Elder static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts) 646602adf40SYehuda Sadeh { 647f8c38929SAlex Elder struct rbd_client *rbdc; 64859c2be1eSYehuda Sadeh 6491f7ba331SAlex Elder rbdc = rbd_client_find(ceph_opts); 6509d3997fdSAlex Elder if (rbdc) /* using an existing client */ 65143ae4701SAlex Elder ceph_destroy_options(ceph_opts); 6529d3997fdSAlex Elder else 653f8c38929SAlex Elder rbdc = rbd_client_create(ceph_opts); 654d720bcb0SAlex Elder 6559d3997fdSAlex Elder return rbdc; 656602adf40SYehuda Sadeh } 657602adf40SYehuda Sadeh 658602adf40SYehuda Sadeh /* 659602adf40SYehuda Sadeh * Destroy ceph client 660d23a4b3fSAlex Elder * 661432b8587SAlex Elder * Caller must hold rbd_client_list_lock. 662602adf40SYehuda Sadeh */ 663602adf40SYehuda Sadeh static void rbd_client_release(struct kref *kref) 664602adf40SYehuda Sadeh { 665602adf40SYehuda Sadeh struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref); 666602adf40SYehuda Sadeh 66737206ee5SAlex Elder dout("%s: rbdc %p\n", __func__, rbdc); 668cd9d9f5dSAlex Elder spin_lock(&rbd_client_list_lock); 669602adf40SYehuda Sadeh list_del(&rbdc->node); 670cd9d9f5dSAlex Elder spin_unlock(&rbd_client_list_lock); 671602adf40SYehuda Sadeh 672602adf40SYehuda Sadeh ceph_destroy_client(rbdc->client); 673602adf40SYehuda Sadeh kfree(rbdc); 674602adf40SYehuda Sadeh } 675602adf40SYehuda Sadeh 676602adf40SYehuda Sadeh /* 677602adf40SYehuda Sadeh * Drop reference to ceph client node. If it's not referenced anymore, release 678602adf40SYehuda Sadeh * it. 679602adf40SYehuda Sadeh */ 6809d3997fdSAlex Elder static void rbd_put_client(struct rbd_client *rbdc) 681602adf40SYehuda Sadeh { 682c53d5893SAlex Elder if (rbdc) 6839d3997fdSAlex Elder kref_put(&rbdc->kref, rbd_client_release); 684602adf40SYehuda Sadeh } 685602adf40SYehuda Sadeh 686a30b71b9SAlex Elder static bool rbd_image_format_valid(u32 image_format) 687a30b71b9SAlex Elder { 688a30b71b9SAlex Elder return image_format == 1 || image_format == 2; 689a30b71b9SAlex Elder } 690a30b71b9SAlex Elder 6918e94af8eSAlex Elder static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk) 6928e94af8eSAlex Elder { 693103a150fSAlex Elder size_t size; 694103a150fSAlex Elder u32 snap_count; 695103a150fSAlex Elder 696103a150fSAlex Elder /* The header has to start with the magic rbd header text */ 697103a150fSAlex Elder if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT))) 698103a150fSAlex Elder return false; 699103a150fSAlex Elder 700db2388b6SAlex Elder /* The bio layer requires at least sector-sized I/O */ 701db2388b6SAlex Elder 702db2388b6SAlex Elder if (ondisk->options.order < SECTOR_SHIFT) 703db2388b6SAlex Elder return false; 704db2388b6SAlex Elder 705db2388b6SAlex Elder /* If we use u64 in a few spots we may be able to loosen this */ 706db2388b6SAlex Elder 707db2388b6SAlex Elder if (ondisk->options.order > 8 * sizeof (int) - 1) 708db2388b6SAlex Elder return false; 709db2388b6SAlex Elder 710103a150fSAlex Elder /* 711103a150fSAlex Elder * The size of a snapshot header has to fit in a size_t, and 712103a150fSAlex Elder * that limits the number of snapshots. 713103a150fSAlex Elder */ 714103a150fSAlex Elder snap_count = le32_to_cpu(ondisk->snap_count); 715103a150fSAlex Elder size = SIZE_MAX - sizeof (struct ceph_snap_context); 716103a150fSAlex Elder if (snap_count > size / sizeof (__le64)) 717103a150fSAlex Elder return false; 718103a150fSAlex Elder 719103a150fSAlex Elder /* 720103a150fSAlex Elder * Not only that, but the size of the entire the snapshot 721103a150fSAlex Elder * header must also be representable in a size_t. 722103a150fSAlex Elder */ 723103a150fSAlex Elder size -= snap_count * sizeof (__le64); 724103a150fSAlex Elder if ((u64) size < le64_to_cpu(ondisk->snap_names_len)) 725103a150fSAlex Elder return false; 726103a150fSAlex Elder 727103a150fSAlex Elder return true; 7288e94af8eSAlex Elder } 7298e94af8eSAlex Elder 730602adf40SYehuda Sadeh /* 731bb23e37aSAlex Elder * Fill an rbd image header with information from the given format 1 732bb23e37aSAlex Elder * on-disk header. 733602adf40SYehuda Sadeh */ 734662518b1SAlex Elder static int rbd_header_from_disk(struct rbd_device *rbd_dev, 7354156d998SAlex Elder struct rbd_image_header_ondisk *ondisk) 736602adf40SYehuda Sadeh { 737662518b1SAlex Elder struct rbd_image_header *header = &rbd_dev->header; 738bb23e37aSAlex Elder bool first_time = header->object_prefix == NULL; 739bb23e37aSAlex Elder struct ceph_snap_context *snapc; 740bb23e37aSAlex Elder char *object_prefix = NULL; 741bb23e37aSAlex Elder char *snap_names = NULL; 742bb23e37aSAlex Elder u64 *snap_sizes = NULL; 743ccece235SAlex Elder u32 snap_count; 744d2bb24e5SAlex Elder size_t size; 745bb23e37aSAlex Elder int ret = -ENOMEM; 746621901d6SAlex Elder u32 i; 747602adf40SYehuda Sadeh 748bb23e37aSAlex Elder /* Allocate this now to avoid having to handle failure below */ 749103a150fSAlex Elder 750bb23e37aSAlex Elder if (first_time) { 751bb23e37aSAlex Elder size_t len; 752bb23e37aSAlex Elder 753bb23e37aSAlex Elder len = strnlen(ondisk->object_prefix, 754bb23e37aSAlex Elder sizeof (ondisk->object_prefix)); 755bb23e37aSAlex Elder object_prefix = kmalloc(len + 1, GFP_KERNEL); 756bb23e37aSAlex Elder if (!object_prefix) 757602adf40SYehuda Sadeh return -ENOMEM; 758bb23e37aSAlex Elder memcpy(object_prefix, ondisk->object_prefix, len); 759bb23e37aSAlex Elder object_prefix[len] = '\0'; 760bb23e37aSAlex Elder } 76100f1f36fSAlex Elder 762bb23e37aSAlex Elder /* Allocate the snapshot context and fill it in */ 763bb23e37aSAlex Elder 764bb23e37aSAlex Elder snap_count = le32_to_cpu(ondisk->snap_count); 765bb23e37aSAlex Elder snapc = ceph_create_snap_context(snap_count, GFP_KERNEL); 766bb23e37aSAlex Elder if (!snapc) 767bb23e37aSAlex Elder goto out_err; 768bb23e37aSAlex Elder snapc->seq = le64_to_cpu(ondisk->snap_seq); 769602adf40SYehuda Sadeh if (snap_count) { 770bb23e37aSAlex Elder struct rbd_image_snap_ondisk *snaps; 771f785cc1dSAlex Elder u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len); 772f785cc1dSAlex Elder 773bb23e37aSAlex Elder /* We'll keep a copy of the snapshot names... */ 774621901d6SAlex Elder 775f785cc1dSAlex Elder if (snap_names_len > (u64)SIZE_MAX) 776bb23e37aSAlex Elder goto out_2big; 777bb23e37aSAlex Elder snap_names = kmalloc(snap_names_len, GFP_KERNEL); 778bb23e37aSAlex Elder if (!snap_names) 7796a52325fSAlex Elder goto out_err; 780bb23e37aSAlex Elder 781bb23e37aSAlex Elder /* ...as well as the array of their sizes. */ 782bb23e37aSAlex Elder 783bb23e37aSAlex Elder size = snap_count * sizeof (*header->snap_sizes); 784bb23e37aSAlex Elder snap_sizes = kmalloc(size, GFP_KERNEL); 785bb23e37aSAlex Elder if (!snap_sizes) 786bb23e37aSAlex Elder goto out_err; 787bb23e37aSAlex Elder 788f785cc1dSAlex Elder /* 789bb23e37aSAlex Elder * Copy the names, and fill in each snapshot's id 790bb23e37aSAlex Elder * and size. 791bb23e37aSAlex Elder * 79299a41ebcSAlex Elder * Note that rbd_dev_v1_header_info() guarantees the 793bb23e37aSAlex Elder * ondisk buffer we're working with has 794f785cc1dSAlex Elder * snap_names_len bytes beyond the end of the 795f785cc1dSAlex Elder * snapshot id array, this memcpy() is safe. 796f785cc1dSAlex Elder */ 797bb23e37aSAlex Elder memcpy(snap_names, &ondisk->snaps[snap_count], snap_names_len); 798bb23e37aSAlex Elder snaps = ondisk->snaps; 799bb23e37aSAlex Elder for (i = 0; i < snap_count; i++) { 800bb23e37aSAlex Elder snapc->snaps[i] = le64_to_cpu(snaps[i].id); 801bb23e37aSAlex Elder snap_sizes[i] = le64_to_cpu(snaps[i].image_size); 802bb23e37aSAlex Elder } 803602adf40SYehuda Sadeh } 804849b4260SAlex Elder 805bb23e37aSAlex Elder /* We won't fail any more, fill in the header */ 806bb23e37aSAlex Elder 807662518b1SAlex Elder down_write(&rbd_dev->header_rwsem); 808bb23e37aSAlex Elder if (first_time) { 809bb23e37aSAlex Elder header->object_prefix = object_prefix; 810602adf40SYehuda Sadeh header->obj_order = ondisk->options.order; 811602adf40SYehuda Sadeh header->crypt_type = ondisk->options.crypt_type; 812602adf40SYehuda Sadeh header->comp_type = ondisk->options.comp_type; 813bb23e37aSAlex Elder /* The rest aren't used for format 1 images */ 814bb23e37aSAlex Elder header->stripe_unit = 0; 815bb23e37aSAlex Elder header->stripe_count = 0; 816bb23e37aSAlex Elder header->features = 0; 817662518b1SAlex Elder } else { 818662518b1SAlex Elder ceph_put_snap_context(header->snapc); 819662518b1SAlex Elder kfree(header->snap_names); 820662518b1SAlex Elder kfree(header->snap_sizes); 821bb23e37aSAlex Elder } 8226a52325fSAlex Elder 823bb23e37aSAlex Elder /* The remaining fields always get updated (when we refresh) */ 824621901d6SAlex Elder 825f84344f3SAlex Elder header->image_size = le64_to_cpu(ondisk->image_size); 826bb23e37aSAlex Elder header->snapc = snapc; 827bb23e37aSAlex Elder header->snap_names = snap_names; 828bb23e37aSAlex Elder header->snap_sizes = snap_sizes; 829602adf40SYehuda Sadeh 830662518b1SAlex Elder /* Make sure mapping size is consistent with header info */ 831662518b1SAlex Elder 832662518b1SAlex Elder if (rbd_dev->spec->snap_id == CEPH_NOSNAP || first_time) 833662518b1SAlex Elder if (rbd_dev->mapping.size != header->image_size) 834662518b1SAlex Elder rbd_dev->mapping.size = header->image_size; 835662518b1SAlex Elder 836662518b1SAlex Elder up_write(&rbd_dev->header_rwsem); 837662518b1SAlex Elder 838602adf40SYehuda Sadeh return 0; 839bb23e37aSAlex Elder out_2big: 840bb23e37aSAlex Elder ret = -EIO; 8416a52325fSAlex Elder out_err: 842bb23e37aSAlex Elder kfree(snap_sizes); 843bb23e37aSAlex Elder kfree(snap_names); 844bb23e37aSAlex Elder ceph_put_snap_context(snapc); 845bb23e37aSAlex Elder kfree(object_prefix); 846ccece235SAlex Elder 847bb23e37aSAlex Elder return ret; 848602adf40SYehuda Sadeh } 849602adf40SYehuda Sadeh 8509682fc6dSAlex Elder static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which) 8519682fc6dSAlex Elder { 8529682fc6dSAlex Elder const char *snap_name; 8539682fc6dSAlex Elder 8549682fc6dSAlex Elder rbd_assert(which < rbd_dev->header.snapc->num_snaps); 8559682fc6dSAlex Elder 8569682fc6dSAlex Elder /* Skip over names until we find the one we are looking for */ 8579682fc6dSAlex Elder 8589682fc6dSAlex Elder snap_name = rbd_dev->header.snap_names; 8599682fc6dSAlex Elder while (which--) 8609682fc6dSAlex Elder snap_name += strlen(snap_name) + 1; 8619682fc6dSAlex Elder 8629682fc6dSAlex Elder return kstrdup(snap_name, GFP_KERNEL); 8639682fc6dSAlex Elder } 8649682fc6dSAlex Elder 86530d1cff8SAlex Elder /* 86630d1cff8SAlex Elder * Snapshot id comparison function for use with qsort()/bsearch(). 86730d1cff8SAlex Elder * Note that result is for snapshots in *descending* order. 86830d1cff8SAlex Elder */ 86930d1cff8SAlex Elder static int snapid_compare_reverse(const void *s1, const void *s2) 87030d1cff8SAlex Elder { 87130d1cff8SAlex Elder u64 snap_id1 = *(u64 *)s1; 87230d1cff8SAlex Elder u64 snap_id2 = *(u64 *)s2; 87330d1cff8SAlex Elder 87430d1cff8SAlex Elder if (snap_id1 < snap_id2) 87530d1cff8SAlex Elder return 1; 87630d1cff8SAlex Elder return snap_id1 == snap_id2 ? 0 : -1; 87730d1cff8SAlex Elder } 87830d1cff8SAlex Elder 87930d1cff8SAlex Elder /* 88030d1cff8SAlex Elder * Search a snapshot context to see if the given snapshot id is 88130d1cff8SAlex Elder * present. 88230d1cff8SAlex Elder * 88330d1cff8SAlex Elder * Returns the position of the snapshot id in the array if it's found, 88430d1cff8SAlex Elder * or BAD_SNAP_INDEX otherwise. 88530d1cff8SAlex Elder * 88630d1cff8SAlex Elder * Note: The snapshot array is in kept sorted (by the osd) in 88730d1cff8SAlex Elder * reverse order, highest snapshot id first. 88830d1cff8SAlex Elder */ 8899682fc6dSAlex Elder static u32 rbd_dev_snap_index(struct rbd_device *rbd_dev, u64 snap_id) 8909682fc6dSAlex Elder { 8919682fc6dSAlex Elder struct ceph_snap_context *snapc = rbd_dev->header.snapc; 89230d1cff8SAlex Elder u64 *found; 8939682fc6dSAlex Elder 89430d1cff8SAlex Elder found = bsearch(&snap_id, &snapc->snaps, snapc->num_snaps, 89530d1cff8SAlex Elder sizeof (snap_id), snapid_compare_reverse); 8969682fc6dSAlex Elder 89730d1cff8SAlex Elder return found ? (u32)(found - &snapc->snaps[0]) : BAD_SNAP_INDEX; 8989682fc6dSAlex Elder } 8999682fc6dSAlex Elder 9002ad3d716SAlex Elder static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, 9012ad3d716SAlex Elder u64 snap_id) 90254cac61fSAlex Elder { 90354cac61fSAlex Elder u32 which; 90454cac61fSAlex Elder 90554cac61fSAlex Elder which = rbd_dev_snap_index(rbd_dev, snap_id); 90654cac61fSAlex Elder if (which == BAD_SNAP_INDEX) 90754cac61fSAlex Elder return NULL; 90854cac61fSAlex Elder 90954cac61fSAlex Elder return _rbd_dev_v1_snap_name(rbd_dev, which); 91054cac61fSAlex Elder } 91154cac61fSAlex Elder 9129e15b77dSAlex Elder static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id) 9139e15b77dSAlex Elder { 9149e15b77dSAlex Elder if (snap_id == CEPH_NOSNAP) 9159e15b77dSAlex Elder return RBD_SNAP_HEAD_NAME; 9169e15b77dSAlex Elder 91754cac61fSAlex Elder rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 91854cac61fSAlex Elder if (rbd_dev->image_format == 1) 91954cac61fSAlex Elder return rbd_dev_v1_snap_name(rbd_dev, snap_id); 9209e15b77dSAlex Elder 92154cac61fSAlex Elder return rbd_dev_v2_snap_name(rbd_dev, snap_id); 9229e15b77dSAlex Elder } 9239e15b77dSAlex Elder 9242ad3d716SAlex Elder static int rbd_snap_size(struct rbd_device *rbd_dev, u64 snap_id, 9252ad3d716SAlex Elder u64 *snap_size) 926602adf40SYehuda Sadeh { 9272ad3d716SAlex Elder rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 9282ad3d716SAlex Elder if (snap_id == CEPH_NOSNAP) { 9292ad3d716SAlex Elder *snap_size = rbd_dev->header.image_size; 9302ad3d716SAlex Elder } else if (rbd_dev->image_format == 1) { 9312ad3d716SAlex Elder u32 which; 93200f1f36fSAlex Elder 9332ad3d716SAlex Elder which = rbd_dev_snap_index(rbd_dev, snap_id); 9342ad3d716SAlex Elder if (which == BAD_SNAP_INDEX) 9352ad3d716SAlex Elder return -ENOENT; 93600f1f36fSAlex Elder 9372ad3d716SAlex Elder *snap_size = rbd_dev->header.snap_sizes[which]; 9382ad3d716SAlex Elder } else { 9392ad3d716SAlex Elder u64 size = 0; 9402ad3d716SAlex Elder int ret; 9412ad3d716SAlex Elder 9422ad3d716SAlex Elder ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size); 9432ad3d716SAlex Elder if (ret) 9442ad3d716SAlex Elder return ret; 9452ad3d716SAlex Elder 9462ad3d716SAlex Elder *snap_size = size; 9472ad3d716SAlex Elder } 9482ad3d716SAlex Elder return 0; 9492ad3d716SAlex Elder } 9502ad3d716SAlex Elder 9512ad3d716SAlex Elder static int rbd_snap_features(struct rbd_device *rbd_dev, u64 snap_id, 9522ad3d716SAlex Elder u64 *snap_features) 9532ad3d716SAlex Elder { 9542ad3d716SAlex Elder rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 9552ad3d716SAlex Elder if (snap_id == CEPH_NOSNAP) { 9562ad3d716SAlex Elder *snap_features = rbd_dev->header.features; 9572ad3d716SAlex Elder } else if (rbd_dev->image_format == 1) { 9582ad3d716SAlex Elder *snap_features = 0; /* No features for format 1 */ 9592ad3d716SAlex Elder } else { 9602ad3d716SAlex Elder u64 features = 0; 9612ad3d716SAlex Elder int ret; 9622ad3d716SAlex Elder 9632ad3d716SAlex Elder ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, &features); 9642ad3d716SAlex Elder if (ret) 9652ad3d716SAlex Elder return ret; 9662ad3d716SAlex Elder 9672ad3d716SAlex Elder *snap_features = features; 9682ad3d716SAlex Elder } 9692ad3d716SAlex Elder return 0; 97000f1f36fSAlex Elder } 971602adf40SYehuda Sadeh 972d1cf5788SAlex Elder static int rbd_dev_mapping_set(struct rbd_device *rbd_dev) 973602adf40SYehuda Sadeh { 9748f4b7d98SAlex Elder u64 snap_id = rbd_dev->spec->snap_id; 9752ad3d716SAlex Elder u64 size = 0; 9762ad3d716SAlex Elder u64 features = 0; 9772ad3d716SAlex Elder int ret; 9788b0241f8SAlex Elder 9792ad3d716SAlex Elder ret = rbd_snap_size(rbd_dev, snap_id, &size); 9802ad3d716SAlex Elder if (ret) 9812ad3d716SAlex Elder return ret; 9822ad3d716SAlex Elder ret = rbd_snap_features(rbd_dev, snap_id, &features); 9832ad3d716SAlex Elder if (ret) 9842ad3d716SAlex Elder return ret; 9852ad3d716SAlex Elder 9862ad3d716SAlex Elder rbd_dev->mapping.size = size; 9872ad3d716SAlex Elder rbd_dev->mapping.features = features; 9882ad3d716SAlex Elder 9898b0241f8SAlex Elder return 0; 990602adf40SYehuda Sadeh } 991602adf40SYehuda Sadeh 992d1cf5788SAlex Elder static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev) 993d1cf5788SAlex Elder { 994d1cf5788SAlex Elder rbd_dev->mapping.size = 0; 995d1cf5788SAlex Elder rbd_dev->mapping.features = 0; 996d1cf5788SAlex Elder } 997d1cf5788SAlex Elder 99898571b5aSAlex Elder static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset) 999602adf40SYehuda Sadeh { 100065ccfe21SAlex Elder char *name; 100165ccfe21SAlex Elder u64 segment; 100265ccfe21SAlex Elder int ret; 1003602adf40SYehuda Sadeh 100478c2a44aSAlex Elder name = kmem_cache_alloc(rbd_segment_name_cache, GFP_NOIO); 100565ccfe21SAlex Elder if (!name) 100665ccfe21SAlex Elder return NULL; 100765ccfe21SAlex Elder segment = offset >> rbd_dev->header.obj_order; 10082fd82b9eSAlex Elder ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, "%s.%012llx", 100965ccfe21SAlex Elder rbd_dev->header.object_prefix, segment); 10102fd82b9eSAlex Elder if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) { 101165ccfe21SAlex Elder pr_err("error formatting segment name for #%llu (%d)\n", 101265ccfe21SAlex Elder segment, ret); 101365ccfe21SAlex Elder kfree(name); 101465ccfe21SAlex Elder name = NULL; 101565ccfe21SAlex Elder } 1016602adf40SYehuda Sadeh 101765ccfe21SAlex Elder return name; 101865ccfe21SAlex Elder } 1019602adf40SYehuda Sadeh 102078c2a44aSAlex Elder static void rbd_segment_name_free(const char *name) 102178c2a44aSAlex Elder { 102278c2a44aSAlex Elder /* The explicit cast here is needed to drop the const qualifier */ 102378c2a44aSAlex Elder 102478c2a44aSAlex Elder kmem_cache_free(rbd_segment_name_cache, (void *)name); 102578c2a44aSAlex Elder } 102678c2a44aSAlex Elder 102765ccfe21SAlex Elder static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset) 102865ccfe21SAlex Elder { 102965ccfe21SAlex Elder u64 segment_size = (u64) 1 << rbd_dev->header.obj_order; 1030602adf40SYehuda Sadeh 103165ccfe21SAlex Elder return offset & (segment_size - 1); 103265ccfe21SAlex Elder } 103365ccfe21SAlex Elder 103465ccfe21SAlex Elder static u64 rbd_segment_length(struct rbd_device *rbd_dev, 103565ccfe21SAlex Elder u64 offset, u64 length) 103665ccfe21SAlex Elder { 103765ccfe21SAlex Elder u64 segment_size = (u64) 1 << rbd_dev->header.obj_order; 103865ccfe21SAlex Elder 103965ccfe21SAlex Elder offset &= segment_size - 1; 104065ccfe21SAlex Elder 1041aafb230eSAlex Elder rbd_assert(length <= U64_MAX - offset); 104265ccfe21SAlex Elder if (offset + length > segment_size) 104365ccfe21SAlex Elder length = segment_size - offset; 104465ccfe21SAlex Elder 104565ccfe21SAlex Elder return length; 1046602adf40SYehuda Sadeh } 1047602adf40SYehuda Sadeh 1048602adf40SYehuda Sadeh /* 1049029bcbd8SJosh Durgin * returns the size of an object in the image 1050029bcbd8SJosh Durgin */ 1051029bcbd8SJosh Durgin static u64 rbd_obj_bytes(struct rbd_image_header *header) 1052029bcbd8SJosh Durgin { 1053029bcbd8SJosh Durgin return 1 << header->obj_order; 1054029bcbd8SJosh Durgin } 1055029bcbd8SJosh Durgin 1056029bcbd8SJosh Durgin /* 1057602adf40SYehuda Sadeh * bio helpers 1058602adf40SYehuda Sadeh */ 1059602adf40SYehuda Sadeh 1060602adf40SYehuda Sadeh static void bio_chain_put(struct bio *chain) 1061602adf40SYehuda Sadeh { 1062602adf40SYehuda Sadeh struct bio *tmp; 1063602adf40SYehuda Sadeh 1064602adf40SYehuda Sadeh while (chain) { 1065602adf40SYehuda Sadeh tmp = chain; 1066602adf40SYehuda Sadeh chain = chain->bi_next; 1067602adf40SYehuda Sadeh bio_put(tmp); 1068602adf40SYehuda Sadeh } 1069602adf40SYehuda Sadeh } 1070602adf40SYehuda Sadeh 1071602adf40SYehuda Sadeh /* 1072602adf40SYehuda Sadeh * zeros a bio chain, starting at specific offset 1073602adf40SYehuda Sadeh */ 1074602adf40SYehuda Sadeh static void zero_bio_chain(struct bio *chain, int start_ofs) 1075602adf40SYehuda Sadeh { 1076602adf40SYehuda Sadeh struct bio_vec *bv; 1077602adf40SYehuda Sadeh unsigned long flags; 1078602adf40SYehuda Sadeh void *buf; 1079602adf40SYehuda Sadeh int i; 1080602adf40SYehuda Sadeh int pos = 0; 1081602adf40SYehuda Sadeh 1082602adf40SYehuda Sadeh while (chain) { 1083602adf40SYehuda Sadeh bio_for_each_segment(bv, chain, i) { 1084602adf40SYehuda Sadeh if (pos + bv->bv_len > start_ofs) { 1085602adf40SYehuda Sadeh int remainder = max(start_ofs - pos, 0); 1086602adf40SYehuda Sadeh buf = bvec_kmap_irq(bv, &flags); 1087602adf40SYehuda Sadeh memset(buf + remainder, 0, 1088602adf40SYehuda Sadeh bv->bv_len - remainder); 108985b5aaa6SDan Carpenter bvec_kunmap_irq(buf, &flags); 1090602adf40SYehuda Sadeh } 1091602adf40SYehuda Sadeh pos += bv->bv_len; 1092602adf40SYehuda Sadeh } 1093602adf40SYehuda Sadeh 1094602adf40SYehuda Sadeh chain = chain->bi_next; 1095602adf40SYehuda Sadeh } 1096602adf40SYehuda Sadeh } 1097602adf40SYehuda Sadeh 1098602adf40SYehuda Sadeh /* 1099b9434c5bSAlex Elder * similar to zero_bio_chain(), zeros data defined by a page array, 1100b9434c5bSAlex Elder * starting at the given byte offset from the start of the array and 1101b9434c5bSAlex Elder * continuing up to the given end offset. The pages array is 1102b9434c5bSAlex Elder * assumed to be big enough to hold all bytes up to the end. 1103b9434c5bSAlex Elder */ 1104b9434c5bSAlex Elder static void zero_pages(struct page **pages, u64 offset, u64 end) 1105b9434c5bSAlex Elder { 1106b9434c5bSAlex Elder struct page **page = &pages[offset >> PAGE_SHIFT]; 1107b9434c5bSAlex Elder 1108b9434c5bSAlex Elder rbd_assert(end > offset); 1109b9434c5bSAlex Elder rbd_assert(end - offset <= (u64)SIZE_MAX); 1110b9434c5bSAlex Elder while (offset < end) { 1111b9434c5bSAlex Elder size_t page_offset; 1112b9434c5bSAlex Elder size_t length; 1113b9434c5bSAlex Elder unsigned long flags; 1114b9434c5bSAlex Elder void *kaddr; 1115b9434c5bSAlex Elder 1116b9434c5bSAlex Elder page_offset = (size_t)(offset & ~PAGE_MASK); 1117b9434c5bSAlex Elder length = min(PAGE_SIZE - page_offset, (size_t)(end - offset)); 1118b9434c5bSAlex Elder local_irq_save(flags); 1119b9434c5bSAlex Elder kaddr = kmap_atomic(*page); 1120b9434c5bSAlex Elder memset(kaddr + page_offset, 0, length); 1121b9434c5bSAlex Elder kunmap_atomic(kaddr); 1122b9434c5bSAlex Elder local_irq_restore(flags); 1123b9434c5bSAlex Elder 1124b9434c5bSAlex Elder offset += length; 1125b9434c5bSAlex Elder page++; 1126b9434c5bSAlex Elder } 1127b9434c5bSAlex Elder } 1128b9434c5bSAlex Elder 1129b9434c5bSAlex Elder /* 1130f7760dadSAlex Elder * Clone a portion of a bio, starting at the given byte offset 1131f7760dadSAlex Elder * and continuing for the number of bytes indicated. 1132602adf40SYehuda Sadeh */ 1133f7760dadSAlex Elder static struct bio *bio_clone_range(struct bio *bio_src, 1134f7760dadSAlex Elder unsigned int offset, 1135f7760dadSAlex Elder unsigned int len, 1136f7760dadSAlex Elder gfp_t gfpmask) 1137602adf40SYehuda Sadeh { 1138f7760dadSAlex Elder struct bio_vec *bv; 1139f7760dadSAlex Elder unsigned int resid; 1140f7760dadSAlex Elder unsigned short idx; 1141f7760dadSAlex Elder unsigned int voff; 1142f7760dadSAlex Elder unsigned short end_idx; 1143f7760dadSAlex Elder unsigned short vcnt; 1144f7760dadSAlex Elder struct bio *bio; 1145602adf40SYehuda Sadeh 1146f7760dadSAlex Elder /* Handle the easy case for the caller */ 1147f7760dadSAlex Elder 1148f7760dadSAlex Elder if (!offset && len == bio_src->bi_size) 1149f7760dadSAlex Elder return bio_clone(bio_src, gfpmask); 1150f7760dadSAlex Elder 1151f7760dadSAlex Elder if (WARN_ON_ONCE(!len)) 1152f7760dadSAlex Elder return NULL; 1153f7760dadSAlex Elder if (WARN_ON_ONCE(len > bio_src->bi_size)) 1154f7760dadSAlex Elder return NULL; 1155f7760dadSAlex Elder if (WARN_ON_ONCE(offset > bio_src->bi_size - len)) 1156f7760dadSAlex Elder return NULL; 1157f7760dadSAlex Elder 1158f7760dadSAlex Elder /* Find first affected segment... */ 1159f7760dadSAlex Elder 1160f7760dadSAlex Elder resid = offset; 1161f7760dadSAlex Elder __bio_for_each_segment(bv, bio_src, idx, 0) { 1162f7760dadSAlex Elder if (resid < bv->bv_len) 1163f7760dadSAlex Elder break; 1164f7760dadSAlex Elder resid -= bv->bv_len; 1165602adf40SYehuda Sadeh } 1166f7760dadSAlex Elder voff = resid; 1167602adf40SYehuda Sadeh 1168f7760dadSAlex Elder /* ...and the last affected segment */ 1169542582fcSAlex Elder 1170f7760dadSAlex Elder resid += len; 1171f7760dadSAlex Elder __bio_for_each_segment(bv, bio_src, end_idx, idx) { 1172f7760dadSAlex Elder if (resid <= bv->bv_len) 1173f7760dadSAlex Elder break; 1174f7760dadSAlex Elder resid -= bv->bv_len; 1175f7760dadSAlex Elder } 1176f7760dadSAlex Elder vcnt = end_idx - idx + 1; 1177602adf40SYehuda Sadeh 1178f7760dadSAlex Elder /* Build the clone */ 1179f7760dadSAlex Elder 1180f7760dadSAlex Elder bio = bio_alloc(gfpmask, (unsigned int) vcnt); 1181f7760dadSAlex Elder if (!bio) 1182f7760dadSAlex Elder return NULL; /* ENOMEM */ 1183f7760dadSAlex Elder 1184f7760dadSAlex Elder bio->bi_bdev = bio_src->bi_bdev; 1185f7760dadSAlex Elder bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT); 1186f7760dadSAlex Elder bio->bi_rw = bio_src->bi_rw; 1187f7760dadSAlex Elder bio->bi_flags |= 1 << BIO_CLONED; 1188602adf40SYehuda Sadeh 1189602adf40SYehuda Sadeh /* 1190f7760dadSAlex Elder * Copy over our part of the bio_vec, then update the first 1191f7760dadSAlex Elder * and last (or only) entries. 1192602adf40SYehuda Sadeh */ 1193f7760dadSAlex Elder memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx], 1194f7760dadSAlex Elder vcnt * sizeof (struct bio_vec)); 1195f7760dadSAlex Elder bio->bi_io_vec[0].bv_offset += voff; 1196f7760dadSAlex Elder if (vcnt > 1) { 1197f7760dadSAlex Elder bio->bi_io_vec[0].bv_len -= voff; 1198f7760dadSAlex Elder bio->bi_io_vec[vcnt - 1].bv_len = resid; 1199602adf40SYehuda Sadeh } else { 1200f7760dadSAlex Elder bio->bi_io_vec[0].bv_len = len; 1201602adf40SYehuda Sadeh } 1202602adf40SYehuda Sadeh 1203f7760dadSAlex Elder bio->bi_vcnt = vcnt; 1204f7760dadSAlex Elder bio->bi_size = len; 1205f7760dadSAlex Elder bio->bi_idx = 0; 1206602adf40SYehuda Sadeh 1207f7760dadSAlex Elder return bio; 1208602adf40SYehuda Sadeh } 1209602adf40SYehuda Sadeh 1210f7760dadSAlex Elder /* 1211f7760dadSAlex Elder * Clone a portion of a bio chain, starting at the given byte offset 1212f7760dadSAlex Elder * into the first bio in the source chain and continuing for the 1213f7760dadSAlex Elder * number of bytes indicated. The result is another bio chain of 1214f7760dadSAlex Elder * exactly the given length, or a null pointer on error. 1215f7760dadSAlex Elder * 1216f7760dadSAlex Elder * The bio_src and offset parameters are both in-out. On entry they 1217f7760dadSAlex Elder * refer to the first source bio and the offset into that bio where 1218f7760dadSAlex Elder * the start of data to be cloned is located. 1219f7760dadSAlex Elder * 1220f7760dadSAlex Elder * On return, bio_src is updated to refer to the bio in the source 1221f7760dadSAlex Elder * chain that contains first un-cloned byte, and *offset will 1222f7760dadSAlex Elder * contain the offset of that byte within that bio. 1223f7760dadSAlex Elder */ 1224f7760dadSAlex Elder static struct bio *bio_chain_clone_range(struct bio **bio_src, 1225f7760dadSAlex Elder unsigned int *offset, 1226f7760dadSAlex Elder unsigned int len, 1227f7760dadSAlex Elder gfp_t gfpmask) 1228f7760dadSAlex Elder { 1229f7760dadSAlex Elder struct bio *bi = *bio_src; 1230f7760dadSAlex Elder unsigned int off = *offset; 1231f7760dadSAlex Elder struct bio *chain = NULL; 1232f7760dadSAlex Elder struct bio **end; 1233602adf40SYehuda Sadeh 1234f7760dadSAlex Elder /* Build up a chain of clone bios up to the limit */ 1235602adf40SYehuda Sadeh 1236f7760dadSAlex Elder if (!bi || off >= bi->bi_size || !len) 1237f7760dadSAlex Elder return NULL; /* Nothing to clone */ 1238602adf40SYehuda Sadeh 1239f7760dadSAlex Elder end = &chain; 1240f7760dadSAlex Elder while (len) { 1241f7760dadSAlex Elder unsigned int bi_size; 1242f7760dadSAlex Elder struct bio *bio; 1243f7760dadSAlex Elder 1244f5400b7aSAlex Elder if (!bi) { 1245f5400b7aSAlex Elder rbd_warn(NULL, "bio_chain exhausted with %u left", len); 1246f7760dadSAlex Elder goto out_err; /* EINVAL; ran out of bio's */ 1247f5400b7aSAlex Elder } 1248f7760dadSAlex Elder bi_size = min_t(unsigned int, bi->bi_size - off, len); 1249f7760dadSAlex Elder bio = bio_clone_range(bi, off, bi_size, gfpmask); 1250f7760dadSAlex Elder if (!bio) 1251f7760dadSAlex Elder goto out_err; /* ENOMEM */ 1252f7760dadSAlex Elder 1253f7760dadSAlex Elder *end = bio; 1254f7760dadSAlex Elder end = &bio->bi_next; 1255f7760dadSAlex Elder 1256f7760dadSAlex Elder off += bi_size; 1257f7760dadSAlex Elder if (off == bi->bi_size) { 1258f7760dadSAlex Elder bi = bi->bi_next; 1259f7760dadSAlex Elder off = 0; 1260f7760dadSAlex Elder } 1261f7760dadSAlex Elder len -= bi_size; 1262f7760dadSAlex Elder } 1263f7760dadSAlex Elder *bio_src = bi; 1264f7760dadSAlex Elder *offset = off; 1265f7760dadSAlex Elder 1266f7760dadSAlex Elder return chain; 1267f7760dadSAlex Elder out_err: 1268f7760dadSAlex Elder bio_chain_put(chain); 1269f7760dadSAlex Elder 1270602adf40SYehuda Sadeh return NULL; 1271602adf40SYehuda Sadeh } 1272602adf40SYehuda Sadeh 1273926f9b3fSAlex Elder /* 1274926f9b3fSAlex Elder * The default/initial value for all object request flags is 0. For 1275926f9b3fSAlex Elder * each flag, once its value is set to 1 it is never reset to 0 1276926f9b3fSAlex Elder * again. 1277926f9b3fSAlex Elder */ 12786365d33aSAlex Elder static void obj_request_img_data_set(struct rbd_obj_request *obj_request) 12796365d33aSAlex Elder { 12806365d33aSAlex Elder if (test_and_set_bit(OBJ_REQ_IMG_DATA, &obj_request->flags)) { 12816365d33aSAlex Elder struct rbd_device *rbd_dev; 12826365d33aSAlex Elder 128357acbaa7SAlex Elder rbd_dev = obj_request->img_request->rbd_dev; 12846365d33aSAlex Elder rbd_warn(rbd_dev, "obj_request %p already marked img_data\n", 12856365d33aSAlex Elder obj_request); 12866365d33aSAlex Elder } 12876365d33aSAlex Elder } 12886365d33aSAlex Elder 12896365d33aSAlex Elder static bool obj_request_img_data_test(struct rbd_obj_request *obj_request) 12906365d33aSAlex Elder { 12916365d33aSAlex Elder smp_mb(); 12926365d33aSAlex Elder return test_bit(OBJ_REQ_IMG_DATA, &obj_request->flags) != 0; 12936365d33aSAlex Elder } 12946365d33aSAlex Elder 129557acbaa7SAlex Elder static void obj_request_done_set(struct rbd_obj_request *obj_request) 129657acbaa7SAlex Elder { 129757acbaa7SAlex Elder if (test_and_set_bit(OBJ_REQ_DONE, &obj_request->flags)) { 129857acbaa7SAlex Elder struct rbd_device *rbd_dev = NULL; 129957acbaa7SAlex Elder 130057acbaa7SAlex Elder if (obj_request_img_data_test(obj_request)) 130157acbaa7SAlex Elder rbd_dev = obj_request->img_request->rbd_dev; 130257acbaa7SAlex Elder rbd_warn(rbd_dev, "obj_request %p already marked done\n", 130357acbaa7SAlex Elder obj_request); 130457acbaa7SAlex Elder } 130557acbaa7SAlex Elder } 130657acbaa7SAlex Elder 130757acbaa7SAlex Elder static bool obj_request_done_test(struct rbd_obj_request *obj_request) 130857acbaa7SAlex Elder { 130957acbaa7SAlex Elder smp_mb(); 131057acbaa7SAlex Elder return test_bit(OBJ_REQ_DONE, &obj_request->flags) != 0; 131157acbaa7SAlex Elder } 131257acbaa7SAlex Elder 13135679c59fSAlex Elder /* 13145679c59fSAlex Elder * This sets the KNOWN flag after (possibly) setting the EXISTS 13155679c59fSAlex Elder * flag. The latter is set based on the "exists" value provided. 13165679c59fSAlex Elder * 13175679c59fSAlex Elder * Note that for our purposes once an object exists it never goes 13185679c59fSAlex Elder * away again. It's possible that the response from two existence 13195679c59fSAlex Elder * checks are separated by the creation of the target object, and 13205679c59fSAlex Elder * the first ("doesn't exist") response arrives *after* the second 13215679c59fSAlex Elder * ("does exist"). In that case we ignore the second one. 13225679c59fSAlex Elder */ 13235679c59fSAlex Elder static void obj_request_existence_set(struct rbd_obj_request *obj_request, 13245679c59fSAlex Elder bool exists) 13255679c59fSAlex Elder { 13265679c59fSAlex Elder if (exists) 13275679c59fSAlex Elder set_bit(OBJ_REQ_EXISTS, &obj_request->flags); 13285679c59fSAlex Elder set_bit(OBJ_REQ_KNOWN, &obj_request->flags); 13295679c59fSAlex Elder smp_mb(); 13305679c59fSAlex Elder } 13315679c59fSAlex Elder 13325679c59fSAlex Elder static bool obj_request_known_test(struct rbd_obj_request *obj_request) 13335679c59fSAlex Elder { 13345679c59fSAlex Elder smp_mb(); 13355679c59fSAlex Elder return test_bit(OBJ_REQ_KNOWN, &obj_request->flags) != 0; 13365679c59fSAlex Elder } 13375679c59fSAlex Elder 13385679c59fSAlex Elder static bool obj_request_exists_test(struct rbd_obj_request *obj_request) 13395679c59fSAlex Elder { 13405679c59fSAlex Elder smp_mb(); 13415679c59fSAlex Elder return test_bit(OBJ_REQ_EXISTS, &obj_request->flags) != 0; 13425679c59fSAlex Elder } 13435679c59fSAlex Elder 1344bf0d5f50SAlex Elder static void rbd_obj_request_get(struct rbd_obj_request *obj_request) 1345bf0d5f50SAlex Elder { 134637206ee5SAlex Elder dout("%s: obj %p (was %d)\n", __func__, obj_request, 134737206ee5SAlex Elder atomic_read(&obj_request->kref.refcount)); 1348bf0d5f50SAlex Elder kref_get(&obj_request->kref); 1349bf0d5f50SAlex Elder } 1350bf0d5f50SAlex Elder 1351bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref); 1352bf0d5f50SAlex Elder static void rbd_obj_request_put(struct rbd_obj_request *obj_request) 1353bf0d5f50SAlex Elder { 1354bf0d5f50SAlex Elder rbd_assert(obj_request != NULL); 135537206ee5SAlex Elder dout("%s: obj %p (was %d)\n", __func__, obj_request, 135637206ee5SAlex Elder atomic_read(&obj_request->kref.refcount)); 1357bf0d5f50SAlex Elder kref_put(&obj_request->kref, rbd_obj_request_destroy); 1358bf0d5f50SAlex Elder } 1359bf0d5f50SAlex Elder 1360bf0d5f50SAlex Elder static void rbd_img_request_get(struct rbd_img_request *img_request) 1361bf0d5f50SAlex Elder { 136237206ee5SAlex Elder dout("%s: img %p (was %d)\n", __func__, img_request, 136337206ee5SAlex Elder atomic_read(&img_request->kref.refcount)); 1364bf0d5f50SAlex Elder kref_get(&img_request->kref); 1365bf0d5f50SAlex Elder } 1366bf0d5f50SAlex Elder 1367bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref); 1368bf0d5f50SAlex Elder static void rbd_img_request_put(struct rbd_img_request *img_request) 1369bf0d5f50SAlex Elder { 1370bf0d5f50SAlex Elder rbd_assert(img_request != NULL); 137137206ee5SAlex Elder dout("%s: img %p (was %d)\n", __func__, img_request, 137237206ee5SAlex Elder atomic_read(&img_request->kref.refcount)); 1373bf0d5f50SAlex Elder kref_put(&img_request->kref, rbd_img_request_destroy); 1374bf0d5f50SAlex Elder } 1375bf0d5f50SAlex Elder 1376bf0d5f50SAlex Elder static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request, 1377bf0d5f50SAlex Elder struct rbd_obj_request *obj_request) 1378bf0d5f50SAlex Elder { 137925dcf954SAlex Elder rbd_assert(obj_request->img_request == NULL); 138025dcf954SAlex Elder 1381b155e86cSAlex Elder /* Image request now owns object's original reference */ 1382bf0d5f50SAlex Elder obj_request->img_request = img_request; 138325dcf954SAlex Elder obj_request->which = img_request->obj_request_count; 13846365d33aSAlex Elder rbd_assert(!obj_request_img_data_test(obj_request)); 13856365d33aSAlex Elder obj_request_img_data_set(obj_request); 1386bf0d5f50SAlex Elder rbd_assert(obj_request->which != BAD_WHICH); 138725dcf954SAlex Elder img_request->obj_request_count++; 138825dcf954SAlex Elder list_add_tail(&obj_request->links, &img_request->obj_requests); 138937206ee5SAlex Elder dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request, 139037206ee5SAlex Elder obj_request->which); 1391bf0d5f50SAlex Elder } 1392bf0d5f50SAlex Elder 1393bf0d5f50SAlex Elder static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request, 1394bf0d5f50SAlex Elder struct rbd_obj_request *obj_request) 1395bf0d5f50SAlex Elder { 1396bf0d5f50SAlex Elder rbd_assert(obj_request->which != BAD_WHICH); 139725dcf954SAlex Elder 139837206ee5SAlex Elder dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request, 139937206ee5SAlex Elder obj_request->which); 1400bf0d5f50SAlex Elder list_del(&obj_request->links); 140125dcf954SAlex Elder rbd_assert(img_request->obj_request_count > 0); 140225dcf954SAlex Elder img_request->obj_request_count--; 140325dcf954SAlex Elder rbd_assert(obj_request->which == img_request->obj_request_count); 140425dcf954SAlex Elder obj_request->which = BAD_WHICH; 14056365d33aSAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 1406bf0d5f50SAlex Elder rbd_assert(obj_request->img_request == img_request); 1407bf0d5f50SAlex Elder obj_request->img_request = NULL; 140825dcf954SAlex Elder obj_request->callback = NULL; 1409bf0d5f50SAlex Elder rbd_obj_request_put(obj_request); 1410bf0d5f50SAlex Elder } 1411bf0d5f50SAlex Elder 1412bf0d5f50SAlex Elder static bool obj_request_type_valid(enum obj_request_type type) 1413bf0d5f50SAlex Elder { 1414bf0d5f50SAlex Elder switch (type) { 14159969ebc5SAlex Elder case OBJ_REQUEST_NODATA: 1416bf0d5f50SAlex Elder case OBJ_REQUEST_BIO: 1417788e2df3SAlex Elder case OBJ_REQUEST_PAGES: 1418bf0d5f50SAlex Elder return true; 1419bf0d5f50SAlex Elder default: 1420bf0d5f50SAlex Elder return false; 1421bf0d5f50SAlex Elder } 1422bf0d5f50SAlex Elder } 1423bf0d5f50SAlex Elder 1424bf0d5f50SAlex Elder static int rbd_obj_request_submit(struct ceph_osd_client *osdc, 1425bf0d5f50SAlex Elder struct rbd_obj_request *obj_request) 1426bf0d5f50SAlex Elder { 142737206ee5SAlex Elder dout("%s: osdc %p obj %p\n", __func__, osdc, obj_request); 142837206ee5SAlex Elder 1429bf0d5f50SAlex Elder return ceph_osdc_start_request(osdc, obj_request->osd_req, false); 1430bf0d5f50SAlex Elder } 1431bf0d5f50SAlex Elder 1432bf0d5f50SAlex Elder static void rbd_img_request_complete(struct rbd_img_request *img_request) 1433bf0d5f50SAlex Elder { 143455f27e09SAlex Elder 143537206ee5SAlex Elder dout("%s: img %p\n", __func__, img_request); 143655f27e09SAlex Elder 143755f27e09SAlex Elder /* 143855f27e09SAlex Elder * If no error occurred, compute the aggregate transfer 143955f27e09SAlex Elder * count for the image request. We could instead use 144055f27e09SAlex Elder * atomic64_cmpxchg() to update it as each object request 144155f27e09SAlex Elder * completes; not clear which way is better off hand. 144255f27e09SAlex Elder */ 144355f27e09SAlex Elder if (!img_request->result) { 144455f27e09SAlex Elder struct rbd_obj_request *obj_request; 144555f27e09SAlex Elder u64 xferred = 0; 144655f27e09SAlex Elder 144755f27e09SAlex Elder for_each_obj_request(img_request, obj_request) 144855f27e09SAlex Elder xferred += obj_request->xferred; 144955f27e09SAlex Elder img_request->xferred = xferred; 145055f27e09SAlex Elder } 145155f27e09SAlex Elder 1452bf0d5f50SAlex Elder if (img_request->callback) 1453bf0d5f50SAlex Elder img_request->callback(img_request); 1454bf0d5f50SAlex Elder else 1455bf0d5f50SAlex Elder rbd_img_request_put(img_request); 1456bf0d5f50SAlex Elder } 1457bf0d5f50SAlex Elder 1458788e2df3SAlex Elder /* Caller is responsible for rbd_obj_request_destroy(obj_request) */ 1459788e2df3SAlex Elder 1460788e2df3SAlex Elder static int rbd_obj_request_wait(struct rbd_obj_request *obj_request) 1461788e2df3SAlex Elder { 146237206ee5SAlex Elder dout("%s: obj %p\n", __func__, obj_request); 146337206ee5SAlex Elder 1464788e2df3SAlex Elder return wait_for_completion_interruptible(&obj_request->completion); 1465788e2df3SAlex Elder } 1466788e2df3SAlex Elder 14670c425248SAlex Elder /* 14680c425248SAlex Elder * The default/initial value for all image request flags is 0. Each 14690c425248SAlex Elder * is conditionally set to 1 at image request initialization time 14700c425248SAlex Elder * and currently never change thereafter. 14710c425248SAlex Elder */ 14720c425248SAlex Elder static void img_request_write_set(struct rbd_img_request *img_request) 14730c425248SAlex Elder { 14740c425248SAlex Elder set_bit(IMG_REQ_WRITE, &img_request->flags); 14750c425248SAlex Elder smp_mb(); 14760c425248SAlex Elder } 14770c425248SAlex Elder 14780c425248SAlex Elder static bool img_request_write_test(struct rbd_img_request *img_request) 14790c425248SAlex Elder { 14800c425248SAlex Elder smp_mb(); 14810c425248SAlex Elder return test_bit(IMG_REQ_WRITE, &img_request->flags) != 0; 14820c425248SAlex Elder } 14830c425248SAlex Elder 14849849e986SAlex Elder static void img_request_child_set(struct rbd_img_request *img_request) 14859849e986SAlex Elder { 14869849e986SAlex Elder set_bit(IMG_REQ_CHILD, &img_request->flags); 14879849e986SAlex Elder smp_mb(); 14889849e986SAlex Elder } 14899849e986SAlex Elder 14909849e986SAlex Elder static bool img_request_child_test(struct rbd_img_request *img_request) 14919849e986SAlex Elder { 14929849e986SAlex Elder smp_mb(); 14939849e986SAlex Elder return test_bit(IMG_REQ_CHILD, &img_request->flags) != 0; 14949849e986SAlex Elder } 14959849e986SAlex Elder 1496d0b2e944SAlex Elder static void img_request_layered_set(struct rbd_img_request *img_request) 1497d0b2e944SAlex Elder { 1498d0b2e944SAlex Elder set_bit(IMG_REQ_LAYERED, &img_request->flags); 1499d0b2e944SAlex Elder smp_mb(); 1500d0b2e944SAlex Elder } 1501d0b2e944SAlex Elder 1502d0b2e944SAlex Elder static bool img_request_layered_test(struct rbd_img_request *img_request) 1503d0b2e944SAlex Elder { 1504d0b2e944SAlex Elder smp_mb(); 1505d0b2e944SAlex Elder return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0; 1506d0b2e944SAlex Elder } 1507d0b2e944SAlex Elder 15086e2a4505SAlex Elder static void 15096e2a4505SAlex Elder rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request) 15106e2a4505SAlex Elder { 1511b9434c5bSAlex Elder u64 xferred = obj_request->xferred; 1512b9434c5bSAlex Elder u64 length = obj_request->length; 1513b9434c5bSAlex Elder 15146e2a4505SAlex Elder dout("%s: obj %p img %p result %d %llu/%llu\n", __func__, 15156e2a4505SAlex Elder obj_request, obj_request->img_request, obj_request->result, 1516b9434c5bSAlex Elder xferred, length); 15176e2a4505SAlex Elder /* 15186e2a4505SAlex Elder * ENOENT means a hole in the image. We zero-fill the 15196e2a4505SAlex Elder * entire length of the request. A short read also implies 15206e2a4505SAlex Elder * zero-fill to the end of the request. Either way we 15216e2a4505SAlex Elder * update the xferred count to indicate the whole request 15226e2a4505SAlex Elder * was satisfied. 15236e2a4505SAlex Elder */ 1524b9434c5bSAlex Elder rbd_assert(obj_request->type != OBJ_REQUEST_NODATA); 15256e2a4505SAlex Elder if (obj_request->result == -ENOENT) { 1526b9434c5bSAlex Elder if (obj_request->type == OBJ_REQUEST_BIO) 15276e2a4505SAlex Elder zero_bio_chain(obj_request->bio_list, 0); 1528b9434c5bSAlex Elder else 1529b9434c5bSAlex Elder zero_pages(obj_request->pages, 0, length); 15306e2a4505SAlex Elder obj_request->result = 0; 1531b9434c5bSAlex Elder obj_request->xferred = length; 1532b9434c5bSAlex Elder } else if (xferred < length && !obj_request->result) { 1533b9434c5bSAlex Elder if (obj_request->type == OBJ_REQUEST_BIO) 1534b9434c5bSAlex Elder zero_bio_chain(obj_request->bio_list, xferred); 1535b9434c5bSAlex Elder else 1536b9434c5bSAlex Elder zero_pages(obj_request->pages, xferred, length); 1537b9434c5bSAlex Elder obj_request->xferred = length; 15386e2a4505SAlex Elder } 15396e2a4505SAlex Elder obj_request_done_set(obj_request); 15406e2a4505SAlex Elder } 15416e2a4505SAlex Elder 1542bf0d5f50SAlex Elder static void rbd_obj_request_complete(struct rbd_obj_request *obj_request) 1543bf0d5f50SAlex Elder { 154437206ee5SAlex Elder dout("%s: obj %p cb %p\n", __func__, obj_request, 154537206ee5SAlex Elder obj_request->callback); 1546bf0d5f50SAlex Elder if (obj_request->callback) 1547bf0d5f50SAlex Elder obj_request->callback(obj_request); 1548788e2df3SAlex Elder else 1549788e2df3SAlex Elder complete_all(&obj_request->completion); 1550bf0d5f50SAlex Elder } 1551bf0d5f50SAlex Elder 1552c47f9371SAlex Elder static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request) 155339bf2c5dSAlex Elder { 155439bf2c5dSAlex Elder dout("%s: obj %p\n", __func__, obj_request); 155539bf2c5dSAlex Elder obj_request_done_set(obj_request); 155639bf2c5dSAlex Elder } 155739bf2c5dSAlex Elder 1558c47f9371SAlex Elder static void rbd_osd_read_callback(struct rbd_obj_request *obj_request) 1559bf0d5f50SAlex Elder { 156057acbaa7SAlex Elder struct rbd_img_request *img_request = NULL; 1561a9e8ba2cSAlex Elder struct rbd_device *rbd_dev = NULL; 156257acbaa7SAlex Elder bool layered = false; 156357acbaa7SAlex Elder 156457acbaa7SAlex Elder if (obj_request_img_data_test(obj_request)) { 156557acbaa7SAlex Elder img_request = obj_request->img_request; 156657acbaa7SAlex Elder layered = img_request && img_request_layered_test(img_request); 1567a9e8ba2cSAlex Elder rbd_dev = img_request->rbd_dev; 156857acbaa7SAlex Elder } 15698b3e1a56SAlex Elder 15708b3e1a56SAlex Elder dout("%s: obj %p img %p result %d %llu/%llu\n", __func__, 15718b3e1a56SAlex Elder obj_request, img_request, obj_request->result, 15728b3e1a56SAlex Elder obj_request->xferred, obj_request->length); 1573a9e8ba2cSAlex Elder if (layered && obj_request->result == -ENOENT && 1574a9e8ba2cSAlex Elder obj_request->img_offset < rbd_dev->parent_overlap) 15758b3e1a56SAlex Elder rbd_img_parent_read(obj_request); 15768b3e1a56SAlex Elder else if (img_request) 15776e2a4505SAlex Elder rbd_img_obj_request_read_callback(obj_request); 15786e2a4505SAlex Elder else 157907741308SAlex Elder obj_request_done_set(obj_request); 1580bf0d5f50SAlex Elder } 1581bf0d5f50SAlex Elder 1582c47f9371SAlex Elder static void rbd_osd_write_callback(struct rbd_obj_request *obj_request) 1583bf0d5f50SAlex Elder { 15841b83bef2SSage Weil dout("%s: obj %p result %d %llu\n", __func__, obj_request, 15851b83bef2SSage Weil obj_request->result, obj_request->length); 15861b83bef2SSage Weil /* 15878b3e1a56SAlex Elder * There is no such thing as a successful short write. Set 15888b3e1a56SAlex Elder * it to our originally-requested length. 15891b83bef2SSage Weil */ 15901b83bef2SSage Weil obj_request->xferred = obj_request->length; 159107741308SAlex Elder obj_request_done_set(obj_request); 1592bf0d5f50SAlex Elder } 1593bf0d5f50SAlex Elder 1594fbfab539SAlex Elder /* 1595fbfab539SAlex Elder * For a simple stat call there's nothing to do. We'll do more if 1596fbfab539SAlex Elder * this is part of a write sequence for a layered image. 1597fbfab539SAlex Elder */ 1598c47f9371SAlex Elder static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request) 1599fbfab539SAlex Elder { 160037206ee5SAlex Elder dout("%s: obj %p\n", __func__, obj_request); 1601fbfab539SAlex Elder obj_request_done_set(obj_request); 1602fbfab539SAlex Elder } 1603fbfab539SAlex Elder 1604bf0d5f50SAlex Elder static void rbd_osd_req_callback(struct ceph_osd_request *osd_req, 1605bf0d5f50SAlex Elder struct ceph_msg *msg) 1606bf0d5f50SAlex Elder { 1607bf0d5f50SAlex Elder struct rbd_obj_request *obj_request = osd_req->r_priv; 1608bf0d5f50SAlex Elder u16 opcode; 1609bf0d5f50SAlex Elder 161037206ee5SAlex Elder dout("%s: osd_req %p msg %p\n", __func__, osd_req, msg); 1611bf0d5f50SAlex Elder rbd_assert(osd_req == obj_request->osd_req); 161257acbaa7SAlex Elder if (obj_request_img_data_test(obj_request)) { 161357acbaa7SAlex Elder rbd_assert(obj_request->img_request); 161457acbaa7SAlex Elder rbd_assert(obj_request->which != BAD_WHICH); 161557acbaa7SAlex Elder } else { 161657acbaa7SAlex Elder rbd_assert(obj_request->which == BAD_WHICH); 161757acbaa7SAlex Elder } 1618bf0d5f50SAlex Elder 16191b83bef2SSage Weil if (osd_req->r_result < 0) 16201b83bef2SSage Weil obj_request->result = osd_req->r_result; 1621bf0d5f50SAlex Elder 16220eefd470SAlex Elder BUG_ON(osd_req->r_num_ops > 2); 1623bf0d5f50SAlex Elder 1624c47f9371SAlex Elder /* 1625c47f9371SAlex Elder * We support a 64-bit length, but ultimately it has to be 1626c47f9371SAlex Elder * passed to blk_end_request(), which takes an unsigned int. 1627c47f9371SAlex Elder */ 16281b83bef2SSage Weil obj_request->xferred = osd_req->r_reply_op_len[0]; 1629c47f9371SAlex Elder rbd_assert(obj_request->xferred < (u64)UINT_MAX); 163079528734SAlex Elder opcode = osd_req->r_ops[0].op; 1631bf0d5f50SAlex Elder switch (opcode) { 1632bf0d5f50SAlex Elder case CEPH_OSD_OP_READ: 1633c47f9371SAlex Elder rbd_osd_read_callback(obj_request); 1634bf0d5f50SAlex Elder break; 1635bf0d5f50SAlex Elder case CEPH_OSD_OP_WRITE: 1636c47f9371SAlex Elder rbd_osd_write_callback(obj_request); 1637bf0d5f50SAlex Elder break; 1638fbfab539SAlex Elder case CEPH_OSD_OP_STAT: 1639c47f9371SAlex Elder rbd_osd_stat_callback(obj_request); 1640fbfab539SAlex Elder break; 164136be9a76SAlex Elder case CEPH_OSD_OP_CALL: 1642b8d70035SAlex Elder case CEPH_OSD_OP_NOTIFY_ACK: 16439969ebc5SAlex Elder case CEPH_OSD_OP_WATCH: 1644c47f9371SAlex Elder rbd_osd_trivial_callback(obj_request); 16459969ebc5SAlex Elder break; 1646bf0d5f50SAlex Elder default: 1647bf0d5f50SAlex Elder rbd_warn(NULL, "%s: unsupported op %hu\n", 1648bf0d5f50SAlex Elder obj_request->object_name, (unsigned short) opcode); 1649bf0d5f50SAlex Elder break; 1650bf0d5f50SAlex Elder } 1651bf0d5f50SAlex Elder 165207741308SAlex Elder if (obj_request_done_test(obj_request)) 1653bf0d5f50SAlex Elder rbd_obj_request_complete(obj_request); 1654bf0d5f50SAlex Elder } 1655bf0d5f50SAlex Elder 16569d4df01fSAlex Elder static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request) 1657430c28c3SAlex Elder { 1658430c28c3SAlex Elder struct rbd_img_request *img_request = obj_request->img_request; 16598c042b0dSAlex Elder struct ceph_osd_request *osd_req = obj_request->osd_req; 16609d4df01fSAlex Elder u64 snap_id; 1661430c28c3SAlex Elder 16628c042b0dSAlex Elder rbd_assert(osd_req != NULL); 1663430c28c3SAlex Elder 16649d4df01fSAlex Elder snap_id = img_request ? img_request->snap_id : CEPH_NOSNAP; 16658c042b0dSAlex Elder ceph_osdc_build_request(osd_req, obj_request->offset, 16669d4df01fSAlex Elder NULL, snap_id, NULL); 16679d4df01fSAlex Elder } 16689d4df01fSAlex Elder 16699d4df01fSAlex Elder static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request) 16709d4df01fSAlex Elder { 16719d4df01fSAlex Elder struct rbd_img_request *img_request = obj_request->img_request; 16729d4df01fSAlex Elder struct ceph_osd_request *osd_req = obj_request->osd_req; 16739d4df01fSAlex Elder struct ceph_snap_context *snapc; 16749d4df01fSAlex Elder struct timespec mtime = CURRENT_TIME; 16759d4df01fSAlex Elder 16769d4df01fSAlex Elder rbd_assert(osd_req != NULL); 16779d4df01fSAlex Elder 16789d4df01fSAlex Elder snapc = img_request ? img_request->snapc : NULL; 16799d4df01fSAlex Elder ceph_osdc_build_request(osd_req, obj_request->offset, 16809d4df01fSAlex Elder snapc, CEPH_NOSNAP, &mtime); 1681430c28c3SAlex Elder } 1682430c28c3SAlex Elder 1683bf0d5f50SAlex Elder static struct ceph_osd_request *rbd_osd_req_create( 1684bf0d5f50SAlex Elder struct rbd_device *rbd_dev, 1685bf0d5f50SAlex Elder bool write_request, 1686430c28c3SAlex Elder struct rbd_obj_request *obj_request) 1687bf0d5f50SAlex Elder { 1688bf0d5f50SAlex Elder struct ceph_snap_context *snapc = NULL; 1689bf0d5f50SAlex Elder struct ceph_osd_client *osdc; 1690bf0d5f50SAlex Elder struct ceph_osd_request *osd_req; 1691bf0d5f50SAlex Elder 16926365d33aSAlex Elder if (obj_request_img_data_test(obj_request)) { 16936365d33aSAlex Elder struct rbd_img_request *img_request = obj_request->img_request; 16946365d33aSAlex Elder 16950c425248SAlex Elder rbd_assert(write_request == 16960c425248SAlex Elder img_request_write_test(img_request)); 16970c425248SAlex Elder if (write_request) 1698bf0d5f50SAlex Elder snapc = img_request->snapc; 1699bf0d5f50SAlex Elder } 1700bf0d5f50SAlex Elder 1701bf0d5f50SAlex Elder /* Allocate and initialize the request, for the single op */ 1702bf0d5f50SAlex Elder 1703bf0d5f50SAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 1704bf0d5f50SAlex Elder osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_ATOMIC); 1705bf0d5f50SAlex Elder if (!osd_req) 1706bf0d5f50SAlex Elder return NULL; /* ENOMEM */ 1707bf0d5f50SAlex Elder 1708430c28c3SAlex Elder if (write_request) 1709bf0d5f50SAlex Elder osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK; 1710430c28c3SAlex Elder else 1711bf0d5f50SAlex Elder osd_req->r_flags = CEPH_OSD_FLAG_READ; 1712bf0d5f50SAlex Elder 1713bf0d5f50SAlex Elder osd_req->r_callback = rbd_osd_req_callback; 1714bf0d5f50SAlex Elder osd_req->r_priv = obj_request; 1715bf0d5f50SAlex Elder 1716bf0d5f50SAlex Elder osd_req->r_oid_len = strlen(obj_request->object_name); 1717bf0d5f50SAlex Elder rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid)); 1718bf0d5f50SAlex Elder memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len); 1719bf0d5f50SAlex Elder 1720bf0d5f50SAlex Elder osd_req->r_file_layout = rbd_dev->layout; /* struct */ 1721bf0d5f50SAlex Elder 1722bf0d5f50SAlex Elder return osd_req; 1723bf0d5f50SAlex Elder } 1724bf0d5f50SAlex Elder 17250eefd470SAlex Elder /* 17260eefd470SAlex Elder * Create a copyup osd request based on the information in the 17270eefd470SAlex Elder * object request supplied. A copyup request has two osd ops, 17280eefd470SAlex Elder * a copyup method call, and a "normal" write request. 17290eefd470SAlex Elder */ 17300eefd470SAlex Elder static struct ceph_osd_request * 17310eefd470SAlex Elder rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request) 17320eefd470SAlex Elder { 17330eefd470SAlex Elder struct rbd_img_request *img_request; 17340eefd470SAlex Elder struct ceph_snap_context *snapc; 17350eefd470SAlex Elder struct rbd_device *rbd_dev; 17360eefd470SAlex Elder struct ceph_osd_client *osdc; 17370eefd470SAlex Elder struct ceph_osd_request *osd_req; 17380eefd470SAlex Elder 17390eefd470SAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 17400eefd470SAlex Elder img_request = obj_request->img_request; 17410eefd470SAlex Elder rbd_assert(img_request); 17420eefd470SAlex Elder rbd_assert(img_request_write_test(img_request)); 17430eefd470SAlex Elder 17440eefd470SAlex Elder /* Allocate and initialize the request, for the two ops */ 17450eefd470SAlex Elder 17460eefd470SAlex Elder snapc = img_request->snapc; 17470eefd470SAlex Elder rbd_dev = img_request->rbd_dev; 17480eefd470SAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 17490eefd470SAlex Elder osd_req = ceph_osdc_alloc_request(osdc, snapc, 2, false, GFP_ATOMIC); 17500eefd470SAlex Elder if (!osd_req) 17510eefd470SAlex Elder return NULL; /* ENOMEM */ 17520eefd470SAlex Elder 17530eefd470SAlex Elder osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK; 17540eefd470SAlex Elder osd_req->r_callback = rbd_osd_req_callback; 17550eefd470SAlex Elder osd_req->r_priv = obj_request; 17560eefd470SAlex Elder 17570eefd470SAlex Elder osd_req->r_oid_len = strlen(obj_request->object_name); 17580eefd470SAlex Elder rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid)); 17590eefd470SAlex Elder memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len); 17600eefd470SAlex Elder 17610eefd470SAlex Elder osd_req->r_file_layout = rbd_dev->layout; /* struct */ 17620eefd470SAlex Elder 17630eefd470SAlex Elder return osd_req; 17640eefd470SAlex Elder } 17650eefd470SAlex Elder 17660eefd470SAlex Elder 1767bf0d5f50SAlex Elder static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req) 1768bf0d5f50SAlex Elder { 1769bf0d5f50SAlex Elder ceph_osdc_put_request(osd_req); 1770bf0d5f50SAlex Elder } 1771bf0d5f50SAlex Elder 1772bf0d5f50SAlex Elder /* object_name is assumed to be a non-null pointer and NUL-terminated */ 1773bf0d5f50SAlex Elder 1774bf0d5f50SAlex Elder static struct rbd_obj_request *rbd_obj_request_create(const char *object_name, 1775bf0d5f50SAlex Elder u64 offset, u64 length, 1776bf0d5f50SAlex Elder enum obj_request_type type) 1777bf0d5f50SAlex Elder { 1778bf0d5f50SAlex Elder struct rbd_obj_request *obj_request; 1779bf0d5f50SAlex Elder size_t size; 1780bf0d5f50SAlex Elder char *name; 1781bf0d5f50SAlex Elder 1782bf0d5f50SAlex Elder rbd_assert(obj_request_type_valid(type)); 1783bf0d5f50SAlex Elder 1784bf0d5f50SAlex Elder size = strlen(object_name) + 1; 1785f907ad55SAlex Elder name = kmalloc(size, GFP_KERNEL); 1786f907ad55SAlex Elder if (!name) 1787bf0d5f50SAlex Elder return NULL; 1788bf0d5f50SAlex Elder 1789868311b1SAlex Elder obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_KERNEL); 1790f907ad55SAlex Elder if (!obj_request) { 1791f907ad55SAlex Elder kfree(name); 1792f907ad55SAlex Elder return NULL; 1793f907ad55SAlex Elder } 1794f907ad55SAlex Elder 1795bf0d5f50SAlex Elder obj_request->object_name = memcpy(name, object_name, size); 1796bf0d5f50SAlex Elder obj_request->offset = offset; 1797bf0d5f50SAlex Elder obj_request->length = length; 1798926f9b3fSAlex Elder obj_request->flags = 0; 1799bf0d5f50SAlex Elder obj_request->which = BAD_WHICH; 1800bf0d5f50SAlex Elder obj_request->type = type; 1801bf0d5f50SAlex Elder INIT_LIST_HEAD(&obj_request->links); 1802788e2df3SAlex Elder init_completion(&obj_request->completion); 1803bf0d5f50SAlex Elder kref_init(&obj_request->kref); 1804bf0d5f50SAlex Elder 180537206ee5SAlex Elder dout("%s: \"%s\" %llu/%llu %d -> obj %p\n", __func__, object_name, 180637206ee5SAlex Elder offset, length, (int)type, obj_request); 180737206ee5SAlex Elder 1808bf0d5f50SAlex Elder return obj_request; 1809bf0d5f50SAlex Elder } 1810bf0d5f50SAlex Elder 1811bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref) 1812bf0d5f50SAlex Elder { 1813bf0d5f50SAlex Elder struct rbd_obj_request *obj_request; 1814bf0d5f50SAlex Elder 1815bf0d5f50SAlex Elder obj_request = container_of(kref, struct rbd_obj_request, kref); 1816bf0d5f50SAlex Elder 181737206ee5SAlex Elder dout("%s: obj %p\n", __func__, obj_request); 181837206ee5SAlex Elder 1819bf0d5f50SAlex Elder rbd_assert(obj_request->img_request == NULL); 1820bf0d5f50SAlex Elder rbd_assert(obj_request->which == BAD_WHICH); 1821bf0d5f50SAlex Elder 1822bf0d5f50SAlex Elder if (obj_request->osd_req) 1823bf0d5f50SAlex Elder rbd_osd_req_destroy(obj_request->osd_req); 1824bf0d5f50SAlex Elder 1825bf0d5f50SAlex Elder rbd_assert(obj_request_type_valid(obj_request->type)); 1826bf0d5f50SAlex Elder switch (obj_request->type) { 18279969ebc5SAlex Elder case OBJ_REQUEST_NODATA: 18289969ebc5SAlex Elder break; /* Nothing to do */ 1829bf0d5f50SAlex Elder case OBJ_REQUEST_BIO: 1830bf0d5f50SAlex Elder if (obj_request->bio_list) 1831bf0d5f50SAlex Elder bio_chain_put(obj_request->bio_list); 1832bf0d5f50SAlex Elder break; 1833788e2df3SAlex Elder case OBJ_REQUEST_PAGES: 1834788e2df3SAlex Elder if (obj_request->pages) 1835788e2df3SAlex Elder ceph_release_page_vector(obj_request->pages, 1836788e2df3SAlex Elder obj_request->page_count); 1837788e2df3SAlex Elder break; 1838bf0d5f50SAlex Elder } 1839bf0d5f50SAlex Elder 1840f907ad55SAlex Elder kfree(obj_request->object_name); 1841868311b1SAlex Elder obj_request->object_name = NULL; 1842868311b1SAlex Elder kmem_cache_free(rbd_obj_request_cache, obj_request); 1843bf0d5f50SAlex Elder } 1844bf0d5f50SAlex Elder 1845bf0d5f50SAlex Elder /* 1846bf0d5f50SAlex Elder * Caller is responsible for filling in the list of object requests 1847bf0d5f50SAlex Elder * that comprises the image request, and the Linux request pointer 1848bf0d5f50SAlex Elder * (if there is one). 1849bf0d5f50SAlex Elder */ 1850cc344fa1SAlex Elder static struct rbd_img_request *rbd_img_request_create( 1851cc344fa1SAlex Elder struct rbd_device *rbd_dev, 1852bf0d5f50SAlex Elder u64 offset, u64 length, 18539849e986SAlex Elder bool write_request, 18549849e986SAlex Elder bool child_request) 1855bf0d5f50SAlex Elder { 1856bf0d5f50SAlex Elder struct rbd_img_request *img_request; 1857bf0d5f50SAlex Elder 18581c2a9dfeSAlex Elder img_request = kmem_cache_alloc(rbd_img_request_cache, GFP_ATOMIC); 1859bf0d5f50SAlex Elder if (!img_request) 1860bf0d5f50SAlex Elder return NULL; 1861bf0d5f50SAlex Elder 1862bf0d5f50SAlex Elder if (write_request) { 1863bf0d5f50SAlex Elder down_read(&rbd_dev->header_rwsem); 1864812164f8SAlex Elder ceph_get_snap_context(rbd_dev->header.snapc); 1865bf0d5f50SAlex Elder up_read(&rbd_dev->header_rwsem); 1866bf0d5f50SAlex Elder } 1867bf0d5f50SAlex Elder 1868bf0d5f50SAlex Elder img_request->rq = NULL; 1869bf0d5f50SAlex Elder img_request->rbd_dev = rbd_dev; 1870bf0d5f50SAlex Elder img_request->offset = offset; 1871bf0d5f50SAlex Elder img_request->length = length; 18720c425248SAlex Elder img_request->flags = 0; 18730c425248SAlex Elder if (write_request) { 18740c425248SAlex Elder img_request_write_set(img_request); 1875468521c1SAlex Elder img_request->snapc = rbd_dev->header.snapc; 18760c425248SAlex Elder } else { 1877bf0d5f50SAlex Elder img_request->snap_id = rbd_dev->spec->snap_id; 18780c425248SAlex Elder } 18799849e986SAlex Elder if (child_request) 18809849e986SAlex Elder img_request_child_set(img_request); 1881d0b2e944SAlex Elder if (rbd_dev->parent_spec) 1882d0b2e944SAlex Elder img_request_layered_set(img_request); 1883bf0d5f50SAlex Elder spin_lock_init(&img_request->completion_lock); 1884bf0d5f50SAlex Elder img_request->next_completion = 0; 1885bf0d5f50SAlex Elder img_request->callback = NULL; 1886a5a337d4SAlex Elder img_request->result = 0; 1887bf0d5f50SAlex Elder img_request->obj_request_count = 0; 1888bf0d5f50SAlex Elder INIT_LIST_HEAD(&img_request->obj_requests); 1889bf0d5f50SAlex Elder kref_init(&img_request->kref); 1890bf0d5f50SAlex Elder 1891bf0d5f50SAlex Elder rbd_img_request_get(img_request); /* Avoid a warning */ 1892bf0d5f50SAlex Elder rbd_img_request_put(img_request); /* TEMPORARY */ 1893bf0d5f50SAlex Elder 189437206ee5SAlex Elder dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev, 189537206ee5SAlex Elder write_request ? "write" : "read", offset, length, 189637206ee5SAlex Elder img_request); 189737206ee5SAlex Elder 1898bf0d5f50SAlex Elder return img_request; 1899bf0d5f50SAlex Elder } 1900bf0d5f50SAlex Elder 1901bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref) 1902bf0d5f50SAlex Elder { 1903bf0d5f50SAlex Elder struct rbd_img_request *img_request; 1904bf0d5f50SAlex Elder struct rbd_obj_request *obj_request; 1905bf0d5f50SAlex Elder struct rbd_obj_request *next_obj_request; 1906bf0d5f50SAlex Elder 1907bf0d5f50SAlex Elder img_request = container_of(kref, struct rbd_img_request, kref); 1908bf0d5f50SAlex Elder 190937206ee5SAlex Elder dout("%s: img %p\n", __func__, img_request); 191037206ee5SAlex Elder 1911bf0d5f50SAlex Elder for_each_obj_request_safe(img_request, obj_request, next_obj_request) 1912bf0d5f50SAlex Elder rbd_img_obj_request_del(img_request, obj_request); 191325dcf954SAlex Elder rbd_assert(img_request->obj_request_count == 0); 1914bf0d5f50SAlex Elder 19150c425248SAlex Elder if (img_request_write_test(img_request)) 1916812164f8SAlex Elder ceph_put_snap_context(img_request->snapc); 1917bf0d5f50SAlex Elder 19188b3e1a56SAlex Elder if (img_request_child_test(img_request)) 19198b3e1a56SAlex Elder rbd_obj_request_put(img_request->obj_request); 19208b3e1a56SAlex Elder 19211c2a9dfeSAlex Elder kmem_cache_free(rbd_img_request_cache, img_request); 1922bf0d5f50SAlex Elder } 1923bf0d5f50SAlex Elder 19241217857fSAlex Elder static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request) 19251217857fSAlex Elder { 19266365d33aSAlex Elder struct rbd_img_request *img_request; 19271217857fSAlex Elder unsigned int xferred; 19281217857fSAlex Elder int result; 19298b3e1a56SAlex Elder bool more; 19301217857fSAlex Elder 19316365d33aSAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 19326365d33aSAlex Elder img_request = obj_request->img_request; 19336365d33aSAlex Elder 19341217857fSAlex Elder rbd_assert(obj_request->xferred <= (u64)UINT_MAX); 19351217857fSAlex Elder xferred = (unsigned int)obj_request->xferred; 19361217857fSAlex Elder result = obj_request->result; 19371217857fSAlex Elder if (result) { 19381217857fSAlex Elder struct rbd_device *rbd_dev = img_request->rbd_dev; 19391217857fSAlex Elder 19401217857fSAlex Elder rbd_warn(rbd_dev, "%s %llx at %llx (%llx)\n", 19411217857fSAlex Elder img_request_write_test(img_request) ? "write" : "read", 19421217857fSAlex Elder obj_request->length, obj_request->img_offset, 19431217857fSAlex Elder obj_request->offset); 19441217857fSAlex Elder rbd_warn(rbd_dev, " result %d xferred %x\n", 19451217857fSAlex Elder result, xferred); 19461217857fSAlex Elder if (!img_request->result) 19471217857fSAlex Elder img_request->result = result; 19481217857fSAlex Elder } 19491217857fSAlex Elder 1950f1a4739fSAlex Elder /* Image object requests don't own their page array */ 1951f1a4739fSAlex Elder 1952f1a4739fSAlex Elder if (obj_request->type == OBJ_REQUEST_PAGES) { 1953f1a4739fSAlex Elder obj_request->pages = NULL; 1954f1a4739fSAlex Elder obj_request->page_count = 0; 1955f1a4739fSAlex Elder } 1956f1a4739fSAlex Elder 19578b3e1a56SAlex Elder if (img_request_child_test(img_request)) { 19588b3e1a56SAlex Elder rbd_assert(img_request->obj_request != NULL); 19598b3e1a56SAlex Elder more = obj_request->which < img_request->obj_request_count - 1; 19608b3e1a56SAlex Elder } else { 19618b3e1a56SAlex Elder rbd_assert(img_request->rq != NULL); 19628b3e1a56SAlex Elder more = blk_end_request(img_request->rq, result, xferred); 19638b3e1a56SAlex Elder } 19648b3e1a56SAlex Elder 19658b3e1a56SAlex Elder return more; 19661217857fSAlex Elder } 19671217857fSAlex Elder 19682169238dSAlex Elder static void rbd_img_obj_callback(struct rbd_obj_request *obj_request) 19692169238dSAlex Elder { 19702169238dSAlex Elder struct rbd_img_request *img_request; 19712169238dSAlex Elder u32 which = obj_request->which; 19722169238dSAlex Elder bool more = true; 19732169238dSAlex Elder 19746365d33aSAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 19752169238dSAlex Elder img_request = obj_request->img_request; 19762169238dSAlex Elder 19772169238dSAlex Elder dout("%s: img %p obj %p\n", __func__, img_request, obj_request); 19782169238dSAlex Elder rbd_assert(img_request != NULL); 19792169238dSAlex Elder rbd_assert(img_request->obj_request_count > 0); 19802169238dSAlex Elder rbd_assert(which != BAD_WHICH); 19812169238dSAlex Elder rbd_assert(which < img_request->obj_request_count); 19822169238dSAlex Elder rbd_assert(which >= img_request->next_completion); 19832169238dSAlex Elder 19842169238dSAlex Elder spin_lock_irq(&img_request->completion_lock); 19852169238dSAlex Elder if (which != img_request->next_completion) 19862169238dSAlex Elder goto out; 19872169238dSAlex Elder 19882169238dSAlex Elder for_each_obj_request_from(img_request, obj_request) { 19892169238dSAlex Elder rbd_assert(more); 19902169238dSAlex Elder rbd_assert(which < img_request->obj_request_count); 19912169238dSAlex Elder 19922169238dSAlex Elder if (!obj_request_done_test(obj_request)) 19932169238dSAlex Elder break; 19941217857fSAlex Elder more = rbd_img_obj_end_request(obj_request); 19952169238dSAlex Elder which++; 19962169238dSAlex Elder } 19972169238dSAlex Elder 19982169238dSAlex Elder rbd_assert(more ^ (which == img_request->obj_request_count)); 19992169238dSAlex Elder img_request->next_completion = which; 20002169238dSAlex Elder out: 20012169238dSAlex Elder spin_unlock_irq(&img_request->completion_lock); 20022169238dSAlex Elder 20032169238dSAlex Elder if (!more) 20042169238dSAlex Elder rbd_img_request_complete(img_request); 20052169238dSAlex Elder } 20062169238dSAlex Elder 2007f1a4739fSAlex Elder /* 2008f1a4739fSAlex Elder * Split up an image request into one or more object requests, each 2009f1a4739fSAlex Elder * to a different object. The "type" parameter indicates whether 2010f1a4739fSAlex Elder * "data_desc" is the pointer to the head of a list of bio 2011f1a4739fSAlex Elder * structures, or the base of a page array. In either case this 2012f1a4739fSAlex Elder * function assumes data_desc describes memory sufficient to hold 2013f1a4739fSAlex Elder * all data described by the image request. 2014f1a4739fSAlex Elder */ 2015f1a4739fSAlex Elder static int rbd_img_request_fill(struct rbd_img_request *img_request, 2016f1a4739fSAlex Elder enum obj_request_type type, 2017f1a4739fSAlex Elder void *data_desc) 2018bf0d5f50SAlex Elder { 2019bf0d5f50SAlex Elder struct rbd_device *rbd_dev = img_request->rbd_dev; 2020bf0d5f50SAlex Elder struct rbd_obj_request *obj_request = NULL; 2021bf0d5f50SAlex Elder struct rbd_obj_request *next_obj_request; 20220c425248SAlex Elder bool write_request = img_request_write_test(img_request); 2023f1a4739fSAlex Elder struct bio *bio_list; 2024f1a4739fSAlex Elder unsigned int bio_offset = 0; 2025f1a4739fSAlex Elder struct page **pages; 20267da22d29SAlex Elder u64 img_offset; 2027bf0d5f50SAlex Elder u64 resid; 2028bf0d5f50SAlex Elder u16 opcode; 2029bf0d5f50SAlex Elder 2030f1a4739fSAlex Elder dout("%s: img %p type %d data_desc %p\n", __func__, img_request, 2031f1a4739fSAlex Elder (int)type, data_desc); 203237206ee5SAlex Elder 2033430c28c3SAlex Elder opcode = write_request ? CEPH_OSD_OP_WRITE : CEPH_OSD_OP_READ; 20347da22d29SAlex Elder img_offset = img_request->offset; 2035bf0d5f50SAlex Elder resid = img_request->length; 20364dda41d3SAlex Elder rbd_assert(resid > 0); 2037f1a4739fSAlex Elder 2038f1a4739fSAlex Elder if (type == OBJ_REQUEST_BIO) { 2039f1a4739fSAlex Elder bio_list = data_desc; 2040f1a4739fSAlex Elder rbd_assert(img_offset == bio_list->bi_sector << SECTOR_SHIFT); 2041f1a4739fSAlex Elder } else { 2042f1a4739fSAlex Elder rbd_assert(type == OBJ_REQUEST_PAGES); 2043f1a4739fSAlex Elder pages = data_desc; 2044f1a4739fSAlex Elder } 2045f1a4739fSAlex Elder 2046bf0d5f50SAlex Elder while (resid) { 20472fa12320SAlex Elder struct ceph_osd_request *osd_req; 2048bf0d5f50SAlex Elder const char *object_name; 2049bf0d5f50SAlex Elder u64 offset; 2050bf0d5f50SAlex Elder u64 length; 2051bf0d5f50SAlex Elder 20527da22d29SAlex Elder object_name = rbd_segment_name(rbd_dev, img_offset); 2053bf0d5f50SAlex Elder if (!object_name) 2054bf0d5f50SAlex Elder goto out_unwind; 20557da22d29SAlex Elder offset = rbd_segment_offset(rbd_dev, img_offset); 20567da22d29SAlex Elder length = rbd_segment_length(rbd_dev, img_offset, resid); 2057bf0d5f50SAlex Elder obj_request = rbd_obj_request_create(object_name, 2058f1a4739fSAlex Elder offset, length, type); 205978c2a44aSAlex Elder /* object request has its own copy of the object name */ 206078c2a44aSAlex Elder rbd_segment_name_free(object_name); 2061bf0d5f50SAlex Elder if (!obj_request) 2062bf0d5f50SAlex Elder goto out_unwind; 2063bf0d5f50SAlex Elder 2064f1a4739fSAlex Elder if (type == OBJ_REQUEST_BIO) { 2065f1a4739fSAlex Elder unsigned int clone_size; 2066f1a4739fSAlex Elder 2067bf0d5f50SAlex Elder rbd_assert(length <= (u64)UINT_MAX); 2068bf0d5f50SAlex Elder clone_size = (unsigned int)length; 2069f1a4739fSAlex Elder obj_request->bio_list = 2070f1a4739fSAlex Elder bio_chain_clone_range(&bio_list, 2071f1a4739fSAlex Elder &bio_offset, 2072f1a4739fSAlex Elder clone_size, 2073bf0d5f50SAlex Elder GFP_ATOMIC); 2074bf0d5f50SAlex Elder if (!obj_request->bio_list) 2075bf0d5f50SAlex Elder goto out_partial; 2076f1a4739fSAlex Elder } else { 2077f1a4739fSAlex Elder unsigned int page_count; 2078f1a4739fSAlex Elder 2079f1a4739fSAlex Elder obj_request->pages = pages; 2080f1a4739fSAlex Elder page_count = (u32)calc_pages_for(offset, length); 2081f1a4739fSAlex Elder obj_request->page_count = page_count; 2082f1a4739fSAlex Elder if ((offset + length) & ~PAGE_MASK) 2083f1a4739fSAlex Elder page_count--; /* more on last page */ 2084f1a4739fSAlex Elder pages += page_count; 2085f1a4739fSAlex Elder } 2086bf0d5f50SAlex Elder 20872fa12320SAlex Elder osd_req = rbd_osd_req_create(rbd_dev, write_request, 20882fa12320SAlex Elder obj_request); 20892fa12320SAlex Elder if (!osd_req) 2090bf0d5f50SAlex Elder goto out_partial; 20912fa12320SAlex Elder obj_request->osd_req = osd_req; 20922169238dSAlex Elder obj_request->callback = rbd_img_obj_callback; 2093430c28c3SAlex Elder 20942fa12320SAlex Elder osd_req_op_extent_init(osd_req, 0, opcode, offset, length, 20952fa12320SAlex Elder 0, 0); 2096f1a4739fSAlex Elder if (type == OBJ_REQUEST_BIO) 2097406e2c9fSAlex Elder osd_req_op_extent_osd_data_bio(osd_req, 0, 2098f1a4739fSAlex Elder obj_request->bio_list, length); 2099f1a4739fSAlex Elder else 2100f1a4739fSAlex Elder osd_req_op_extent_osd_data_pages(osd_req, 0, 2101f1a4739fSAlex Elder obj_request->pages, length, 2102f1a4739fSAlex Elder offset & ~PAGE_MASK, false, false); 21039d4df01fSAlex Elder 21049d4df01fSAlex Elder if (write_request) 21059d4df01fSAlex Elder rbd_osd_req_format_write(obj_request); 21069d4df01fSAlex Elder else 21079d4df01fSAlex Elder rbd_osd_req_format_read(obj_request); 2108430c28c3SAlex Elder 21097da22d29SAlex Elder obj_request->img_offset = img_offset; 2110bf0d5f50SAlex Elder rbd_img_obj_request_add(img_request, obj_request); 2111bf0d5f50SAlex Elder 21127da22d29SAlex Elder img_offset += length; 2113bf0d5f50SAlex Elder resid -= length; 2114bf0d5f50SAlex Elder } 2115bf0d5f50SAlex Elder 2116bf0d5f50SAlex Elder return 0; 2117bf0d5f50SAlex Elder 2118bf0d5f50SAlex Elder out_partial: 2119bf0d5f50SAlex Elder rbd_obj_request_put(obj_request); 2120bf0d5f50SAlex Elder out_unwind: 2121bf0d5f50SAlex Elder for_each_obj_request_safe(img_request, obj_request, next_obj_request) 2122bf0d5f50SAlex Elder rbd_obj_request_put(obj_request); 2123bf0d5f50SAlex Elder 2124bf0d5f50SAlex Elder return -ENOMEM; 2125bf0d5f50SAlex Elder } 2126bf0d5f50SAlex Elder 21273d7efd18SAlex Elder static void 21280eefd470SAlex Elder rbd_img_obj_copyup_callback(struct rbd_obj_request *obj_request) 21290eefd470SAlex Elder { 21300eefd470SAlex Elder struct rbd_img_request *img_request; 21310eefd470SAlex Elder struct rbd_device *rbd_dev; 21320eefd470SAlex Elder u64 length; 21330eefd470SAlex Elder u32 page_count; 21340eefd470SAlex Elder 21350eefd470SAlex Elder rbd_assert(obj_request->type == OBJ_REQUEST_BIO); 21360eefd470SAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 21370eefd470SAlex Elder img_request = obj_request->img_request; 21380eefd470SAlex Elder rbd_assert(img_request); 21390eefd470SAlex Elder 21400eefd470SAlex Elder rbd_dev = img_request->rbd_dev; 21410eefd470SAlex Elder rbd_assert(rbd_dev); 21420eefd470SAlex Elder length = (u64)1 << rbd_dev->header.obj_order; 21430eefd470SAlex Elder page_count = (u32)calc_pages_for(0, length); 21440eefd470SAlex Elder 21450eefd470SAlex Elder rbd_assert(obj_request->copyup_pages); 21460eefd470SAlex Elder ceph_release_page_vector(obj_request->copyup_pages, page_count); 21470eefd470SAlex Elder obj_request->copyup_pages = NULL; 21480eefd470SAlex Elder 21490eefd470SAlex Elder /* 21500eefd470SAlex Elder * We want the transfer count to reflect the size of the 21510eefd470SAlex Elder * original write request. There is no such thing as a 21520eefd470SAlex Elder * successful short write, so if the request was successful 21530eefd470SAlex Elder * we can just set it to the originally-requested length. 21540eefd470SAlex Elder */ 21550eefd470SAlex Elder if (!obj_request->result) 21560eefd470SAlex Elder obj_request->xferred = obj_request->length; 21570eefd470SAlex Elder 21580eefd470SAlex Elder /* Finish up with the normal image object callback */ 21590eefd470SAlex Elder 21600eefd470SAlex Elder rbd_img_obj_callback(obj_request); 21610eefd470SAlex Elder } 21620eefd470SAlex Elder 21630eefd470SAlex Elder static void 21643d7efd18SAlex Elder rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request) 21653d7efd18SAlex Elder { 21663d7efd18SAlex Elder struct rbd_obj_request *orig_request; 21670eefd470SAlex Elder struct ceph_osd_request *osd_req; 21680eefd470SAlex Elder struct ceph_osd_client *osdc; 21690eefd470SAlex Elder struct rbd_device *rbd_dev; 21703d7efd18SAlex Elder struct page **pages; 21713d7efd18SAlex Elder int result; 21723d7efd18SAlex Elder u64 obj_size; 21733d7efd18SAlex Elder u64 xferred; 21743d7efd18SAlex Elder 21753d7efd18SAlex Elder rbd_assert(img_request_child_test(img_request)); 21763d7efd18SAlex Elder 21773d7efd18SAlex Elder /* First get what we need from the image request */ 21783d7efd18SAlex Elder 21793d7efd18SAlex Elder pages = img_request->copyup_pages; 21803d7efd18SAlex Elder rbd_assert(pages != NULL); 21813d7efd18SAlex Elder img_request->copyup_pages = NULL; 21823d7efd18SAlex Elder 21833d7efd18SAlex Elder orig_request = img_request->obj_request; 21843d7efd18SAlex Elder rbd_assert(orig_request != NULL); 21850eefd470SAlex Elder rbd_assert(orig_request->type == OBJ_REQUEST_BIO); 21863d7efd18SAlex Elder result = img_request->result; 21873d7efd18SAlex Elder obj_size = img_request->length; 21883d7efd18SAlex Elder xferred = img_request->xferred; 21893d7efd18SAlex Elder 21900eefd470SAlex Elder rbd_dev = img_request->rbd_dev; 21910eefd470SAlex Elder rbd_assert(rbd_dev); 21920eefd470SAlex Elder rbd_assert(obj_size == (u64)1 << rbd_dev->header.obj_order); 21930eefd470SAlex Elder 21943d7efd18SAlex Elder rbd_img_request_put(img_request); 21953d7efd18SAlex Elder 21960eefd470SAlex Elder if (result) 21970eefd470SAlex Elder goto out_err; 21983d7efd18SAlex Elder 21990eefd470SAlex Elder /* Allocate the new copyup osd request for the original request */ 22003d7efd18SAlex Elder 22010eefd470SAlex Elder result = -ENOMEM; 22020eefd470SAlex Elder rbd_assert(!orig_request->osd_req); 22030eefd470SAlex Elder osd_req = rbd_osd_req_create_copyup(orig_request); 22040eefd470SAlex Elder if (!osd_req) 22050eefd470SAlex Elder goto out_err; 22060eefd470SAlex Elder orig_request->osd_req = osd_req; 22070eefd470SAlex Elder orig_request->copyup_pages = pages; 22083d7efd18SAlex Elder 22090eefd470SAlex Elder /* Initialize the copyup op */ 22100eefd470SAlex Elder 22110eefd470SAlex Elder osd_req_op_cls_init(osd_req, 0, CEPH_OSD_OP_CALL, "rbd", "copyup"); 22120eefd470SAlex Elder osd_req_op_cls_request_data_pages(osd_req, 0, pages, obj_size, 0, 22130eefd470SAlex Elder false, false); 22140eefd470SAlex Elder 22150eefd470SAlex Elder /* Then the original write request op */ 22160eefd470SAlex Elder 22170eefd470SAlex Elder osd_req_op_extent_init(osd_req, 1, CEPH_OSD_OP_WRITE, 22180eefd470SAlex Elder orig_request->offset, 22190eefd470SAlex Elder orig_request->length, 0, 0); 22200eefd470SAlex Elder osd_req_op_extent_osd_data_bio(osd_req, 1, orig_request->bio_list, 22210eefd470SAlex Elder orig_request->length); 22220eefd470SAlex Elder 22230eefd470SAlex Elder rbd_osd_req_format_write(orig_request); 22240eefd470SAlex Elder 22250eefd470SAlex Elder /* All set, send it off. */ 22260eefd470SAlex Elder 22270eefd470SAlex Elder orig_request->callback = rbd_img_obj_copyup_callback; 22280eefd470SAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 22290eefd470SAlex Elder result = rbd_obj_request_submit(osdc, orig_request); 22300eefd470SAlex Elder if (!result) 22310eefd470SAlex Elder return; 22320eefd470SAlex Elder out_err: 22330eefd470SAlex Elder /* Record the error code and complete the request */ 22340eefd470SAlex Elder 22350eefd470SAlex Elder orig_request->result = result; 22360eefd470SAlex Elder orig_request->xferred = 0; 22373d7efd18SAlex Elder obj_request_done_set(orig_request); 22383d7efd18SAlex Elder rbd_obj_request_complete(orig_request); 22393d7efd18SAlex Elder } 22403d7efd18SAlex Elder 22413d7efd18SAlex Elder /* 22423d7efd18SAlex Elder * Read from the parent image the range of data that covers the 22433d7efd18SAlex Elder * entire target of the given object request. This is used for 22443d7efd18SAlex Elder * satisfying a layered image write request when the target of an 22453d7efd18SAlex Elder * object request from the image request does not exist. 22463d7efd18SAlex Elder * 22473d7efd18SAlex Elder * A page array big enough to hold the returned data is allocated 22483d7efd18SAlex Elder * and supplied to rbd_img_request_fill() as the "data descriptor." 22493d7efd18SAlex Elder * When the read completes, this page array will be transferred to 22503d7efd18SAlex Elder * the original object request for the copyup operation. 22513d7efd18SAlex Elder * 22523d7efd18SAlex Elder * If an error occurs, record it as the result of the original 22533d7efd18SAlex Elder * object request and mark it done so it gets completed. 22543d7efd18SAlex Elder */ 22553d7efd18SAlex Elder static int rbd_img_obj_parent_read_full(struct rbd_obj_request *obj_request) 22563d7efd18SAlex Elder { 22573d7efd18SAlex Elder struct rbd_img_request *img_request = NULL; 22583d7efd18SAlex Elder struct rbd_img_request *parent_request = NULL; 22593d7efd18SAlex Elder struct rbd_device *rbd_dev; 22603d7efd18SAlex Elder u64 img_offset; 22613d7efd18SAlex Elder u64 length; 22623d7efd18SAlex Elder struct page **pages = NULL; 22633d7efd18SAlex Elder u32 page_count; 22643d7efd18SAlex Elder int result; 22653d7efd18SAlex Elder 22663d7efd18SAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 22673d7efd18SAlex Elder rbd_assert(obj_request->type == OBJ_REQUEST_BIO); 22683d7efd18SAlex Elder 22693d7efd18SAlex Elder img_request = obj_request->img_request; 22703d7efd18SAlex Elder rbd_assert(img_request != NULL); 22713d7efd18SAlex Elder rbd_dev = img_request->rbd_dev; 22723d7efd18SAlex Elder rbd_assert(rbd_dev->parent != NULL); 22733d7efd18SAlex Elder 22743d7efd18SAlex Elder /* 22750eefd470SAlex Elder * First things first. The original osd request is of no 22760eefd470SAlex Elder * use to use any more, we'll need a new one that can hold 22770eefd470SAlex Elder * the two ops in a copyup request. We'll get that later, 22780eefd470SAlex Elder * but for now we can release the old one. 22790eefd470SAlex Elder */ 22800eefd470SAlex Elder rbd_osd_req_destroy(obj_request->osd_req); 22810eefd470SAlex Elder obj_request->osd_req = NULL; 22820eefd470SAlex Elder 22830eefd470SAlex Elder /* 22843d7efd18SAlex Elder * Determine the byte range covered by the object in the 22853d7efd18SAlex Elder * child image to which the original request was to be sent. 22863d7efd18SAlex Elder */ 22873d7efd18SAlex Elder img_offset = obj_request->img_offset - obj_request->offset; 22883d7efd18SAlex Elder length = (u64)1 << rbd_dev->header.obj_order; 22893d7efd18SAlex Elder 22903d7efd18SAlex Elder /* 2291a9e8ba2cSAlex Elder * There is no defined parent data beyond the parent 2292a9e8ba2cSAlex Elder * overlap, so limit what we read at that boundary if 2293a9e8ba2cSAlex Elder * necessary. 2294a9e8ba2cSAlex Elder */ 2295a9e8ba2cSAlex Elder if (img_offset + length > rbd_dev->parent_overlap) { 2296a9e8ba2cSAlex Elder rbd_assert(img_offset < rbd_dev->parent_overlap); 2297a9e8ba2cSAlex Elder length = rbd_dev->parent_overlap - img_offset; 2298a9e8ba2cSAlex Elder } 2299a9e8ba2cSAlex Elder 2300a9e8ba2cSAlex Elder /* 23013d7efd18SAlex Elder * Allocate a page array big enough to receive the data read 23023d7efd18SAlex Elder * from the parent. 23033d7efd18SAlex Elder */ 23043d7efd18SAlex Elder page_count = (u32)calc_pages_for(0, length); 23053d7efd18SAlex Elder pages = ceph_alloc_page_vector(page_count, GFP_KERNEL); 23063d7efd18SAlex Elder if (IS_ERR(pages)) { 23073d7efd18SAlex Elder result = PTR_ERR(pages); 23083d7efd18SAlex Elder pages = NULL; 23093d7efd18SAlex Elder goto out_err; 23103d7efd18SAlex Elder } 23113d7efd18SAlex Elder 23123d7efd18SAlex Elder result = -ENOMEM; 23133d7efd18SAlex Elder parent_request = rbd_img_request_create(rbd_dev->parent, 23143d7efd18SAlex Elder img_offset, length, 23153d7efd18SAlex Elder false, true); 23163d7efd18SAlex Elder if (!parent_request) 23173d7efd18SAlex Elder goto out_err; 23183d7efd18SAlex Elder rbd_obj_request_get(obj_request); 23193d7efd18SAlex Elder parent_request->obj_request = obj_request; 23203d7efd18SAlex Elder 23213d7efd18SAlex Elder result = rbd_img_request_fill(parent_request, OBJ_REQUEST_PAGES, pages); 23223d7efd18SAlex Elder if (result) 23233d7efd18SAlex Elder goto out_err; 23243d7efd18SAlex Elder parent_request->copyup_pages = pages; 23253d7efd18SAlex Elder 23263d7efd18SAlex Elder parent_request->callback = rbd_img_obj_parent_read_full_callback; 23273d7efd18SAlex Elder result = rbd_img_request_submit(parent_request); 23283d7efd18SAlex Elder if (!result) 23293d7efd18SAlex Elder return 0; 23303d7efd18SAlex Elder 23313d7efd18SAlex Elder parent_request->copyup_pages = NULL; 23323d7efd18SAlex Elder parent_request->obj_request = NULL; 23333d7efd18SAlex Elder rbd_obj_request_put(obj_request); 23343d7efd18SAlex Elder out_err: 23353d7efd18SAlex Elder if (pages) 23363d7efd18SAlex Elder ceph_release_page_vector(pages, page_count); 23373d7efd18SAlex Elder if (parent_request) 23383d7efd18SAlex Elder rbd_img_request_put(parent_request); 23393d7efd18SAlex Elder obj_request->result = result; 23403d7efd18SAlex Elder obj_request->xferred = 0; 23413d7efd18SAlex Elder obj_request_done_set(obj_request); 23423d7efd18SAlex Elder 23433d7efd18SAlex Elder return result; 23443d7efd18SAlex Elder } 23453d7efd18SAlex Elder 2346c5b5ef6cSAlex Elder static void rbd_img_obj_exists_callback(struct rbd_obj_request *obj_request) 2347c5b5ef6cSAlex Elder { 2348c5b5ef6cSAlex Elder struct rbd_obj_request *orig_request; 2349c5b5ef6cSAlex Elder int result; 2350c5b5ef6cSAlex Elder 2351c5b5ef6cSAlex Elder rbd_assert(!obj_request_img_data_test(obj_request)); 2352c5b5ef6cSAlex Elder 2353c5b5ef6cSAlex Elder /* 2354c5b5ef6cSAlex Elder * All we need from the object request is the original 2355c5b5ef6cSAlex Elder * request and the result of the STAT op. Grab those, then 2356c5b5ef6cSAlex Elder * we're done with the request. 2357c5b5ef6cSAlex Elder */ 2358c5b5ef6cSAlex Elder orig_request = obj_request->obj_request; 2359c5b5ef6cSAlex Elder obj_request->obj_request = NULL; 2360c5b5ef6cSAlex Elder rbd_assert(orig_request); 2361c5b5ef6cSAlex Elder rbd_assert(orig_request->img_request); 2362c5b5ef6cSAlex Elder 2363c5b5ef6cSAlex Elder result = obj_request->result; 2364c5b5ef6cSAlex Elder obj_request->result = 0; 2365c5b5ef6cSAlex Elder 2366c5b5ef6cSAlex Elder dout("%s: obj %p for obj %p result %d %llu/%llu\n", __func__, 2367c5b5ef6cSAlex Elder obj_request, orig_request, result, 2368c5b5ef6cSAlex Elder obj_request->xferred, obj_request->length); 2369c5b5ef6cSAlex Elder rbd_obj_request_put(obj_request); 2370c5b5ef6cSAlex Elder 2371c5b5ef6cSAlex Elder rbd_assert(orig_request); 2372c5b5ef6cSAlex Elder rbd_assert(orig_request->img_request); 2373c5b5ef6cSAlex Elder 2374c5b5ef6cSAlex Elder /* 2375c5b5ef6cSAlex Elder * Our only purpose here is to determine whether the object 2376c5b5ef6cSAlex Elder * exists, and we don't want to treat the non-existence as 2377c5b5ef6cSAlex Elder * an error. If something else comes back, transfer the 2378c5b5ef6cSAlex Elder * error to the original request and complete it now. 2379c5b5ef6cSAlex Elder */ 2380c5b5ef6cSAlex Elder if (!result) { 2381c5b5ef6cSAlex Elder obj_request_existence_set(orig_request, true); 2382c5b5ef6cSAlex Elder } else if (result == -ENOENT) { 2383c5b5ef6cSAlex Elder obj_request_existence_set(orig_request, false); 2384c5b5ef6cSAlex Elder } else if (result) { 2385c5b5ef6cSAlex Elder orig_request->result = result; 23863d7efd18SAlex Elder goto out; 2387c5b5ef6cSAlex Elder } 2388c5b5ef6cSAlex Elder 2389c5b5ef6cSAlex Elder /* 2390c5b5ef6cSAlex Elder * Resubmit the original request now that we have recorded 2391c5b5ef6cSAlex Elder * whether the target object exists. 2392c5b5ef6cSAlex Elder */ 2393b454e36dSAlex Elder orig_request->result = rbd_img_obj_request_submit(orig_request); 23943d7efd18SAlex Elder out: 2395c5b5ef6cSAlex Elder if (orig_request->result) 2396c5b5ef6cSAlex Elder rbd_obj_request_complete(orig_request); 2397c5b5ef6cSAlex Elder rbd_obj_request_put(orig_request); 2398c5b5ef6cSAlex Elder } 2399c5b5ef6cSAlex Elder 2400c5b5ef6cSAlex Elder static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request) 2401c5b5ef6cSAlex Elder { 2402c5b5ef6cSAlex Elder struct rbd_obj_request *stat_request; 2403c5b5ef6cSAlex Elder struct rbd_device *rbd_dev; 2404c5b5ef6cSAlex Elder struct ceph_osd_client *osdc; 2405c5b5ef6cSAlex Elder struct page **pages = NULL; 2406c5b5ef6cSAlex Elder u32 page_count; 2407c5b5ef6cSAlex Elder size_t size; 2408c5b5ef6cSAlex Elder int ret; 2409c5b5ef6cSAlex Elder 2410c5b5ef6cSAlex Elder /* 2411c5b5ef6cSAlex Elder * The response data for a STAT call consists of: 2412c5b5ef6cSAlex Elder * le64 length; 2413c5b5ef6cSAlex Elder * struct { 2414c5b5ef6cSAlex Elder * le32 tv_sec; 2415c5b5ef6cSAlex Elder * le32 tv_nsec; 2416c5b5ef6cSAlex Elder * } mtime; 2417c5b5ef6cSAlex Elder */ 2418c5b5ef6cSAlex Elder size = sizeof (__le64) + sizeof (__le32) + sizeof (__le32); 2419c5b5ef6cSAlex Elder page_count = (u32)calc_pages_for(0, size); 2420c5b5ef6cSAlex Elder pages = ceph_alloc_page_vector(page_count, GFP_KERNEL); 2421c5b5ef6cSAlex Elder if (IS_ERR(pages)) 2422c5b5ef6cSAlex Elder return PTR_ERR(pages); 2423c5b5ef6cSAlex Elder 2424c5b5ef6cSAlex Elder ret = -ENOMEM; 2425c5b5ef6cSAlex Elder stat_request = rbd_obj_request_create(obj_request->object_name, 0, 0, 2426c5b5ef6cSAlex Elder OBJ_REQUEST_PAGES); 2427c5b5ef6cSAlex Elder if (!stat_request) 2428c5b5ef6cSAlex Elder goto out; 2429c5b5ef6cSAlex Elder 2430c5b5ef6cSAlex Elder rbd_obj_request_get(obj_request); 2431c5b5ef6cSAlex Elder stat_request->obj_request = obj_request; 2432c5b5ef6cSAlex Elder stat_request->pages = pages; 2433c5b5ef6cSAlex Elder stat_request->page_count = page_count; 2434c5b5ef6cSAlex Elder 2435c5b5ef6cSAlex Elder rbd_assert(obj_request->img_request); 2436c5b5ef6cSAlex Elder rbd_dev = obj_request->img_request->rbd_dev; 2437c5b5ef6cSAlex Elder stat_request->osd_req = rbd_osd_req_create(rbd_dev, false, 2438c5b5ef6cSAlex Elder stat_request); 2439c5b5ef6cSAlex Elder if (!stat_request->osd_req) 2440c5b5ef6cSAlex Elder goto out; 2441c5b5ef6cSAlex Elder stat_request->callback = rbd_img_obj_exists_callback; 2442c5b5ef6cSAlex Elder 2443c5b5ef6cSAlex Elder osd_req_op_init(stat_request->osd_req, 0, CEPH_OSD_OP_STAT); 2444c5b5ef6cSAlex Elder osd_req_op_raw_data_in_pages(stat_request->osd_req, 0, pages, size, 0, 2445c5b5ef6cSAlex Elder false, false); 24469d4df01fSAlex Elder rbd_osd_req_format_read(stat_request); 2447c5b5ef6cSAlex Elder 2448c5b5ef6cSAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 2449c5b5ef6cSAlex Elder ret = rbd_obj_request_submit(osdc, stat_request); 2450c5b5ef6cSAlex Elder out: 2451c5b5ef6cSAlex Elder if (ret) 2452c5b5ef6cSAlex Elder rbd_obj_request_put(obj_request); 2453c5b5ef6cSAlex Elder 2454c5b5ef6cSAlex Elder return ret; 2455c5b5ef6cSAlex Elder } 2456c5b5ef6cSAlex Elder 2457b454e36dSAlex Elder static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request) 2458b454e36dSAlex Elder { 2459b454e36dSAlex Elder struct rbd_img_request *img_request; 2460a9e8ba2cSAlex Elder struct rbd_device *rbd_dev; 24613d7efd18SAlex Elder bool known; 2462b454e36dSAlex Elder 2463b454e36dSAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 2464b454e36dSAlex Elder 2465b454e36dSAlex Elder img_request = obj_request->img_request; 2466b454e36dSAlex Elder rbd_assert(img_request); 2467a9e8ba2cSAlex Elder rbd_dev = img_request->rbd_dev; 2468b454e36dSAlex Elder 2469b454e36dSAlex Elder /* 2470a9e8ba2cSAlex Elder * Only writes to layered images need special handling. 2471a9e8ba2cSAlex Elder * Reads and non-layered writes are simple object requests. 2472a9e8ba2cSAlex Elder * Layered writes that start beyond the end of the overlap 2473a9e8ba2cSAlex Elder * with the parent have no parent data, so they too are 2474a9e8ba2cSAlex Elder * simple object requests. Finally, if the target object is 2475a9e8ba2cSAlex Elder * known to already exist, its parent data has already been 2476a9e8ba2cSAlex Elder * copied, so a write to the object can also be handled as a 2477a9e8ba2cSAlex Elder * simple object request. 2478b454e36dSAlex Elder */ 2479b454e36dSAlex Elder if (!img_request_write_test(img_request) || 2480b454e36dSAlex Elder !img_request_layered_test(img_request) || 2481a9e8ba2cSAlex Elder rbd_dev->parent_overlap <= obj_request->img_offset || 24823d7efd18SAlex Elder ((known = obj_request_known_test(obj_request)) && 24833d7efd18SAlex Elder obj_request_exists_test(obj_request))) { 2484b454e36dSAlex Elder 2485b454e36dSAlex Elder struct rbd_device *rbd_dev; 2486b454e36dSAlex Elder struct ceph_osd_client *osdc; 2487b454e36dSAlex Elder 2488b454e36dSAlex Elder rbd_dev = obj_request->img_request->rbd_dev; 2489b454e36dSAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 2490b454e36dSAlex Elder 2491b454e36dSAlex Elder return rbd_obj_request_submit(osdc, obj_request); 2492b454e36dSAlex Elder } 2493b454e36dSAlex Elder 2494b454e36dSAlex Elder /* 24953d7efd18SAlex Elder * It's a layered write. The target object might exist but 24963d7efd18SAlex Elder * we may not know that yet. If we know it doesn't exist, 24973d7efd18SAlex Elder * start by reading the data for the full target object from 24983d7efd18SAlex Elder * the parent so we can use it for a copyup to the target. 2499b454e36dSAlex Elder */ 25003d7efd18SAlex Elder if (known) 25013d7efd18SAlex Elder return rbd_img_obj_parent_read_full(obj_request); 25023d7efd18SAlex Elder 25033d7efd18SAlex Elder /* We don't know whether the target exists. Go find out. */ 2504b454e36dSAlex Elder 2505b454e36dSAlex Elder return rbd_img_obj_exists_submit(obj_request); 2506b454e36dSAlex Elder } 2507b454e36dSAlex Elder 2508bf0d5f50SAlex Elder static int rbd_img_request_submit(struct rbd_img_request *img_request) 2509bf0d5f50SAlex Elder { 2510bf0d5f50SAlex Elder struct rbd_obj_request *obj_request; 251146faeed4SAlex Elder struct rbd_obj_request *next_obj_request; 2512bf0d5f50SAlex Elder 251337206ee5SAlex Elder dout("%s: img %p\n", __func__, img_request); 251446faeed4SAlex Elder for_each_obj_request_safe(img_request, obj_request, next_obj_request) { 2515bf0d5f50SAlex Elder int ret; 2516bf0d5f50SAlex Elder 2517b454e36dSAlex Elder ret = rbd_img_obj_request_submit(obj_request); 2518bf0d5f50SAlex Elder if (ret) 2519bf0d5f50SAlex Elder return ret; 2520bf0d5f50SAlex Elder } 2521bf0d5f50SAlex Elder 2522bf0d5f50SAlex Elder return 0; 2523bf0d5f50SAlex Elder } 2524bf0d5f50SAlex Elder 25258b3e1a56SAlex Elder static void rbd_img_parent_read_callback(struct rbd_img_request *img_request) 25268b3e1a56SAlex Elder { 25278b3e1a56SAlex Elder struct rbd_obj_request *obj_request; 2528a9e8ba2cSAlex Elder struct rbd_device *rbd_dev; 2529a9e8ba2cSAlex Elder u64 obj_end; 25308b3e1a56SAlex Elder 25318b3e1a56SAlex Elder rbd_assert(img_request_child_test(img_request)); 25328b3e1a56SAlex Elder 25338b3e1a56SAlex Elder obj_request = img_request->obj_request; 2534a9e8ba2cSAlex Elder rbd_assert(obj_request); 2535a9e8ba2cSAlex Elder rbd_assert(obj_request->img_request); 25368b3e1a56SAlex Elder 2537a9e8ba2cSAlex Elder obj_request->result = img_request->result; 2538a9e8ba2cSAlex Elder if (obj_request->result) 2539a9e8ba2cSAlex Elder goto out; 2540a9e8ba2cSAlex Elder 2541a9e8ba2cSAlex Elder /* 2542a9e8ba2cSAlex Elder * We need to zero anything beyond the parent overlap 2543a9e8ba2cSAlex Elder * boundary. Since rbd_img_obj_request_read_callback() 2544a9e8ba2cSAlex Elder * will zero anything beyond the end of a short read, an 2545a9e8ba2cSAlex Elder * easy way to do this is to pretend the data from the 2546a9e8ba2cSAlex Elder * parent came up short--ending at the overlap boundary. 2547a9e8ba2cSAlex Elder */ 2548a9e8ba2cSAlex Elder rbd_assert(obj_request->img_offset < U64_MAX - obj_request->length); 2549a9e8ba2cSAlex Elder obj_end = obj_request->img_offset + obj_request->length; 2550a9e8ba2cSAlex Elder rbd_dev = obj_request->img_request->rbd_dev; 2551a9e8ba2cSAlex Elder if (obj_end > rbd_dev->parent_overlap) { 2552a9e8ba2cSAlex Elder u64 xferred = 0; 2553a9e8ba2cSAlex Elder 2554a9e8ba2cSAlex Elder if (obj_request->img_offset < rbd_dev->parent_overlap) 2555a9e8ba2cSAlex Elder xferred = rbd_dev->parent_overlap - 2556a9e8ba2cSAlex Elder obj_request->img_offset; 2557a9e8ba2cSAlex Elder 2558a9e8ba2cSAlex Elder obj_request->xferred = min(img_request->xferred, xferred); 2559a9e8ba2cSAlex Elder } else { 2560a9e8ba2cSAlex Elder obj_request->xferred = img_request->xferred; 2561a9e8ba2cSAlex Elder } 2562a9e8ba2cSAlex Elder out: 2563b5b09be3SAlex Elder rbd_img_request_put(img_request); 25648b3e1a56SAlex Elder rbd_img_obj_request_read_callback(obj_request); 25658b3e1a56SAlex Elder rbd_obj_request_complete(obj_request); 25668b3e1a56SAlex Elder } 25678b3e1a56SAlex Elder 25688b3e1a56SAlex Elder static void rbd_img_parent_read(struct rbd_obj_request *obj_request) 25698b3e1a56SAlex Elder { 25708b3e1a56SAlex Elder struct rbd_device *rbd_dev; 25718b3e1a56SAlex Elder struct rbd_img_request *img_request; 25728b3e1a56SAlex Elder int result; 25738b3e1a56SAlex Elder 25748b3e1a56SAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 25758b3e1a56SAlex Elder rbd_assert(obj_request->img_request != NULL); 25768b3e1a56SAlex Elder rbd_assert(obj_request->result == (s32) -ENOENT); 25778b3e1a56SAlex Elder rbd_assert(obj_request->type == OBJ_REQUEST_BIO); 25788b3e1a56SAlex Elder 25798b3e1a56SAlex Elder rbd_dev = obj_request->img_request->rbd_dev; 25808b3e1a56SAlex Elder rbd_assert(rbd_dev->parent != NULL); 25818b3e1a56SAlex Elder /* rbd_read_finish(obj_request, obj_request->length); */ 25828b3e1a56SAlex Elder img_request = rbd_img_request_create(rbd_dev->parent, 25838b3e1a56SAlex Elder obj_request->img_offset, 25848b3e1a56SAlex Elder obj_request->length, 25858b3e1a56SAlex Elder false, true); 25868b3e1a56SAlex Elder result = -ENOMEM; 25878b3e1a56SAlex Elder if (!img_request) 25888b3e1a56SAlex Elder goto out_err; 25898b3e1a56SAlex Elder 25908b3e1a56SAlex Elder rbd_obj_request_get(obj_request); 25918b3e1a56SAlex Elder img_request->obj_request = obj_request; 25928b3e1a56SAlex Elder 2593f1a4739fSAlex Elder result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO, 2594f1a4739fSAlex Elder obj_request->bio_list); 25958b3e1a56SAlex Elder if (result) 25968b3e1a56SAlex Elder goto out_err; 25978b3e1a56SAlex Elder 25988b3e1a56SAlex Elder img_request->callback = rbd_img_parent_read_callback; 25998b3e1a56SAlex Elder result = rbd_img_request_submit(img_request); 26008b3e1a56SAlex Elder if (result) 26018b3e1a56SAlex Elder goto out_err; 26028b3e1a56SAlex Elder 26038b3e1a56SAlex Elder return; 26048b3e1a56SAlex Elder out_err: 26058b3e1a56SAlex Elder if (img_request) 26068b3e1a56SAlex Elder rbd_img_request_put(img_request); 26078b3e1a56SAlex Elder obj_request->result = result; 26088b3e1a56SAlex Elder obj_request->xferred = 0; 26098b3e1a56SAlex Elder obj_request_done_set(obj_request); 26108b3e1a56SAlex Elder } 26118b3e1a56SAlex Elder 2612cc4a38bdSAlex Elder static int rbd_obj_notify_ack(struct rbd_device *rbd_dev, u64 notify_id) 2613b8d70035SAlex Elder { 2614b8d70035SAlex Elder struct rbd_obj_request *obj_request; 26152169238dSAlex Elder struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 2616b8d70035SAlex Elder int ret; 2617b8d70035SAlex Elder 2618b8d70035SAlex Elder obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0, 2619b8d70035SAlex Elder OBJ_REQUEST_NODATA); 2620b8d70035SAlex Elder if (!obj_request) 2621b8d70035SAlex Elder return -ENOMEM; 2622b8d70035SAlex Elder 2623b8d70035SAlex Elder ret = -ENOMEM; 2624430c28c3SAlex Elder obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request); 2625b8d70035SAlex Elder if (!obj_request->osd_req) 2626b8d70035SAlex Elder goto out; 26272169238dSAlex Elder obj_request->callback = rbd_obj_request_put; 2628b8d70035SAlex Elder 2629c99d2d4aSAlex Elder osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_NOTIFY_ACK, 2630cc4a38bdSAlex Elder notify_id, 0, 0); 26319d4df01fSAlex Elder rbd_osd_req_format_read(obj_request); 2632430c28c3SAlex Elder 2633b8d70035SAlex Elder ret = rbd_obj_request_submit(osdc, obj_request); 2634b8d70035SAlex Elder out: 2635cf81b60eSAlex Elder if (ret) 2636b8d70035SAlex Elder rbd_obj_request_put(obj_request); 2637b8d70035SAlex Elder 2638b8d70035SAlex Elder return ret; 2639b8d70035SAlex Elder } 2640b8d70035SAlex Elder 2641b8d70035SAlex Elder static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data) 2642b8d70035SAlex Elder { 2643b8d70035SAlex Elder struct rbd_device *rbd_dev = (struct rbd_device *)data; 2644e627db08SAlex Elder int ret; 2645b8d70035SAlex Elder 2646b8d70035SAlex Elder if (!rbd_dev) 2647b8d70035SAlex Elder return; 2648b8d70035SAlex Elder 264937206ee5SAlex Elder dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__, 2650b8d70035SAlex Elder rbd_dev->header_name, (unsigned long long)notify_id, 2651b8d70035SAlex Elder (unsigned int)opcode); 2652e627db08SAlex Elder ret = rbd_dev_refresh(rbd_dev); 2653e627db08SAlex Elder if (ret) 2654e627db08SAlex Elder rbd_warn(rbd_dev, ": header refresh error (%d)\n", ret); 2655b8d70035SAlex Elder 2656cc4a38bdSAlex Elder rbd_obj_notify_ack(rbd_dev, notify_id); 2657b8d70035SAlex Elder } 2658b8d70035SAlex Elder 26599969ebc5SAlex Elder /* 26609969ebc5SAlex Elder * Request sync osd watch/unwatch. The value of "start" determines 26619969ebc5SAlex Elder * whether a watch request is being initiated or torn down. 26629969ebc5SAlex Elder */ 26639969ebc5SAlex Elder static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start) 26649969ebc5SAlex Elder { 26659969ebc5SAlex Elder struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 26669969ebc5SAlex Elder struct rbd_obj_request *obj_request; 26679969ebc5SAlex Elder int ret; 26689969ebc5SAlex Elder 26699969ebc5SAlex Elder rbd_assert(start ^ !!rbd_dev->watch_event); 26709969ebc5SAlex Elder rbd_assert(start ^ !!rbd_dev->watch_request); 26719969ebc5SAlex Elder 26729969ebc5SAlex Elder if (start) { 26733c663bbdSAlex Elder ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev, 26749969ebc5SAlex Elder &rbd_dev->watch_event); 26759969ebc5SAlex Elder if (ret < 0) 26769969ebc5SAlex Elder return ret; 26778eb87565SAlex Elder rbd_assert(rbd_dev->watch_event != NULL); 26789969ebc5SAlex Elder } 26799969ebc5SAlex Elder 26809969ebc5SAlex Elder ret = -ENOMEM; 26819969ebc5SAlex Elder obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0, 26829969ebc5SAlex Elder OBJ_REQUEST_NODATA); 26839969ebc5SAlex Elder if (!obj_request) 26849969ebc5SAlex Elder goto out_cancel; 26859969ebc5SAlex Elder 2686430c28c3SAlex Elder obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, obj_request); 2687430c28c3SAlex Elder if (!obj_request->osd_req) 2688430c28c3SAlex Elder goto out_cancel; 2689430c28c3SAlex Elder 26908eb87565SAlex Elder if (start) 2691975241afSAlex Elder ceph_osdc_set_request_linger(osdc, obj_request->osd_req); 26928eb87565SAlex Elder else 26936977c3f9SAlex Elder ceph_osdc_unregister_linger_request(osdc, 2694975241afSAlex Elder rbd_dev->watch_request->osd_req); 26952169238dSAlex Elder 26962169238dSAlex Elder osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH, 2697b21ebdddSAlex Elder rbd_dev->watch_event->cookie, 0, start); 26989d4df01fSAlex Elder rbd_osd_req_format_write(obj_request); 26992169238dSAlex Elder 27009969ebc5SAlex Elder ret = rbd_obj_request_submit(osdc, obj_request); 27019969ebc5SAlex Elder if (ret) 27029969ebc5SAlex Elder goto out_cancel; 27039969ebc5SAlex Elder ret = rbd_obj_request_wait(obj_request); 27049969ebc5SAlex Elder if (ret) 27059969ebc5SAlex Elder goto out_cancel; 27069969ebc5SAlex Elder ret = obj_request->result; 27079969ebc5SAlex Elder if (ret) 27089969ebc5SAlex Elder goto out_cancel; 27099969ebc5SAlex Elder 27108eb87565SAlex Elder /* 27118eb87565SAlex Elder * A watch request is set to linger, so the underlying osd 27128eb87565SAlex Elder * request won't go away until we unregister it. We retain 27138eb87565SAlex Elder * a pointer to the object request during that time (in 27148eb87565SAlex Elder * rbd_dev->watch_request), so we'll keep a reference to 27158eb87565SAlex Elder * it. We'll drop that reference (below) after we've 27168eb87565SAlex Elder * unregistered it. 27178eb87565SAlex Elder */ 27188eb87565SAlex Elder if (start) { 27198eb87565SAlex Elder rbd_dev->watch_request = obj_request; 27208eb87565SAlex Elder 27218eb87565SAlex Elder return 0; 27228eb87565SAlex Elder } 27238eb87565SAlex Elder 27248eb87565SAlex Elder /* We have successfully torn down the watch request */ 27258eb87565SAlex Elder 27268eb87565SAlex Elder rbd_obj_request_put(rbd_dev->watch_request); 27278eb87565SAlex Elder rbd_dev->watch_request = NULL; 27289969ebc5SAlex Elder out_cancel: 27299969ebc5SAlex Elder /* Cancel the event if we're tearing down, or on error */ 27309969ebc5SAlex Elder ceph_osdc_cancel_event(rbd_dev->watch_event); 27319969ebc5SAlex Elder rbd_dev->watch_event = NULL; 27329969ebc5SAlex Elder if (obj_request) 27339969ebc5SAlex Elder rbd_obj_request_put(obj_request); 27349969ebc5SAlex Elder 27359969ebc5SAlex Elder return ret; 27369969ebc5SAlex Elder } 27379969ebc5SAlex Elder 273836be9a76SAlex Elder /* 2739f40eb349SAlex Elder * Synchronous osd object method call. Returns the number of bytes 2740f40eb349SAlex Elder * returned in the outbound buffer, or a negative error code. 274136be9a76SAlex Elder */ 274236be9a76SAlex Elder static int rbd_obj_method_sync(struct rbd_device *rbd_dev, 274336be9a76SAlex Elder const char *object_name, 274436be9a76SAlex Elder const char *class_name, 274536be9a76SAlex Elder const char *method_name, 27464157976bSAlex Elder const void *outbound, 274736be9a76SAlex Elder size_t outbound_size, 27484157976bSAlex Elder void *inbound, 2749e2a58ee5SAlex Elder size_t inbound_size) 275036be9a76SAlex Elder { 27512169238dSAlex Elder struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 275236be9a76SAlex Elder struct rbd_obj_request *obj_request; 275336be9a76SAlex Elder struct page **pages; 275436be9a76SAlex Elder u32 page_count; 275536be9a76SAlex Elder int ret; 275636be9a76SAlex Elder 275736be9a76SAlex Elder /* 27586010a451SAlex Elder * Method calls are ultimately read operations. The result 27596010a451SAlex Elder * should placed into the inbound buffer provided. They 27606010a451SAlex Elder * also supply outbound data--parameters for the object 27616010a451SAlex Elder * method. Currently if this is present it will be a 27626010a451SAlex Elder * snapshot id. 276336be9a76SAlex Elder */ 276436be9a76SAlex Elder page_count = (u32)calc_pages_for(0, inbound_size); 276536be9a76SAlex Elder pages = ceph_alloc_page_vector(page_count, GFP_KERNEL); 276636be9a76SAlex Elder if (IS_ERR(pages)) 276736be9a76SAlex Elder return PTR_ERR(pages); 276836be9a76SAlex Elder 276936be9a76SAlex Elder ret = -ENOMEM; 27706010a451SAlex Elder obj_request = rbd_obj_request_create(object_name, 0, inbound_size, 277136be9a76SAlex Elder OBJ_REQUEST_PAGES); 277236be9a76SAlex Elder if (!obj_request) 277336be9a76SAlex Elder goto out; 277436be9a76SAlex Elder 277536be9a76SAlex Elder obj_request->pages = pages; 277636be9a76SAlex Elder obj_request->page_count = page_count; 277736be9a76SAlex Elder 2778430c28c3SAlex Elder obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request); 277936be9a76SAlex Elder if (!obj_request->osd_req) 278036be9a76SAlex Elder goto out; 278136be9a76SAlex Elder 2782c99d2d4aSAlex Elder osd_req_op_cls_init(obj_request->osd_req, 0, CEPH_OSD_OP_CALL, 278304017e29SAlex Elder class_name, method_name); 278404017e29SAlex Elder if (outbound_size) { 278504017e29SAlex Elder struct ceph_pagelist *pagelist; 278604017e29SAlex Elder 278704017e29SAlex Elder pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS); 278804017e29SAlex Elder if (!pagelist) 278904017e29SAlex Elder goto out; 279004017e29SAlex Elder 279104017e29SAlex Elder ceph_pagelist_init(pagelist); 279204017e29SAlex Elder ceph_pagelist_append(pagelist, outbound, outbound_size); 279304017e29SAlex Elder osd_req_op_cls_request_data_pagelist(obj_request->osd_req, 0, 279404017e29SAlex Elder pagelist); 279504017e29SAlex Elder } 2796a4ce40a9SAlex Elder osd_req_op_cls_response_data_pages(obj_request->osd_req, 0, 2797a4ce40a9SAlex Elder obj_request->pages, inbound_size, 279844cd188dSAlex Elder 0, false, false); 27999d4df01fSAlex Elder rbd_osd_req_format_read(obj_request); 2800430c28c3SAlex Elder 280136be9a76SAlex Elder ret = rbd_obj_request_submit(osdc, obj_request); 280236be9a76SAlex Elder if (ret) 280336be9a76SAlex Elder goto out; 280436be9a76SAlex Elder ret = rbd_obj_request_wait(obj_request); 280536be9a76SAlex Elder if (ret) 280636be9a76SAlex Elder goto out; 280736be9a76SAlex Elder 280836be9a76SAlex Elder ret = obj_request->result; 280936be9a76SAlex Elder if (ret < 0) 281036be9a76SAlex Elder goto out; 281157385b51SAlex Elder 281257385b51SAlex Elder rbd_assert(obj_request->xferred < (u64)INT_MAX); 281357385b51SAlex Elder ret = (int)obj_request->xferred; 2814903bb32eSAlex Elder ceph_copy_from_page_vector(pages, inbound, 0, obj_request->xferred); 281536be9a76SAlex Elder out: 281636be9a76SAlex Elder if (obj_request) 281736be9a76SAlex Elder rbd_obj_request_put(obj_request); 281836be9a76SAlex Elder else 281936be9a76SAlex Elder ceph_release_page_vector(pages, page_count); 282036be9a76SAlex Elder 282136be9a76SAlex Elder return ret; 282236be9a76SAlex Elder } 282336be9a76SAlex Elder 2824bf0d5f50SAlex Elder static void rbd_request_fn(struct request_queue *q) 2825cc344fa1SAlex Elder __releases(q->queue_lock) __acquires(q->queue_lock) 2826bf0d5f50SAlex Elder { 2827bf0d5f50SAlex Elder struct rbd_device *rbd_dev = q->queuedata; 2828bf0d5f50SAlex Elder bool read_only = rbd_dev->mapping.read_only; 2829bf0d5f50SAlex Elder struct request *rq; 2830bf0d5f50SAlex Elder int result; 2831bf0d5f50SAlex Elder 2832bf0d5f50SAlex Elder while ((rq = blk_fetch_request(q))) { 2833bf0d5f50SAlex Elder bool write_request = rq_data_dir(rq) == WRITE; 2834bf0d5f50SAlex Elder struct rbd_img_request *img_request; 2835bf0d5f50SAlex Elder u64 offset; 2836bf0d5f50SAlex Elder u64 length; 2837bf0d5f50SAlex Elder 2838bf0d5f50SAlex Elder /* Ignore any non-FS requests that filter through. */ 2839bf0d5f50SAlex Elder 2840bf0d5f50SAlex Elder if (rq->cmd_type != REQ_TYPE_FS) { 28414dda41d3SAlex Elder dout("%s: non-fs request type %d\n", __func__, 28424dda41d3SAlex Elder (int) rq->cmd_type); 28434dda41d3SAlex Elder __blk_end_request_all(rq, 0); 28444dda41d3SAlex Elder continue; 28454dda41d3SAlex Elder } 28464dda41d3SAlex Elder 28474dda41d3SAlex Elder /* Ignore/skip any zero-length requests */ 28484dda41d3SAlex Elder 28494dda41d3SAlex Elder offset = (u64) blk_rq_pos(rq) << SECTOR_SHIFT; 28504dda41d3SAlex Elder length = (u64) blk_rq_bytes(rq); 28514dda41d3SAlex Elder 28524dda41d3SAlex Elder if (!length) { 28534dda41d3SAlex Elder dout("%s: zero-length request\n", __func__); 2854bf0d5f50SAlex Elder __blk_end_request_all(rq, 0); 2855bf0d5f50SAlex Elder continue; 2856bf0d5f50SAlex Elder } 2857bf0d5f50SAlex Elder 2858bf0d5f50SAlex Elder spin_unlock_irq(q->queue_lock); 2859bf0d5f50SAlex Elder 2860bf0d5f50SAlex Elder /* Disallow writes to a read-only device */ 2861bf0d5f50SAlex Elder 2862bf0d5f50SAlex Elder if (write_request) { 2863bf0d5f50SAlex Elder result = -EROFS; 2864bf0d5f50SAlex Elder if (read_only) 2865bf0d5f50SAlex Elder goto end_request; 2866bf0d5f50SAlex Elder rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP); 2867bf0d5f50SAlex Elder } 2868bf0d5f50SAlex Elder 28696d292906SAlex Elder /* 28706d292906SAlex Elder * Quit early if the mapped snapshot no longer 28716d292906SAlex Elder * exists. It's still possible the snapshot will 28726d292906SAlex Elder * have disappeared by the time our request arrives 28736d292906SAlex Elder * at the osd, but there's no sense in sending it if 28746d292906SAlex Elder * we already know. 28756d292906SAlex Elder */ 28766d292906SAlex Elder if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) { 2877bf0d5f50SAlex Elder dout("request for non-existent snapshot"); 2878bf0d5f50SAlex Elder rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP); 2879bf0d5f50SAlex Elder result = -ENXIO; 2880bf0d5f50SAlex Elder goto end_request; 2881bf0d5f50SAlex Elder } 2882bf0d5f50SAlex Elder 2883bf0d5f50SAlex Elder result = -EINVAL; 2884c0cd10dbSAlex Elder if (offset && length > U64_MAX - offset + 1) { 2885c0cd10dbSAlex Elder rbd_warn(rbd_dev, "bad request range (%llu~%llu)\n", 2886c0cd10dbSAlex Elder offset, length); 2887bf0d5f50SAlex Elder goto end_request; /* Shouldn't happen */ 2888c0cd10dbSAlex Elder } 2889bf0d5f50SAlex Elder 289000a653e2SAlex Elder result = -EIO; 289100a653e2SAlex Elder if (offset + length > rbd_dev->mapping.size) { 289200a653e2SAlex Elder rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)\n", 289300a653e2SAlex Elder offset, length, rbd_dev->mapping.size); 289400a653e2SAlex Elder goto end_request; 289500a653e2SAlex Elder } 289600a653e2SAlex Elder 2897bf0d5f50SAlex Elder result = -ENOMEM; 2898bf0d5f50SAlex Elder img_request = rbd_img_request_create(rbd_dev, offset, length, 28999849e986SAlex Elder write_request, false); 2900bf0d5f50SAlex Elder if (!img_request) 2901bf0d5f50SAlex Elder goto end_request; 2902bf0d5f50SAlex Elder 2903bf0d5f50SAlex Elder img_request->rq = rq; 2904bf0d5f50SAlex Elder 2905f1a4739fSAlex Elder result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO, 2906f1a4739fSAlex Elder rq->bio); 2907bf0d5f50SAlex Elder if (!result) 2908bf0d5f50SAlex Elder result = rbd_img_request_submit(img_request); 2909bf0d5f50SAlex Elder if (result) 2910bf0d5f50SAlex Elder rbd_img_request_put(img_request); 2911bf0d5f50SAlex Elder end_request: 2912bf0d5f50SAlex Elder spin_lock_irq(q->queue_lock); 2913bf0d5f50SAlex Elder if (result < 0) { 29147da22d29SAlex Elder rbd_warn(rbd_dev, "%s %llx at %llx result %d\n", 29157da22d29SAlex Elder write_request ? "write" : "read", 29167da22d29SAlex Elder length, offset, result); 29177da22d29SAlex Elder 2918bf0d5f50SAlex Elder __blk_end_request_all(rq, result); 2919bf0d5f50SAlex Elder } 2920bf0d5f50SAlex Elder } 2921bf0d5f50SAlex Elder } 2922bf0d5f50SAlex Elder 2923602adf40SYehuda Sadeh /* 2924602adf40SYehuda Sadeh * a queue callback. Makes sure that we don't create a bio that spans across 2925602adf40SYehuda Sadeh * multiple osd objects. One exception would be with a single page bios, 2926f7760dadSAlex Elder * which we handle later at bio_chain_clone_range() 2927602adf40SYehuda Sadeh */ 2928602adf40SYehuda Sadeh static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd, 2929602adf40SYehuda Sadeh struct bio_vec *bvec) 2930602adf40SYehuda Sadeh { 2931602adf40SYehuda Sadeh struct rbd_device *rbd_dev = q->queuedata; 2932e5cfeed2SAlex Elder sector_t sector_offset; 2933e5cfeed2SAlex Elder sector_t sectors_per_obj; 2934e5cfeed2SAlex Elder sector_t obj_sector_offset; 2935e5cfeed2SAlex Elder int ret; 2936602adf40SYehuda Sadeh 2937e5cfeed2SAlex Elder /* 2938e5cfeed2SAlex Elder * Find how far into its rbd object the partition-relative 2939e5cfeed2SAlex Elder * bio start sector is to offset relative to the enclosing 2940e5cfeed2SAlex Elder * device. 2941e5cfeed2SAlex Elder */ 2942e5cfeed2SAlex Elder sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector; 2943e5cfeed2SAlex Elder sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT); 2944e5cfeed2SAlex Elder obj_sector_offset = sector_offset & (sectors_per_obj - 1); 2945593a9e7bSAlex Elder 2946e5cfeed2SAlex Elder /* 2947e5cfeed2SAlex Elder * Compute the number of bytes from that offset to the end 2948e5cfeed2SAlex Elder * of the object. Account for what's already used by the bio. 2949e5cfeed2SAlex Elder */ 2950e5cfeed2SAlex Elder ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT; 2951e5cfeed2SAlex Elder if (ret > bmd->bi_size) 2952e5cfeed2SAlex Elder ret -= bmd->bi_size; 2953e5cfeed2SAlex Elder else 2954e5cfeed2SAlex Elder ret = 0; 2955e5cfeed2SAlex Elder 2956e5cfeed2SAlex Elder /* 2957e5cfeed2SAlex Elder * Don't send back more than was asked for. And if the bio 2958e5cfeed2SAlex Elder * was empty, let the whole thing through because: "Note 2959e5cfeed2SAlex Elder * that a block device *must* allow a single page to be 2960e5cfeed2SAlex Elder * added to an empty bio." 2961e5cfeed2SAlex Elder */ 2962e5cfeed2SAlex Elder rbd_assert(bvec->bv_len <= PAGE_SIZE); 2963e5cfeed2SAlex Elder if (ret > (int) bvec->bv_len || !bmd->bi_size) 2964e5cfeed2SAlex Elder ret = (int) bvec->bv_len; 2965e5cfeed2SAlex Elder 2966e5cfeed2SAlex Elder return ret; 2967602adf40SYehuda Sadeh } 2968602adf40SYehuda Sadeh 2969602adf40SYehuda Sadeh static void rbd_free_disk(struct rbd_device *rbd_dev) 2970602adf40SYehuda Sadeh { 2971602adf40SYehuda Sadeh struct gendisk *disk = rbd_dev->disk; 2972602adf40SYehuda Sadeh 2973602adf40SYehuda Sadeh if (!disk) 2974602adf40SYehuda Sadeh return; 2975602adf40SYehuda Sadeh 2976a0cab924SAlex Elder rbd_dev->disk = NULL; 2977a0cab924SAlex Elder if (disk->flags & GENHD_FL_UP) { 2978602adf40SYehuda Sadeh del_gendisk(disk); 2979602adf40SYehuda Sadeh if (disk->queue) 2980602adf40SYehuda Sadeh blk_cleanup_queue(disk->queue); 2981a0cab924SAlex Elder } 2982602adf40SYehuda Sadeh put_disk(disk); 2983602adf40SYehuda Sadeh } 2984602adf40SYehuda Sadeh 2985788e2df3SAlex Elder static int rbd_obj_read_sync(struct rbd_device *rbd_dev, 2986788e2df3SAlex Elder const char *object_name, 29877097f8dfSAlex Elder u64 offset, u64 length, void *buf) 2988788e2df3SAlex Elder 2989788e2df3SAlex Elder { 29902169238dSAlex Elder struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 2991788e2df3SAlex Elder struct rbd_obj_request *obj_request; 2992788e2df3SAlex Elder struct page **pages = NULL; 2993788e2df3SAlex Elder u32 page_count; 29941ceae7efSAlex Elder size_t size; 2995788e2df3SAlex Elder int ret; 2996788e2df3SAlex Elder 2997788e2df3SAlex Elder page_count = (u32) calc_pages_for(offset, length); 2998788e2df3SAlex Elder pages = ceph_alloc_page_vector(page_count, GFP_KERNEL); 2999788e2df3SAlex Elder if (IS_ERR(pages)) 3000788e2df3SAlex Elder ret = PTR_ERR(pages); 3001788e2df3SAlex Elder 3002788e2df3SAlex Elder ret = -ENOMEM; 3003788e2df3SAlex Elder obj_request = rbd_obj_request_create(object_name, offset, length, 3004788e2df3SAlex Elder OBJ_REQUEST_PAGES); 3005788e2df3SAlex Elder if (!obj_request) 3006788e2df3SAlex Elder goto out; 3007788e2df3SAlex Elder 3008788e2df3SAlex Elder obj_request->pages = pages; 3009788e2df3SAlex Elder obj_request->page_count = page_count; 3010788e2df3SAlex Elder 3011430c28c3SAlex Elder obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request); 3012788e2df3SAlex Elder if (!obj_request->osd_req) 3013788e2df3SAlex Elder goto out; 3014788e2df3SAlex Elder 3015c99d2d4aSAlex Elder osd_req_op_extent_init(obj_request->osd_req, 0, CEPH_OSD_OP_READ, 3016c99d2d4aSAlex Elder offset, length, 0, 0); 3017406e2c9fSAlex Elder osd_req_op_extent_osd_data_pages(obj_request->osd_req, 0, 3018a4ce40a9SAlex Elder obj_request->pages, 301944cd188dSAlex Elder obj_request->length, 302044cd188dSAlex Elder obj_request->offset & ~PAGE_MASK, 302144cd188dSAlex Elder false, false); 30229d4df01fSAlex Elder rbd_osd_req_format_read(obj_request); 3023430c28c3SAlex Elder 3024788e2df3SAlex Elder ret = rbd_obj_request_submit(osdc, obj_request); 3025788e2df3SAlex Elder if (ret) 3026788e2df3SAlex Elder goto out; 3027788e2df3SAlex Elder ret = rbd_obj_request_wait(obj_request); 3028788e2df3SAlex Elder if (ret) 3029788e2df3SAlex Elder goto out; 3030788e2df3SAlex Elder 3031788e2df3SAlex Elder ret = obj_request->result; 3032788e2df3SAlex Elder if (ret < 0) 3033788e2df3SAlex Elder goto out; 30341ceae7efSAlex Elder 30351ceae7efSAlex Elder rbd_assert(obj_request->xferred <= (u64) SIZE_MAX); 30361ceae7efSAlex Elder size = (size_t) obj_request->xferred; 3037903bb32eSAlex Elder ceph_copy_from_page_vector(pages, buf, 0, size); 303823ed6e13SAlex Elder rbd_assert(size <= (size_t)INT_MAX); 303923ed6e13SAlex Elder ret = (int)size; 3040788e2df3SAlex Elder out: 3041788e2df3SAlex Elder if (obj_request) 3042788e2df3SAlex Elder rbd_obj_request_put(obj_request); 3043788e2df3SAlex Elder else 3044788e2df3SAlex Elder ceph_release_page_vector(pages, page_count); 3045788e2df3SAlex Elder 3046788e2df3SAlex Elder return ret; 3047788e2df3SAlex Elder } 3048788e2df3SAlex Elder 3049602adf40SYehuda Sadeh /* 3050662518b1SAlex Elder * Read the complete header for the given rbd device. On successful 3051662518b1SAlex Elder * return, the rbd_dev->header field will contain up-to-date 3052662518b1SAlex Elder * information about the image. 30534156d998SAlex Elder */ 305499a41ebcSAlex Elder static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev) 30554156d998SAlex Elder { 30564156d998SAlex Elder struct rbd_image_header_ondisk *ondisk = NULL; 30574156d998SAlex Elder u32 snap_count = 0; 30584156d998SAlex Elder u64 names_size = 0; 30594156d998SAlex Elder u32 want_count; 30604156d998SAlex Elder int ret; 30614156d998SAlex Elder 30624156d998SAlex Elder /* 30634156d998SAlex Elder * The complete header will include an array of its 64-bit 30644156d998SAlex Elder * snapshot ids, followed by the names of those snapshots as 30654156d998SAlex Elder * a contiguous block of NUL-terminated strings. Note that 30664156d998SAlex Elder * the number of snapshots could change by the time we read 30674156d998SAlex Elder * it in, in which case we re-read it. 30684156d998SAlex Elder */ 30694156d998SAlex Elder do { 30704156d998SAlex Elder size_t size; 30714156d998SAlex Elder 30724156d998SAlex Elder kfree(ondisk); 30734156d998SAlex Elder 30744156d998SAlex Elder size = sizeof (*ondisk); 30754156d998SAlex Elder size += snap_count * sizeof (struct rbd_image_snap_ondisk); 30764156d998SAlex Elder size += names_size; 30774156d998SAlex Elder ondisk = kmalloc(size, GFP_KERNEL); 30784156d998SAlex Elder if (!ondisk) 3079662518b1SAlex Elder return -ENOMEM; 30804156d998SAlex Elder 3081788e2df3SAlex Elder ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name, 30827097f8dfSAlex Elder 0, size, ondisk); 30834156d998SAlex Elder if (ret < 0) 3084662518b1SAlex Elder goto out; 3085c0cd10dbSAlex Elder if ((size_t)ret < size) { 30864156d998SAlex Elder ret = -ENXIO; 308706ecc6cbSAlex Elder rbd_warn(rbd_dev, "short header read (want %zd got %d)", 308806ecc6cbSAlex Elder size, ret); 3089662518b1SAlex Elder goto out; 30904156d998SAlex Elder } 30914156d998SAlex Elder if (!rbd_dev_ondisk_valid(ondisk)) { 30924156d998SAlex Elder ret = -ENXIO; 309306ecc6cbSAlex Elder rbd_warn(rbd_dev, "invalid header"); 3094662518b1SAlex Elder goto out; 30954156d998SAlex Elder } 30964156d998SAlex Elder 30974156d998SAlex Elder names_size = le64_to_cpu(ondisk->snap_names_len); 30984156d998SAlex Elder want_count = snap_count; 30994156d998SAlex Elder snap_count = le32_to_cpu(ondisk->snap_count); 31004156d998SAlex Elder } while (snap_count != want_count); 31014156d998SAlex Elder 3102662518b1SAlex Elder ret = rbd_header_from_disk(rbd_dev, ondisk); 3103662518b1SAlex Elder out: 31044156d998SAlex Elder kfree(ondisk); 3105602adf40SYehuda Sadeh 31064156d998SAlex Elder return ret; 3107602adf40SYehuda Sadeh } 3108602adf40SYehuda Sadeh 3109602adf40SYehuda Sadeh /* 311015228edeSAlex Elder * Clear the rbd device's EXISTS flag if the snapshot it's mapped to 311115228edeSAlex Elder * has disappeared from the (just updated) snapshot context. 311215228edeSAlex Elder */ 311315228edeSAlex Elder static void rbd_exists_validate(struct rbd_device *rbd_dev) 311415228edeSAlex Elder { 311515228edeSAlex Elder u64 snap_id; 311615228edeSAlex Elder 311715228edeSAlex Elder if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) 311815228edeSAlex Elder return; 311915228edeSAlex Elder 312015228edeSAlex Elder snap_id = rbd_dev->spec->snap_id; 312115228edeSAlex Elder if (snap_id == CEPH_NOSNAP) 312215228edeSAlex Elder return; 312315228edeSAlex Elder 312415228edeSAlex Elder if (rbd_dev_snap_index(rbd_dev, snap_id) == BAD_SNAP_INDEX) 312515228edeSAlex Elder clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); 312615228edeSAlex Elder } 312715228edeSAlex Elder 3128cc4a38bdSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev) 31291fe5e993SAlex Elder { 3130e627db08SAlex Elder u64 mapping_size; 31311fe5e993SAlex Elder int ret; 31321fe5e993SAlex Elder 3133117973fbSAlex Elder rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 3134e627db08SAlex Elder mapping_size = rbd_dev->mapping.size; 31351fe5e993SAlex Elder mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); 3136117973fbSAlex Elder if (rbd_dev->image_format == 1) 313799a41ebcSAlex Elder ret = rbd_dev_v1_header_info(rbd_dev); 3138117973fbSAlex Elder else 31392df3fac7SAlex Elder ret = rbd_dev_v2_header_info(rbd_dev); 314015228edeSAlex Elder 314115228edeSAlex Elder /* If it's a mapped snapshot, validate its EXISTS flag */ 314215228edeSAlex Elder 314315228edeSAlex Elder rbd_exists_validate(rbd_dev); 31441fe5e993SAlex Elder mutex_unlock(&ctl_mutex); 314500a653e2SAlex Elder if (mapping_size != rbd_dev->mapping.size) { 314600a653e2SAlex Elder sector_t size; 314700a653e2SAlex Elder 314800a653e2SAlex Elder size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE; 314900a653e2SAlex Elder dout("setting size to %llu sectors", (unsigned long long)size); 315000a653e2SAlex Elder set_capacity(rbd_dev->disk, size); 3151a3fbe5d4SAlex Elder revalidate_disk(rbd_dev->disk); 315200a653e2SAlex Elder } 31531fe5e993SAlex Elder 31541fe5e993SAlex Elder return ret; 31551fe5e993SAlex Elder } 31561fe5e993SAlex Elder 3157602adf40SYehuda Sadeh static int rbd_init_disk(struct rbd_device *rbd_dev) 3158602adf40SYehuda Sadeh { 3159602adf40SYehuda Sadeh struct gendisk *disk; 3160602adf40SYehuda Sadeh struct request_queue *q; 3161593a9e7bSAlex Elder u64 segment_size; 3162602adf40SYehuda Sadeh 3163602adf40SYehuda Sadeh /* create gendisk info */ 3164602adf40SYehuda Sadeh disk = alloc_disk(RBD_MINORS_PER_MAJOR); 3165602adf40SYehuda Sadeh if (!disk) 31661fcdb8aaSAlex Elder return -ENOMEM; 3167602adf40SYehuda Sadeh 3168f0f8cef5SAlex Elder snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d", 3169de71a297SAlex Elder rbd_dev->dev_id); 3170602adf40SYehuda Sadeh disk->major = rbd_dev->major; 3171602adf40SYehuda Sadeh disk->first_minor = 0; 3172602adf40SYehuda Sadeh disk->fops = &rbd_bd_ops; 3173602adf40SYehuda Sadeh disk->private_data = rbd_dev; 3174602adf40SYehuda Sadeh 3175bf0d5f50SAlex Elder q = blk_init_queue(rbd_request_fn, &rbd_dev->lock); 3176602adf40SYehuda Sadeh if (!q) 3177602adf40SYehuda Sadeh goto out_disk; 3178029bcbd8SJosh Durgin 3179593a9e7bSAlex Elder /* We use the default size, but let's be explicit about it. */ 3180593a9e7bSAlex Elder blk_queue_physical_block_size(q, SECTOR_SIZE); 3181593a9e7bSAlex Elder 3182029bcbd8SJosh Durgin /* set io sizes to object size */ 3183593a9e7bSAlex Elder segment_size = rbd_obj_bytes(&rbd_dev->header); 3184593a9e7bSAlex Elder blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE); 3185593a9e7bSAlex Elder blk_queue_max_segment_size(q, segment_size); 3186593a9e7bSAlex Elder blk_queue_io_min(q, segment_size); 3187593a9e7bSAlex Elder blk_queue_io_opt(q, segment_size); 3188029bcbd8SJosh Durgin 3189602adf40SYehuda Sadeh blk_queue_merge_bvec(q, rbd_merge_bvec); 3190602adf40SYehuda Sadeh disk->queue = q; 3191602adf40SYehuda Sadeh 3192602adf40SYehuda Sadeh q->queuedata = rbd_dev; 3193602adf40SYehuda Sadeh 3194602adf40SYehuda Sadeh rbd_dev->disk = disk; 3195602adf40SYehuda Sadeh 3196602adf40SYehuda Sadeh return 0; 3197602adf40SYehuda Sadeh out_disk: 3198602adf40SYehuda Sadeh put_disk(disk); 31991fcdb8aaSAlex Elder 32001fcdb8aaSAlex Elder return -ENOMEM; 3201602adf40SYehuda Sadeh } 3202602adf40SYehuda Sadeh 3203dfc5606dSYehuda Sadeh /* 3204dfc5606dSYehuda Sadeh sysfs 3205dfc5606dSYehuda Sadeh */ 3206602adf40SYehuda Sadeh 3207593a9e7bSAlex Elder static struct rbd_device *dev_to_rbd_dev(struct device *dev) 3208593a9e7bSAlex Elder { 3209593a9e7bSAlex Elder return container_of(dev, struct rbd_device, dev); 3210593a9e7bSAlex Elder } 3211593a9e7bSAlex Elder 3212dfc5606dSYehuda Sadeh static ssize_t rbd_size_show(struct device *dev, 3213dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 3214602adf40SYehuda Sadeh { 3215593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 3216dfc5606dSYehuda Sadeh 3217fc71d833SAlex Elder return sprintf(buf, "%llu\n", 3218fc71d833SAlex Elder (unsigned long long)rbd_dev->mapping.size); 3219602adf40SYehuda Sadeh } 3220602adf40SYehuda Sadeh 322134b13184SAlex Elder /* 322234b13184SAlex Elder * Note this shows the features for whatever's mapped, which is not 322334b13184SAlex Elder * necessarily the base image. 322434b13184SAlex Elder */ 322534b13184SAlex Elder static ssize_t rbd_features_show(struct device *dev, 322634b13184SAlex Elder struct device_attribute *attr, char *buf) 322734b13184SAlex Elder { 322834b13184SAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 322934b13184SAlex Elder 323034b13184SAlex Elder return sprintf(buf, "0x%016llx\n", 323134b13184SAlex Elder (unsigned long long)rbd_dev->mapping.features); 323234b13184SAlex Elder } 323334b13184SAlex Elder 3234dfc5606dSYehuda Sadeh static ssize_t rbd_major_show(struct device *dev, 3235dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 3236602adf40SYehuda Sadeh { 3237593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 3238dfc5606dSYehuda Sadeh 3239fc71d833SAlex Elder if (rbd_dev->major) 3240dfc5606dSYehuda Sadeh return sprintf(buf, "%d\n", rbd_dev->major); 3241fc71d833SAlex Elder 3242fc71d833SAlex Elder return sprintf(buf, "(none)\n"); 3243fc71d833SAlex Elder 3244dfc5606dSYehuda Sadeh } 3245dfc5606dSYehuda Sadeh 3246dfc5606dSYehuda Sadeh static ssize_t rbd_client_id_show(struct device *dev, 3247dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 3248dfc5606dSYehuda Sadeh { 3249593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 3250dfc5606dSYehuda Sadeh 32511dbb4399SAlex Elder return sprintf(buf, "client%lld\n", 32521dbb4399SAlex Elder ceph_client_id(rbd_dev->rbd_client->client)); 3253dfc5606dSYehuda Sadeh } 3254dfc5606dSYehuda Sadeh 3255dfc5606dSYehuda Sadeh static ssize_t rbd_pool_show(struct device *dev, 3256dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 3257dfc5606dSYehuda Sadeh { 3258593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 3259dfc5606dSYehuda Sadeh 32600d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->pool_name); 3261dfc5606dSYehuda Sadeh } 3262dfc5606dSYehuda Sadeh 32639bb2f334SAlex Elder static ssize_t rbd_pool_id_show(struct device *dev, 32649bb2f334SAlex Elder struct device_attribute *attr, char *buf) 32659bb2f334SAlex Elder { 32669bb2f334SAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 32679bb2f334SAlex Elder 32680d7dbfceSAlex Elder return sprintf(buf, "%llu\n", 32690d7dbfceSAlex Elder (unsigned long long) rbd_dev->spec->pool_id); 32709bb2f334SAlex Elder } 32719bb2f334SAlex Elder 3272dfc5606dSYehuda Sadeh static ssize_t rbd_name_show(struct device *dev, 3273dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 3274dfc5606dSYehuda Sadeh { 3275593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 3276dfc5606dSYehuda Sadeh 3277a92ffdf8SAlex Elder if (rbd_dev->spec->image_name) 32780d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->image_name); 3279a92ffdf8SAlex Elder 3280a92ffdf8SAlex Elder return sprintf(buf, "(unknown)\n"); 3281dfc5606dSYehuda Sadeh } 3282dfc5606dSYehuda Sadeh 3283589d30e0SAlex Elder static ssize_t rbd_image_id_show(struct device *dev, 3284589d30e0SAlex Elder struct device_attribute *attr, char *buf) 3285589d30e0SAlex Elder { 3286589d30e0SAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 3287589d30e0SAlex Elder 32880d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->image_id); 3289589d30e0SAlex Elder } 3290589d30e0SAlex Elder 329134b13184SAlex Elder /* 329234b13184SAlex Elder * Shows the name of the currently-mapped snapshot (or 329334b13184SAlex Elder * RBD_SNAP_HEAD_NAME for the base image). 329434b13184SAlex Elder */ 3295dfc5606dSYehuda Sadeh static ssize_t rbd_snap_show(struct device *dev, 3296dfc5606dSYehuda Sadeh struct device_attribute *attr, 3297dfc5606dSYehuda Sadeh char *buf) 3298dfc5606dSYehuda Sadeh { 3299593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 3300dfc5606dSYehuda Sadeh 33010d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->snap_name); 3302dfc5606dSYehuda Sadeh } 3303dfc5606dSYehuda Sadeh 330486b00e0dSAlex Elder /* 330586b00e0dSAlex Elder * For an rbd v2 image, shows the pool id, image id, and snapshot id 330686b00e0dSAlex Elder * for the parent image. If there is no parent, simply shows 330786b00e0dSAlex Elder * "(no parent image)". 330886b00e0dSAlex Elder */ 330986b00e0dSAlex Elder static ssize_t rbd_parent_show(struct device *dev, 331086b00e0dSAlex Elder struct device_attribute *attr, 331186b00e0dSAlex Elder char *buf) 331286b00e0dSAlex Elder { 331386b00e0dSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 331486b00e0dSAlex Elder struct rbd_spec *spec = rbd_dev->parent_spec; 331586b00e0dSAlex Elder int count; 331686b00e0dSAlex Elder char *bufp = buf; 331786b00e0dSAlex Elder 331886b00e0dSAlex Elder if (!spec) 331986b00e0dSAlex Elder return sprintf(buf, "(no parent image)\n"); 332086b00e0dSAlex Elder 332186b00e0dSAlex Elder count = sprintf(bufp, "pool_id %llu\npool_name %s\n", 332286b00e0dSAlex Elder (unsigned long long) spec->pool_id, spec->pool_name); 332386b00e0dSAlex Elder if (count < 0) 332486b00e0dSAlex Elder return count; 332586b00e0dSAlex Elder bufp += count; 332686b00e0dSAlex Elder 332786b00e0dSAlex Elder count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id, 332886b00e0dSAlex Elder spec->image_name ? spec->image_name : "(unknown)"); 332986b00e0dSAlex Elder if (count < 0) 333086b00e0dSAlex Elder return count; 333186b00e0dSAlex Elder bufp += count; 333286b00e0dSAlex Elder 333386b00e0dSAlex Elder count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n", 333486b00e0dSAlex Elder (unsigned long long) spec->snap_id, spec->snap_name); 333586b00e0dSAlex Elder if (count < 0) 333686b00e0dSAlex Elder return count; 333786b00e0dSAlex Elder bufp += count; 333886b00e0dSAlex Elder 333986b00e0dSAlex Elder count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap); 334086b00e0dSAlex Elder if (count < 0) 334186b00e0dSAlex Elder return count; 334286b00e0dSAlex Elder bufp += count; 334386b00e0dSAlex Elder 334486b00e0dSAlex Elder return (ssize_t) (bufp - buf); 334586b00e0dSAlex Elder } 334686b00e0dSAlex Elder 3347dfc5606dSYehuda Sadeh static ssize_t rbd_image_refresh(struct device *dev, 3348dfc5606dSYehuda Sadeh struct device_attribute *attr, 3349dfc5606dSYehuda Sadeh const char *buf, 3350dfc5606dSYehuda Sadeh size_t size) 3351dfc5606dSYehuda Sadeh { 3352593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 3353b813623aSAlex Elder int ret; 3354602adf40SYehuda Sadeh 3355cc4a38bdSAlex Elder ret = rbd_dev_refresh(rbd_dev); 3356e627db08SAlex Elder if (ret) 3357e627db08SAlex Elder rbd_warn(rbd_dev, ": manual header refresh error (%d)\n", ret); 3358b813623aSAlex Elder 3359b813623aSAlex Elder return ret < 0 ? ret : size; 3360dfc5606dSYehuda Sadeh } 3361602adf40SYehuda Sadeh 3362dfc5606dSYehuda Sadeh static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL); 336334b13184SAlex Elder static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL); 3364dfc5606dSYehuda Sadeh static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL); 3365dfc5606dSYehuda Sadeh static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL); 3366dfc5606dSYehuda Sadeh static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL); 33679bb2f334SAlex Elder static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL); 3368dfc5606dSYehuda Sadeh static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL); 3369589d30e0SAlex Elder static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL); 3370dfc5606dSYehuda Sadeh static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh); 3371dfc5606dSYehuda Sadeh static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL); 337286b00e0dSAlex Elder static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL); 3373dfc5606dSYehuda Sadeh 3374dfc5606dSYehuda Sadeh static struct attribute *rbd_attrs[] = { 3375dfc5606dSYehuda Sadeh &dev_attr_size.attr, 337634b13184SAlex Elder &dev_attr_features.attr, 3377dfc5606dSYehuda Sadeh &dev_attr_major.attr, 3378dfc5606dSYehuda Sadeh &dev_attr_client_id.attr, 3379dfc5606dSYehuda Sadeh &dev_attr_pool.attr, 33809bb2f334SAlex Elder &dev_attr_pool_id.attr, 3381dfc5606dSYehuda Sadeh &dev_attr_name.attr, 3382589d30e0SAlex Elder &dev_attr_image_id.attr, 3383dfc5606dSYehuda Sadeh &dev_attr_current_snap.attr, 338486b00e0dSAlex Elder &dev_attr_parent.attr, 3385dfc5606dSYehuda Sadeh &dev_attr_refresh.attr, 3386dfc5606dSYehuda Sadeh NULL 3387dfc5606dSYehuda Sadeh }; 3388dfc5606dSYehuda Sadeh 3389dfc5606dSYehuda Sadeh static struct attribute_group rbd_attr_group = { 3390dfc5606dSYehuda Sadeh .attrs = rbd_attrs, 3391dfc5606dSYehuda Sadeh }; 3392dfc5606dSYehuda Sadeh 3393dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_attr_groups[] = { 3394dfc5606dSYehuda Sadeh &rbd_attr_group, 3395dfc5606dSYehuda Sadeh NULL 3396dfc5606dSYehuda Sadeh }; 3397dfc5606dSYehuda Sadeh 3398dfc5606dSYehuda Sadeh static void rbd_sysfs_dev_release(struct device *dev) 3399dfc5606dSYehuda Sadeh { 3400dfc5606dSYehuda Sadeh } 3401dfc5606dSYehuda Sadeh 3402dfc5606dSYehuda Sadeh static struct device_type rbd_device_type = { 3403dfc5606dSYehuda Sadeh .name = "rbd", 3404dfc5606dSYehuda Sadeh .groups = rbd_attr_groups, 3405dfc5606dSYehuda Sadeh .release = rbd_sysfs_dev_release, 3406dfc5606dSYehuda Sadeh }; 3407dfc5606dSYehuda Sadeh 34088b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec) 34098b8fb99cSAlex Elder { 34108b8fb99cSAlex Elder kref_get(&spec->kref); 34118b8fb99cSAlex Elder 34128b8fb99cSAlex Elder return spec; 34138b8fb99cSAlex Elder } 34148b8fb99cSAlex Elder 34158b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref); 34168b8fb99cSAlex Elder static void rbd_spec_put(struct rbd_spec *spec) 34178b8fb99cSAlex Elder { 34188b8fb99cSAlex Elder if (spec) 34198b8fb99cSAlex Elder kref_put(&spec->kref, rbd_spec_free); 34208b8fb99cSAlex Elder } 34218b8fb99cSAlex Elder 34228b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_alloc(void) 34238b8fb99cSAlex Elder { 34248b8fb99cSAlex Elder struct rbd_spec *spec; 34258b8fb99cSAlex Elder 34268b8fb99cSAlex Elder spec = kzalloc(sizeof (*spec), GFP_KERNEL); 34278b8fb99cSAlex Elder if (!spec) 34288b8fb99cSAlex Elder return NULL; 34298b8fb99cSAlex Elder kref_init(&spec->kref); 34308b8fb99cSAlex Elder 34318b8fb99cSAlex Elder return spec; 34328b8fb99cSAlex Elder } 34338b8fb99cSAlex Elder 34348b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref) 34358b8fb99cSAlex Elder { 34368b8fb99cSAlex Elder struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref); 34378b8fb99cSAlex Elder 34388b8fb99cSAlex Elder kfree(spec->pool_name); 34398b8fb99cSAlex Elder kfree(spec->image_id); 34408b8fb99cSAlex Elder kfree(spec->image_name); 34418b8fb99cSAlex Elder kfree(spec->snap_name); 34428b8fb99cSAlex Elder kfree(spec); 34438b8fb99cSAlex Elder } 34448b8fb99cSAlex Elder 3445cc344fa1SAlex Elder static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc, 3446c53d5893SAlex Elder struct rbd_spec *spec) 3447c53d5893SAlex Elder { 3448c53d5893SAlex Elder struct rbd_device *rbd_dev; 3449c53d5893SAlex Elder 3450c53d5893SAlex Elder rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL); 3451c53d5893SAlex Elder if (!rbd_dev) 3452c53d5893SAlex Elder return NULL; 3453c53d5893SAlex Elder 3454c53d5893SAlex Elder spin_lock_init(&rbd_dev->lock); 34556d292906SAlex Elder rbd_dev->flags = 0; 3456c53d5893SAlex Elder INIT_LIST_HEAD(&rbd_dev->node); 3457c53d5893SAlex Elder init_rwsem(&rbd_dev->header_rwsem); 3458c53d5893SAlex Elder 3459c53d5893SAlex Elder rbd_dev->spec = spec; 3460c53d5893SAlex Elder rbd_dev->rbd_client = rbdc; 3461c53d5893SAlex Elder 34620903e875SAlex Elder /* Initialize the layout used for all rbd requests */ 34630903e875SAlex Elder 34640903e875SAlex Elder rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); 34650903e875SAlex Elder rbd_dev->layout.fl_stripe_count = cpu_to_le32(1); 34660903e875SAlex Elder rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); 34670903e875SAlex Elder rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id); 34680903e875SAlex Elder 3469c53d5893SAlex Elder return rbd_dev; 3470c53d5893SAlex Elder } 3471c53d5893SAlex Elder 3472c53d5893SAlex Elder static void rbd_dev_destroy(struct rbd_device *rbd_dev) 3473c53d5893SAlex Elder { 3474c53d5893SAlex Elder rbd_put_client(rbd_dev->rbd_client); 3475c53d5893SAlex Elder rbd_spec_put(rbd_dev->spec); 3476c53d5893SAlex Elder kfree(rbd_dev); 3477c53d5893SAlex Elder } 3478c53d5893SAlex Elder 3479dfc5606dSYehuda Sadeh /* 34809d475de5SAlex Elder * Get the size and object order for an image snapshot, or if 34819d475de5SAlex Elder * snap_id is CEPH_NOSNAP, gets this information for the base 34829d475de5SAlex Elder * image. 34839d475de5SAlex Elder */ 34849d475de5SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id, 34859d475de5SAlex Elder u8 *order, u64 *snap_size) 34869d475de5SAlex Elder { 34879d475de5SAlex Elder __le64 snapid = cpu_to_le64(snap_id); 34889d475de5SAlex Elder int ret; 34899d475de5SAlex Elder struct { 34909d475de5SAlex Elder u8 order; 34919d475de5SAlex Elder __le64 size; 34929d475de5SAlex Elder } __attribute__ ((packed)) size_buf = { 0 }; 34939d475de5SAlex Elder 349436be9a76SAlex Elder ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, 34959d475de5SAlex Elder "rbd", "get_size", 34964157976bSAlex Elder &snapid, sizeof (snapid), 3497e2a58ee5SAlex Elder &size_buf, sizeof (size_buf)); 349836be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 34999d475de5SAlex Elder if (ret < 0) 35009d475de5SAlex Elder return ret; 350157385b51SAlex Elder if (ret < sizeof (size_buf)) 350257385b51SAlex Elder return -ERANGE; 35039d475de5SAlex Elder 3504c86f86e9SAlex Elder if (order) 35059d475de5SAlex Elder *order = size_buf.order; 35069d475de5SAlex Elder *snap_size = le64_to_cpu(size_buf.size); 35079d475de5SAlex Elder 35089d475de5SAlex Elder dout(" snap_id 0x%016llx order = %u, snap_size = %llu\n", 35099d475de5SAlex Elder (unsigned long long)snap_id, (unsigned int)*order, 35109d475de5SAlex Elder (unsigned long long)*snap_size); 35119d475de5SAlex Elder 35129d475de5SAlex Elder return 0; 35139d475de5SAlex Elder } 35149d475de5SAlex Elder 35159d475de5SAlex Elder static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev) 35169d475de5SAlex Elder { 35179d475de5SAlex Elder return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP, 35189d475de5SAlex Elder &rbd_dev->header.obj_order, 35199d475de5SAlex Elder &rbd_dev->header.image_size); 35209d475de5SAlex Elder } 35219d475de5SAlex Elder 35221e130199SAlex Elder static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev) 35231e130199SAlex Elder { 35241e130199SAlex Elder void *reply_buf; 35251e130199SAlex Elder int ret; 35261e130199SAlex Elder void *p; 35271e130199SAlex Elder 35281e130199SAlex Elder reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL); 35291e130199SAlex Elder if (!reply_buf) 35301e130199SAlex Elder return -ENOMEM; 35311e130199SAlex Elder 353236be9a76SAlex Elder ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, 35334157976bSAlex Elder "rbd", "get_object_prefix", NULL, 0, 3534e2a58ee5SAlex Elder reply_buf, RBD_OBJ_PREFIX_LEN_MAX); 353536be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 35361e130199SAlex Elder if (ret < 0) 35371e130199SAlex Elder goto out; 35381e130199SAlex Elder 35391e130199SAlex Elder p = reply_buf; 35401e130199SAlex Elder rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p, 354157385b51SAlex Elder p + ret, NULL, GFP_NOIO); 354257385b51SAlex Elder ret = 0; 35431e130199SAlex Elder 35441e130199SAlex Elder if (IS_ERR(rbd_dev->header.object_prefix)) { 35451e130199SAlex Elder ret = PTR_ERR(rbd_dev->header.object_prefix); 35461e130199SAlex Elder rbd_dev->header.object_prefix = NULL; 35471e130199SAlex Elder } else { 35481e130199SAlex Elder dout(" object_prefix = %s\n", rbd_dev->header.object_prefix); 35491e130199SAlex Elder } 35501e130199SAlex Elder out: 35511e130199SAlex Elder kfree(reply_buf); 35521e130199SAlex Elder 35531e130199SAlex Elder return ret; 35541e130199SAlex Elder } 35551e130199SAlex Elder 3556b1b5402aSAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id, 3557b1b5402aSAlex Elder u64 *snap_features) 3558b1b5402aSAlex Elder { 3559b1b5402aSAlex Elder __le64 snapid = cpu_to_le64(snap_id); 3560b1b5402aSAlex Elder struct { 3561b1b5402aSAlex Elder __le64 features; 3562b1b5402aSAlex Elder __le64 incompat; 35634157976bSAlex Elder } __attribute__ ((packed)) features_buf = { 0 }; 3564d889140cSAlex Elder u64 incompat; 3565b1b5402aSAlex Elder int ret; 3566b1b5402aSAlex Elder 356736be9a76SAlex Elder ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, 3568b1b5402aSAlex Elder "rbd", "get_features", 35694157976bSAlex Elder &snapid, sizeof (snapid), 3570e2a58ee5SAlex Elder &features_buf, sizeof (features_buf)); 357136be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 3572b1b5402aSAlex Elder if (ret < 0) 3573b1b5402aSAlex Elder return ret; 357457385b51SAlex Elder if (ret < sizeof (features_buf)) 357557385b51SAlex Elder return -ERANGE; 3576d889140cSAlex Elder 3577d889140cSAlex Elder incompat = le64_to_cpu(features_buf.incompat); 35785cbf6f12SAlex Elder if (incompat & ~RBD_FEATURES_SUPPORTED) 3579b8f5c6edSAlex Elder return -ENXIO; 3580d889140cSAlex Elder 3581b1b5402aSAlex Elder *snap_features = le64_to_cpu(features_buf.features); 3582b1b5402aSAlex Elder 3583b1b5402aSAlex Elder dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n", 3584b1b5402aSAlex Elder (unsigned long long)snap_id, 3585b1b5402aSAlex Elder (unsigned long long)*snap_features, 3586b1b5402aSAlex Elder (unsigned long long)le64_to_cpu(features_buf.incompat)); 3587b1b5402aSAlex Elder 3588b1b5402aSAlex Elder return 0; 3589b1b5402aSAlex Elder } 3590b1b5402aSAlex Elder 3591b1b5402aSAlex Elder static int rbd_dev_v2_features(struct rbd_device *rbd_dev) 3592b1b5402aSAlex Elder { 3593b1b5402aSAlex Elder return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP, 3594b1b5402aSAlex Elder &rbd_dev->header.features); 3595b1b5402aSAlex Elder } 3596b1b5402aSAlex Elder 359786b00e0dSAlex Elder static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev) 359886b00e0dSAlex Elder { 359986b00e0dSAlex Elder struct rbd_spec *parent_spec; 360086b00e0dSAlex Elder size_t size; 360186b00e0dSAlex Elder void *reply_buf = NULL; 360286b00e0dSAlex Elder __le64 snapid; 360386b00e0dSAlex Elder void *p; 360486b00e0dSAlex Elder void *end; 360586b00e0dSAlex Elder char *image_id; 360686b00e0dSAlex Elder u64 overlap; 360786b00e0dSAlex Elder int ret; 360886b00e0dSAlex Elder 360986b00e0dSAlex Elder parent_spec = rbd_spec_alloc(); 361086b00e0dSAlex Elder if (!parent_spec) 361186b00e0dSAlex Elder return -ENOMEM; 361286b00e0dSAlex Elder 361386b00e0dSAlex Elder size = sizeof (__le64) + /* pool_id */ 361486b00e0dSAlex Elder sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX + /* image_id */ 361586b00e0dSAlex Elder sizeof (__le64) + /* snap_id */ 361686b00e0dSAlex Elder sizeof (__le64); /* overlap */ 361786b00e0dSAlex Elder reply_buf = kmalloc(size, GFP_KERNEL); 361886b00e0dSAlex Elder if (!reply_buf) { 361986b00e0dSAlex Elder ret = -ENOMEM; 362086b00e0dSAlex Elder goto out_err; 362186b00e0dSAlex Elder } 362286b00e0dSAlex Elder 362386b00e0dSAlex Elder snapid = cpu_to_le64(CEPH_NOSNAP); 362436be9a76SAlex Elder ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, 362586b00e0dSAlex Elder "rbd", "get_parent", 36264157976bSAlex Elder &snapid, sizeof (snapid), 3627e2a58ee5SAlex Elder reply_buf, size); 362836be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 362986b00e0dSAlex Elder if (ret < 0) 363086b00e0dSAlex Elder goto out_err; 363186b00e0dSAlex Elder 363286b00e0dSAlex Elder p = reply_buf; 363357385b51SAlex Elder end = reply_buf + ret; 363457385b51SAlex Elder ret = -ERANGE; 363586b00e0dSAlex Elder ceph_decode_64_safe(&p, end, parent_spec->pool_id, out_err); 363686b00e0dSAlex Elder if (parent_spec->pool_id == CEPH_NOPOOL) 363786b00e0dSAlex Elder goto out; /* No parent? No problem. */ 363886b00e0dSAlex Elder 36390903e875SAlex Elder /* The ceph file layout needs to fit pool id in 32 bits */ 36400903e875SAlex Elder 36410903e875SAlex Elder ret = -EIO; 3642c0cd10dbSAlex Elder if (parent_spec->pool_id > (u64)U32_MAX) { 3643c0cd10dbSAlex Elder rbd_warn(NULL, "parent pool id too large (%llu > %u)\n", 3644c0cd10dbSAlex Elder (unsigned long long)parent_spec->pool_id, U32_MAX); 364557385b51SAlex Elder goto out_err; 3646c0cd10dbSAlex Elder } 36470903e875SAlex Elder 3648979ed480SAlex Elder image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL); 364986b00e0dSAlex Elder if (IS_ERR(image_id)) { 365086b00e0dSAlex Elder ret = PTR_ERR(image_id); 365186b00e0dSAlex Elder goto out_err; 365286b00e0dSAlex Elder } 365386b00e0dSAlex Elder parent_spec->image_id = image_id; 365486b00e0dSAlex Elder ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err); 365586b00e0dSAlex Elder ceph_decode_64_safe(&p, end, overlap, out_err); 365686b00e0dSAlex Elder 365786b00e0dSAlex Elder rbd_dev->parent_overlap = overlap; 365886b00e0dSAlex Elder rbd_dev->parent_spec = parent_spec; 365986b00e0dSAlex Elder parent_spec = NULL; /* rbd_dev now owns this */ 366086b00e0dSAlex Elder out: 366186b00e0dSAlex Elder ret = 0; 366286b00e0dSAlex Elder out_err: 366386b00e0dSAlex Elder kfree(reply_buf); 366486b00e0dSAlex Elder rbd_spec_put(parent_spec); 366586b00e0dSAlex Elder 366686b00e0dSAlex Elder return ret; 366786b00e0dSAlex Elder } 366886b00e0dSAlex Elder 3669cc070d59SAlex Elder static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev) 3670cc070d59SAlex Elder { 3671cc070d59SAlex Elder struct { 3672cc070d59SAlex Elder __le64 stripe_unit; 3673cc070d59SAlex Elder __le64 stripe_count; 3674cc070d59SAlex Elder } __attribute__ ((packed)) striping_info_buf = { 0 }; 3675cc070d59SAlex Elder size_t size = sizeof (striping_info_buf); 3676cc070d59SAlex Elder void *p; 3677cc070d59SAlex Elder u64 obj_size; 3678cc070d59SAlex Elder u64 stripe_unit; 3679cc070d59SAlex Elder u64 stripe_count; 3680cc070d59SAlex Elder int ret; 3681cc070d59SAlex Elder 3682cc070d59SAlex Elder ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, 3683cc070d59SAlex Elder "rbd", "get_stripe_unit_count", NULL, 0, 3684e2a58ee5SAlex Elder (char *)&striping_info_buf, size); 3685cc070d59SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 3686cc070d59SAlex Elder if (ret < 0) 3687cc070d59SAlex Elder return ret; 3688cc070d59SAlex Elder if (ret < size) 3689cc070d59SAlex Elder return -ERANGE; 3690cc070d59SAlex Elder 3691cc070d59SAlex Elder /* 3692cc070d59SAlex Elder * We don't actually support the "fancy striping" feature 3693cc070d59SAlex Elder * (STRIPINGV2) yet, but if the striping sizes are the 3694cc070d59SAlex Elder * defaults the behavior is the same as before. So find 3695cc070d59SAlex Elder * out, and only fail if the image has non-default values. 3696cc070d59SAlex Elder */ 3697cc070d59SAlex Elder ret = -EINVAL; 3698cc070d59SAlex Elder obj_size = (u64)1 << rbd_dev->header.obj_order; 3699cc070d59SAlex Elder p = &striping_info_buf; 3700cc070d59SAlex Elder stripe_unit = ceph_decode_64(&p); 3701cc070d59SAlex Elder if (stripe_unit != obj_size) { 3702cc070d59SAlex Elder rbd_warn(rbd_dev, "unsupported stripe unit " 3703cc070d59SAlex Elder "(got %llu want %llu)", 3704cc070d59SAlex Elder stripe_unit, obj_size); 3705cc070d59SAlex Elder return -EINVAL; 3706cc070d59SAlex Elder } 3707cc070d59SAlex Elder stripe_count = ceph_decode_64(&p); 3708cc070d59SAlex Elder if (stripe_count != 1) { 3709cc070d59SAlex Elder rbd_warn(rbd_dev, "unsupported stripe count " 3710cc070d59SAlex Elder "(got %llu want 1)", stripe_count); 3711cc070d59SAlex Elder return -EINVAL; 3712cc070d59SAlex Elder } 3713500d0c0fSAlex Elder rbd_dev->header.stripe_unit = stripe_unit; 3714500d0c0fSAlex Elder rbd_dev->header.stripe_count = stripe_count; 3715cc070d59SAlex Elder 3716cc070d59SAlex Elder return 0; 3717cc070d59SAlex Elder } 3718cc070d59SAlex Elder 37199e15b77dSAlex Elder static char *rbd_dev_image_name(struct rbd_device *rbd_dev) 37209e15b77dSAlex Elder { 37219e15b77dSAlex Elder size_t image_id_size; 37229e15b77dSAlex Elder char *image_id; 37239e15b77dSAlex Elder void *p; 37249e15b77dSAlex Elder void *end; 37259e15b77dSAlex Elder size_t size; 37269e15b77dSAlex Elder void *reply_buf = NULL; 37279e15b77dSAlex Elder size_t len = 0; 37289e15b77dSAlex Elder char *image_name = NULL; 37299e15b77dSAlex Elder int ret; 37309e15b77dSAlex Elder 37319e15b77dSAlex Elder rbd_assert(!rbd_dev->spec->image_name); 37329e15b77dSAlex Elder 373369e7a02fSAlex Elder len = strlen(rbd_dev->spec->image_id); 373469e7a02fSAlex Elder image_id_size = sizeof (__le32) + len; 37359e15b77dSAlex Elder image_id = kmalloc(image_id_size, GFP_KERNEL); 37369e15b77dSAlex Elder if (!image_id) 37379e15b77dSAlex Elder return NULL; 37389e15b77dSAlex Elder 37399e15b77dSAlex Elder p = image_id; 37404157976bSAlex Elder end = image_id + image_id_size; 374169e7a02fSAlex Elder ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len); 37429e15b77dSAlex Elder 37439e15b77dSAlex Elder size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX; 37449e15b77dSAlex Elder reply_buf = kmalloc(size, GFP_KERNEL); 37459e15b77dSAlex Elder if (!reply_buf) 37469e15b77dSAlex Elder goto out; 37479e15b77dSAlex Elder 374836be9a76SAlex Elder ret = rbd_obj_method_sync(rbd_dev, RBD_DIRECTORY, 37499e15b77dSAlex Elder "rbd", "dir_get_name", 37509e15b77dSAlex Elder image_id, image_id_size, 3751e2a58ee5SAlex Elder reply_buf, size); 37529e15b77dSAlex Elder if (ret < 0) 37539e15b77dSAlex Elder goto out; 37549e15b77dSAlex Elder p = reply_buf; 3755f40eb349SAlex Elder end = reply_buf + ret; 3756f40eb349SAlex Elder 37579e15b77dSAlex Elder image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL); 37589e15b77dSAlex Elder if (IS_ERR(image_name)) 37599e15b77dSAlex Elder image_name = NULL; 37609e15b77dSAlex Elder else 37619e15b77dSAlex Elder dout("%s: name is %s len is %zd\n", __func__, image_name, len); 37629e15b77dSAlex Elder out: 37639e15b77dSAlex Elder kfree(reply_buf); 37649e15b77dSAlex Elder kfree(image_id); 37659e15b77dSAlex Elder 37669e15b77dSAlex Elder return image_name; 37679e15b77dSAlex Elder } 37689e15b77dSAlex Elder 37692ad3d716SAlex Elder static u64 rbd_v1_snap_id_by_name(struct rbd_device *rbd_dev, const char *name) 37702ad3d716SAlex Elder { 37712ad3d716SAlex Elder struct ceph_snap_context *snapc = rbd_dev->header.snapc; 37722ad3d716SAlex Elder const char *snap_name; 37732ad3d716SAlex Elder u32 which = 0; 37742ad3d716SAlex Elder 37752ad3d716SAlex Elder /* Skip over names until we find the one we are looking for */ 37762ad3d716SAlex Elder 37772ad3d716SAlex Elder snap_name = rbd_dev->header.snap_names; 37782ad3d716SAlex Elder while (which < snapc->num_snaps) { 37792ad3d716SAlex Elder if (!strcmp(name, snap_name)) 37802ad3d716SAlex Elder return snapc->snaps[which]; 37812ad3d716SAlex Elder snap_name += strlen(snap_name) + 1; 37822ad3d716SAlex Elder which++; 37832ad3d716SAlex Elder } 37842ad3d716SAlex Elder return CEPH_NOSNAP; 37852ad3d716SAlex Elder } 37862ad3d716SAlex Elder 37872ad3d716SAlex Elder static u64 rbd_v2_snap_id_by_name(struct rbd_device *rbd_dev, const char *name) 37882ad3d716SAlex Elder { 37892ad3d716SAlex Elder struct ceph_snap_context *snapc = rbd_dev->header.snapc; 37902ad3d716SAlex Elder u32 which; 37912ad3d716SAlex Elder bool found = false; 37922ad3d716SAlex Elder u64 snap_id; 37932ad3d716SAlex Elder 37942ad3d716SAlex Elder for (which = 0; !found && which < snapc->num_snaps; which++) { 37952ad3d716SAlex Elder const char *snap_name; 37962ad3d716SAlex Elder 37972ad3d716SAlex Elder snap_id = snapc->snaps[which]; 37982ad3d716SAlex Elder snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id); 37992ad3d716SAlex Elder if (IS_ERR(snap_name)) 38002ad3d716SAlex Elder break; 38012ad3d716SAlex Elder found = !strcmp(name, snap_name); 38022ad3d716SAlex Elder kfree(snap_name); 38032ad3d716SAlex Elder } 38042ad3d716SAlex Elder return found ? snap_id : CEPH_NOSNAP; 38052ad3d716SAlex Elder } 38062ad3d716SAlex Elder 38072ad3d716SAlex Elder /* 38082ad3d716SAlex Elder * Assumes name is never RBD_SNAP_HEAD_NAME; returns CEPH_NOSNAP if 38092ad3d716SAlex Elder * no snapshot by that name is found, or if an error occurs. 38102ad3d716SAlex Elder */ 38112ad3d716SAlex Elder static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name) 38122ad3d716SAlex Elder { 38132ad3d716SAlex Elder if (rbd_dev->image_format == 1) 38142ad3d716SAlex Elder return rbd_v1_snap_id_by_name(rbd_dev, name); 38152ad3d716SAlex Elder 38162ad3d716SAlex Elder return rbd_v2_snap_id_by_name(rbd_dev, name); 38172ad3d716SAlex Elder } 38182ad3d716SAlex Elder 38199e15b77dSAlex Elder /* 38202e9f7f1cSAlex Elder * When an rbd image has a parent image, it is identified by the 38212e9f7f1cSAlex Elder * pool, image, and snapshot ids (not names). This function fills 38222e9f7f1cSAlex Elder * in the names for those ids. (It's OK if we can't figure out the 38232e9f7f1cSAlex Elder * name for an image id, but the pool and snapshot ids should always 38242e9f7f1cSAlex Elder * exist and have names.) All names in an rbd spec are dynamically 38252e9f7f1cSAlex Elder * allocated. 3826e1d4213fSAlex Elder * 3827e1d4213fSAlex Elder * When an image being mapped (not a parent) is probed, we have the 3828e1d4213fSAlex Elder * pool name and pool id, image name and image id, and the snapshot 3829e1d4213fSAlex Elder * name. The only thing we're missing is the snapshot id. 38309e15b77dSAlex Elder */ 38312e9f7f1cSAlex Elder static int rbd_dev_spec_update(struct rbd_device *rbd_dev) 38329e15b77dSAlex Elder { 38332e9f7f1cSAlex Elder struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 38342e9f7f1cSAlex Elder struct rbd_spec *spec = rbd_dev->spec; 38352e9f7f1cSAlex Elder const char *pool_name; 38362e9f7f1cSAlex Elder const char *image_name; 38372e9f7f1cSAlex Elder const char *snap_name; 38389e15b77dSAlex Elder int ret; 38399e15b77dSAlex Elder 3840e1d4213fSAlex Elder /* 3841e1d4213fSAlex Elder * An image being mapped will have the pool name (etc.), but 3842e1d4213fSAlex Elder * we need to look up the snapshot id. 3843e1d4213fSAlex Elder */ 38442e9f7f1cSAlex Elder if (spec->pool_name) { 38452e9f7f1cSAlex Elder if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) { 38462ad3d716SAlex Elder u64 snap_id; 3847e1d4213fSAlex Elder 38482ad3d716SAlex Elder snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name); 38492ad3d716SAlex Elder if (snap_id == CEPH_NOSNAP) 3850e1d4213fSAlex Elder return -ENOENT; 38512ad3d716SAlex Elder spec->snap_id = snap_id; 3852e1d4213fSAlex Elder } else { 38532e9f7f1cSAlex Elder spec->snap_id = CEPH_NOSNAP; 3854e1d4213fSAlex Elder } 3855e1d4213fSAlex Elder 3856e1d4213fSAlex Elder return 0; 3857e1d4213fSAlex Elder } 38589e15b77dSAlex Elder 38592e9f7f1cSAlex Elder /* Get the pool name; we have to make our own copy of this */ 38609e15b77dSAlex Elder 38612e9f7f1cSAlex Elder pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id); 38622e9f7f1cSAlex Elder if (!pool_name) { 38632e9f7f1cSAlex Elder rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id); 3864935dc89fSAlex Elder return -EIO; 3865935dc89fSAlex Elder } 38662e9f7f1cSAlex Elder pool_name = kstrdup(pool_name, GFP_KERNEL); 38672e9f7f1cSAlex Elder if (!pool_name) 38689e15b77dSAlex Elder return -ENOMEM; 38699e15b77dSAlex Elder 38709e15b77dSAlex Elder /* Fetch the image name; tolerate failure here */ 38719e15b77dSAlex Elder 38722e9f7f1cSAlex Elder image_name = rbd_dev_image_name(rbd_dev); 38732e9f7f1cSAlex Elder if (!image_name) 387406ecc6cbSAlex Elder rbd_warn(rbd_dev, "unable to get image name"); 38759e15b77dSAlex Elder 38762e9f7f1cSAlex Elder /* Look up the snapshot name, and make a copy */ 38779e15b77dSAlex Elder 38782e9f7f1cSAlex Elder snap_name = rbd_snap_name(rbd_dev, spec->snap_id); 38792e9f7f1cSAlex Elder if (!snap_name) { 38802e9f7f1cSAlex Elder ret = -ENOMEM; 38819e15b77dSAlex Elder goto out_err; 38822e9f7f1cSAlex Elder } 38832e9f7f1cSAlex Elder 38842e9f7f1cSAlex Elder spec->pool_name = pool_name; 38852e9f7f1cSAlex Elder spec->image_name = image_name; 38862e9f7f1cSAlex Elder spec->snap_name = snap_name; 38879e15b77dSAlex Elder 38889e15b77dSAlex Elder return 0; 38899e15b77dSAlex Elder out_err: 38902e9f7f1cSAlex Elder kfree(image_name); 38912e9f7f1cSAlex Elder kfree(pool_name); 38929e15b77dSAlex Elder 38939e15b77dSAlex Elder return ret; 38949e15b77dSAlex Elder } 38959e15b77dSAlex Elder 3896cc4a38bdSAlex Elder static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev) 389735d489f9SAlex Elder { 389835d489f9SAlex Elder size_t size; 389935d489f9SAlex Elder int ret; 390035d489f9SAlex Elder void *reply_buf; 390135d489f9SAlex Elder void *p; 390235d489f9SAlex Elder void *end; 390335d489f9SAlex Elder u64 seq; 390435d489f9SAlex Elder u32 snap_count; 390535d489f9SAlex Elder struct ceph_snap_context *snapc; 390635d489f9SAlex Elder u32 i; 390735d489f9SAlex Elder 390835d489f9SAlex Elder /* 390935d489f9SAlex Elder * We'll need room for the seq value (maximum snapshot id), 391035d489f9SAlex Elder * snapshot count, and array of that many snapshot ids. 391135d489f9SAlex Elder * For now we have a fixed upper limit on the number we're 391235d489f9SAlex Elder * prepared to receive. 391335d489f9SAlex Elder */ 391435d489f9SAlex Elder size = sizeof (__le64) + sizeof (__le32) + 391535d489f9SAlex Elder RBD_MAX_SNAP_COUNT * sizeof (__le64); 391635d489f9SAlex Elder reply_buf = kzalloc(size, GFP_KERNEL); 391735d489f9SAlex Elder if (!reply_buf) 391835d489f9SAlex Elder return -ENOMEM; 391935d489f9SAlex Elder 392036be9a76SAlex Elder ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, 39214157976bSAlex Elder "rbd", "get_snapcontext", NULL, 0, 3922e2a58ee5SAlex Elder reply_buf, size); 392336be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 392435d489f9SAlex Elder if (ret < 0) 392535d489f9SAlex Elder goto out; 392635d489f9SAlex Elder 392735d489f9SAlex Elder p = reply_buf; 392857385b51SAlex Elder end = reply_buf + ret; 392957385b51SAlex Elder ret = -ERANGE; 393035d489f9SAlex Elder ceph_decode_64_safe(&p, end, seq, out); 393135d489f9SAlex Elder ceph_decode_32_safe(&p, end, snap_count, out); 393235d489f9SAlex Elder 393335d489f9SAlex Elder /* 393435d489f9SAlex Elder * Make sure the reported number of snapshot ids wouldn't go 393535d489f9SAlex Elder * beyond the end of our buffer. But before checking that, 393635d489f9SAlex Elder * make sure the computed size of the snapshot context we 393735d489f9SAlex Elder * allocate is representable in a size_t. 393835d489f9SAlex Elder */ 393935d489f9SAlex Elder if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context)) 394035d489f9SAlex Elder / sizeof (u64)) { 394135d489f9SAlex Elder ret = -EINVAL; 394235d489f9SAlex Elder goto out; 394335d489f9SAlex Elder } 394435d489f9SAlex Elder if (!ceph_has_room(&p, end, snap_count * sizeof (__le64))) 394535d489f9SAlex Elder goto out; 3946468521c1SAlex Elder ret = 0; 394735d489f9SAlex Elder 3948812164f8SAlex Elder snapc = ceph_create_snap_context(snap_count, GFP_KERNEL); 394935d489f9SAlex Elder if (!snapc) { 395035d489f9SAlex Elder ret = -ENOMEM; 395135d489f9SAlex Elder goto out; 395235d489f9SAlex Elder } 395335d489f9SAlex Elder snapc->seq = seq; 395435d489f9SAlex Elder for (i = 0; i < snap_count; i++) 395535d489f9SAlex Elder snapc->snaps[i] = ceph_decode_64(&p); 395635d489f9SAlex Elder 395749ece554SAlex Elder ceph_put_snap_context(rbd_dev->header.snapc); 395835d489f9SAlex Elder rbd_dev->header.snapc = snapc; 395935d489f9SAlex Elder 396035d489f9SAlex Elder dout(" snap context seq = %llu, snap_count = %u\n", 396135d489f9SAlex Elder (unsigned long long)seq, (unsigned int)snap_count); 396235d489f9SAlex Elder out: 396335d489f9SAlex Elder kfree(reply_buf); 396435d489f9SAlex Elder 396557385b51SAlex Elder return ret; 396635d489f9SAlex Elder } 396735d489f9SAlex Elder 396854cac61fSAlex Elder static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, 396954cac61fSAlex Elder u64 snap_id) 3970b8b1e2dbSAlex Elder { 3971b8b1e2dbSAlex Elder size_t size; 3972b8b1e2dbSAlex Elder void *reply_buf; 397354cac61fSAlex Elder __le64 snapid; 3974b8b1e2dbSAlex Elder int ret; 3975b8b1e2dbSAlex Elder void *p; 3976b8b1e2dbSAlex Elder void *end; 3977b8b1e2dbSAlex Elder char *snap_name; 3978b8b1e2dbSAlex Elder 3979b8b1e2dbSAlex Elder size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN; 3980b8b1e2dbSAlex Elder reply_buf = kmalloc(size, GFP_KERNEL); 3981b8b1e2dbSAlex Elder if (!reply_buf) 3982b8b1e2dbSAlex Elder return ERR_PTR(-ENOMEM); 3983b8b1e2dbSAlex Elder 398454cac61fSAlex Elder snapid = cpu_to_le64(snap_id); 398536be9a76SAlex Elder ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, 3986b8b1e2dbSAlex Elder "rbd", "get_snapshot_name", 398754cac61fSAlex Elder &snapid, sizeof (snapid), 3988e2a58ee5SAlex Elder reply_buf, size); 398936be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 3990f40eb349SAlex Elder if (ret < 0) { 3991f40eb349SAlex Elder snap_name = ERR_PTR(ret); 3992b8b1e2dbSAlex Elder goto out; 3993f40eb349SAlex Elder } 3994b8b1e2dbSAlex Elder 3995b8b1e2dbSAlex Elder p = reply_buf; 3996f40eb349SAlex Elder end = reply_buf + ret; 3997e5c35534SAlex Elder snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL); 3998f40eb349SAlex Elder if (IS_ERR(snap_name)) 3999b8b1e2dbSAlex Elder goto out; 4000f40eb349SAlex Elder 4001b8b1e2dbSAlex Elder dout(" snap_id 0x%016llx snap_name = %s\n", 400254cac61fSAlex Elder (unsigned long long)snap_id, snap_name); 4003b8b1e2dbSAlex Elder out: 4004b8b1e2dbSAlex Elder kfree(reply_buf); 4005b8b1e2dbSAlex Elder 4006f40eb349SAlex Elder return snap_name; 4007b8b1e2dbSAlex Elder } 4008b8b1e2dbSAlex Elder 40092df3fac7SAlex Elder static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev) 4010117973fbSAlex Elder { 40112df3fac7SAlex Elder bool first_time = rbd_dev->header.object_prefix == NULL; 4012117973fbSAlex Elder int ret; 4013117973fbSAlex Elder 4014117973fbSAlex Elder down_write(&rbd_dev->header_rwsem); 4015117973fbSAlex Elder 40162df3fac7SAlex Elder if (first_time) { 40172df3fac7SAlex Elder ret = rbd_dev_v2_header_onetime(rbd_dev); 40182df3fac7SAlex Elder if (ret) 40192df3fac7SAlex Elder goto out; 40202df3fac7SAlex Elder } 40212df3fac7SAlex Elder 4022117973fbSAlex Elder ret = rbd_dev_v2_image_size(rbd_dev); 4023117973fbSAlex Elder if (ret) 4024117973fbSAlex Elder goto out; 402529334ba4SAlex Elder if (rbd_dev->spec->snap_id == CEPH_NOSNAP) 402629334ba4SAlex Elder if (rbd_dev->mapping.size != rbd_dev->header.image_size) 402729334ba4SAlex Elder rbd_dev->mapping.size = rbd_dev->header.image_size; 4028117973fbSAlex Elder 4029cc4a38bdSAlex Elder ret = rbd_dev_v2_snap_context(rbd_dev); 4030117973fbSAlex Elder dout("rbd_dev_v2_snap_context returned %d\n", ret); 4031117973fbSAlex Elder if (ret) 4032117973fbSAlex Elder goto out; 4033117973fbSAlex Elder out: 4034117973fbSAlex Elder up_write(&rbd_dev->header_rwsem); 4035117973fbSAlex Elder 4036117973fbSAlex Elder return ret; 4037117973fbSAlex Elder } 4038117973fbSAlex Elder 4039dfc5606dSYehuda Sadeh static int rbd_bus_add_dev(struct rbd_device *rbd_dev) 4040dfc5606dSYehuda Sadeh { 4041dfc5606dSYehuda Sadeh struct device *dev; 4042cd789ab9SAlex Elder int ret; 4043dfc5606dSYehuda Sadeh 4044dfc5606dSYehuda Sadeh mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); 4045dfc5606dSYehuda Sadeh 4046cd789ab9SAlex Elder dev = &rbd_dev->dev; 4047dfc5606dSYehuda Sadeh dev->bus = &rbd_bus_type; 4048dfc5606dSYehuda Sadeh dev->type = &rbd_device_type; 4049dfc5606dSYehuda Sadeh dev->parent = &rbd_root_dev; 4050200a6a8bSAlex Elder dev->release = rbd_dev_device_release; 4051de71a297SAlex Elder dev_set_name(dev, "%d", rbd_dev->dev_id); 4052dfc5606dSYehuda Sadeh ret = device_register(dev); 4053dfc5606dSYehuda Sadeh 4054dfc5606dSYehuda Sadeh mutex_unlock(&ctl_mutex); 4055cd789ab9SAlex Elder 4056dfc5606dSYehuda Sadeh return ret; 4057602adf40SYehuda Sadeh } 4058602adf40SYehuda Sadeh 4059dfc5606dSYehuda Sadeh static void rbd_bus_del_dev(struct rbd_device *rbd_dev) 4060dfc5606dSYehuda Sadeh { 4061dfc5606dSYehuda Sadeh device_unregister(&rbd_dev->dev); 4062dfc5606dSYehuda Sadeh } 4063dfc5606dSYehuda Sadeh 4064e2839308SAlex Elder static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0); 40651ddbe94eSAlex Elder 40661ddbe94eSAlex Elder /* 4067499afd5bSAlex Elder * Get a unique rbd identifier for the given new rbd_dev, and add 4068499afd5bSAlex Elder * the rbd_dev to the global list. The minimum rbd id is 1. 40691ddbe94eSAlex Elder */ 4070e2839308SAlex Elder static void rbd_dev_id_get(struct rbd_device *rbd_dev) 4071b7f23c36SAlex Elder { 4072e2839308SAlex Elder rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max); 4073499afd5bSAlex Elder 4074499afd5bSAlex Elder spin_lock(&rbd_dev_list_lock); 4075499afd5bSAlex Elder list_add_tail(&rbd_dev->node, &rbd_dev_list); 4076499afd5bSAlex Elder spin_unlock(&rbd_dev_list_lock); 4077e2839308SAlex Elder dout("rbd_dev %p given dev id %llu\n", rbd_dev, 4078e2839308SAlex Elder (unsigned long long) rbd_dev->dev_id); 4079b7f23c36SAlex Elder } 4080b7f23c36SAlex Elder 40811ddbe94eSAlex Elder /* 4082499afd5bSAlex Elder * Remove an rbd_dev from the global list, and record that its 4083499afd5bSAlex Elder * identifier is no longer in use. 40841ddbe94eSAlex Elder */ 4085e2839308SAlex Elder static void rbd_dev_id_put(struct rbd_device *rbd_dev) 40861ddbe94eSAlex Elder { 4087d184f6bfSAlex Elder struct list_head *tmp; 4088de71a297SAlex Elder int rbd_id = rbd_dev->dev_id; 4089d184f6bfSAlex Elder int max_id; 4090d184f6bfSAlex Elder 4091aafb230eSAlex Elder rbd_assert(rbd_id > 0); 4092499afd5bSAlex Elder 4093e2839308SAlex Elder dout("rbd_dev %p released dev id %llu\n", rbd_dev, 4094e2839308SAlex Elder (unsigned long long) rbd_dev->dev_id); 4095499afd5bSAlex Elder spin_lock(&rbd_dev_list_lock); 4096499afd5bSAlex Elder list_del_init(&rbd_dev->node); 4097d184f6bfSAlex Elder 4098d184f6bfSAlex Elder /* 4099d184f6bfSAlex Elder * If the id being "put" is not the current maximum, there 4100d184f6bfSAlex Elder * is nothing special we need to do. 4101d184f6bfSAlex Elder */ 4102e2839308SAlex Elder if (rbd_id != atomic64_read(&rbd_dev_id_max)) { 4103d184f6bfSAlex Elder spin_unlock(&rbd_dev_list_lock); 4104d184f6bfSAlex Elder return; 4105d184f6bfSAlex Elder } 4106d184f6bfSAlex Elder 4107d184f6bfSAlex Elder /* 4108d184f6bfSAlex Elder * We need to update the current maximum id. Search the 4109d184f6bfSAlex Elder * list to find out what it is. We're more likely to find 4110d184f6bfSAlex Elder * the maximum at the end, so search the list backward. 4111d184f6bfSAlex Elder */ 4112d184f6bfSAlex Elder max_id = 0; 4113d184f6bfSAlex Elder list_for_each_prev(tmp, &rbd_dev_list) { 4114d184f6bfSAlex Elder struct rbd_device *rbd_dev; 4115d184f6bfSAlex Elder 4116d184f6bfSAlex Elder rbd_dev = list_entry(tmp, struct rbd_device, node); 4117b213e0b1SAlex Elder if (rbd_dev->dev_id > max_id) 4118b213e0b1SAlex Elder max_id = rbd_dev->dev_id; 4119d184f6bfSAlex Elder } 4120499afd5bSAlex Elder spin_unlock(&rbd_dev_list_lock); 41211ddbe94eSAlex Elder 41221ddbe94eSAlex Elder /* 4123e2839308SAlex Elder * The max id could have been updated by rbd_dev_id_get(), in 4124d184f6bfSAlex Elder * which case it now accurately reflects the new maximum. 4125d184f6bfSAlex Elder * Be careful not to overwrite the maximum value in that 4126d184f6bfSAlex Elder * case. 41271ddbe94eSAlex Elder */ 4128e2839308SAlex Elder atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id); 4129e2839308SAlex Elder dout(" max dev id has been reset\n"); 4130b7f23c36SAlex Elder } 4131b7f23c36SAlex Elder 4132a725f65eSAlex Elder /* 4133e28fff26SAlex Elder * Skips over white space at *buf, and updates *buf to point to the 4134e28fff26SAlex Elder * first found non-space character (if any). Returns the length of 4135593a9e7bSAlex Elder * the token (string of non-white space characters) found. Note 4136593a9e7bSAlex Elder * that *buf must be terminated with '\0'. 4137e28fff26SAlex Elder */ 4138e28fff26SAlex Elder static inline size_t next_token(const char **buf) 4139e28fff26SAlex Elder { 4140e28fff26SAlex Elder /* 4141e28fff26SAlex Elder * These are the characters that produce nonzero for 4142e28fff26SAlex Elder * isspace() in the "C" and "POSIX" locales. 4143e28fff26SAlex Elder */ 4144e28fff26SAlex Elder const char *spaces = " \f\n\r\t\v"; 4145e28fff26SAlex Elder 4146e28fff26SAlex Elder *buf += strspn(*buf, spaces); /* Find start of token */ 4147e28fff26SAlex Elder 4148e28fff26SAlex Elder return strcspn(*buf, spaces); /* Return token length */ 4149e28fff26SAlex Elder } 4150e28fff26SAlex Elder 4151e28fff26SAlex Elder /* 4152e28fff26SAlex Elder * Finds the next token in *buf, and if the provided token buffer is 4153e28fff26SAlex Elder * big enough, copies the found token into it. The result, if 4154593a9e7bSAlex Elder * copied, is guaranteed to be terminated with '\0'. Note that *buf 4155593a9e7bSAlex Elder * must be terminated with '\0' on entry. 4156e28fff26SAlex Elder * 4157e28fff26SAlex Elder * Returns the length of the token found (not including the '\0'). 4158e28fff26SAlex Elder * Return value will be 0 if no token is found, and it will be >= 4159e28fff26SAlex Elder * token_size if the token would not fit. 4160e28fff26SAlex Elder * 4161593a9e7bSAlex Elder * The *buf pointer will be updated to point beyond the end of the 4162e28fff26SAlex Elder * found token. Note that this occurs even if the token buffer is 4163e28fff26SAlex Elder * too small to hold it. 4164e28fff26SAlex Elder */ 4165e28fff26SAlex Elder static inline size_t copy_token(const char **buf, 4166e28fff26SAlex Elder char *token, 4167e28fff26SAlex Elder size_t token_size) 4168e28fff26SAlex Elder { 4169e28fff26SAlex Elder size_t len; 4170e28fff26SAlex Elder 4171e28fff26SAlex Elder len = next_token(buf); 4172e28fff26SAlex Elder if (len < token_size) { 4173e28fff26SAlex Elder memcpy(token, *buf, len); 4174e28fff26SAlex Elder *(token + len) = '\0'; 4175e28fff26SAlex Elder } 4176e28fff26SAlex Elder *buf += len; 4177e28fff26SAlex Elder 4178e28fff26SAlex Elder return len; 4179e28fff26SAlex Elder } 4180e28fff26SAlex Elder 4181e28fff26SAlex Elder /* 4182ea3352f4SAlex Elder * Finds the next token in *buf, dynamically allocates a buffer big 4183ea3352f4SAlex Elder * enough to hold a copy of it, and copies the token into the new 4184ea3352f4SAlex Elder * buffer. The copy is guaranteed to be terminated with '\0'. Note 4185ea3352f4SAlex Elder * that a duplicate buffer is created even for a zero-length token. 4186ea3352f4SAlex Elder * 4187ea3352f4SAlex Elder * Returns a pointer to the newly-allocated duplicate, or a null 4188ea3352f4SAlex Elder * pointer if memory for the duplicate was not available. If 4189ea3352f4SAlex Elder * the lenp argument is a non-null pointer, the length of the token 4190ea3352f4SAlex Elder * (not including the '\0') is returned in *lenp. 4191ea3352f4SAlex Elder * 4192ea3352f4SAlex Elder * If successful, the *buf pointer will be updated to point beyond 4193ea3352f4SAlex Elder * the end of the found token. 4194ea3352f4SAlex Elder * 4195ea3352f4SAlex Elder * Note: uses GFP_KERNEL for allocation. 4196ea3352f4SAlex Elder */ 4197ea3352f4SAlex Elder static inline char *dup_token(const char **buf, size_t *lenp) 4198ea3352f4SAlex Elder { 4199ea3352f4SAlex Elder char *dup; 4200ea3352f4SAlex Elder size_t len; 4201ea3352f4SAlex Elder 4202ea3352f4SAlex Elder len = next_token(buf); 42034caf35f9SAlex Elder dup = kmemdup(*buf, len + 1, GFP_KERNEL); 4204ea3352f4SAlex Elder if (!dup) 4205ea3352f4SAlex Elder return NULL; 4206ea3352f4SAlex Elder *(dup + len) = '\0'; 4207ea3352f4SAlex Elder *buf += len; 4208ea3352f4SAlex Elder 4209ea3352f4SAlex Elder if (lenp) 4210ea3352f4SAlex Elder *lenp = len; 4211ea3352f4SAlex Elder 4212ea3352f4SAlex Elder return dup; 4213ea3352f4SAlex Elder } 4214ea3352f4SAlex Elder 4215ea3352f4SAlex Elder /* 4216859c31dfSAlex Elder * Parse the options provided for an "rbd add" (i.e., rbd image 4217859c31dfSAlex Elder * mapping) request. These arrive via a write to /sys/bus/rbd/add, 4218859c31dfSAlex Elder * and the data written is passed here via a NUL-terminated buffer. 4219859c31dfSAlex Elder * Returns 0 if successful or an error code otherwise. 4220d22f76e7SAlex Elder * 4221859c31dfSAlex Elder * The information extracted from these options is recorded in 4222859c31dfSAlex Elder * the other parameters which return dynamically-allocated 4223859c31dfSAlex Elder * structures: 4224859c31dfSAlex Elder * ceph_opts 4225859c31dfSAlex Elder * The address of a pointer that will refer to a ceph options 4226859c31dfSAlex Elder * structure. Caller must release the returned pointer using 4227859c31dfSAlex Elder * ceph_destroy_options() when it is no longer needed. 4228859c31dfSAlex Elder * rbd_opts 4229859c31dfSAlex Elder * Address of an rbd options pointer. Fully initialized by 4230859c31dfSAlex Elder * this function; caller must release with kfree(). 4231859c31dfSAlex Elder * spec 4232859c31dfSAlex Elder * Address of an rbd image specification pointer. Fully 4233859c31dfSAlex Elder * initialized by this function based on parsed options. 4234859c31dfSAlex Elder * Caller must release with rbd_spec_put(). 4235859c31dfSAlex Elder * 4236859c31dfSAlex Elder * The options passed take this form: 4237859c31dfSAlex Elder * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>] 4238859c31dfSAlex Elder * where: 4239859c31dfSAlex Elder * <mon_addrs> 4240859c31dfSAlex Elder * A comma-separated list of one or more monitor addresses. 4241859c31dfSAlex Elder * A monitor address is an ip address, optionally followed 4242859c31dfSAlex Elder * by a port number (separated by a colon). 4243859c31dfSAlex Elder * I.e.: ip1[:port1][,ip2[:port2]...] 4244859c31dfSAlex Elder * <options> 4245859c31dfSAlex Elder * A comma-separated list of ceph and/or rbd options. 4246859c31dfSAlex Elder * <pool_name> 4247859c31dfSAlex Elder * The name of the rados pool containing the rbd image. 4248859c31dfSAlex Elder * <image_name> 4249859c31dfSAlex Elder * The name of the image in that pool to map. 4250859c31dfSAlex Elder * <snap_id> 4251859c31dfSAlex Elder * An optional snapshot id. If provided, the mapping will 4252859c31dfSAlex Elder * present data from the image at the time that snapshot was 4253859c31dfSAlex Elder * created. The image head is used if no snapshot id is 4254859c31dfSAlex Elder * provided. Snapshot mappings are always read-only. 4255a725f65eSAlex Elder */ 4256859c31dfSAlex Elder static int rbd_add_parse_args(const char *buf, 4257dc79b113SAlex Elder struct ceph_options **ceph_opts, 4258859c31dfSAlex Elder struct rbd_options **opts, 4259859c31dfSAlex Elder struct rbd_spec **rbd_spec) 4260a725f65eSAlex Elder { 4261e28fff26SAlex Elder size_t len; 4262859c31dfSAlex Elder char *options; 42630ddebc0cSAlex Elder const char *mon_addrs; 4264ecb4dc22SAlex Elder char *snap_name; 42650ddebc0cSAlex Elder size_t mon_addrs_size; 4266859c31dfSAlex Elder struct rbd_spec *spec = NULL; 42674e9afebaSAlex Elder struct rbd_options *rbd_opts = NULL; 4268859c31dfSAlex Elder struct ceph_options *copts; 4269dc79b113SAlex Elder int ret; 4270e28fff26SAlex Elder 4271e28fff26SAlex Elder /* The first four tokens are required */ 4272e28fff26SAlex Elder 42737ef3214aSAlex Elder len = next_token(&buf); 42744fb5d671SAlex Elder if (!len) { 42754fb5d671SAlex Elder rbd_warn(NULL, "no monitor address(es) provided"); 42764fb5d671SAlex Elder return -EINVAL; 42774fb5d671SAlex Elder } 42780ddebc0cSAlex Elder mon_addrs = buf; 4279f28e565aSAlex Elder mon_addrs_size = len + 1; 42807ef3214aSAlex Elder buf += len; 4281a725f65eSAlex Elder 4282dc79b113SAlex Elder ret = -EINVAL; 4283f28e565aSAlex Elder options = dup_token(&buf, NULL); 4284f28e565aSAlex Elder if (!options) 4285dc79b113SAlex Elder return -ENOMEM; 42864fb5d671SAlex Elder if (!*options) { 42874fb5d671SAlex Elder rbd_warn(NULL, "no options provided"); 42884fb5d671SAlex Elder goto out_err; 42894fb5d671SAlex Elder } 4290a725f65eSAlex Elder 4291859c31dfSAlex Elder spec = rbd_spec_alloc(); 4292859c31dfSAlex Elder if (!spec) 4293f28e565aSAlex Elder goto out_mem; 4294859c31dfSAlex Elder 4295859c31dfSAlex Elder spec->pool_name = dup_token(&buf, NULL); 4296859c31dfSAlex Elder if (!spec->pool_name) 4297859c31dfSAlex Elder goto out_mem; 42984fb5d671SAlex Elder if (!*spec->pool_name) { 42994fb5d671SAlex Elder rbd_warn(NULL, "no pool name provided"); 43004fb5d671SAlex Elder goto out_err; 43014fb5d671SAlex Elder } 4302e28fff26SAlex Elder 430369e7a02fSAlex Elder spec->image_name = dup_token(&buf, NULL); 4304859c31dfSAlex Elder if (!spec->image_name) 4305f28e565aSAlex Elder goto out_mem; 43064fb5d671SAlex Elder if (!*spec->image_name) { 43074fb5d671SAlex Elder rbd_warn(NULL, "no image name provided"); 43084fb5d671SAlex Elder goto out_err; 43094fb5d671SAlex Elder } 4310e28fff26SAlex Elder 4311f28e565aSAlex Elder /* 4312f28e565aSAlex Elder * Snapshot name is optional; default is to use "-" 4313f28e565aSAlex Elder * (indicating the head/no snapshot). 4314f28e565aSAlex Elder */ 43153feeb894SAlex Elder len = next_token(&buf); 4316820a5f3eSAlex Elder if (!len) { 43173feeb894SAlex Elder buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */ 43183feeb894SAlex Elder len = sizeof (RBD_SNAP_HEAD_NAME) - 1; 4319f28e565aSAlex Elder } else if (len > RBD_MAX_SNAP_NAME_LEN) { 4320dc79b113SAlex Elder ret = -ENAMETOOLONG; 4321f28e565aSAlex Elder goto out_err; 4322849b4260SAlex Elder } 4323ecb4dc22SAlex Elder snap_name = kmemdup(buf, len + 1, GFP_KERNEL); 4324ecb4dc22SAlex Elder if (!snap_name) 4325f28e565aSAlex Elder goto out_mem; 4326ecb4dc22SAlex Elder *(snap_name + len) = '\0'; 4327ecb4dc22SAlex Elder spec->snap_name = snap_name; 4328e5c35534SAlex Elder 43290ddebc0cSAlex Elder /* Initialize all rbd options to the defaults */ 4330e28fff26SAlex Elder 43314e9afebaSAlex Elder rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL); 43324e9afebaSAlex Elder if (!rbd_opts) 43334e9afebaSAlex Elder goto out_mem; 43344e9afebaSAlex Elder 43354e9afebaSAlex Elder rbd_opts->read_only = RBD_READ_ONLY_DEFAULT; 4336d22f76e7SAlex Elder 4337859c31dfSAlex Elder copts = ceph_parse_options(options, mon_addrs, 43380ddebc0cSAlex Elder mon_addrs + mon_addrs_size - 1, 43394e9afebaSAlex Elder parse_rbd_opts_token, rbd_opts); 4340859c31dfSAlex Elder if (IS_ERR(copts)) { 4341859c31dfSAlex Elder ret = PTR_ERR(copts); 4342dc79b113SAlex Elder goto out_err; 4343dc79b113SAlex Elder } 4344859c31dfSAlex Elder kfree(options); 4345859c31dfSAlex Elder 4346859c31dfSAlex Elder *ceph_opts = copts; 43474e9afebaSAlex Elder *opts = rbd_opts; 4348859c31dfSAlex Elder *rbd_spec = spec; 43490ddebc0cSAlex Elder 4350dc79b113SAlex Elder return 0; 4351f28e565aSAlex Elder out_mem: 4352dc79b113SAlex Elder ret = -ENOMEM; 4353d22f76e7SAlex Elder out_err: 4354859c31dfSAlex Elder kfree(rbd_opts); 4355859c31dfSAlex Elder rbd_spec_put(spec); 4356f28e565aSAlex Elder kfree(options); 4357d22f76e7SAlex Elder 4358dc79b113SAlex Elder return ret; 4359a725f65eSAlex Elder } 4360a725f65eSAlex Elder 4361589d30e0SAlex Elder /* 4362589d30e0SAlex Elder * An rbd format 2 image has a unique identifier, distinct from the 4363589d30e0SAlex Elder * name given to it by the user. Internally, that identifier is 4364589d30e0SAlex Elder * what's used to specify the names of objects related to the image. 4365589d30e0SAlex Elder * 4366589d30e0SAlex Elder * A special "rbd id" object is used to map an rbd image name to its 4367589d30e0SAlex Elder * id. If that object doesn't exist, then there is no v2 rbd image 4368589d30e0SAlex Elder * with the supplied name. 4369589d30e0SAlex Elder * 4370589d30e0SAlex Elder * This function will record the given rbd_dev's image_id field if 4371589d30e0SAlex Elder * it can be determined, and in that case will return 0. If any 4372589d30e0SAlex Elder * errors occur a negative errno will be returned and the rbd_dev's 4373589d30e0SAlex Elder * image_id field will be unchanged (and should be NULL). 4374589d30e0SAlex Elder */ 4375589d30e0SAlex Elder static int rbd_dev_image_id(struct rbd_device *rbd_dev) 4376589d30e0SAlex Elder { 4377589d30e0SAlex Elder int ret; 4378589d30e0SAlex Elder size_t size; 4379589d30e0SAlex Elder char *object_name; 4380589d30e0SAlex Elder void *response; 4381c0fba368SAlex Elder char *image_id; 43822f82ee54SAlex Elder 4383589d30e0SAlex Elder /* 43842c0d0a10SAlex Elder * When probing a parent image, the image id is already 43852c0d0a10SAlex Elder * known (and the image name likely is not). There's no 4386c0fba368SAlex Elder * need to fetch the image id again in this case. We 4387c0fba368SAlex Elder * do still need to set the image format though. 43882c0d0a10SAlex Elder */ 4389c0fba368SAlex Elder if (rbd_dev->spec->image_id) { 4390c0fba368SAlex Elder rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1; 4391c0fba368SAlex Elder 43922c0d0a10SAlex Elder return 0; 4393c0fba368SAlex Elder } 43942c0d0a10SAlex Elder 43952c0d0a10SAlex Elder /* 4396589d30e0SAlex Elder * First, see if the format 2 image id file exists, and if 4397589d30e0SAlex Elder * so, get the image's persistent id from it. 4398589d30e0SAlex Elder */ 439969e7a02fSAlex Elder size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name); 4400589d30e0SAlex Elder object_name = kmalloc(size, GFP_NOIO); 4401589d30e0SAlex Elder if (!object_name) 4402589d30e0SAlex Elder return -ENOMEM; 44030d7dbfceSAlex Elder sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name); 4404589d30e0SAlex Elder dout("rbd id object name is %s\n", object_name); 4405589d30e0SAlex Elder 4406589d30e0SAlex Elder /* Response will be an encoded string, which includes a length */ 4407589d30e0SAlex Elder 4408589d30e0SAlex Elder size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX; 4409589d30e0SAlex Elder response = kzalloc(size, GFP_NOIO); 4410589d30e0SAlex Elder if (!response) { 4411589d30e0SAlex Elder ret = -ENOMEM; 4412589d30e0SAlex Elder goto out; 4413589d30e0SAlex Elder } 4414589d30e0SAlex Elder 4415c0fba368SAlex Elder /* If it doesn't exist we'll assume it's a format 1 image */ 4416c0fba368SAlex Elder 441736be9a76SAlex Elder ret = rbd_obj_method_sync(rbd_dev, object_name, 44184157976bSAlex Elder "rbd", "get_id", NULL, 0, 4419e2a58ee5SAlex Elder response, RBD_IMAGE_ID_LEN_MAX); 442036be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 4421c0fba368SAlex Elder if (ret == -ENOENT) { 4422c0fba368SAlex Elder image_id = kstrdup("", GFP_KERNEL); 4423c0fba368SAlex Elder ret = image_id ? 0 : -ENOMEM; 4424c0fba368SAlex Elder if (!ret) 4425c0fba368SAlex Elder rbd_dev->image_format = 1; 4426c0fba368SAlex Elder } else if (ret > sizeof (__le32)) { 4427c0fba368SAlex Elder void *p = response; 4428589d30e0SAlex Elder 4429c0fba368SAlex Elder image_id = ceph_extract_encoded_string(&p, p + ret, 4430979ed480SAlex Elder NULL, GFP_NOIO); 4431c0fba368SAlex Elder ret = IS_ERR(image_id) ? PTR_ERR(image_id) : 0; 4432c0fba368SAlex Elder if (!ret) 4433c0fba368SAlex Elder rbd_dev->image_format = 2; 4434589d30e0SAlex Elder } else { 4435c0fba368SAlex Elder ret = -EINVAL; 4436c0fba368SAlex Elder } 4437c0fba368SAlex Elder 4438c0fba368SAlex Elder if (!ret) { 4439c0fba368SAlex Elder rbd_dev->spec->image_id = image_id; 4440c0fba368SAlex Elder dout("image_id is %s\n", image_id); 4441589d30e0SAlex Elder } 4442589d30e0SAlex Elder out: 4443589d30e0SAlex Elder kfree(response); 4444589d30e0SAlex Elder kfree(object_name); 4445589d30e0SAlex Elder 4446589d30e0SAlex Elder return ret; 4447589d30e0SAlex Elder } 4448589d30e0SAlex Elder 44496fd48b3bSAlex Elder /* Undo whatever state changes are made by v1 or v2 image probe */ 44506fd48b3bSAlex Elder 44516fd48b3bSAlex Elder static void rbd_dev_unprobe(struct rbd_device *rbd_dev) 44526fd48b3bSAlex Elder { 44536fd48b3bSAlex Elder struct rbd_image_header *header; 44546fd48b3bSAlex Elder 44556fd48b3bSAlex Elder rbd_dev_remove_parent(rbd_dev); 44566fd48b3bSAlex Elder rbd_spec_put(rbd_dev->parent_spec); 44576fd48b3bSAlex Elder rbd_dev->parent_spec = NULL; 44586fd48b3bSAlex Elder rbd_dev->parent_overlap = 0; 44596fd48b3bSAlex Elder 44606fd48b3bSAlex Elder /* Free dynamic fields from the header, then zero it out */ 44616fd48b3bSAlex Elder 44626fd48b3bSAlex Elder header = &rbd_dev->header; 4463812164f8SAlex Elder ceph_put_snap_context(header->snapc); 44646fd48b3bSAlex Elder kfree(header->snap_sizes); 44656fd48b3bSAlex Elder kfree(header->snap_names); 44666fd48b3bSAlex Elder kfree(header->object_prefix); 44676fd48b3bSAlex Elder memset(header, 0, sizeof (*header)); 44686fd48b3bSAlex Elder } 44696fd48b3bSAlex Elder 44702df3fac7SAlex Elder static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev) 4471a30b71b9SAlex Elder { 44729d475de5SAlex Elder int ret; 4473a30b71b9SAlex Elder 44741e130199SAlex Elder ret = rbd_dev_v2_object_prefix(rbd_dev); 447557385b51SAlex Elder if (ret) 44761e130199SAlex Elder goto out_err; 4477b1b5402aSAlex Elder 44782df3fac7SAlex Elder /* 44792df3fac7SAlex Elder * Get the and check features for the image. Currently the 44802df3fac7SAlex Elder * features are assumed to never change. 44812df3fac7SAlex Elder */ 4482b1b5402aSAlex Elder ret = rbd_dev_v2_features(rbd_dev); 448357385b51SAlex Elder if (ret) 4484b1b5402aSAlex Elder goto out_err; 448535d489f9SAlex Elder 448686b00e0dSAlex Elder /* If the image supports layering, get the parent info */ 448786b00e0dSAlex Elder 448886b00e0dSAlex Elder if (rbd_dev->header.features & RBD_FEATURE_LAYERING) { 448986b00e0dSAlex Elder ret = rbd_dev_v2_parent_info(rbd_dev); 449057385b51SAlex Elder if (ret) 449186b00e0dSAlex Elder goto out_err; 449296882f55SAlex Elder /* 4493c734b796SAlex Elder * Print a warning if this image has a parent. 4494c734b796SAlex Elder * Don't print it if the image now being probed 4495c734b796SAlex Elder * is itself a parent. We can tell at this point 4496c734b796SAlex Elder * because we won't know its pool name yet (just its 4497c734b796SAlex Elder * pool id). 449896882f55SAlex Elder */ 4499c734b796SAlex Elder if (rbd_dev->parent_spec && rbd_dev->spec->pool_name) 450096882f55SAlex Elder rbd_warn(rbd_dev, "WARNING: kernel layering " 450196882f55SAlex Elder "is EXPERIMENTAL!"); 450286b00e0dSAlex Elder } 450386b00e0dSAlex Elder 4504cc070d59SAlex Elder /* If the image supports fancy striping, get its parameters */ 4505cc070d59SAlex Elder 4506cc070d59SAlex Elder if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) { 4507cc070d59SAlex Elder ret = rbd_dev_v2_striping_info(rbd_dev); 4508cc070d59SAlex Elder if (ret < 0) 4509cc070d59SAlex Elder goto out_err; 4510cc070d59SAlex Elder } 45112df3fac7SAlex Elder /* No support for crypto and compression type format 2 images */ 45126e14b1a6SAlex Elder 451335152979SAlex Elder return 0; 45149d475de5SAlex Elder out_err: 451586b00e0dSAlex Elder rbd_dev->parent_overlap = 0; 451686b00e0dSAlex Elder rbd_spec_put(rbd_dev->parent_spec); 451786b00e0dSAlex Elder rbd_dev->parent_spec = NULL; 45189d475de5SAlex Elder kfree(rbd_dev->header_name); 45199d475de5SAlex Elder rbd_dev->header_name = NULL; 45201e130199SAlex Elder kfree(rbd_dev->header.object_prefix); 45211e130199SAlex Elder rbd_dev->header.object_prefix = NULL; 45229d475de5SAlex Elder 45239d475de5SAlex Elder return ret; 4524a30b71b9SAlex Elder } 4525a30b71b9SAlex Elder 4526124afba2SAlex Elder static int rbd_dev_probe_parent(struct rbd_device *rbd_dev) 452783a06263SAlex Elder { 45282f82ee54SAlex Elder struct rbd_device *parent = NULL; 4529124afba2SAlex Elder struct rbd_spec *parent_spec; 4530124afba2SAlex Elder struct rbd_client *rbdc; 4531124afba2SAlex Elder int ret; 4532124afba2SAlex Elder 4533124afba2SAlex Elder if (!rbd_dev->parent_spec) 4534124afba2SAlex Elder return 0; 4535124afba2SAlex Elder /* 4536124afba2SAlex Elder * We need to pass a reference to the client and the parent 4537124afba2SAlex Elder * spec when creating the parent rbd_dev. Images related by 4538124afba2SAlex Elder * parent/child relationships always share both. 4539124afba2SAlex Elder */ 4540124afba2SAlex Elder parent_spec = rbd_spec_get(rbd_dev->parent_spec); 4541124afba2SAlex Elder rbdc = __rbd_get_client(rbd_dev->rbd_client); 4542124afba2SAlex Elder 4543124afba2SAlex Elder ret = -ENOMEM; 4544124afba2SAlex Elder parent = rbd_dev_create(rbdc, parent_spec); 4545124afba2SAlex Elder if (!parent) 4546124afba2SAlex Elder goto out_err; 4547124afba2SAlex Elder 454851344a38SAlex Elder ret = rbd_dev_image_probe(parent, true); 4549124afba2SAlex Elder if (ret < 0) 4550124afba2SAlex Elder goto out_err; 4551124afba2SAlex Elder rbd_dev->parent = parent; 4552124afba2SAlex Elder 4553124afba2SAlex Elder return 0; 4554124afba2SAlex Elder out_err: 4555124afba2SAlex Elder if (parent) { 4556124afba2SAlex Elder rbd_spec_put(rbd_dev->parent_spec); 4557124afba2SAlex Elder kfree(rbd_dev->header_name); 4558124afba2SAlex Elder rbd_dev_destroy(parent); 4559124afba2SAlex Elder } else { 4560124afba2SAlex Elder rbd_put_client(rbdc); 4561124afba2SAlex Elder rbd_spec_put(parent_spec); 4562124afba2SAlex Elder } 4563124afba2SAlex Elder 4564124afba2SAlex Elder return ret; 4565124afba2SAlex Elder } 4566124afba2SAlex Elder 4567200a6a8bSAlex Elder static int rbd_dev_device_setup(struct rbd_device *rbd_dev) 4568124afba2SAlex Elder { 456983a06263SAlex Elder int ret; 457083a06263SAlex Elder 457183a06263SAlex Elder /* generate unique id: find highest unique id, add one */ 457283a06263SAlex Elder rbd_dev_id_get(rbd_dev); 457383a06263SAlex Elder 457483a06263SAlex Elder /* Fill in the device name, now that we have its id. */ 457583a06263SAlex Elder BUILD_BUG_ON(DEV_NAME_LEN 457683a06263SAlex Elder < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH); 457783a06263SAlex Elder sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id); 457883a06263SAlex Elder 457983a06263SAlex Elder /* Get our block major device number. */ 458083a06263SAlex Elder 458183a06263SAlex Elder ret = register_blkdev(0, rbd_dev->name); 458283a06263SAlex Elder if (ret < 0) 458383a06263SAlex Elder goto err_out_id; 458483a06263SAlex Elder rbd_dev->major = ret; 458583a06263SAlex Elder 458683a06263SAlex Elder /* Set up the blkdev mapping. */ 458783a06263SAlex Elder 458883a06263SAlex Elder ret = rbd_init_disk(rbd_dev); 458983a06263SAlex Elder if (ret) 459083a06263SAlex Elder goto err_out_blkdev; 459183a06263SAlex Elder 4592f35a4deeSAlex Elder ret = rbd_dev_mapping_set(rbd_dev); 459383a06263SAlex Elder if (ret) 459483a06263SAlex Elder goto err_out_disk; 4595f35a4deeSAlex Elder set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE); 4596f35a4deeSAlex Elder 4597f35a4deeSAlex Elder ret = rbd_bus_add_dev(rbd_dev); 4598f35a4deeSAlex Elder if (ret) 4599f35a4deeSAlex Elder goto err_out_mapping; 460083a06263SAlex Elder 460183a06263SAlex Elder /* Everything's ready. Announce the disk to the world. */ 460283a06263SAlex Elder 4603129b79d4SAlex Elder set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); 460483a06263SAlex Elder add_disk(rbd_dev->disk); 460583a06263SAlex Elder 460683a06263SAlex Elder pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name, 460783a06263SAlex Elder (unsigned long long) rbd_dev->mapping.size); 460883a06263SAlex Elder 460983a06263SAlex Elder return ret; 46102f82ee54SAlex Elder 4611f35a4deeSAlex Elder err_out_mapping: 4612f35a4deeSAlex Elder rbd_dev_mapping_clear(rbd_dev); 461383a06263SAlex Elder err_out_disk: 461483a06263SAlex Elder rbd_free_disk(rbd_dev); 461583a06263SAlex Elder err_out_blkdev: 461683a06263SAlex Elder unregister_blkdev(rbd_dev->major, rbd_dev->name); 461783a06263SAlex Elder err_out_id: 461883a06263SAlex Elder rbd_dev_id_put(rbd_dev); 4619d1cf5788SAlex Elder rbd_dev_mapping_clear(rbd_dev); 462083a06263SAlex Elder 462183a06263SAlex Elder return ret; 462283a06263SAlex Elder } 462383a06263SAlex Elder 4624332bb12dSAlex Elder static int rbd_dev_header_name(struct rbd_device *rbd_dev) 4625332bb12dSAlex Elder { 4626332bb12dSAlex Elder struct rbd_spec *spec = rbd_dev->spec; 4627332bb12dSAlex Elder size_t size; 4628332bb12dSAlex Elder 4629332bb12dSAlex Elder /* Record the header object name for this rbd image. */ 4630332bb12dSAlex Elder 4631332bb12dSAlex Elder rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 4632332bb12dSAlex Elder 4633332bb12dSAlex Elder if (rbd_dev->image_format == 1) 4634332bb12dSAlex Elder size = strlen(spec->image_name) + sizeof (RBD_SUFFIX); 4635332bb12dSAlex Elder else 4636332bb12dSAlex Elder size = sizeof (RBD_HEADER_PREFIX) + strlen(spec->image_id); 4637332bb12dSAlex Elder 4638332bb12dSAlex Elder rbd_dev->header_name = kmalloc(size, GFP_KERNEL); 4639332bb12dSAlex Elder if (!rbd_dev->header_name) 4640332bb12dSAlex Elder return -ENOMEM; 4641332bb12dSAlex Elder 4642332bb12dSAlex Elder if (rbd_dev->image_format == 1) 4643332bb12dSAlex Elder sprintf(rbd_dev->header_name, "%s%s", 4644332bb12dSAlex Elder spec->image_name, RBD_SUFFIX); 4645332bb12dSAlex Elder else 4646332bb12dSAlex Elder sprintf(rbd_dev->header_name, "%s%s", 4647332bb12dSAlex Elder RBD_HEADER_PREFIX, spec->image_id); 4648332bb12dSAlex Elder return 0; 4649332bb12dSAlex Elder } 4650332bb12dSAlex Elder 4651200a6a8bSAlex Elder static void rbd_dev_image_release(struct rbd_device *rbd_dev) 4652200a6a8bSAlex Elder { 46536fd48b3bSAlex Elder int ret; 46546fd48b3bSAlex Elder 46556fd48b3bSAlex Elder rbd_dev_unprobe(rbd_dev); 46566fd48b3bSAlex Elder ret = rbd_dev_header_watch_sync(rbd_dev, 0); 46576fd48b3bSAlex Elder if (ret) 46586fd48b3bSAlex Elder rbd_warn(rbd_dev, "failed to cancel watch event (%d)\n", ret); 4659200a6a8bSAlex Elder kfree(rbd_dev->header_name); 46606fd48b3bSAlex Elder rbd_dev->header_name = NULL; 46616fd48b3bSAlex Elder rbd_dev->image_format = 0; 46626fd48b3bSAlex Elder kfree(rbd_dev->spec->image_id); 46636fd48b3bSAlex Elder rbd_dev->spec->image_id = NULL; 46646fd48b3bSAlex Elder 4665200a6a8bSAlex Elder rbd_dev_destroy(rbd_dev); 4666200a6a8bSAlex Elder } 4667200a6a8bSAlex Elder 4668a30b71b9SAlex Elder /* 4669a30b71b9SAlex Elder * Probe for the existence of the header object for the given rbd 4670a30b71b9SAlex Elder * device. For format 2 images this includes determining the image 4671a30b71b9SAlex Elder * id. 4672a30b71b9SAlex Elder */ 467351344a38SAlex Elder static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool read_only) 4674a30b71b9SAlex Elder { 4675a30b71b9SAlex Elder int ret; 4676b644de2bSAlex Elder int tmp; 4677a30b71b9SAlex Elder 4678a30b71b9SAlex Elder /* 4679a30b71b9SAlex Elder * Get the id from the image id object. If it's not a 4680a30b71b9SAlex Elder * format 2 image, we'll get ENOENT back, and we'll assume 4681a30b71b9SAlex Elder * it's a format 1 image. 4682a30b71b9SAlex Elder */ 4683a30b71b9SAlex Elder ret = rbd_dev_image_id(rbd_dev); 4684a30b71b9SAlex Elder if (ret) 4685c0fba368SAlex Elder return ret; 4686c0fba368SAlex Elder rbd_assert(rbd_dev->spec->image_id); 4687c0fba368SAlex Elder rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 4688c0fba368SAlex Elder 4689332bb12dSAlex Elder ret = rbd_dev_header_name(rbd_dev); 4690332bb12dSAlex Elder if (ret) 4691332bb12dSAlex Elder goto err_out_format; 4692332bb12dSAlex Elder 4693b644de2bSAlex Elder ret = rbd_dev_header_watch_sync(rbd_dev, 1); 4694b644de2bSAlex Elder if (ret) 4695b644de2bSAlex Elder goto out_header_name; 4696b644de2bSAlex Elder 4697c0fba368SAlex Elder if (rbd_dev->image_format == 1) 469899a41ebcSAlex Elder ret = rbd_dev_v1_header_info(rbd_dev); 4699a30b71b9SAlex Elder else 47002df3fac7SAlex Elder ret = rbd_dev_v2_header_info(rbd_dev); 47015655c4d9SAlex Elder if (ret) 4702b644de2bSAlex Elder goto err_out_watch; 4703a30b71b9SAlex Elder 47049bb81c9bSAlex Elder ret = rbd_dev_spec_update(rbd_dev); 47059bb81c9bSAlex Elder if (ret) 470633dca39fSAlex Elder goto err_out_probe; 47079bb81c9bSAlex Elder 470851344a38SAlex Elder /* If we are mapping a snapshot it must be marked read-only */ 470951344a38SAlex Elder 471051344a38SAlex Elder if (rbd_dev->spec->snap_id != CEPH_NOSNAP) 471151344a38SAlex Elder read_only = true; 471251344a38SAlex Elder rbd_dev->mapping.read_only = read_only; 471351344a38SAlex Elder 47149bb81c9bSAlex Elder ret = rbd_dev_probe_parent(rbd_dev); 471530d60ba2SAlex Elder if (ret) 471630d60ba2SAlex Elder goto err_out_probe; 471783a06263SAlex Elder 471830d60ba2SAlex Elder dout("discovered format %u image, header name is %s\n", 471930d60ba2SAlex Elder rbd_dev->image_format, rbd_dev->header_name); 472030d60ba2SAlex Elder 472130d60ba2SAlex Elder return 0; 47226fd48b3bSAlex Elder err_out_probe: 47236fd48b3bSAlex Elder rbd_dev_unprobe(rbd_dev); 4724b644de2bSAlex Elder err_out_watch: 4725b644de2bSAlex Elder tmp = rbd_dev_header_watch_sync(rbd_dev, 0); 4726b644de2bSAlex Elder if (tmp) 4727b644de2bSAlex Elder rbd_warn(rbd_dev, "unable to tear down watch request\n"); 4728332bb12dSAlex Elder out_header_name: 4729332bb12dSAlex Elder kfree(rbd_dev->header_name); 4730332bb12dSAlex Elder rbd_dev->header_name = NULL; 4731332bb12dSAlex Elder err_out_format: 4732332bb12dSAlex Elder rbd_dev->image_format = 0; 47335655c4d9SAlex Elder kfree(rbd_dev->spec->image_id); 47345655c4d9SAlex Elder rbd_dev->spec->image_id = NULL; 47355655c4d9SAlex Elder 47365655c4d9SAlex Elder dout("probe failed, returning %d\n", ret); 47375655c4d9SAlex Elder 47385655c4d9SAlex Elder return ret; 473983a06263SAlex Elder } 474083a06263SAlex Elder 474159c2be1eSYehuda Sadeh static ssize_t rbd_add(struct bus_type *bus, 474259c2be1eSYehuda Sadeh const char *buf, 474359c2be1eSYehuda Sadeh size_t count) 4744602adf40SYehuda Sadeh { 4745cb8627c7SAlex Elder struct rbd_device *rbd_dev = NULL; 4746dc79b113SAlex Elder struct ceph_options *ceph_opts = NULL; 47474e9afebaSAlex Elder struct rbd_options *rbd_opts = NULL; 4748859c31dfSAlex Elder struct rbd_spec *spec = NULL; 47499d3997fdSAlex Elder struct rbd_client *rbdc; 475027cc2594SAlex Elder struct ceph_osd_client *osdc; 475151344a38SAlex Elder bool read_only; 475227cc2594SAlex Elder int rc = -ENOMEM; 4753602adf40SYehuda Sadeh 4754602adf40SYehuda Sadeh if (!try_module_get(THIS_MODULE)) 4755602adf40SYehuda Sadeh return -ENODEV; 4756602adf40SYehuda Sadeh 4757a725f65eSAlex Elder /* parse add command */ 4758859c31dfSAlex Elder rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec); 4759dc79b113SAlex Elder if (rc < 0) 4760bd4ba655SAlex Elder goto err_out_module; 476151344a38SAlex Elder read_only = rbd_opts->read_only; 476251344a38SAlex Elder kfree(rbd_opts); 476351344a38SAlex Elder rbd_opts = NULL; /* done with this */ 4764a725f65eSAlex Elder 47659d3997fdSAlex Elder rbdc = rbd_get_client(ceph_opts); 47669d3997fdSAlex Elder if (IS_ERR(rbdc)) { 47679d3997fdSAlex Elder rc = PTR_ERR(rbdc); 47680ddebc0cSAlex Elder goto err_out_args; 47699d3997fdSAlex Elder } 4770c53d5893SAlex Elder ceph_opts = NULL; /* rbd_dev client now owns this */ 4771602adf40SYehuda Sadeh 4772602adf40SYehuda Sadeh /* pick the pool */ 47739d3997fdSAlex Elder osdc = &rbdc->client->osdc; 4774859c31dfSAlex Elder rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name); 4775602adf40SYehuda Sadeh if (rc < 0) 4776602adf40SYehuda Sadeh goto err_out_client; 4777859c31dfSAlex Elder spec->pool_id = (u64)rc; 4778859c31dfSAlex Elder 47790903e875SAlex Elder /* The ceph file layout needs to fit pool id in 32 bits */ 47800903e875SAlex Elder 4781c0cd10dbSAlex Elder if (spec->pool_id > (u64)U32_MAX) { 4782c0cd10dbSAlex Elder rbd_warn(NULL, "pool id too large (%llu > %u)\n", 4783c0cd10dbSAlex Elder (unsigned long long)spec->pool_id, U32_MAX); 47840903e875SAlex Elder rc = -EIO; 47850903e875SAlex Elder goto err_out_client; 47860903e875SAlex Elder } 47870903e875SAlex Elder 4788c53d5893SAlex Elder rbd_dev = rbd_dev_create(rbdc, spec); 4789bd4ba655SAlex Elder if (!rbd_dev) 4790bd4ba655SAlex Elder goto err_out_client; 4791c53d5893SAlex Elder rbdc = NULL; /* rbd_dev now owns this */ 4792c53d5893SAlex Elder spec = NULL; /* rbd_dev now owns this */ 4793602adf40SYehuda Sadeh 479451344a38SAlex Elder rc = rbd_dev_image_probe(rbd_dev, read_only); 4795a30b71b9SAlex Elder if (rc < 0) 4796c53d5893SAlex Elder goto err_out_rbd_dev; 479705fd6f6fSAlex Elder 4798b536f69aSAlex Elder rc = rbd_dev_device_setup(rbd_dev); 4799b536f69aSAlex Elder if (!rc) 4800602adf40SYehuda Sadeh return count; 4801b536f69aSAlex Elder 4802b536f69aSAlex Elder rbd_dev_image_release(rbd_dev); 4803c53d5893SAlex Elder err_out_rbd_dev: 4804c53d5893SAlex Elder rbd_dev_destroy(rbd_dev); 4805bd4ba655SAlex Elder err_out_client: 48069d3997fdSAlex Elder rbd_put_client(rbdc); 48070ddebc0cSAlex Elder err_out_args: 480878cea76eSAlex Elder if (ceph_opts) 480978cea76eSAlex Elder ceph_destroy_options(ceph_opts); 48104e9afebaSAlex Elder kfree(rbd_opts); 4811859c31dfSAlex Elder rbd_spec_put(spec); 4812bd4ba655SAlex Elder err_out_module: 4813bd4ba655SAlex Elder module_put(THIS_MODULE); 481427cc2594SAlex Elder 4815602adf40SYehuda Sadeh dout("Error adding device %s\n", buf); 481627cc2594SAlex Elder 481727cc2594SAlex Elder return (ssize_t)rc; 4818602adf40SYehuda Sadeh } 4819602adf40SYehuda Sadeh 4820de71a297SAlex Elder static struct rbd_device *__rbd_get_dev(unsigned long dev_id) 4821602adf40SYehuda Sadeh { 4822602adf40SYehuda Sadeh struct list_head *tmp; 4823602adf40SYehuda Sadeh struct rbd_device *rbd_dev; 4824602adf40SYehuda Sadeh 4825e124a82fSAlex Elder spin_lock(&rbd_dev_list_lock); 4826602adf40SYehuda Sadeh list_for_each(tmp, &rbd_dev_list) { 4827602adf40SYehuda Sadeh rbd_dev = list_entry(tmp, struct rbd_device, node); 4828de71a297SAlex Elder if (rbd_dev->dev_id == dev_id) { 4829e124a82fSAlex Elder spin_unlock(&rbd_dev_list_lock); 4830602adf40SYehuda Sadeh return rbd_dev; 4831602adf40SYehuda Sadeh } 4832e124a82fSAlex Elder } 4833e124a82fSAlex Elder spin_unlock(&rbd_dev_list_lock); 4834602adf40SYehuda Sadeh return NULL; 4835602adf40SYehuda Sadeh } 4836602adf40SYehuda Sadeh 4837200a6a8bSAlex Elder static void rbd_dev_device_release(struct device *dev) 4838602adf40SYehuda Sadeh { 4839593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4840602adf40SYehuda Sadeh 4841602adf40SYehuda Sadeh rbd_free_disk(rbd_dev); 4842200a6a8bSAlex Elder clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); 48436d80b130SAlex Elder rbd_dev_mapping_clear(rbd_dev); 4844602adf40SYehuda Sadeh unregister_blkdev(rbd_dev->major, rbd_dev->name); 4845200a6a8bSAlex Elder rbd_dev->major = 0; 4846e2839308SAlex Elder rbd_dev_id_put(rbd_dev); 4847d1cf5788SAlex Elder rbd_dev_mapping_clear(rbd_dev); 4848602adf40SYehuda Sadeh } 4849602adf40SYehuda Sadeh 485005a46afdSAlex Elder static void rbd_dev_remove_parent(struct rbd_device *rbd_dev) 485105a46afdSAlex Elder { 4852ad945fc1SAlex Elder while (rbd_dev->parent) { 485305a46afdSAlex Elder struct rbd_device *first = rbd_dev; 485405a46afdSAlex Elder struct rbd_device *second = first->parent; 485505a46afdSAlex Elder struct rbd_device *third; 485605a46afdSAlex Elder 485705a46afdSAlex Elder /* 485805a46afdSAlex Elder * Follow to the parent with no grandparent and 485905a46afdSAlex Elder * remove it. 486005a46afdSAlex Elder */ 486105a46afdSAlex Elder while (second && (third = second->parent)) { 486205a46afdSAlex Elder first = second; 486305a46afdSAlex Elder second = third; 486405a46afdSAlex Elder } 4865ad945fc1SAlex Elder rbd_assert(second); 48668ad42cd0SAlex Elder rbd_dev_image_release(second); 4867ad945fc1SAlex Elder first->parent = NULL; 4868ad945fc1SAlex Elder first->parent_overlap = 0; 4869ad945fc1SAlex Elder 4870ad945fc1SAlex Elder rbd_assert(first->parent_spec); 487105a46afdSAlex Elder rbd_spec_put(first->parent_spec); 487205a46afdSAlex Elder first->parent_spec = NULL; 487305a46afdSAlex Elder } 487405a46afdSAlex Elder } 487505a46afdSAlex Elder 4876dfc5606dSYehuda Sadeh static ssize_t rbd_remove(struct bus_type *bus, 4877602adf40SYehuda Sadeh const char *buf, 4878602adf40SYehuda Sadeh size_t count) 4879602adf40SYehuda Sadeh { 4880602adf40SYehuda Sadeh struct rbd_device *rbd_dev = NULL; 48810d8189e1SAlex Elder int target_id; 4882602adf40SYehuda Sadeh unsigned long ul; 48830d8189e1SAlex Elder int ret; 4884602adf40SYehuda Sadeh 48850d8189e1SAlex Elder ret = strict_strtoul(buf, 10, &ul); 48860d8189e1SAlex Elder if (ret) 48870d8189e1SAlex Elder return ret; 4888602adf40SYehuda Sadeh 4889602adf40SYehuda Sadeh /* convert to int; abort if we lost anything in the conversion */ 4890602adf40SYehuda Sadeh target_id = (int) ul; 4891602adf40SYehuda Sadeh if (target_id != ul) 4892602adf40SYehuda Sadeh return -EINVAL; 4893602adf40SYehuda Sadeh 4894602adf40SYehuda Sadeh mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); 4895602adf40SYehuda Sadeh 4896602adf40SYehuda Sadeh rbd_dev = __rbd_get_dev(target_id); 4897602adf40SYehuda Sadeh if (!rbd_dev) { 4898602adf40SYehuda Sadeh ret = -ENOENT; 4899602adf40SYehuda Sadeh goto done; 4900602adf40SYehuda Sadeh } 4901602adf40SYehuda Sadeh 4902a14ea269SAlex Elder spin_lock_irq(&rbd_dev->lock); 4903b82d167bSAlex Elder if (rbd_dev->open_count) 490442382b70SAlex Elder ret = -EBUSY; 4905b82d167bSAlex Elder else 4906b82d167bSAlex Elder set_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags); 4907a14ea269SAlex Elder spin_unlock_irq(&rbd_dev->lock); 4908b82d167bSAlex Elder if (ret < 0) 490942382b70SAlex Elder goto done; 49100d8189e1SAlex Elder ret = count; 4911b480815aSAlex Elder rbd_bus_del_dev(rbd_dev); 49128ad42cd0SAlex Elder rbd_dev_image_release(rbd_dev); 491379ab7558SAlex Elder module_put(THIS_MODULE); 4914602adf40SYehuda Sadeh done: 4915602adf40SYehuda Sadeh mutex_unlock(&ctl_mutex); 4916aafb230eSAlex Elder 4917602adf40SYehuda Sadeh return ret; 4918602adf40SYehuda Sadeh } 4919602adf40SYehuda Sadeh 4920602adf40SYehuda Sadeh /* 4921602adf40SYehuda Sadeh * create control files in sysfs 4922dfc5606dSYehuda Sadeh * /sys/bus/rbd/... 4923602adf40SYehuda Sadeh */ 4924602adf40SYehuda Sadeh static int rbd_sysfs_init(void) 4925602adf40SYehuda Sadeh { 4926dfc5606dSYehuda Sadeh int ret; 4927602adf40SYehuda Sadeh 4928fed4c143SAlex Elder ret = device_register(&rbd_root_dev); 4929dfc5606dSYehuda Sadeh if (ret < 0) 4930dfc5606dSYehuda Sadeh return ret; 4931602adf40SYehuda Sadeh 4932fed4c143SAlex Elder ret = bus_register(&rbd_bus_type); 4933fed4c143SAlex Elder if (ret < 0) 4934fed4c143SAlex Elder device_unregister(&rbd_root_dev); 4935602adf40SYehuda Sadeh 4936602adf40SYehuda Sadeh return ret; 4937602adf40SYehuda Sadeh } 4938602adf40SYehuda Sadeh 4939602adf40SYehuda Sadeh static void rbd_sysfs_cleanup(void) 4940602adf40SYehuda Sadeh { 4941dfc5606dSYehuda Sadeh bus_unregister(&rbd_bus_type); 4942fed4c143SAlex Elder device_unregister(&rbd_root_dev); 4943602adf40SYehuda Sadeh } 4944602adf40SYehuda Sadeh 49451c2a9dfeSAlex Elder static int rbd_slab_init(void) 49461c2a9dfeSAlex Elder { 49471c2a9dfeSAlex Elder rbd_assert(!rbd_img_request_cache); 49481c2a9dfeSAlex Elder rbd_img_request_cache = kmem_cache_create("rbd_img_request", 49491c2a9dfeSAlex Elder sizeof (struct rbd_img_request), 49501c2a9dfeSAlex Elder __alignof__(struct rbd_img_request), 49511c2a9dfeSAlex Elder 0, NULL); 4952868311b1SAlex Elder if (!rbd_img_request_cache) 4953868311b1SAlex Elder return -ENOMEM; 4954868311b1SAlex Elder 4955868311b1SAlex Elder rbd_assert(!rbd_obj_request_cache); 4956868311b1SAlex Elder rbd_obj_request_cache = kmem_cache_create("rbd_obj_request", 4957868311b1SAlex Elder sizeof (struct rbd_obj_request), 4958868311b1SAlex Elder __alignof__(struct rbd_obj_request), 4959868311b1SAlex Elder 0, NULL); 496078c2a44aSAlex Elder if (!rbd_obj_request_cache) 496178c2a44aSAlex Elder goto out_err; 496278c2a44aSAlex Elder 496378c2a44aSAlex Elder rbd_assert(!rbd_segment_name_cache); 496478c2a44aSAlex Elder rbd_segment_name_cache = kmem_cache_create("rbd_segment_name", 496578c2a44aSAlex Elder MAX_OBJ_NAME_SIZE + 1, 1, 0, NULL); 496678c2a44aSAlex Elder if (rbd_segment_name_cache) 49671c2a9dfeSAlex Elder return 0; 496878c2a44aSAlex Elder out_err: 496978c2a44aSAlex Elder if (rbd_obj_request_cache) { 497078c2a44aSAlex Elder kmem_cache_destroy(rbd_obj_request_cache); 497178c2a44aSAlex Elder rbd_obj_request_cache = NULL; 497278c2a44aSAlex Elder } 49731c2a9dfeSAlex Elder 4974868311b1SAlex Elder kmem_cache_destroy(rbd_img_request_cache); 4975868311b1SAlex Elder rbd_img_request_cache = NULL; 4976868311b1SAlex Elder 49771c2a9dfeSAlex Elder return -ENOMEM; 49781c2a9dfeSAlex Elder } 49791c2a9dfeSAlex Elder 49801c2a9dfeSAlex Elder static void rbd_slab_exit(void) 49811c2a9dfeSAlex Elder { 498278c2a44aSAlex Elder rbd_assert(rbd_segment_name_cache); 498378c2a44aSAlex Elder kmem_cache_destroy(rbd_segment_name_cache); 498478c2a44aSAlex Elder rbd_segment_name_cache = NULL; 498578c2a44aSAlex Elder 4986868311b1SAlex Elder rbd_assert(rbd_obj_request_cache); 4987868311b1SAlex Elder kmem_cache_destroy(rbd_obj_request_cache); 4988868311b1SAlex Elder rbd_obj_request_cache = NULL; 4989868311b1SAlex Elder 49901c2a9dfeSAlex Elder rbd_assert(rbd_img_request_cache); 49911c2a9dfeSAlex Elder kmem_cache_destroy(rbd_img_request_cache); 49921c2a9dfeSAlex Elder rbd_img_request_cache = NULL; 49931c2a9dfeSAlex Elder } 49941c2a9dfeSAlex Elder 4995cc344fa1SAlex Elder static int __init rbd_init(void) 4996602adf40SYehuda Sadeh { 4997602adf40SYehuda Sadeh int rc; 4998602adf40SYehuda Sadeh 49991e32d34cSAlex Elder if (!libceph_compatible(NULL)) { 50001e32d34cSAlex Elder rbd_warn(NULL, "libceph incompatibility (quitting)"); 50011e32d34cSAlex Elder 50021e32d34cSAlex Elder return -EINVAL; 50031e32d34cSAlex Elder } 50041c2a9dfeSAlex Elder rc = rbd_slab_init(); 5005602adf40SYehuda Sadeh if (rc) 5006602adf40SYehuda Sadeh return rc; 50071c2a9dfeSAlex Elder rc = rbd_sysfs_init(); 50081c2a9dfeSAlex Elder if (rc) 50091c2a9dfeSAlex Elder rbd_slab_exit(); 50101c2a9dfeSAlex Elder else 5011f0f8cef5SAlex Elder pr_info("loaded " RBD_DRV_NAME_LONG "\n"); 50121c2a9dfeSAlex Elder 50131c2a9dfeSAlex Elder return rc; 5014602adf40SYehuda Sadeh } 5015602adf40SYehuda Sadeh 5016cc344fa1SAlex Elder static void __exit rbd_exit(void) 5017602adf40SYehuda Sadeh { 5018602adf40SYehuda Sadeh rbd_sysfs_cleanup(); 50191c2a9dfeSAlex Elder rbd_slab_exit(); 5020602adf40SYehuda Sadeh } 5021602adf40SYehuda Sadeh 5022602adf40SYehuda Sadeh module_init(rbd_init); 5023602adf40SYehuda Sadeh module_exit(rbd_exit); 5024602adf40SYehuda Sadeh 5025602adf40SYehuda Sadeh MODULE_AUTHOR("Sage Weil <sage@newdream.net>"); 5026602adf40SYehuda Sadeh MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>"); 5027602adf40SYehuda Sadeh MODULE_DESCRIPTION("rados block device"); 5028602adf40SYehuda Sadeh 5029602adf40SYehuda Sadeh /* following authorship retained from original osdblk.c */ 5030602adf40SYehuda Sadeh MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>"); 5031602adf40SYehuda Sadeh 5032602adf40SYehuda Sadeh MODULE_LICENSE("GPL"); 5033