1e2a58ee5SAlex Elder 2602adf40SYehuda Sadeh /* 3602adf40SYehuda Sadeh rbd.c -- Export ceph rados objects as a Linux block device 4602adf40SYehuda Sadeh 5602adf40SYehuda Sadeh 6602adf40SYehuda Sadeh based on drivers/block/osdblk.c: 7602adf40SYehuda Sadeh 8602adf40SYehuda Sadeh Copyright 2009 Red Hat, Inc. 9602adf40SYehuda Sadeh 10602adf40SYehuda Sadeh This program is free software; you can redistribute it and/or modify 11602adf40SYehuda Sadeh it under the terms of the GNU General Public License as published by 12602adf40SYehuda Sadeh the Free Software Foundation. 13602adf40SYehuda Sadeh 14602adf40SYehuda Sadeh This program is distributed in the hope that it will be useful, 15602adf40SYehuda Sadeh but WITHOUT ANY WARRANTY; without even the implied warranty of 16602adf40SYehuda Sadeh MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17602adf40SYehuda Sadeh GNU General Public License for more details. 18602adf40SYehuda Sadeh 19602adf40SYehuda Sadeh You should have received a copy of the GNU General Public License 20602adf40SYehuda Sadeh along with this program; see the file COPYING. If not, write to 21602adf40SYehuda Sadeh the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 22602adf40SYehuda Sadeh 23602adf40SYehuda Sadeh 24602adf40SYehuda Sadeh 25dfc5606dSYehuda Sadeh For usage instructions, please refer to: 26602adf40SYehuda Sadeh 27dfc5606dSYehuda Sadeh Documentation/ABI/testing/sysfs-bus-rbd 28602adf40SYehuda Sadeh 29602adf40SYehuda Sadeh */ 30602adf40SYehuda Sadeh 31602adf40SYehuda Sadeh #include <linux/ceph/libceph.h> 32602adf40SYehuda Sadeh #include <linux/ceph/osd_client.h> 33602adf40SYehuda Sadeh #include <linux/ceph/mon_client.h> 34602adf40SYehuda Sadeh #include <linux/ceph/decode.h> 3559c2be1eSYehuda Sadeh #include <linux/parser.h> 3630d1cff8SAlex Elder #include <linux/bsearch.h> 37602adf40SYehuda Sadeh 38602adf40SYehuda Sadeh #include <linux/kernel.h> 39602adf40SYehuda Sadeh #include <linux/device.h> 40602adf40SYehuda Sadeh #include <linux/module.h> 41602adf40SYehuda Sadeh #include <linux/fs.h> 42602adf40SYehuda Sadeh #include <linux/blkdev.h> 431c2a9dfeSAlex Elder #include <linux/slab.h> 44602adf40SYehuda Sadeh 45602adf40SYehuda Sadeh #include "rbd_types.h" 46602adf40SYehuda Sadeh 47aafb230eSAlex Elder #define RBD_DEBUG /* Activate rbd_assert() calls */ 48aafb230eSAlex Elder 49593a9e7bSAlex Elder /* 50593a9e7bSAlex Elder * The basic unit of block I/O is a sector. It is interpreted in a 51593a9e7bSAlex Elder * number of contexts in Linux (blk, bio, genhd), but the default is 52593a9e7bSAlex Elder * universally 512 bytes. These symbols are just slightly more 53593a9e7bSAlex Elder * meaningful than the bare numbers they represent. 54593a9e7bSAlex Elder */ 55593a9e7bSAlex Elder #define SECTOR_SHIFT 9 56593a9e7bSAlex Elder #define SECTOR_SIZE (1ULL << SECTOR_SHIFT) 57593a9e7bSAlex Elder 58a2acd00eSAlex Elder /* 59a2acd00eSAlex Elder * Increment the given counter and return its updated value. 60a2acd00eSAlex Elder * If the counter is already 0 it will not be incremented. 61a2acd00eSAlex Elder * If the counter is already at its maximum value returns 62a2acd00eSAlex Elder * -EINVAL without updating it. 63a2acd00eSAlex Elder */ 64a2acd00eSAlex Elder static int atomic_inc_return_safe(atomic_t *v) 65a2acd00eSAlex Elder { 66a2acd00eSAlex Elder unsigned int counter; 67a2acd00eSAlex Elder 68a2acd00eSAlex Elder counter = (unsigned int)__atomic_add_unless(v, 1, 0); 69a2acd00eSAlex Elder if (counter <= (unsigned int)INT_MAX) 70a2acd00eSAlex Elder return (int)counter; 71a2acd00eSAlex Elder 72a2acd00eSAlex Elder atomic_dec(v); 73a2acd00eSAlex Elder 74a2acd00eSAlex Elder return -EINVAL; 75a2acd00eSAlex Elder } 76a2acd00eSAlex Elder 77a2acd00eSAlex Elder /* Decrement the counter. Return the resulting value, or -EINVAL */ 78a2acd00eSAlex Elder static int atomic_dec_return_safe(atomic_t *v) 79a2acd00eSAlex Elder { 80a2acd00eSAlex Elder int counter; 81a2acd00eSAlex Elder 82a2acd00eSAlex Elder counter = atomic_dec_return(v); 83a2acd00eSAlex Elder if (counter >= 0) 84a2acd00eSAlex Elder return counter; 85a2acd00eSAlex Elder 86a2acd00eSAlex Elder atomic_inc(v); 87a2acd00eSAlex Elder 88a2acd00eSAlex Elder return -EINVAL; 89a2acd00eSAlex Elder } 90a2acd00eSAlex Elder 91f0f8cef5SAlex Elder #define RBD_DRV_NAME "rbd" 92f0f8cef5SAlex Elder #define RBD_DRV_NAME_LONG "rbd (rados block device)" 93602adf40SYehuda Sadeh 94602adf40SYehuda Sadeh #define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */ 95602adf40SYehuda Sadeh 96d4b125e9SAlex Elder #define RBD_SNAP_DEV_NAME_PREFIX "snap_" 97d4b125e9SAlex Elder #define RBD_MAX_SNAP_NAME_LEN \ 98d4b125e9SAlex Elder (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1)) 99d4b125e9SAlex Elder 10035d489f9SAlex Elder #define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */ 101602adf40SYehuda Sadeh 102602adf40SYehuda Sadeh #define RBD_SNAP_HEAD_NAME "-" 103602adf40SYehuda Sadeh 1049682fc6dSAlex Elder #define BAD_SNAP_INDEX U32_MAX /* invalid index into snap array */ 1059682fc6dSAlex Elder 1069e15b77dSAlex Elder /* This allows a single page to hold an image name sent by OSD */ 1079e15b77dSAlex Elder #define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1) 108589d30e0SAlex Elder #define RBD_IMAGE_ID_LEN_MAX 64 1099e15b77dSAlex Elder 1101e130199SAlex Elder #define RBD_OBJ_PREFIX_LEN_MAX 64 111589d30e0SAlex Elder 112d889140cSAlex Elder /* Feature bits */ 113d889140cSAlex Elder 1145cbf6f12SAlex Elder #define RBD_FEATURE_LAYERING (1<<0) 1155cbf6f12SAlex Elder #define RBD_FEATURE_STRIPINGV2 (1<<1) 1165cbf6f12SAlex Elder #define RBD_FEATURES_ALL \ 1175cbf6f12SAlex Elder (RBD_FEATURE_LAYERING | RBD_FEATURE_STRIPINGV2) 118d889140cSAlex Elder 119d889140cSAlex Elder /* Features supported by this (client software) implementation. */ 120d889140cSAlex Elder 121770eba6eSAlex Elder #define RBD_FEATURES_SUPPORTED (RBD_FEATURES_ALL) 122d889140cSAlex Elder 12381a89793SAlex Elder /* 12481a89793SAlex Elder * An RBD device name will be "rbd#", where the "rbd" comes from 12581a89793SAlex Elder * RBD_DRV_NAME above, and # is a unique integer identifier. 12681a89793SAlex Elder * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big 12781a89793SAlex Elder * enough to hold all possible device names. 12881a89793SAlex Elder */ 129602adf40SYehuda Sadeh #define DEV_NAME_LEN 32 13081a89793SAlex Elder #define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1) 131602adf40SYehuda Sadeh 132602adf40SYehuda Sadeh /* 133602adf40SYehuda Sadeh * block device image metadata (in-memory version) 134602adf40SYehuda Sadeh */ 135602adf40SYehuda Sadeh struct rbd_image_header { 136f35a4deeSAlex Elder /* These six fields never change for a given rbd image */ 137849b4260SAlex Elder char *object_prefix; 138602adf40SYehuda Sadeh __u8 obj_order; 139602adf40SYehuda Sadeh __u8 crypt_type; 140602adf40SYehuda Sadeh __u8 comp_type; 141f35a4deeSAlex Elder u64 stripe_unit; 142f35a4deeSAlex Elder u64 stripe_count; 143f35a4deeSAlex Elder u64 features; /* Might be changeable someday? */ 144602adf40SYehuda Sadeh 145f84344f3SAlex Elder /* The remaining fields need to be updated occasionally */ 146f84344f3SAlex Elder u64 image_size; 147f84344f3SAlex Elder struct ceph_snap_context *snapc; 148f35a4deeSAlex Elder char *snap_names; /* format 1 only */ 149f35a4deeSAlex Elder u64 *snap_sizes; /* format 1 only */ 15059c2be1eSYehuda Sadeh }; 15159c2be1eSYehuda Sadeh 1520d7dbfceSAlex Elder /* 1530d7dbfceSAlex Elder * An rbd image specification. 1540d7dbfceSAlex Elder * 1550d7dbfceSAlex Elder * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely 156c66c6e0cSAlex Elder * identify an image. Each rbd_dev structure includes a pointer to 157c66c6e0cSAlex Elder * an rbd_spec structure that encapsulates this identity. 158c66c6e0cSAlex Elder * 159c66c6e0cSAlex Elder * Each of the id's in an rbd_spec has an associated name. For a 160c66c6e0cSAlex Elder * user-mapped image, the names are supplied and the id's associated 161c66c6e0cSAlex Elder * with them are looked up. For a layered image, a parent image is 162c66c6e0cSAlex Elder * defined by the tuple, and the names are looked up. 163c66c6e0cSAlex Elder * 164c66c6e0cSAlex Elder * An rbd_dev structure contains a parent_spec pointer which is 165c66c6e0cSAlex Elder * non-null if the image it represents is a child in a layered 166c66c6e0cSAlex Elder * image. This pointer will refer to the rbd_spec structure used 167c66c6e0cSAlex Elder * by the parent rbd_dev for its own identity (i.e., the structure 168c66c6e0cSAlex Elder * is shared between the parent and child). 169c66c6e0cSAlex Elder * 170c66c6e0cSAlex Elder * Since these structures are populated once, during the discovery 171c66c6e0cSAlex Elder * phase of image construction, they are effectively immutable so 172c66c6e0cSAlex Elder * we make no effort to synchronize access to them. 173c66c6e0cSAlex Elder * 174c66c6e0cSAlex Elder * Note that code herein does not assume the image name is known (it 175c66c6e0cSAlex Elder * could be a null pointer). 1760d7dbfceSAlex Elder */ 1770d7dbfceSAlex Elder struct rbd_spec { 1780d7dbfceSAlex Elder u64 pool_id; 179ecb4dc22SAlex Elder const char *pool_name; 1800d7dbfceSAlex Elder 181ecb4dc22SAlex Elder const char *image_id; 182ecb4dc22SAlex Elder const char *image_name; 1830d7dbfceSAlex Elder 1840d7dbfceSAlex Elder u64 snap_id; 185ecb4dc22SAlex Elder const char *snap_name; 1860d7dbfceSAlex Elder 1870d7dbfceSAlex Elder struct kref kref; 1880d7dbfceSAlex Elder }; 1890d7dbfceSAlex Elder 190602adf40SYehuda Sadeh /* 191f0f8cef5SAlex Elder * an instance of the client. multiple devices may share an rbd client. 192602adf40SYehuda Sadeh */ 193602adf40SYehuda Sadeh struct rbd_client { 194602adf40SYehuda Sadeh struct ceph_client *client; 195602adf40SYehuda Sadeh struct kref kref; 196602adf40SYehuda Sadeh struct list_head node; 197602adf40SYehuda Sadeh }; 198602adf40SYehuda Sadeh 199bf0d5f50SAlex Elder struct rbd_img_request; 200bf0d5f50SAlex Elder typedef void (*rbd_img_callback_t)(struct rbd_img_request *); 201bf0d5f50SAlex Elder 202bf0d5f50SAlex Elder #define BAD_WHICH U32_MAX /* Good which or bad which, which? */ 203bf0d5f50SAlex Elder 204bf0d5f50SAlex Elder struct rbd_obj_request; 205bf0d5f50SAlex Elder typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *); 206bf0d5f50SAlex Elder 2079969ebc5SAlex Elder enum obj_request_type { 2089969ebc5SAlex Elder OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES 2099969ebc5SAlex Elder }; 210bf0d5f50SAlex Elder 211926f9b3fSAlex Elder enum obj_req_flags { 212926f9b3fSAlex Elder OBJ_REQ_DONE, /* completion flag: not done = 0, done = 1 */ 2136365d33aSAlex Elder OBJ_REQ_IMG_DATA, /* object usage: standalone = 0, image = 1 */ 2145679c59fSAlex Elder OBJ_REQ_KNOWN, /* EXISTS flag valid: no = 0, yes = 1 */ 2155679c59fSAlex Elder OBJ_REQ_EXISTS, /* target exists: no = 0, yes = 1 */ 216926f9b3fSAlex Elder }; 217926f9b3fSAlex Elder 218bf0d5f50SAlex Elder struct rbd_obj_request { 219bf0d5f50SAlex Elder const char *object_name; 220bf0d5f50SAlex Elder u64 offset; /* object start byte */ 221bf0d5f50SAlex Elder u64 length; /* bytes from offset */ 222926f9b3fSAlex Elder unsigned long flags; 223bf0d5f50SAlex Elder 224c5b5ef6cSAlex Elder /* 225c5b5ef6cSAlex Elder * An object request associated with an image will have its 226c5b5ef6cSAlex Elder * img_data flag set; a standalone object request will not. 227c5b5ef6cSAlex Elder * 228c5b5ef6cSAlex Elder * A standalone object request will have which == BAD_WHICH 229c5b5ef6cSAlex Elder * and a null obj_request pointer. 230c5b5ef6cSAlex Elder * 231c5b5ef6cSAlex Elder * An object request initiated in support of a layered image 232c5b5ef6cSAlex Elder * object (to check for its existence before a write) will 233c5b5ef6cSAlex Elder * have which == BAD_WHICH and a non-null obj_request pointer. 234c5b5ef6cSAlex Elder * 235c5b5ef6cSAlex Elder * Finally, an object request for rbd image data will have 236c5b5ef6cSAlex Elder * which != BAD_WHICH, and will have a non-null img_request 237c5b5ef6cSAlex Elder * pointer. The value of which will be in the range 238c5b5ef6cSAlex Elder * 0..(img_request->obj_request_count-1). 239c5b5ef6cSAlex Elder */ 240c5b5ef6cSAlex Elder union { 241c5b5ef6cSAlex Elder struct rbd_obj_request *obj_request; /* STAT op */ 242c5b5ef6cSAlex Elder struct { 243bf0d5f50SAlex Elder struct rbd_img_request *img_request; 244c5b5ef6cSAlex Elder u64 img_offset; 245c5b5ef6cSAlex Elder /* links for img_request->obj_requests list */ 246c5b5ef6cSAlex Elder struct list_head links; 247c5b5ef6cSAlex Elder }; 248c5b5ef6cSAlex Elder }; 249bf0d5f50SAlex Elder u32 which; /* posn image request list */ 250bf0d5f50SAlex Elder 251bf0d5f50SAlex Elder enum obj_request_type type; 252788e2df3SAlex Elder union { 253bf0d5f50SAlex Elder struct bio *bio_list; 254788e2df3SAlex Elder struct { 255788e2df3SAlex Elder struct page **pages; 256788e2df3SAlex Elder u32 page_count; 257788e2df3SAlex Elder }; 258788e2df3SAlex Elder }; 2590eefd470SAlex Elder struct page **copyup_pages; 260ebda6408SAlex Elder u32 copyup_page_count; 261bf0d5f50SAlex Elder 262bf0d5f50SAlex Elder struct ceph_osd_request *osd_req; 263bf0d5f50SAlex Elder 264bf0d5f50SAlex Elder u64 xferred; /* bytes transferred */ 2651b83bef2SSage Weil int result; 266bf0d5f50SAlex Elder 267bf0d5f50SAlex Elder rbd_obj_callback_t callback; 268788e2df3SAlex Elder struct completion completion; 269bf0d5f50SAlex Elder 270bf0d5f50SAlex Elder struct kref kref; 271bf0d5f50SAlex Elder }; 272bf0d5f50SAlex Elder 2730c425248SAlex Elder enum img_req_flags { 2749849e986SAlex Elder IMG_REQ_WRITE, /* I/O direction: read = 0, write = 1 */ 2759849e986SAlex Elder IMG_REQ_CHILD, /* initiator: block = 0, child image = 1 */ 276d0b2e944SAlex Elder IMG_REQ_LAYERED, /* ENOENT handling: normal = 0, layered = 1 */ 2770c425248SAlex Elder }; 2780c425248SAlex Elder 279bf0d5f50SAlex Elder struct rbd_img_request { 280bf0d5f50SAlex Elder struct rbd_device *rbd_dev; 281bf0d5f50SAlex Elder u64 offset; /* starting image byte offset */ 282bf0d5f50SAlex Elder u64 length; /* byte count from offset */ 2830c425248SAlex Elder unsigned long flags; 284bf0d5f50SAlex Elder union { 285bf0d5f50SAlex Elder u64 snap_id; /* for reads */ 2869849e986SAlex Elder struct ceph_snap_context *snapc; /* for writes */ 2879849e986SAlex Elder }; 2889849e986SAlex Elder union { 2899849e986SAlex Elder struct request *rq; /* block request */ 2909849e986SAlex Elder struct rbd_obj_request *obj_request; /* obj req initiator */ 291bf0d5f50SAlex Elder }; 2923d7efd18SAlex Elder struct page **copyup_pages; 293ebda6408SAlex Elder u32 copyup_page_count; 294bf0d5f50SAlex Elder spinlock_t completion_lock;/* protects next_completion */ 295bf0d5f50SAlex Elder u32 next_completion; 296bf0d5f50SAlex Elder rbd_img_callback_t callback; 29755f27e09SAlex Elder u64 xferred;/* aggregate bytes transferred */ 298a5a337d4SAlex Elder int result; /* first nonzero obj_request result */ 299bf0d5f50SAlex Elder 300bf0d5f50SAlex Elder u32 obj_request_count; 301bf0d5f50SAlex Elder struct list_head obj_requests; /* rbd_obj_request structs */ 302bf0d5f50SAlex Elder 303bf0d5f50SAlex Elder struct kref kref; 304bf0d5f50SAlex Elder }; 305bf0d5f50SAlex Elder 306bf0d5f50SAlex Elder #define for_each_obj_request(ireq, oreq) \ 307ef06f4d3SAlex Elder list_for_each_entry(oreq, &(ireq)->obj_requests, links) 308bf0d5f50SAlex Elder #define for_each_obj_request_from(ireq, oreq) \ 309ef06f4d3SAlex Elder list_for_each_entry_from(oreq, &(ireq)->obj_requests, links) 310bf0d5f50SAlex Elder #define for_each_obj_request_safe(ireq, oreq, n) \ 311ef06f4d3SAlex Elder list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links) 312bf0d5f50SAlex Elder 313f84344f3SAlex Elder struct rbd_mapping { 31499c1f08fSAlex Elder u64 size; 31534b13184SAlex Elder u64 features; 316f84344f3SAlex Elder bool read_only; 317f84344f3SAlex Elder }; 318f84344f3SAlex Elder 319602adf40SYehuda Sadeh /* 320602adf40SYehuda Sadeh * a single device 321602adf40SYehuda Sadeh */ 322602adf40SYehuda Sadeh struct rbd_device { 323de71a297SAlex Elder int dev_id; /* blkdev unique id */ 324602adf40SYehuda Sadeh 325602adf40SYehuda Sadeh int major; /* blkdev assigned major */ 326602adf40SYehuda Sadeh struct gendisk *disk; /* blkdev's gendisk and rq */ 327602adf40SYehuda Sadeh 328a30b71b9SAlex Elder u32 image_format; /* Either 1 or 2 */ 329602adf40SYehuda Sadeh struct rbd_client *rbd_client; 330602adf40SYehuda Sadeh 331602adf40SYehuda Sadeh char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */ 332602adf40SYehuda Sadeh 333b82d167bSAlex Elder spinlock_t lock; /* queue, flags, open_count */ 334602adf40SYehuda Sadeh 335602adf40SYehuda Sadeh struct rbd_image_header header; 336b82d167bSAlex Elder unsigned long flags; /* possibly lock protected */ 3370d7dbfceSAlex Elder struct rbd_spec *spec; 338602adf40SYehuda Sadeh 3390d7dbfceSAlex Elder char *header_name; 340971f839aSAlex Elder 3410903e875SAlex Elder struct ceph_file_layout layout; 3420903e875SAlex Elder 34359c2be1eSYehuda Sadeh struct ceph_osd_event *watch_event; 344975241afSAlex Elder struct rbd_obj_request *watch_request; 34559c2be1eSYehuda Sadeh 34686b00e0dSAlex Elder struct rbd_spec *parent_spec; 34786b00e0dSAlex Elder u64 parent_overlap; 348a2acd00eSAlex Elder atomic_t parent_ref; 3492f82ee54SAlex Elder struct rbd_device *parent; 35086b00e0dSAlex Elder 351c666601aSJosh Durgin /* protects updating the header */ 352c666601aSJosh Durgin struct rw_semaphore header_rwsem; 353f84344f3SAlex Elder 354f84344f3SAlex Elder struct rbd_mapping mapping; 355602adf40SYehuda Sadeh 356602adf40SYehuda Sadeh struct list_head node; 357dfc5606dSYehuda Sadeh 358dfc5606dSYehuda Sadeh /* sysfs related */ 359dfc5606dSYehuda Sadeh struct device dev; 360b82d167bSAlex Elder unsigned long open_count; /* protected by lock */ 361dfc5606dSYehuda Sadeh }; 362dfc5606dSYehuda Sadeh 363b82d167bSAlex Elder /* 364b82d167bSAlex Elder * Flag bits for rbd_dev->flags. If atomicity is required, 365b82d167bSAlex Elder * rbd_dev->lock is used to protect access. 366b82d167bSAlex Elder * 367b82d167bSAlex Elder * Currently, only the "removing" flag (which is coupled with the 368b82d167bSAlex Elder * "open_count" field) requires atomic access. 369b82d167bSAlex Elder */ 3706d292906SAlex Elder enum rbd_dev_flags { 3716d292906SAlex Elder RBD_DEV_FLAG_EXISTS, /* mapped snapshot has not been deleted */ 372b82d167bSAlex Elder RBD_DEV_FLAG_REMOVING, /* this mapping is being removed */ 3736d292906SAlex Elder }; 3746d292906SAlex Elder 375cfbf6377SAlex Elder static DEFINE_MUTEX(client_mutex); /* Serialize client creation */ 376e124a82fSAlex Elder 377602adf40SYehuda Sadeh static LIST_HEAD(rbd_dev_list); /* devices */ 378e124a82fSAlex Elder static DEFINE_SPINLOCK(rbd_dev_list_lock); 379e124a82fSAlex Elder 380602adf40SYehuda Sadeh static LIST_HEAD(rbd_client_list); /* clients */ 381432b8587SAlex Elder static DEFINE_SPINLOCK(rbd_client_list_lock); 382602adf40SYehuda Sadeh 38378c2a44aSAlex Elder /* Slab caches for frequently-allocated structures */ 38478c2a44aSAlex Elder 3851c2a9dfeSAlex Elder static struct kmem_cache *rbd_img_request_cache; 386868311b1SAlex Elder static struct kmem_cache *rbd_obj_request_cache; 38778c2a44aSAlex Elder static struct kmem_cache *rbd_segment_name_cache; 3881c2a9dfeSAlex Elder 3893d7efd18SAlex Elder static int rbd_img_request_submit(struct rbd_img_request *img_request); 3903d7efd18SAlex Elder 391200a6a8bSAlex Elder static void rbd_dev_device_release(struct device *dev); 392dfc5606dSYehuda Sadeh 393f0f8cef5SAlex Elder static ssize_t rbd_add(struct bus_type *bus, const char *buf, 394f0f8cef5SAlex Elder size_t count); 395f0f8cef5SAlex Elder static ssize_t rbd_remove(struct bus_type *bus, const char *buf, 396f0f8cef5SAlex Elder size_t count); 3971f3ef788SAlex Elder static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool mapping); 398a2acd00eSAlex Elder static void rbd_spec_put(struct rbd_spec *spec); 399f0f8cef5SAlex Elder 400f0f8cef5SAlex Elder static struct bus_attribute rbd_bus_attrs[] = { 401f0f8cef5SAlex Elder __ATTR(add, S_IWUSR, NULL, rbd_add), 402f0f8cef5SAlex Elder __ATTR(remove, S_IWUSR, NULL, rbd_remove), 403f0f8cef5SAlex Elder __ATTR_NULL 404f0f8cef5SAlex Elder }; 405f0f8cef5SAlex Elder 406f0f8cef5SAlex Elder static struct bus_type rbd_bus_type = { 407f0f8cef5SAlex Elder .name = "rbd", 408f0f8cef5SAlex Elder .bus_attrs = rbd_bus_attrs, 409f0f8cef5SAlex Elder }; 410f0f8cef5SAlex Elder 411f0f8cef5SAlex Elder static void rbd_root_dev_release(struct device *dev) 412f0f8cef5SAlex Elder { 413f0f8cef5SAlex Elder } 414f0f8cef5SAlex Elder 415f0f8cef5SAlex Elder static struct device rbd_root_dev = { 416f0f8cef5SAlex Elder .init_name = "rbd", 417f0f8cef5SAlex Elder .release = rbd_root_dev_release, 418f0f8cef5SAlex Elder }; 419f0f8cef5SAlex Elder 42006ecc6cbSAlex Elder static __printf(2, 3) 42106ecc6cbSAlex Elder void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...) 42206ecc6cbSAlex Elder { 42306ecc6cbSAlex Elder struct va_format vaf; 42406ecc6cbSAlex Elder va_list args; 42506ecc6cbSAlex Elder 42606ecc6cbSAlex Elder va_start(args, fmt); 42706ecc6cbSAlex Elder vaf.fmt = fmt; 42806ecc6cbSAlex Elder vaf.va = &args; 42906ecc6cbSAlex Elder 43006ecc6cbSAlex Elder if (!rbd_dev) 43106ecc6cbSAlex Elder printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf); 43206ecc6cbSAlex Elder else if (rbd_dev->disk) 43306ecc6cbSAlex Elder printk(KERN_WARNING "%s: %s: %pV\n", 43406ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf); 43506ecc6cbSAlex Elder else if (rbd_dev->spec && rbd_dev->spec->image_name) 43606ecc6cbSAlex Elder printk(KERN_WARNING "%s: image %s: %pV\n", 43706ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf); 43806ecc6cbSAlex Elder else if (rbd_dev->spec && rbd_dev->spec->image_id) 43906ecc6cbSAlex Elder printk(KERN_WARNING "%s: id %s: %pV\n", 44006ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf); 44106ecc6cbSAlex Elder else /* punt */ 44206ecc6cbSAlex Elder printk(KERN_WARNING "%s: rbd_dev %p: %pV\n", 44306ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev, &vaf); 44406ecc6cbSAlex Elder va_end(args); 44506ecc6cbSAlex Elder } 44606ecc6cbSAlex Elder 447aafb230eSAlex Elder #ifdef RBD_DEBUG 448aafb230eSAlex Elder #define rbd_assert(expr) \ 449aafb230eSAlex Elder if (unlikely(!(expr))) { \ 450aafb230eSAlex Elder printk(KERN_ERR "\nAssertion failure in %s() " \ 451aafb230eSAlex Elder "at line %d:\n\n" \ 452aafb230eSAlex Elder "\trbd_assert(%s);\n\n", \ 453aafb230eSAlex Elder __func__, __LINE__, #expr); \ 454aafb230eSAlex Elder BUG(); \ 455aafb230eSAlex Elder } 456aafb230eSAlex Elder #else /* !RBD_DEBUG */ 457aafb230eSAlex Elder # define rbd_assert(expr) ((void) 0) 458aafb230eSAlex Elder #endif /* !RBD_DEBUG */ 459dfc5606dSYehuda Sadeh 460b454e36dSAlex Elder static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request); 46105a46afdSAlex Elder static void rbd_img_parent_read(struct rbd_obj_request *obj_request); 46205a46afdSAlex Elder static void rbd_dev_remove_parent(struct rbd_device *rbd_dev); 4638b3e1a56SAlex Elder 464cc4a38bdSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev); 4652df3fac7SAlex Elder static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev); 4662df3fac7SAlex Elder static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev); 46754cac61fSAlex Elder static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, 46854cac61fSAlex Elder u64 snap_id); 4692ad3d716SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id, 4702ad3d716SAlex Elder u8 *order, u64 *snap_size); 4712ad3d716SAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id, 4722ad3d716SAlex Elder u64 *snap_features); 4732ad3d716SAlex Elder static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name); 47459c2be1eSYehuda Sadeh 475602adf40SYehuda Sadeh static int rbd_open(struct block_device *bdev, fmode_t mode) 476602adf40SYehuda Sadeh { 477f0f8cef5SAlex Elder struct rbd_device *rbd_dev = bdev->bd_disk->private_data; 478b82d167bSAlex Elder bool removing = false; 479602adf40SYehuda Sadeh 480f84344f3SAlex Elder if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only) 481602adf40SYehuda Sadeh return -EROFS; 482602adf40SYehuda Sadeh 483a14ea269SAlex Elder spin_lock_irq(&rbd_dev->lock); 484b82d167bSAlex Elder if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags)) 485b82d167bSAlex Elder removing = true; 486b82d167bSAlex Elder else 487b82d167bSAlex Elder rbd_dev->open_count++; 488a14ea269SAlex Elder spin_unlock_irq(&rbd_dev->lock); 489b82d167bSAlex Elder if (removing) 490b82d167bSAlex Elder return -ENOENT; 491b82d167bSAlex Elder 492c3e946ceSAlex Elder (void) get_device(&rbd_dev->dev); 493f84344f3SAlex Elder set_device_ro(bdev, rbd_dev->mapping.read_only); 494340c7a2bSAlex Elder 495602adf40SYehuda Sadeh return 0; 496602adf40SYehuda Sadeh } 497602adf40SYehuda Sadeh 498db2a144bSAl Viro static void rbd_release(struct gendisk *disk, fmode_t mode) 499dfc5606dSYehuda Sadeh { 500dfc5606dSYehuda Sadeh struct rbd_device *rbd_dev = disk->private_data; 501b82d167bSAlex Elder unsigned long open_count_before; 502b82d167bSAlex Elder 503a14ea269SAlex Elder spin_lock_irq(&rbd_dev->lock); 504b82d167bSAlex Elder open_count_before = rbd_dev->open_count--; 505a14ea269SAlex Elder spin_unlock_irq(&rbd_dev->lock); 506b82d167bSAlex Elder rbd_assert(open_count_before > 0); 507dfc5606dSYehuda Sadeh 508c3e946ceSAlex Elder put_device(&rbd_dev->dev); 509dfc5606dSYehuda Sadeh } 510dfc5606dSYehuda Sadeh 511602adf40SYehuda Sadeh static const struct block_device_operations rbd_bd_ops = { 512602adf40SYehuda Sadeh .owner = THIS_MODULE, 513602adf40SYehuda Sadeh .open = rbd_open, 514dfc5606dSYehuda Sadeh .release = rbd_release, 515602adf40SYehuda Sadeh }; 516602adf40SYehuda Sadeh 517602adf40SYehuda Sadeh /* 5187262cfcaSAlex Elder * Initialize an rbd client instance. Success or not, this function 519cfbf6377SAlex Elder * consumes ceph_opts. Caller holds client_mutex. 520602adf40SYehuda Sadeh */ 521f8c38929SAlex Elder static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts) 522602adf40SYehuda Sadeh { 523602adf40SYehuda Sadeh struct rbd_client *rbdc; 524602adf40SYehuda Sadeh int ret = -ENOMEM; 525602adf40SYehuda Sadeh 52637206ee5SAlex Elder dout("%s:\n", __func__); 527602adf40SYehuda Sadeh rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL); 528602adf40SYehuda Sadeh if (!rbdc) 529602adf40SYehuda Sadeh goto out_opt; 530602adf40SYehuda Sadeh 531602adf40SYehuda Sadeh kref_init(&rbdc->kref); 532602adf40SYehuda Sadeh INIT_LIST_HEAD(&rbdc->node); 533602adf40SYehuda Sadeh 53443ae4701SAlex Elder rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0); 535602adf40SYehuda Sadeh if (IS_ERR(rbdc->client)) 53608f75463SAlex Elder goto out_rbdc; 53743ae4701SAlex Elder ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */ 538602adf40SYehuda Sadeh 539602adf40SYehuda Sadeh ret = ceph_open_session(rbdc->client); 540602adf40SYehuda Sadeh if (ret < 0) 54108f75463SAlex Elder goto out_client; 542602adf40SYehuda Sadeh 543432b8587SAlex Elder spin_lock(&rbd_client_list_lock); 544602adf40SYehuda Sadeh list_add_tail(&rbdc->node, &rbd_client_list); 545432b8587SAlex Elder spin_unlock(&rbd_client_list_lock); 546602adf40SYehuda Sadeh 54737206ee5SAlex Elder dout("%s: rbdc %p\n", __func__, rbdc); 548bc534d86SAlex Elder 549602adf40SYehuda Sadeh return rbdc; 55008f75463SAlex Elder out_client: 551602adf40SYehuda Sadeh ceph_destroy_client(rbdc->client); 55208f75463SAlex Elder out_rbdc: 553602adf40SYehuda Sadeh kfree(rbdc); 554602adf40SYehuda Sadeh out_opt: 55543ae4701SAlex Elder if (ceph_opts) 55643ae4701SAlex Elder ceph_destroy_options(ceph_opts); 55737206ee5SAlex Elder dout("%s: error %d\n", __func__, ret); 55837206ee5SAlex Elder 55928f259b7SVasiliy Kulikov return ERR_PTR(ret); 560602adf40SYehuda Sadeh } 561602adf40SYehuda Sadeh 5622f82ee54SAlex Elder static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc) 5632f82ee54SAlex Elder { 5642f82ee54SAlex Elder kref_get(&rbdc->kref); 5652f82ee54SAlex Elder 5662f82ee54SAlex Elder return rbdc; 5672f82ee54SAlex Elder } 5682f82ee54SAlex Elder 569602adf40SYehuda Sadeh /* 5701f7ba331SAlex Elder * Find a ceph client with specific addr and configuration. If 5711f7ba331SAlex Elder * found, bump its reference count. 572602adf40SYehuda Sadeh */ 5731f7ba331SAlex Elder static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts) 574602adf40SYehuda Sadeh { 575602adf40SYehuda Sadeh struct rbd_client *client_node; 5761f7ba331SAlex Elder bool found = false; 577602adf40SYehuda Sadeh 57843ae4701SAlex Elder if (ceph_opts->flags & CEPH_OPT_NOSHARE) 579602adf40SYehuda Sadeh return NULL; 580602adf40SYehuda Sadeh 5811f7ba331SAlex Elder spin_lock(&rbd_client_list_lock); 5821f7ba331SAlex Elder list_for_each_entry(client_node, &rbd_client_list, node) { 5831f7ba331SAlex Elder if (!ceph_compare_options(ceph_opts, client_node->client)) { 5842f82ee54SAlex Elder __rbd_get_client(client_node); 5852f82ee54SAlex Elder 5861f7ba331SAlex Elder found = true; 5871f7ba331SAlex Elder break; 5881f7ba331SAlex Elder } 5891f7ba331SAlex Elder } 5901f7ba331SAlex Elder spin_unlock(&rbd_client_list_lock); 5911f7ba331SAlex Elder 5921f7ba331SAlex Elder return found ? client_node : NULL; 593602adf40SYehuda Sadeh } 594602adf40SYehuda Sadeh 595602adf40SYehuda Sadeh /* 59659c2be1eSYehuda Sadeh * mount options 59759c2be1eSYehuda Sadeh */ 59859c2be1eSYehuda Sadeh enum { 59959c2be1eSYehuda Sadeh Opt_last_int, 60059c2be1eSYehuda Sadeh /* int args above */ 60159c2be1eSYehuda Sadeh Opt_last_string, 60259c2be1eSYehuda Sadeh /* string args above */ 603cc0538b6SAlex Elder Opt_read_only, 604cc0538b6SAlex Elder Opt_read_write, 605cc0538b6SAlex Elder /* Boolean args above */ 606cc0538b6SAlex Elder Opt_last_bool, 60759c2be1eSYehuda Sadeh }; 60859c2be1eSYehuda Sadeh 60943ae4701SAlex Elder static match_table_t rbd_opts_tokens = { 61059c2be1eSYehuda Sadeh /* int args above */ 61159c2be1eSYehuda Sadeh /* string args above */ 612be466c1cSAlex Elder {Opt_read_only, "read_only"}, 613cc0538b6SAlex Elder {Opt_read_only, "ro"}, /* Alternate spelling */ 614cc0538b6SAlex Elder {Opt_read_write, "read_write"}, 615cc0538b6SAlex Elder {Opt_read_write, "rw"}, /* Alternate spelling */ 616cc0538b6SAlex Elder /* Boolean args above */ 61759c2be1eSYehuda Sadeh {-1, NULL} 61859c2be1eSYehuda Sadeh }; 61959c2be1eSYehuda Sadeh 62098571b5aSAlex Elder struct rbd_options { 62198571b5aSAlex Elder bool read_only; 62298571b5aSAlex Elder }; 62398571b5aSAlex Elder 62498571b5aSAlex Elder #define RBD_READ_ONLY_DEFAULT false 62598571b5aSAlex Elder 62659c2be1eSYehuda Sadeh static int parse_rbd_opts_token(char *c, void *private) 62759c2be1eSYehuda Sadeh { 62843ae4701SAlex Elder struct rbd_options *rbd_opts = private; 62959c2be1eSYehuda Sadeh substring_t argstr[MAX_OPT_ARGS]; 63059c2be1eSYehuda Sadeh int token, intval, ret; 63159c2be1eSYehuda Sadeh 63243ae4701SAlex Elder token = match_token(c, rbd_opts_tokens, argstr); 63359c2be1eSYehuda Sadeh if (token < 0) 63459c2be1eSYehuda Sadeh return -EINVAL; 63559c2be1eSYehuda Sadeh 63659c2be1eSYehuda Sadeh if (token < Opt_last_int) { 63759c2be1eSYehuda Sadeh ret = match_int(&argstr[0], &intval); 63859c2be1eSYehuda Sadeh if (ret < 0) { 63959c2be1eSYehuda Sadeh pr_err("bad mount option arg (not int) " 64059c2be1eSYehuda Sadeh "at '%s'\n", c); 64159c2be1eSYehuda Sadeh return ret; 64259c2be1eSYehuda Sadeh } 64359c2be1eSYehuda Sadeh dout("got int token %d val %d\n", token, intval); 64459c2be1eSYehuda Sadeh } else if (token > Opt_last_int && token < Opt_last_string) { 64559c2be1eSYehuda Sadeh dout("got string token %d val %s\n", token, 64659c2be1eSYehuda Sadeh argstr[0].from); 647cc0538b6SAlex Elder } else if (token > Opt_last_string && token < Opt_last_bool) { 648cc0538b6SAlex Elder dout("got Boolean token %d\n", token); 64959c2be1eSYehuda Sadeh } else { 65059c2be1eSYehuda Sadeh dout("got token %d\n", token); 65159c2be1eSYehuda Sadeh } 65259c2be1eSYehuda Sadeh 65359c2be1eSYehuda Sadeh switch (token) { 654cc0538b6SAlex Elder case Opt_read_only: 655cc0538b6SAlex Elder rbd_opts->read_only = true; 656cc0538b6SAlex Elder break; 657cc0538b6SAlex Elder case Opt_read_write: 658cc0538b6SAlex Elder rbd_opts->read_only = false; 659cc0538b6SAlex Elder break; 66059c2be1eSYehuda Sadeh default: 661aafb230eSAlex Elder rbd_assert(false); 662aafb230eSAlex Elder break; 66359c2be1eSYehuda Sadeh } 66459c2be1eSYehuda Sadeh return 0; 66559c2be1eSYehuda Sadeh } 66659c2be1eSYehuda Sadeh 66759c2be1eSYehuda Sadeh /* 668602adf40SYehuda Sadeh * Get a ceph client with specific addr and configuration, if one does 6697262cfcaSAlex Elder * not exist create it. Either way, ceph_opts is consumed by this 6707262cfcaSAlex Elder * function. 671602adf40SYehuda Sadeh */ 6729d3997fdSAlex Elder static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts) 673602adf40SYehuda Sadeh { 674f8c38929SAlex Elder struct rbd_client *rbdc; 67559c2be1eSYehuda Sadeh 676cfbf6377SAlex Elder mutex_lock_nested(&client_mutex, SINGLE_DEPTH_NESTING); 6771f7ba331SAlex Elder rbdc = rbd_client_find(ceph_opts); 6789d3997fdSAlex Elder if (rbdc) /* using an existing client */ 67943ae4701SAlex Elder ceph_destroy_options(ceph_opts); 6809d3997fdSAlex Elder else 681f8c38929SAlex Elder rbdc = rbd_client_create(ceph_opts); 682cfbf6377SAlex Elder mutex_unlock(&client_mutex); 683d720bcb0SAlex Elder 6849d3997fdSAlex Elder return rbdc; 685602adf40SYehuda Sadeh } 686602adf40SYehuda Sadeh 687602adf40SYehuda Sadeh /* 688602adf40SYehuda Sadeh * Destroy ceph client 689d23a4b3fSAlex Elder * 690432b8587SAlex Elder * Caller must hold rbd_client_list_lock. 691602adf40SYehuda Sadeh */ 692602adf40SYehuda Sadeh static void rbd_client_release(struct kref *kref) 693602adf40SYehuda Sadeh { 694602adf40SYehuda Sadeh struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref); 695602adf40SYehuda Sadeh 69637206ee5SAlex Elder dout("%s: rbdc %p\n", __func__, rbdc); 697cd9d9f5dSAlex Elder spin_lock(&rbd_client_list_lock); 698602adf40SYehuda Sadeh list_del(&rbdc->node); 699cd9d9f5dSAlex Elder spin_unlock(&rbd_client_list_lock); 700602adf40SYehuda Sadeh 701602adf40SYehuda Sadeh ceph_destroy_client(rbdc->client); 702602adf40SYehuda Sadeh kfree(rbdc); 703602adf40SYehuda Sadeh } 704602adf40SYehuda Sadeh 705602adf40SYehuda Sadeh /* 706602adf40SYehuda Sadeh * Drop reference to ceph client node. If it's not referenced anymore, release 707602adf40SYehuda Sadeh * it. 708602adf40SYehuda Sadeh */ 7099d3997fdSAlex Elder static void rbd_put_client(struct rbd_client *rbdc) 710602adf40SYehuda Sadeh { 711c53d5893SAlex Elder if (rbdc) 7129d3997fdSAlex Elder kref_put(&rbdc->kref, rbd_client_release); 713602adf40SYehuda Sadeh } 714602adf40SYehuda Sadeh 715a30b71b9SAlex Elder static bool rbd_image_format_valid(u32 image_format) 716a30b71b9SAlex Elder { 717a30b71b9SAlex Elder return image_format == 1 || image_format == 2; 718a30b71b9SAlex Elder } 719a30b71b9SAlex Elder 7208e94af8eSAlex Elder static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk) 7218e94af8eSAlex Elder { 722103a150fSAlex Elder size_t size; 723103a150fSAlex Elder u32 snap_count; 724103a150fSAlex Elder 725103a150fSAlex Elder /* The header has to start with the magic rbd header text */ 726103a150fSAlex Elder if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT))) 727103a150fSAlex Elder return false; 728103a150fSAlex Elder 729db2388b6SAlex Elder /* The bio layer requires at least sector-sized I/O */ 730db2388b6SAlex Elder 731db2388b6SAlex Elder if (ondisk->options.order < SECTOR_SHIFT) 732db2388b6SAlex Elder return false; 733db2388b6SAlex Elder 734db2388b6SAlex Elder /* If we use u64 in a few spots we may be able to loosen this */ 735db2388b6SAlex Elder 736db2388b6SAlex Elder if (ondisk->options.order > 8 * sizeof (int) - 1) 737db2388b6SAlex Elder return false; 738db2388b6SAlex Elder 739103a150fSAlex Elder /* 740103a150fSAlex Elder * The size of a snapshot header has to fit in a size_t, and 741103a150fSAlex Elder * that limits the number of snapshots. 742103a150fSAlex Elder */ 743103a150fSAlex Elder snap_count = le32_to_cpu(ondisk->snap_count); 744103a150fSAlex Elder size = SIZE_MAX - sizeof (struct ceph_snap_context); 745103a150fSAlex Elder if (snap_count > size / sizeof (__le64)) 746103a150fSAlex Elder return false; 747103a150fSAlex Elder 748103a150fSAlex Elder /* 749103a150fSAlex Elder * Not only that, but the size of the entire the snapshot 750103a150fSAlex Elder * header must also be representable in a size_t. 751103a150fSAlex Elder */ 752103a150fSAlex Elder size -= snap_count * sizeof (__le64); 753103a150fSAlex Elder if ((u64) size < le64_to_cpu(ondisk->snap_names_len)) 754103a150fSAlex Elder return false; 755103a150fSAlex Elder 756103a150fSAlex Elder return true; 7578e94af8eSAlex Elder } 7588e94af8eSAlex Elder 759602adf40SYehuda Sadeh /* 760bb23e37aSAlex Elder * Fill an rbd image header with information from the given format 1 761bb23e37aSAlex Elder * on-disk header. 762602adf40SYehuda Sadeh */ 763662518b1SAlex Elder static int rbd_header_from_disk(struct rbd_device *rbd_dev, 7644156d998SAlex Elder struct rbd_image_header_ondisk *ondisk) 765602adf40SYehuda Sadeh { 766662518b1SAlex Elder struct rbd_image_header *header = &rbd_dev->header; 767bb23e37aSAlex Elder bool first_time = header->object_prefix == NULL; 768bb23e37aSAlex Elder struct ceph_snap_context *snapc; 769bb23e37aSAlex Elder char *object_prefix = NULL; 770bb23e37aSAlex Elder char *snap_names = NULL; 771bb23e37aSAlex Elder u64 *snap_sizes = NULL; 772ccece235SAlex Elder u32 snap_count; 773d2bb24e5SAlex Elder size_t size; 774bb23e37aSAlex Elder int ret = -ENOMEM; 775621901d6SAlex Elder u32 i; 776602adf40SYehuda Sadeh 777bb23e37aSAlex Elder /* Allocate this now to avoid having to handle failure below */ 778103a150fSAlex Elder 779bb23e37aSAlex Elder if (first_time) { 780bb23e37aSAlex Elder size_t len; 781bb23e37aSAlex Elder 782bb23e37aSAlex Elder len = strnlen(ondisk->object_prefix, 783bb23e37aSAlex Elder sizeof (ondisk->object_prefix)); 784bb23e37aSAlex Elder object_prefix = kmalloc(len + 1, GFP_KERNEL); 785bb23e37aSAlex Elder if (!object_prefix) 786602adf40SYehuda Sadeh return -ENOMEM; 787bb23e37aSAlex Elder memcpy(object_prefix, ondisk->object_prefix, len); 788bb23e37aSAlex Elder object_prefix[len] = '\0'; 789bb23e37aSAlex Elder } 79000f1f36fSAlex Elder 791bb23e37aSAlex Elder /* Allocate the snapshot context and fill it in */ 792d2bb24e5SAlex Elder 793602adf40SYehuda Sadeh snap_count = le32_to_cpu(ondisk->snap_count); 794bb23e37aSAlex Elder snapc = ceph_create_snap_context(snap_count, GFP_KERNEL); 795bb23e37aSAlex Elder if (!snapc) 796bb23e37aSAlex Elder goto out_err; 797bb23e37aSAlex Elder snapc->seq = le64_to_cpu(ondisk->snap_seq); 798602adf40SYehuda Sadeh if (snap_count) { 799bb23e37aSAlex Elder struct rbd_image_snap_ondisk *snaps; 800f785cc1dSAlex Elder u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len); 801f785cc1dSAlex Elder 802bb23e37aSAlex Elder /* We'll keep a copy of the snapshot names... */ 803621901d6SAlex Elder 804f785cc1dSAlex Elder if (snap_names_len > (u64)SIZE_MAX) 805bb23e37aSAlex Elder goto out_2big; 806bb23e37aSAlex Elder snap_names = kmalloc(snap_names_len, GFP_KERNEL); 807bb23e37aSAlex Elder if (!snap_names) 808602adf40SYehuda Sadeh goto out_err; 809bb23e37aSAlex Elder 810bb23e37aSAlex Elder /* ...as well as the array of their sizes. */ 811bb23e37aSAlex Elder 812bb23e37aSAlex Elder size = snap_count * sizeof (*header->snap_sizes); 813bb23e37aSAlex Elder snap_sizes = kmalloc(size, GFP_KERNEL); 814bb23e37aSAlex Elder if (!snap_sizes) 815bb23e37aSAlex Elder goto out_err; 816bb23e37aSAlex Elder 817f785cc1dSAlex Elder /* 818bb23e37aSAlex Elder * Copy the names, and fill in each snapshot's id 819bb23e37aSAlex Elder * and size. 820bb23e37aSAlex Elder * 82199a41ebcSAlex Elder * Note that rbd_dev_v1_header_info() guarantees the 822bb23e37aSAlex Elder * ondisk buffer we're working with has 823f785cc1dSAlex Elder * snap_names_len bytes beyond the end of the 824f785cc1dSAlex Elder * snapshot id array, this memcpy() is safe. 825f785cc1dSAlex Elder */ 826bb23e37aSAlex Elder memcpy(snap_names, &ondisk->snaps[snap_count], snap_names_len); 827bb23e37aSAlex Elder snaps = ondisk->snaps; 828bb23e37aSAlex Elder for (i = 0; i < snap_count; i++) { 829bb23e37aSAlex Elder snapc->snaps[i] = le64_to_cpu(snaps[i].id); 830bb23e37aSAlex Elder snap_sizes[i] = le64_to_cpu(snaps[i].image_size); 831bb23e37aSAlex Elder } 832602adf40SYehuda Sadeh } 833849b4260SAlex Elder 834bb23e37aSAlex Elder /* We won't fail any more, fill in the header */ 835bb23e37aSAlex Elder 836bb23e37aSAlex Elder if (first_time) { 837bb23e37aSAlex Elder header->object_prefix = object_prefix; 838602adf40SYehuda Sadeh header->obj_order = ondisk->options.order; 839602adf40SYehuda Sadeh header->crypt_type = ondisk->options.crypt_type; 840602adf40SYehuda Sadeh header->comp_type = ondisk->options.comp_type; 841bb23e37aSAlex Elder /* The rest aren't used for format 1 images */ 842bb23e37aSAlex Elder header->stripe_unit = 0; 843bb23e37aSAlex Elder header->stripe_count = 0; 844bb23e37aSAlex Elder header->features = 0; 845662518b1SAlex Elder } else { 846662518b1SAlex Elder ceph_put_snap_context(header->snapc); 847662518b1SAlex Elder kfree(header->snap_names); 848662518b1SAlex Elder kfree(header->snap_sizes); 849bb23e37aSAlex Elder } 8506a52325fSAlex Elder 851bb23e37aSAlex Elder /* The remaining fields always get updated (when we refresh) */ 852621901d6SAlex Elder 853f84344f3SAlex Elder header->image_size = le64_to_cpu(ondisk->image_size); 854bb23e37aSAlex Elder header->snapc = snapc; 855bb23e37aSAlex Elder header->snap_names = snap_names; 856bb23e37aSAlex Elder header->snap_sizes = snap_sizes; 857468521c1SAlex Elder 858662518b1SAlex Elder /* Make sure mapping size is consistent with header info */ 859662518b1SAlex Elder 860662518b1SAlex Elder if (rbd_dev->spec->snap_id == CEPH_NOSNAP || first_time) 861662518b1SAlex Elder if (rbd_dev->mapping.size != header->image_size) 862662518b1SAlex Elder rbd_dev->mapping.size = header->image_size; 863662518b1SAlex Elder 864602adf40SYehuda Sadeh return 0; 865bb23e37aSAlex Elder out_2big: 866bb23e37aSAlex Elder ret = -EIO; 8676a52325fSAlex Elder out_err: 868bb23e37aSAlex Elder kfree(snap_sizes); 869bb23e37aSAlex Elder kfree(snap_names); 870bb23e37aSAlex Elder ceph_put_snap_context(snapc); 871bb23e37aSAlex Elder kfree(object_prefix); 872ccece235SAlex Elder 873bb23e37aSAlex Elder return ret; 874602adf40SYehuda Sadeh } 875602adf40SYehuda Sadeh 8769682fc6dSAlex Elder static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which) 8779682fc6dSAlex Elder { 8789682fc6dSAlex Elder const char *snap_name; 8799682fc6dSAlex Elder 8809682fc6dSAlex Elder rbd_assert(which < rbd_dev->header.snapc->num_snaps); 8819682fc6dSAlex Elder 8829682fc6dSAlex Elder /* Skip over names until we find the one we are looking for */ 8839682fc6dSAlex Elder 8849682fc6dSAlex Elder snap_name = rbd_dev->header.snap_names; 8859682fc6dSAlex Elder while (which--) 8869682fc6dSAlex Elder snap_name += strlen(snap_name) + 1; 8879682fc6dSAlex Elder 8889682fc6dSAlex Elder return kstrdup(snap_name, GFP_KERNEL); 8899682fc6dSAlex Elder } 8909682fc6dSAlex Elder 89130d1cff8SAlex Elder /* 89230d1cff8SAlex Elder * Snapshot id comparison function for use with qsort()/bsearch(). 89330d1cff8SAlex Elder * Note that result is for snapshots in *descending* order. 89430d1cff8SAlex Elder */ 89530d1cff8SAlex Elder static int snapid_compare_reverse(const void *s1, const void *s2) 89630d1cff8SAlex Elder { 89730d1cff8SAlex Elder u64 snap_id1 = *(u64 *)s1; 89830d1cff8SAlex Elder u64 snap_id2 = *(u64 *)s2; 89930d1cff8SAlex Elder 90030d1cff8SAlex Elder if (snap_id1 < snap_id2) 90130d1cff8SAlex Elder return 1; 90230d1cff8SAlex Elder return snap_id1 == snap_id2 ? 0 : -1; 90330d1cff8SAlex Elder } 90430d1cff8SAlex Elder 90530d1cff8SAlex Elder /* 90630d1cff8SAlex Elder * Search a snapshot context to see if the given snapshot id is 90730d1cff8SAlex Elder * present. 90830d1cff8SAlex Elder * 90930d1cff8SAlex Elder * Returns the position of the snapshot id in the array if it's found, 91030d1cff8SAlex Elder * or BAD_SNAP_INDEX otherwise. 91130d1cff8SAlex Elder * 91230d1cff8SAlex Elder * Note: The snapshot array is in kept sorted (by the osd) in 91330d1cff8SAlex Elder * reverse order, highest snapshot id first. 91430d1cff8SAlex Elder */ 9159682fc6dSAlex Elder static u32 rbd_dev_snap_index(struct rbd_device *rbd_dev, u64 snap_id) 9169682fc6dSAlex Elder { 9179682fc6dSAlex Elder struct ceph_snap_context *snapc = rbd_dev->header.snapc; 91830d1cff8SAlex Elder u64 *found; 9199682fc6dSAlex Elder 92030d1cff8SAlex Elder found = bsearch(&snap_id, &snapc->snaps, snapc->num_snaps, 92130d1cff8SAlex Elder sizeof (snap_id), snapid_compare_reverse); 9229682fc6dSAlex Elder 92330d1cff8SAlex Elder return found ? (u32)(found - &snapc->snaps[0]) : BAD_SNAP_INDEX; 9249682fc6dSAlex Elder } 9259682fc6dSAlex Elder 9262ad3d716SAlex Elder static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, 9272ad3d716SAlex Elder u64 snap_id) 92854cac61fSAlex Elder { 92954cac61fSAlex Elder u32 which; 93054cac61fSAlex Elder 93154cac61fSAlex Elder which = rbd_dev_snap_index(rbd_dev, snap_id); 93254cac61fSAlex Elder if (which == BAD_SNAP_INDEX) 93354cac61fSAlex Elder return NULL; 93454cac61fSAlex Elder 93554cac61fSAlex Elder return _rbd_dev_v1_snap_name(rbd_dev, which); 93654cac61fSAlex Elder } 93754cac61fSAlex Elder 9389e15b77dSAlex Elder static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id) 9399e15b77dSAlex Elder { 9409e15b77dSAlex Elder if (snap_id == CEPH_NOSNAP) 9419e15b77dSAlex Elder return RBD_SNAP_HEAD_NAME; 9429e15b77dSAlex Elder 94354cac61fSAlex Elder rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 94454cac61fSAlex Elder if (rbd_dev->image_format == 1) 94554cac61fSAlex Elder return rbd_dev_v1_snap_name(rbd_dev, snap_id); 9469e15b77dSAlex Elder 94754cac61fSAlex Elder return rbd_dev_v2_snap_name(rbd_dev, snap_id); 9489e15b77dSAlex Elder } 9499e15b77dSAlex Elder 9502ad3d716SAlex Elder static int rbd_snap_size(struct rbd_device *rbd_dev, u64 snap_id, 9512ad3d716SAlex Elder u64 *snap_size) 952602adf40SYehuda Sadeh { 9532ad3d716SAlex Elder rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 9542ad3d716SAlex Elder if (snap_id == CEPH_NOSNAP) { 9552ad3d716SAlex Elder *snap_size = rbd_dev->header.image_size; 9562ad3d716SAlex Elder } else if (rbd_dev->image_format == 1) { 9572ad3d716SAlex Elder u32 which; 95800f1f36fSAlex Elder 9592ad3d716SAlex Elder which = rbd_dev_snap_index(rbd_dev, snap_id); 9602ad3d716SAlex Elder if (which == BAD_SNAP_INDEX) 9612ad3d716SAlex Elder return -ENOENT; 96200f1f36fSAlex Elder 9632ad3d716SAlex Elder *snap_size = rbd_dev->header.snap_sizes[which]; 9642ad3d716SAlex Elder } else { 9652ad3d716SAlex Elder u64 size = 0; 9662ad3d716SAlex Elder int ret; 9672ad3d716SAlex Elder 9682ad3d716SAlex Elder ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size); 9692ad3d716SAlex Elder if (ret) 9702ad3d716SAlex Elder return ret; 9712ad3d716SAlex Elder 9722ad3d716SAlex Elder *snap_size = size; 9732ad3d716SAlex Elder } 9742ad3d716SAlex Elder return 0; 9752ad3d716SAlex Elder } 9762ad3d716SAlex Elder 9772ad3d716SAlex Elder static int rbd_snap_features(struct rbd_device *rbd_dev, u64 snap_id, 9782ad3d716SAlex Elder u64 *snap_features) 9792ad3d716SAlex Elder { 9802ad3d716SAlex Elder rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 9812ad3d716SAlex Elder if (snap_id == CEPH_NOSNAP) { 9822ad3d716SAlex Elder *snap_features = rbd_dev->header.features; 9832ad3d716SAlex Elder } else if (rbd_dev->image_format == 1) { 9842ad3d716SAlex Elder *snap_features = 0; /* No features for format 1 */ 9852ad3d716SAlex Elder } else { 9862ad3d716SAlex Elder u64 features = 0; 9872ad3d716SAlex Elder int ret; 9882ad3d716SAlex Elder 9892ad3d716SAlex Elder ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, &features); 9902ad3d716SAlex Elder if (ret) 9912ad3d716SAlex Elder return ret; 9922ad3d716SAlex Elder 9932ad3d716SAlex Elder *snap_features = features; 9942ad3d716SAlex Elder } 9952ad3d716SAlex Elder return 0; 99600f1f36fSAlex Elder } 997602adf40SYehuda Sadeh 998d1cf5788SAlex Elder static int rbd_dev_mapping_set(struct rbd_device *rbd_dev) 999602adf40SYehuda Sadeh { 10008f4b7d98SAlex Elder u64 snap_id = rbd_dev->spec->snap_id; 10012ad3d716SAlex Elder u64 size = 0; 10022ad3d716SAlex Elder u64 features = 0; 10032ad3d716SAlex Elder int ret; 10048b0241f8SAlex Elder 10052ad3d716SAlex Elder ret = rbd_snap_size(rbd_dev, snap_id, &size); 10062ad3d716SAlex Elder if (ret) 10072ad3d716SAlex Elder return ret; 10082ad3d716SAlex Elder ret = rbd_snap_features(rbd_dev, snap_id, &features); 10092ad3d716SAlex Elder if (ret) 10102ad3d716SAlex Elder return ret; 10112ad3d716SAlex Elder 10122ad3d716SAlex Elder rbd_dev->mapping.size = size; 10132ad3d716SAlex Elder rbd_dev->mapping.features = features; 10142ad3d716SAlex Elder 10158b0241f8SAlex Elder return 0; 1016602adf40SYehuda Sadeh } 1017602adf40SYehuda Sadeh 1018d1cf5788SAlex Elder static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev) 1019d1cf5788SAlex Elder { 1020d1cf5788SAlex Elder rbd_dev->mapping.size = 0; 1021d1cf5788SAlex Elder rbd_dev->mapping.features = 0; 1022200a6a8bSAlex Elder } 1023200a6a8bSAlex Elder 102498571b5aSAlex Elder static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset) 1025602adf40SYehuda Sadeh { 102665ccfe21SAlex Elder char *name; 102765ccfe21SAlex Elder u64 segment; 102865ccfe21SAlex Elder int ret; 10293a96d5cdSJosh Durgin char *name_format; 1030602adf40SYehuda Sadeh 103178c2a44aSAlex Elder name = kmem_cache_alloc(rbd_segment_name_cache, GFP_NOIO); 103265ccfe21SAlex Elder if (!name) 103365ccfe21SAlex Elder return NULL; 103465ccfe21SAlex Elder segment = offset >> rbd_dev->header.obj_order; 10353a96d5cdSJosh Durgin name_format = "%s.%012llx"; 10363a96d5cdSJosh Durgin if (rbd_dev->image_format == 2) 10373a96d5cdSJosh Durgin name_format = "%s.%016llx"; 10383a96d5cdSJosh Durgin ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, name_format, 103965ccfe21SAlex Elder rbd_dev->header.object_prefix, segment); 10402fd82b9eSAlex Elder if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) { 104165ccfe21SAlex Elder pr_err("error formatting segment name for #%llu (%d)\n", 104265ccfe21SAlex Elder segment, ret); 104365ccfe21SAlex Elder kfree(name); 104465ccfe21SAlex Elder name = NULL; 104565ccfe21SAlex Elder } 1046602adf40SYehuda Sadeh 104765ccfe21SAlex Elder return name; 104865ccfe21SAlex Elder } 1049602adf40SYehuda Sadeh 105078c2a44aSAlex Elder static void rbd_segment_name_free(const char *name) 105178c2a44aSAlex Elder { 105278c2a44aSAlex Elder /* The explicit cast here is needed to drop the const qualifier */ 105378c2a44aSAlex Elder 105478c2a44aSAlex Elder kmem_cache_free(rbd_segment_name_cache, (void *)name); 105578c2a44aSAlex Elder } 105678c2a44aSAlex Elder 105765ccfe21SAlex Elder static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset) 105865ccfe21SAlex Elder { 105965ccfe21SAlex Elder u64 segment_size = (u64) 1 << rbd_dev->header.obj_order; 1060602adf40SYehuda Sadeh 106165ccfe21SAlex Elder return offset & (segment_size - 1); 106265ccfe21SAlex Elder } 106365ccfe21SAlex Elder 106465ccfe21SAlex Elder static u64 rbd_segment_length(struct rbd_device *rbd_dev, 106565ccfe21SAlex Elder u64 offset, u64 length) 106665ccfe21SAlex Elder { 106765ccfe21SAlex Elder u64 segment_size = (u64) 1 << rbd_dev->header.obj_order; 106865ccfe21SAlex Elder 106965ccfe21SAlex Elder offset &= segment_size - 1; 107065ccfe21SAlex Elder 1071aafb230eSAlex Elder rbd_assert(length <= U64_MAX - offset); 107265ccfe21SAlex Elder if (offset + length > segment_size) 107365ccfe21SAlex Elder length = segment_size - offset; 107465ccfe21SAlex Elder 107565ccfe21SAlex Elder return length; 1076602adf40SYehuda Sadeh } 1077602adf40SYehuda Sadeh 1078602adf40SYehuda Sadeh /* 1079029bcbd8SJosh Durgin * returns the size of an object in the image 1080029bcbd8SJosh Durgin */ 1081029bcbd8SJosh Durgin static u64 rbd_obj_bytes(struct rbd_image_header *header) 1082029bcbd8SJosh Durgin { 1083029bcbd8SJosh Durgin return 1 << header->obj_order; 1084029bcbd8SJosh Durgin } 1085029bcbd8SJosh Durgin 1086029bcbd8SJosh Durgin /* 1087602adf40SYehuda Sadeh * bio helpers 1088602adf40SYehuda Sadeh */ 1089602adf40SYehuda Sadeh 1090602adf40SYehuda Sadeh static void bio_chain_put(struct bio *chain) 1091602adf40SYehuda Sadeh { 1092602adf40SYehuda Sadeh struct bio *tmp; 1093602adf40SYehuda Sadeh 1094602adf40SYehuda Sadeh while (chain) { 1095602adf40SYehuda Sadeh tmp = chain; 1096602adf40SYehuda Sadeh chain = chain->bi_next; 1097602adf40SYehuda Sadeh bio_put(tmp); 1098602adf40SYehuda Sadeh } 1099602adf40SYehuda Sadeh } 1100602adf40SYehuda Sadeh 1101602adf40SYehuda Sadeh /* 1102602adf40SYehuda Sadeh * zeros a bio chain, starting at specific offset 1103602adf40SYehuda Sadeh */ 1104602adf40SYehuda Sadeh static void zero_bio_chain(struct bio *chain, int start_ofs) 1105602adf40SYehuda Sadeh { 1106602adf40SYehuda Sadeh struct bio_vec *bv; 1107602adf40SYehuda Sadeh unsigned long flags; 1108602adf40SYehuda Sadeh void *buf; 1109602adf40SYehuda Sadeh int i; 1110602adf40SYehuda Sadeh int pos = 0; 1111602adf40SYehuda Sadeh 1112602adf40SYehuda Sadeh while (chain) { 1113602adf40SYehuda Sadeh bio_for_each_segment(bv, chain, i) { 1114602adf40SYehuda Sadeh if (pos + bv->bv_len > start_ofs) { 1115602adf40SYehuda Sadeh int remainder = max(start_ofs - pos, 0); 1116602adf40SYehuda Sadeh buf = bvec_kmap_irq(bv, &flags); 1117602adf40SYehuda Sadeh memset(buf + remainder, 0, 1118602adf40SYehuda Sadeh bv->bv_len - remainder); 1119e2156054SAlex Elder flush_dcache_page(bv->bv_page); 112085b5aaa6SDan Carpenter bvec_kunmap_irq(buf, &flags); 1121602adf40SYehuda Sadeh } 1122602adf40SYehuda Sadeh pos += bv->bv_len; 1123602adf40SYehuda Sadeh } 1124602adf40SYehuda Sadeh 1125602adf40SYehuda Sadeh chain = chain->bi_next; 1126602adf40SYehuda Sadeh } 1127602adf40SYehuda Sadeh } 1128602adf40SYehuda Sadeh 1129602adf40SYehuda Sadeh /* 1130b9434c5bSAlex Elder * similar to zero_bio_chain(), zeros data defined by a page array, 1131b9434c5bSAlex Elder * starting at the given byte offset from the start of the array and 1132b9434c5bSAlex Elder * continuing up to the given end offset. The pages array is 1133b9434c5bSAlex Elder * assumed to be big enough to hold all bytes up to the end. 1134b9434c5bSAlex Elder */ 1135b9434c5bSAlex Elder static void zero_pages(struct page **pages, u64 offset, u64 end) 1136b9434c5bSAlex Elder { 1137b9434c5bSAlex Elder struct page **page = &pages[offset >> PAGE_SHIFT]; 1138b9434c5bSAlex Elder 1139b9434c5bSAlex Elder rbd_assert(end > offset); 1140b9434c5bSAlex Elder rbd_assert(end - offset <= (u64)SIZE_MAX); 1141b9434c5bSAlex Elder while (offset < end) { 1142b9434c5bSAlex Elder size_t page_offset; 1143b9434c5bSAlex Elder size_t length; 1144b9434c5bSAlex Elder unsigned long flags; 1145b9434c5bSAlex Elder void *kaddr; 1146b9434c5bSAlex Elder 1147491205a8SGeert Uytterhoeven page_offset = offset & ~PAGE_MASK; 1148491205a8SGeert Uytterhoeven length = min_t(size_t, PAGE_SIZE - page_offset, end - offset); 1149b9434c5bSAlex Elder local_irq_save(flags); 1150b9434c5bSAlex Elder kaddr = kmap_atomic(*page); 1151b9434c5bSAlex Elder memset(kaddr + page_offset, 0, length); 1152e2156054SAlex Elder flush_dcache_page(*page); 1153b9434c5bSAlex Elder kunmap_atomic(kaddr); 1154b9434c5bSAlex Elder local_irq_restore(flags); 1155b9434c5bSAlex Elder 1156b9434c5bSAlex Elder offset += length; 1157b9434c5bSAlex Elder page++; 1158b9434c5bSAlex Elder } 1159b9434c5bSAlex Elder } 1160b9434c5bSAlex Elder 1161b9434c5bSAlex Elder /* 1162f7760dadSAlex Elder * Clone a portion of a bio, starting at the given byte offset 1163f7760dadSAlex Elder * and continuing for the number of bytes indicated. 1164602adf40SYehuda Sadeh */ 1165f7760dadSAlex Elder static struct bio *bio_clone_range(struct bio *bio_src, 1166f7760dadSAlex Elder unsigned int offset, 1167f7760dadSAlex Elder unsigned int len, 1168f7760dadSAlex Elder gfp_t gfpmask) 1169602adf40SYehuda Sadeh { 1170f7760dadSAlex Elder struct bio_vec *bv; 1171f7760dadSAlex Elder unsigned int resid; 1172f7760dadSAlex Elder unsigned short idx; 1173f7760dadSAlex Elder unsigned int voff; 1174f7760dadSAlex Elder unsigned short end_idx; 1175f7760dadSAlex Elder unsigned short vcnt; 1176f7760dadSAlex Elder struct bio *bio; 1177602adf40SYehuda Sadeh 1178f7760dadSAlex Elder /* Handle the easy case for the caller */ 1179f7760dadSAlex Elder 1180f7760dadSAlex Elder if (!offset && len == bio_src->bi_size) 1181f7760dadSAlex Elder return bio_clone(bio_src, gfpmask); 1182f7760dadSAlex Elder 1183f7760dadSAlex Elder if (WARN_ON_ONCE(!len)) 1184f7760dadSAlex Elder return NULL; 1185f7760dadSAlex Elder if (WARN_ON_ONCE(len > bio_src->bi_size)) 1186f7760dadSAlex Elder return NULL; 1187f7760dadSAlex Elder if (WARN_ON_ONCE(offset > bio_src->bi_size - len)) 1188f7760dadSAlex Elder return NULL; 1189f7760dadSAlex Elder 1190f7760dadSAlex Elder /* Find first affected segment... */ 1191f7760dadSAlex Elder 1192f7760dadSAlex Elder resid = offset; 1193d74c6d51SKent Overstreet bio_for_each_segment(bv, bio_src, idx) { 1194f7760dadSAlex Elder if (resid < bv->bv_len) 1195f7760dadSAlex Elder break; 1196f7760dadSAlex Elder resid -= bv->bv_len; 1197602adf40SYehuda Sadeh } 1198f7760dadSAlex Elder voff = resid; 1199602adf40SYehuda Sadeh 1200f7760dadSAlex Elder /* ...and the last affected segment */ 1201542582fcSAlex Elder 1202f7760dadSAlex Elder resid += len; 1203f7760dadSAlex Elder __bio_for_each_segment(bv, bio_src, end_idx, idx) { 1204f7760dadSAlex Elder if (resid <= bv->bv_len) 1205f7760dadSAlex Elder break; 1206f7760dadSAlex Elder resid -= bv->bv_len; 1207f7760dadSAlex Elder } 1208f7760dadSAlex Elder vcnt = end_idx - idx + 1; 1209602adf40SYehuda Sadeh 1210f7760dadSAlex Elder /* Build the clone */ 1211f7760dadSAlex Elder 1212f7760dadSAlex Elder bio = bio_alloc(gfpmask, (unsigned int) vcnt); 1213f7760dadSAlex Elder if (!bio) 1214f7760dadSAlex Elder return NULL; /* ENOMEM */ 1215f7760dadSAlex Elder 1216f7760dadSAlex Elder bio->bi_bdev = bio_src->bi_bdev; 1217f7760dadSAlex Elder bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT); 1218f7760dadSAlex Elder bio->bi_rw = bio_src->bi_rw; 1219f7760dadSAlex Elder bio->bi_flags |= 1 << BIO_CLONED; 1220602adf40SYehuda Sadeh 1221602adf40SYehuda Sadeh /* 1222f7760dadSAlex Elder * Copy over our part of the bio_vec, then update the first 1223f7760dadSAlex Elder * and last (or only) entries. 1224602adf40SYehuda Sadeh */ 1225f7760dadSAlex Elder memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx], 1226f7760dadSAlex Elder vcnt * sizeof (struct bio_vec)); 1227f7760dadSAlex Elder bio->bi_io_vec[0].bv_offset += voff; 1228f7760dadSAlex Elder if (vcnt > 1) { 1229f7760dadSAlex Elder bio->bi_io_vec[0].bv_len -= voff; 1230f7760dadSAlex Elder bio->bi_io_vec[vcnt - 1].bv_len = resid; 1231602adf40SYehuda Sadeh } else { 1232f7760dadSAlex Elder bio->bi_io_vec[0].bv_len = len; 1233602adf40SYehuda Sadeh } 1234602adf40SYehuda Sadeh 1235f7760dadSAlex Elder bio->bi_vcnt = vcnt; 1236f7760dadSAlex Elder bio->bi_size = len; 1237f7760dadSAlex Elder bio->bi_idx = 0; 1238602adf40SYehuda Sadeh 1239f7760dadSAlex Elder return bio; 1240602adf40SYehuda Sadeh } 1241602adf40SYehuda Sadeh 1242f7760dadSAlex Elder /* 1243f7760dadSAlex Elder * Clone a portion of a bio chain, starting at the given byte offset 1244f7760dadSAlex Elder * into the first bio in the source chain and continuing for the 1245f7760dadSAlex Elder * number of bytes indicated. The result is another bio chain of 1246f7760dadSAlex Elder * exactly the given length, or a null pointer on error. 1247f7760dadSAlex Elder * 1248f7760dadSAlex Elder * The bio_src and offset parameters are both in-out. On entry they 1249f7760dadSAlex Elder * refer to the first source bio and the offset into that bio where 1250f7760dadSAlex Elder * the start of data to be cloned is located. 1251f7760dadSAlex Elder * 1252f7760dadSAlex Elder * On return, bio_src is updated to refer to the bio in the source 1253f7760dadSAlex Elder * chain that contains first un-cloned byte, and *offset will 1254f7760dadSAlex Elder * contain the offset of that byte within that bio. 1255f7760dadSAlex Elder */ 1256f7760dadSAlex Elder static struct bio *bio_chain_clone_range(struct bio **bio_src, 1257f7760dadSAlex Elder unsigned int *offset, 1258f7760dadSAlex Elder unsigned int len, 1259f7760dadSAlex Elder gfp_t gfpmask) 1260f7760dadSAlex Elder { 1261f7760dadSAlex Elder struct bio *bi = *bio_src; 1262f7760dadSAlex Elder unsigned int off = *offset; 1263f7760dadSAlex Elder struct bio *chain = NULL; 1264f7760dadSAlex Elder struct bio **end; 1265602adf40SYehuda Sadeh 1266f7760dadSAlex Elder /* Build up a chain of clone bios up to the limit */ 1267602adf40SYehuda Sadeh 1268f7760dadSAlex Elder if (!bi || off >= bi->bi_size || !len) 1269f7760dadSAlex Elder return NULL; /* Nothing to clone */ 1270602adf40SYehuda Sadeh 1271f7760dadSAlex Elder end = &chain; 1272f7760dadSAlex Elder while (len) { 1273f7760dadSAlex Elder unsigned int bi_size; 1274f7760dadSAlex Elder struct bio *bio; 1275f7760dadSAlex Elder 1276f5400b7aSAlex Elder if (!bi) { 1277f5400b7aSAlex Elder rbd_warn(NULL, "bio_chain exhausted with %u left", len); 1278f7760dadSAlex Elder goto out_err; /* EINVAL; ran out of bio's */ 1279f5400b7aSAlex Elder } 1280f7760dadSAlex Elder bi_size = min_t(unsigned int, bi->bi_size - off, len); 1281f7760dadSAlex Elder bio = bio_clone_range(bi, off, bi_size, gfpmask); 1282f7760dadSAlex Elder if (!bio) 1283f7760dadSAlex Elder goto out_err; /* ENOMEM */ 1284f7760dadSAlex Elder 1285f7760dadSAlex Elder *end = bio; 1286f7760dadSAlex Elder end = &bio->bi_next; 1287f7760dadSAlex Elder 1288f7760dadSAlex Elder off += bi_size; 1289f7760dadSAlex Elder if (off == bi->bi_size) { 1290f7760dadSAlex Elder bi = bi->bi_next; 1291f7760dadSAlex Elder off = 0; 1292f7760dadSAlex Elder } 1293f7760dadSAlex Elder len -= bi_size; 1294f7760dadSAlex Elder } 1295f7760dadSAlex Elder *bio_src = bi; 1296f7760dadSAlex Elder *offset = off; 1297f7760dadSAlex Elder 1298f7760dadSAlex Elder return chain; 1299f7760dadSAlex Elder out_err: 1300f7760dadSAlex Elder bio_chain_put(chain); 1301f7760dadSAlex Elder 1302602adf40SYehuda Sadeh return NULL; 1303602adf40SYehuda Sadeh } 1304602adf40SYehuda Sadeh 1305926f9b3fSAlex Elder /* 1306926f9b3fSAlex Elder * The default/initial value for all object request flags is 0. For 1307926f9b3fSAlex Elder * each flag, once its value is set to 1 it is never reset to 0 1308926f9b3fSAlex Elder * again. 1309926f9b3fSAlex Elder */ 13106365d33aSAlex Elder static void obj_request_img_data_set(struct rbd_obj_request *obj_request) 13116365d33aSAlex Elder { 13126365d33aSAlex Elder if (test_and_set_bit(OBJ_REQ_IMG_DATA, &obj_request->flags)) { 13136365d33aSAlex Elder struct rbd_device *rbd_dev; 13146365d33aSAlex Elder 131557acbaa7SAlex Elder rbd_dev = obj_request->img_request->rbd_dev; 13166365d33aSAlex Elder rbd_warn(rbd_dev, "obj_request %p already marked img_data\n", 13176365d33aSAlex Elder obj_request); 13186365d33aSAlex Elder } 13196365d33aSAlex Elder } 13206365d33aSAlex Elder 13216365d33aSAlex Elder static bool obj_request_img_data_test(struct rbd_obj_request *obj_request) 13226365d33aSAlex Elder { 13236365d33aSAlex Elder smp_mb(); 13246365d33aSAlex Elder return test_bit(OBJ_REQ_IMG_DATA, &obj_request->flags) != 0; 13256365d33aSAlex Elder } 13266365d33aSAlex Elder 132757acbaa7SAlex Elder static void obj_request_done_set(struct rbd_obj_request *obj_request) 132857acbaa7SAlex Elder { 132957acbaa7SAlex Elder if (test_and_set_bit(OBJ_REQ_DONE, &obj_request->flags)) { 133057acbaa7SAlex Elder struct rbd_device *rbd_dev = NULL; 133157acbaa7SAlex Elder 133257acbaa7SAlex Elder if (obj_request_img_data_test(obj_request)) 133357acbaa7SAlex Elder rbd_dev = obj_request->img_request->rbd_dev; 133457acbaa7SAlex Elder rbd_warn(rbd_dev, "obj_request %p already marked done\n", 133557acbaa7SAlex Elder obj_request); 133657acbaa7SAlex Elder } 133757acbaa7SAlex Elder } 133857acbaa7SAlex Elder 133957acbaa7SAlex Elder static bool obj_request_done_test(struct rbd_obj_request *obj_request) 134057acbaa7SAlex Elder { 134157acbaa7SAlex Elder smp_mb(); 134257acbaa7SAlex Elder return test_bit(OBJ_REQ_DONE, &obj_request->flags) != 0; 134357acbaa7SAlex Elder } 134457acbaa7SAlex Elder 13455679c59fSAlex Elder /* 13465679c59fSAlex Elder * This sets the KNOWN flag after (possibly) setting the EXISTS 13475679c59fSAlex Elder * flag. The latter is set based on the "exists" value provided. 13485679c59fSAlex Elder * 13495679c59fSAlex Elder * Note that for our purposes once an object exists it never goes 13505679c59fSAlex Elder * away again. It's possible that the response from two existence 13515679c59fSAlex Elder * checks are separated by the creation of the target object, and 13525679c59fSAlex Elder * the first ("doesn't exist") response arrives *after* the second 13535679c59fSAlex Elder * ("does exist"). In that case we ignore the second one. 13545679c59fSAlex Elder */ 13555679c59fSAlex Elder static void obj_request_existence_set(struct rbd_obj_request *obj_request, 13565679c59fSAlex Elder bool exists) 13575679c59fSAlex Elder { 13585679c59fSAlex Elder if (exists) 13595679c59fSAlex Elder set_bit(OBJ_REQ_EXISTS, &obj_request->flags); 13605679c59fSAlex Elder set_bit(OBJ_REQ_KNOWN, &obj_request->flags); 13615679c59fSAlex Elder smp_mb(); 13625679c59fSAlex Elder } 13635679c59fSAlex Elder 13645679c59fSAlex Elder static bool obj_request_known_test(struct rbd_obj_request *obj_request) 13655679c59fSAlex Elder { 13665679c59fSAlex Elder smp_mb(); 13675679c59fSAlex Elder return test_bit(OBJ_REQ_KNOWN, &obj_request->flags) != 0; 13685679c59fSAlex Elder } 13695679c59fSAlex Elder 13705679c59fSAlex Elder static bool obj_request_exists_test(struct rbd_obj_request *obj_request) 13715679c59fSAlex Elder { 13725679c59fSAlex Elder smp_mb(); 13735679c59fSAlex Elder return test_bit(OBJ_REQ_EXISTS, &obj_request->flags) != 0; 13745679c59fSAlex Elder } 13755679c59fSAlex Elder 1376bf0d5f50SAlex Elder static void rbd_obj_request_get(struct rbd_obj_request *obj_request) 1377bf0d5f50SAlex Elder { 137837206ee5SAlex Elder dout("%s: obj %p (was %d)\n", __func__, obj_request, 137937206ee5SAlex Elder atomic_read(&obj_request->kref.refcount)); 1380bf0d5f50SAlex Elder kref_get(&obj_request->kref); 1381bf0d5f50SAlex Elder } 1382bf0d5f50SAlex Elder 1383bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref); 1384bf0d5f50SAlex Elder static void rbd_obj_request_put(struct rbd_obj_request *obj_request) 1385bf0d5f50SAlex Elder { 1386bf0d5f50SAlex Elder rbd_assert(obj_request != NULL); 138737206ee5SAlex Elder dout("%s: obj %p (was %d)\n", __func__, obj_request, 138837206ee5SAlex Elder atomic_read(&obj_request->kref.refcount)); 1389bf0d5f50SAlex Elder kref_put(&obj_request->kref, rbd_obj_request_destroy); 1390bf0d5f50SAlex Elder } 1391bf0d5f50SAlex Elder 1392e93f3152SAlex Elder static bool img_request_child_test(struct rbd_img_request *img_request); 1393e93f3152SAlex Elder static void rbd_parent_request_destroy(struct kref *kref); 1394bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref); 1395bf0d5f50SAlex Elder static void rbd_img_request_put(struct rbd_img_request *img_request) 1396bf0d5f50SAlex Elder { 1397bf0d5f50SAlex Elder rbd_assert(img_request != NULL); 139837206ee5SAlex Elder dout("%s: img %p (was %d)\n", __func__, img_request, 139937206ee5SAlex Elder atomic_read(&img_request->kref.refcount)); 1400e93f3152SAlex Elder if (img_request_child_test(img_request)) 1401e93f3152SAlex Elder kref_put(&img_request->kref, rbd_parent_request_destroy); 1402e93f3152SAlex Elder else 1403bf0d5f50SAlex Elder kref_put(&img_request->kref, rbd_img_request_destroy); 1404bf0d5f50SAlex Elder } 1405bf0d5f50SAlex Elder 1406bf0d5f50SAlex Elder static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request, 1407bf0d5f50SAlex Elder struct rbd_obj_request *obj_request) 1408bf0d5f50SAlex Elder { 140925dcf954SAlex Elder rbd_assert(obj_request->img_request == NULL); 141025dcf954SAlex Elder 1411b155e86cSAlex Elder /* Image request now owns object's original reference */ 1412bf0d5f50SAlex Elder obj_request->img_request = img_request; 141325dcf954SAlex Elder obj_request->which = img_request->obj_request_count; 14146365d33aSAlex Elder rbd_assert(!obj_request_img_data_test(obj_request)); 14156365d33aSAlex Elder obj_request_img_data_set(obj_request); 1416bf0d5f50SAlex Elder rbd_assert(obj_request->which != BAD_WHICH); 141725dcf954SAlex Elder img_request->obj_request_count++; 141825dcf954SAlex Elder list_add_tail(&obj_request->links, &img_request->obj_requests); 141937206ee5SAlex Elder dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request, 142037206ee5SAlex Elder obj_request->which); 1421bf0d5f50SAlex Elder } 1422bf0d5f50SAlex Elder 1423bf0d5f50SAlex Elder static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request, 1424bf0d5f50SAlex Elder struct rbd_obj_request *obj_request) 1425bf0d5f50SAlex Elder { 1426bf0d5f50SAlex Elder rbd_assert(obj_request->which != BAD_WHICH); 142725dcf954SAlex Elder 142837206ee5SAlex Elder dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request, 142937206ee5SAlex Elder obj_request->which); 1430bf0d5f50SAlex Elder list_del(&obj_request->links); 143125dcf954SAlex Elder rbd_assert(img_request->obj_request_count > 0); 143225dcf954SAlex Elder img_request->obj_request_count--; 143325dcf954SAlex Elder rbd_assert(obj_request->which == img_request->obj_request_count); 143425dcf954SAlex Elder obj_request->which = BAD_WHICH; 14356365d33aSAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 1436bf0d5f50SAlex Elder rbd_assert(obj_request->img_request == img_request); 1437bf0d5f50SAlex Elder obj_request->img_request = NULL; 143825dcf954SAlex Elder obj_request->callback = NULL; 1439bf0d5f50SAlex Elder rbd_obj_request_put(obj_request); 1440bf0d5f50SAlex Elder } 1441bf0d5f50SAlex Elder 1442bf0d5f50SAlex Elder static bool obj_request_type_valid(enum obj_request_type type) 1443bf0d5f50SAlex Elder { 1444bf0d5f50SAlex Elder switch (type) { 14459969ebc5SAlex Elder case OBJ_REQUEST_NODATA: 1446bf0d5f50SAlex Elder case OBJ_REQUEST_BIO: 1447788e2df3SAlex Elder case OBJ_REQUEST_PAGES: 1448bf0d5f50SAlex Elder return true; 1449bf0d5f50SAlex Elder default: 1450bf0d5f50SAlex Elder return false; 1451bf0d5f50SAlex Elder } 1452bf0d5f50SAlex Elder } 1453bf0d5f50SAlex Elder 1454bf0d5f50SAlex Elder static int rbd_obj_request_submit(struct ceph_osd_client *osdc, 1455bf0d5f50SAlex Elder struct rbd_obj_request *obj_request) 1456bf0d5f50SAlex Elder { 145737206ee5SAlex Elder dout("%s: osdc %p obj %p\n", __func__, osdc, obj_request); 145837206ee5SAlex Elder 1459bf0d5f50SAlex Elder return ceph_osdc_start_request(osdc, obj_request->osd_req, false); 1460bf0d5f50SAlex Elder } 1461bf0d5f50SAlex Elder 1462bf0d5f50SAlex Elder static void rbd_img_request_complete(struct rbd_img_request *img_request) 1463bf0d5f50SAlex Elder { 146455f27e09SAlex Elder 146537206ee5SAlex Elder dout("%s: img %p\n", __func__, img_request); 146655f27e09SAlex Elder 146755f27e09SAlex Elder /* 146855f27e09SAlex Elder * If no error occurred, compute the aggregate transfer 146955f27e09SAlex Elder * count for the image request. We could instead use 147055f27e09SAlex Elder * atomic64_cmpxchg() to update it as each object request 147155f27e09SAlex Elder * completes; not clear which way is better off hand. 147255f27e09SAlex Elder */ 147355f27e09SAlex Elder if (!img_request->result) { 147455f27e09SAlex Elder struct rbd_obj_request *obj_request; 147555f27e09SAlex Elder u64 xferred = 0; 147655f27e09SAlex Elder 147755f27e09SAlex Elder for_each_obj_request(img_request, obj_request) 147855f27e09SAlex Elder xferred += obj_request->xferred; 147955f27e09SAlex Elder img_request->xferred = xferred; 148055f27e09SAlex Elder } 148155f27e09SAlex Elder 1482bf0d5f50SAlex Elder if (img_request->callback) 1483bf0d5f50SAlex Elder img_request->callback(img_request); 1484bf0d5f50SAlex Elder else 1485bf0d5f50SAlex Elder rbd_img_request_put(img_request); 1486bf0d5f50SAlex Elder } 1487bf0d5f50SAlex Elder 1488788e2df3SAlex Elder /* Caller is responsible for rbd_obj_request_destroy(obj_request) */ 1489788e2df3SAlex Elder 1490788e2df3SAlex Elder static int rbd_obj_request_wait(struct rbd_obj_request *obj_request) 1491788e2df3SAlex Elder { 149237206ee5SAlex Elder dout("%s: obj %p\n", __func__, obj_request); 149337206ee5SAlex Elder 1494788e2df3SAlex Elder return wait_for_completion_interruptible(&obj_request->completion); 1495788e2df3SAlex Elder } 1496788e2df3SAlex Elder 14970c425248SAlex Elder /* 14980c425248SAlex Elder * The default/initial value for all image request flags is 0. Each 14990c425248SAlex Elder * is conditionally set to 1 at image request initialization time 15000c425248SAlex Elder * and currently never change thereafter. 15010c425248SAlex Elder */ 15020c425248SAlex Elder static void img_request_write_set(struct rbd_img_request *img_request) 15030c425248SAlex Elder { 15040c425248SAlex Elder set_bit(IMG_REQ_WRITE, &img_request->flags); 15050c425248SAlex Elder smp_mb(); 15060c425248SAlex Elder } 15070c425248SAlex Elder 15080c425248SAlex Elder static bool img_request_write_test(struct rbd_img_request *img_request) 15090c425248SAlex Elder { 15100c425248SAlex Elder smp_mb(); 15110c425248SAlex Elder return test_bit(IMG_REQ_WRITE, &img_request->flags) != 0; 15120c425248SAlex Elder } 15130c425248SAlex Elder 15149849e986SAlex Elder static void img_request_child_set(struct rbd_img_request *img_request) 15159849e986SAlex Elder { 15169849e986SAlex Elder set_bit(IMG_REQ_CHILD, &img_request->flags); 15179849e986SAlex Elder smp_mb(); 15189849e986SAlex Elder } 15199849e986SAlex Elder 1520e93f3152SAlex Elder static void img_request_child_clear(struct rbd_img_request *img_request) 1521e93f3152SAlex Elder { 1522e93f3152SAlex Elder clear_bit(IMG_REQ_CHILD, &img_request->flags); 1523e93f3152SAlex Elder smp_mb(); 1524e93f3152SAlex Elder } 1525e93f3152SAlex Elder 15269849e986SAlex Elder static bool img_request_child_test(struct rbd_img_request *img_request) 15279849e986SAlex Elder { 15289849e986SAlex Elder smp_mb(); 15299849e986SAlex Elder return test_bit(IMG_REQ_CHILD, &img_request->flags) != 0; 15309849e986SAlex Elder } 15319849e986SAlex Elder 1532d0b2e944SAlex Elder static void img_request_layered_set(struct rbd_img_request *img_request) 1533d0b2e944SAlex Elder { 1534d0b2e944SAlex Elder set_bit(IMG_REQ_LAYERED, &img_request->flags); 1535d0b2e944SAlex Elder smp_mb(); 1536d0b2e944SAlex Elder } 1537d0b2e944SAlex Elder 1538a2acd00eSAlex Elder static void img_request_layered_clear(struct rbd_img_request *img_request) 1539a2acd00eSAlex Elder { 1540a2acd00eSAlex Elder clear_bit(IMG_REQ_LAYERED, &img_request->flags); 1541a2acd00eSAlex Elder smp_mb(); 1542a2acd00eSAlex Elder } 1543a2acd00eSAlex Elder 1544d0b2e944SAlex Elder static bool img_request_layered_test(struct rbd_img_request *img_request) 1545d0b2e944SAlex Elder { 1546d0b2e944SAlex Elder smp_mb(); 1547d0b2e944SAlex Elder return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0; 1548d0b2e944SAlex Elder } 1549d0b2e944SAlex Elder 15506e2a4505SAlex Elder static void 15516e2a4505SAlex Elder rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request) 15526e2a4505SAlex Elder { 1553b9434c5bSAlex Elder u64 xferred = obj_request->xferred; 1554b9434c5bSAlex Elder u64 length = obj_request->length; 1555b9434c5bSAlex Elder 15566e2a4505SAlex Elder dout("%s: obj %p img %p result %d %llu/%llu\n", __func__, 15576e2a4505SAlex Elder obj_request, obj_request->img_request, obj_request->result, 1558b9434c5bSAlex Elder xferred, length); 15596e2a4505SAlex Elder /* 156017c1cc1dSJosh Durgin * ENOENT means a hole in the image. We zero-fill the entire 156117c1cc1dSJosh Durgin * length of the request. A short read also implies zero-fill 156217c1cc1dSJosh Durgin * to the end of the request. An error requires the whole 156317c1cc1dSJosh Durgin * length of the request to be reported finished with an error 156417c1cc1dSJosh Durgin * to the block layer. In each case we update the xferred 156517c1cc1dSJosh Durgin * count to indicate the whole request was satisfied. 15666e2a4505SAlex Elder */ 1567b9434c5bSAlex Elder rbd_assert(obj_request->type != OBJ_REQUEST_NODATA); 15686e2a4505SAlex Elder if (obj_request->result == -ENOENT) { 1569b9434c5bSAlex Elder if (obj_request->type == OBJ_REQUEST_BIO) 15706e2a4505SAlex Elder zero_bio_chain(obj_request->bio_list, 0); 1571b9434c5bSAlex Elder else 1572b9434c5bSAlex Elder zero_pages(obj_request->pages, 0, length); 15736e2a4505SAlex Elder obj_request->result = 0; 1574b9434c5bSAlex Elder } else if (xferred < length && !obj_request->result) { 1575b9434c5bSAlex Elder if (obj_request->type == OBJ_REQUEST_BIO) 1576b9434c5bSAlex Elder zero_bio_chain(obj_request->bio_list, xferred); 1577b9434c5bSAlex Elder else 1578b9434c5bSAlex Elder zero_pages(obj_request->pages, xferred, length); 15796e2a4505SAlex Elder } 158017c1cc1dSJosh Durgin obj_request->xferred = length; 15816e2a4505SAlex Elder obj_request_done_set(obj_request); 15826e2a4505SAlex Elder } 15836e2a4505SAlex Elder 1584bf0d5f50SAlex Elder static void rbd_obj_request_complete(struct rbd_obj_request *obj_request) 1585bf0d5f50SAlex Elder { 158637206ee5SAlex Elder dout("%s: obj %p cb %p\n", __func__, obj_request, 158737206ee5SAlex Elder obj_request->callback); 1588bf0d5f50SAlex Elder if (obj_request->callback) 1589bf0d5f50SAlex Elder obj_request->callback(obj_request); 1590788e2df3SAlex Elder else 1591788e2df3SAlex Elder complete_all(&obj_request->completion); 1592bf0d5f50SAlex Elder } 1593bf0d5f50SAlex Elder 1594c47f9371SAlex Elder static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request) 159539bf2c5dSAlex Elder { 159639bf2c5dSAlex Elder dout("%s: obj %p\n", __func__, obj_request); 159739bf2c5dSAlex Elder obj_request_done_set(obj_request); 159839bf2c5dSAlex Elder } 159939bf2c5dSAlex Elder 1600c47f9371SAlex Elder static void rbd_osd_read_callback(struct rbd_obj_request *obj_request) 1601bf0d5f50SAlex Elder { 160257acbaa7SAlex Elder struct rbd_img_request *img_request = NULL; 1603a9e8ba2cSAlex Elder struct rbd_device *rbd_dev = NULL; 160457acbaa7SAlex Elder bool layered = false; 160557acbaa7SAlex Elder 160657acbaa7SAlex Elder if (obj_request_img_data_test(obj_request)) { 160757acbaa7SAlex Elder img_request = obj_request->img_request; 160857acbaa7SAlex Elder layered = img_request && img_request_layered_test(img_request); 1609a9e8ba2cSAlex Elder rbd_dev = img_request->rbd_dev; 161057acbaa7SAlex Elder } 16118b3e1a56SAlex Elder 16128b3e1a56SAlex Elder dout("%s: obj %p img %p result %d %llu/%llu\n", __func__, 16138b3e1a56SAlex Elder obj_request, img_request, obj_request->result, 16148b3e1a56SAlex Elder obj_request->xferred, obj_request->length); 1615a9e8ba2cSAlex Elder if (layered && obj_request->result == -ENOENT && 1616a9e8ba2cSAlex Elder obj_request->img_offset < rbd_dev->parent_overlap) 16178b3e1a56SAlex Elder rbd_img_parent_read(obj_request); 16188b3e1a56SAlex Elder else if (img_request) 16196e2a4505SAlex Elder rbd_img_obj_request_read_callback(obj_request); 16206e2a4505SAlex Elder else 162107741308SAlex Elder obj_request_done_set(obj_request); 1622bf0d5f50SAlex Elder } 1623bf0d5f50SAlex Elder 1624c47f9371SAlex Elder static void rbd_osd_write_callback(struct rbd_obj_request *obj_request) 1625bf0d5f50SAlex Elder { 16261b83bef2SSage Weil dout("%s: obj %p result %d %llu\n", __func__, obj_request, 16271b83bef2SSage Weil obj_request->result, obj_request->length); 16281b83bef2SSage Weil /* 16298b3e1a56SAlex Elder * There is no such thing as a successful short write. Set 16308b3e1a56SAlex Elder * it to our originally-requested length. 16311b83bef2SSage Weil */ 16321b83bef2SSage Weil obj_request->xferred = obj_request->length; 163307741308SAlex Elder obj_request_done_set(obj_request); 1634bf0d5f50SAlex Elder } 1635bf0d5f50SAlex Elder 1636fbfab539SAlex Elder /* 1637fbfab539SAlex Elder * For a simple stat call there's nothing to do. We'll do more if 1638fbfab539SAlex Elder * this is part of a write sequence for a layered image. 1639fbfab539SAlex Elder */ 1640c47f9371SAlex Elder static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request) 1641fbfab539SAlex Elder { 164237206ee5SAlex Elder dout("%s: obj %p\n", __func__, obj_request); 1643fbfab539SAlex Elder obj_request_done_set(obj_request); 1644fbfab539SAlex Elder } 1645fbfab539SAlex Elder 1646bf0d5f50SAlex Elder static void rbd_osd_req_callback(struct ceph_osd_request *osd_req, 1647bf0d5f50SAlex Elder struct ceph_msg *msg) 1648bf0d5f50SAlex Elder { 1649bf0d5f50SAlex Elder struct rbd_obj_request *obj_request = osd_req->r_priv; 1650bf0d5f50SAlex Elder u16 opcode; 1651bf0d5f50SAlex Elder 165237206ee5SAlex Elder dout("%s: osd_req %p msg %p\n", __func__, osd_req, msg); 1653bf0d5f50SAlex Elder rbd_assert(osd_req == obj_request->osd_req); 165457acbaa7SAlex Elder if (obj_request_img_data_test(obj_request)) { 165557acbaa7SAlex Elder rbd_assert(obj_request->img_request); 165657acbaa7SAlex Elder rbd_assert(obj_request->which != BAD_WHICH); 165757acbaa7SAlex Elder } else { 165857acbaa7SAlex Elder rbd_assert(obj_request->which == BAD_WHICH); 165957acbaa7SAlex Elder } 1660bf0d5f50SAlex Elder 16611b83bef2SSage Weil if (osd_req->r_result < 0) 16621b83bef2SSage Weil obj_request->result = osd_req->r_result; 1663bf0d5f50SAlex Elder 16640eefd470SAlex Elder BUG_ON(osd_req->r_num_ops > 2); 1665bf0d5f50SAlex Elder 1666c47f9371SAlex Elder /* 1667c47f9371SAlex Elder * We support a 64-bit length, but ultimately it has to be 1668c47f9371SAlex Elder * passed to blk_end_request(), which takes an unsigned int. 1669c47f9371SAlex Elder */ 16701b83bef2SSage Weil obj_request->xferred = osd_req->r_reply_op_len[0]; 1671c47f9371SAlex Elder rbd_assert(obj_request->xferred < (u64)UINT_MAX); 167279528734SAlex Elder opcode = osd_req->r_ops[0].op; 1673bf0d5f50SAlex Elder switch (opcode) { 1674bf0d5f50SAlex Elder case CEPH_OSD_OP_READ: 1675c47f9371SAlex Elder rbd_osd_read_callback(obj_request); 1676bf0d5f50SAlex Elder break; 1677bf0d5f50SAlex Elder case CEPH_OSD_OP_WRITE: 1678c47f9371SAlex Elder rbd_osd_write_callback(obj_request); 1679bf0d5f50SAlex Elder break; 1680fbfab539SAlex Elder case CEPH_OSD_OP_STAT: 1681c47f9371SAlex Elder rbd_osd_stat_callback(obj_request); 1682fbfab539SAlex Elder break; 168336be9a76SAlex Elder case CEPH_OSD_OP_CALL: 1684b8d70035SAlex Elder case CEPH_OSD_OP_NOTIFY_ACK: 16859969ebc5SAlex Elder case CEPH_OSD_OP_WATCH: 1686c47f9371SAlex Elder rbd_osd_trivial_callback(obj_request); 16879969ebc5SAlex Elder break; 1688bf0d5f50SAlex Elder default: 1689bf0d5f50SAlex Elder rbd_warn(NULL, "%s: unsupported op %hu\n", 1690bf0d5f50SAlex Elder obj_request->object_name, (unsigned short) opcode); 1691bf0d5f50SAlex Elder break; 1692bf0d5f50SAlex Elder } 1693bf0d5f50SAlex Elder 169407741308SAlex Elder if (obj_request_done_test(obj_request)) 1695bf0d5f50SAlex Elder rbd_obj_request_complete(obj_request); 1696bf0d5f50SAlex Elder } 1697bf0d5f50SAlex Elder 16989d4df01fSAlex Elder static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request) 1699430c28c3SAlex Elder { 1700430c28c3SAlex Elder struct rbd_img_request *img_request = obj_request->img_request; 17018c042b0dSAlex Elder struct ceph_osd_request *osd_req = obj_request->osd_req; 17029d4df01fSAlex Elder u64 snap_id; 1703430c28c3SAlex Elder 17048c042b0dSAlex Elder rbd_assert(osd_req != NULL); 1705430c28c3SAlex Elder 17069d4df01fSAlex Elder snap_id = img_request ? img_request->snap_id : CEPH_NOSNAP; 17078c042b0dSAlex Elder ceph_osdc_build_request(osd_req, obj_request->offset, 17089d4df01fSAlex Elder NULL, snap_id, NULL); 17099d4df01fSAlex Elder } 17109d4df01fSAlex Elder 17119d4df01fSAlex Elder static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request) 17129d4df01fSAlex Elder { 17139d4df01fSAlex Elder struct rbd_img_request *img_request = obj_request->img_request; 17149d4df01fSAlex Elder struct ceph_osd_request *osd_req = obj_request->osd_req; 17159d4df01fSAlex Elder struct ceph_snap_context *snapc; 17169d4df01fSAlex Elder struct timespec mtime = CURRENT_TIME; 17179d4df01fSAlex Elder 17189d4df01fSAlex Elder rbd_assert(osd_req != NULL); 17199d4df01fSAlex Elder 17209d4df01fSAlex Elder snapc = img_request ? img_request->snapc : NULL; 17219d4df01fSAlex Elder ceph_osdc_build_request(osd_req, obj_request->offset, 17229d4df01fSAlex Elder snapc, CEPH_NOSNAP, &mtime); 1723430c28c3SAlex Elder } 1724430c28c3SAlex Elder 1725bf0d5f50SAlex Elder static struct ceph_osd_request *rbd_osd_req_create( 1726bf0d5f50SAlex Elder struct rbd_device *rbd_dev, 1727bf0d5f50SAlex Elder bool write_request, 1728430c28c3SAlex Elder struct rbd_obj_request *obj_request) 1729bf0d5f50SAlex Elder { 1730bf0d5f50SAlex Elder struct ceph_snap_context *snapc = NULL; 1731bf0d5f50SAlex Elder struct ceph_osd_client *osdc; 1732bf0d5f50SAlex Elder struct ceph_osd_request *osd_req; 1733bf0d5f50SAlex Elder 17346365d33aSAlex Elder if (obj_request_img_data_test(obj_request)) { 17356365d33aSAlex Elder struct rbd_img_request *img_request = obj_request->img_request; 17366365d33aSAlex Elder 17370c425248SAlex Elder rbd_assert(write_request == 17380c425248SAlex Elder img_request_write_test(img_request)); 17390c425248SAlex Elder if (write_request) 1740bf0d5f50SAlex Elder snapc = img_request->snapc; 1741bf0d5f50SAlex Elder } 1742bf0d5f50SAlex Elder 1743bf0d5f50SAlex Elder /* Allocate and initialize the request, for the single op */ 1744bf0d5f50SAlex Elder 1745bf0d5f50SAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 1746bf0d5f50SAlex Elder osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_ATOMIC); 1747bf0d5f50SAlex Elder if (!osd_req) 1748bf0d5f50SAlex Elder return NULL; /* ENOMEM */ 1749bf0d5f50SAlex Elder 1750430c28c3SAlex Elder if (write_request) 1751bf0d5f50SAlex Elder osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK; 1752430c28c3SAlex Elder else 1753bf0d5f50SAlex Elder osd_req->r_flags = CEPH_OSD_FLAG_READ; 1754bf0d5f50SAlex Elder 1755bf0d5f50SAlex Elder osd_req->r_callback = rbd_osd_req_callback; 1756bf0d5f50SAlex Elder osd_req->r_priv = obj_request; 1757bf0d5f50SAlex Elder 1758bf0d5f50SAlex Elder osd_req->r_oid_len = strlen(obj_request->object_name); 1759bf0d5f50SAlex Elder rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid)); 1760bf0d5f50SAlex Elder memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len); 1761bf0d5f50SAlex Elder 1762bf0d5f50SAlex Elder osd_req->r_file_layout = rbd_dev->layout; /* struct */ 1763bf0d5f50SAlex Elder 1764bf0d5f50SAlex Elder return osd_req; 1765bf0d5f50SAlex Elder } 1766bf0d5f50SAlex Elder 17670eefd470SAlex Elder /* 17680eefd470SAlex Elder * Create a copyup osd request based on the information in the 17690eefd470SAlex Elder * object request supplied. A copyup request has two osd ops, 17700eefd470SAlex Elder * a copyup method call, and a "normal" write request. 17710eefd470SAlex Elder */ 17720eefd470SAlex Elder static struct ceph_osd_request * 17730eefd470SAlex Elder rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request) 17740eefd470SAlex Elder { 17750eefd470SAlex Elder struct rbd_img_request *img_request; 17760eefd470SAlex Elder struct ceph_snap_context *snapc; 17770eefd470SAlex Elder struct rbd_device *rbd_dev; 17780eefd470SAlex Elder struct ceph_osd_client *osdc; 17790eefd470SAlex Elder struct ceph_osd_request *osd_req; 17800eefd470SAlex Elder 17810eefd470SAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 17820eefd470SAlex Elder img_request = obj_request->img_request; 17830eefd470SAlex Elder rbd_assert(img_request); 17840eefd470SAlex Elder rbd_assert(img_request_write_test(img_request)); 17850eefd470SAlex Elder 17860eefd470SAlex Elder /* Allocate and initialize the request, for the two ops */ 17870eefd470SAlex Elder 17880eefd470SAlex Elder snapc = img_request->snapc; 17890eefd470SAlex Elder rbd_dev = img_request->rbd_dev; 17900eefd470SAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 17910eefd470SAlex Elder osd_req = ceph_osdc_alloc_request(osdc, snapc, 2, false, GFP_ATOMIC); 17920eefd470SAlex Elder if (!osd_req) 17930eefd470SAlex Elder return NULL; /* ENOMEM */ 17940eefd470SAlex Elder 17950eefd470SAlex Elder osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK; 17960eefd470SAlex Elder osd_req->r_callback = rbd_osd_req_callback; 17970eefd470SAlex Elder osd_req->r_priv = obj_request; 17980eefd470SAlex Elder 17990eefd470SAlex Elder osd_req->r_oid_len = strlen(obj_request->object_name); 18000eefd470SAlex Elder rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid)); 18010eefd470SAlex Elder memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len); 18020eefd470SAlex Elder 18030eefd470SAlex Elder osd_req->r_file_layout = rbd_dev->layout; /* struct */ 18040eefd470SAlex Elder 18050eefd470SAlex Elder return osd_req; 18060eefd470SAlex Elder } 18070eefd470SAlex Elder 18080eefd470SAlex Elder 1809bf0d5f50SAlex Elder static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req) 1810bf0d5f50SAlex Elder { 1811bf0d5f50SAlex Elder ceph_osdc_put_request(osd_req); 1812bf0d5f50SAlex Elder } 1813bf0d5f50SAlex Elder 1814bf0d5f50SAlex Elder /* object_name is assumed to be a non-null pointer and NUL-terminated */ 1815bf0d5f50SAlex Elder 1816bf0d5f50SAlex Elder static struct rbd_obj_request *rbd_obj_request_create(const char *object_name, 1817bf0d5f50SAlex Elder u64 offset, u64 length, 1818bf0d5f50SAlex Elder enum obj_request_type type) 1819bf0d5f50SAlex Elder { 1820bf0d5f50SAlex Elder struct rbd_obj_request *obj_request; 1821bf0d5f50SAlex Elder size_t size; 1822bf0d5f50SAlex Elder char *name; 1823bf0d5f50SAlex Elder 1824bf0d5f50SAlex Elder rbd_assert(obj_request_type_valid(type)); 1825bf0d5f50SAlex Elder 1826bf0d5f50SAlex Elder size = strlen(object_name) + 1; 1827f907ad55SAlex Elder name = kmalloc(size, GFP_KERNEL); 1828f907ad55SAlex Elder if (!name) 1829bf0d5f50SAlex Elder return NULL; 1830bf0d5f50SAlex Elder 1831868311b1SAlex Elder obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_KERNEL); 1832f907ad55SAlex Elder if (!obj_request) { 1833f907ad55SAlex Elder kfree(name); 1834f907ad55SAlex Elder return NULL; 1835f907ad55SAlex Elder } 1836f907ad55SAlex Elder 1837bf0d5f50SAlex Elder obj_request->object_name = memcpy(name, object_name, size); 1838bf0d5f50SAlex Elder obj_request->offset = offset; 1839bf0d5f50SAlex Elder obj_request->length = length; 1840926f9b3fSAlex Elder obj_request->flags = 0; 1841bf0d5f50SAlex Elder obj_request->which = BAD_WHICH; 1842bf0d5f50SAlex Elder obj_request->type = type; 1843bf0d5f50SAlex Elder INIT_LIST_HEAD(&obj_request->links); 1844788e2df3SAlex Elder init_completion(&obj_request->completion); 1845bf0d5f50SAlex Elder kref_init(&obj_request->kref); 1846bf0d5f50SAlex Elder 184737206ee5SAlex Elder dout("%s: \"%s\" %llu/%llu %d -> obj %p\n", __func__, object_name, 184837206ee5SAlex Elder offset, length, (int)type, obj_request); 184937206ee5SAlex Elder 1850bf0d5f50SAlex Elder return obj_request; 1851bf0d5f50SAlex Elder } 1852bf0d5f50SAlex Elder 1853bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref) 1854bf0d5f50SAlex Elder { 1855bf0d5f50SAlex Elder struct rbd_obj_request *obj_request; 1856bf0d5f50SAlex Elder 1857bf0d5f50SAlex Elder obj_request = container_of(kref, struct rbd_obj_request, kref); 1858bf0d5f50SAlex Elder 185937206ee5SAlex Elder dout("%s: obj %p\n", __func__, obj_request); 186037206ee5SAlex Elder 1861bf0d5f50SAlex Elder rbd_assert(obj_request->img_request == NULL); 1862bf0d5f50SAlex Elder rbd_assert(obj_request->which == BAD_WHICH); 1863bf0d5f50SAlex Elder 1864bf0d5f50SAlex Elder if (obj_request->osd_req) 1865bf0d5f50SAlex Elder rbd_osd_req_destroy(obj_request->osd_req); 1866bf0d5f50SAlex Elder 1867bf0d5f50SAlex Elder rbd_assert(obj_request_type_valid(obj_request->type)); 1868bf0d5f50SAlex Elder switch (obj_request->type) { 18699969ebc5SAlex Elder case OBJ_REQUEST_NODATA: 18709969ebc5SAlex Elder break; /* Nothing to do */ 1871bf0d5f50SAlex Elder case OBJ_REQUEST_BIO: 1872bf0d5f50SAlex Elder if (obj_request->bio_list) 1873bf0d5f50SAlex Elder bio_chain_put(obj_request->bio_list); 1874bf0d5f50SAlex Elder break; 1875788e2df3SAlex Elder case OBJ_REQUEST_PAGES: 1876788e2df3SAlex Elder if (obj_request->pages) 1877788e2df3SAlex Elder ceph_release_page_vector(obj_request->pages, 1878788e2df3SAlex Elder obj_request->page_count); 1879788e2df3SAlex Elder break; 1880bf0d5f50SAlex Elder } 1881bf0d5f50SAlex Elder 1882f907ad55SAlex Elder kfree(obj_request->object_name); 1883868311b1SAlex Elder obj_request->object_name = NULL; 1884868311b1SAlex Elder kmem_cache_free(rbd_obj_request_cache, obj_request); 1885bf0d5f50SAlex Elder } 1886bf0d5f50SAlex Elder 1887fb65d228SAlex Elder /* It's OK to call this for a device with no parent */ 1888fb65d228SAlex Elder 1889fb65d228SAlex Elder static void rbd_spec_put(struct rbd_spec *spec); 1890fb65d228SAlex Elder static void rbd_dev_unparent(struct rbd_device *rbd_dev) 1891fb65d228SAlex Elder { 1892fb65d228SAlex Elder rbd_dev_remove_parent(rbd_dev); 1893fb65d228SAlex Elder rbd_spec_put(rbd_dev->parent_spec); 1894fb65d228SAlex Elder rbd_dev->parent_spec = NULL; 1895fb65d228SAlex Elder rbd_dev->parent_overlap = 0; 1896fb65d228SAlex Elder } 1897fb65d228SAlex Elder 1898bf0d5f50SAlex Elder /* 1899a2acd00eSAlex Elder * Parent image reference counting is used to determine when an 1900a2acd00eSAlex Elder * image's parent fields can be safely torn down--after there are no 1901a2acd00eSAlex Elder * more in-flight requests to the parent image. When the last 1902a2acd00eSAlex Elder * reference is dropped, cleaning them up is safe. 1903a2acd00eSAlex Elder */ 1904a2acd00eSAlex Elder static void rbd_dev_parent_put(struct rbd_device *rbd_dev) 1905a2acd00eSAlex Elder { 1906a2acd00eSAlex Elder int counter; 1907a2acd00eSAlex Elder 1908a2acd00eSAlex Elder if (!rbd_dev->parent_spec) 1909a2acd00eSAlex Elder return; 1910a2acd00eSAlex Elder 1911a2acd00eSAlex Elder counter = atomic_dec_return_safe(&rbd_dev->parent_ref); 1912a2acd00eSAlex Elder if (counter > 0) 1913a2acd00eSAlex Elder return; 1914a2acd00eSAlex Elder 1915a2acd00eSAlex Elder /* Last reference; clean up parent data structures */ 1916a2acd00eSAlex Elder 1917a2acd00eSAlex Elder if (!counter) 1918a2acd00eSAlex Elder rbd_dev_unparent(rbd_dev); 1919a2acd00eSAlex Elder else 1920a2acd00eSAlex Elder rbd_warn(rbd_dev, "parent reference underflow\n"); 1921a2acd00eSAlex Elder } 1922a2acd00eSAlex Elder 1923a2acd00eSAlex Elder /* 1924a2acd00eSAlex Elder * If an image has a non-zero parent overlap, get a reference to its 1925a2acd00eSAlex Elder * parent. 1926a2acd00eSAlex Elder * 1927392a9dadSAlex Elder * We must get the reference before checking for the overlap to 1928392a9dadSAlex Elder * coordinate properly with zeroing the parent overlap in 1929392a9dadSAlex Elder * rbd_dev_v2_parent_info() when an image gets flattened. We 1930392a9dadSAlex Elder * drop it again if there is no overlap. 1931392a9dadSAlex Elder * 1932a2acd00eSAlex Elder * Returns true if the rbd device has a parent with a non-zero 1933a2acd00eSAlex Elder * overlap and a reference for it was successfully taken, or 1934a2acd00eSAlex Elder * false otherwise. 1935a2acd00eSAlex Elder */ 1936a2acd00eSAlex Elder static bool rbd_dev_parent_get(struct rbd_device *rbd_dev) 1937a2acd00eSAlex Elder { 1938a2acd00eSAlex Elder int counter; 1939a2acd00eSAlex Elder 1940a2acd00eSAlex Elder if (!rbd_dev->parent_spec) 1941a2acd00eSAlex Elder return false; 1942a2acd00eSAlex Elder 1943a2acd00eSAlex Elder counter = atomic_inc_return_safe(&rbd_dev->parent_ref); 1944a2acd00eSAlex Elder if (counter > 0 && rbd_dev->parent_overlap) 1945a2acd00eSAlex Elder return true; 1946a2acd00eSAlex Elder 1947a2acd00eSAlex Elder /* Image was flattened, but parent is not yet torn down */ 1948a2acd00eSAlex Elder 1949a2acd00eSAlex Elder if (counter < 0) 1950a2acd00eSAlex Elder rbd_warn(rbd_dev, "parent reference overflow\n"); 1951a2acd00eSAlex Elder 1952a2acd00eSAlex Elder return false; 1953a2acd00eSAlex Elder } 1954a2acd00eSAlex Elder 1955bf0d5f50SAlex Elder /* 1956bf0d5f50SAlex Elder * Caller is responsible for filling in the list of object requests 1957bf0d5f50SAlex Elder * that comprises the image request, and the Linux request pointer 1958bf0d5f50SAlex Elder * (if there is one). 1959bf0d5f50SAlex Elder */ 1960cc344fa1SAlex Elder static struct rbd_img_request *rbd_img_request_create( 1961cc344fa1SAlex Elder struct rbd_device *rbd_dev, 1962bf0d5f50SAlex Elder u64 offset, u64 length, 1963e93f3152SAlex Elder bool write_request) 1964bf0d5f50SAlex Elder { 1965bf0d5f50SAlex Elder struct rbd_img_request *img_request; 1966bf0d5f50SAlex Elder 19671c2a9dfeSAlex Elder img_request = kmem_cache_alloc(rbd_img_request_cache, GFP_ATOMIC); 1968bf0d5f50SAlex Elder if (!img_request) 1969bf0d5f50SAlex Elder return NULL; 1970bf0d5f50SAlex Elder 1971bf0d5f50SAlex Elder if (write_request) { 1972bf0d5f50SAlex Elder down_read(&rbd_dev->header_rwsem); 1973812164f8SAlex Elder ceph_get_snap_context(rbd_dev->header.snapc); 1974bf0d5f50SAlex Elder up_read(&rbd_dev->header_rwsem); 1975bf0d5f50SAlex Elder } 1976bf0d5f50SAlex Elder 1977bf0d5f50SAlex Elder img_request->rq = NULL; 1978bf0d5f50SAlex Elder img_request->rbd_dev = rbd_dev; 1979bf0d5f50SAlex Elder img_request->offset = offset; 1980bf0d5f50SAlex Elder img_request->length = length; 19810c425248SAlex Elder img_request->flags = 0; 19820c425248SAlex Elder if (write_request) { 19830c425248SAlex Elder img_request_write_set(img_request); 1984468521c1SAlex Elder img_request->snapc = rbd_dev->header.snapc; 19850c425248SAlex Elder } else { 1986bf0d5f50SAlex Elder img_request->snap_id = rbd_dev->spec->snap_id; 19870c425248SAlex Elder } 1988a2acd00eSAlex Elder if (rbd_dev_parent_get(rbd_dev)) 1989d0b2e944SAlex Elder img_request_layered_set(img_request); 1990bf0d5f50SAlex Elder spin_lock_init(&img_request->completion_lock); 1991bf0d5f50SAlex Elder img_request->next_completion = 0; 1992bf0d5f50SAlex Elder img_request->callback = NULL; 1993a5a337d4SAlex Elder img_request->result = 0; 1994bf0d5f50SAlex Elder img_request->obj_request_count = 0; 1995bf0d5f50SAlex Elder INIT_LIST_HEAD(&img_request->obj_requests); 1996bf0d5f50SAlex Elder kref_init(&img_request->kref); 1997bf0d5f50SAlex Elder 199837206ee5SAlex Elder dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev, 199937206ee5SAlex Elder write_request ? "write" : "read", offset, length, 200037206ee5SAlex Elder img_request); 200137206ee5SAlex Elder 2002bf0d5f50SAlex Elder return img_request; 2003bf0d5f50SAlex Elder } 2004bf0d5f50SAlex Elder 2005bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref) 2006bf0d5f50SAlex Elder { 2007bf0d5f50SAlex Elder struct rbd_img_request *img_request; 2008bf0d5f50SAlex Elder struct rbd_obj_request *obj_request; 2009bf0d5f50SAlex Elder struct rbd_obj_request *next_obj_request; 2010bf0d5f50SAlex Elder 2011bf0d5f50SAlex Elder img_request = container_of(kref, struct rbd_img_request, kref); 2012bf0d5f50SAlex Elder 201337206ee5SAlex Elder dout("%s: img %p\n", __func__, img_request); 201437206ee5SAlex Elder 2015bf0d5f50SAlex Elder for_each_obj_request_safe(img_request, obj_request, next_obj_request) 2016bf0d5f50SAlex Elder rbd_img_obj_request_del(img_request, obj_request); 201725dcf954SAlex Elder rbd_assert(img_request->obj_request_count == 0); 2018bf0d5f50SAlex Elder 2019a2acd00eSAlex Elder if (img_request_layered_test(img_request)) { 2020a2acd00eSAlex Elder img_request_layered_clear(img_request); 2021a2acd00eSAlex Elder rbd_dev_parent_put(img_request->rbd_dev); 2022a2acd00eSAlex Elder } 2023a2acd00eSAlex Elder 20240c425248SAlex Elder if (img_request_write_test(img_request)) 2025812164f8SAlex Elder ceph_put_snap_context(img_request->snapc); 2026bf0d5f50SAlex Elder 20271c2a9dfeSAlex Elder kmem_cache_free(rbd_img_request_cache, img_request); 2028bf0d5f50SAlex Elder } 2029bf0d5f50SAlex Elder 2030e93f3152SAlex Elder static struct rbd_img_request *rbd_parent_request_create( 2031e93f3152SAlex Elder struct rbd_obj_request *obj_request, 2032e93f3152SAlex Elder u64 img_offset, u64 length) 2033e93f3152SAlex Elder { 2034e93f3152SAlex Elder struct rbd_img_request *parent_request; 2035e93f3152SAlex Elder struct rbd_device *rbd_dev; 2036e93f3152SAlex Elder 2037e93f3152SAlex Elder rbd_assert(obj_request->img_request); 2038e93f3152SAlex Elder rbd_dev = obj_request->img_request->rbd_dev; 2039e93f3152SAlex Elder 2040e93f3152SAlex Elder parent_request = rbd_img_request_create(rbd_dev->parent, 2041e93f3152SAlex Elder img_offset, length, false); 2042e93f3152SAlex Elder if (!parent_request) 2043e93f3152SAlex Elder return NULL; 2044e93f3152SAlex Elder 2045e93f3152SAlex Elder img_request_child_set(parent_request); 2046e93f3152SAlex Elder rbd_obj_request_get(obj_request); 2047e93f3152SAlex Elder parent_request->obj_request = obj_request; 2048e93f3152SAlex Elder 2049e93f3152SAlex Elder return parent_request; 2050e93f3152SAlex Elder } 2051e93f3152SAlex Elder 2052e93f3152SAlex Elder static void rbd_parent_request_destroy(struct kref *kref) 2053e93f3152SAlex Elder { 2054e93f3152SAlex Elder struct rbd_img_request *parent_request; 2055e93f3152SAlex Elder struct rbd_obj_request *orig_request; 2056e93f3152SAlex Elder 2057e93f3152SAlex Elder parent_request = container_of(kref, struct rbd_img_request, kref); 2058e93f3152SAlex Elder orig_request = parent_request->obj_request; 2059e93f3152SAlex Elder 2060e93f3152SAlex Elder parent_request->obj_request = NULL; 2061e93f3152SAlex Elder rbd_obj_request_put(orig_request); 2062e93f3152SAlex Elder img_request_child_clear(parent_request); 2063e93f3152SAlex Elder 2064e93f3152SAlex Elder rbd_img_request_destroy(kref); 2065e93f3152SAlex Elder } 2066e93f3152SAlex Elder 20671217857fSAlex Elder static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request) 20681217857fSAlex Elder { 20696365d33aSAlex Elder struct rbd_img_request *img_request; 20701217857fSAlex Elder unsigned int xferred; 20711217857fSAlex Elder int result; 20728b3e1a56SAlex Elder bool more; 20731217857fSAlex Elder 20746365d33aSAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 20756365d33aSAlex Elder img_request = obj_request->img_request; 20766365d33aSAlex Elder 20771217857fSAlex Elder rbd_assert(obj_request->xferred <= (u64)UINT_MAX); 20781217857fSAlex Elder xferred = (unsigned int)obj_request->xferred; 20791217857fSAlex Elder result = obj_request->result; 20801217857fSAlex Elder if (result) { 20811217857fSAlex Elder struct rbd_device *rbd_dev = img_request->rbd_dev; 20821217857fSAlex Elder 20831217857fSAlex Elder rbd_warn(rbd_dev, "%s %llx at %llx (%llx)\n", 20841217857fSAlex Elder img_request_write_test(img_request) ? "write" : "read", 20851217857fSAlex Elder obj_request->length, obj_request->img_offset, 20861217857fSAlex Elder obj_request->offset); 20871217857fSAlex Elder rbd_warn(rbd_dev, " result %d xferred %x\n", 20881217857fSAlex Elder result, xferred); 20891217857fSAlex Elder if (!img_request->result) 20901217857fSAlex Elder img_request->result = result; 20911217857fSAlex Elder } 20921217857fSAlex Elder 2093f1a4739fSAlex Elder /* Image object requests don't own their page array */ 2094f1a4739fSAlex Elder 2095f1a4739fSAlex Elder if (obj_request->type == OBJ_REQUEST_PAGES) { 2096f1a4739fSAlex Elder obj_request->pages = NULL; 2097f1a4739fSAlex Elder obj_request->page_count = 0; 2098f1a4739fSAlex Elder } 2099f1a4739fSAlex Elder 21008b3e1a56SAlex Elder if (img_request_child_test(img_request)) { 21018b3e1a56SAlex Elder rbd_assert(img_request->obj_request != NULL); 21028b3e1a56SAlex Elder more = obj_request->which < img_request->obj_request_count - 1; 21038b3e1a56SAlex Elder } else { 21048b3e1a56SAlex Elder rbd_assert(img_request->rq != NULL); 21058b3e1a56SAlex Elder more = blk_end_request(img_request->rq, result, xferred); 21068b3e1a56SAlex Elder } 21078b3e1a56SAlex Elder 21088b3e1a56SAlex Elder return more; 21091217857fSAlex Elder } 21101217857fSAlex Elder 21112169238dSAlex Elder static void rbd_img_obj_callback(struct rbd_obj_request *obj_request) 21122169238dSAlex Elder { 21132169238dSAlex Elder struct rbd_img_request *img_request; 21142169238dSAlex Elder u32 which = obj_request->which; 21152169238dSAlex Elder bool more = true; 21162169238dSAlex Elder 21176365d33aSAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 21182169238dSAlex Elder img_request = obj_request->img_request; 21192169238dSAlex Elder 21202169238dSAlex Elder dout("%s: img %p obj %p\n", __func__, img_request, obj_request); 21212169238dSAlex Elder rbd_assert(img_request != NULL); 21222169238dSAlex Elder rbd_assert(img_request->obj_request_count > 0); 21232169238dSAlex Elder rbd_assert(which != BAD_WHICH); 21242169238dSAlex Elder rbd_assert(which < img_request->obj_request_count); 21252169238dSAlex Elder rbd_assert(which >= img_request->next_completion); 21262169238dSAlex Elder 21272169238dSAlex Elder spin_lock_irq(&img_request->completion_lock); 21282169238dSAlex Elder if (which != img_request->next_completion) 21292169238dSAlex Elder goto out; 21302169238dSAlex Elder 21312169238dSAlex Elder for_each_obj_request_from(img_request, obj_request) { 21322169238dSAlex Elder rbd_assert(more); 21332169238dSAlex Elder rbd_assert(which < img_request->obj_request_count); 21342169238dSAlex Elder 21352169238dSAlex Elder if (!obj_request_done_test(obj_request)) 21362169238dSAlex Elder break; 21371217857fSAlex Elder more = rbd_img_obj_end_request(obj_request); 21382169238dSAlex Elder which++; 21392169238dSAlex Elder } 21402169238dSAlex Elder 21412169238dSAlex Elder rbd_assert(more ^ (which == img_request->obj_request_count)); 21422169238dSAlex Elder img_request->next_completion = which; 21432169238dSAlex Elder out: 21442169238dSAlex Elder spin_unlock_irq(&img_request->completion_lock); 21452169238dSAlex Elder 21462169238dSAlex Elder if (!more) 21472169238dSAlex Elder rbd_img_request_complete(img_request); 21482169238dSAlex Elder } 21492169238dSAlex Elder 2150f1a4739fSAlex Elder /* 2151f1a4739fSAlex Elder * Split up an image request into one or more object requests, each 2152f1a4739fSAlex Elder * to a different object. The "type" parameter indicates whether 2153f1a4739fSAlex Elder * "data_desc" is the pointer to the head of a list of bio 2154f1a4739fSAlex Elder * structures, or the base of a page array. In either case this 2155f1a4739fSAlex Elder * function assumes data_desc describes memory sufficient to hold 2156f1a4739fSAlex Elder * all data described by the image request. 2157f1a4739fSAlex Elder */ 2158f1a4739fSAlex Elder static int rbd_img_request_fill(struct rbd_img_request *img_request, 2159f1a4739fSAlex Elder enum obj_request_type type, 2160f1a4739fSAlex Elder void *data_desc) 2161bf0d5f50SAlex Elder { 2162bf0d5f50SAlex Elder struct rbd_device *rbd_dev = img_request->rbd_dev; 2163bf0d5f50SAlex Elder struct rbd_obj_request *obj_request = NULL; 2164bf0d5f50SAlex Elder struct rbd_obj_request *next_obj_request; 21650c425248SAlex Elder bool write_request = img_request_write_test(img_request); 2166a158073cSJingoo Han struct bio *bio_list = NULL; 2167f1a4739fSAlex Elder unsigned int bio_offset = 0; 2168a158073cSJingoo Han struct page **pages = NULL; 21697da22d29SAlex Elder u64 img_offset; 2170bf0d5f50SAlex Elder u64 resid; 2171bf0d5f50SAlex Elder u16 opcode; 2172bf0d5f50SAlex Elder 2173f1a4739fSAlex Elder dout("%s: img %p type %d data_desc %p\n", __func__, img_request, 2174f1a4739fSAlex Elder (int)type, data_desc); 217537206ee5SAlex Elder 2176430c28c3SAlex Elder opcode = write_request ? CEPH_OSD_OP_WRITE : CEPH_OSD_OP_READ; 21777da22d29SAlex Elder img_offset = img_request->offset; 2178bf0d5f50SAlex Elder resid = img_request->length; 21794dda41d3SAlex Elder rbd_assert(resid > 0); 2180f1a4739fSAlex Elder 2181f1a4739fSAlex Elder if (type == OBJ_REQUEST_BIO) { 2182f1a4739fSAlex Elder bio_list = data_desc; 2183f1a4739fSAlex Elder rbd_assert(img_offset == bio_list->bi_sector << SECTOR_SHIFT); 2184f1a4739fSAlex Elder } else { 2185f1a4739fSAlex Elder rbd_assert(type == OBJ_REQUEST_PAGES); 2186f1a4739fSAlex Elder pages = data_desc; 2187f1a4739fSAlex Elder } 2188f1a4739fSAlex Elder 2189bf0d5f50SAlex Elder while (resid) { 21902fa12320SAlex Elder struct ceph_osd_request *osd_req; 2191bf0d5f50SAlex Elder const char *object_name; 2192bf0d5f50SAlex Elder u64 offset; 2193bf0d5f50SAlex Elder u64 length; 2194bf0d5f50SAlex Elder 21957da22d29SAlex Elder object_name = rbd_segment_name(rbd_dev, img_offset); 2196bf0d5f50SAlex Elder if (!object_name) 2197bf0d5f50SAlex Elder goto out_unwind; 21987da22d29SAlex Elder offset = rbd_segment_offset(rbd_dev, img_offset); 21997da22d29SAlex Elder length = rbd_segment_length(rbd_dev, img_offset, resid); 2200bf0d5f50SAlex Elder obj_request = rbd_obj_request_create(object_name, 2201f1a4739fSAlex Elder offset, length, type); 220278c2a44aSAlex Elder /* object request has its own copy of the object name */ 220378c2a44aSAlex Elder rbd_segment_name_free(object_name); 2204bf0d5f50SAlex Elder if (!obj_request) 2205bf0d5f50SAlex Elder goto out_unwind; 220603507db6SJosh Durgin /* 220703507db6SJosh Durgin * set obj_request->img_request before creating the 220803507db6SJosh Durgin * osd_request so that it gets the right snapc 220903507db6SJosh Durgin */ 221003507db6SJosh Durgin rbd_img_obj_request_add(img_request, obj_request); 2211bf0d5f50SAlex Elder 2212f1a4739fSAlex Elder if (type == OBJ_REQUEST_BIO) { 2213f1a4739fSAlex Elder unsigned int clone_size; 2214f1a4739fSAlex Elder 2215bf0d5f50SAlex Elder rbd_assert(length <= (u64)UINT_MAX); 2216bf0d5f50SAlex Elder clone_size = (unsigned int)length; 2217f1a4739fSAlex Elder obj_request->bio_list = 2218f1a4739fSAlex Elder bio_chain_clone_range(&bio_list, 2219f1a4739fSAlex Elder &bio_offset, 2220f1a4739fSAlex Elder clone_size, 2221bf0d5f50SAlex Elder GFP_ATOMIC); 2222bf0d5f50SAlex Elder if (!obj_request->bio_list) 2223bf0d5f50SAlex Elder goto out_partial; 2224f1a4739fSAlex Elder } else { 2225f1a4739fSAlex Elder unsigned int page_count; 2226f1a4739fSAlex Elder 2227f1a4739fSAlex Elder obj_request->pages = pages; 2228f1a4739fSAlex Elder page_count = (u32)calc_pages_for(offset, length); 2229f1a4739fSAlex Elder obj_request->page_count = page_count; 2230f1a4739fSAlex Elder if ((offset + length) & ~PAGE_MASK) 2231f1a4739fSAlex Elder page_count--; /* more on last page */ 2232f1a4739fSAlex Elder pages += page_count; 2233f1a4739fSAlex Elder } 2234bf0d5f50SAlex Elder 22352fa12320SAlex Elder osd_req = rbd_osd_req_create(rbd_dev, write_request, 22362fa12320SAlex Elder obj_request); 22372fa12320SAlex Elder if (!osd_req) 2238bf0d5f50SAlex Elder goto out_partial; 22392fa12320SAlex Elder obj_request->osd_req = osd_req; 22402169238dSAlex Elder obj_request->callback = rbd_img_obj_callback; 2241430c28c3SAlex Elder 22422fa12320SAlex Elder osd_req_op_extent_init(osd_req, 0, opcode, offset, length, 22432fa12320SAlex Elder 0, 0); 2244f1a4739fSAlex Elder if (type == OBJ_REQUEST_BIO) 2245406e2c9fSAlex Elder osd_req_op_extent_osd_data_bio(osd_req, 0, 2246f1a4739fSAlex Elder obj_request->bio_list, length); 2247f1a4739fSAlex Elder else 2248f1a4739fSAlex Elder osd_req_op_extent_osd_data_pages(osd_req, 0, 2249f1a4739fSAlex Elder obj_request->pages, length, 2250f1a4739fSAlex Elder offset & ~PAGE_MASK, false, false); 22519d4df01fSAlex Elder 22529d4df01fSAlex Elder if (write_request) 22539d4df01fSAlex Elder rbd_osd_req_format_write(obj_request); 22549d4df01fSAlex Elder else 22559d4df01fSAlex Elder rbd_osd_req_format_read(obj_request); 2256430c28c3SAlex Elder 22577da22d29SAlex Elder obj_request->img_offset = img_offset; 2258bf0d5f50SAlex Elder 22597da22d29SAlex Elder img_offset += length; 2260bf0d5f50SAlex Elder resid -= length; 2261bf0d5f50SAlex Elder } 2262bf0d5f50SAlex Elder 2263bf0d5f50SAlex Elder return 0; 2264bf0d5f50SAlex Elder 2265bf0d5f50SAlex Elder out_partial: 2266bf0d5f50SAlex Elder rbd_obj_request_put(obj_request); 2267bf0d5f50SAlex Elder out_unwind: 2268bf0d5f50SAlex Elder for_each_obj_request_safe(img_request, obj_request, next_obj_request) 2269bf0d5f50SAlex Elder rbd_obj_request_put(obj_request); 2270bf0d5f50SAlex Elder 2271bf0d5f50SAlex Elder return -ENOMEM; 2272bf0d5f50SAlex Elder } 2273bf0d5f50SAlex Elder 22743d7efd18SAlex Elder static void 22750eefd470SAlex Elder rbd_img_obj_copyup_callback(struct rbd_obj_request *obj_request) 22760eefd470SAlex Elder { 22770eefd470SAlex Elder struct rbd_img_request *img_request; 22780eefd470SAlex Elder struct rbd_device *rbd_dev; 2279ebda6408SAlex Elder struct page **pages; 22800eefd470SAlex Elder u32 page_count; 22810eefd470SAlex Elder 22820eefd470SAlex Elder rbd_assert(obj_request->type == OBJ_REQUEST_BIO); 22830eefd470SAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 22840eefd470SAlex Elder img_request = obj_request->img_request; 22850eefd470SAlex Elder rbd_assert(img_request); 22860eefd470SAlex Elder 22870eefd470SAlex Elder rbd_dev = img_request->rbd_dev; 22880eefd470SAlex Elder rbd_assert(rbd_dev); 22890eefd470SAlex Elder 2290ebda6408SAlex Elder pages = obj_request->copyup_pages; 2291ebda6408SAlex Elder rbd_assert(pages != NULL); 22920eefd470SAlex Elder obj_request->copyup_pages = NULL; 2293ebda6408SAlex Elder page_count = obj_request->copyup_page_count; 2294ebda6408SAlex Elder rbd_assert(page_count); 2295ebda6408SAlex Elder obj_request->copyup_page_count = 0; 2296ebda6408SAlex Elder ceph_release_page_vector(pages, page_count); 22970eefd470SAlex Elder 22980eefd470SAlex Elder /* 22990eefd470SAlex Elder * We want the transfer count to reflect the size of the 23000eefd470SAlex Elder * original write request. There is no such thing as a 23010eefd470SAlex Elder * successful short write, so if the request was successful 23020eefd470SAlex Elder * we can just set it to the originally-requested length. 23030eefd470SAlex Elder */ 23040eefd470SAlex Elder if (!obj_request->result) 23050eefd470SAlex Elder obj_request->xferred = obj_request->length; 23060eefd470SAlex Elder 23070eefd470SAlex Elder /* Finish up with the normal image object callback */ 23080eefd470SAlex Elder 23090eefd470SAlex Elder rbd_img_obj_callback(obj_request); 23100eefd470SAlex Elder } 23110eefd470SAlex Elder 23120eefd470SAlex Elder static void 23133d7efd18SAlex Elder rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request) 23143d7efd18SAlex Elder { 23153d7efd18SAlex Elder struct rbd_obj_request *orig_request; 23160eefd470SAlex Elder struct ceph_osd_request *osd_req; 23170eefd470SAlex Elder struct ceph_osd_client *osdc; 23180eefd470SAlex Elder struct rbd_device *rbd_dev; 23193d7efd18SAlex Elder struct page **pages; 2320ebda6408SAlex Elder u32 page_count; 2321bbea1c1aSAlex Elder int img_result; 2322ebda6408SAlex Elder u64 parent_length; 2323b91f09f1SAlex Elder u64 offset; 2324b91f09f1SAlex Elder u64 length; 23253d7efd18SAlex Elder 23263d7efd18SAlex Elder rbd_assert(img_request_child_test(img_request)); 23273d7efd18SAlex Elder 23283d7efd18SAlex Elder /* First get what we need from the image request */ 23293d7efd18SAlex Elder 23303d7efd18SAlex Elder pages = img_request->copyup_pages; 23313d7efd18SAlex Elder rbd_assert(pages != NULL); 23323d7efd18SAlex Elder img_request->copyup_pages = NULL; 2333ebda6408SAlex Elder page_count = img_request->copyup_page_count; 2334ebda6408SAlex Elder rbd_assert(page_count); 2335ebda6408SAlex Elder img_request->copyup_page_count = 0; 23363d7efd18SAlex Elder 23373d7efd18SAlex Elder orig_request = img_request->obj_request; 23383d7efd18SAlex Elder rbd_assert(orig_request != NULL); 2339b91f09f1SAlex Elder rbd_assert(obj_request_type_valid(orig_request->type)); 2340bbea1c1aSAlex Elder img_result = img_request->result; 2341ebda6408SAlex Elder parent_length = img_request->length; 2342ebda6408SAlex Elder rbd_assert(parent_length == img_request->xferred); 23433d7efd18SAlex Elder rbd_img_request_put(img_request); 23443d7efd18SAlex Elder 234591c6febbSAlex Elder rbd_assert(orig_request->img_request); 234691c6febbSAlex Elder rbd_dev = orig_request->img_request->rbd_dev; 23473d7efd18SAlex Elder rbd_assert(rbd_dev); 23483d7efd18SAlex Elder 2349bbea1c1aSAlex Elder /* 2350bbea1c1aSAlex Elder * If the overlap has become 0 (most likely because the 2351bbea1c1aSAlex Elder * image has been flattened) we need to free the pages 2352bbea1c1aSAlex Elder * and re-submit the original write request. 2353bbea1c1aSAlex Elder */ 2354bbea1c1aSAlex Elder if (!rbd_dev->parent_overlap) { 2355bbea1c1aSAlex Elder struct ceph_osd_client *osdc; 2356bbea1c1aSAlex Elder 2357bbea1c1aSAlex Elder ceph_release_page_vector(pages, page_count); 2358bbea1c1aSAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 2359bbea1c1aSAlex Elder img_result = rbd_obj_request_submit(osdc, orig_request); 2360bbea1c1aSAlex Elder if (!img_result) 2361bbea1c1aSAlex Elder return; 2362bbea1c1aSAlex Elder } 2363bbea1c1aSAlex Elder 2364bbea1c1aSAlex Elder if (img_result) 23650eefd470SAlex Elder goto out_err; 23663d7efd18SAlex Elder 23678785b1d4SAlex Elder /* 23688785b1d4SAlex Elder * The original osd request is of no use to use any more. 23698785b1d4SAlex Elder * We need a new one that can hold the two ops in a copyup 23708785b1d4SAlex Elder * request. Allocate the new copyup osd request for the 23718785b1d4SAlex Elder * original request, and release the old one. 23728785b1d4SAlex Elder */ 2373bbea1c1aSAlex Elder img_result = -ENOMEM; 23740eefd470SAlex Elder osd_req = rbd_osd_req_create_copyup(orig_request); 23750eefd470SAlex Elder if (!osd_req) 23760eefd470SAlex Elder goto out_err; 23778785b1d4SAlex Elder rbd_osd_req_destroy(orig_request->osd_req); 23780eefd470SAlex Elder orig_request->osd_req = osd_req; 23790eefd470SAlex Elder orig_request->copyup_pages = pages; 2380ebda6408SAlex Elder orig_request->copyup_page_count = page_count; 23813d7efd18SAlex Elder 23820eefd470SAlex Elder /* Initialize the copyup op */ 23830eefd470SAlex Elder 23840eefd470SAlex Elder osd_req_op_cls_init(osd_req, 0, CEPH_OSD_OP_CALL, "rbd", "copyup"); 2385ebda6408SAlex Elder osd_req_op_cls_request_data_pages(osd_req, 0, pages, parent_length, 0, 23860eefd470SAlex Elder false, false); 23870eefd470SAlex Elder 23880eefd470SAlex Elder /* Then the original write request op */ 23890eefd470SAlex Elder 2390b91f09f1SAlex Elder offset = orig_request->offset; 2391b91f09f1SAlex Elder length = orig_request->length; 23920eefd470SAlex Elder osd_req_op_extent_init(osd_req, 1, CEPH_OSD_OP_WRITE, 2393b91f09f1SAlex Elder offset, length, 0, 0); 2394b91f09f1SAlex Elder if (orig_request->type == OBJ_REQUEST_BIO) 2395b91f09f1SAlex Elder osd_req_op_extent_osd_data_bio(osd_req, 1, 2396b91f09f1SAlex Elder orig_request->bio_list, length); 2397b91f09f1SAlex Elder else 2398b91f09f1SAlex Elder osd_req_op_extent_osd_data_pages(osd_req, 1, 2399b91f09f1SAlex Elder orig_request->pages, length, 2400b91f09f1SAlex Elder offset & ~PAGE_MASK, false, false); 24010eefd470SAlex Elder 24020eefd470SAlex Elder rbd_osd_req_format_write(orig_request); 24030eefd470SAlex Elder 24040eefd470SAlex Elder /* All set, send it off. */ 24050eefd470SAlex Elder 24060eefd470SAlex Elder orig_request->callback = rbd_img_obj_copyup_callback; 24070eefd470SAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 2408bbea1c1aSAlex Elder img_result = rbd_obj_request_submit(osdc, orig_request); 2409bbea1c1aSAlex Elder if (!img_result) 24100eefd470SAlex Elder return; 24110eefd470SAlex Elder out_err: 24120eefd470SAlex Elder /* Record the error code and complete the request */ 24130eefd470SAlex Elder 2414bbea1c1aSAlex Elder orig_request->result = img_result; 24150eefd470SAlex Elder orig_request->xferred = 0; 24163d7efd18SAlex Elder obj_request_done_set(orig_request); 24173d7efd18SAlex Elder rbd_obj_request_complete(orig_request); 24183d7efd18SAlex Elder } 24193d7efd18SAlex Elder 24203d7efd18SAlex Elder /* 24213d7efd18SAlex Elder * Read from the parent image the range of data that covers the 24223d7efd18SAlex Elder * entire target of the given object request. This is used for 24233d7efd18SAlex Elder * satisfying a layered image write request when the target of an 24243d7efd18SAlex Elder * object request from the image request does not exist. 24253d7efd18SAlex Elder * 24263d7efd18SAlex Elder * A page array big enough to hold the returned data is allocated 24273d7efd18SAlex Elder * and supplied to rbd_img_request_fill() as the "data descriptor." 24283d7efd18SAlex Elder * When the read completes, this page array will be transferred to 24293d7efd18SAlex Elder * the original object request for the copyup operation. 24303d7efd18SAlex Elder * 24313d7efd18SAlex Elder * If an error occurs, record it as the result of the original 24323d7efd18SAlex Elder * object request and mark it done so it gets completed. 24333d7efd18SAlex Elder */ 24343d7efd18SAlex Elder static int rbd_img_obj_parent_read_full(struct rbd_obj_request *obj_request) 24353d7efd18SAlex Elder { 24363d7efd18SAlex Elder struct rbd_img_request *img_request = NULL; 24373d7efd18SAlex Elder struct rbd_img_request *parent_request = NULL; 24383d7efd18SAlex Elder struct rbd_device *rbd_dev; 24393d7efd18SAlex Elder u64 img_offset; 24403d7efd18SAlex Elder u64 length; 24413d7efd18SAlex Elder struct page **pages = NULL; 24423d7efd18SAlex Elder u32 page_count; 24433d7efd18SAlex Elder int result; 24443d7efd18SAlex Elder 24453d7efd18SAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 2446b91f09f1SAlex Elder rbd_assert(obj_request_type_valid(obj_request->type)); 24473d7efd18SAlex Elder 24483d7efd18SAlex Elder img_request = obj_request->img_request; 24493d7efd18SAlex Elder rbd_assert(img_request != NULL); 24503d7efd18SAlex Elder rbd_dev = img_request->rbd_dev; 24513d7efd18SAlex Elder rbd_assert(rbd_dev->parent != NULL); 24523d7efd18SAlex Elder 24533d7efd18SAlex Elder /* 24543d7efd18SAlex Elder * Determine the byte range covered by the object in the 24553d7efd18SAlex Elder * child image to which the original request was to be sent. 24563d7efd18SAlex Elder */ 24573d7efd18SAlex Elder img_offset = obj_request->img_offset - obj_request->offset; 24583d7efd18SAlex Elder length = (u64)1 << rbd_dev->header.obj_order; 24593d7efd18SAlex Elder 24603d7efd18SAlex Elder /* 2461a9e8ba2cSAlex Elder * There is no defined parent data beyond the parent 2462a9e8ba2cSAlex Elder * overlap, so limit what we read at that boundary if 2463a9e8ba2cSAlex Elder * necessary. 2464a9e8ba2cSAlex Elder */ 2465a9e8ba2cSAlex Elder if (img_offset + length > rbd_dev->parent_overlap) { 2466a9e8ba2cSAlex Elder rbd_assert(img_offset < rbd_dev->parent_overlap); 2467a9e8ba2cSAlex Elder length = rbd_dev->parent_overlap - img_offset; 2468a9e8ba2cSAlex Elder } 2469a9e8ba2cSAlex Elder 2470a9e8ba2cSAlex Elder /* 24713d7efd18SAlex Elder * Allocate a page array big enough to receive the data read 24723d7efd18SAlex Elder * from the parent. 24733d7efd18SAlex Elder */ 24743d7efd18SAlex Elder page_count = (u32)calc_pages_for(0, length); 24753d7efd18SAlex Elder pages = ceph_alloc_page_vector(page_count, GFP_KERNEL); 24763d7efd18SAlex Elder if (IS_ERR(pages)) { 24773d7efd18SAlex Elder result = PTR_ERR(pages); 24783d7efd18SAlex Elder pages = NULL; 24793d7efd18SAlex Elder goto out_err; 24803d7efd18SAlex Elder } 24813d7efd18SAlex Elder 24823d7efd18SAlex Elder result = -ENOMEM; 2483e93f3152SAlex Elder parent_request = rbd_parent_request_create(obj_request, 2484e93f3152SAlex Elder img_offset, length); 24853d7efd18SAlex Elder if (!parent_request) 24863d7efd18SAlex Elder goto out_err; 24873d7efd18SAlex Elder 24883d7efd18SAlex Elder result = rbd_img_request_fill(parent_request, OBJ_REQUEST_PAGES, pages); 24893d7efd18SAlex Elder if (result) 24903d7efd18SAlex Elder goto out_err; 24913d7efd18SAlex Elder parent_request->copyup_pages = pages; 2492ebda6408SAlex Elder parent_request->copyup_page_count = page_count; 24933d7efd18SAlex Elder 24943d7efd18SAlex Elder parent_request->callback = rbd_img_obj_parent_read_full_callback; 24953d7efd18SAlex Elder result = rbd_img_request_submit(parent_request); 24963d7efd18SAlex Elder if (!result) 24973d7efd18SAlex Elder return 0; 24983d7efd18SAlex Elder 24993d7efd18SAlex Elder parent_request->copyup_pages = NULL; 2500ebda6408SAlex Elder parent_request->copyup_page_count = 0; 25013d7efd18SAlex Elder parent_request->obj_request = NULL; 25023d7efd18SAlex Elder rbd_obj_request_put(obj_request); 25033d7efd18SAlex Elder out_err: 25043d7efd18SAlex Elder if (pages) 25053d7efd18SAlex Elder ceph_release_page_vector(pages, page_count); 25063d7efd18SAlex Elder if (parent_request) 25073d7efd18SAlex Elder rbd_img_request_put(parent_request); 25083d7efd18SAlex Elder obj_request->result = result; 25093d7efd18SAlex Elder obj_request->xferred = 0; 25103d7efd18SAlex Elder obj_request_done_set(obj_request); 25113d7efd18SAlex Elder 25123d7efd18SAlex Elder return result; 25133d7efd18SAlex Elder } 25143d7efd18SAlex Elder 2515c5b5ef6cSAlex Elder static void rbd_img_obj_exists_callback(struct rbd_obj_request *obj_request) 2516c5b5ef6cSAlex Elder { 2517c5b5ef6cSAlex Elder struct rbd_obj_request *orig_request; 2518638f5abeSAlex Elder struct rbd_device *rbd_dev; 2519c5b5ef6cSAlex Elder int result; 2520c5b5ef6cSAlex Elder 2521c5b5ef6cSAlex Elder rbd_assert(!obj_request_img_data_test(obj_request)); 2522c5b5ef6cSAlex Elder 2523c5b5ef6cSAlex Elder /* 2524c5b5ef6cSAlex Elder * All we need from the object request is the original 2525c5b5ef6cSAlex Elder * request and the result of the STAT op. Grab those, then 2526c5b5ef6cSAlex Elder * we're done with the request. 2527c5b5ef6cSAlex Elder */ 2528c5b5ef6cSAlex Elder orig_request = obj_request->obj_request; 2529c5b5ef6cSAlex Elder obj_request->obj_request = NULL; 2530912c317dSAlex Elder rbd_obj_request_put(orig_request); 2531c5b5ef6cSAlex Elder rbd_assert(orig_request); 2532c5b5ef6cSAlex Elder rbd_assert(orig_request->img_request); 2533c5b5ef6cSAlex Elder 2534c5b5ef6cSAlex Elder result = obj_request->result; 2535c5b5ef6cSAlex Elder obj_request->result = 0; 2536c5b5ef6cSAlex Elder 2537c5b5ef6cSAlex Elder dout("%s: obj %p for obj %p result %d %llu/%llu\n", __func__, 2538c5b5ef6cSAlex Elder obj_request, orig_request, result, 2539c5b5ef6cSAlex Elder obj_request->xferred, obj_request->length); 2540c5b5ef6cSAlex Elder rbd_obj_request_put(obj_request); 2541c5b5ef6cSAlex Elder 2542638f5abeSAlex Elder /* 2543638f5abeSAlex Elder * If the overlap has become 0 (most likely because the 2544638f5abeSAlex Elder * image has been flattened) we need to free the pages 2545638f5abeSAlex Elder * and re-submit the original write request. 2546638f5abeSAlex Elder */ 2547638f5abeSAlex Elder rbd_dev = orig_request->img_request->rbd_dev; 2548638f5abeSAlex Elder if (!rbd_dev->parent_overlap) { 2549638f5abeSAlex Elder struct ceph_osd_client *osdc; 2550638f5abeSAlex Elder 2551638f5abeSAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 2552638f5abeSAlex Elder result = rbd_obj_request_submit(osdc, orig_request); 2553638f5abeSAlex Elder if (!result) 2554638f5abeSAlex Elder return; 2555638f5abeSAlex Elder } 2556c5b5ef6cSAlex Elder 2557c5b5ef6cSAlex Elder /* 2558c5b5ef6cSAlex Elder * Our only purpose here is to determine whether the object 2559c5b5ef6cSAlex Elder * exists, and we don't want to treat the non-existence as 2560c5b5ef6cSAlex Elder * an error. If something else comes back, transfer the 2561c5b5ef6cSAlex Elder * error to the original request and complete it now. 2562c5b5ef6cSAlex Elder */ 2563c5b5ef6cSAlex Elder if (!result) { 2564c5b5ef6cSAlex Elder obj_request_existence_set(orig_request, true); 2565c5b5ef6cSAlex Elder } else if (result == -ENOENT) { 2566c5b5ef6cSAlex Elder obj_request_existence_set(orig_request, false); 2567c5b5ef6cSAlex Elder } else if (result) { 2568c5b5ef6cSAlex Elder orig_request->result = result; 25693d7efd18SAlex Elder goto out; 2570c5b5ef6cSAlex Elder } 2571c5b5ef6cSAlex Elder 2572c5b5ef6cSAlex Elder /* 2573c5b5ef6cSAlex Elder * Resubmit the original request now that we have recorded 2574c5b5ef6cSAlex Elder * whether the target object exists. 2575c5b5ef6cSAlex Elder */ 2576b454e36dSAlex Elder orig_request->result = rbd_img_obj_request_submit(orig_request); 25773d7efd18SAlex Elder out: 2578c5b5ef6cSAlex Elder if (orig_request->result) 2579c5b5ef6cSAlex Elder rbd_obj_request_complete(orig_request); 2580c5b5ef6cSAlex Elder } 2581c5b5ef6cSAlex Elder 2582c5b5ef6cSAlex Elder static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request) 2583c5b5ef6cSAlex Elder { 2584c5b5ef6cSAlex Elder struct rbd_obj_request *stat_request; 2585c5b5ef6cSAlex Elder struct rbd_device *rbd_dev; 2586c5b5ef6cSAlex Elder struct ceph_osd_client *osdc; 2587c5b5ef6cSAlex Elder struct page **pages = NULL; 2588c5b5ef6cSAlex Elder u32 page_count; 2589c5b5ef6cSAlex Elder size_t size; 2590c5b5ef6cSAlex Elder int ret; 2591c5b5ef6cSAlex Elder 2592c5b5ef6cSAlex Elder /* 2593c5b5ef6cSAlex Elder * The response data for a STAT call consists of: 2594c5b5ef6cSAlex Elder * le64 length; 2595c5b5ef6cSAlex Elder * struct { 2596c5b5ef6cSAlex Elder * le32 tv_sec; 2597c5b5ef6cSAlex Elder * le32 tv_nsec; 2598c5b5ef6cSAlex Elder * } mtime; 2599c5b5ef6cSAlex Elder */ 2600c5b5ef6cSAlex Elder size = sizeof (__le64) + sizeof (__le32) + sizeof (__le32); 2601c5b5ef6cSAlex Elder page_count = (u32)calc_pages_for(0, size); 2602c5b5ef6cSAlex Elder pages = ceph_alloc_page_vector(page_count, GFP_KERNEL); 2603c5b5ef6cSAlex Elder if (IS_ERR(pages)) 2604c5b5ef6cSAlex Elder return PTR_ERR(pages); 2605c5b5ef6cSAlex Elder 2606c5b5ef6cSAlex Elder ret = -ENOMEM; 2607c5b5ef6cSAlex Elder stat_request = rbd_obj_request_create(obj_request->object_name, 0, 0, 2608c5b5ef6cSAlex Elder OBJ_REQUEST_PAGES); 2609c5b5ef6cSAlex Elder if (!stat_request) 2610c5b5ef6cSAlex Elder goto out; 2611c5b5ef6cSAlex Elder 2612c5b5ef6cSAlex Elder rbd_obj_request_get(obj_request); 2613c5b5ef6cSAlex Elder stat_request->obj_request = obj_request; 2614c5b5ef6cSAlex Elder stat_request->pages = pages; 2615c5b5ef6cSAlex Elder stat_request->page_count = page_count; 2616c5b5ef6cSAlex Elder 2617c5b5ef6cSAlex Elder rbd_assert(obj_request->img_request); 2618c5b5ef6cSAlex Elder rbd_dev = obj_request->img_request->rbd_dev; 2619c5b5ef6cSAlex Elder stat_request->osd_req = rbd_osd_req_create(rbd_dev, false, 2620c5b5ef6cSAlex Elder stat_request); 2621c5b5ef6cSAlex Elder if (!stat_request->osd_req) 2622c5b5ef6cSAlex Elder goto out; 2623c5b5ef6cSAlex Elder stat_request->callback = rbd_img_obj_exists_callback; 2624c5b5ef6cSAlex Elder 2625c5b5ef6cSAlex Elder osd_req_op_init(stat_request->osd_req, 0, CEPH_OSD_OP_STAT); 2626c5b5ef6cSAlex Elder osd_req_op_raw_data_in_pages(stat_request->osd_req, 0, pages, size, 0, 2627c5b5ef6cSAlex Elder false, false); 26289d4df01fSAlex Elder rbd_osd_req_format_read(stat_request); 2629c5b5ef6cSAlex Elder 2630c5b5ef6cSAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 2631c5b5ef6cSAlex Elder ret = rbd_obj_request_submit(osdc, stat_request); 2632c5b5ef6cSAlex Elder out: 2633c5b5ef6cSAlex Elder if (ret) 2634c5b5ef6cSAlex Elder rbd_obj_request_put(obj_request); 2635c5b5ef6cSAlex Elder 2636c5b5ef6cSAlex Elder return ret; 2637c5b5ef6cSAlex Elder } 2638c5b5ef6cSAlex Elder 2639b454e36dSAlex Elder static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request) 2640b454e36dSAlex Elder { 2641b454e36dSAlex Elder struct rbd_img_request *img_request; 2642a9e8ba2cSAlex Elder struct rbd_device *rbd_dev; 26433d7efd18SAlex Elder bool known; 2644b454e36dSAlex Elder 2645b454e36dSAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 2646b454e36dSAlex Elder 2647b454e36dSAlex Elder img_request = obj_request->img_request; 2648b454e36dSAlex Elder rbd_assert(img_request); 2649a9e8ba2cSAlex Elder rbd_dev = img_request->rbd_dev; 2650b454e36dSAlex Elder 2651b454e36dSAlex Elder /* 2652a9e8ba2cSAlex Elder * Only writes to layered images need special handling. 2653a9e8ba2cSAlex Elder * Reads and non-layered writes are simple object requests. 2654a9e8ba2cSAlex Elder * Layered writes that start beyond the end of the overlap 2655a9e8ba2cSAlex Elder * with the parent have no parent data, so they too are 2656a9e8ba2cSAlex Elder * simple object requests. Finally, if the target object is 2657a9e8ba2cSAlex Elder * known to already exist, its parent data has already been 2658a9e8ba2cSAlex Elder * copied, so a write to the object can also be handled as a 2659a9e8ba2cSAlex Elder * simple object request. 2660b454e36dSAlex Elder */ 2661b454e36dSAlex Elder if (!img_request_write_test(img_request) || 2662b454e36dSAlex Elder !img_request_layered_test(img_request) || 2663a9e8ba2cSAlex Elder rbd_dev->parent_overlap <= obj_request->img_offset || 26643d7efd18SAlex Elder ((known = obj_request_known_test(obj_request)) && 26653d7efd18SAlex Elder obj_request_exists_test(obj_request))) { 2666b454e36dSAlex Elder 2667b454e36dSAlex Elder struct rbd_device *rbd_dev; 2668b454e36dSAlex Elder struct ceph_osd_client *osdc; 2669b454e36dSAlex Elder 2670b454e36dSAlex Elder rbd_dev = obj_request->img_request->rbd_dev; 2671b454e36dSAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 2672b454e36dSAlex Elder 2673b454e36dSAlex Elder return rbd_obj_request_submit(osdc, obj_request); 2674b454e36dSAlex Elder } 2675b454e36dSAlex Elder 2676b454e36dSAlex Elder /* 26773d7efd18SAlex Elder * It's a layered write. The target object might exist but 26783d7efd18SAlex Elder * we may not know that yet. If we know it doesn't exist, 26793d7efd18SAlex Elder * start by reading the data for the full target object from 26803d7efd18SAlex Elder * the parent so we can use it for a copyup to the target. 2681b454e36dSAlex Elder */ 26823d7efd18SAlex Elder if (known) 26833d7efd18SAlex Elder return rbd_img_obj_parent_read_full(obj_request); 26843d7efd18SAlex Elder 26853d7efd18SAlex Elder /* We don't know whether the target exists. Go find out. */ 2686b454e36dSAlex Elder 2687b454e36dSAlex Elder return rbd_img_obj_exists_submit(obj_request); 2688b454e36dSAlex Elder } 2689b454e36dSAlex Elder 2690bf0d5f50SAlex Elder static int rbd_img_request_submit(struct rbd_img_request *img_request) 2691bf0d5f50SAlex Elder { 2692bf0d5f50SAlex Elder struct rbd_obj_request *obj_request; 269346faeed4SAlex Elder struct rbd_obj_request *next_obj_request; 2694bf0d5f50SAlex Elder 269537206ee5SAlex Elder dout("%s: img %p\n", __func__, img_request); 269646faeed4SAlex Elder for_each_obj_request_safe(img_request, obj_request, next_obj_request) { 2697bf0d5f50SAlex Elder int ret; 2698bf0d5f50SAlex Elder 2699b454e36dSAlex Elder ret = rbd_img_obj_request_submit(obj_request); 2700bf0d5f50SAlex Elder if (ret) 2701bf0d5f50SAlex Elder return ret; 2702bf0d5f50SAlex Elder } 2703bf0d5f50SAlex Elder 2704bf0d5f50SAlex Elder return 0; 2705bf0d5f50SAlex Elder } 2706bf0d5f50SAlex Elder 27078b3e1a56SAlex Elder static void rbd_img_parent_read_callback(struct rbd_img_request *img_request) 27088b3e1a56SAlex Elder { 27098b3e1a56SAlex Elder struct rbd_obj_request *obj_request; 2710a9e8ba2cSAlex Elder struct rbd_device *rbd_dev; 2711a9e8ba2cSAlex Elder u64 obj_end; 271202c74fbaSAlex Elder u64 img_xferred; 271302c74fbaSAlex Elder int img_result; 27148b3e1a56SAlex Elder 27158b3e1a56SAlex Elder rbd_assert(img_request_child_test(img_request)); 27168b3e1a56SAlex Elder 271702c74fbaSAlex Elder /* First get what we need from the image request and release it */ 271802c74fbaSAlex Elder 27198b3e1a56SAlex Elder obj_request = img_request->obj_request; 272002c74fbaSAlex Elder img_xferred = img_request->xferred; 272102c74fbaSAlex Elder img_result = img_request->result; 272202c74fbaSAlex Elder rbd_img_request_put(img_request); 272302c74fbaSAlex Elder 272402c74fbaSAlex Elder /* 272502c74fbaSAlex Elder * If the overlap has become 0 (most likely because the 272602c74fbaSAlex Elder * image has been flattened) we need to re-submit the 272702c74fbaSAlex Elder * original request. 272802c74fbaSAlex Elder */ 2729a9e8ba2cSAlex Elder rbd_assert(obj_request); 2730a9e8ba2cSAlex Elder rbd_assert(obj_request->img_request); 273102c74fbaSAlex Elder rbd_dev = obj_request->img_request->rbd_dev; 273202c74fbaSAlex Elder if (!rbd_dev->parent_overlap) { 273302c74fbaSAlex Elder struct ceph_osd_client *osdc; 27348b3e1a56SAlex Elder 273502c74fbaSAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 273602c74fbaSAlex Elder img_result = rbd_obj_request_submit(osdc, obj_request); 273702c74fbaSAlex Elder if (!img_result) 273802c74fbaSAlex Elder return; 273902c74fbaSAlex Elder } 274002c74fbaSAlex Elder 274102c74fbaSAlex Elder obj_request->result = img_result; 2742a9e8ba2cSAlex Elder if (obj_request->result) 2743a9e8ba2cSAlex Elder goto out; 2744a9e8ba2cSAlex Elder 2745a9e8ba2cSAlex Elder /* 2746a9e8ba2cSAlex Elder * We need to zero anything beyond the parent overlap 2747a9e8ba2cSAlex Elder * boundary. Since rbd_img_obj_request_read_callback() 2748a9e8ba2cSAlex Elder * will zero anything beyond the end of a short read, an 2749a9e8ba2cSAlex Elder * easy way to do this is to pretend the data from the 2750a9e8ba2cSAlex Elder * parent came up short--ending at the overlap boundary. 2751a9e8ba2cSAlex Elder */ 2752a9e8ba2cSAlex Elder rbd_assert(obj_request->img_offset < U64_MAX - obj_request->length); 2753a9e8ba2cSAlex Elder obj_end = obj_request->img_offset + obj_request->length; 2754a9e8ba2cSAlex Elder if (obj_end > rbd_dev->parent_overlap) { 2755a9e8ba2cSAlex Elder u64 xferred = 0; 2756a9e8ba2cSAlex Elder 2757a9e8ba2cSAlex Elder if (obj_request->img_offset < rbd_dev->parent_overlap) 2758a9e8ba2cSAlex Elder xferred = rbd_dev->parent_overlap - 2759a9e8ba2cSAlex Elder obj_request->img_offset; 2760a9e8ba2cSAlex Elder 276102c74fbaSAlex Elder obj_request->xferred = min(img_xferred, xferred); 2762a9e8ba2cSAlex Elder } else { 276302c74fbaSAlex Elder obj_request->xferred = img_xferred; 2764a9e8ba2cSAlex Elder } 2765a9e8ba2cSAlex Elder out: 27668b3e1a56SAlex Elder rbd_img_obj_request_read_callback(obj_request); 27678b3e1a56SAlex Elder rbd_obj_request_complete(obj_request); 27688b3e1a56SAlex Elder } 27698b3e1a56SAlex Elder 27708b3e1a56SAlex Elder static void rbd_img_parent_read(struct rbd_obj_request *obj_request) 27718b3e1a56SAlex Elder { 27728b3e1a56SAlex Elder struct rbd_img_request *img_request; 27738b3e1a56SAlex Elder int result; 27748b3e1a56SAlex Elder 27758b3e1a56SAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 27768b3e1a56SAlex Elder rbd_assert(obj_request->img_request != NULL); 27778b3e1a56SAlex Elder rbd_assert(obj_request->result == (s32) -ENOENT); 27785b2ab72dSAlex Elder rbd_assert(obj_request_type_valid(obj_request->type)); 27798b3e1a56SAlex Elder 27808b3e1a56SAlex Elder /* rbd_read_finish(obj_request, obj_request->length); */ 2781e93f3152SAlex Elder img_request = rbd_parent_request_create(obj_request, 27828b3e1a56SAlex Elder obj_request->img_offset, 2783e93f3152SAlex Elder obj_request->length); 27848b3e1a56SAlex Elder result = -ENOMEM; 27858b3e1a56SAlex Elder if (!img_request) 27868b3e1a56SAlex Elder goto out_err; 27878b3e1a56SAlex Elder 27885b2ab72dSAlex Elder if (obj_request->type == OBJ_REQUEST_BIO) 2789f1a4739fSAlex Elder result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO, 2790f1a4739fSAlex Elder obj_request->bio_list); 27915b2ab72dSAlex Elder else 27925b2ab72dSAlex Elder result = rbd_img_request_fill(img_request, OBJ_REQUEST_PAGES, 27935b2ab72dSAlex Elder obj_request->pages); 27948b3e1a56SAlex Elder if (result) 27958b3e1a56SAlex Elder goto out_err; 27968b3e1a56SAlex Elder 27978b3e1a56SAlex Elder img_request->callback = rbd_img_parent_read_callback; 27988b3e1a56SAlex Elder result = rbd_img_request_submit(img_request); 27998b3e1a56SAlex Elder if (result) 28008b3e1a56SAlex Elder goto out_err; 28018b3e1a56SAlex Elder 28028b3e1a56SAlex Elder return; 28038b3e1a56SAlex Elder out_err: 28048b3e1a56SAlex Elder if (img_request) 28058b3e1a56SAlex Elder rbd_img_request_put(img_request); 28068b3e1a56SAlex Elder obj_request->result = result; 28078b3e1a56SAlex Elder obj_request->xferred = 0; 28088b3e1a56SAlex Elder obj_request_done_set(obj_request); 28098b3e1a56SAlex Elder } 28108b3e1a56SAlex Elder 281120e0af67SJosh Durgin static int rbd_obj_notify_ack_sync(struct rbd_device *rbd_dev, u64 notify_id) 2812b8d70035SAlex Elder { 2813b8d70035SAlex Elder struct rbd_obj_request *obj_request; 28142169238dSAlex Elder struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 2815b8d70035SAlex Elder int ret; 2816b8d70035SAlex Elder 2817b8d70035SAlex Elder obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0, 2818b8d70035SAlex Elder OBJ_REQUEST_NODATA); 2819b8d70035SAlex Elder if (!obj_request) 2820b8d70035SAlex Elder return -ENOMEM; 2821b8d70035SAlex Elder 2822b8d70035SAlex Elder ret = -ENOMEM; 2823430c28c3SAlex Elder obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request); 2824b8d70035SAlex Elder if (!obj_request->osd_req) 2825b8d70035SAlex Elder goto out; 2826b8d70035SAlex Elder 2827c99d2d4aSAlex Elder osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_NOTIFY_ACK, 2828cc4a38bdSAlex Elder notify_id, 0, 0); 28299d4df01fSAlex Elder rbd_osd_req_format_read(obj_request); 2830430c28c3SAlex Elder 2831b8d70035SAlex Elder ret = rbd_obj_request_submit(osdc, obj_request); 2832cf81b60eSAlex Elder if (ret) 283320e0af67SJosh Durgin goto out; 283420e0af67SJosh Durgin ret = rbd_obj_request_wait(obj_request); 283520e0af67SJosh Durgin out: 2836b8d70035SAlex Elder rbd_obj_request_put(obj_request); 2837b8d70035SAlex Elder 2838b8d70035SAlex Elder return ret; 2839b8d70035SAlex Elder } 2840b8d70035SAlex Elder 2841b8d70035SAlex Elder static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data) 2842b8d70035SAlex Elder { 2843b8d70035SAlex Elder struct rbd_device *rbd_dev = (struct rbd_device *)data; 2844e627db08SAlex Elder int ret; 2845b8d70035SAlex Elder 2846b8d70035SAlex Elder if (!rbd_dev) 2847b8d70035SAlex Elder return; 2848b8d70035SAlex Elder 284937206ee5SAlex Elder dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__, 2850b8d70035SAlex Elder rbd_dev->header_name, (unsigned long long)notify_id, 2851b8d70035SAlex Elder (unsigned int)opcode); 2852e627db08SAlex Elder ret = rbd_dev_refresh(rbd_dev); 2853e627db08SAlex Elder if (ret) 28543b5cf2a2SAlex Elder rbd_warn(rbd_dev, "header refresh error (%d)\n", ret); 2855b8d70035SAlex Elder 285620e0af67SJosh Durgin rbd_obj_notify_ack_sync(rbd_dev, notify_id); 2857b8d70035SAlex Elder } 2858b8d70035SAlex Elder 28599969ebc5SAlex Elder /* 28609969ebc5SAlex Elder * Request sync osd watch/unwatch. The value of "start" determines 28619969ebc5SAlex Elder * whether a watch request is being initiated or torn down. 28629969ebc5SAlex Elder */ 28631f3ef788SAlex Elder static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, bool start) 28649969ebc5SAlex Elder { 28659969ebc5SAlex Elder struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 28669969ebc5SAlex Elder struct rbd_obj_request *obj_request; 28679969ebc5SAlex Elder int ret; 28689969ebc5SAlex Elder 28699969ebc5SAlex Elder rbd_assert(start ^ !!rbd_dev->watch_event); 28709969ebc5SAlex Elder rbd_assert(start ^ !!rbd_dev->watch_request); 28719969ebc5SAlex Elder 28729969ebc5SAlex Elder if (start) { 28733c663bbdSAlex Elder ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev, 28749969ebc5SAlex Elder &rbd_dev->watch_event); 28759969ebc5SAlex Elder if (ret < 0) 28769969ebc5SAlex Elder return ret; 28778eb87565SAlex Elder rbd_assert(rbd_dev->watch_event != NULL); 28789969ebc5SAlex Elder } 28799969ebc5SAlex Elder 28809969ebc5SAlex Elder ret = -ENOMEM; 28819969ebc5SAlex Elder obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0, 28829969ebc5SAlex Elder OBJ_REQUEST_NODATA); 28839969ebc5SAlex Elder if (!obj_request) 28849969ebc5SAlex Elder goto out_cancel; 28859969ebc5SAlex Elder 2886430c28c3SAlex Elder obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, obj_request); 2887430c28c3SAlex Elder if (!obj_request->osd_req) 2888430c28c3SAlex Elder goto out_cancel; 2889430c28c3SAlex Elder 28908eb87565SAlex Elder if (start) 2891975241afSAlex Elder ceph_osdc_set_request_linger(osdc, obj_request->osd_req); 28928eb87565SAlex Elder else 28936977c3f9SAlex Elder ceph_osdc_unregister_linger_request(osdc, 2894975241afSAlex Elder rbd_dev->watch_request->osd_req); 28952169238dSAlex Elder 28962169238dSAlex Elder osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH, 28971f3ef788SAlex Elder rbd_dev->watch_event->cookie, 0, start ? 1 : 0); 28989d4df01fSAlex Elder rbd_osd_req_format_write(obj_request); 28992169238dSAlex Elder 29009969ebc5SAlex Elder ret = rbd_obj_request_submit(osdc, obj_request); 29019969ebc5SAlex Elder if (ret) 29029969ebc5SAlex Elder goto out_cancel; 29039969ebc5SAlex Elder ret = rbd_obj_request_wait(obj_request); 29049969ebc5SAlex Elder if (ret) 29059969ebc5SAlex Elder goto out_cancel; 29069969ebc5SAlex Elder ret = obj_request->result; 29079969ebc5SAlex Elder if (ret) 29089969ebc5SAlex Elder goto out_cancel; 29099969ebc5SAlex Elder 29108eb87565SAlex Elder /* 29118eb87565SAlex Elder * A watch request is set to linger, so the underlying osd 29128eb87565SAlex Elder * request won't go away until we unregister it. We retain 29138eb87565SAlex Elder * a pointer to the object request during that time (in 29148eb87565SAlex Elder * rbd_dev->watch_request), so we'll keep a reference to 29158eb87565SAlex Elder * it. We'll drop that reference (below) after we've 29168eb87565SAlex Elder * unregistered it. 29178eb87565SAlex Elder */ 29188eb87565SAlex Elder if (start) { 29198eb87565SAlex Elder rbd_dev->watch_request = obj_request; 29208eb87565SAlex Elder 29218eb87565SAlex Elder return 0; 29228eb87565SAlex Elder } 29238eb87565SAlex Elder 29248eb87565SAlex Elder /* We have successfully torn down the watch request */ 29258eb87565SAlex Elder 29268eb87565SAlex Elder rbd_obj_request_put(rbd_dev->watch_request); 29278eb87565SAlex Elder rbd_dev->watch_request = NULL; 29289969ebc5SAlex Elder out_cancel: 29299969ebc5SAlex Elder /* Cancel the event if we're tearing down, or on error */ 29309969ebc5SAlex Elder ceph_osdc_cancel_event(rbd_dev->watch_event); 29319969ebc5SAlex Elder rbd_dev->watch_event = NULL; 29329969ebc5SAlex Elder if (obj_request) 29339969ebc5SAlex Elder rbd_obj_request_put(obj_request); 29349969ebc5SAlex Elder 29359969ebc5SAlex Elder return ret; 29369969ebc5SAlex Elder } 29379969ebc5SAlex Elder 293836be9a76SAlex Elder /* 2939f40eb349SAlex Elder * Synchronous osd object method call. Returns the number of bytes 2940f40eb349SAlex Elder * returned in the outbound buffer, or a negative error code. 294136be9a76SAlex Elder */ 294236be9a76SAlex Elder static int rbd_obj_method_sync(struct rbd_device *rbd_dev, 294336be9a76SAlex Elder const char *object_name, 294436be9a76SAlex Elder const char *class_name, 294536be9a76SAlex Elder const char *method_name, 29464157976bSAlex Elder const void *outbound, 294736be9a76SAlex Elder size_t outbound_size, 29484157976bSAlex Elder void *inbound, 2949e2a58ee5SAlex Elder size_t inbound_size) 295036be9a76SAlex Elder { 29512169238dSAlex Elder struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 295236be9a76SAlex Elder struct rbd_obj_request *obj_request; 295336be9a76SAlex Elder struct page **pages; 295436be9a76SAlex Elder u32 page_count; 295536be9a76SAlex Elder int ret; 295636be9a76SAlex Elder 295736be9a76SAlex Elder /* 29586010a451SAlex Elder * Method calls are ultimately read operations. The result 29596010a451SAlex Elder * should placed into the inbound buffer provided. They 29606010a451SAlex Elder * also supply outbound data--parameters for the object 29616010a451SAlex Elder * method. Currently if this is present it will be a 29626010a451SAlex Elder * snapshot id. 296336be9a76SAlex Elder */ 296436be9a76SAlex Elder page_count = (u32)calc_pages_for(0, inbound_size); 296536be9a76SAlex Elder pages = ceph_alloc_page_vector(page_count, GFP_KERNEL); 296636be9a76SAlex Elder if (IS_ERR(pages)) 296736be9a76SAlex Elder return PTR_ERR(pages); 296836be9a76SAlex Elder 296936be9a76SAlex Elder ret = -ENOMEM; 29706010a451SAlex Elder obj_request = rbd_obj_request_create(object_name, 0, inbound_size, 297136be9a76SAlex Elder OBJ_REQUEST_PAGES); 297236be9a76SAlex Elder if (!obj_request) 297336be9a76SAlex Elder goto out; 297436be9a76SAlex Elder 297536be9a76SAlex Elder obj_request->pages = pages; 297636be9a76SAlex Elder obj_request->page_count = page_count; 297736be9a76SAlex Elder 2978430c28c3SAlex Elder obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request); 297936be9a76SAlex Elder if (!obj_request->osd_req) 298036be9a76SAlex Elder goto out; 298136be9a76SAlex Elder 2982c99d2d4aSAlex Elder osd_req_op_cls_init(obj_request->osd_req, 0, CEPH_OSD_OP_CALL, 298304017e29SAlex Elder class_name, method_name); 298404017e29SAlex Elder if (outbound_size) { 298504017e29SAlex Elder struct ceph_pagelist *pagelist; 298604017e29SAlex Elder 298704017e29SAlex Elder pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS); 298804017e29SAlex Elder if (!pagelist) 298904017e29SAlex Elder goto out; 299004017e29SAlex Elder 299104017e29SAlex Elder ceph_pagelist_init(pagelist); 299204017e29SAlex Elder ceph_pagelist_append(pagelist, outbound, outbound_size); 299304017e29SAlex Elder osd_req_op_cls_request_data_pagelist(obj_request->osd_req, 0, 299404017e29SAlex Elder pagelist); 299504017e29SAlex Elder } 2996a4ce40a9SAlex Elder osd_req_op_cls_response_data_pages(obj_request->osd_req, 0, 2997a4ce40a9SAlex Elder obj_request->pages, inbound_size, 299844cd188dSAlex Elder 0, false, false); 29999d4df01fSAlex Elder rbd_osd_req_format_read(obj_request); 3000430c28c3SAlex Elder 300136be9a76SAlex Elder ret = rbd_obj_request_submit(osdc, obj_request); 300236be9a76SAlex Elder if (ret) 300336be9a76SAlex Elder goto out; 300436be9a76SAlex Elder ret = rbd_obj_request_wait(obj_request); 300536be9a76SAlex Elder if (ret) 300636be9a76SAlex Elder goto out; 300736be9a76SAlex Elder 300836be9a76SAlex Elder ret = obj_request->result; 300936be9a76SAlex Elder if (ret < 0) 301036be9a76SAlex Elder goto out; 301157385b51SAlex Elder 301257385b51SAlex Elder rbd_assert(obj_request->xferred < (u64)INT_MAX); 301357385b51SAlex Elder ret = (int)obj_request->xferred; 3014903bb32eSAlex Elder ceph_copy_from_page_vector(pages, inbound, 0, obj_request->xferred); 301536be9a76SAlex Elder out: 301636be9a76SAlex Elder if (obj_request) 301736be9a76SAlex Elder rbd_obj_request_put(obj_request); 301836be9a76SAlex Elder else 301936be9a76SAlex Elder ceph_release_page_vector(pages, page_count); 302036be9a76SAlex Elder 302136be9a76SAlex Elder return ret; 302236be9a76SAlex Elder } 302336be9a76SAlex Elder 3024bf0d5f50SAlex Elder static void rbd_request_fn(struct request_queue *q) 3025cc344fa1SAlex Elder __releases(q->queue_lock) __acquires(q->queue_lock) 3026bf0d5f50SAlex Elder { 3027bf0d5f50SAlex Elder struct rbd_device *rbd_dev = q->queuedata; 3028bf0d5f50SAlex Elder bool read_only = rbd_dev->mapping.read_only; 3029bf0d5f50SAlex Elder struct request *rq; 3030bf0d5f50SAlex Elder int result; 3031bf0d5f50SAlex Elder 3032bf0d5f50SAlex Elder while ((rq = blk_fetch_request(q))) { 3033bf0d5f50SAlex Elder bool write_request = rq_data_dir(rq) == WRITE; 3034bf0d5f50SAlex Elder struct rbd_img_request *img_request; 3035bf0d5f50SAlex Elder u64 offset; 3036bf0d5f50SAlex Elder u64 length; 3037bf0d5f50SAlex Elder 3038bf0d5f50SAlex Elder /* Ignore any non-FS requests that filter through. */ 3039bf0d5f50SAlex Elder 3040bf0d5f50SAlex Elder if (rq->cmd_type != REQ_TYPE_FS) { 30414dda41d3SAlex Elder dout("%s: non-fs request type %d\n", __func__, 30424dda41d3SAlex Elder (int) rq->cmd_type); 30434dda41d3SAlex Elder __blk_end_request_all(rq, 0); 30444dda41d3SAlex Elder continue; 30454dda41d3SAlex Elder } 30464dda41d3SAlex Elder 30474dda41d3SAlex Elder /* Ignore/skip any zero-length requests */ 30484dda41d3SAlex Elder 30494dda41d3SAlex Elder offset = (u64) blk_rq_pos(rq) << SECTOR_SHIFT; 30504dda41d3SAlex Elder length = (u64) blk_rq_bytes(rq); 30514dda41d3SAlex Elder 30524dda41d3SAlex Elder if (!length) { 30534dda41d3SAlex Elder dout("%s: zero-length request\n", __func__); 3054bf0d5f50SAlex Elder __blk_end_request_all(rq, 0); 3055bf0d5f50SAlex Elder continue; 3056bf0d5f50SAlex Elder } 3057bf0d5f50SAlex Elder 3058bf0d5f50SAlex Elder spin_unlock_irq(q->queue_lock); 3059bf0d5f50SAlex Elder 3060bf0d5f50SAlex Elder /* Disallow writes to a read-only device */ 3061bf0d5f50SAlex Elder 3062bf0d5f50SAlex Elder if (write_request) { 3063bf0d5f50SAlex Elder result = -EROFS; 3064bf0d5f50SAlex Elder if (read_only) 3065bf0d5f50SAlex Elder goto end_request; 3066bf0d5f50SAlex Elder rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP); 3067bf0d5f50SAlex Elder } 3068bf0d5f50SAlex Elder 30696d292906SAlex Elder /* 30706d292906SAlex Elder * Quit early if the mapped snapshot no longer 30716d292906SAlex Elder * exists. It's still possible the snapshot will 30726d292906SAlex Elder * have disappeared by the time our request arrives 30736d292906SAlex Elder * at the osd, but there's no sense in sending it if 30746d292906SAlex Elder * we already know. 30756d292906SAlex Elder */ 30766d292906SAlex Elder if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) { 3077bf0d5f50SAlex Elder dout("request for non-existent snapshot"); 3078bf0d5f50SAlex Elder rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP); 3079bf0d5f50SAlex Elder result = -ENXIO; 3080bf0d5f50SAlex Elder goto end_request; 3081bf0d5f50SAlex Elder } 3082bf0d5f50SAlex Elder 3083bf0d5f50SAlex Elder result = -EINVAL; 3084c0cd10dbSAlex Elder if (offset && length > U64_MAX - offset + 1) { 3085c0cd10dbSAlex Elder rbd_warn(rbd_dev, "bad request range (%llu~%llu)\n", 3086c0cd10dbSAlex Elder offset, length); 3087bf0d5f50SAlex Elder goto end_request; /* Shouldn't happen */ 3088c0cd10dbSAlex Elder } 3089bf0d5f50SAlex Elder 309000a653e2SAlex Elder result = -EIO; 309100a653e2SAlex Elder if (offset + length > rbd_dev->mapping.size) { 309200a653e2SAlex Elder rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)\n", 309300a653e2SAlex Elder offset, length, rbd_dev->mapping.size); 309400a653e2SAlex Elder goto end_request; 309500a653e2SAlex Elder } 309600a653e2SAlex Elder 3097bf0d5f50SAlex Elder result = -ENOMEM; 3098bf0d5f50SAlex Elder img_request = rbd_img_request_create(rbd_dev, offset, length, 3099e93f3152SAlex Elder write_request); 3100bf0d5f50SAlex Elder if (!img_request) 3101bf0d5f50SAlex Elder goto end_request; 3102bf0d5f50SAlex Elder 3103bf0d5f50SAlex Elder img_request->rq = rq; 3104bf0d5f50SAlex Elder 3105f1a4739fSAlex Elder result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO, 3106f1a4739fSAlex Elder rq->bio); 3107bf0d5f50SAlex Elder if (!result) 3108bf0d5f50SAlex Elder result = rbd_img_request_submit(img_request); 3109bf0d5f50SAlex Elder if (result) 3110bf0d5f50SAlex Elder rbd_img_request_put(img_request); 3111bf0d5f50SAlex Elder end_request: 3112bf0d5f50SAlex Elder spin_lock_irq(q->queue_lock); 3113bf0d5f50SAlex Elder if (result < 0) { 31147da22d29SAlex Elder rbd_warn(rbd_dev, "%s %llx at %llx result %d\n", 31157da22d29SAlex Elder write_request ? "write" : "read", 31167da22d29SAlex Elder length, offset, result); 31177da22d29SAlex Elder 3118bf0d5f50SAlex Elder __blk_end_request_all(rq, result); 3119bf0d5f50SAlex Elder } 3120bf0d5f50SAlex Elder } 3121bf0d5f50SAlex Elder } 3122bf0d5f50SAlex Elder 3123602adf40SYehuda Sadeh /* 3124602adf40SYehuda Sadeh * a queue callback. Makes sure that we don't create a bio that spans across 3125602adf40SYehuda Sadeh * multiple osd objects. One exception would be with a single page bios, 3126f7760dadSAlex Elder * which we handle later at bio_chain_clone_range() 3127602adf40SYehuda Sadeh */ 3128602adf40SYehuda Sadeh static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd, 3129602adf40SYehuda Sadeh struct bio_vec *bvec) 3130602adf40SYehuda Sadeh { 3131602adf40SYehuda Sadeh struct rbd_device *rbd_dev = q->queuedata; 3132e5cfeed2SAlex Elder sector_t sector_offset; 3133e5cfeed2SAlex Elder sector_t sectors_per_obj; 3134e5cfeed2SAlex Elder sector_t obj_sector_offset; 3135e5cfeed2SAlex Elder int ret; 3136602adf40SYehuda Sadeh 3137e5cfeed2SAlex Elder /* 3138e5cfeed2SAlex Elder * Find how far into its rbd object the partition-relative 3139e5cfeed2SAlex Elder * bio start sector is to offset relative to the enclosing 3140e5cfeed2SAlex Elder * device. 3141e5cfeed2SAlex Elder */ 3142e5cfeed2SAlex Elder sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector; 3143e5cfeed2SAlex Elder sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT); 3144e5cfeed2SAlex Elder obj_sector_offset = sector_offset & (sectors_per_obj - 1); 3145593a9e7bSAlex Elder 3146e5cfeed2SAlex Elder /* 3147e5cfeed2SAlex Elder * Compute the number of bytes from that offset to the end 3148e5cfeed2SAlex Elder * of the object. Account for what's already used by the bio. 3149e5cfeed2SAlex Elder */ 3150e5cfeed2SAlex Elder ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT; 3151e5cfeed2SAlex Elder if (ret > bmd->bi_size) 3152e5cfeed2SAlex Elder ret -= bmd->bi_size; 3153e5cfeed2SAlex Elder else 3154e5cfeed2SAlex Elder ret = 0; 3155e5cfeed2SAlex Elder 3156e5cfeed2SAlex Elder /* 3157e5cfeed2SAlex Elder * Don't send back more than was asked for. And if the bio 3158e5cfeed2SAlex Elder * was empty, let the whole thing through because: "Note 3159e5cfeed2SAlex Elder * that a block device *must* allow a single page to be 3160e5cfeed2SAlex Elder * added to an empty bio." 3161e5cfeed2SAlex Elder */ 3162e5cfeed2SAlex Elder rbd_assert(bvec->bv_len <= PAGE_SIZE); 3163e5cfeed2SAlex Elder if (ret > (int) bvec->bv_len || !bmd->bi_size) 3164e5cfeed2SAlex Elder ret = (int) bvec->bv_len; 3165e5cfeed2SAlex Elder 3166e5cfeed2SAlex Elder return ret; 3167602adf40SYehuda Sadeh } 3168602adf40SYehuda Sadeh 3169602adf40SYehuda Sadeh static void rbd_free_disk(struct rbd_device *rbd_dev) 3170602adf40SYehuda Sadeh { 3171602adf40SYehuda Sadeh struct gendisk *disk = rbd_dev->disk; 3172602adf40SYehuda Sadeh 3173602adf40SYehuda Sadeh if (!disk) 3174602adf40SYehuda Sadeh return; 3175602adf40SYehuda Sadeh 3176a0cab924SAlex Elder rbd_dev->disk = NULL; 3177a0cab924SAlex Elder if (disk->flags & GENHD_FL_UP) { 3178602adf40SYehuda Sadeh del_gendisk(disk); 3179602adf40SYehuda Sadeh if (disk->queue) 3180602adf40SYehuda Sadeh blk_cleanup_queue(disk->queue); 3181a0cab924SAlex Elder } 3182602adf40SYehuda Sadeh put_disk(disk); 3183602adf40SYehuda Sadeh } 3184602adf40SYehuda Sadeh 3185788e2df3SAlex Elder static int rbd_obj_read_sync(struct rbd_device *rbd_dev, 3186788e2df3SAlex Elder const char *object_name, 31877097f8dfSAlex Elder u64 offset, u64 length, void *buf) 3188788e2df3SAlex Elder 3189788e2df3SAlex Elder { 31902169238dSAlex Elder struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 3191788e2df3SAlex Elder struct rbd_obj_request *obj_request; 3192788e2df3SAlex Elder struct page **pages = NULL; 3193788e2df3SAlex Elder u32 page_count; 31941ceae7efSAlex Elder size_t size; 3195788e2df3SAlex Elder int ret; 3196788e2df3SAlex Elder 3197788e2df3SAlex Elder page_count = (u32) calc_pages_for(offset, length); 3198788e2df3SAlex Elder pages = ceph_alloc_page_vector(page_count, GFP_KERNEL); 3199788e2df3SAlex Elder if (IS_ERR(pages)) 3200788e2df3SAlex Elder ret = PTR_ERR(pages); 3201788e2df3SAlex Elder 3202788e2df3SAlex Elder ret = -ENOMEM; 3203788e2df3SAlex Elder obj_request = rbd_obj_request_create(object_name, offset, length, 3204788e2df3SAlex Elder OBJ_REQUEST_PAGES); 3205788e2df3SAlex Elder if (!obj_request) 3206788e2df3SAlex Elder goto out; 3207788e2df3SAlex Elder 3208788e2df3SAlex Elder obj_request->pages = pages; 3209788e2df3SAlex Elder obj_request->page_count = page_count; 3210788e2df3SAlex Elder 3211430c28c3SAlex Elder obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request); 3212788e2df3SAlex Elder if (!obj_request->osd_req) 3213788e2df3SAlex Elder goto out; 3214788e2df3SAlex Elder 3215c99d2d4aSAlex Elder osd_req_op_extent_init(obj_request->osd_req, 0, CEPH_OSD_OP_READ, 3216c99d2d4aSAlex Elder offset, length, 0, 0); 3217406e2c9fSAlex Elder osd_req_op_extent_osd_data_pages(obj_request->osd_req, 0, 3218a4ce40a9SAlex Elder obj_request->pages, 321944cd188dSAlex Elder obj_request->length, 322044cd188dSAlex Elder obj_request->offset & ~PAGE_MASK, 322144cd188dSAlex Elder false, false); 32229d4df01fSAlex Elder rbd_osd_req_format_read(obj_request); 3223430c28c3SAlex Elder 3224788e2df3SAlex Elder ret = rbd_obj_request_submit(osdc, obj_request); 3225788e2df3SAlex Elder if (ret) 3226788e2df3SAlex Elder goto out; 3227788e2df3SAlex Elder ret = rbd_obj_request_wait(obj_request); 3228788e2df3SAlex Elder if (ret) 3229788e2df3SAlex Elder goto out; 3230788e2df3SAlex Elder 3231788e2df3SAlex Elder ret = obj_request->result; 3232788e2df3SAlex Elder if (ret < 0) 3233788e2df3SAlex Elder goto out; 32341ceae7efSAlex Elder 32351ceae7efSAlex Elder rbd_assert(obj_request->xferred <= (u64) SIZE_MAX); 32361ceae7efSAlex Elder size = (size_t) obj_request->xferred; 3237903bb32eSAlex Elder ceph_copy_from_page_vector(pages, buf, 0, size); 323823ed6e13SAlex Elder rbd_assert(size <= (size_t)INT_MAX); 323923ed6e13SAlex Elder ret = (int)size; 3240788e2df3SAlex Elder out: 3241788e2df3SAlex Elder if (obj_request) 3242788e2df3SAlex Elder rbd_obj_request_put(obj_request); 3243788e2df3SAlex Elder else 3244788e2df3SAlex Elder ceph_release_page_vector(pages, page_count); 3245788e2df3SAlex Elder 3246788e2df3SAlex Elder return ret; 3247788e2df3SAlex Elder } 3248788e2df3SAlex Elder 3249602adf40SYehuda Sadeh /* 3250662518b1SAlex Elder * Read the complete header for the given rbd device. On successful 3251662518b1SAlex Elder * return, the rbd_dev->header field will contain up-to-date 3252662518b1SAlex Elder * information about the image. 32534156d998SAlex Elder */ 325499a41ebcSAlex Elder static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev) 32554156d998SAlex Elder { 32564156d998SAlex Elder struct rbd_image_header_ondisk *ondisk = NULL; 32574156d998SAlex Elder u32 snap_count = 0; 32584156d998SAlex Elder u64 names_size = 0; 32594156d998SAlex Elder u32 want_count; 32604156d998SAlex Elder int ret; 32614156d998SAlex Elder 32624156d998SAlex Elder /* 32634156d998SAlex Elder * The complete header will include an array of its 64-bit 32644156d998SAlex Elder * snapshot ids, followed by the names of those snapshots as 32654156d998SAlex Elder * a contiguous block of NUL-terminated strings. Note that 32664156d998SAlex Elder * the number of snapshots could change by the time we read 32674156d998SAlex Elder * it in, in which case we re-read it. 32684156d998SAlex Elder */ 32694156d998SAlex Elder do { 32704156d998SAlex Elder size_t size; 32714156d998SAlex Elder 32724156d998SAlex Elder kfree(ondisk); 32734156d998SAlex Elder 32744156d998SAlex Elder size = sizeof (*ondisk); 32754156d998SAlex Elder size += snap_count * sizeof (struct rbd_image_snap_ondisk); 32764156d998SAlex Elder size += names_size; 32774156d998SAlex Elder ondisk = kmalloc(size, GFP_KERNEL); 32784156d998SAlex Elder if (!ondisk) 3279662518b1SAlex Elder return -ENOMEM; 32804156d998SAlex Elder 3281788e2df3SAlex Elder ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name, 32827097f8dfSAlex Elder 0, size, ondisk); 32834156d998SAlex Elder if (ret < 0) 3284662518b1SAlex Elder goto out; 3285c0cd10dbSAlex Elder if ((size_t)ret < size) { 32864156d998SAlex Elder ret = -ENXIO; 328706ecc6cbSAlex Elder rbd_warn(rbd_dev, "short header read (want %zd got %d)", 328806ecc6cbSAlex Elder size, ret); 3289662518b1SAlex Elder goto out; 32904156d998SAlex Elder } 32914156d998SAlex Elder if (!rbd_dev_ondisk_valid(ondisk)) { 32924156d998SAlex Elder ret = -ENXIO; 329306ecc6cbSAlex Elder rbd_warn(rbd_dev, "invalid header"); 3294662518b1SAlex Elder goto out; 32954156d998SAlex Elder } 32964156d998SAlex Elder 32974156d998SAlex Elder names_size = le64_to_cpu(ondisk->snap_names_len); 32984156d998SAlex Elder want_count = snap_count; 32994156d998SAlex Elder snap_count = le32_to_cpu(ondisk->snap_count); 33004156d998SAlex Elder } while (snap_count != want_count); 33014156d998SAlex Elder 3302662518b1SAlex Elder ret = rbd_header_from_disk(rbd_dev, ondisk); 3303662518b1SAlex Elder out: 33044156d998SAlex Elder kfree(ondisk); 33054156d998SAlex Elder 3306dfc5606dSYehuda Sadeh return ret; 3307602adf40SYehuda Sadeh } 3308602adf40SYehuda Sadeh 330915228edeSAlex Elder /* 331015228edeSAlex Elder * Clear the rbd device's EXISTS flag if the snapshot it's mapped to 331115228edeSAlex Elder * has disappeared from the (just updated) snapshot context. 331215228edeSAlex Elder */ 331315228edeSAlex Elder static void rbd_exists_validate(struct rbd_device *rbd_dev) 331415228edeSAlex Elder { 331515228edeSAlex Elder u64 snap_id; 331615228edeSAlex Elder 331715228edeSAlex Elder if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) 331815228edeSAlex Elder return; 331915228edeSAlex Elder 332015228edeSAlex Elder snap_id = rbd_dev->spec->snap_id; 332115228edeSAlex Elder if (snap_id == CEPH_NOSNAP) 332215228edeSAlex Elder return; 332315228edeSAlex Elder 332415228edeSAlex Elder if (rbd_dev_snap_index(rbd_dev, snap_id) == BAD_SNAP_INDEX) 332515228edeSAlex Elder clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); 332615228edeSAlex Elder } 332715228edeSAlex Elder 33289875201eSJosh Durgin static void rbd_dev_update_size(struct rbd_device *rbd_dev) 33299875201eSJosh Durgin { 33309875201eSJosh Durgin sector_t size; 33319875201eSJosh Durgin bool removing; 33329875201eSJosh Durgin 33339875201eSJosh Durgin /* 33349875201eSJosh Durgin * Don't hold the lock while doing disk operations, 33359875201eSJosh Durgin * or lock ordering will conflict with the bdev mutex via: 33369875201eSJosh Durgin * rbd_add() -> blkdev_get() -> rbd_open() 33379875201eSJosh Durgin */ 33389875201eSJosh Durgin spin_lock_irq(&rbd_dev->lock); 33399875201eSJosh Durgin removing = test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags); 33409875201eSJosh Durgin spin_unlock_irq(&rbd_dev->lock); 33419875201eSJosh Durgin /* 33429875201eSJosh Durgin * If the device is being removed, rbd_dev->disk has 33439875201eSJosh Durgin * been destroyed, so don't try to update its size 33449875201eSJosh Durgin */ 33459875201eSJosh Durgin if (!removing) { 33469875201eSJosh Durgin size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE; 33479875201eSJosh Durgin dout("setting size to %llu sectors", (unsigned long long)size); 33489875201eSJosh Durgin set_capacity(rbd_dev->disk, size); 33499875201eSJosh Durgin revalidate_disk(rbd_dev->disk); 33509875201eSJosh Durgin } 33519875201eSJosh Durgin } 33529875201eSJosh Durgin 3353cc4a38bdSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev) 33541fe5e993SAlex Elder { 3355e627db08SAlex Elder u64 mapping_size; 33561fe5e993SAlex Elder int ret; 33571fe5e993SAlex Elder 3358117973fbSAlex Elder rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 3359cfbf6377SAlex Elder down_write(&rbd_dev->header_rwsem); 33603b5cf2a2SAlex Elder mapping_size = rbd_dev->mapping.size; 3361117973fbSAlex Elder if (rbd_dev->image_format == 1) 336299a41ebcSAlex Elder ret = rbd_dev_v1_header_info(rbd_dev); 3363117973fbSAlex Elder else 33642df3fac7SAlex Elder ret = rbd_dev_v2_header_info(rbd_dev); 336515228edeSAlex Elder 336615228edeSAlex Elder /* If it's a mapped snapshot, validate its EXISTS flag */ 336715228edeSAlex Elder 336815228edeSAlex Elder rbd_exists_validate(rbd_dev); 3369cfbf6377SAlex Elder up_write(&rbd_dev->header_rwsem); 3370cfbf6377SAlex Elder 337100a653e2SAlex Elder if (mapping_size != rbd_dev->mapping.size) { 33729875201eSJosh Durgin rbd_dev_update_size(rbd_dev); 337300a653e2SAlex Elder } 33741fe5e993SAlex Elder 33751fe5e993SAlex Elder return ret; 33761fe5e993SAlex Elder } 33771fe5e993SAlex Elder 3378602adf40SYehuda Sadeh static int rbd_init_disk(struct rbd_device *rbd_dev) 3379602adf40SYehuda Sadeh { 3380602adf40SYehuda Sadeh struct gendisk *disk; 3381602adf40SYehuda Sadeh struct request_queue *q; 3382593a9e7bSAlex Elder u64 segment_size; 3383602adf40SYehuda Sadeh 3384602adf40SYehuda Sadeh /* create gendisk info */ 3385602adf40SYehuda Sadeh disk = alloc_disk(RBD_MINORS_PER_MAJOR); 3386602adf40SYehuda Sadeh if (!disk) 33871fcdb8aaSAlex Elder return -ENOMEM; 3388602adf40SYehuda Sadeh 3389f0f8cef5SAlex Elder snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d", 3390de71a297SAlex Elder rbd_dev->dev_id); 3391602adf40SYehuda Sadeh disk->major = rbd_dev->major; 3392602adf40SYehuda Sadeh disk->first_minor = 0; 3393602adf40SYehuda Sadeh disk->fops = &rbd_bd_ops; 3394602adf40SYehuda Sadeh disk->private_data = rbd_dev; 3395602adf40SYehuda Sadeh 3396bf0d5f50SAlex Elder q = blk_init_queue(rbd_request_fn, &rbd_dev->lock); 3397602adf40SYehuda Sadeh if (!q) 3398602adf40SYehuda Sadeh goto out_disk; 3399029bcbd8SJosh Durgin 3400593a9e7bSAlex Elder /* We use the default size, but let's be explicit about it. */ 3401593a9e7bSAlex Elder blk_queue_physical_block_size(q, SECTOR_SIZE); 3402593a9e7bSAlex Elder 3403029bcbd8SJosh Durgin /* set io sizes to object size */ 3404593a9e7bSAlex Elder segment_size = rbd_obj_bytes(&rbd_dev->header); 3405593a9e7bSAlex Elder blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE); 3406593a9e7bSAlex Elder blk_queue_max_segment_size(q, segment_size); 3407593a9e7bSAlex Elder blk_queue_io_min(q, segment_size); 3408593a9e7bSAlex Elder blk_queue_io_opt(q, segment_size); 3409029bcbd8SJosh Durgin 3410602adf40SYehuda Sadeh blk_queue_merge_bvec(q, rbd_merge_bvec); 3411602adf40SYehuda Sadeh disk->queue = q; 3412602adf40SYehuda Sadeh 3413602adf40SYehuda Sadeh q->queuedata = rbd_dev; 3414602adf40SYehuda Sadeh 3415602adf40SYehuda Sadeh rbd_dev->disk = disk; 3416602adf40SYehuda Sadeh 3417602adf40SYehuda Sadeh return 0; 3418602adf40SYehuda Sadeh out_disk: 3419602adf40SYehuda Sadeh put_disk(disk); 34201fcdb8aaSAlex Elder 34211fcdb8aaSAlex Elder return -ENOMEM; 3422602adf40SYehuda Sadeh } 3423602adf40SYehuda Sadeh 3424dfc5606dSYehuda Sadeh /* 3425dfc5606dSYehuda Sadeh sysfs 3426dfc5606dSYehuda Sadeh */ 3427602adf40SYehuda Sadeh 3428593a9e7bSAlex Elder static struct rbd_device *dev_to_rbd_dev(struct device *dev) 3429593a9e7bSAlex Elder { 3430593a9e7bSAlex Elder return container_of(dev, struct rbd_device, dev); 3431593a9e7bSAlex Elder } 3432593a9e7bSAlex Elder 3433dfc5606dSYehuda Sadeh static ssize_t rbd_size_show(struct device *dev, 3434dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 3435602adf40SYehuda Sadeh { 3436593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 3437dfc5606dSYehuda Sadeh 3438fc71d833SAlex Elder return sprintf(buf, "%llu\n", 3439fc71d833SAlex Elder (unsigned long long)rbd_dev->mapping.size); 3440602adf40SYehuda Sadeh } 3441602adf40SYehuda Sadeh 344234b13184SAlex Elder /* 344334b13184SAlex Elder * Note this shows the features for whatever's mapped, which is not 344434b13184SAlex Elder * necessarily the base image. 344534b13184SAlex Elder */ 344634b13184SAlex Elder static ssize_t rbd_features_show(struct device *dev, 344734b13184SAlex Elder struct device_attribute *attr, char *buf) 344834b13184SAlex Elder { 344934b13184SAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 345034b13184SAlex Elder 345134b13184SAlex Elder return sprintf(buf, "0x%016llx\n", 345234b13184SAlex Elder (unsigned long long)rbd_dev->mapping.features); 345334b13184SAlex Elder } 345434b13184SAlex Elder 3455dfc5606dSYehuda Sadeh static ssize_t rbd_major_show(struct device *dev, 3456dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 3457602adf40SYehuda Sadeh { 3458593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 3459dfc5606dSYehuda Sadeh 3460fc71d833SAlex Elder if (rbd_dev->major) 3461dfc5606dSYehuda Sadeh return sprintf(buf, "%d\n", rbd_dev->major); 3462fc71d833SAlex Elder 3463fc71d833SAlex Elder return sprintf(buf, "(none)\n"); 3464fc71d833SAlex Elder 3465dfc5606dSYehuda Sadeh } 3466dfc5606dSYehuda Sadeh 3467dfc5606dSYehuda Sadeh static ssize_t rbd_client_id_show(struct device *dev, 3468dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 3469dfc5606dSYehuda Sadeh { 3470593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 3471dfc5606dSYehuda Sadeh 34721dbb4399SAlex Elder return sprintf(buf, "client%lld\n", 34731dbb4399SAlex Elder ceph_client_id(rbd_dev->rbd_client->client)); 3474dfc5606dSYehuda Sadeh } 3475dfc5606dSYehuda Sadeh 3476dfc5606dSYehuda Sadeh static ssize_t rbd_pool_show(struct device *dev, 3477dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 3478dfc5606dSYehuda Sadeh { 3479593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 3480dfc5606dSYehuda Sadeh 34810d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->pool_name); 3482dfc5606dSYehuda Sadeh } 3483dfc5606dSYehuda Sadeh 34849bb2f334SAlex Elder static ssize_t rbd_pool_id_show(struct device *dev, 34859bb2f334SAlex Elder struct device_attribute *attr, char *buf) 34869bb2f334SAlex Elder { 34879bb2f334SAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 34889bb2f334SAlex Elder 34890d7dbfceSAlex Elder return sprintf(buf, "%llu\n", 34900d7dbfceSAlex Elder (unsigned long long) rbd_dev->spec->pool_id); 34919bb2f334SAlex Elder } 34929bb2f334SAlex Elder 3493dfc5606dSYehuda Sadeh static ssize_t rbd_name_show(struct device *dev, 3494dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 3495dfc5606dSYehuda Sadeh { 3496593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 3497dfc5606dSYehuda Sadeh 3498a92ffdf8SAlex Elder if (rbd_dev->spec->image_name) 34990d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->image_name); 3500a92ffdf8SAlex Elder 3501a92ffdf8SAlex Elder return sprintf(buf, "(unknown)\n"); 3502dfc5606dSYehuda Sadeh } 3503dfc5606dSYehuda Sadeh 3504589d30e0SAlex Elder static ssize_t rbd_image_id_show(struct device *dev, 3505589d30e0SAlex Elder struct device_attribute *attr, char *buf) 3506589d30e0SAlex Elder { 3507589d30e0SAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 3508589d30e0SAlex Elder 35090d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->image_id); 3510589d30e0SAlex Elder } 3511589d30e0SAlex Elder 351234b13184SAlex Elder /* 351334b13184SAlex Elder * Shows the name of the currently-mapped snapshot (or 351434b13184SAlex Elder * RBD_SNAP_HEAD_NAME for the base image). 351534b13184SAlex Elder */ 3516dfc5606dSYehuda Sadeh static ssize_t rbd_snap_show(struct device *dev, 3517dfc5606dSYehuda Sadeh struct device_attribute *attr, 3518dfc5606dSYehuda Sadeh char *buf) 3519dfc5606dSYehuda Sadeh { 3520593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 3521dfc5606dSYehuda Sadeh 35220d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->snap_name); 3523dfc5606dSYehuda Sadeh } 3524dfc5606dSYehuda Sadeh 352586b00e0dSAlex Elder /* 352686b00e0dSAlex Elder * For an rbd v2 image, shows the pool id, image id, and snapshot id 352786b00e0dSAlex Elder * for the parent image. If there is no parent, simply shows 352886b00e0dSAlex Elder * "(no parent image)". 352986b00e0dSAlex Elder */ 353086b00e0dSAlex Elder static ssize_t rbd_parent_show(struct device *dev, 353186b00e0dSAlex Elder struct device_attribute *attr, 353286b00e0dSAlex Elder char *buf) 353386b00e0dSAlex Elder { 353486b00e0dSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 353586b00e0dSAlex Elder struct rbd_spec *spec = rbd_dev->parent_spec; 353686b00e0dSAlex Elder int count; 353786b00e0dSAlex Elder char *bufp = buf; 353886b00e0dSAlex Elder 353986b00e0dSAlex Elder if (!spec) 354086b00e0dSAlex Elder return sprintf(buf, "(no parent image)\n"); 354186b00e0dSAlex Elder 354286b00e0dSAlex Elder count = sprintf(bufp, "pool_id %llu\npool_name %s\n", 354386b00e0dSAlex Elder (unsigned long long) spec->pool_id, spec->pool_name); 354486b00e0dSAlex Elder if (count < 0) 354586b00e0dSAlex Elder return count; 354686b00e0dSAlex Elder bufp += count; 354786b00e0dSAlex Elder 354886b00e0dSAlex Elder count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id, 354986b00e0dSAlex Elder spec->image_name ? spec->image_name : "(unknown)"); 355086b00e0dSAlex Elder if (count < 0) 355186b00e0dSAlex Elder return count; 355286b00e0dSAlex Elder bufp += count; 355386b00e0dSAlex Elder 355486b00e0dSAlex Elder count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n", 355586b00e0dSAlex Elder (unsigned long long) spec->snap_id, spec->snap_name); 355686b00e0dSAlex Elder if (count < 0) 355786b00e0dSAlex Elder return count; 355886b00e0dSAlex Elder bufp += count; 355986b00e0dSAlex Elder 356086b00e0dSAlex Elder count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap); 356186b00e0dSAlex Elder if (count < 0) 356286b00e0dSAlex Elder return count; 356386b00e0dSAlex Elder bufp += count; 356486b00e0dSAlex Elder 356586b00e0dSAlex Elder return (ssize_t) (bufp - buf); 356686b00e0dSAlex Elder } 356786b00e0dSAlex Elder 3568dfc5606dSYehuda Sadeh static ssize_t rbd_image_refresh(struct device *dev, 3569dfc5606dSYehuda Sadeh struct device_attribute *attr, 3570dfc5606dSYehuda Sadeh const char *buf, 3571dfc5606dSYehuda Sadeh size_t size) 3572dfc5606dSYehuda Sadeh { 3573593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 3574b813623aSAlex Elder int ret; 3575602adf40SYehuda Sadeh 3576cc4a38bdSAlex Elder ret = rbd_dev_refresh(rbd_dev); 3577e627db08SAlex Elder if (ret) 3578e627db08SAlex Elder rbd_warn(rbd_dev, ": manual header refresh error (%d)\n", ret); 3579b813623aSAlex Elder 3580b813623aSAlex Elder return ret < 0 ? ret : size; 3581dfc5606dSYehuda Sadeh } 3582602adf40SYehuda Sadeh 3583dfc5606dSYehuda Sadeh static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL); 358434b13184SAlex Elder static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL); 3585dfc5606dSYehuda Sadeh static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL); 3586dfc5606dSYehuda Sadeh static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL); 3587dfc5606dSYehuda Sadeh static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL); 35889bb2f334SAlex Elder static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL); 3589dfc5606dSYehuda Sadeh static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL); 3590589d30e0SAlex Elder static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL); 3591dfc5606dSYehuda Sadeh static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh); 3592dfc5606dSYehuda Sadeh static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL); 359386b00e0dSAlex Elder static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL); 3594dfc5606dSYehuda Sadeh 3595dfc5606dSYehuda Sadeh static struct attribute *rbd_attrs[] = { 3596dfc5606dSYehuda Sadeh &dev_attr_size.attr, 359734b13184SAlex Elder &dev_attr_features.attr, 3598dfc5606dSYehuda Sadeh &dev_attr_major.attr, 3599dfc5606dSYehuda Sadeh &dev_attr_client_id.attr, 3600dfc5606dSYehuda Sadeh &dev_attr_pool.attr, 36019bb2f334SAlex Elder &dev_attr_pool_id.attr, 3602dfc5606dSYehuda Sadeh &dev_attr_name.attr, 3603589d30e0SAlex Elder &dev_attr_image_id.attr, 3604dfc5606dSYehuda Sadeh &dev_attr_current_snap.attr, 360586b00e0dSAlex Elder &dev_attr_parent.attr, 3606dfc5606dSYehuda Sadeh &dev_attr_refresh.attr, 3607dfc5606dSYehuda Sadeh NULL 3608dfc5606dSYehuda Sadeh }; 3609dfc5606dSYehuda Sadeh 3610dfc5606dSYehuda Sadeh static struct attribute_group rbd_attr_group = { 3611dfc5606dSYehuda Sadeh .attrs = rbd_attrs, 3612dfc5606dSYehuda Sadeh }; 3613dfc5606dSYehuda Sadeh 3614dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_attr_groups[] = { 3615dfc5606dSYehuda Sadeh &rbd_attr_group, 3616dfc5606dSYehuda Sadeh NULL 3617dfc5606dSYehuda Sadeh }; 3618dfc5606dSYehuda Sadeh 3619dfc5606dSYehuda Sadeh static void rbd_sysfs_dev_release(struct device *dev) 3620dfc5606dSYehuda Sadeh { 3621dfc5606dSYehuda Sadeh } 3622dfc5606dSYehuda Sadeh 3623dfc5606dSYehuda Sadeh static struct device_type rbd_device_type = { 3624dfc5606dSYehuda Sadeh .name = "rbd", 3625dfc5606dSYehuda Sadeh .groups = rbd_attr_groups, 3626dfc5606dSYehuda Sadeh .release = rbd_sysfs_dev_release, 3627dfc5606dSYehuda Sadeh }; 3628dfc5606dSYehuda Sadeh 36298b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec) 36308b8fb99cSAlex Elder { 36318b8fb99cSAlex Elder kref_get(&spec->kref); 36328b8fb99cSAlex Elder 36338b8fb99cSAlex Elder return spec; 36348b8fb99cSAlex Elder } 36358b8fb99cSAlex Elder 36368b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref); 36378b8fb99cSAlex Elder static void rbd_spec_put(struct rbd_spec *spec) 36388b8fb99cSAlex Elder { 36398b8fb99cSAlex Elder if (spec) 36408b8fb99cSAlex Elder kref_put(&spec->kref, rbd_spec_free); 36418b8fb99cSAlex Elder } 36428b8fb99cSAlex Elder 36438b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_alloc(void) 36448b8fb99cSAlex Elder { 36458b8fb99cSAlex Elder struct rbd_spec *spec; 36468b8fb99cSAlex Elder 36478b8fb99cSAlex Elder spec = kzalloc(sizeof (*spec), GFP_KERNEL); 36488b8fb99cSAlex Elder if (!spec) 36498b8fb99cSAlex Elder return NULL; 36508b8fb99cSAlex Elder kref_init(&spec->kref); 36518b8fb99cSAlex Elder 36528b8fb99cSAlex Elder return spec; 36538b8fb99cSAlex Elder } 36548b8fb99cSAlex Elder 36558b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref) 36568b8fb99cSAlex Elder { 36578b8fb99cSAlex Elder struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref); 36588b8fb99cSAlex Elder 36598b8fb99cSAlex Elder kfree(spec->pool_name); 36608b8fb99cSAlex Elder kfree(spec->image_id); 36618b8fb99cSAlex Elder kfree(spec->image_name); 36628b8fb99cSAlex Elder kfree(spec->snap_name); 36638b8fb99cSAlex Elder kfree(spec); 36648b8fb99cSAlex Elder } 36658b8fb99cSAlex Elder 3666cc344fa1SAlex Elder static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc, 3667c53d5893SAlex Elder struct rbd_spec *spec) 3668c53d5893SAlex Elder { 3669c53d5893SAlex Elder struct rbd_device *rbd_dev; 3670c53d5893SAlex Elder 3671c53d5893SAlex Elder rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL); 3672c53d5893SAlex Elder if (!rbd_dev) 3673c53d5893SAlex Elder return NULL; 3674c53d5893SAlex Elder 3675c53d5893SAlex Elder spin_lock_init(&rbd_dev->lock); 36766d292906SAlex Elder rbd_dev->flags = 0; 3677a2acd00eSAlex Elder atomic_set(&rbd_dev->parent_ref, 0); 3678c53d5893SAlex Elder INIT_LIST_HEAD(&rbd_dev->node); 3679c53d5893SAlex Elder init_rwsem(&rbd_dev->header_rwsem); 3680c53d5893SAlex Elder 3681c53d5893SAlex Elder rbd_dev->spec = spec; 3682c53d5893SAlex Elder rbd_dev->rbd_client = rbdc; 3683c53d5893SAlex Elder 36840903e875SAlex Elder /* Initialize the layout used for all rbd requests */ 36850903e875SAlex Elder 36860903e875SAlex Elder rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); 36870903e875SAlex Elder rbd_dev->layout.fl_stripe_count = cpu_to_le32(1); 36880903e875SAlex Elder rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); 36890903e875SAlex Elder rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id); 36900903e875SAlex Elder 3691c53d5893SAlex Elder return rbd_dev; 3692c53d5893SAlex Elder } 3693c53d5893SAlex Elder 3694c53d5893SAlex Elder static void rbd_dev_destroy(struct rbd_device *rbd_dev) 3695c53d5893SAlex Elder { 3696c53d5893SAlex Elder rbd_put_client(rbd_dev->rbd_client); 3697c53d5893SAlex Elder rbd_spec_put(rbd_dev->spec); 3698c53d5893SAlex Elder kfree(rbd_dev); 3699c53d5893SAlex Elder } 3700c53d5893SAlex Elder 3701dfc5606dSYehuda Sadeh /* 37029d475de5SAlex Elder * Get the size and object order for an image snapshot, or if 37039d475de5SAlex Elder * snap_id is CEPH_NOSNAP, gets this information for the base 37049d475de5SAlex Elder * image. 37059d475de5SAlex Elder */ 37069d475de5SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id, 37079d475de5SAlex Elder u8 *order, u64 *snap_size) 37089d475de5SAlex Elder { 37099d475de5SAlex Elder __le64 snapid = cpu_to_le64(snap_id); 37109d475de5SAlex Elder int ret; 37119d475de5SAlex Elder struct { 37129d475de5SAlex Elder u8 order; 37139d475de5SAlex Elder __le64 size; 37149d475de5SAlex Elder } __attribute__ ((packed)) size_buf = { 0 }; 37159d475de5SAlex Elder 371636be9a76SAlex Elder ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, 37179d475de5SAlex Elder "rbd", "get_size", 37184157976bSAlex Elder &snapid, sizeof (snapid), 3719e2a58ee5SAlex Elder &size_buf, sizeof (size_buf)); 372036be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 37219d475de5SAlex Elder if (ret < 0) 37229d475de5SAlex Elder return ret; 372357385b51SAlex Elder if (ret < sizeof (size_buf)) 372457385b51SAlex Elder return -ERANGE; 37259d475de5SAlex Elder 3726c3545579SJosh Durgin if (order) { 37279d475de5SAlex Elder *order = size_buf.order; 3728c3545579SJosh Durgin dout(" order %u", (unsigned int)*order); 3729c3545579SJosh Durgin } 37309d475de5SAlex Elder *snap_size = le64_to_cpu(size_buf.size); 37319d475de5SAlex Elder 3732c3545579SJosh Durgin dout(" snap_id 0x%016llx snap_size = %llu\n", 3733c3545579SJosh Durgin (unsigned long long)snap_id, 37349d475de5SAlex Elder (unsigned long long)*snap_size); 37359d475de5SAlex Elder 37369d475de5SAlex Elder return 0; 37379d475de5SAlex Elder } 37389d475de5SAlex Elder 37399d475de5SAlex Elder static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev) 37409d475de5SAlex Elder { 37419d475de5SAlex Elder return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP, 37429d475de5SAlex Elder &rbd_dev->header.obj_order, 37439d475de5SAlex Elder &rbd_dev->header.image_size); 37449d475de5SAlex Elder } 37459d475de5SAlex Elder 37461e130199SAlex Elder static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev) 37471e130199SAlex Elder { 37481e130199SAlex Elder void *reply_buf; 37491e130199SAlex Elder int ret; 37501e130199SAlex Elder void *p; 37511e130199SAlex Elder 37521e130199SAlex Elder reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL); 37531e130199SAlex Elder if (!reply_buf) 37541e130199SAlex Elder return -ENOMEM; 37551e130199SAlex Elder 375636be9a76SAlex Elder ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, 37574157976bSAlex Elder "rbd", "get_object_prefix", NULL, 0, 3758e2a58ee5SAlex Elder reply_buf, RBD_OBJ_PREFIX_LEN_MAX); 375936be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 37601e130199SAlex Elder if (ret < 0) 37611e130199SAlex Elder goto out; 37621e130199SAlex Elder 37631e130199SAlex Elder p = reply_buf; 37641e130199SAlex Elder rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p, 376557385b51SAlex Elder p + ret, NULL, GFP_NOIO); 376657385b51SAlex Elder ret = 0; 37671e130199SAlex Elder 37681e130199SAlex Elder if (IS_ERR(rbd_dev->header.object_prefix)) { 37691e130199SAlex Elder ret = PTR_ERR(rbd_dev->header.object_prefix); 37701e130199SAlex Elder rbd_dev->header.object_prefix = NULL; 37711e130199SAlex Elder } else { 37721e130199SAlex Elder dout(" object_prefix = %s\n", rbd_dev->header.object_prefix); 37731e130199SAlex Elder } 37741e130199SAlex Elder out: 37751e130199SAlex Elder kfree(reply_buf); 37761e130199SAlex Elder 37771e130199SAlex Elder return ret; 37781e130199SAlex Elder } 37791e130199SAlex Elder 3780b1b5402aSAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id, 3781b1b5402aSAlex Elder u64 *snap_features) 3782b1b5402aSAlex Elder { 3783b1b5402aSAlex Elder __le64 snapid = cpu_to_le64(snap_id); 3784b1b5402aSAlex Elder struct { 3785b1b5402aSAlex Elder __le64 features; 3786b1b5402aSAlex Elder __le64 incompat; 37874157976bSAlex Elder } __attribute__ ((packed)) features_buf = { 0 }; 3788d889140cSAlex Elder u64 incompat; 3789b1b5402aSAlex Elder int ret; 3790b1b5402aSAlex Elder 379136be9a76SAlex Elder ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, 3792b1b5402aSAlex Elder "rbd", "get_features", 37934157976bSAlex Elder &snapid, sizeof (snapid), 3794e2a58ee5SAlex Elder &features_buf, sizeof (features_buf)); 379536be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 3796b1b5402aSAlex Elder if (ret < 0) 3797b1b5402aSAlex Elder return ret; 379857385b51SAlex Elder if (ret < sizeof (features_buf)) 379957385b51SAlex Elder return -ERANGE; 3800d889140cSAlex Elder 3801d889140cSAlex Elder incompat = le64_to_cpu(features_buf.incompat); 38025cbf6f12SAlex Elder if (incompat & ~RBD_FEATURES_SUPPORTED) 3803b8f5c6edSAlex Elder return -ENXIO; 3804d889140cSAlex Elder 3805b1b5402aSAlex Elder *snap_features = le64_to_cpu(features_buf.features); 3806b1b5402aSAlex Elder 3807b1b5402aSAlex Elder dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n", 3808b1b5402aSAlex Elder (unsigned long long)snap_id, 3809b1b5402aSAlex Elder (unsigned long long)*snap_features, 3810b1b5402aSAlex Elder (unsigned long long)le64_to_cpu(features_buf.incompat)); 3811b1b5402aSAlex Elder 3812b1b5402aSAlex Elder return 0; 3813b1b5402aSAlex Elder } 3814b1b5402aSAlex Elder 3815b1b5402aSAlex Elder static int rbd_dev_v2_features(struct rbd_device *rbd_dev) 3816b1b5402aSAlex Elder { 3817b1b5402aSAlex Elder return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP, 3818b1b5402aSAlex Elder &rbd_dev->header.features); 3819b1b5402aSAlex Elder } 3820b1b5402aSAlex Elder 382186b00e0dSAlex Elder static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev) 382286b00e0dSAlex Elder { 382386b00e0dSAlex Elder struct rbd_spec *parent_spec; 382486b00e0dSAlex Elder size_t size; 382586b00e0dSAlex Elder void *reply_buf = NULL; 382686b00e0dSAlex Elder __le64 snapid; 382786b00e0dSAlex Elder void *p; 382886b00e0dSAlex Elder void *end; 3829642a2537SAlex Elder u64 pool_id; 383086b00e0dSAlex Elder char *image_id; 38313b5cf2a2SAlex Elder u64 snap_id; 383286b00e0dSAlex Elder u64 overlap; 383386b00e0dSAlex Elder int ret; 383486b00e0dSAlex Elder 383586b00e0dSAlex Elder parent_spec = rbd_spec_alloc(); 383686b00e0dSAlex Elder if (!parent_spec) 383786b00e0dSAlex Elder return -ENOMEM; 383886b00e0dSAlex Elder 383986b00e0dSAlex Elder size = sizeof (__le64) + /* pool_id */ 384086b00e0dSAlex Elder sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX + /* image_id */ 384186b00e0dSAlex Elder sizeof (__le64) + /* snap_id */ 384286b00e0dSAlex Elder sizeof (__le64); /* overlap */ 384386b00e0dSAlex Elder reply_buf = kmalloc(size, GFP_KERNEL); 384486b00e0dSAlex Elder if (!reply_buf) { 384586b00e0dSAlex Elder ret = -ENOMEM; 384686b00e0dSAlex Elder goto out_err; 384786b00e0dSAlex Elder } 384886b00e0dSAlex Elder 384986b00e0dSAlex Elder snapid = cpu_to_le64(CEPH_NOSNAP); 385036be9a76SAlex Elder ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, 385186b00e0dSAlex Elder "rbd", "get_parent", 38524157976bSAlex Elder &snapid, sizeof (snapid), 3853e2a58ee5SAlex Elder reply_buf, size); 385436be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 385586b00e0dSAlex Elder if (ret < 0) 385686b00e0dSAlex Elder goto out_err; 385786b00e0dSAlex Elder 385886b00e0dSAlex Elder p = reply_buf; 385957385b51SAlex Elder end = reply_buf + ret; 386057385b51SAlex Elder ret = -ERANGE; 3861642a2537SAlex Elder ceph_decode_64_safe(&p, end, pool_id, out_err); 3862392a9dadSAlex Elder if (pool_id == CEPH_NOPOOL) { 3863392a9dadSAlex Elder /* 3864392a9dadSAlex Elder * Either the parent never existed, or we have 3865392a9dadSAlex Elder * record of it but the image got flattened so it no 3866392a9dadSAlex Elder * longer has a parent. When the parent of a 3867392a9dadSAlex Elder * layered image disappears we immediately set the 3868392a9dadSAlex Elder * overlap to 0. The effect of this is that all new 3869392a9dadSAlex Elder * requests will be treated as if the image had no 3870392a9dadSAlex Elder * parent. 3871392a9dadSAlex Elder */ 3872392a9dadSAlex Elder if (rbd_dev->parent_overlap) { 3873392a9dadSAlex Elder rbd_dev->parent_overlap = 0; 3874392a9dadSAlex Elder smp_mb(); 3875392a9dadSAlex Elder rbd_dev_parent_put(rbd_dev); 3876392a9dadSAlex Elder pr_info("%s: clone image has been flattened\n", 3877392a9dadSAlex Elder rbd_dev->disk->disk_name); 3878392a9dadSAlex Elder } 3879392a9dadSAlex Elder 388086b00e0dSAlex Elder goto out; /* No parent? No problem. */ 3881392a9dadSAlex Elder } 388286b00e0dSAlex Elder 38830903e875SAlex Elder /* The ceph file layout needs to fit pool id in 32 bits */ 38840903e875SAlex Elder 38850903e875SAlex Elder ret = -EIO; 3886642a2537SAlex Elder if (pool_id > (u64)U32_MAX) { 3887c0cd10dbSAlex Elder rbd_warn(NULL, "parent pool id too large (%llu > %u)\n", 3888642a2537SAlex Elder (unsigned long long)pool_id, U32_MAX); 388957385b51SAlex Elder goto out_err; 3890c0cd10dbSAlex Elder } 38910903e875SAlex Elder 3892979ed480SAlex Elder image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL); 389386b00e0dSAlex Elder if (IS_ERR(image_id)) { 389486b00e0dSAlex Elder ret = PTR_ERR(image_id); 389586b00e0dSAlex Elder goto out_err; 389686b00e0dSAlex Elder } 38973b5cf2a2SAlex Elder ceph_decode_64_safe(&p, end, snap_id, out_err); 389886b00e0dSAlex Elder ceph_decode_64_safe(&p, end, overlap, out_err); 389986b00e0dSAlex Elder 39003b5cf2a2SAlex Elder /* 39013b5cf2a2SAlex Elder * The parent won't change (except when the clone is 39023b5cf2a2SAlex Elder * flattened, already handled that). So we only need to 39033b5cf2a2SAlex Elder * record the parent spec we have not already done so. 39043b5cf2a2SAlex Elder */ 39053b5cf2a2SAlex Elder if (!rbd_dev->parent_spec) { 39063b5cf2a2SAlex Elder parent_spec->pool_id = pool_id; 39073b5cf2a2SAlex Elder parent_spec->image_id = image_id; 39083b5cf2a2SAlex Elder parent_spec->snap_id = snap_id; 390986b00e0dSAlex Elder rbd_dev->parent_spec = parent_spec; 391086b00e0dSAlex Elder parent_spec = NULL; /* rbd_dev now owns this */ 39113b5cf2a2SAlex Elder } 39123b5cf2a2SAlex Elder 39133b5cf2a2SAlex Elder /* 39143b5cf2a2SAlex Elder * We always update the parent overlap. If it's zero we 39153b5cf2a2SAlex Elder * treat it specially. 39163b5cf2a2SAlex Elder */ 391770cf49cfSAlex Elder rbd_dev->parent_overlap = overlap; 39183b5cf2a2SAlex Elder smp_mb(); 39193b5cf2a2SAlex Elder if (!overlap) { 39203b5cf2a2SAlex Elder 39213b5cf2a2SAlex Elder /* A null parent_spec indicates it's the initial probe */ 39223b5cf2a2SAlex Elder 39233b5cf2a2SAlex Elder if (parent_spec) { 39243b5cf2a2SAlex Elder /* 39253b5cf2a2SAlex Elder * The overlap has become zero, so the clone 39263b5cf2a2SAlex Elder * must have been resized down to 0 at some 39273b5cf2a2SAlex Elder * point. Treat this the same as a flatten. 39283b5cf2a2SAlex Elder */ 39293b5cf2a2SAlex Elder rbd_dev_parent_put(rbd_dev); 39303b5cf2a2SAlex Elder pr_info("%s: clone image now standalone\n", 39313b5cf2a2SAlex Elder rbd_dev->disk->disk_name); 393270cf49cfSAlex Elder } else { 39333b5cf2a2SAlex Elder /* 39343b5cf2a2SAlex Elder * For the initial probe, if we find the 39353b5cf2a2SAlex Elder * overlap is zero we just pretend there was 39363b5cf2a2SAlex Elder * no parent image. 39373b5cf2a2SAlex Elder */ 39383b5cf2a2SAlex Elder rbd_warn(rbd_dev, "ignoring parent of " 39393b5cf2a2SAlex Elder "clone with overlap 0\n"); 39403b5cf2a2SAlex Elder } 394170cf49cfSAlex Elder } 394286b00e0dSAlex Elder out: 394386b00e0dSAlex Elder ret = 0; 394486b00e0dSAlex Elder out_err: 394586b00e0dSAlex Elder kfree(reply_buf); 394686b00e0dSAlex Elder rbd_spec_put(parent_spec); 394786b00e0dSAlex Elder 394886b00e0dSAlex Elder return ret; 394986b00e0dSAlex Elder } 395086b00e0dSAlex Elder 3951cc070d59SAlex Elder static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev) 3952cc070d59SAlex Elder { 3953cc070d59SAlex Elder struct { 3954cc070d59SAlex Elder __le64 stripe_unit; 3955cc070d59SAlex Elder __le64 stripe_count; 3956cc070d59SAlex Elder } __attribute__ ((packed)) striping_info_buf = { 0 }; 3957cc070d59SAlex Elder size_t size = sizeof (striping_info_buf); 3958cc070d59SAlex Elder void *p; 3959cc070d59SAlex Elder u64 obj_size; 3960cc070d59SAlex Elder u64 stripe_unit; 3961cc070d59SAlex Elder u64 stripe_count; 3962cc070d59SAlex Elder int ret; 3963cc070d59SAlex Elder 3964cc070d59SAlex Elder ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, 3965cc070d59SAlex Elder "rbd", "get_stripe_unit_count", NULL, 0, 3966e2a58ee5SAlex Elder (char *)&striping_info_buf, size); 3967cc070d59SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 3968cc070d59SAlex Elder if (ret < 0) 3969cc070d59SAlex Elder return ret; 3970cc070d59SAlex Elder if (ret < size) 3971cc070d59SAlex Elder return -ERANGE; 3972cc070d59SAlex Elder 3973cc070d59SAlex Elder /* 3974cc070d59SAlex Elder * We don't actually support the "fancy striping" feature 3975cc070d59SAlex Elder * (STRIPINGV2) yet, but if the striping sizes are the 3976cc070d59SAlex Elder * defaults the behavior is the same as before. So find 3977cc070d59SAlex Elder * out, and only fail if the image has non-default values. 3978cc070d59SAlex Elder */ 3979cc070d59SAlex Elder ret = -EINVAL; 3980cc070d59SAlex Elder obj_size = (u64)1 << rbd_dev->header.obj_order; 3981cc070d59SAlex Elder p = &striping_info_buf; 3982cc070d59SAlex Elder stripe_unit = ceph_decode_64(&p); 3983cc070d59SAlex Elder if (stripe_unit != obj_size) { 3984cc070d59SAlex Elder rbd_warn(rbd_dev, "unsupported stripe unit " 3985cc070d59SAlex Elder "(got %llu want %llu)", 3986cc070d59SAlex Elder stripe_unit, obj_size); 3987cc070d59SAlex Elder return -EINVAL; 3988cc070d59SAlex Elder } 3989cc070d59SAlex Elder stripe_count = ceph_decode_64(&p); 3990cc070d59SAlex Elder if (stripe_count != 1) { 3991cc070d59SAlex Elder rbd_warn(rbd_dev, "unsupported stripe count " 3992cc070d59SAlex Elder "(got %llu want 1)", stripe_count); 3993cc070d59SAlex Elder return -EINVAL; 3994cc070d59SAlex Elder } 3995500d0c0fSAlex Elder rbd_dev->header.stripe_unit = stripe_unit; 3996500d0c0fSAlex Elder rbd_dev->header.stripe_count = stripe_count; 3997cc070d59SAlex Elder 3998cc070d59SAlex Elder return 0; 3999cc070d59SAlex Elder } 4000cc070d59SAlex Elder 40019e15b77dSAlex Elder static char *rbd_dev_image_name(struct rbd_device *rbd_dev) 40029e15b77dSAlex Elder { 40039e15b77dSAlex Elder size_t image_id_size; 40049e15b77dSAlex Elder char *image_id; 40059e15b77dSAlex Elder void *p; 40069e15b77dSAlex Elder void *end; 40079e15b77dSAlex Elder size_t size; 40089e15b77dSAlex Elder void *reply_buf = NULL; 40099e15b77dSAlex Elder size_t len = 0; 40109e15b77dSAlex Elder char *image_name = NULL; 40119e15b77dSAlex Elder int ret; 40129e15b77dSAlex Elder 40139e15b77dSAlex Elder rbd_assert(!rbd_dev->spec->image_name); 40149e15b77dSAlex Elder 401569e7a02fSAlex Elder len = strlen(rbd_dev->spec->image_id); 401669e7a02fSAlex Elder image_id_size = sizeof (__le32) + len; 40179e15b77dSAlex Elder image_id = kmalloc(image_id_size, GFP_KERNEL); 40189e15b77dSAlex Elder if (!image_id) 40199e15b77dSAlex Elder return NULL; 40209e15b77dSAlex Elder 40219e15b77dSAlex Elder p = image_id; 40224157976bSAlex Elder end = image_id + image_id_size; 402369e7a02fSAlex Elder ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len); 40249e15b77dSAlex Elder 40259e15b77dSAlex Elder size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX; 40269e15b77dSAlex Elder reply_buf = kmalloc(size, GFP_KERNEL); 40279e15b77dSAlex Elder if (!reply_buf) 40289e15b77dSAlex Elder goto out; 40299e15b77dSAlex Elder 403036be9a76SAlex Elder ret = rbd_obj_method_sync(rbd_dev, RBD_DIRECTORY, 40319e15b77dSAlex Elder "rbd", "dir_get_name", 40329e15b77dSAlex Elder image_id, image_id_size, 4033e2a58ee5SAlex Elder reply_buf, size); 40349e15b77dSAlex Elder if (ret < 0) 40359e15b77dSAlex Elder goto out; 40369e15b77dSAlex Elder p = reply_buf; 4037f40eb349SAlex Elder end = reply_buf + ret; 4038f40eb349SAlex Elder 40399e15b77dSAlex Elder image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL); 40409e15b77dSAlex Elder if (IS_ERR(image_name)) 40419e15b77dSAlex Elder image_name = NULL; 40429e15b77dSAlex Elder else 40439e15b77dSAlex Elder dout("%s: name is %s len is %zd\n", __func__, image_name, len); 40449e15b77dSAlex Elder out: 40459e15b77dSAlex Elder kfree(reply_buf); 40469e15b77dSAlex Elder kfree(image_id); 40479e15b77dSAlex Elder 40489e15b77dSAlex Elder return image_name; 40499e15b77dSAlex Elder } 40509e15b77dSAlex Elder 40512ad3d716SAlex Elder static u64 rbd_v1_snap_id_by_name(struct rbd_device *rbd_dev, const char *name) 40522ad3d716SAlex Elder { 40532ad3d716SAlex Elder struct ceph_snap_context *snapc = rbd_dev->header.snapc; 40542ad3d716SAlex Elder const char *snap_name; 40552ad3d716SAlex Elder u32 which = 0; 40562ad3d716SAlex Elder 40572ad3d716SAlex Elder /* Skip over names until we find the one we are looking for */ 40582ad3d716SAlex Elder 40592ad3d716SAlex Elder snap_name = rbd_dev->header.snap_names; 40602ad3d716SAlex Elder while (which < snapc->num_snaps) { 40612ad3d716SAlex Elder if (!strcmp(name, snap_name)) 40622ad3d716SAlex Elder return snapc->snaps[which]; 40632ad3d716SAlex Elder snap_name += strlen(snap_name) + 1; 40642ad3d716SAlex Elder which++; 40652ad3d716SAlex Elder } 40662ad3d716SAlex Elder return CEPH_NOSNAP; 40672ad3d716SAlex Elder } 40682ad3d716SAlex Elder 40692ad3d716SAlex Elder static u64 rbd_v2_snap_id_by_name(struct rbd_device *rbd_dev, const char *name) 40702ad3d716SAlex Elder { 40712ad3d716SAlex Elder struct ceph_snap_context *snapc = rbd_dev->header.snapc; 40722ad3d716SAlex Elder u32 which; 40732ad3d716SAlex Elder bool found = false; 40742ad3d716SAlex Elder u64 snap_id; 40752ad3d716SAlex Elder 40762ad3d716SAlex Elder for (which = 0; !found && which < snapc->num_snaps; which++) { 40772ad3d716SAlex Elder const char *snap_name; 40782ad3d716SAlex Elder 40792ad3d716SAlex Elder snap_id = snapc->snaps[which]; 40802ad3d716SAlex Elder snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id); 40812ad3d716SAlex Elder if (IS_ERR(snap_name)) 40822ad3d716SAlex Elder break; 40832ad3d716SAlex Elder found = !strcmp(name, snap_name); 40842ad3d716SAlex Elder kfree(snap_name); 40852ad3d716SAlex Elder } 40862ad3d716SAlex Elder return found ? snap_id : CEPH_NOSNAP; 40872ad3d716SAlex Elder } 40882ad3d716SAlex Elder 40892ad3d716SAlex Elder /* 40902ad3d716SAlex Elder * Assumes name is never RBD_SNAP_HEAD_NAME; returns CEPH_NOSNAP if 40912ad3d716SAlex Elder * no snapshot by that name is found, or if an error occurs. 40922ad3d716SAlex Elder */ 40932ad3d716SAlex Elder static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name) 40942ad3d716SAlex Elder { 40952ad3d716SAlex Elder if (rbd_dev->image_format == 1) 40962ad3d716SAlex Elder return rbd_v1_snap_id_by_name(rbd_dev, name); 40972ad3d716SAlex Elder 40982ad3d716SAlex Elder return rbd_v2_snap_id_by_name(rbd_dev, name); 40992ad3d716SAlex Elder } 41002ad3d716SAlex Elder 41019e15b77dSAlex Elder /* 41022e9f7f1cSAlex Elder * When an rbd image has a parent image, it is identified by the 41032e9f7f1cSAlex Elder * pool, image, and snapshot ids (not names). This function fills 41042e9f7f1cSAlex Elder * in the names for those ids. (It's OK if we can't figure out the 41052e9f7f1cSAlex Elder * name for an image id, but the pool and snapshot ids should always 41062e9f7f1cSAlex Elder * exist and have names.) All names in an rbd spec are dynamically 41072e9f7f1cSAlex Elder * allocated. 4108e1d4213fSAlex Elder * 4109e1d4213fSAlex Elder * When an image being mapped (not a parent) is probed, we have the 4110e1d4213fSAlex Elder * pool name and pool id, image name and image id, and the snapshot 4111e1d4213fSAlex Elder * name. The only thing we're missing is the snapshot id. 41129e15b77dSAlex Elder */ 41132e9f7f1cSAlex Elder static int rbd_dev_spec_update(struct rbd_device *rbd_dev) 41149e15b77dSAlex Elder { 41152e9f7f1cSAlex Elder struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 41162e9f7f1cSAlex Elder struct rbd_spec *spec = rbd_dev->spec; 41172e9f7f1cSAlex Elder const char *pool_name; 41182e9f7f1cSAlex Elder const char *image_name; 41192e9f7f1cSAlex Elder const char *snap_name; 41209e15b77dSAlex Elder int ret; 41219e15b77dSAlex Elder 4122e1d4213fSAlex Elder /* 4123e1d4213fSAlex Elder * An image being mapped will have the pool name (etc.), but 4124e1d4213fSAlex Elder * we need to look up the snapshot id. 4125e1d4213fSAlex Elder */ 41262e9f7f1cSAlex Elder if (spec->pool_name) { 41272e9f7f1cSAlex Elder if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) { 41282ad3d716SAlex Elder u64 snap_id; 4129e1d4213fSAlex Elder 41302ad3d716SAlex Elder snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name); 41312ad3d716SAlex Elder if (snap_id == CEPH_NOSNAP) 4132e1d4213fSAlex Elder return -ENOENT; 41332ad3d716SAlex Elder spec->snap_id = snap_id; 4134e1d4213fSAlex Elder } else { 41352e9f7f1cSAlex Elder spec->snap_id = CEPH_NOSNAP; 4136e1d4213fSAlex Elder } 4137e1d4213fSAlex Elder 4138e1d4213fSAlex Elder return 0; 4139e1d4213fSAlex Elder } 41409e15b77dSAlex Elder 41412e9f7f1cSAlex Elder /* Get the pool name; we have to make our own copy of this */ 41429e15b77dSAlex Elder 41432e9f7f1cSAlex Elder pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id); 41442e9f7f1cSAlex Elder if (!pool_name) { 41452e9f7f1cSAlex Elder rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id); 4146935dc89fSAlex Elder return -EIO; 4147935dc89fSAlex Elder } 41482e9f7f1cSAlex Elder pool_name = kstrdup(pool_name, GFP_KERNEL); 41492e9f7f1cSAlex Elder if (!pool_name) 41509e15b77dSAlex Elder return -ENOMEM; 41519e15b77dSAlex Elder 41529e15b77dSAlex Elder /* Fetch the image name; tolerate failure here */ 41539e15b77dSAlex Elder 41542e9f7f1cSAlex Elder image_name = rbd_dev_image_name(rbd_dev); 41552e9f7f1cSAlex Elder if (!image_name) 415606ecc6cbSAlex Elder rbd_warn(rbd_dev, "unable to get image name"); 41579e15b77dSAlex Elder 41582e9f7f1cSAlex Elder /* Look up the snapshot name, and make a copy */ 41599e15b77dSAlex Elder 41602e9f7f1cSAlex Elder snap_name = rbd_snap_name(rbd_dev, spec->snap_id); 41612e9f7f1cSAlex Elder if (!snap_name) { 41622e9f7f1cSAlex Elder ret = -ENOMEM; 41639e15b77dSAlex Elder goto out_err; 41642e9f7f1cSAlex Elder } 41652e9f7f1cSAlex Elder 41662e9f7f1cSAlex Elder spec->pool_name = pool_name; 41672e9f7f1cSAlex Elder spec->image_name = image_name; 41682e9f7f1cSAlex Elder spec->snap_name = snap_name; 41699e15b77dSAlex Elder 41709e15b77dSAlex Elder return 0; 41719e15b77dSAlex Elder out_err: 41722e9f7f1cSAlex Elder kfree(image_name); 41732e9f7f1cSAlex Elder kfree(pool_name); 41749e15b77dSAlex Elder 41759e15b77dSAlex Elder return ret; 41769e15b77dSAlex Elder } 41779e15b77dSAlex Elder 4178cc4a38bdSAlex Elder static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev) 417935d489f9SAlex Elder { 418035d489f9SAlex Elder size_t size; 418135d489f9SAlex Elder int ret; 418235d489f9SAlex Elder void *reply_buf; 418335d489f9SAlex Elder void *p; 418435d489f9SAlex Elder void *end; 418535d489f9SAlex Elder u64 seq; 418635d489f9SAlex Elder u32 snap_count; 418735d489f9SAlex Elder struct ceph_snap_context *snapc; 418835d489f9SAlex Elder u32 i; 418935d489f9SAlex Elder 419035d489f9SAlex Elder /* 419135d489f9SAlex Elder * We'll need room for the seq value (maximum snapshot id), 419235d489f9SAlex Elder * snapshot count, and array of that many snapshot ids. 419335d489f9SAlex Elder * For now we have a fixed upper limit on the number we're 419435d489f9SAlex Elder * prepared to receive. 419535d489f9SAlex Elder */ 419635d489f9SAlex Elder size = sizeof (__le64) + sizeof (__le32) + 419735d489f9SAlex Elder RBD_MAX_SNAP_COUNT * sizeof (__le64); 419835d489f9SAlex Elder reply_buf = kzalloc(size, GFP_KERNEL); 419935d489f9SAlex Elder if (!reply_buf) 420035d489f9SAlex Elder return -ENOMEM; 420135d489f9SAlex Elder 420236be9a76SAlex Elder ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, 42034157976bSAlex Elder "rbd", "get_snapcontext", NULL, 0, 4204e2a58ee5SAlex Elder reply_buf, size); 420536be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 420635d489f9SAlex Elder if (ret < 0) 420735d489f9SAlex Elder goto out; 420835d489f9SAlex Elder 420935d489f9SAlex Elder p = reply_buf; 421057385b51SAlex Elder end = reply_buf + ret; 421157385b51SAlex Elder ret = -ERANGE; 421235d489f9SAlex Elder ceph_decode_64_safe(&p, end, seq, out); 421335d489f9SAlex Elder ceph_decode_32_safe(&p, end, snap_count, out); 421435d489f9SAlex Elder 421535d489f9SAlex Elder /* 421635d489f9SAlex Elder * Make sure the reported number of snapshot ids wouldn't go 421735d489f9SAlex Elder * beyond the end of our buffer. But before checking that, 421835d489f9SAlex Elder * make sure the computed size of the snapshot context we 421935d489f9SAlex Elder * allocate is representable in a size_t. 422035d489f9SAlex Elder */ 422135d489f9SAlex Elder if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context)) 422235d489f9SAlex Elder / sizeof (u64)) { 422335d489f9SAlex Elder ret = -EINVAL; 422435d489f9SAlex Elder goto out; 422535d489f9SAlex Elder } 422635d489f9SAlex Elder if (!ceph_has_room(&p, end, snap_count * sizeof (__le64))) 422735d489f9SAlex Elder goto out; 4228468521c1SAlex Elder ret = 0; 422935d489f9SAlex Elder 4230812164f8SAlex Elder snapc = ceph_create_snap_context(snap_count, GFP_KERNEL); 423135d489f9SAlex Elder if (!snapc) { 423235d489f9SAlex Elder ret = -ENOMEM; 423335d489f9SAlex Elder goto out; 423435d489f9SAlex Elder } 423535d489f9SAlex Elder snapc->seq = seq; 423635d489f9SAlex Elder for (i = 0; i < snap_count; i++) 423735d489f9SAlex Elder snapc->snaps[i] = ceph_decode_64(&p); 423835d489f9SAlex Elder 423949ece554SAlex Elder ceph_put_snap_context(rbd_dev->header.snapc); 424035d489f9SAlex Elder rbd_dev->header.snapc = snapc; 424135d489f9SAlex Elder 424235d489f9SAlex Elder dout(" snap context seq = %llu, snap_count = %u\n", 424335d489f9SAlex Elder (unsigned long long)seq, (unsigned int)snap_count); 424435d489f9SAlex Elder out: 424535d489f9SAlex Elder kfree(reply_buf); 424635d489f9SAlex Elder 424757385b51SAlex Elder return ret; 424835d489f9SAlex Elder } 424935d489f9SAlex Elder 425054cac61fSAlex Elder static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, 425154cac61fSAlex Elder u64 snap_id) 4252b8b1e2dbSAlex Elder { 4253b8b1e2dbSAlex Elder size_t size; 4254b8b1e2dbSAlex Elder void *reply_buf; 425554cac61fSAlex Elder __le64 snapid; 4256b8b1e2dbSAlex Elder int ret; 4257b8b1e2dbSAlex Elder void *p; 4258b8b1e2dbSAlex Elder void *end; 4259b8b1e2dbSAlex Elder char *snap_name; 4260b8b1e2dbSAlex Elder 4261b8b1e2dbSAlex Elder size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN; 4262b8b1e2dbSAlex Elder reply_buf = kmalloc(size, GFP_KERNEL); 4263b8b1e2dbSAlex Elder if (!reply_buf) 4264b8b1e2dbSAlex Elder return ERR_PTR(-ENOMEM); 4265b8b1e2dbSAlex Elder 426654cac61fSAlex Elder snapid = cpu_to_le64(snap_id); 426736be9a76SAlex Elder ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, 4268b8b1e2dbSAlex Elder "rbd", "get_snapshot_name", 426954cac61fSAlex Elder &snapid, sizeof (snapid), 4270e2a58ee5SAlex Elder reply_buf, size); 427136be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 4272f40eb349SAlex Elder if (ret < 0) { 4273f40eb349SAlex Elder snap_name = ERR_PTR(ret); 4274b8b1e2dbSAlex Elder goto out; 4275f40eb349SAlex Elder } 4276b8b1e2dbSAlex Elder 4277b8b1e2dbSAlex Elder p = reply_buf; 4278f40eb349SAlex Elder end = reply_buf + ret; 4279e5c35534SAlex Elder snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL); 4280f40eb349SAlex Elder if (IS_ERR(snap_name)) 4281b8b1e2dbSAlex Elder goto out; 4282f40eb349SAlex Elder 4283b8b1e2dbSAlex Elder dout(" snap_id 0x%016llx snap_name = %s\n", 428454cac61fSAlex Elder (unsigned long long)snap_id, snap_name); 4285b8b1e2dbSAlex Elder out: 4286b8b1e2dbSAlex Elder kfree(reply_buf); 4287b8b1e2dbSAlex Elder 4288f40eb349SAlex Elder return snap_name; 4289b8b1e2dbSAlex Elder } 4290b8b1e2dbSAlex Elder 42912df3fac7SAlex Elder static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev) 4292117973fbSAlex Elder { 42932df3fac7SAlex Elder bool first_time = rbd_dev->header.object_prefix == NULL; 4294117973fbSAlex Elder int ret; 4295117973fbSAlex Elder 42961617e40cSJosh Durgin ret = rbd_dev_v2_image_size(rbd_dev); 42971617e40cSJosh Durgin if (ret) 4298cfbf6377SAlex Elder return ret; 42991617e40cSJosh Durgin 43002df3fac7SAlex Elder if (first_time) { 43012df3fac7SAlex Elder ret = rbd_dev_v2_header_onetime(rbd_dev); 43022df3fac7SAlex Elder if (ret) 4303cfbf6377SAlex Elder return ret; 43042df3fac7SAlex Elder } 43052df3fac7SAlex Elder 4306642a2537SAlex Elder /* 4307642a2537SAlex Elder * If the image supports layering, get the parent info. We 4308642a2537SAlex Elder * need to probe the first time regardless. Thereafter we 4309642a2537SAlex Elder * only need to if there's a parent, to see if it has 4310642a2537SAlex Elder * disappeared due to the mapped image getting flattened. 4311642a2537SAlex Elder */ 4312642a2537SAlex Elder if (rbd_dev->header.features & RBD_FEATURE_LAYERING && 4313642a2537SAlex Elder (first_time || rbd_dev->parent_spec)) { 4314642a2537SAlex Elder bool warn; 4315642a2537SAlex Elder 4316642a2537SAlex Elder ret = rbd_dev_v2_parent_info(rbd_dev); 4317642a2537SAlex Elder if (ret) 4318cfbf6377SAlex Elder return ret; 4319642a2537SAlex Elder 4320642a2537SAlex Elder /* 4321642a2537SAlex Elder * Print a warning if this is the initial probe and 4322642a2537SAlex Elder * the image has a parent. Don't print it if the 4323642a2537SAlex Elder * image now being probed is itself a parent. We 4324642a2537SAlex Elder * can tell at this point because we won't know its 4325642a2537SAlex Elder * pool name yet (just its pool id). 4326642a2537SAlex Elder */ 4327642a2537SAlex Elder warn = rbd_dev->parent_spec && rbd_dev->spec->pool_name; 4328642a2537SAlex Elder if (first_time && warn) 4329642a2537SAlex Elder rbd_warn(rbd_dev, "WARNING: kernel layering " 4330642a2537SAlex Elder "is EXPERIMENTAL!"); 4331642a2537SAlex Elder } 4332642a2537SAlex Elder 433329334ba4SAlex Elder if (rbd_dev->spec->snap_id == CEPH_NOSNAP) 433429334ba4SAlex Elder if (rbd_dev->mapping.size != rbd_dev->header.image_size) 433529334ba4SAlex Elder rbd_dev->mapping.size = rbd_dev->header.image_size; 4336117973fbSAlex Elder 4337cc4a38bdSAlex Elder ret = rbd_dev_v2_snap_context(rbd_dev); 4338117973fbSAlex Elder dout("rbd_dev_v2_snap_context returned %d\n", ret); 4339117973fbSAlex Elder 4340117973fbSAlex Elder return ret; 4341117973fbSAlex Elder } 4342117973fbSAlex Elder 4343dfc5606dSYehuda Sadeh static int rbd_bus_add_dev(struct rbd_device *rbd_dev) 4344dfc5606dSYehuda Sadeh { 4345dfc5606dSYehuda Sadeh struct device *dev; 4346cd789ab9SAlex Elder int ret; 4347dfc5606dSYehuda Sadeh 4348cd789ab9SAlex Elder dev = &rbd_dev->dev; 4349dfc5606dSYehuda Sadeh dev->bus = &rbd_bus_type; 4350dfc5606dSYehuda Sadeh dev->type = &rbd_device_type; 4351dfc5606dSYehuda Sadeh dev->parent = &rbd_root_dev; 4352200a6a8bSAlex Elder dev->release = rbd_dev_device_release; 4353de71a297SAlex Elder dev_set_name(dev, "%d", rbd_dev->dev_id); 4354dfc5606dSYehuda Sadeh ret = device_register(dev); 4355dfc5606dSYehuda Sadeh 4356dfc5606dSYehuda Sadeh return ret; 4357602adf40SYehuda Sadeh } 4358602adf40SYehuda Sadeh 4359dfc5606dSYehuda Sadeh static void rbd_bus_del_dev(struct rbd_device *rbd_dev) 4360dfc5606dSYehuda Sadeh { 4361dfc5606dSYehuda Sadeh device_unregister(&rbd_dev->dev); 4362dfc5606dSYehuda Sadeh } 4363dfc5606dSYehuda Sadeh 4364e2839308SAlex Elder static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0); 43651ddbe94eSAlex Elder 43661ddbe94eSAlex Elder /* 4367499afd5bSAlex Elder * Get a unique rbd identifier for the given new rbd_dev, and add 4368499afd5bSAlex Elder * the rbd_dev to the global list. The minimum rbd id is 1. 43691ddbe94eSAlex Elder */ 4370e2839308SAlex Elder static void rbd_dev_id_get(struct rbd_device *rbd_dev) 4371b7f23c36SAlex Elder { 4372e2839308SAlex Elder rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max); 4373499afd5bSAlex Elder 4374499afd5bSAlex Elder spin_lock(&rbd_dev_list_lock); 4375499afd5bSAlex Elder list_add_tail(&rbd_dev->node, &rbd_dev_list); 4376499afd5bSAlex Elder spin_unlock(&rbd_dev_list_lock); 4377e2839308SAlex Elder dout("rbd_dev %p given dev id %llu\n", rbd_dev, 4378e2839308SAlex Elder (unsigned long long) rbd_dev->dev_id); 4379b7f23c36SAlex Elder } 4380b7f23c36SAlex Elder 43811ddbe94eSAlex Elder /* 4382499afd5bSAlex Elder * Remove an rbd_dev from the global list, and record that its 4383499afd5bSAlex Elder * identifier is no longer in use. 43841ddbe94eSAlex Elder */ 4385e2839308SAlex Elder static void rbd_dev_id_put(struct rbd_device *rbd_dev) 43861ddbe94eSAlex Elder { 4387d184f6bfSAlex Elder struct list_head *tmp; 4388de71a297SAlex Elder int rbd_id = rbd_dev->dev_id; 4389d184f6bfSAlex Elder int max_id; 4390d184f6bfSAlex Elder 4391aafb230eSAlex Elder rbd_assert(rbd_id > 0); 4392499afd5bSAlex Elder 4393e2839308SAlex Elder dout("rbd_dev %p released dev id %llu\n", rbd_dev, 4394e2839308SAlex Elder (unsigned long long) rbd_dev->dev_id); 4395499afd5bSAlex Elder spin_lock(&rbd_dev_list_lock); 4396499afd5bSAlex Elder list_del_init(&rbd_dev->node); 4397d184f6bfSAlex Elder 4398d184f6bfSAlex Elder /* 4399d184f6bfSAlex Elder * If the id being "put" is not the current maximum, there 4400d184f6bfSAlex Elder * is nothing special we need to do. 4401d184f6bfSAlex Elder */ 4402e2839308SAlex Elder if (rbd_id != atomic64_read(&rbd_dev_id_max)) { 4403d184f6bfSAlex Elder spin_unlock(&rbd_dev_list_lock); 4404d184f6bfSAlex Elder return; 4405d184f6bfSAlex Elder } 4406d184f6bfSAlex Elder 4407d184f6bfSAlex Elder /* 4408d184f6bfSAlex Elder * We need to update the current maximum id. Search the 4409d184f6bfSAlex Elder * list to find out what it is. We're more likely to find 4410d184f6bfSAlex Elder * the maximum at the end, so search the list backward. 4411d184f6bfSAlex Elder */ 4412d184f6bfSAlex Elder max_id = 0; 4413d184f6bfSAlex Elder list_for_each_prev(tmp, &rbd_dev_list) { 4414d184f6bfSAlex Elder struct rbd_device *rbd_dev; 4415d184f6bfSAlex Elder 4416d184f6bfSAlex Elder rbd_dev = list_entry(tmp, struct rbd_device, node); 4417b213e0b1SAlex Elder if (rbd_dev->dev_id > max_id) 4418b213e0b1SAlex Elder max_id = rbd_dev->dev_id; 4419d184f6bfSAlex Elder } 4420499afd5bSAlex Elder spin_unlock(&rbd_dev_list_lock); 44211ddbe94eSAlex Elder 44221ddbe94eSAlex Elder /* 4423e2839308SAlex Elder * The max id could have been updated by rbd_dev_id_get(), in 4424d184f6bfSAlex Elder * which case it now accurately reflects the new maximum. 4425d184f6bfSAlex Elder * Be careful not to overwrite the maximum value in that 4426d184f6bfSAlex Elder * case. 44271ddbe94eSAlex Elder */ 4428e2839308SAlex Elder atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id); 4429e2839308SAlex Elder dout(" max dev id has been reset\n"); 4430b7f23c36SAlex Elder } 4431b7f23c36SAlex Elder 4432a725f65eSAlex Elder /* 4433e28fff26SAlex Elder * Skips over white space at *buf, and updates *buf to point to the 4434e28fff26SAlex Elder * first found non-space character (if any). Returns the length of 4435593a9e7bSAlex Elder * the token (string of non-white space characters) found. Note 4436593a9e7bSAlex Elder * that *buf must be terminated with '\0'. 4437e28fff26SAlex Elder */ 4438e28fff26SAlex Elder static inline size_t next_token(const char **buf) 4439e28fff26SAlex Elder { 4440e28fff26SAlex Elder /* 4441e28fff26SAlex Elder * These are the characters that produce nonzero for 4442e28fff26SAlex Elder * isspace() in the "C" and "POSIX" locales. 4443e28fff26SAlex Elder */ 4444e28fff26SAlex Elder const char *spaces = " \f\n\r\t\v"; 4445e28fff26SAlex Elder 4446e28fff26SAlex Elder *buf += strspn(*buf, spaces); /* Find start of token */ 4447e28fff26SAlex Elder 4448e28fff26SAlex Elder return strcspn(*buf, spaces); /* Return token length */ 4449e28fff26SAlex Elder } 4450e28fff26SAlex Elder 4451e28fff26SAlex Elder /* 4452e28fff26SAlex Elder * Finds the next token in *buf, and if the provided token buffer is 4453e28fff26SAlex Elder * big enough, copies the found token into it. The result, if 4454593a9e7bSAlex Elder * copied, is guaranteed to be terminated with '\0'. Note that *buf 4455593a9e7bSAlex Elder * must be terminated with '\0' on entry. 4456e28fff26SAlex Elder * 4457e28fff26SAlex Elder * Returns the length of the token found (not including the '\0'). 4458e28fff26SAlex Elder * Return value will be 0 if no token is found, and it will be >= 4459e28fff26SAlex Elder * token_size if the token would not fit. 4460e28fff26SAlex Elder * 4461593a9e7bSAlex Elder * The *buf pointer will be updated to point beyond the end of the 4462e28fff26SAlex Elder * found token. Note that this occurs even if the token buffer is 4463e28fff26SAlex Elder * too small to hold it. 4464e28fff26SAlex Elder */ 4465e28fff26SAlex Elder static inline size_t copy_token(const char **buf, 4466e28fff26SAlex Elder char *token, 4467e28fff26SAlex Elder size_t token_size) 4468e28fff26SAlex Elder { 4469e28fff26SAlex Elder size_t len; 4470e28fff26SAlex Elder 4471e28fff26SAlex Elder len = next_token(buf); 4472e28fff26SAlex Elder if (len < token_size) { 4473e28fff26SAlex Elder memcpy(token, *buf, len); 4474e28fff26SAlex Elder *(token + len) = '\0'; 4475e28fff26SAlex Elder } 4476e28fff26SAlex Elder *buf += len; 4477e28fff26SAlex Elder 4478e28fff26SAlex Elder return len; 4479e28fff26SAlex Elder } 4480e28fff26SAlex Elder 4481e28fff26SAlex Elder /* 4482ea3352f4SAlex Elder * Finds the next token in *buf, dynamically allocates a buffer big 4483ea3352f4SAlex Elder * enough to hold a copy of it, and copies the token into the new 4484ea3352f4SAlex Elder * buffer. The copy is guaranteed to be terminated with '\0'. Note 4485ea3352f4SAlex Elder * that a duplicate buffer is created even for a zero-length token. 4486ea3352f4SAlex Elder * 4487ea3352f4SAlex Elder * Returns a pointer to the newly-allocated duplicate, or a null 4488ea3352f4SAlex Elder * pointer if memory for the duplicate was not available. If 4489ea3352f4SAlex Elder * the lenp argument is a non-null pointer, the length of the token 4490ea3352f4SAlex Elder * (not including the '\0') is returned in *lenp. 4491ea3352f4SAlex Elder * 4492ea3352f4SAlex Elder * If successful, the *buf pointer will be updated to point beyond 4493ea3352f4SAlex Elder * the end of the found token. 4494ea3352f4SAlex Elder * 4495ea3352f4SAlex Elder * Note: uses GFP_KERNEL for allocation. 4496ea3352f4SAlex Elder */ 4497ea3352f4SAlex Elder static inline char *dup_token(const char **buf, size_t *lenp) 4498ea3352f4SAlex Elder { 4499ea3352f4SAlex Elder char *dup; 4500ea3352f4SAlex Elder size_t len; 4501ea3352f4SAlex Elder 4502ea3352f4SAlex Elder len = next_token(buf); 45034caf35f9SAlex Elder dup = kmemdup(*buf, len + 1, GFP_KERNEL); 4504ea3352f4SAlex Elder if (!dup) 4505ea3352f4SAlex Elder return NULL; 4506ea3352f4SAlex Elder *(dup + len) = '\0'; 4507ea3352f4SAlex Elder *buf += len; 4508ea3352f4SAlex Elder 4509ea3352f4SAlex Elder if (lenp) 4510ea3352f4SAlex Elder *lenp = len; 4511ea3352f4SAlex Elder 4512ea3352f4SAlex Elder return dup; 4513ea3352f4SAlex Elder } 4514ea3352f4SAlex Elder 4515ea3352f4SAlex Elder /* 4516859c31dfSAlex Elder * Parse the options provided for an "rbd add" (i.e., rbd image 4517859c31dfSAlex Elder * mapping) request. These arrive via a write to /sys/bus/rbd/add, 4518859c31dfSAlex Elder * and the data written is passed here via a NUL-terminated buffer. 4519859c31dfSAlex Elder * Returns 0 if successful or an error code otherwise. 4520d22f76e7SAlex Elder * 4521859c31dfSAlex Elder * The information extracted from these options is recorded in 4522859c31dfSAlex Elder * the other parameters which return dynamically-allocated 4523859c31dfSAlex Elder * structures: 4524859c31dfSAlex Elder * ceph_opts 4525859c31dfSAlex Elder * The address of a pointer that will refer to a ceph options 4526859c31dfSAlex Elder * structure. Caller must release the returned pointer using 4527859c31dfSAlex Elder * ceph_destroy_options() when it is no longer needed. 4528859c31dfSAlex Elder * rbd_opts 4529859c31dfSAlex Elder * Address of an rbd options pointer. Fully initialized by 4530859c31dfSAlex Elder * this function; caller must release with kfree(). 4531859c31dfSAlex Elder * spec 4532859c31dfSAlex Elder * Address of an rbd image specification pointer. Fully 4533859c31dfSAlex Elder * initialized by this function based on parsed options. 4534859c31dfSAlex Elder * Caller must release with rbd_spec_put(). 4535859c31dfSAlex Elder * 4536859c31dfSAlex Elder * The options passed take this form: 4537859c31dfSAlex Elder * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>] 4538859c31dfSAlex Elder * where: 4539859c31dfSAlex Elder * <mon_addrs> 4540859c31dfSAlex Elder * A comma-separated list of one or more monitor addresses. 4541859c31dfSAlex Elder * A monitor address is an ip address, optionally followed 4542859c31dfSAlex Elder * by a port number (separated by a colon). 4543859c31dfSAlex Elder * I.e.: ip1[:port1][,ip2[:port2]...] 4544859c31dfSAlex Elder * <options> 4545859c31dfSAlex Elder * A comma-separated list of ceph and/or rbd options. 4546859c31dfSAlex Elder * <pool_name> 4547859c31dfSAlex Elder * The name of the rados pool containing the rbd image. 4548859c31dfSAlex Elder * <image_name> 4549859c31dfSAlex Elder * The name of the image in that pool to map. 4550859c31dfSAlex Elder * <snap_id> 4551859c31dfSAlex Elder * An optional snapshot id. If provided, the mapping will 4552859c31dfSAlex Elder * present data from the image at the time that snapshot was 4553859c31dfSAlex Elder * created. The image head is used if no snapshot id is 4554859c31dfSAlex Elder * provided. Snapshot mappings are always read-only. 4555a725f65eSAlex Elder */ 4556859c31dfSAlex Elder static int rbd_add_parse_args(const char *buf, 4557dc79b113SAlex Elder struct ceph_options **ceph_opts, 4558859c31dfSAlex Elder struct rbd_options **opts, 4559859c31dfSAlex Elder struct rbd_spec **rbd_spec) 4560a725f65eSAlex Elder { 4561e28fff26SAlex Elder size_t len; 4562859c31dfSAlex Elder char *options; 45630ddebc0cSAlex Elder const char *mon_addrs; 4564ecb4dc22SAlex Elder char *snap_name; 45650ddebc0cSAlex Elder size_t mon_addrs_size; 4566859c31dfSAlex Elder struct rbd_spec *spec = NULL; 45674e9afebaSAlex Elder struct rbd_options *rbd_opts = NULL; 4568859c31dfSAlex Elder struct ceph_options *copts; 4569dc79b113SAlex Elder int ret; 4570e28fff26SAlex Elder 4571e28fff26SAlex Elder /* The first four tokens are required */ 4572e28fff26SAlex Elder 45737ef3214aSAlex Elder len = next_token(&buf); 45744fb5d671SAlex Elder if (!len) { 45754fb5d671SAlex Elder rbd_warn(NULL, "no monitor address(es) provided"); 45764fb5d671SAlex Elder return -EINVAL; 45774fb5d671SAlex Elder } 45780ddebc0cSAlex Elder mon_addrs = buf; 4579f28e565aSAlex Elder mon_addrs_size = len + 1; 45807ef3214aSAlex Elder buf += len; 4581a725f65eSAlex Elder 4582dc79b113SAlex Elder ret = -EINVAL; 4583f28e565aSAlex Elder options = dup_token(&buf, NULL); 4584f28e565aSAlex Elder if (!options) 4585dc79b113SAlex Elder return -ENOMEM; 45864fb5d671SAlex Elder if (!*options) { 45874fb5d671SAlex Elder rbd_warn(NULL, "no options provided"); 45884fb5d671SAlex Elder goto out_err; 45894fb5d671SAlex Elder } 4590a725f65eSAlex Elder 4591859c31dfSAlex Elder spec = rbd_spec_alloc(); 4592859c31dfSAlex Elder if (!spec) 4593f28e565aSAlex Elder goto out_mem; 4594859c31dfSAlex Elder 4595859c31dfSAlex Elder spec->pool_name = dup_token(&buf, NULL); 4596859c31dfSAlex Elder if (!spec->pool_name) 4597859c31dfSAlex Elder goto out_mem; 45984fb5d671SAlex Elder if (!*spec->pool_name) { 45994fb5d671SAlex Elder rbd_warn(NULL, "no pool name provided"); 46004fb5d671SAlex Elder goto out_err; 46014fb5d671SAlex Elder } 4602e28fff26SAlex Elder 460369e7a02fSAlex Elder spec->image_name = dup_token(&buf, NULL); 4604859c31dfSAlex Elder if (!spec->image_name) 4605f28e565aSAlex Elder goto out_mem; 46064fb5d671SAlex Elder if (!*spec->image_name) { 46074fb5d671SAlex Elder rbd_warn(NULL, "no image name provided"); 46084fb5d671SAlex Elder goto out_err; 46094fb5d671SAlex Elder } 4610e28fff26SAlex Elder 4611f28e565aSAlex Elder /* 4612f28e565aSAlex Elder * Snapshot name is optional; default is to use "-" 4613f28e565aSAlex Elder * (indicating the head/no snapshot). 4614f28e565aSAlex Elder */ 46153feeb894SAlex Elder len = next_token(&buf); 4616820a5f3eSAlex Elder if (!len) { 46173feeb894SAlex Elder buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */ 46183feeb894SAlex Elder len = sizeof (RBD_SNAP_HEAD_NAME) - 1; 4619f28e565aSAlex Elder } else if (len > RBD_MAX_SNAP_NAME_LEN) { 4620dc79b113SAlex Elder ret = -ENAMETOOLONG; 4621f28e565aSAlex Elder goto out_err; 4622849b4260SAlex Elder } 4623ecb4dc22SAlex Elder snap_name = kmemdup(buf, len + 1, GFP_KERNEL); 4624ecb4dc22SAlex Elder if (!snap_name) 4625f28e565aSAlex Elder goto out_mem; 4626ecb4dc22SAlex Elder *(snap_name + len) = '\0'; 4627ecb4dc22SAlex Elder spec->snap_name = snap_name; 4628e5c35534SAlex Elder 46290ddebc0cSAlex Elder /* Initialize all rbd options to the defaults */ 4630e28fff26SAlex Elder 46314e9afebaSAlex Elder rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL); 46324e9afebaSAlex Elder if (!rbd_opts) 46334e9afebaSAlex Elder goto out_mem; 46344e9afebaSAlex Elder 46354e9afebaSAlex Elder rbd_opts->read_only = RBD_READ_ONLY_DEFAULT; 4636d22f76e7SAlex Elder 4637859c31dfSAlex Elder copts = ceph_parse_options(options, mon_addrs, 46380ddebc0cSAlex Elder mon_addrs + mon_addrs_size - 1, 46394e9afebaSAlex Elder parse_rbd_opts_token, rbd_opts); 4640859c31dfSAlex Elder if (IS_ERR(copts)) { 4641859c31dfSAlex Elder ret = PTR_ERR(copts); 4642dc79b113SAlex Elder goto out_err; 4643dc79b113SAlex Elder } 4644859c31dfSAlex Elder kfree(options); 4645859c31dfSAlex Elder 4646859c31dfSAlex Elder *ceph_opts = copts; 46474e9afebaSAlex Elder *opts = rbd_opts; 4648859c31dfSAlex Elder *rbd_spec = spec; 46490ddebc0cSAlex Elder 4650dc79b113SAlex Elder return 0; 4651f28e565aSAlex Elder out_mem: 4652dc79b113SAlex Elder ret = -ENOMEM; 4653d22f76e7SAlex Elder out_err: 4654859c31dfSAlex Elder kfree(rbd_opts); 4655859c31dfSAlex Elder rbd_spec_put(spec); 4656f28e565aSAlex Elder kfree(options); 4657d22f76e7SAlex Elder 4658dc79b113SAlex Elder return ret; 4659a725f65eSAlex Elder } 4660a725f65eSAlex Elder 4661589d30e0SAlex Elder /* 4662589d30e0SAlex Elder * An rbd format 2 image has a unique identifier, distinct from the 4663589d30e0SAlex Elder * name given to it by the user. Internally, that identifier is 4664589d30e0SAlex Elder * what's used to specify the names of objects related to the image. 4665589d30e0SAlex Elder * 4666589d30e0SAlex Elder * A special "rbd id" object is used to map an rbd image name to its 4667589d30e0SAlex Elder * id. If that object doesn't exist, then there is no v2 rbd image 4668589d30e0SAlex Elder * with the supplied name. 4669589d30e0SAlex Elder * 4670589d30e0SAlex Elder * This function will record the given rbd_dev's image_id field if 4671589d30e0SAlex Elder * it can be determined, and in that case will return 0. If any 4672589d30e0SAlex Elder * errors occur a negative errno will be returned and the rbd_dev's 4673589d30e0SAlex Elder * image_id field will be unchanged (and should be NULL). 4674589d30e0SAlex Elder */ 4675589d30e0SAlex Elder static int rbd_dev_image_id(struct rbd_device *rbd_dev) 4676589d30e0SAlex Elder { 4677589d30e0SAlex Elder int ret; 4678589d30e0SAlex Elder size_t size; 4679589d30e0SAlex Elder char *object_name; 4680589d30e0SAlex Elder void *response; 4681c0fba368SAlex Elder char *image_id; 46822f82ee54SAlex Elder 4683589d30e0SAlex Elder /* 46842c0d0a10SAlex Elder * When probing a parent image, the image id is already 46852c0d0a10SAlex Elder * known (and the image name likely is not). There's no 4686c0fba368SAlex Elder * need to fetch the image id again in this case. We 4687c0fba368SAlex Elder * do still need to set the image format though. 46882c0d0a10SAlex Elder */ 4689c0fba368SAlex Elder if (rbd_dev->spec->image_id) { 4690c0fba368SAlex Elder rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1; 4691c0fba368SAlex Elder 46922c0d0a10SAlex Elder return 0; 4693c0fba368SAlex Elder } 46942c0d0a10SAlex Elder 46952c0d0a10SAlex Elder /* 4696589d30e0SAlex Elder * First, see if the format 2 image id file exists, and if 4697589d30e0SAlex Elder * so, get the image's persistent id from it. 4698589d30e0SAlex Elder */ 469969e7a02fSAlex Elder size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name); 4700589d30e0SAlex Elder object_name = kmalloc(size, GFP_NOIO); 4701589d30e0SAlex Elder if (!object_name) 4702589d30e0SAlex Elder return -ENOMEM; 47030d7dbfceSAlex Elder sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name); 4704589d30e0SAlex Elder dout("rbd id object name is %s\n", object_name); 4705589d30e0SAlex Elder 4706589d30e0SAlex Elder /* Response will be an encoded string, which includes a length */ 4707589d30e0SAlex Elder 4708589d30e0SAlex Elder size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX; 4709589d30e0SAlex Elder response = kzalloc(size, GFP_NOIO); 4710589d30e0SAlex Elder if (!response) { 4711589d30e0SAlex Elder ret = -ENOMEM; 4712589d30e0SAlex Elder goto out; 4713589d30e0SAlex Elder } 4714589d30e0SAlex Elder 4715c0fba368SAlex Elder /* If it doesn't exist we'll assume it's a format 1 image */ 4716c0fba368SAlex Elder 471736be9a76SAlex Elder ret = rbd_obj_method_sync(rbd_dev, object_name, 47184157976bSAlex Elder "rbd", "get_id", NULL, 0, 4719e2a58ee5SAlex Elder response, RBD_IMAGE_ID_LEN_MAX); 472036be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 4721c0fba368SAlex Elder if (ret == -ENOENT) { 4722c0fba368SAlex Elder image_id = kstrdup("", GFP_KERNEL); 4723c0fba368SAlex Elder ret = image_id ? 0 : -ENOMEM; 4724c0fba368SAlex Elder if (!ret) 4725c0fba368SAlex Elder rbd_dev->image_format = 1; 4726c0fba368SAlex Elder } else if (ret > sizeof (__le32)) { 4727c0fba368SAlex Elder void *p = response; 4728589d30e0SAlex Elder 4729c0fba368SAlex Elder image_id = ceph_extract_encoded_string(&p, p + ret, 4730979ed480SAlex Elder NULL, GFP_NOIO); 4731c0fba368SAlex Elder ret = IS_ERR(image_id) ? PTR_ERR(image_id) : 0; 4732c0fba368SAlex Elder if (!ret) 4733c0fba368SAlex Elder rbd_dev->image_format = 2; 4734589d30e0SAlex Elder } else { 4735c0fba368SAlex Elder ret = -EINVAL; 4736c0fba368SAlex Elder } 4737c0fba368SAlex Elder 4738c0fba368SAlex Elder if (!ret) { 4739c0fba368SAlex Elder rbd_dev->spec->image_id = image_id; 4740c0fba368SAlex Elder dout("image_id is %s\n", image_id); 4741589d30e0SAlex Elder } 4742589d30e0SAlex Elder out: 4743589d30e0SAlex Elder kfree(response); 4744589d30e0SAlex Elder kfree(object_name); 4745589d30e0SAlex Elder 4746589d30e0SAlex Elder return ret; 4747589d30e0SAlex Elder } 4748589d30e0SAlex Elder 47493abef3b3SAlex Elder /* 47503abef3b3SAlex Elder * Undo whatever state changes are made by v1 or v2 header info 47513abef3b3SAlex Elder * call. 47523abef3b3SAlex Elder */ 47536fd48b3bSAlex Elder static void rbd_dev_unprobe(struct rbd_device *rbd_dev) 47546fd48b3bSAlex Elder { 47556fd48b3bSAlex Elder struct rbd_image_header *header; 47566fd48b3bSAlex Elder 4757392a9dadSAlex Elder /* Drop parent reference unless it's already been done (or none) */ 4758392a9dadSAlex Elder 4759392a9dadSAlex Elder if (rbd_dev->parent_overlap) 4760a2acd00eSAlex Elder rbd_dev_parent_put(rbd_dev); 47616fd48b3bSAlex Elder 47626fd48b3bSAlex Elder /* Free dynamic fields from the header, then zero it out */ 47636fd48b3bSAlex Elder 47646fd48b3bSAlex Elder header = &rbd_dev->header; 4765812164f8SAlex Elder ceph_put_snap_context(header->snapc); 47666fd48b3bSAlex Elder kfree(header->snap_sizes); 47676fd48b3bSAlex Elder kfree(header->snap_names); 47686fd48b3bSAlex Elder kfree(header->object_prefix); 47696fd48b3bSAlex Elder memset(header, 0, sizeof (*header)); 47706fd48b3bSAlex Elder } 47716fd48b3bSAlex Elder 47722df3fac7SAlex Elder static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev) 4773a30b71b9SAlex Elder { 4774a30b71b9SAlex Elder int ret; 4775a30b71b9SAlex Elder 47761e130199SAlex Elder ret = rbd_dev_v2_object_prefix(rbd_dev); 477757385b51SAlex Elder if (ret) 47781e130199SAlex Elder goto out_err; 4779b1b5402aSAlex Elder 47802df3fac7SAlex Elder /* 47812df3fac7SAlex Elder * Get the and check features for the image. Currently the 47822df3fac7SAlex Elder * features are assumed to never change. 47832df3fac7SAlex Elder */ 4784b1b5402aSAlex Elder ret = rbd_dev_v2_features(rbd_dev); 478557385b51SAlex Elder if (ret) 4786b1b5402aSAlex Elder goto out_err; 478735d489f9SAlex Elder 4788cc070d59SAlex Elder /* If the image supports fancy striping, get its parameters */ 4789cc070d59SAlex Elder 4790cc070d59SAlex Elder if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) { 4791cc070d59SAlex Elder ret = rbd_dev_v2_striping_info(rbd_dev); 4792cc070d59SAlex Elder if (ret < 0) 4793cc070d59SAlex Elder goto out_err; 4794cc070d59SAlex Elder } 47952df3fac7SAlex Elder /* No support for crypto and compression type format 2 images */ 4796a30b71b9SAlex Elder 479735152979SAlex Elder return 0; 47989d475de5SAlex Elder out_err: 4799642a2537SAlex Elder rbd_dev->header.features = 0; 48001e130199SAlex Elder kfree(rbd_dev->header.object_prefix); 48011e130199SAlex Elder rbd_dev->header.object_prefix = NULL; 48029d475de5SAlex Elder 48039d475de5SAlex Elder return ret; 4804a30b71b9SAlex Elder } 4805a30b71b9SAlex Elder 4806124afba2SAlex Elder static int rbd_dev_probe_parent(struct rbd_device *rbd_dev) 480783a06263SAlex Elder { 48082f82ee54SAlex Elder struct rbd_device *parent = NULL; 4809124afba2SAlex Elder struct rbd_spec *parent_spec; 4810124afba2SAlex Elder struct rbd_client *rbdc; 4811124afba2SAlex Elder int ret; 4812124afba2SAlex Elder 4813124afba2SAlex Elder if (!rbd_dev->parent_spec) 4814124afba2SAlex Elder return 0; 4815124afba2SAlex Elder /* 4816124afba2SAlex Elder * We need to pass a reference to the client and the parent 4817124afba2SAlex Elder * spec when creating the parent rbd_dev. Images related by 4818124afba2SAlex Elder * parent/child relationships always share both. 4819124afba2SAlex Elder */ 4820124afba2SAlex Elder parent_spec = rbd_spec_get(rbd_dev->parent_spec); 4821124afba2SAlex Elder rbdc = __rbd_get_client(rbd_dev->rbd_client); 4822124afba2SAlex Elder 4823124afba2SAlex Elder ret = -ENOMEM; 4824124afba2SAlex Elder parent = rbd_dev_create(rbdc, parent_spec); 4825124afba2SAlex Elder if (!parent) 4826124afba2SAlex Elder goto out_err; 4827124afba2SAlex Elder 48281f3ef788SAlex Elder ret = rbd_dev_image_probe(parent, false); 4829124afba2SAlex Elder if (ret < 0) 4830124afba2SAlex Elder goto out_err; 4831124afba2SAlex Elder rbd_dev->parent = parent; 4832a2acd00eSAlex Elder atomic_set(&rbd_dev->parent_ref, 1); 4833124afba2SAlex Elder 4834124afba2SAlex Elder return 0; 4835124afba2SAlex Elder out_err: 4836124afba2SAlex Elder if (parent) { 4837fb65d228SAlex Elder rbd_dev_unparent(rbd_dev); 4838124afba2SAlex Elder kfree(rbd_dev->header_name); 4839124afba2SAlex Elder rbd_dev_destroy(parent); 4840124afba2SAlex Elder } else { 4841124afba2SAlex Elder rbd_put_client(rbdc); 4842124afba2SAlex Elder rbd_spec_put(parent_spec); 4843124afba2SAlex Elder } 4844124afba2SAlex Elder 4845124afba2SAlex Elder return ret; 4846124afba2SAlex Elder } 4847124afba2SAlex Elder 4848200a6a8bSAlex Elder static int rbd_dev_device_setup(struct rbd_device *rbd_dev) 4849124afba2SAlex Elder { 485083a06263SAlex Elder int ret; 485183a06263SAlex Elder 485283a06263SAlex Elder /* generate unique id: find highest unique id, add one */ 485383a06263SAlex Elder rbd_dev_id_get(rbd_dev); 485483a06263SAlex Elder 485583a06263SAlex Elder /* Fill in the device name, now that we have its id. */ 485683a06263SAlex Elder BUILD_BUG_ON(DEV_NAME_LEN 485783a06263SAlex Elder < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH); 485883a06263SAlex Elder sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id); 485983a06263SAlex Elder 486083a06263SAlex Elder /* Get our block major device number. */ 486183a06263SAlex Elder 486283a06263SAlex Elder ret = register_blkdev(0, rbd_dev->name); 486383a06263SAlex Elder if (ret < 0) 486483a06263SAlex Elder goto err_out_id; 486583a06263SAlex Elder rbd_dev->major = ret; 486683a06263SAlex Elder 486783a06263SAlex Elder /* Set up the blkdev mapping. */ 486883a06263SAlex Elder 486983a06263SAlex Elder ret = rbd_init_disk(rbd_dev); 487083a06263SAlex Elder if (ret) 487183a06263SAlex Elder goto err_out_blkdev; 487283a06263SAlex Elder 4873f35a4deeSAlex Elder ret = rbd_dev_mapping_set(rbd_dev); 487483a06263SAlex Elder if (ret) 487583a06263SAlex Elder goto err_out_disk; 4876f35a4deeSAlex Elder set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE); 4877f35a4deeSAlex Elder 4878f35a4deeSAlex Elder ret = rbd_bus_add_dev(rbd_dev); 4879f35a4deeSAlex Elder if (ret) 4880f35a4deeSAlex Elder goto err_out_mapping; 488183a06263SAlex Elder 488283a06263SAlex Elder /* Everything's ready. Announce the disk to the world. */ 488383a06263SAlex Elder 4884129b79d4SAlex Elder set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); 488583a06263SAlex Elder add_disk(rbd_dev->disk); 488683a06263SAlex Elder 488783a06263SAlex Elder pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name, 488883a06263SAlex Elder (unsigned long long) rbd_dev->mapping.size); 488983a06263SAlex Elder 489083a06263SAlex Elder return ret; 48912f82ee54SAlex Elder 4892f35a4deeSAlex Elder err_out_mapping: 4893f35a4deeSAlex Elder rbd_dev_mapping_clear(rbd_dev); 489483a06263SAlex Elder err_out_disk: 489583a06263SAlex Elder rbd_free_disk(rbd_dev); 489683a06263SAlex Elder err_out_blkdev: 489783a06263SAlex Elder unregister_blkdev(rbd_dev->major, rbd_dev->name); 489883a06263SAlex Elder err_out_id: 489983a06263SAlex Elder rbd_dev_id_put(rbd_dev); 4900d1cf5788SAlex Elder rbd_dev_mapping_clear(rbd_dev); 490183a06263SAlex Elder 490283a06263SAlex Elder return ret; 490383a06263SAlex Elder } 490483a06263SAlex Elder 4905332bb12dSAlex Elder static int rbd_dev_header_name(struct rbd_device *rbd_dev) 4906332bb12dSAlex Elder { 4907332bb12dSAlex Elder struct rbd_spec *spec = rbd_dev->spec; 4908332bb12dSAlex Elder size_t size; 4909332bb12dSAlex Elder 4910332bb12dSAlex Elder /* Record the header object name for this rbd image. */ 4911332bb12dSAlex Elder 4912332bb12dSAlex Elder rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 4913332bb12dSAlex Elder 4914332bb12dSAlex Elder if (rbd_dev->image_format == 1) 4915332bb12dSAlex Elder size = strlen(spec->image_name) + sizeof (RBD_SUFFIX); 4916332bb12dSAlex Elder else 4917332bb12dSAlex Elder size = sizeof (RBD_HEADER_PREFIX) + strlen(spec->image_id); 4918332bb12dSAlex Elder 4919332bb12dSAlex Elder rbd_dev->header_name = kmalloc(size, GFP_KERNEL); 4920332bb12dSAlex Elder if (!rbd_dev->header_name) 4921332bb12dSAlex Elder return -ENOMEM; 4922332bb12dSAlex Elder 4923332bb12dSAlex Elder if (rbd_dev->image_format == 1) 4924332bb12dSAlex Elder sprintf(rbd_dev->header_name, "%s%s", 4925332bb12dSAlex Elder spec->image_name, RBD_SUFFIX); 4926332bb12dSAlex Elder else 4927332bb12dSAlex Elder sprintf(rbd_dev->header_name, "%s%s", 4928332bb12dSAlex Elder RBD_HEADER_PREFIX, spec->image_id); 4929332bb12dSAlex Elder return 0; 4930332bb12dSAlex Elder } 4931332bb12dSAlex Elder 4932200a6a8bSAlex Elder static void rbd_dev_image_release(struct rbd_device *rbd_dev) 4933200a6a8bSAlex Elder { 49346fd48b3bSAlex Elder rbd_dev_unprobe(rbd_dev); 4935200a6a8bSAlex Elder kfree(rbd_dev->header_name); 49366fd48b3bSAlex Elder rbd_dev->header_name = NULL; 49376fd48b3bSAlex Elder rbd_dev->image_format = 0; 49386fd48b3bSAlex Elder kfree(rbd_dev->spec->image_id); 49396fd48b3bSAlex Elder rbd_dev->spec->image_id = NULL; 49406fd48b3bSAlex Elder 4941200a6a8bSAlex Elder rbd_dev_destroy(rbd_dev); 4942200a6a8bSAlex Elder } 4943200a6a8bSAlex Elder 4944a30b71b9SAlex Elder /* 4945a30b71b9SAlex Elder * Probe for the existence of the header object for the given rbd 49461f3ef788SAlex Elder * device. If this image is the one being mapped (i.e., not a 49471f3ef788SAlex Elder * parent), initiate a watch on its header object before using that 49481f3ef788SAlex Elder * object to get detailed information about the rbd image. 4949a30b71b9SAlex Elder */ 49501f3ef788SAlex Elder static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool mapping) 4951a30b71b9SAlex Elder { 4952a30b71b9SAlex Elder int ret; 4953b644de2bSAlex Elder int tmp; 4954a30b71b9SAlex Elder 4955a30b71b9SAlex Elder /* 49563abef3b3SAlex Elder * Get the id from the image id object. Unless there's an 49573abef3b3SAlex Elder * error, rbd_dev->spec->image_id will be filled in with 49583abef3b3SAlex Elder * a dynamically-allocated string, and rbd_dev->image_format 49593abef3b3SAlex Elder * will be set to either 1 or 2. 4960a30b71b9SAlex Elder */ 4961a30b71b9SAlex Elder ret = rbd_dev_image_id(rbd_dev); 4962a30b71b9SAlex Elder if (ret) 4963c0fba368SAlex Elder return ret; 4964c0fba368SAlex Elder rbd_assert(rbd_dev->spec->image_id); 4965c0fba368SAlex Elder rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 4966c0fba368SAlex Elder 4967332bb12dSAlex Elder ret = rbd_dev_header_name(rbd_dev); 4968332bb12dSAlex Elder if (ret) 4969332bb12dSAlex Elder goto err_out_format; 4970332bb12dSAlex Elder 49711f3ef788SAlex Elder if (mapping) { 49721f3ef788SAlex Elder ret = rbd_dev_header_watch_sync(rbd_dev, true); 4973b644de2bSAlex Elder if (ret) 4974b644de2bSAlex Elder goto out_header_name; 49751f3ef788SAlex Elder } 4976b644de2bSAlex Elder 4977c0fba368SAlex Elder if (rbd_dev->image_format == 1) 497899a41ebcSAlex Elder ret = rbd_dev_v1_header_info(rbd_dev); 4979a30b71b9SAlex Elder else 49802df3fac7SAlex Elder ret = rbd_dev_v2_header_info(rbd_dev); 49815655c4d9SAlex Elder if (ret) 4982b644de2bSAlex Elder goto err_out_watch; 4983a30b71b9SAlex Elder 49849bb81c9bSAlex Elder ret = rbd_dev_spec_update(rbd_dev); 49859bb81c9bSAlex Elder if (ret) 498633dca39fSAlex Elder goto err_out_probe; 49879bb81c9bSAlex Elder 49889bb81c9bSAlex Elder ret = rbd_dev_probe_parent(rbd_dev); 498930d60ba2SAlex Elder if (ret) 499030d60ba2SAlex Elder goto err_out_probe; 499183a06263SAlex Elder 499230d60ba2SAlex Elder dout("discovered format %u image, header name is %s\n", 499330d60ba2SAlex Elder rbd_dev->image_format, rbd_dev->header_name); 499430d60ba2SAlex Elder 499530d60ba2SAlex Elder return 0; 49966fd48b3bSAlex Elder err_out_probe: 49976fd48b3bSAlex Elder rbd_dev_unprobe(rbd_dev); 4998b644de2bSAlex Elder err_out_watch: 49991f3ef788SAlex Elder if (mapping) { 50001f3ef788SAlex Elder tmp = rbd_dev_header_watch_sync(rbd_dev, false); 5001b644de2bSAlex Elder if (tmp) 50021f3ef788SAlex Elder rbd_warn(rbd_dev, "unable to tear down " 50031f3ef788SAlex Elder "watch request (%d)\n", tmp); 50041f3ef788SAlex Elder } 5005332bb12dSAlex Elder out_header_name: 5006332bb12dSAlex Elder kfree(rbd_dev->header_name); 5007332bb12dSAlex Elder rbd_dev->header_name = NULL; 5008332bb12dSAlex Elder err_out_format: 5009332bb12dSAlex Elder rbd_dev->image_format = 0; 50105655c4d9SAlex Elder kfree(rbd_dev->spec->image_id); 50115655c4d9SAlex Elder rbd_dev->spec->image_id = NULL; 50125655c4d9SAlex Elder 50135655c4d9SAlex Elder dout("probe failed, returning %d\n", ret); 50145655c4d9SAlex Elder 50155655c4d9SAlex Elder return ret; 501683a06263SAlex Elder } 501783a06263SAlex Elder 501859c2be1eSYehuda Sadeh static ssize_t rbd_add(struct bus_type *bus, 501959c2be1eSYehuda Sadeh const char *buf, 502059c2be1eSYehuda Sadeh size_t count) 5021602adf40SYehuda Sadeh { 5022cb8627c7SAlex Elder struct rbd_device *rbd_dev = NULL; 5023dc79b113SAlex Elder struct ceph_options *ceph_opts = NULL; 50244e9afebaSAlex Elder struct rbd_options *rbd_opts = NULL; 5025859c31dfSAlex Elder struct rbd_spec *spec = NULL; 50269d3997fdSAlex Elder struct rbd_client *rbdc; 502727cc2594SAlex Elder struct ceph_osd_client *osdc; 502851344a38SAlex Elder bool read_only; 502927cc2594SAlex Elder int rc = -ENOMEM; 5030602adf40SYehuda Sadeh 5031602adf40SYehuda Sadeh if (!try_module_get(THIS_MODULE)) 5032602adf40SYehuda Sadeh return -ENODEV; 5033602adf40SYehuda Sadeh 5034a725f65eSAlex Elder /* parse add command */ 5035859c31dfSAlex Elder rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec); 5036dc79b113SAlex Elder if (rc < 0) 5037bd4ba655SAlex Elder goto err_out_module; 503851344a38SAlex Elder read_only = rbd_opts->read_only; 503951344a38SAlex Elder kfree(rbd_opts); 504051344a38SAlex Elder rbd_opts = NULL; /* done with this */ 5041a725f65eSAlex Elder 50429d3997fdSAlex Elder rbdc = rbd_get_client(ceph_opts); 50439d3997fdSAlex Elder if (IS_ERR(rbdc)) { 50449d3997fdSAlex Elder rc = PTR_ERR(rbdc); 50450ddebc0cSAlex Elder goto err_out_args; 50469d3997fdSAlex Elder } 5047602adf40SYehuda Sadeh 5048602adf40SYehuda Sadeh /* pick the pool */ 50499d3997fdSAlex Elder osdc = &rbdc->client->osdc; 5050859c31dfSAlex Elder rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name); 5051602adf40SYehuda Sadeh if (rc < 0) 5052602adf40SYehuda Sadeh goto err_out_client; 5053859c31dfSAlex Elder spec->pool_id = (u64)rc; 5054859c31dfSAlex Elder 50550903e875SAlex Elder /* The ceph file layout needs to fit pool id in 32 bits */ 50560903e875SAlex Elder 5057c0cd10dbSAlex Elder if (spec->pool_id > (u64)U32_MAX) { 5058c0cd10dbSAlex Elder rbd_warn(NULL, "pool id too large (%llu > %u)\n", 5059c0cd10dbSAlex Elder (unsigned long long)spec->pool_id, U32_MAX); 50600903e875SAlex Elder rc = -EIO; 50610903e875SAlex Elder goto err_out_client; 50620903e875SAlex Elder } 50630903e875SAlex Elder 5064c53d5893SAlex Elder rbd_dev = rbd_dev_create(rbdc, spec); 5065bd4ba655SAlex Elder if (!rbd_dev) 5066bd4ba655SAlex Elder goto err_out_client; 5067c53d5893SAlex Elder rbdc = NULL; /* rbd_dev now owns this */ 5068c53d5893SAlex Elder spec = NULL; /* rbd_dev now owns this */ 5069602adf40SYehuda Sadeh 50701f3ef788SAlex Elder rc = rbd_dev_image_probe(rbd_dev, true); 5071a30b71b9SAlex Elder if (rc < 0) 5072c53d5893SAlex Elder goto err_out_rbd_dev; 507305fd6f6fSAlex Elder 50747ce4eef7SAlex Elder /* If we are mapping a snapshot it must be marked read-only */ 50757ce4eef7SAlex Elder 50767ce4eef7SAlex Elder if (rbd_dev->spec->snap_id != CEPH_NOSNAP) 50777ce4eef7SAlex Elder read_only = true; 50787ce4eef7SAlex Elder rbd_dev->mapping.read_only = read_only; 50797ce4eef7SAlex Elder 5080b536f69aSAlex Elder rc = rbd_dev_device_setup(rbd_dev); 50813abef3b3SAlex Elder if (rc) { 50823abef3b3SAlex Elder rbd_dev_image_release(rbd_dev); 50833abef3b3SAlex Elder goto err_out_module; 50843abef3b3SAlex Elder } 50853abef3b3SAlex Elder 5086602adf40SYehuda Sadeh return count; 5087b536f69aSAlex Elder 5088c53d5893SAlex Elder err_out_rbd_dev: 5089c53d5893SAlex Elder rbd_dev_destroy(rbd_dev); 5090bd4ba655SAlex Elder err_out_client: 50919d3997fdSAlex Elder rbd_put_client(rbdc); 50920ddebc0cSAlex Elder err_out_args: 5093859c31dfSAlex Elder rbd_spec_put(spec); 5094bd4ba655SAlex Elder err_out_module: 5095bd4ba655SAlex Elder module_put(THIS_MODULE); 509627cc2594SAlex Elder 5097602adf40SYehuda Sadeh dout("Error adding device %s\n", buf); 509827cc2594SAlex Elder 509927cc2594SAlex Elder return (ssize_t)rc; 5100602adf40SYehuda Sadeh } 5101602adf40SYehuda Sadeh 5102200a6a8bSAlex Elder static void rbd_dev_device_release(struct device *dev) 5103602adf40SYehuda Sadeh { 5104593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 5105602adf40SYehuda Sadeh 5106602adf40SYehuda Sadeh rbd_free_disk(rbd_dev); 5107200a6a8bSAlex Elder clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); 51086d80b130SAlex Elder rbd_dev_mapping_clear(rbd_dev); 5109602adf40SYehuda Sadeh unregister_blkdev(rbd_dev->major, rbd_dev->name); 5110200a6a8bSAlex Elder rbd_dev->major = 0; 5111e2839308SAlex Elder rbd_dev_id_put(rbd_dev); 5112d1cf5788SAlex Elder rbd_dev_mapping_clear(rbd_dev); 5113602adf40SYehuda Sadeh } 5114602adf40SYehuda Sadeh 511505a46afdSAlex Elder static void rbd_dev_remove_parent(struct rbd_device *rbd_dev) 511605a46afdSAlex Elder { 5117ad945fc1SAlex Elder while (rbd_dev->parent) { 511805a46afdSAlex Elder struct rbd_device *first = rbd_dev; 511905a46afdSAlex Elder struct rbd_device *second = first->parent; 512005a46afdSAlex Elder struct rbd_device *third; 512105a46afdSAlex Elder 512205a46afdSAlex Elder /* 512305a46afdSAlex Elder * Follow to the parent with no grandparent and 512405a46afdSAlex Elder * remove it. 512505a46afdSAlex Elder */ 512605a46afdSAlex Elder while (second && (third = second->parent)) { 512705a46afdSAlex Elder first = second; 512805a46afdSAlex Elder second = third; 512905a46afdSAlex Elder } 5130ad945fc1SAlex Elder rbd_assert(second); 51318ad42cd0SAlex Elder rbd_dev_image_release(second); 5132ad945fc1SAlex Elder first->parent = NULL; 5133ad945fc1SAlex Elder first->parent_overlap = 0; 5134ad945fc1SAlex Elder 5135ad945fc1SAlex Elder rbd_assert(first->parent_spec); 513605a46afdSAlex Elder rbd_spec_put(first->parent_spec); 513705a46afdSAlex Elder first->parent_spec = NULL; 513805a46afdSAlex Elder } 513905a46afdSAlex Elder } 514005a46afdSAlex Elder 5141dfc5606dSYehuda Sadeh static ssize_t rbd_remove(struct bus_type *bus, 5142602adf40SYehuda Sadeh const char *buf, 5143602adf40SYehuda Sadeh size_t count) 5144602adf40SYehuda Sadeh { 5145602adf40SYehuda Sadeh struct rbd_device *rbd_dev = NULL; 5146751cc0e3SAlex Elder struct list_head *tmp; 5147751cc0e3SAlex Elder int dev_id; 5148602adf40SYehuda Sadeh unsigned long ul; 514982a442d2SAlex Elder bool already = false; 51500d8189e1SAlex Elder int ret; 5151602adf40SYehuda Sadeh 51520d8189e1SAlex Elder ret = strict_strtoul(buf, 10, &ul); 51530d8189e1SAlex Elder if (ret) 51540d8189e1SAlex Elder return ret; 5155602adf40SYehuda Sadeh 5156602adf40SYehuda Sadeh /* convert to int; abort if we lost anything in the conversion */ 5157751cc0e3SAlex Elder dev_id = (int)ul; 5158751cc0e3SAlex Elder if (dev_id != ul) 5159602adf40SYehuda Sadeh return -EINVAL; 5160602adf40SYehuda Sadeh 5161602adf40SYehuda Sadeh ret = -ENOENT; 5162751cc0e3SAlex Elder spin_lock(&rbd_dev_list_lock); 5163751cc0e3SAlex Elder list_for_each(tmp, &rbd_dev_list) { 5164751cc0e3SAlex Elder rbd_dev = list_entry(tmp, struct rbd_device, node); 5165751cc0e3SAlex Elder if (rbd_dev->dev_id == dev_id) { 5166751cc0e3SAlex Elder ret = 0; 5167751cc0e3SAlex Elder break; 5168602adf40SYehuda Sadeh } 5169751cc0e3SAlex Elder } 5170751cc0e3SAlex Elder if (!ret) { 5171a14ea269SAlex Elder spin_lock_irq(&rbd_dev->lock); 5172b82d167bSAlex Elder if (rbd_dev->open_count) 517342382b70SAlex Elder ret = -EBUSY; 5174b82d167bSAlex Elder else 517582a442d2SAlex Elder already = test_and_set_bit(RBD_DEV_FLAG_REMOVING, 517682a442d2SAlex Elder &rbd_dev->flags); 5177a14ea269SAlex Elder spin_unlock_irq(&rbd_dev->lock); 5178751cc0e3SAlex Elder } 5179751cc0e3SAlex Elder spin_unlock(&rbd_dev_list_lock); 518082a442d2SAlex Elder if (ret < 0 || already) 51811ba0f1e7SAlex Elder return ret; 5182751cc0e3SAlex Elder 51831f3ef788SAlex Elder ret = rbd_dev_header_watch_sync(rbd_dev, false); 51841f3ef788SAlex Elder if (ret) 51851f3ef788SAlex Elder rbd_warn(rbd_dev, "failed to cancel watch event (%d)\n", ret); 51869abc5990SJosh Durgin 51879abc5990SJosh Durgin /* 51889abc5990SJosh Durgin * flush remaining watch callbacks - these must be complete 51899abc5990SJosh Durgin * before the osd_client is shutdown 51909abc5990SJosh Durgin */ 51919abc5990SJosh Durgin dout("%s: flushing notifies", __func__); 51929abc5990SJosh Durgin ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc); 51939875201eSJosh Durgin /* 51949875201eSJosh Durgin * Don't free anything from rbd_dev->disk until after all 51959875201eSJosh Durgin * notifies are completely processed. Otherwise 51969875201eSJosh Durgin * rbd_bus_del_dev() will race with rbd_watch_cb(), resulting 51979875201eSJosh Durgin * in a potential use after free of rbd_dev->disk or rbd_dev. 51989875201eSJosh Durgin */ 51999875201eSJosh Durgin rbd_bus_del_dev(rbd_dev); 52008ad42cd0SAlex Elder rbd_dev_image_release(rbd_dev); 520179ab7558SAlex Elder module_put(THIS_MODULE); 5202aafb230eSAlex Elder 52031ba0f1e7SAlex Elder return count; 5204602adf40SYehuda Sadeh } 5205602adf40SYehuda Sadeh 5206602adf40SYehuda Sadeh /* 5207602adf40SYehuda Sadeh * create control files in sysfs 5208dfc5606dSYehuda Sadeh * /sys/bus/rbd/... 5209602adf40SYehuda Sadeh */ 5210602adf40SYehuda Sadeh static int rbd_sysfs_init(void) 5211602adf40SYehuda Sadeh { 5212dfc5606dSYehuda Sadeh int ret; 5213602adf40SYehuda Sadeh 5214fed4c143SAlex Elder ret = device_register(&rbd_root_dev); 5215dfc5606dSYehuda Sadeh if (ret < 0) 5216dfc5606dSYehuda Sadeh return ret; 5217602adf40SYehuda Sadeh 5218fed4c143SAlex Elder ret = bus_register(&rbd_bus_type); 5219fed4c143SAlex Elder if (ret < 0) 5220fed4c143SAlex Elder device_unregister(&rbd_root_dev); 5221602adf40SYehuda Sadeh 5222602adf40SYehuda Sadeh return ret; 5223602adf40SYehuda Sadeh } 5224602adf40SYehuda Sadeh 5225602adf40SYehuda Sadeh static void rbd_sysfs_cleanup(void) 5226602adf40SYehuda Sadeh { 5227dfc5606dSYehuda Sadeh bus_unregister(&rbd_bus_type); 5228fed4c143SAlex Elder device_unregister(&rbd_root_dev); 5229602adf40SYehuda Sadeh } 5230602adf40SYehuda Sadeh 52311c2a9dfeSAlex Elder static int rbd_slab_init(void) 52321c2a9dfeSAlex Elder { 52331c2a9dfeSAlex Elder rbd_assert(!rbd_img_request_cache); 52341c2a9dfeSAlex Elder rbd_img_request_cache = kmem_cache_create("rbd_img_request", 52351c2a9dfeSAlex Elder sizeof (struct rbd_img_request), 52361c2a9dfeSAlex Elder __alignof__(struct rbd_img_request), 52371c2a9dfeSAlex Elder 0, NULL); 5238868311b1SAlex Elder if (!rbd_img_request_cache) 5239868311b1SAlex Elder return -ENOMEM; 5240868311b1SAlex Elder 5241868311b1SAlex Elder rbd_assert(!rbd_obj_request_cache); 5242868311b1SAlex Elder rbd_obj_request_cache = kmem_cache_create("rbd_obj_request", 5243868311b1SAlex Elder sizeof (struct rbd_obj_request), 5244868311b1SAlex Elder __alignof__(struct rbd_obj_request), 5245868311b1SAlex Elder 0, NULL); 524678c2a44aSAlex Elder if (!rbd_obj_request_cache) 524778c2a44aSAlex Elder goto out_err; 524878c2a44aSAlex Elder 524978c2a44aSAlex Elder rbd_assert(!rbd_segment_name_cache); 525078c2a44aSAlex Elder rbd_segment_name_cache = kmem_cache_create("rbd_segment_name", 525178c2a44aSAlex Elder MAX_OBJ_NAME_SIZE + 1, 1, 0, NULL); 525278c2a44aSAlex Elder if (rbd_segment_name_cache) 52531c2a9dfeSAlex Elder return 0; 525478c2a44aSAlex Elder out_err: 525578c2a44aSAlex Elder if (rbd_obj_request_cache) { 525678c2a44aSAlex Elder kmem_cache_destroy(rbd_obj_request_cache); 525778c2a44aSAlex Elder rbd_obj_request_cache = NULL; 525878c2a44aSAlex Elder } 52591c2a9dfeSAlex Elder 5260868311b1SAlex Elder kmem_cache_destroy(rbd_img_request_cache); 5261868311b1SAlex Elder rbd_img_request_cache = NULL; 5262868311b1SAlex Elder 52631c2a9dfeSAlex Elder return -ENOMEM; 52641c2a9dfeSAlex Elder } 52651c2a9dfeSAlex Elder 52661c2a9dfeSAlex Elder static void rbd_slab_exit(void) 52671c2a9dfeSAlex Elder { 526878c2a44aSAlex Elder rbd_assert(rbd_segment_name_cache); 526978c2a44aSAlex Elder kmem_cache_destroy(rbd_segment_name_cache); 527078c2a44aSAlex Elder rbd_segment_name_cache = NULL; 527178c2a44aSAlex Elder 5272868311b1SAlex Elder rbd_assert(rbd_obj_request_cache); 5273868311b1SAlex Elder kmem_cache_destroy(rbd_obj_request_cache); 5274868311b1SAlex Elder rbd_obj_request_cache = NULL; 5275868311b1SAlex Elder 52761c2a9dfeSAlex Elder rbd_assert(rbd_img_request_cache); 52771c2a9dfeSAlex Elder kmem_cache_destroy(rbd_img_request_cache); 52781c2a9dfeSAlex Elder rbd_img_request_cache = NULL; 52791c2a9dfeSAlex Elder } 52801c2a9dfeSAlex Elder 5281cc344fa1SAlex Elder static int __init rbd_init(void) 5282602adf40SYehuda Sadeh { 5283602adf40SYehuda Sadeh int rc; 5284602adf40SYehuda Sadeh 52851e32d34cSAlex Elder if (!libceph_compatible(NULL)) { 52861e32d34cSAlex Elder rbd_warn(NULL, "libceph incompatibility (quitting)"); 52871e32d34cSAlex Elder 52881e32d34cSAlex Elder return -EINVAL; 52891e32d34cSAlex Elder } 52901c2a9dfeSAlex Elder rc = rbd_slab_init(); 5291602adf40SYehuda Sadeh if (rc) 5292602adf40SYehuda Sadeh return rc; 52931c2a9dfeSAlex Elder rc = rbd_sysfs_init(); 52941c2a9dfeSAlex Elder if (rc) 52951c2a9dfeSAlex Elder rbd_slab_exit(); 52961c2a9dfeSAlex Elder else 5297f0f8cef5SAlex Elder pr_info("loaded " RBD_DRV_NAME_LONG "\n"); 52981c2a9dfeSAlex Elder 52991c2a9dfeSAlex Elder return rc; 5300602adf40SYehuda Sadeh } 5301602adf40SYehuda Sadeh 5302cc344fa1SAlex Elder static void __exit rbd_exit(void) 5303602adf40SYehuda Sadeh { 5304602adf40SYehuda Sadeh rbd_sysfs_cleanup(); 53051c2a9dfeSAlex Elder rbd_slab_exit(); 5306602adf40SYehuda Sadeh } 5307602adf40SYehuda Sadeh 5308602adf40SYehuda Sadeh module_init(rbd_init); 5309602adf40SYehuda Sadeh module_exit(rbd_exit); 5310602adf40SYehuda Sadeh 5311d552c619SAlex Elder MODULE_AUTHOR("Alex Elder <elder@inktank.com>"); 5312602adf40SYehuda Sadeh MODULE_AUTHOR("Sage Weil <sage@newdream.net>"); 5313602adf40SYehuda Sadeh MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>"); 5314602adf40SYehuda Sadeh MODULE_DESCRIPTION("rados block device"); 5315602adf40SYehuda Sadeh 5316602adf40SYehuda Sadeh /* following authorship retained from original osdblk.c */ 5317602adf40SYehuda Sadeh MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>"); 5318602adf40SYehuda Sadeh 5319602adf40SYehuda Sadeh MODULE_LICENSE("GPL"); 5320