1602adf40SYehuda Sadeh /* 2602adf40SYehuda Sadeh rbd.c -- Export ceph rados objects as a Linux block device 3602adf40SYehuda Sadeh 4602adf40SYehuda Sadeh 5602adf40SYehuda Sadeh based on drivers/block/osdblk.c: 6602adf40SYehuda Sadeh 7602adf40SYehuda Sadeh Copyright 2009 Red Hat, Inc. 8602adf40SYehuda Sadeh 9602adf40SYehuda Sadeh This program is free software; you can redistribute it and/or modify 10602adf40SYehuda Sadeh it under the terms of the GNU General Public License as published by 11602adf40SYehuda Sadeh the Free Software Foundation. 12602adf40SYehuda Sadeh 13602adf40SYehuda Sadeh This program is distributed in the hope that it will be useful, 14602adf40SYehuda Sadeh but WITHOUT ANY WARRANTY; without even the implied warranty of 15602adf40SYehuda Sadeh MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16602adf40SYehuda Sadeh GNU General Public License for more details. 17602adf40SYehuda Sadeh 18602adf40SYehuda Sadeh You should have received a copy of the GNU General Public License 19602adf40SYehuda Sadeh along with this program; see the file COPYING. If not, write to 20602adf40SYehuda Sadeh the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 21602adf40SYehuda Sadeh 22602adf40SYehuda Sadeh 23602adf40SYehuda Sadeh 24dfc5606dSYehuda Sadeh For usage instructions, please refer to: 25602adf40SYehuda Sadeh 26dfc5606dSYehuda Sadeh Documentation/ABI/testing/sysfs-bus-rbd 27602adf40SYehuda Sadeh 28602adf40SYehuda Sadeh */ 29602adf40SYehuda Sadeh 30602adf40SYehuda Sadeh #include <linux/ceph/libceph.h> 31602adf40SYehuda Sadeh #include <linux/ceph/osd_client.h> 32602adf40SYehuda Sadeh #include <linux/ceph/mon_client.h> 33602adf40SYehuda Sadeh #include <linux/ceph/decode.h> 3459c2be1eSYehuda Sadeh #include <linux/parser.h> 35602adf40SYehuda Sadeh 36602adf40SYehuda Sadeh #include <linux/kernel.h> 37602adf40SYehuda Sadeh #include <linux/device.h> 38602adf40SYehuda Sadeh #include <linux/module.h> 39602adf40SYehuda Sadeh #include <linux/fs.h> 40602adf40SYehuda Sadeh #include <linux/blkdev.h> 41602adf40SYehuda Sadeh 42602adf40SYehuda Sadeh #include "rbd_types.h" 43602adf40SYehuda Sadeh 44aafb230eSAlex Elder #define RBD_DEBUG /* Activate rbd_assert() calls */ 45aafb230eSAlex Elder 46593a9e7bSAlex Elder /* 47593a9e7bSAlex Elder * The basic unit of block I/O is a sector. It is interpreted in a 48593a9e7bSAlex Elder * number of contexts in Linux (blk, bio, genhd), but the default is 49593a9e7bSAlex Elder * universally 512 bytes. These symbols are just slightly more 50593a9e7bSAlex Elder * meaningful than the bare numbers they represent. 51593a9e7bSAlex Elder */ 52593a9e7bSAlex Elder #define SECTOR_SHIFT 9 53593a9e7bSAlex Elder #define SECTOR_SIZE (1ULL << SECTOR_SHIFT) 54593a9e7bSAlex Elder 55f0f8cef5SAlex Elder #define RBD_DRV_NAME "rbd" 56f0f8cef5SAlex Elder #define RBD_DRV_NAME_LONG "rbd (rados block device)" 57602adf40SYehuda Sadeh 58602adf40SYehuda Sadeh #define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */ 59602adf40SYehuda Sadeh 60d4b125e9SAlex Elder #define RBD_SNAP_DEV_NAME_PREFIX "snap_" 61d4b125e9SAlex Elder #define RBD_MAX_SNAP_NAME_LEN \ 62d4b125e9SAlex Elder (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1)) 63d4b125e9SAlex Elder 6435d489f9SAlex Elder #define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */ 65602adf40SYehuda Sadeh 66602adf40SYehuda Sadeh #define RBD_SNAP_HEAD_NAME "-" 67602adf40SYehuda Sadeh 689e15b77dSAlex Elder /* This allows a single page to hold an image name sent by OSD */ 699e15b77dSAlex Elder #define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1) 70589d30e0SAlex Elder #define RBD_IMAGE_ID_LEN_MAX 64 719e15b77dSAlex Elder 721e130199SAlex Elder #define RBD_OBJ_PREFIX_LEN_MAX 64 73589d30e0SAlex Elder 74d889140cSAlex Elder /* Feature bits */ 75d889140cSAlex Elder 765cbf6f12SAlex Elder #define RBD_FEATURE_LAYERING (1<<0) 775cbf6f12SAlex Elder #define RBD_FEATURE_STRIPINGV2 (1<<1) 785cbf6f12SAlex Elder #define RBD_FEATURES_ALL \ 795cbf6f12SAlex Elder (RBD_FEATURE_LAYERING | RBD_FEATURE_STRIPINGV2) 80d889140cSAlex Elder 81d889140cSAlex Elder /* Features supported by this (client software) implementation. */ 82d889140cSAlex Elder 83770eba6eSAlex Elder #define RBD_FEATURES_SUPPORTED (RBD_FEATURES_ALL) 84d889140cSAlex Elder 8581a89793SAlex Elder /* 8681a89793SAlex Elder * An RBD device name will be "rbd#", where the "rbd" comes from 8781a89793SAlex Elder * RBD_DRV_NAME above, and # is a unique integer identifier. 8881a89793SAlex Elder * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big 8981a89793SAlex Elder * enough to hold all possible device names. 9081a89793SAlex Elder */ 91602adf40SYehuda Sadeh #define DEV_NAME_LEN 32 9281a89793SAlex Elder #define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1) 93602adf40SYehuda Sadeh 94602adf40SYehuda Sadeh /* 95602adf40SYehuda Sadeh * block device image metadata (in-memory version) 96602adf40SYehuda Sadeh */ 97602adf40SYehuda Sadeh struct rbd_image_header { 98f84344f3SAlex Elder /* These four fields never change for a given rbd image */ 99849b4260SAlex Elder char *object_prefix; 10034b13184SAlex Elder u64 features; 101602adf40SYehuda Sadeh __u8 obj_order; 102602adf40SYehuda Sadeh __u8 crypt_type; 103602adf40SYehuda Sadeh __u8 comp_type; 104602adf40SYehuda Sadeh 105f84344f3SAlex Elder /* The remaining fields need to be updated occasionally */ 106f84344f3SAlex Elder u64 image_size; 107f84344f3SAlex Elder struct ceph_snap_context *snapc; 108602adf40SYehuda Sadeh char *snap_names; 109602adf40SYehuda Sadeh u64 *snap_sizes; 11059c2be1eSYehuda Sadeh 11159c2be1eSYehuda Sadeh u64 obj_version; 11259c2be1eSYehuda Sadeh }; 11359c2be1eSYehuda Sadeh 1140d7dbfceSAlex Elder /* 1150d7dbfceSAlex Elder * An rbd image specification. 1160d7dbfceSAlex Elder * 1170d7dbfceSAlex Elder * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely 118c66c6e0cSAlex Elder * identify an image. Each rbd_dev structure includes a pointer to 119c66c6e0cSAlex Elder * an rbd_spec structure that encapsulates this identity. 120c66c6e0cSAlex Elder * 121c66c6e0cSAlex Elder * Each of the id's in an rbd_spec has an associated name. For a 122c66c6e0cSAlex Elder * user-mapped image, the names are supplied and the id's associated 123c66c6e0cSAlex Elder * with them are looked up. For a layered image, a parent image is 124c66c6e0cSAlex Elder * defined by the tuple, and the names are looked up. 125c66c6e0cSAlex Elder * 126c66c6e0cSAlex Elder * An rbd_dev structure contains a parent_spec pointer which is 127c66c6e0cSAlex Elder * non-null if the image it represents is a child in a layered 128c66c6e0cSAlex Elder * image. This pointer will refer to the rbd_spec structure used 129c66c6e0cSAlex Elder * by the parent rbd_dev for its own identity (i.e., the structure 130c66c6e0cSAlex Elder * is shared between the parent and child). 131c66c6e0cSAlex Elder * 132c66c6e0cSAlex Elder * Since these structures are populated once, during the discovery 133c66c6e0cSAlex Elder * phase of image construction, they are effectively immutable so 134c66c6e0cSAlex Elder * we make no effort to synchronize access to them. 135c66c6e0cSAlex Elder * 136c66c6e0cSAlex Elder * Note that code herein does not assume the image name is known (it 137c66c6e0cSAlex Elder * could be a null pointer). 1380d7dbfceSAlex Elder */ 1390d7dbfceSAlex Elder struct rbd_spec { 1400d7dbfceSAlex Elder u64 pool_id; 1410d7dbfceSAlex Elder char *pool_name; 1420d7dbfceSAlex Elder 1430d7dbfceSAlex Elder char *image_id; 1440d7dbfceSAlex Elder char *image_name; 1450d7dbfceSAlex Elder 1460d7dbfceSAlex Elder u64 snap_id; 1470d7dbfceSAlex Elder char *snap_name; 1480d7dbfceSAlex Elder 1490d7dbfceSAlex Elder struct kref kref; 1500d7dbfceSAlex Elder }; 1510d7dbfceSAlex Elder 152602adf40SYehuda Sadeh /* 153f0f8cef5SAlex Elder * an instance of the client. multiple devices may share an rbd client. 154602adf40SYehuda Sadeh */ 155602adf40SYehuda Sadeh struct rbd_client { 156602adf40SYehuda Sadeh struct ceph_client *client; 157602adf40SYehuda Sadeh struct kref kref; 158602adf40SYehuda Sadeh struct list_head node; 159602adf40SYehuda Sadeh }; 160602adf40SYehuda Sadeh 161bf0d5f50SAlex Elder struct rbd_img_request; 162bf0d5f50SAlex Elder typedef void (*rbd_img_callback_t)(struct rbd_img_request *); 163bf0d5f50SAlex Elder 164bf0d5f50SAlex Elder #define BAD_WHICH U32_MAX /* Good which or bad which, which? */ 165bf0d5f50SAlex Elder 166bf0d5f50SAlex Elder struct rbd_obj_request; 167bf0d5f50SAlex Elder typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *); 168bf0d5f50SAlex Elder 1699969ebc5SAlex Elder enum obj_request_type { 1709969ebc5SAlex Elder OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES 1719969ebc5SAlex Elder }; 172bf0d5f50SAlex Elder 173926f9b3fSAlex Elder enum obj_req_flags { 174926f9b3fSAlex Elder OBJ_REQ_DONE, /* completion flag: not done = 0, done = 1 */ 1756365d33aSAlex Elder OBJ_REQ_IMG_DATA, /* object usage: standalone = 0, image = 1 */ 1765679c59fSAlex Elder OBJ_REQ_KNOWN, /* EXISTS flag valid: no = 0, yes = 1 */ 1775679c59fSAlex Elder OBJ_REQ_EXISTS, /* target exists: no = 0, yes = 1 */ 178926f9b3fSAlex Elder }; 179926f9b3fSAlex Elder 180bf0d5f50SAlex Elder struct rbd_obj_request { 181bf0d5f50SAlex Elder const char *object_name; 182bf0d5f50SAlex Elder u64 offset; /* object start byte */ 183bf0d5f50SAlex Elder u64 length; /* bytes from offset */ 184926f9b3fSAlex Elder unsigned long flags; 185bf0d5f50SAlex Elder 186c5b5ef6cSAlex Elder /* 187c5b5ef6cSAlex Elder * An object request associated with an image will have its 188c5b5ef6cSAlex Elder * img_data flag set; a standalone object request will not. 189c5b5ef6cSAlex Elder * 190c5b5ef6cSAlex Elder * A standalone object request will have which == BAD_WHICH 191c5b5ef6cSAlex Elder * and a null obj_request pointer. 192c5b5ef6cSAlex Elder * 193c5b5ef6cSAlex Elder * An object request initiated in support of a layered image 194c5b5ef6cSAlex Elder * object (to check for its existence before a write) will 195c5b5ef6cSAlex Elder * have which == BAD_WHICH and a non-null obj_request pointer. 196c5b5ef6cSAlex Elder * 197c5b5ef6cSAlex Elder * Finally, an object request for rbd image data will have 198c5b5ef6cSAlex Elder * which != BAD_WHICH, and will have a non-null img_request 199c5b5ef6cSAlex Elder * pointer. The value of which will be in the range 200c5b5ef6cSAlex Elder * 0..(img_request->obj_request_count-1). 201c5b5ef6cSAlex Elder */ 202c5b5ef6cSAlex Elder union { 203c5b5ef6cSAlex Elder struct rbd_obj_request *obj_request; /* STAT op */ 204c5b5ef6cSAlex Elder struct { 205bf0d5f50SAlex Elder struct rbd_img_request *img_request; 206c5b5ef6cSAlex Elder u64 img_offset; 207c5b5ef6cSAlex Elder /* links for img_request->obj_requests list */ 208c5b5ef6cSAlex Elder struct list_head links; 209c5b5ef6cSAlex Elder }; 210c5b5ef6cSAlex Elder }; 211bf0d5f50SAlex Elder u32 which; /* posn image request list */ 212bf0d5f50SAlex Elder 213bf0d5f50SAlex Elder enum obj_request_type type; 214788e2df3SAlex Elder union { 215bf0d5f50SAlex Elder struct bio *bio_list; 216788e2df3SAlex Elder struct { 217788e2df3SAlex Elder struct page **pages; 218788e2df3SAlex Elder u32 page_count; 219788e2df3SAlex Elder }; 220788e2df3SAlex Elder }; 2210eefd470SAlex Elder struct page **copyup_pages; 222bf0d5f50SAlex Elder 223bf0d5f50SAlex Elder struct ceph_osd_request *osd_req; 224bf0d5f50SAlex Elder 225bf0d5f50SAlex Elder u64 xferred; /* bytes transferred */ 226bf0d5f50SAlex Elder u64 version; 2271b83bef2SSage Weil int result; 228bf0d5f50SAlex Elder 229bf0d5f50SAlex Elder rbd_obj_callback_t callback; 230788e2df3SAlex Elder struct completion completion; 231bf0d5f50SAlex Elder 232bf0d5f50SAlex Elder struct kref kref; 233bf0d5f50SAlex Elder }; 234bf0d5f50SAlex Elder 2350c425248SAlex Elder enum img_req_flags { 2369849e986SAlex Elder IMG_REQ_WRITE, /* I/O direction: read = 0, write = 1 */ 2379849e986SAlex Elder IMG_REQ_CHILD, /* initiator: block = 0, child image = 1 */ 238d0b2e944SAlex Elder IMG_REQ_LAYERED, /* ENOENT handling: normal = 0, layered = 1 */ 2390c425248SAlex Elder }; 2400c425248SAlex Elder 241bf0d5f50SAlex Elder struct rbd_img_request { 242bf0d5f50SAlex Elder struct rbd_device *rbd_dev; 243bf0d5f50SAlex Elder u64 offset; /* starting image byte offset */ 244bf0d5f50SAlex Elder u64 length; /* byte count from offset */ 2450c425248SAlex Elder unsigned long flags; 246bf0d5f50SAlex Elder union { 247bf0d5f50SAlex Elder u64 snap_id; /* for reads */ 2489849e986SAlex Elder struct ceph_snap_context *snapc; /* for writes */ 2499849e986SAlex Elder }; 2509849e986SAlex Elder union { 2519849e986SAlex Elder struct request *rq; /* block request */ 2529849e986SAlex Elder struct rbd_obj_request *obj_request; /* obj req initiator */ 253bf0d5f50SAlex Elder }; 2543d7efd18SAlex Elder struct page **copyup_pages; 255bf0d5f50SAlex Elder spinlock_t completion_lock;/* protects next_completion */ 256bf0d5f50SAlex Elder u32 next_completion; 257bf0d5f50SAlex Elder rbd_img_callback_t callback; 25855f27e09SAlex Elder u64 xferred;/* aggregate bytes transferred */ 259a5a337d4SAlex Elder int result; /* first nonzero obj_request result */ 260bf0d5f50SAlex Elder 261bf0d5f50SAlex Elder u32 obj_request_count; 262bf0d5f50SAlex Elder struct list_head obj_requests; /* rbd_obj_request structs */ 263bf0d5f50SAlex Elder 264bf0d5f50SAlex Elder struct kref kref; 265bf0d5f50SAlex Elder }; 266bf0d5f50SAlex Elder 267bf0d5f50SAlex Elder #define for_each_obj_request(ireq, oreq) \ 268ef06f4d3SAlex Elder list_for_each_entry(oreq, &(ireq)->obj_requests, links) 269bf0d5f50SAlex Elder #define for_each_obj_request_from(ireq, oreq) \ 270ef06f4d3SAlex Elder list_for_each_entry_from(oreq, &(ireq)->obj_requests, links) 271bf0d5f50SAlex Elder #define for_each_obj_request_safe(ireq, oreq, n) \ 272ef06f4d3SAlex Elder list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links) 273bf0d5f50SAlex Elder 274dfc5606dSYehuda Sadeh struct rbd_snap { 275dfc5606dSYehuda Sadeh const char *name; 2763591538fSJosh Durgin u64 size; 277dfc5606dSYehuda Sadeh struct list_head node; 278dfc5606dSYehuda Sadeh u64 id; 27934b13184SAlex Elder u64 features; 280dfc5606dSYehuda Sadeh }; 281dfc5606dSYehuda Sadeh 282f84344f3SAlex Elder struct rbd_mapping { 28399c1f08fSAlex Elder u64 size; 28434b13184SAlex Elder u64 features; 285f84344f3SAlex Elder bool read_only; 286f84344f3SAlex Elder }; 287f84344f3SAlex Elder 288602adf40SYehuda Sadeh /* 289602adf40SYehuda Sadeh * a single device 290602adf40SYehuda Sadeh */ 291602adf40SYehuda Sadeh struct rbd_device { 292de71a297SAlex Elder int dev_id; /* blkdev unique id */ 293602adf40SYehuda Sadeh 294602adf40SYehuda Sadeh int major; /* blkdev assigned major */ 295602adf40SYehuda Sadeh struct gendisk *disk; /* blkdev's gendisk and rq */ 296602adf40SYehuda Sadeh 297a30b71b9SAlex Elder u32 image_format; /* Either 1 or 2 */ 298602adf40SYehuda Sadeh struct rbd_client *rbd_client; 299602adf40SYehuda Sadeh 300602adf40SYehuda Sadeh char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */ 301602adf40SYehuda Sadeh 302b82d167bSAlex Elder spinlock_t lock; /* queue, flags, open_count */ 303602adf40SYehuda Sadeh 304602adf40SYehuda Sadeh struct rbd_image_header header; 305b82d167bSAlex Elder unsigned long flags; /* possibly lock protected */ 3060d7dbfceSAlex Elder struct rbd_spec *spec; 307602adf40SYehuda Sadeh 3080d7dbfceSAlex Elder char *header_name; 309971f839aSAlex Elder 3100903e875SAlex Elder struct ceph_file_layout layout; 3110903e875SAlex Elder 31259c2be1eSYehuda Sadeh struct ceph_osd_event *watch_event; 313975241afSAlex Elder struct rbd_obj_request *watch_request; 31459c2be1eSYehuda Sadeh 31586b00e0dSAlex Elder struct rbd_spec *parent_spec; 31686b00e0dSAlex Elder u64 parent_overlap; 3172f82ee54SAlex Elder struct rbd_device *parent; 31886b00e0dSAlex Elder 319cc070d59SAlex Elder u64 stripe_unit; 320cc070d59SAlex Elder u64 stripe_count; 321cc070d59SAlex Elder 322c666601aSJosh Durgin /* protects updating the header */ 323c666601aSJosh Durgin struct rw_semaphore header_rwsem; 324f84344f3SAlex Elder 325f84344f3SAlex Elder struct rbd_mapping mapping; 326602adf40SYehuda Sadeh 327602adf40SYehuda Sadeh struct list_head node; 328dfc5606dSYehuda Sadeh 329dfc5606dSYehuda Sadeh /* list of snapshots */ 330dfc5606dSYehuda Sadeh struct list_head snaps; 331dfc5606dSYehuda Sadeh 332dfc5606dSYehuda Sadeh /* sysfs related */ 333dfc5606dSYehuda Sadeh struct device dev; 334b82d167bSAlex Elder unsigned long open_count; /* protected by lock */ 335dfc5606dSYehuda Sadeh }; 336dfc5606dSYehuda Sadeh 337b82d167bSAlex Elder /* 338b82d167bSAlex Elder * Flag bits for rbd_dev->flags. If atomicity is required, 339b82d167bSAlex Elder * rbd_dev->lock is used to protect access. 340b82d167bSAlex Elder * 341b82d167bSAlex Elder * Currently, only the "removing" flag (which is coupled with the 342b82d167bSAlex Elder * "open_count" field) requires atomic access. 343b82d167bSAlex Elder */ 3446d292906SAlex Elder enum rbd_dev_flags { 3456d292906SAlex Elder RBD_DEV_FLAG_EXISTS, /* mapped snapshot has not been deleted */ 346b82d167bSAlex Elder RBD_DEV_FLAG_REMOVING, /* this mapping is being removed */ 3476d292906SAlex Elder }; 3486d292906SAlex Elder 349602adf40SYehuda Sadeh static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */ 350e124a82fSAlex Elder 351602adf40SYehuda Sadeh static LIST_HEAD(rbd_dev_list); /* devices */ 352e124a82fSAlex Elder static DEFINE_SPINLOCK(rbd_dev_list_lock); 353e124a82fSAlex Elder 354602adf40SYehuda Sadeh static LIST_HEAD(rbd_client_list); /* clients */ 355432b8587SAlex Elder static DEFINE_SPINLOCK(rbd_client_list_lock); 356602adf40SYehuda Sadeh 3573d7efd18SAlex Elder static int rbd_img_request_submit(struct rbd_img_request *img_request); 3583d7efd18SAlex Elder 359304f6808SAlex Elder static int rbd_dev_snaps_update(struct rbd_device *rbd_dev); 360304f6808SAlex Elder 361dfc5606dSYehuda Sadeh static void rbd_dev_release(struct device *dev); 3626087b51bSAlex Elder static void rbd_snap_destroy(struct rbd_snap *snap); 363dfc5606dSYehuda Sadeh 364f0f8cef5SAlex Elder static ssize_t rbd_add(struct bus_type *bus, const char *buf, 365f0f8cef5SAlex Elder size_t count); 366f0f8cef5SAlex Elder static ssize_t rbd_remove(struct bus_type *bus, const char *buf, 367f0f8cef5SAlex Elder size_t count); 3682f82ee54SAlex Elder static int rbd_dev_probe(struct rbd_device *rbd_dev); 369f0f8cef5SAlex Elder 370f0f8cef5SAlex Elder static struct bus_attribute rbd_bus_attrs[] = { 371f0f8cef5SAlex Elder __ATTR(add, S_IWUSR, NULL, rbd_add), 372f0f8cef5SAlex Elder __ATTR(remove, S_IWUSR, NULL, rbd_remove), 373f0f8cef5SAlex Elder __ATTR_NULL 374f0f8cef5SAlex Elder }; 375f0f8cef5SAlex Elder 376f0f8cef5SAlex Elder static struct bus_type rbd_bus_type = { 377f0f8cef5SAlex Elder .name = "rbd", 378f0f8cef5SAlex Elder .bus_attrs = rbd_bus_attrs, 379f0f8cef5SAlex Elder }; 380f0f8cef5SAlex Elder 381f0f8cef5SAlex Elder static void rbd_root_dev_release(struct device *dev) 382f0f8cef5SAlex Elder { 383f0f8cef5SAlex Elder } 384f0f8cef5SAlex Elder 385f0f8cef5SAlex Elder static struct device rbd_root_dev = { 386f0f8cef5SAlex Elder .init_name = "rbd", 387f0f8cef5SAlex Elder .release = rbd_root_dev_release, 388f0f8cef5SAlex Elder }; 389f0f8cef5SAlex Elder 39006ecc6cbSAlex Elder static __printf(2, 3) 39106ecc6cbSAlex Elder void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...) 39206ecc6cbSAlex Elder { 39306ecc6cbSAlex Elder struct va_format vaf; 39406ecc6cbSAlex Elder va_list args; 39506ecc6cbSAlex Elder 39606ecc6cbSAlex Elder va_start(args, fmt); 39706ecc6cbSAlex Elder vaf.fmt = fmt; 39806ecc6cbSAlex Elder vaf.va = &args; 39906ecc6cbSAlex Elder 40006ecc6cbSAlex Elder if (!rbd_dev) 40106ecc6cbSAlex Elder printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf); 40206ecc6cbSAlex Elder else if (rbd_dev->disk) 40306ecc6cbSAlex Elder printk(KERN_WARNING "%s: %s: %pV\n", 40406ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf); 40506ecc6cbSAlex Elder else if (rbd_dev->spec && rbd_dev->spec->image_name) 40606ecc6cbSAlex Elder printk(KERN_WARNING "%s: image %s: %pV\n", 40706ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf); 40806ecc6cbSAlex Elder else if (rbd_dev->spec && rbd_dev->spec->image_id) 40906ecc6cbSAlex Elder printk(KERN_WARNING "%s: id %s: %pV\n", 41006ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf); 41106ecc6cbSAlex Elder else /* punt */ 41206ecc6cbSAlex Elder printk(KERN_WARNING "%s: rbd_dev %p: %pV\n", 41306ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev, &vaf); 41406ecc6cbSAlex Elder va_end(args); 41506ecc6cbSAlex Elder } 41606ecc6cbSAlex Elder 417aafb230eSAlex Elder #ifdef RBD_DEBUG 418aafb230eSAlex Elder #define rbd_assert(expr) \ 419aafb230eSAlex Elder if (unlikely(!(expr))) { \ 420aafb230eSAlex Elder printk(KERN_ERR "\nAssertion failure in %s() " \ 421aafb230eSAlex Elder "at line %d:\n\n" \ 422aafb230eSAlex Elder "\trbd_assert(%s);\n\n", \ 423aafb230eSAlex Elder __func__, __LINE__, #expr); \ 424aafb230eSAlex Elder BUG(); \ 425aafb230eSAlex Elder } 426aafb230eSAlex Elder #else /* !RBD_DEBUG */ 427aafb230eSAlex Elder # define rbd_assert(expr) ((void) 0) 428aafb230eSAlex Elder #endif /* !RBD_DEBUG */ 429dfc5606dSYehuda Sadeh 4308b3e1a56SAlex Elder static void rbd_img_parent_read(struct rbd_obj_request *obj_request); 431b454e36dSAlex Elder static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request); 4328b3e1a56SAlex Elder 433117973fbSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver); 434117973fbSAlex Elder static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver); 43559c2be1eSYehuda Sadeh 436602adf40SYehuda Sadeh static int rbd_open(struct block_device *bdev, fmode_t mode) 437602adf40SYehuda Sadeh { 438f0f8cef5SAlex Elder struct rbd_device *rbd_dev = bdev->bd_disk->private_data; 439b82d167bSAlex Elder bool removing = false; 440602adf40SYehuda Sadeh 441f84344f3SAlex Elder if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only) 442602adf40SYehuda Sadeh return -EROFS; 443602adf40SYehuda Sadeh 444a14ea269SAlex Elder spin_lock_irq(&rbd_dev->lock); 445b82d167bSAlex Elder if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags)) 446b82d167bSAlex Elder removing = true; 447b82d167bSAlex Elder else 448b82d167bSAlex Elder rbd_dev->open_count++; 449a14ea269SAlex Elder spin_unlock_irq(&rbd_dev->lock); 450b82d167bSAlex Elder if (removing) 451b82d167bSAlex Elder return -ENOENT; 452b82d167bSAlex Elder 45342382b70SAlex Elder mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); 454c3e946ceSAlex Elder (void) get_device(&rbd_dev->dev); 455f84344f3SAlex Elder set_device_ro(bdev, rbd_dev->mapping.read_only); 45642382b70SAlex Elder mutex_unlock(&ctl_mutex); 457340c7a2bSAlex Elder 458602adf40SYehuda Sadeh return 0; 459602adf40SYehuda Sadeh } 460602adf40SYehuda Sadeh 461dfc5606dSYehuda Sadeh static int rbd_release(struct gendisk *disk, fmode_t mode) 462dfc5606dSYehuda Sadeh { 463dfc5606dSYehuda Sadeh struct rbd_device *rbd_dev = disk->private_data; 464b82d167bSAlex Elder unsigned long open_count_before; 465b82d167bSAlex Elder 466a14ea269SAlex Elder spin_lock_irq(&rbd_dev->lock); 467b82d167bSAlex Elder open_count_before = rbd_dev->open_count--; 468a14ea269SAlex Elder spin_unlock_irq(&rbd_dev->lock); 469b82d167bSAlex Elder rbd_assert(open_count_before > 0); 470dfc5606dSYehuda Sadeh 47142382b70SAlex Elder mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); 472c3e946ceSAlex Elder put_device(&rbd_dev->dev); 47342382b70SAlex Elder mutex_unlock(&ctl_mutex); 474dfc5606dSYehuda Sadeh 475dfc5606dSYehuda Sadeh return 0; 476dfc5606dSYehuda Sadeh } 477dfc5606dSYehuda Sadeh 478602adf40SYehuda Sadeh static const struct block_device_operations rbd_bd_ops = { 479602adf40SYehuda Sadeh .owner = THIS_MODULE, 480602adf40SYehuda Sadeh .open = rbd_open, 481dfc5606dSYehuda Sadeh .release = rbd_release, 482602adf40SYehuda Sadeh }; 483602adf40SYehuda Sadeh 484602adf40SYehuda Sadeh /* 485602adf40SYehuda Sadeh * Initialize an rbd client instance. 48643ae4701SAlex Elder * We own *ceph_opts. 487602adf40SYehuda Sadeh */ 488f8c38929SAlex Elder static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts) 489602adf40SYehuda Sadeh { 490602adf40SYehuda Sadeh struct rbd_client *rbdc; 491602adf40SYehuda Sadeh int ret = -ENOMEM; 492602adf40SYehuda Sadeh 49337206ee5SAlex Elder dout("%s:\n", __func__); 494602adf40SYehuda Sadeh rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL); 495602adf40SYehuda Sadeh if (!rbdc) 496602adf40SYehuda Sadeh goto out_opt; 497602adf40SYehuda Sadeh 498602adf40SYehuda Sadeh kref_init(&rbdc->kref); 499602adf40SYehuda Sadeh INIT_LIST_HEAD(&rbdc->node); 500602adf40SYehuda Sadeh 501bc534d86SAlex Elder mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); 502bc534d86SAlex Elder 50343ae4701SAlex Elder rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0); 504602adf40SYehuda Sadeh if (IS_ERR(rbdc->client)) 505bc534d86SAlex Elder goto out_mutex; 50643ae4701SAlex Elder ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */ 507602adf40SYehuda Sadeh 508602adf40SYehuda Sadeh ret = ceph_open_session(rbdc->client); 509602adf40SYehuda Sadeh if (ret < 0) 510602adf40SYehuda Sadeh goto out_err; 511602adf40SYehuda Sadeh 512432b8587SAlex Elder spin_lock(&rbd_client_list_lock); 513602adf40SYehuda Sadeh list_add_tail(&rbdc->node, &rbd_client_list); 514432b8587SAlex Elder spin_unlock(&rbd_client_list_lock); 515602adf40SYehuda Sadeh 516bc534d86SAlex Elder mutex_unlock(&ctl_mutex); 51737206ee5SAlex Elder dout("%s: rbdc %p\n", __func__, rbdc); 518bc534d86SAlex Elder 519602adf40SYehuda Sadeh return rbdc; 520602adf40SYehuda Sadeh 521602adf40SYehuda Sadeh out_err: 522602adf40SYehuda Sadeh ceph_destroy_client(rbdc->client); 523bc534d86SAlex Elder out_mutex: 524bc534d86SAlex Elder mutex_unlock(&ctl_mutex); 525602adf40SYehuda Sadeh kfree(rbdc); 526602adf40SYehuda Sadeh out_opt: 52743ae4701SAlex Elder if (ceph_opts) 52843ae4701SAlex Elder ceph_destroy_options(ceph_opts); 52937206ee5SAlex Elder dout("%s: error %d\n", __func__, ret); 53037206ee5SAlex Elder 53128f259b7SVasiliy Kulikov return ERR_PTR(ret); 532602adf40SYehuda Sadeh } 533602adf40SYehuda Sadeh 5342f82ee54SAlex Elder static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc) 5352f82ee54SAlex Elder { 5362f82ee54SAlex Elder kref_get(&rbdc->kref); 5372f82ee54SAlex Elder 5382f82ee54SAlex Elder return rbdc; 5392f82ee54SAlex Elder } 5402f82ee54SAlex Elder 541602adf40SYehuda Sadeh /* 5421f7ba331SAlex Elder * Find a ceph client with specific addr and configuration. If 5431f7ba331SAlex Elder * found, bump its reference count. 544602adf40SYehuda Sadeh */ 5451f7ba331SAlex Elder static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts) 546602adf40SYehuda Sadeh { 547602adf40SYehuda Sadeh struct rbd_client *client_node; 5481f7ba331SAlex Elder bool found = false; 549602adf40SYehuda Sadeh 55043ae4701SAlex Elder if (ceph_opts->flags & CEPH_OPT_NOSHARE) 551602adf40SYehuda Sadeh return NULL; 552602adf40SYehuda Sadeh 5531f7ba331SAlex Elder spin_lock(&rbd_client_list_lock); 5541f7ba331SAlex Elder list_for_each_entry(client_node, &rbd_client_list, node) { 5551f7ba331SAlex Elder if (!ceph_compare_options(ceph_opts, client_node->client)) { 5562f82ee54SAlex Elder __rbd_get_client(client_node); 5572f82ee54SAlex Elder 5581f7ba331SAlex Elder found = true; 5591f7ba331SAlex Elder break; 5601f7ba331SAlex Elder } 5611f7ba331SAlex Elder } 5621f7ba331SAlex Elder spin_unlock(&rbd_client_list_lock); 5631f7ba331SAlex Elder 5641f7ba331SAlex Elder return found ? client_node : NULL; 565602adf40SYehuda Sadeh } 566602adf40SYehuda Sadeh 567602adf40SYehuda Sadeh /* 56859c2be1eSYehuda Sadeh * mount options 56959c2be1eSYehuda Sadeh */ 57059c2be1eSYehuda Sadeh enum { 57159c2be1eSYehuda Sadeh Opt_last_int, 57259c2be1eSYehuda Sadeh /* int args above */ 57359c2be1eSYehuda Sadeh Opt_last_string, 57459c2be1eSYehuda Sadeh /* string args above */ 575cc0538b6SAlex Elder Opt_read_only, 576cc0538b6SAlex Elder Opt_read_write, 577cc0538b6SAlex Elder /* Boolean args above */ 578cc0538b6SAlex Elder Opt_last_bool, 57959c2be1eSYehuda Sadeh }; 58059c2be1eSYehuda Sadeh 58143ae4701SAlex Elder static match_table_t rbd_opts_tokens = { 58259c2be1eSYehuda Sadeh /* int args above */ 58359c2be1eSYehuda Sadeh /* string args above */ 584be466c1cSAlex Elder {Opt_read_only, "read_only"}, 585cc0538b6SAlex Elder {Opt_read_only, "ro"}, /* Alternate spelling */ 586cc0538b6SAlex Elder {Opt_read_write, "read_write"}, 587cc0538b6SAlex Elder {Opt_read_write, "rw"}, /* Alternate spelling */ 588cc0538b6SAlex Elder /* Boolean args above */ 58959c2be1eSYehuda Sadeh {-1, NULL} 59059c2be1eSYehuda Sadeh }; 59159c2be1eSYehuda Sadeh 59298571b5aSAlex Elder struct rbd_options { 59398571b5aSAlex Elder bool read_only; 59498571b5aSAlex Elder }; 59598571b5aSAlex Elder 59698571b5aSAlex Elder #define RBD_READ_ONLY_DEFAULT false 59798571b5aSAlex Elder 59859c2be1eSYehuda Sadeh static int parse_rbd_opts_token(char *c, void *private) 59959c2be1eSYehuda Sadeh { 60043ae4701SAlex Elder struct rbd_options *rbd_opts = private; 60159c2be1eSYehuda Sadeh substring_t argstr[MAX_OPT_ARGS]; 60259c2be1eSYehuda Sadeh int token, intval, ret; 60359c2be1eSYehuda Sadeh 60443ae4701SAlex Elder token = match_token(c, rbd_opts_tokens, argstr); 60559c2be1eSYehuda Sadeh if (token < 0) 60659c2be1eSYehuda Sadeh return -EINVAL; 60759c2be1eSYehuda Sadeh 60859c2be1eSYehuda Sadeh if (token < Opt_last_int) { 60959c2be1eSYehuda Sadeh ret = match_int(&argstr[0], &intval); 61059c2be1eSYehuda Sadeh if (ret < 0) { 61159c2be1eSYehuda Sadeh pr_err("bad mount option arg (not int) " 61259c2be1eSYehuda Sadeh "at '%s'\n", c); 61359c2be1eSYehuda Sadeh return ret; 61459c2be1eSYehuda Sadeh } 61559c2be1eSYehuda Sadeh dout("got int token %d val %d\n", token, intval); 61659c2be1eSYehuda Sadeh } else if (token > Opt_last_int && token < Opt_last_string) { 61759c2be1eSYehuda Sadeh dout("got string token %d val %s\n", token, 61859c2be1eSYehuda Sadeh argstr[0].from); 619cc0538b6SAlex Elder } else if (token > Opt_last_string && token < Opt_last_bool) { 620cc0538b6SAlex Elder dout("got Boolean token %d\n", token); 62159c2be1eSYehuda Sadeh } else { 62259c2be1eSYehuda Sadeh dout("got token %d\n", token); 62359c2be1eSYehuda Sadeh } 62459c2be1eSYehuda Sadeh 62559c2be1eSYehuda Sadeh switch (token) { 626cc0538b6SAlex Elder case Opt_read_only: 627cc0538b6SAlex Elder rbd_opts->read_only = true; 628cc0538b6SAlex Elder break; 629cc0538b6SAlex Elder case Opt_read_write: 630cc0538b6SAlex Elder rbd_opts->read_only = false; 631cc0538b6SAlex Elder break; 63259c2be1eSYehuda Sadeh default: 633aafb230eSAlex Elder rbd_assert(false); 634aafb230eSAlex Elder break; 63559c2be1eSYehuda Sadeh } 63659c2be1eSYehuda Sadeh return 0; 63759c2be1eSYehuda Sadeh } 63859c2be1eSYehuda Sadeh 63959c2be1eSYehuda Sadeh /* 640602adf40SYehuda Sadeh * Get a ceph client with specific addr and configuration, if one does 641602adf40SYehuda Sadeh * not exist create it. 642602adf40SYehuda Sadeh */ 6439d3997fdSAlex Elder static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts) 644602adf40SYehuda Sadeh { 645f8c38929SAlex Elder struct rbd_client *rbdc; 64659c2be1eSYehuda Sadeh 6471f7ba331SAlex Elder rbdc = rbd_client_find(ceph_opts); 6489d3997fdSAlex Elder if (rbdc) /* using an existing client */ 64943ae4701SAlex Elder ceph_destroy_options(ceph_opts); 6509d3997fdSAlex Elder else 651f8c38929SAlex Elder rbdc = rbd_client_create(ceph_opts); 652d720bcb0SAlex Elder 6539d3997fdSAlex Elder return rbdc; 654602adf40SYehuda Sadeh } 655602adf40SYehuda Sadeh 656602adf40SYehuda Sadeh /* 657602adf40SYehuda Sadeh * Destroy ceph client 658d23a4b3fSAlex Elder * 659432b8587SAlex Elder * Caller must hold rbd_client_list_lock. 660602adf40SYehuda Sadeh */ 661602adf40SYehuda Sadeh static void rbd_client_release(struct kref *kref) 662602adf40SYehuda Sadeh { 663602adf40SYehuda Sadeh struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref); 664602adf40SYehuda Sadeh 66537206ee5SAlex Elder dout("%s: rbdc %p\n", __func__, rbdc); 666cd9d9f5dSAlex Elder spin_lock(&rbd_client_list_lock); 667602adf40SYehuda Sadeh list_del(&rbdc->node); 668cd9d9f5dSAlex Elder spin_unlock(&rbd_client_list_lock); 669602adf40SYehuda Sadeh 670602adf40SYehuda Sadeh ceph_destroy_client(rbdc->client); 671602adf40SYehuda Sadeh kfree(rbdc); 672602adf40SYehuda Sadeh } 673602adf40SYehuda Sadeh 674602adf40SYehuda Sadeh /* 675602adf40SYehuda Sadeh * Drop reference to ceph client node. If it's not referenced anymore, release 676602adf40SYehuda Sadeh * it. 677602adf40SYehuda Sadeh */ 6789d3997fdSAlex Elder static void rbd_put_client(struct rbd_client *rbdc) 679602adf40SYehuda Sadeh { 680c53d5893SAlex Elder if (rbdc) 6819d3997fdSAlex Elder kref_put(&rbdc->kref, rbd_client_release); 682602adf40SYehuda Sadeh } 683602adf40SYehuda Sadeh 684a30b71b9SAlex Elder static bool rbd_image_format_valid(u32 image_format) 685a30b71b9SAlex Elder { 686a30b71b9SAlex Elder return image_format == 1 || image_format == 2; 687a30b71b9SAlex Elder } 688a30b71b9SAlex Elder 6898e94af8eSAlex Elder static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk) 6908e94af8eSAlex Elder { 691103a150fSAlex Elder size_t size; 692103a150fSAlex Elder u32 snap_count; 693103a150fSAlex Elder 694103a150fSAlex Elder /* The header has to start with the magic rbd header text */ 695103a150fSAlex Elder if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT))) 696103a150fSAlex Elder return false; 697103a150fSAlex Elder 698db2388b6SAlex Elder /* The bio layer requires at least sector-sized I/O */ 699db2388b6SAlex Elder 700db2388b6SAlex Elder if (ondisk->options.order < SECTOR_SHIFT) 701db2388b6SAlex Elder return false; 702db2388b6SAlex Elder 703db2388b6SAlex Elder /* If we use u64 in a few spots we may be able to loosen this */ 704db2388b6SAlex Elder 705db2388b6SAlex Elder if (ondisk->options.order > 8 * sizeof (int) - 1) 706db2388b6SAlex Elder return false; 707db2388b6SAlex Elder 708103a150fSAlex Elder /* 709103a150fSAlex Elder * The size of a snapshot header has to fit in a size_t, and 710103a150fSAlex Elder * that limits the number of snapshots. 711103a150fSAlex Elder */ 712103a150fSAlex Elder snap_count = le32_to_cpu(ondisk->snap_count); 713103a150fSAlex Elder size = SIZE_MAX - sizeof (struct ceph_snap_context); 714103a150fSAlex Elder if (snap_count > size / sizeof (__le64)) 715103a150fSAlex Elder return false; 716103a150fSAlex Elder 717103a150fSAlex Elder /* 718103a150fSAlex Elder * Not only that, but the size of the entire the snapshot 719103a150fSAlex Elder * header must also be representable in a size_t. 720103a150fSAlex Elder */ 721103a150fSAlex Elder size -= snap_count * sizeof (__le64); 722103a150fSAlex Elder if ((u64) size < le64_to_cpu(ondisk->snap_names_len)) 723103a150fSAlex Elder return false; 724103a150fSAlex Elder 725103a150fSAlex Elder return true; 7268e94af8eSAlex Elder } 7278e94af8eSAlex Elder 728602adf40SYehuda Sadeh /* 729602adf40SYehuda Sadeh * Create a new header structure, translate header format from the on-disk 730602adf40SYehuda Sadeh * header. 731602adf40SYehuda Sadeh */ 732602adf40SYehuda Sadeh static int rbd_header_from_disk(struct rbd_image_header *header, 7334156d998SAlex Elder struct rbd_image_header_ondisk *ondisk) 734602adf40SYehuda Sadeh { 735ccece235SAlex Elder u32 snap_count; 73658c17b0eSAlex Elder size_t len; 737d2bb24e5SAlex Elder size_t size; 738621901d6SAlex Elder u32 i; 739602adf40SYehuda Sadeh 7406a52325fSAlex Elder memset(header, 0, sizeof (*header)); 7416a52325fSAlex Elder 742103a150fSAlex Elder snap_count = le32_to_cpu(ondisk->snap_count); 743103a150fSAlex Elder 74458c17b0eSAlex Elder len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix)); 74558c17b0eSAlex Elder header->object_prefix = kmalloc(len + 1, GFP_KERNEL); 7466a52325fSAlex Elder if (!header->object_prefix) 747602adf40SYehuda Sadeh return -ENOMEM; 74858c17b0eSAlex Elder memcpy(header->object_prefix, ondisk->object_prefix, len); 74958c17b0eSAlex Elder header->object_prefix[len] = '\0'; 75000f1f36fSAlex Elder 751602adf40SYehuda Sadeh if (snap_count) { 752f785cc1dSAlex Elder u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len); 753f785cc1dSAlex Elder 754621901d6SAlex Elder /* Save a copy of the snapshot names */ 755621901d6SAlex Elder 756f785cc1dSAlex Elder if (snap_names_len > (u64) SIZE_MAX) 757f785cc1dSAlex Elder return -EIO; 758f785cc1dSAlex Elder header->snap_names = kmalloc(snap_names_len, GFP_KERNEL); 759602adf40SYehuda Sadeh if (!header->snap_names) 7606a52325fSAlex Elder goto out_err; 761f785cc1dSAlex Elder /* 762f785cc1dSAlex Elder * Note that rbd_dev_v1_header_read() guarantees 763f785cc1dSAlex Elder * the ondisk buffer we're working with has 764f785cc1dSAlex Elder * snap_names_len bytes beyond the end of the 765f785cc1dSAlex Elder * snapshot id array, this memcpy() is safe. 766f785cc1dSAlex Elder */ 767f785cc1dSAlex Elder memcpy(header->snap_names, &ondisk->snaps[snap_count], 768f785cc1dSAlex Elder snap_names_len); 7696a52325fSAlex Elder 770621901d6SAlex Elder /* Record each snapshot's size */ 771621901d6SAlex Elder 772d2bb24e5SAlex Elder size = snap_count * sizeof (*header->snap_sizes); 773d2bb24e5SAlex Elder header->snap_sizes = kmalloc(size, GFP_KERNEL); 774602adf40SYehuda Sadeh if (!header->snap_sizes) 7756a52325fSAlex Elder goto out_err; 776621901d6SAlex Elder for (i = 0; i < snap_count; i++) 777621901d6SAlex Elder header->snap_sizes[i] = 778621901d6SAlex Elder le64_to_cpu(ondisk->snaps[i].image_size); 779602adf40SYehuda Sadeh } else { 780ccece235SAlex Elder WARN_ON(ondisk->snap_names_len); 781602adf40SYehuda Sadeh header->snap_names = NULL; 782602adf40SYehuda Sadeh header->snap_sizes = NULL; 783602adf40SYehuda Sadeh } 784849b4260SAlex Elder 78534b13184SAlex Elder header->features = 0; /* No features support in v1 images */ 786602adf40SYehuda Sadeh header->obj_order = ondisk->options.order; 787602adf40SYehuda Sadeh header->crypt_type = ondisk->options.crypt_type; 788602adf40SYehuda Sadeh header->comp_type = ondisk->options.comp_type; 7896a52325fSAlex Elder 790621901d6SAlex Elder /* Allocate and fill in the snapshot context */ 791621901d6SAlex Elder 792f84344f3SAlex Elder header->image_size = le64_to_cpu(ondisk->image_size); 7936a52325fSAlex Elder size = sizeof (struct ceph_snap_context); 7946a52325fSAlex Elder size += snap_count * sizeof (header->snapc->snaps[0]); 7956a52325fSAlex Elder header->snapc = kzalloc(size, GFP_KERNEL); 7966a52325fSAlex Elder if (!header->snapc) 7976a52325fSAlex Elder goto out_err; 798602adf40SYehuda Sadeh 799602adf40SYehuda Sadeh atomic_set(&header->snapc->nref, 1); 800505cbb9bSAlex Elder header->snapc->seq = le64_to_cpu(ondisk->snap_seq); 801602adf40SYehuda Sadeh header->snapc->num_snaps = snap_count; 802621901d6SAlex Elder for (i = 0; i < snap_count; i++) 803602adf40SYehuda Sadeh header->snapc->snaps[i] = 804602adf40SYehuda Sadeh le64_to_cpu(ondisk->snaps[i].id); 805602adf40SYehuda Sadeh 806602adf40SYehuda Sadeh return 0; 807602adf40SYehuda Sadeh 8086a52325fSAlex Elder out_err: 809849b4260SAlex Elder kfree(header->snap_sizes); 810ccece235SAlex Elder header->snap_sizes = NULL; 811602adf40SYehuda Sadeh kfree(header->snap_names); 812ccece235SAlex Elder header->snap_names = NULL; 8136a52325fSAlex Elder kfree(header->object_prefix); 8146a52325fSAlex Elder header->object_prefix = NULL; 815ccece235SAlex Elder 81600f1f36fSAlex Elder return -ENOMEM; 817602adf40SYehuda Sadeh } 818602adf40SYehuda Sadeh 8199e15b77dSAlex Elder static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id) 8209e15b77dSAlex Elder { 8219e15b77dSAlex Elder struct rbd_snap *snap; 8229e15b77dSAlex Elder 8239e15b77dSAlex Elder if (snap_id == CEPH_NOSNAP) 8249e15b77dSAlex Elder return RBD_SNAP_HEAD_NAME; 8259e15b77dSAlex Elder 8269e15b77dSAlex Elder list_for_each_entry(snap, &rbd_dev->snaps, node) 8279e15b77dSAlex Elder if (snap_id == snap->id) 8289e15b77dSAlex Elder return snap->name; 8299e15b77dSAlex Elder 8309e15b77dSAlex Elder return NULL; 8319e15b77dSAlex Elder } 8329e15b77dSAlex Elder 8338836b995SAlex Elder static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name) 834602adf40SYehuda Sadeh { 835602adf40SYehuda Sadeh 836e86924a8SAlex Elder struct rbd_snap *snap; 83700f1f36fSAlex Elder 838e86924a8SAlex Elder list_for_each_entry(snap, &rbd_dev->snaps, node) { 839e86924a8SAlex Elder if (!strcmp(snap_name, snap->name)) { 8400d7dbfceSAlex Elder rbd_dev->spec->snap_id = snap->id; 841e86924a8SAlex Elder rbd_dev->mapping.size = snap->size; 84234b13184SAlex Elder rbd_dev->mapping.features = snap->features; 84300f1f36fSAlex Elder 844e86924a8SAlex Elder return 0; 845602adf40SYehuda Sadeh } 84600f1f36fSAlex Elder } 847e86924a8SAlex Elder 84800f1f36fSAlex Elder return -ENOENT; 84900f1f36fSAlex Elder } 850602adf40SYehuda Sadeh 851819d52bfSAlex Elder static int rbd_dev_set_mapping(struct rbd_device *rbd_dev) 852602adf40SYehuda Sadeh { 85378dc447dSAlex Elder int ret; 854602adf40SYehuda Sadeh 8550d7dbfceSAlex Elder if (!memcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME, 856cc9d734cSJosh Durgin sizeof (RBD_SNAP_HEAD_NAME))) { 8570d7dbfceSAlex Elder rbd_dev->spec->snap_id = CEPH_NOSNAP; 85899c1f08fSAlex Elder rbd_dev->mapping.size = rbd_dev->header.image_size; 85934b13184SAlex Elder rbd_dev->mapping.features = rbd_dev->header.features; 860e86924a8SAlex Elder ret = 0; 861602adf40SYehuda Sadeh } else { 8620d7dbfceSAlex Elder ret = snap_by_name(rbd_dev, rbd_dev->spec->snap_name); 863602adf40SYehuda Sadeh if (ret < 0) 864602adf40SYehuda Sadeh goto done; 865f84344f3SAlex Elder rbd_dev->mapping.read_only = true; 866602adf40SYehuda Sadeh } 8676d292906SAlex Elder set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); 8686d292906SAlex Elder 869602adf40SYehuda Sadeh done: 870602adf40SYehuda Sadeh return ret; 871602adf40SYehuda Sadeh } 872602adf40SYehuda Sadeh 873602adf40SYehuda Sadeh static void rbd_header_free(struct rbd_image_header *header) 874602adf40SYehuda Sadeh { 875849b4260SAlex Elder kfree(header->object_prefix); 876d78fd7aeSAlex Elder header->object_prefix = NULL; 877602adf40SYehuda Sadeh kfree(header->snap_sizes); 878d78fd7aeSAlex Elder header->snap_sizes = NULL; 879849b4260SAlex Elder kfree(header->snap_names); 880d78fd7aeSAlex Elder header->snap_names = NULL; 881d1d25646SJosh Durgin ceph_put_snap_context(header->snapc); 882d78fd7aeSAlex Elder header->snapc = NULL; 883602adf40SYehuda Sadeh } 884602adf40SYehuda Sadeh 88598571b5aSAlex Elder static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset) 886602adf40SYehuda Sadeh { 88765ccfe21SAlex Elder char *name; 88865ccfe21SAlex Elder u64 segment; 88965ccfe21SAlex Elder int ret; 890602adf40SYehuda Sadeh 8912fd82b9eSAlex Elder name = kmalloc(MAX_OBJ_NAME_SIZE + 1, GFP_NOIO); 89265ccfe21SAlex Elder if (!name) 89365ccfe21SAlex Elder return NULL; 89465ccfe21SAlex Elder segment = offset >> rbd_dev->header.obj_order; 8952fd82b9eSAlex Elder ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, "%s.%012llx", 89665ccfe21SAlex Elder rbd_dev->header.object_prefix, segment); 8972fd82b9eSAlex Elder if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) { 89865ccfe21SAlex Elder pr_err("error formatting segment name for #%llu (%d)\n", 89965ccfe21SAlex Elder segment, ret); 90065ccfe21SAlex Elder kfree(name); 90165ccfe21SAlex Elder name = NULL; 90265ccfe21SAlex Elder } 903602adf40SYehuda Sadeh 90465ccfe21SAlex Elder return name; 90565ccfe21SAlex Elder } 906602adf40SYehuda Sadeh 90765ccfe21SAlex Elder static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset) 90865ccfe21SAlex Elder { 90965ccfe21SAlex Elder u64 segment_size = (u64) 1 << rbd_dev->header.obj_order; 910602adf40SYehuda Sadeh 91165ccfe21SAlex Elder return offset & (segment_size - 1); 91265ccfe21SAlex Elder } 91365ccfe21SAlex Elder 91465ccfe21SAlex Elder static u64 rbd_segment_length(struct rbd_device *rbd_dev, 91565ccfe21SAlex Elder u64 offset, u64 length) 91665ccfe21SAlex Elder { 91765ccfe21SAlex Elder u64 segment_size = (u64) 1 << rbd_dev->header.obj_order; 91865ccfe21SAlex Elder 91965ccfe21SAlex Elder offset &= segment_size - 1; 92065ccfe21SAlex Elder 921aafb230eSAlex Elder rbd_assert(length <= U64_MAX - offset); 92265ccfe21SAlex Elder if (offset + length > segment_size) 92365ccfe21SAlex Elder length = segment_size - offset; 92465ccfe21SAlex Elder 92565ccfe21SAlex Elder return length; 926602adf40SYehuda Sadeh } 927602adf40SYehuda Sadeh 928602adf40SYehuda Sadeh /* 929029bcbd8SJosh Durgin * returns the size of an object in the image 930029bcbd8SJosh Durgin */ 931029bcbd8SJosh Durgin static u64 rbd_obj_bytes(struct rbd_image_header *header) 932029bcbd8SJosh Durgin { 933029bcbd8SJosh Durgin return 1 << header->obj_order; 934029bcbd8SJosh Durgin } 935029bcbd8SJosh Durgin 936029bcbd8SJosh Durgin /* 937602adf40SYehuda Sadeh * bio helpers 938602adf40SYehuda Sadeh */ 939602adf40SYehuda Sadeh 940602adf40SYehuda Sadeh static void bio_chain_put(struct bio *chain) 941602adf40SYehuda Sadeh { 942602adf40SYehuda Sadeh struct bio *tmp; 943602adf40SYehuda Sadeh 944602adf40SYehuda Sadeh while (chain) { 945602adf40SYehuda Sadeh tmp = chain; 946602adf40SYehuda Sadeh chain = chain->bi_next; 947602adf40SYehuda Sadeh bio_put(tmp); 948602adf40SYehuda Sadeh } 949602adf40SYehuda Sadeh } 950602adf40SYehuda Sadeh 951602adf40SYehuda Sadeh /* 952602adf40SYehuda Sadeh * zeros a bio chain, starting at specific offset 953602adf40SYehuda Sadeh */ 954602adf40SYehuda Sadeh static void zero_bio_chain(struct bio *chain, int start_ofs) 955602adf40SYehuda Sadeh { 956602adf40SYehuda Sadeh struct bio_vec *bv; 957602adf40SYehuda Sadeh unsigned long flags; 958602adf40SYehuda Sadeh void *buf; 959602adf40SYehuda Sadeh int i; 960602adf40SYehuda Sadeh int pos = 0; 961602adf40SYehuda Sadeh 962602adf40SYehuda Sadeh while (chain) { 963602adf40SYehuda Sadeh bio_for_each_segment(bv, chain, i) { 964602adf40SYehuda Sadeh if (pos + bv->bv_len > start_ofs) { 965602adf40SYehuda Sadeh int remainder = max(start_ofs - pos, 0); 966602adf40SYehuda Sadeh buf = bvec_kmap_irq(bv, &flags); 967602adf40SYehuda Sadeh memset(buf + remainder, 0, 968602adf40SYehuda Sadeh bv->bv_len - remainder); 96985b5aaa6SDan Carpenter bvec_kunmap_irq(buf, &flags); 970602adf40SYehuda Sadeh } 971602adf40SYehuda Sadeh pos += bv->bv_len; 972602adf40SYehuda Sadeh } 973602adf40SYehuda Sadeh 974602adf40SYehuda Sadeh chain = chain->bi_next; 975602adf40SYehuda Sadeh } 976602adf40SYehuda Sadeh } 977602adf40SYehuda Sadeh 978602adf40SYehuda Sadeh /* 979b9434c5bSAlex Elder * similar to zero_bio_chain(), zeros data defined by a page array, 980b9434c5bSAlex Elder * starting at the given byte offset from the start of the array and 981b9434c5bSAlex Elder * continuing up to the given end offset. The pages array is 982b9434c5bSAlex Elder * assumed to be big enough to hold all bytes up to the end. 983b9434c5bSAlex Elder */ 984b9434c5bSAlex Elder static void zero_pages(struct page **pages, u64 offset, u64 end) 985b9434c5bSAlex Elder { 986b9434c5bSAlex Elder struct page **page = &pages[offset >> PAGE_SHIFT]; 987b9434c5bSAlex Elder 988b9434c5bSAlex Elder rbd_assert(end > offset); 989b9434c5bSAlex Elder rbd_assert(end - offset <= (u64)SIZE_MAX); 990b9434c5bSAlex Elder while (offset < end) { 991b9434c5bSAlex Elder size_t page_offset; 992b9434c5bSAlex Elder size_t length; 993b9434c5bSAlex Elder unsigned long flags; 994b9434c5bSAlex Elder void *kaddr; 995b9434c5bSAlex Elder 996b9434c5bSAlex Elder page_offset = (size_t)(offset & ~PAGE_MASK); 997b9434c5bSAlex Elder length = min(PAGE_SIZE - page_offset, (size_t)(end - offset)); 998b9434c5bSAlex Elder local_irq_save(flags); 999b9434c5bSAlex Elder kaddr = kmap_atomic(*page); 1000b9434c5bSAlex Elder memset(kaddr + page_offset, 0, length); 1001b9434c5bSAlex Elder kunmap_atomic(kaddr); 1002b9434c5bSAlex Elder local_irq_restore(flags); 1003b9434c5bSAlex Elder 1004b9434c5bSAlex Elder offset += length; 1005b9434c5bSAlex Elder page++; 1006b9434c5bSAlex Elder } 1007b9434c5bSAlex Elder } 1008b9434c5bSAlex Elder 1009b9434c5bSAlex Elder /* 1010f7760dadSAlex Elder * Clone a portion of a bio, starting at the given byte offset 1011f7760dadSAlex Elder * and continuing for the number of bytes indicated. 1012602adf40SYehuda Sadeh */ 1013f7760dadSAlex Elder static struct bio *bio_clone_range(struct bio *bio_src, 1014f7760dadSAlex Elder unsigned int offset, 1015f7760dadSAlex Elder unsigned int len, 1016f7760dadSAlex Elder gfp_t gfpmask) 1017602adf40SYehuda Sadeh { 1018f7760dadSAlex Elder struct bio_vec *bv; 1019f7760dadSAlex Elder unsigned int resid; 1020f7760dadSAlex Elder unsigned short idx; 1021f7760dadSAlex Elder unsigned int voff; 1022f7760dadSAlex Elder unsigned short end_idx; 1023f7760dadSAlex Elder unsigned short vcnt; 1024f7760dadSAlex Elder struct bio *bio; 1025602adf40SYehuda Sadeh 1026f7760dadSAlex Elder /* Handle the easy case for the caller */ 1027f7760dadSAlex Elder 1028f7760dadSAlex Elder if (!offset && len == bio_src->bi_size) 1029f7760dadSAlex Elder return bio_clone(bio_src, gfpmask); 1030f7760dadSAlex Elder 1031f7760dadSAlex Elder if (WARN_ON_ONCE(!len)) 1032f7760dadSAlex Elder return NULL; 1033f7760dadSAlex Elder if (WARN_ON_ONCE(len > bio_src->bi_size)) 1034f7760dadSAlex Elder return NULL; 1035f7760dadSAlex Elder if (WARN_ON_ONCE(offset > bio_src->bi_size - len)) 1036f7760dadSAlex Elder return NULL; 1037f7760dadSAlex Elder 1038f7760dadSAlex Elder /* Find first affected segment... */ 1039f7760dadSAlex Elder 1040f7760dadSAlex Elder resid = offset; 1041f7760dadSAlex Elder __bio_for_each_segment(bv, bio_src, idx, 0) { 1042f7760dadSAlex Elder if (resid < bv->bv_len) 1043f7760dadSAlex Elder break; 1044f7760dadSAlex Elder resid -= bv->bv_len; 1045602adf40SYehuda Sadeh } 1046f7760dadSAlex Elder voff = resid; 1047602adf40SYehuda Sadeh 1048f7760dadSAlex Elder /* ...and the last affected segment */ 1049542582fcSAlex Elder 1050f7760dadSAlex Elder resid += len; 1051f7760dadSAlex Elder __bio_for_each_segment(bv, bio_src, end_idx, idx) { 1052f7760dadSAlex Elder if (resid <= bv->bv_len) 1053f7760dadSAlex Elder break; 1054f7760dadSAlex Elder resid -= bv->bv_len; 1055f7760dadSAlex Elder } 1056f7760dadSAlex Elder vcnt = end_idx - idx + 1; 1057602adf40SYehuda Sadeh 1058f7760dadSAlex Elder /* Build the clone */ 1059f7760dadSAlex Elder 1060f7760dadSAlex Elder bio = bio_alloc(gfpmask, (unsigned int) vcnt); 1061f7760dadSAlex Elder if (!bio) 1062f7760dadSAlex Elder return NULL; /* ENOMEM */ 1063f7760dadSAlex Elder 1064f7760dadSAlex Elder bio->bi_bdev = bio_src->bi_bdev; 1065f7760dadSAlex Elder bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT); 1066f7760dadSAlex Elder bio->bi_rw = bio_src->bi_rw; 1067f7760dadSAlex Elder bio->bi_flags |= 1 << BIO_CLONED; 1068602adf40SYehuda Sadeh 1069602adf40SYehuda Sadeh /* 1070f7760dadSAlex Elder * Copy over our part of the bio_vec, then update the first 1071f7760dadSAlex Elder * and last (or only) entries. 1072602adf40SYehuda Sadeh */ 1073f7760dadSAlex Elder memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx], 1074f7760dadSAlex Elder vcnt * sizeof (struct bio_vec)); 1075f7760dadSAlex Elder bio->bi_io_vec[0].bv_offset += voff; 1076f7760dadSAlex Elder if (vcnt > 1) { 1077f7760dadSAlex Elder bio->bi_io_vec[0].bv_len -= voff; 1078f7760dadSAlex Elder bio->bi_io_vec[vcnt - 1].bv_len = resid; 1079602adf40SYehuda Sadeh } else { 1080f7760dadSAlex Elder bio->bi_io_vec[0].bv_len = len; 1081602adf40SYehuda Sadeh } 1082602adf40SYehuda Sadeh 1083f7760dadSAlex Elder bio->bi_vcnt = vcnt; 1084f7760dadSAlex Elder bio->bi_size = len; 1085f7760dadSAlex Elder bio->bi_idx = 0; 1086602adf40SYehuda Sadeh 1087f7760dadSAlex Elder return bio; 1088602adf40SYehuda Sadeh } 1089602adf40SYehuda Sadeh 1090f7760dadSAlex Elder /* 1091f7760dadSAlex Elder * Clone a portion of a bio chain, starting at the given byte offset 1092f7760dadSAlex Elder * into the first bio in the source chain and continuing for the 1093f7760dadSAlex Elder * number of bytes indicated. The result is another bio chain of 1094f7760dadSAlex Elder * exactly the given length, or a null pointer on error. 1095f7760dadSAlex Elder * 1096f7760dadSAlex Elder * The bio_src and offset parameters are both in-out. On entry they 1097f7760dadSAlex Elder * refer to the first source bio and the offset into that bio where 1098f7760dadSAlex Elder * the start of data to be cloned is located. 1099f7760dadSAlex Elder * 1100f7760dadSAlex Elder * On return, bio_src is updated to refer to the bio in the source 1101f7760dadSAlex Elder * chain that contains first un-cloned byte, and *offset will 1102f7760dadSAlex Elder * contain the offset of that byte within that bio. 1103f7760dadSAlex Elder */ 1104f7760dadSAlex Elder static struct bio *bio_chain_clone_range(struct bio **bio_src, 1105f7760dadSAlex Elder unsigned int *offset, 1106f7760dadSAlex Elder unsigned int len, 1107f7760dadSAlex Elder gfp_t gfpmask) 1108f7760dadSAlex Elder { 1109f7760dadSAlex Elder struct bio *bi = *bio_src; 1110f7760dadSAlex Elder unsigned int off = *offset; 1111f7760dadSAlex Elder struct bio *chain = NULL; 1112f7760dadSAlex Elder struct bio **end; 1113602adf40SYehuda Sadeh 1114f7760dadSAlex Elder /* Build up a chain of clone bios up to the limit */ 1115602adf40SYehuda Sadeh 1116f7760dadSAlex Elder if (!bi || off >= bi->bi_size || !len) 1117f7760dadSAlex Elder return NULL; /* Nothing to clone */ 1118602adf40SYehuda Sadeh 1119f7760dadSAlex Elder end = &chain; 1120f7760dadSAlex Elder while (len) { 1121f7760dadSAlex Elder unsigned int bi_size; 1122f7760dadSAlex Elder struct bio *bio; 1123f7760dadSAlex Elder 1124f5400b7aSAlex Elder if (!bi) { 1125f5400b7aSAlex Elder rbd_warn(NULL, "bio_chain exhausted with %u left", len); 1126f7760dadSAlex Elder goto out_err; /* EINVAL; ran out of bio's */ 1127f5400b7aSAlex Elder } 1128f7760dadSAlex Elder bi_size = min_t(unsigned int, bi->bi_size - off, len); 1129f7760dadSAlex Elder bio = bio_clone_range(bi, off, bi_size, gfpmask); 1130f7760dadSAlex Elder if (!bio) 1131f7760dadSAlex Elder goto out_err; /* ENOMEM */ 1132f7760dadSAlex Elder 1133f7760dadSAlex Elder *end = bio; 1134f7760dadSAlex Elder end = &bio->bi_next; 1135f7760dadSAlex Elder 1136f7760dadSAlex Elder off += bi_size; 1137f7760dadSAlex Elder if (off == bi->bi_size) { 1138f7760dadSAlex Elder bi = bi->bi_next; 1139f7760dadSAlex Elder off = 0; 1140f7760dadSAlex Elder } 1141f7760dadSAlex Elder len -= bi_size; 1142f7760dadSAlex Elder } 1143f7760dadSAlex Elder *bio_src = bi; 1144f7760dadSAlex Elder *offset = off; 1145f7760dadSAlex Elder 1146f7760dadSAlex Elder return chain; 1147f7760dadSAlex Elder out_err: 1148f7760dadSAlex Elder bio_chain_put(chain); 1149f7760dadSAlex Elder 1150602adf40SYehuda Sadeh return NULL; 1151602adf40SYehuda Sadeh } 1152602adf40SYehuda Sadeh 1153926f9b3fSAlex Elder /* 1154926f9b3fSAlex Elder * The default/initial value for all object request flags is 0. For 1155926f9b3fSAlex Elder * each flag, once its value is set to 1 it is never reset to 0 1156926f9b3fSAlex Elder * again. 1157926f9b3fSAlex Elder */ 11586365d33aSAlex Elder static void obj_request_img_data_set(struct rbd_obj_request *obj_request) 11596365d33aSAlex Elder { 11606365d33aSAlex Elder if (test_and_set_bit(OBJ_REQ_IMG_DATA, &obj_request->flags)) { 11616365d33aSAlex Elder struct rbd_device *rbd_dev; 11626365d33aSAlex Elder 116357acbaa7SAlex Elder rbd_dev = obj_request->img_request->rbd_dev; 11646365d33aSAlex Elder rbd_warn(rbd_dev, "obj_request %p already marked img_data\n", 11656365d33aSAlex Elder obj_request); 11666365d33aSAlex Elder } 11676365d33aSAlex Elder } 11686365d33aSAlex Elder 11696365d33aSAlex Elder static bool obj_request_img_data_test(struct rbd_obj_request *obj_request) 11706365d33aSAlex Elder { 11716365d33aSAlex Elder smp_mb(); 11726365d33aSAlex Elder return test_bit(OBJ_REQ_IMG_DATA, &obj_request->flags) != 0; 11736365d33aSAlex Elder } 11746365d33aSAlex Elder 117557acbaa7SAlex Elder static void obj_request_done_set(struct rbd_obj_request *obj_request) 117657acbaa7SAlex Elder { 117757acbaa7SAlex Elder if (test_and_set_bit(OBJ_REQ_DONE, &obj_request->flags)) { 117857acbaa7SAlex Elder struct rbd_device *rbd_dev = NULL; 117957acbaa7SAlex Elder 118057acbaa7SAlex Elder if (obj_request_img_data_test(obj_request)) 118157acbaa7SAlex Elder rbd_dev = obj_request->img_request->rbd_dev; 118257acbaa7SAlex Elder rbd_warn(rbd_dev, "obj_request %p already marked done\n", 118357acbaa7SAlex Elder obj_request); 118457acbaa7SAlex Elder } 118557acbaa7SAlex Elder } 118657acbaa7SAlex Elder 118757acbaa7SAlex Elder static bool obj_request_done_test(struct rbd_obj_request *obj_request) 118857acbaa7SAlex Elder { 118957acbaa7SAlex Elder smp_mb(); 119057acbaa7SAlex Elder return test_bit(OBJ_REQ_DONE, &obj_request->flags) != 0; 119157acbaa7SAlex Elder } 119257acbaa7SAlex Elder 11935679c59fSAlex Elder /* 11945679c59fSAlex Elder * This sets the KNOWN flag after (possibly) setting the EXISTS 11955679c59fSAlex Elder * flag. The latter is set based on the "exists" value provided. 11965679c59fSAlex Elder * 11975679c59fSAlex Elder * Note that for our purposes once an object exists it never goes 11985679c59fSAlex Elder * away again. It's possible that the response from two existence 11995679c59fSAlex Elder * checks are separated by the creation of the target object, and 12005679c59fSAlex Elder * the first ("doesn't exist") response arrives *after* the second 12015679c59fSAlex Elder * ("does exist"). In that case we ignore the second one. 12025679c59fSAlex Elder */ 12035679c59fSAlex Elder static void obj_request_existence_set(struct rbd_obj_request *obj_request, 12045679c59fSAlex Elder bool exists) 12055679c59fSAlex Elder { 12065679c59fSAlex Elder if (exists) 12075679c59fSAlex Elder set_bit(OBJ_REQ_EXISTS, &obj_request->flags); 12085679c59fSAlex Elder set_bit(OBJ_REQ_KNOWN, &obj_request->flags); 12095679c59fSAlex Elder smp_mb(); 12105679c59fSAlex Elder } 12115679c59fSAlex Elder 12125679c59fSAlex Elder static bool obj_request_known_test(struct rbd_obj_request *obj_request) 12135679c59fSAlex Elder { 12145679c59fSAlex Elder smp_mb(); 12155679c59fSAlex Elder return test_bit(OBJ_REQ_KNOWN, &obj_request->flags) != 0; 12165679c59fSAlex Elder } 12175679c59fSAlex Elder 12185679c59fSAlex Elder static bool obj_request_exists_test(struct rbd_obj_request *obj_request) 12195679c59fSAlex Elder { 12205679c59fSAlex Elder smp_mb(); 12215679c59fSAlex Elder return test_bit(OBJ_REQ_EXISTS, &obj_request->flags) != 0; 12225679c59fSAlex Elder } 12235679c59fSAlex Elder 1224bf0d5f50SAlex Elder static void rbd_obj_request_get(struct rbd_obj_request *obj_request) 1225bf0d5f50SAlex Elder { 122637206ee5SAlex Elder dout("%s: obj %p (was %d)\n", __func__, obj_request, 122737206ee5SAlex Elder atomic_read(&obj_request->kref.refcount)); 1228bf0d5f50SAlex Elder kref_get(&obj_request->kref); 1229bf0d5f50SAlex Elder } 1230bf0d5f50SAlex Elder 1231bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref); 1232bf0d5f50SAlex Elder static void rbd_obj_request_put(struct rbd_obj_request *obj_request) 1233bf0d5f50SAlex Elder { 1234bf0d5f50SAlex Elder rbd_assert(obj_request != NULL); 123537206ee5SAlex Elder dout("%s: obj %p (was %d)\n", __func__, obj_request, 123637206ee5SAlex Elder atomic_read(&obj_request->kref.refcount)); 1237bf0d5f50SAlex Elder kref_put(&obj_request->kref, rbd_obj_request_destroy); 1238bf0d5f50SAlex Elder } 1239bf0d5f50SAlex Elder 1240bf0d5f50SAlex Elder static void rbd_img_request_get(struct rbd_img_request *img_request) 1241bf0d5f50SAlex Elder { 124237206ee5SAlex Elder dout("%s: img %p (was %d)\n", __func__, img_request, 124337206ee5SAlex Elder atomic_read(&img_request->kref.refcount)); 1244bf0d5f50SAlex Elder kref_get(&img_request->kref); 1245bf0d5f50SAlex Elder } 1246bf0d5f50SAlex Elder 1247bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref); 1248bf0d5f50SAlex Elder static void rbd_img_request_put(struct rbd_img_request *img_request) 1249bf0d5f50SAlex Elder { 1250bf0d5f50SAlex Elder rbd_assert(img_request != NULL); 125137206ee5SAlex Elder dout("%s: img %p (was %d)\n", __func__, img_request, 125237206ee5SAlex Elder atomic_read(&img_request->kref.refcount)); 1253bf0d5f50SAlex Elder kref_put(&img_request->kref, rbd_img_request_destroy); 1254bf0d5f50SAlex Elder } 1255bf0d5f50SAlex Elder 1256bf0d5f50SAlex Elder static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request, 1257bf0d5f50SAlex Elder struct rbd_obj_request *obj_request) 1258bf0d5f50SAlex Elder { 125925dcf954SAlex Elder rbd_assert(obj_request->img_request == NULL); 126025dcf954SAlex Elder 1261b155e86cSAlex Elder /* Image request now owns object's original reference */ 1262bf0d5f50SAlex Elder obj_request->img_request = img_request; 126325dcf954SAlex Elder obj_request->which = img_request->obj_request_count; 12646365d33aSAlex Elder rbd_assert(!obj_request_img_data_test(obj_request)); 12656365d33aSAlex Elder obj_request_img_data_set(obj_request); 1266bf0d5f50SAlex Elder rbd_assert(obj_request->which != BAD_WHICH); 126725dcf954SAlex Elder img_request->obj_request_count++; 126825dcf954SAlex Elder list_add_tail(&obj_request->links, &img_request->obj_requests); 126937206ee5SAlex Elder dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request, 127037206ee5SAlex Elder obj_request->which); 1271bf0d5f50SAlex Elder } 1272bf0d5f50SAlex Elder 1273bf0d5f50SAlex Elder static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request, 1274bf0d5f50SAlex Elder struct rbd_obj_request *obj_request) 1275bf0d5f50SAlex Elder { 1276bf0d5f50SAlex Elder rbd_assert(obj_request->which != BAD_WHICH); 127725dcf954SAlex Elder 127837206ee5SAlex Elder dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request, 127937206ee5SAlex Elder obj_request->which); 1280bf0d5f50SAlex Elder list_del(&obj_request->links); 128125dcf954SAlex Elder rbd_assert(img_request->obj_request_count > 0); 128225dcf954SAlex Elder img_request->obj_request_count--; 128325dcf954SAlex Elder rbd_assert(obj_request->which == img_request->obj_request_count); 128425dcf954SAlex Elder obj_request->which = BAD_WHICH; 12856365d33aSAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 1286bf0d5f50SAlex Elder rbd_assert(obj_request->img_request == img_request); 1287bf0d5f50SAlex Elder obj_request->img_request = NULL; 128825dcf954SAlex Elder obj_request->callback = NULL; 1289bf0d5f50SAlex Elder rbd_obj_request_put(obj_request); 1290bf0d5f50SAlex Elder } 1291bf0d5f50SAlex Elder 1292bf0d5f50SAlex Elder static bool obj_request_type_valid(enum obj_request_type type) 1293bf0d5f50SAlex Elder { 1294bf0d5f50SAlex Elder switch (type) { 12959969ebc5SAlex Elder case OBJ_REQUEST_NODATA: 1296bf0d5f50SAlex Elder case OBJ_REQUEST_BIO: 1297788e2df3SAlex Elder case OBJ_REQUEST_PAGES: 1298bf0d5f50SAlex Elder return true; 1299bf0d5f50SAlex Elder default: 1300bf0d5f50SAlex Elder return false; 1301bf0d5f50SAlex Elder } 1302bf0d5f50SAlex Elder } 1303bf0d5f50SAlex Elder 1304bf0d5f50SAlex Elder static int rbd_obj_request_submit(struct ceph_osd_client *osdc, 1305bf0d5f50SAlex Elder struct rbd_obj_request *obj_request) 1306bf0d5f50SAlex Elder { 130737206ee5SAlex Elder dout("%s: osdc %p obj %p\n", __func__, osdc, obj_request); 130837206ee5SAlex Elder 1309bf0d5f50SAlex Elder return ceph_osdc_start_request(osdc, obj_request->osd_req, false); 1310bf0d5f50SAlex Elder } 1311bf0d5f50SAlex Elder 1312bf0d5f50SAlex Elder static void rbd_img_request_complete(struct rbd_img_request *img_request) 1313bf0d5f50SAlex Elder { 131455f27e09SAlex Elder 131537206ee5SAlex Elder dout("%s: img %p\n", __func__, img_request); 131655f27e09SAlex Elder 131755f27e09SAlex Elder /* 131855f27e09SAlex Elder * If no error occurred, compute the aggregate transfer 131955f27e09SAlex Elder * count for the image request. We could instead use 132055f27e09SAlex Elder * atomic64_cmpxchg() to update it as each object request 132155f27e09SAlex Elder * completes; not clear which way is better off hand. 132255f27e09SAlex Elder */ 132355f27e09SAlex Elder if (!img_request->result) { 132455f27e09SAlex Elder struct rbd_obj_request *obj_request; 132555f27e09SAlex Elder u64 xferred = 0; 132655f27e09SAlex Elder 132755f27e09SAlex Elder for_each_obj_request(img_request, obj_request) 132855f27e09SAlex Elder xferred += obj_request->xferred; 132955f27e09SAlex Elder img_request->xferred = xferred; 133055f27e09SAlex Elder } 133155f27e09SAlex Elder 1332bf0d5f50SAlex Elder if (img_request->callback) 1333bf0d5f50SAlex Elder img_request->callback(img_request); 1334bf0d5f50SAlex Elder else 1335bf0d5f50SAlex Elder rbd_img_request_put(img_request); 1336bf0d5f50SAlex Elder } 1337bf0d5f50SAlex Elder 1338788e2df3SAlex Elder /* Caller is responsible for rbd_obj_request_destroy(obj_request) */ 1339788e2df3SAlex Elder 1340788e2df3SAlex Elder static int rbd_obj_request_wait(struct rbd_obj_request *obj_request) 1341788e2df3SAlex Elder { 134237206ee5SAlex Elder dout("%s: obj %p\n", __func__, obj_request); 134337206ee5SAlex Elder 1344788e2df3SAlex Elder return wait_for_completion_interruptible(&obj_request->completion); 1345788e2df3SAlex Elder } 1346788e2df3SAlex Elder 13470c425248SAlex Elder /* 13480c425248SAlex Elder * The default/initial value for all image request flags is 0. Each 13490c425248SAlex Elder * is conditionally set to 1 at image request initialization time 13500c425248SAlex Elder * and currently never change thereafter. 13510c425248SAlex Elder */ 13520c425248SAlex Elder static void img_request_write_set(struct rbd_img_request *img_request) 13530c425248SAlex Elder { 13540c425248SAlex Elder set_bit(IMG_REQ_WRITE, &img_request->flags); 13550c425248SAlex Elder smp_mb(); 13560c425248SAlex Elder } 13570c425248SAlex Elder 13580c425248SAlex Elder static bool img_request_write_test(struct rbd_img_request *img_request) 13590c425248SAlex Elder { 13600c425248SAlex Elder smp_mb(); 13610c425248SAlex Elder return test_bit(IMG_REQ_WRITE, &img_request->flags) != 0; 13620c425248SAlex Elder } 13630c425248SAlex Elder 13649849e986SAlex Elder static void img_request_child_set(struct rbd_img_request *img_request) 13659849e986SAlex Elder { 13669849e986SAlex Elder set_bit(IMG_REQ_CHILD, &img_request->flags); 13679849e986SAlex Elder smp_mb(); 13689849e986SAlex Elder } 13699849e986SAlex Elder 13709849e986SAlex Elder static bool img_request_child_test(struct rbd_img_request *img_request) 13719849e986SAlex Elder { 13729849e986SAlex Elder smp_mb(); 13739849e986SAlex Elder return test_bit(IMG_REQ_CHILD, &img_request->flags) != 0; 13749849e986SAlex Elder } 13759849e986SAlex Elder 1376d0b2e944SAlex Elder static void img_request_layered_set(struct rbd_img_request *img_request) 1377d0b2e944SAlex Elder { 1378d0b2e944SAlex Elder set_bit(IMG_REQ_LAYERED, &img_request->flags); 1379d0b2e944SAlex Elder smp_mb(); 1380d0b2e944SAlex Elder } 1381d0b2e944SAlex Elder 1382d0b2e944SAlex Elder static bool img_request_layered_test(struct rbd_img_request *img_request) 1383d0b2e944SAlex Elder { 1384d0b2e944SAlex Elder smp_mb(); 1385d0b2e944SAlex Elder return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0; 1386d0b2e944SAlex Elder } 1387d0b2e944SAlex Elder 13886e2a4505SAlex Elder static void 13896e2a4505SAlex Elder rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request) 13906e2a4505SAlex Elder { 1391b9434c5bSAlex Elder u64 xferred = obj_request->xferred; 1392b9434c5bSAlex Elder u64 length = obj_request->length; 1393b9434c5bSAlex Elder 13946e2a4505SAlex Elder dout("%s: obj %p img %p result %d %llu/%llu\n", __func__, 13956e2a4505SAlex Elder obj_request, obj_request->img_request, obj_request->result, 1396b9434c5bSAlex Elder xferred, length); 13976e2a4505SAlex Elder /* 13986e2a4505SAlex Elder * ENOENT means a hole in the image. We zero-fill the 13996e2a4505SAlex Elder * entire length of the request. A short read also implies 14006e2a4505SAlex Elder * zero-fill to the end of the request. Either way we 14016e2a4505SAlex Elder * update the xferred count to indicate the whole request 14026e2a4505SAlex Elder * was satisfied. 14036e2a4505SAlex Elder */ 1404b9434c5bSAlex Elder rbd_assert(obj_request->type != OBJ_REQUEST_NODATA); 14056e2a4505SAlex Elder if (obj_request->result == -ENOENT) { 1406b9434c5bSAlex Elder if (obj_request->type == OBJ_REQUEST_BIO) 14076e2a4505SAlex Elder zero_bio_chain(obj_request->bio_list, 0); 1408b9434c5bSAlex Elder else 1409b9434c5bSAlex Elder zero_pages(obj_request->pages, 0, length); 14106e2a4505SAlex Elder obj_request->result = 0; 1411b9434c5bSAlex Elder obj_request->xferred = length; 1412b9434c5bSAlex Elder } else if (xferred < length && !obj_request->result) { 1413b9434c5bSAlex Elder if (obj_request->type == OBJ_REQUEST_BIO) 1414b9434c5bSAlex Elder zero_bio_chain(obj_request->bio_list, xferred); 1415b9434c5bSAlex Elder else 1416b9434c5bSAlex Elder zero_pages(obj_request->pages, xferred, length); 1417b9434c5bSAlex Elder obj_request->xferred = length; 14186e2a4505SAlex Elder } 14196e2a4505SAlex Elder obj_request_done_set(obj_request); 14206e2a4505SAlex Elder } 14216e2a4505SAlex Elder 1422bf0d5f50SAlex Elder static void rbd_obj_request_complete(struct rbd_obj_request *obj_request) 1423bf0d5f50SAlex Elder { 142437206ee5SAlex Elder dout("%s: obj %p cb %p\n", __func__, obj_request, 142537206ee5SAlex Elder obj_request->callback); 1426bf0d5f50SAlex Elder if (obj_request->callback) 1427bf0d5f50SAlex Elder obj_request->callback(obj_request); 1428788e2df3SAlex Elder else 1429788e2df3SAlex Elder complete_all(&obj_request->completion); 1430bf0d5f50SAlex Elder } 1431bf0d5f50SAlex Elder 1432c47f9371SAlex Elder static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request) 143339bf2c5dSAlex Elder { 143439bf2c5dSAlex Elder dout("%s: obj %p\n", __func__, obj_request); 143539bf2c5dSAlex Elder obj_request_done_set(obj_request); 143639bf2c5dSAlex Elder } 143739bf2c5dSAlex Elder 1438c47f9371SAlex Elder static void rbd_osd_read_callback(struct rbd_obj_request *obj_request) 1439bf0d5f50SAlex Elder { 144057acbaa7SAlex Elder struct rbd_img_request *img_request = NULL; 1441a9e8ba2cSAlex Elder struct rbd_device *rbd_dev = NULL; 144257acbaa7SAlex Elder bool layered = false; 144357acbaa7SAlex Elder 144457acbaa7SAlex Elder if (obj_request_img_data_test(obj_request)) { 144557acbaa7SAlex Elder img_request = obj_request->img_request; 144657acbaa7SAlex Elder layered = img_request && img_request_layered_test(img_request); 1447a9e8ba2cSAlex Elder rbd_dev = img_request->rbd_dev; 144857acbaa7SAlex Elder } 14498b3e1a56SAlex Elder 14508b3e1a56SAlex Elder dout("%s: obj %p img %p result %d %llu/%llu\n", __func__, 14518b3e1a56SAlex Elder obj_request, img_request, obj_request->result, 14528b3e1a56SAlex Elder obj_request->xferred, obj_request->length); 1453a9e8ba2cSAlex Elder if (layered && obj_request->result == -ENOENT && 1454a9e8ba2cSAlex Elder obj_request->img_offset < rbd_dev->parent_overlap) 14558b3e1a56SAlex Elder rbd_img_parent_read(obj_request); 14568b3e1a56SAlex Elder else if (img_request) 14576e2a4505SAlex Elder rbd_img_obj_request_read_callback(obj_request); 14586e2a4505SAlex Elder else 145907741308SAlex Elder obj_request_done_set(obj_request); 1460bf0d5f50SAlex Elder } 1461bf0d5f50SAlex Elder 1462c47f9371SAlex Elder static void rbd_osd_write_callback(struct rbd_obj_request *obj_request) 1463bf0d5f50SAlex Elder { 14641b83bef2SSage Weil dout("%s: obj %p result %d %llu\n", __func__, obj_request, 14651b83bef2SSage Weil obj_request->result, obj_request->length); 14661b83bef2SSage Weil /* 14678b3e1a56SAlex Elder * There is no such thing as a successful short write. Set 14688b3e1a56SAlex Elder * it to our originally-requested length. 14691b83bef2SSage Weil */ 14701b83bef2SSage Weil obj_request->xferred = obj_request->length; 147107741308SAlex Elder obj_request_done_set(obj_request); 1472bf0d5f50SAlex Elder } 1473bf0d5f50SAlex Elder 1474fbfab539SAlex Elder /* 1475fbfab539SAlex Elder * For a simple stat call there's nothing to do. We'll do more if 1476fbfab539SAlex Elder * this is part of a write sequence for a layered image. 1477fbfab539SAlex Elder */ 1478c47f9371SAlex Elder static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request) 1479fbfab539SAlex Elder { 148037206ee5SAlex Elder dout("%s: obj %p\n", __func__, obj_request); 1481fbfab539SAlex Elder obj_request_done_set(obj_request); 1482fbfab539SAlex Elder } 1483fbfab539SAlex Elder 1484bf0d5f50SAlex Elder static void rbd_osd_req_callback(struct ceph_osd_request *osd_req, 1485bf0d5f50SAlex Elder struct ceph_msg *msg) 1486bf0d5f50SAlex Elder { 1487bf0d5f50SAlex Elder struct rbd_obj_request *obj_request = osd_req->r_priv; 1488bf0d5f50SAlex Elder u16 opcode; 1489bf0d5f50SAlex Elder 149037206ee5SAlex Elder dout("%s: osd_req %p msg %p\n", __func__, osd_req, msg); 1491bf0d5f50SAlex Elder rbd_assert(osd_req == obj_request->osd_req); 149257acbaa7SAlex Elder if (obj_request_img_data_test(obj_request)) { 149357acbaa7SAlex Elder rbd_assert(obj_request->img_request); 149457acbaa7SAlex Elder rbd_assert(obj_request->which != BAD_WHICH); 149557acbaa7SAlex Elder } else { 149657acbaa7SAlex Elder rbd_assert(obj_request->which == BAD_WHICH); 149757acbaa7SAlex Elder } 1498bf0d5f50SAlex Elder 14991b83bef2SSage Weil if (osd_req->r_result < 0) 15001b83bef2SSage Weil obj_request->result = osd_req->r_result; 1501bf0d5f50SAlex Elder obj_request->version = le64_to_cpu(osd_req->r_reassert_version.version); 1502bf0d5f50SAlex Elder 15030eefd470SAlex Elder BUG_ON(osd_req->r_num_ops > 2); 1504bf0d5f50SAlex Elder 1505c47f9371SAlex Elder /* 1506c47f9371SAlex Elder * We support a 64-bit length, but ultimately it has to be 1507c47f9371SAlex Elder * passed to blk_end_request(), which takes an unsigned int. 1508c47f9371SAlex Elder */ 15091b83bef2SSage Weil obj_request->xferred = osd_req->r_reply_op_len[0]; 1510c47f9371SAlex Elder rbd_assert(obj_request->xferred < (u64)UINT_MAX); 151179528734SAlex Elder opcode = osd_req->r_ops[0].op; 1512bf0d5f50SAlex Elder switch (opcode) { 1513bf0d5f50SAlex Elder case CEPH_OSD_OP_READ: 1514c47f9371SAlex Elder rbd_osd_read_callback(obj_request); 1515bf0d5f50SAlex Elder break; 1516bf0d5f50SAlex Elder case CEPH_OSD_OP_WRITE: 1517c47f9371SAlex Elder rbd_osd_write_callback(obj_request); 1518bf0d5f50SAlex Elder break; 1519fbfab539SAlex Elder case CEPH_OSD_OP_STAT: 1520c47f9371SAlex Elder rbd_osd_stat_callback(obj_request); 1521fbfab539SAlex Elder break; 152236be9a76SAlex Elder case CEPH_OSD_OP_CALL: 1523b8d70035SAlex Elder case CEPH_OSD_OP_NOTIFY_ACK: 15249969ebc5SAlex Elder case CEPH_OSD_OP_WATCH: 1525c47f9371SAlex Elder rbd_osd_trivial_callback(obj_request); 15269969ebc5SAlex Elder break; 1527bf0d5f50SAlex Elder default: 1528bf0d5f50SAlex Elder rbd_warn(NULL, "%s: unsupported op %hu\n", 1529bf0d5f50SAlex Elder obj_request->object_name, (unsigned short) opcode); 1530bf0d5f50SAlex Elder break; 1531bf0d5f50SAlex Elder } 1532bf0d5f50SAlex Elder 153307741308SAlex Elder if (obj_request_done_test(obj_request)) 1534bf0d5f50SAlex Elder rbd_obj_request_complete(obj_request); 1535bf0d5f50SAlex Elder } 1536bf0d5f50SAlex Elder 15379d4df01fSAlex Elder static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request) 1538430c28c3SAlex Elder { 1539430c28c3SAlex Elder struct rbd_img_request *img_request = obj_request->img_request; 15408c042b0dSAlex Elder struct ceph_osd_request *osd_req = obj_request->osd_req; 15419d4df01fSAlex Elder u64 snap_id; 1542430c28c3SAlex Elder 15438c042b0dSAlex Elder rbd_assert(osd_req != NULL); 1544430c28c3SAlex Elder 15459d4df01fSAlex Elder snap_id = img_request ? img_request->snap_id : CEPH_NOSNAP; 15468c042b0dSAlex Elder ceph_osdc_build_request(osd_req, obj_request->offset, 15479d4df01fSAlex Elder NULL, snap_id, NULL); 15489d4df01fSAlex Elder } 15499d4df01fSAlex Elder 15509d4df01fSAlex Elder static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request) 15519d4df01fSAlex Elder { 15529d4df01fSAlex Elder struct rbd_img_request *img_request = obj_request->img_request; 15539d4df01fSAlex Elder struct ceph_osd_request *osd_req = obj_request->osd_req; 15549d4df01fSAlex Elder struct ceph_snap_context *snapc; 15559d4df01fSAlex Elder struct timespec mtime = CURRENT_TIME; 15569d4df01fSAlex Elder 15579d4df01fSAlex Elder rbd_assert(osd_req != NULL); 15589d4df01fSAlex Elder 15599d4df01fSAlex Elder snapc = img_request ? img_request->snapc : NULL; 15609d4df01fSAlex Elder ceph_osdc_build_request(osd_req, obj_request->offset, 15619d4df01fSAlex Elder snapc, CEPH_NOSNAP, &mtime); 1562430c28c3SAlex Elder } 1563430c28c3SAlex Elder 1564bf0d5f50SAlex Elder static struct ceph_osd_request *rbd_osd_req_create( 1565bf0d5f50SAlex Elder struct rbd_device *rbd_dev, 1566bf0d5f50SAlex Elder bool write_request, 1567430c28c3SAlex Elder struct rbd_obj_request *obj_request) 1568bf0d5f50SAlex Elder { 1569bf0d5f50SAlex Elder struct ceph_snap_context *snapc = NULL; 1570bf0d5f50SAlex Elder struct ceph_osd_client *osdc; 1571bf0d5f50SAlex Elder struct ceph_osd_request *osd_req; 1572bf0d5f50SAlex Elder 15736365d33aSAlex Elder if (obj_request_img_data_test(obj_request)) { 15746365d33aSAlex Elder struct rbd_img_request *img_request = obj_request->img_request; 15756365d33aSAlex Elder 15760c425248SAlex Elder rbd_assert(write_request == 15770c425248SAlex Elder img_request_write_test(img_request)); 15780c425248SAlex Elder if (write_request) 1579bf0d5f50SAlex Elder snapc = img_request->snapc; 1580bf0d5f50SAlex Elder } 1581bf0d5f50SAlex Elder 1582bf0d5f50SAlex Elder /* Allocate and initialize the request, for the single op */ 1583bf0d5f50SAlex Elder 1584bf0d5f50SAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 1585bf0d5f50SAlex Elder osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_ATOMIC); 1586bf0d5f50SAlex Elder if (!osd_req) 1587bf0d5f50SAlex Elder return NULL; /* ENOMEM */ 1588bf0d5f50SAlex Elder 1589430c28c3SAlex Elder if (write_request) 1590bf0d5f50SAlex Elder osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK; 1591430c28c3SAlex Elder else 1592bf0d5f50SAlex Elder osd_req->r_flags = CEPH_OSD_FLAG_READ; 1593bf0d5f50SAlex Elder 1594bf0d5f50SAlex Elder osd_req->r_callback = rbd_osd_req_callback; 1595bf0d5f50SAlex Elder osd_req->r_priv = obj_request; 1596bf0d5f50SAlex Elder 1597bf0d5f50SAlex Elder osd_req->r_oid_len = strlen(obj_request->object_name); 1598bf0d5f50SAlex Elder rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid)); 1599bf0d5f50SAlex Elder memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len); 1600bf0d5f50SAlex Elder 1601bf0d5f50SAlex Elder osd_req->r_file_layout = rbd_dev->layout; /* struct */ 1602bf0d5f50SAlex Elder 1603bf0d5f50SAlex Elder return osd_req; 1604bf0d5f50SAlex Elder } 1605bf0d5f50SAlex Elder 16060eefd470SAlex Elder /* 16070eefd470SAlex Elder * Create a copyup osd request based on the information in the 16080eefd470SAlex Elder * object request supplied. A copyup request has two osd ops, 16090eefd470SAlex Elder * a copyup method call, and a "normal" write request. 16100eefd470SAlex Elder */ 16110eefd470SAlex Elder static struct ceph_osd_request * 16120eefd470SAlex Elder rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request) 16130eefd470SAlex Elder { 16140eefd470SAlex Elder struct rbd_img_request *img_request; 16150eefd470SAlex Elder struct ceph_snap_context *snapc; 16160eefd470SAlex Elder struct rbd_device *rbd_dev; 16170eefd470SAlex Elder struct ceph_osd_client *osdc; 16180eefd470SAlex Elder struct ceph_osd_request *osd_req; 16190eefd470SAlex Elder 16200eefd470SAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 16210eefd470SAlex Elder img_request = obj_request->img_request; 16220eefd470SAlex Elder rbd_assert(img_request); 16230eefd470SAlex Elder rbd_assert(img_request_write_test(img_request)); 16240eefd470SAlex Elder 16250eefd470SAlex Elder /* Allocate and initialize the request, for the two ops */ 16260eefd470SAlex Elder 16270eefd470SAlex Elder snapc = img_request->snapc; 16280eefd470SAlex Elder rbd_dev = img_request->rbd_dev; 16290eefd470SAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 16300eefd470SAlex Elder osd_req = ceph_osdc_alloc_request(osdc, snapc, 2, false, GFP_ATOMIC); 16310eefd470SAlex Elder if (!osd_req) 16320eefd470SAlex Elder return NULL; /* ENOMEM */ 16330eefd470SAlex Elder 16340eefd470SAlex Elder osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK; 16350eefd470SAlex Elder osd_req->r_callback = rbd_osd_req_callback; 16360eefd470SAlex Elder osd_req->r_priv = obj_request; 16370eefd470SAlex Elder 16380eefd470SAlex Elder osd_req->r_oid_len = strlen(obj_request->object_name); 16390eefd470SAlex Elder rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid)); 16400eefd470SAlex Elder memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len); 16410eefd470SAlex Elder 16420eefd470SAlex Elder osd_req->r_file_layout = rbd_dev->layout; /* struct */ 16430eefd470SAlex Elder 16440eefd470SAlex Elder return osd_req; 16450eefd470SAlex Elder } 16460eefd470SAlex Elder 16470eefd470SAlex Elder 1648bf0d5f50SAlex Elder static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req) 1649bf0d5f50SAlex Elder { 1650bf0d5f50SAlex Elder ceph_osdc_put_request(osd_req); 1651bf0d5f50SAlex Elder } 1652bf0d5f50SAlex Elder 1653bf0d5f50SAlex Elder /* object_name is assumed to be a non-null pointer and NUL-terminated */ 1654bf0d5f50SAlex Elder 1655bf0d5f50SAlex Elder static struct rbd_obj_request *rbd_obj_request_create(const char *object_name, 1656bf0d5f50SAlex Elder u64 offset, u64 length, 1657bf0d5f50SAlex Elder enum obj_request_type type) 1658bf0d5f50SAlex Elder { 1659bf0d5f50SAlex Elder struct rbd_obj_request *obj_request; 1660bf0d5f50SAlex Elder size_t size; 1661bf0d5f50SAlex Elder char *name; 1662bf0d5f50SAlex Elder 1663bf0d5f50SAlex Elder rbd_assert(obj_request_type_valid(type)); 1664bf0d5f50SAlex Elder 1665bf0d5f50SAlex Elder size = strlen(object_name) + 1; 1666bf0d5f50SAlex Elder obj_request = kzalloc(sizeof (*obj_request) + size, GFP_KERNEL); 1667bf0d5f50SAlex Elder if (!obj_request) 1668bf0d5f50SAlex Elder return NULL; 1669bf0d5f50SAlex Elder 1670bf0d5f50SAlex Elder name = (char *)(obj_request + 1); 1671bf0d5f50SAlex Elder obj_request->object_name = memcpy(name, object_name, size); 1672bf0d5f50SAlex Elder obj_request->offset = offset; 1673bf0d5f50SAlex Elder obj_request->length = length; 1674926f9b3fSAlex Elder obj_request->flags = 0; 1675bf0d5f50SAlex Elder obj_request->which = BAD_WHICH; 1676bf0d5f50SAlex Elder obj_request->type = type; 1677bf0d5f50SAlex Elder INIT_LIST_HEAD(&obj_request->links); 1678788e2df3SAlex Elder init_completion(&obj_request->completion); 1679bf0d5f50SAlex Elder kref_init(&obj_request->kref); 1680bf0d5f50SAlex Elder 168137206ee5SAlex Elder dout("%s: \"%s\" %llu/%llu %d -> obj %p\n", __func__, object_name, 168237206ee5SAlex Elder offset, length, (int)type, obj_request); 168337206ee5SAlex Elder 1684bf0d5f50SAlex Elder return obj_request; 1685bf0d5f50SAlex Elder } 1686bf0d5f50SAlex Elder 1687bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref) 1688bf0d5f50SAlex Elder { 1689bf0d5f50SAlex Elder struct rbd_obj_request *obj_request; 1690bf0d5f50SAlex Elder 1691bf0d5f50SAlex Elder obj_request = container_of(kref, struct rbd_obj_request, kref); 1692bf0d5f50SAlex Elder 169337206ee5SAlex Elder dout("%s: obj %p\n", __func__, obj_request); 169437206ee5SAlex Elder 1695bf0d5f50SAlex Elder rbd_assert(obj_request->img_request == NULL); 1696bf0d5f50SAlex Elder rbd_assert(obj_request->which == BAD_WHICH); 1697bf0d5f50SAlex Elder 1698bf0d5f50SAlex Elder if (obj_request->osd_req) 1699bf0d5f50SAlex Elder rbd_osd_req_destroy(obj_request->osd_req); 1700bf0d5f50SAlex Elder 1701bf0d5f50SAlex Elder rbd_assert(obj_request_type_valid(obj_request->type)); 1702bf0d5f50SAlex Elder switch (obj_request->type) { 17039969ebc5SAlex Elder case OBJ_REQUEST_NODATA: 17049969ebc5SAlex Elder break; /* Nothing to do */ 1705bf0d5f50SAlex Elder case OBJ_REQUEST_BIO: 1706bf0d5f50SAlex Elder if (obj_request->bio_list) 1707bf0d5f50SAlex Elder bio_chain_put(obj_request->bio_list); 1708bf0d5f50SAlex Elder break; 1709788e2df3SAlex Elder case OBJ_REQUEST_PAGES: 1710788e2df3SAlex Elder if (obj_request->pages) 1711788e2df3SAlex Elder ceph_release_page_vector(obj_request->pages, 1712788e2df3SAlex Elder obj_request->page_count); 1713788e2df3SAlex Elder break; 1714bf0d5f50SAlex Elder } 1715bf0d5f50SAlex Elder 1716bf0d5f50SAlex Elder kfree(obj_request); 1717bf0d5f50SAlex Elder } 1718bf0d5f50SAlex Elder 1719bf0d5f50SAlex Elder /* 1720bf0d5f50SAlex Elder * Caller is responsible for filling in the list of object requests 1721bf0d5f50SAlex Elder * that comprises the image request, and the Linux request pointer 1722bf0d5f50SAlex Elder * (if there is one). 1723bf0d5f50SAlex Elder */ 1724cc344fa1SAlex Elder static struct rbd_img_request *rbd_img_request_create( 1725cc344fa1SAlex Elder struct rbd_device *rbd_dev, 1726bf0d5f50SAlex Elder u64 offset, u64 length, 17279849e986SAlex Elder bool write_request, 17289849e986SAlex Elder bool child_request) 1729bf0d5f50SAlex Elder { 1730bf0d5f50SAlex Elder struct rbd_img_request *img_request; 1731bf0d5f50SAlex Elder struct ceph_snap_context *snapc = NULL; 1732bf0d5f50SAlex Elder 1733bf0d5f50SAlex Elder img_request = kmalloc(sizeof (*img_request), GFP_ATOMIC); 1734bf0d5f50SAlex Elder if (!img_request) 1735bf0d5f50SAlex Elder return NULL; 1736bf0d5f50SAlex Elder 1737bf0d5f50SAlex Elder if (write_request) { 1738bf0d5f50SAlex Elder down_read(&rbd_dev->header_rwsem); 1739bf0d5f50SAlex Elder snapc = ceph_get_snap_context(rbd_dev->header.snapc); 1740bf0d5f50SAlex Elder up_read(&rbd_dev->header_rwsem); 1741bf0d5f50SAlex Elder if (WARN_ON(!snapc)) { 1742bf0d5f50SAlex Elder kfree(img_request); 1743bf0d5f50SAlex Elder return NULL; /* Shouldn't happen */ 1744bf0d5f50SAlex Elder } 17450c425248SAlex Elder 1746bf0d5f50SAlex Elder } 1747bf0d5f50SAlex Elder 1748bf0d5f50SAlex Elder img_request->rq = NULL; 1749bf0d5f50SAlex Elder img_request->rbd_dev = rbd_dev; 1750bf0d5f50SAlex Elder img_request->offset = offset; 1751bf0d5f50SAlex Elder img_request->length = length; 17520c425248SAlex Elder img_request->flags = 0; 17530c425248SAlex Elder if (write_request) { 17540c425248SAlex Elder img_request_write_set(img_request); 1755bf0d5f50SAlex Elder img_request->snapc = snapc; 17560c425248SAlex Elder } else { 1757bf0d5f50SAlex Elder img_request->snap_id = rbd_dev->spec->snap_id; 17580c425248SAlex Elder } 17599849e986SAlex Elder if (child_request) 17609849e986SAlex Elder img_request_child_set(img_request); 1761d0b2e944SAlex Elder if (rbd_dev->parent_spec) 1762d0b2e944SAlex Elder img_request_layered_set(img_request); 1763bf0d5f50SAlex Elder spin_lock_init(&img_request->completion_lock); 1764bf0d5f50SAlex Elder img_request->next_completion = 0; 1765bf0d5f50SAlex Elder img_request->callback = NULL; 1766a5a337d4SAlex Elder img_request->result = 0; 1767bf0d5f50SAlex Elder img_request->obj_request_count = 0; 1768bf0d5f50SAlex Elder INIT_LIST_HEAD(&img_request->obj_requests); 1769bf0d5f50SAlex Elder kref_init(&img_request->kref); 1770bf0d5f50SAlex Elder 1771bf0d5f50SAlex Elder rbd_img_request_get(img_request); /* Avoid a warning */ 1772bf0d5f50SAlex Elder rbd_img_request_put(img_request); /* TEMPORARY */ 1773bf0d5f50SAlex Elder 177437206ee5SAlex Elder dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev, 177537206ee5SAlex Elder write_request ? "write" : "read", offset, length, 177637206ee5SAlex Elder img_request); 177737206ee5SAlex Elder 1778bf0d5f50SAlex Elder return img_request; 1779bf0d5f50SAlex Elder } 1780bf0d5f50SAlex Elder 1781bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref) 1782bf0d5f50SAlex Elder { 1783bf0d5f50SAlex Elder struct rbd_img_request *img_request; 1784bf0d5f50SAlex Elder struct rbd_obj_request *obj_request; 1785bf0d5f50SAlex Elder struct rbd_obj_request *next_obj_request; 1786bf0d5f50SAlex Elder 1787bf0d5f50SAlex Elder img_request = container_of(kref, struct rbd_img_request, kref); 1788bf0d5f50SAlex Elder 178937206ee5SAlex Elder dout("%s: img %p\n", __func__, img_request); 179037206ee5SAlex Elder 1791bf0d5f50SAlex Elder for_each_obj_request_safe(img_request, obj_request, next_obj_request) 1792bf0d5f50SAlex Elder rbd_img_obj_request_del(img_request, obj_request); 179325dcf954SAlex Elder rbd_assert(img_request->obj_request_count == 0); 1794bf0d5f50SAlex Elder 17950c425248SAlex Elder if (img_request_write_test(img_request)) 1796bf0d5f50SAlex Elder ceph_put_snap_context(img_request->snapc); 1797bf0d5f50SAlex Elder 17988b3e1a56SAlex Elder if (img_request_child_test(img_request)) 17998b3e1a56SAlex Elder rbd_obj_request_put(img_request->obj_request); 18008b3e1a56SAlex Elder 1801bf0d5f50SAlex Elder kfree(img_request); 1802bf0d5f50SAlex Elder } 1803bf0d5f50SAlex Elder 18041217857fSAlex Elder static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request) 18051217857fSAlex Elder { 18066365d33aSAlex Elder struct rbd_img_request *img_request; 18071217857fSAlex Elder unsigned int xferred; 18081217857fSAlex Elder int result; 18098b3e1a56SAlex Elder bool more; 18101217857fSAlex Elder 18116365d33aSAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 18126365d33aSAlex Elder img_request = obj_request->img_request; 18136365d33aSAlex Elder 18141217857fSAlex Elder rbd_assert(obj_request->xferred <= (u64)UINT_MAX); 18151217857fSAlex Elder xferred = (unsigned int)obj_request->xferred; 18161217857fSAlex Elder result = obj_request->result; 18171217857fSAlex Elder if (result) { 18181217857fSAlex Elder struct rbd_device *rbd_dev = img_request->rbd_dev; 18191217857fSAlex Elder 18201217857fSAlex Elder rbd_warn(rbd_dev, "%s %llx at %llx (%llx)\n", 18211217857fSAlex Elder img_request_write_test(img_request) ? "write" : "read", 18221217857fSAlex Elder obj_request->length, obj_request->img_offset, 18231217857fSAlex Elder obj_request->offset); 18241217857fSAlex Elder rbd_warn(rbd_dev, " result %d xferred %x\n", 18251217857fSAlex Elder result, xferred); 18261217857fSAlex Elder if (!img_request->result) 18271217857fSAlex Elder img_request->result = result; 18281217857fSAlex Elder } 18291217857fSAlex Elder 1830f1a4739fSAlex Elder /* Image object requests don't own their page array */ 1831f1a4739fSAlex Elder 1832f1a4739fSAlex Elder if (obj_request->type == OBJ_REQUEST_PAGES) { 1833f1a4739fSAlex Elder obj_request->pages = NULL; 1834f1a4739fSAlex Elder obj_request->page_count = 0; 1835f1a4739fSAlex Elder } 1836f1a4739fSAlex Elder 18378b3e1a56SAlex Elder if (img_request_child_test(img_request)) { 18388b3e1a56SAlex Elder rbd_assert(img_request->obj_request != NULL); 18398b3e1a56SAlex Elder more = obj_request->which < img_request->obj_request_count - 1; 18408b3e1a56SAlex Elder } else { 18418b3e1a56SAlex Elder rbd_assert(img_request->rq != NULL); 18428b3e1a56SAlex Elder more = blk_end_request(img_request->rq, result, xferred); 18438b3e1a56SAlex Elder } 18448b3e1a56SAlex Elder 18458b3e1a56SAlex Elder return more; 18461217857fSAlex Elder } 18471217857fSAlex Elder 18482169238dSAlex Elder static void rbd_img_obj_callback(struct rbd_obj_request *obj_request) 18492169238dSAlex Elder { 18502169238dSAlex Elder struct rbd_img_request *img_request; 18512169238dSAlex Elder u32 which = obj_request->which; 18522169238dSAlex Elder bool more = true; 18532169238dSAlex Elder 18546365d33aSAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 18552169238dSAlex Elder img_request = obj_request->img_request; 18562169238dSAlex Elder 18572169238dSAlex Elder dout("%s: img %p obj %p\n", __func__, img_request, obj_request); 18582169238dSAlex Elder rbd_assert(img_request != NULL); 18592169238dSAlex Elder rbd_assert(img_request->obj_request_count > 0); 18602169238dSAlex Elder rbd_assert(which != BAD_WHICH); 18612169238dSAlex Elder rbd_assert(which < img_request->obj_request_count); 18622169238dSAlex Elder rbd_assert(which >= img_request->next_completion); 18632169238dSAlex Elder 18642169238dSAlex Elder spin_lock_irq(&img_request->completion_lock); 18652169238dSAlex Elder if (which != img_request->next_completion) 18662169238dSAlex Elder goto out; 18672169238dSAlex Elder 18682169238dSAlex Elder for_each_obj_request_from(img_request, obj_request) { 18692169238dSAlex Elder rbd_assert(more); 18702169238dSAlex Elder rbd_assert(which < img_request->obj_request_count); 18712169238dSAlex Elder 18722169238dSAlex Elder if (!obj_request_done_test(obj_request)) 18732169238dSAlex Elder break; 18741217857fSAlex Elder more = rbd_img_obj_end_request(obj_request); 18752169238dSAlex Elder which++; 18762169238dSAlex Elder } 18772169238dSAlex Elder 18782169238dSAlex Elder rbd_assert(more ^ (which == img_request->obj_request_count)); 18792169238dSAlex Elder img_request->next_completion = which; 18802169238dSAlex Elder out: 18812169238dSAlex Elder spin_unlock_irq(&img_request->completion_lock); 18822169238dSAlex Elder 18832169238dSAlex Elder if (!more) 18842169238dSAlex Elder rbd_img_request_complete(img_request); 18852169238dSAlex Elder } 18862169238dSAlex Elder 1887f1a4739fSAlex Elder /* 1888f1a4739fSAlex Elder * Split up an image request into one or more object requests, each 1889f1a4739fSAlex Elder * to a different object. The "type" parameter indicates whether 1890f1a4739fSAlex Elder * "data_desc" is the pointer to the head of a list of bio 1891f1a4739fSAlex Elder * structures, or the base of a page array. In either case this 1892f1a4739fSAlex Elder * function assumes data_desc describes memory sufficient to hold 1893f1a4739fSAlex Elder * all data described by the image request. 1894f1a4739fSAlex Elder */ 1895f1a4739fSAlex Elder static int rbd_img_request_fill(struct rbd_img_request *img_request, 1896f1a4739fSAlex Elder enum obj_request_type type, 1897f1a4739fSAlex Elder void *data_desc) 1898bf0d5f50SAlex Elder { 1899bf0d5f50SAlex Elder struct rbd_device *rbd_dev = img_request->rbd_dev; 1900bf0d5f50SAlex Elder struct rbd_obj_request *obj_request = NULL; 1901bf0d5f50SAlex Elder struct rbd_obj_request *next_obj_request; 19020c425248SAlex Elder bool write_request = img_request_write_test(img_request); 1903f1a4739fSAlex Elder struct bio *bio_list; 1904f1a4739fSAlex Elder unsigned int bio_offset = 0; 1905f1a4739fSAlex Elder struct page **pages; 19067da22d29SAlex Elder u64 img_offset; 1907bf0d5f50SAlex Elder u64 resid; 1908bf0d5f50SAlex Elder u16 opcode; 1909bf0d5f50SAlex Elder 1910f1a4739fSAlex Elder dout("%s: img %p type %d data_desc %p\n", __func__, img_request, 1911f1a4739fSAlex Elder (int)type, data_desc); 191237206ee5SAlex Elder 1913430c28c3SAlex Elder opcode = write_request ? CEPH_OSD_OP_WRITE : CEPH_OSD_OP_READ; 19147da22d29SAlex Elder img_offset = img_request->offset; 1915bf0d5f50SAlex Elder resid = img_request->length; 19164dda41d3SAlex Elder rbd_assert(resid > 0); 1917f1a4739fSAlex Elder 1918f1a4739fSAlex Elder if (type == OBJ_REQUEST_BIO) { 1919f1a4739fSAlex Elder bio_list = data_desc; 1920f1a4739fSAlex Elder rbd_assert(img_offset == bio_list->bi_sector << SECTOR_SHIFT); 1921f1a4739fSAlex Elder } else { 1922f1a4739fSAlex Elder rbd_assert(type == OBJ_REQUEST_PAGES); 1923f1a4739fSAlex Elder pages = data_desc; 1924f1a4739fSAlex Elder } 1925f1a4739fSAlex Elder 1926bf0d5f50SAlex Elder while (resid) { 19272fa12320SAlex Elder struct ceph_osd_request *osd_req; 1928bf0d5f50SAlex Elder const char *object_name; 1929bf0d5f50SAlex Elder u64 offset; 1930bf0d5f50SAlex Elder u64 length; 1931bf0d5f50SAlex Elder 19327da22d29SAlex Elder object_name = rbd_segment_name(rbd_dev, img_offset); 1933bf0d5f50SAlex Elder if (!object_name) 1934bf0d5f50SAlex Elder goto out_unwind; 19357da22d29SAlex Elder offset = rbd_segment_offset(rbd_dev, img_offset); 19367da22d29SAlex Elder length = rbd_segment_length(rbd_dev, img_offset, resid); 1937bf0d5f50SAlex Elder obj_request = rbd_obj_request_create(object_name, 1938f1a4739fSAlex Elder offset, length, type); 1939bf0d5f50SAlex Elder kfree(object_name); /* object request has its own copy */ 1940bf0d5f50SAlex Elder if (!obj_request) 1941bf0d5f50SAlex Elder goto out_unwind; 1942bf0d5f50SAlex Elder 1943f1a4739fSAlex Elder if (type == OBJ_REQUEST_BIO) { 1944f1a4739fSAlex Elder unsigned int clone_size; 1945f1a4739fSAlex Elder 1946bf0d5f50SAlex Elder rbd_assert(length <= (u64)UINT_MAX); 1947bf0d5f50SAlex Elder clone_size = (unsigned int)length; 1948f1a4739fSAlex Elder obj_request->bio_list = 1949f1a4739fSAlex Elder bio_chain_clone_range(&bio_list, 1950f1a4739fSAlex Elder &bio_offset, 1951f1a4739fSAlex Elder clone_size, 1952bf0d5f50SAlex Elder GFP_ATOMIC); 1953bf0d5f50SAlex Elder if (!obj_request->bio_list) 1954bf0d5f50SAlex Elder goto out_partial; 1955f1a4739fSAlex Elder } else { 1956f1a4739fSAlex Elder unsigned int page_count; 1957f1a4739fSAlex Elder 1958f1a4739fSAlex Elder obj_request->pages = pages; 1959f1a4739fSAlex Elder page_count = (u32)calc_pages_for(offset, length); 1960f1a4739fSAlex Elder obj_request->page_count = page_count; 1961f1a4739fSAlex Elder if ((offset + length) & ~PAGE_MASK) 1962f1a4739fSAlex Elder page_count--; /* more on last page */ 1963f1a4739fSAlex Elder pages += page_count; 1964f1a4739fSAlex Elder } 1965bf0d5f50SAlex Elder 19662fa12320SAlex Elder osd_req = rbd_osd_req_create(rbd_dev, write_request, 19672fa12320SAlex Elder obj_request); 19682fa12320SAlex Elder if (!osd_req) 1969bf0d5f50SAlex Elder goto out_partial; 19702fa12320SAlex Elder obj_request->osd_req = osd_req; 19712169238dSAlex Elder obj_request->callback = rbd_img_obj_callback; 1972430c28c3SAlex Elder 19732fa12320SAlex Elder osd_req_op_extent_init(osd_req, 0, opcode, offset, length, 19742fa12320SAlex Elder 0, 0); 1975f1a4739fSAlex Elder if (type == OBJ_REQUEST_BIO) 1976406e2c9fSAlex Elder osd_req_op_extent_osd_data_bio(osd_req, 0, 1977f1a4739fSAlex Elder obj_request->bio_list, length); 1978f1a4739fSAlex Elder else 1979f1a4739fSAlex Elder osd_req_op_extent_osd_data_pages(osd_req, 0, 1980f1a4739fSAlex Elder obj_request->pages, length, 1981f1a4739fSAlex Elder offset & ~PAGE_MASK, false, false); 19829d4df01fSAlex Elder 19839d4df01fSAlex Elder if (write_request) 19849d4df01fSAlex Elder rbd_osd_req_format_write(obj_request); 19859d4df01fSAlex Elder else 19869d4df01fSAlex Elder rbd_osd_req_format_read(obj_request); 1987430c28c3SAlex Elder 19887da22d29SAlex Elder obj_request->img_offset = img_offset; 1989bf0d5f50SAlex Elder rbd_img_obj_request_add(img_request, obj_request); 1990bf0d5f50SAlex Elder 19917da22d29SAlex Elder img_offset += length; 1992bf0d5f50SAlex Elder resid -= length; 1993bf0d5f50SAlex Elder } 1994bf0d5f50SAlex Elder 1995bf0d5f50SAlex Elder return 0; 1996bf0d5f50SAlex Elder 1997bf0d5f50SAlex Elder out_partial: 1998bf0d5f50SAlex Elder rbd_obj_request_put(obj_request); 1999bf0d5f50SAlex Elder out_unwind: 2000bf0d5f50SAlex Elder for_each_obj_request_safe(img_request, obj_request, next_obj_request) 2001bf0d5f50SAlex Elder rbd_obj_request_put(obj_request); 2002bf0d5f50SAlex Elder 2003bf0d5f50SAlex Elder return -ENOMEM; 2004bf0d5f50SAlex Elder } 2005bf0d5f50SAlex Elder 20063d7efd18SAlex Elder static void 20070eefd470SAlex Elder rbd_img_obj_copyup_callback(struct rbd_obj_request *obj_request) 20080eefd470SAlex Elder { 20090eefd470SAlex Elder struct rbd_img_request *img_request; 20100eefd470SAlex Elder struct rbd_device *rbd_dev; 20110eefd470SAlex Elder u64 length; 20120eefd470SAlex Elder u32 page_count; 20130eefd470SAlex Elder 20140eefd470SAlex Elder rbd_assert(obj_request->type == OBJ_REQUEST_BIO); 20150eefd470SAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 20160eefd470SAlex Elder img_request = obj_request->img_request; 20170eefd470SAlex Elder rbd_assert(img_request); 20180eefd470SAlex Elder 20190eefd470SAlex Elder rbd_dev = img_request->rbd_dev; 20200eefd470SAlex Elder rbd_assert(rbd_dev); 20210eefd470SAlex Elder length = (u64)1 << rbd_dev->header.obj_order; 20220eefd470SAlex Elder page_count = (u32)calc_pages_for(0, length); 20230eefd470SAlex Elder 20240eefd470SAlex Elder rbd_assert(obj_request->copyup_pages); 20250eefd470SAlex Elder ceph_release_page_vector(obj_request->copyup_pages, page_count); 20260eefd470SAlex Elder obj_request->copyup_pages = NULL; 20270eefd470SAlex Elder 20280eefd470SAlex Elder /* 20290eefd470SAlex Elder * We want the transfer count to reflect the size of the 20300eefd470SAlex Elder * original write request. There is no such thing as a 20310eefd470SAlex Elder * successful short write, so if the request was successful 20320eefd470SAlex Elder * we can just set it to the originally-requested length. 20330eefd470SAlex Elder */ 20340eefd470SAlex Elder if (!obj_request->result) 20350eefd470SAlex Elder obj_request->xferred = obj_request->length; 20360eefd470SAlex Elder 20370eefd470SAlex Elder /* Finish up with the normal image object callback */ 20380eefd470SAlex Elder 20390eefd470SAlex Elder rbd_img_obj_callback(obj_request); 20400eefd470SAlex Elder } 20410eefd470SAlex Elder 20420eefd470SAlex Elder static void 20433d7efd18SAlex Elder rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request) 20443d7efd18SAlex Elder { 20453d7efd18SAlex Elder struct rbd_obj_request *orig_request; 20460eefd470SAlex Elder struct ceph_osd_request *osd_req; 20470eefd470SAlex Elder struct ceph_osd_client *osdc; 20480eefd470SAlex Elder struct rbd_device *rbd_dev; 20493d7efd18SAlex Elder struct page **pages; 20503d7efd18SAlex Elder int result; 20513d7efd18SAlex Elder u64 obj_size; 20523d7efd18SAlex Elder u64 xferred; 20533d7efd18SAlex Elder 20543d7efd18SAlex Elder rbd_assert(img_request_child_test(img_request)); 20553d7efd18SAlex Elder 20563d7efd18SAlex Elder /* First get what we need from the image request */ 20573d7efd18SAlex Elder 20583d7efd18SAlex Elder pages = img_request->copyup_pages; 20593d7efd18SAlex Elder rbd_assert(pages != NULL); 20603d7efd18SAlex Elder img_request->copyup_pages = NULL; 20613d7efd18SAlex Elder 20623d7efd18SAlex Elder orig_request = img_request->obj_request; 20633d7efd18SAlex Elder rbd_assert(orig_request != NULL); 20640eefd470SAlex Elder rbd_assert(orig_request->type == OBJ_REQUEST_BIO); 20653d7efd18SAlex Elder result = img_request->result; 20663d7efd18SAlex Elder obj_size = img_request->length; 20673d7efd18SAlex Elder xferred = img_request->xferred; 20683d7efd18SAlex Elder 20690eefd470SAlex Elder rbd_dev = img_request->rbd_dev; 20700eefd470SAlex Elder rbd_assert(rbd_dev); 20710eefd470SAlex Elder rbd_assert(obj_size == (u64)1 << rbd_dev->header.obj_order); 20720eefd470SAlex Elder 20733d7efd18SAlex Elder rbd_img_request_put(img_request); 20743d7efd18SAlex Elder 20750eefd470SAlex Elder if (result) 20760eefd470SAlex Elder goto out_err; 20773d7efd18SAlex Elder 20780eefd470SAlex Elder /* Allocate the new copyup osd request for the original request */ 20793d7efd18SAlex Elder 20800eefd470SAlex Elder result = -ENOMEM; 20810eefd470SAlex Elder rbd_assert(!orig_request->osd_req); 20820eefd470SAlex Elder osd_req = rbd_osd_req_create_copyup(orig_request); 20830eefd470SAlex Elder if (!osd_req) 20840eefd470SAlex Elder goto out_err; 20850eefd470SAlex Elder orig_request->osd_req = osd_req; 20860eefd470SAlex Elder orig_request->copyup_pages = pages; 20873d7efd18SAlex Elder 20880eefd470SAlex Elder /* Initialize the copyup op */ 20890eefd470SAlex Elder 20900eefd470SAlex Elder osd_req_op_cls_init(osd_req, 0, CEPH_OSD_OP_CALL, "rbd", "copyup"); 20910eefd470SAlex Elder osd_req_op_cls_request_data_pages(osd_req, 0, pages, obj_size, 0, 20920eefd470SAlex Elder false, false); 20930eefd470SAlex Elder 20940eefd470SAlex Elder /* Then the original write request op */ 20950eefd470SAlex Elder 20960eefd470SAlex Elder osd_req_op_extent_init(osd_req, 1, CEPH_OSD_OP_WRITE, 20970eefd470SAlex Elder orig_request->offset, 20980eefd470SAlex Elder orig_request->length, 0, 0); 20990eefd470SAlex Elder osd_req_op_extent_osd_data_bio(osd_req, 1, orig_request->bio_list, 21000eefd470SAlex Elder orig_request->length); 21010eefd470SAlex Elder 21020eefd470SAlex Elder rbd_osd_req_format_write(orig_request); 21030eefd470SAlex Elder 21040eefd470SAlex Elder /* All set, send it off. */ 21050eefd470SAlex Elder 21060eefd470SAlex Elder orig_request->callback = rbd_img_obj_copyup_callback; 21070eefd470SAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 21080eefd470SAlex Elder result = rbd_obj_request_submit(osdc, orig_request); 21090eefd470SAlex Elder if (!result) 21100eefd470SAlex Elder return; 21110eefd470SAlex Elder out_err: 21120eefd470SAlex Elder /* Record the error code and complete the request */ 21130eefd470SAlex Elder 21140eefd470SAlex Elder orig_request->result = result; 21150eefd470SAlex Elder orig_request->xferred = 0; 21163d7efd18SAlex Elder obj_request_done_set(orig_request); 21173d7efd18SAlex Elder rbd_obj_request_complete(orig_request); 21183d7efd18SAlex Elder } 21193d7efd18SAlex Elder 21203d7efd18SAlex Elder /* 21213d7efd18SAlex Elder * Read from the parent image the range of data that covers the 21223d7efd18SAlex Elder * entire target of the given object request. This is used for 21233d7efd18SAlex Elder * satisfying a layered image write request when the target of an 21243d7efd18SAlex Elder * object request from the image request does not exist. 21253d7efd18SAlex Elder * 21263d7efd18SAlex Elder * A page array big enough to hold the returned data is allocated 21273d7efd18SAlex Elder * and supplied to rbd_img_request_fill() as the "data descriptor." 21283d7efd18SAlex Elder * When the read completes, this page array will be transferred to 21293d7efd18SAlex Elder * the original object request for the copyup operation. 21303d7efd18SAlex Elder * 21313d7efd18SAlex Elder * If an error occurs, record it as the result of the original 21323d7efd18SAlex Elder * object request and mark it done so it gets completed. 21333d7efd18SAlex Elder */ 21343d7efd18SAlex Elder static int rbd_img_obj_parent_read_full(struct rbd_obj_request *obj_request) 21353d7efd18SAlex Elder { 21363d7efd18SAlex Elder struct rbd_img_request *img_request = NULL; 21373d7efd18SAlex Elder struct rbd_img_request *parent_request = NULL; 21383d7efd18SAlex Elder struct rbd_device *rbd_dev; 21393d7efd18SAlex Elder u64 img_offset; 21403d7efd18SAlex Elder u64 length; 21413d7efd18SAlex Elder struct page **pages = NULL; 21423d7efd18SAlex Elder u32 page_count; 21433d7efd18SAlex Elder int result; 21443d7efd18SAlex Elder 21453d7efd18SAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 21463d7efd18SAlex Elder rbd_assert(obj_request->type == OBJ_REQUEST_BIO); 21473d7efd18SAlex Elder 21483d7efd18SAlex Elder img_request = obj_request->img_request; 21493d7efd18SAlex Elder rbd_assert(img_request != NULL); 21503d7efd18SAlex Elder rbd_dev = img_request->rbd_dev; 21513d7efd18SAlex Elder rbd_assert(rbd_dev->parent != NULL); 21523d7efd18SAlex Elder 21533d7efd18SAlex Elder /* 21540eefd470SAlex Elder * First things first. The original osd request is of no 21550eefd470SAlex Elder * use to use any more, we'll need a new one that can hold 21560eefd470SAlex Elder * the two ops in a copyup request. We'll get that later, 21570eefd470SAlex Elder * but for now we can release the old one. 21580eefd470SAlex Elder */ 21590eefd470SAlex Elder rbd_osd_req_destroy(obj_request->osd_req); 21600eefd470SAlex Elder obj_request->osd_req = NULL; 21610eefd470SAlex Elder 21620eefd470SAlex Elder /* 21633d7efd18SAlex Elder * Determine the byte range covered by the object in the 21643d7efd18SAlex Elder * child image to which the original request was to be sent. 21653d7efd18SAlex Elder */ 21663d7efd18SAlex Elder img_offset = obj_request->img_offset - obj_request->offset; 21673d7efd18SAlex Elder length = (u64)1 << rbd_dev->header.obj_order; 21683d7efd18SAlex Elder 21693d7efd18SAlex Elder /* 2170a9e8ba2cSAlex Elder * There is no defined parent data beyond the parent 2171a9e8ba2cSAlex Elder * overlap, so limit what we read at that boundary if 2172a9e8ba2cSAlex Elder * necessary. 2173a9e8ba2cSAlex Elder */ 2174a9e8ba2cSAlex Elder if (img_offset + length > rbd_dev->parent_overlap) { 2175a9e8ba2cSAlex Elder rbd_assert(img_offset < rbd_dev->parent_overlap); 2176a9e8ba2cSAlex Elder length = rbd_dev->parent_overlap - img_offset; 2177a9e8ba2cSAlex Elder } 2178a9e8ba2cSAlex Elder 2179a9e8ba2cSAlex Elder /* 21803d7efd18SAlex Elder * Allocate a page array big enough to receive the data read 21813d7efd18SAlex Elder * from the parent. 21823d7efd18SAlex Elder */ 21833d7efd18SAlex Elder page_count = (u32)calc_pages_for(0, length); 21843d7efd18SAlex Elder pages = ceph_alloc_page_vector(page_count, GFP_KERNEL); 21853d7efd18SAlex Elder if (IS_ERR(pages)) { 21863d7efd18SAlex Elder result = PTR_ERR(pages); 21873d7efd18SAlex Elder pages = NULL; 21883d7efd18SAlex Elder goto out_err; 21893d7efd18SAlex Elder } 21903d7efd18SAlex Elder 21913d7efd18SAlex Elder result = -ENOMEM; 21923d7efd18SAlex Elder parent_request = rbd_img_request_create(rbd_dev->parent, 21933d7efd18SAlex Elder img_offset, length, 21943d7efd18SAlex Elder false, true); 21953d7efd18SAlex Elder if (!parent_request) 21963d7efd18SAlex Elder goto out_err; 21973d7efd18SAlex Elder rbd_obj_request_get(obj_request); 21983d7efd18SAlex Elder parent_request->obj_request = obj_request; 21993d7efd18SAlex Elder 22003d7efd18SAlex Elder result = rbd_img_request_fill(parent_request, OBJ_REQUEST_PAGES, pages); 22013d7efd18SAlex Elder if (result) 22023d7efd18SAlex Elder goto out_err; 22033d7efd18SAlex Elder parent_request->copyup_pages = pages; 22043d7efd18SAlex Elder 22053d7efd18SAlex Elder parent_request->callback = rbd_img_obj_parent_read_full_callback; 22063d7efd18SAlex Elder result = rbd_img_request_submit(parent_request); 22073d7efd18SAlex Elder if (!result) 22083d7efd18SAlex Elder return 0; 22093d7efd18SAlex Elder 22103d7efd18SAlex Elder parent_request->copyup_pages = NULL; 22113d7efd18SAlex Elder parent_request->obj_request = NULL; 22123d7efd18SAlex Elder rbd_obj_request_put(obj_request); 22133d7efd18SAlex Elder out_err: 22143d7efd18SAlex Elder if (pages) 22153d7efd18SAlex Elder ceph_release_page_vector(pages, page_count); 22163d7efd18SAlex Elder if (parent_request) 22173d7efd18SAlex Elder rbd_img_request_put(parent_request); 22183d7efd18SAlex Elder obj_request->result = result; 22193d7efd18SAlex Elder obj_request->xferred = 0; 22203d7efd18SAlex Elder obj_request_done_set(obj_request); 22213d7efd18SAlex Elder 22223d7efd18SAlex Elder return result; 22233d7efd18SAlex Elder } 22243d7efd18SAlex Elder 2225c5b5ef6cSAlex Elder static void rbd_img_obj_exists_callback(struct rbd_obj_request *obj_request) 2226c5b5ef6cSAlex Elder { 2227c5b5ef6cSAlex Elder struct rbd_obj_request *orig_request; 2228c5b5ef6cSAlex Elder int result; 2229c5b5ef6cSAlex Elder 2230c5b5ef6cSAlex Elder rbd_assert(!obj_request_img_data_test(obj_request)); 2231c5b5ef6cSAlex Elder 2232c5b5ef6cSAlex Elder /* 2233c5b5ef6cSAlex Elder * All we need from the object request is the original 2234c5b5ef6cSAlex Elder * request and the result of the STAT op. Grab those, then 2235c5b5ef6cSAlex Elder * we're done with the request. 2236c5b5ef6cSAlex Elder */ 2237c5b5ef6cSAlex Elder orig_request = obj_request->obj_request; 2238c5b5ef6cSAlex Elder obj_request->obj_request = NULL; 2239c5b5ef6cSAlex Elder rbd_assert(orig_request); 2240c5b5ef6cSAlex Elder rbd_assert(orig_request->img_request); 2241c5b5ef6cSAlex Elder 2242c5b5ef6cSAlex Elder result = obj_request->result; 2243c5b5ef6cSAlex Elder obj_request->result = 0; 2244c5b5ef6cSAlex Elder 2245c5b5ef6cSAlex Elder dout("%s: obj %p for obj %p result %d %llu/%llu\n", __func__, 2246c5b5ef6cSAlex Elder obj_request, orig_request, result, 2247c5b5ef6cSAlex Elder obj_request->xferred, obj_request->length); 2248c5b5ef6cSAlex Elder rbd_obj_request_put(obj_request); 2249c5b5ef6cSAlex Elder 2250c5b5ef6cSAlex Elder rbd_assert(orig_request); 2251c5b5ef6cSAlex Elder rbd_assert(orig_request->img_request); 2252c5b5ef6cSAlex Elder 2253c5b5ef6cSAlex Elder /* 2254c5b5ef6cSAlex Elder * Our only purpose here is to determine whether the object 2255c5b5ef6cSAlex Elder * exists, and we don't want to treat the non-existence as 2256c5b5ef6cSAlex Elder * an error. If something else comes back, transfer the 2257c5b5ef6cSAlex Elder * error to the original request and complete it now. 2258c5b5ef6cSAlex Elder */ 2259c5b5ef6cSAlex Elder if (!result) { 2260c5b5ef6cSAlex Elder obj_request_existence_set(orig_request, true); 2261c5b5ef6cSAlex Elder } else if (result == -ENOENT) { 2262c5b5ef6cSAlex Elder obj_request_existence_set(orig_request, false); 2263c5b5ef6cSAlex Elder } else if (result) { 2264c5b5ef6cSAlex Elder orig_request->result = result; 22653d7efd18SAlex Elder goto out; 2266c5b5ef6cSAlex Elder } 2267c5b5ef6cSAlex Elder 2268c5b5ef6cSAlex Elder /* 2269c5b5ef6cSAlex Elder * Resubmit the original request now that we have recorded 2270c5b5ef6cSAlex Elder * whether the target object exists. 2271c5b5ef6cSAlex Elder */ 2272b454e36dSAlex Elder orig_request->result = rbd_img_obj_request_submit(orig_request); 22733d7efd18SAlex Elder out: 2274c5b5ef6cSAlex Elder if (orig_request->result) 2275c5b5ef6cSAlex Elder rbd_obj_request_complete(orig_request); 2276c5b5ef6cSAlex Elder rbd_obj_request_put(orig_request); 2277c5b5ef6cSAlex Elder } 2278c5b5ef6cSAlex Elder 2279c5b5ef6cSAlex Elder static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request) 2280c5b5ef6cSAlex Elder { 2281c5b5ef6cSAlex Elder struct rbd_obj_request *stat_request; 2282c5b5ef6cSAlex Elder struct rbd_device *rbd_dev; 2283c5b5ef6cSAlex Elder struct ceph_osd_client *osdc; 2284c5b5ef6cSAlex Elder struct page **pages = NULL; 2285c5b5ef6cSAlex Elder u32 page_count; 2286c5b5ef6cSAlex Elder size_t size; 2287c5b5ef6cSAlex Elder int ret; 2288c5b5ef6cSAlex Elder 2289c5b5ef6cSAlex Elder /* 2290c5b5ef6cSAlex Elder * The response data for a STAT call consists of: 2291c5b5ef6cSAlex Elder * le64 length; 2292c5b5ef6cSAlex Elder * struct { 2293c5b5ef6cSAlex Elder * le32 tv_sec; 2294c5b5ef6cSAlex Elder * le32 tv_nsec; 2295c5b5ef6cSAlex Elder * } mtime; 2296c5b5ef6cSAlex Elder */ 2297c5b5ef6cSAlex Elder size = sizeof (__le64) + sizeof (__le32) + sizeof (__le32); 2298c5b5ef6cSAlex Elder page_count = (u32)calc_pages_for(0, size); 2299c5b5ef6cSAlex Elder pages = ceph_alloc_page_vector(page_count, GFP_KERNEL); 2300c5b5ef6cSAlex Elder if (IS_ERR(pages)) 2301c5b5ef6cSAlex Elder return PTR_ERR(pages); 2302c5b5ef6cSAlex Elder 2303c5b5ef6cSAlex Elder ret = -ENOMEM; 2304c5b5ef6cSAlex Elder stat_request = rbd_obj_request_create(obj_request->object_name, 0, 0, 2305c5b5ef6cSAlex Elder OBJ_REQUEST_PAGES); 2306c5b5ef6cSAlex Elder if (!stat_request) 2307c5b5ef6cSAlex Elder goto out; 2308c5b5ef6cSAlex Elder 2309c5b5ef6cSAlex Elder rbd_obj_request_get(obj_request); 2310c5b5ef6cSAlex Elder stat_request->obj_request = obj_request; 2311c5b5ef6cSAlex Elder stat_request->pages = pages; 2312c5b5ef6cSAlex Elder stat_request->page_count = page_count; 2313c5b5ef6cSAlex Elder 2314c5b5ef6cSAlex Elder rbd_assert(obj_request->img_request); 2315c5b5ef6cSAlex Elder rbd_dev = obj_request->img_request->rbd_dev; 2316c5b5ef6cSAlex Elder stat_request->osd_req = rbd_osd_req_create(rbd_dev, false, 2317c5b5ef6cSAlex Elder stat_request); 2318c5b5ef6cSAlex Elder if (!stat_request->osd_req) 2319c5b5ef6cSAlex Elder goto out; 2320c5b5ef6cSAlex Elder stat_request->callback = rbd_img_obj_exists_callback; 2321c5b5ef6cSAlex Elder 2322c5b5ef6cSAlex Elder osd_req_op_init(stat_request->osd_req, 0, CEPH_OSD_OP_STAT); 2323c5b5ef6cSAlex Elder osd_req_op_raw_data_in_pages(stat_request->osd_req, 0, pages, size, 0, 2324c5b5ef6cSAlex Elder false, false); 23259d4df01fSAlex Elder rbd_osd_req_format_read(stat_request); 2326c5b5ef6cSAlex Elder 2327c5b5ef6cSAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 2328c5b5ef6cSAlex Elder ret = rbd_obj_request_submit(osdc, stat_request); 2329c5b5ef6cSAlex Elder out: 2330c5b5ef6cSAlex Elder if (ret) 2331c5b5ef6cSAlex Elder rbd_obj_request_put(obj_request); 2332c5b5ef6cSAlex Elder 2333c5b5ef6cSAlex Elder return ret; 2334c5b5ef6cSAlex Elder } 2335c5b5ef6cSAlex Elder 2336b454e36dSAlex Elder static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request) 2337b454e36dSAlex Elder { 2338b454e36dSAlex Elder struct rbd_img_request *img_request; 2339a9e8ba2cSAlex Elder struct rbd_device *rbd_dev; 23403d7efd18SAlex Elder bool known; 2341b454e36dSAlex Elder 2342b454e36dSAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 2343b454e36dSAlex Elder 2344b454e36dSAlex Elder img_request = obj_request->img_request; 2345b454e36dSAlex Elder rbd_assert(img_request); 2346a9e8ba2cSAlex Elder rbd_dev = img_request->rbd_dev; 2347b454e36dSAlex Elder 2348b454e36dSAlex Elder /* 2349a9e8ba2cSAlex Elder * Only writes to layered images need special handling. 2350a9e8ba2cSAlex Elder * Reads and non-layered writes are simple object requests. 2351a9e8ba2cSAlex Elder * Layered writes that start beyond the end of the overlap 2352a9e8ba2cSAlex Elder * with the parent have no parent data, so they too are 2353a9e8ba2cSAlex Elder * simple object requests. Finally, if the target object is 2354a9e8ba2cSAlex Elder * known to already exist, its parent data has already been 2355a9e8ba2cSAlex Elder * copied, so a write to the object can also be handled as a 2356a9e8ba2cSAlex Elder * simple object request. 2357b454e36dSAlex Elder */ 2358b454e36dSAlex Elder if (!img_request_write_test(img_request) || 2359b454e36dSAlex Elder !img_request_layered_test(img_request) || 2360a9e8ba2cSAlex Elder rbd_dev->parent_overlap <= obj_request->img_offset || 23613d7efd18SAlex Elder ((known = obj_request_known_test(obj_request)) && 23623d7efd18SAlex Elder obj_request_exists_test(obj_request))) { 2363b454e36dSAlex Elder 2364b454e36dSAlex Elder struct rbd_device *rbd_dev; 2365b454e36dSAlex Elder struct ceph_osd_client *osdc; 2366b454e36dSAlex Elder 2367b454e36dSAlex Elder rbd_dev = obj_request->img_request->rbd_dev; 2368b454e36dSAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 2369b454e36dSAlex Elder 2370b454e36dSAlex Elder return rbd_obj_request_submit(osdc, obj_request); 2371b454e36dSAlex Elder } 2372b454e36dSAlex Elder 2373b454e36dSAlex Elder /* 23743d7efd18SAlex Elder * It's a layered write. The target object might exist but 23753d7efd18SAlex Elder * we may not know that yet. If we know it doesn't exist, 23763d7efd18SAlex Elder * start by reading the data for the full target object from 23773d7efd18SAlex Elder * the parent so we can use it for a copyup to the target. 2378b454e36dSAlex Elder */ 23793d7efd18SAlex Elder if (known) 23803d7efd18SAlex Elder return rbd_img_obj_parent_read_full(obj_request); 23813d7efd18SAlex Elder 23823d7efd18SAlex Elder /* We don't know whether the target exists. Go find out. */ 2383b454e36dSAlex Elder 2384b454e36dSAlex Elder return rbd_img_obj_exists_submit(obj_request); 2385b454e36dSAlex Elder } 2386b454e36dSAlex Elder 2387bf0d5f50SAlex Elder static int rbd_img_request_submit(struct rbd_img_request *img_request) 2388bf0d5f50SAlex Elder { 2389bf0d5f50SAlex Elder struct rbd_obj_request *obj_request; 239046faeed4SAlex Elder struct rbd_obj_request *next_obj_request; 2391bf0d5f50SAlex Elder 239237206ee5SAlex Elder dout("%s: img %p\n", __func__, img_request); 239346faeed4SAlex Elder for_each_obj_request_safe(img_request, obj_request, next_obj_request) { 2394bf0d5f50SAlex Elder int ret; 2395bf0d5f50SAlex Elder 2396b454e36dSAlex Elder ret = rbd_img_obj_request_submit(obj_request); 2397bf0d5f50SAlex Elder if (ret) 2398bf0d5f50SAlex Elder return ret; 2399bf0d5f50SAlex Elder } 2400bf0d5f50SAlex Elder 2401bf0d5f50SAlex Elder return 0; 2402bf0d5f50SAlex Elder } 2403bf0d5f50SAlex Elder 24048b3e1a56SAlex Elder static void rbd_img_parent_read_callback(struct rbd_img_request *img_request) 24058b3e1a56SAlex Elder { 24068b3e1a56SAlex Elder struct rbd_obj_request *obj_request; 2407a9e8ba2cSAlex Elder struct rbd_device *rbd_dev; 2408a9e8ba2cSAlex Elder u64 obj_end; 24098b3e1a56SAlex Elder 24108b3e1a56SAlex Elder rbd_assert(img_request_child_test(img_request)); 24118b3e1a56SAlex Elder 24128b3e1a56SAlex Elder obj_request = img_request->obj_request; 2413a9e8ba2cSAlex Elder rbd_assert(obj_request); 2414a9e8ba2cSAlex Elder rbd_assert(obj_request->img_request); 24158b3e1a56SAlex Elder 2416a9e8ba2cSAlex Elder obj_request->result = img_request->result; 2417a9e8ba2cSAlex Elder if (obj_request->result) 2418a9e8ba2cSAlex Elder goto out; 2419a9e8ba2cSAlex Elder 2420a9e8ba2cSAlex Elder /* 2421a9e8ba2cSAlex Elder * We need to zero anything beyond the parent overlap 2422a9e8ba2cSAlex Elder * boundary. Since rbd_img_obj_request_read_callback() 2423a9e8ba2cSAlex Elder * will zero anything beyond the end of a short read, an 2424a9e8ba2cSAlex Elder * easy way to do this is to pretend the data from the 2425a9e8ba2cSAlex Elder * parent came up short--ending at the overlap boundary. 2426a9e8ba2cSAlex Elder */ 2427a9e8ba2cSAlex Elder rbd_assert(obj_request->img_offset < U64_MAX - obj_request->length); 2428a9e8ba2cSAlex Elder obj_end = obj_request->img_offset + obj_request->length; 2429a9e8ba2cSAlex Elder rbd_dev = obj_request->img_request->rbd_dev; 2430a9e8ba2cSAlex Elder if (obj_end > rbd_dev->parent_overlap) { 2431a9e8ba2cSAlex Elder u64 xferred = 0; 2432a9e8ba2cSAlex Elder 2433a9e8ba2cSAlex Elder if (obj_request->img_offset < rbd_dev->parent_overlap) 2434a9e8ba2cSAlex Elder xferred = rbd_dev->parent_overlap - 2435a9e8ba2cSAlex Elder obj_request->img_offset; 2436a9e8ba2cSAlex Elder 2437a9e8ba2cSAlex Elder obj_request->xferred = min(img_request->xferred, xferred); 2438a9e8ba2cSAlex Elder } else { 2439a9e8ba2cSAlex Elder obj_request->xferred = img_request->xferred; 2440a9e8ba2cSAlex Elder } 2441a9e8ba2cSAlex Elder out: 24428b3e1a56SAlex Elder rbd_img_obj_request_read_callback(obj_request); 24438b3e1a56SAlex Elder rbd_obj_request_complete(obj_request); 24448b3e1a56SAlex Elder } 24458b3e1a56SAlex Elder 24468b3e1a56SAlex Elder static void rbd_img_parent_read(struct rbd_obj_request *obj_request) 24478b3e1a56SAlex Elder { 24488b3e1a56SAlex Elder struct rbd_device *rbd_dev; 24498b3e1a56SAlex Elder struct rbd_img_request *img_request; 24508b3e1a56SAlex Elder int result; 24518b3e1a56SAlex Elder 24528b3e1a56SAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 24538b3e1a56SAlex Elder rbd_assert(obj_request->img_request != NULL); 24548b3e1a56SAlex Elder rbd_assert(obj_request->result == (s32) -ENOENT); 24558b3e1a56SAlex Elder rbd_assert(obj_request->type == OBJ_REQUEST_BIO); 24568b3e1a56SAlex Elder 24578b3e1a56SAlex Elder rbd_dev = obj_request->img_request->rbd_dev; 24588b3e1a56SAlex Elder rbd_assert(rbd_dev->parent != NULL); 24598b3e1a56SAlex Elder /* rbd_read_finish(obj_request, obj_request->length); */ 24608b3e1a56SAlex Elder img_request = rbd_img_request_create(rbd_dev->parent, 24618b3e1a56SAlex Elder obj_request->img_offset, 24628b3e1a56SAlex Elder obj_request->length, 24638b3e1a56SAlex Elder false, true); 24648b3e1a56SAlex Elder result = -ENOMEM; 24658b3e1a56SAlex Elder if (!img_request) 24668b3e1a56SAlex Elder goto out_err; 24678b3e1a56SAlex Elder 24688b3e1a56SAlex Elder rbd_obj_request_get(obj_request); 24698b3e1a56SAlex Elder img_request->obj_request = obj_request; 24708b3e1a56SAlex Elder 2471f1a4739fSAlex Elder result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO, 2472f1a4739fSAlex Elder obj_request->bio_list); 24738b3e1a56SAlex Elder if (result) 24748b3e1a56SAlex Elder goto out_err; 24758b3e1a56SAlex Elder 24768b3e1a56SAlex Elder img_request->callback = rbd_img_parent_read_callback; 24778b3e1a56SAlex Elder result = rbd_img_request_submit(img_request); 24788b3e1a56SAlex Elder if (result) 24798b3e1a56SAlex Elder goto out_err; 24808b3e1a56SAlex Elder 24818b3e1a56SAlex Elder return; 24828b3e1a56SAlex Elder out_err: 24838b3e1a56SAlex Elder if (img_request) 24848b3e1a56SAlex Elder rbd_img_request_put(img_request); 24858b3e1a56SAlex Elder obj_request->result = result; 24868b3e1a56SAlex Elder obj_request->xferred = 0; 24878b3e1a56SAlex Elder obj_request_done_set(obj_request); 24888b3e1a56SAlex Elder } 24898b3e1a56SAlex Elder 2490cf81b60eSAlex Elder static int rbd_obj_notify_ack(struct rbd_device *rbd_dev, 2491b8d70035SAlex Elder u64 ver, u64 notify_id) 2492b8d70035SAlex Elder { 2493b8d70035SAlex Elder struct rbd_obj_request *obj_request; 24942169238dSAlex Elder struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 2495b8d70035SAlex Elder int ret; 2496b8d70035SAlex Elder 2497b8d70035SAlex Elder obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0, 2498b8d70035SAlex Elder OBJ_REQUEST_NODATA); 2499b8d70035SAlex Elder if (!obj_request) 2500b8d70035SAlex Elder return -ENOMEM; 2501b8d70035SAlex Elder 2502b8d70035SAlex Elder ret = -ENOMEM; 2503430c28c3SAlex Elder obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request); 2504b8d70035SAlex Elder if (!obj_request->osd_req) 2505b8d70035SAlex Elder goto out; 25062169238dSAlex Elder obj_request->callback = rbd_obj_request_put; 2507b8d70035SAlex Elder 2508c99d2d4aSAlex Elder osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_NOTIFY_ACK, 2509c99d2d4aSAlex Elder notify_id, ver, 0); 25109d4df01fSAlex Elder rbd_osd_req_format_read(obj_request); 2511430c28c3SAlex Elder 2512b8d70035SAlex Elder ret = rbd_obj_request_submit(osdc, obj_request); 2513b8d70035SAlex Elder out: 2514cf81b60eSAlex Elder if (ret) 2515b8d70035SAlex Elder rbd_obj_request_put(obj_request); 2516b8d70035SAlex Elder 2517b8d70035SAlex Elder return ret; 2518b8d70035SAlex Elder } 2519b8d70035SAlex Elder 2520b8d70035SAlex Elder static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data) 2521b8d70035SAlex Elder { 2522b8d70035SAlex Elder struct rbd_device *rbd_dev = (struct rbd_device *)data; 2523b8d70035SAlex Elder u64 hver; 2524b8d70035SAlex Elder 2525b8d70035SAlex Elder if (!rbd_dev) 2526b8d70035SAlex Elder return; 2527b8d70035SAlex Elder 252837206ee5SAlex Elder dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__, 2529b8d70035SAlex Elder rbd_dev->header_name, (unsigned long long) notify_id, 2530b8d70035SAlex Elder (unsigned int) opcode); 2531522a0cc0SAlex Elder (void)rbd_dev_refresh(rbd_dev, &hver); 2532b8d70035SAlex Elder 2533cf81b60eSAlex Elder rbd_obj_notify_ack(rbd_dev, hver, notify_id); 2534b8d70035SAlex Elder } 2535b8d70035SAlex Elder 25369969ebc5SAlex Elder /* 25379969ebc5SAlex Elder * Request sync osd watch/unwatch. The value of "start" determines 25389969ebc5SAlex Elder * whether a watch request is being initiated or torn down. 25399969ebc5SAlex Elder */ 25409969ebc5SAlex Elder static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start) 25419969ebc5SAlex Elder { 25429969ebc5SAlex Elder struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 25439969ebc5SAlex Elder struct rbd_obj_request *obj_request; 25449969ebc5SAlex Elder int ret; 25459969ebc5SAlex Elder 25469969ebc5SAlex Elder rbd_assert(start ^ !!rbd_dev->watch_event); 25479969ebc5SAlex Elder rbd_assert(start ^ !!rbd_dev->watch_request); 25489969ebc5SAlex Elder 25499969ebc5SAlex Elder if (start) { 25503c663bbdSAlex Elder ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev, 25519969ebc5SAlex Elder &rbd_dev->watch_event); 25529969ebc5SAlex Elder if (ret < 0) 25539969ebc5SAlex Elder return ret; 25548eb87565SAlex Elder rbd_assert(rbd_dev->watch_event != NULL); 25559969ebc5SAlex Elder } 25569969ebc5SAlex Elder 25579969ebc5SAlex Elder ret = -ENOMEM; 25589969ebc5SAlex Elder obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0, 25599969ebc5SAlex Elder OBJ_REQUEST_NODATA); 25609969ebc5SAlex Elder if (!obj_request) 25619969ebc5SAlex Elder goto out_cancel; 25629969ebc5SAlex Elder 2563430c28c3SAlex Elder obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, obj_request); 2564430c28c3SAlex Elder if (!obj_request->osd_req) 2565430c28c3SAlex Elder goto out_cancel; 2566430c28c3SAlex Elder 25678eb87565SAlex Elder if (start) 2568975241afSAlex Elder ceph_osdc_set_request_linger(osdc, obj_request->osd_req); 25698eb87565SAlex Elder else 25706977c3f9SAlex Elder ceph_osdc_unregister_linger_request(osdc, 2571975241afSAlex Elder rbd_dev->watch_request->osd_req); 25722169238dSAlex Elder 25732169238dSAlex Elder osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH, 25742169238dSAlex Elder rbd_dev->watch_event->cookie, 25752169238dSAlex Elder rbd_dev->header.obj_version, start); 25769d4df01fSAlex Elder rbd_osd_req_format_write(obj_request); 25772169238dSAlex Elder 25789969ebc5SAlex Elder ret = rbd_obj_request_submit(osdc, obj_request); 25799969ebc5SAlex Elder if (ret) 25809969ebc5SAlex Elder goto out_cancel; 25819969ebc5SAlex Elder ret = rbd_obj_request_wait(obj_request); 25829969ebc5SAlex Elder if (ret) 25839969ebc5SAlex Elder goto out_cancel; 25849969ebc5SAlex Elder ret = obj_request->result; 25859969ebc5SAlex Elder if (ret) 25869969ebc5SAlex Elder goto out_cancel; 25879969ebc5SAlex Elder 25888eb87565SAlex Elder /* 25898eb87565SAlex Elder * A watch request is set to linger, so the underlying osd 25908eb87565SAlex Elder * request won't go away until we unregister it. We retain 25918eb87565SAlex Elder * a pointer to the object request during that time (in 25928eb87565SAlex Elder * rbd_dev->watch_request), so we'll keep a reference to 25938eb87565SAlex Elder * it. We'll drop that reference (below) after we've 25948eb87565SAlex Elder * unregistered it. 25958eb87565SAlex Elder */ 25968eb87565SAlex Elder if (start) { 25978eb87565SAlex Elder rbd_dev->watch_request = obj_request; 25988eb87565SAlex Elder 25998eb87565SAlex Elder return 0; 26008eb87565SAlex Elder } 26018eb87565SAlex Elder 26028eb87565SAlex Elder /* We have successfully torn down the watch request */ 26038eb87565SAlex Elder 26048eb87565SAlex Elder rbd_obj_request_put(rbd_dev->watch_request); 26058eb87565SAlex Elder rbd_dev->watch_request = NULL; 26069969ebc5SAlex Elder out_cancel: 26079969ebc5SAlex Elder /* Cancel the event if we're tearing down, or on error */ 26089969ebc5SAlex Elder ceph_osdc_cancel_event(rbd_dev->watch_event); 26099969ebc5SAlex Elder rbd_dev->watch_event = NULL; 26109969ebc5SAlex Elder if (obj_request) 26119969ebc5SAlex Elder rbd_obj_request_put(obj_request); 26129969ebc5SAlex Elder 26139969ebc5SAlex Elder return ret; 26149969ebc5SAlex Elder } 26159969ebc5SAlex Elder 261636be9a76SAlex Elder /* 261736be9a76SAlex Elder * Synchronous osd object method call 261836be9a76SAlex Elder */ 261936be9a76SAlex Elder static int rbd_obj_method_sync(struct rbd_device *rbd_dev, 262036be9a76SAlex Elder const char *object_name, 262136be9a76SAlex Elder const char *class_name, 262236be9a76SAlex Elder const char *method_name, 26234157976bSAlex Elder const void *outbound, 262436be9a76SAlex Elder size_t outbound_size, 26254157976bSAlex Elder void *inbound, 262636be9a76SAlex Elder size_t inbound_size, 262736be9a76SAlex Elder u64 *version) 262836be9a76SAlex Elder { 26292169238dSAlex Elder struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 263036be9a76SAlex Elder struct rbd_obj_request *obj_request; 263136be9a76SAlex Elder struct page **pages; 263236be9a76SAlex Elder u32 page_count; 263336be9a76SAlex Elder int ret; 263436be9a76SAlex Elder 263536be9a76SAlex Elder /* 26366010a451SAlex Elder * Method calls are ultimately read operations. The result 26376010a451SAlex Elder * should placed into the inbound buffer provided. They 26386010a451SAlex Elder * also supply outbound data--parameters for the object 26396010a451SAlex Elder * method. Currently if this is present it will be a 26406010a451SAlex Elder * snapshot id. 264136be9a76SAlex Elder */ 264236be9a76SAlex Elder page_count = (u32)calc_pages_for(0, inbound_size); 264336be9a76SAlex Elder pages = ceph_alloc_page_vector(page_count, GFP_KERNEL); 264436be9a76SAlex Elder if (IS_ERR(pages)) 264536be9a76SAlex Elder return PTR_ERR(pages); 264636be9a76SAlex Elder 264736be9a76SAlex Elder ret = -ENOMEM; 26486010a451SAlex Elder obj_request = rbd_obj_request_create(object_name, 0, inbound_size, 264936be9a76SAlex Elder OBJ_REQUEST_PAGES); 265036be9a76SAlex Elder if (!obj_request) 265136be9a76SAlex Elder goto out; 265236be9a76SAlex Elder 265336be9a76SAlex Elder obj_request->pages = pages; 265436be9a76SAlex Elder obj_request->page_count = page_count; 265536be9a76SAlex Elder 2656430c28c3SAlex Elder obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request); 265736be9a76SAlex Elder if (!obj_request->osd_req) 265836be9a76SAlex Elder goto out; 265936be9a76SAlex Elder 2660c99d2d4aSAlex Elder osd_req_op_cls_init(obj_request->osd_req, 0, CEPH_OSD_OP_CALL, 266104017e29SAlex Elder class_name, method_name); 266204017e29SAlex Elder if (outbound_size) { 266304017e29SAlex Elder struct ceph_pagelist *pagelist; 266404017e29SAlex Elder 266504017e29SAlex Elder pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS); 266604017e29SAlex Elder if (!pagelist) 266704017e29SAlex Elder goto out; 266804017e29SAlex Elder 266904017e29SAlex Elder ceph_pagelist_init(pagelist); 267004017e29SAlex Elder ceph_pagelist_append(pagelist, outbound, outbound_size); 267104017e29SAlex Elder osd_req_op_cls_request_data_pagelist(obj_request->osd_req, 0, 267204017e29SAlex Elder pagelist); 267304017e29SAlex Elder } 2674a4ce40a9SAlex Elder osd_req_op_cls_response_data_pages(obj_request->osd_req, 0, 2675a4ce40a9SAlex Elder obj_request->pages, inbound_size, 267644cd188dSAlex Elder 0, false, false); 26779d4df01fSAlex Elder rbd_osd_req_format_read(obj_request); 2678430c28c3SAlex Elder 267936be9a76SAlex Elder ret = rbd_obj_request_submit(osdc, obj_request); 268036be9a76SAlex Elder if (ret) 268136be9a76SAlex Elder goto out; 268236be9a76SAlex Elder ret = rbd_obj_request_wait(obj_request); 268336be9a76SAlex Elder if (ret) 268436be9a76SAlex Elder goto out; 268536be9a76SAlex Elder 268636be9a76SAlex Elder ret = obj_request->result; 268736be9a76SAlex Elder if (ret < 0) 268836be9a76SAlex Elder goto out; 268957385b51SAlex Elder 269057385b51SAlex Elder rbd_assert(obj_request->xferred < (u64)INT_MAX); 269157385b51SAlex Elder ret = (int)obj_request->xferred; 2692903bb32eSAlex Elder ceph_copy_from_page_vector(pages, inbound, 0, obj_request->xferred); 269336be9a76SAlex Elder if (version) 269436be9a76SAlex Elder *version = obj_request->version; 269536be9a76SAlex Elder out: 269636be9a76SAlex Elder if (obj_request) 269736be9a76SAlex Elder rbd_obj_request_put(obj_request); 269836be9a76SAlex Elder else 269936be9a76SAlex Elder ceph_release_page_vector(pages, page_count); 270036be9a76SAlex Elder 270136be9a76SAlex Elder return ret; 270236be9a76SAlex Elder } 270336be9a76SAlex Elder 2704bf0d5f50SAlex Elder static void rbd_request_fn(struct request_queue *q) 2705cc344fa1SAlex Elder __releases(q->queue_lock) __acquires(q->queue_lock) 2706bf0d5f50SAlex Elder { 2707bf0d5f50SAlex Elder struct rbd_device *rbd_dev = q->queuedata; 2708bf0d5f50SAlex Elder bool read_only = rbd_dev->mapping.read_only; 2709bf0d5f50SAlex Elder struct request *rq; 2710bf0d5f50SAlex Elder int result; 2711bf0d5f50SAlex Elder 2712bf0d5f50SAlex Elder while ((rq = blk_fetch_request(q))) { 2713bf0d5f50SAlex Elder bool write_request = rq_data_dir(rq) == WRITE; 2714bf0d5f50SAlex Elder struct rbd_img_request *img_request; 2715bf0d5f50SAlex Elder u64 offset; 2716bf0d5f50SAlex Elder u64 length; 2717bf0d5f50SAlex Elder 2718bf0d5f50SAlex Elder /* Ignore any non-FS requests that filter through. */ 2719bf0d5f50SAlex Elder 2720bf0d5f50SAlex Elder if (rq->cmd_type != REQ_TYPE_FS) { 27214dda41d3SAlex Elder dout("%s: non-fs request type %d\n", __func__, 27224dda41d3SAlex Elder (int) rq->cmd_type); 27234dda41d3SAlex Elder __blk_end_request_all(rq, 0); 27244dda41d3SAlex Elder continue; 27254dda41d3SAlex Elder } 27264dda41d3SAlex Elder 27274dda41d3SAlex Elder /* Ignore/skip any zero-length requests */ 27284dda41d3SAlex Elder 27294dda41d3SAlex Elder offset = (u64) blk_rq_pos(rq) << SECTOR_SHIFT; 27304dda41d3SAlex Elder length = (u64) blk_rq_bytes(rq); 27314dda41d3SAlex Elder 27324dda41d3SAlex Elder if (!length) { 27334dda41d3SAlex Elder dout("%s: zero-length request\n", __func__); 2734bf0d5f50SAlex Elder __blk_end_request_all(rq, 0); 2735bf0d5f50SAlex Elder continue; 2736bf0d5f50SAlex Elder } 2737bf0d5f50SAlex Elder 2738bf0d5f50SAlex Elder spin_unlock_irq(q->queue_lock); 2739bf0d5f50SAlex Elder 2740bf0d5f50SAlex Elder /* Disallow writes to a read-only device */ 2741bf0d5f50SAlex Elder 2742bf0d5f50SAlex Elder if (write_request) { 2743bf0d5f50SAlex Elder result = -EROFS; 2744bf0d5f50SAlex Elder if (read_only) 2745bf0d5f50SAlex Elder goto end_request; 2746bf0d5f50SAlex Elder rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP); 2747bf0d5f50SAlex Elder } 2748bf0d5f50SAlex Elder 27496d292906SAlex Elder /* 27506d292906SAlex Elder * Quit early if the mapped snapshot no longer 27516d292906SAlex Elder * exists. It's still possible the snapshot will 27526d292906SAlex Elder * have disappeared by the time our request arrives 27536d292906SAlex Elder * at the osd, but there's no sense in sending it if 27546d292906SAlex Elder * we already know. 27556d292906SAlex Elder */ 27566d292906SAlex Elder if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) { 2757bf0d5f50SAlex Elder dout("request for non-existent snapshot"); 2758bf0d5f50SAlex Elder rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP); 2759bf0d5f50SAlex Elder result = -ENXIO; 2760bf0d5f50SAlex Elder goto end_request; 2761bf0d5f50SAlex Elder } 2762bf0d5f50SAlex Elder 2763bf0d5f50SAlex Elder result = -EINVAL; 2764bf0d5f50SAlex Elder if (WARN_ON(offset && length > U64_MAX - offset + 1)) 2765bf0d5f50SAlex Elder goto end_request; /* Shouldn't happen */ 2766bf0d5f50SAlex Elder 2767bf0d5f50SAlex Elder result = -ENOMEM; 2768bf0d5f50SAlex Elder img_request = rbd_img_request_create(rbd_dev, offset, length, 27699849e986SAlex Elder write_request, false); 2770bf0d5f50SAlex Elder if (!img_request) 2771bf0d5f50SAlex Elder goto end_request; 2772bf0d5f50SAlex Elder 2773bf0d5f50SAlex Elder img_request->rq = rq; 2774bf0d5f50SAlex Elder 2775f1a4739fSAlex Elder result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO, 2776f1a4739fSAlex Elder rq->bio); 2777bf0d5f50SAlex Elder if (!result) 2778bf0d5f50SAlex Elder result = rbd_img_request_submit(img_request); 2779bf0d5f50SAlex Elder if (result) 2780bf0d5f50SAlex Elder rbd_img_request_put(img_request); 2781bf0d5f50SAlex Elder end_request: 2782bf0d5f50SAlex Elder spin_lock_irq(q->queue_lock); 2783bf0d5f50SAlex Elder if (result < 0) { 27847da22d29SAlex Elder rbd_warn(rbd_dev, "%s %llx at %llx result %d\n", 27857da22d29SAlex Elder write_request ? "write" : "read", 27867da22d29SAlex Elder length, offset, result); 27877da22d29SAlex Elder 2788bf0d5f50SAlex Elder __blk_end_request_all(rq, result); 2789bf0d5f50SAlex Elder } 2790bf0d5f50SAlex Elder } 2791bf0d5f50SAlex Elder } 2792bf0d5f50SAlex Elder 2793602adf40SYehuda Sadeh /* 2794602adf40SYehuda Sadeh * a queue callback. Makes sure that we don't create a bio that spans across 2795602adf40SYehuda Sadeh * multiple osd objects. One exception would be with a single page bios, 2796f7760dadSAlex Elder * which we handle later at bio_chain_clone_range() 2797602adf40SYehuda Sadeh */ 2798602adf40SYehuda Sadeh static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd, 2799602adf40SYehuda Sadeh struct bio_vec *bvec) 2800602adf40SYehuda Sadeh { 2801602adf40SYehuda Sadeh struct rbd_device *rbd_dev = q->queuedata; 2802e5cfeed2SAlex Elder sector_t sector_offset; 2803e5cfeed2SAlex Elder sector_t sectors_per_obj; 2804e5cfeed2SAlex Elder sector_t obj_sector_offset; 2805e5cfeed2SAlex Elder int ret; 2806602adf40SYehuda Sadeh 2807e5cfeed2SAlex Elder /* 2808e5cfeed2SAlex Elder * Find how far into its rbd object the partition-relative 2809e5cfeed2SAlex Elder * bio start sector is to offset relative to the enclosing 2810e5cfeed2SAlex Elder * device. 2811e5cfeed2SAlex Elder */ 2812e5cfeed2SAlex Elder sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector; 2813e5cfeed2SAlex Elder sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT); 2814e5cfeed2SAlex Elder obj_sector_offset = sector_offset & (sectors_per_obj - 1); 2815593a9e7bSAlex Elder 2816e5cfeed2SAlex Elder /* 2817e5cfeed2SAlex Elder * Compute the number of bytes from that offset to the end 2818e5cfeed2SAlex Elder * of the object. Account for what's already used by the bio. 2819e5cfeed2SAlex Elder */ 2820e5cfeed2SAlex Elder ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT; 2821e5cfeed2SAlex Elder if (ret > bmd->bi_size) 2822e5cfeed2SAlex Elder ret -= bmd->bi_size; 2823e5cfeed2SAlex Elder else 2824e5cfeed2SAlex Elder ret = 0; 2825e5cfeed2SAlex Elder 2826e5cfeed2SAlex Elder /* 2827e5cfeed2SAlex Elder * Don't send back more than was asked for. And if the bio 2828e5cfeed2SAlex Elder * was empty, let the whole thing through because: "Note 2829e5cfeed2SAlex Elder * that a block device *must* allow a single page to be 2830e5cfeed2SAlex Elder * added to an empty bio." 2831e5cfeed2SAlex Elder */ 2832e5cfeed2SAlex Elder rbd_assert(bvec->bv_len <= PAGE_SIZE); 2833e5cfeed2SAlex Elder if (ret > (int) bvec->bv_len || !bmd->bi_size) 2834e5cfeed2SAlex Elder ret = (int) bvec->bv_len; 2835e5cfeed2SAlex Elder 2836e5cfeed2SAlex Elder return ret; 2837602adf40SYehuda Sadeh } 2838602adf40SYehuda Sadeh 2839602adf40SYehuda Sadeh static void rbd_free_disk(struct rbd_device *rbd_dev) 2840602adf40SYehuda Sadeh { 2841602adf40SYehuda Sadeh struct gendisk *disk = rbd_dev->disk; 2842602adf40SYehuda Sadeh 2843602adf40SYehuda Sadeh if (!disk) 2844602adf40SYehuda Sadeh return; 2845602adf40SYehuda Sadeh 2846602adf40SYehuda Sadeh if (disk->flags & GENHD_FL_UP) 2847602adf40SYehuda Sadeh del_gendisk(disk); 2848602adf40SYehuda Sadeh if (disk->queue) 2849602adf40SYehuda Sadeh blk_cleanup_queue(disk->queue); 2850602adf40SYehuda Sadeh put_disk(disk); 2851602adf40SYehuda Sadeh } 2852602adf40SYehuda Sadeh 2853788e2df3SAlex Elder static int rbd_obj_read_sync(struct rbd_device *rbd_dev, 2854788e2df3SAlex Elder const char *object_name, 2855788e2df3SAlex Elder u64 offset, u64 length, 285680ef15bfSAlex Elder void *buf, u64 *version) 2857788e2df3SAlex Elder 2858788e2df3SAlex Elder { 28592169238dSAlex Elder struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 2860788e2df3SAlex Elder struct rbd_obj_request *obj_request; 2861788e2df3SAlex Elder struct page **pages = NULL; 2862788e2df3SAlex Elder u32 page_count; 28631ceae7efSAlex Elder size_t size; 2864788e2df3SAlex Elder int ret; 2865788e2df3SAlex Elder 2866788e2df3SAlex Elder page_count = (u32) calc_pages_for(offset, length); 2867788e2df3SAlex Elder pages = ceph_alloc_page_vector(page_count, GFP_KERNEL); 2868788e2df3SAlex Elder if (IS_ERR(pages)) 2869788e2df3SAlex Elder ret = PTR_ERR(pages); 2870788e2df3SAlex Elder 2871788e2df3SAlex Elder ret = -ENOMEM; 2872788e2df3SAlex Elder obj_request = rbd_obj_request_create(object_name, offset, length, 2873788e2df3SAlex Elder OBJ_REQUEST_PAGES); 2874788e2df3SAlex Elder if (!obj_request) 2875788e2df3SAlex Elder goto out; 2876788e2df3SAlex Elder 2877788e2df3SAlex Elder obj_request->pages = pages; 2878788e2df3SAlex Elder obj_request->page_count = page_count; 2879788e2df3SAlex Elder 2880430c28c3SAlex Elder obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request); 2881788e2df3SAlex Elder if (!obj_request->osd_req) 2882788e2df3SAlex Elder goto out; 2883788e2df3SAlex Elder 2884c99d2d4aSAlex Elder osd_req_op_extent_init(obj_request->osd_req, 0, CEPH_OSD_OP_READ, 2885c99d2d4aSAlex Elder offset, length, 0, 0); 2886406e2c9fSAlex Elder osd_req_op_extent_osd_data_pages(obj_request->osd_req, 0, 2887a4ce40a9SAlex Elder obj_request->pages, 288844cd188dSAlex Elder obj_request->length, 288944cd188dSAlex Elder obj_request->offset & ~PAGE_MASK, 289044cd188dSAlex Elder false, false); 28919d4df01fSAlex Elder rbd_osd_req_format_read(obj_request); 2892430c28c3SAlex Elder 2893788e2df3SAlex Elder ret = rbd_obj_request_submit(osdc, obj_request); 2894788e2df3SAlex Elder if (ret) 2895788e2df3SAlex Elder goto out; 2896788e2df3SAlex Elder ret = rbd_obj_request_wait(obj_request); 2897788e2df3SAlex Elder if (ret) 2898788e2df3SAlex Elder goto out; 2899788e2df3SAlex Elder 2900788e2df3SAlex Elder ret = obj_request->result; 2901788e2df3SAlex Elder if (ret < 0) 2902788e2df3SAlex Elder goto out; 29031ceae7efSAlex Elder 29041ceae7efSAlex Elder rbd_assert(obj_request->xferred <= (u64) SIZE_MAX); 29051ceae7efSAlex Elder size = (size_t) obj_request->xferred; 2906903bb32eSAlex Elder ceph_copy_from_page_vector(pages, buf, 0, size); 290723ed6e13SAlex Elder rbd_assert(size <= (size_t) INT_MAX); 290823ed6e13SAlex Elder ret = (int) size; 2909788e2df3SAlex Elder if (version) 2910788e2df3SAlex Elder *version = obj_request->version; 2911788e2df3SAlex Elder out: 2912788e2df3SAlex Elder if (obj_request) 2913788e2df3SAlex Elder rbd_obj_request_put(obj_request); 2914788e2df3SAlex Elder else 2915788e2df3SAlex Elder ceph_release_page_vector(pages, page_count); 2916788e2df3SAlex Elder 2917788e2df3SAlex Elder return ret; 2918788e2df3SAlex Elder } 2919788e2df3SAlex Elder 2920602adf40SYehuda Sadeh /* 29214156d998SAlex Elder * Read the complete header for the given rbd device. 29224156d998SAlex Elder * 29234156d998SAlex Elder * Returns a pointer to a dynamically-allocated buffer containing 29244156d998SAlex Elder * the complete and validated header. Caller can pass the address 29254156d998SAlex Elder * of a variable that will be filled in with the version of the 29264156d998SAlex Elder * header object at the time it was read. 29274156d998SAlex Elder * 29284156d998SAlex Elder * Returns a pointer-coded errno if a failure occurs. 29294156d998SAlex Elder */ 29304156d998SAlex Elder static struct rbd_image_header_ondisk * 29314156d998SAlex Elder rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version) 29324156d998SAlex Elder { 29334156d998SAlex Elder struct rbd_image_header_ondisk *ondisk = NULL; 29344156d998SAlex Elder u32 snap_count = 0; 29354156d998SAlex Elder u64 names_size = 0; 29364156d998SAlex Elder u32 want_count; 29374156d998SAlex Elder int ret; 29384156d998SAlex Elder 29394156d998SAlex Elder /* 29404156d998SAlex Elder * The complete header will include an array of its 64-bit 29414156d998SAlex Elder * snapshot ids, followed by the names of those snapshots as 29424156d998SAlex Elder * a contiguous block of NUL-terminated strings. Note that 29434156d998SAlex Elder * the number of snapshots could change by the time we read 29444156d998SAlex Elder * it in, in which case we re-read it. 29454156d998SAlex Elder */ 29464156d998SAlex Elder do { 29474156d998SAlex Elder size_t size; 29484156d998SAlex Elder 29494156d998SAlex Elder kfree(ondisk); 29504156d998SAlex Elder 29514156d998SAlex Elder size = sizeof (*ondisk); 29524156d998SAlex Elder size += snap_count * sizeof (struct rbd_image_snap_ondisk); 29534156d998SAlex Elder size += names_size; 29544156d998SAlex Elder ondisk = kmalloc(size, GFP_KERNEL); 29554156d998SAlex Elder if (!ondisk) 29564156d998SAlex Elder return ERR_PTR(-ENOMEM); 29574156d998SAlex Elder 2958788e2df3SAlex Elder ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name, 295980ef15bfSAlex Elder 0, size, ondisk, version); 29604156d998SAlex Elder if (ret < 0) 29614156d998SAlex Elder goto out_err; 29624156d998SAlex Elder if (WARN_ON((size_t) ret < size)) { 29634156d998SAlex Elder ret = -ENXIO; 296406ecc6cbSAlex Elder rbd_warn(rbd_dev, "short header read (want %zd got %d)", 296506ecc6cbSAlex Elder size, ret); 29664156d998SAlex Elder goto out_err; 29674156d998SAlex Elder } 29684156d998SAlex Elder if (!rbd_dev_ondisk_valid(ondisk)) { 29694156d998SAlex Elder ret = -ENXIO; 297006ecc6cbSAlex Elder rbd_warn(rbd_dev, "invalid header"); 29714156d998SAlex Elder goto out_err; 29724156d998SAlex Elder } 29734156d998SAlex Elder 29744156d998SAlex Elder names_size = le64_to_cpu(ondisk->snap_names_len); 29754156d998SAlex Elder want_count = snap_count; 29764156d998SAlex Elder snap_count = le32_to_cpu(ondisk->snap_count); 29774156d998SAlex Elder } while (snap_count != want_count); 29784156d998SAlex Elder 29794156d998SAlex Elder return ondisk; 29804156d998SAlex Elder 29814156d998SAlex Elder out_err: 29824156d998SAlex Elder kfree(ondisk); 29834156d998SAlex Elder 29844156d998SAlex Elder return ERR_PTR(ret); 29854156d998SAlex Elder } 29864156d998SAlex Elder 29874156d998SAlex Elder /* 2988602adf40SYehuda Sadeh * reload the ondisk the header 2989602adf40SYehuda Sadeh */ 2990602adf40SYehuda Sadeh static int rbd_read_header(struct rbd_device *rbd_dev, 2991602adf40SYehuda Sadeh struct rbd_image_header *header) 2992602adf40SYehuda Sadeh { 29934156d998SAlex Elder struct rbd_image_header_ondisk *ondisk; 29944156d998SAlex Elder u64 ver = 0; 29954156d998SAlex Elder int ret; 2996602adf40SYehuda Sadeh 29974156d998SAlex Elder ondisk = rbd_dev_v1_header_read(rbd_dev, &ver); 29984156d998SAlex Elder if (IS_ERR(ondisk)) 29994156d998SAlex Elder return PTR_ERR(ondisk); 30004156d998SAlex Elder ret = rbd_header_from_disk(header, ondisk); 30014156d998SAlex Elder if (ret >= 0) 300259c2be1eSYehuda Sadeh header->obj_version = ver; 30034156d998SAlex Elder kfree(ondisk); 3004602adf40SYehuda Sadeh 30054156d998SAlex Elder return ret; 3006602adf40SYehuda Sadeh } 3007602adf40SYehuda Sadeh 300841f38c2bSAlex Elder static void rbd_remove_all_snaps(struct rbd_device *rbd_dev) 3009dfc5606dSYehuda Sadeh { 3010dfc5606dSYehuda Sadeh struct rbd_snap *snap; 3011a0593290SAlex Elder struct rbd_snap *next; 3012dfc5606dSYehuda Sadeh 30136087b51bSAlex Elder list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node) { 30146087b51bSAlex Elder list_del(&snap->node); 30156087b51bSAlex Elder rbd_snap_destroy(snap); 30166087b51bSAlex Elder } 3017dfc5606dSYehuda Sadeh } 3018dfc5606dSYehuda Sadeh 30199478554aSAlex Elder static void rbd_update_mapping_size(struct rbd_device *rbd_dev) 30209478554aSAlex Elder { 30219478554aSAlex Elder sector_t size; 30229478554aSAlex Elder 30230d7dbfceSAlex Elder if (rbd_dev->spec->snap_id != CEPH_NOSNAP) 30249478554aSAlex Elder return; 30259478554aSAlex Elder 30269478554aSAlex Elder size = (sector_t) rbd_dev->header.image_size / SECTOR_SIZE; 30279478554aSAlex Elder dout("setting size to %llu sectors", (unsigned long long) size); 30289478554aSAlex Elder rbd_dev->mapping.size = (u64) size; 30299478554aSAlex Elder set_capacity(rbd_dev->disk, size); 30309478554aSAlex Elder } 30319478554aSAlex Elder 3032602adf40SYehuda Sadeh /* 3033602adf40SYehuda Sadeh * only read the first part of the ondisk header, without the snaps info 3034602adf40SYehuda Sadeh */ 3035117973fbSAlex Elder static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver) 3036602adf40SYehuda Sadeh { 3037602adf40SYehuda Sadeh int ret; 3038602adf40SYehuda Sadeh struct rbd_image_header h; 3039602adf40SYehuda Sadeh 3040602adf40SYehuda Sadeh ret = rbd_read_header(rbd_dev, &h); 3041602adf40SYehuda Sadeh if (ret < 0) 3042602adf40SYehuda Sadeh return ret; 3043602adf40SYehuda Sadeh 3044a51aa0c0SJosh Durgin down_write(&rbd_dev->header_rwsem); 3045a51aa0c0SJosh Durgin 30469478554aSAlex Elder /* Update image size, and check for resize of mapped image */ 30479478554aSAlex Elder rbd_dev->header.image_size = h.image_size; 30489478554aSAlex Elder rbd_update_mapping_size(rbd_dev); 30499db4b3e3SSage Weil 3050849b4260SAlex Elder /* rbd_dev->header.object_prefix shouldn't change */ 3051602adf40SYehuda Sadeh kfree(rbd_dev->header.snap_sizes); 3052849b4260SAlex Elder kfree(rbd_dev->header.snap_names); 3053d1d25646SJosh Durgin /* osd requests may still refer to snapc */ 3054d1d25646SJosh Durgin ceph_put_snap_context(rbd_dev->header.snapc); 3055602adf40SYehuda Sadeh 3056b813623aSAlex Elder if (hver) 3057b813623aSAlex Elder *hver = h.obj_version; 3058a71b891bSJosh Durgin rbd_dev->header.obj_version = h.obj_version; 305993a24e08SJosh Durgin rbd_dev->header.image_size = h.image_size; 3060602adf40SYehuda Sadeh rbd_dev->header.snapc = h.snapc; 3061602adf40SYehuda Sadeh rbd_dev->header.snap_names = h.snap_names; 3062602adf40SYehuda Sadeh rbd_dev->header.snap_sizes = h.snap_sizes; 3063849b4260SAlex Elder /* Free the extra copy of the object prefix */ 3064849b4260SAlex Elder WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix)); 3065849b4260SAlex Elder kfree(h.object_prefix); 3066849b4260SAlex Elder 3067304f6808SAlex Elder ret = rbd_dev_snaps_update(rbd_dev); 3068dfc5606dSYehuda Sadeh 3069c666601aSJosh Durgin up_write(&rbd_dev->header_rwsem); 3070602adf40SYehuda Sadeh 3071dfc5606dSYehuda Sadeh return ret; 3072602adf40SYehuda Sadeh } 3073602adf40SYehuda Sadeh 3074117973fbSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver) 30751fe5e993SAlex Elder { 30761fe5e993SAlex Elder int ret; 30771fe5e993SAlex Elder 3078117973fbSAlex Elder rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 30791fe5e993SAlex Elder mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); 3080117973fbSAlex Elder if (rbd_dev->image_format == 1) 3081117973fbSAlex Elder ret = rbd_dev_v1_refresh(rbd_dev, hver); 3082117973fbSAlex Elder else 3083117973fbSAlex Elder ret = rbd_dev_v2_refresh(rbd_dev, hver); 30841fe5e993SAlex Elder mutex_unlock(&ctl_mutex); 3085d98df63eSLaurent Barbe revalidate_disk(rbd_dev->disk); 3086522a0cc0SAlex Elder if (ret) 3087522a0cc0SAlex Elder rbd_warn(rbd_dev, "got notification but failed to " 3088522a0cc0SAlex Elder " update snaps: %d\n", ret); 30891fe5e993SAlex Elder 30901fe5e993SAlex Elder return ret; 30911fe5e993SAlex Elder } 30921fe5e993SAlex Elder 3093602adf40SYehuda Sadeh static int rbd_init_disk(struct rbd_device *rbd_dev) 3094602adf40SYehuda Sadeh { 3095602adf40SYehuda Sadeh struct gendisk *disk; 3096602adf40SYehuda Sadeh struct request_queue *q; 3097593a9e7bSAlex Elder u64 segment_size; 3098602adf40SYehuda Sadeh 3099602adf40SYehuda Sadeh /* create gendisk info */ 3100602adf40SYehuda Sadeh disk = alloc_disk(RBD_MINORS_PER_MAJOR); 3101602adf40SYehuda Sadeh if (!disk) 31021fcdb8aaSAlex Elder return -ENOMEM; 3103602adf40SYehuda Sadeh 3104f0f8cef5SAlex Elder snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d", 3105de71a297SAlex Elder rbd_dev->dev_id); 3106602adf40SYehuda Sadeh disk->major = rbd_dev->major; 3107602adf40SYehuda Sadeh disk->first_minor = 0; 3108602adf40SYehuda Sadeh disk->fops = &rbd_bd_ops; 3109602adf40SYehuda Sadeh disk->private_data = rbd_dev; 3110602adf40SYehuda Sadeh 3111bf0d5f50SAlex Elder q = blk_init_queue(rbd_request_fn, &rbd_dev->lock); 3112602adf40SYehuda Sadeh if (!q) 3113602adf40SYehuda Sadeh goto out_disk; 3114029bcbd8SJosh Durgin 3115593a9e7bSAlex Elder /* We use the default size, but let's be explicit about it. */ 3116593a9e7bSAlex Elder blk_queue_physical_block_size(q, SECTOR_SIZE); 3117593a9e7bSAlex Elder 3118029bcbd8SJosh Durgin /* set io sizes to object size */ 3119593a9e7bSAlex Elder segment_size = rbd_obj_bytes(&rbd_dev->header); 3120593a9e7bSAlex Elder blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE); 3121593a9e7bSAlex Elder blk_queue_max_segment_size(q, segment_size); 3122593a9e7bSAlex Elder blk_queue_io_min(q, segment_size); 3123593a9e7bSAlex Elder blk_queue_io_opt(q, segment_size); 3124029bcbd8SJosh Durgin 3125602adf40SYehuda Sadeh blk_queue_merge_bvec(q, rbd_merge_bvec); 3126602adf40SYehuda Sadeh disk->queue = q; 3127602adf40SYehuda Sadeh 3128602adf40SYehuda Sadeh q->queuedata = rbd_dev; 3129602adf40SYehuda Sadeh 3130602adf40SYehuda Sadeh rbd_dev->disk = disk; 3131602adf40SYehuda Sadeh 313212f02944SAlex Elder set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE); 313312f02944SAlex Elder 3134602adf40SYehuda Sadeh return 0; 3135602adf40SYehuda Sadeh out_disk: 3136602adf40SYehuda Sadeh put_disk(disk); 31371fcdb8aaSAlex Elder 31381fcdb8aaSAlex Elder return -ENOMEM; 3139602adf40SYehuda Sadeh } 3140602adf40SYehuda Sadeh 3141dfc5606dSYehuda Sadeh /* 3142dfc5606dSYehuda Sadeh sysfs 3143dfc5606dSYehuda Sadeh */ 3144602adf40SYehuda Sadeh 3145593a9e7bSAlex Elder static struct rbd_device *dev_to_rbd_dev(struct device *dev) 3146593a9e7bSAlex Elder { 3147593a9e7bSAlex Elder return container_of(dev, struct rbd_device, dev); 3148593a9e7bSAlex Elder } 3149593a9e7bSAlex Elder 3150dfc5606dSYehuda Sadeh static ssize_t rbd_size_show(struct device *dev, 3151dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 3152602adf40SYehuda Sadeh { 3153593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 3154a51aa0c0SJosh Durgin sector_t size; 3155dfc5606dSYehuda Sadeh 3156a51aa0c0SJosh Durgin down_read(&rbd_dev->header_rwsem); 3157a51aa0c0SJosh Durgin size = get_capacity(rbd_dev->disk); 3158a51aa0c0SJosh Durgin up_read(&rbd_dev->header_rwsem); 3159a51aa0c0SJosh Durgin 3160a51aa0c0SJosh Durgin return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE); 3161602adf40SYehuda Sadeh } 3162602adf40SYehuda Sadeh 316334b13184SAlex Elder /* 316434b13184SAlex Elder * Note this shows the features for whatever's mapped, which is not 316534b13184SAlex Elder * necessarily the base image. 316634b13184SAlex Elder */ 316734b13184SAlex Elder static ssize_t rbd_features_show(struct device *dev, 316834b13184SAlex Elder struct device_attribute *attr, char *buf) 316934b13184SAlex Elder { 317034b13184SAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 317134b13184SAlex Elder 317234b13184SAlex Elder return sprintf(buf, "0x%016llx\n", 317334b13184SAlex Elder (unsigned long long) rbd_dev->mapping.features); 317434b13184SAlex Elder } 317534b13184SAlex Elder 3176dfc5606dSYehuda Sadeh static ssize_t rbd_major_show(struct device *dev, 3177dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 3178602adf40SYehuda Sadeh { 3179593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 3180dfc5606dSYehuda Sadeh 3181dfc5606dSYehuda Sadeh return sprintf(buf, "%d\n", rbd_dev->major); 3182dfc5606dSYehuda Sadeh } 3183dfc5606dSYehuda Sadeh 3184dfc5606dSYehuda Sadeh static ssize_t rbd_client_id_show(struct device *dev, 3185dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 3186dfc5606dSYehuda Sadeh { 3187593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 3188dfc5606dSYehuda Sadeh 31891dbb4399SAlex Elder return sprintf(buf, "client%lld\n", 31901dbb4399SAlex Elder ceph_client_id(rbd_dev->rbd_client->client)); 3191dfc5606dSYehuda Sadeh } 3192dfc5606dSYehuda Sadeh 3193dfc5606dSYehuda Sadeh static ssize_t rbd_pool_show(struct device *dev, 3194dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 3195dfc5606dSYehuda Sadeh { 3196593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 3197dfc5606dSYehuda Sadeh 31980d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->pool_name); 3199dfc5606dSYehuda Sadeh } 3200dfc5606dSYehuda Sadeh 32019bb2f334SAlex Elder static ssize_t rbd_pool_id_show(struct device *dev, 32029bb2f334SAlex Elder struct device_attribute *attr, char *buf) 32039bb2f334SAlex Elder { 32049bb2f334SAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 32059bb2f334SAlex Elder 32060d7dbfceSAlex Elder return sprintf(buf, "%llu\n", 32070d7dbfceSAlex Elder (unsigned long long) rbd_dev->spec->pool_id); 32089bb2f334SAlex Elder } 32099bb2f334SAlex Elder 3210dfc5606dSYehuda Sadeh static ssize_t rbd_name_show(struct device *dev, 3211dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 3212dfc5606dSYehuda Sadeh { 3213593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 3214dfc5606dSYehuda Sadeh 3215a92ffdf8SAlex Elder if (rbd_dev->spec->image_name) 32160d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->image_name); 3217a92ffdf8SAlex Elder 3218a92ffdf8SAlex Elder return sprintf(buf, "(unknown)\n"); 3219dfc5606dSYehuda Sadeh } 3220dfc5606dSYehuda Sadeh 3221589d30e0SAlex Elder static ssize_t rbd_image_id_show(struct device *dev, 3222589d30e0SAlex Elder struct device_attribute *attr, char *buf) 3223589d30e0SAlex Elder { 3224589d30e0SAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 3225589d30e0SAlex Elder 32260d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->image_id); 3227589d30e0SAlex Elder } 3228589d30e0SAlex Elder 322934b13184SAlex Elder /* 323034b13184SAlex Elder * Shows the name of the currently-mapped snapshot (or 323134b13184SAlex Elder * RBD_SNAP_HEAD_NAME for the base image). 323234b13184SAlex Elder */ 3233dfc5606dSYehuda Sadeh static ssize_t rbd_snap_show(struct device *dev, 3234dfc5606dSYehuda Sadeh struct device_attribute *attr, 3235dfc5606dSYehuda Sadeh char *buf) 3236dfc5606dSYehuda Sadeh { 3237593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 3238dfc5606dSYehuda Sadeh 32390d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->snap_name); 3240dfc5606dSYehuda Sadeh } 3241dfc5606dSYehuda Sadeh 324286b00e0dSAlex Elder /* 324386b00e0dSAlex Elder * For an rbd v2 image, shows the pool id, image id, and snapshot id 324486b00e0dSAlex Elder * for the parent image. If there is no parent, simply shows 324586b00e0dSAlex Elder * "(no parent image)". 324686b00e0dSAlex Elder */ 324786b00e0dSAlex Elder static ssize_t rbd_parent_show(struct device *dev, 324886b00e0dSAlex Elder struct device_attribute *attr, 324986b00e0dSAlex Elder char *buf) 325086b00e0dSAlex Elder { 325186b00e0dSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 325286b00e0dSAlex Elder struct rbd_spec *spec = rbd_dev->parent_spec; 325386b00e0dSAlex Elder int count; 325486b00e0dSAlex Elder char *bufp = buf; 325586b00e0dSAlex Elder 325686b00e0dSAlex Elder if (!spec) 325786b00e0dSAlex Elder return sprintf(buf, "(no parent image)\n"); 325886b00e0dSAlex Elder 325986b00e0dSAlex Elder count = sprintf(bufp, "pool_id %llu\npool_name %s\n", 326086b00e0dSAlex Elder (unsigned long long) spec->pool_id, spec->pool_name); 326186b00e0dSAlex Elder if (count < 0) 326286b00e0dSAlex Elder return count; 326386b00e0dSAlex Elder bufp += count; 326486b00e0dSAlex Elder 326586b00e0dSAlex Elder count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id, 326686b00e0dSAlex Elder spec->image_name ? spec->image_name : "(unknown)"); 326786b00e0dSAlex Elder if (count < 0) 326886b00e0dSAlex Elder return count; 326986b00e0dSAlex Elder bufp += count; 327086b00e0dSAlex Elder 327186b00e0dSAlex Elder count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n", 327286b00e0dSAlex Elder (unsigned long long) spec->snap_id, spec->snap_name); 327386b00e0dSAlex Elder if (count < 0) 327486b00e0dSAlex Elder return count; 327586b00e0dSAlex Elder bufp += count; 327686b00e0dSAlex Elder 327786b00e0dSAlex Elder count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap); 327886b00e0dSAlex Elder if (count < 0) 327986b00e0dSAlex Elder return count; 328086b00e0dSAlex Elder bufp += count; 328186b00e0dSAlex Elder 328286b00e0dSAlex Elder return (ssize_t) (bufp - buf); 328386b00e0dSAlex Elder } 328486b00e0dSAlex Elder 3285dfc5606dSYehuda Sadeh static ssize_t rbd_image_refresh(struct device *dev, 3286dfc5606dSYehuda Sadeh struct device_attribute *attr, 3287dfc5606dSYehuda Sadeh const char *buf, 3288dfc5606dSYehuda Sadeh size_t size) 3289dfc5606dSYehuda Sadeh { 3290593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 3291b813623aSAlex Elder int ret; 3292602adf40SYehuda Sadeh 3293117973fbSAlex Elder ret = rbd_dev_refresh(rbd_dev, NULL); 3294b813623aSAlex Elder 3295b813623aSAlex Elder return ret < 0 ? ret : size; 3296dfc5606dSYehuda Sadeh } 3297602adf40SYehuda Sadeh 3298dfc5606dSYehuda Sadeh static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL); 329934b13184SAlex Elder static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL); 3300dfc5606dSYehuda Sadeh static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL); 3301dfc5606dSYehuda Sadeh static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL); 3302dfc5606dSYehuda Sadeh static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL); 33039bb2f334SAlex Elder static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL); 3304dfc5606dSYehuda Sadeh static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL); 3305589d30e0SAlex Elder static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL); 3306dfc5606dSYehuda Sadeh static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh); 3307dfc5606dSYehuda Sadeh static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL); 330886b00e0dSAlex Elder static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL); 3309dfc5606dSYehuda Sadeh 3310dfc5606dSYehuda Sadeh static struct attribute *rbd_attrs[] = { 3311dfc5606dSYehuda Sadeh &dev_attr_size.attr, 331234b13184SAlex Elder &dev_attr_features.attr, 3313dfc5606dSYehuda Sadeh &dev_attr_major.attr, 3314dfc5606dSYehuda Sadeh &dev_attr_client_id.attr, 3315dfc5606dSYehuda Sadeh &dev_attr_pool.attr, 33169bb2f334SAlex Elder &dev_attr_pool_id.attr, 3317dfc5606dSYehuda Sadeh &dev_attr_name.attr, 3318589d30e0SAlex Elder &dev_attr_image_id.attr, 3319dfc5606dSYehuda Sadeh &dev_attr_current_snap.attr, 332086b00e0dSAlex Elder &dev_attr_parent.attr, 3321dfc5606dSYehuda Sadeh &dev_attr_refresh.attr, 3322dfc5606dSYehuda Sadeh NULL 3323dfc5606dSYehuda Sadeh }; 3324dfc5606dSYehuda Sadeh 3325dfc5606dSYehuda Sadeh static struct attribute_group rbd_attr_group = { 3326dfc5606dSYehuda Sadeh .attrs = rbd_attrs, 3327dfc5606dSYehuda Sadeh }; 3328dfc5606dSYehuda Sadeh 3329dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_attr_groups[] = { 3330dfc5606dSYehuda Sadeh &rbd_attr_group, 3331dfc5606dSYehuda Sadeh NULL 3332dfc5606dSYehuda Sadeh }; 3333dfc5606dSYehuda Sadeh 3334dfc5606dSYehuda Sadeh static void rbd_sysfs_dev_release(struct device *dev) 3335dfc5606dSYehuda Sadeh { 3336dfc5606dSYehuda Sadeh } 3337dfc5606dSYehuda Sadeh 3338dfc5606dSYehuda Sadeh static struct device_type rbd_device_type = { 3339dfc5606dSYehuda Sadeh .name = "rbd", 3340dfc5606dSYehuda Sadeh .groups = rbd_attr_groups, 3341dfc5606dSYehuda Sadeh .release = rbd_sysfs_dev_release, 3342dfc5606dSYehuda Sadeh }; 3343dfc5606dSYehuda Sadeh 33448b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec) 33458b8fb99cSAlex Elder { 33468b8fb99cSAlex Elder kref_get(&spec->kref); 33478b8fb99cSAlex Elder 33488b8fb99cSAlex Elder return spec; 33498b8fb99cSAlex Elder } 33508b8fb99cSAlex Elder 33518b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref); 33528b8fb99cSAlex Elder static void rbd_spec_put(struct rbd_spec *spec) 33538b8fb99cSAlex Elder { 33548b8fb99cSAlex Elder if (spec) 33558b8fb99cSAlex Elder kref_put(&spec->kref, rbd_spec_free); 33568b8fb99cSAlex Elder } 33578b8fb99cSAlex Elder 33588b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_alloc(void) 33598b8fb99cSAlex Elder { 33608b8fb99cSAlex Elder struct rbd_spec *spec; 33618b8fb99cSAlex Elder 33628b8fb99cSAlex Elder spec = kzalloc(sizeof (*spec), GFP_KERNEL); 33638b8fb99cSAlex Elder if (!spec) 33648b8fb99cSAlex Elder return NULL; 33658b8fb99cSAlex Elder kref_init(&spec->kref); 33668b8fb99cSAlex Elder 33678b8fb99cSAlex Elder return spec; 33688b8fb99cSAlex Elder } 33698b8fb99cSAlex Elder 33708b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref) 33718b8fb99cSAlex Elder { 33728b8fb99cSAlex Elder struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref); 33738b8fb99cSAlex Elder 33748b8fb99cSAlex Elder kfree(spec->pool_name); 33758b8fb99cSAlex Elder kfree(spec->image_id); 33768b8fb99cSAlex Elder kfree(spec->image_name); 33778b8fb99cSAlex Elder kfree(spec->snap_name); 33788b8fb99cSAlex Elder kfree(spec); 33798b8fb99cSAlex Elder } 33808b8fb99cSAlex Elder 3381cc344fa1SAlex Elder static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc, 3382c53d5893SAlex Elder struct rbd_spec *spec) 3383c53d5893SAlex Elder { 3384c53d5893SAlex Elder struct rbd_device *rbd_dev; 3385c53d5893SAlex Elder 3386c53d5893SAlex Elder rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL); 3387c53d5893SAlex Elder if (!rbd_dev) 3388c53d5893SAlex Elder return NULL; 3389c53d5893SAlex Elder 3390c53d5893SAlex Elder spin_lock_init(&rbd_dev->lock); 33916d292906SAlex Elder rbd_dev->flags = 0; 3392c53d5893SAlex Elder INIT_LIST_HEAD(&rbd_dev->node); 3393c53d5893SAlex Elder INIT_LIST_HEAD(&rbd_dev->snaps); 3394c53d5893SAlex Elder init_rwsem(&rbd_dev->header_rwsem); 3395c53d5893SAlex Elder 3396c53d5893SAlex Elder rbd_dev->spec = spec; 3397c53d5893SAlex Elder rbd_dev->rbd_client = rbdc; 3398c53d5893SAlex Elder 33990903e875SAlex Elder /* Initialize the layout used for all rbd requests */ 34000903e875SAlex Elder 34010903e875SAlex Elder rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); 34020903e875SAlex Elder rbd_dev->layout.fl_stripe_count = cpu_to_le32(1); 34030903e875SAlex Elder rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); 34040903e875SAlex Elder rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id); 34050903e875SAlex Elder 3406c53d5893SAlex Elder return rbd_dev; 3407c53d5893SAlex Elder } 3408c53d5893SAlex Elder 3409c53d5893SAlex Elder static void rbd_dev_destroy(struct rbd_device *rbd_dev) 3410c53d5893SAlex Elder { 341186b00e0dSAlex Elder rbd_spec_put(rbd_dev->parent_spec); 3412c53d5893SAlex Elder kfree(rbd_dev->header_name); 3413c53d5893SAlex Elder rbd_put_client(rbd_dev->rbd_client); 3414c53d5893SAlex Elder rbd_spec_put(rbd_dev->spec); 3415c53d5893SAlex Elder kfree(rbd_dev); 3416c53d5893SAlex Elder } 3417c53d5893SAlex Elder 34186087b51bSAlex Elder static void rbd_snap_destroy(struct rbd_snap *snap) 3419dfc5606dSYehuda Sadeh { 34203e83b65bSAlex Elder kfree(snap->name); 34213e83b65bSAlex Elder kfree(snap); 3422dfc5606dSYehuda Sadeh } 3423dfc5606dSYehuda Sadeh 34246087b51bSAlex Elder static struct rbd_snap *rbd_snap_create(struct rbd_device *rbd_dev, 3425c8d18425SAlex Elder const char *snap_name, 342634b13184SAlex Elder u64 snap_id, u64 snap_size, 342734b13184SAlex Elder u64 snap_features) 3428dfc5606dSYehuda Sadeh { 34294e891e0aSAlex Elder struct rbd_snap *snap; 34304e891e0aSAlex Elder 34314e891e0aSAlex Elder snap = kzalloc(sizeof (*snap), GFP_KERNEL); 3432dfc5606dSYehuda Sadeh if (!snap) 34334e891e0aSAlex Elder return ERR_PTR(-ENOMEM); 34344e891e0aSAlex Elder 34356e584f52SAlex Elder snap->name = snap_name; 3436c8d18425SAlex Elder snap->id = snap_id; 3437c8d18425SAlex Elder snap->size = snap_size; 343834b13184SAlex Elder snap->features = snap_features; 34394e891e0aSAlex Elder 34404e891e0aSAlex Elder return snap; 3441dfc5606dSYehuda Sadeh } 3442dfc5606dSYehuda Sadeh 34436e584f52SAlex Elder /* 34446e584f52SAlex Elder * Returns a dynamically-allocated snapshot name if successful, or a 34456e584f52SAlex Elder * pointer-coded error otherwise. 34466e584f52SAlex Elder */ 3447cd892126SAlex Elder static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which, 3448cd892126SAlex Elder u64 *snap_size, u64 *snap_features) 3449cd892126SAlex Elder { 3450cd892126SAlex Elder char *snap_name; 34516e584f52SAlex Elder int i; 3452cd892126SAlex Elder 3453cd892126SAlex Elder rbd_assert(which < rbd_dev->header.snapc->num_snaps); 3454cd892126SAlex Elder 3455cd892126SAlex Elder /* Skip over names until we find the one we are looking for */ 3456cd892126SAlex Elder 3457cd892126SAlex Elder snap_name = rbd_dev->header.snap_names; 34586e584f52SAlex Elder for (i = 0; i < which; i++) 3459cd892126SAlex Elder snap_name += strlen(snap_name) + 1; 3460cd892126SAlex Elder 34616e584f52SAlex Elder snap_name = kstrdup(snap_name, GFP_KERNEL); 34626e584f52SAlex Elder if (!snap_name) 34636e584f52SAlex Elder return ERR_PTR(-ENOMEM); 34646e584f52SAlex Elder 34656e584f52SAlex Elder *snap_size = rbd_dev->header.snap_sizes[which]; 34666e584f52SAlex Elder *snap_features = 0; /* No features for v1 */ 34676e584f52SAlex Elder 3468cd892126SAlex Elder return snap_name; 3469cd892126SAlex Elder } 3470cd892126SAlex Elder 3471dfc5606dSYehuda Sadeh /* 34729d475de5SAlex Elder * Get the size and object order for an image snapshot, or if 34739d475de5SAlex Elder * snap_id is CEPH_NOSNAP, gets this information for the base 34749d475de5SAlex Elder * image. 34759d475de5SAlex Elder */ 34769d475de5SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id, 34779d475de5SAlex Elder u8 *order, u64 *snap_size) 34789d475de5SAlex Elder { 34799d475de5SAlex Elder __le64 snapid = cpu_to_le64(snap_id); 34809d475de5SAlex Elder int ret; 34819d475de5SAlex Elder struct { 34829d475de5SAlex Elder u8 order; 34839d475de5SAlex Elder __le64 size; 34849d475de5SAlex Elder } __attribute__ ((packed)) size_buf = { 0 }; 34859d475de5SAlex Elder 348636be9a76SAlex Elder ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, 34879d475de5SAlex Elder "rbd", "get_size", 34884157976bSAlex Elder &snapid, sizeof (snapid), 34894157976bSAlex Elder &size_buf, sizeof (size_buf), NULL); 349036be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 34919d475de5SAlex Elder if (ret < 0) 34929d475de5SAlex Elder return ret; 349357385b51SAlex Elder if (ret < sizeof (size_buf)) 349457385b51SAlex Elder return -ERANGE; 34959d475de5SAlex Elder 3496c86f86e9SAlex Elder if (order) 34979d475de5SAlex Elder *order = size_buf.order; 34989d475de5SAlex Elder *snap_size = le64_to_cpu(size_buf.size); 34999d475de5SAlex Elder 35009d475de5SAlex Elder dout(" snap_id 0x%016llx order = %u, snap_size = %llu\n", 35019d475de5SAlex Elder (unsigned long long)snap_id, (unsigned int)*order, 35029d475de5SAlex Elder (unsigned long long)*snap_size); 35039d475de5SAlex Elder 35049d475de5SAlex Elder return 0; 35059d475de5SAlex Elder } 35069d475de5SAlex Elder 35079d475de5SAlex Elder static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev) 35089d475de5SAlex Elder { 35099d475de5SAlex Elder return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP, 35109d475de5SAlex Elder &rbd_dev->header.obj_order, 35119d475de5SAlex Elder &rbd_dev->header.image_size); 35129d475de5SAlex Elder } 35139d475de5SAlex Elder 35141e130199SAlex Elder static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev) 35151e130199SAlex Elder { 35161e130199SAlex Elder void *reply_buf; 35171e130199SAlex Elder int ret; 35181e130199SAlex Elder void *p; 35191e130199SAlex Elder 35201e130199SAlex Elder reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL); 35211e130199SAlex Elder if (!reply_buf) 35221e130199SAlex Elder return -ENOMEM; 35231e130199SAlex Elder 352436be9a76SAlex Elder ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, 35254157976bSAlex Elder "rbd", "get_object_prefix", NULL, 0, 352607b2391fSAlex Elder reply_buf, RBD_OBJ_PREFIX_LEN_MAX, NULL); 352736be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 35281e130199SAlex Elder if (ret < 0) 35291e130199SAlex Elder goto out; 35301e130199SAlex Elder 35311e130199SAlex Elder p = reply_buf; 35321e130199SAlex Elder rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p, 353357385b51SAlex Elder p + ret, NULL, GFP_NOIO); 353457385b51SAlex Elder ret = 0; 35351e130199SAlex Elder 35361e130199SAlex Elder if (IS_ERR(rbd_dev->header.object_prefix)) { 35371e130199SAlex Elder ret = PTR_ERR(rbd_dev->header.object_prefix); 35381e130199SAlex Elder rbd_dev->header.object_prefix = NULL; 35391e130199SAlex Elder } else { 35401e130199SAlex Elder dout(" object_prefix = %s\n", rbd_dev->header.object_prefix); 35411e130199SAlex Elder } 35421e130199SAlex Elder out: 35431e130199SAlex Elder kfree(reply_buf); 35441e130199SAlex Elder 35451e130199SAlex Elder return ret; 35461e130199SAlex Elder } 35471e130199SAlex Elder 3548b1b5402aSAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id, 3549b1b5402aSAlex Elder u64 *snap_features) 3550b1b5402aSAlex Elder { 3551b1b5402aSAlex Elder __le64 snapid = cpu_to_le64(snap_id); 3552b1b5402aSAlex Elder struct { 3553b1b5402aSAlex Elder __le64 features; 3554b1b5402aSAlex Elder __le64 incompat; 35554157976bSAlex Elder } __attribute__ ((packed)) features_buf = { 0 }; 3556d889140cSAlex Elder u64 incompat; 3557b1b5402aSAlex Elder int ret; 3558b1b5402aSAlex Elder 355936be9a76SAlex Elder ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, 3560b1b5402aSAlex Elder "rbd", "get_features", 35614157976bSAlex Elder &snapid, sizeof (snapid), 35624157976bSAlex Elder &features_buf, sizeof (features_buf), NULL); 356336be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 3564b1b5402aSAlex Elder if (ret < 0) 3565b1b5402aSAlex Elder return ret; 356657385b51SAlex Elder if (ret < sizeof (features_buf)) 356757385b51SAlex Elder return -ERANGE; 3568d889140cSAlex Elder 3569d889140cSAlex Elder incompat = le64_to_cpu(features_buf.incompat); 35705cbf6f12SAlex Elder if (incompat & ~RBD_FEATURES_SUPPORTED) 3571b8f5c6edSAlex Elder return -ENXIO; 3572d889140cSAlex Elder 3573b1b5402aSAlex Elder *snap_features = le64_to_cpu(features_buf.features); 3574b1b5402aSAlex Elder 3575b1b5402aSAlex Elder dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n", 3576b1b5402aSAlex Elder (unsigned long long)snap_id, 3577b1b5402aSAlex Elder (unsigned long long)*snap_features, 3578b1b5402aSAlex Elder (unsigned long long)le64_to_cpu(features_buf.incompat)); 3579b1b5402aSAlex Elder 3580b1b5402aSAlex Elder return 0; 3581b1b5402aSAlex Elder } 3582b1b5402aSAlex Elder 3583b1b5402aSAlex Elder static int rbd_dev_v2_features(struct rbd_device *rbd_dev) 3584b1b5402aSAlex Elder { 3585b1b5402aSAlex Elder return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP, 3586b1b5402aSAlex Elder &rbd_dev->header.features); 3587b1b5402aSAlex Elder } 3588b1b5402aSAlex Elder 358986b00e0dSAlex Elder static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev) 359086b00e0dSAlex Elder { 359186b00e0dSAlex Elder struct rbd_spec *parent_spec; 359286b00e0dSAlex Elder size_t size; 359386b00e0dSAlex Elder void *reply_buf = NULL; 359486b00e0dSAlex Elder __le64 snapid; 359586b00e0dSAlex Elder void *p; 359686b00e0dSAlex Elder void *end; 359786b00e0dSAlex Elder char *image_id; 359886b00e0dSAlex Elder u64 overlap; 359986b00e0dSAlex Elder int ret; 360086b00e0dSAlex Elder 360186b00e0dSAlex Elder parent_spec = rbd_spec_alloc(); 360286b00e0dSAlex Elder if (!parent_spec) 360386b00e0dSAlex Elder return -ENOMEM; 360486b00e0dSAlex Elder 360586b00e0dSAlex Elder size = sizeof (__le64) + /* pool_id */ 360686b00e0dSAlex Elder sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX + /* image_id */ 360786b00e0dSAlex Elder sizeof (__le64) + /* snap_id */ 360886b00e0dSAlex Elder sizeof (__le64); /* overlap */ 360986b00e0dSAlex Elder reply_buf = kmalloc(size, GFP_KERNEL); 361086b00e0dSAlex Elder if (!reply_buf) { 361186b00e0dSAlex Elder ret = -ENOMEM; 361286b00e0dSAlex Elder goto out_err; 361386b00e0dSAlex Elder } 361486b00e0dSAlex Elder 361586b00e0dSAlex Elder snapid = cpu_to_le64(CEPH_NOSNAP); 361636be9a76SAlex Elder ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, 361786b00e0dSAlex Elder "rbd", "get_parent", 36184157976bSAlex Elder &snapid, sizeof (snapid), 36194157976bSAlex Elder reply_buf, size, NULL); 362036be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 362186b00e0dSAlex Elder if (ret < 0) 362286b00e0dSAlex Elder goto out_err; 362386b00e0dSAlex Elder 362486b00e0dSAlex Elder p = reply_buf; 362557385b51SAlex Elder end = reply_buf + ret; 362657385b51SAlex Elder ret = -ERANGE; 362786b00e0dSAlex Elder ceph_decode_64_safe(&p, end, parent_spec->pool_id, out_err); 362886b00e0dSAlex Elder if (parent_spec->pool_id == CEPH_NOPOOL) 362986b00e0dSAlex Elder goto out; /* No parent? No problem. */ 363086b00e0dSAlex Elder 36310903e875SAlex Elder /* The ceph file layout needs to fit pool id in 32 bits */ 36320903e875SAlex Elder 36330903e875SAlex Elder ret = -EIO; 36340903e875SAlex Elder if (WARN_ON(parent_spec->pool_id > (u64)U32_MAX)) 363557385b51SAlex Elder goto out_err; 36360903e875SAlex Elder 3637979ed480SAlex Elder image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL); 363886b00e0dSAlex Elder if (IS_ERR(image_id)) { 363986b00e0dSAlex Elder ret = PTR_ERR(image_id); 364086b00e0dSAlex Elder goto out_err; 364186b00e0dSAlex Elder } 364286b00e0dSAlex Elder parent_spec->image_id = image_id; 364386b00e0dSAlex Elder ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err); 364486b00e0dSAlex Elder ceph_decode_64_safe(&p, end, overlap, out_err); 364586b00e0dSAlex Elder 364686b00e0dSAlex Elder rbd_dev->parent_overlap = overlap; 364786b00e0dSAlex Elder rbd_dev->parent_spec = parent_spec; 364886b00e0dSAlex Elder parent_spec = NULL; /* rbd_dev now owns this */ 364986b00e0dSAlex Elder out: 365086b00e0dSAlex Elder ret = 0; 365186b00e0dSAlex Elder out_err: 365286b00e0dSAlex Elder kfree(reply_buf); 365386b00e0dSAlex Elder rbd_spec_put(parent_spec); 365486b00e0dSAlex Elder 365586b00e0dSAlex Elder return ret; 365686b00e0dSAlex Elder } 365786b00e0dSAlex Elder 3658cc070d59SAlex Elder static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev) 3659cc070d59SAlex Elder { 3660cc070d59SAlex Elder struct { 3661cc070d59SAlex Elder __le64 stripe_unit; 3662cc070d59SAlex Elder __le64 stripe_count; 3663cc070d59SAlex Elder } __attribute__ ((packed)) striping_info_buf = { 0 }; 3664cc070d59SAlex Elder size_t size = sizeof (striping_info_buf); 3665cc070d59SAlex Elder void *p; 3666cc070d59SAlex Elder u64 obj_size; 3667cc070d59SAlex Elder u64 stripe_unit; 3668cc070d59SAlex Elder u64 stripe_count; 3669cc070d59SAlex Elder int ret; 3670cc070d59SAlex Elder 3671cc070d59SAlex Elder ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, 3672cc070d59SAlex Elder "rbd", "get_stripe_unit_count", NULL, 0, 3673cc070d59SAlex Elder (char *)&striping_info_buf, size, NULL); 3674cc070d59SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 3675cc070d59SAlex Elder if (ret < 0) 3676cc070d59SAlex Elder return ret; 3677cc070d59SAlex Elder if (ret < size) 3678cc070d59SAlex Elder return -ERANGE; 3679cc070d59SAlex Elder 3680cc070d59SAlex Elder /* 3681cc070d59SAlex Elder * We don't actually support the "fancy striping" feature 3682cc070d59SAlex Elder * (STRIPINGV2) yet, but if the striping sizes are the 3683cc070d59SAlex Elder * defaults the behavior is the same as before. So find 3684cc070d59SAlex Elder * out, and only fail if the image has non-default values. 3685cc070d59SAlex Elder */ 3686cc070d59SAlex Elder ret = -EINVAL; 3687cc070d59SAlex Elder obj_size = (u64)1 << rbd_dev->header.obj_order; 3688cc070d59SAlex Elder p = &striping_info_buf; 3689cc070d59SAlex Elder stripe_unit = ceph_decode_64(&p); 3690cc070d59SAlex Elder if (stripe_unit != obj_size) { 3691cc070d59SAlex Elder rbd_warn(rbd_dev, "unsupported stripe unit " 3692cc070d59SAlex Elder "(got %llu want %llu)", 3693cc070d59SAlex Elder stripe_unit, obj_size); 3694cc070d59SAlex Elder return -EINVAL; 3695cc070d59SAlex Elder } 3696cc070d59SAlex Elder stripe_count = ceph_decode_64(&p); 3697cc070d59SAlex Elder if (stripe_count != 1) { 3698cc070d59SAlex Elder rbd_warn(rbd_dev, "unsupported stripe count " 3699cc070d59SAlex Elder "(got %llu want 1)", stripe_count); 3700cc070d59SAlex Elder return -EINVAL; 3701cc070d59SAlex Elder } 3702cc070d59SAlex Elder rbd_dev->stripe_unit = stripe_unit; 3703cc070d59SAlex Elder rbd_dev->stripe_count = stripe_count; 3704cc070d59SAlex Elder 3705cc070d59SAlex Elder return 0; 3706cc070d59SAlex Elder } 3707cc070d59SAlex Elder 37089e15b77dSAlex Elder static char *rbd_dev_image_name(struct rbd_device *rbd_dev) 37099e15b77dSAlex Elder { 37109e15b77dSAlex Elder size_t image_id_size; 37119e15b77dSAlex Elder char *image_id; 37129e15b77dSAlex Elder void *p; 37139e15b77dSAlex Elder void *end; 37149e15b77dSAlex Elder size_t size; 37159e15b77dSAlex Elder void *reply_buf = NULL; 37169e15b77dSAlex Elder size_t len = 0; 37179e15b77dSAlex Elder char *image_name = NULL; 37189e15b77dSAlex Elder int ret; 37199e15b77dSAlex Elder 37209e15b77dSAlex Elder rbd_assert(!rbd_dev->spec->image_name); 37219e15b77dSAlex Elder 372269e7a02fSAlex Elder len = strlen(rbd_dev->spec->image_id); 372369e7a02fSAlex Elder image_id_size = sizeof (__le32) + len; 37249e15b77dSAlex Elder image_id = kmalloc(image_id_size, GFP_KERNEL); 37259e15b77dSAlex Elder if (!image_id) 37269e15b77dSAlex Elder return NULL; 37279e15b77dSAlex Elder 37289e15b77dSAlex Elder p = image_id; 37294157976bSAlex Elder end = image_id + image_id_size; 373069e7a02fSAlex Elder ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len); 37319e15b77dSAlex Elder 37329e15b77dSAlex Elder size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX; 37339e15b77dSAlex Elder reply_buf = kmalloc(size, GFP_KERNEL); 37349e15b77dSAlex Elder if (!reply_buf) 37359e15b77dSAlex Elder goto out; 37369e15b77dSAlex Elder 373736be9a76SAlex Elder ret = rbd_obj_method_sync(rbd_dev, RBD_DIRECTORY, 37389e15b77dSAlex Elder "rbd", "dir_get_name", 37399e15b77dSAlex Elder image_id, image_id_size, 37404157976bSAlex Elder reply_buf, size, NULL); 37419e15b77dSAlex Elder if (ret < 0) 37429e15b77dSAlex Elder goto out; 37439e15b77dSAlex Elder p = reply_buf; 37444157976bSAlex Elder end = reply_buf + size; 37459e15b77dSAlex Elder image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL); 37469e15b77dSAlex Elder if (IS_ERR(image_name)) 37479e15b77dSAlex Elder image_name = NULL; 37489e15b77dSAlex Elder else 37499e15b77dSAlex Elder dout("%s: name is %s len is %zd\n", __func__, image_name, len); 37509e15b77dSAlex Elder out: 37519e15b77dSAlex Elder kfree(reply_buf); 37529e15b77dSAlex Elder kfree(image_id); 37539e15b77dSAlex Elder 37549e15b77dSAlex Elder return image_name; 37559e15b77dSAlex Elder } 37569e15b77dSAlex Elder 37579e15b77dSAlex Elder /* 37589e15b77dSAlex Elder * When a parent image gets probed, we only have the pool, image, 37599e15b77dSAlex Elder * and snapshot ids but not the names of any of them. This call 37609e15b77dSAlex Elder * is made later to fill in those names. It has to be done after 37619e15b77dSAlex Elder * rbd_dev_snaps_update() has completed because some of the 37629e15b77dSAlex Elder * information (in particular, snapshot name) is not available 37639e15b77dSAlex Elder * until then. 37649e15b77dSAlex Elder */ 37659e15b77dSAlex Elder static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev) 37669e15b77dSAlex Elder { 37679e15b77dSAlex Elder struct ceph_osd_client *osdc; 37689e15b77dSAlex Elder const char *name; 37699e15b77dSAlex Elder void *reply_buf = NULL; 37709e15b77dSAlex Elder int ret; 37719e15b77dSAlex Elder 37729e15b77dSAlex Elder if (rbd_dev->spec->pool_name) 37739e15b77dSAlex Elder return 0; /* Already have the names */ 37749e15b77dSAlex Elder 37759e15b77dSAlex Elder /* Look up the pool name */ 37769e15b77dSAlex Elder 37779e15b77dSAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 37789e15b77dSAlex Elder name = ceph_pg_pool_name_by_id(osdc->osdmap, rbd_dev->spec->pool_id); 3779935dc89fSAlex Elder if (!name) { 3780935dc89fSAlex Elder rbd_warn(rbd_dev, "there is no pool with id %llu", 3781935dc89fSAlex Elder rbd_dev->spec->pool_id); /* Really a BUG() */ 3782935dc89fSAlex Elder return -EIO; 3783935dc89fSAlex Elder } 37849e15b77dSAlex Elder 37859e15b77dSAlex Elder rbd_dev->spec->pool_name = kstrdup(name, GFP_KERNEL); 37869e15b77dSAlex Elder if (!rbd_dev->spec->pool_name) 37879e15b77dSAlex Elder return -ENOMEM; 37889e15b77dSAlex Elder 37899e15b77dSAlex Elder /* Fetch the image name; tolerate failure here */ 37909e15b77dSAlex Elder 37919e15b77dSAlex Elder name = rbd_dev_image_name(rbd_dev); 379269e7a02fSAlex Elder if (name) 37939e15b77dSAlex Elder rbd_dev->spec->image_name = (char *)name; 379469e7a02fSAlex Elder else 379506ecc6cbSAlex Elder rbd_warn(rbd_dev, "unable to get image name"); 37969e15b77dSAlex Elder 37979e15b77dSAlex Elder /* Look up the snapshot name. */ 37989e15b77dSAlex Elder 37999e15b77dSAlex Elder name = rbd_snap_name(rbd_dev, rbd_dev->spec->snap_id); 38009e15b77dSAlex Elder if (!name) { 3801935dc89fSAlex Elder rbd_warn(rbd_dev, "no snapshot with id %llu", 3802935dc89fSAlex Elder rbd_dev->spec->snap_id); /* Really a BUG() */ 38039e15b77dSAlex Elder ret = -EIO; 38049e15b77dSAlex Elder goto out_err; 38059e15b77dSAlex Elder } 38069e15b77dSAlex Elder rbd_dev->spec->snap_name = kstrdup(name, GFP_KERNEL); 38079e15b77dSAlex Elder if(!rbd_dev->spec->snap_name) 38089e15b77dSAlex Elder goto out_err; 38099e15b77dSAlex Elder 38109e15b77dSAlex Elder return 0; 38119e15b77dSAlex Elder out_err: 38129e15b77dSAlex Elder kfree(reply_buf); 38139e15b77dSAlex Elder kfree(rbd_dev->spec->pool_name); 38149e15b77dSAlex Elder rbd_dev->spec->pool_name = NULL; 38159e15b77dSAlex Elder 38169e15b77dSAlex Elder return ret; 38179e15b77dSAlex Elder } 38189e15b77dSAlex Elder 38196e14b1a6SAlex Elder static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver) 382035d489f9SAlex Elder { 382135d489f9SAlex Elder size_t size; 382235d489f9SAlex Elder int ret; 382335d489f9SAlex Elder void *reply_buf; 382435d489f9SAlex Elder void *p; 382535d489f9SAlex Elder void *end; 382635d489f9SAlex Elder u64 seq; 382735d489f9SAlex Elder u32 snap_count; 382835d489f9SAlex Elder struct ceph_snap_context *snapc; 382935d489f9SAlex Elder u32 i; 383035d489f9SAlex Elder 383135d489f9SAlex Elder /* 383235d489f9SAlex Elder * We'll need room for the seq value (maximum snapshot id), 383335d489f9SAlex Elder * snapshot count, and array of that many snapshot ids. 383435d489f9SAlex Elder * For now we have a fixed upper limit on the number we're 383535d489f9SAlex Elder * prepared to receive. 383635d489f9SAlex Elder */ 383735d489f9SAlex Elder size = sizeof (__le64) + sizeof (__le32) + 383835d489f9SAlex Elder RBD_MAX_SNAP_COUNT * sizeof (__le64); 383935d489f9SAlex Elder reply_buf = kzalloc(size, GFP_KERNEL); 384035d489f9SAlex Elder if (!reply_buf) 384135d489f9SAlex Elder return -ENOMEM; 384235d489f9SAlex Elder 384336be9a76SAlex Elder ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, 38444157976bSAlex Elder "rbd", "get_snapcontext", NULL, 0, 384507b2391fSAlex Elder reply_buf, size, ver); 384636be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 384735d489f9SAlex Elder if (ret < 0) 384835d489f9SAlex Elder goto out; 384935d489f9SAlex Elder 385035d489f9SAlex Elder p = reply_buf; 385157385b51SAlex Elder end = reply_buf + ret; 385257385b51SAlex Elder ret = -ERANGE; 385335d489f9SAlex Elder ceph_decode_64_safe(&p, end, seq, out); 385435d489f9SAlex Elder ceph_decode_32_safe(&p, end, snap_count, out); 385535d489f9SAlex Elder 385635d489f9SAlex Elder /* 385735d489f9SAlex Elder * Make sure the reported number of snapshot ids wouldn't go 385835d489f9SAlex Elder * beyond the end of our buffer. But before checking that, 385935d489f9SAlex Elder * make sure the computed size of the snapshot context we 386035d489f9SAlex Elder * allocate is representable in a size_t. 386135d489f9SAlex Elder */ 386235d489f9SAlex Elder if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context)) 386335d489f9SAlex Elder / sizeof (u64)) { 386435d489f9SAlex Elder ret = -EINVAL; 386535d489f9SAlex Elder goto out; 386635d489f9SAlex Elder } 386735d489f9SAlex Elder if (!ceph_has_room(&p, end, snap_count * sizeof (__le64))) 386835d489f9SAlex Elder goto out; 386935d489f9SAlex Elder 387035d489f9SAlex Elder size = sizeof (struct ceph_snap_context) + 387135d489f9SAlex Elder snap_count * sizeof (snapc->snaps[0]); 387235d489f9SAlex Elder snapc = kmalloc(size, GFP_KERNEL); 387335d489f9SAlex Elder if (!snapc) { 387435d489f9SAlex Elder ret = -ENOMEM; 387535d489f9SAlex Elder goto out; 387635d489f9SAlex Elder } 387757385b51SAlex Elder ret = 0; 387835d489f9SAlex Elder 387935d489f9SAlex Elder atomic_set(&snapc->nref, 1); 388035d489f9SAlex Elder snapc->seq = seq; 388135d489f9SAlex Elder snapc->num_snaps = snap_count; 388235d489f9SAlex Elder for (i = 0; i < snap_count; i++) 388335d489f9SAlex Elder snapc->snaps[i] = ceph_decode_64(&p); 388435d489f9SAlex Elder 388535d489f9SAlex Elder rbd_dev->header.snapc = snapc; 388635d489f9SAlex Elder 388735d489f9SAlex Elder dout(" snap context seq = %llu, snap_count = %u\n", 388835d489f9SAlex Elder (unsigned long long)seq, (unsigned int)snap_count); 388935d489f9SAlex Elder out: 389035d489f9SAlex Elder kfree(reply_buf); 389135d489f9SAlex Elder 389257385b51SAlex Elder return ret; 389335d489f9SAlex Elder } 389435d489f9SAlex Elder 3895b8b1e2dbSAlex Elder static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which) 3896b8b1e2dbSAlex Elder { 3897b8b1e2dbSAlex Elder size_t size; 3898b8b1e2dbSAlex Elder void *reply_buf; 3899b8b1e2dbSAlex Elder __le64 snap_id; 3900b8b1e2dbSAlex Elder int ret; 3901b8b1e2dbSAlex Elder void *p; 3902b8b1e2dbSAlex Elder void *end; 3903b8b1e2dbSAlex Elder char *snap_name; 3904b8b1e2dbSAlex Elder 3905b8b1e2dbSAlex Elder size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN; 3906b8b1e2dbSAlex Elder reply_buf = kmalloc(size, GFP_KERNEL); 3907b8b1e2dbSAlex Elder if (!reply_buf) 3908b8b1e2dbSAlex Elder return ERR_PTR(-ENOMEM); 3909b8b1e2dbSAlex Elder 3910acb1b6caSAlex Elder rbd_assert(which < rbd_dev->header.snapc->num_snaps); 3911b8b1e2dbSAlex Elder snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]); 391236be9a76SAlex Elder ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, 3913b8b1e2dbSAlex Elder "rbd", "get_snapshot_name", 39144157976bSAlex Elder &snap_id, sizeof (snap_id), 391507b2391fSAlex Elder reply_buf, size, NULL); 391636be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 3917b8b1e2dbSAlex Elder if (ret < 0) 3918b8b1e2dbSAlex Elder goto out; 3919b8b1e2dbSAlex Elder 3920b8b1e2dbSAlex Elder p = reply_buf; 39214157976bSAlex Elder end = reply_buf + size; 3922e5c35534SAlex Elder snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL); 3923b8b1e2dbSAlex Elder if (IS_ERR(snap_name)) { 3924b8b1e2dbSAlex Elder ret = PTR_ERR(snap_name); 3925b8b1e2dbSAlex Elder goto out; 3926b8b1e2dbSAlex Elder } else { 3927b8b1e2dbSAlex Elder dout(" snap_id 0x%016llx snap_name = %s\n", 3928b8b1e2dbSAlex Elder (unsigned long long)le64_to_cpu(snap_id), snap_name); 3929b8b1e2dbSAlex Elder } 3930b8b1e2dbSAlex Elder kfree(reply_buf); 3931b8b1e2dbSAlex Elder 3932b8b1e2dbSAlex Elder return snap_name; 3933b8b1e2dbSAlex Elder out: 3934b8b1e2dbSAlex Elder kfree(reply_buf); 3935b8b1e2dbSAlex Elder 3936b8b1e2dbSAlex Elder return ERR_PTR(ret); 3937b8b1e2dbSAlex Elder } 3938b8b1e2dbSAlex Elder 3939b8b1e2dbSAlex Elder static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which, 3940b8b1e2dbSAlex Elder u64 *snap_size, u64 *snap_features) 3941b8b1e2dbSAlex Elder { 3942e0b49868SAlex Elder u64 snap_id; 3943acb1b6caSAlex Elder u64 size; 3944acb1b6caSAlex Elder u64 features; 3945acb1b6caSAlex Elder char *snap_name; 3946b8b1e2dbSAlex Elder int ret; 3947b8b1e2dbSAlex Elder 3948acb1b6caSAlex Elder rbd_assert(which < rbd_dev->header.snapc->num_snaps); 3949b8b1e2dbSAlex Elder snap_id = rbd_dev->header.snapc->snaps[which]; 3950acb1b6caSAlex Elder ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size); 3951b8b1e2dbSAlex Elder if (ret) 3952acb1b6caSAlex Elder goto out_err; 3953b8b1e2dbSAlex Elder 3954acb1b6caSAlex Elder ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, &features); 3955acb1b6caSAlex Elder if (ret) 3956acb1b6caSAlex Elder goto out_err; 3957acb1b6caSAlex Elder 3958acb1b6caSAlex Elder snap_name = rbd_dev_v2_snap_name(rbd_dev, which); 3959acb1b6caSAlex Elder if (!IS_ERR(snap_name)) { 3960acb1b6caSAlex Elder *snap_size = size; 3961acb1b6caSAlex Elder *snap_features = features; 3962acb1b6caSAlex Elder } 3963acb1b6caSAlex Elder 3964acb1b6caSAlex Elder return snap_name; 3965acb1b6caSAlex Elder out_err: 3966acb1b6caSAlex Elder return ERR_PTR(ret); 3967b8b1e2dbSAlex Elder } 3968b8b1e2dbSAlex Elder 3969b8b1e2dbSAlex Elder static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which, 3970b8b1e2dbSAlex Elder u64 *snap_size, u64 *snap_features) 3971b8b1e2dbSAlex Elder { 3972b8b1e2dbSAlex Elder if (rbd_dev->image_format == 1) 3973b8b1e2dbSAlex Elder return rbd_dev_v1_snap_info(rbd_dev, which, 3974b8b1e2dbSAlex Elder snap_size, snap_features); 3975b8b1e2dbSAlex Elder if (rbd_dev->image_format == 2) 3976b8b1e2dbSAlex Elder return rbd_dev_v2_snap_info(rbd_dev, which, 3977b8b1e2dbSAlex Elder snap_size, snap_features); 3978b8b1e2dbSAlex Elder return ERR_PTR(-EINVAL); 3979b8b1e2dbSAlex Elder } 3980b8b1e2dbSAlex Elder 3981117973fbSAlex Elder static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver) 3982117973fbSAlex Elder { 3983117973fbSAlex Elder int ret; 3984117973fbSAlex Elder __u8 obj_order; 3985117973fbSAlex Elder 3986117973fbSAlex Elder down_write(&rbd_dev->header_rwsem); 3987117973fbSAlex Elder 3988117973fbSAlex Elder /* Grab old order first, to see if it changes */ 3989117973fbSAlex Elder 3990117973fbSAlex Elder obj_order = rbd_dev->header.obj_order, 3991117973fbSAlex Elder ret = rbd_dev_v2_image_size(rbd_dev); 3992117973fbSAlex Elder if (ret) 3993117973fbSAlex Elder goto out; 3994117973fbSAlex Elder if (rbd_dev->header.obj_order != obj_order) { 3995117973fbSAlex Elder ret = -EIO; 3996117973fbSAlex Elder goto out; 3997117973fbSAlex Elder } 3998117973fbSAlex Elder rbd_update_mapping_size(rbd_dev); 3999117973fbSAlex Elder 4000117973fbSAlex Elder ret = rbd_dev_v2_snap_context(rbd_dev, hver); 4001117973fbSAlex Elder dout("rbd_dev_v2_snap_context returned %d\n", ret); 4002117973fbSAlex Elder if (ret) 4003117973fbSAlex Elder goto out; 4004117973fbSAlex Elder ret = rbd_dev_snaps_update(rbd_dev); 4005117973fbSAlex Elder dout("rbd_dev_snaps_update returned %d\n", ret); 4006117973fbSAlex Elder if (ret) 4007117973fbSAlex Elder goto out; 4008117973fbSAlex Elder out: 4009117973fbSAlex Elder up_write(&rbd_dev->header_rwsem); 4010117973fbSAlex Elder 4011117973fbSAlex Elder return ret; 4012117973fbSAlex Elder } 4013117973fbSAlex Elder 40149d475de5SAlex Elder /* 401535938150SAlex Elder * Scan the rbd device's current snapshot list and compare it to the 401635938150SAlex Elder * newly-received snapshot context. Remove any existing snapshots 401735938150SAlex Elder * not present in the new snapshot context. Add a new snapshot for 401835938150SAlex Elder * any snaphots in the snapshot context not in the current list. 401935938150SAlex Elder * And verify there are no changes to snapshots we already know 402035938150SAlex Elder * about. 402135938150SAlex Elder * 402235938150SAlex Elder * Assumes the snapshots in the snapshot context are sorted by 402335938150SAlex Elder * snapshot id, highest id first. (Snapshots in the rbd_dev's list 402435938150SAlex Elder * are also maintained in that order.) 4025522a0cc0SAlex Elder * 4026522a0cc0SAlex Elder * Note that any error occurs while updating the snapshot list 4027522a0cc0SAlex Elder * aborts the update, and the entire list is cleared. The snapshot 4028522a0cc0SAlex Elder * list becomes inconsistent at that point anyway, so it might as 4029522a0cc0SAlex Elder * well be empty. 4030dfc5606dSYehuda Sadeh */ 4031304f6808SAlex Elder static int rbd_dev_snaps_update(struct rbd_device *rbd_dev) 4032dfc5606dSYehuda Sadeh { 403335938150SAlex Elder struct ceph_snap_context *snapc = rbd_dev->header.snapc; 403435938150SAlex Elder const u32 snap_count = snapc->num_snaps; 403535938150SAlex Elder struct list_head *head = &rbd_dev->snaps; 403635938150SAlex Elder struct list_head *links = head->next; 403735938150SAlex Elder u32 index = 0; 4038522a0cc0SAlex Elder int ret = 0; 4039dfc5606dSYehuda Sadeh 40409fcbb800SAlex Elder dout("%s: snap count is %u\n", __func__, (unsigned int)snap_count); 404135938150SAlex Elder while (index < snap_count || links != head) { 404235938150SAlex Elder u64 snap_id; 404335938150SAlex Elder struct rbd_snap *snap; 4044cd892126SAlex Elder char *snap_name; 4045cd892126SAlex Elder u64 snap_size = 0; 4046cd892126SAlex Elder u64 snap_features = 0; 4047dfc5606dSYehuda Sadeh 404835938150SAlex Elder snap_id = index < snap_count ? snapc->snaps[index] 404935938150SAlex Elder : CEPH_NOSNAP; 405035938150SAlex Elder snap = links != head ? list_entry(links, struct rbd_snap, node) 405135938150SAlex Elder : NULL; 4052aafb230eSAlex Elder rbd_assert(!snap || snap->id != CEPH_NOSNAP); 4053dfc5606dSYehuda Sadeh 405435938150SAlex Elder if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) { 405535938150SAlex Elder struct list_head *next = links->next; 4056dfc5606dSYehuda Sadeh 40576d292906SAlex Elder /* 40586d292906SAlex Elder * A previously-existing snapshot is not in 40596d292906SAlex Elder * the new snap context. 40606d292906SAlex Elder * 4061522a0cc0SAlex Elder * If the now-missing snapshot is the one 4062522a0cc0SAlex Elder * the image represents, clear its existence 4063522a0cc0SAlex Elder * flag so we can avoid sending any more 4064522a0cc0SAlex Elder * requests to it. 40656d292906SAlex Elder */ 40660d7dbfceSAlex Elder if (rbd_dev->spec->snap_id == snap->id) 40676d292906SAlex Elder clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); 40683e83b65bSAlex Elder dout("removing %ssnap id %llu\n", 40690d7dbfceSAlex Elder rbd_dev->spec->snap_id == snap->id ? 40700d7dbfceSAlex Elder "mapped " : "", 40719fcbb800SAlex Elder (unsigned long long)snap->id); 40726087b51bSAlex Elder 40736087b51bSAlex Elder list_del(&snap->node); 40746087b51bSAlex Elder rbd_snap_destroy(snap); 4075dfc5606dSYehuda Sadeh 407635938150SAlex Elder /* Done with this list entry; advance */ 407735938150SAlex Elder 407835938150SAlex Elder links = next; 407935938150SAlex Elder continue; 4080dfc5606dSYehuda Sadeh } 408135938150SAlex Elder 4082b8b1e2dbSAlex Elder snap_name = rbd_dev_snap_info(rbd_dev, index, 4083cd892126SAlex Elder &snap_size, &snap_features); 4084522a0cc0SAlex Elder if (IS_ERR(snap_name)) { 4085522a0cc0SAlex Elder ret = PTR_ERR(snap_name); 4086522a0cc0SAlex Elder dout("failed to get snap info, error %d\n", ret); 4087522a0cc0SAlex Elder goto out_err; 4088522a0cc0SAlex Elder } 4089cd892126SAlex Elder 40909fcbb800SAlex Elder dout("entry %u: snap_id = %llu\n", (unsigned int)snap_count, 40919fcbb800SAlex Elder (unsigned long long)snap_id); 409235938150SAlex Elder if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) { 409335938150SAlex Elder struct rbd_snap *new_snap; 409435938150SAlex Elder 409535938150SAlex Elder /* We haven't seen this snapshot before */ 409635938150SAlex Elder 40976087b51bSAlex Elder new_snap = rbd_snap_create(rbd_dev, snap_name, 4098cd892126SAlex Elder snap_id, snap_size, snap_features); 40999fcbb800SAlex Elder if (IS_ERR(new_snap)) { 4100522a0cc0SAlex Elder ret = PTR_ERR(new_snap); 4101522a0cc0SAlex Elder dout(" failed to add dev, error %d\n", ret); 4102522a0cc0SAlex Elder goto out_err; 41039fcbb800SAlex Elder } 410435938150SAlex Elder 410535938150SAlex Elder /* New goes before existing, or at end of list */ 410635938150SAlex Elder 41079fcbb800SAlex Elder dout(" added dev%s\n", snap ? "" : " at end\n"); 410835938150SAlex Elder if (snap) 410935938150SAlex Elder list_add_tail(&new_snap->node, &snap->node); 411035938150SAlex Elder else 4111523f3258SAlex Elder list_add_tail(&new_snap->node, head); 411235938150SAlex Elder } else { 411335938150SAlex Elder /* Already have this one */ 411435938150SAlex Elder 41159fcbb800SAlex Elder dout(" already present\n"); 41169fcbb800SAlex Elder 4117cd892126SAlex Elder rbd_assert(snap->size == snap_size); 4118aafb230eSAlex Elder rbd_assert(!strcmp(snap->name, snap_name)); 4119cd892126SAlex Elder rbd_assert(snap->features == snap_features); 412035938150SAlex Elder 412135938150SAlex Elder /* Done with this list entry; advance */ 412235938150SAlex Elder 412335938150SAlex Elder links = links->next; 4124dfc5606dSYehuda Sadeh } 412535938150SAlex Elder 412635938150SAlex Elder /* Advance to the next entry in the snapshot context */ 412735938150SAlex Elder 412835938150SAlex Elder index++; 4129dfc5606dSYehuda Sadeh } 41309fcbb800SAlex Elder dout("%s: done\n", __func__); 4131dfc5606dSYehuda Sadeh 4132dfc5606dSYehuda Sadeh return 0; 4133522a0cc0SAlex Elder out_err: 4134522a0cc0SAlex Elder rbd_remove_all_snaps(rbd_dev); 4135522a0cc0SAlex Elder 4136522a0cc0SAlex Elder return ret; 4137dfc5606dSYehuda Sadeh } 4138dfc5606dSYehuda Sadeh 4139dfc5606dSYehuda Sadeh static int rbd_bus_add_dev(struct rbd_device *rbd_dev) 4140dfc5606dSYehuda Sadeh { 4141dfc5606dSYehuda Sadeh struct device *dev; 4142cd789ab9SAlex Elder int ret; 4143dfc5606dSYehuda Sadeh 4144dfc5606dSYehuda Sadeh mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); 4145dfc5606dSYehuda Sadeh 4146cd789ab9SAlex Elder dev = &rbd_dev->dev; 4147dfc5606dSYehuda Sadeh dev->bus = &rbd_bus_type; 4148dfc5606dSYehuda Sadeh dev->type = &rbd_device_type; 4149dfc5606dSYehuda Sadeh dev->parent = &rbd_root_dev; 4150dfc5606dSYehuda Sadeh dev->release = rbd_dev_release; 4151de71a297SAlex Elder dev_set_name(dev, "%d", rbd_dev->dev_id); 4152dfc5606dSYehuda Sadeh ret = device_register(dev); 4153dfc5606dSYehuda Sadeh 4154dfc5606dSYehuda Sadeh mutex_unlock(&ctl_mutex); 4155cd789ab9SAlex Elder 4156dfc5606dSYehuda Sadeh return ret; 4157602adf40SYehuda Sadeh } 4158602adf40SYehuda Sadeh 4159dfc5606dSYehuda Sadeh static void rbd_bus_del_dev(struct rbd_device *rbd_dev) 4160dfc5606dSYehuda Sadeh { 4161dfc5606dSYehuda Sadeh device_unregister(&rbd_dev->dev); 4162dfc5606dSYehuda Sadeh } 4163dfc5606dSYehuda Sadeh 4164e2839308SAlex Elder static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0); 41651ddbe94eSAlex Elder 41661ddbe94eSAlex Elder /* 4167499afd5bSAlex Elder * Get a unique rbd identifier for the given new rbd_dev, and add 4168499afd5bSAlex Elder * the rbd_dev to the global list. The minimum rbd id is 1. 41691ddbe94eSAlex Elder */ 4170e2839308SAlex Elder static void rbd_dev_id_get(struct rbd_device *rbd_dev) 4171b7f23c36SAlex Elder { 4172e2839308SAlex Elder rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max); 4173499afd5bSAlex Elder 4174499afd5bSAlex Elder spin_lock(&rbd_dev_list_lock); 4175499afd5bSAlex Elder list_add_tail(&rbd_dev->node, &rbd_dev_list); 4176499afd5bSAlex Elder spin_unlock(&rbd_dev_list_lock); 4177e2839308SAlex Elder dout("rbd_dev %p given dev id %llu\n", rbd_dev, 4178e2839308SAlex Elder (unsigned long long) rbd_dev->dev_id); 4179b7f23c36SAlex Elder } 4180b7f23c36SAlex Elder 41811ddbe94eSAlex Elder /* 4182499afd5bSAlex Elder * Remove an rbd_dev from the global list, and record that its 4183499afd5bSAlex Elder * identifier is no longer in use. 41841ddbe94eSAlex Elder */ 4185e2839308SAlex Elder static void rbd_dev_id_put(struct rbd_device *rbd_dev) 41861ddbe94eSAlex Elder { 4187d184f6bfSAlex Elder struct list_head *tmp; 4188de71a297SAlex Elder int rbd_id = rbd_dev->dev_id; 4189d184f6bfSAlex Elder int max_id; 4190d184f6bfSAlex Elder 4191aafb230eSAlex Elder rbd_assert(rbd_id > 0); 4192499afd5bSAlex Elder 4193e2839308SAlex Elder dout("rbd_dev %p released dev id %llu\n", rbd_dev, 4194e2839308SAlex Elder (unsigned long long) rbd_dev->dev_id); 4195499afd5bSAlex Elder spin_lock(&rbd_dev_list_lock); 4196499afd5bSAlex Elder list_del_init(&rbd_dev->node); 4197d184f6bfSAlex Elder 4198d184f6bfSAlex Elder /* 4199d184f6bfSAlex Elder * If the id being "put" is not the current maximum, there 4200d184f6bfSAlex Elder * is nothing special we need to do. 4201d184f6bfSAlex Elder */ 4202e2839308SAlex Elder if (rbd_id != atomic64_read(&rbd_dev_id_max)) { 4203d184f6bfSAlex Elder spin_unlock(&rbd_dev_list_lock); 4204d184f6bfSAlex Elder return; 4205d184f6bfSAlex Elder } 4206d184f6bfSAlex Elder 4207d184f6bfSAlex Elder /* 4208d184f6bfSAlex Elder * We need to update the current maximum id. Search the 4209d184f6bfSAlex Elder * list to find out what it is. We're more likely to find 4210d184f6bfSAlex Elder * the maximum at the end, so search the list backward. 4211d184f6bfSAlex Elder */ 4212d184f6bfSAlex Elder max_id = 0; 4213d184f6bfSAlex Elder list_for_each_prev(tmp, &rbd_dev_list) { 4214d184f6bfSAlex Elder struct rbd_device *rbd_dev; 4215d184f6bfSAlex Elder 4216d184f6bfSAlex Elder rbd_dev = list_entry(tmp, struct rbd_device, node); 4217b213e0b1SAlex Elder if (rbd_dev->dev_id > max_id) 4218b213e0b1SAlex Elder max_id = rbd_dev->dev_id; 4219d184f6bfSAlex Elder } 4220499afd5bSAlex Elder spin_unlock(&rbd_dev_list_lock); 42211ddbe94eSAlex Elder 42221ddbe94eSAlex Elder /* 4223e2839308SAlex Elder * The max id could have been updated by rbd_dev_id_get(), in 4224d184f6bfSAlex Elder * which case it now accurately reflects the new maximum. 4225d184f6bfSAlex Elder * Be careful not to overwrite the maximum value in that 4226d184f6bfSAlex Elder * case. 42271ddbe94eSAlex Elder */ 4228e2839308SAlex Elder atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id); 4229e2839308SAlex Elder dout(" max dev id has been reset\n"); 4230b7f23c36SAlex Elder } 4231b7f23c36SAlex Elder 4232a725f65eSAlex Elder /* 4233e28fff26SAlex Elder * Skips over white space at *buf, and updates *buf to point to the 4234e28fff26SAlex Elder * first found non-space character (if any). Returns the length of 4235593a9e7bSAlex Elder * the token (string of non-white space characters) found. Note 4236593a9e7bSAlex Elder * that *buf must be terminated with '\0'. 4237e28fff26SAlex Elder */ 4238e28fff26SAlex Elder static inline size_t next_token(const char **buf) 4239e28fff26SAlex Elder { 4240e28fff26SAlex Elder /* 4241e28fff26SAlex Elder * These are the characters that produce nonzero for 4242e28fff26SAlex Elder * isspace() in the "C" and "POSIX" locales. 4243e28fff26SAlex Elder */ 4244e28fff26SAlex Elder const char *spaces = " \f\n\r\t\v"; 4245e28fff26SAlex Elder 4246e28fff26SAlex Elder *buf += strspn(*buf, spaces); /* Find start of token */ 4247e28fff26SAlex Elder 4248e28fff26SAlex Elder return strcspn(*buf, spaces); /* Return token length */ 4249e28fff26SAlex Elder } 4250e28fff26SAlex Elder 4251e28fff26SAlex Elder /* 4252e28fff26SAlex Elder * Finds the next token in *buf, and if the provided token buffer is 4253e28fff26SAlex Elder * big enough, copies the found token into it. The result, if 4254593a9e7bSAlex Elder * copied, is guaranteed to be terminated with '\0'. Note that *buf 4255593a9e7bSAlex Elder * must be terminated with '\0' on entry. 4256e28fff26SAlex Elder * 4257e28fff26SAlex Elder * Returns the length of the token found (not including the '\0'). 4258e28fff26SAlex Elder * Return value will be 0 if no token is found, and it will be >= 4259e28fff26SAlex Elder * token_size if the token would not fit. 4260e28fff26SAlex Elder * 4261593a9e7bSAlex Elder * The *buf pointer will be updated to point beyond the end of the 4262e28fff26SAlex Elder * found token. Note that this occurs even if the token buffer is 4263e28fff26SAlex Elder * too small to hold it. 4264e28fff26SAlex Elder */ 4265e28fff26SAlex Elder static inline size_t copy_token(const char **buf, 4266e28fff26SAlex Elder char *token, 4267e28fff26SAlex Elder size_t token_size) 4268e28fff26SAlex Elder { 4269e28fff26SAlex Elder size_t len; 4270e28fff26SAlex Elder 4271e28fff26SAlex Elder len = next_token(buf); 4272e28fff26SAlex Elder if (len < token_size) { 4273e28fff26SAlex Elder memcpy(token, *buf, len); 4274e28fff26SAlex Elder *(token + len) = '\0'; 4275e28fff26SAlex Elder } 4276e28fff26SAlex Elder *buf += len; 4277e28fff26SAlex Elder 4278e28fff26SAlex Elder return len; 4279e28fff26SAlex Elder } 4280e28fff26SAlex Elder 4281e28fff26SAlex Elder /* 4282ea3352f4SAlex Elder * Finds the next token in *buf, dynamically allocates a buffer big 4283ea3352f4SAlex Elder * enough to hold a copy of it, and copies the token into the new 4284ea3352f4SAlex Elder * buffer. The copy is guaranteed to be terminated with '\0'. Note 4285ea3352f4SAlex Elder * that a duplicate buffer is created even for a zero-length token. 4286ea3352f4SAlex Elder * 4287ea3352f4SAlex Elder * Returns a pointer to the newly-allocated duplicate, or a null 4288ea3352f4SAlex Elder * pointer if memory for the duplicate was not available. If 4289ea3352f4SAlex Elder * the lenp argument is a non-null pointer, the length of the token 4290ea3352f4SAlex Elder * (not including the '\0') is returned in *lenp. 4291ea3352f4SAlex Elder * 4292ea3352f4SAlex Elder * If successful, the *buf pointer will be updated to point beyond 4293ea3352f4SAlex Elder * the end of the found token. 4294ea3352f4SAlex Elder * 4295ea3352f4SAlex Elder * Note: uses GFP_KERNEL for allocation. 4296ea3352f4SAlex Elder */ 4297ea3352f4SAlex Elder static inline char *dup_token(const char **buf, size_t *lenp) 4298ea3352f4SAlex Elder { 4299ea3352f4SAlex Elder char *dup; 4300ea3352f4SAlex Elder size_t len; 4301ea3352f4SAlex Elder 4302ea3352f4SAlex Elder len = next_token(buf); 43034caf35f9SAlex Elder dup = kmemdup(*buf, len + 1, GFP_KERNEL); 4304ea3352f4SAlex Elder if (!dup) 4305ea3352f4SAlex Elder return NULL; 4306ea3352f4SAlex Elder *(dup + len) = '\0'; 4307ea3352f4SAlex Elder *buf += len; 4308ea3352f4SAlex Elder 4309ea3352f4SAlex Elder if (lenp) 4310ea3352f4SAlex Elder *lenp = len; 4311ea3352f4SAlex Elder 4312ea3352f4SAlex Elder return dup; 4313ea3352f4SAlex Elder } 4314ea3352f4SAlex Elder 4315ea3352f4SAlex Elder /* 4316859c31dfSAlex Elder * Parse the options provided for an "rbd add" (i.e., rbd image 4317859c31dfSAlex Elder * mapping) request. These arrive via a write to /sys/bus/rbd/add, 4318859c31dfSAlex Elder * and the data written is passed here via a NUL-terminated buffer. 4319859c31dfSAlex Elder * Returns 0 if successful or an error code otherwise. 4320d22f76e7SAlex Elder * 4321859c31dfSAlex Elder * The information extracted from these options is recorded in 4322859c31dfSAlex Elder * the other parameters which return dynamically-allocated 4323859c31dfSAlex Elder * structures: 4324859c31dfSAlex Elder * ceph_opts 4325859c31dfSAlex Elder * The address of a pointer that will refer to a ceph options 4326859c31dfSAlex Elder * structure. Caller must release the returned pointer using 4327859c31dfSAlex Elder * ceph_destroy_options() when it is no longer needed. 4328859c31dfSAlex Elder * rbd_opts 4329859c31dfSAlex Elder * Address of an rbd options pointer. Fully initialized by 4330859c31dfSAlex Elder * this function; caller must release with kfree(). 4331859c31dfSAlex Elder * spec 4332859c31dfSAlex Elder * Address of an rbd image specification pointer. Fully 4333859c31dfSAlex Elder * initialized by this function based on parsed options. 4334859c31dfSAlex Elder * Caller must release with rbd_spec_put(). 4335859c31dfSAlex Elder * 4336859c31dfSAlex Elder * The options passed take this form: 4337859c31dfSAlex Elder * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>] 4338859c31dfSAlex Elder * where: 4339859c31dfSAlex Elder * <mon_addrs> 4340859c31dfSAlex Elder * A comma-separated list of one or more monitor addresses. 4341859c31dfSAlex Elder * A monitor address is an ip address, optionally followed 4342859c31dfSAlex Elder * by a port number (separated by a colon). 4343859c31dfSAlex Elder * I.e.: ip1[:port1][,ip2[:port2]...] 4344859c31dfSAlex Elder * <options> 4345859c31dfSAlex Elder * A comma-separated list of ceph and/or rbd options. 4346859c31dfSAlex Elder * <pool_name> 4347859c31dfSAlex Elder * The name of the rados pool containing the rbd image. 4348859c31dfSAlex Elder * <image_name> 4349859c31dfSAlex Elder * The name of the image in that pool to map. 4350859c31dfSAlex Elder * <snap_id> 4351859c31dfSAlex Elder * An optional snapshot id. If provided, the mapping will 4352859c31dfSAlex Elder * present data from the image at the time that snapshot was 4353859c31dfSAlex Elder * created. The image head is used if no snapshot id is 4354859c31dfSAlex Elder * provided. Snapshot mappings are always read-only. 4355a725f65eSAlex Elder */ 4356859c31dfSAlex Elder static int rbd_add_parse_args(const char *buf, 4357dc79b113SAlex Elder struct ceph_options **ceph_opts, 4358859c31dfSAlex Elder struct rbd_options **opts, 4359859c31dfSAlex Elder struct rbd_spec **rbd_spec) 4360a725f65eSAlex Elder { 4361e28fff26SAlex Elder size_t len; 4362859c31dfSAlex Elder char *options; 43630ddebc0cSAlex Elder const char *mon_addrs; 43640ddebc0cSAlex Elder size_t mon_addrs_size; 4365859c31dfSAlex Elder struct rbd_spec *spec = NULL; 43664e9afebaSAlex Elder struct rbd_options *rbd_opts = NULL; 4367859c31dfSAlex Elder struct ceph_options *copts; 4368dc79b113SAlex Elder int ret; 4369e28fff26SAlex Elder 4370e28fff26SAlex Elder /* The first four tokens are required */ 4371e28fff26SAlex Elder 43727ef3214aSAlex Elder len = next_token(&buf); 43734fb5d671SAlex Elder if (!len) { 43744fb5d671SAlex Elder rbd_warn(NULL, "no monitor address(es) provided"); 43754fb5d671SAlex Elder return -EINVAL; 43764fb5d671SAlex Elder } 43770ddebc0cSAlex Elder mon_addrs = buf; 4378f28e565aSAlex Elder mon_addrs_size = len + 1; 43797ef3214aSAlex Elder buf += len; 4380a725f65eSAlex Elder 4381dc79b113SAlex Elder ret = -EINVAL; 4382f28e565aSAlex Elder options = dup_token(&buf, NULL); 4383f28e565aSAlex Elder if (!options) 4384dc79b113SAlex Elder return -ENOMEM; 43854fb5d671SAlex Elder if (!*options) { 43864fb5d671SAlex Elder rbd_warn(NULL, "no options provided"); 43874fb5d671SAlex Elder goto out_err; 43884fb5d671SAlex Elder } 4389a725f65eSAlex Elder 4390859c31dfSAlex Elder spec = rbd_spec_alloc(); 4391859c31dfSAlex Elder if (!spec) 4392f28e565aSAlex Elder goto out_mem; 4393859c31dfSAlex Elder 4394859c31dfSAlex Elder spec->pool_name = dup_token(&buf, NULL); 4395859c31dfSAlex Elder if (!spec->pool_name) 4396859c31dfSAlex Elder goto out_mem; 43974fb5d671SAlex Elder if (!*spec->pool_name) { 43984fb5d671SAlex Elder rbd_warn(NULL, "no pool name provided"); 43994fb5d671SAlex Elder goto out_err; 44004fb5d671SAlex Elder } 4401e28fff26SAlex Elder 440269e7a02fSAlex Elder spec->image_name = dup_token(&buf, NULL); 4403859c31dfSAlex Elder if (!spec->image_name) 4404f28e565aSAlex Elder goto out_mem; 44054fb5d671SAlex Elder if (!*spec->image_name) { 44064fb5d671SAlex Elder rbd_warn(NULL, "no image name provided"); 44074fb5d671SAlex Elder goto out_err; 44084fb5d671SAlex Elder } 4409e28fff26SAlex Elder 4410f28e565aSAlex Elder /* 4411f28e565aSAlex Elder * Snapshot name is optional; default is to use "-" 4412f28e565aSAlex Elder * (indicating the head/no snapshot). 4413f28e565aSAlex Elder */ 44143feeb894SAlex Elder len = next_token(&buf); 4415820a5f3eSAlex Elder if (!len) { 44163feeb894SAlex Elder buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */ 44173feeb894SAlex Elder len = sizeof (RBD_SNAP_HEAD_NAME) - 1; 4418f28e565aSAlex Elder } else if (len > RBD_MAX_SNAP_NAME_LEN) { 4419dc79b113SAlex Elder ret = -ENAMETOOLONG; 4420f28e565aSAlex Elder goto out_err; 4421849b4260SAlex Elder } 44224caf35f9SAlex Elder spec->snap_name = kmemdup(buf, len + 1, GFP_KERNEL); 4423859c31dfSAlex Elder if (!spec->snap_name) 4424f28e565aSAlex Elder goto out_mem; 4425859c31dfSAlex Elder *(spec->snap_name + len) = '\0'; 4426e5c35534SAlex Elder 44270ddebc0cSAlex Elder /* Initialize all rbd options to the defaults */ 4428e28fff26SAlex Elder 44294e9afebaSAlex Elder rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL); 44304e9afebaSAlex Elder if (!rbd_opts) 44314e9afebaSAlex Elder goto out_mem; 44324e9afebaSAlex Elder 44334e9afebaSAlex Elder rbd_opts->read_only = RBD_READ_ONLY_DEFAULT; 4434d22f76e7SAlex Elder 4435859c31dfSAlex Elder copts = ceph_parse_options(options, mon_addrs, 44360ddebc0cSAlex Elder mon_addrs + mon_addrs_size - 1, 44374e9afebaSAlex Elder parse_rbd_opts_token, rbd_opts); 4438859c31dfSAlex Elder if (IS_ERR(copts)) { 4439859c31dfSAlex Elder ret = PTR_ERR(copts); 4440dc79b113SAlex Elder goto out_err; 4441dc79b113SAlex Elder } 4442859c31dfSAlex Elder kfree(options); 4443859c31dfSAlex Elder 4444859c31dfSAlex Elder *ceph_opts = copts; 44454e9afebaSAlex Elder *opts = rbd_opts; 4446859c31dfSAlex Elder *rbd_spec = spec; 44470ddebc0cSAlex Elder 4448dc79b113SAlex Elder return 0; 4449f28e565aSAlex Elder out_mem: 4450dc79b113SAlex Elder ret = -ENOMEM; 4451d22f76e7SAlex Elder out_err: 4452859c31dfSAlex Elder kfree(rbd_opts); 4453859c31dfSAlex Elder rbd_spec_put(spec); 4454f28e565aSAlex Elder kfree(options); 4455d22f76e7SAlex Elder 4456dc79b113SAlex Elder return ret; 4457a725f65eSAlex Elder } 4458a725f65eSAlex Elder 4459589d30e0SAlex Elder /* 4460589d30e0SAlex Elder * An rbd format 2 image has a unique identifier, distinct from the 4461589d30e0SAlex Elder * name given to it by the user. Internally, that identifier is 4462589d30e0SAlex Elder * what's used to specify the names of objects related to the image. 4463589d30e0SAlex Elder * 4464589d30e0SAlex Elder * A special "rbd id" object is used to map an rbd image name to its 4465589d30e0SAlex Elder * id. If that object doesn't exist, then there is no v2 rbd image 4466589d30e0SAlex Elder * with the supplied name. 4467589d30e0SAlex Elder * 4468589d30e0SAlex Elder * This function will record the given rbd_dev's image_id field if 4469589d30e0SAlex Elder * it can be determined, and in that case will return 0. If any 4470589d30e0SAlex Elder * errors occur a negative errno will be returned and the rbd_dev's 4471589d30e0SAlex Elder * image_id field will be unchanged (and should be NULL). 4472589d30e0SAlex Elder */ 4473589d30e0SAlex Elder static int rbd_dev_image_id(struct rbd_device *rbd_dev) 4474589d30e0SAlex Elder { 4475589d30e0SAlex Elder int ret; 4476589d30e0SAlex Elder size_t size; 4477589d30e0SAlex Elder char *object_name; 4478589d30e0SAlex Elder void *response; 4479589d30e0SAlex Elder void *p; 4480589d30e0SAlex Elder 44812f82ee54SAlex Elder /* If we already have it we don't need to look it up */ 44822f82ee54SAlex Elder 44832f82ee54SAlex Elder if (rbd_dev->spec->image_id) 44842f82ee54SAlex Elder return 0; 44852f82ee54SAlex Elder 4486589d30e0SAlex Elder /* 44872c0d0a10SAlex Elder * When probing a parent image, the image id is already 44882c0d0a10SAlex Elder * known (and the image name likely is not). There's no 44892c0d0a10SAlex Elder * need to fetch the image id again in this case. 44902c0d0a10SAlex Elder */ 44912c0d0a10SAlex Elder if (rbd_dev->spec->image_id) 44922c0d0a10SAlex Elder return 0; 44932c0d0a10SAlex Elder 44942c0d0a10SAlex Elder /* 4495589d30e0SAlex Elder * First, see if the format 2 image id file exists, and if 4496589d30e0SAlex Elder * so, get the image's persistent id from it. 4497589d30e0SAlex Elder */ 449869e7a02fSAlex Elder size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name); 4499589d30e0SAlex Elder object_name = kmalloc(size, GFP_NOIO); 4500589d30e0SAlex Elder if (!object_name) 4501589d30e0SAlex Elder return -ENOMEM; 45020d7dbfceSAlex Elder sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name); 4503589d30e0SAlex Elder dout("rbd id object name is %s\n", object_name); 4504589d30e0SAlex Elder 4505589d30e0SAlex Elder /* Response will be an encoded string, which includes a length */ 4506589d30e0SAlex Elder 4507589d30e0SAlex Elder size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX; 4508589d30e0SAlex Elder response = kzalloc(size, GFP_NOIO); 4509589d30e0SAlex Elder if (!response) { 4510589d30e0SAlex Elder ret = -ENOMEM; 4511589d30e0SAlex Elder goto out; 4512589d30e0SAlex Elder } 4513589d30e0SAlex Elder 451436be9a76SAlex Elder ret = rbd_obj_method_sync(rbd_dev, object_name, 45154157976bSAlex Elder "rbd", "get_id", NULL, 0, 451607b2391fSAlex Elder response, RBD_IMAGE_ID_LEN_MAX, NULL); 451736be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 4518589d30e0SAlex Elder if (ret < 0) 4519589d30e0SAlex Elder goto out; 4520589d30e0SAlex Elder 4521589d30e0SAlex Elder p = response; 45220d7dbfceSAlex Elder rbd_dev->spec->image_id = ceph_extract_encoded_string(&p, 452357385b51SAlex Elder p + ret, 4524979ed480SAlex Elder NULL, GFP_NOIO); 452557385b51SAlex Elder ret = 0; 452657385b51SAlex Elder 45270d7dbfceSAlex Elder if (IS_ERR(rbd_dev->spec->image_id)) { 45280d7dbfceSAlex Elder ret = PTR_ERR(rbd_dev->spec->image_id); 45290d7dbfceSAlex Elder rbd_dev->spec->image_id = NULL; 4530589d30e0SAlex Elder } else { 45310d7dbfceSAlex Elder dout("image_id is %s\n", rbd_dev->spec->image_id); 4532589d30e0SAlex Elder } 4533589d30e0SAlex Elder out: 4534589d30e0SAlex Elder kfree(response); 4535589d30e0SAlex Elder kfree(object_name); 4536589d30e0SAlex Elder 4537589d30e0SAlex Elder return ret; 4538589d30e0SAlex Elder } 4539589d30e0SAlex Elder 4540a30b71b9SAlex Elder static int rbd_dev_v1_probe(struct rbd_device *rbd_dev) 4541a30b71b9SAlex Elder { 4542a30b71b9SAlex Elder int ret; 4543a30b71b9SAlex Elder size_t size; 4544a30b71b9SAlex Elder 4545a30b71b9SAlex Elder /* Version 1 images have no id; empty string is used */ 4546a30b71b9SAlex Elder 45470d7dbfceSAlex Elder rbd_dev->spec->image_id = kstrdup("", GFP_KERNEL); 45480d7dbfceSAlex Elder if (!rbd_dev->spec->image_id) 4549a30b71b9SAlex Elder return -ENOMEM; 4550a30b71b9SAlex Elder 4551a30b71b9SAlex Elder /* Record the header object name for this rbd image. */ 4552a30b71b9SAlex Elder 455369e7a02fSAlex Elder size = strlen(rbd_dev->spec->image_name) + sizeof (RBD_SUFFIX); 4554a30b71b9SAlex Elder rbd_dev->header_name = kmalloc(size, GFP_KERNEL); 4555a30b71b9SAlex Elder if (!rbd_dev->header_name) { 4556a30b71b9SAlex Elder ret = -ENOMEM; 4557a30b71b9SAlex Elder goto out_err; 4558a30b71b9SAlex Elder } 45590d7dbfceSAlex Elder sprintf(rbd_dev->header_name, "%s%s", 45600d7dbfceSAlex Elder rbd_dev->spec->image_name, RBD_SUFFIX); 4561a30b71b9SAlex Elder 4562a30b71b9SAlex Elder /* Populate rbd image metadata */ 4563a30b71b9SAlex Elder 4564a30b71b9SAlex Elder ret = rbd_read_header(rbd_dev, &rbd_dev->header); 4565a30b71b9SAlex Elder if (ret < 0) 4566a30b71b9SAlex Elder goto out_err; 456786b00e0dSAlex Elder 456886b00e0dSAlex Elder /* Version 1 images have no parent (no layering) */ 456986b00e0dSAlex Elder 457086b00e0dSAlex Elder rbd_dev->parent_spec = NULL; 457186b00e0dSAlex Elder rbd_dev->parent_overlap = 0; 457286b00e0dSAlex Elder 4573a30b71b9SAlex Elder rbd_dev->image_format = 1; 4574a30b71b9SAlex Elder 4575a30b71b9SAlex Elder dout("discovered version 1 image, header name is %s\n", 4576a30b71b9SAlex Elder rbd_dev->header_name); 4577a30b71b9SAlex Elder 4578a30b71b9SAlex Elder return 0; 4579a30b71b9SAlex Elder 4580a30b71b9SAlex Elder out_err: 4581a30b71b9SAlex Elder kfree(rbd_dev->header_name); 4582a30b71b9SAlex Elder rbd_dev->header_name = NULL; 45830d7dbfceSAlex Elder kfree(rbd_dev->spec->image_id); 45840d7dbfceSAlex Elder rbd_dev->spec->image_id = NULL; 4585a30b71b9SAlex Elder 4586a30b71b9SAlex Elder return ret; 4587a30b71b9SAlex Elder } 4588a30b71b9SAlex Elder 4589a30b71b9SAlex Elder static int rbd_dev_v2_probe(struct rbd_device *rbd_dev) 4590a30b71b9SAlex Elder { 4591a30b71b9SAlex Elder size_t size; 45929d475de5SAlex Elder int ret; 45936e14b1a6SAlex Elder u64 ver = 0; 4594a30b71b9SAlex Elder 4595a30b71b9SAlex Elder /* 4596a30b71b9SAlex Elder * Image id was filled in by the caller. Record the header 4597a30b71b9SAlex Elder * object name for this rbd image. 4598a30b71b9SAlex Elder */ 4599979ed480SAlex Elder size = sizeof (RBD_HEADER_PREFIX) + strlen(rbd_dev->spec->image_id); 4600a30b71b9SAlex Elder rbd_dev->header_name = kmalloc(size, GFP_KERNEL); 4601a30b71b9SAlex Elder if (!rbd_dev->header_name) 4602a30b71b9SAlex Elder return -ENOMEM; 4603a30b71b9SAlex Elder sprintf(rbd_dev->header_name, "%s%s", 46040d7dbfceSAlex Elder RBD_HEADER_PREFIX, rbd_dev->spec->image_id); 46059d475de5SAlex Elder 46069d475de5SAlex Elder /* Get the size and object order for the image */ 46079d475de5SAlex Elder ret = rbd_dev_v2_image_size(rbd_dev); 460857385b51SAlex Elder if (ret) 46099d475de5SAlex Elder goto out_err; 46101e130199SAlex Elder 46111e130199SAlex Elder /* Get the object prefix (a.k.a. block_name) for the image */ 46121e130199SAlex Elder 46131e130199SAlex Elder ret = rbd_dev_v2_object_prefix(rbd_dev); 461457385b51SAlex Elder if (ret) 46151e130199SAlex Elder goto out_err; 4616b1b5402aSAlex Elder 4617d889140cSAlex Elder /* Get the and check features for the image */ 4618b1b5402aSAlex Elder 4619b1b5402aSAlex Elder ret = rbd_dev_v2_features(rbd_dev); 462057385b51SAlex Elder if (ret) 4621b1b5402aSAlex Elder goto out_err; 462235d489f9SAlex Elder 462386b00e0dSAlex Elder /* If the image supports layering, get the parent info */ 462486b00e0dSAlex Elder 462586b00e0dSAlex Elder if (rbd_dev->header.features & RBD_FEATURE_LAYERING) { 462686b00e0dSAlex Elder ret = rbd_dev_v2_parent_info(rbd_dev); 462757385b51SAlex Elder if (ret) 462886b00e0dSAlex Elder goto out_err; 4629770eba6eSAlex Elder rbd_warn(rbd_dev, "WARNING: kernel support for " 4630770eba6eSAlex Elder "layered rbd images is EXPERIMENTAL!"); 463186b00e0dSAlex Elder } 463286b00e0dSAlex Elder 4633cc070d59SAlex Elder /* If the image supports fancy striping, get its parameters */ 4634cc070d59SAlex Elder 4635cc070d59SAlex Elder if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) { 4636cc070d59SAlex Elder ret = rbd_dev_v2_striping_info(rbd_dev); 4637cc070d59SAlex Elder if (ret < 0) 4638cc070d59SAlex Elder goto out_err; 4639cc070d59SAlex Elder } 4640cc070d59SAlex Elder 46416e14b1a6SAlex Elder /* crypto and compression type aren't (yet) supported for v2 images */ 464235d489f9SAlex Elder 46436e14b1a6SAlex Elder rbd_dev->header.crypt_type = 0; 46446e14b1a6SAlex Elder rbd_dev->header.comp_type = 0; 46456e14b1a6SAlex Elder 46466e14b1a6SAlex Elder /* Get the snapshot context, plus the header version */ 46476e14b1a6SAlex Elder 46486e14b1a6SAlex Elder ret = rbd_dev_v2_snap_context(rbd_dev, &ver); 464935d489f9SAlex Elder if (ret) 465035d489f9SAlex Elder goto out_err; 46516e14b1a6SAlex Elder rbd_dev->header.obj_version = ver; 46526e14b1a6SAlex Elder 4653a30b71b9SAlex Elder rbd_dev->image_format = 2; 4654a30b71b9SAlex Elder 4655a30b71b9SAlex Elder dout("discovered version 2 image, header name is %s\n", 4656a30b71b9SAlex Elder rbd_dev->header_name); 4657a30b71b9SAlex Elder 465835152979SAlex Elder return 0; 46599d475de5SAlex Elder out_err: 466086b00e0dSAlex Elder rbd_dev->parent_overlap = 0; 466186b00e0dSAlex Elder rbd_spec_put(rbd_dev->parent_spec); 466286b00e0dSAlex Elder rbd_dev->parent_spec = NULL; 46639d475de5SAlex Elder kfree(rbd_dev->header_name); 46649d475de5SAlex Elder rbd_dev->header_name = NULL; 46651e130199SAlex Elder kfree(rbd_dev->header.object_prefix); 46661e130199SAlex Elder rbd_dev->header.object_prefix = NULL; 46679d475de5SAlex Elder 46689d475de5SAlex Elder return ret; 4669a30b71b9SAlex Elder } 4670a30b71b9SAlex Elder 467183a06263SAlex Elder static int rbd_dev_probe_finish(struct rbd_device *rbd_dev) 467283a06263SAlex Elder { 46732f82ee54SAlex Elder struct rbd_device *parent = NULL; 46742f82ee54SAlex Elder struct rbd_spec *parent_spec = NULL; 46752f82ee54SAlex Elder struct rbd_client *rbdc = NULL; 467683a06263SAlex Elder int ret; 467783a06263SAlex Elder 467883a06263SAlex Elder /* no need to lock here, as rbd_dev is not registered yet */ 467983a06263SAlex Elder ret = rbd_dev_snaps_update(rbd_dev); 468083a06263SAlex Elder if (ret) 468183a06263SAlex Elder return ret; 468283a06263SAlex Elder 46839e15b77dSAlex Elder ret = rbd_dev_probe_update_spec(rbd_dev); 46849e15b77dSAlex Elder if (ret) 46859e15b77dSAlex Elder goto err_out_snaps; 46869e15b77dSAlex Elder 468783a06263SAlex Elder ret = rbd_dev_set_mapping(rbd_dev); 468883a06263SAlex Elder if (ret) 468983a06263SAlex Elder goto err_out_snaps; 469083a06263SAlex Elder 469183a06263SAlex Elder /* generate unique id: find highest unique id, add one */ 469283a06263SAlex Elder rbd_dev_id_get(rbd_dev); 469383a06263SAlex Elder 469483a06263SAlex Elder /* Fill in the device name, now that we have its id. */ 469583a06263SAlex Elder BUILD_BUG_ON(DEV_NAME_LEN 469683a06263SAlex Elder < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH); 469783a06263SAlex Elder sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id); 469883a06263SAlex Elder 469983a06263SAlex Elder /* Get our block major device number. */ 470083a06263SAlex Elder 470183a06263SAlex Elder ret = register_blkdev(0, rbd_dev->name); 470283a06263SAlex Elder if (ret < 0) 470383a06263SAlex Elder goto err_out_id; 470483a06263SAlex Elder rbd_dev->major = ret; 470583a06263SAlex Elder 470683a06263SAlex Elder /* Set up the blkdev mapping. */ 470783a06263SAlex Elder 470883a06263SAlex Elder ret = rbd_init_disk(rbd_dev); 470983a06263SAlex Elder if (ret) 471083a06263SAlex Elder goto err_out_blkdev; 471183a06263SAlex Elder 471283a06263SAlex Elder ret = rbd_bus_add_dev(rbd_dev); 471383a06263SAlex Elder if (ret) 471483a06263SAlex Elder goto err_out_disk; 471583a06263SAlex Elder 471683a06263SAlex Elder /* 471783a06263SAlex Elder * At this point cleanup in the event of an error is the job 471883a06263SAlex Elder * of the sysfs code (initiated by rbd_bus_del_dev()). 471983a06263SAlex Elder */ 47202f82ee54SAlex Elder /* Probe the parent if there is one */ 47212f82ee54SAlex Elder 47222f82ee54SAlex Elder if (rbd_dev->parent_spec) { 47232f82ee54SAlex Elder /* 47242f82ee54SAlex Elder * We need to pass a reference to the client and the 47252f82ee54SAlex Elder * parent spec when creating the parent rbd_dev. 47262f82ee54SAlex Elder * Images related by parent/child relationships 47272f82ee54SAlex Elder * always share both. 47282f82ee54SAlex Elder */ 47292f82ee54SAlex Elder parent_spec = rbd_spec_get(rbd_dev->parent_spec); 47302f82ee54SAlex Elder rbdc = __rbd_get_client(rbd_dev->rbd_client); 47312f82ee54SAlex Elder 47322f82ee54SAlex Elder parent = rbd_dev_create(rbdc, parent_spec); 47332f82ee54SAlex Elder if (!parent) { 47342f82ee54SAlex Elder ret = -ENOMEM; 47352f82ee54SAlex Elder goto err_out_spec; 47362f82ee54SAlex Elder } 47372f82ee54SAlex Elder rbdc = NULL; /* parent now owns reference */ 47382f82ee54SAlex Elder parent_spec = NULL; /* parent now owns reference */ 47392f82ee54SAlex Elder ret = rbd_dev_probe(parent); 47402f82ee54SAlex Elder if (ret < 0) 47412f82ee54SAlex Elder goto err_out_parent; 47422f82ee54SAlex Elder rbd_dev->parent = parent; 47432f82ee54SAlex Elder } 47442f82ee54SAlex Elder 47459969ebc5SAlex Elder ret = rbd_dev_header_watch_sync(rbd_dev, 1); 474683a06263SAlex Elder if (ret) 474783a06263SAlex Elder goto err_out_bus; 474883a06263SAlex Elder 474983a06263SAlex Elder /* Everything's ready. Announce the disk to the world. */ 475083a06263SAlex Elder 475183a06263SAlex Elder add_disk(rbd_dev->disk); 475283a06263SAlex Elder 475383a06263SAlex Elder pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name, 475483a06263SAlex Elder (unsigned long long) rbd_dev->mapping.size); 475583a06263SAlex Elder 475683a06263SAlex Elder return ret; 47572f82ee54SAlex Elder 47582f82ee54SAlex Elder err_out_parent: 47592f82ee54SAlex Elder rbd_dev_destroy(parent); 47602f82ee54SAlex Elder err_out_spec: 47612f82ee54SAlex Elder rbd_spec_put(parent_spec); 47622f82ee54SAlex Elder rbd_put_client(rbdc); 476383a06263SAlex Elder err_out_bus: 476483a06263SAlex Elder /* this will also clean up rest of rbd_dev stuff */ 476583a06263SAlex Elder 476683a06263SAlex Elder rbd_bus_del_dev(rbd_dev); 476783a06263SAlex Elder 476883a06263SAlex Elder return ret; 476983a06263SAlex Elder err_out_disk: 477083a06263SAlex Elder rbd_free_disk(rbd_dev); 477183a06263SAlex Elder err_out_blkdev: 477283a06263SAlex Elder unregister_blkdev(rbd_dev->major, rbd_dev->name); 477383a06263SAlex Elder err_out_id: 477483a06263SAlex Elder rbd_dev_id_put(rbd_dev); 477583a06263SAlex Elder err_out_snaps: 477683a06263SAlex Elder rbd_remove_all_snaps(rbd_dev); 477783a06263SAlex Elder 477883a06263SAlex Elder return ret; 477983a06263SAlex Elder } 478083a06263SAlex Elder 4781a30b71b9SAlex Elder /* 4782a30b71b9SAlex Elder * Probe for the existence of the header object for the given rbd 4783a30b71b9SAlex Elder * device. For format 2 images this includes determining the image 4784a30b71b9SAlex Elder * id. 4785a30b71b9SAlex Elder */ 4786a30b71b9SAlex Elder static int rbd_dev_probe(struct rbd_device *rbd_dev) 4787a30b71b9SAlex Elder { 4788a30b71b9SAlex Elder int ret; 4789a30b71b9SAlex Elder 4790a30b71b9SAlex Elder /* 4791a30b71b9SAlex Elder * Get the id from the image id object. If it's not a 4792a30b71b9SAlex Elder * format 2 image, we'll get ENOENT back, and we'll assume 4793a30b71b9SAlex Elder * it's a format 1 image. 4794a30b71b9SAlex Elder */ 4795a30b71b9SAlex Elder ret = rbd_dev_image_id(rbd_dev); 4796a30b71b9SAlex Elder if (ret) 4797a30b71b9SAlex Elder ret = rbd_dev_v1_probe(rbd_dev); 4798a30b71b9SAlex Elder else 4799a30b71b9SAlex Elder ret = rbd_dev_v2_probe(rbd_dev); 480083a06263SAlex Elder if (ret) { 4801a30b71b9SAlex Elder dout("probe failed, returning %d\n", ret); 4802a30b71b9SAlex Elder 4803a30b71b9SAlex Elder return ret; 4804a30b71b9SAlex Elder } 4805a30b71b9SAlex Elder 480683a06263SAlex Elder ret = rbd_dev_probe_finish(rbd_dev); 480783a06263SAlex Elder if (ret) 480883a06263SAlex Elder rbd_header_free(&rbd_dev->header); 480983a06263SAlex Elder 481083a06263SAlex Elder return ret; 481183a06263SAlex Elder } 481283a06263SAlex Elder 481359c2be1eSYehuda Sadeh static ssize_t rbd_add(struct bus_type *bus, 481459c2be1eSYehuda Sadeh const char *buf, 481559c2be1eSYehuda Sadeh size_t count) 4816602adf40SYehuda Sadeh { 4817cb8627c7SAlex Elder struct rbd_device *rbd_dev = NULL; 4818dc79b113SAlex Elder struct ceph_options *ceph_opts = NULL; 48194e9afebaSAlex Elder struct rbd_options *rbd_opts = NULL; 4820859c31dfSAlex Elder struct rbd_spec *spec = NULL; 48219d3997fdSAlex Elder struct rbd_client *rbdc; 482227cc2594SAlex Elder struct ceph_osd_client *osdc; 482327cc2594SAlex Elder int rc = -ENOMEM; 4824602adf40SYehuda Sadeh 4825602adf40SYehuda Sadeh if (!try_module_get(THIS_MODULE)) 4826602adf40SYehuda Sadeh return -ENODEV; 4827602adf40SYehuda Sadeh 4828a725f65eSAlex Elder /* parse add command */ 4829859c31dfSAlex Elder rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec); 4830dc79b113SAlex Elder if (rc < 0) 4831bd4ba655SAlex Elder goto err_out_module; 4832a725f65eSAlex Elder 48339d3997fdSAlex Elder rbdc = rbd_get_client(ceph_opts); 48349d3997fdSAlex Elder if (IS_ERR(rbdc)) { 48359d3997fdSAlex Elder rc = PTR_ERR(rbdc); 48360ddebc0cSAlex Elder goto err_out_args; 48379d3997fdSAlex Elder } 4838c53d5893SAlex Elder ceph_opts = NULL; /* rbd_dev client now owns this */ 4839602adf40SYehuda Sadeh 4840602adf40SYehuda Sadeh /* pick the pool */ 48419d3997fdSAlex Elder osdc = &rbdc->client->osdc; 4842859c31dfSAlex Elder rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name); 4843602adf40SYehuda Sadeh if (rc < 0) 4844602adf40SYehuda Sadeh goto err_out_client; 4845859c31dfSAlex Elder spec->pool_id = (u64) rc; 4846859c31dfSAlex Elder 48470903e875SAlex Elder /* The ceph file layout needs to fit pool id in 32 bits */ 48480903e875SAlex Elder 48490903e875SAlex Elder if (WARN_ON(spec->pool_id > (u64) U32_MAX)) { 48500903e875SAlex Elder rc = -EIO; 48510903e875SAlex Elder goto err_out_client; 48520903e875SAlex Elder } 48530903e875SAlex Elder 4854c53d5893SAlex Elder rbd_dev = rbd_dev_create(rbdc, spec); 4855bd4ba655SAlex Elder if (!rbd_dev) 4856bd4ba655SAlex Elder goto err_out_client; 4857c53d5893SAlex Elder rbdc = NULL; /* rbd_dev now owns this */ 4858c53d5893SAlex Elder spec = NULL; /* rbd_dev now owns this */ 4859602adf40SYehuda Sadeh 4860bd4ba655SAlex Elder rbd_dev->mapping.read_only = rbd_opts->read_only; 4861c53d5893SAlex Elder kfree(rbd_opts); 4862c53d5893SAlex Elder rbd_opts = NULL; /* done with this */ 4863bd4ba655SAlex Elder 4864a30b71b9SAlex Elder rc = rbd_dev_probe(rbd_dev); 4865a30b71b9SAlex Elder if (rc < 0) 4866c53d5893SAlex Elder goto err_out_rbd_dev; 486705fd6f6fSAlex Elder 4868602adf40SYehuda Sadeh return count; 4869c53d5893SAlex Elder err_out_rbd_dev: 4870c53d5893SAlex Elder rbd_dev_destroy(rbd_dev); 4871bd4ba655SAlex Elder err_out_client: 48729d3997fdSAlex Elder rbd_put_client(rbdc); 48730ddebc0cSAlex Elder err_out_args: 487478cea76eSAlex Elder if (ceph_opts) 487578cea76eSAlex Elder ceph_destroy_options(ceph_opts); 48764e9afebaSAlex Elder kfree(rbd_opts); 4877859c31dfSAlex Elder rbd_spec_put(spec); 4878bd4ba655SAlex Elder err_out_module: 4879bd4ba655SAlex Elder module_put(THIS_MODULE); 488027cc2594SAlex Elder 4881602adf40SYehuda Sadeh dout("Error adding device %s\n", buf); 488227cc2594SAlex Elder 488327cc2594SAlex Elder return (ssize_t) rc; 4884602adf40SYehuda Sadeh } 4885602adf40SYehuda Sadeh 4886de71a297SAlex Elder static struct rbd_device *__rbd_get_dev(unsigned long dev_id) 4887602adf40SYehuda Sadeh { 4888602adf40SYehuda Sadeh struct list_head *tmp; 4889602adf40SYehuda Sadeh struct rbd_device *rbd_dev; 4890602adf40SYehuda Sadeh 4891e124a82fSAlex Elder spin_lock(&rbd_dev_list_lock); 4892602adf40SYehuda Sadeh list_for_each(tmp, &rbd_dev_list) { 4893602adf40SYehuda Sadeh rbd_dev = list_entry(tmp, struct rbd_device, node); 4894de71a297SAlex Elder if (rbd_dev->dev_id == dev_id) { 4895e124a82fSAlex Elder spin_unlock(&rbd_dev_list_lock); 4896602adf40SYehuda Sadeh return rbd_dev; 4897602adf40SYehuda Sadeh } 4898e124a82fSAlex Elder } 4899e124a82fSAlex Elder spin_unlock(&rbd_dev_list_lock); 4900602adf40SYehuda Sadeh return NULL; 4901602adf40SYehuda Sadeh } 4902602adf40SYehuda Sadeh 4903dfc5606dSYehuda Sadeh static void rbd_dev_release(struct device *dev) 4904602adf40SYehuda Sadeh { 4905593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4906602adf40SYehuda Sadeh 490759c2be1eSYehuda Sadeh if (rbd_dev->watch_event) 49089969ebc5SAlex Elder rbd_dev_header_watch_sync(rbd_dev, 0); 4909602adf40SYehuda Sadeh 4910602adf40SYehuda Sadeh /* clean up and free blkdev */ 4911602adf40SYehuda Sadeh rbd_free_disk(rbd_dev); 4912602adf40SYehuda Sadeh unregister_blkdev(rbd_dev->major, rbd_dev->name); 491332eec68dSAlex Elder 49142ac4e75dSAlex Elder /* release allocated disk header fields */ 49152ac4e75dSAlex Elder rbd_header_free(&rbd_dev->header); 49162ac4e75dSAlex Elder 491732eec68dSAlex Elder /* done with the id, and with the rbd_dev */ 4918e2839308SAlex Elder rbd_dev_id_put(rbd_dev); 4919c53d5893SAlex Elder rbd_assert(rbd_dev->rbd_client != NULL); 4920c53d5893SAlex Elder rbd_dev_destroy(rbd_dev); 4921602adf40SYehuda Sadeh 4922602adf40SYehuda Sadeh /* release module ref */ 4923602adf40SYehuda Sadeh module_put(THIS_MODULE); 4924602adf40SYehuda Sadeh } 4925602adf40SYehuda Sadeh 49262f82ee54SAlex Elder static void __rbd_remove(struct rbd_device *rbd_dev) 49272f82ee54SAlex Elder { 49282f82ee54SAlex Elder rbd_remove_all_snaps(rbd_dev); 49292f82ee54SAlex Elder rbd_bus_del_dev(rbd_dev); 49302f82ee54SAlex Elder } 49312f82ee54SAlex Elder 4932dfc5606dSYehuda Sadeh static ssize_t rbd_remove(struct bus_type *bus, 4933602adf40SYehuda Sadeh const char *buf, 4934602adf40SYehuda Sadeh size_t count) 4935602adf40SYehuda Sadeh { 4936602adf40SYehuda Sadeh struct rbd_device *rbd_dev = NULL; 4937602adf40SYehuda Sadeh int target_id, rc; 4938602adf40SYehuda Sadeh unsigned long ul; 4939602adf40SYehuda Sadeh int ret = count; 4940602adf40SYehuda Sadeh 4941602adf40SYehuda Sadeh rc = strict_strtoul(buf, 10, &ul); 4942602adf40SYehuda Sadeh if (rc) 4943602adf40SYehuda Sadeh return rc; 4944602adf40SYehuda Sadeh 4945602adf40SYehuda Sadeh /* convert to int; abort if we lost anything in the conversion */ 4946602adf40SYehuda Sadeh target_id = (int) ul; 4947602adf40SYehuda Sadeh if (target_id != ul) 4948602adf40SYehuda Sadeh return -EINVAL; 4949602adf40SYehuda Sadeh 4950602adf40SYehuda Sadeh mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); 4951602adf40SYehuda Sadeh 4952602adf40SYehuda Sadeh rbd_dev = __rbd_get_dev(target_id); 4953602adf40SYehuda Sadeh if (!rbd_dev) { 4954602adf40SYehuda Sadeh ret = -ENOENT; 4955602adf40SYehuda Sadeh goto done; 4956602adf40SYehuda Sadeh } 4957602adf40SYehuda Sadeh 4958a14ea269SAlex Elder spin_lock_irq(&rbd_dev->lock); 4959b82d167bSAlex Elder if (rbd_dev->open_count) 496042382b70SAlex Elder ret = -EBUSY; 4961b82d167bSAlex Elder else 4962b82d167bSAlex Elder set_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags); 4963a14ea269SAlex Elder spin_unlock_irq(&rbd_dev->lock); 4964b82d167bSAlex Elder if (ret < 0) 496542382b70SAlex Elder goto done; 496642382b70SAlex Elder 49672f82ee54SAlex Elder while (rbd_dev->parent_spec) { 49682f82ee54SAlex Elder struct rbd_device *first = rbd_dev; 49692f82ee54SAlex Elder struct rbd_device *second = first->parent; 49702f82ee54SAlex Elder struct rbd_device *third; 49712f82ee54SAlex Elder 49722f82ee54SAlex Elder /* 49732f82ee54SAlex Elder * Follow to the parent with no grandparent and 49742f82ee54SAlex Elder * remove it. 49752f82ee54SAlex Elder */ 49762f82ee54SAlex Elder while (second && (third = second->parent)) { 49772f82ee54SAlex Elder first = second; 49782f82ee54SAlex Elder second = third; 49792f82ee54SAlex Elder } 49802f82ee54SAlex Elder __rbd_remove(second); 49812f82ee54SAlex Elder rbd_spec_put(first->parent_spec); 49822f82ee54SAlex Elder first->parent_spec = NULL; 49832f82ee54SAlex Elder first->parent_overlap = 0; 49842f82ee54SAlex Elder first->parent = NULL; 49852f82ee54SAlex Elder } 49862f82ee54SAlex Elder __rbd_remove(rbd_dev); 4987602adf40SYehuda Sadeh 4988602adf40SYehuda Sadeh done: 4989602adf40SYehuda Sadeh mutex_unlock(&ctl_mutex); 4990aafb230eSAlex Elder 4991602adf40SYehuda Sadeh return ret; 4992602adf40SYehuda Sadeh } 4993602adf40SYehuda Sadeh 4994602adf40SYehuda Sadeh /* 4995602adf40SYehuda Sadeh * create control files in sysfs 4996dfc5606dSYehuda Sadeh * /sys/bus/rbd/... 4997602adf40SYehuda Sadeh */ 4998602adf40SYehuda Sadeh static int rbd_sysfs_init(void) 4999602adf40SYehuda Sadeh { 5000dfc5606dSYehuda Sadeh int ret; 5001602adf40SYehuda Sadeh 5002fed4c143SAlex Elder ret = device_register(&rbd_root_dev); 5003dfc5606dSYehuda Sadeh if (ret < 0) 5004dfc5606dSYehuda Sadeh return ret; 5005602adf40SYehuda Sadeh 5006fed4c143SAlex Elder ret = bus_register(&rbd_bus_type); 5007fed4c143SAlex Elder if (ret < 0) 5008fed4c143SAlex Elder device_unregister(&rbd_root_dev); 5009602adf40SYehuda Sadeh 5010602adf40SYehuda Sadeh return ret; 5011602adf40SYehuda Sadeh } 5012602adf40SYehuda Sadeh 5013602adf40SYehuda Sadeh static void rbd_sysfs_cleanup(void) 5014602adf40SYehuda Sadeh { 5015dfc5606dSYehuda Sadeh bus_unregister(&rbd_bus_type); 5016fed4c143SAlex Elder device_unregister(&rbd_root_dev); 5017602adf40SYehuda Sadeh } 5018602adf40SYehuda Sadeh 5019cc344fa1SAlex Elder static int __init rbd_init(void) 5020602adf40SYehuda Sadeh { 5021602adf40SYehuda Sadeh int rc; 5022602adf40SYehuda Sadeh 50231e32d34cSAlex Elder if (!libceph_compatible(NULL)) { 50241e32d34cSAlex Elder rbd_warn(NULL, "libceph incompatibility (quitting)"); 50251e32d34cSAlex Elder 50261e32d34cSAlex Elder return -EINVAL; 50271e32d34cSAlex Elder } 5028602adf40SYehuda Sadeh rc = rbd_sysfs_init(); 5029602adf40SYehuda Sadeh if (rc) 5030602adf40SYehuda Sadeh return rc; 5031f0f8cef5SAlex Elder pr_info("loaded " RBD_DRV_NAME_LONG "\n"); 5032602adf40SYehuda Sadeh return 0; 5033602adf40SYehuda Sadeh } 5034602adf40SYehuda Sadeh 5035cc344fa1SAlex Elder static void __exit rbd_exit(void) 5036602adf40SYehuda Sadeh { 5037602adf40SYehuda Sadeh rbd_sysfs_cleanup(); 5038602adf40SYehuda Sadeh } 5039602adf40SYehuda Sadeh 5040602adf40SYehuda Sadeh module_init(rbd_init); 5041602adf40SYehuda Sadeh module_exit(rbd_exit); 5042602adf40SYehuda Sadeh 5043602adf40SYehuda Sadeh MODULE_AUTHOR("Sage Weil <sage@newdream.net>"); 5044602adf40SYehuda Sadeh MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>"); 5045602adf40SYehuda Sadeh MODULE_DESCRIPTION("rados block device"); 5046602adf40SYehuda Sadeh 5047602adf40SYehuda Sadeh /* following authorship retained from original osdblk.c */ 5048602adf40SYehuda Sadeh MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>"); 5049602adf40SYehuda Sadeh 5050602adf40SYehuda Sadeh MODULE_LICENSE("GPL"); 5051