1602adf40SYehuda Sadeh /* 2602adf40SYehuda Sadeh rbd.c -- Export ceph rados objects as a Linux block device 3602adf40SYehuda Sadeh 4602adf40SYehuda Sadeh 5602adf40SYehuda Sadeh based on drivers/block/osdblk.c: 6602adf40SYehuda Sadeh 7602adf40SYehuda Sadeh Copyright 2009 Red Hat, Inc. 8602adf40SYehuda Sadeh 9602adf40SYehuda Sadeh This program is free software; you can redistribute it and/or modify 10602adf40SYehuda Sadeh it under the terms of the GNU General Public License as published by 11602adf40SYehuda Sadeh the Free Software Foundation. 12602adf40SYehuda Sadeh 13602adf40SYehuda Sadeh This program is distributed in the hope that it will be useful, 14602adf40SYehuda Sadeh but WITHOUT ANY WARRANTY; without even the implied warranty of 15602adf40SYehuda Sadeh MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16602adf40SYehuda Sadeh GNU General Public License for more details. 17602adf40SYehuda Sadeh 18602adf40SYehuda Sadeh You should have received a copy of the GNU General Public License 19602adf40SYehuda Sadeh along with this program; see the file COPYING. If not, write to 20602adf40SYehuda Sadeh the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 21602adf40SYehuda Sadeh 22602adf40SYehuda Sadeh 23602adf40SYehuda Sadeh 24dfc5606dSYehuda Sadeh For usage instructions, please refer to: 25602adf40SYehuda Sadeh 26dfc5606dSYehuda Sadeh Documentation/ABI/testing/sysfs-bus-rbd 27602adf40SYehuda Sadeh 28602adf40SYehuda Sadeh */ 29602adf40SYehuda Sadeh 30602adf40SYehuda Sadeh #include <linux/ceph/libceph.h> 31602adf40SYehuda Sadeh #include <linux/ceph/osd_client.h> 32602adf40SYehuda Sadeh #include <linux/ceph/mon_client.h> 33602adf40SYehuda Sadeh #include <linux/ceph/decode.h> 3459c2be1eSYehuda Sadeh #include <linux/parser.h> 35602adf40SYehuda Sadeh 36602adf40SYehuda Sadeh #include <linux/kernel.h> 37602adf40SYehuda Sadeh #include <linux/device.h> 38602adf40SYehuda Sadeh #include <linux/module.h> 39602adf40SYehuda Sadeh #include <linux/fs.h> 40602adf40SYehuda Sadeh #include <linux/blkdev.h> 41602adf40SYehuda Sadeh 42602adf40SYehuda Sadeh #include "rbd_types.h" 43602adf40SYehuda Sadeh 44aafb230eSAlex Elder #define RBD_DEBUG /* Activate rbd_assert() calls */ 45aafb230eSAlex Elder 46593a9e7bSAlex Elder /* 47593a9e7bSAlex Elder * The basic unit of block I/O is a sector. It is interpreted in a 48593a9e7bSAlex Elder * number of contexts in Linux (blk, bio, genhd), but the default is 49593a9e7bSAlex Elder * universally 512 bytes. These symbols are just slightly more 50593a9e7bSAlex Elder * meaningful than the bare numbers they represent. 51593a9e7bSAlex Elder */ 52593a9e7bSAlex Elder #define SECTOR_SHIFT 9 53593a9e7bSAlex Elder #define SECTOR_SIZE (1ULL << SECTOR_SHIFT) 54593a9e7bSAlex Elder 55f0f8cef5SAlex Elder #define RBD_DRV_NAME "rbd" 56f0f8cef5SAlex Elder #define RBD_DRV_NAME_LONG "rbd (rados block device)" 57602adf40SYehuda Sadeh 58602adf40SYehuda Sadeh #define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */ 59602adf40SYehuda Sadeh 60d4b125e9SAlex Elder #define RBD_SNAP_DEV_NAME_PREFIX "snap_" 61d4b125e9SAlex Elder #define RBD_MAX_SNAP_NAME_LEN \ 62d4b125e9SAlex Elder (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1)) 63d4b125e9SAlex Elder 6435d489f9SAlex Elder #define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */ 65602adf40SYehuda Sadeh 66602adf40SYehuda Sadeh #define RBD_SNAP_HEAD_NAME "-" 67602adf40SYehuda Sadeh 689e15b77dSAlex Elder /* This allows a single page to hold an image name sent by OSD */ 699e15b77dSAlex Elder #define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1) 70589d30e0SAlex Elder #define RBD_IMAGE_ID_LEN_MAX 64 719e15b77dSAlex Elder 721e130199SAlex Elder #define RBD_OBJ_PREFIX_LEN_MAX 64 73589d30e0SAlex Elder 74d889140cSAlex Elder /* Feature bits */ 75d889140cSAlex Elder 765cbf6f12SAlex Elder #define RBD_FEATURE_LAYERING (1<<0) 775cbf6f12SAlex Elder #define RBD_FEATURE_STRIPINGV2 (1<<1) 785cbf6f12SAlex Elder #define RBD_FEATURES_ALL \ 795cbf6f12SAlex Elder (RBD_FEATURE_LAYERING | RBD_FEATURE_STRIPINGV2) 80d889140cSAlex Elder 81d889140cSAlex Elder /* Features supported by this (client software) implementation. */ 82d889140cSAlex Elder 835cbf6f12SAlex Elder #define RBD_FEATURES_SUPPORTED (0) 84d889140cSAlex Elder 8581a89793SAlex Elder /* 8681a89793SAlex Elder * An RBD device name will be "rbd#", where the "rbd" comes from 8781a89793SAlex Elder * RBD_DRV_NAME above, and # is a unique integer identifier. 8881a89793SAlex Elder * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big 8981a89793SAlex Elder * enough to hold all possible device names. 9081a89793SAlex Elder */ 91602adf40SYehuda Sadeh #define DEV_NAME_LEN 32 9281a89793SAlex Elder #define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1) 93602adf40SYehuda Sadeh 94602adf40SYehuda Sadeh /* 95602adf40SYehuda Sadeh * block device image metadata (in-memory version) 96602adf40SYehuda Sadeh */ 97602adf40SYehuda Sadeh struct rbd_image_header { 98f84344f3SAlex Elder /* These four fields never change for a given rbd image */ 99849b4260SAlex Elder char *object_prefix; 10034b13184SAlex Elder u64 features; 101602adf40SYehuda Sadeh __u8 obj_order; 102602adf40SYehuda Sadeh __u8 crypt_type; 103602adf40SYehuda Sadeh __u8 comp_type; 104602adf40SYehuda Sadeh 105f84344f3SAlex Elder /* The remaining fields need to be updated occasionally */ 106f84344f3SAlex Elder u64 image_size; 107f84344f3SAlex Elder struct ceph_snap_context *snapc; 108602adf40SYehuda Sadeh char *snap_names; 109602adf40SYehuda Sadeh u64 *snap_sizes; 11059c2be1eSYehuda Sadeh 11159c2be1eSYehuda Sadeh u64 obj_version; 11259c2be1eSYehuda Sadeh }; 11359c2be1eSYehuda Sadeh 1140d7dbfceSAlex Elder /* 1150d7dbfceSAlex Elder * An rbd image specification. 1160d7dbfceSAlex Elder * 1170d7dbfceSAlex Elder * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely 118c66c6e0cSAlex Elder * identify an image. Each rbd_dev structure includes a pointer to 119c66c6e0cSAlex Elder * an rbd_spec structure that encapsulates this identity. 120c66c6e0cSAlex Elder * 121c66c6e0cSAlex Elder * Each of the id's in an rbd_spec has an associated name. For a 122c66c6e0cSAlex Elder * user-mapped image, the names are supplied and the id's associated 123c66c6e0cSAlex Elder * with them are looked up. For a layered image, a parent image is 124c66c6e0cSAlex Elder * defined by the tuple, and the names are looked up. 125c66c6e0cSAlex Elder * 126c66c6e0cSAlex Elder * An rbd_dev structure contains a parent_spec pointer which is 127c66c6e0cSAlex Elder * non-null if the image it represents is a child in a layered 128c66c6e0cSAlex Elder * image. This pointer will refer to the rbd_spec structure used 129c66c6e0cSAlex Elder * by the parent rbd_dev for its own identity (i.e., the structure 130c66c6e0cSAlex Elder * is shared between the parent and child). 131c66c6e0cSAlex Elder * 132c66c6e0cSAlex Elder * Since these structures are populated once, during the discovery 133c66c6e0cSAlex Elder * phase of image construction, they are effectively immutable so 134c66c6e0cSAlex Elder * we make no effort to synchronize access to them. 135c66c6e0cSAlex Elder * 136c66c6e0cSAlex Elder * Note that code herein does not assume the image name is known (it 137c66c6e0cSAlex Elder * could be a null pointer). 1380d7dbfceSAlex Elder */ 1390d7dbfceSAlex Elder struct rbd_spec { 1400d7dbfceSAlex Elder u64 pool_id; 1410d7dbfceSAlex Elder char *pool_name; 1420d7dbfceSAlex Elder 1430d7dbfceSAlex Elder char *image_id; 1440d7dbfceSAlex Elder char *image_name; 1450d7dbfceSAlex Elder 1460d7dbfceSAlex Elder u64 snap_id; 1470d7dbfceSAlex Elder char *snap_name; 1480d7dbfceSAlex Elder 1490d7dbfceSAlex Elder struct kref kref; 1500d7dbfceSAlex Elder }; 1510d7dbfceSAlex Elder 152602adf40SYehuda Sadeh /* 153f0f8cef5SAlex Elder * an instance of the client. multiple devices may share an rbd client. 154602adf40SYehuda Sadeh */ 155602adf40SYehuda Sadeh struct rbd_client { 156602adf40SYehuda Sadeh struct ceph_client *client; 157602adf40SYehuda Sadeh struct kref kref; 158602adf40SYehuda Sadeh struct list_head node; 159602adf40SYehuda Sadeh }; 160602adf40SYehuda Sadeh 161bf0d5f50SAlex Elder struct rbd_img_request; 162bf0d5f50SAlex Elder typedef void (*rbd_img_callback_t)(struct rbd_img_request *); 163bf0d5f50SAlex Elder 164bf0d5f50SAlex Elder #define BAD_WHICH U32_MAX /* Good which or bad which, which? */ 165bf0d5f50SAlex Elder 166bf0d5f50SAlex Elder struct rbd_obj_request; 167bf0d5f50SAlex Elder typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *); 168bf0d5f50SAlex Elder 1699969ebc5SAlex Elder enum obj_request_type { 1709969ebc5SAlex Elder OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES 1719969ebc5SAlex Elder }; 172bf0d5f50SAlex Elder 173926f9b3fSAlex Elder enum obj_req_flags { 174926f9b3fSAlex Elder OBJ_REQ_DONE, /* completion flag: not done = 0, done = 1 */ 1756365d33aSAlex Elder OBJ_REQ_IMG_DATA, /* object usage: standalone = 0, image = 1 */ 176926f9b3fSAlex Elder }; 177926f9b3fSAlex Elder 178bf0d5f50SAlex Elder struct rbd_obj_request { 179bf0d5f50SAlex Elder const char *object_name; 180bf0d5f50SAlex Elder u64 offset; /* object start byte */ 181bf0d5f50SAlex Elder u64 length; /* bytes from offset */ 182926f9b3fSAlex Elder unsigned long flags; 183bf0d5f50SAlex Elder 184bf0d5f50SAlex Elder struct rbd_img_request *img_request; 1857da22d29SAlex Elder u64 img_offset; /* image relative offset */ 186bf0d5f50SAlex Elder struct list_head links; /* img_request->obj_requests */ 187bf0d5f50SAlex Elder u32 which; /* posn image request list */ 188bf0d5f50SAlex Elder 189bf0d5f50SAlex Elder enum obj_request_type type; 190788e2df3SAlex Elder union { 191bf0d5f50SAlex Elder struct bio *bio_list; 192788e2df3SAlex Elder struct { 193788e2df3SAlex Elder struct page **pages; 194788e2df3SAlex Elder u32 page_count; 195788e2df3SAlex Elder }; 196788e2df3SAlex Elder }; 197bf0d5f50SAlex Elder 198bf0d5f50SAlex Elder struct ceph_osd_request *osd_req; 199bf0d5f50SAlex Elder 200bf0d5f50SAlex Elder u64 xferred; /* bytes transferred */ 201bf0d5f50SAlex Elder u64 version; 2021b83bef2SSage Weil int result; 203bf0d5f50SAlex Elder 204bf0d5f50SAlex Elder rbd_obj_callback_t callback; 205788e2df3SAlex Elder struct completion completion; 206bf0d5f50SAlex Elder 207bf0d5f50SAlex Elder struct kref kref; 208bf0d5f50SAlex Elder }; 209bf0d5f50SAlex Elder 2100c425248SAlex Elder enum img_req_flags { 2119849e986SAlex Elder IMG_REQ_WRITE, /* I/O direction: read = 0, write = 1 */ 2129849e986SAlex Elder IMG_REQ_CHILD, /* initiator: block = 0, child image = 1 */ 213d0b2e944SAlex Elder IMG_REQ_LAYERED, /* ENOENT handling: normal = 0, layered = 1 */ 2140c425248SAlex Elder }; 2150c425248SAlex Elder 216bf0d5f50SAlex Elder struct rbd_img_request { 217bf0d5f50SAlex Elder struct rbd_device *rbd_dev; 218bf0d5f50SAlex Elder u64 offset; /* starting image byte offset */ 219bf0d5f50SAlex Elder u64 length; /* byte count from offset */ 2200c425248SAlex Elder unsigned long flags; 221bf0d5f50SAlex Elder union { 222bf0d5f50SAlex Elder u64 snap_id; /* for reads */ 2239849e986SAlex Elder struct ceph_snap_context *snapc; /* for writes */ 2249849e986SAlex Elder }; 2259849e986SAlex Elder union { 2269849e986SAlex Elder struct request *rq; /* block request */ 2279849e986SAlex Elder struct rbd_obj_request *obj_request; /* obj req initiator */ 228bf0d5f50SAlex Elder }; 229bf0d5f50SAlex Elder spinlock_t completion_lock;/* protects next_completion */ 230bf0d5f50SAlex Elder u32 next_completion; 231bf0d5f50SAlex Elder rbd_img_callback_t callback; 23255f27e09SAlex Elder u64 xferred;/* aggregate bytes transferred */ 233a5a337d4SAlex Elder int result; /* first nonzero obj_request result */ 234bf0d5f50SAlex Elder 235bf0d5f50SAlex Elder u32 obj_request_count; 236bf0d5f50SAlex Elder struct list_head obj_requests; /* rbd_obj_request structs */ 237bf0d5f50SAlex Elder 238bf0d5f50SAlex Elder struct kref kref; 239bf0d5f50SAlex Elder }; 240bf0d5f50SAlex Elder 241bf0d5f50SAlex Elder #define for_each_obj_request(ireq, oreq) \ 242ef06f4d3SAlex Elder list_for_each_entry(oreq, &(ireq)->obj_requests, links) 243bf0d5f50SAlex Elder #define for_each_obj_request_from(ireq, oreq) \ 244ef06f4d3SAlex Elder list_for_each_entry_from(oreq, &(ireq)->obj_requests, links) 245bf0d5f50SAlex Elder #define for_each_obj_request_safe(ireq, oreq, n) \ 246ef06f4d3SAlex Elder list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links) 247bf0d5f50SAlex Elder 248dfc5606dSYehuda Sadeh struct rbd_snap { 249dfc5606dSYehuda Sadeh struct device dev; 250dfc5606dSYehuda Sadeh const char *name; 2513591538fSJosh Durgin u64 size; 252dfc5606dSYehuda Sadeh struct list_head node; 253dfc5606dSYehuda Sadeh u64 id; 25434b13184SAlex Elder u64 features; 255dfc5606dSYehuda Sadeh }; 256dfc5606dSYehuda Sadeh 257f84344f3SAlex Elder struct rbd_mapping { 25899c1f08fSAlex Elder u64 size; 25934b13184SAlex Elder u64 features; 260f84344f3SAlex Elder bool read_only; 261f84344f3SAlex Elder }; 262f84344f3SAlex Elder 263602adf40SYehuda Sadeh /* 264602adf40SYehuda Sadeh * a single device 265602adf40SYehuda Sadeh */ 266602adf40SYehuda Sadeh struct rbd_device { 267de71a297SAlex Elder int dev_id; /* blkdev unique id */ 268602adf40SYehuda Sadeh 269602adf40SYehuda Sadeh int major; /* blkdev assigned major */ 270602adf40SYehuda Sadeh struct gendisk *disk; /* blkdev's gendisk and rq */ 271602adf40SYehuda Sadeh 272a30b71b9SAlex Elder u32 image_format; /* Either 1 or 2 */ 273602adf40SYehuda Sadeh struct rbd_client *rbd_client; 274602adf40SYehuda Sadeh 275602adf40SYehuda Sadeh char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */ 276602adf40SYehuda Sadeh 277b82d167bSAlex Elder spinlock_t lock; /* queue, flags, open_count */ 278602adf40SYehuda Sadeh 279602adf40SYehuda Sadeh struct rbd_image_header header; 280b82d167bSAlex Elder unsigned long flags; /* possibly lock protected */ 2810d7dbfceSAlex Elder struct rbd_spec *spec; 282602adf40SYehuda Sadeh 2830d7dbfceSAlex Elder char *header_name; 284971f839aSAlex Elder 2850903e875SAlex Elder struct ceph_file_layout layout; 2860903e875SAlex Elder 28759c2be1eSYehuda Sadeh struct ceph_osd_event *watch_event; 288975241afSAlex Elder struct rbd_obj_request *watch_request; 28959c2be1eSYehuda Sadeh 29086b00e0dSAlex Elder struct rbd_spec *parent_spec; 29186b00e0dSAlex Elder u64 parent_overlap; 2922f82ee54SAlex Elder struct rbd_device *parent; 29386b00e0dSAlex Elder 294c666601aSJosh Durgin /* protects updating the header */ 295c666601aSJosh Durgin struct rw_semaphore header_rwsem; 296f84344f3SAlex Elder 297f84344f3SAlex Elder struct rbd_mapping mapping; 298602adf40SYehuda Sadeh 299602adf40SYehuda Sadeh struct list_head node; 300dfc5606dSYehuda Sadeh 301dfc5606dSYehuda Sadeh /* list of snapshots */ 302dfc5606dSYehuda Sadeh struct list_head snaps; 303dfc5606dSYehuda Sadeh 304dfc5606dSYehuda Sadeh /* sysfs related */ 305dfc5606dSYehuda Sadeh struct device dev; 306b82d167bSAlex Elder unsigned long open_count; /* protected by lock */ 307dfc5606dSYehuda Sadeh }; 308dfc5606dSYehuda Sadeh 309b82d167bSAlex Elder /* 310b82d167bSAlex Elder * Flag bits for rbd_dev->flags. If atomicity is required, 311b82d167bSAlex Elder * rbd_dev->lock is used to protect access. 312b82d167bSAlex Elder * 313b82d167bSAlex Elder * Currently, only the "removing" flag (which is coupled with the 314b82d167bSAlex Elder * "open_count" field) requires atomic access. 315b82d167bSAlex Elder */ 3166d292906SAlex Elder enum rbd_dev_flags { 3176d292906SAlex Elder RBD_DEV_FLAG_EXISTS, /* mapped snapshot has not been deleted */ 318b82d167bSAlex Elder RBD_DEV_FLAG_REMOVING, /* this mapping is being removed */ 3196d292906SAlex Elder }; 3206d292906SAlex Elder 321602adf40SYehuda Sadeh static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */ 322e124a82fSAlex Elder 323602adf40SYehuda Sadeh static LIST_HEAD(rbd_dev_list); /* devices */ 324e124a82fSAlex Elder static DEFINE_SPINLOCK(rbd_dev_list_lock); 325e124a82fSAlex Elder 326602adf40SYehuda Sadeh static LIST_HEAD(rbd_client_list); /* clients */ 327432b8587SAlex Elder static DEFINE_SPINLOCK(rbd_client_list_lock); 328602adf40SYehuda Sadeh 329304f6808SAlex Elder static int rbd_dev_snaps_update(struct rbd_device *rbd_dev); 330304f6808SAlex Elder static int rbd_dev_snaps_register(struct rbd_device *rbd_dev); 331304f6808SAlex Elder 332dfc5606dSYehuda Sadeh static void rbd_dev_release(struct device *dev); 33341f38c2bSAlex Elder static void rbd_remove_snap_dev(struct rbd_snap *snap); 334dfc5606dSYehuda Sadeh 335f0f8cef5SAlex Elder static ssize_t rbd_add(struct bus_type *bus, const char *buf, 336f0f8cef5SAlex Elder size_t count); 337f0f8cef5SAlex Elder static ssize_t rbd_remove(struct bus_type *bus, const char *buf, 338f0f8cef5SAlex Elder size_t count); 3392f82ee54SAlex Elder static int rbd_dev_probe(struct rbd_device *rbd_dev); 340f0f8cef5SAlex Elder 341f0f8cef5SAlex Elder static struct bus_attribute rbd_bus_attrs[] = { 342f0f8cef5SAlex Elder __ATTR(add, S_IWUSR, NULL, rbd_add), 343f0f8cef5SAlex Elder __ATTR(remove, S_IWUSR, NULL, rbd_remove), 344f0f8cef5SAlex Elder __ATTR_NULL 345f0f8cef5SAlex Elder }; 346f0f8cef5SAlex Elder 347f0f8cef5SAlex Elder static struct bus_type rbd_bus_type = { 348f0f8cef5SAlex Elder .name = "rbd", 349f0f8cef5SAlex Elder .bus_attrs = rbd_bus_attrs, 350f0f8cef5SAlex Elder }; 351f0f8cef5SAlex Elder 352f0f8cef5SAlex Elder static void rbd_root_dev_release(struct device *dev) 353f0f8cef5SAlex Elder { 354f0f8cef5SAlex Elder } 355f0f8cef5SAlex Elder 356f0f8cef5SAlex Elder static struct device rbd_root_dev = { 357f0f8cef5SAlex Elder .init_name = "rbd", 358f0f8cef5SAlex Elder .release = rbd_root_dev_release, 359f0f8cef5SAlex Elder }; 360f0f8cef5SAlex Elder 36106ecc6cbSAlex Elder static __printf(2, 3) 36206ecc6cbSAlex Elder void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...) 36306ecc6cbSAlex Elder { 36406ecc6cbSAlex Elder struct va_format vaf; 36506ecc6cbSAlex Elder va_list args; 36606ecc6cbSAlex Elder 36706ecc6cbSAlex Elder va_start(args, fmt); 36806ecc6cbSAlex Elder vaf.fmt = fmt; 36906ecc6cbSAlex Elder vaf.va = &args; 37006ecc6cbSAlex Elder 37106ecc6cbSAlex Elder if (!rbd_dev) 37206ecc6cbSAlex Elder printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf); 37306ecc6cbSAlex Elder else if (rbd_dev->disk) 37406ecc6cbSAlex Elder printk(KERN_WARNING "%s: %s: %pV\n", 37506ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf); 37606ecc6cbSAlex Elder else if (rbd_dev->spec && rbd_dev->spec->image_name) 37706ecc6cbSAlex Elder printk(KERN_WARNING "%s: image %s: %pV\n", 37806ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf); 37906ecc6cbSAlex Elder else if (rbd_dev->spec && rbd_dev->spec->image_id) 38006ecc6cbSAlex Elder printk(KERN_WARNING "%s: id %s: %pV\n", 38106ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf); 38206ecc6cbSAlex Elder else /* punt */ 38306ecc6cbSAlex Elder printk(KERN_WARNING "%s: rbd_dev %p: %pV\n", 38406ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev, &vaf); 38506ecc6cbSAlex Elder va_end(args); 38606ecc6cbSAlex Elder } 38706ecc6cbSAlex Elder 388aafb230eSAlex Elder #ifdef RBD_DEBUG 389aafb230eSAlex Elder #define rbd_assert(expr) \ 390aafb230eSAlex Elder if (unlikely(!(expr))) { \ 391aafb230eSAlex Elder printk(KERN_ERR "\nAssertion failure in %s() " \ 392aafb230eSAlex Elder "at line %d:\n\n" \ 393aafb230eSAlex Elder "\trbd_assert(%s);\n\n", \ 394aafb230eSAlex Elder __func__, __LINE__, #expr); \ 395aafb230eSAlex Elder BUG(); \ 396aafb230eSAlex Elder } 397aafb230eSAlex Elder #else /* !RBD_DEBUG */ 398aafb230eSAlex Elder # define rbd_assert(expr) ((void) 0) 399aafb230eSAlex Elder #endif /* !RBD_DEBUG */ 400dfc5606dSYehuda Sadeh 4018b3e1a56SAlex Elder static void rbd_img_parent_read(struct rbd_obj_request *obj_request); 4028b3e1a56SAlex Elder 403117973fbSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver); 404117973fbSAlex Elder static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver); 40559c2be1eSYehuda Sadeh 406602adf40SYehuda Sadeh static int rbd_open(struct block_device *bdev, fmode_t mode) 407602adf40SYehuda Sadeh { 408f0f8cef5SAlex Elder struct rbd_device *rbd_dev = bdev->bd_disk->private_data; 409b82d167bSAlex Elder bool removing = false; 410602adf40SYehuda Sadeh 411f84344f3SAlex Elder if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only) 412602adf40SYehuda Sadeh return -EROFS; 413602adf40SYehuda Sadeh 414a14ea269SAlex Elder spin_lock_irq(&rbd_dev->lock); 415b82d167bSAlex Elder if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags)) 416b82d167bSAlex Elder removing = true; 417b82d167bSAlex Elder else 418b82d167bSAlex Elder rbd_dev->open_count++; 419a14ea269SAlex Elder spin_unlock_irq(&rbd_dev->lock); 420b82d167bSAlex Elder if (removing) 421b82d167bSAlex Elder return -ENOENT; 422b82d167bSAlex Elder 42342382b70SAlex Elder mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); 424c3e946ceSAlex Elder (void) get_device(&rbd_dev->dev); 425f84344f3SAlex Elder set_device_ro(bdev, rbd_dev->mapping.read_only); 42642382b70SAlex Elder mutex_unlock(&ctl_mutex); 427340c7a2bSAlex Elder 428602adf40SYehuda Sadeh return 0; 429602adf40SYehuda Sadeh } 430602adf40SYehuda Sadeh 431dfc5606dSYehuda Sadeh static int rbd_release(struct gendisk *disk, fmode_t mode) 432dfc5606dSYehuda Sadeh { 433dfc5606dSYehuda Sadeh struct rbd_device *rbd_dev = disk->private_data; 434b82d167bSAlex Elder unsigned long open_count_before; 435b82d167bSAlex Elder 436a14ea269SAlex Elder spin_lock_irq(&rbd_dev->lock); 437b82d167bSAlex Elder open_count_before = rbd_dev->open_count--; 438a14ea269SAlex Elder spin_unlock_irq(&rbd_dev->lock); 439b82d167bSAlex Elder rbd_assert(open_count_before > 0); 440dfc5606dSYehuda Sadeh 44142382b70SAlex Elder mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); 442c3e946ceSAlex Elder put_device(&rbd_dev->dev); 44342382b70SAlex Elder mutex_unlock(&ctl_mutex); 444dfc5606dSYehuda Sadeh 445dfc5606dSYehuda Sadeh return 0; 446dfc5606dSYehuda Sadeh } 447dfc5606dSYehuda Sadeh 448602adf40SYehuda Sadeh static const struct block_device_operations rbd_bd_ops = { 449602adf40SYehuda Sadeh .owner = THIS_MODULE, 450602adf40SYehuda Sadeh .open = rbd_open, 451dfc5606dSYehuda Sadeh .release = rbd_release, 452602adf40SYehuda Sadeh }; 453602adf40SYehuda Sadeh 454602adf40SYehuda Sadeh /* 455602adf40SYehuda Sadeh * Initialize an rbd client instance. 45643ae4701SAlex Elder * We own *ceph_opts. 457602adf40SYehuda Sadeh */ 458f8c38929SAlex Elder static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts) 459602adf40SYehuda Sadeh { 460602adf40SYehuda Sadeh struct rbd_client *rbdc; 461602adf40SYehuda Sadeh int ret = -ENOMEM; 462602adf40SYehuda Sadeh 46337206ee5SAlex Elder dout("%s:\n", __func__); 464602adf40SYehuda Sadeh rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL); 465602adf40SYehuda Sadeh if (!rbdc) 466602adf40SYehuda Sadeh goto out_opt; 467602adf40SYehuda Sadeh 468602adf40SYehuda Sadeh kref_init(&rbdc->kref); 469602adf40SYehuda Sadeh INIT_LIST_HEAD(&rbdc->node); 470602adf40SYehuda Sadeh 471bc534d86SAlex Elder mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); 472bc534d86SAlex Elder 47343ae4701SAlex Elder rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0); 474602adf40SYehuda Sadeh if (IS_ERR(rbdc->client)) 475bc534d86SAlex Elder goto out_mutex; 47643ae4701SAlex Elder ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */ 477602adf40SYehuda Sadeh 478602adf40SYehuda Sadeh ret = ceph_open_session(rbdc->client); 479602adf40SYehuda Sadeh if (ret < 0) 480602adf40SYehuda Sadeh goto out_err; 481602adf40SYehuda Sadeh 482432b8587SAlex Elder spin_lock(&rbd_client_list_lock); 483602adf40SYehuda Sadeh list_add_tail(&rbdc->node, &rbd_client_list); 484432b8587SAlex Elder spin_unlock(&rbd_client_list_lock); 485602adf40SYehuda Sadeh 486bc534d86SAlex Elder mutex_unlock(&ctl_mutex); 48737206ee5SAlex Elder dout("%s: rbdc %p\n", __func__, rbdc); 488bc534d86SAlex Elder 489602adf40SYehuda Sadeh return rbdc; 490602adf40SYehuda Sadeh 491602adf40SYehuda Sadeh out_err: 492602adf40SYehuda Sadeh ceph_destroy_client(rbdc->client); 493bc534d86SAlex Elder out_mutex: 494bc534d86SAlex Elder mutex_unlock(&ctl_mutex); 495602adf40SYehuda Sadeh kfree(rbdc); 496602adf40SYehuda Sadeh out_opt: 49743ae4701SAlex Elder if (ceph_opts) 49843ae4701SAlex Elder ceph_destroy_options(ceph_opts); 49937206ee5SAlex Elder dout("%s: error %d\n", __func__, ret); 50037206ee5SAlex Elder 50128f259b7SVasiliy Kulikov return ERR_PTR(ret); 502602adf40SYehuda Sadeh } 503602adf40SYehuda Sadeh 5042f82ee54SAlex Elder static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc) 5052f82ee54SAlex Elder { 5062f82ee54SAlex Elder kref_get(&rbdc->kref); 5072f82ee54SAlex Elder 5082f82ee54SAlex Elder return rbdc; 5092f82ee54SAlex Elder } 5102f82ee54SAlex Elder 511602adf40SYehuda Sadeh /* 5121f7ba331SAlex Elder * Find a ceph client with specific addr and configuration. If 5131f7ba331SAlex Elder * found, bump its reference count. 514602adf40SYehuda Sadeh */ 5151f7ba331SAlex Elder static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts) 516602adf40SYehuda Sadeh { 517602adf40SYehuda Sadeh struct rbd_client *client_node; 5181f7ba331SAlex Elder bool found = false; 519602adf40SYehuda Sadeh 52043ae4701SAlex Elder if (ceph_opts->flags & CEPH_OPT_NOSHARE) 521602adf40SYehuda Sadeh return NULL; 522602adf40SYehuda Sadeh 5231f7ba331SAlex Elder spin_lock(&rbd_client_list_lock); 5241f7ba331SAlex Elder list_for_each_entry(client_node, &rbd_client_list, node) { 5251f7ba331SAlex Elder if (!ceph_compare_options(ceph_opts, client_node->client)) { 5262f82ee54SAlex Elder __rbd_get_client(client_node); 5272f82ee54SAlex Elder 5281f7ba331SAlex Elder found = true; 5291f7ba331SAlex Elder break; 5301f7ba331SAlex Elder } 5311f7ba331SAlex Elder } 5321f7ba331SAlex Elder spin_unlock(&rbd_client_list_lock); 5331f7ba331SAlex Elder 5341f7ba331SAlex Elder return found ? client_node : NULL; 535602adf40SYehuda Sadeh } 536602adf40SYehuda Sadeh 537602adf40SYehuda Sadeh /* 53859c2be1eSYehuda Sadeh * mount options 53959c2be1eSYehuda Sadeh */ 54059c2be1eSYehuda Sadeh enum { 54159c2be1eSYehuda Sadeh Opt_last_int, 54259c2be1eSYehuda Sadeh /* int args above */ 54359c2be1eSYehuda Sadeh Opt_last_string, 54459c2be1eSYehuda Sadeh /* string args above */ 545cc0538b6SAlex Elder Opt_read_only, 546cc0538b6SAlex Elder Opt_read_write, 547cc0538b6SAlex Elder /* Boolean args above */ 548cc0538b6SAlex Elder Opt_last_bool, 54959c2be1eSYehuda Sadeh }; 55059c2be1eSYehuda Sadeh 55143ae4701SAlex Elder static match_table_t rbd_opts_tokens = { 55259c2be1eSYehuda Sadeh /* int args above */ 55359c2be1eSYehuda Sadeh /* string args above */ 554be466c1cSAlex Elder {Opt_read_only, "read_only"}, 555cc0538b6SAlex Elder {Opt_read_only, "ro"}, /* Alternate spelling */ 556cc0538b6SAlex Elder {Opt_read_write, "read_write"}, 557cc0538b6SAlex Elder {Opt_read_write, "rw"}, /* Alternate spelling */ 558cc0538b6SAlex Elder /* Boolean args above */ 55959c2be1eSYehuda Sadeh {-1, NULL} 56059c2be1eSYehuda Sadeh }; 56159c2be1eSYehuda Sadeh 56298571b5aSAlex Elder struct rbd_options { 56398571b5aSAlex Elder bool read_only; 56498571b5aSAlex Elder }; 56598571b5aSAlex Elder 56698571b5aSAlex Elder #define RBD_READ_ONLY_DEFAULT false 56798571b5aSAlex Elder 56859c2be1eSYehuda Sadeh static int parse_rbd_opts_token(char *c, void *private) 56959c2be1eSYehuda Sadeh { 57043ae4701SAlex Elder struct rbd_options *rbd_opts = private; 57159c2be1eSYehuda Sadeh substring_t argstr[MAX_OPT_ARGS]; 57259c2be1eSYehuda Sadeh int token, intval, ret; 57359c2be1eSYehuda Sadeh 57443ae4701SAlex Elder token = match_token(c, rbd_opts_tokens, argstr); 57559c2be1eSYehuda Sadeh if (token < 0) 57659c2be1eSYehuda Sadeh return -EINVAL; 57759c2be1eSYehuda Sadeh 57859c2be1eSYehuda Sadeh if (token < Opt_last_int) { 57959c2be1eSYehuda Sadeh ret = match_int(&argstr[0], &intval); 58059c2be1eSYehuda Sadeh if (ret < 0) { 58159c2be1eSYehuda Sadeh pr_err("bad mount option arg (not int) " 58259c2be1eSYehuda Sadeh "at '%s'\n", c); 58359c2be1eSYehuda Sadeh return ret; 58459c2be1eSYehuda Sadeh } 58559c2be1eSYehuda Sadeh dout("got int token %d val %d\n", token, intval); 58659c2be1eSYehuda Sadeh } else if (token > Opt_last_int && token < Opt_last_string) { 58759c2be1eSYehuda Sadeh dout("got string token %d val %s\n", token, 58859c2be1eSYehuda Sadeh argstr[0].from); 589cc0538b6SAlex Elder } else if (token > Opt_last_string && token < Opt_last_bool) { 590cc0538b6SAlex Elder dout("got Boolean token %d\n", token); 59159c2be1eSYehuda Sadeh } else { 59259c2be1eSYehuda Sadeh dout("got token %d\n", token); 59359c2be1eSYehuda Sadeh } 59459c2be1eSYehuda Sadeh 59559c2be1eSYehuda Sadeh switch (token) { 596cc0538b6SAlex Elder case Opt_read_only: 597cc0538b6SAlex Elder rbd_opts->read_only = true; 598cc0538b6SAlex Elder break; 599cc0538b6SAlex Elder case Opt_read_write: 600cc0538b6SAlex Elder rbd_opts->read_only = false; 601cc0538b6SAlex Elder break; 60259c2be1eSYehuda Sadeh default: 603aafb230eSAlex Elder rbd_assert(false); 604aafb230eSAlex Elder break; 60559c2be1eSYehuda Sadeh } 60659c2be1eSYehuda Sadeh return 0; 60759c2be1eSYehuda Sadeh } 60859c2be1eSYehuda Sadeh 60959c2be1eSYehuda Sadeh /* 610602adf40SYehuda Sadeh * Get a ceph client with specific addr and configuration, if one does 611602adf40SYehuda Sadeh * not exist create it. 612602adf40SYehuda Sadeh */ 6139d3997fdSAlex Elder static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts) 614602adf40SYehuda Sadeh { 615f8c38929SAlex Elder struct rbd_client *rbdc; 61659c2be1eSYehuda Sadeh 6171f7ba331SAlex Elder rbdc = rbd_client_find(ceph_opts); 6189d3997fdSAlex Elder if (rbdc) /* using an existing client */ 61943ae4701SAlex Elder ceph_destroy_options(ceph_opts); 6209d3997fdSAlex Elder else 621f8c38929SAlex Elder rbdc = rbd_client_create(ceph_opts); 622d720bcb0SAlex Elder 6239d3997fdSAlex Elder return rbdc; 624602adf40SYehuda Sadeh } 625602adf40SYehuda Sadeh 626602adf40SYehuda Sadeh /* 627602adf40SYehuda Sadeh * Destroy ceph client 628d23a4b3fSAlex Elder * 629432b8587SAlex Elder * Caller must hold rbd_client_list_lock. 630602adf40SYehuda Sadeh */ 631602adf40SYehuda Sadeh static void rbd_client_release(struct kref *kref) 632602adf40SYehuda Sadeh { 633602adf40SYehuda Sadeh struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref); 634602adf40SYehuda Sadeh 63537206ee5SAlex Elder dout("%s: rbdc %p\n", __func__, rbdc); 636cd9d9f5dSAlex Elder spin_lock(&rbd_client_list_lock); 637602adf40SYehuda Sadeh list_del(&rbdc->node); 638cd9d9f5dSAlex Elder spin_unlock(&rbd_client_list_lock); 639602adf40SYehuda Sadeh 640602adf40SYehuda Sadeh ceph_destroy_client(rbdc->client); 641602adf40SYehuda Sadeh kfree(rbdc); 642602adf40SYehuda Sadeh } 643602adf40SYehuda Sadeh 644602adf40SYehuda Sadeh /* 645602adf40SYehuda Sadeh * Drop reference to ceph client node. If it's not referenced anymore, release 646602adf40SYehuda Sadeh * it. 647602adf40SYehuda Sadeh */ 6489d3997fdSAlex Elder static void rbd_put_client(struct rbd_client *rbdc) 649602adf40SYehuda Sadeh { 650c53d5893SAlex Elder if (rbdc) 6519d3997fdSAlex Elder kref_put(&rbdc->kref, rbd_client_release); 652602adf40SYehuda Sadeh } 653602adf40SYehuda Sadeh 654a30b71b9SAlex Elder static bool rbd_image_format_valid(u32 image_format) 655a30b71b9SAlex Elder { 656a30b71b9SAlex Elder return image_format == 1 || image_format == 2; 657a30b71b9SAlex Elder } 658a30b71b9SAlex Elder 6598e94af8eSAlex Elder static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk) 6608e94af8eSAlex Elder { 661103a150fSAlex Elder size_t size; 662103a150fSAlex Elder u32 snap_count; 663103a150fSAlex Elder 664103a150fSAlex Elder /* The header has to start with the magic rbd header text */ 665103a150fSAlex Elder if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT))) 666103a150fSAlex Elder return false; 667103a150fSAlex Elder 668db2388b6SAlex Elder /* The bio layer requires at least sector-sized I/O */ 669db2388b6SAlex Elder 670db2388b6SAlex Elder if (ondisk->options.order < SECTOR_SHIFT) 671db2388b6SAlex Elder return false; 672db2388b6SAlex Elder 673db2388b6SAlex Elder /* If we use u64 in a few spots we may be able to loosen this */ 674db2388b6SAlex Elder 675db2388b6SAlex Elder if (ondisk->options.order > 8 * sizeof (int) - 1) 676db2388b6SAlex Elder return false; 677db2388b6SAlex Elder 678103a150fSAlex Elder /* 679103a150fSAlex Elder * The size of a snapshot header has to fit in a size_t, and 680103a150fSAlex Elder * that limits the number of snapshots. 681103a150fSAlex Elder */ 682103a150fSAlex Elder snap_count = le32_to_cpu(ondisk->snap_count); 683103a150fSAlex Elder size = SIZE_MAX - sizeof (struct ceph_snap_context); 684103a150fSAlex Elder if (snap_count > size / sizeof (__le64)) 685103a150fSAlex Elder return false; 686103a150fSAlex Elder 687103a150fSAlex Elder /* 688103a150fSAlex Elder * Not only that, but the size of the entire the snapshot 689103a150fSAlex Elder * header must also be representable in a size_t. 690103a150fSAlex Elder */ 691103a150fSAlex Elder size -= snap_count * sizeof (__le64); 692103a150fSAlex Elder if ((u64) size < le64_to_cpu(ondisk->snap_names_len)) 693103a150fSAlex Elder return false; 694103a150fSAlex Elder 695103a150fSAlex Elder return true; 6968e94af8eSAlex Elder } 6978e94af8eSAlex Elder 698602adf40SYehuda Sadeh /* 699602adf40SYehuda Sadeh * Create a new header structure, translate header format from the on-disk 700602adf40SYehuda Sadeh * header. 701602adf40SYehuda Sadeh */ 702602adf40SYehuda Sadeh static int rbd_header_from_disk(struct rbd_image_header *header, 7034156d998SAlex Elder struct rbd_image_header_ondisk *ondisk) 704602adf40SYehuda Sadeh { 705ccece235SAlex Elder u32 snap_count; 70658c17b0eSAlex Elder size_t len; 707d2bb24e5SAlex Elder size_t size; 708621901d6SAlex Elder u32 i; 709602adf40SYehuda Sadeh 7106a52325fSAlex Elder memset(header, 0, sizeof (*header)); 7116a52325fSAlex Elder 712103a150fSAlex Elder snap_count = le32_to_cpu(ondisk->snap_count); 713103a150fSAlex Elder 71458c17b0eSAlex Elder len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix)); 71558c17b0eSAlex Elder header->object_prefix = kmalloc(len + 1, GFP_KERNEL); 7166a52325fSAlex Elder if (!header->object_prefix) 717602adf40SYehuda Sadeh return -ENOMEM; 71858c17b0eSAlex Elder memcpy(header->object_prefix, ondisk->object_prefix, len); 71958c17b0eSAlex Elder header->object_prefix[len] = '\0'; 72000f1f36fSAlex Elder 721602adf40SYehuda Sadeh if (snap_count) { 722f785cc1dSAlex Elder u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len); 723f785cc1dSAlex Elder 724621901d6SAlex Elder /* Save a copy of the snapshot names */ 725621901d6SAlex Elder 726f785cc1dSAlex Elder if (snap_names_len > (u64) SIZE_MAX) 727f785cc1dSAlex Elder return -EIO; 728f785cc1dSAlex Elder header->snap_names = kmalloc(snap_names_len, GFP_KERNEL); 729602adf40SYehuda Sadeh if (!header->snap_names) 7306a52325fSAlex Elder goto out_err; 731f785cc1dSAlex Elder /* 732f785cc1dSAlex Elder * Note that rbd_dev_v1_header_read() guarantees 733f785cc1dSAlex Elder * the ondisk buffer we're working with has 734f785cc1dSAlex Elder * snap_names_len bytes beyond the end of the 735f785cc1dSAlex Elder * snapshot id array, this memcpy() is safe. 736f785cc1dSAlex Elder */ 737f785cc1dSAlex Elder memcpy(header->snap_names, &ondisk->snaps[snap_count], 738f785cc1dSAlex Elder snap_names_len); 7396a52325fSAlex Elder 740621901d6SAlex Elder /* Record each snapshot's size */ 741621901d6SAlex Elder 742d2bb24e5SAlex Elder size = snap_count * sizeof (*header->snap_sizes); 743d2bb24e5SAlex Elder header->snap_sizes = kmalloc(size, GFP_KERNEL); 744602adf40SYehuda Sadeh if (!header->snap_sizes) 7456a52325fSAlex Elder goto out_err; 746621901d6SAlex Elder for (i = 0; i < snap_count; i++) 747621901d6SAlex Elder header->snap_sizes[i] = 748621901d6SAlex Elder le64_to_cpu(ondisk->snaps[i].image_size); 749602adf40SYehuda Sadeh } else { 750ccece235SAlex Elder WARN_ON(ondisk->snap_names_len); 751602adf40SYehuda Sadeh header->snap_names = NULL; 752602adf40SYehuda Sadeh header->snap_sizes = NULL; 753602adf40SYehuda Sadeh } 754849b4260SAlex Elder 75534b13184SAlex Elder header->features = 0; /* No features support in v1 images */ 756602adf40SYehuda Sadeh header->obj_order = ondisk->options.order; 757602adf40SYehuda Sadeh header->crypt_type = ondisk->options.crypt_type; 758602adf40SYehuda Sadeh header->comp_type = ondisk->options.comp_type; 7596a52325fSAlex Elder 760621901d6SAlex Elder /* Allocate and fill in the snapshot context */ 761621901d6SAlex Elder 762f84344f3SAlex Elder header->image_size = le64_to_cpu(ondisk->image_size); 7636a52325fSAlex Elder size = sizeof (struct ceph_snap_context); 7646a52325fSAlex Elder size += snap_count * sizeof (header->snapc->snaps[0]); 7656a52325fSAlex Elder header->snapc = kzalloc(size, GFP_KERNEL); 7666a52325fSAlex Elder if (!header->snapc) 7676a52325fSAlex Elder goto out_err; 768602adf40SYehuda Sadeh 769602adf40SYehuda Sadeh atomic_set(&header->snapc->nref, 1); 770505cbb9bSAlex Elder header->snapc->seq = le64_to_cpu(ondisk->snap_seq); 771602adf40SYehuda Sadeh header->snapc->num_snaps = snap_count; 772621901d6SAlex Elder for (i = 0; i < snap_count; i++) 773602adf40SYehuda Sadeh header->snapc->snaps[i] = 774602adf40SYehuda Sadeh le64_to_cpu(ondisk->snaps[i].id); 775602adf40SYehuda Sadeh 776602adf40SYehuda Sadeh return 0; 777602adf40SYehuda Sadeh 7786a52325fSAlex Elder out_err: 779849b4260SAlex Elder kfree(header->snap_sizes); 780ccece235SAlex Elder header->snap_sizes = NULL; 781602adf40SYehuda Sadeh kfree(header->snap_names); 782ccece235SAlex Elder header->snap_names = NULL; 7836a52325fSAlex Elder kfree(header->object_prefix); 7846a52325fSAlex Elder header->object_prefix = NULL; 785ccece235SAlex Elder 78600f1f36fSAlex Elder return -ENOMEM; 787602adf40SYehuda Sadeh } 788602adf40SYehuda Sadeh 7899e15b77dSAlex Elder static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id) 7909e15b77dSAlex Elder { 7919e15b77dSAlex Elder struct rbd_snap *snap; 7929e15b77dSAlex Elder 7939e15b77dSAlex Elder if (snap_id == CEPH_NOSNAP) 7949e15b77dSAlex Elder return RBD_SNAP_HEAD_NAME; 7959e15b77dSAlex Elder 7969e15b77dSAlex Elder list_for_each_entry(snap, &rbd_dev->snaps, node) 7979e15b77dSAlex Elder if (snap_id == snap->id) 7989e15b77dSAlex Elder return snap->name; 7999e15b77dSAlex Elder 8009e15b77dSAlex Elder return NULL; 8019e15b77dSAlex Elder } 8029e15b77dSAlex Elder 8038836b995SAlex Elder static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name) 804602adf40SYehuda Sadeh { 805602adf40SYehuda Sadeh 806e86924a8SAlex Elder struct rbd_snap *snap; 80700f1f36fSAlex Elder 808e86924a8SAlex Elder list_for_each_entry(snap, &rbd_dev->snaps, node) { 809e86924a8SAlex Elder if (!strcmp(snap_name, snap->name)) { 8100d7dbfceSAlex Elder rbd_dev->spec->snap_id = snap->id; 811e86924a8SAlex Elder rbd_dev->mapping.size = snap->size; 81234b13184SAlex Elder rbd_dev->mapping.features = snap->features; 81300f1f36fSAlex Elder 814e86924a8SAlex Elder return 0; 815602adf40SYehuda Sadeh } 81600f1f36fSAlex Elder } 817e86924a8SAlex Elder 81800f1f36fSAlex Elder return -ENOENT; 81900f1f36fSAlex Elder } 820602adf40SYehuda Sadeh 821819d52bfSAlex Elder static int rbd_dev_set_mapping(struct rbd_device *rbd_dev) 822602adf40SYehuda Sadeh { 82378dc447dSAlex Elder int ret; 824602adf40SYehuda Sadeh 8250d7dbfceSAlex Elder if (!memcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME, 826cc9d734cSJosh Durgin sizeof (RBD_SNAP_HEAD_NAME))) { 8270d7dbfceSAlex Elder rbd_dev->spec->snap_id = CEPH_NOSNAP; 82899c1f08fSAlex Elder rbd_dev->mapping.size = rbd_dev->header.image_size; 82934b13184SAlex Elder rbd_dev->mapping.features = rbd_dev->header.features; 830e86924a8SAlex Elder ret = 0; 831602adf40SYehuda Sadeh } else { 8320d7dbfceSAlex Elder ret = snap_by_name(rbd_dev, rbd_dev->spec->snap_name); 833602adf40SYehuda Sadeh if (ret < 0) 834602adf40SYehuda Sadeh goto done; 835f84344f3SAlex Elder rbd_dev->mapping.read_only = true; 836602adf40SYehuda Sadeh } 8376d292906SAlex Elder set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); 8386d292906SAlex Elder 839602adf40SYehuda Sadeh done: 840602adf40SYehuda Sadeh return ret; 841602adf40SYehuda Sadeh } 842602adf40SYehuda Sadeh 843602adf40SYehuda Sadeh static void rbd_header_free(struct rbd_image_header *header) 844602adf40SYehuda Sadeh { 845849b4260SAlex Elder kfree(header->object_prefix); 846d78fd7aeSAlex Elder header->object_prefix = NULL; 847602adf40SYehuda Sadeh kfree(header->snap_sizes); 848d78fd7aeSAlex Elder header->snap_sizes = NULL; 849849b4260SAlex Elder kfree(header->snap_names); 850d78fd7aeSAlex Elder header->snap_names = NULL; 851d1d25646SJosh Durgin ceph_put_snap_context(header->snapc); 852d78fd7aeSAlex Elder header->snapc = NULL; 853602adf40SYehuda Sadeh } 854602adf40SYehuda Sadeh 85598571b5aSAlex Elder static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset) 856602adf40SYehuda Sadeh { 85765ccfe21SAlex Elder char *name; 85865ccfe21SAlex Elder u64 segment; 85965ccfe21SAlex Elder int ret; 860602adf40SYehuda Sadeh 8612fd82b9eSAlex Elder name = kmalloc(MAX_OBJ_NAME_SIZE + 1, GFP_NOIO); 86265ccfe21SAlex Elder if (!name) 86365ccfe21SAlex Elder return NULL; 86465ccfe21SAlex Elder segment = offset >> rbd_dev->header.obj_order; 8652fd82b9eSAlex Elder ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, "%s.%012llx", 86665ccfe21SAlex Elder rbd_dev->header.object_prefix, segment); 8672fd82b9eSAlex Elder if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) { 86865ccfe21SAlex Elder pr_err("error formatting segment name for #%llu (%d)\n", 86965ccfe21SAlex Elder segment, ret); 87065ccfe21SAlex Elder kfree(name); 87165ccfe21SAlex Elder name = NULL; 87265ccfe21SAlex Elder } 873602adf40SYehuda Sadeh 87465ccfe21SAlex Elder return name; 87565ccfe21SAlex Elder } 876602adf40SYehuda Sadeh 87765ccfe21SAlex Elder static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset) 87865ccfe21SAlex Elder { 87965ccfe21SAlex Elder u64 segment_size = (u64) 1 << rbd_dev->header.obj_order; 880602adf40SYehuda Sadeh 88165ccfe21SAlex Elder return offset & (segment_size - 1); 88265ccfe21SAlex Elder } 88365ccfe21SAlex Elder 88465ccfe21SAlex Elder static u64 rbd_segment_length(struct rbd_device *rbd_dev, 88565ccfe21SAlex Elder u64 offset, u64 length) 88665ccfe21SAlex Elder { 88765ccfe21SAlex Elder u64 segment_size = (u64) 1 << rbd_dev->header.obj_order; 88865ccfe21SAlex Elder 88965ccfe21SAlex Elder offset &= segment_size - 1; 89065ccfe21SAlex Elder 891aafb230eSAlex Elder rbd_assert(length <= U64_MAX - offset); 89265ccfe21SAlex Elder if (offset + length > segment_size) 89365ccfe21SAlex Elder length = segment_size - offset; 89465ccfe21SAlex Elder 89565ccfe21SAlex Elder return length; 896602adf40SYehuda Sadeh } 897602adf40SYehuda Sadeh 898602adf40SYehuda Sadeh /* 899029bcbd8SJosh Durgin * returns the size of an object in the image 900029bcbd8SJosh Durgin */ 901029bcbd8SJosh Durgin static u64 rbd_obj_bytes(struct rbd_image_header *header) 902029bcbd8SJosh Durgin { 903029bcbd8SJosh Durgin return 1 << header->obj_order; 904029bcbd8SJosh Durgin } 905029bcbd8SJosh Durgin 906029bcbd8SJosh Durgin /* 907602adf40SYehuda Sadeh * bio helpers 908602adf40SYehuda Sadeh */ 909602adf40SYehuda Sadeh 910602adf40SYehuda Sadeh static void bio_chain_put(struct bio *chain) 911602adf40SYehuda Sadeh { 912602adf40SYehuda Sadeh struct bio *tmp; 913602adf40SYehuda Sadeh 914602adf40SYehuda Sadeh while (chain) { 915602adf40SYehuda Sadeh tmp = chain; 916602adf40SYehuda Sadeh chain = chain->bi_next; 917602adf40SYehuda Sadeh bio_put(tmp); 918602adf40SYehuda Sadeh } 919602adf40SYehuda Sadeh } 920602adf40SYehuda Sadeh 921602adf40SYehuda Sadeh /* 922602adf40SYehuda Sadeh * zeros a bio chain, starting at specific offset 923602adf40SYehuda Sadeh */ 924602adf40SYehuda Sadeh static void zero_bio_chain(struct bio *chain, int start_ofs) 925602adf40SYehuda Sadeh { 926602adf40SYehuda Sadeh struct bio_vec *bv; 927602adf40SYehuda Sadeh unsigned long flags; 928602adf40SYehuda Sadeh void *buf; 929602adf40SYehuda Sadeh int i; 930602adf40SYehuda Sadeh int pos = 0; 931602adf40SYehuda Sadeh 932602adf40SYehuda Sadeh while (chain) { 933602adf40SYehuda Sadeh bio_for_each_segment(bv, chain, i) { 934602adf40SYehuda Sadeh if (pos + bv->bv_len > start_ofs) { 935602adf40SYehuda Sadeh int remainder = max(start_ofs - pos, 0); 936602adf40SYehuda Sadeh buf = bvec_kmap_irq(bv, &flags); 937602adf40SYehuda Sadeh memset(buf + remainder, 0, 938602adf40SYehuda Sadeh bv->bv_len - remainder); 93985b5aaa6SDan Carpenter bvec_kunmap_irq(buf, &flags); 940602adf40SYehuda Sadeh } 941602adf40SYehuda Sadeh pos += bv->bv_len; 942602adf40SYehuda Sadeh } 943602adf40SYehuda Sadeh 944602adf40SYehuda Sadeh chain = chain->bi_next; 945602adf40SYehuda Sadeh } 946602adf40SYehuda Sadeh } 947602adf40SYehuda Sadeh 948602adf40SYehuda Sadeh /* 949f7760dadSAlex Elder * Clone a portion of a bio, starting at the given byte offset 950f7760dadSAlex Elder * and continuing for the number of bytes indicated. 951602adf40SYehuda Sadeh */ 952f7760dadSAlex Elder static struct bio *bio_clone_range(struct bio *bio_src, 953f7760dadSAlex Elder unsigned int offset, 954f7760dadSAlex Elder unsigned int len, 955f7760dadSAlex Elder gfp_t gfpmask) 956602adf40SYehuda Sadeh { 957f7760dadSAlex Elder struct bio_vec *bv; 958f7760dadSAlex Elder unsigned int resid; 959f7760dadSAlex Elder unsigned short idx; 960f7760dadSAlex Elder unsigned int voff; 961f7760dadSAlex Elder unsigned short end_idx; 962f7760dadSAlex Elder unsigned short vcnt; 963f7760dadSAlex Elder struct bio *bio; 964602adf40SYehuda Sadeh 965f7760dadSAlex Elder /* Handle the easy case for the caller */ 966f7760dadSAlex Elder 967f7760dadSAlex Elder if (!offset && len == bio_src->bi_size) 968f7760dadSAlex Elder return bio_clone(bio_src, gfpmask); 969f7760dadSAlex Elder 970f7760dadSAlex Elder if (WARN_ON_ONCE(!len)) 971f7760dadSAlex Elder return NULL; 972f7760dadSAlex Elder if (WARN_ON_ONCE(len > bio_src->bi_size)) 973f7760dadSAlex Elder return NULL; 974f7760dadSAlex Elder if (WARN_ON_ONCE(offset > bio_src->bi_size - len)) 975f7760dadSAlex Elder return NULL; 976f7760dadSAlex Elder 977f7760dadSAlex Elder /* Find first affected segment... */ 978f7760dadSAlex Elder 979f7760dadSAlex Elder resid = offset; 980f7760dadSAlex Elder __bio_for_each_segment(bv, bio_src, idx, 0) { 981f7760dadSAlex Elder if (resid < bv->bv_len) 982f7760dadSAlex Elder break; 983f7760dadSAlex Elder resid -= bv->bv_len; 984602adf40SYehuda Sadeh } 985f7760dadSAlex Elder voff = resid; 986602adf40SYehuda Sadeh 987f7760dadSAlex Elder /* ...and the last affected segment */ 988542582fcSAlex Elder 989f7760dadSAlex Elder resid += len; 990f7760dadSAlex Elder __bio_for_each_segment(bv, bio_src, end_idx, idx) { 991f7760dadSAlex Elder if (resid <= bv->bv_len) 992f7760dadSAlex Elder break; 993f7760dadSAlex Elder resid -= bv->bv_len; 994f7760dadSAlex Elder } 995f7760dadSAlex Elder vcnt = end_idx - idx + 1; 996602adf40SYehuda Sadeh 997f7760dadSAlex Elder /* Build the clone */ 998f7760dadSAlex Elder 999f7760dadSAlex Elder bio = bio_alloc(gfpmask, (unsigned int) vcnt); 1000f7760dadSAlex Elder if (!bio) 1001f7760dadSAlex Elder return NULL; /* ENOMEM */ 1002f7760dadSAlex Elder 1003f7760dadSAlex Elder bio->bi_bdev = bio_src->bi_bdev; 1004f7760dadSAlex Elder bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT); 1005f7760dadSAlex Elder bio->bi_rw = bio_src->bi_rw; 1006f7760dadSAlex Elder bio->bi_flags |= 1 << BIO_CLONED; 1007602adf40SYehuda Sadeh 1008602adf40SYehuda Sadeh /* 1009f7760dadSAlex Elder * Copy over our part of the bio_vec, then update the first 1010f7760dadSAlex Elder * and last (or only) entries. 1011602adf40SYehuda Sadeh */ 1012f7760dadSAlex Elder memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx], 1013f7760dadSAlex Elder vcnt * sizeof (struct bio_vec)); 1014f7760dadSAlex Elder bio->bi_io_vec[0].bv_offset += voff; 1015f7760dadSAlex Elder if (vcnt > 1) { 1016f7760dadSAlex Elder bio->bi_io_vec[0].bv_len -= voff; 1017f7760dadSAlex Elder bio->bi_io_vec[vcnt - 1].bv_len = resid; 1018602adf40SYehuda Sadeh } else { 1019f7760dadSAlex Elder bio->bi_io_vec[0].bv_len = len; 1020602adf40SYehuda Sadeh } 1021602adf40SYehuda Sadeh 1022f7760dadSAlex Elder bio->bi_vcnt = vcnt; 1023f7760dadSAlex Elder bio->bi_size = len; 1024f7760dadSAlex Elder bio->bi_idx = 0; 1025602adf40SYehuda Sadeh 1026f7760dadSAlex Elder return bio; 1027602adf40SYehuda Sadeh } 1028602adf40SYehuda Sadeh 1029f7760dadSAlex Elder /* 1030f7760dadSAlex Elder * Clone a portion of a bio chain, starting at the given byte offset 1031f7760dadSAlex Elder * into the first bio in the source chain and continuing for the 1032f7760dadSAlex Elder * number of bytes indicated. The result is another bio chain of 1033f7760dadSAlex Elder * exactly the given length, or a null pointer on error. 1034f7760dadSAlex Elder * 1035f7760dadSAlex Elder * The bio_src and offset parameters are both in-out. On entry they 1036f7760dadSAlex Elder * refer to the first source bio and the offset into that bio where 1037f7760dadSAlex Elder * the start of data to be cloned is located. 1038f7760dadSAlex Elder * 1039f7760dadSAlex Elder * On return, bio_src is updated to refer to the bio in the source 1040f7760dadSAlex Elder * chain that contains first un-cloned byte, and *offset will 1041f7760dadSAlex Elder * contain the offset of that byte within that bio. 1042f7760dadSAlex Elder */ 1043f7760dadSAlex Elder static struct bio *bio_chain_clone_range(struct bio **bio_src, 1044f7760dadSAlex Elder unsigned int *offset, 1045f7760dadSAlex Elder unsigned int len, 1046f7760dadSAlex Elder gfp_t gfpmask) 1047f7760dadSAlex Elder { 1048f7760dadSAlex Elder struct bio *bi = *bio_src; 1049f7760dadSAlex Elder unsigned int off = *offset; 1050f7760dadSAlex Elder struct bio *chain = NULL; 1051f7760dadSAlex Elder struct bio **end; 1052602adf40SYehuda Sadeh 1053f7760dadSAlex Elder /* Build up a chain of clone bios up to the limit */ 1054602adf40SYehuda Sadeh 1055f7760dadSAlex Elder if (!bi || off >= bi->bi_size || !len) 1056f7760dadSAlex Elder return NULL; /* Nothing to clone */ 1057602adf40SYehuda Sadeh 1058f7760dadSAlex Elder end = &chain; 1059f7760dadSAlex Elder while (len) { 1060f7760dadSAlex Elder unsigned int bi_size; 1061f7760dadSAlex Elder struct bio *bio; 1062f7760dadSAlex Elder 1063f5400b7aSAlex Elder if (!bi) { 1064f5400b7aSAlex Elder rbd_warn(NULL, "bio_chain exhausted with %u left", len); 1065f7760dadSAlex Elder goto out_err; /* EINVAL; ran out of bio's */ 1066f5400b7aSAlex Elder } 1067f7760dadSAlex Elder bi_size = min_t(unsigned int, bi->bi_size - off, len); 1068f7760dadSAlex Elder bio = bio_clone_range(bi, off, bi_size, gfpmask); 1069f7760dadSAlex Elder if (!bio) 1070f7760dadSAlex Elder goto out_err; /* ENOMEM */ 1071f7760dadSAlex Elder 1072f7760dadSAlex Elder *end = bio; 1073f7760dadSAlex Elder end = &bio->bi_next; 1074f7760dadSAlex Elder 1075f7760dadSAlex Elder off += bi_size; 1076f7760dadSAlex Elder if (off == bi->bi_size) { 1077f7760dadSAlex Elder bi = bi->bi_next; 1078f7760dadSAlex Elder off = 0; 1079f7760dadSAlex Elder } 1080f7760dadSAlex Elder len -= bi_size; 1081f7760dadSAlex Elder } 1082f7760dadSAlex Elder *bio_src = bi; 1083f7760dadSAlex Elder *offset = off; 1084f7760dadSAlex Elder 1085f7760dadSAlex Elder return chain; 1086f7760dadSAlex Elder out_err: 1087f7760dadSAlex Elder bio_chain_put(chain); 1088f7760dadSAlex Elder 1089602adf40SYehuda Sadeh return NULL; 1090602adf40SYehuda Sadeh } 1091602adf40SYehuda Sadeh 1092926f9b3fSAlex Elder /* 1093926f9b3fSAlex Elder * The default/initial value for all object request flags is 0. For 1094926f9b3fSAlex Elder * each flag, once its value is set to 1 it is never reset to 0 1095926f9b3fSAlex Elder * again. 1096926f9b3fSAlex Elder */ 1097926f9b3fSAlex Elder static void obj_request_done_set(struct rbd_obj_request *obj_request) 1098926f9b3fSAlex Elder { 1099926f9b3fSAlex Elder if (test_and_set_bit(OBJ_REQ_DONE, &obj_request->flags)) { 1100926f9b3fSAlex Elder struct rbd_img_request *img_request = obj_request->img_request; 1101926f9b3fSAlex Elder struct rbd_device *rbd_dev; 1102926f9b3fSAlex Elder 1103926f9b3fSAlex Elder rbd_dev = img_request ? img_request->rbd_dev : NULL; 1104926f9b3fSAlex Elder rbd_warn(rbd_dev, "obj_request %p already marked done\n", 1105926f9b3fSAlex Elder obj_request); 1106926f9b3fSAlex Elder } 1107926f9b3fSAlex Elder } 1108926f9b3fSAlex Elder 1109926f9b3fSAlex Elder static bool obj_request_done_test(struct rbd_obj_request *obj_request) 1110926f9b3fSAlex Elder { 1111926f9b3fSAlex Elder smp_mb(); 1112926f9b3fSAlex Elder return test_bit(OBJ_REQ_DONE, &obj_request->flags) != 0; 1113926f9b3fSAlex Elder } 1114926f9b3fSAlex Elder 11156365d33aSAlex Elder static void obj_request_img_data_set(struct rbd_obj_request *obj_request) 11166365d33aSAlex Elder { 11176365d33aSAlex Elder if (test_and_set_bit(OBJ_REQ_IMG_DATA, &obj_request->flags)) { 11186365d33aSAlex Elder struct rbd_img_request *img_request = obj_request->img_request; 11196365d33aSAlex Elder struct rbd_device *rbd_dev; 11206365d33aSAlex Elder 11216365d33aSAlex Elder rbd_dev = img_request ? img_request->rbd_dev : NULL; 11226365d33aSAlex Elder rbd_warn(rbd_dev, "obj_request %p already marked img_data\n", 11236365d33aSAlex Elder obj_request); 11246365d33aSAlex Elder } 11256365d33aSAlex Elder } 11266365d33aSAlex Elder 11276365d33aSAlex Elder static bool obj_request_img_data_test(struct rbd_obj_request *obj_request) 11286365d33aSAlex Elder { 11296365d33aSAlex Elder smp_mb(); 11306365d33aSAlex Elder return test_bit(OBJ_REQ_IMG_DATA, &obj_request->flags) != 0; 11316365d33aSAlex Elder } 11326365d33aSAlex Elder 1133bf0d5f50SAlex Elder static void rbd_obj_request_get(struct rbd_obj_request *obj_request) 1134bf0d5f50SAlex Elder { 113537206ee5SAlex Elder dout("%s: obj %p (was %d)\n", __func__, obj_request, 113637206ee5SAlex Elder atomic_read(&obj_request->kref.refcount)); 1137bf0d5f50SAlex Elder kref_get(&obj_request->kref); 1138bf0d5f50SAlex Elder } 1139bf0d5f50SAlex Elder 1140bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref); 1141bf0d5f50SAlex Elder static void rbd_obj_request_put(struct rbd_obj_request *obj_request) 1142bf0d5f50SAlex Elder { 1143bf0d5f50SAlex Elder rbd_assert(obj_request != NULL); 114437206ee5SAlex Elder dout("%s: obj %p (was %d)\n", __func__, obj_request, 114537206ee5SAlex Elder atomic_read(&obj_request->kref.refcount)); 1146bf0d5f50SAlex Elder kref_put(&obj_request->kref, rbd_obj_request_destroy); 1147bf0d5f50SAlex Elder } 1148bf0d5f50SAlex Elder 1149bf0d5f50SAlex Elder static void rbd_img_request_get(struct rbd_img_request *img_request) 1150bf0d5f50SAlex Elder { 115137206ee5SAlex Elder dout("%s: img %p (was %d)\n", __func__, img_request, 115237206ee5SAlex Elder atomic_read(&img_request->kref.refcount)); 1153bf0d5f50SAlex Elder kref_get(&img_request->kref); 1154bf0d5f50SAlex Elder } 1155bf0d5f50SAlex Elder 1156bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref); 1157bf0d5f50SAlex Elder static void rbd_img_request_put(struct rbd_img_request *img_request) 1158bf0d5f50SAlex Elder { 1159bf0d5f50SAlex Elder rbd_assert(img_request != NULL); 116037206ee5SAlex Elder dout("%s: img %p (was %d)\n", __func__, img_request, 116137206ee5SAlex Elder atomic_read(&img_request->kref.refcount)); 1162bf0d5f50SAlex Elder kref_put(&img_request->kref, rbd_img_request_destroy); 1163bf0d5f50SAlex Elder } 1164bf0d5f50SAlex Elder 1165bf0d5f50SAlex Elder static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request, 1166bf0d5f50SAlex Elder struct rbd_obj_request *obj_request) 1167bf0d5f50SAlex Elder { 116825dcf954SAlex Elder rbd_assert(obj_request->img_request == NULL); 116925dcf954SAlex Elder 1170b155e86cSAlex Elder /* Image request now owns object's original reference */ 1171bf0d5f50SAlex Elder obj_request->img_request = img_request; 117225dcf954SAlex Elder obj_request->which = img_request->obj_request_count; 11736365d33aSAlex Elder rbd_assert(!obj_request_img_data_test(obj_request)); 11746365d33aSAlex Elder obj_request_img_data_set(obj_request); 1175bf0d5f50SAlex Elder rbd_assert(obj_request->which != BAD_WHICH); 117625dcf954SAlex Elder img_request->obj_request_count++; 117725dcf954SAlex Elder list_add_tail(&obj_request->links, &img_request->obj_requests); 117837206ee5SAlex Elder dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request, 117937206ee5SAlex Elder obj_request->which); 1180bf0d5f50SAlex Elder } 1181bf0d5f50SAlex Elder 1182bf0d5f50SAlex Elder static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request, 1183bf0d5f50SAlex Elder struct rbd_obj_request *obj_request) 1184bf0d5f50SAlex Elder { 1185bf0d5f50SAlex Elder rbd_assert(obj_request->which != BAD_WHICH); 118625dcf954SAlex Elder 118737206ee5SAlex Elder dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request, 118837206ee5SAlex Elder obj_request->which); 1189bf0d5f50SAlex Elder list_del(&obj_request->links); 119025dcf954SAlex Elder rbd_assert(img_request->obj_request_count > 0); 119125dcf954SAlex Elder img_request->obj_request_count--; 119225dcf954SAlex Elder rbd_assert(obj_request->which == img_request->obj_request_count); 119325dcf954SAlex Elder obj_request->which = BAD_WHICH; 11946365d33aSAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 1195bf0d5f50SAlex Elder rbd_assert(obj_request->img_request == img_request); 1196bf0d5f50SAlex Elder obj_request->img_request = NULL; 119725dcf954SAlex Elder obj_request->callback = NULL; 1198bf0d5f50SAlex Elder rbd_obj_request_put(obj_request); 1199bf0d5f50SAlex Elder } 1200bf0d5f50SAlex Elder 1201bf0d5f50SAlex Elder static bool obj_request_type_valid(enum obj_request_type type) 1202bf0d5f50SAlex Elder { 1203bf0d5f50SAlex Elder switch (type) { 12049969ebc5SAlex Elder case OBJ_REQUEST_NODATA: 1205bf0d5f50SAlex Elder case OBJ_REQUEST_BIO: 1206788e2df3SAlex Elder case OBJ_REQUEST_PAGES: 1207bf0d5f50SAlex Elder return true; 1208bf0d5f50SAlex Elder default: 1209bf0d5f50SAlex Elder return false; 1210bf0d5f50SAlex Elder } 1211bf0d5f50SAlex Elder } 1212bf0d5f50SAlex Elder 1213bf0d5f50SAlex Elder static int rbd_obj_request_submit(struct ceph_osd_client *osdc, 1214bf0d5f50SAlex Elder struct rbd_obj_request *obj_request) 1215bf0d5f50SAlex Elder { 121637206ee5SAlex Elder dout("%s: osdc %p obj %p\n", __func__, osdc, obj_request); 121737206ee5SAlex Elder 1218bf0d5f50SAlex Elder return ceph_osdc_start_request(osdc, obj_request->osd_req, false); 1219bf0d5f50SAlex Elder } 1220bf0d5f50SAlex Elder 1221bf0d5f50SAlex Elder static void rbd_img_request_complete(struct rbd_img_request *img_request) 1222bf0d5f50SAlex Elder { 122355f27e09SAlex Elder 122437206ee5SAlex Elder dout("%s: img %p\n", __func__, img_request); 122555f27e09SAlex Elder 122655f27e09SAlex Elder /* 122755f27e09SAlex Elder * If no error occurred, compute the aggregate transfer 122855f27e09SAlex Elder * count for the image request. We could instead use 122955f27e09SAlex Elder * atomic64_cmpxchg() to update it as each object request 123055f27e09SAlex Elder * completes; not clear which way is better off hand. 123155f27e09SAlex Elder */ 123255f27e09SAlex Elder if (!img_request->result) { 123355f27e09SAlex Elder struct rbd_obj_request *obj_request; 123455f27e09SAlex Elder u64 xferred = 0; 123555f27e09SAlex Elder 123655f27e09SAlex Elder for_each_obj_request(img_request, obj_request) 123755f27e09SAlex Elder xferred += obj_request->xferred; 123855f27e09SAlex Elder img_request->xferred = xferred; 123955f27e09SAlex Elder } 124055f27e09SAlex Elder 1241bf0d5f50SAlex Elder if (img_request->callback) 1242bf0d5f50SAlex Elder img_request->callback(img_request); 1243bf0d5f50SAlex Elder else 1244bf0d5f50SAlex Elder rbd_img_request_put(img_request); 1245bf0d5f50SAlex Elder } 1246bf0d5f50SAlex Elder 1247788e2df3SAlex Elder /* Caller is responsible for rbd_obj_request_destroy(obj_request) */ 1248788e2df3SAlex Elder 1249788e2df3SAlex Elder static int rbd_obj_request_wait(struct rbd_obj_request *obj_request) 1250788e2df3SAlex Elder { 125137206ee5SAlex Elder dout("%s: obj %p\n", __func__, obj_request); 125237206ee5SAlex Elder 1253788e2df3SAlex Elder return wait_for_completion_interruptible(&obj_request->completion); 1254788e2df3SAlex Elder } 1255788e2df3SAlex Elder 12560c425248SAlex Elder /* 12570c425248SAlex Elder * The default/initial value for all image request flags is 0. Each 12580c425248SAlex Elder * is conditionally set to 1 at image request initialization time 12590c425248SAlex Elder * and currently never change thereafter. 12600c425248SAlex Elder */ 12610c425248SAlex Elder static void img_request_write_set(struct rbd_img_request *img_request) 12620c425248SAlex Elder { 12630c425248SAlex Elder set_bit(IMG_REQ_WRITE, &img_request->flags); 12640c425248SAlex Elder smp_mb(); 12650c425248SAlex Elder } 12660c425248SAlex Elder 12670c425248SAlex Elder static bool img_request_write_test(struct rbd_img_request *img_request) 12680c425248SAlex Elder { 12690c425248SAlex Elder smp_mb(); 12700c425248SAlex Elder return test_bit(IMG_REQ_WRITE, &img_request->flags) != 0; 12710c425248SAlex Elder } 12720c425248SAlex Elder 12739849e986SAlex Elder static void img_request_child_set(struct rbd_img_request *img_request) 12749849e986SAlex Elder { 12759849e986SAlex Elder set_bit(IMG_REQ_CHILD, &img_request->flags); 12769849e986SAlex Elder smp_mb(); 12779849e986SAlex Elder } 12789849e986SAlex Elder 12799849e986SAlex Elder static bool img_request_child_test(struct rbd_img_request *img_request) 12809849e986SAlex Elder { 12819849e986SAlex Elder smp_mb(); 12829849e986SAlex Elder return test_bit(IMG_REQ_CHILD, &img_request->flags) != 0; 12839849e986SAlex Elder } 12849849e986SAlex Elder 1285d0b2e944SAlex Elder static void img_request_layered_set(struct rbd_img_request *img_request) 1286d0b2e944SAlex Elder { 1287d0b2e944SAlex Elder set_bit(IMG_REQ_LAYERED, &img_request->flags); 1288d0b2e944SAlex Elder smp_mb(); 1289d0b2e944SAlex Elder } 1290d0b2e944SAlex Elder 1291d0b2e944SAlex Elder static bool img_request_layered_test(struct rbd_img_request *img_request) 1292d0b2e944SAlex Elder { 1293d0b2e944SAlex Elder smp_mb(); 1294d0b2e944SAlex Elder return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0; 1295d0b2e944SAlex Elder } 1296d0b2e944SAlex Elder 12976e2a4505SAlex Elder static void 12986e2a4505SAlex Elder rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request) 12996e2a4505SAlex Elder { 13006e2a4505SAlex Elder dout("%s: obj %p img %p result %d %llu/%llu\n", __func__, 13016e2a4505SAlex Elder obj_request, obj_request->img_request, obj_request->result, 13026e2a4505SAlex Elder obj_request->xferred, obj_request->length); 13036e2a4505SAlex Elder /* 13046e2a4505SAlex Elder * ENOENT means a hole in the image. We zero-fill the 13056e2a4505SAlex Elder * entire length of the request. A short read also implies 13066e2a4505SAlex Elder * zero-fill to the end of the request. Either way we 13076e2a4505SAlex Elder * update the xferred count to indicate the whole request 13086e2a4505SAlex Elder * was satisfied. 13096e2a4505SAlex Elder */ 13106e2a4505SAlex Elder BUG_ON(obj_request->type != OBJ_REQUEST_BIO); 13116e2a4505SAlex Elder if (obj_request->result == -ENOENT) { 13126e2a4505SAlex Elder zero_bio_chain(obj_request->bio_list, 0); 13136e2a4505SAlex Elder obj_request->result = 0; 13146e2a4505SAlex Elder obj_request->xferred = obj_request->length; 13156e2a4505SAlex Elder } else if (obj_request->xferred < obj_request->length && 13166e2a4505SAlex Elder !obj_request->result) { 13176e2a4505SAlex Elder zero_bio_chain(obj_request->bio_list, obj_request->xferred); 13186e2a4505SAlex Elder obj_request->xferred = obj_request->length; 13196e2a4505SAlex Elder } 13206e2a4505SAlex Elder obj_request_done_set(obj_request); 13216e2a4505SAlex Elder } 13226e2a4505SAlex Elder 1323bf0d5f50SAlex Elder static void rbd_obj_request_complete(struct rbd_obj_request *obj_request) 1324bf0d5f50SAlex Elder { 132537206ee5SAlex Elder dout("%s: obj %p cb %p\n", __func__, obj_request, 132637206ee5SAlex Elder obj_request->callback); 1327bf0d5f50SAlex Elder if (obj_request->callback) 1328bf0d5f50SAlex Elder obj_request->callback(obj_request); 1329788e2df3SAlex Elder else 1330788e2df3SAlex Elder complete_all(&obj_request->completion); 1331bf0d5f50SAlex Elder } 1332bf0d5f50SAlex Elder 1333c47f9371SAlex Elder static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request) 133439bf2c5dSAlex Elder { 133539bf2c5dSAlex Elder dout("%s: obj %p\n", __func__, obj_request); 133639bf2c5dSAlex Elder obj_request_done_set(obj_request); 133739bf2c5dSAlex Elder } 133839bf2c5dSAlex Elder 1339c47f9371SAlex Elder static void rbd_osd_read_callback(struct rbd_obj_request *obj_request) 1340bf0d5f50SAlex Elder { 13418b3e1a56SAlex Elder struct rbd_img_request *img_request = obj_request->img_request; 13428b3e1a56SAlex Elder bool layered = img_request && img_request_layered_test(img_request); 13438b3e1a56SAlex Elder 13448b3e1a56SAlex Elder dout("%s: obj %p img %p result %d %llu/%llu\n", __func__, 13458b3e1a56SAlex Elder obj_request, img_request, obj_request->result, 13468b3e1a56SAlex Elder obj_request->xferred, obj_request->length); 13478b3e1a56SAlex Elder if (layered && obj_request->result == -ENOENT) 13488b3e1a56SAlex Elder rbd_img_parent_read(obj_request); 13498b3e1a56SAlex Elder else if (img_request) 13506e2a4505SAlex Elder rbd_img_obj_request_read_callback(obj_request); 13516e2a4505SAlex Elder else 135207741308SAlex Elder obj_request_done_set(obj_request); 1353bf0d5f50SAlex Elder } 1354bf0d5f50SAlex Elder 1355c47f9371SAlex Elder static void rbd_osd_write_callback(struct rbd_obj_request *obj_request) 1356bf0d5f50SAlex Elder { 13571b83bef2SSage Weil dout("%s: obj %p result %d %llu\n", __func__, obj_request, 13581b83bef2SSage Weil obj_request->result, obj_request->length); 13591b83bef2SSage Weil /* 13608b3e1a56SAlex Elder * There is no such thing as a successful short write. Set 13618b3e1a56SAlex Elder * it to our originally-requested length. 13621b83bef2SSage Weil */ 13631b83bef2SSage Weil obj_request->xferred = obj_request->length; 136407741308SAlex Elder obj_request_done_set(obj_request); 1365bf0d5f50SAlex Elder } 1366bf0d5f50SAlex Elder 1367fbfab539SAlex Elder /* 1368fbfab539SAlex Elder * For a simple stat call there's nothing to do. We'll do more if 1369fbfab539SAlex Elder * this is part of a write sequence for a layered image. 1370fbfab539SAlex Elder */ 1371c47f9371SAlex Elder static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request) 1372fbfab539SAlex Elder { 137337206ee5SAlex Elder dout("%s: obj %p\n", __func__, obj_request); 1374fbfab539SAlex Elder obj_request_done_set(obj_request); 1375fbfab539SAlex Elder } 1376fbfab539SAlex Elder 1377bf0d5f50SAlex Elder static void rbd_osd_req_callback(struct ceph_osd_request *osd_req, 1378bf0d5f50SAlex Elder struct ceph_msg *msg) 1379bf0d5f50SAlex Elder { 1380bf0d5f50SAlex Elder struct rbd_obj_request *obj_request = osd_req->r_priv; 1381bf0d5f50SAlex Elder u16 opcode; 1382bf0d5f50SAlex Elder 138337206ee5SAlex Elder dout("%s: osd_req %p msg %p\n", __func__, osd_req, msg); 1384bf0d5f50SAlex Elder rbd_assert(osd_req == obj_request->osd_req); 13856365d33aSAlex Elder rbd_assert(obj_request_img_data_test(obj_request) ^ 13866365d33aSAlex Elder !obj_request->img_request); 13876365d33aSAlex Elder rbd_assert(obj_request_img_data_test(obj_request) ^ 1388bf0d5f50SAlex Elder (obj_request->which == BAD_WHICH)); 1389bf0d5f50SAlex Elder 13901b83bef2SSage Weil if (osd_req->r_result < 0) 13911b83bef2SSage Weil obj_request->result = osd_req->r_result; 1392bf0d5f50SAlex Elder obj_request->version = le64_to_cpu(osd_req->r_reassert_version.version); 1393bf0d5f50SAlex Elder 13941b83bef2SSage Weil WARN_ON(osd_req->r_num_ops != 1); /* For now */ 1395bf0d5f50SAlex Elder 1396c47f9371SAlex Elder /* 1397c47f9371SAlex Elder * We support a 64-bit length, but ultimately it has to be 1398c47f9371SAlex Elder * passed to blk_end_request(), which takes an unsigned int. 1399c47f9371SAlex Elder */ 14001b83bef2SSage Weil obj_request->xferred = osd_req->r_reply_op_len[0]; 1401c47f9371SAlex Elder rbd_assert(obj_request->xferred < (u64)UINT_MAX); 140279528734SAlex Elder opcode = osd_req->r_ops[0].op; 1403bf0d5f50SAlex Elder switch (opcode) { 1404bf0d5f50SAlex Elder case CEPH_OSD_OP_READ: 1405c47f9371SAlex Elder rbd_osd_read_callback(obj_request); 1406bf0d5f50SAlex Elder break; 1407bf0d5f50SAlex Elder case CEPH_OSD_OP_WRITE: 1408c47f9371SAlex Elder rbd_osd_write_callback(obj_request); 1409bf0d5f50SAlex Elder break; 1410fbfab539SAlex Elder case CEPH_OSD_OP_STAT: 1411c47f9371SAlex Elder rbd_osd_stat_callback(obj_request); 1412fbfab539SAlex Elder break; 141336be9a76SAlex Elder case CEPH_OSD_OP_CALL: 1414b8d70035SAlex Elder case CEPH_OSD_OP_NOTIFY_ACK: 14159969ebc5SAlex Elder case CEPH_OSD_OP_WATCH: 1416c47f9371SAlex Elder rbd_osd_trivial_callback(obj_request); 14179969ebc5SAlex Elder break; 1418bf0d5f50SAlex Elder default: 1419bf0d5f50SAlex Elder rbd_warn(NULL, "%s: unsupported op %hu\n", 1420bf0d5f50SAlex Elder obj_request->object_name, (unsigned short) opcode); 1421bf0d5f50SAlex Elder break; 1422bf0d5f50SAlex Elder } 1423bf0d5f50SAlex Elder 142407741308SAlex Elder if (obj_request_done_test(obj_request)) 1425bf0d5f50SAlex Elder rbd_obj_request_complete(obj_request); 1426bf0d5f50SAlex Elder } 1427bf0d5f50SAlex Elder 14282fa12320SAlex Elder static void rbd_osd_req_format(struct rbd_obj_request *obj_request, 142979528734SAlex Elder bool write_request) 1430430c28c3SAlex Elder { 1431430c28c3SAlex Elder struct rbd_img_request *img_request = obj_request->img_request; 14328c042b0dSAlex Elder struct ceph_osd_request *osd_req = obj_request->osd_req; 1433430c28c3SAlex Elder struct ceph_snap_context *snapc = NULL; 1434430c28c3SAlex Elder u64 snap_id = CEPH_NOSNAP; 1435430c28c3SAlex Elder struct timespec *mtime = NULL; 1436430c28c3SAlex Elder struct timespec now; 1437430c28c3SAlex Elder 14388c042b0dSAlex Elder rbd_assert(osd_req != NULL); 1439430c28c3SAlex Elder 1440430c28c3SAlex Elder if (write_request) { 1441430c28c3SAlex Elder now = CURRENT_TIME; 1442430c28c3SAlex Elder mtime = &now; 1443430c28c3SAlex Elder if (img_request) 1444430c28c3SAlex Elder snapc = img_request->snapc; 14452fa12320SAlex Elder } else if (img_request) { 1446430c28c3SAlex Elder snap_id = img_request->snap_id; 1447430c28c3SAlex Elder } 14488c042b0dSAlex Elder ceph_osdc_build_request(osd_req, obj_request->offset, 144979528734SAlex Elder snapc, snap_id, mtime); 1450430c28c3SAlex Elder } 1451430c28c3SAlex Elder 1452bf0d5f50SAlex Elder static struct ceph_osd_request *rbd_osd_req_create( 1453bf0d5f50SAlex Elder struct rbd_device *rbd_dev, 1454bf0d5f50SAlex Elder bool write_request, 1455430c28c3SAlex Elder struct rbd_obj_request *obj_request) 1456bf0d5f50SAlex Elder { 1457bf0d5f50SAlex Elder struct ceph_snap_context *snapc = NULL; 1458bf0d5f50SAlex Elder struct ceph_osd_client *osdc; 1459bf0d5f50SAlex Elder struct ceph_osd_request *osd_req; 1460bf0d5f50SAlex Elder 14616365d33aSAlex Elder if (obj_request_img_data_test(obj_request)) { 14626365d33aSAlex Elder struct rbd_img_request *img_request = obj_request->img_request; 14636365d33aSAlex Elder 14640c425248SAlex Elder rbd_assert(write_request == 14650c425248SAlex Elder img_request_write_test(img_request)); 14660c425248SAlex Elder if (write_request) 1467bf0d5f50SAlex Elder snapc = img_request->snapc; 1468bf0d5f50SAlex Elder } 1469bf0d5f50SAlex Elder 1470bf0d5f50SAlex Elder /* Allocate and initialize the request, for the single op */ 1471bf0d5f50SAlex Elder 1472bf0d5f50SAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 1473bf0d5f50SAlex Elder osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_ATOMIC); 1474bf0d5f50SAlex Elder if (!osd_req) 1475bf0d5f50SAlex Elder return NULL; /* ENOMEM */ 1476bf0d5f50SAlex Elder 1477430c28c3SAlex Elder if (write_request) 1478bf0d5f50SAlex Elder osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK; 1479430c28c3SAlex Elder else 1480bf0d5f50SAlex Elder osd_req->r_flags = CEPH_OSD_FLAG_READ; 1481bf0d5f50SAlex Elder 1482bf0d5f50SAlex Elder osd_req->r_callback = rbd_osd_req_callback; 1483bf0d5f50SAlex Elder osd_req->r_priv = obj_request; 1484bf0d5f50SAlex Elder 1485bf0d5f50SAlex Elder osd_req->r_oid_len = strlen(obj_request->object_name); 1486bf0d5f50SAlex Elder rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid)); 1487bf0d5f50SAlex Elder memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len); 1488bf0d5f50SAlex Elder 1489bf0d5f50SAlex Elder osd_req->r_file_layout = rbd_dev->layout; /* struct */ 1490bf0d5f50SAlex Elder 1491bf0d5f50SAlex Elder return osd_req; 1492bf0d5f50SAlex Elder } 1493bf0d5f50SAlex Elder 1494bf0d5f50SAlex Elder static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req) 1495bf0d5f50SAlex Elder { 1496bf0d5f50SAlex Elder ceph_osdc_put_request(osd_req); 1497bf0d5f50SAlex Elder } 1498bf0d5f50SAlex Elder 1499bf0d5f50SAlex Elder /* object_name is assumed to be a non-null pointer and NUL-terminated */ 1500bf0d5f50SAlex Elder 1501bf0d5f50SAlex Elder static struct rbd_obj_request *rbd_obj_request_create(const char *object_name, 1502bf0d5f50SAlex Elder u64 offset, u64 length, 1503bf0d5f50SAlex Elder enum obj_request_type type) 1504bf0d5f50SAlex Elder { 1505bf0d5f50SAlex Elder struct rbd_obj_request *obj_request; 1506bf0d5f50SAlex Elder size_t size; 1507bf0d5f50SAlex Elder char *name; 1508bf0d5f50SAlex Elder 1509bf0d5f50SAlex Elder rbd_assert(obj_request_type_valid(type)); 1510bf0d5f50SAlex Elder 1511bf0d5f50SAlex Elder size = strlen(object_name) + 1; 1512bf0d5f50SAlex Elder obj_request = kzalloc(sizeof (*obj_request) + size, GFP_KERNEL); 1513bf0d5f50SAlex Elder if (!obj_request) 1514bf0d5f50SAlex Elder return NULL; 1515bf0d5f50SAlex Elder 1516bf0d5f50SAlex Elder name = (char *)(obj_request + 1); 1517bf0d5f50SAlex Elder obj_request->object_name = memcpy(name, object_name, size); 1518bf0d5f50SAlex Elder obj_request->offset = offset; 1519bf0d5f50SAlex Elder obj_request->length = length; 1520926f9b3fSAlex Elder obj_request->flags = 0; 1521bf0d5f50SAlex Elder obj_request->which = BAD_WHICH; 1522bf0d5f50SAlex Elder obj_request->type = type; 1523bf0d5f50SAlex Elder INIT_LIST_HEAD(&obj_request->links); 1524788e2df3SAlex Elder init_completion(&obj_request->completion); 1525bf0d5f50SAlex Elder kref_init(&obj_request->kref); 1526bf0d5f50SAlex Elder 152737206ee5SAlex Elder dout("%s: \"%s\" %llu/%llu %d -> obj %p\n", __func__, object_name, 152837206ee5SAlex Elder offset, length, (int)type, obj_request); 152937206ee5SAlex Elder 1530bf0d5f50SAlex Elder return obj_request; 1531bf0d5f50SAlex Elder } 1532bf0d5f50SAlex Elder 1533bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref) 1534bf0d5f50SAlex Elder { 1535bf0d5f50SAlex Elder struct rbd_obj_request *obj_request; 1536bf0d5f50SAlex Elder 1537bf0d5f50SAlex Elder obj_request = container_of(kref, struct rbd_obj_request, kref); 1538bf0d5f50SAlex Elder 153937206ee5SAlex Elder dout("%s: obj %p\n", __func__, obj_request); 154037206ee5SAlex Elder 1541bf0d5f50SAlex Elder rbd_assert(obj_request->img_request == NULL); 1542bf0d5f50SAlex Elder rbd_assert(obj_request->which == BAD_WHICH); 1543bf0d5f50SAlex Elder 1544bf0d5f50SAlex Elder if (obj_request->osd_req) 1545bf0d5f50SAlex Elder rbd_osd_req_destroy(obj_request->osd_req); 1546bf0d5f50SAlex Elder 1547bf0d5f50SAlex Elder rbd_assert(obj_request_type_valid(obj_request->type)); 1548bf0d5f50SAlex Elder switch (obj_request->type) { 15499969ebc5SAlex Elder case OBJ_REQUEST_NODATA: 15509969ebc5SAlex Elder break; /* Nothing to do */ 1551bf0d5f50SAlex Elder case OBJ_REQUEST_BIO: 1552bf0d5f50SAlex Elder if (obj_request->bio_list) 1553bf0d5f50SAlex Elder bio_chain_put(obj_request->bio_list); 1554bf0d5f50SAlex Elder break; 1555788e2df3SAlex Elder case OBJ_REQUEST_PAGES: 1556788e2df3SAlex Elder if (obj_request->pages) 1557788e2df3SAlex Elder ceph_release_page_vector(obj_request->pages, 1558788e2df3SAlex Elder obj_request->page_count); 1559788e2df3SAlex Elder break; 1560bf0d5f50SAlex Elder } 1561bf0d5f50SAlex Elder 1562bf0d5f50SAlex Elder kfree(obj_request); 1563bf0d5f50SAlex Elder } 1564bf0d5f50SAlex Elder 1565bf0d5f50SAlex Elder /* 1566bf0d5f50SAlex Elder * Caller is responsible for filling in the list of object requests 1567bf0d5f50SAlex Elder * that comprises the image request, and the Linux request pointer 1568bf0d5f50SAlex Elder * (if there is one). 1569bf0d5f50SAlex Elder */ 1570cc344fa1SAlex Elder static struct rbd_img_request *rbd_img_request_create( 1571cc344fa1SAlex Elder struct rbd_device *rbd_dev, 1572bf0d5f50SAlex Elder u64 offset, u64 length, 15739849e986SAlex Elder bool write_request, 15749849e986SAlex Elder bool child_request) 1575bf0d5f50SAlex Elder { 1576bf0d5f50SAlex Elder struct rbd_img_request *img_request; 1577bf0d5f50SAlex Elder struct ceph_snap_context *snapc = NULL; 1578bf0d5f50SAlex Elder 1579bf0d5f50SAlex Elder img_request = kmalloc(sizeof (*img_request), GFP_ATOMIC); 1580bf0d5f50SAlex Elder if (!img_request) 1581bf0d5f50SAlex Elder return NULL; 1582bf0d5f50SAlex Elder 1583bf0d5f50SAlex Elder if (write_request) { 1584bf0d5f50SAlex Elder down_read(&rbd_dev->header_rwsem); 1585bf0d5f50SAlex Elder snapc = ceph_get_snap_context(rbd_dev->header.snapc); 1586bf0d5f50SAlex Elder up_read(&rbd_dev->header_rwsem); 1587bf0d5f50SAlex Elder if (WARN_ON(!snapc)) { 1588bf0d5f50SAlex Elder kfree(img_request); 1589bf0d5f50SAlex Elder return NULL; /* Shouldn't happen */ 1590bf0d5f50SAlex Elder } 15910c425248SAlex Elder 1592bf0d5f50SAlex Elder } 1593bf0d5f50SAlex Elder 1594bf0d5f50SAlex Elder img_request->rq = NULL; 1595bf0d5f50SAlex Elder img_request->rbd_dev = rbd_dev; 1596bf0d5f50SAlex Elder img_request->offset = offset; 1597bf0d5f50SAlex Elder img_request->length = length; 15980c425248SAlex Elder img_request->flags = 0; 15990c425248SAlex Elder if (write_request) { 16000c425248SAlex Elder img_request_write_set(img_request); 1601bf0d5f50SAlex Elder img_request->snapc = snapc; 16020c425248SAlex Elder } else { 1603bf0d5f50SAlex Elder img_request->snap_id = rbd_dev->spec->snap_id; 16040c425248SAlex Elder } 16059849e986SAlex Elder if (child_request) 16069849e986SAlex Elder img_request_child_set(img_request); 1607d0b2e944SAlex Elder if (rbd_dev->parent_spec) 1608d0b2e944SAlex Elder img_request_layered_set(img_request); 1609bf0d5f50SAlex Elder spin_lock_init(&img_request->completion_lock); 1610bf0d5f50SAlex Elder img_request->next_completion = 0; 1611bf0d5f50SAlex Elder img_request->callback = NULL; 1612a5a337d4SAlex Elder img_request->result = 0; 1613bf0d5f50SAlex Elder img_request->obj_request_count = 0; 1614bf0d5f50SAlex Elder INIT_LIST_HEAD(&img_request->obj_requests); 1615bf0d5f50SAlex Elder kref_init(&img_request->kref); 1616bf0d5f50SAlex Elder 1617bf0d5f50SAlex Elder rbd_img_request_get(img_request); /* Avoid a warning */ 1618bf0d5f50SAlex Elder rbd_img_request_put(img_request); /* TEMPORARY */ 1619bf0d5f50SAlex Elder 162037206ee5SAlex Elder dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev, 162137206ee5SAlex Elder write_request ? "write" : "read", offset, length, 162237206ee5SAlex Elder img_request); 162337206ee5SAlex Elder 1624bf0d5f50SAlex Elder return img_request; 1625bf0d5f50SAlex Elder } 1626bf0d5f50SAlex Elder 1627bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref) 1628bf0d5f50SAlex Elder { 1629bf0d5f50SAlex Elder struct rbd_img_request *img_request; 1630bf0d5f50SAlex Elder struct rbd_obj_request *obj_request; 1631bf0d5f50SAlex Elder struct rbd_obj_request *next_obj_request; 1632bf0d5f50SAlex Elder 1633bf0d5f50SAlex Elder img_request = container_of(kref, struct rbd_img_request, kref); 1634bf0d5f50SAlex Elder 163537206ee5SAlex Elder dout("%s: img %p\n", __func__, img_request); 163637206ee5SAlex Elder 1637bf0d5f50SAlex Elder for_each_obj_request_safe(img_request, obj_request, next_obj_request) 1638bf0d5f50SAlex Elder rbd_img_obj_request_del(img_request, obj_request); 163925dcf954SAlex Elder rbd_assert(img_request->obj_request_count == 0); 1640bf0d5f50SAlex Elder 16410c425248SAlex Elder if (img_request_write_test(img_request)) 1642bf0d5f50SAlex Elder ceph_put_snap_context(img_request->snapc); 1643bf0d5f50SAlex Elder 16448b3e1a56SAlex Elder if (img_request_child_test(img_request)) 16458b3e1a56SAlex Elder rbd_obj_request_put(img_request->obj_request); 16468b3e1a56SAlex Elder 1647bf0d5f50SAlex Elder kfree(img_request); 1648bf0d5f50SAlex Elder } 1649bf0d5f50SAlex Elder 16501217857fSAlex Elder static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request) 16511217857fSAlex Elder { 16526365d33aSAlex Elder struct rbd_img_request *img_request; 16531217857fSAlex Elder unsigned int xferred; 16541217857fSAlex Elder int result; 16558b3e1a56SAlex Elder bool more; 16561217857fSAlex Elder 16576365d33aSAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 16586365d33aSAlex Elder img_request = obj_request->img_request; 16596365d33aSAlex Elder 16601217857fSAlex Elder rbd_assert(obj_request->xferred <= (u64)UINT_MAX); 16611217857fSAlex Elder xferred = (unsigned int)obj_request->xferred; 16621217857fSAlex Elder result = obj_request->result; 16631217857fSAlex Elder if (result) { 16641217857fSAlex Elder struct rbd_device *rbd_dev = img_request->rbd_dev; 16651217857fSAlex Elder 16661217857fSAlex Elder rbd_warn(rbd_dev, "%s %llx at %llx (%llx)\n", 16671217857fSAlex Elder img_request_write_test(img_request) ? "write" : "read", 16681217857fSAlex Elder obj_request->length, obj_request->img_offset, 16691217857fSAlex Elder obj_request->offset); 16701217857fSAlex Elder rbd_warn(rbd_dev, " result %d xferred %x\n", 16711217857fSAlex Elder result, xferred); 16721217857fSAlex Elder if (!img_request->result) 16731217857fSAlex Elder img_request->result = result; 16741217857fSAlex Elder } 16751217857fSAlex Elder 16768b3e1a56SAlex Elder if (img_request_child_test(img_request)) { 16778b3e1a56SAlex Elder rbd_assert(img_request->obj_request != NULL); 16788b3e1a56SAlex Elder more = obj_request->which < img_request->obj_request_count - 1; 16798b3e1a56SAlex Elder } else { 16808b3e1a56SAlex Elder rbd_assert(img_request->rq != NULL); 16818b3e1a56SAlex Elder more = blk_end_request(img_request->rq, result, xferred); 16828b3e1a56SAlex Elder } 16838b3e1a56SAlex Elder 16848b3e1a56SAlex Elder return more; 16851217857fSAlex Elder } 16861217857fSAlex Elder 16872169238dSAlex Elder static void rbd_img_obj_callback(struct rbd_obj_request *obj_request) 16882169238dSAlex Elder { 16892169238dSAlex Elder struct rbd_img_request *img_request; 16902169238dSAlex Elder u32 which = obj_request->which; 16912169238dSAlex Elder bool more = true; 16922169238dSAlex Elder 16936365d33aSAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 16942169238dSAlex Elder img_request = obj_request->img_request; 16952169238dSAlex Elder 16962169238dSAlex Elder dout("%s: img %p obj %p\n", __func__, img_request, obj_request); 16972169238dSAlex Elder rbd_assert(img_request != NULL); 16982169238dSAlex Elder rbd_assert(img_request->obj_request_count > 0); 16992169238dSAlex Elder rbd_assert(which != BAD_WHICH); 17002169238dSAlex Elder rbd_assert(which < img_request->obj_request_count); 17012169238dSAlex Elder rbd_assert(which >= img_request->next_completion); 17022169238dSAlex Elder 17032169238dSAlex Elder spin_lock_irq(&img_request->completion_lock); 17042169238dSAlex Elder if (which != img_request->next_completion) 17052169238dSAlex Elder goto out; 17062169238dSAlex Elder 17072169238dSAlex Elder for_each_obj_request_from(img_request, obj_request) { 17082169238dSAlex Elder rbd_assert(more); 17092169238dSAlex Elder rbd_assert(which < img_request->obj_request_count); 17102169238dSAlex Elder 17112169238dSAlex Elder if (!obj_request_done_test(obj_request)) 17122169238dSAlex Elder break; 17131217857fSAlex Elder more = rbd_img_obj_end_request(obj_request); 17142169238dSAlex Elder which++; 17152169238dSAlex Elder } 17162169238dSAlex Elder 17172169238dSAlex Elder rbd_assert(more ^ (which == img_request->obj_request_count)); 17182169238dSAlex Elder img_request->next_completion = which; 17192169238dSAlex Elder out: 17202169238dSAlex Elder spin_unlock_irq(&img_request->completion_lock); 17212169238dSAlex Elder 17222169238dSAlex Elder if (!more) 17232169238dSAlex Elder rbd_img_request_complete(img_request); 17242169238dSAlex Elder } 17252169238dSAlex Elder 1726bf0d5f50SAlex Elder static int rbd_img_request_fill_bio(struct rbd_img_request *img_request, 1727bf0d5f50SAlex Elder struct bio *bio_list) 1728bf0d5f50SAlex Elder { 1729bf0d5f50SAlex Elder struct rbd_device *rbd_dev = img_request->rbd_dev; 1730bf0d5f50SAlex Elder struct rbd_obj_request *obj_request = NULL; 1731bf0d5f50SAlex Elder struct rbd_obj_request *next_obj_request; 17320c425248SAlex Elder bool write_request = img_request_write_test(img_request); 1733bf0d5f50SAlex Elder unsigned int bio_offset; 17347da22d29SAlex Elder u64 img_offset; 1735bf0d5f50SAlex Elder u64 resid; 1736bf0d5f50SAlex Elder u16 opcode; 1737bf0d5f50SAlex Elder 173837206ee5SAlex Elder dout("%s: img %p bio %p\n", __func__, img_request, bio_list); 173937206ee5SAlex Elder 1740430c28c3SAlex Elder opcode = write_request ? CEPH_OSD_OP_WRITE : CEPH_OSD_OP_READ; 1741bf0d5f50SAlex Elder bio_offset = 0; 17427da22d29SAlex Elder img_offset = img_request->offset; 17437da22d29SAlex Elder rbd_assert(img_offset == bio_list->bi_sector << SECTOR_SHIFT); 1744bf0d5f50SAlex Elder resid = img_request->length; 17454dda41d3SAlex Elder rbd_assert(resid > 0); 1746bf0d5f50SAlex Elder while (resid) { 17472fa12320SAlex Elder struct ceph_osd_request *osd_req; 1748bf0d5f50SAlex Elder const char *object_name; 1749bf0d5f50SAlex Elder unsigned int clone_size; 1750bf0d5f50SAlex Elder u64 offset; 1751bf0d5f50SAlex Elder u64 length; 1752bf0d5f50SAlex Elder 17537da22d29SAlex Elder object_name = rbd_segment_name(rbd_dev, img_offset); 1754bf0d5f50SAlex Elder if (!object_name) 1755bf0d5f50SAlex Elder goto out_unwind; 17567da22d29SAlex Elder offset = rbd_segment_offset(rbd_dev, img_offset); 17577da22d29SAlex Elder length = rbd_segment_length(rbd_dev, img_offset, resid); 1758bf0d5f50SAlex Elder obj_request = rbd_obj_request_create(object_name, 1759bf0d5f50SAlex Elder offset, length, 1760bf0d5f50SAlex Elder OBJ_REQUEST_BIO); 1761bf0d5f50SAlex Elder kfree(object_name); /* object request has its own copy */ 1762bf0d5f50SAlex Elder if (!obj_request) 1763bf0d5f50SAlex Elder goto out_unwind; 1764bf0d5f50SAlex Elder 1765bf0d5f50SAlex Elder rbd_assert(length <= (u64) UINT_MAX); 1766bf0d5f50SAlex Elder clone_size = (unsigned int) length; 1767bf0d5f50SAlex Elder obj_request->bio_list = bio_chain_clone_range(&bio_list, 1768bf0d5f50SAlex Elder &bio_offset, clone_size, 1769bf0d5f50SAlex Elder GFP_ATOMIC); 1770bf0d5f50SAlex Elder if (!obj_request->bio_list) 1771bf0d5f50SAlex Elder goto out_partial; 1772bf0d5f50SAlex Elder 17732fa12320SAlex Elder osd_req = rbd_osd_req_create(rbd_dev, write_request, 17742fa12320SAlex Elder obj_request); 17752fa12320SAlex Elder if (!osd_req) 1776bf0d5f50SAlex Elder goto out_partial; 17772fa12320SAlex Elder obj_request->osd_req = osd_req; 17782169238dSAlex Elder obj_request->callback = rbd_img_obj_callback; 1779430c28c3SAlex Elder 17802fa12320SAlex Elder osd_req_op_extent_init(osd_req, 0, opcode, offset, length, 17812fa12320SAlex Elder 0, 0); 1782406e2c9fSAlex Elder osd_req_op_extent_osd_data_bio(osd_req, 0, 1783a4ce40a9SAlex Elder obj_request->bio_list, obj_request->length); 17842fa12320SAlex Elder rbd_osd_req_format(obj_request, write_request); 1785430c28c3SAlex Elder 17867da22d29SAlex Elder obj_request->img_offset = img_offset; 1787bf0d5f50SAlex Elder rbd_img_obj_request_add(img_request, obj_request); 1788bf0d5f50SAlex Elder 17897da22d29SAlex Elder img_offset += length; 1790bf0d5f50SAlex Elder resid -= length; 1791bf0d5f50SAlex Elder } 1792bf0d5f50SAlex Elder 1793bf0d5f50SAlex Elder return 0; 1794bf0d5f50SAlex Elder 1795bf0d5f50SAlex Elder out_partial: 1796bf0d5f50SAlex Elder rbd_obj_request_put(obj_request); 1797bf0d5f50SAlex Elder out_unwind: 1798bf0d5f50SAlex Elder for_each_obj_request_safe(img_request, obj_request, next_obj_request) 1799bf0d5f50SAlex Elder rbd_obj_request_put(obj_request); 1800bf0d5f50SAlex Elder 1801bf0d5f50SAlex Elder return -ENOMEM; 1802bf0d5f50SAlex Elder } 1803bf0d5f50SAlex Elder 1804bf0d5f50SAlex Elder static int rbd_img_request_submit(struct rbd_img_request *img_request) 1805bf0d5f50SAlex Elder { 1806bf0d5f50SAlex Elder struct rbd_device *rbd_dev = img_request->rbd_dev; 1807bf0d5f50SAlex Elder struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 1808bf0d5f50SAlex Elder struct rbd_obj_request *obj_request; 180946faeed4SAlex Elder struct rbd_obj_request *next_obj_request; 1810bf0d5f50SAlex Elder 181137206ee5SAlex Elder dout("%s: img %p\n", __func__, img_request); 181246faeed4SAlex Elder for_each_obj_request_safe(img_request, obj_request, next_obj_request) { 1813bf0d5f50SAlex Elder int ret; 1814bf0d5f50SAlex Elder 1815bf0d5f50SAlex Elder ret = rbd_obj_request_submit(osdc, obj_request); 1816bf0d5f50SAlex Elder if (ret) 1817bf0d5f50SAlex Elder return ret; 1818bf0d5f50SAlex Elder } 1819bf0d5f50SAlex Elder 1820bf0d5f50SAlex Elder return 0; 1821bf0d5f50SAlex Elder } 1822bf0d5f50SAlex Elder 18238b3e1a56SAlex Elder static void rbd_img_parent_read_callback(struct rbd_img_request *img_request) 18248b3e1a56SAlex Elder { 18258b3e1a56SAlex Elder struct rbd_obj_request *obj_request; 18268b3e1a56SAlex Elder 18278b3e1a56SAlex Elder rbd_assert(img_request_child_test(img_request)); 18288b3e1a56SAlex Elder 18298b3e1a56SAlex Elder obj_request = img_request->obj_request; 18308b3e1a56SAlex Elder rbd_assert(obj_request != NULL); 18318b3e1a56SAlex Elder obj_request->result = img_request->result; 18328b3e1a56SAlex Elder obj_request->xferred = img_request->xferred; 18338b3e1a56SAlex Elder 18348b3e1a56SAlex Elder rbd_img_obj_request_read_callback(obj_request); 18358b3e1a56SAlex Elder rbd_obj_request_complete(obj_request); 18368b3e1a56SAlex Elder } 18378b3e1a56SAlex Elder 18388b3e1a56SAlex Elder static void rbd_img_parent_read(struct rbd_obj_request *obj_request) 18398b3e1a56SAlex Elder { 18408b3e1a56SAlex Elder struct rbd_device *rbd_dev; 18418b3e1a56SAlex Elder struct rbd_img_request *img_request; 18428b3e1a56SAlex Elder int result; 18438b3e1a56SAlex Elder 18448b3e1a56SAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 18458b3e1a56SAlex Elder rbd_assert(obj_request->img_request != NULL); 18468b3e1a56SAlex Elder rbd_assert(obj_request->result == (s32) -ENOENT); 18478b3e1a56SAlex Elder rbd_assert(obj_request->type == OBJ_REQUEST_BIO); 18488b3e1a56SAlex Elder 18498b3e1a56SAlex Elder rbd_dev = obj_request->img_request->rbd_dev; 18508b3e1a56SAlex Elder rbd_assert(rbd_dev->parent != NULL); 18518b3e1a56SAlex Elder /* rbd_read_finish(obj_request, obj_request->length); */ 18528b3e1a56SAlex Elder img_request = rbd_img_request_create(rbd_dev->parent, 18538b3e1a56SAlex Elder obj_request->img_offset, 18548b3e1a56SAlex Elder obj_request->length, 18558b3e1a56SAlex Elder false, true); 18568b3e1a56SAlex Elder result = -ENOMEM; 18578b3e1a56SAlex Elder if (!img_request) 18588b3e1a56SAlex Elder goto out_err; 18598b3e1a56SAlex Elder 18608b3e1a56SAlex Elder rbd_obj_request_get(obj_request); 18618b3e1a56SAlex Elder img_request->obj_request = obj_request; 18628b3e1a56SAlex Elder 18638b3e1a56SAlex Elder result = rbd_img_request_fill_bio(img_request, obj_request->bio_list); 18648b3e1a56SAlex Elder if (result) 18658b3e1a56SAlex Elder goto out_err; 18668b3e1a56SAlex Elder 18678b3e1a56SAlex Elder img_request->callback = rbd_img_parent_read_callback; 18688b3e1a56SAlex Elder result = rbd_img_request_submit(img_request); 18698b3e1a56SAlex Elder if (result) 18708b3e1a56SAlex Elder goto out_err; 18718b3e1a56SAlex Elder 18728b3e1a56SAlex Elder return; 18738b3e1a56SAlex Elder out_err: 18748b3e1a56SAlex Elder if (img_request) 18758b3e1a56SAlex Elder rbd_img_request_put(img_request); 18768b3e1a56SAlex Elder obj_request->result = result; 18778b3e1a56SAlex Elder obj_request->xferred = 0; 18788b3e1a56SAlex Elder obj_request_done_set(obj_request); 18798b3e1a56SAlex Elder } 18808b3e1a56SAlex Elder 1881cf81b60eSAlex Elder static int rbd_obj_notify_ack(struct rbd_device *rbd_dev, 1882b8d70035SAlex Elder u64 ver, u64 notify_id) 1883b8d70035SAlex Elder { 1884b8d70035SAlex Elder struct rbd_obj_request *obj_request; 18852169238dSAlex Elder struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 1886b8d70035SAlex Elder int ret; 1887b8d70035SAlex Elder 1888b8d70035SAlex Elder obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0, 1889b8d70035SAlex Elder OBJ_REQUEST_NODATA); 1890b8d70035SAlex Elder if (!obj_request) 1891b8d70035SAlex Elder return -ENOMEM; 1892b8d70035SAlex Elder 1893b8d70035SAlex Elder ret = -ENOMEM; 1894430c28c3SAlex Elder obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request); 1895b8d70035SAlex Elder if (!obj_request->osd_req) 1896b8d70035SAlex Elder goto out; 18972169238dSAlex Elder obj_request->callback = rbd_obj_request_put; 1898b8d70035SAlex Elder 1899c99d2d4aSAlex Elder osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_NOTIFY_ACK, 1900c99d2d4aSAlex Elder notify_id, ver, 0); 19012fa12320SAlex Elder rbd_osd_req_format(obj_request, false); 1902430c28c3SAlex Elder 1903b8d70035SAlex Elder ret = rbd_obj_request_submit(osdc, obj_request); 1904b8d70035SAlex Elder out: 1905cf81b60eSAlex Elder if (ret) 1906b8d70035SAlex Elder rbd_obj_request_put(obj_request); 1907b8d70035SAlex Elder 1908b8d70035SAlex Elder return ret; 1909b8d70035SAlex Elder } 1910b8d70035SAlex Elder 1911b8d70035SAlex Elder static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data) 1912b8d70035SAlex Elder { 1913b8d70035SAlex Elder struct rbd_device *rbd_dev = (struct rbd_device *)data; 1914b8d70035SAlex Elder u64 hver; 1915b8d70035SAlex Elder int rc; 1916b8d70035SAlex Elder 1917b8d70035SAlex Elder if (!rbd_dev) 1918b8d70035SAlex Elder return; 1919b8d70035SAlex Elder 192037206ee5SAlex Elder dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__, 1921b8d70035SAlex Elder rbd_dev->header_name, (unsigned long long) notify_id, 1922b8d70035SAlex Elder (unsigned int) opcode); 1923b8d70035SAlex Elder rc = rbd_dev_refresh(rbd_dev, &hver); 1924b8d70035SAlex Elder if (rc) 1925b8d70035SAlex Elder rbd_warn(rbd_dev, "got notification but failed to " 1926b8d70035SAlex Elder " update snaps: %d\n", rc); 1927b8d70035SAlex Elder 1928cf81b60eSAlex Elder rbd_obj_notify_ack(rbd_dev, hver, notify_id); 1929b8d70035SAlex Elder } 1930b8d70035SAlex Elder 19319969ebc5SAlex Elder /* 19329969ebc5SAlex Elder * Request sync osd watch/unwatch. The value of "start" determines 19339969ebc5SAlex Elder * whether a watch request is being initiated or torn down. 19349969ebc5SAlex Elder */ 19359969ebc5SAlex Elder static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start) 19369969ebc5SAlex Elder { 19379969ebc5SAlex Elder struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 19389969ebc5SAlex Elder struct rbd_obj_request *obj_request; 19399969ebc5SAlex Elder int ret; 19409969ebc5SAlex Elder 19419969ebc5SAlex Elder rbd_assert(start ^ !!rbd_dev->watch_event); 19429969ebc5SAlex Elder rbd_assert(start ^ !!rbd_dev->watch_request); 19439969ebc5SAlex Elder 19449969ebc5SAlex Elder if (start) { 19453c663bbdSAlex Elder ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev, 19469969ebc5SAlex Elder &rbd_dev->watch_event); 19479969ebc5SAlex Elder if (ret < 0) 19489969ebc5SAlex Elder return ret; 19498eb87565SAlex Elder rbd_assert(rbd_dev->watch_event != NULL); 19509969ebc5SAlex Elder } 19519969ebc5SAlex Elder 19529969ebc5SAlex Elder ret = -ENOMEM; 19539969ebc5SAlex Elder obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0, 19549969ebc5SAlex Elder OBJ_REQUEST_NODATA); 19559969ebc5SAlex Elder if (!obj_request) 19569969ebc5SAlex Elder goto out_cancel; 19579969ebc5SAlex Elder 1958430c28c3SAlex Elder obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, obj_request); 1959430c28c3SAlex Elder if (!obj_request->osd_req) 1960430c28c3SAlex Elder goto out_cancel; 1961430c28c3SAlex Elder 19628eb87565SAlex Elder if (start) 1963975241afSAlex Elder ceph_osdc_set_request_linger(osdc, obj_request->osd_req); 19648eb87565SAlex Elder else 19656977c3f9SAlex Elder ceph_osdc_unregister_linger_request(osdc, 1966975241afSAlex Elder rbd_dev->watch_request->osd_req); 19672169238dSAlex Elder 19682169238dSAlex Elder osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH, 19692169238dSAlex Elder rbd_dev->watch_event->cookie, 19702169238dSAlex Elder rbd_dev->header.obj_version, start); 19712169238dSAlex Elder rbd_osd_req_format(obj_request, true); 19722169238dSAlex Elder 19739969ebc5SAlex Elder ret = rbd_obj_request_submit(osdc, obj_request); 19749969ebc5SAlex Elder if (ret) 19759969ebc5SAlex Elder goto out_cancel; 19769969ebc5SAlex Elder ret = rbd_obj_request_wait(obj_request); 19779969ebc5SAlex Elder if (ret) 19789969ebc5SAlex Elder goto out_cancel; 19799969ebc5SAlex Elder ret = obj_request->result; 19809969ebc5SAlex Elder if (ret) 19819969ebc5SAlex Elder goto out_cancel; 19829969ebc5SAlex Elder 19838eb87565SAlex Elder /* 19848eb87565SAlex Elder * A watch request is set to linger, so the underlying osd 19858eb87565SAlex Elder * request won't go away until we unregister it. We retain 19868eb87565SAlex Elder * a pointer to the object request during that time (in 19878eb87565SAlex Elder * rbd_dev->watch_request), so we'll keep a reference to 19888eb87565SAlex Elder * it. We'll drop that reference (below) after we've 19898eb87565SAlex Elder * unregistered it. 19908eb87565SAlex Elder */ 19918eb87565SAlex Elder if (start) { 19928eb87565SAlex Elder rbd_dev->watch_request = obj_request; 19938eb87565SAlex Elder 19948eb87565SAlex Elder return 0; 19958eb87565SAlex Elder } 19968eb87565SAlex Elder 19978eb87565SAlex Elder /* We have successfully torn down the watch request */ 19988eb87565SAlex Elder 19998eb87565SAlex Elder rbd_obj_request_put(rbd_dev->watch_request); 20008eb87565SAlex Elder rbd_dev->watch_request = NULL; 20019969ebc5SAlex Elder out_cancel: 20029969ebc5SAlex Elder /* Cancel the event if we're tearing down, or on error */ 20039969ebc5SAlex Elder ceph_osdc_cancel_event(rbd_dev->watch_event); 20049969ebc5SAlex Elder rbd_dev->watch_event = NULL; 20059969ebc5SAlex Elder if (obj_request) 20069969ebc5SAlex Elder rbd_obj_request_put(obj_request); 20079969ebc5SAlex Elder 20089969ebc5SAlex Elder return ret; 20099969ebc5SAlex Elder } 20109969ebc5SAlex Elder 201136be9a76SAlex Elder /* 201236be9a76SAlex Elder * Synchronous osd object method call 201336be9a76SAlex Elder */ 201436be9a76SAlex Elder static int rbd_obj_method_sync(struct rbd_device *rbd_dev, 201536be9a76SAlex Elder const char *object_name, 201636be9a76SAlex Elder const char *class_name, 201736be9a76SAlex Elder const char *method_name, 201836be9a76SAlex Elder const char *outbound, 201936be9a76SAlex Elder size_t outbound_size, 202036be9a76SAlex Elder char *inbound, 202136be9a76SAlex Elder size_t inbound_size, 202236be9a76SAlex Elder u64 *version) 202336be9a76SAlex Elder { 20242169238dSAlex Elder struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 202536be9a76SAlex Elder struct rbd_obj_request *obj_request; 202636be9a76SAlex Elder struct page **pages; 202736be9a76SAlex Elder u32 page_count; 202836be9a76SAlex Elder int ret; 202936be9a76SAlex Elder 203036be9a76SAlex Elder /* 20316010a451SAlex Elder * Method calls are ultimately read operations. The result 20326010a451SAlex Elder * should placed into the inbound buffer provided. They 20336010a451SAlex Elder * also supply outbound data--parameters for the object 20346010a451SAlex Elder * method. Currently if this is present it will be a 20356010a451SAlex Elder * snapshot id. 203636be9a76SAlex Elder */ 203736be9a76SAlex Elder page_count = (u32) calc_pages_for(0, inbound_size); 203836be9a76SAlex Elder pages = ceph_alloc_page_vector(page_count, GFP_KERNEL); 203936be9a76SAlex Elder if (IS_ERR(pages)) 204036be9a76SAlex Elder return PTR_ERR(pages); 204136be9a76SAlex Elder 204236be9a76SAlex Elder ret = -ENOMEM; 20436010a451SAlex Elder obj_request = rbd_obj_request_create(object_name, 0, inbound_size, 204436be9a76SAlex Elder OBJ_REQUEST_PAGES); 204536be9a76SAlex Elder if (!obj_request) 204636be9a76SAlex Elder goto out; 204736be9a76SAlex Elder 204836be9a76SAlex Elder obj_request->pages = pages; 204936be9a76SAlex Elder obj_request->page_count = page_count; 205036be9a76SAlex Elder 2051430c28c3SAlex Elder obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request); 205236be9a76SAlex Elder if (!obj_request->osd_req) 205336be9a76SAlex Elder goto out; 205436be9a76SAlex Elder 2055c99d2d4aSAlex Elder osd_req_op_cls_init(obj_request->osd_req, 0, CEPH_OSD_OP_CALL, 205604017e29SAlex Elder class_name, method_name); 205704017e29SAlex Elder if (outbound_size) { 205804017e29SAlex Elder struct ceph_pagelist *pagelist; 205904017e29SAlex Elder 206004017e29SAlex Elder pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS); 206104017e29SAlex Elder if (!pagelist) 206204017e29SAlex Elder goto out; 206304017e29SAlex Elder 206404017e29SAlex Elder ceph_pagelist_init(pagelist); 206504017e29SAlex Elder ceph_pagelist_append(pagelist, outbound, outbound_size); 206604017e29SAlex Elder osd_req_op_cls_request_data_pagelist(obj_request->osd_req, 0, 206704017e29SAlex Elder pagelist); 206804017e29SAlex Elder } 2069a4ce40a9SAlex Elder osd_req_op_cls_response_data_pages(obj_request->osd_req, 0, 2070a4ce40a9SAlex Elder obj_request->pages, inbound_size, 207144cd188dSAlex Elder 0, false, false); 20722fa12320SAlex Elder rbd_osd_req_format(obj_request, false); 2073430c28c3SAlex Elder 207436be9a76SAlex Elder ret = rbd_obj_request_submit(osdc, obj_request); 207536be9a76SAlex Elder if (ret) 207636be9a76SAlex Elder goto out; 207736be9a76SAlex Elder ret = rbd_obj_request_wait(obj_request); 207836be9a76SAlex Elder if (ret) 207936be9a76SAlex Elder goto out; 208036be9a76SAlex Elder 208136be9a76SAlex Elder ret = obj_request->result; 208236be9a76SAlex Elder if (ret < 0) 208336be9a76SAlex Elder goto out; 208423ed6e13SAlex Elder ret = 0; 2085903bb32eSAlex Elder ceph_copy_from_page_vector(pages, inbound, 0, obj_request->xferred); 208636be9a76SAlex Elder if (version) 208736be9a76SAlex Elder *version = obj_request->version; 208836be9a76SAlex Elder out: 208936be9a76SAlex Elder if (obj_request) 209036be9a76SAlex Elder rbd_obj_request_put(obj_request); 209136be9a76SAlex Elder else 209236be9a76SAlex Elder ceph_release_page_vector(pages, page_count); 209336be9a76SAlex Elder 209436be9a76SAlex Elder return ret; 209536be9a76SAlex Elder } 209636be9a76SAlex Elder 2097bf0d5f50SAlex Elder static void rbd_request_fn(struct request_queue *q) 2098cc344fa1SAlex Elder __releases(q->queue_lock) __acquires(q->queue_lock) 2099bf0d5f50SAlex Elder { 2100bf0d5f50SAlex Elder struct rbd_device *rbd_dev = q->queuedata; 2101bf0d5f50SAlex Elder bool read_only = rbd_dev->mapping.read_only; 2102bf0d5f50SAlex Elder struct request *rq; 2103bf0d5f50SAlex Elder int result; 2104bf0d5f50SAlex Elder 2105bf0d5f50SAlex Elder while ((rq = blk_fetch_request(q))) { 2106bf0d5f50SAlex Elder bool write_request = rq_data_dir(rq) == WRITE; 2107bf0d5f50SAlex Elder struct rbd_img_request *img_request; 2108bf0d5f50SAlex Elder u64 offset; 2109bf0d5f50SAlex Elder u64 length; 2110bf0d5f50SAlex Elder 2111bf0d5f50SAlex Elder /* Ignore any non-FS requests that filter through. */ 2112bf0d5f50SAlex Elder 2113bf0d5f50SAlex Elder if (rq->cmd_type != REQ_TYPE_FS) { 21144dda41d3SAlex Elder dout("%s: non-fs request type %d\n", __func__, 21154dda41d3SAlex Elder (int) rq->cmd_type); 21164dda41d3SAlex Elder __blk_end_request_all(rq, 0); 21174dda41d3SAlex Elder continue; 21184dda41d3SAlex Elder } 21194dda41d3SAlex Elder 21204dda41d3SAlex Elder /* Ignore/skip any zero-length requests */ 21214dda41d3SAlex Elder 21224dda41d3SAlex Elder offset = (u64) blk_rq_pos(rq) << SECTOR_SHIFT; 21234dda41d3SAlex Elder length = (u64) blk_rq_bytes(rq); 21244dda41d3SAlex Elder 21254dda41d3SAlex Elder if (!length) { 21264dda41d3SAlex Elder dout("%s: zero-length request\n", __func__); 2127bf0d5f50SAlex Elder __blk_end_request_all(rq, 0); 2128bf0d5f50SAlex Elder continue; 2129bf0d5f50SAlex Elder } 2130bf0d5f50SAlex Elder 2131bf0d5f50SAlex Elder spin_unlock_irq(q->queue_lock); 2132bf0d5f50SAlex Elder 2133bf0d5f50SAlex Elder /* Disallow writes to a read-only device */ 2134bf0d5f50SAlex Elder 2135bf0d5f50SAlex Elder if (write_request) { 2136bf0d5f50SAlex Elder result = -EROFS; 2137bf0d5f50SAlex Elder if (read_only) 2138bf0d5f50SAlex Elder goto end_request; 2139bf0d5f50SAlex Elder rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP); 2140bf0d5f50SAlex Elder } 2141bf0d5f50SAlex Elder 21426d292906SAlex Elder /* 21436d292906SAlex Elder * Quit early if the mapped snapshot no longer 21446d292906SAlex Elder * exists. It's still possible the snapshot will 21456d292906SAlex Elder * have disappeared by the time our request arrives 21466d292906SAlex Elder * at the osd, but there's no sense in sending it if 21476d292906SAlex Elder * we already know. 21486d292906SAlex Elder */ 21496d292906SAlex Elder if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) { 2150bf0d5f50SAlex Elder dout("request for non-existent snapshot"); 2151bf0d5f50SAlex Elder rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP); 2152bf0d5f50SAlex Elder result = -ENXIO; 2153bf0d5f50SAlex Elder goto end_request; 2154bf0d5f50SAlex Elder } 2155bf0d5f50SAlex Elder 2156bf0d5f50SAlex Elder result = -EINVAL; 2157bf0d5f50SAlex Elder if (WARN_ON(offset && length > U64_MAX - offset + 1)) 2158bf0d5f50SAlex Elder goto end_request; /* Shouldn't happen */ 2159bf0d5f50SAlex Elder 2160bf0d5f50SAlex Elder result = -ENOMEM; 2161bf0d5f50SAlex Elder img_request = rbd_img_request_create(rbd_dev, offset, length, 21629849e986SAlex Elder write_request, false); 2163bf0d5f50SAlex Elder if (!img_request) 2164bf0d5f50SAlex Elder goto end_request; 2165bf0d5f50SAlex Elder 2166bf0d5f50SAlex Elder img_request->rq = rq; 2167bf0d5f50SAlex Elder 2168bf0d5f50SAlex Elder result = rbd_img_request_fill_bio(img_request, rq->bio); 2169bf0d5f50SAlex Elder if (!result) 2170bf0d5f50SAlex Elder result = rbd_img_request_submit(img_request); 2171bf0d5f50SAlex Elder if (result) 2172bf0d5f50SAlex Elder rbd_img_request_put(img_request); 2173bf0d5f50SAlex Elder end_request: 2174bf0d5f50SAlex Elder spin_lock_irq(q->queue_lock); 2175bf0d5f50SAlex Elder if (result < 0) { 21767da22d29SAlex Elder rbd_warn(rbd_dev, "%s %llx at %llx result %d\n", 21777da22d29SAlex Elder write_request ? "write" : "read", 21787da22d29SAlex Elder length, offset, result); 21797da22d29SAlex Elder 2180bf0d5f50SAlex Elder __blk_end_request_all(rq, result); 2181bf0d5f50SAlex Elder } 2182bf0d5f50SAlex Elder } 2183bf0d5f50SAlex Elder } 2184bf0d5f50SAlex Elder 2185602adf40SYehuda Sadeh /* 2186602adf40SYehuda Sadeh * a queue callback. Makes sure that we don't create a bio that spans across 2187602adf40SYehuda Sadeh * multiple osd objects. One exception would be with a single page bios, 2188f7760dadSAlex Elder * which we handle later at bio_chain_clone_range() 2189602adf40SYehuda Sadeh */ 2190602adf40SYehuda Sadeh static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd, 2191602adf40SYehuda Sadeh struct bio_vec *bvec) 2192602adf40SYehuda Sadeh { 2193602adf40SYehuda Sadeh struct rbd_device *rbd_dev = q->queuedata; 2194e5cfeed2SAlex Elder sector_t sector_offset; 2195e5cfeed2SAlex Elder sector_t sectors_per_obj; 2196e5cfeed2SAlex Elder sector_t obj_sector_offset; 2197e5cfeed2SAlex Elder int ret; 2198602adf40SYehuda Sadeh 2199e5cfeed2SAlex Elder /* 2200e5cfeed2SAlex Elder * Find how far into its rbd object the partition-relative 2201e5cfeed2SAlex Elder * bio start sector is to offset relative to the enclosing 2202e5cfeed2SAlex Elder * device. 2203e5cfeed2SAlex Elder */ 2204e5cfeed2SAlex Elder sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector; 2205e5cfeed2SAlex Elder sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT); 2206e5cfeed2SAlex Elder obj_sector_offset = sector_offset & (sectors_per_obj - 1); 2207593a9e7bSAlex Elder 2208e5cfeed2SAlex Elder /* 2209e5cfeed2SAlex Elder * Compute the number of bytes from that offset to the end 2210e5cfeed2SAlex Elder * of the object. Account for what's already used by the bio. 2211e5cfeed2SAlex Elder */ 2212e5cfeed2SAlex Elder ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT; 2213e5cfeed2SAlex Elder if (ret > bmd->bi_size) 2214e5cfeed2SAlex Elder ret -= bmd->bi_size; 2215e5cfeed2SAlex Elder else 2216e5cfeed2SAlex Elder ret = 0; 2217e5cfeed2SAlex Elder 2218e5cfeed2SAlex Elder /* 2219e5cfeed2SAlex Elder * Don't send back more than was asked for. And if the bio 2220e5cfeed2SAlex Elder * was empty, let the whole thing through because: "Note 2221e5cfeed2SAlex Elder * that a block device *must* allow a single page to be 2222e5cfeed2SAlex Elder * added to an empty bio." 2223e5cfeed2SAlex Elder */ 2224e5cfeed2SAlex Elder rbd_assert(bvec->bv_len <= PAGE_SIZE); 2225e5cfeed2SAlex Elder if (ret > (int) bvec->bv_len || !bmd->bi_size) 2226e5cfeed2SAlex Elder ret = (int) bvec->bv_len; 2227e5cfeed2SAlex Elder 2228e5cfeed2SAlex Elder return ret; 2229602adf40SYehuda Sadeh } 2230602adf40SYehuda Sadeh 2231602adf40SYehuda Sadeh static void rbd_free_disk(struct rbd_device *rbd_dev) 2232602adf40SYehuda Sadeh { 2233602adf40SYehuda Sadeh struct gendisk *disk = rbd_dev->disk; 2234602adf40SYehuda Sadeh 2235602adf40SYehuda Sadeh if (!disk) 2236602adf40SYehuda Sadeh return; 2237602adf40SYehuda Sadeh 2238602adf40SYehuda Sadeh if (disk->flags & GENHD_FL_UP) 2239602adf40SYehuda Sadeh del_gendisk(disk); 2240602adf40SYehuda Sadeh if (disk->queue) 2241602adf40SYehuda Sadeh blk_cleanup_queue(disk->queue); 2242602adf40SYehuda Sadeh put_disk(disk); 2243602adf40SYehuda Sadeh } 2244602adf40SYehuda Sadeh 2245788e2df3SAlex Elder static int rbd_obj_read_sync(struct rbd_device *rbd_dev, 2246788e2df3SAlex Elder const char *object_name, 2247788e2df3SAlex Elder u64 offset, u64 length, 2248788e2df3SAlex Elder char *buf, u64 *version) 2249788e2df3SAlex Elder 2250788e2df3SAlex Elder { 22512169238dSAlex Elder struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 2252788e2df3SAlex Elder struct rbd_obj_request *obj_request; 2253788e2df3SAlex Elder struct page **pages = NULL; 2254788e2df3SAlex Elder u32 page_count; 22551ceae7efSAlex Elder size_t size; 2256788e2df3SAlex Elder int ret; 2257788e2df3SAlex Elder 2258788e2df3SAlex Elder page_count = (u32) calc_pages_for(offset, length); 2259788e2df3SAlex Elder pages = ceph_alloc_page_vector(page_count, GFP_KERNEL); 2260788e2df3SAlex Elder if (IS_ERR(pages)) 2261788e2df3SAlex Elder ret = PTR_ERR(pages); 2262788e2df3SAlex Elder 2263788e2df3SAlex Elder ret = -ENOMEM; 2264788e2df3SAlex Elder obj_request = rbd_obj_request_create(object_name, offset, length, 2265788e2df3SAlex Elder OBJ_REQUEST_PAGES); 2266788e2df3SAlex Elder if (!obj_request) 2267788e2df3SAlex Elder goto out; 2268788e2df3SAlex Elder 2269788e2df3SAlex Elder obj_request->pages = pages; 2270788e2df3SAlex Elder obj_request->page_count = page_count; 2271788e2df3SAlex Elder 2272430c28c3SAlex Elder obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request); 2273788e2df3SAlex Elder if (!obj_request->osd_req) 2274788e2df3SAlex Elder goto out; 2275788e2df3SAlex Elder 2276c99d2d4aSAlex Elder osd_req_op_extent_init(obj_request->osd_req, 0, CEPH_OSD_OP_READ, 2277c99d2d4aSAlex Elder offset, length, 0, 0); 2278406e2c9fSAlex Elder osd_req_op_extent_osd_data_pages(obj_request->osd_req, 0, 2279a4ce40a9SAlex Elder obj_request->pages, 228044cd188dSAlex Elder obj_request->length, 228144cd188dSAlex Elder obj_request->offset & ~PAGE_MASK, 228244cd188dSAlex Elder false, false); 22832fa12320SAlex Elder rbd_osd_req_format(obj_request, false); 2284430c28c3SAlex Elder 2285788e2df3SAlex Elder ret = rbd_obj_request_submit(osdc, obj_request); 2286788e2df3SAlex Elder if (ret) 2287788e2df3SAlex Elder goto out; 2288788e2df3SAlex Elder ret = rbd_obj_request_wait(obj_request); 2289788e2df3SAlex Elder if (ret) 2290788e2df3SAlex Elder goto out; 2291788e2df3SAlex Elder 2292788e2df3SAlex Elder ret = obj_request->result; 2293788e2df3SAlex Elder if (ret < 0) 2294788e2df3SAlex Elder goto out; 22951ceae7efSAlex Elder 22961ceae7efSAlex Elder rbd_assert(obj_request->xferred <= (u64) SIZE_MAX); 22971ceae7efSAlex Elder size = (size_t) obj_request->xferred; 2298903bb32eSAlex Elder ceph_copy_from_page_vector(pages, buf, 0, size); 229923ed6e13SAlex Elder rbd_assert(size <= (size_t) INT_MAX); 230023ed6e13SAlex Elder ret = (int) size; 2301788e2df3SAlex Elder if (version) 2302788e2df3SAlex Elder *version = obj_request->version; 2303788e2df3SAlex Elder out: 2304788e2df3SAlex Elder if (obj_request) 2305788e2df3SAlex Elder rbd_obj_request_put(obj_request); 2306788e2df3SAlex Elder else 2307788e2df3SAlex Elder ceph_release_page_vector(pages, page_count); 2308788e2df3SAlex Elder 2309788e2df3SAlex Elder return ret; 2310788e2df3SAlex Elder } 2311788e2df3SAlex Elder 2312602adf40SYehuda Sadeh /* 23134156d998SAlex Elder * Read the complete header for the given rbd device. 23144156d998SAlex Elder * 23154156d998SAlex Elder * Returns a pointer to a dynamically-allocated buffer containing 23164156d998SAlex Elder * the complete and validated header. Caller can pass the address 23174156d998SAlex Elder * of a variable that will be filled in with the version of the 23184156d998SAlex Elder * header object at the time it was read. 23194156d998SAlex Elder * 23204156d998SAlex Elder * Returns a pointer-coded errno if a failure occurs. 23214156d998SAlex Elder */ 23224156d998SAlex Elder static struct rbd_image_header_ondisk * 23234156d998SAlex Elder rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version) 23244156d998SAlex Elder { 23254156d998SAlex Elder struct rbd_image_header_ondisk *ondisk = NULL; 23264156d998SAlex Elder u32 snap_count = 0; 23274156d998SAlex Elder u64 names_size = 0; 23284156d998SAlex Elder u32 want_count; 23294156d998SAlex Elder int ret; 23304156d998SAlex Elder 23314156d998SAlex Elder /* 23324156d998SAlex Elder * The complete header will include an array of its 64-bit 23334156d998SAlex Elder * snapshot ids, followed by the names of those snapshots as 23344156d998SAlex Elder * a contiguous block of NUL-terminated strings. Note that 23354156d998SAlex Elder * the number of snapshots could change by the time we read 23364156d998SAlex Elder * it in, in which case we re-read it. 23374156d998SAlex Elder */ 23384156d998SAlex Elder do { 23394156d998SAlex Elder size_t size; 23404156d998SAlex Elder 23414156d998SAlex Elder kfree(ondisk); 23424156d998SAlex Elder 23434156d998SAlex Elder size = sizeof (*ondisk); 23444156d998SAlex Elder size += snap_count * sizeof (struct rbd_image_snap_ondisk); 23454156d998SAlex Elder size += names_size; 23464156d998SAlex Elder ondisk = kmalloc(size, GFP_KERNEL); 23474156d998SAlex Elder if (!ondisk) 23484156d998SAlex Elder return ERR_PTR(-ENOMEM); 23494156d998SAlex Elder 2350788e2df3SAlex Elder ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name, 23514156d998SAlex Elder 0, size, 23524156d998SAlex Elder (char *) ondisk, version); 23534156d998SAlex Elder if (ret < 0) 23544156d998SAlex Elder goto out_err; 23554156d998SAlex Elder if (WARN_ON((size_t) ret < size)) { 23564156d998SAlex Elder ret = -ENXIO; 235706ecc6cbSAlex Elder rbd_warn(rbd_dev, "short header read (want %zd got %d)", 235806ecc6cbSAlex Elder size, ret); 23594156d998SAlex Elder goto out_err; 23604156d998SAlex Elder } 23614156d998SAlex Elder if (!rbd_dev_ondisk_valid(ondisk)) { 23624156d998SAlex Elder ret = -ENXIO; 236306ecc6cbSAlex Elder rbd_warn(rbd_dev, "invalid header"); 23644156d998SAlex Elder goto out_err; 23654156d998SAlex Elder } 23664156d998SAlex Elder 23674156d998SAlex Elder names_size = le64_to_cpu(ondisk->snap_names_len); 23684156d998SAlex Elder want_count = snap_count; 23694156d998SAlex Elder snap_count = le32_to_cpu(ondisk->snap_count); 23704156d998SAlex Elder } while (snap_count != want_count); 23714156d998SAlex Elder 23724156d998SAlex Elder return ondisk; 23734156d998SAlex Elder 23744156d998SAlex Elder out_err: 23754156d998SAlex Elder kfree(ondisk); 23764156d998SAlex Elder 23774156d998SAlex Elder return ERR_PTR(ret); 23784156d998SAlex Elder } 23794156d998SAlex Elder 23804156d998SAlex Elder /* 2381602adf40SYehuda Sadeh * reload the ondisk the header 2382602adf40SYehuda Sadeh */ 2383602adf40SYehuda Sadeh static int rbd_read_header(struct rbd_device *rbd_dev, 2384602adf40SYehuda Sadeh struct rbd_image_header *header) 2385602adf40SYehuda Sadeh { 23864156d998SAlex Elder struct rbd_image_header_ondisk *ondisk; 23874156d998SAlex Elder u64 ver = 0; 23884156d998SAlex Elder int ret; 2389602adf40SYehuda Sadeh 23904156d998SAlex Elder ondisk = rbd_dev_v1_header_read(rbd_dev, &ver); 23914156d998SAlex Elder if (IS_ERR(ondisk)) 23924156d998SAlex Elder return PTR_ERR(ondisk); 23934156d998SAlex Elder ret = rbd_header_from_disk(header, ondisk); 23944156d998SAlex Elder if (ret >= 0) 239559c2be1eSYehuda Sadeh header->obj_version = ver; 23964156d998SAlex Elder kfree(ondisk); 2397602adf40SYehuda Sadeh 23984156d998SAlex Elder return ret; 2399602adf40SYehuda Sadeh } 2400602adf40SYehuda Sadeh 240141f38c2bSAlex Elder static void rbd_remove_all_snaps(struct rbd_device *rbd_dev) 2402dfc5606dSYehuda Sadeh { 2403dfc5606dSYehuda Sadeh struct rbd_snap *snap; 2404a0593290SAlex Elder struct rbd_snap *next; 2405dfc5606dSYehuda Sadeh 2406a0593290SAlex Elder list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node) 240741f38c2bSAlex Elder rbd_remove_snap_dev(snap); 2408dfc5606dSYehuda Sadeh } 2409dfc5606dSYehuda Sadeh 24109478554aSAlex Elder static void rbd_update_mapping_size(struct rbd_device *rbd_dev) 24119478554aSAlex Elder { 24129478554aSAlex Elder sector_t size; 24139478554aSAlex Elder 24140d7dbfceSAlex Elder if (rbd_dev->spec->snap_id != CEPH_NOSNAP) 24159478554aSAlex Elder return; 24169478554aSAlex Elder 24179478554aSAlex Elder size = (sector_t) rbd_dev->header.image_size / SECTOR_SIZE; 24189478554aSAlex Elder dout("setting size to %llu sectors", (unsigned long long) size); 24199478554aSAlex Elder rbd_dev->mapping.size = (u64) size; 24209478554aSAlex Elder set_capacity(rbd_dev->disk, size); 24219478554aSAlex Elder } 24229478554aSAlex Elder 2423602adf40SYehuda Sadeh /* 2424602adf40SYehuda Sadeh * only read the first part of the ondisk header, without the snaps info 2425602adf40SYehuda Sadeh */ 2426117973fbSAlex Elder static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver) 2427602adf40SYehuda Sadeh { 2428602adf40SYehuda Sadeh int ret; 2429602adf40SYehuda Sadeh struct rbd_image_header h; 2430602adf40SYehuda Sadeh 2431602adf40SYehuda Sadeh ret = rbd_read_header(rbd_dev, &h); 2432602adf40SYehuda Sadeh if (ret < 0) 2433602adf40SYehuda Sadeh return ret; 2434602adf40SYehuda Sadeh 2435a51aa0c0SJosh Durgin down_write(&rbd_dev->header_rwsem); 2436a51aa0c0SJosh Durgin 24379478554aSAlex Elder /* Update image size, and check for resize of mapped image */ 24389478554aSAlex Elder rbd_dev->header.image_size = h.image_size; 24399478554aSAlex Elder rbd_update_mapping_size(rbd_dev); 24409db4b3e3SSage Weil 2441849b4260SAlex Elder /* rbd_dev->header.object_prefix shouldn't change */ 2442602adf40SYehuda Sadeh kfree(rbd_dev->header.snap_sizes); 2443849b4260SAlex Elder kfree(rbd_dev->header.snap_names); 2444d1d25646SJosh Durgin /* osd requests may still refer to snapc */ 2445d1d25646SJosh Durgin ceph_put_snap_context(rbd_dev->header.snapc); 2446602adf40SYehuda Sadeh 2447b813623aSAlex Elder if (hver) 2448b813623aSAlex Elder *hver = h.obj_version; 2449a71b891bSJosh Durgin rbd_dev->header.obj_version = h.obj_version; 245093a24e08SJosh Durgin rbd_dev->header.image_size = h.image_size; 2451602adf40SYehuda Sadeh rbd_dev->header.snapc = h.snapc; 2452602adf40SYehuda Sadeh rbd_dev->header.snap_names = h.snap_names; 2453602adf40SYehuda Sadeh rbd_dev->header.snap_sizes = h.snap_sizes; 2454849b4260SAlex Elder /* Free the extra copy of the object prefix */ 2455849b4260SAlex Elder WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix)); 2456849b4260SAlex Elder kfree(h.object_prefix); 2457849b4260SAlex Elder 2458304f6808SAlex Elder ret = rbd_dev_snaps_update(rbd_dev); 2459304f6808SAlex Elder if (!ret) 2460304f6808SAlex Elder ret = rbd_dev_snaps_register(rbd_dev); 2461dfc5606dSYehuda Sadeh 2462c666601aSJosh Durgin up_write(&rbd_dev->header_rwsem); 2463602adf40SYehuda Sadeh 2464dfc5606dSYehuda Sadeh return ret; 2465602adf40SYehuda Sadeh } 2466602adf40SYehuda Sadeh 2467117973fbSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver) 24681fe5e993SAlex Elder { 24691fe5e993SAlex Elder int ret; 24701fe5e993SAlex Elder 2471117973fbSAlex Elder rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 24721fe5e993SAlex Elder mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); 2473117973fbSAlex Elder if (rbd_dev->image_format == 1) 2474117973fbSAlex Elder ret = rbd_dev_v1_refresh(rbd_dev, hver); 2475117973fbSAlex Elder else 2476117973fbSAlex Elder ret = rbd_dev_v2_refresh(rbd_dev, hver); 24771fe5e993SAlex Elder mutex_unlock(&ctl_mutex); 24781fe5e993SAlex Elder 24791fe5e993SAlex Elder return ret; 24801fe5e993SAlex Elder } 24811fe5e993SAlex Elder 2482602adf40SYehuda Sadeh static int rbd_init_disk(struct rbd_device *rbd_dev) 2483602adf40SYehuda Sadeh { 2484602adf40SYehuda Sadeh struct gendisk *disk; 2485602adf40SYehuda Sadeh struct request_queue *q; 2486593a9e7bSAlex Elder u64 segment_size; 2487602adf40SYehuda Sadeh 2488602adf40SYehuda Sadeh /* create gendisk info */ 2489602adf40SYehuda Sadeh disk = alloc_disk(RBD_MINORS_PER_MAJOR); 2490602adf40SYehuda Sadeh if (!disk) 24911fcdb8aaSAlex Elder return -ENOMEM; 2492602adf40SYehuda Sadeh 2493f0f8cef5SAlex Elder snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d", 2494de71a297SAlex Elder rbd_dev->dev_id); 2495602adf40SYehuda Sadeh disk->major = rbd_dev->major; 2496602adf40SYehuda Sadeh disk->first_minor = 0; 2497602adf40SYehuda Sadeh disk->fops = &rbd_bd_ops; 2498602adf40SYehuda Sadeh disk->private_data = rbd_dev; 2499602adf40SYehuda Sadeh 2500bf0d5f50SAlex Elder q = blk_init_queue(rbd_request_fn, &rbd_dev->lock); 2501602adf40SYehuda Sadeh if (!q) 2502602adf40SYehuda Sadeh goto out_disk; 2503029bcbd8SJosh Durgin 2504593a9e7bSAlex Elder /* We use the default size, but let's be explicit about it. */ 2505593a9e7bSAlex Elder blk_queue_physical_block_size(q, SECTOR_SIZE); 2506593a9e7bSAlex Elder 2507029bcbd8SJosh Durgin /* set io sizes to object size */ 2508593a9e7bSAlex Elder segment_size = rbd_obj_bytes(&rbd_dev->header); 2509593a9e7bSAlex Elder blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE); 2510593a9e7bSAlex Elder blk_queue_max_segment_size(q, segment_size); 2511593a9e7bSAlex Elder blk_queue_io_min(q, segment_size); 2512593a9e7bSAlex Elder blk_queue_io_opt(q, segment_size); 2513029bcbd8SJosh Durgin 2514602adf40SYehuda Sadeh blk_queue_merge_bvec(q, rbd_merge_bvec); 2515602adf40SYehuda Sadeh disk->queue = q; 2516602adf40SYehuda Sadeh 2517602adf40SYehuda Sadeh q->queuedata = rbd_dev; 2518602adf40SYehuda Sadeh 2519602adf40SYehuda Sadeh rbd_dev->disk = disk; 2520602adf40SYehuda Sadeh 252112f02944SAlex Elder set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE); 252212f02944SAlex Elder 2523602adf40SYehuda Sadeh return 0; 2524602adf40SYehuda Sadeh out_disk: 2525602adf40SYehuda Sadeh put_disk(disk); 25261fcdb8aaSAlex Elder 25271fcdb8aaSAlex Elder return -ENOMEM; 2528602adf40SYehuda Sadeh } 2529602adf40SYehuda Sadeh 2530dfc5606dSYehuda Sadeh /* 2531dfc5606dSYehuda Sadeh sysfs 2532dfc5606dSYehuda Sadeh */ 2533602adf40SYehuda Sadeh 2534593a9e7bSAlex Elder static struct rbd_device *dev_to_rbd_dev(struct device *dev) 2535593a9e7bSAlex Elder { 2536593a9e7bSAlex Elder return container_of(dev, struct rbd_device, dev); 2537593a9e7bSAlex Elder } 2538593a9e7bSAlex Elder 2539dfc5606dSYehuda Sadeh static ssize_t rbd_size_show(struct device *dev, 2540dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 2541602adf40SYehuda Sadeh { 2542593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 2543a51aa0c0SJosh Durgin sector_t size; 2544dfc5606dSYehuda Sadeh 2545a51aa0c0SJosh Durgin down_read(&rbd_dev->header_rwsem); 2546a51aa0c0SJosh Durgin size = get_capacity(rbd_dev->disk); 2547a51aa0c0SJosh Durgin up_read(&rbd_dev->header_rwsem); 2548a51aa0c0SJosh Durgin 2549a51aa0c0SJosh Durgin return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE); 2550602adf40SYehuda Sadeh } 2551602adf40SYehuda Sadeh 255234b13184SAlex Elder /* 255334b13184SAlex Elder * Note this shows the features for whatever's mapped, which is not 255434b13184SAlex Elder * necessarily the base image. 255534b13184SAlex Elder */ 255634b13184SAlex Elder static ssize_t rbd_features_show(struct device *dev, 255734b13184SAlex Elder struct device_attribute *attr, char *buf) 255834b13184SAlex Elder { 255934b13184SAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 256034b13184SAlex Elder 256134b13184SAlex Elder return sprintf(buf, "0x%016llx\n", 256234b13184SAlex Elder (unsigned long long) rbd_dev->mapping.features); 256334b13184SAlex Elder } 256434b13184SAlex Elder 2565dfc5606dSYehuda Sadeh static ssize_t rbd_major_show(struct device *dev, 2566dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 2567602adf40SYehuda Sadeh { 2568593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 2569dfc5606dSYehuda Sadeh 2570dfc5606dSYehuda Sadeh return sprintf(buf, "%d\n", rbd_dev->major); 2571dfc5606dSYehuda Sadeh } 2572dfc5606dSYehuda Sadeh 2573dfc5606dSYehuda Sadeh static ssize_t rbd_client_id_show(struct device *dev, 2574dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 2575dfc5606dSYehuda Sadeh { 2576593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 2577dfc5606dSYehuda Sadeh 25781dbb4399SAlex Elder return sprintf(buf, "client%lld\n", 25791dbb4399SAlex Elder ceph_client_id(rbd_dev->rbd_client->client)); 2580dfc5606dSYehuda Sadeh } 2581dfc5606dSYehuda Sadeh 2582dfc5606dSYehuda Sadeh static ssize_t rbd_pool_show(struct device *dev, 2583dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 2584dfc5606dSYehuda Sadeh { 2585593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 2586dfc5606dSYehuda Sadeh 25870d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->pool_name); 2588dfc5606dSYehuda Sadeh } 2589dfc5606dSYehuda Sadeh 25909bb2f334SAlex Elder static ssize_t rbd_pool_id_show(struct device *dev, 25919bb2f334SAlex Elder struct device_attribute *attr, char *buf) 25929bb2f334SAlex Elder { 25939bb2f334SAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 25949bb2f334SAlex Elder 25950d7dbfceSAlex Elder return sprintf(buf, "%llu\n", 25960d7dbfceSAlex Elder (unsigned long long) rbd_dev->spec->pool_id); 25979bb2f334SAlex Elder } 25989bb2f334SAlex Elder 2599dfc5606dSYehuda Sadeh static ssize_t rbd_name_show(struct device *dev, 2600dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 2601dfc5606dSYehuda Sadeh { 2602593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 2603dfc5606dSYehuda Sadeh 2604a92ffdf8SAlex Elder if (rbd_dev->spec->image_name) 26050d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->image_name); 2606a92ffdf8SAlex Elder 2607a92ffdf8SAlex Elder return sprintf(buf, "(unknown)\n"); 2608dfc5606dSYehuda Sadeh } 2609dfc5606dSYehuda Sadeh 2610589d30e0SAlex Elder static ssize_t rbd_image_id_show(struct device *dev, 2611589d30e0SAlex Elder struct device_attribute *attr, char *buf) 2612589d30e0SAlex Elder { 2613589d30e0SAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 2614589d30e0SAlex Elder 26150d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->image_id); 2616589d30e0SAlex Elder } 2617589d30e0SAlex Elder 261834b13184SAlex Elder /* 261934b13184SAlex Elder * Shows the name of the currently-mapped snapshot (or 262034b13184SAlex Elder * RBD_SNAP_HEAD_NAME for the base image). 262134b13184SAlex Elder */ 2622dfc5606dSYehuda Sadeh static ssize_t rbd_snap_show(struct device *dev, 2623dfc5606dSYehuda Sadeh struct device_attribute *attr, 2624dfc5606dSYehuda Sadeh char *buf) 2625dfc5606dSYehuda Sadeh { 2626593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 2627dfc5606dSYehuda Sadeh 26280d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->snap_name); 2629dfc5606dSYehuda Sadeh } 2630dfc5606dSYehuda Sadeh 263186b00e0dSAlex Elder /* 263286b00e0dSAlex Elder * For an rbd v2 image, shows the pool id, image id, and snapshot id 263386b00e0dSAlex Elder * for the parent image. If there is no parent, simply shows 263486b00e0dSAlex Elder * "(no parent image)". 263586b00e0dSAlex Elder */ 263686b00e0dSAlex Elder static ssize_t rbd_parent_show(struct device *dev, 263786b00e0dSAlex Elder struct device_attribute *attr, 263886b00e0dSAlex Elder char *buf) 263986b00e0dSAlex Elder { 264086b00e0dSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 264186b00e0dSAlex Elder struct rbd_spec *spec = rbd_dev->parent_spec; 264286b00e0dSAlex Elder int count; 264386b00e0dSAlex Elder char *bufp = buf; 264486b00e0dSAlex Elder 264586b00e0dSAlex Elder if (!spec) 264686b00e0dSAlex Elder return sprintf(buf, "(no parent image)\n"); 264786b00e0dSAlex Elder 264886b00e0dSAlex Elder count = sprintf(bufp, "pool_id %llu\npool_name %s\n", 264986b00e0dSAlex Elder (unsigned long long) spec->pool_id, spec->pool_name); 265086b00e0dSAlex Elder if (count < 0) 265186b00e0dSAlex Elder return count; 265286b00e0dSAlex Elder bufp += count; 265386b00e0dSAlex Elder 265486b00e0dSAlex Elder count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id, 265586b00e0dSAlex Elder spec->image_name ? spec->image_name : "(unknown)"); 265686b00e0dSAlex Elder if (count < 0) 265786b00e0dSAlex Elder return count; 265886b00e0dSAlex Elder bufp += count; 265986b00e0dSAlex Elder 266086b00e0dSAlex Elder count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n", 266186b00e0dSAlex Elder (unsigned long long) spec->snap_id, spec->snap_name); 266286b00e0dSAlex Elder if (count < 0) 266386b00e0dSAlex Elder return count; 266486b00e0dSAlex Elder bufp += count; 266586b00e0dSAlex Elder 266686b00e0dSAlex Elder count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap); 266786b00e0dSAlex Elder if (count < 0) 266886b00e0dSAlex Elder return count; 266986b00e0dSAlex Elder bufp += count; 267086b00e0dSAlex Elder 267186b00e0dSAlex Elder return (ssize_t) (bufp - buf); 267286b00e0dSAlex Elder } 267386b00e0dSAlex Elder 2674dfc5606dSYehuda Sadeh static ssize_t rbd_image_refresh(struct device *dev, 2675dfc5606dSYehuda Sadeh struct device_attribute *attr, 2676dfc5606dSYehuda Sadeh const char *buf, 2677dfc5606dSYehuda Sadeh size_t size) 2678dfc5606dSYehuda Sadeh { 2679593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 2680b813623aSAlex Elder int ret; 2681602adf40SYehuda Sadeh 2682117973fbSAlex Elder ret = rbd_dev_refresh(rbd_dev, NULL); 2683b813623aSAlex Elder 2684b813623aSAlex Elder return ret < 0 ? ret : size; 2685dfc5606dSYehuda Sadeh } 2686602adf40SYehuda Sadeh 2687dfc5606dSYehuda Sadeh static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL); 268834b13184SAlex Elder static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL); 2689dfc5606dSYehuda Sadeh static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL); 2690dfc5606dSYehuda Sadeh static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL); 2691dfc5606dSYehuda Sadeh static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL); 26929bb2f334SAlex Elder static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL); 2693dfc5606dSYehuda Sadeh static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL); 2694589d30e0SAlex Elder static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL); 2695dfc5606dSYehuda Sadeh static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh); 2696dfc5606dSYehuda Sadeh static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL); 269786b00e0dSAlex Elder static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL); 2698dfc5606dSYehuda Sadeh 2699dfc5606dSYehuda Sadeh static struct attribute *rbd_attrs[] = { 2700dfc5606dSYehuda Sadeh &dev_attr_size.attr, 270134b13184SAlex Elder &dev_attr_features.attr, 2702dfc5606dSYehuda Sadeh &dev_attr_major.attr, 2703dfc5606dSYehuda Sadeh &dev_attr_client_id.attr, 2704dfc5606dSYehuda Sadeh &dev_attr_pool.attr, 27059bb2f334SAlex Elder &dev_attr_pool_id.attr, 2706dfc5606dSYehuda Sadeh &dev_attr_name.attr, 2707589d30e0SAlex Elder &dev_attr_image_id.attr, 2708dfc5606dSYehuda Sadeh &dev_attr_current_snap.attr, 270986b00e0dSAlex Elder &dev_attr_parent.attr, 2710dfc5606dSYehuda Sadeh &dev_attr_refresh.attr, 2711dfc5606dSYehuda Sadeh NULL 2712dfc5606dSYehuda Sadeh }; 2713dfc5606dSYehuda Sadeh 2714dfc5606dSYehuda Sadeh static struct attribute_group rbd_attr_group = { 2715dfc5606dSYehuda Sadeh .attrs = rbd_attrs, 2716dfc5606dSYehuda Sadeh }; 2717dfc5606dSYehuda Sadeh 2718dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_attr_groups[] = { 2719dfc5606dSYehuda Sadeh &rbd_attr_group, 2720dfc5606dSYehuda Sadeh NULL 2721dfc5606dSYehuda Sadeh }; 2722dfc5606dSYehuda Sadeh 2723dfc5606dSYehuda Sadeh static void rbd_sysfs_dev_release(struct device *dev) 2724dfc5606dSYehuda Sadeh { 2725dfc5606dSYehuda Sadeh } 2726dfc5606dSYehuda Sadeh 2727dfc5606dSYehuda Sadeh static struct device_type rbd_device_type = { 2728dfc5606dSYehuda Sadeh .name = "rbd", 2729dfc5606dSYehuda Sadeh .groups = rbd_attr_groups, 2730dfc5606dSYehuda Sadeh .release = rbd_sysfs_dev_release, 2731dfc5606dSYehuda Sadeh }; 2732dfc5606dSYehuda Sadeh 2733dfc5606dSYehuda Sadeh 2734dfc5606dSYehuda Sadeh /* 2735dfc5606dSYehuda Sadeh sysfs - snapshots 2736dfc5606dSYehuda Sadeh */ 2737dfc5606dSYehuda Sadeh 2738dfc5606dSYehuda Sadeh static ssize_t rbd_snap_size_show(struct device *dev, 2739dfc5606dSYehuda Sadeh struct device_attribute *attr, 2740dfc5606dSYehuda Sadeh char *buf) 2741dfc5606dSYehuda Sadeh { 2742dfc5606dSYehuda Sadeh struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev); 2743dfc5606dSYehuda Sadeh 27443591538fSJosh Durgin return sprintf(buf, "%llu\n", (unsigned long long)snap->size); 2745dfc5606dSYehuda Sadeh } 2746dfc5606dSYehuda Sadeh 2747dfc5606dSYehuda Sadeh static ssize_t rbd_snap_id_show(struct device *dev, 2748dfc5606dSYehuda Sadeh struct device_attribute *attr, 2749dfc5606dSYehuda Sadeh char *buf) 2750dfc5606dSYehuda Sadeh { 2751dfc5606dSYehuda Sadeh struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev); 2752dfc5606dSYehuda Sadeh 2753593a9e7bSAlex Elder return sprintf(buf, "%llu\n", (unsigned long long)snap->id); 2754dfc5606dSYehuda Sadeh } 2755dfc5606dSYehuda Sadeh 275634b13184SAlex Elder static ssize_t rbd_snap_features_show(struct device *dev, 275734b13184SAlex Elder struct device_attribute *attr, 275834b13184SAlex Elder char *buf) 275934b13184SAlex Elder { 276034b13184SAlex Elder struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev); 276134b13184SAlex Elder 276234b13184SAlex Elder return sprintf(buf, "0x%016llx\n", 276334b13184SAlex Elder (unsigned long long) snap->features); 276434b13184SAlex Elder } 276534b13184SAlex Elder 2766dfc5606dSYehuda Sadeh static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL); 2767dfc5606dSYehuda Sadeh static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL); 276834b13184SAlex Elder static DEVICE_ATTR(snap_features, S_IRUGO, rbd_snap_features_show, NULL); 2769dfc5606dSYehuda Sadeh 2770dfc5606dSYehuda Sadeh static struct attribute *rbd_snap_attrs[] = { 2771dfc5606dSYehuda Sadeh &dev_attr_snap_size.attr, 2772dfc5606dSYehuda Sadeh &dev_attr_snap_id.attr, 277334b13184SAlex Elder &dev_attr_snap_features.attr, 2774dfc5606dSYehuda Sadeh NULL, 2775dfc5606dSYehuda Sadeh }; 2776dfc5606dSYehuda Sadeh 2777dfc5606dSYehuda Sadeh static struct attribute_group rbd_snap_attr_group = { 2778dfc5606dSYehuda Sadeh .attrs = rbd_snap_attrs, 2779dfc5606dSYehuda Sadeh }; 2780dfc5606dSYehuda Sadeh 2781dfc5606dSYehuda Sadeh static void rbd_snap_dev_release(struct device *dev) 2782dfc5606dSYehuda Sadeh { 2783dfc5606dSYehuda Sadeh struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev); 2784dfc5606dSYehuda Sadeh kfree(snap->name); 2785dfc5606dSYehuda Sadeh kfree(snap); 2786dfc5606dSYehuda Sadeh } 2787dfc5606dSYehuda Sadeh 2788dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_snap_attr_groups[] = { 2789dfc5606dSYehuda Sadeh &rbd_snap_attr_group, 2790dfc5606dSYehuda Sadeh NULL 2791dfc5606dSYehuda Sadeh }; 2792dfc5606dSYehuda Sadeh 2793dfc5606dSYehuda Sadeh static struct device_type rbd_snap_device_type = { 2794dfc5606dSYehuda Sadeh .groups = rbd_snap_attr_groups, 2795dfc5606dSYehuda Sadeh .release = rbd_snap_dev_release, 2796dfc5606dSYehuda Sadeh }; 2797dfc5606dSYehuda Sadeh 27988b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec) 27998b8fb99cSAlex Elder { 28008b8fb99cSAlex Elder kref_get(&spec->kref); 28018b8fb99cSAlex Elder 28028b8fb99cSAlex Elder return spec; 28038b8fb99cSAlex Elder } 28048b8fb99cSAlex Elder 28058b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref); 28068b8fb99cSAlex Elder static void rbd_spec_put(struct rbd_spec *spec) 28078b8fb99cSAlex Elder { 28088b8fb99cSAlex Elder if (spec) 28098b8fb99cSAlex Elder kref_put(&spec->kref, rbd_spec_free); 28108b8fb99cSAlex Elder } 28118b8fb99cSAlex Elder 28128b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_alloc(void) 28138b8fb99cSAlex Elder { 28148b8fb99cSAlex Elder struct rbd_spec *spec; 28158b8fb99cSAlex Elder 28168b8fb99cSAlex Elder spec = kzalloc(sizeof (*spec), GFP_KERNEL); 28178b8fb99cSAlex Elder if (!spec) 28188b8fb99cSAlex Elder return NULL; 28198b8fb99cSAlex Elder kref_init(&spec->kref); 28208b8fb99cSAlex Elder 28218b8fb99cSAlex Elder return spec; 28228b8fb99cSAlex Elder } 28238b8fb99cSAlex Elder 28248b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref) 28258b8fb99cSAlex Elder { 28268b8fb99cSAlex Elder struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref); 28278b8fb99cSAlex Elder 28288b8fb99cSAlex Elder kfree(spec->pool_name); 28298b8fb99cSAlex Elder kfree(spec->image_id); 28308b8fb99cSAlex Elder kfree(spec->image_name); 28318b8fb99cSAlex Elder kfree(spec->snap_name); 28328b8fb99cSAlex Elder kfree(spec); 28338b8fb99cSAlex Elder } 28348b8fb99cSAlex Elder 2835cc344fa1SAlex Elder static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc, 2836c53d5893SAlex Elder struct rbd_spec *spec) 2837c53d5893SAlex Elder { 2838c53d5893SAlex Elder struct rbd_device *rbd_dev; 2839c53d5893SAlex Elder 2840c53d5893SAlex Elder rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL); 2841c53d5893SAlex Elder if (!rbd_dev) 2842c53d5893SAlex Elder return NULL; 2843c53d5893SAlex Elder 2844c53d5893SAlex Elder spin_lock_init(&rbd_dev->lock); 28456d292906SAlex Elder rbd_dev->flags = 0; 2846c53d5893SAlex Elder INIT_LIST_HEAD(&rbd_dev->node); 2847c53d5893SAlex Elder INIT_LIST_HEAD(&rbd_dev->snaps); 2848c53d5893SAlex Elder init_rwsem(&rbd_dev->header_rwsem); 2849c53d5893SAlex Elder 2850c53d5893SAlex Elder rbd_dev->spec = spec; 2851c53d5893SAlex Elder rbd_dev->rbd_client = rbdc; 2852c53d5893SAlex Elder 28530903e875SAlex Elder /* Initialize the layout used for all rbd requests */ 28540903e875SAlex Elder 28550903e875SAlex Elder rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); 28560903e875SAlex Elder rbd_dev->layout.fl_stripe_count = cpu_to_le32(1); 28570903e875SAlex Elder rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); 28580903e875SAlex Elder rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id); 28590903e875SAlex Elder 2860c53d5893SAlex Elder return rbd_dev; 2861c53d5893SAlex Elder } 2862c53d5893SAlex Elder 2863c53d5893SAlex Elder static void rbd_dev_destroy(struct rbd_device *rbd_dev) 2864c53d5893SAlex Elder { 286586b00e0dSAlex Elder rbd_spec_put(rbd_dev->parent_spec); 2866c53d5893SAlex Elder kfree(rbd_dev->header_name); 2867c53d5893SAlex Elder rbd_put_client(rbd_dev->rbd_client); 2868c53d5893SAlex Elder rbd_spec_put(rbd_dev->spec); 2869c53d5893SAlex Elder kfree(rbd_dev); 2870c53d5893SAlex Elder } 2871c53d5893SAlex Elder 2872304f6808SAlex Elder static bool rbd_snap_registered(struct rbd_snap *snap) 2873304f6808SAlex Elder { 2874304f6808SAlex Elder bool ret = snap->dev.type == &rbd_snap_device_type; 2875304f6808SAlex Elder bool reg = device_is_registered(&snap->dev); 2876304f6808SAlex Elder 2877304f6808SAlex Elder rbd_assert(!ret ^ reg); 2878304f6808SAlex Elder 2879304f6808SAlex Elder return ret; 2880304f6808SAlex Elder } 2881304f6808SAlex Elder 288241f38c2bSAlex Elder static void rbd_remove_snap_dev(struct rbd_snap *snap) 2883dfc5606dSYehuda Sadeh { 2884dfc5606dSYehuda Sadeh list_del(&snap->node); 2885304f6808SAlex Elder if (device_is_registered(&snap->dev)) 2886dfc5606dSYehuda Sadeh device_unregister(&snap->dev); 2887dfc5606dSYehuda Sadeh } 2888dfc5606dSYehuda Sadeh 288914e7085dSAlex Elder static int rbd_register_snap_dev(struct rbd_snap *snap, 2890dfc5606dSYehuda Sadeh struct device *parent) 2891dfc5606dSYehuda Sadeh { 2892dfc5606dSYehuda Sadeh struct device *dev = &snap->dev; 2893dfc5606dSYehuda Sadeh int ret; 2894dfc5606dSYehuda Sadeh 2895dfc5606dSYehuda Sadeh dev->type = &rbd_snap_device_type; 2896dfc5606dSYehuda Sadeh dev->parent = parent; 2897dfc5606dSYehuda Sadeh dev->release = rbd_snap_dev_release; 2898d4b125e9SAlex Elder dev_set_name(dev, "%s%s", RBD_SNAP_DEV_NAME_PREFIX, snap->name); 2899304f6808SAlex Elder dout("%s: registering device for snapshot %s\n", __func__, snap->name); 2900304f6808SAlex Elder 2901dfc5606dSYehuda Sadeh ret = device_register(dev); 2902dfc5606dSYehuda Sadeh 2903dfc5606dSYehuda Sadeh return ret; 2904dfc5606dSYehuda Sadeh } 2905dfc5606dSYehuda Sadeh 29064e891e0aSAlex Elder static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev, 2907c8d18425SAlex Elder const char *snap_name, 290834b13184SAlex Elder u64 snap_id, u64 snap_size, 290934b13184SAlex Elder u64 snap_features) 2910dfc5606dSYehuda Sadeh { 29114e891e0aSAlex Elder struct rbd_snap *snap; 2912dfc5606dSYehuda Sadeh int ret; 29134e891e0aSAlex Elder 29144e891e0aSAlex Elder snap = kzalloc(sizeof (*snap), GFP_KERNEL); 2915dfc5606dSYehuda Sadeh if (!snap) 29164e891e0aSAlex Elder return ERR_PTR(-ENOMEM); 29174e891e0aSAlex Elder 29184e891e0aSAlex Elder ret = -ENOMEM; 2919c8d18425SAlex Elder snap->name = kstrdup(snap_name, GFP_KERNEL); 29204e891e0aSAlex Elder if (!snap->name) 29214e891e0aSAlex Elder goto err; 29224e891e0aSAlex Elder 2923c8d18425SAlex Elder snap->id = snap_id; 2924c8d18425SAlex Elder snap->size = snap_size; 292534b13184SAlex Elder snap->features = snap_features; 29264e891e0aSAlex Elder 29274e891e0aSAlex Elder return snap; 29284e891e0aSAlex Elder 2929dfc5606dSYehuda Sadeh err: 2930dfc5606dSYehuda Sadeh kfree(snap->name); 2931dfc5606dSYehuda Sadeh kfree(snap); 29324e891e0aSAlex Elder 29334e891e0aSAlex Elder return ERR_PTR(ret); 2934dfc5606dSYehuda Sadeh } 2935dfc5606dSYehuda Sadeh 2936cd892126SAlex Elder static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which, 2937cd892126SAlex Elder u64 *snap_size, u64 *snap_features) 2938cd892126SAlex Elder { 2939cd892126SAlex Elder char *snap_name; 2940cd892126SAlex Elder 2941cd892126SAlex Elder rbd_assert(which < rbd_dev->header.snapc->num_snaps); 2942cd892126SAlex Elder 2943cd892126SAlex Elder *snap_size = rbd_dev->header.snap_sizes[which]; 2944cd892126SAlex Elder *snap_features = 0; /* No features for v1 */ 2945cd892126SAlex Elder 2946cd892126SAlex Elder /* Skip over names until we find the one we are looking for */ 2947cd892126SAlex Elder 2948cd892126SAlex Elder snap_name = rbd_dev->header.snap_names; 2949cd892126SAlex Elder while (which--) 2950cd892126SAlex Elder snap_name += strlen(snap_name) + 1; 2951cd892126SAlex Elder 2952cd892126SAlex Elder return snap_name; 2953cd892126SAlex Elder } 2954cd892126SAlex Elder 2955dfc5606dSYehuda Sadeh /* 29569d475de5SAlex Elder * Get the size and object order for an image snapshot, or if 29579d475de5SAlex Elder * snap_id is CEPH_NOSNAP, gets this information for the base 29589d475de5SAlex Elder * image. 29599d475de5SAlex Elder */ 29609d475de5SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id, 29619d475de5SAlex Elder u8 *order, u64 *snap_size) 29629d475de5SAlex Elder { 29639d475de5SAlex Elder __le64 snapid = cpu_to_le64(snap_id); 29649d475de5SAlex Elder int ret; 29659d475de5SAlex Elder struct { 29669d475de5SAlex Elder u8 order; 29679d475de5SAlex Elder __le64 size; 29689d475de5SAlex Elder } __attribute__ ((packed)) size_buf = { 0 }; 29699d475de5SAlex Elder 297036be9a76SAlex Elder ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, 29719d475de5SAlex Elder "rbd", "get_size", 29729d475de5SAlex Elder (char *) &snapid, sizeof (snapid), 297307b2391fSAlex Elder (char *) &size_buf, sizeof (size_buf), NULL); 297436be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 29759d475de5SAlex Elder if (ret < 0) 29769d475de5SAlex Elder return ret; 29779d475de5SAlex Elder 29789d475de5SAlex Elder *order = size_buf.order; 29799d475de5SAlex Elder *snap_size = le64_to_cpu(size_buf.size); 29809d475de5SAlex Elder 29819d475de5SAlex Elder dout(" snap_id 0x%016llx order = %u, snap_size = %llu\n", 29829d475de5SAlex Elder (unsigned long long) snap_id, (unsigned int) *order, 29839d475de5SAlex Elder (unsigned long long) *snap_size); 29849d475de5SAlex Elder 29859d475de5SAlex Elder return 0; 29869d475de5SAlex Elder } 29879d475de5SAlex Elder 29889d475de5SAlex Elder static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev) 29899d475de5SAlex Elder { 29909d475de5SAlex Elder return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP, 29919d475de5SAlex Elder &rbd_dev->header.obj_order, 29929d475de5SAlex Elder &rbd_dev->header.image_size); 29939d475de5SAlex Elder } 29949d475de5SAlex Elder 29951e130199SAlex Elder static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev) 29961e130199SAlex Elder { 29971e130199SAlex Elder void *reply_buf; 29981e130199SAlex Elder int ret; 29991e130199SAlex Elder void *p; 30001e130199SAlex Elder 30011e130199SAlex Elder reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL); 30021e130199SAlex Elder if (!reply_buf) 30031e130199SAlex Elder return -ENOMEM; 30041e130199SAlex Elder 300536be9a76SAlex Elder ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, 30061e130199SAlex Elder "rbd", "get_object_prefix", 30071e130199SAlex Elder NULL, 0, 300807b2391fSAlex Elder reply_buf, RBD_OBJ_PREFIX_LEN_MAX, NULL); 300936be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 30101e130199SAlex Elder if (ret < 0) 30111e130199SAlex Elder goto out; 30121e130199SAlex Elder 30131e130199SAlex Elder p = reply_buf; 30141e130199SAlex Elder rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p, 30151e130199SAlex Elder p + RBD_OBJ_PREFIX_LEN_MAX, 30161e130199SAlex Elder NULL, GFP_NOIO); 30171e130199SAlex Elder 30181e130199SAlex Elder if (IS_ERR(rbd_dev->header.object_prefix)) { 30191e130199SAlex Elder ret = PTR_ERR(rbd_dev->header.object_prefix); 30201e130199SAlex Elder rbd_dev->header.object_prefix = NULL; 30211e130199SAlex Elder } else { 30221e130199SAlex Elder dout(" object_prefix = %s\n", rbd_dev->header.object_prefix); 30231e130199SAlex Elder } 30241e130199SAlex Elder 30251e130199SAlex Elder out: 30261e130199SAlex Elder kfree(reply_buf); 30271e130199SAlex Elder 30281e130199SAlex Elder return ret; 30291e130199SAlex Elder } 30301e130199SAlex Elder 3031b1b5402aSAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id, 3032b1b5402aSAlex Elder u64 *snap_features) 3033b1b5402aSAlex Elder { 3034b1b5402aSAlex Elder __le64 snapid = cpu_to_le64(snap_id); 3035b1b5402aSAlex Elder struct { 3036b1b5402aSAlex Elder __le64 features; 3037b1b5402aSAlex Elder __le64 incompat; 3038b1b5402aSAlex Elder } features_buf = { 0 }; 3039d889140cSAlex Elder u64 incompat; 3040b1b5402aSAlex Elder int ret; 3041b1b5402aSAlex Elder 304236be9a76SAlex Elder ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, 3043b1b5402aSAlex Elder "rbd", "get_features", 3044b1b5402aSAlex Elder (char *) &snapid, sizeof (snapid), 3045b1b5402aSAlex Elder (char *) &features_buf, sizeof (features_buf), 304607b2391fSAlex Elder NULL); 304736be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 3048b1b5402aSAlex Elder if (ret < 0) 3049b1b5402aSAlex Elder return ret; 3050d889140cSAlex Elder 3051d889140cSAlex Elder incompat = le64_to_cpu(features_buf.incompat); 30525cbf6f12SAlex Elder if (incompat & ~RBD_FEATURES_SUPPORTED) 3053b8f5c6edSAlex Elder return -ENXIO; 3054d889140cSAlex Elder 3055b1b5402aSAlex Elder *snap_features = le64_to_cpu(features_buf.features); 3056b1b5402aSAlex Elder 3057b1b5402aSAlex Elder dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n", 3058b1b5402aSAlex Elder (unsigned long long) snap_id, 3059b1b5402aSAlex Elder (unsigned long long) *snap_features, 3060b1b5402aSAlex Elder (unsigned long long) le64_to_cpu(features_buf.incompat)); 3061b1b5402aSAlex Elder 3062b1b5402aSAlex Elder return 0; 3063b1b5402aSAlex Elder } 3064b1b5402aSAlex Elder 3065b1b5402aSAlex Elder static int rbd_dev_v2_features(struct rbd_device *rbd_dev) 3066b1b5402aSAlex Elder { 3067b1b5402aSAlex Elder return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP, 3068b1b5402aSAlex Elder &rbd_dev->header.features); 3069b1b5402aSAlex Elder } 3070b1b5402aSAlex Elder 307186b00e0dSAlex Elder static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev) 307286b00e0dSAlex Elder { 307386b00e0dSAlex Elder struct rbd_spec *parent_spec; 307486b00e0dSAlex Elder size_t size; 307586b00e0dSAlex Elder void *reply_buf = NULL; 307686b00e0dSAlex Elder __le64 snapid; 307786b00e0dSAlex Elder void *p; 307886b00e0dSAlex Elder void *end; 307986b00e0dSAlex Elder char *image_id; 308086b00e0dSAlex Elder u64 overlap; 308186b00e0dSAlex Elder int ret; 308286b00e0dSAlex Elder 308386b00e0dSAlex Elder parent_spec = rbd_spec_alloc(); 308486b00e0dSAlex Elder if (!parent_spec) 308586b00e0dSAlex Elder return -ENOMEM; 308686b00e0dSAlex Elder 308786b00e0dSAlex Elder size = sizeof (__le64) + /* pool_id */ 308886b00e0dSAlex Elder sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX + /* image_id */ 308986b00e0dSAlex Elder sizeof (__le64) + /* snap_id */ 309086b00e0dSAlex Elder sizeof (__le64); /* overlap */ 309186b00e0dSAlex Elder reply_buf = kmalloc(size, GFP_KERNEL); 309286b00e0dSAlex Elder if (!reply_buf) { 309386b00e0dSAlex Elder ret = -ENOMEM; 309486b00e0dSAlex Elder goto out_err; 309586b00e0dSAlex Elder } 309686b00e0dSAlex Elder 309786b00e0dSAlex Elder snapid = cpu_to_le64(CEPH_NOSNAP); 309836be9a76SAlex Elder ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, 309986b00e0dSAlex Elder "rbd", "get_parent", 310086b00e0dSAlex Elder (char *) &snapid, sizeof (snapid), 310107b2391fSAlex Elder (char *) reply_buf, size, NULL); 310236be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 310386b00e0dSAlex Elder if (ret < 0) 310486b00e0dSAlex Elder goto out_err; 310586b00e0dSAlex Elder 310686b00e0dSAlex Elder ret = -ERANGE; 310786b00e0dSAlex Elder p = reply_buf; 310886b00e0dSAlex Elder end = (char *) reply_buf + size; 310986b00e0dSAlex Elder ceph_decode_64_safe(&p, end, parent_spec->pool_id, out_err); 311086b00e0dSAlex Elder if (parent_spec->pool_id == CEPH_NOPOOL) 311186b00e0dSAlex Elder goto out; /* No parent? No problem. */ 311286b00e0dSAlex Elder 31130903e875SAlex Elder /* The ceph file layout needs to fit pool id in 32 bits */ 31140903e875SAlex Elder 31150903e875SAlex Elder ret = -EIO; 31160903e875SAlex Elder if (WARN_ON(parent_spec->pool_id > (u64) U32_MAX)) 31170903e875SAlex Elder goto out; 31180903e875SAlex Elder 3119979ed480SAlex Elder image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL); 312086b00e0dSAlex Elder if (IS_ERR(image_id)) { 312186b00e0dSAlex Elder ret = PTR_ERR(image_id); 312286b00e0dSAlex Elder goto out_err; 312386b00e0dSAlex Elder } 312486b00e0dSAlex Elder parent_spec->image_id = image_id; 312586b00e0dSAlex Elder ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err); 312686b00e0dSAlex Elder ceph_decode_64_safe(&p, end, overlap, out_err); 312786b00e0dSAlex Elder 312886b00e0dSAlex Elder rbd_dev->parent_overlap = overlap; 312986b00e0dSAlex Elder rbd_dev->parent_spec = parent_spec; 313086b00e0dSAlex Elder parent_spec = NULL; /* rbd_dev now owns this */ 313186b00e0dSAlex Elder out: 313286b00e0dSAlex Elder ret = 0; 313386b00e0dSAlex Elder out_err: 313486b00e0dSAlex Elder kfree(reply_buf); 313586b00e0dSAlex Elder rbd_spec_put(parent_spec); 313686b00e0dSAlex Elder 313786b00e0dSAlex Elder return ret; 313886b00e0dSAlex Elder } 313986b00e0dSAlex Elder 31409e15b77dSAlex Elder static char *rbd_dev_image_name(struct rbd_device *rbd_dev) 31419e15b77dSAlex Elder { 31429e15b77dSAlex Elder size_t image_id_size; 31439e15b77dSAlex Elder char *image_id; 31449e15b77dSAlex Elder void *p; 31459e15b77dSAlex Elder void *end; 31469e15b77dSAlex Elder size_t size; 31479e15b77dSAlex Elder void *reply_buf = NULL; 31489e15b77dSAlex Elder size_t len = 0; 31499e15b77dSAlex Elder char *image_name = NULL; 31509e15b77dSAlex Elder int ret; 31519e15b77dSAlex Elder 31529e15b77dSAlex Elder rbd_assert(!rbd_dev->spec->image_name); 31539e15b77dSAlex Elder 315469e7a02fSAlex Elder len = strlen(rbd_dev->spec->image_id); 315569e7a02fSAlex Elder image_id_size = sizeof (__le32) + len; 31569e15b77dSAlex Elder image_id = kmalloc(image_id_size, GFP_KERNEL); 31579e15b77dSAlex Elder if (!image_id) 31589e15b77dSAlex Elder return NULL; 31599e15b77dSAlex Elder 31609e15b77dSAlex Elder p = image_id; 31619e15b77dSAlex Elder end = (char *) image_id + image_id_size; 316269e7a02fSAlex Elder ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32) len); 31639e15b77dSAlex Elder 31649e15b77dSAlex Elder size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX; 31659e15b77dSAlex Elder reply_buf = kmalloc(size, GFP_KERNEL); 31669e15b77dSAlex Elder if (!reply_buf) 31679e15b77dSAlex Elder goto out; 31689e15b77dSAlex Elder 316936be9a76SAlex Elder ret = rbd_obj_method_sync(rbd_dev, RBD_DIRECTORY, 31709e15b77dSAlex Elder "rbd", "dir_get_name", 31719e15b77dSAlex Elder image_id, image_id_size, 317207b2391fSAlex Elder (char *) reply_buf, size, NULL); 31739e15b77dSAlex Elder if (ret < 0) 31749e15b77dSAlex Elder goto out; 31759e15b77dSAlex Elder p = reply_buf; 31769e15b77dSAlex Elder end = (char *) reply_buf + size; 31779e15b77dSAlex Elder image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL); 31789e15b77dSAlex Elder if (IS_ERR(image_name)) 31799e15b77dSAlex Elder image_name = NULL; 31809e15b77dSAlex Elder else 31819e15b77dSAlex Elder dout("%s: name is %s len is %zd\n", __func__, image_name, len); 31829e15b77dSAlex Elder out: 31839e15b77dSAlex Elder kfree(reply_buf); 31849e15b77dSAlex Elder kfree(image_id); 31859e15b77dSAlex Elder 31869e15b77dSAlex Elder return image_name; 31879e15b77dSAlex Elder } 31889e15b77dSAlex Elder 31899e15b77dSAlex Elder /* 31909e15b77dSAlex Elder * When a parent image gets probed, we only have the pool, image, 31919e15b77dSAlex Elder * and snapshot ids but not the names of any of them. This call 31929e15b77dSAlex Elder * is made later to fill in those names. It has to be done after 31939e15b77dSAlex Elder * rbd_dev_snaps_update() has completed because some of the 31949e15b77dSAlex Elder * information (in particular, snapshot name) is not available 31959e15b77dSAlex Elder * until then. 31969e15b77dSAlex Elder */ 31979e15b77dSAlex Elder static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev) 31989e15b77dSAlex Elder { 31999e15b77dSAlex Elder struct ceph_osd_client *osdc; 32009e15b77dSAlex Elder const char *name; 32019e15b77dSAlex Elder void *reply_buf = NULL; 32029e15b77dSAlex Elder int ret; 32039e15b77dSAlex Elder 32049e15b77dSAlex Elder if (rbd_dev->spec->pool_name) 32059e15b77dSAlex Elder return 0; /* Already have the names */ 32069e15b77dSAlex Elder 32079e15b77dSAlex Elder /* Look up the pool name */ 32089e15b77dSAlex Elder 32099e15b77dSAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 32109e15b77dSAlex Elder name = ceph_pg_pool_name_by_id(osdc->osdmap, rbd_dev->spec->pool_id); 3211935dc89fSAlex Elder if (!name) { 3212935dc89fSAlex Elder rbd_warn(rbd_dev, "there is no pool with id %llu", 3213935dc89fSAlex Elder rbd_dev->spec->pool_id); /* Really a BUG() */ 3214935dc89fSAlex Elder return -EIO; 3215935dc89fSAlex Elder } 32169e15b77dSAlex Elder 32179e15b77dSAlex Elder rbd_dev->spec->pool_name = kstrdup(name, GFP_KERNEL); 32189e15b77dSAlex Elder if (!rbd_dev->spec->pool_name) 32199e15b77dSAlex Elder return -ENOMEM; 32209e15b77dSAlex Elder 32219e15b77dSAlex Elder /* Fetch the image name; tolerate failure here */ 32229e15b77dSAlex Elder 32239e15b77dSAlex Elder name = rbd_dev_image_name(rbd_dev); 322469e7a02fSAlex Elder if (name) 32259e15b77dSAlex Elder rbd_dev->spec->image_name = (char *) name; 322669e7a02fSAlex Elder else 322706ecc6cbSAlex Elder rbd_warn(rbd_dev, "unable to get image name"); 32289e15b77dSAlex Elder 32299e15b77dSAlex Elder /* Look up the snapshot name. */ 32309e15b77dSAlex Elder 32319e15b77dSAlex Elder name = rbd_snap_name(rbd_dev, rbd_dev->spec->snap_id); 32329e15b77dSAlex Elder if (!name) { 3233935dc89fSAlex Elder rbd_warn(rbd_dev, "no snapshot with id %llu", 3234935dc89fSAlex Elder rbd_dev->spec->snap_id); /* Really a BUG() */ 32359e15b77dSAlex Elder ret = -EIO; 32369e15b77dSAlex Elder goto out_err; 32379e15b77dSAlex Elder } 32389e15b77dSAlex Elder rbd_dev->spec->snap_name = kstrdup(name, GFP_KERNEL); 32399e15b77dSAlex Elder if(!rbd_dev->spec->snap_name) 32409e15b77dSAlex Elder goto out_err; 32419e15b77dSAlex Elder 32429e15b77dSAlex Elder return 0; 32439e15b77dSAlex Elder out_err: 32449e15b77dSAlex Elder kfree(reply_buf); 32459e15b77dSAlex Elder kfree(rbd_dev->spec->pool_name); 32469e15b77dSAlex Elder rbd_dev->spec->pool_name = NULL; 32479e15b77dSAlex Elder 32489e15b77dSAlex Elder return ret; 32499e15b77dSAlex Elder } 32509e15b77dSAlex Elder 32516e14b1a6SAlex Elder static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver) 325235d489f9SAlex Elder { 325335d489f9SAlex Elder size_t size; 325435d489f9SAlex Elder int ret; 325535d489f9SAlex Elder void *reply_buf; 325635d489f9SAlex Elder void *p; 325735d489f9SAlex Elder void *end; 325835d489f9SAlex Elder u64 seq; 325935d489f9SAlex Elder u32 snap_count; 326035d489f9SAlex Elder struct ceph_snap_context *snapc; 326135d489f9SAlex Elder u32 i; 326235d489f9SAlex Elder 326335d489f9SAlex Elder /* 326435d489f9SAlex Elder * We'll need room for the seq value (maximum snapshot id), 326535d489f9SAlex Elder * snapshot count, and array of that many snapshot ids. 326635d489f9SAlex Elder * For now we have a fixed upper limit on the number we're 326735d489f9SAlex Elder * prepared to receive. 326835d489f9SAlex Elder */ 326935d489f9SAlex Elder size = sizeof (__le64) + sizeof (__le32) + 327035d489f9SAlex Elder RBD_MAX_SNAP_COUNT * sizeof (__le64); 327135d489f9SAlex Elder reply_buf = kzalloc(size, GFP_KERNEL); 327235d489f9SAlex Elder if (!reply_buf) 327335d489f9SAlex Elder return -ENOMEM; 327435d489f9SAlex Elder 327536be9a76SAlex Elder ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, 327635d489f9SAlex Elder "rbd", "get_snapcontext", 327735d489f9SAlex Elder NULL, 0, 327807b2391fSAlex Elder reply_buf, size, ver); 327936be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 328035d489f9SAlex Elder if (ret < 0) 328135d489f9SAlex Elder goto out; 328235d489f9SAlex Elder 328335d489f9SAlex Elder ret = -ERANGE; 328435d489f9SAlex Elder p = reply_buf; 328535d489f9SAlex Elder end = (char *) reply_buf + size; 328635d489f9SAlex Elder ceph_decode_64_safe(&p, end, seq, out); 328735d489f9SAlex Elder ceph_decode_32_safe(&p, end, snap_count, out); 328835d489f9SAlex Elder 328935d489f9SAlex Elder /* 329035d489f9SAlex Elder * Make sure the reported number of snapshot ids wouldn't go 329135d489f9SAlex Elder * beyond the end of our buffer. But before checking that, 329235d489f9SAlex Elder * make sure the computed size of the snapshot context we 329335d489f9SAlex Elder * allocate is representable in a size_t. 329435d489f9SAlex Elder */ 329535d489f9SAlex Elder if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context)) 329635d489f9SAlex Elder / sizeof (u64)) { 329735d489f9SAlex Elder ret = -EINVAL; 329835d489f9SAlex Elder goto out; 329935d489f9SAlex Elder } 330035d489f9SAlex Elder if (!ceph_has_room(&p, end, snap_count * sizeof (__le64))) 330135d489f9SAlex Elder goto out; 330235d489f9SAlex Elder 330335d489f9SAlex Elder size = sizeof (struct ceph_snap_context) + 330435d489f9SAlex Elder snap_count * sizeof (snapc->snaps[0]); 330535d489f9SAlex Elder snapc = kmalloc(size, GFP_KERNEL); 330635d489f9SAlex Elder if (!snapc) { 330735d489f9SAlex Elder ret = -ENOMEM; 330835d489f9SAlex Elder goto out; 330935d489f9SAlex Elder } 331035d489f9SAlex Elder 331135d489f9SAlex Elder atomic_set(&snapc->nref, 1); 331235d489f9SAlex Elder snapc->seq = seq; 331335d489f9SAlex Elder snapc->num_snaps = snap_count; 331435d489f9SAlex Elder for (i = 0; i < snap_count; i++) 331535d489f9SAlex Elder snapc->snaps[i] = ceph_decode_64(&p); 331635d489f9SAlex Elder 331735d489f9SAlex Elder rbd_dev->header.snapc = snapc; 331835d489f9SAlex Elder 331935d489f9SAlex Elder dout(" snap context seq = %llu, snap_count = %u\n", 332035d489f9SAlex Elder (unsigned long long) seq, (unsigned int) snap_count); 332135d489f9SAlex Elder 332235d489f9SAlex Elder out: 332335d489f9SAlex Elder kfree(reply_buf); 332435d489f9SAlex Elder 332535d489f9SAlex Elder return 0; 332635d489f9SAlex Elder } 332735d489f9SAlex Elder 3328b8b1e2dbSAlex Elder static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which) 3329b8b1e2dbSAlex Elder { 3330b8b1e2dbSAlex Elder size_t size; 3331b8b1e2dbSAlex Elder void *reply_buf; 3332b8b1e2dbSAlex Elder __le64 snap_id; 3333b8b1e2dbSAlex Elder int ret; 3334b8b1e2dbSAlex Elder void *p; 3335b8b1e2dbSAlex Elder void *end; 3336b8b1e2dbSAlex Elder char *snap_name; 3337b8b1e2dbSAlex Elder 3338b8b1e2dbSAlex Elder size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN; 3339b8b1e2dbSAlex Elder reply_buf = kmalloc(size, GFP_KERNEL); 3340b8b1e2dbSAlex Elder if (!reply_buf) 3341b8b1e2dbSAlex Elder return ERR_PTR(-ENOMEM); 3342b8b1e2dbSAlex Elder 3343b8b1e2dbSAlex Elder snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]); 334436be9a76SAlex Elder ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, 3345b8b1e2dbSAlex Elder "rbd", "get_snapshot_name", 3346b8b1e2dbSAlex Elder (char *) &snap_id, sizeof (snap_id), 334707b2391fSAlex Elder reply_buf, size, NULL); 334836be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 3349b8b1e2dbSAlex Elder if (ret < 0) 3350b8b1e2dbSAlex Elder goto out; 3351b8b1e2dbSAlex Elder 3352b8b1e2dbSAlex Elder p = reply_buf; 3353b8b1e2dbSAlex Elder end = (char *) reply_buf + size; 3354e5c35534SAlex Elder snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL); 3355b8b1e2dbSAlex Elder if (IS_ERR(snap_name)) { 3356b8b1e2dbSAlex Elder ret = PTR_ERR(snap_name); 3357b8b1e2dbSAlex Elder goto out; 3358b8b1e2dbSAlex Elder } else { 3359b8b1e2dbSAlex Elder dout(" snap_id 0x%016llx snap_name = %s\n", 3360b8b1e2dbSAlex Elder (unsigned long long) le64_to_cpu(snap_id), snap_name); 3361b8b1e2dbSAlex Elder } 3362b8b1e2dbSAlex Elder kfree(reply_buf); 3363b8b1e2dbSAlex Elder 3364b8b1e2dbSAlex Elder return snap_name; 3365b8b1e2dbSAlex Elder out: 3366b8b1e2dbSAlex Elder kfree(reply_buf); 3367b8b1e2dbSAlex Elder 3368b8b1e2dbSAlex Elder return ERR_PTR(ret); 3369b8b1e2dbSAlex Elder } 3370b8b1e2dbSAlex Elder 3371b8b1e2dbSAlex Elder static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which, 3372b8b1e2dbSAlex Elder u64 *snap_size, u64 *snap_features) 3373b8b1e2dbSAlex Elder { 3374e0b49868SAlex Elder u64 snap_id; 3375b8b1e2dbSAlex Elder u8 order; 3376b8b1e2dbSAlex Elder int ret; 3377b8b1e2dbSAlex Elder 3378b8b1e2dbSAlex Elder snap_id = rbd_dev->header.snapc->snaps[which]; 3379b8b1e2dbSAlex Elder ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, &order, snap_size); 3380b8b1e2dbSAlex Elder if (ret) 3381b8b1e2dbSAlex Elder return ERR_PTR(ret); 3382b8b1e2dbSAlex Elder ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, snap_features); 3383b8b1e2dbSAlex Elder if (ret) 3384b8b1e2dbSAlex Elder return ERR_PTR(ret); 3385b8b1e2dbSAlex Elder 3386b8b1e2dbSAlex Elder return rbd_dev_v2_snap_name(rbd_dev, which); 3387b8b1e2dbSAlex Elder } 3388b8b1e2dbSAlex Elder 3389b8b1e2dbSAlex Elder static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which, 3390b8b1e2dbSAlex Elder u64 *snap_size, u64 *snap_features) 3391b8b1e2dbSAlex Elder { 3392b8b1e2dbSAlex Elder if (rbd_dev->image_format == 1) 3393b8b1e2dbSAlex Elder return rbd_dev_v1_snap_info(rbd_dev, which, 3394b8b1e2dbSAlex Elder snap_size, snap_features); 3395b8b1e2dbSAlex Elder if (rbd_dev->image_format == 2) 3396b8b1e2dbSAlex Elder return rbd_dev_v2_snap_info(rbd_dev, which, 3397b8b1e2dbSAlex Elder snap_size, snap_features); 3398b8b1e2dbSAlex Elder return ERR_PTR(-EINVAL); 3399b8b1e2dbSAlex Elder } 3400b8b1e2dbSAlex Elder 3401117973fbSAlex Elder static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver) 3402117973fbSAlex Elder { 3403117973fbSAlex Elder int ret; 3404117973fbSAlex Elder __u8 obj_order; 3405117973fbSAlex Elder 3406117973fbSAlex Elder down_write(&rbd_dev->header_rwsem); 3407117973fbSAlex Elder 3408117973fbSAlex Elder /* Grab old order first, to see if it changes */ 3409117973fbSAlex Elder 3410117973fbSAlex Elder obj_order = rbd_dev->header.obj_order, 3411117973fbSAlex Elder ret = rbd_dev_v2_image_size(rbd_dev); 3412117973fbSAlex Elder if (ret) 3413117973fbSAlex Elder goto out; 3414117973fbSAlex Elder if (rbd_dev->header.obj_order != obj_order) { 3415117973fbSAlex Elder ret = -EIO; 3416117973fbSAlex Elder goto out; 3417117973fbSAlex Elder } 3418117973fbSAlex Elder rbd_update_mapping_size(rbd_dev); 3419117973fbSAlex Elder 3420117973fbSAlex Elder ret = rbd_dev_v2_snap_context(rbd_dev, hver); 3421117973fbSAlex Elder dout("rbd_dev_v2_snap_context returned %d\n", ret); 3422117973fbSAlex Elder if (ret) 3423117973fbSAlex Elder goto out; 3424117973fbSAlex Elder ret = rbd_dev_snaps_update(rbd_dev); 3425117973fbSAlex Elder dout("rbd_dev_snaps_update returned %d\n", ret); 3426117973fbSAlex Elder if (ret) 3427117973fbSAlex Elder goto out; 3428117973fbSAlex Elder ret = rbd_dev_snaps_register(rbd_dev); 3429117973fbSAlex Elder dout("rbd_dev_snaps_register returned %d\n", ret); 3430117973fbSAlex Elder out: 3431117973fbSAlex Elder up_write(&rbd_dev->header_rwsem); 3432117973fbSAlex Elder 3433117973fbSAlex Elder return ret; 3434117973fbSAlex Elder } 3435117973fbSAlex Elder 34369d475de5SAlex Elder /* 343735938150SAlex Elder * Scan the rbd device's current snapshot list and compare it to the 343835938150SAlex Elder * newly-received snapshot context. Remove any existing snapshots 343935938150SAlex Elder * not present in the new snapshot context. Add a new snapshot for 344035938150SAlex Elder * any snaphots in the snapshot context not in the current list. 344135938150SAlex Elder * And verify there are no changes to snapshots we already know 344235938150SAlex Elder * about. 344335938150SAlex Elder * 344435938150SAlex Elder * Assumes the snapshots in the snapshot context are sorted by 344535938150SAlex Elder * snapshot id, highest id first. (Snapshots in the rbd_dev's list 344635938150SAlex Elder * are also maintained in that order.) 3447dfc5606dSYehuda Sadeh */ 3448304f6808SAlex Elder static int rbd_dev_snaps_update(struct rbd_device *rbd_dev) 3449dfc5606dSYehuda Sadeh { 345035938150SAlex Elder struct ceph_snap_context *snapc = rbd_dev->header.snapc; 345135938150SAlex Elder const u32 snap_count = snapc->num_snaps; 345235938150SAlex Elder struct list_head *head = &rbd_dev->snaps; 345335938150SAlex Elder struct list_head *links = head->next; 345435938150SAlex Elder u32 index = 0; 3455dfc5606dSYehuda Sadeh 34569fcbb800SAlex Elder dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count); 345735938150SAlex Elder while (index < snap_count || links != head) { 345835938150SAlex Elder u64 snap_id; 345935938150SAlex Elder struct rbd_snap *snap; 3460cd892126SAlex Elder char *snap_name; 3461cd892126SAlex Elder u64 snap_size = 0; 3462cd892126SAlex Elder u64 snap_features = 0; 3463dfc5606dSYehuda Sadeh 346435938150SAlex Elder snap_id = index < snap_count ? snapc->snaps[index] 346535938150SAlex Elder : CEPH_NOSNAP; 346635938150SAlex Elder snap = links != head ? list_entry(links, struct rbd_snap, node) 346735938150SAlex Elder : NULL; 3468aafb230eSAlex Elder rbd_assert(!snap || snap->id != CEPH_NOSNAP); 3469dfc5606dSYehuda Sadeh 347035938150SAlex Elder if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) { 347135938150SAlex Elder struct list_head *next = links->next; 3472dfc5606dSYehuda Sadeh 34736d292906SAlex Elder /* 34746d292906SAlex Elder * A previously-existing snapshot is not in 34756d292906SAlex Elder * the new snap context. 34766d292906SAlex Elder * 34776d292906SAlex Elder * If the now missing snapshot is the one the 34786d292906SAlex Elder * image is mapped to, clear its exists flag 34796d292906SAlex Elder * so we can avoid sending any more requests 34806d292906SAlex Elder * to it. 34816d292906SAlex Elder */ 34820d7dbfceSAlex Elder if (rbd_dev->spec->snap_id == snap->id) 34836d292906SAlex Elder clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); 348441f38c2bSAlex Elder rbd_remove_snap_dev(snap); 34859fcbb800SAlex Elder dout("%ssnap id %llu has been removed\n", 34860d7dbfceSAlex Elder rbd_dev->spec->snap_id == snap->id ? 34870d7dbfceSAlex Elder "mapped " : "", 34889fcbb800SAlex Elder (unsigned long long) snap->id); 3489dfc5606dSYehuda Sadeh 349035938150SAlex Elder /* Done with this list entry; advance */ 349135938150SAlex Elder 349235938150SAlex Elder links = next; 349335938150SAlex Elder continue; 3494dfc5606dSYehuda Sadeh } 349535938150SAlex Elder 3496b8b1e2dbSAlex Elder snap_name = rbd_dev_snap_info(rbd_dev, index, 3497cd892126SAlex Elder &snap_size, &snap_features); 3498cd892126SAlex Elder if (IS_ERR(snap_name)) 3499cd892126SAlex Elder return PTR_ERR(snap_name); 3500cd892126SAlex Elder 35019fcbb800SAlex Elder dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count, 35029fcbb800SAlex Elder (unsigned long long) snap_id); 350335938150SAlex Elder if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) { 350435938150SAlex Elder struct rbd_snap *new_snap; 350535938150SAlex Elder 350635938150SAlex Elder /* We haven't seen this snapshot before */ 350735938150SAlex Elder 3508c8d18425SAlex Elder new_snap = __rbd_add_snap_dev(rbd_dev, snap_name, 3509cd892126SAlex Elder snap_id, snap_size, snap_features); 35109fcbb800SAlex Elder if (IS_ERR(new_snap)) { 35119fcbb800SAlex Elder int err = PTR_ERR(new_snap); 35129fcbb800SAlex Elder 35139fcbb800SAlex Elder dout(" failed to add dev, error %d\n", err); 35149fcbb800SAlex Elder 35159fcbb800SAlex Elder return err; 35169fcbb800SAlex Elder } 351735938150SAlex Elder 351835938150SAlex Elder /* New goes before existing, or at end of list */ 351935938150SAlex Elder 35209fcbb800SAlex Elder dout(" added dev%s\n", snap ? "" : " at end\n"); 352135938150SAlex Elder if (snap) 352235938150SAlex Elder list_add_tail(&new_snap->node, &snap->node); 352335938150SAlex Elder else 3524523f3258SAlex Elder list_add_tail(&new_snap->node, head); 352535938150SAlex Elder } else { 352635938150SAlex Elder /* Already have this one */ 352735938150SAlex Elder 35289fcbb800SAlex Elder dout(" already present\n"); 35299fcbb800SAlex Elder 3530cd892126SAlex Elder rbd_assert(snap->size == snap_size); 3531aafb230eSAlex Elder rbd_assert(!strcmp(snap->name, snap_name)); 3532cd892126SAlex Elder rbd_assert(snap->features == snap_features); 353335938150SAlex Elder 353435938150SAlex Elder /* Done with this list entry; advance */ 353535938150SAlex Elder 353635938150SAlex Elder links = links->next; 3537dfc5606dSYehuda Sadeh } 353835938150SAlex Elder 353935938150SAlex Elder /* Advance to the next entry in the snapshot context */ 354035938150SAlex Elder 354135938150SAlex Elder index++; 3542dfc5606dSYehuda Sadeh } 35439fcbb800SAlex Elder dout("%s: done\n", __func__); 3544dfc5606dSYehuda Sadeh 3545dfc5606dSYehuda Sadeh return 0; 3546dfc5606dSYehuda Sadeh } 3547dfc5606dSYehuda Sadeh 3548304f6808SAlex Elder /* 3549304f6808SAlex Elder * Scan the list of snapshots and register the devices for any that 3550304f6808SAlex Elder * have not already been registered. 3551304f6808SAlex Elder */ 3552304f6808SAlex Elder static int rbd_dev_snaps_register(struct rbd_device *rbd_dev) 3553304f6808SAlex Elder { 3554304f6808SAlex Elder struct rbd_snap *snap; 3555304f6808SAlex Elder int ret = 0; 3556304f6808SAlex Elder 355737206ee5SAlex Elder dout("%s:\n", __func__); 355886ff77bbSAlex Elder if (WARN_ON(!device_is_registered(&rbd_dev->dev))) 355986ff77bbSAlex Elder return -EIO; 3560304f6808SAlex Elder 3561304f6808SAlex Elder list_for_each_entry(snap, &rbd_dev->snaps, node) { 3562304f6808SAlex Elder if (!rbd_snap_registered(snap)) { 3563304f6808SAlex Elder ret = rbd_register_snap_dev(snap, &rbd_dev->dev); 3564304f6808SAlex Elder if (ret < 0) 3565304f6808SAlex Elder break; 3566304f6808SAlex Elder } 3567304f6808SAlex Elder } 3568304f6808SAlex Elder dout("%s: returning %d\n", __func__, ret); 3569304f6808SAlex Elder 3570304f6808SAlex Elder return ret; 3571304f6808SAlex Elder } 3572304f6808SAlex Elder 3573dfc5606dSYehuda Sadeh static int rbd_bus_add_dev(struct rbd_device *rbd_dev) 3574dfc5606dSYehuda Sadeh { 3575dfc5606dSYehuda Sadeh struct device *dev; 3576cd789ab9SAlex Elder int ret; 3577dfc5606dSYehuda Sadeh 3578dfc5606dSYehuda Sadeh mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); 3579dfc5606dSYehuda Sadeh 3580cd789ab9SAlex Elder dev = &rbd_dev->dev; 3581dfc5606dSYehuda Sadeh dev->bus = &rbd_bus_type; 3582dfc5606dSYehuda Sadeh dev->type = &rbd_device_type; 3583dfc5606dSYehuda Sadeh dev->parent = &rbd_root_dev; 3584dfc5606dSYehuda Sadeh dev->release = rbd_dev_release; 3585de71a297SAlex Elder dev_set_name(dev, "%d", rbd_dev->dev_id); 3586dfc5606dSYehuda Sadeh ret = device_register(dev); 3587dfc5606dSYehuda Sadeh 3588dfc5606dSYehuda Sadeh mutex_unlock(&ctl_mutex); 3589cd789ab9SAlex Elder 3590dfc5606dSYehuda Sadeh return ret; 3591602adf40SYehuda Sadeh } 3592602adf40SYehuda Sadeh 3593dfc5606dSYehuda Sadeh static void rbd_bus_del_dev(struct rbd_device *rbd_dev) 3594dfc5606dSYehuda Sadeh { 3595dfc5606dSYehuda Sadeh device_unregister(&rbd_dev->dev); 3596dfc5606dSYehuda Sadeh } 3597dfc5606dSYehuda Sadeh 3598e2839308SAlex Elder static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0); 35991ddbe94eSAlex Elder 36001ddbe94eSAlex Elder /* 3601499afd5bSAlex Elder * Get a unique rbd identifier for the given new rbd_dev, and add 3602499afd5bSAlex Elder * the rbd_dev to the global list. The minimum rbd id is 1. 36031ddbe94eSAlex Elder */ 3604e2839308SAlex Elder static void rbd_dev_id_get(struct rbd_device *rbd_dev) 3605b7f23c36SAlex Elder { 3606e2839308SAlex Elder rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max); 3607499afd5bSAlex Elder 3608499afd5bSAlex Elder spin_lock(&rbd_dev_list_lock); 3609499afd5bSAlex Elder list_add_tail(&rbd_dev->node, &rbd_dev_list); 3610499afd5bSAlex Elder spin_unlock(&rbd_dev_list_lock); 3611e2839308SAlex Elder dout("rbd_dev %p given dev id %llu\n", rbd_dev, 3612e2839308SAlex Elder (unsigned long long) rbd_dev->dev_id); 3613b7f23c36SAlex Elder } 3614b7f23c36SAlex Elder 36151ddbe94eSAlex Elder /* 3616499afd5bSAlex Elder * Remove an rbd_dev from the global list, and record that its 3617499afd5bSAlex Elder * identifier is no longer in use. 36181ddbe94eSAlex Elder */ 3619e2839308SAlex Elder static void rbd_dev_id_put(struct rbd_device *rbd_dev) 36201ddbe94eSAlex Elder { 3621d184f6bfSAlex Elder struct list_head *tmp; 3622de71a297SAlex Elder int rbd_id = rbd_dev->dev_id; 3623d184f6bfSAlex Elder int max_id; 3624d184f6bfSAlex Elder 3625aafb230eSAlex Elder rbd_assert(rbd_id > 0); 3626499afd5bSAlex Elder 3627e2839308SAlex Elder dout("rbd_dev %p released dev id %llu\n", rbd_dev, 3628e2839308SAlex Elder (unsigned long long) rbd_dev->dev_id); 3629499afd5bSAlex Elder spin_lock(&rbd_dev_list_lock); 3630499afd5bSAlex Elder list_del_init(&rbd_dev->node); 3631d184f6bfSAlex Elder 3632d184f6bfSAlex Elder /* 3633d184f6bfSAlex Elder * If the id being "put" is not the current maximum, there 3634d184f6bfSAlex Elder * is nothing special we need to do. 3635d184f6bfSAlex Elder */ 3636e2839308SAlex Elder if (rbd_id != atomic64_read(&rbd_dev_id_max)) { 3637d184f6bfSAlex Elder spin_unlock(&rbd_dev_list_lock); 3638d184f6bfSAlex Elder return; 3639d184f6bfSAlex Elder } 3640d184f6bfSAlex Elder 3641d184f6bfSAlex Elder /* 3642d184f6bfSAlex Elder * We need to update the current maximum id. Search the 3643d184f6bfSAlex Elder * list to find out what it is. We're more likely to find 3644d184f6bfSAlex Elder * the maximum at the end, so search the list backward. 3645d184f6bfSAlex Elder */ 3646d184f6bfSAlex Elder max_id = 0; 3647d184f6bfSAlex Elder list_for_each_prev(tmp, &rbd_dev_list) { 3648d184f6bfSAlex Elder struct rbd_device *rbd_dev; 3649d184f6bfSAlex Elder 3650d184f6bfSAlex Elder rbd_dev = list_entry(tmp, struct rbd_device, node); 3651b213e0b1SAlex Elder if (rbd_dev->dev_id > max_id) 3652b213e0b1SAlex Elder max_id = rbd_dev->dev_id; 3653d184f6bfSAlex Elder } 3654499afd5bSAlex Elder spin_unlock(&rbd_dev_list_lock); 36551ddbe94eSAlex Elder 36561ddbe94eSAlex Elder /* 3657e2839308SAlex Elder * The max id could have been updated by rbd_dev_id_get(), in 3658d184f6bfSAlex Elder * which case it now accurately reflects the new maximum. 3659d184f6bfSAlex Elder * Be careful not to overwrite the maximum value in that 3660d184f6bfSAlex Elder * case. 36611ddbe94eSAlex Elder */ 3662e2839308SAlex Elder atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id); 3663e2839308SAlex Elder dout(" max dev id has been reset\n"); 3664b7f23c36SAlex Elder } 3665b7f23c36SAlex Elder 3666a725f65eSAlex Elder /* 3667e28fff26SAlex Elder * Skips over white space at *buf, and updates *buf to point to the 3668e28fff26SAlex Elder * first found non-space character (if any). Returns the length of 3669593a9e7bSAlex Elder * the token (string of non-white space characters) found. Note 3670593a9e7bSAlex Elder * that *buf must be terminated with '\0'. 3671e28fff26SAlex Elder */ 3672e28fff26SAlex Elder static inline size_t next_token(const char **buf) 3673e28fff26SAlex Elder { 3674e28fff26SAlex Elder /* 3675e28fff26SAlex Elder * These are the characters that produce nonzero for 3676e28fff26SAlex Elder * isspace() in the "C" and "POSIX" locales. 3677e28fff26SAlex Elder */ 3678e28fff26SAlex Elder const char *spaces = " \f\n\r\t\v"; 3679e28fff26SAlex Elder 3680e28fff26SAlex Elder *buf += strspn(*buf, spaces); /* Find start of token */ 3681e28fff26SAlex Elder 3682e28fff26SAlex Elder return strcspn(*buf, spaces); /* Return token length */ 3683e28fff26SAlex Elder } 3684e28fff26SAlex Elder 3685e28fff26SAlex Elder /* 3686e28fff26SAlex Elder * Finds the next token in *buf, and if the provided token buffer is 3687e28fff26SAlex Elder * big enough, copies the found token into it. The result, if 3688593a9e7bSAlex Elder * copied, is guaranteed to be terminated with '\0'. Note that *buf 3689593a9e7bSAlex Elder * must be terminated with '\0' on entry. 3690e28fff26SAlex Elder * 3691e28fff26SAlex Elder * Returns the length of the token found (not including the '\0'). 3692e28fff26SAlex Elder * Return value will be 0 if no token is found, and it will be >= 3693e28fff26SAlex Elder * token_size if the token would not fit. 3694e28fff26SAlex Elder * 3695593a9e7bSAlex Elder * The *buf pointer will be updated to point beyond the end of the 3696e28fff26SAlex Elder * found token. Note that this occurs even if the token buffer is 3697e28fff26SAlex Elder * too small to hold it. 3698e28fff26SAlex Elder */ 3699e28fff26SAlex Elder static inline size_t copy_token(const char **buf, 3700e28fff26SAlex Elder char *token, 3701e28fff26SAlex Elder size_t token_size) 3702e28fff26SAlex Elder { 3703e28fff26SAlex Elder size_t len; 3704e28fff26SAlex Elder 3705e28fff26SAlex Elder len = next_token(buf); 3706e28fff26SAlex Elder if (len < token_size) { 3707e28fff26SAlex Elder memcpy(token, *buf, len); 3708e28fff26SAlex Elder *(token + len) = '\0'; 3709e28fff26SAlex Elder } 3710e28fff26SAlex Elder *buf += len; 3711e28fff26SAlex Elder 3712e28fff26SAlex Elder return len; 3713e28fff26SAlex Elder } 3714e28fff26SAlex Elder 3715e28fff26SAlex Elder /* 3716ea3352f4SAlex Elder * Finds the next token in *buf, dynamically allocates a buffer big 3717ea3352f4SAlex Elder * enough to hold a copy of it, and copies the token into the new 3718ea3352f4SAlex Elder * buffer. The copy is guaranteed to be terminated with '\0'. Note 3719ea3352f4SAlex Elder * that a duplicate buffer is created even for a zero-length token. 3720ea3352f4SAlex Elder * 3721ea3352f4SAlex Elder * Returns a pointer to the newly-allocated duplicate, or a null 3722ea3352f4SAlex Elder * pointer if memory for the duplicate was not available. If 3723ea3352f4SAlex Elder * the lenp argument is a non-null pointer, the length of the token 3724ea3352f4SAlex Elder * (not including the '\0') is returned in *lenp. 3725ea3352f4SAlex Elder * 3726ea3352f4SAlex Elder * If successful, the *buf pointer will be updated to point beyond 3727ea3352f4SAlex Elder * the end of the found token. 3728ea3352f4SAlex Elder * 3729ea3352f4SAlex Elder * Note: uses GFP_KERNEL for allocation. 3730ea3352f4SAlex Elder */ 3731ea3352f4SAlex Elder static inline char *dup_token(const char **buf, size_t *lenp) 3732ea3352f4SAlex Elder { 3733ea3352f4SAlex Elder char *dup; 3734ea3352f4SAlex Elder size_t len; 3735ea3352f4SAlex Elder 3736ea3352f4SAlex Elder len = next_token(buf); 37374caf35f9SAlex Elder dup = kmemdup(*buf, len + 1, GFP_KERNEL); 3738ea3352f4SAlex Elder if (!dup) 3739ea3352f4SAlex Elder return NULL; 3740ea3352f4SAlex Elder *(dup + len) = '\0'; 3741ea3352f4SAlex Elder *buf += len; 3742ea3352f4SAlex Elder 3743ea3352f4SAlex Elder if (lenp) 3744ea3352f4SAlex Elder *lenp = len; 3745ea3352f4SAlex Elder 3746ea3352f4SAlex Elder return dup; 3747ea3352f4SAlex Elder } 3748ea3352f4SAlex Elder 3749ea3352f4SAlex Elder /* 3750859c31dfSAlex Elder * Parse the options provided for an "rbd add" (i.e., rbd image 3751859c31dfSAlex Elder * mapping) request. These arrive via a write to /sys/bus/rbd/add, 3752859c31dfSAlex Elder * and the data written is passed here via a NUL-terminated buffer. 3753859c31dfSAlex Elder * Returns 0 if successful or an error code otherwise. 3754d22f76e7SAlex Elder * 3755859c31dfSAlex Elder * The information extracted from these options is recorded in 3756859c31dfSAlex Elder * the other parameters which return dynamically-allocated 3757859c31dfSAlex Elder * structures: 3758859c31dfSAlex Elder * ceph_opts 3759859c31dfSAlex Elder * The address of a pointer that will refer to a ceph options 3760859c31dfSAlex Elder * structure. Caller must release the returned pointer using 3761859c31dfSAlex Elder * ceph_destroy_options() when it is no longer needed. 3762859c31dfSAlex Elder * rbd_opts 3763859c31dfSAlex Elder * Address of an rbd options pointer. Fully initialized by 3764859c31dfSAlex Elder * this function; caller must release with kfree(). 3765859c31dfSAlex Elder * spec 3766859c31dfSAlex Elder * Address of an rbd image specification pointer. Fully 3767859c31dfSAlex Elder * initialized by this function based on parsed options. 3768859c31dfSAlex Elder * Caller must release with rbd_spec_put(). 3769859c31dfSAlex Elder * 3770859c31dfSAlex Elder * The options passed take this form: 3771859c31dfSAlex Elder * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>] 3772859c31dfSAlex Elder * where: 3773859c31dfSAlex Elder * <mon_addrs> 3774859c31dfSAlex Elder * A comma-separated list of one or more monitor addresses. 3775859c31dfSAlex Elder * A monitor address is an ip address, optionally followed 3776859c31dfSAlex Elder * by a port number (separated by a colon). 3777859c31dfSAlex Elder * I.e.: ip1[:port1][,ip2[:port2]...] 3778859c31dfSAlex Elder * <options> 3779859c31dfSAlex Elder * A comma-separated list of ceph and/or rbd options. 3780859c31dfSAlex Elder * <pool_name> 3781859c31dfSAlex Elder * The name of the rados pool containing the rbd image. 3782859c31dfSAlex Elder * <image_name> 3783859c31dfSAlex Elder * The name of the image in that pool to map. 3784859c31dfSAlex Elder * <snap_id> 3785859c31dfSAlex Elder * An optional snapshot id. If provided, the mapping will 3786859c31dfSAlex Elder * present data from the image at the time that snapshot was 3787859c31dfSAlex Elder * created. The image head is used if no snapshot id is 3788859c31dfSAlex Elder * provided. Snapshot mappings are always read-only. 3789a725f65eSAlex Elder */ 3790859c31dfSAlex Elder static int rbd_add_parse_args(const char *buf, 3791dc79b113SAlex Elder struct ceph_options **ceph_opts, 3792859c31dfSAlex Elder struct rbd_options **opts, 3793859c31dfSAlex Elder struct rbd_spec **rbd_spec) 3794a725f65eSAlex Elder { 3795e28fff26SAlex Elder size_t len; 3796859c31dfSAlex Elder char *options; 37970ddebc0cSAlex Elder const char *mon_addrs; 37980ddebc0cSAlex Elder size_t mon_addrs_size; 3799859c31dfSAlex Elder struct rbd_spec *spec = NULL; 38004e9afebaSAlex Elder struct rbd_options *rbd_opts = NULL; 3801859c31dfSAlex Elder struct ceph_options *copts; 3802dc79b113SAlex Elder int ret; 3803e28fff26SAlex Elder 3804e28fff26SAlex Elder /* The first four tokens are required */ 3805e28fff26SAlex Elder 38067ef3214aSAlex Elder len = next_token(&buf); 38074fb5d671SAlex Elder if (!len) { 38084fb5d671SAlex Elder rbd_warn(NULL, "no monitor address(es) provided"); 38094fb5d671SAlex Elder return -EINVAL; 38104fb5d671SAlex Elder } 38110ddebc0cSAlex Elder mon_addrs = buf; 3812f28e565aSAlex Elder mon_addrs_size = len + 1; 38137ef3214aSAlex Elder buf += len; 3814a725f65eSAlex Elder 3815dc79b113SAlex Elder ret = -EINVAL; 3816f28e565aSAlex Elder options = dup_token(&buf, NULL); 3817f28e565aSAlex Elder if (!options) 3818dc79b113SAlex Elder return -ENOMEM; 38194fb5d671SAlex Elder if (!*options) { 38204fb5d671SAlex Elder rbd_warn(NULL, "no options provided"); 38214fb5d671SAlex Elder goto out_err; 38224fb5d671SAlex Elder } 3823a725f65eSAlex Elder 3824859c31dfSAlex Elder spec = rbd_spec_alloc(); 3825859c31dfSAlex Elder if (!spec) 3826f28e565aSAlex Elder goto out_mem; 3827859c31dfSAlex Elder 3828859c31dfSAlex Elder spec->pool_name = dup_token(&buf, NULL); 3829859c31dfSAlex Elder if (!spec->pool_name) 3830859c31dfSAlex Elder goto out_mem; 38314fb5d671SAlex Elder if (!*spec->pool_name) { 38324fb5d671SAlex Elder rbd_warn(NULL, "no pool name provided"); 38334fb5d671SAlex Elder goto out_err; 38344fb5d671SAlex Elder } 3835e28fff26SAlex Elder 383669e7a02fSAlex Elder spec->image_name = dup_token(&buf, NULL); 3837859c31dfSAlex Elder if (!spec->image_name) 3838f28e565aSAlex Elder goto out_mem; 38394fb5d671SAlex Elder if (!*spec->image_name) { 38404fb5d671SAlex Elder rbd_warn(NULL, "no image name provided"); 38414fb5d671SAlex Elder goto out_err; 38424fb5d671SAlex Elder } 3843e28fff26SAlex Elder 3844f28e565aSAlex Elder /* 3845f28e565aSAlex Elder * Snapshot name is optional; default is to use "-" 3846f28e565aSAlex Elder * (indicating the head/no snapshot). 3847f28e565aSAlex Elder */ 38483feeb894SAlex Elder len = next_token(&buf); 3849820a5f3eSAlex Elder if (!len) { 38503feeb894SAlex Elder buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */ 38513feeb894SAlex Elder len = sizeof (RBD_SNAP_HEAD_NAME) - 1; 3852f28e565aSAlex Elder } else if (len > RBD_MAX_SNAP_NAME_LEN) { 3853dc79b113SAlex Elder ret = -ENAMETOOLONG; 3854f28e565aSAlex Elder goto out_err; 3855849b4260SAlex Elder } 38564caf35f9SAlex Elder spec->snap_name = kmemdup(buf, len + 1, GFP_KERNEL); 3857859c31dfSAlex Elder if (!spec->snap_name) 3858f28e565aSAlex Elder goto out_mem; 3859859c31dfSAlex Elder *(spec->snap_name + len) = '\0'; 3860e5c35534SAlex Elder 38610ddebc0cSAlex Elder /* Initialize all rbd options to the defaults */ 3862e28fff26SAlex Elder 38634e9afebaSAlex Elder rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL); 38644e9afebaSAlex Elder if (!rbd_opts) 38654e9afebaSAlex Elder goto out_mem; 38664e9afebaSAlex Elder 38674e9afebaSAlex Elder rbd_opts->read_only = RBD_READ_ONLY_DEFAULT; 3868d22f76e7SAlex Elder 3869859c31dfSAlex Elder copts = ceph_parse_options(options, mon_addrs, 38700ddebc0cSAlex Elder mon_addrs + mon_addrs_size - 1, 38714e9afebaSAlex Elder parse_rbd_opts_token, rbd_opts); 3872859c31dfSAlex Elder if (IS_ERR(copts)) { 3873859c31dfSAlex Elder ret = PTR_ERR(copts); 3874dc79b113SAlex Elder goto out_err; 3875dc79b113SAlex Elder } 3876859c31dfSAlex Elder kfree(options); 3877859c31dfSAlex Elder 3878859c31dfSAlex Elder *ceph_opts = copts; 38794e9afebaSAlex Elder *opts = rbd_opts; 3880859c31dfSAlex Elder *rbd_spec = spec; 38810ddebc0cSAlex Elder 3882dc79b113SAlex Elder return 0; 3883f28e565aSAlex Elder out_mem: 3884dc79b113SAlex Elder ret = -ENOMEM; 3885d22f76e7SAlex Elder out_err: 3886859c31dfSAlex Elder kfree(rbd_opts); 3887859c31dfSAlex Elder rbd_spec_put(spec); 3888f28e565aSAlex Elder kfree(options); 3889d22f76e7SAlex Elder 3890dc79b113SAlex Elder return ret; 3891a725f65eSAlex Elder } 3892a725f65eSAlex Elder 3893589d30e0SAlex Elder /* 3894589d30e0SAlex Elder * An rbd format 2 image has a unique identifier, distinct from the 3895589d30e0SAlex Elder * name given to it by the user. Internally, that identifier is 3896589d30e0SAlex Elder * what's used to specify the names of objects related to the image. 3897589d30e0SAlex Elder * 3898589d30e0SAlex Elder * A special "rbd id" object is used to map an rbd image name to its 3899589d30e0SAlex Elder * id. If that object doesn't exist, then there is no v2 rbd image 3900589d30e0SAlex Elder * with the supplied name. 3901589d30e0SAlex Elder * 3902589d30e0SAlex Elder * This function will record the given rbd_dev's image_id field if 3903589d30e0SAlex Elder * it can be determined, and in that case will return 0. If any 3904589d30e0SAlex Elder * errors occur a negative errno will be returned and the rbd_dev's 3905589d30e0SAlex Elder * image_id field will be unchanged (and should be NULL). 3906589d30e0SAlex Elder */ 3907589d30e0SAlex Elder static int rbd_dev_image_id(struct rbd_device *rbd_dev) 3908589d30e0SAlex Elder { 3909589d30e0SAlex Elder int ret; 3910589d30e0SAlex Elder size_t size; 3911589d30e0SAlex Elder char *object_name; 3912589d30e0SAlex Elder void *response; 3913589d30e0SAlex Elder void *p; 3914589d30e0SAlex Elder 39152f82ee54SAlex Elder /* If we already have it we don't need to look it up */ 39162f82ee54SAlex Elder 39172f82ee54SAlex Elder if (rbd_dev->spec->image_id) 39182f82ee54SAlex Elder return 0; 39192f82ee54SAlex Elder 3920589d30e0SAlex Elder /* 39212c0d0a10SAlex Elder * When probing a parent image, the image id is already 39222c0d0a10SAlex Elder * known (and the image name likely is not). There's no 39232c0d0a10SAlex Elder * need to fetch the image id again in this case. 39242c0d0a10SAlex Elder */ 39252c0d0a10SAlex Elder if (rbd_dev->spec->image_id) 39262c0d0a10SAlex Elder return 0; 39272c0d0a10SAlex Elder 39282c0d0a10SAlex Elder /* 3929589d30e0SAlex Elder * First, see if the format 2 image id file exists, and if 3930589d30e0SAlex Elder * so, get the image's persistent id from it. 3931589d30e0SAlex Elder */ 393269e7a02fSAlex Elder size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name); 3933589d30e0SAlex Elder object_name = kmalloc(size, GFP_NOIO); 3934589d30e0SAlex Elder if (!object_name) 3935589d30e0SAlex Elder return -ENOMEM; 39360d7dbfceSAlex Elder sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name); 3937589d30e0SAlex Elder dout("rbd id object name is %s\n", object_name); 3938589d30e0SAlex Elder 3939589d30e0SAlex Elder /* Response will be an encoded string, which includes a length */ 3940589d30e0SAlex Elder 3941589d30e0SAlex Elder size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX; 3942589d30e0SAlex Elder response = kzalloc(size, GFP_NOIO); 3943589d30e0SAlex Elder if (!response) { 3944589d30e0SAlex Elder ret = -ENOMEM; 3945589d30e0SAlex Elder goto out; 3946589d30e0SAlex Elder } 3947589d30e0SAlex Elder 394836be9a76SAlex Elder ret = rbd_obj_method_sync(rbd_dev, object_name, 3949589d30e0SAlex Elder "rbd", "get_id", 3950589d30e0SAlex Elder NULL, 0, 395107b2391fSAlex Elder response, RBD_IMAGE_ID_LEN_MAX, NULL); 395236be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 3953589d30e0SAlex Elder if (ret < 0) 3954589d30e0SAlex Elder goto out; 3955589d30e0SAlex Elder 3956589d30e0SAlex Elder p = response; 39570d7dbfceSAlex Elder rbd_dev->spec->image_id = ceph_extract_encoded_string(&p, 3958589d30e0SAlex Elder p + RBD_IMAGE_ID_LEN_MAX, 3959979ed480SAlex Elder NULL, GFP_NOIO); 39600d7dbfceSAlex Elder if (IS_ERR(rbd_dev->spec->image_id)) { 39610d7dbfceSAlex Elder ret = PTR_ERR(rbd_dev->spec->image_id); 39620d7dbfceSAlex Elder rbd_dev->spec->image_id = NULL; 3963589d30e0SAlex Elder } else { 39640d7dbfceSAlex Elder dout("image_id is %s\n", rbd_dev->spec->image_id); 3965589d30e0SAlex Elder } 3966589d30e0SAlex Elder out: 3967589d30e0SAlex Elder kfree(response); 3968589d30e0SAlex Elder kfree(object_name); 3969589d30e0SAlex Elder 3970589d30e0SAlex Elder return ret; 3971589d30e0SAlex Elder } 3972589d30e0SAlex Elder 3973a30b71b9SAlex Elder static int rbd_dev_v1_probe(struct rbd_device *rbd_dev) 3974a30b71b9SAlex Elder { 3975a30b71b9SAlex Elder int ret; 3976a30b71b9SAlex Elder size_t size; 3977a30b71b9SAlex Elder 3978a30b71b9SAlex Elder /* Version 1 images have no id; empty string is used */ 3979a30b71b9SAlex Elder 39800d7dbfceSAlex Elder rbd_dev->spec->image_id = kstrdup("", GFP_KERNEL); 39810d7dbfceSAlex Elder if (!rbd_dev->spec->image_id) 3982a30b71b9SAlex Elder return -ENOMEM; 3983a30b71b9SAlex Elder 3984a30b71b9SAlex Elder /* Record the header object name for this rbd image. */ 3985a30b71b9SAlex Elder 398669e7a02fSAlex Elder size = strlen(rbd_dev->spec->image_name) + sizeof (RBD_SUFFIX); 3987a30b71b9SAlex Elder rbd_dev->header_name = kmalloc(size, GFP_KERNEL); 3988a30b71b9SAlex Elder if (!rbd_dev->header_name) { 3989a30b71b9SAlex Elder ret = -ENOMEM; 3990a30b71b9SAlex Elder goto out_err; 3991a30b71b9SAlex Elder } 39920d7dbfceSAlex Elder sprintf(rbd_dev->header_name, "%s%s", 39930d7dbfceSAlex Elder rbd_dev->spec->image_name, RBD_SUFFIX); 3994a30b71b9SAlex Elder 3995a30b71b9SAlex Elder /* Populate rbd image metadata */ 3996a30b71b9SAlex Elder 3997a30b71b9SAlex Elder ret = rbd_read_header(rbd_dev, &rbd_dev->header); 3998a30b71b9SAlex Elder if (ret < 0) 3999a30b71b9SAlex Elder goto out_err; 400086b00e0dSAlex Elder 400186b00e0dSAlex Elder /* Version 1 images have no parent (no layering) */ 400286b00e0dSAlex Elder 400386b00e0dSAlex Elder rbd_dev->parent_spec = NULL; 400486b00e0dSAlex Elder rbd_dev->parent_overlap = 0; 400586b00e0dSAlex Elder 4006a30b71b9SAlex Elder rbd_dev->image_format = 1; 4007a30b71b9SAlex Elder 4008a30b71b9SAlex Elder dout("discovered version 1 image, header name is %s\n", 4009a30b71b9SAlex Elder rbd_dev->header_name); 4010a30b71b9SAlex Elder 4011a30b71b9SAlex Elder return 0; 4012a30b71b9SAlex Elder 4013a30b71b9SAlex Elder out_err: 4014a30b71b9SAlex Elder kfree(rbd_dev->header_name); 4015a30b71b9SAlex Elder rbd_dev->header_name = NULL; 40160d7dbfceSAlex Elder kfree(rbd_dev->spec->image_id); 40170d7dbfceSAlex Elder rbd_dev->spec->image_id = NULL; 4018a30b71b9SAlex Elder 4019a30b71b9SAlex Elder return ret; 4020a30b71b9SAlex Elder } 4021a30b71b9SAlex Elder 4022a30b71b9SAlex Elder static int rbd_dev_v2_probe(struct rbd_device *rbd_dev) 4023a30b71b9SAlex Elder { 4024a30b71b9SAlex Elder size_t size; 40259d475de5SAlex Elder int ret; 40266e14b1a6SAlex Elder u64 ver = 0; 4027a30b71b9SAlex Elder 4028a30b71b9SAlex Elder /* 4029a30b71b9SAlex Elder * Image id was filled in by the caller. Record the header 4030a30b71b9SAlex Elder * object name for this rbd image. 4031a30b71b9SAlex Elder */ 4032979ed480SAlex Elder size = sizeof (RBD_HEADER_PREFIX) + strlen(rbd_dev->spec->image_id); 4033a30b71b9SAlex Elder rbd_dev->header_name = kmalloc(size, GFP_KERNEL); 4034a30b71b9SAlex Elder if (!rbd_dev->header_name) 4035a30b71b9SAlex Elder return -ENOMEM; 4036a30b71b9SAlex Elder sprintf(rbd_dev->header_name, "%s%s", 40370d7dbfceSAlex Elder RBD_HEADER_PREFIX, rbd_dev->spec->image_id); 40389d475de5SAlex Elder 40399d475de5SAlex Elder /* Get the size and object order for the image */ 40409d475de5SAlex Elder 40419d475de5SAlex Elder ret = rbd_dev_v2_image_size(rbd_dev); 40429d475de5SAlex Elder if (ret < 0) 40439d475de5SAlex Elder goto out_err; 40441e130199SAlex Elder 40451e130199SAlex Elder /* Get the object prefix (a.k.a. block_name) for the image */ 40461e130199SAlex Elder 40471e130199SAlex Elder ret = rbd_dev_v2_object_prefix(rbd_dev); 40481e130199SAlex Elder if (ret < 0) 40491e130199SAlex Elder goto out_err; 4050b1b5402aSAlex Elder 4051d889140cSAlex Elder /* Get the and check features for the image */ 4052b1b5402aSAlex Elder 4053b1b5402aSAlex Elder ret = rbd_dev_v2_features(rbd_dev); 4054b1b5402aSAlex Elder if (ret < 0) 4055b1b5402aSAlex Elder goto out_err; 405635d489f9SAlex Elder 405786b00e0dSAlex Elder /* If the image supports layering, get the parent info */ 405886b00e0dSAlex Elder 405986b00e0dSAlex Elder if (rbd_dev->header.features & RBD_FEATURE_LAYERING) { 406086b00e0dSAlex Elder ret = rbd_dev_v2_parent_info(rbd_dev); 406186b00e0dSAlex Elder if (ret < 0) 406286b00e0dSAlex Elder goto out_err; 406386b00e0dSAlex Elder } 406486b00e0dSAlex Elder 40656e14b1a6SAlex Elder /* crypto and compression type aren't (yet) supported for v2 images */ 406635d489f9SAlex Elder 40676e14b1a6SAlex Elder rbd_dev->header.crypt_type = 0; 40686e14b1a6SAlex Elder rbd_dev->header.comp_type = 0; 40696e14b1a6SAlex Elder 40706e14b1a6SAlex Elder /* Get the snapshot context, plus the header version */ 40716e14b1a6SAlex Elder 40726e14b1a6SAlex Elder ret = rbd_dev_v2_snap_context(rbd_dev, &ver); 407335d489f9SAlex Elder if (ret) 407435d489f9SAlex Elder goto out_err; 40756e14b1a6SAlex Elder rbd_dev->header.obj_version = ver; 40766e14b1a6SAlex Elder 4077a30b71b9SAlex Elder rbd_dev->image_format = 2; 4078a30b71b9SAlex Elder 4079a30b71b9SAlex Elder dout("discovered version 2 image, header name is %s\n", 4080a30b71b9SAlex Elder rbd_dev->header_name); 4081a30b71b9SAlex Elder 408235152979SAlex Elder return 0; 40839d475de5SAlex Elder out_err: 408486b00e0dSAlex Elder rbd_dev->parent_overlap = 0; 408586b00e0dSAlex Elder rbd_spec_put(rbd_dev->parent_spec); 408686b00e0dSAlex Elder rbd_dev->parent_spec = NULL; 40879d475de5SAlex Elder kfree(rbd_dev->header_name); 40889d475de5SAlex Elder rbd_dev->header_name = NULL; 40891e130199SAlex Elder kfree(rbd_dev->header.object_prefix); 40901e130199SAlex Elder rbd_dev->header.object_prefix = NULL; 40919d475de5SAlex Elder 40929d475de5SAlex Elder return ret; 4093a30b71b9SAlex Elder } 4094a30b71b9SAlex Elder 409583a06263SAlex Elder static int rbd_dev_probe_finish(struct rbd_device *rbd_dev) 409683a06263SAlex Elder { 40972f82ee54SAlex Elder struct rbd_device *parent = NULL; 40982f82ee54SAlex Elder struct rbd_spec *parent_spec = NULL; 40992f82ee54SAlex Elder struct rbd_client *rbdc = NULL; 410083a06263SAlex Elder int ret; 410183a06263SAlex Elder 410283a06263SAlex Elder /* no need to lock here, as rbd_dev is not registered yet */ 410383a06263SAlex Elder ret = rbd_dev_snaps_update(rbd_dev); 410483a06263SAlex Elder if (ret) 410583a06263SAlex Elder return ret; 410683a06263SAlex Elder 41079e15b77dSAlex Elder ret = rbd_dev_probe_update_spec(rbd_dev); 41089e15b77dSAlex Elder if (ret) 41099e15b77dSAlex Elder goto err_out_snaps; 41109e15b77dSAlex Elder 411183a06263SAlex Elder ret = rbd_dev_set_mapping(rbd_dev); 411283a06263SAlex Elder if (ret) 411383a06263SAlex Elder goto err_out_snaps; 411483a06263SAlex Elder 411583a06263SAlex Elder /* generate unique id: find highest unique id, add one */ 411683a06263SAlex Elder rbd_dev_id_get(rbd_dev); 411783a06263SAlex Elder 411883a06263SAlex Elder /* Fill in the device name, now that we have its id. */ 411983a06263SAlex Elder BUILD_BUG_ON(DEV_NAME_LEN 412083a06263SAlex Elder < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH); 412183a06263SAlex Elder sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id); 412283a06263SAlex Elder 412383a06263SAlex Elder /* Get our block major device number. */ 412483a06263SAlex Elder 412583a06263SAlex Elder ret = register_blkdev(0, rbd_dev->name); 412683a06263SAlex Elder if (ret < 0) 412783a06263SAlex Elder goto err_out_id; 412883a06263SAlex Elder rbd_dev->major = ret; 412983a06263SAlex Elder 413083a06263SAlex Elder /* Set up the blkdev mapping. */ 413183a06263SAlex Elder 413283a06263SAlex Elder ret = rbd_init_disk(rbd_dev); 413383a06263SAlex Elder if (ret) 413483a06263SAlex Elder goto err_out_blkdev; 413583a06263SAlex Elder 413683a06263SAlex Elder ret = rbd_bus_add_dev(rbd_dev); 413783a06263SAlex Elder if (ret) 413883a06263SAlex Elder goto err_out_disk; 413983a06263SAlex Elder 414083a06263SAlex Elder /* 414183a06263SAlex Elder * At this point cleanup in the event of an error is the job 414283a06263SAlex Elder * of the sysfs code (initiated by rbd_bus_del_dev()). 414383a06263SAlex Elder */ 41442f82ee54SAlex Elder /* Probe the parent if there is one */ 41452f82ee54SAlex Elder 41462f82ee54SAlex Elder if (rbd_dev->parent_spec) { 41472f82ee54SAlex Elder /* 41482f82ee54SAlex Elder * We need to pass a reference to the client and the 41492f82ee54SAlex Elder * parent spec when creating the parent rbd_dev. 41502f82ee54SAlex Elder * Images related by parent/child relationships 41512f82ee54SAlex Elder * always share both. 41522f82ee54SAlex Elder */ 41532f82ee54SAlex Elder parent_spec = rbd_spec_get(rbd_dev->parent_spec); 41542f82ee54SAlex Elder rbdc = __rbd_get_client(rbd_dev->rbd_client); 41552f82ee54SAlex Elder 41562f82ee54SAlex Elder parent = rbd_dev_create(rbdc, parent_spec); 41572f82ee54SAlex Elder if (!parent) { 41582f82ee54SAlex Elder ret = -ENOMEM; 41592f82ee54SAlex Elder goto err_out_spec; 41602f82ee54SAlex Elder } 41612f82ee54SAlex Elder rbdc = NULL; /* parent now owns reference */ 41622f82ee54SAlex Elder parent_spec = NULL; /* parent now owns reference */ 41632f82ee54SAlex Elder ret = rbd_dev_probe(parent); 41642f82ee54SAlex Elder if (ret < 0) 41652f82ee54SAlex Elder goto err_out_parent; 41662f82ee54SAlex Elder rbd_dev->parent = parent; 41672f82ee54SAlex Elder } 41682f82ee54SAlex Elder 416983a06263SAlex Elder down_write(&rbd_dev->header_rwsem); 417083a06263SAlex Elder ret = rbd_dev_snaps_register(rbd_dev); 417183a06263SAlex Elder up_write(&rbd_dev->header_rwsem); 417283a06263SAlex Elder if (ret) 417383a06263SAlex Elder goto err_out_bus; 417483a06263SAlex Elder 41759969ebc5SAlex Elder ret = rbd_dev_header_watch_sync(rbd_dev, 1); 417683a06263SAlex Elder if (ret) 417783a06263SAlex Elder goto err_out_bus; 417883a06263SAlex Elder 417983a06263SAlex Elder /* Everything's ready. Announce the disk to the world. */ 418083a06263SAlex Elder 418183a06263SAlex Elder add_disk(rbd_dev->disk); 418283a06263SAlex Elder 418383a06263SAlex Elder pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name, 418483a06263SAlex Elder (unsigned long long) rbd_dev->mapping.size); 418583a06263SAlex Elder 418683a06263SAlex Elder return ret; 41872f82ee54SAlex Elder 41882f82ee54SAlex Elder err_out_parent: 41892f82ee54SAlex Elder rbd_dev_destroy(parent); 41902f82ee54SAlex Elder err_out_spec: 41912f82ee54SAlex Elder rbd_spec_put(parent_spec); 41922f82ee54SAlex Elder rbd_put_client(rbdc); 419383a06263SAlex Elder err_out_bus: 419483a06263SAlex Elder /* this will also clean up rest of rbd_dev stuff */ 419583a06263SAlex Elder 419683a06263SAlex Elder rbd_bus_del_dev(rbd_dev); 419783a06263SAlex Elder 419883a06263SAlex Elder return ret; 419983a06263SAlex Elder err_out_disk: 420083a06263SAlex Elder rbd_free_disk(rbd_dev); 420183a06263SAlex Elder err_out_blkdev: 420283a06263SAlex Elder unregister_blkdev(rbd_dev->major, rbd_dev->name); 420383a06263SAlex Elder err_out_id: 420483a06263SAlex Elder rbd_dev_id_put(rbd_dev); 420583a06263SAlex Elder err_out_snaps: 420683a06263SAlex Elder rbd_remove_all_snaps(rbd_dev); 420783a06263SAlex Elder 420883a06263SAlex Elder return ret; 420983a06263SAlex Elder } 421083a06263SAlex Elder 4211a30b71b9SAlex Elder /* 4212a30b71b9SAlex Elder * Probe for the existence of the header object for the given rbd 4213a30b71b9SAlex Elder * device. For format 2 images this includes determining the image 4214a30b71b9SAlex Elder * id. 4215a30b71b9SAlex Elder */ 4216a30b71b9SAlex Elder static int rbd_dev_probe(struct rbd_device *rbd_dev) 4217a30b71b9SAlex Elder { 4218a30b71b9SAlex Elder int ret; 4219a30b71b9SAlex Elder 4220a30b71b9SAlex Elder /* 4221a30b71b9SAlex Elder * Get the id from the image id object. If it's not a 4222a30b71b9SAlex Elder * format 2 image, we'll get ENOENT back, and we'll assume 4223a30b71b9SAlex Elder * it's a format 1 image. 4224a30b71b9SAlex Elder */ 4225a30b71b9SAlex Elder ret = rbd_dev_image_id(rbd_dev); 4226a30b71b9SAlex Elder if (ret) 4227a30b71b9SAlex Elder ret = rbd_dev_v1_probe(rbd_dev); 4228a30b71b9SAlex Elder else 4229a30b71b9SAlex Elder ret = rbd_dev_v2_probe(rbd_dev); 423083a06263SAlex Elder if (ret) { 4231a30b71b9SAlex Elder dout("probe failed, returning %d\n", ret); 4232a30b71b9SAlex Elder 4233a30b71b9SAlex Elder return ret; 4234a30b71b9SAlex Elder } 4235a30b71b9SAlex Elder 423683a06263SAlex Elder ret = rbd_dev_probe_finish(rbd_dev); 423783a06263SAlex Elder if (ret) 423883a06263SAlex Elder rbd_header_free(&rbd_dev->header); 423983a06263SAlex Elder 424083a06263SAlex Elder return ret; 424183a06263SAlex Elder } 424283a06263SAlex Elder 424359c2be1eSYehuda Sadeh static ssize_t rbd_add(struct bus_type *bus, 424459c2be1eSYehuda Sadeh const char *buf, 424559c2be1eSYehuda Sadeh size_t count) 4246602adf40SYehuda Sadeh { 4247cb8627c7SAlex Elder struct rbd_device *rbd_dev = NULL; 4248dc79b113SAlex Elder struct ceph_options *ceph_opts = NULL; 42494e9afebaSAlex Elder struct rbd_options *rbd_opts = NULL; 4250859c31dfSAlex Elder struct rbd_spec *spec = NULL; 42519d3997fdSAlex Elder struct rbd_client *rbdc; 425227cc2594SAlex Elder struct ceph_osd_client *osdc; 425327cc2594SAlex Elder int rc = -ENOMEM; 4254602adf40SYehuda Sadeh 4255602adf40SYehuda Sadeh if (!try_module_get(THIS_MODULE)) 4256602adf40SYehuda Sadeh return -ENODEV; 4257602adf40SYehuda Sadeh 4258a725f65eSAlex Elder /* parse add command */ 4259859c31dfSAlex Elder rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec); 4260dc79b113SAlex Elder if (rc < 0) 4261bd4ba655SAlex Elder goto err_out_module; 4262a725f65eSAlex Elder 42639d3997fdSAlex Elder rbdc = rbd_get_client(ceph_opts); 42649d3997fdSAlex Elder if (IS_ERR(rbdc)) { 42659d3997fdSAlex Elder rc = PTR_ERR(rbdc); 42660ddebc0cSAlex Elder goto err_out_args; 42679d3997fdSAlex Elder } 4268c53d5893SAlex Elder ceph_opts = NULL; /* rbd_dev client now owns this */ 4269602adf40SYehuda Sadeh 4270602adf40SYehuda Sadeh /* pick the pool */ 42719d3997fdSAlex Elder osdc = &rbdc->client->osdc; 4272859c31dfSAlex Elder rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name); 4273602adf40SYehuda Sadeh if (rc < 0) 4274602adf40SYehuda Sadeh goto err_out_client; 4275859c31dfSAlex Elder spec->pool_id = (u64) rc; 4276859c31dfSAlex Elder 42770903e875SAlex Elder /* The ceph file layout needs to fit pool id in 32 bits */ 42780903e875SAlex Elder 42790903e875SAlex Elder if (WARN_ON(spec->pool_id > (u64) U32_MAX)) { 42800903e875SAlex Elder rc = -EIO; 42810903e875SAlex Elder goto err_out_client; 42820903e875SAlex Elder } 42830903e875SAlex Elder 4284c53d5893SAlex Elder rbd_dev = rbd_dev_create(rbdc, spec); 4285bd4ba655SAlex Elder if (!rbd_dev) 4286bd4ba655SAlex Elder goto err_out_client; 4287c53d5893SAlex Elder rbdc = NULL; /* rbd_dev now owns this */ 4288c53d5893SAlex Elder spec = NULL; /* rbd_dev now owns this */ 4289602adf40SYehuda Sadeh 4290bd4ba655SAlex Elder rbd_dev->mapping.read_only = rbd_opts->read_only; 4291c53d5893SAlex Elder kfree(rbd_opts); 4292c53d5893SAlex Elder rbd_opts = NULL; /* done with this */ 4293bd4ba655SAlex Elder 4294a30b71b9SAlex Elder rc = rbd_dev_probe(rbd_dev); 4295a30b71b9SAlex Elder if (rc < 0) 4296c53d5893SAlex Elder goto err_out_rbd_dev; 429705fd6f6fSAlex Elder 4298602adf40SYehuda Sadeh return count; 4299c53d5893SAlex Elder err_out_rbd_dev: 4300c53d5893SAlex Elder rbd_dev_destroy(rbd_dev); 4301bd4ba655SAlex Elder err_out_client: 43029d3997fdSAlex Elder rbd_put_client(rbdc); 43030ddebc0cSAlex Elder err_out_args: 430478cea76eSAlex Elder if (ceph_opts) 430578cea76eSAlex Elder ceph_destroy_options(ceph_opts); 43064e9afebaSAlex Elder kfree(rbd_opts); 4307859c31dfSAlex Elder rbd_spec_put(spec); 4308bd4ba655SAlex Elder err_out_module: 4309bd4ba655SAlex Elder module_put(THIS_MODULE); 431027cc2594SAlex Elder 4311602adf40SYehuda Sadeh dout("Error adding device %s\n", buf); 431227cc2594SAlex Elder 431327cc2594SAlex Elder return (ssize_t) rc; 4314602adf40SYehuda Sadeh } 4315602adf40SYehuda Sadeh 4316de71a297SAlex Elder static struct rbd_device *__rbd_get_dev(unsigned long dev_id) 4317602adf40SYehuda Sadeh { 4318602adf40SYehuda Sadeh struct list_head *tmp; 4319602adf40SYehuda Sadeh struct rbd_device *rbd_dev; 4320602adf40SYehuda Sadeh 4321e124a82fSAlex Elder spin_lock(&rbd_dev_list_lock); 4322602adf40SYehuda Sadeh list_for_each(tmp, &rbd_dev_list) { 4323602adf40SYehuda Sadeh rbd_dev = list_entry(tmp, struct rbd_device, node); 4324de71a297SAlex Elder if (rbd_dev->dev_id == dev_id) { 4325e124a82fSAlex Elder spin_unlock(&rbd_dev_list_lock); 4326602adf40SYehuda Sadeh return rbd_dev; 4327602adf40SYehuda Sadeh } 4328e124a82fSAlex Elder } 4329e124a82fSAlex Elder spin_unlock(&rbd_dev_list_lock); 4330602adf40SYehuda Sadeh return NULL; 4331602adf40SYehuda Sadeh } 4332602adf40SYehuda Sadeh 4333dfc5606dSYehuda Sadeh static void rbd_dev_release(struct device *dev) 4334602adf40SYehuda Sadeh { 4335593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4336602adf40SYehuda Sadeh 433759c2be1eSYehuda Sadeh if (rbd_dev->watch_event) 43389969ebc5SAlex Elder rbd_dev_header_watch_sync(rbd_dev, 0); 4339602adf40SYehuda Sadeh 4340602adf40SYehuda Sadeh /* clean up and free blkdev */ 4341602adf40SYehuda Sadeh rbd_free_disk(rbd_dev); 4342602adf40SYehuda Sadeh unregister_blkdev(rbd_dev->major, rbd_dev->name); 434332eec68dSAlex Elder 43442ac4e75dSAlex Elder /* release allocated disk header fields */ 43452ac4e75dSAlex Elder rbd_header_free(&rbd_dev->header); 43462ac4e75dSAlex Elder 434732eec68dSAlex Elder /* done with the id, and with the rbd_dev */ 4348e2839308SAlex Elder rbd_dev_id_put(rbd_dev); 4349c53d5893SAlex Elder rbd_assert(rbd_dev->rbd_client != NULL); 4350c53d5893SAlex Elder rbd_dev_destroy(rbd_dev); 4351602adf40SYehuda Sadeh 4352602adf40SYehuda Sadeh /* release module ref */ 4353602adf40SYehuda Sadeh module_put(THIS_MODULE); 4354602adf40SYehuda Sadeh } 4355602adf40SYehuda Sadeh 43562f82ee54SAlex Elder static void __rbd_remove(struct rbd_device *rbd_dev) 43572f82ee54SAlex Elder { 43582f82ee54SAlex Elder rbd_remove_all_snaps(rbd_dev); 43592f82ee54SAlex Elder rbd_bus_del_dev(rbd_dev); 43602f82ee54SAlex Elder } 43612f82ee54SAlex Elder 4362dfc5606dSYehuda Sadeh static ssize_t rbd_remove(struct bus_type *bus, 4363602adf40SYehuda Sadeh const char *buf, 4364602adf40SYehuda Sadeh size_t count) 4365602adf40SYehuda Sadeh { 4366602adf40SYehuda Sadeh struct rbd_device *rbd_dev = NULL; 4367602adf40SYehuda Sadeh int target_id, rc; 4368602adf40SYehuda Sadeh unsigned long ul; 4369602adf40SYehuda Sadeh int ret = count; 4370602adf40SYehuda Sadeh 4371602adf40SYehuda Sadeh rc = strict_strtoul(buf, 10, &ul); 4372602adf40SYehuda Sadeh if (rc) 4373602adf40SYehuda Sadeh return rc; 4374602adf40SYehuda Sadeh 4375602adf40SYehuda Sadeh /* convert to int; abort if we lost anything in the conversion */ 4376602adf40SYehuda Sadeh target_id = (int) ul; 4377602adf40SYehuda Sadeh if (target_id != ul) 4378602adf40SYehuda Sadeh return -EINVAL; 4379602adf40SYehuda Sadeh 4380602adf40SYehuda Sadeh mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); 4381602adf40SYehuda Sadeh 4382602adf40SYehuda Sadeh rbd_dev = __rbd_get_dev(target_id); 4383602adf40SYehuda Sadeh if (!rbd_dev) { 4384602adf40SYehuda Sadeh ret = -ENOENT; 4385602adf40SYehuda Sadeh goto done; 4386602adf40SYehuda Sadeh } 4387602adf40SYehuda Sadeh 4388a14ea269SAlex Elder spin_lock_irq(&rbd_dev->lock); 4389b82d167bSAlex Elder if (rbd_dev->open_count) 439042382b70SAlex Elder ret = -EBUSY; 4391b82d167bSAlex Elder else 4392b82d167bSAlex Elder set_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags); 4393a14ea269SAlex Elder spin_unlock_irq(&rbd_dev->lock); 4394b82d167bSAlex Elder if (ret < 0) 439542382b70SAlex Elder goto done; 439642382b70SAlex Elder 43972f82ee54SAlex Elder while (rbd_dev->parent_spec) { 43982f82ee54SAlex Elder struct rbd_device *first = rbd_dev; 43992f82ee54SAlex Elder struct rbd_device *second = first->parent; 44002f82ee54SAlex Elder struct rbd_device *third; 44012f82ee54SAlex Elder 44022f82ee54SAlex Elder /* 44032f82ee54SAlex Elder * Follow to the parent with no grandparent and 44042f82ee54SAlex Elder * remove it. 44052f82ee54SAlex Elder */ 44062f82ee54SAlex Elder while (second && (third = second->parent)) { 44072f82ee54SAlex Elder first = second; 44082f82ee54SAlex Elder second = third; 44092f82ee54SAlex Elder } 44102f82ee54SAlex Elder __rbd_remove(second); 44112f82ee54SAlex Elder rbd_spec_put(first->parent_spec); 44122f82ee54SAlex Elder first->parent_spec = NULL; 44132f82ee54SAlex Elder first->parent_overlap = 0; 44142f82ee54SAlex Elder first->parent = NULL; 44152f82ee54SAlex Elder } 44162f82ee54SAlex Elder __rbd_remove(rbd_dev); 4417602adf40SYehuda Sadeh 4418602adf40SYehuda Sadeh done: 4419602adf40SYehuda Sadeh mutex_unlock(&ctl_mutex); 4420aafb230eSAlex Elder 4421602adf40SYehuda Sadeh return ret; 4422602adf40SYehuda Sadeh } 4423602adf40SYehuda Sadeh 4424602adf40SYehuda Sadeh /* 4425602adf40SYehuda Sadeh * create control files in sysfs 4426dfc5606dSYehuda Sadeh * /sys/bus/rbd/... 4427602adf40SYehuda Sadeh */ 4428602adf40SYehuda Sadeh static int rbd_sysfs_init(void) 4429602adf40SYehuda Sadeh { 4430dfc5606dSYehuda Sadeh int ret; 4431602adf40SYehuda Sadeh 4432fed4c143SAlex Elder ret = device_register(&rbd_root_dev); 4433dfc5606dSYehuda Sadeh if (ret < 0) 4434dfc5606dSYehuda Sadeh return ret; 4435602adf40SYehuda Sadeh 4436fed4c143SAlex Elder ret = bus_register(&rbd_bus_type); 4437fed4c143SAlex Elder if (ret < 0) 4438fed4c143SAlex Elder device_unregister(&rbd_root_dev); 4439602adf40SYehuda Sadeh 4440602adf40SYehuda Sadeh return ret; 4441602adf40SYehuda Sadeh } 4442602adf40SYehuda Sadeh 4443602adf40SYehuda Sadeh static void rbd_sysfs_cleanup(void) 4444602adf40SYehuda Sadeh { 4445dfc5606dSYehuda Sadeh bus_unregister(&rbd_bus_type); 4446fed4c143SAlex Elder device_unregister(&rbd_root_dev); 4447602adf40SYehuda Sadeh } 4448602adf40SYehuda Sadeh 4449cc344fa1SAlex Elder static int __init rbd_init(void) 4450602adf40SYehuda Sadeh { 4451602adf40SYehuda Sadeh int rc; 4452602adf40SYehuda Sadeh 44531e32d34cSAlex Elder if (!libceph_compatible(NULL)) { 44541e32d34cSAlex Elder rbd_warn(NULL, "libceph incompatibility (quitting)"); 44551e32d34cSAlex Elder 44561e32d34cSAlex Elder return -EINVAL; 44571e32d34cSAlex Elder } 4458602adf40SYehuda Sadeh rc = rbd_sysfs_init(); 4459602adf40SYehuda Sadeh if (rc) 4460602adf40SYehuda Sadeh return rc; 4461f0f8cef5SAlex Elder pr_info("loaded " RBD_DRV_NAME_LONG "\n"); 4462602adf40SYehuda Sadeh return 0; 4463602adf40SYehuda Sadeh } 4464602adf40SYehuda Sadeh 4465cc344fa1SAlex Elder static void __exit rbd_exit(void) 4466602adf40SYehuda Sadeh { 4467602adf40SYehuda Sadeh rbd_sysfs_cleanup(); 4468602adf40SYehuda Sadeh } 4469602adf40SYehuda Sadeh 4470602adf40SYehuda Sadeh module_init(rbd_init); 4471602adf40SYehuda Sadeh module_exit(rbd_exit); 4472602adf40SYehuda Sadeh 4473602adf40SYehuda Sadeh MODULE_AUTHOR("Sage Weil <sage@newdream.net>"); 4474602adf40SYehuda Sadeh MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>"); 4475602adf40SYehuda Sadeh MODULE_DESCRIPTION("rados block device"); 4476602adf40SYehuda Sadeh 4477602adf40SYehuda Sadeh /* following authorship retained from original osdblk.c */ 4478602adf40SYehuda Sadeh MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>"); 4479602adf40SYehuda Sadeh 4480602adf40SYehuda Sadeh MODULE_LICENSE("GPL"); 4481