1602adf40SYehuda Sadeh /* 2602adf40SYehuda Sadeh rbd.c -- Export ceph rados objects as a Linux block device 3602adf40SYehuda Sadeh 4602adf40SYehuda Sadeh 5602adf40SYehuda Sadeh based on drivers/block/osdblk.c: 6602adf40SYehuda Sadeh 7602adf40SYehuda Sadeh Copyright 2009 Red Hat, Inc. 8602adf40SYehuda Sadeh 9602adf40SYehuda Sadeh This program is free software; you can redistribute it and/or modify 10602adf40SYehuda Sadeh it under the terms of the GNU General Public License as published by 11602adf40SYehuda Sadeh the Free Software Foundation. 12602adf40SYehuda Sadeh 13602adf40SYehuda Sadeh This program is distributed in the hope that it will be useful, 14602adf40SYehuda Sadeh but WITHOUT ANY WARRANTY; without even the implied warranty of 15602adf40SYehuda Sadeh MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16602adf40SYehuda Sadeh GNU General Public License for more details. 17602adf40SYehuda Sadeh 18602adf40SYehuda Sadeh You should have received a copy of the GNU General Public License 19602adf40SYehuda Sadeh along with this program; see the file COPYING. If not, write to 20602adf40SYehuda Sadeh the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 21602adf40SYehuda Sadeh 22602adf40SYehuda Sadeh 23602adf40SYehuda Sadeh 24dfc5606dSYehuda Sadeh For usage instructions, please refer to: 25602adf40SYehuda Sadeh 26dfc5606dSYehuda Sadeh Documentation/ABI/testing/sysfs-bus-rbd 27602adf40SYehuda Sadeh 28602adf40SYehuda Sadeh */ 29602adf40SYehuda Sadeh 30602adf40SYehuda Sadeh #include <linux/ceph/libceph.h> 31602adf40SYehuda Sadeh #include <linux/ceph/osd_client.h> 32602adf40SYehuda Sadeh #include <linux/ceph/mon_client.h> 33602adf40SYehuda Sadeh #include <linux/ceph/decode.h> 3459c2be1eSYehuda Sadeh #include <linux/parser.h> 35602adf40SYehuda Sadeh 36602adf40SYehuda Sadeh #include <linux/kernel.h> 37602adf40SYehuda Sadeh #include <linux/device.h> 38602adf40SYehuda Sadeh #include <linux/module.h> 39602adf40SYehuda Sadeh #include <linux/fs.h> 40602adf40SYehuda Sadeh #include <linux/blkdev.h> 41602adf40SYehuda Sadeh 42602adf40SYehuda Sadeh #include "rbd_types.h" 43602adf40SYehuda Sadeh 44aafb230eSAlex Elder #define RBD_DEBUG /* Activate rbd_assert() calls */ 45aafb230eSAlex Elder 46593a9e7bSAlex Elder /* 47593a9e7bSAlex Elder * The basic unit of block I/O is a sector. It is interpreted in a 48593a9e7bSAlex Elder * number of contexts in Linux (blk, bio, genhd), but the default is 49593a9e7bSAlex Elder * universally 512 bytes. These symbols are just slightly more 50593a9e7bSAlex Elder * meaningful than the bare numbers they represent. 51593a9e7bSAlex Elder */ 52593a9e7bSAlex Elder #define SECTOR_SHIFT 9 53593a9e7bSAlex Elder #define SECTOR_SIZE (1ULL << SECTOR_SHIFT) 54593a9e7bSAlex Elder 552647ba38SAlex Elder /* It might be useful to have these defined elsewhere */ 56df111be6SAlex Elder 572647ba38SAlex Elder #define U8_MAX ((u8) (~0U)) 582647ba38SAlex Elder #define U16_MAX ((u16) (~0U)) 590ec8ce87SAlex Elder #define U32_MAX ((u32) (~0U)) 60df111be6SAlex Elder #define U64_MAX ((u64) (~0ULL)) 61df111be6SAlex Elder 62f0f8cef5SAlex Elder #define RBD_DRV_NAME "rbd" 63f0f8cef5SAlex Elder #define RBD_DRV_NAME_LONG "rbd (rados block device)" 64602adf40SYehuda Sadeh 65602adf40SYehuda Sadeh #define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */ 66602adf40SYehuda Sadeh 67d4b125e9SAlex Elder #define RBD_SNAP_DEV_NAME_PREFIX "snap_" 68d4b125e9SAlex Elder #define RBD_MAX_SNAP_NAME_LEN \ 69d4b125e9SAlex Elder (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1)) 70d4b125e9SAlex Elder 7135d489f9SAlex Elder #define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */ 72602adf40SYehuda Sadeh 73602adf40SYehuda Sadeh #define RBD_SNAP_HEAD_NAME "-" 74602adf40SYehuda Sadeh 759e15b77dSAlex Elder /* This allows a single page to hold an image name sent by OSD */ 769e15b77dSAlex Elder #define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1) 77589d30e0SAlex Elder #define RBD_IMAGE_ID_LEN_MAX 64 789e15b77dSAlex Elder 791e130199SAlex Elder #define RBD_OBJ_PREFIX_LEN_MAX 64 80589d30e0SAlex Elder 81d889140cSAlex Elder /* Feature bits */ 82d889140cSAlex Elder 83d889140cSAlex Elder #define RBD_FEATURE_LAYERING 1 84d889140cSAlex Elder 85d889140cSAlex Elder /* Features supported by this (client software) implementation. */ 86d889140cSAlex Elder 87d889140cSAlex Elder #define RBD_FEATURES_ALL (0) 88d889140cSAlex Elder 8981a89793SAlex Elder /* 9081a89793SAlex Elder * An RBD device name will be "rbd#", where the "rbd" comes from 9181a89793SAlex Elder * RBD_DRV_NAME above, and # is a unique integer identifier. 9281a89793SAlex Elder * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big 9381a89793SAlex Elder * enough to hold all possible device names. 9481a89793SAlex Elder */ 95602adf40SYehuda Sadeh #define DEV_NAME_LEN 32 9681a89793SAlex Elder #define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1) 97602adf40SYehuda Sadeh 98602adf40SYehuda Sadeh /* 99602adf40SYehuda Sadeh * block device image metadata (in-memory version) 100602adf40SYehuda Sadeh */ 101602adf40SYehuda Sadeh struct rbd_image_header { 102f84344f3SAlex Elder /* These four fields never change for a given rbd image */ 103849b4260SAlex Elder char *object_prefix; 10434b13184SAlex Elder u64 features; 105602adf40SYehuda Sadeh __u8 obj_order; 106602adf40SYehuda Sadeh __u8 crypt_type; 107602adf40SYehuda Sadeh __u8 comp_type; 108602adf40SYehuda Sadeh 109f84344f3SAlex Elder /* The remaining fields need to be updated occasionally */ 110f84344f3SAlex Elder u64 image_size; 111f84344f3SAlex Elder struct ceph_snap_context *snapc; 112602adf40SYehuda Sadeh char *snap_names; 113602adf40SYehuda Sadeh u64 *snap_sizes; 11459c2be1eSYehuda Sadeh 11559c2be1eSYehuda Sadeh u64 obj_version; 11659c2be1eSYehuda Sadeh }; 11759c2be1eSYehuda Sadeh 1180d7dbfceSAlex Elder /* 1190d7dbfceSAlex Elder * An rbd image specification. 1200d7dbfceSAlex Elder * 1210d7dbfceSAlex Elder * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely 122c66c6e0cSAlex Elder * identify an image. Each rbd_dev structure includes a pointer to 123c66c6e0cSAlex Elder * an rbd_spec structure that encapsulates this identity. 124c66c6e0cSAlex Elder * 125c66c6e0cSAlex Elder * Each of the id's in an rbd_spec has an associated name. For a 126c66c6e0cSAlex Elder * user-mapped image, the names are supplied and the id's associated 127c66c6e0cSAlex Elder * with them are looked up. For a layered image, a parent image is 128c66c6e0cSAlex Elder * defined by the tuple, and the names are looked up. 129c66c6e0cSAlex Elder * 130c66c6e0cSAlex Elder * An rbd_dev structure contains a parent_spec pointer which is 131c66c6e0cSAlex Elder * non-null if the image it represents is a child in a layered 132c66c6e0cSAlex Elder * image. This pointer will refer to the rbd_spec structure used 133c66c6e0cSAlex Elder * by the parent rbd_dev for its own identity (i.e., the structure 134c66c6e0cSAlex Elder * is shared between the parent and child). 135c66c6e0cSAlex Elder * 136c66c6e0cSAlex Elder * Since these structures are populated once, during the discovery 137c66c6e0cSAlex Elder * phase of image construction, they are effectively immutable so 138c66c6e0cSAlex Elder * we make no effort to synchronize access to them. 139c66c6e0cSAlex Elder * 140c66c6e0cSAlex Elder * Note that code herein does not assume the image name is known (it 141c66c6e0cSAlex Elder * could be a null pointer). 1420d7dbfceSAlex Elder */ 1430d7dbfceSAlex Elder struct rbd_spec { 1440d7dbfceSAlex Elder u64 pool_id; 1450d7dbfceSAlex Elder char *pool_name; 1460d7dbfceSAlex Elder 1470d7dbfceSAlex Elder char *image_id; 1480d7dbfceSAlex Elder char *image_name; 1490d7dbfceSAlex Elder 1500d7dbfceSAlex Elder u64 snap_id; 1510d7dbfceSAlex Elder char *snap_name; 1520d7dbfceSAlex Elder 1530d7dbfceSAlex Elder struct kref kref; 1540d7dbfceSAlex Elder }; 1550d7dbfceSAlex Elder 156602adf40SYehuda Sadeh /* 157f0f8cef5SAlex Elder * an instance of the client. multiple devices may share an rbd client. 158602adf40SYehuda Sadeh */ 159602adf40SYehuda Sadeh struct rbd_client { 160602adf40SYehuda Sadeh struct ceph_client *client; 161602adf40SYehuda Sadeh struct kref kref; 162602adf40SYehuda Sadeh struct list_head node; 163602adf40SYehuda Sadeh }; 164602adf40SYehuda Sadeh 165bf0d5f50SAlex Elder struct rbd_img_request; 166bf0d5f50SAlex Elder typedef void (*rbd_img_callback_t)(struct rbd_img_request *); 167bf0d5f50SAlex Elder 168bf0d5f50SAlex Elder #define BAD_WHICH U32_MAX /* Good which or bad which, which? */ 169bf0d5f50SAlex Elder 170bf0d5f50SAlex Elder struct rbd_obj_request; 171bf0d5f50SAlex Elder typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *); 172bf0d5f50SAlex Elder 1739969ebc5SAlex Elder enum obj_request_type { 1749969ebc5SAlex Elder OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES 1759969ebc5SAlex Elder }; 176bf0d5f50SAlex Elder 177bf0d5f50SAlex Elder struct rbd_obj_request { 178bf0d5f50SAlex Elder const char *object_name; 179bf0d5f50SAlex Elder u64 offset; /* object start byte */ 180bf0d5f50SAlex Elder u64 length; /* bytes from offset */ 181bf0d5f50SAlex Elder 182bf0d5f50SAlex Elder struct rbd_img_request *img_request; 183bf0d5f50SAlex Elder struct list_head links; /* img_request->obj_requests */ 184bf0d5f50SAlex Elder u32 which; /* posn image request list */ 185bf0d5f50SAlex Elder 186bf0d5f50SAlex Elder enum obj_request_type type; 187788e2df3SAlex Elder union { 188bf0d5f50SAlex Elder struct bio *bio_list; 189788e2df3SAlex Elder struct { 190788e2df3SAlex Elder struct page **pages; 191788e2df3SAlex Elder u32 page_count; 192788e2df3SAlex Elder }; 193788e2df3SAlex Elder }; 194bf0d5f50SAlex Elder 195bf0d5f50SAlex Elder struct ceph_osd_request *osd_req; 196bf0d5f50SAlex Elder 197bf0d5f50SAlex Elder u64 xferred; /* bytes transferred */ 198bf0d5f50SAlex Elder u64 version; 199bf0d5f50SAlex Elder s32 result; 200bf0d5f50SAlex Elder atomic_t done; 201bf0d5f50SAlex Elder 202bf0d5f50SAlex Elder rbd_obj_callback_t callback; 203788e2df3SAlex Elder struct completion completion; 204bf0d5f50SAlex Elder 205bf0d5f50SAlex Elder struct kref kref; 206bf0d5f50SAlex Elder }; 207bf0d5f50SAlex Elder 208bf0d5f50SAlex Elder struct rbd_img_request { 209bf0d5f50SAlex Elder struct request *rq; 210bf0d5f50SAlex Elder struct rbd_device *rbd_dev; 211bf0d5f50SAlex Elder u64 offset; /* starting image byte offset */ 212bf0d5f50SAlex Elder u64 length; /* byte count from offset */ 213bf0d5f50SAlex Elder bool write_request; /* false for read */ 214bf0d5f50SAlex Elder union { 215bf0d5f50SAlex Elder struct ceph_snap_context *snapc; /* for writes */ 216bf0d5f50SAlex Elder u64 snap_id; /* for reads */ 217bf0d5f50SAlex Elder }; 218bf0d5f50SAlex Elder spinlock_t completion_lock;/* protects next_completion */ 219bf0d5f50SAlex Elder u32 next_completion; 220bf0d5f50SAlex Elder rbd_img_callback_t callback; 221bf0d5f50SAlex Elder 222bf0d5f50SAlex Elder u32 obj_request_count; 223bf0d5f50SAlex Elder struct list_head obj_requests; /* rbd_obj_request structs */ 224bf0d5f50SAlex Elder 225bf0d5f50SAlex Elder struct kref kref; 226bf0d5f50SAlex Elder }; 227bf0d5f50SAlex Elder 228bf0d5f50SAlex Elder #define for_each_obj_request(ireq, oreq) \ 229ef06f4d3SAlex Elder list_for_each_entry(oreq, &(ireq)->obj_requests, links) 230bf0d5f50SAlex Elder #define for_each_obj_request_from(ireq, oreq) \ 231ef06f4d3SAlex Elder list_for_each_entry_from(oreq, &(ireq)->obj_requests, links) 232bf0d5f50SAlex Elder #define for_each_obj_request_safe(ireq, oreq, n) \ 233ef06f4d3SAlex Elder list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links) 234bf0d5f50SAlex Elder 235dfc5606dSYehuda Sadeh struct rbd_snap { 236dfc5606dSYehuda Sadeh struct device dev; 237dfc5606dSYehuda Sadeh const char *name; 2383591538fSJosh Durgin u64 size; 239dfc5606dSYehuda Sadeh struct list_head node; 240dfc5606dSYehuda Sadeh u64 id; 24134b13184SAlex Elder u64 features; 242dfc5606dSYehuda Sadeh }; 243dfc5606dSYehuda Sadeh 244f84344f3SAlex Elder struct rbd_mapping { 24599c1f08fSAlex Elder u64 size; 24634b13184SAlex Elder u64 features; 247f84344f3SAlex Elder bool read_only; 248f84344f3SAlex Elder }; 249f84344f3SAlex Elder 250602adf40SYehuda Sadeh /* 251602adf40SYehuda Sadeh * a single device 252602adf40SYehuda Sadeh */ 253602adf40SYehuda Sadeh struct rbd_device { 254de71a297SAlex Elder int dev_id; /* blkdev unique id */ 255602adf40SYehuda Sadeh 256602adf40SYehuda Sadeh int major; /* blkdev assigned major */ 257602adf40SYehuda Sadeh struct gendisk *disk; /* blkdev's gendisk and rq */ 258602adf40SYehuda Sadeh 259a30b71b9SAlex Elder u32 image_format; /* Either 1 or 2 */ 260602adf40SYehuda Sadeh struct rbd_client *rbd_client; 261602adf40SYehuda Sadeh 262602adf40SYehuda Sadeh char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */ 263602adf40SYehuda Sadeh 264b82d167bSAlex Elder spinlock_t lock; /* queue, flags, open_count */ 265602adf40SYehuda Sadeh 266602adf40SYehuda Sadeh struct rbd_image_header header; 267b82d167bSAlex Elder unsigned long flags; /* possibly lock protected */ 2680d7dbfceSAlex Elder struct rbd_spec *spec; 269602adf40SYehuda Sadeh 2700d7dbfceSAlex Elder char *header_name; 271971f839aSAlex Elder 2720903e875SAlex Elder struct ceph_file_layout layout; 2730903e875SAlex Elder 27459c2be1eSYehuda Sadeh struct ceph_osd_event *watch_event; 275975241afSAlex Elder struct rbd_obj_request *watch_request; 27659c2be1eSYehuda Sadeh 27786b00e0dSAlex Elder struct rbd_spec *parent_spec; 27886b00e0dSAlex Elder u64 parent_overlap; 27986b00e0dSAlex Elder 280c666601aSJosh Durgin /* protects updating the header */ 281c666601aSJosh Durgin struct rw_semaphore header_rwsem; 282f84344f3SAlex Elder 283f84344f3SAlex Elder struct rbd_mapping mapping; 284602adf40SYehuda Sadeh 285602adf40SYehuda Sadeh struct list_head node; 286dfc5606dSYehuda Sadeh 287dfc5606dSYehuda Sadeh /* list of snapshots */ 288dfc5606dSYehuda Sadeh struct list_head snaps; 289dfc5606dSYehuda Sadeh 290dfc5606dSYehuda Sadeh /* sysfs related */ 291dfc5606dSYehuda Sadeh struct device dev; 292b82d167bSAlex Elder unsigned long open_count; /* protected by lock */ 293dfc5606dSYehuda Sadeh }; 294dfc5606dSYehuda Sadeh 295b82d167bSAlex Elder /* 296b82d167bSAlex Elder * Flag bits for rbd_dev->flags. If atomicity is required, 297b82d167bSAlex Elder * rbd_dev->lock is used to protect access. 298b82d167bSAlex Elder * 299b82d167bSAlex Elder * Currently, only the "removing" flag (which is coupled with the 300b82d167bSAlex Elder * "open_count" field) requires atomic access. 301b82d167bSAlex Elder */ 3026d292906SAlex Elder enum rbd_dev_flags { 3036d292906SAlex Elder RBD_DEV_FLAG_EXISTS, /* mapped snapshot has not been deleted */ 304b82d167bSAlex Elder RBD_DEV_FLAG_REMOVING, /* this mapping is being removed */ 3056d292906SAlex Elder }; 3066d292906SAlex Elder 307602adf40SYehuda Sadeh static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */ 308e124a82fSAlex Elder 309602adf40SYehuda Sadeh static LIST_HEAD(rbd_dev_list); /* devices */ 310e124a82fSAlex Elder static DEFINE_SPINLOCK(rbd_dev_list_lock); 311e124a82fSAlex Elder 312602adf40SYehuda Sadeh static LIST_HEAD(rbd_client_list); /* clients */ 313432b8587SAlex Elder static DEFINE_SPINLOCK(rbd_client_list_lock); 314602adf40SYehuda Sadeh 315304f6808SAlex Elder static int rbd_dev_snaps_update(struct rbd_device *rbd_dev); 316304f6808SAlex Elder static int rbd_dev_snaps_register(struct rbd_device *rbd_dev); 317304f6808SAlex Elder 318dfc5606dSYehuda Sadeh static void rbd_dev_release(struct device *dev); 31941f38c2bSAlex Elder static void rbd_remove_snap_dev(struct rbd_snap *snap); 320dfc5606dSYehuda Sadeh 321f0f8cef5SAlex Elder static ssize_t rbd_add(struct bus_type *bus, const char *buf, 322f0f8cef5SAlex Elder size_t count); 323f0f8cef5SAlex Elder static ssize_t rbd_remove(struct bus_type *bus, const char *buf, 324f0f8cef5SAlex Elder size_t count); 325f0f8cef5SAlex Elder 326f0f8cef5SAlex Elder static struct bus_attribute rbd_bus_attrs[] = { 327f0f8cef5SAlex Elder __ATTR(add, S_IWUSR, NULL, rbd_add), 328f0f8cef5SAlex Elder __ATTR(remove, S_IWUSR, NULL, rbd_remove), 329f0f8cef5SAlex Elder __ATTR_NULL 330f0f8cef5SAlex Elder }; 331f0f8cef5SAlex Elder 332f0f8cef5SAlex Elder static struct bus_type rbd_bus_type = { 333f0f8cef5SAlex Elder .name = "rbd", 334f0f8cef5SAlex Elder .bus_attrs = rbd_bus_attrs, 335f0f8cef5SAlex Elder }; 336f0f8cef5SAlex Elder 337f0f8cef5SAlex Elder static void rbd_root_dev_release(struct device *dev) 338f0f8cef5SAlex Elder { 339f0f8cef5SAlex Elder } 340f0f8cef5SAlex Elder 341f0f8cef5SAlex Elder static struct device rbd_root_dev = { 342f0f8cef5SAlex Elder .init_name = "rbd", 343f0f8cef5SAlex Elder .release = rbd_root_dev_release, 344f0f8cef5SAlex Elder }; 345f0f8cef5SAlex Elder 34606ecc6cbSAlex Elder static __printf(2, 3) 34706ecc6cbSAlex Elder void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...) 34806ecc6cbSAlex Elder { 34906ecc6cbSAlex Elder struct va_format vaf; 35006ecc6cbSAlex Elder va_list args; 35106ecc6cbSAlex Elder 35206ecc6cbSAlex Elder va_start(args, fmt); 35306ecc6cbSAlex Elder vaf.fmt = fmt; 35406ecc6cbSAlex Elder vaf.va = &args; 35506ecc6cbSAlex Elder 35606ecc6cbSAlex Elder if (!rbd_dev) 35706ecc6cbSAlex Elder printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf); 35806ecc6cbSAlex Elder else if (rbd_dev->disk) 35906ecc6cbSAlex Elder printk(KERN_WARNING "%s: %s: %pV\n", 36006ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf); 36106ecc6cbSAlex Elder else if (rbd_dev->spec && rbd_dev->spec->image_name) 36206ecc6cbSAlex Elder printk(KERN_WARNING "%s: image %s: %pV\n", 36306ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf); 36406ecc6cbSAlex Elder else if (rbd_dev->spec && rbd_dev->spec->image_id) 36506ecc6cbSAlex Elder printk(KERN_WARNING "%s: id %s: %pV\n", 36606ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf); 36706ecc6cbSAlex Elder else /* punt */ 36806ecc6cbSAlex Elder printk(KERN_WARNING "%s: rbd_dev %p: %pV\n", 36906ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev, &vaf); 37006ecc6cbSAlex Elder va_end(args); 37106ecc6cbSAlex Elder } 37206ecc6cbSAlex Elder 373aafb230eSAlex Elder #ifdef RBD_DEBUG 374aafb230eSAlex Elder #define rbd_assert(expr) \ 375aafb230eSAlex Elder if (unlikely(!(expr))) { \ 376aafb230eSAlex Elder printk(KERN_ERR "\nAssertion failure in %s() " \ 377aafb230eSAlex Elder "at line %d:\n\n" \ 378aafb230eSAlex Elder "\trbd_assert(%s);\n\n", \ 379aafb230eSAlex Elder __func__, __LINE__, #expr); \ 380aafb230eSAlex Elder BUG(); \ 381aafb230eSAlex Elder } 382aafb230eSAlex Elder #else /* !RBD_DEBUG */ 383aafb230eSAlex Elder # define rbd_assert(expr) ((void) 0) 384aafb230eSAlex Elder #endif /* !RBD_DEBUG */ 385dfc5606dSYehuda Sadeh 386117973fbSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver); 387117973fbSAlex Elder static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver); 38859c2be1eSYehuda Sadeh 389602adf40SYehuda Sadeh static int rbd_open(struct block_device *bdev, fmode_t mode) 390602adf40SYehuda Sadeh { 391f0f8cef5SAlex Elder struct rbd_device *rbd_dev = bdev->bd_disk->private_data; 392b82d167bSAlex Elder bool removing = false; 393602adf40SYehuda Sadeh 394f84344f3SAlex Elder if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only) 395602adf40SYehuda Sadeh return -EROFS; 396602adf40SYehuda Sadeh 397a14ea269SAlex Elder spin_lock_irq(&rbd_dev->lock); 398b82d167bSAlex Elder if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags)) 399b82d167bSAlex Elder removing = true; 400b82d167bSAlex Elder else 401b82d167bSAlex Elder rbd_dev->open_count++; 402a14ea269SAlex Elder spin_unlock_irq(&rbd_dev->lock); 403b82d167bSAlex Elder if (removing) 404b82d167bSAlex Elder return -ENOENT; 405b82d167bSAlex Elder 40642382b70SAlex Elder mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); 407c3e946ceSAlex Elder (void) get_device(&rbd_dev->dev); 408f84344f3SAlex Elder set_device_ro(bdev, rbd_dev->mapping.read_only); 40942382b70SAlex Elder mutex_unlock(&ctl_mutex); 410340c7a2bSAlex Elder 411602adf40SYehuda Sadeh return 0; 412602adf40SYehuda Sadeh } 413602adf40SYehuda Sadeh 414dfc5606dSYehuda Sadeh static int rbd_release(struct gendisk *disk, fmode_t mode) 415dfc5606dSYehuda Sadeh { 416dfc5606dSYehuda Sadeh struct rbd_device *rbd_dev = disk->private_data; 417b82d167bSAlex Elder unsigned long open_count_before; 418b82d167bSAlex Elder 419a14ea269SAlex Elder spin_lock_irq(&rbd_dev->lock); 420b82d167bSAlex Elder open_count_before = rbd_dev->open_count--; 421a14ea269SAlex Elder spin_unlock_irq(&rbd_dev->lock); 422b82d167bSAlex Elder rbd_assert(open_count_before > 0); 423dfc5606dSYehuda Sadeh 42442382b70SAlex Elder mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); 425c3e946ceSAlex Elder put_device(&rbd_dev->dev); 42642382b70SAlex Elder mutex_unlock(&ctl_mutex); 427dfc5606dSYehuda Sadeh 428dfc5606dSYehuda Sadeh return 0; 429dfc5606dSYehuda Sadeh } 430dfc5606dSYehuda Sadeh 431602adf40SYehuda Sadeh static const struct block_device_operations rbd_bd_ops = { 432602adf40SYehuda Sadeh .owner = THIS_MODULE, 433602adf40SYehuda Sadeh .open = rbd_open, 434dfc5606dSYehuda Sadeh .release = rbd_release, 435602adf40SYehuda Sadeh }; 436602adf40SYehuda Sadeh 437602adf40SYehuda Sadeh /* 438602adf40SYehuda Sadeh * Initialize an rbd client instance. 43943ae4701SAlex Elder * We own *ceph_opts. 440602adf40SYehuda Sadeh */ 441f8c38929SAlex Elder static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts) 442602adf40SYehuda Sadeh { 443602adf40SYehuda Sadeh struct rbd_client *rbdc; 444602adf40SYehuda Sadeh int ret = -ENOMEM; 445602adf40SYehuda Sadeh 446602adf40SYehuda Sadeh dout("rbd_client_create\n"); 447602adf40SYehuda Sadeh rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL); 448602adf40SYehuda Sadeh if (!rbdc) 449602adf40SYehuda Sadeh goto out_opt; 450602adf40SYehuda Sadeh 451602adf40SYehuda Sadeh kref_init(&rbdc->kref); 452602adf40SYehuda Sadeh INIT_LIST_HEAD(&rbdc->node); 453602adf40SYehuda Sadeh 454bc534d86SAlex Elder mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); 455bc534d86SAlex Elder 45643ae4701SAlex Elder rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0); 457602adf40SYehuda Sadeh if (IS_ERR(rbdc->client)) 458bc534d86SAlex Elder goto out_mutex; 45943ae4701SAlex Elder ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */ 460602adf40SYehuda Sadeh 461602adf40SYehuda Sadeh ret = ceph_open_session(rbdc->client); 462602adf40SYehuda Sadeh if (ret < 0) 463602adf40SYehuda Sadeh goto out_err; 464602adf40SYehuda Sadeh 465432b8587SAlex Elder spin_lock(&rbd_client_list_lock); 466602adf40SYehuda Sadeh list_add_tail(&rbdc->node, &rbd_client_list); 467432b8587SAlex Elder spin_unlock(&rbd_client_list_lock); 468602adf40SYehuda Sadeh 469bc534d86SAlex Elder mutex_unlock(&ctl_mutex); 470bc534d86SAlex Elder 471602adf40SYehuda Sadeh dout("rbd_client_create created %p\n", rbdc); 472602adf40SYehuda Sadeh return rbdc; 473602adf40SYehuda Sadeh 474602adf40SYehuda Sadeh out_err: 475602adf40SYehuda Sadeh ceph_destroy_client(rbdc->client); 476bc534d86SAlex Elder out_mutex: 477bc534d86SAlex Elder mutex_unlock(&ctl_mutex); 478602adf40SYehuda Sadeh kfree(rbdc); 479602adf40SYehuda Sadeh out_opt: 48043ae4701SAlex Elder if (ceph_opts) 48143ae4701SAlex Elder ceph_destroy_options(ceph_opts); 48228f259b7SVasiliy Kulikov return ERR_PTR(ret); 483602adf40SYehuda Sadeh } 484602adf40SYehuda Sadeh 485602adf40SYehuda Sadeh /* 4861f7ba331SAlex Elder * Find a ceph client with specific addr and configuration. If 4871f7ba331SAlex Elder * found, bump its reference count. 488602adf40SYehuda Sadeh */ 4891f7ba331SAlex Elder static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts) 490602adf40SYehuda Sadeh { 491602adf40SYehuda Sadeh struct rbd_client *client_node; 4921f7ba331SAlex Elder bool found = false; 493602adf40SYehuda Sadeh 49443ae4701SAlex Elder if (ceph_opts->flags & CEPH_OPT_NOSHARE) 495602adf40SYehuda Sadeh return NULL; 496602adf40SYehuda Sadeh 4971f7ba331SAlex Elder spin_lock(&rbd_client_list_lock); 4981f7ba331SAlex Elder list_for_each_entry(client_node, &rbd_client_list, node) { 4991f7ba331SAlex Elder if (!ceph_compare_options(ceph_opts, client_node->client)) { 5001f7ba331SAlex Elder kref_get(&client_node->kref); 5011f7ba331SAlex Elder found = true; 5021f7ba331SAlex Elder break; 5031f7ba331SAlex Elder } 5041f7ba331SAlex Elder } 5051f7ba331SAlex Elder spin_unlock(&rbd_client_list_lock); 5061f7ba331SAlex Elder 5071f7ba331SAlex Elder return found ? client_node : NULL; 508602adf40SYehuda Sadeh } 509602adf40SYehuda Sadeh 510602adf40SYehuda Sadeh /* 51159c2be1eSYehuda Sadeh * mount options 51259c2be1eSYehuda Sadeh */ 51359c2be1eSYehuda Sadeh enum { 51459c2be1eSYehuda Sadeh Opt_last_int, 51559c2be1eSYehuda Sadeh /* int args above */ 51659c2be1eSYehuda Sadeh Opt_last_string, 51759c2be1eSYehuda Sadeh /* string args above */ 518cc0538b6SAlex Elder Opt_read_only, 519cc0538b6SAlex Elder Opt_read_write, 520cc0538b6SAlex Elder /* Boolean args above */ 521cc0538b6SAlex Elder Opt_last_bool, 52259c2be1eSYehuda Sadeh }; 52359c2be1eSYehuda Sadeh 52443ae4701SAlex Elder static match_table_t rbd_opts_tokens = { 52559c2be1eSYehuda Sadeh /* int args above */ 52659c2be1eSYehuda Sadeh /* string args above */ 527be466c1cSAlex Elder {Opt_read_only, "read_only"}, 528cc0538b6SAlex Elder {Opt_read_only, "ro"}, /* Alternate spelling */ 529cc0538b6SAlex Elder {Opt_read_write, "read_write"}, 530cc0538b6SAlex Elder {Opt_read_write, "rw"}, /* Alternate spelling */ 531cc0538b6SAlex Elder /* Boolean args above */ 53259c2be1eSYehuda Sadeh {-1, NULL} 53359c2be1eSYehuda Sadeh }; 53459c2be1eSYehuda Sadeh 53598571b5aSAlex Elder struct rbd_options { 53698571b5aSAlex Elder bool read_only; 53798571b5aSAlex Elder }; 53898571b5aSAlex Elder 53998571b5aSAlex Elder #define RBD_READ_ONLY_DEFAULT false 54098571b5aSAlex Elder 54159c2be1eSYehuda Sadeh static int parse_rbd_opts_token(char *c, void *private) 54259c2be1eSYehuda Sadeh { 54343ae4701SAlex Elder struct rbd_options *rbd_opts = private; 54459c2be1eSYehuda Sadeh substring_t argstr[MAX_OPT_ARGS]; 54559c2be1eSYehuda Sadeh int token, intval, ret; 54659c2be1eSYehuda Sadeh 54743ae4701SAlex Elder token = match_token(c, rbd_opts_tokens, argstr); 54859c2be1eSYehuda Sadeh if (token < 0) 54959c2be1eSYehuda Sadeh return -EINVAL; 55059c2be1eSYehuda Sadeh 55159c2be1eSYehuda Sadeh if (token < Opt_last_int) { 55259c2be1eSYehuda Sadeh ret = match_int(&argstr[0], &intval); 55359c2be1eSYehuda Sadeh if (ret < 0) { 55459c2be1eSYehuda Sadeh pr_err("bad mount option arg (not int) " 55559c2be1eSYehuda Sadeh "at '%s'\n", c); 55659c2be1eSYehuda Sadeh return ret; 55759c2be1eSYehuda Sadeh } 55859c2be1eSYehuda Sadeh dout("got int token %d val %d\n", token, intval); 55959c2be1eSYehuda Sadeh } else if (token > Opt_last_int && token < Opt_last_string) { 56059c2be1eSYehuda Sadeh dout("got string token %d val %s\n", token, 56159c2be1eSYehuda Sadeh argstr[0].from); 562cc0538b6SAlex Elder } else if (token > Opt_last_string && token < Opt_last_bool) { 563cc0538b6SAlex Elder dout("got Boolean token %d\n", token); 56459c2be1eSYehuda Sadeh } else { 56559c2be1eSYehuda Sadeh dout("got token %d\n", token); 56659c2be1eSYehuda Sadeh } 56759c2be1eSYehuda Sadeh 56859c2be1eSYehuda Sadeh switch (token) { 569cc0538b6SAlex Elder case Opt_read_only: 570cc0538b6SAlex Elder rbd_opts->read_only = true; 571cc0538b6SAlex Elder break; 572cc0538b6SAlex Elder case Opt_read_write: 573cc0538b6SAlex Elder rbd_opts->read_only = false; 574cc0538b6SAlex Elder break; 57559c2be1eSYehuda Sadeh default: 576aafb230eSAlex Elder rbd_assert(false); 577aafb230eSAlex Elder break; 57859c2be1eSYehuda Sadeh } 57959c2be1eSYehuda Sadeh return 0; 58059c2be1eSYehuda Sadeh } 58159c2be1eSYehuda Sadeh 58259c2be1eSYehuda Sadeh /* 583602adf40SYehuda Sadeh * Get a ceph client with specific addr and configuration, if one does 584602adf40SYehuda Sadeh * not exist create it. 585602adf40SYehuda Sadeh */ 5869d3997fdSAlex Elder static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts) 587602adf40SYehuda Sadeh { 588f8c38929SAlex Elder struct rbd_client *rbdc; 58959c2be1eSYehuda Sadeh 5901f7ba331SAlex Elder rbdc = rbd_client_find(ceph_opts); 5919d3997fdSAlex Elder if (rbdc) /* using an existing client */ 59243ae4701SAlex Elder ceph_destroy_options(ceph_opts); 5939d3997fdSAlex Elder else 594f8c38929SAlex Elder rbdc = rbd_client_create(ceph_opts); 595d720bcb0SAlex Elder 5969d3997fdSAlex Elder return rbdc; 597602adf40SYehuda Sadeh } 598602adf40SYehuda Sadeh 599602adf40SYehuda Sadeh /* 600602adf40SYehuda Sadeh * Destroy ceph client 601d23a4b3fSAlex Elder * 602432b8587SAlex Elder * Caller must hold rbd_client_list_lock. 603602adf40SYehuda Sadeh */ 604602adf40SYehuda Sadeh static void rbd_client_release(struct kref *kref) 605602adf40SYehuda Sadeh { 606602adf40SYehuda Sadeh struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref); 607602adf40SYehuda Sadeh 608602adf40SYehuda Sadeh dout("rbd_release_client %p\n", rbdc); 609cd9d9f5dSAlex Elder spin_lock(&rbd_client_list_lock); 610602adf40SYehuda Sadeh list_del(&rbdc->node); 611cd9d9f5dSAlex Elder spin_unlock(&rbd_client_list_lock); 612602adf40SYehuda Sadeh 613602adf40SYehuda Sadeh ceph_destroy_client(rbdc->client); 614602adf40SYehuda Sadeh kfree(rbdc); 615602adf40SYehuda Sadeh } 616602adf40SYehuda Sadeh 617602adf40SYehuda Sadeh /* 618602adf40SYehuda Sadeh * Drop reference to ceph client node. If it's not referenced anymore, release 619602adf40SYehuda Sadeh * it. 620602adf40SYehuda Sadeh */ 6219d3997fdSAlex Elder static void rbd_put_client(struct rbd_client *rbdc) 622602adf40SYehuda Sadeh { 623c53d5893SAlex Elder if (rbdc) 6249d3997fdSAlex Elder kref_put(&rbdc->kref, rbd_client_release); 625602adf40SYehuda Sadeh } 626602adf40SYehuda Sadeh 627a30b71b9SAlex Elder static bool rbd_image_format_valid(u32 image_format) 628a30b71b9SAlex Elder { 629a30b71b9SAlex Elder return image_format == 1 || image_format == 2; 630a30b71b9SAlex Elder } 631a30b71b9SAlex Elder 6328e94af8eSAlex Elder static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk) 6338e94af8eSAlex Elder { 634103a150fSAlex Elder size_t size; 635103a150fSAlex Elder u32 snap_count; 636103a150fSAlex Elder 637103a150fSAlex Elder /* The header has to start with the magic rbd header text */ 638103a150fSAlex Elder if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT))) 639103a150fSAlex Elder return false; 640103a150fSAlex Elder 641db2388b6SAlex Elder /* The bio layer requires at least sector-sized I/O */ 642db2388b6SAlex Elder 643db2388b6SAlex Elder if (ondisk->options.order < SECTOR_SHIFT) 644db2388b6SAlex Elder return false; 645db2388b6SAlex Elder 646db2388b6SAlex Elder /* If we use u64 in a few spots we may be able to loosen this */ 647db2388b6SAlex Elder 648db2388b6SAlex Elder if (ondisk->options.order > 8 * sizeof (int) - 1) 649db2388b6SAlex Elder return false; 650db2388b6SAlex Elder 651103a150fSAlex Elder /* 652103a150fSAlex Elder * The size of a snapshot header has to fit in a size_t, and 653103a150fSAlex Elder * that limits the number of snapshots. 654103a150fSAlex Elder */ 655103a150fSAlex Elder snap_count = le32_to_cpu(ondisk->snap_count); 656103a150fSAlex Elder size = SIZE_MAX - sizeof (struct ceph_snap_context); 657103a150fSAlex Elder if (snap_count > size / sizeof (__le64)) 658103a150fSAlex Elder return false; 659103a150fSAlex Elder 660103a150fSAlex Elder /* 661103a150fSAlex Elder * Not only that, but the size of the entire the snapshot 662103a150fSAlex Elder * header must also be representable in a size_t. 663103a150fSAlex Elder */ 664103a150fSAlex Elder size -= snap_count * sizeof (__le64); 665103a150fSAlex Elder if ((u64) size < le64_to_cpu(ondisk->snap_names_len)) 666103a150fSAlex Elder return false; 667103a150fSAlex Elder 668103a150fSAlex Elder return true; 6698e94af8eSAlex Elder } 6708e94af8eSAlex Elder 671602adf40SYehuda Sadeh /* 672602adf40SYehuda Sadeh * Create a new header structure, translate header format from the on-disk 673602adf40SYehuda Sadeh * header. 674602adf40SYehuda Sadeh */ 675602adf40SYehuda Sadeh static int rbd_header_from_disk(struct rbd_image_header *header, 6764156d998SAlex Elder struct rbd_image_header_ondisk *ondisk) 677602adf40SYehuda Sadeh { 678ccece235SAlex Elder u32 snap_count; 67958c17b0eSAlex Elder size_t len; 680d2bb24e5SAlex Elder size_t size; 681621901d6SAlex Elder u32 i; 682602adf40SYehuda Sadeh 6836a52325fSAlex Elder memset(header, 0, sizeof (*header)); 6846a52325fSAlex Elder 685103a150fSAlex Elder snap_count = le32_to_cpu(ondisk->snap_count); 686103a150fSAlex Elder 68758c17b0eSAlex Elder len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix)); 68858c17b0eSAlex Elder header->object_prefix = kmalloc(len + 1, GFP_KERNEL); 6896a52325fSAlex Elder if (!header->object_prefix) 690602adf40SYehuda Sadeh return -ENOMEM; 69158c17b0eSAlex Elder memcpy(header->object_prefix, ondisk->object_prefix, len); 69258c17b0eSAlex Elder header->object_prefix[len] = '\0'; 69300f1f36fSAlex Elder 694602adf40SYehuda Sadeh if (snap_count) { 695f785cc1dSAlex Elder u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len); 696f785cc1dSAlex Elder 697621901d6SAlex Elder /* Save a copy of the snapshot names */ 698621901d6SAlex Elder 699f785cc1dSAlex Elder if (snap_names_len > (u64) SIZE_MAX) 700f785cc1dSAlex Elder return -EIO; 701f785cc1dSAlex Elder header->snap_names = kmalloc(snap_names_len, GFP_KERNEL); 702602adf40SYehuda Sadeh if (!header->snap_names) 7036a52325fSAlex Elder goto out_err; 704f785cc1dSAlex Elder /* 705f785cc1dSAlex Elder * Note that rbd_dev_v1_header_read() guarantees 706f785cc1dSAlex Elder * the ondisk buffer we're working with has 707f785cc1dSAlex Elder * snap_names_len bytes beyond the end of the 708f785cc1dSAlex Elder * snapshot id array, this memcpy() is safe. 709f785cc1dSAlex Elder */ 710f785cc1dSAlex Elder memcpy(header->snap_names, &ondisk->snaps[snap_count], 711f785cc1dSAlex Elder snap_names_len); 7126a52325fSAlex Elder 713621901d6SAlex Elder /* Record each snapshot's size */ 714621901d6SAlex Elder 715d2bb24e5SAlex Elder size = snap_count * sizeof (*header->snap_sizes); 716d2bb24e5SAlex Elder header->snap_sizes = kmalloc(size, GFP_KERNEL); 717602adf40SYehuda Sadeh if (!header->snap_sizes) 7186a52325fSAlex Elder goto out_err; 719621901d6SAlex Elder for (i = 0; i < snap_count; i++) 720621901d6SAlex Elder header->snap_sizes[i] = 721621901d6SAlex Elder le64_to_cpu(ondisk->snaps[i].image_size); 722602adf40SYehuda Sadeh } else { 723ccece235SAlex Elder WARN_ON(ondisk->snap_names_len); 724602adf40SYehuda Sadeh header->snap_names = NULL; 725602adf40SYehuda Sadeh header->snap_sizes = NULL; 726602adf40SYehuda Sadeh } 727849b4260SAlex Elder 72834b13184SAlex Elder header->features = 0; /* No features support in v1 images */ 729602adf40SYehuda Sadeh header->obj_order = ondisk->options.order; 730602adf40SYehuda Sadeh header->crypt_type = ondisk->options.crypt_type; 731602adf40SYehuda Sadeh header->comp_type = ondisk->options.comp_type; 7326a52325fSAlex Elder 733621901d6SAlex Elder /* Allocate and fill in the snapshot context */ 734621901d6SAlex Elder 735f84344f3SAlex Elder header->image_size = le64_to_cpu(ondisk->image_size); 7366a52325fSAlex Elder size = sizeof (struct ceph_snap_context); 7376a52325fSAlex Elder size += snap_count * sizeof (header->snapc->snaps[0]); 7386a52325fSAlex Elder header->snapc = kzalloc(size, GFP_KERNEL); 7396a52325fSAlex Elder if (!header->snapc) 7406a52325fSAlex Elder goto out_err; 741602adf40SYehuda Sadeh 742602adf40SYehuda Sadeh atomic_set(&header->snapc->nref, 1); 743505cbb9bSAlex Elder header->snapc->seq = le64_to_cpu(ondisk->snap_seq); 744602adf40SYehuda Sadeh header->snapc->num_snaps = snap_count; 745621901d6SAlex Elder for (i = 0; i < snap_count; i++) 746602adf40SYehuda Sadeh header->snapc->snaps[i] = 747602adf40SYehuda Sadeh le64_to_cpu(ondisk->snaps[i].id); 748602adf40SYehuda Sadeh 749602adf40SYehuda Sadeh return 0; 750602adf40SYehuda Sadeh 7516a52325fSAlex Elder out_err: 752849b4260SAlex Elder kfree(header->snap_sizes); 753ccece235SAlex Elder header->snap_sizes = NULL; 754602adf40SYehuda Sadeh kfree(header->snap_names); 755ccece235SAlex Elder header->snap_names = NULL; 7566a52325fSAlex Elder kfree(header->object_prefix); 7576a52325fSAlex Elder header->object_prefix = NULL; 758ccece235SAlex Elder 75900f1f36fSAlex Elder return -ENOMEM; 760602adf40SYehuda Sadeh } 761602adf40SYehuda Sadeh 7629e15b77dSAlex Elder static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id) 7639e15b77dSAlex Elder { 7649e15b77dSAlex Elder struct rbd_snap *snap; 7659e15b77dSAlex Elder 7669e15b77dSAlex Elder if (snap_id == CEPH_NOSNAP) 7679e15b77dSAlex Elder return RBD_SNAP_HEAD_NAME; 7689e15b77dSAlex Elder 7699e15b77dSAlex Elder list_for_each_entry(snap, &rbd_dev->snaps, node) 7709e15b77dSAlex Elder if (snap_id == snap->id) 7719e15b77dSAlex Elder return snap->name; 7729e15b77dSAlex Elder 7739e15b77dSAlex Elder return NULL; 7749e15b77dSAlex Elder } 7759e15b77dSAlex Elder 7768836b995SAlex Elder static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name) 777602adf40SYehuda Sadeh { 778602adf40SYehuda Sadeh 779e86924a8SAlex Elder struct rbd_snap *snap; 78000f1f36fSAlex Elder 781e86924a8SAlex Elder list_for_each_entry(snap, &rbd_dev->snaps, node) { 782e86924a8SAlex Elder if (!strcmp(snap_name, snap->name)) { 7830d7dbfceSAlex Elder rbd_dev->spec->snap_id = snap->id; 784e86924a8SAlex Elder rbd_dev->mapping.size = snap->size; 78534b13184SAlex Elder rbd_dev->mapping.features = snap->features; 78600f1f36fSAlex Elder 787e86924a8SAlex Elder return 0; 788602adf40SYehuda Sadeh } 78900f1f36fSAlex Elder } 790e86924a8SAlex Elder 79100f1f36fSAlex Elder return -ENOENT; 79200f1f36fSAlex Elder } 793602adf40SYehuda Sadeh 794819d52bfSAlex Elder static int rbd_dev_set_mapping(struct rbd_device *rbd_dev) 795602adf40SYehuda Sadeh { 79678dc447dSAlex Elder int ret; 797602adf40SYehuda Sadeh 7980d7dbfceSAlex Elder if (!memcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME, 799cc9d734cSJosh Durgin sizeof (RBD_SNAP_HEAD_NAME))) { 8000d7dbfceSAlex Elder rbd_dev->spec->snap_id = CEPH_NOSNAP; 80199c1f08fSAlex Elder rbd_dev->mapping.size = rbd_dev->header.image_size; 80234b13184SAlex Elder rbd_dev->mapping.features = rbd_dev->header.features; 803e86924a8SAlex Elder ret = 0; 804602adf40SYehuda Sadeh } else { 8050d7dbfceSAlex Elder ret = snap_by_name(rbd_dev, rbd_dev->spec->snap_name); 806602adf40SYehuda Sadeh if (ret < 0) 807602adf40SYehuda Sadeh goto done; 808f84344f3SAlex Elder rbd_dev->mapping.read_only = true; 809602adf40SYehuda Sadeh } 8106d292906SAlex Elder set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); 8116d292906SAlex Elder 812602adf40SYehuda Sadeh done: 813602adf40SYehuda Sadeh return ret; 814602adf40SYehuda Sadeh } 815602adf40SYehuda Sadeh 816602adf40SYehuda Sadeh static void rbd_header_free(struct rbd_image_header *header) 817602adf40SYehuda Sadeh { 818849b4260SAlex Elder kfree(header->object_prefix); 819d78fd7aeSAlex Elder header->object_prefix = NULL; 820602adf40SYehuda Sadeh kfree(header->snap_sizes); 821d78fd7aeSAlex Elder header->snap_sizes = NULL; 822849b4260SAlex Elder kfree(header->snap_names); 823d78fd7aeSAlex Elder header->snap_names = NULL; 824d1d25646SJosh Durgin ceph_put_snap_context(header->snapc); 825d78fd7aeSAlex Elder header->snapc = NULL; 826602adf40SYehuda Sadeh } 827602adf40SYehuda Sadeh 82898571b5aSAlex Elder static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset) 829602adf40SYehuda Sadeh { 83065ccfe21SAlex Elder char *name; 83165ccfe21SAlex Elder u64 segment; 83265ccfe21SAlex Elder int ret; 833602adf40SYehuda Sadeh 8342fd82b9eSAlex Elder name = kmalloc(MAX_OBJ_NAME_SIZE + 1, GFP_NOIO); 83565ccfe21SAlex Elder if (!name) 83665ccfe21SAlex Elder return NULL; 83765ccfe21SAlex Elder segment = offset >> rbd_dev->header.obj_order; 8382fd82b9eSAlex Elder ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, "%s.%012llx", 83965ccfe21SAlex Elder rbd_dev->header.object_prefix, segment); 8402fd82b9eSAlex Elder if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) { 84165ccfe21SAlex Elder pr_err("error formatting segment name for #%llu (%d)\n", 84265ccfe21SAlex Elder segment, ret); 84365ccfe21SAlex Elder kfree(name); 84465ccfe21SAlex Elder name = NULL; 84565ccfe21SAlex Elder } 846602adf40SYehuda Sadeh 84765ccfe21SAlex Elder return name; 84865ccfe21SAlex Elder } 849602adf40SYehuda Sadeh 85065ccfe21SAlex Elder static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset) 85165ccfe21SAlex Elder { 85265ccfe21SAlex Elder u64 segment_size = (u64) 1 << rbd_dev->header.obj_order; 853602adf40SYehuda Sadeh 85465ccfe21SAlex Elder return offset & (segment_size - 1); 85565ccfe21SAlex Elder } 85665ccfe21SAlex Elder 85765ccfe21SAlex Elder static u64 rbd_segment_length(struct rbd_device *rbd_dev, 85865ccfe21SAlex Elder u64 offset, u64 length) 85965ccfe21SAlex Elder { 86065ccfe21SAlex Elder u64 segment_size = (u64) 1 << rbd_dev->header.obj_order; 86165ccfe21SAlex Elder 86265ccfe21SAlex Elder offset &= segment_size - 1; 86365ccfe21SAlex Elder 864aafb230eSAlex Elder rbd_assert(length <= U64_MAX - offset); 86565ccfe21SAlex Elder if (offset + length > segment_size) 86665ccfe21SAlex Elder length = segment_size - offset; 86765ccfe21SAlex Elder 86865ccfe21SAlex Elder return length; 869602adf40SYehuda Sadeh } 870602adf40SYehuda Sadeh 871602adf40SYehuda Sadeh /* 872029bcbd8SJosh Durgin * returns the size of an object in the image 873029bcbd8SJosh Durgin */ 874029bcbd8SJosh Durgin static u64 rbd_obj_bytes(struct rbd_image_header *header) 875029bcbd8SJosh Durgin { 876029bcbd8SJosh Durgin return 1 << header->obj_order; 877029bcbd8SJosh Durgin } 878029bcbd8SJosh Durgin 879029bcbd8SJosh Durgin /* 880602adf40SYehuda Sadeh * bio helpers 881602adf40SYehuda Sadeh */ 882602adf40SYehuda Sadeh 883602adf40SYehuda Sadeh static void bio_chain_put(struct bio *chain) 884602adf40SYehuda Sadeh { 885602adf40SYehuda Sadeh struct bio *tmp; 886602adf40SYehuda Sadeh 887602adf40SYehuda Sadeh while (chain) { 888602adf40SYehuda Sadeh tmp = chain; 889602adf40SYehuda Sadeh chain = chain->bi_next; 890602adf40SYehuda Sadeh bio_put(tmp); 891602adf40SYehuda Sadeh } 892602adf40SYehuda Sadeh } 893602adf40SYehuda Sadeh 894602adf40SYehuda Sadeh /* 895602adf40SYehuda Sadeh * zeros a bio chain, starting at specific offset 896602adf40SYehuda Sadeh */ 897602adf40SYehuda Sadeh static void zero_bio_chain(struct bio *chain, int start_ofs) 898602adf40SYehuda Sadeh { 899602adf40SYehuda Sadeh struct bio_vec *bv; 900602adf40SYehuda Sadeh unsigned long flags; 901602adf40SYehuda Sadeh void *buf; 902602adf40SYehuda Sadeh int i; 903602adf40SYehuda Sadeh int pos = 0; 904602adf40SYehuda Sadeh 905602adf40SYehuda Sadeh while (chain) { 906602adf40SYehuda Sadeh bio_for_each_segment(bv, chain, i) { 907602adf40SYehuda Sadeh if (pos + bv->bv_len > start_ofs) { 908602adf40SYehuda Sadeh int remainder = max(start_ofs - pos, 0); 909602adf40SYehuda Sadeh buf = bvec_kmap_irq(bv, &flags); 910602adf40SYehuda Sadeh memset(buf + remainder, 0, 911602adf40SYehuda Sadeh bv->bv_len - remainder); 91285b5aaa6SDan Carpenter bvec_kunmap_irq(buf, &flags); 913602adf40SYehuda Sadeh } 914602adf40SYehuda Sadeh pos += bv->bv_len; 915602adf40SYehuda Sadeh } 916602adf40SYehuda Sadeh 917602adf40SYehuda Sadeh chain = chain->bi_next; 918602adf40SYehuda Sadeh } 919602adf40SYehuda Sadeh } 920602adf40SYehuda Sadeh 921602adf40SYehuda Sadeh /* 922f7760dadSAlex Elder * Clone a portion of a bio, starting at the given byte offset 923f7760dadSAlex Elder * and continuing for the number of bytes indicated. 924602adf40SYehuda Sadeh */ 925f7760dadSAlex Elder static struct bio *bio_clone_range(struct bio *bio_src, 926f7760dadSAlex Elder unsigned int offset, 927f7760dadSAlex Elder unsigned int len, 928f7760dadSAlex Elder gfp_t gfpmask) 929602adf40SYehuda Sadeh { 930f7760dadSAlex Elder struct bio_vec *bv; 931f7760dadSAlex Elder unsigned int resid; 932f7760dadSAlex Elder unsigned short idx; 933f7760dadSAlex Elder unsigned int voff; 934f7760dadSAlex Elder unsigned short end_idx; 935f7760dadSAlex Elder unsigned short vcnt; 936f7760dadSAlex Elder struct bio *bio; 937602adf40SYehuda Sadeh 938f7760dadSAlex Elder /* Handle the easy case for the caller */ 939f7760dadSAlex Elder 940f7760dadSAlex Elder if (!offset && len == bio_src->bi_size) 941f7760dadSAlex Elder return bio_clone(bio_src, gfpmask); 942f7760dadSAlex Elder 943f7760dadSAlex Elder if (WARN_ON_ONCE(!len)) 944f7760dadSAlex Elder return NULL; 945f7760dadSAlex Elder if (WARN_ON_ONCE(len > bio_src->bi_size)) 946f7760dadSAlex Elder return NULL; 947f7760dadSAlex Elder if (WARN_ON_ONCE(offset > bio_src->bi_size - len)) 948f7760dadSAlex Elder return NULL; 949f7760dadSAlex Elder 950f7760dadSAlex Elder /* Find first affected segment... */ 951f7760dadSAlex Elder 952f7760dadSAlex Elder resid = offset; 953f7760dadSAlex Elder __bio_for_each_segment(bv, bio_src, idx, 0) { 954f7760dadSAlex Elder if (resid < bv->bv_len) 955f7760dadSAlex Elder break; 956f7760dadSAlex Elder resid -= bv->bv_len; 957602adf40SYehuda Sadeh } 958f7760dadSAlex Elder voff = resid; 959602adf40SYehuda Sadeh 960f7760dadSAlex Elder /* ...and the last affected segment */ 961542582fcSAlex Elder 962f7760dadSAlex Elder resid += len; 963f7760dadSAlex Elder __bio_for_each_segment(bv, bio_src, end_idx, idx) { 964f7760dadSAlex Elder if (resid <= bv->bv_len) 965f7760dadSAlex Elder break; 966f7760dadSAlex Elder resid -= bv->bv_len; 967f7760dadSAlex Elder } 968f7760dadSAlex Elder vcnt = end_idx - idx + 1; 969602adf40SYehuda Sadeh 970f7760dadSAlex Elder /* Build the clone */ 971f7760dadSAlex Elder 972f7760dadSAlex Elder bio = bio_alloc(gfpmask, (unsigned int) vcnt); 973f7760dadSAlex Elder if (!bio) 974f7760dadSAlex Elder return NULL; /* ENOMEM */ 975f7760dadSAlex Elder 976f7760dadSAlex Elder bio->bi_bdev = bio_src->bi_bdev; 977f7760dadSAlex Elder bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT); 978f7760dadSAlex Elder bio->bi_rw = bio_src->bi_rw; 979f7760dadSAlex Elder bio->bi_flags |= 1 << BIO_CLONED; 980602adf40SYehuda Sadeh 981602adf40SYehuda Sadeh /* 982f7760dadSAlex Elder * Copy over our part of the bio_vec, then update the first 983f7760dadSAlex Elder * and last (or only) entries. 984602adf40SYehuda Sadeh */ 985f7760dadSAlex Elder memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx], 986f7760dadSAlex Elder vcnt * sizeof (struct bio_vec)); 987f7760dadSAlex Elder bio->bi_io_vec[0].bv_offset += voff; 988f7760dadSAlex Elder if (vcnt > 1) { 989f7760dadSAlex Elder bio->bi_io_vec[0].bv_len -= voff; 990f7760dadSAlex Elder bio->bi_io_vec[vcnt - 1].bv_len = resid; 991602adf40SYehuda Sadeh } else { 992f7760dadSAlex Elder bio->bi_io_vec[0].bv_len = len; 993602adf40SYehuda Sadeh } 994602adf40SYehuda Sadeh 995f7760dadSAlex Elder bio->bi_vcnt = vcnt; 996f7760dadSAlex Elder bio->bi_size = len; 997f7760dadSAlex Elder bio->bi_idx = 0; 998602adf40SYehuda Sadeh 999f7760dadSAlex Elder return bio; 1000602adf40SYehuda Sadeh } 1001602adf40SYehuda Sadeh 1002f7760dadSAlex Elder /* 1003f7760dadSAlex Elder * Clone a portion of a bio chain, starting at the given byte offset 1004f7760dadSAlex Elder * into the first bio in the source chain and continuing for the 1005f7760dadSAlex Elder * number of bytes indicated. The result is another bio chain of 1006f7760dadSAlex Elder * exactly the given length, or a null pointer on error. 1007f7760dadSAlex Elder * 1008f7760dadSAlex Elder * The bio_src and offset parameters are both in-out. On entry they 1009f7760dadSAlex Elder * refer to the first source bio and the offset into that bio where 1010f7760dadSAlex Elder * the start of data to be cloned is located. 1011f7760dadSAlex Elder * 1012f7760dadSAlex Elder * On return, bio_src is updated to refer to the bio in the source 1013f7760dadSAlex Elder * chain that contains first un-cloned byte, and *offset will 1014f7760dadSAlex Elder * contain the offset of that byte within that bio. 1015f7760dadSAlex Elder */ 1016f7760dadSAlex Elder static struct bio *bio_chain_clone_range(struct bio **bio_src, 1017f7760dadSAlex Elder unsigned int *offset, 1018f7760dadSAlex Elder unsigned int len, 1019f7760dadSAlex Elder gfp_t gfpmask) 1020f7760dadSAlex Elder { 1021f7760dadSAlex Elder struct bio *bi = *bio_src; 1022f7760dadSAlex Elder unsigned int off = *offset; 1023f7760dadSAlex Elder struct bio *chain = NULL; 1024f7760dadSAlex Elder struct bio **end; 1025602adf40SYehuda Sadeh 1026f7760dadSAlex Elder /* Build up a chain of clone bios up to the limit */ 1027602adf40SYehuda Sadeh 1028f7760dadSAlex Elder if (!bi || off >= bi->bi_size || !len) 1029f7760dadSAlex Elder return NULL; /* Nothing to clone */ 1030602adf40SYehuda Sadeh 1031f7760dadSAlex Elder end = &chain; 1032f7760dadSAlex Elder while (len) { 1033f7760dadSAlex Elder unsigned int bi_size; 1034f7760dadSAlex Elder struct bio *bio; 1035f7760dadSAlex Elder 1036f5400b7aSAlex Elder if (!bi) { 1037f5400b7aSAlex Elder rbd_warn(NULL, "bio_chain exhausted with %u left", len); 1038f7760dadSAlex Elder goto out_err; /* EINVAL; ran out of bio's */ 1039f5400b7aSAlex Elder } 1040f7760dadSAlex Elder bi_size = min_t(unsigned int, bi->bi_size - off, len); 1041f7760dadSAlex Elder bio = bio_clone_range(bi, off, bi_size, gfpmask); 1042f7760dadSAlex Elder if (!bio) 1043f7760dadSAlex Elder goto out_err; /* ENOMEM */ 1044f7760dadSAlex Elder 1045f7760dadSAlex Elder *end = bio; 1046f7760dadSAlex Elder end = &bio->bi_next; 1047f7760dadSAlex Elder 1048f7760dadSAlex Elder off += bi_size; 1049f7760dadSAlex Elder if (off == bi->bi_size) { 1050f7760dadSAlex Elder bi = bi->bi_next; 1051f7760dadSAlex Elder off = 0; 1052f7760dadSAlex Elder } 1053f7760dadSAlex Elder len -= bi_size; 1054f7760dadSAlex Elder } 1055f7760dadSAlex Elder *bio_src = bi; 1056f7760dadSAlex Elder *offset = off; 1057f7760dadSAlex Elder 1058f7760dadSAlex Elder return chain; 1059f7760dadSAlex Elder out_err: 1060f7760dadSAlex Elder bio_chain_put(chain); 1061f7760dadSAlex Elder 1062602adf40SYehuda Sadeh return NULL; 1063602adf40SYehuda Sadeh } 1064602adf40SYehuda Sadeh 1065bf0d5f50SAlex Elder static void rbd_obj_request_get(struct rbd_obj_request *obj_request) 1066bf0d5f50SAlex Elder { 1067bf0d5f50SAlex Elder kref_get(&obj_request->kref); 1068bf0d5f50SAlex Elder } 1069bf0d5f50SAlex Elder 1070bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref); 1071bf0d5f50SAlex Elder static void rbd_obj_request_put(struct rbd_obj_request *obj_request) 1072bf0d5f50SAlex Elder { 1073bf0d5f50SAlex Elder rbd_assert(obj_request != NULL); 1074bf0d5f50SAlex Elder kref_put(&obj_request->kref, rbd_obj_request_destroy); 1075bf0d5f50SAlex Elder } 1076bf0d5f50SAlex Elder 1077bf0d5f50SAlex Elder static void rbd_img_request_get(struct rbd_img_request *img_request) 1078bf0d5f50SAlex Elder { 1079bf0d5f50SAlex Elder kref_get(&img_request->kref); 1080bf0d5f50SAlex Elder } 1081bf0d5f50SAlex Elder 1082bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref); 1083bf0d5f50SAlex Elder static void rbd_img_request_put(struct rbd_img_request *img_request) 1084bf0d5f50SAlex Elder { 1085bf0d5f50SAlex Elder rbd_assert(img_request != NULL); 1086bf0d5f50SAlex Elder kref_put(&img_request->kref, rbd_img_request_destroy); 1087bf0d5f50SAlex Elder } 1088bf0d5f50SAlex Elder 1089bf0d5f50SAlex Elder static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request, 1090bf0d5f50SAlex Elder struct rbd_obj_request *obj_request) 1091bf0d5f50SAlex Elder { 109225dcf954SAlex Elder rbd_assert(obj_request->img_request == NULL); 109325dcf954SAlex Elder 1094bf0d5f50SAlex Elder rbd_obj_request_get(obj_request); 1095bf0d5f50SAlex Elder obj_request->img_request = img_request; 109625dcf954SAlex Elder obj_request->which = img_request->obj_request_count; 1097bf0d5f50SAlex Elder rbd_assert(obj_request->which != BAD_WHICH); 109825dcf954SAlex Elder img_request->obj_request_count++; 109925dcf954SAlex Elder list_add_tail(&obj_request->links, &img_request->obj_requests); 1100bf0d5f50SAlex Elder } 1101bf0d5f50SAlex Elder 1102bf0d5f50SAlex Elder static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request, 1103bf0d5f50SAlex Elder struct rbd_obj_request *obj_request) 1104bf0d5f50SAlex Elder { 1105bf0d5f50SAlex Elder rbd_assert(obj_request->which != BAD_WHICH); 110625dcf954SAlex Elder 1107bf0d5f50SAlex Elder list_del(&obj_request->links); 110825dcf954SAlex Elder rbd_assert(img_request->obj_request_count > 0); 110925dcf954SAlex Elder img_request->obj_request_count--; 111025dcf954SAlex Elder rbd_assert(obj_request->which == img_request->obj_request_count); 111125dcf954SAlex Elder obj_request->which = BAD_WHICH; 1112bf0d5f50SAlex Elder rbd_assert(obj_request->img_request == img_request); 1113bf0d5f50SAlex Elder obj_request->img_request = NULL; 111425dcf954SAlex Elder obj_request->callback = NULL; 1115bf0d5f50SAlex Elder rbd_obj_request_put(obj_request); 1116bf0d5f50SAlex Elder } 1117bf0d5f50SAlex Elder 1118bf0d5f50SAlex Elder static bool obj_request_type_valid(enum obj_request_type type) 1119bf0d5f50SAlex Elder { 1120bf0d5f50SAlex Elder switch (type) { 11219969ebc5SAlex Elder case OBJ_REQUEST_NODATA: 1122bf0d5f50SAlex Elder case OBJ_REQUEST_BIO: 1123788e2df3SAlex Elder case OBJ_REQUEST_PAGES: 1124bf0d5f50SAlex Elder return true; 1125bf0d5f50SAlex Elder default: 1126bf0d5f50SAlex Elder return false; 1127bf0d5f50SAlex Elder } 1128bf0d5f50SAlex Elder } 1129bf0d5f50SAlex Elder 11308d23bf29SAlex Elder struct ceph_osd_req_op *rbd_osd_req_op_create(u16 opcode, ...) 11318d23bf29SAlex Elder { 11328d23bf29SAlex Elder struct ceph_osd_req_op *op; 11338d23bf29SAlex Elder va_list args; 11342647ba38SAlex Elder size_t size; 11358d23bf29SAlex Elder 11368d23bf29SAlex Elder op = kzalloc(sizeof (*op), GFP_NOIO); 11378d23bf29SAlex Elder if (!op) 11388d23bf29SAlex Elder return NULL; 11398d23bf29SAlex Elder op->op = opcode; 11408d23bf29SAlex Elder va_start(args, opcode); 11418d23bf29SAlex Elder switch (opcode) { 11428d23bf29SAlex Elder case CEPH_OSD_OP_READ: 11438d23bf29SAlex Elder case CEPH_OSD_OP_WRITE: 11448d23bf29SAlex Elder /* rbd_osd_req_op_create(READ, offset, length) */ 11458d23bf29SAlex Elder /* rbd_osd_req_op_create(WRITE, offset, length) */ 11468d23bf29SAlex Elder op->extent.offset = va_arg(args, u64); 11478d23bf29SAlex Elder op->extent.length = va_arg(args, u64); 11488d23bf29SAlex Elder if (opcode == CEPH_OSD_OP_WRITE) 11498d23bf29SAlex Elder op->payload_len = op->extent.length; 11508d23bf29SAlex Elder break; 1151fbfab539SAlex Elder case CEPH_OSD_OP_STAT: 1152fbfab539SAlex Elder break; 11532647ba38SAlex Elder case CEPH_OSD_OP_CALL: 11542647ba38SAlex Elder /* rbd_osd_req_op_create(CALL, class, method, data, datalen) */ 11552647ba38SAlex Elder op->cls.class_name = va_arg(args, char *); 11562647ba38SAlex Elder size = strlen(op->cls.class_name); 11572647ba38SAlex Elder rbd_assert(size <= (size_t) U8_MAX); 11582647ba38SAlex Elder op->cls.class_len = size; 11592647ba38SAlex Elder op->payload_len = size; 11602647ba38SAlex Elder 11612647ba38SAlex Elder op->cls.method_name = va_arg(args, char *); 11622647ba38SAlex Elder size = strlen(op->cls.method_name); 11632647ba38SAlex Elder rbd_assert(size <= (size_t) U8_MAX); 11642647ba38SAlex Elder op->cls.method_len = size; 11652647ba38SAlex Elder op->payload_len += size; 11662647ba38SAlex Elder 11672647ba38SAlex Elder op->cls.argc = 0; 11682647ba38SAlex Elder op->cls.indata = va_arg(args, void *); 11692647ba38SAlex Elder size = va_arg(args, size_t); 11702647ba38SAlex Elder rbd_assert(size <= (size_t) U32_MAX); 11712647ba38SAlex Elder op->cls.indata_len = (u32) size; 11722647ba38SAlex Elder op->payload_len += size; 11732647ba38SAlex Elder break; 11745efea49aSAlex Elder case CEPH_OSD_OP_NOTIFY_ACK: 11755efea49aSAlex Elder case CEPH_OSD_OP_WATCH: 11765efea49aSAlex Elder /* rbd_osd_req_op_create(NOTIFY_ACK, cookie, version) */ 11775efea49aSAlex Elder /* rbd_osd_req_op_create(WATCH, cookie, version, flag) */ 11785efea49aSAlex Elder op->watch.cookie = va_arg(args, u64); 11795efea49aSAlex Elder op->watch.ver = va_arg(args, u64); 11805efea49aSAlex Elder op->watch.ver = cpu_to_le64(op->watch.ver); 11815efea49aSAlex Elder if (opcode == CEPH_OSD_OP_WATCH && va_arg(args, int)) 11825efea49aSAlex Elder op->watch.flag = (u8) 1; 11835efea49aSAlex Elder break; 11848d23bf29SAlex Elder default: 11858d23bf29SAlex Elder rbd_warn(NULL, "unsupported opcode %hu\n", opcode); 11868d23bf29SAlex Elder kfree(op); 11878d23bf29SAlex Elder op = NULL; 11888d23bf29SAlex Elder break; 11898d23bf29SAlex Elder } 11908d23bf29SAlex Elder va_end(args); 11918d23bf29SAlex Elder 11928d23bf29SAlex Elder return op; 11938d23bf29SAlex Elder } 11948d23bf29SAlex Elder 11958d23bf29SAlex Elder static void rbd_osd_req_op_destroy(struct ceph_osd_req_op *op) 11968d23bf29SAlex Elder { 11978d23bf29SAlex Elder kfree(op); 11988d23bf29SAlex Elder } 11998d23bf29SAlex Elder 1200bf0d5f50SAlex Elder static int rbd_obj_request_submit(struct ceph_osd_client *osdc, 1201bf0d5f50SAlex Elder struct rbd_obj_request *obj_request) 1202bf0d5f50SAlex Elder { 1203bf0d5f50SAlex Elder return ceph_osdc_start_request(osdc, obj_request->osd_req, false); 1204bf0d5f50SAlex Elder } 1205bf0d5f50SAlex Elder 1206bf0d5f50SAlex Elder static void rbd_img_request_complete(struct rbd_img_request *img_request) 1207bf0d5f50SAlex Elder { 1208bf0d5f50SAlex Elder if (img_request->callback) 1209bf0d5f50SAlex Elder img_request->callback(img_request); 1210bf0d5f50SAlex Elder else 1211bf0d5f50SAlex Elder rbd_img_request_put(img_request); 1212bf0d5f50SAlex Elder } 1213bf0d5f50SAlex Elder 1214788e2df3SAlex Elder /* Caller is responsible for rbd_obj_request_destroy(obj_request) */ 1215788e2df3SAlex Elder 1216788e2df3SAlex Elder static int rbd_obj_request_wait(struct rbd_obj_request *obj_request) 1217788e2df3SAlex Elder { 1218788e2df3SAlex Elder return wait_for_completion_interruptible(&obj_request->completion); 1219788e2df3SAlex Elder } 1220788e2df3SAlex Elder 122107741308SAlex Elder static void obj_request_done_init(struct rbd_obj_request *obj_request) 122207741308SAlex Elder { 122307741308SAlex Elder atomic_set(&obj_request->done, 0); 122407741308SAlex Elder smp_wmb(); 122507741308SAlex Elder } 122607741308SAlex Elder 122707741308SAlex Elder static void obj_request_done_set(struct rbd_obj_request *obj_request) 122807741308SAlex Elder { 122907741308SAlex Elder atomic_set(&obj_request->done, 1); 123007741308SAlex Elder smp_wmb(); 123107741308SAlex Elder } 123207741308SAlex Elder 123307741308SAlex Elder static bool obj_request_done_test(struct rbd_obj_request *obj_request) 123407741308SAlex Elder { 123507741308SAlex Elder smp_rmb(); 123607741308SAlex Elder return atomic_read(&obj_request->done) != 0; 123707741308SAlex Elder } 123807741308SAlex Elder 12399969ebc5SAlex Elder static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request, 12409969ebc5SAlex Elder struct ceph_osd_op *op) 12419969ebc5SAlex Elder { 124207741308SAlex Elder obj_request_done_set(obj_request); 12439969ebc5SAlex Elder } 12449969ebc5SAlex Elder 1245bf0d5f50SAlex Elder static void rbd_obj_request_complete(struct rbd_obj_request *obj_request) 1246bf0d5f50SAlex Elder { 1247bf0d5f50SAlex Elder if (obj_request->callback) 1248bf0d5f50SAlex Elder obj_request->callback(obj_request); 1249788e2df3SAlex Elder else 1250788e2df3SAlex Elder complete_all(&obj_request->completion); 1251bf0d5f50SAlex Elder } 1252bf0d5f50SAlex Elder 1253bf0d5f50SAlex Elder static void rbd_osd_read_callback(struct rbd_obj_request *obj_request, 1254bf0d5f50SAlex Elder struct ceph_osd_op *op) 1255bf0d5f50SAlex Elder { 1256bf0d5f50SAlex Elder u64 xferred; 1257bf0d5f50SAlex Elder 1258bf0d5f50SAlex Elder /* 1259bf0d5f50SAlex Elder * We support a 64-bit length, but ultimately it has to be 1260bf0d5f50SAlex Elder * passed to blk_end_request(), which takes an unsigned int. 1261bf0d5f50SAlex Elder */ 1262bf0d5f50SAlex Elder xferred = le64_to_cpu(op->extent.length); 1263bf0d5f50SAlex Elder rbd_assert(xferred < (u64) UINT_MAX); 1264bf0d5f50SAlex Elder if (obj_request->result == (s32) -ENOENT) { 1265bf0d5f50SAlex Elder zero_bio_chain(obj_request->bio_list, 0); 1266bf0d5f50SAlex Elder obj_request->result = 0; 1267bf0d5f50SAlex Elder } else if (xferred < obj_request->length && !obj_request->result) { 1268bf0d5f50SAlex Elder zero_bio_chain(obj_request->bio_list, xferred); 1269bf0d5f50SAlex Elder xferred = obj_request->length; 1270bf0d5f50SAlex Elder } 1271bf0d5f50SAlex Elder obj_request->xferred = xferred; 127207741308SAlex Elder obj_request_done_set(obj_request); 1273bf0d5f50SAlex Elder } 1274bf0d5f50SAlex Elder 1275bf0d5f50SAlex Elder static void rbd_osd_write_callback(struct rbd_obj_request *obj_request, 1276bf0d5f50SAlex Elder struct ceph_osd_op *op) 1277bf0d5f50SAlex Elder { 1278bf0d5f50SAlex Elder obj_request->xferred = le64_to_cpu(op->extent.length); 127907741308SAlex Elder obj_request_done_set(obj_request); 1280bf0d5f50SAlex Elder } 1281bf0d5f50SAlex Elder 1282fbfab539SAlex Elder /* 1283fbfab539SAlex Elder * For a simple stat call there's nothing to do. We'll do more if 1284fbfab539SAlex Elder * this is part of a write sequence for a layered image. 1285fbfab539SAlex Elder */ 1286fbfab539SAlex Elder static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request, 1287fbfab539SAlex Elder struct ceph_osd_op *op) 1288fbfab539SAlex Elder { 1289fbfab539SAlex Elder obj_request_done_set(obj_request); 1290fbfab539SAlex Elder } 1291fbfab539SAlex Elder 1292bf0d5f50SAlex Elder static void rbd_osd_req_callback(struct ceph_osd_request *osd_req, 1293bf0d5f50SAlex Elder struct ceph_msg *msg) 1294bf0d5f50SAlex Elder { 1295bf0d5f50SAlex Elder struct rbd_obj_request *obj_request = osd_req->r_priv; 1296bf0d5f50SAlex Elder struct ceph_osd_reply_head *reply_head; 1297bf0d5f50SAlex Elder struct ceph_osd_op *op; 1298bf0d5f50SAlex Elder u32 num_ops; 1299bf0d5f50SAlex Elder u16 opcode; 1300bf0d5f50SAlex Elder 1301bf0d5f50SAlex Elder rbd_assert(osd_req == obj_request->osd_req); 1302bf0d5f50SAlex Elder rbd_assert(!!obj_request->img_request ^ 1303bf0d5f50SAlex Elder (obj_request->which == BAD_WHICH)); 1304bf0d5f50SAlex Elder 1305bf0d5f50SAlex Elder obj_request->xferred = le32_to_cpu(msg->hdr.data_len); 1306bf0d5f50SAlex Elder reply_head = msg->front.iov_base; 1307bf0d5f50SAlex Elder obj_request->result = (s32) le32_to_cpu(reply_head->result); 1308bf0d5f50SAlex Elder obj_request->version = le64_to_cpu(osd_req->r_reassert_version.version); 1309bf0d5f50SAlex Elder 1310bf0d5f50SAlex Elder num_ops = le32_to_cpu(reply_head->num_ops); 1311bf0d5f50SAlex Elder WARN_ON(num_ops != 1); /* For now */ 1312bf0d5f50SAlex Elder 1313bf0d5f50SAlex Elder op = &reply_head->ops[0]; 1314bf0d5f50SAlex Elder opcode = le16_to_cpu(op->op); 1315bf0d5f50SAlex Elder switch (opcode) { 1316bf0d5f50SAlex Elder case CEPH_OSD_OP_READ: 1317bf0d5f50SAlex Elder rbd_osd_read_callback(obj_request, op); 1318bf0d5f50SAlex Elder break; 1319bf0d5f50SAlex Elder case CEPH_OSD_OP_WRITE: 1320bf0d5f50SAlex Elder rbd_osd_write_callback(obj_request, op); 1321bf0d5f50SAlex Elder break; 1322fbfab539SAlex Elder case CEPH_OSD_OP_STAT: 1323fbfab539SAlex Elder rbd_osd_stat_callback(obj_request, op); 1324fbfab539SAlex Elder break; 132536be9a76SAlex Elder case CEPH_OSD_OP_CALL: 1326b8d70035SAlex Elder case CEPH_OSD_OP_NOTIFY_ACK: 13279969ebc5SAlex Elder case CEPH_OSD_OP_WATCH: 13289969ebc5SAlex Elder rbd_osd_trivial_callback(obj_request, op); 13299969ebc5SAlex Elder break; 1330bf0d5f50SAlex Elder default: 1331bf0d5f50SAlex Elder rbd_warn(NULL, "%s: unsupported op %hu\n", 1332bf0d5f50SAlex Elder obj_request->object_name, (unsigned short) opcode); 1333bf0d5f50SAlex Elder break; 1334bf0d5f50SAlex Elder } 1335bf0d5f50SAlex Elder 133607741308SAlex Elder if (obj_request_done_test(obj_request)) 1337bf0d5f50SAlex Elder rbd_obj_request_complete(obj_request); 1338bf0d5f50SAlex Elder } 1339bf0d5f50SAlex Elder 1340bf0d5f50SAlex Elder static struct ceph_osd_request *rbd_osd_req_create( 1341bf0d5f50SAlex Elder struct rbd_device *rbd_dev, 1342bf0d5f50SAlex Elder bool write_request, 1343bf0d5f50SAlex Elder struct rbd_obj_request *obj_request, 1344bf0d5f50SAlex Elder struct ceph_osd_req_op *op) 1345bf0d5f50SAlex Elder { 1346bf0d5f50SAlex Elder struct rbd_img_request *img_request = obj_request->img_request; 1347bf0d5f50SAlex Elder struct ceph_snap_context *snapc = NULL; 1348bf0d5f50SAlex Elder struct ceph_osd_client *osdc; 1349bf0d5f50SAlex Elder struct ceph_osd_request *osd_req; 1350bf0d5f50SAlex Elder struct timespec now; 1351bf0d5f50SAlex Elder struct timespec *mtime; 1352bf0d5f50SAlex Elder u64 snap_id = CEPH_NOSNAP; 1353bf0d5f50SAlex Elder u64 offset = obj_request->offset; 1354bf0d5f50SAlex Elder u64 length = obj_request->length; 1355bf0d5f50SAlex Elder 1356bf0d5f50SAlex Elder if (img_request) { 1357bf0d5f50SAlex Elder rbd_assert(img_request->write_request == write_request); 1358bf0d5f50SAlex Elder if (img_request->write_request) 1359bf0d5f50SAlex Elder snapc = img_request->snapc; 1360bf0d5f50SAlex Elder else 1361bf0d5f50SAlex Elder snap_id = img_request->snap_id; 1362bf0d5f50SAlex Elder } 1363bf0d5f50SAlex Elder 1364bf0d5f50SAlex Elder /* Allocate and initialize the request, for the single op */ 1365bf0d5f50SAlex Elder 1366bf0d5f50SAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 1367bf0d5f50SAlex Elder osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_ATOMIC); 1368bf0d5f50SAlex Elder if (!osd_req) 1369bf0d5f50SAlex Elder return NULL; /* ENOMEM */ 1370bf0d5f50SAlex Elder 1371bf0d5f50SAlex Elder rbd_assert(obj_request_type_valid(obj_request->type)); 1372bf0d5f50SAlex Elder switch (obj_request->type) { 13739969ebc5SAlex Elder case OBJ_REQUEST_NODATA: 13749969ebc5SAlex Elder break; /* Nothing to do */ 1375bf0d5f50SAlex Elder case OBJ_REQUEST_BIO: 1376bf0d5f50SAlex Elder rbd_assert(obj_request->bio_list != NULL); 1377bf0d5f50SAlex Elder osd_req->r_bio = obj_request->bio_list; 1378bf0d5f50SAlex Elder break; 1379788e2df3SAlex Elder case OBJ_REQUEST_PAGES: 1380788e2df3SAlex Elder osd_req->r_pages = obj_request->pages; 1381788e2df3SAlex Elder osd_req->r_num_pages = obj_request->page_count; 1382788e2df3SAlex Elder osd_req->r_page_alignment = offset & ~PAGE_MASK; 1383788e2df3SAlex Elder break; 1384bf0d5f50SAlex Elder } 1385bf0d5f50SAlex Elder 1386bf0d5f50SAlex Elder if (write_request) { 1387bf0d5f50SAlex Elder osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK; 1388bf0d5f50SAlex Elder now = CURRENT_TIME; 1389bf0d5f50SAlex Elder mtime = &now; 1390bf0d5f50SAlex Elder } else { 1391bf0d5f50SAlex Elder osd_req->r_flags = CEPH_OSD_FLAG_READ; 1392bf0d5f50SAlex Elder mtime = NULL; /* not needed for reads */ 1393bf0d5f50SAlex Elder offset = 0; /* These are not used... */ 1394bf0d5f50SAlex Elder length = 0; /* ...for osd read requests */ 1395bf0d5f50SAlex Elder } 1396bf0d5f50SAlex Elder 1397bf0d5f50SAlex Elder osd_req->r_callback = rbd_osd_req_callback; 1398bf0d5f50SAlex Elder osd_req->r_priv = obj_request; 1399bf0d5f50SAlex Elder 1400bf0d5f50SAlex Elder osd_req->r_oid_len = strlen(obj_request->object_name); 1401bf0d5f50SAlex Elder rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid)); 1402bf0d5f50SAlex Elder memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len); 1403bf0d5f50SAlex Elder 1404bf0d5f50SAlex Elder osd_req->r_file_layout = rbd_dev->layout; /* struct */ 1405bf0d5f50SAlex Elder 1406bf0d5f50SAlex Elder /* osd_req will get its own reference to snapc (if non-null) */ 1407bf0d5f50SAlex Elder 1408bf0d5f50SAlex Elder ceph_osdc_build_request(osd_req, offset, length, 1, op, 1409bf0d5f50SAlex Elder snapc, snap_id, mtime); 1410bf0d5f50SAlex Elder 1411bf0d5f50SAlex Elder return osd_req; 1412bf0d5f50SAlex Elder } 1413bf0d5f50SAlex Elder 1414bf0d5f50SAlex Elder static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req) 1415bf0d5f50SAlex Elder { 1416bf0d5f50SAlex Elder ceph_osdc_put_request(osd_req); 1417bf0d5f50SAlex Elder } 1418bf0d5f50SAlex Elder 1419bf0d5f50SAlex Elder /* object_name is assumed to be a non-null pointer and NUL-terminated */ 1420bf0d5f50SAlex Elder 1421bf0d5f50SAlex Elder static struct rbd_obj_request *rbd_obj_request_create(const char *object_name, 1422bf0d5f50SAlex Elder u64 offset, u64 length, 1423bf0d5f50SAlex Elder enum obj_request_type type) 1424bf0d5f50SAlex Elder { 1425bf0d5f50SAlex Elder struct rbd_obj_request *obj_request; 1426bf0d5f50SAlex Elder size_t size; 1427bf0d5f50SAlex Elder char *name; 1428bf0d5f50SAlex Elder 1429bf0d5f50SAlex Elder rbd_assert(obj_request_type_valid(type)); 1430bf0d5f50SAlex Elder 1431bf0d5f50SAlex Elder size = strlen(object_name) + 1; 1432bf0d5f50SAlex Elder obj_request = kzalloc(sizeof (*obj_request) + size, GFP_KERNEL); 1433bf0d5f50SAlex Elder if (!obj_request) 1434bf0d5f50SAlex Elder return NULL; 1435bf0d5f50SAlex Elder 1436bf0d5f50SAlex Elder name = (char *)(obj_request + 1); 1437bf0d5f50SAlex Elder obj_request->object_name = memcpy(name, object_name, size); 1438bf0d5f50SAlex Elder obj_request->offset = offset; 1439bf0d5f50SAlex Elder obj_request->length = length; 1440bf0d5f50SAlex Elder obj_request->which = BAD_WHICH; 1441bf0d5f50SAlex Elder obj_request->type = type; 1442bf0d5f50SAlex Elder INIT_LIST_HEAD(&obj_request->links); 144307741308SAlex Elder obj_request_done_init(obj_request); 1444788e2df3SAlex Elder init_completion(&obj_request->completion); 1445bf0d5f50SAlex Elder kref_init(&obj_request->kref); 1446bf0d5f50SAlex Elder 1447bf0d5f50SAlex Elder return obj_request; 1448bf0d5f50SAlex Elder } 1449bf0d5f50SAlex Elder 1450bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref) 1451bf0d5f50SAlex Elder { 1452bf0d5f50SAlex Elder struct rbd_obj_request *obj_request; 1453bf0d5f50SAlex Elder 1454bf0d5f50SAlex Elder obj_request = container_of(kref, struct rbd_obj_request, kref); 1455bf0d5f50SAlex Elder 1456bf0d5f50SAlex Elder rbd_assert(obj_request->img_request == NULL); 1457bf0d5f50SAlex Elder rbd_assert(obj_request->which == BAD_WHICH); 1458bf0d5f50SAlex Elder 1459bf0d5f50SAlex Elder if (obj_request->osd_req) 1460bf0d5f50SAlex Elder rbd_osd_req_destroy(obj_request->osd_req); 1461bf0d5f50SAlex Elder 1462bf0d5f50SAlex Elder rbd_assert(obj_request_type_valid(obj_request->type)); 1463bf0d5f50SAlex Elder switch (obj_request->type) { 14649969ebc5SAlex Elder case OBJ_REQUEST_NODATA: 14659969ebc5SAlex Elder break; /* Nothing to do */ 1466bf0d5f50SAlex Elder case OBJ_REQUEST_BIO: 1467bf0d5f50SAlex Elder if (obj_request->bio_list) 1468bf0d5f50SAlex Elder bio_chain_put(obj_request->bio_list); 1469bf0d5f50SAlex Elder break; 1470788e2df3SAlex Elder case OBJ_REQUEST_PAGES: 1471788e2df3SAlex Elder if (obj_request->pages) 1472788e2df3SAlex Elder ceph_release_page_vector(obj_request->pages, 1473788e2df3SAlex Elder obj_request->page_count); 1474788e2df3SAlex Elder break; 1475bf0d5f50SAlex Elder } 1476bf0d5f50SAlex Elder 1477bf0d5f50SAlex Elder kfree(obj_request); 1478bf0d5f50SAlex Elder } 1479bf0d5f50SAlex Elder 1480bf0d5f50SAlex Elder /* 1481bf0d5f50SAlex Elder * Caller is responsible for filling in the list of object requests 1482bf0d5f50SAlex Elder * that comprises the image request, and the Linux request pointer 1483bf0d5f50SAlex Elder * (if there is one). 1484bf0d5f50SAlex Elder */ 1485bf0d5f50SAlex Elder struct rbd_img_request *rbd_img_request_create(struct rbd_device *rbd_dev, 1486bf0d5f50SAlex Elder u64 offset, u64 length, 1487bf0d5f50SAlex Elder bool write_request) 1488bf0d5f50SAlex Elder { 1489bf0d5f50SAlex Elder struct rbd_img_request *img_request; 1490bf0d5f50SAlex Elder struct ceph_snap_context *snapc = NULL; 1491bf0d5f50SAlex Elder 1492bf0d5f50SAlex Elder img_request = kmalloc(sizeof (*img_request), GFP_ATOMIC); 1493bf0d5f50SAlex Elder if (!img_request) 1494bf0d5f50SAlex Elder return NULL; 1495bf0d5f50SAlex Elder 1496bf0d5f50SAlex Elder if (write_request) { 1497bf0d5f50SAlex Elder down_read(&rbd_dev->header_rwsem); 1498bf0d5f50SAlex Elder snapc = ceph_get_snap_context(rbd_dev->header.snapc); 1499bf0d5f50SAlex Elder up_read(&rbd_dev->header_rwsem); 1500bf0d5f50SAlex Elder if (WARN_ON(!snapc)) { 1501bf0d5f50SAlex Elder kfree(img_request); 1502bf0d5f50SAlex Elder return NULL; /* Shouldn't happen */ 1503bf0d5f50SAlex Elder } 1504bf0d5f50SAlex Elder } 1505bf0d5f50SAlex Elder 1506bf0d5f50SAlex Elder img_request->rq = NULL; 1507bf0d5f50SAlex Elder img_request->rbd_dev = rbd_dev; 1508bf0d5f50SAlex Elder img_request->offset = offset; 1509bf0d5f50SAlex Elder img_request->length = length; 1510bf0d5f50SAlex Elder img_request->write_request = write_request; 1511bf0d5f50SAlex Elder if (write_request) 1512bf0d5f50SAlex Elder img_request->snapc = snapc; 1513bf0d5f50SAlex Elder else 1514bf0d5f50SAlex Elder img_request->snap_id = rbd_dev->spec->snap_id; 1515bf0d5f50SAlex Elder spin_lock_init(&img_request->completion_lock); 1516bf0d5f50SAlex Elder img_request->next_completion = 0; 1517bf0d5f50SAlex Elder img_request->callback = NULL; 1518bf0d5f50SAlex Elder img_request->obj_request_count = 0; 1519bf0d5f50SAlex Elder INIT_LIST_HEAD(&img_request->obj_requests); 1520bf0d5f50SAlex Elder kref_init(&img_request->kref); 1521bf0d5f50SAlex Elder 1522bf0d5f50SAlex Elder rbd_img_request_get(img_request); /* Avoid a warning */ 1523bf0d5f50SAlex Elder rbd_img_request_put(img_request); /* TEMPORARY */ 1524bf0d5f50SAlex Elder 1525bf0d5f50SAlex Elder return img_request; 1526bf0d5f50SAlex Elder } 1527bf0d5f50SAlex Elder 1528bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref) 1529bf0d5f50SAlex Elder { 1530bf0d5f50SAlex Elder struct rbd_img_request *img_request; 1531bf0d5f50SAlex Elder struct rbd_obj_request *obj_request; 1532bf0d5f50SAlex Elder struct rbd_obj_request *next_obj_request; 1533bf0d5f50SAlex Elder 1534bf0d5f50SAlex Elder img_request = container_of(kref, struct rbd_img_request, kref); 1535bf0d5f50SAlex Elder 1536bf0d5f50SAlex Elder for_each_obj_request_safe(img_request, obj_request, next_obj_request) 1537bf0d5f50SAlex Elder rbd_img_obj_request_del(img_request, obj_request); 153825dcf954SAlex Elder rbd_assert(img_request->obj_request_count == 0); 1539bf0d5f50SAlex Elder 1540bf0d5f50SAlex Elder if (img_request->write_request) 1541bf0d5f50SAlex Elder ceph_put_snap_context(img_request->snapc); 1542bf0d5f50SAlex Elder 1543bf0d5f50SAlex Elder kfree(img_request); 1544bf0d5f50SAlex Elder } 1545bf0d5f50SAlex Elder 1546bf0d5f50SAlex Elder static int rbd_img_request_fill_bio(struct rbd_img_request *img_request, 1547bf0d5f50SAlex Elder struct bio *bio_list) 1548bf0d5f50SAlex Elder { 1549bf0d5f50SAlex Elder struct rbd_device *rbd_dev = img_request->rbd_dev; 1550bf0d5f50SAlex Elder struct rbd_obj_request *obj_request = NULL; 1551bf0d5f50SAlex Elder struct rbd_obj_request *next_obj_request; 1552bf0d5f50SAlex Elder unsigned int bio_offset; 1553bf0d5f50SAlex Elder u64 image_offset; 1554bf0d5f50SAlex Elder u64 resid; 1555bf0d5f50SAlex Elder u16 opcode; 1556bf0d5f50SAlex Elder 1557bf0d5f50SAlex Elder opcode = img_request->write_request ? CEPH_OSD_OP_WRITE 1558bf0d5f50SAlex Elder : CEPH_OSD_OP_READ; 1559bf0d5f50SAlex Elder bio_offset = 0; 1560bf0d5f50SAlex Elder image_offset = img_request->offset; 1561bf0d5f50SAlex Elder rbd_assert(image_offset == bio_list->bi_sector << SECTOR_SHIFT); 1562bf0d5f50SAlex Elder resid = img_request->length; 1563bf0d5f50SAlex Elder while (resid) { 1564bf0d5f50SAlex Elder const char *object_name; 1565bf0d5f50SAlex Elder unsigned int clone_size; 1566bf0d5f50SAlex Elder struct ceph_osd_req_op *op; 1567bf0d5f50SAlex Elder u64 offset; 1568bf0d5f50SAlex Elder u64 length; 1569bf0d5f50SAlex Elder 1570bf0d5f50SAlex Elder object_name = rbd_segment_name(rbd_dev, image_offset); 1571bf0d5f50SAlex Elder if (!object_name) 1572bf0d5f50SAlex Elder goto out_unwind; 1573bf0d5f50SAlex Elder offset = rbd_segment_offset(rbd_dev, image_offset); 1574bf0d5f50SAlex Elder length = rbd_segment_length(rbd_dev, image_offset, resid); 1575bf0d5f50SAlex Elder obj_request = rbd_obj_request_create(object_name, 1576bf0d5f50SAlex Elder offset, length, 1577bf0d5f50SAlex Elder OBJ_REQUEST_BIO); 1578bf0d5f50SAlex Elder kfree(object_name); /* object request has its own copy */ 1579bf0d5f50SAlex Elder if (!obj_request) 1580bf0d5f50SAlex Elder goto out_unwind; 1581bf0d5f50SAlex Elder 1582bf0d5f50SAlex Elder rbd_assert(length <= (u64) UINT_MAX); 1583bf0d5f50SAlex Elder clone_size = (unsigned int) length; 1584bf0d5f50SAlex Elder obj_request->bio_list = bio_chain_clone_range(&bio_list, 1585bf0d5f50SAlex Elder &bio_offset, clone_size, 1586bf0d5f50SAlex Elder GFP_ATOMIC); 1587bf0d5f50SAlex Elder if (!obj_request->bio_list) 1588bf0d5f50SAlex Elder goto out_partial; 1589bf0d5f50SAlex Elder 1590bf0d5f50SAlex Elder /* 1591bf0d5f50SAlex Elder * Build up the op to use in building the osd 1592bf0d5f50SAlex Elder * request. Note that the contents of the op are 1593bf0d5f50SAlex Elder * copied by rbd_osd_req_create(). 1594bf0d5f50SAlex Elder */ 1595bf0d5f50SAlex Elder op = rbd_osd_req_op_create(opcode, offset, length); 1596bf0d5f50SAlex Elder if (!op) 1597bf0d5f50SAlex Elder goto out_partial; 1598bf0d5f50SAlex Elder obj_request->osd_req = rbd_osd_req_create(rbd_dev, 1599bf0d5f50SAlex Elder img_request->write_request, 1600bf0d5f50SAlex Elder obj_request, op); 1601bf0d5f50SAlex Elder rbd_osd_req_op_destroy(op); 1602bf0d5f50SAlex Elder if (!obj_request->osd_req) 1603bf0d5f50SAlex Elder goto out_partial; 1604bf0d5f50SAlex Elder /* status and version are initially zero-filled */ 1605bf0d5f50SAlex Elder 1606bf0d5f50SAlex Elder rbd_img_obj_request_add(img_request, obj_request); 1607bf0d5f50SAlex Elder 1608bf0d5f50SAlex Elder image_offset += length; 1609bf0d5f50SAlex Elder resid -= length; 1610bf0d5f50SAlex Elder } 1611bf0d5f50SAlex Elder 1612bf0d5f50SAlex Elder return 0; 1613bf0d5f50SAlex Elder 1614bf0d5f50SAlex Elder out_partial: 1615bf0d5f50SAlex Elder rbd_obj_request_put(obj_request); 1616bf0d5f50SAlex Elder out_unwind: 1617bf0d5f50SAlex Elder for_each_obj_request_safe(img_request, obj_request, next_obj_request) 1618bf0d5f50SAlex Elder rbd_obj_request_put(obj_request); 1619bf0d5f50SAlex Elder 1620bf0d5f50SAlex Elder return -ENOMEM; 1621bf0d5f50SAlex Elder } 1622bf0d5f50SAlex Elder 1623bf0d5f50SAlex Elder static void rbd_img_obj_callback(struct rbd_obj_request *obj_request) 1624bf0d5f50SAlex Elder { 1625bf0d5f50SAlex Elder struct rbd_img_request *img_request; 1626bf0d5f50SAlex Elder u32 which = obj_request->which; 1627bf0d5f50SAlex Elder bool more = true; 1628bf0d5f50SAlex Elder 1629bf0d5f50SAlex Elder img_request = obj_request->img_request; 1630bf0d5f50SAlex Elder rbd_assert(img_request != NULL); 1631bf0d5f50SAlex Elder rbd_assert(img_request->rq != NULL); 1632bf0d5f50SAlex Elder rbd_assert(which != BAD_WHICH); 1633bf0d5f50SAlex Elder rbd_assert(which < img_request->obj_request_count); 1634bf0d5f50SAlex Elder rbd_assert(which >= img_request->next_completion); 1635bf0d5f50SAlex Elder 1636bf0d5f50SAlex Elder spin_lock_irq(&img_request->completion_lock); 1637bf0d5f50SAlex Elder if (which != img_request->next_completion) 1638bf0d5f50SAlex Elder goto out; 1639bf0d5f50SAlex Elder 1640bf0d5f50SAlex Elder for_each_obj_request_from(img_request, obj_request) { 1641bf0d5f50SAlex Elder unsigned int xferred; 1642bf0d5f50SAlex Elder int result; 1643bf0d5f50SAlex Elder 1644bf0d5f50SAlex Elder rbd_assert(more); 1645bf0d5f50SAlex Elder rbd_assert(which < img_request->obj_request_count); 1646bf0d5f50SAlex Elder 164707741308SAlex Elder if (!obj_request_done_test(obj_request)) 1648bf0d5f50SAlex Elder break; 1649bf0d5f50SAlex Elder 1650bf0d5f50SAlex Elder rbd_assert(obj_request->xferred <= (u64) UINT_MAX); 1651bf0d5f50SAlex Elder xferred = (unsigned int) obj_request->xferred; 1652bf0d5f50SAlex Elder result = (int) obj_request->result; 1653bf0d5f50SAlex Elder if (result) 1654bf0d5f50SAlex Elder rbd_warn(NULL, "obj_request %s result %d xferred %u\n", 1655bf0d5f50SAlex Elder img_request->write_request ? "write" : "read", 1656bf0d5f50SAlex Elder result, xferred); 1657bf0d5f50SAlex Elder 1658bf0d5f50SAlex Elder more = blk_end_request(img_request->rq, result, xferred); 1659bf0d5f50SAlex Elder which++; 1660bf0d5f50SAlex Elder } 1661bf0d5f50SAlex Elder rbd_assert(more ^ (which == img_request->obj_request_count)); 1662bf0d5f50SAlex Elder img_request->next_completion = which; 1663bf0d5f50SAlex Elder out: 1664bf0d5f50SAlex Elder spin_unlock_irq(&img_request->completion_lock); 1665bf0d5f50SAlex Elder 1666bf0d5f50SAlex Elder if (!more) 1667bf0d5f50SAlex Elder rbd_img_request_complete(img_request); 1668bf0d5f50SAlex Elder } 1669bf0d5f50SAlex Elder 1670bf0d5f50SAlex Elder static int rbd_img_request_submit(struct rbd_img_request *img_request) 1671bf0d5f50SAlex Elder { 1672bf0d5f50SAlex Elder struct rbd_device *rbd_dev = img_request->rbd_dev; 1673bf0d5f50SAlex Elder struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 1674bf0d5f50SAlex Elder struct rbd_obj_request *obj_request; 1675bf0d5f50SAlex Elder 1676bf0d5f50SAlex Elder for_each_obj_request(img_request, obj_request) { 1677bf0d5f50SAlex Elder int ret; 1678bf0d5f50SAlex Elder 1679bf0d5f50SAlex Elder obj_request->callback = rbd_img_obj_callback; 1680bf0d5f50SAlex Elder ret = rbd_obj_request_submit(osdc, obj_request); 1681bf0d5f50SAlex Elder if (ret) 1682bf0d5f50SAlex Elder return ret; 1683bf0d5f50SAlex Elder /* 1684bf0d5f50SAlex Elder * The image request has its own reference to each 1685bf0d5f50SAlex Elder * of its object requests, so we can safely drop the 1686bf0d5f50SAlex Elder * initial one here. 1687bf0d5f50SAlex Elder */ 1688bf0d5f50SAlex Elder rbd_obj_request_put(obj_request); 1689bf0d5f50SAlex Elder } 1690bf0d5f50SAlex Elder 1691bf0d5f50SAlex Elder return 0; 1692bf0d5f50SAlex Elder } 1693bf0d5f50SAlex Elder 1694cf81b60eSAlex Elder static int rbd_obj_notify_ack(struct rbd_device *rbd_dev, 1695b8d70035SAlex Elder u64 ver, u64 notify_id) 1696b8d70035SAlex Elder { 1697b8d70035SAlex Elder struct rbd_obj_request *obj_request; 1698b8d70035SAlex Elder struct ceph_osd_req_op *op; 1699b8d70035SAlex Elder struct ceph_osd_client *osdc; 1700b8d70035SAlex Elder int ret; 1701b8d70035SAlex Elder 1702b8d70035SAlex Elder obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0, 1703b8d70035SAlex Elder OBJ_REQUEST_NODATA); 1704b8d70035SAlex Elder if (!obj_request) 1705b8d70035SAlex Elder return -ENOMEM; 1706b8d70035SAlex Elder 1707b8d70035SAlex Elder ret = -ENOMEM; 1708b8d70035SAlex Elder op = rbd_osd_req_op_create(CEPH_OSD_OP_NOTIFY_ACK, notify_id, ver); 1709b8d70035SAlex Elder if (!op) 1710b8d70035SAlex Elder goto out; 1711b8d70035SAlex Elder obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, 1712b8d70035SAlex Elder obj_request, op); 1713b8d70035SAlex Elder rbd_osd_req_op_destroy(op); 1714b8d70035SAlex Elder if (!obj_request->osd_req) 1715b8d70035SAlex Elder goto out; 1716b8d70035SAlex Elder 1717b8d70035SAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 1718cf81b60eSAlex Elder obj_request->callback = rbd_obj_request_put; 1719b8d70035SAlex Elder ret = rbd_obj_request_submit(osdc, obj_request); 1720b8d70035SAlex Elder out: 1721cf81b60eSAlex Elder if (ret) 1722b8d70035SAlex Elder rbd_obj_request_put(obj_request); 1723b8d70035SAlex Elder 1724b8d70035SAlex Elder return ret; 1725b8d70035SAlex Elder } 1726b8d70035SAlex Elder 1727b8d70035SAlex Elder static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data) 1728b8d70035SAlex Elder { 1729b8d70035SAlex Elder struct rbd_device *rbd_dev = (struct rbd_device *)data; 1730b8d70035SAlex Elder u64 hver; 1731b8d70035SAlex Elder int rc; 1732b8d70035SAlex Elder 1733b8d70035SAlex Elder if (!rbd_dev) 1734b8d70035SAlex Elder return; 1735b8d70035SAlex Elder 1736b8d70035SAlex Elder dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n", 1737b8d70035SAlex Elder rbd_dev->header_name, (unsigned long long) notify_id, 1738b8d70035SAlex Elder (unsigned int) opcode); 1739b8d70035SAlex Elder rc = rbd_dev_refresh(rbd_dev, &hver); 1740b8d70035SAlex Elder if (rc) 1741b8d70035SAlex Elder rbd_warn(rbd_dev, "got notification but failed to " 1742b8d70035SAlex Elder " update snaps: %d\n", rc); 1743b8d70035SAlex Elder 1744cf81b60eSAlex Elder rbd_obj_notify_ack(rbd_dev, hver, notify_id); 1745b8d70035SAlex Elder } 1746b8d70035SAlex Elder 17479969ebc5SAlex Elder /* 17489969ebc5SAlex Elder * Request sync osd watch/unwatch. The value of "start" determines 17499969ebc5SAlex Elder * whether a watch request is being initiated or torn down. 17509969ebc5SAlex Elder */ 17519969ebc5SAlex Elder static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start) 17529969ebc5SAlex Elder { 17539969ebc5SAlex Elder struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 17549969ebc5SAlex Elder struct rbd_obj_request *obj_request; 17559969ebc5SAlex Elder struct ceph_osd_req_op *op; 17569969ebc5SAlex Elder int ret; 17579969ebc5SAlex Elder 17589969ebc5SAlex Elder rbd_assert(start ^ !!rbd_dev->watch_event); 17599969ebc5SAlex Elder rbd_assert(start ^ !!rbd_dev->watch_request); 17609969ebc5SAlex Elder 17619969ebc5SAlex Elder if (start) { 17623c663bbdSAlex Elder ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev, 17639969ebc5SAlex Elder &rbd_dev->watch_event); 17649969ebc5SAlex Elder if (ret < 0) 17659969ebc5SAlex Elder return ret; 17668eb87565SAlex Elder rbd_assert(rbd_dev->watch_event != NULL); 17679969ebc5SAlex Elder } 17689969ebc5SAlex Elder 17699969ebc5SAlex Elder ret = -ENOMEM; 17709969ebc5SAlex Elder obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0, 17719969ebc5SAlex Elder OBJ_REQUEST_NODATA); 17729969ebc5SAlex Elder if (!obj_request) 17739969ebc5SAlex Elder goto out_cancel; 17749969ebc5SAlex Elder 17759969ebc5SAlex Elder op = rbd_osd_req_op_create(CEPH_OSD_OP_WATCH, 17769969ebc5SAlex Elder rbd_dev->watch_event->cookie, 17779969ebc5SAlex Elder rbd_dev->header.obj_version, start); 17789969ebc5SAlex Elder if (!op) 17799969ebc5SAlex Elder goto out_cancel; 17809969ebc5SAlex Elder obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, 17819969ebc5SAlex Elder obj_request, op); 17829969ebc5SAlex Elder rbd_osd_req_op_destroy(op); 17839969ebc5SAlex Elder if (!obj_request->osd_req) 17849969ebc5SAlex Elder goto out_cancel; 17859969ebc5SAlex Elder 17868eb87565SAlex Elder if (start) 1787975241afSAlex Elder ceph_osdc_set_request_linger(osdc, obj_request->osd_req); 17888eb87565SAlex Elder else 17896977c3f9SAlex Elder ceph_osdc_unregister_linger_request(osdc, 1790975241afSAlex Elder rbd_dev->watch_request->osd_req); 17919969ebc5SAlex Elder ret = rbd_obj_request_submit(osdc, obj_request); 17929969ebc5SAlex Elder if (ret) 17939969ebc5SAlex Elder goto out_cancel; 17949969ebc5SAlex Elder ret = rbd_obj_request_wait(obj_request); 17959969ebc5SAlex Elder if (ret) 17969969ebc5SAlex Elder goto out_cancel; 17979969ebc5SAlex Elder ret = obj_request->result; 17989969ebc5SAlex Elder if (ret) 17999969ebc5SAlex Elder goto out_cancel; 18009969ebc5SAlex Elder 18018eb87565SAlex Elder /* 18028eb87565SAlex Elder * A watch request is set to linger, so the underlying osd 18038eb87565SAlex Elder * request won't go away until we unregister it. We retain 18048eb87565SAlex Elder * a pointer to the object request during that time (in 18058eb87565SAlex Elder * rbd_dev->watch_request), so we'll keep a reference to 18068eb87565SAlex Elder * it. We'll drop that reference (below) after we've 18078eb87565SAlex Elder * unregistered it. 18088eb87565SAlex Elder */ 18098eb87565SAlex Elder if (start) { 18108eb87565SAlex Elder rbd_dev->watch_request = obj_request; 18118eb87565SAlex Elder 18128eb87565SAlex Elder return 0; 18138eb87565SAlex Elder } 18148eb87565SAlex Elder 18158eb87565SAlex Elder /* We have successfully torn down the watch request */ 18168eb87565SAlex Elder 18178eb87565SAlex Elder rbd_obj_request_put(rbd_dev->watch_request); 18188eb87565SAlex Elder rbd_dev->watch_request = NULL; 18199969ebc5SAlex Elder out_cancel: 18209969ebc5SAlex Elder /* Cancel the event if we're tearing down, or on error */ 18219969ebc5SAlex Elder ceph_osdc_cancel_event(rbd_dev->watch_event); 18229969ebc5SAlex Elder rbd_dev->watch_event = NULL; 18239969ebc5SAlex Elder if (obj_request) 18249969ebc5SAlex Elder rbd_obj_request_put(obj_request); 18259969ebc5SAlex Elder 18269969ebc5SAlex Elder return ret; 18279969ebc5SAlex Elder } 18289969ebc5SAlex Elder 182936be9a76SAlex Elder /* 183036be9a76SAlex Elder * Synchronous osd object method call 183136be9a76SAlex Elder */ 183236be9a76SAlex Elder static int rbd_obj_method_sync(struct rbd_device *rbd_dev, 183336be9a76SAlex Elder const char *object_name, 183436be9a76SAlex Elder const char *class_name, 183536be9a76SAlex Elder const char *method_name, 183636be9a76SAlex Elder const char *outbound, 183736be9a76SAlex Elder size_t outbound_size, 183836be9a76SAlex Elder char *inbound, 183936be9a76SAlex Elder size_t inbound_size, 184036be9a76SAlex Elder u64 *version) 184136be9a76SAlex Elder { 184236be9a76SAlex Elder struct rbd_obj_request *obj_request; 184336be9a76SAlex Elder struct ceph_osd_client *osdc; 184436be9a76SAlex Elder struct ceph_osd_req_op *op; 184536be9a76SAlex Elder struct page **pages; 184636be9a76SAlex Elder u32 page_count; 184736be9a76SAlex Elder int ret; 184836be9a76SAlex Elder 184936be9a76SAlex Elder /* 185036be9a76SAlex Elder * Method calls are ultimately read operations but they 185136be9a76SAlex Elder * don't involve object data (so no offset or length). 185236be9a76SAlex Elder * The result should placed into the inbound buffer 185336be9a76SAlex Elder * provided. They also supply outbound data--parameters for 185436be9a76SAlex Elder * the object method. Currently if this is present it will 185536be9a76SAlex Elder * be a snapshot id. 185636be9a76SAlex Elder */ 185736be9a76SAlex Elder page_count = (u32) calc_pages_for(0, inbound_size); 185836be9a76SAlex Elder pages = ceph_alloc_page_vector(page_count, GFP_KERNEL); 185936be9a76SAlex Elder if (IS_ERR(pages)) 186036be9a76SAlex Elder return PTR_ERR(pages); 186136be9a76SAlex Elder 186236be9a76SAlex Elder ret = -ENOMEM; 186336be9a76SAlex Elder obj_request = rbd_obj_request_create(object_name, 0, 0, 186436be9a76SAlex Elder OBJ_REQUEST_PAGES); 186536be9a76SAlex Elder if (!obj_request) 186636be9a76SAlex Elder goto out; 186736be9a76SAlex Elder 186836be9a76SAlex Elder obj_request->pages = pages; 186936be9a76SAlex Elder obj_request->page_count = page_count; 187036be9a76SAlex Elder 187136be9a76SAlex Elder op = rbd_osd_req_op_create(CEPH_OSD_OP_CALL, class_name, 187236be9a76SAlex Elder method_name, outbound, outbound_size); 187336be9a76SAlex Elder if (!op) 187436be9a76SAlex Elder goto out; 187536be9a76SAlex Elder obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, 187636be9a76SAlex Elder obj_request, op); 187736be9a76SAlex Elder rbd_osd_req_op_destroy(op); 187836be9a76SAlex Elder if (!obj_request->osd_req) 187936be9a76SAlex Elder goto out; 188036be9a76SAlex Elder 188136be9a76SAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 188236be9a76SAlex Elder ret = rbd_obj_request_submit(osdc, obj_request); 188336be9a76SAlex Elder if (ret) 188436be9a76SAlex Elder goto out; 188536be9a76SAlex Elder ret = rbd_obj_request_wait(obj_request); 188636be9a76SAlex Elder if (ret) 188736be9a76SAlex Elder goto out; 188836be9a76SAlex Elder 188936be9a76SAlex Elder ret = obj_request->result; 189036be9a76SAlex Elder if (ret < 0) 189136be9a76SAlex Elder goto out; 189223ed6e13SAlex Elder ret = 0; 189323ed6e13SAlex Elder (void) ceph_copy_from_page_vector(pages, inbound, 0, 189436be9a76SAlex Elder obj_request->xferred); 189536be9a76SAlex Elder if (version) 189636be9a76SAlex Elder *version = obj_request->version; 189736be9a76SAlex Elder out: 189836be9a76SAlex Elder if (obj_request) 189936be9a76SAlex Elder rbd_obj_request_put(obj_request); 190036be9a76SAlex Elder else 190136be9a76SAlex Elder ceph_release_page_vector(pages, page_count); 190236be9a76SAlex Elder 190336be9a76SAlex Elder return ret; 190436be9a76SAlex Elder } 190536be9a76SAlex Elder 1906bf0d5f50SAlex Elder static void rbd_request_fn(struct request_queue *q) 1907bf0d5f50SAlex Elder { 1908bf0d5f50SAlex Elder struct rbd_device *rbd_dev = q->queuedata; 1909bf0d5f50SAlex Elder bool read_only = rbd_dev->mapping.read_only; 1910bf0d5f50SAlex Elder struct request *rq; 1911bf0d5f50SAlex Elder int result; 1912bf0d5f50SAlex Elder 1913bf0d5f50SAlex Elder while ((rq = blk_fetch_request(q))) { 1914bf0d5f50SAlex Elder bool write_request = rq_data_dir(rq) == WRITE; 1915bf0d5f50SAlex Elder struct rbd_img_request *img_request; 1916bf0d5f50SAlex Elder u64 offset; 1917bf0d5f50SAlex Elder u64 length; 1918bf0d5f50SAlex Elder 1919bf0d5f50SAlex Elder /* Ignore any non-FS requests that filter through. */ 1920bf0d5f50SAlex Elder 1921bf0d5f50SAlex Elder if (rq->cmd_type != REQ_TYPE_FS) { 1922bf0d5f50SAlex Elder __blk_end_request_all(rq, 0); 1923bf0d5f50SAlex Elder continue; 1924bf0d5f50SAlex Elder } 1925bf0d5f50SAlex Elder 1926bf0d5f50SAlex Elder spin_unlock_irq(q->queue_lock); 1927bf0d5f50SAlex Elder 1928bf0d5f50SAlex Elder /* Disallow writes to a read-only device */ 1929bf0d5f50SAlex Elder 1930bf0d5f50SAlex Elder if (write_request) { 1931bf0d5f50SAlex Elder result = -EROFS; 1932bf0d5f50SAlex Elder if (read_only) 1933bf0d5f50SAlex Elder goto end_request; 1934bf0d5f50SAlex Elder rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP); 1935bf0d5f50SAlex Elder } 1936bf0d5f50SAlex Elder 19376d292906SAlex Elder /* 19386d292906SAlex Elder * Quit early if the mapped snapshot no longer 19396d292906SAlex Elder * exists. It's still possible the snapshot will 19406d292906SAlex Elder * have disappeared by the time our request arrives 19416d292906SAlex Elder * at the osd, but there's no sense in sending it if 19426d292906SAlex Elder * we already know. 19436d292906SAlex Elder */ 19446d292906SAlex Elder if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) { 1945bf0d5f50SAlex Elder dout("request for non-existent snapshot"); 1946bf0d5f50SAlex Elder rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP); 1947bf0d5f50SAlex Elder result = -ENXIO; 1948bf0d5f50SAlex Elder goto end_request; 1949bf0d5f50SAlex Elder } 1950bf0d5f50SAlex Elder 1951bf0d5f50SAlex Elder offset = (u64) blk_rq_pos(rq) << SECTOR_SHIFT; 1952bf0d5f50SAlex Elder length = (u64) blk_rq_bytes(rq); 1953bf0d5f50SAlex Elder 1954bf0d5f50SAlex Elder result = -EINVAL; 1955bf0d5f50SAlex Elder if (WARN_ON(offset && length > U64_MAX - offset + 1)) 1956bf0d5f50SAlex Elder goto end_request; /* Shouldn't happen */ 1957bf0d5f50SAlex Elder 1958bf0d5f50SAlex Elder result = -ENOMEM; 1959bf0d5f50SAlex Elder img_request = rbd_img_request_create(rbd_dev, offset, length, 1960bf0d5f50SAlex Elder write_request); 1961bf0d5f50SAlex Elder if (!img_request) 1962bf0d5f50SAlex Elder goto end_request; 1963bf0d5f50SAlex Elder 1964bf0d5f50SAlex Elder img_request->rq = rq; 1965bf0d5f50SAlex Elder 1966bf0d5f50SAlex Elder result = rbd_img_request_fill_bio(img_request, rq->bio); 1967bf0d5f50SAlex Elder if (!result) 1968bf0d5f50SAlex Elder result = rbd_img_request_submit(img_request); 1969bf0d5f50SAlex Elder if (result) 1970bf0d5f50SAlex Elder rbd_img_request_put(img_request); 1971bf0d5f50SAlex Elder end_request: 1972bf0d5f50SAlex Elder spin_lock_irq(q->queue_lock); 1973bf0d5f50SAlex Elder if (result < 0) { 1974bf0d5f50SAlex Elder rbd_warn(rbd_dev, "obj_request %s result %d\n", 1975bf0d5f50SAlex Elder write_request ? "write" : "read", result); 1976bf0d5f50SAlex Elder __blk_end_request_all(rq, result); 1977bf0d5f50SAlex Elder } 1978bf0d5f50SAlex Elder } 1979bf0d5f50SAlex Elder } 1980bf0d5f50SAlex Elder 1981602adf40SYehuda Sadeh /* 1982602adf40SYehuda Sadeh * a queue callback. Makes sure that we don't create a bio that spans across 1983602adf40SYehuda Sadeh * multiple osd objects. One exception would be with a single page bios, 1984f7760dadSAlex Elder * which we handle later at bio_chain_clone_range() 1985602adf40SYehuda Sadeh */ 1986602adf40SYehuda Sadeh static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd, 1987602adf40SYehuda Sadeh struct bio_vec *bvec) 1988602adf40SYehuda Sadeh { 1989602adf40SYehuda Sadeh struct rbd_device *rbd_dev = q->queuedata; 1990e5cfeed2SAlex Elder sector_t sector_offset; 1991e5cfeed2SAlex Elder sector_t sectors_per_obj; 1992e5cfeed2SAlex Elder sector_t obj_sector_offset; 1993e5cfeed2SAlex Elder int ret; 1994602adf40SYehuda Sadeh 1995e5cfeed2SAlex Elder /* 1996e5cfeed2SAlex Elder * Find how far into its rbd object the partition-relative 1997e5cfeed2SAlex Elder * bio start sector is to offset relative to the enclosing 1998e5cfeed2SAlex Elder * device. 1999e5cfeed2SAlex Elder */ 2000e5cfeed2SAlex Elder sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector; 2001e5cfeed2SAlex Elder sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT); 2002e5cfeed2SAlex Elder obj_sector_offset = sector_offset & (sectors_per_obj - 1); 2003593a9e7bSAlex Elder 2004e5cfeed2SAlex Elder /* 2005e5cfeed2SAlex Elder * Compute the number of bytes from that offset to the end 2006e5cfeed2SAlex Elder * of the object. Account for what's already used by the bio. 2007e5cfeed2SAlex Elder */ 2008e5cfeed2SAlex Elder ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT; 2009e5cfeed2SAlex Elder if (ret > bmd->bi_size) 2010e5cfeed2SAlex Elder ret -= bmd->bi_size; 2011e5cfeed2SAlex Elder else 2012e5cfeed2SAlex Elder ret = 0; 2013e5cfeed2SAlex Elder 2014e5cfeed2SAlex Elder /* 2015e5cfeed2SAlex Elder * Don't send back more than was asked for. And if the bio 2016e5cfeed2SAlex Elder * was empty, let the whole thing through because: "Note 2017e5cfeed2SAlex Elder * that a block device *must* allow a single page to be 2018e5cfeed2SAlex Elder * added to an empty bio." 2019e5cfeed2SAlex Elder */ 2020e5cfeed2SAlex Elder rbd_assert(bvec->bv_len <= PAGE_SIZE); 2021e5cfeed2SAlex Elder if (ret > (int) bvec->bv_len || !bmd->bi_size) 2022e5cfeed2SAlex Elder ret = (int) bvec->bv_len; 2023e5cfeed2SAlex Elder 2024e5cfeed2SAlex Elder return ret; 2025602adf40SYehuda Sadeh } 2026602adf40SYehuda Sadeh 2027602adf40SYehuda Sadeh static void rbd_free_disk(struct rbd_device *rbd_dev) 2028602adf40SYehuda Sadeh { 2029602adf40SYehuda Sadeh struct gendisk *disk = rbd_dev->disk; 2030602adf40SYehuda Sadeh 2031602adf40SYehuda Sadeh if (!disk) 2032602adf40SYehuda Sadeh return; 2033602adf40SYehuda Sadeh 2034602adf40SYehuda Sadeh if (disk->flags & GENHD_FL_UP) 2035602adf40SYehuda Sadeh del_gendisk(disk); 2036602adf40SYehuda Sadeh if (disk->queue) 2037602adf40SYehuda Sadeh blk_cleanup_queue(disk->queue); 2038602adf40SYehuda Sadeh put_disk(disk); 2039602adf40SYehuda Sadeh } 2040602adf40SYehuda Sadeh 2041788e2df3SAlex Elder static int rbd_obj_read_sync(struct rbd_device *rbd_dev, 2042788e2df3SAlex Elder const char *object_name, 2043788e2df3SAlex Elder u64 offset, u64 length, 2044788e2df3SAlex Elder char *buf, u64 *version) 2045788e2df3SAlex Elder 2046788e2df3SAlex Elder { 2047788e2df3SAlex Elder struct ceph_osd_req_op *op; 2048788e2df3SAlex Elder struct rbd_obj_request *obj_request; 2049788e2df3SAlex Elder struct ceph_osd_client *osdc; 2050788e2df3SAlex Elder struct page **pages = NULL; 2051788e2df3SAlex Elder u32 page_count; 20521ceae7efSAlex Elder size_t size; 2053788e2df3SAlex Elder int ret; 2054788e2df3SAlex Elder 2055788e2df3SAlex Elder page_count = (u32) calc_pages_for(offset, length); 2056788e2df3SAlex Elder pages = ceph_alloc_page_vector(page_count, GFP_KERNEL); 2057788e2df3SAlex Elder if (IS_ERR(pages)) 2058788e2df3SAlex Elder ret = PTR_ERR(pages); 2059788e2df3SAlex Elder 2060788e2df3SAlex Elder ret = -ENOMEM; 2061788e2df3SAlex Elder obj_request = rbd_obj_request_create(object_name, offset, length, 2062788e2df3SAlex Elder OBJ_REQUEST_PAGES); 2063788e2df3SAlex Elder if (!obj_request) 2064788e2df3SAlex Elder goto out; 2065788e2df3SAlex Elder 2066788e2df3SAlex Elder obj_request->pages = pages; 2067788e2df3SAlex Elder obj_request->page_count = page_count; 2068788e2df3SAlex Elder 2069788e2df3SAlex Elder op = rbd_osd_req_op_create(CEPH_OSD_OP_READ, offset, length); 2070788e2df3SAlex Elder if (!op) 2071788e2df3SAlex Elder goto out; 2072788e2df3SAlex Elder obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, 2073788e2df3SAlex Elder obj_request, op); 2074788e2df3SAlex Elder rbd_osd_req_op_destroy(op); 2075788e2df3SAlex Elder if (!obj_request->osd_req) 2076788e2df3SAlex Elder goto out; 2077788e2df3SAlex Elder 2078788e2df3SAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 2079788e2df3SAlex Elder ret = rbd_obj_request_submit(osdc, obj_request); 2080788e2df3SAlex Elder if (ret) 2081788e2df3SAlex Elder goto out; 2082788e2df3SAlex Elder ret = rbd_obj_request_wait(obj_request); 2083788e2df3SAlex Elder if (ret) 2084788e2df3SAlex Elder goto out; 2085788e2df3SAlex Elder 2086788e2df3SAlex Elder ret = obj_request->result; 2087788e2df3SAlex Elder if (ret < 0) 2088788e2df3SAlex Elder goto out; 20891ceae7efSAlex Elder 20901ceae7efSAlex Elder rbd_assert(obj_request->xferred <= (u64) SIZE_MAX); 20911ceae7efSAlex Elder size = (size_t) obj_request->xferred; 209223ed6e13SAlex Elder (void) ceph_copy_from_page_vector(pages, buf, 0, size); 209323ed6e13SAlex Elder rbd_assert(size <= (size_t) INT_MAX); 209423ed6e13SAlex Elder ret = (int) size; 2095788e2df3SAlex Elder if (version) 2096788e2df3SAlex Elder *version = obj_request->version; 2097788e2df3SAlex Elder out: 2098788e2df3SAlex Elder if (obj_request) 2099788e2df3SAlex Elder rbd_obj_request_put(obj_request); 2100788e2df3SAlex Elder else 2101788e2df3SAlex Elder ceph_release_page_vector(pages, page_count); 2102788e2df3SAlex Elder 2103788e2df3SAlex Elder return ret; 2104788e2df3SAlex Elder } 2105788e2df3SAlex Elder 2106602adf40SYehuda Sadeh /* 21074156d998SAlex Elder * Read the complete header for the given rbd device. 21084156d998SAlex Elder * 21094156d998SAlex Elder * Returns a pointer to a dynamically-allocated buffer containing 21104156d998SAlex Elder * the complete and validated header. Caller can pass the address 21114156d998SAlex Elder * of a variable that will be filled in with the version of the 21124156d998SAlex Elder * header object at the time it was read. 21134156d998SAlex Elder * 21144156d998SAlex Elder * Returns a pointer-coded errno if a failure occurs. 21154156d998SAlex Elder */ 21164156d998SAlex Elder static struct rbd_image_header_ondisk * 21174156d998SAlex Elder rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version) 21184156d998SAlex Elder { 21194156d998SAlex Elder struct rbd_image_header_ondisk *ondisk = NULL; 21204156d998SAlex Elder u32 snap_count = 0; 21214156d998SAlex Elder u64 names_size = 0; 21224156d998SAlex Elder u32 want_count; 21234156d998SAlex Elder int ret; 21244156d998SAlex Elder 21254156d998SAlex Elder /* 21264156d998SAlex Elder * The complete header will include an array of its 64-bit 21274156d998SAlex Elder * snapshot ids, followed by the names of those snapshots as 21284156d998SAlex Elder * a contiguous block of NUL-terminated strings. Note that 21294156d998SAlex Elder * the number of snapshots could change by the time we read 21304156d998SAlex Elder * it in, in which case we re-read it. 21314156d998SAlex Elder */ 21324156d998SAlex Elder do { 21334156d998SAlex Elder size_t size; 21344156d998SAlex Elder 21354156d998SAlex Elder kfree(ondisk); 21364156d998SAlex Elder 21374156d998SAlex Elder size = sizeof (*ondisk); 21384156d998SAlex Elder size += snap_count * sizeof (struct rbd_image_snap_ondisk); 21394156d998SAlex Elder size += names_size; 21404156d998SAlex Elder ondisk = kmalloc(size, GFP_KERNEL); 21414156d998SAlex Elder if (!ondisk) 21424156d998SAlex Elder return ERR_PTR(-ENOMEM); 21434156d998SAlex Elder 2144788e2df3SAlex Elder ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name, 21454156d998SAlex Elder 0, size, 21464156d998SAlex Elder (char *) ondisk, version); 21474156d998SAlex Elder if (ret < 0) 21484156d998SAlex Elder goto out_err; 21494156d998SAlex Elder if (WARN_ON((size_t) ret < size)) { 21504156d998SAlex Elder ret = -ENXIO; 215106ecc6cbSAlex Elder rbd_warn(rbd_dev, "short header read (want %zd got %d)", 215206ecc6cbSAlex Elder size, ret); 21534156d998SAlex Elder goto out_err; 21544156d998SAlex Elder } 21554156d998SAlex Elder if (!rbd_dev_ondisk_valid(ondisk)) { 21564156d998SAlex Elder ret = -ENXIO; 215706ecc6cbSAlex Elder rbd_warn(rbd_dev, "invalid header"); 21584156d998SAlex Elder goto out_err; 21594156d998SAlex Elder } 21604156d998SAlex Elder 21614156d998SAlex Elder names_size = le64_to_cpu(ondisk->snap_names_len); 21624156d998SAlex Elder want_count = snap_count; 21634156d998SAlex Elder snap_count = le32_to_cpu(ondisk->snap_count); 21644156d998SAlex Elder } while (snap_count != want_count); 21654156d998SAlex Elder 21664156d998SAlex Elder return ondisk; 21674156d998SAlex Elder 21684156d998SAlex Elder out_err: 21694156d998SAlex Elder kfree(ondisk); 21704156d998SAlex Elder 21714156d998SAlex Elder return ERR_PTR(ret); 21724156d998SAlex Elder } 21734156d998SAlex Elder 21744156d998SAlex Elder /* 2175602adf40SYehuda Sadeh * reload the ondisk the header 2176602adf40SYehuda Sadeh */ 2177602adf40SYehuda Sadeh static int rbd_read_header(struct rbd_device *rbd_dev, 2178602adf40SYehuda Sadeh struct rbd_image_header *header) 2179602adf40SYehuda Sadeh { 21804156d998SAlex Elder struct rbd_image_header_ondisk *ondisk; 21814156d998SAlex Elder u64 ver = 0; 21824156d998SAlex Elder int ret; 2183602adf40SYehuda Sadeh 21844156d998SAlex Elder ondisk = rbd_dev_v1_header_read(rbd_dev, &ver); 21854156d998SAlex Elder if (IS_ERR(ondisk)) 21864156d998SAlex Elder return PTR_ERR(ondisk); 21874156d998SAlex Elder ret = rbd_header_from_disk(header, ondisk); 21884156d998SAlex Elder if (ret >= 0) 218959c2be1eSYehuda Sadeh header->obj_version = ver; 21904156d998SAlex Elder kfree(ondisk); 2191602adf40SYehuda Sadeh 21924156d998SAlex Elder return ret; 2193602adf40SYehuda Sadeh } 2194602adf40SYehuda Sadeh 219541f38c2bSAlex Elder static void rbd_remove_all_snaps(struct rbd_device *rbd_dev) 2196dfc5606dSYehuda Sadeh { 2197dfc5606dSYehuda Sadeh struct rbd_snap *snap; 2198a0593290SAlex Elder struct rbd_snap *next; 2199dfc5606dSYehuda Sadeh 2200a0593290SAlex Elder list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node) 220141f38c2bSAlex Elder rbd_remove_snap_dev(snap); 2202dfc5606dSYehuda Sadeh } 2203dfc5606dSYehuda Sadeh 22049478554aSAlex Elder static void rbd_update_mapping_size(struct rbd_device *rbd_dev) 22059478554aSAlex Elder { 22069478554aSAlex Elder sector_t size; 22079478554aSAlex Elder 22080d7dbfceSAlex Elder if (rbd_dev->spec->snap_id != CEPH_NOSNAP) 22099478554aSAlex Elder return; 22109478554aSAlex Elder 22119478554aSAlex Elder size = (sector_t) rbd_dev->header.image_size / SECTOR_SIZE; 22129478554aSAlex Elder dout("setting size to %llu sectors", (unsigned long long) size); 22139478554aSAlex Elder rbd_dev->mapping.size = (u64) size; 22149478554aSAlex Elder set_capacity(rbd_dev->disk, size); 22159478554aSAlex Elder } 22169478554aSAlex Elder 2217602adf40SYehuda Sadeh /* 2218602adf40SYehuda Sadeh * only read the first part of the ondisk header, without the snaps info 2219602adf40SYehuda Sadeh */ 2220117973fbSAlex Elder static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver) 2221602adf40SYehuda Sadeh { 2222602adf40SYehuda Sadeh int ret; 2223602adf40SYehuda Sadeh struct rbd_image_header h; 2224602adf40SYehuda Sadeh 2225602adf40SYehuda Sadeh ret = rbd_read_header(rbd_dev, &h); 2226602adf40SYehuda Sadeh if (ret < 0) 2227602adf40SYehuda Sadeh return ret; 2228602adf40SYehuda Sadeh 2229a51aa0c0SJosh Durgin down_write(&rbd_dev->header_rwsem); 2230a51aa0c0SJosh Durgin 22319478554aSAlex Elder /* Update image size, and check for resize of mapped image */ 22329478554aSAlex Elder rbd_dev->header.image_size = h.image_size; 22339478554aSAlex Elder rbd_update_mapping_size(rbd_dev); 22349db4b3e3SSage Weil 2235849b4260SAlex Elder /* rbd_dev->header.object_prefix shouldn't change */ 2236602adf40SYehuda Sadeh kfree(rbd_dev->header.snap_sizes); 2237849b4260SAlex Elder kfree(rbd_dev->header.snap_names); 2238d1d25646SJosh Durgin /* osd requests may still refer to snapc */ 2239d1d25646SJosh Durgin ceph_put_snap_context(rbd_dev->header.snapc); 2240602adf40SYehuda Sadeh 2241b813623aSAlex Elder if (hver) 2242b813623aSAlex Elder *hver = h.obj_version; 2243a71b891bSJosh Durgin rbd_dev->header.obj_version = h.obj_version; 224493a24e08SJosh Durgin rbd_dev->header.image_size = h.image_size; 2245602adf40SYehuda Sadeh rbd_dev->header.snapc = h.snapc; 2246602adf40SYehuda Sadeh rbd_dev->header.snap_names = h.snap_names; 2247602adf40SYehuda Sadeh rbd_dev->header.snap_sizes = h.snap_sizes; 2248849b4260SAlex Elder /* Free the extra copy of the object prefix */ 2249849b4260SAlex Elder WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix)); 2250849b4260SAlex Elder kfree(h.object_prefix); 2251849b4260SAlex Elder 2252304f6808SAlex Elder ret = rbd_dev_snaps_update(rbd_dev); 2253304f6808SAlex Elder if (!ret) 2254304f6808SAlex Elder ret = rbd_dev_snaps_register(rbd_dev); 2255dfc5606dSYehuda Sadeh 2256c666601aSJosh Durgin up_write(&rbd_dev->header_rwsem); 2257602adf40SYehuda Sadeh 2258dfc5606dSYehuda Sadeh return ret; 2259602adf40SYehuda Sadeh } 2260602adf40SYehuda Sadeh 2261117973fbSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver) 22621fe5e993SAlex Elder { 22631fe5e993SAlex Elder int ret; 22641fe5e993SAlex Elder 2265117973fbSAlex Elder rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 22661fe5e993SAlex Elder mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); 2267117973fbSAlex Elder if (rbd_dev->image_format == 1) 2268117973fbSAlex Elder ret = rbd_dev_v1_refresh(rbd_dev, hver); 2269117973fbSAlex Elder else 2270117973fbSAlex Elder ret = rbd_dev_v2_refresh(rbd_dev, hver); 22711fe5e993SAlex Elder mutex_unlock(&ctl_mutex); 22721fe5e993SAlex Elder 22731fe5e993SAlex Elder return ret; 22741fe5e993SAlex Elder } 22751fe5e993SAlex Elder 2276602adf40SYehuda Sadeh static int rbd_init_disk(struct rbd_device *rbd_dev) 2277602adf40SYehuda Sadeh { 2278602adf40SYehuda Sadeh struct gendisk *disk; 2279602adf40SYehuda Sadeh struct request_queue *q; 2280593a9e7bSAlex Elder u64 segment_size; 2281602adf40SYehuda Sadeh 2282602adf40SYehuda Sadeh /* create gendisk info */ 2283602adf40SYehuda Sadeh disk = alloc_disk(RBD_MINORS_PER_MAJOR); 2284602adf40SYehuda Sadeh if (!disk) 22851fcdb8aaSAlex Elder return -ENOMEM; 2286602adf40SYehuda Sadeh 2287f0f8cef5SAlex Elder snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d", 2288de71a297SAlex Elder rbd_dev->dev_id); 2289602adf40SYehuda Sadeh disk->major = rbd_dev->major; 2290602adf40SYehuda Sadeh disk->first_minor = 0; 2291602adf40SYehuda Sadeh disk->fops = &rbd_bd_ops; 2292602adf40SYehuda Sadeh disk->private_data = rbd_dev; 2293602adf40SYehuda Sadeh 2294bf0d5f50SAlex Elder q = blk_init_queue(rbd_request_fn, &rbd_dev->lock); 2295602adf40SYehuda Sadeh if (!q) 2296602adf40SYehuda Sadeh goto out_disk; 2297029bcbd8SJosh Durgin 2298593a9e7bSAlex Elder /* We use the default size, but let's be explicit about it. */ 2299593a9e7bSAlex Elder blk_queue_physical_block_size(q, SECTOR_SIZE); 2300593a9e7bSAlex Elder 2301029bcbd8SJosh Durgin /* set io sizes to object size */ 2302593a9e7bSAlex Elder segment_size = rbd_obj_bytes(&rbd_dev->header); 2303593a9e7bSAlex Elder blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE); 2304593a9e7bSAlex Elder blk_queue_max_segment_size(q, segment_size); 2305593a9e7bSAlex Elder blk_queue_io_min(q, segment_size); 2306593a9e7bSAlex Elder blk_queue_io_opt(q, segment_size); 2307029bcbd8SJosh Durgin 2308602adf40SYehuda Sadeh blk_queue_merge_bvec(q, rbd_merge_bvec); 2309602adf40SYehuda Sadeh disk->queue = q; 2310602adf40SYehuda Sadeh 2311602adf40SYehuda Sadeh q->queuedata = rbd_dev; 2312602adf40SYehuda Sadeh 2313602adf40SYehuda Sadeh rbd_dev->disk = disk; 2314602adf40SYehuda Sadeh 231512f02944SAlex Elder set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE); 231612f02944SAlex Elder 2317602adf40SYehuda Sadeh return 0; 2318602adf40SYehuda Sadeh out_disk: 2319602adf40SYehuda Sadeh put_disk(disk); 23201fcdb8aaSAlex Elder 23211fcdb8aaSAlex Elder return -ENOMEM; 2322602adf40SYehuda Sadeh } 2323602adf40SYehuda Sadeh 2324dfc5606dSYehuda Sadeh /* 2325dfc5606dSYehuda Sadeh sysfs 2326dfc5606dSYehuda Sadeh */ 2327602adf40SYehuda Sadeh 2328593a9e7bSAlex Elder static struct rbd_device *dev_to_rbd_dev(struct device *dev) 2329593a9e7bSAlex Elder { 2330593a9e7bSAlex Elder return container_of(dev, struct rbd_device, dev); 2331593a9e7bSAlex Elder } 2332593a9e7bSAlex Elder 2333dfc5606dSYehuda Sadeh static ssize_t rbd_size_show(struct device *dev, 2334dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 2335602adf40SYehuda Sadeh { 2336593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 2337a51aa0c0SJosh Durgin sector_t size; 2338dfc5606dSYehuda Sadeh 2339a51aa0c0SJosh Durgin down_read(&rbd_dev->header_rwsem); 2340a51aa0c0SJosh Durgin size = get_capacity(rbd_dev->disk); 2341a51aa0c0SJosh Durgin up_read(&rbd_dev->header_rwsem); 2342a51aa0c0SJosh Durgin 2343a51aa0c0SJosh Durgin return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE); 2344602adf40SYehuda Sadeh } 2345602adf40SYehuda Sadeh 234634b13184SAlex Elder /* 234734b13184SAlex Elder * Note this shows the features for whatever's mapped, which is not 234834b13184SAlex Elder * necessarily the base image. 234934b13184SAlex Elder */ 235034b13184SAlex Elder static ssize_t rbd_features_show(struct device *dev, 235134b13184SAlex Elder struct device_attribute *attr, char *buf) 235234b13184SAlex Elder { 235334b13184SAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 235434b13184SAlex Elder 235534b13184SAlex Elder return sprintf(buf, "0x%016llx\n", 235634b13184SAlex Elder (unsigned long long) rbd_dev->mapping.features); 235734b13184SAlex Elder } 235834b13184SAlex Elder 2359dfc5606dSYehuda Sadeh static ssize_t rbd_major_show(struct device *dev, 2360dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 2361602adf40SYehuda Sadeh { 2362593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 2363dfc5606dSYehuda Sadeh 2364dfc5606dSYehuda Sadeh return sprintf(buf, "%d\n", rbd_dev->major); 2365dfc5606dSYehuda Sadeh } 2366dfc5606dSYehuda Sadeh 2367dfc5606dSYehuda Sadeh static ssize_t rbd_client_id_show(struct device *dev, 2368dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 2369dfc5606dSYehuda Sadeh { 2370593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 2371dfc5606dSYehuda Sadeh 23721dbb4399SAlex Elder return sprintf(buf, "client%lld\n", 23731dbb4399SAlex Elder ceph_client_id(rbd_dev->rbd_client->client)); 2374dfc5606dSYehuda Sadeh } 2375dfc5606dSYehuda Sadeh 2376dfc5606dSYehuda Sadeh static ssize_t rbd_pool_show(struct device *dev, 2377dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 2378dfc5606dSYehuda Sadeh { 2379593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 2380dfc5606dSYehuda Sadeh 23810d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->pool_name); 2382dfc5606dSYehuda Sadeh } 2383dfc5606dSYehuda Sadeh 23849bb2f334SAlex Elder static ssize_t rbd_pool_id_show(struct device *dev, 23859bb2f334SAlex Elder struct device_attribute *attr, char *buf) 23869bb2f334SAlex Elder { 23879bb2f334SAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 23889bb2f334SAlex Elder 23890d7dbfceSAlex Elder return sprintf(buf, "%llu\n", 23900d7dbfceSAlex Elder (unsigned long long) rbd_dev->spec->pool_id); 23919bb2f334SAlex Elder } 23929bb2f334SAlex Elder 2393dfc5606dSYehuda Sadeh static ssize_t rbd_name_show(struct device *dev, 2394dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 2395dfc5606dSYehuda Sadeh { 2396593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 2397dfc5606dSYehuda Sadeh 2398a92ffdf8SAlex Elder if (rbd_dev->spec->image_name) 23990d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->image_name); 2400a92ffdf8SAlex Elder 2401a92ffdf8SAlex Elder return sprintf(buf, "(unknown)\n"); 2402dfc5606dSYehuda Sadeh } 2403dfc5606dSYehuda Sadeh 2404589d30e0SAlex Elder static ssize_t rbd_image_id_show(struct device *dev, 2405589d30e0SAlex Elder struct device_attribute *attr, char *buf) 2406589d30e0SAlex Elder { 2407589d30e0SAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 2408589d30e0SAlex Elder 24090d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->image_id); 2410589d30e0SAlex Elder } 2411589d30e0SAlex Elder 241234b13184SAlex Elder /* 241334b13184SAlex Elder * Shows the name of the currently-mapped snapshot (or 241434b13184SAlex Elder * RBD_SNAP_HEAD_NAME for the base image). 241534b13184SAlex Elder */ 2416dfc5606dSYehuda Sadeh static ssize_t rbd_snap_show(struct device *dev, 2417dfc5606dSYehuda Sadeh struct device_attribute *attr, 2418dfc5606dSYehuda Sadeh char *buf) 2419dfc5606dSYehuda Sadeh { 2420593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 2421dfc5606dSYehuda Sadeh 24220d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->snap_name); 2423dfc5606dSYehuda Sadeh } 2424dfc5606dSYehuda Sadeh 242586b00e0dSAlex Elder /* 242686b00e0dSAlex Elder * For an rbd v2 image, shows the pool id, image id, and snapshot id 242786b00e0dSAlex Elder * for the parent image. If there is no parent, simply shows 242886b00e0dSAlex Elder * "(no parent image)". 242986b00e0dSAlex Elder */ 243086b00e0dSAlex Elder static ssize_t rbd_parent_show(struct device *dev, 243186b00e0dSAlex Elder struct device_attribute *attr, 243286b00e0dSAlex Elder char *buf) 243386b00e0dSAlex Elder { 243486b00e0dSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 243586b00e0dSAlex Elder struct rbd_spec *spec = rbd_dev->parent_spec; 243686b00e0dSAlex Elder int count; 243786b00e0dSAlex Elder char *bufp = buf; 243886b00e0dSAlex Elder 243986b00e0dSAlex Elder if (!spec) 244086b00e0dSAlex Elder return sprintf(buf, "(no parent image)\n"); 244186b00e0dSAlex Elder 244286b00e0dSAlex Elder count = sprintf(bufp, "pool_id %llu\npool_name %s\n", 244386b00e0dSAlex Elder (unsigned long long) spec->pool_id, spec->pool_name); 244486b00e0dSAlex Elder if (count < 0) 244586b00e0dSAlex Elder return count; 244686b00e0dSAlex Elder bufp += count; 244786b00e0dSAlex Elder 244886b00e0dSAlex Elder count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id, 244986b00e0dSAlex Elder spec->image_name ? spec->image_name : "(unknown)"); 245086b00e0dSAlex Elder if (count < 0) 245186b00e0dSAlex Elder return count; 245286b00e0dSAlex Elder bufp += count; 245386b00e0dSAlex Elder 245486b00e0dSAlex Elder count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n", 245586b00e0dSAlex Elder (unsigned long long) spec->snap_id, spec->snap_name); 245686b00e0dSAlex Elder if (count < 0) 245786b00e0dSAlex Elder return count; 245886b00e0dSAlex Elder bufp += count; 245986b00e0dSAlex Elder 246086b00e0dSAlex Elder count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap); 246186b00e0dSAlex Elder if (count < 0) 246286b00e0dSAlex Elder return count; 246386b00e0dSAlex Elder bufp += count; 246486b00e0dSAlex Elder 246586b00e0dSAlex Elder return (ssize_t) (bufp - buf); 246686b00e0dSAlex Elder } 246786b00e0dSAlex Elder 2468dfc5606dSYehuda Sadeh static ssize_t rbd_image_refresh(struct device *dev, 2469dfc5606dSYehuda Sadeh struct device_attribute *attr, 2470dfc5606dSYehuda Sadeh const char *buf, 2471dfc5606dSYehuda Sadeh size_t size) 2472dfc5606dSYehuda Sadeh { 2473593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 2474b813623aSAlex Elder int ret; 2475602adf40SYehuda Sadeh 2476117973fbSAlex Elder ret = rbd_dev_refresh(rbd_dev, NULL); 2477b813623aSAlex Elder 2478b813623aSAlex Elder return ret < 0 ? ret : size; 2479dfc5606dSYehuda Sadeh } 2480602adf40SYehuda Sadeh 2481dfc5606dSYehuda Sadeh static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL); 248234b13184SAlex Elder static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL); 2483dfc5606dSYehuda Sadeh static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL); 2484dfc5606dSYehuda Sadeh static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL); 2485dfc5606dSYehuda Sadeh static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL); 24869bb2f334SAlex Elder static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL); 2487dfc5606dSYehuda Sadeh static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL); 2488589d30e0SAlex Elder static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL); 2489dfc5606dSYehuda Sadeh static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh); 2490dfc5606dSYehuda Sadeh static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL); 249186b00e0dSAlex Elder static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL); 2492dfc5606dSYehuda Sadeh 2493dfc5606dSYehuda Sadeh static struct attribute *rbd_attrs[] = { 2494dfc5606dSYehuda Sadeh &dev_attr_size.attr, 249534b13184SAlex Elder &dev_attr_features.attr, 2496dfc5606dSYehuda Sadeh &dev_attr_major.attr, 2497dfc5606dSYehuda Sadeh &dev_attr_client_id.attr, 2498dfc5606dSYehuda Sadeh &dev_attr_pool.attr, 24999bb2f334SAlex Elder &dev_attr_pool_id.attr, 2500dfc5606dSYehuda Sadeh &dev_attr_name.attr, 2501589d30e0SAlex Elder &dev_attr_image_id.attr, 2502dfc5606dSYehuda Sadeh &dev_attr_current_snap.attr, 250386b00e0dSAlex Elder &dev_attr_parent.attr, 2504dfc5606dSYehuda Sadeh &dev_attr_refresh.attr, 2505dfc5606dSYehuda Sadeh NULL 2506dfc5606dSYehuda Sadeh }; 2507dfc5606dSYehuda Sadeh 2508dfc5606dSYehuda Sadeh static struct attribute_group rbd_attr_group = { 2509dfc5606dSYehuda Sadeh .attrs = rbd_attrs, 2510dfc5606dSYehuda Sadeh }; 2511dfc5606dSYehuda Sadeh 2512dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_attr_groups[] = { 2513dfc5606dSYehuda Sadeh &rbd_attr_group, 2514dfc5606dSYehuda Sadeh NULL 2515dfc5606dSYehuda Sadeh }; 2516dfc5606dSYehuda Sadeh 2517dfc5606dSYehuda Sadeh static void rbd_sysfs_dev_release(struct device *dev) 2518dfc5606dSYehuda Sadeh { 2519dfc5606dSYehuda Sadeh } 2520dfc5606dSYehuda Sadeh 2521dfc5606dSYehuda Sadeh static struct device_type rbd_device_type = { 2522dfc5606dSYehuda Sadeh .name = "rbd", 2523dfc5606dSYehuda Sadeh .groups = rbd_attr_groups, 2524dfc5606dSYehuda Sadeh .release = rbd_sysfs_dev_release, 2525dfc5606dSYehuda Sadeh }; 2526dfc5606dSYehuda Sadeh 2527dfc5606dSYehuda Sadeh 2528dfc5606dSYehuda Sadeh /* 2529dfc5606dSYehuda Sadeh sysfs - snapshots 2530dfc5606dSYehuda Sadeh */ 2531dfc5606dSYehuda Sadeh 2532dfc5606dSYehuda Sadeh static ssize_t rbd_snap_size_show(struct device *dev, 2533dfc5606dSYehuda Sadeh struct device_attribute *attr, 2534dfc5606dSYehuda Sadeh char *buf) 2535dfc5606dSYehuda Sadeh { 2536dfc5606dSYehuda Sadeh struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev); 2537dfc5606dSYehuda Sadeh 25383591538fSJosh Durgin return sprintf(buf, "%llu\n", (unsigned long long)snap->size); 2539dfc5606dSYehuda Sadeh } 2540dfc5606dSYehuda Sadeh 2541dfc5606dSYehuda Sadeh static ssize_t rbd_snap_id_show(struct device *dev, 2542dfc5606dSYehuda Sadeh struct device_attribute *attr, 2543dfc5606dSYehuda Sadeh char *buf) 2544dfc5606dSYehuda Sadeh { 2545dfc5606dSYehuda Sadeh struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev); 2546dfc5606dSYehuda Sadeh 2547593a9e7bSAlex Elder return sprintf(buf, "%llu\n", (unsigned long long)snap->id); 2548dfc5606dSYehuda Sadeh } 2549dfc5606dSYehuda Sadeh 255034b13184SAlex Elder static ssize_t rbd_snap_features_show(struct device *dev, 255134b13184SAlex Elder struct device_attribute *attr, 255234b13184SAlex Elder char *buf) 255334b13184SAlex Elder { 255434b13184SAlex Elder struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev); 255534b13184SAlex Elder 255634b13184SAlex Elder return sprintf(buf, "0x%016llx\n", 255734b13184SAlex Elder (unsigned long long) snap->features); 255834b13184SAlex Elder } 255934b13184SAlex Elder 2560dfc5606dSYehuda Sadeh static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL); 2561dfc5606dSYehuda Sadeh static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL); 256234b13184SAlex Elder static DEVICE_ATTR(snap_features, S_IRUGO, rbd_snap_features_show, NULL); 2563dfc5606dSYehuda Sadeh 2564dfc5606dSYehuda Sadeh static struct attribute *rbd_snap_attrs[] = { 2565dfc5606dSYehuda Sadeh &dev_attr_snap_size.attr, 2566dfc5606dSYehuda Sadeh &dev_attr_snap_id.attr, 256734b13184SAlex Elder &dev_attr_snap_features.attr, 2568dfc5606dSYehuda Sadeh NULL, 2569dfc5606dSYehuda Sadeh }; 2570dfc5606dSYehuda Sadeh 2571dfc5606dSYehuda Sadeh static struct attribute_group rbd_snap_attr_group = { 2572dfc5606dSYehuda Sadeh .attrs = rbd_snap_attrs, 2573dfc5606dSYehuda Sadeh }; 2574dfc5606dSYehuda Sadeh 2575dfc5606dSYehuda Sadeh static void rbd_snap_dev_release(struct device *dev) 2576dfc5606dSYehuda Sadeh { 2577dfc5606dSYehuda Sadeh struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev); 2578dfc5606dSYehuda Sadeh kfree(snap->name); 2579dfc5606dSYehuda Sadeh kfree(snap); 2580dfc5606dSYehuda Sadeh } 2581dfc5606dSYehuda Sadeh 2582dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_snap_attr_groups[] = { 2583dfc5606dSYehuda Sadeh &rbd_snap_attr_group, 2584dfc5606dSYehuda Sadeh NULL 2585dfc5606dSYehuda Sadeh }; 2586dfc5606dSYehuda Sadeh 2587dfc5606dSYehuda Sadeh static struct device_type rbd_snap_device_type = { 2588dfc5606dSYehuda Sadeh .groups = rbd_snap_attr_groups, 2589dfc5606dSYehuda Sadeh .release = rbd_snap_dev_release, 2590dfc5606dSYehuda Sadeh }; 2591dfc5606dSYehuda Sadeh 25928b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec) 25938b8fb99cSAlex Elder { 25948b8fb99cSAlex Elder kref_get(&spec->kref); 25958b8fb99cSAlex Elder 25968b8fb99cSAlex Elder return spec; 25978b8fb99cSAlex Elder } 25988b8fb99cSAlex Elder 25998b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref); 26008b8fb99cSAlex Elder static void rbd_spec_put(struct rbd_spec *spec) 26018b8fb99cSAlex Elder { 26028b8fb99cSAlex Elder if (spec) 26038b8fb99cSAlex Elder kref_put(&spec->kref, rbd_spec_free); 26048b8fb99cSAlex Elder } 26058b8fb99cSAlex Elder 26068b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_alloc(void) 26078b8fb99cSAlex Elder { 26088b8fb99cSAlex Elder struct rbd_spec *spec; 26098b8fb99cSAlex Elder 26108b8fb99cSAlex Elder spec = kzalloc(sizeof (*spec), GFP_KERNEL); 26118b8fb99cSAlex Elder if (!spec) 26128b8fb99cSAlex Elder return NULL; 26138b8fb99cSAlex Elder kref_init(&spec->kref); 26148b8fb99cSAlex Elder 26158b8fb99cSAlex Elder rbd_spec_put(rbd_spec_get(spec)); /* TEMPORARY */ 26168b8fb99cSAlex Elder 26178b8fb99cSAlex Elder return spec; 26188b8fb99cSAlex Elder } 26198b8fb99cSAlex Elder 26208b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref) 26218b8fb99cSAlex Elder { 26228b8fb99cSAlex Elder struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref); 26238b8fb99cSAlex Elder 26248b8fb99cSAlex Elder kfree(spec->pool_name); 26258b8fb99cSAlex Elder kfree(spec->image_id); 26268b8fb99cSAlex Elder kfree(spec->image_name); 26278b8fb99cSAlex Elder kfree(spec->snap_name); 26288b8fb99cSAlex Elder kfree(spec); 26298b8fb99cSAlex Elder } 26308b8fb99cSAlex Elder 2631c53d5893SAlex Elder struct rbd_device *rbd_dev_create(struct rbd_client *rbdc, 2632c53d5893SAlex Elder struct rbd_spec *spec) 2633c53d5893SAlex Elder { 2634c53d5893SAlex Elder struct rbd_device *rbd_dev; 2635c53d5893SAlex Elder 2636c53d5893SAlex Elder rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL); 2637c53d5893SAlex Elder if (!rbd_dev) 2638c53d5893SAlex Elder return NULL; 2639c53d5893SAlex Elder 2640c53d5893SAlex Elder spin_lock_init(&rbd_dev->lock); 26416d292906SAlex Elder rbd_dev->flags = 0; 2642c53d5893SAlex Elder INIT_LIST_HEAD(&rbd_dev->node); 2643c53d5893SAlex Elder INIT_LIST_HEAD(&rbd_dev->snaps); 2644c53d5893SAlex Elder init_rwsem(&rbd_dev->header_rwsem); 2645c53d5893SAlex Elder 2646c53d5893SAlex Elder rbd_dev->spec = spec; 2647c53d5893SAlex Elder rbd_dev->rbd_client = rbdc; 2648c53d5893SAlex Elder 26490903e875SAlex Elder /* Initialize the layout used for all rbd requests */ 26500903e875SAlex Elder 26510903e875SAlex Elder rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); 26520903e875SAlex Elder rbd_dev->layout.fl_stripe_count = cpu_to_le32(1); 26530903e875SAlex Elder rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); 26540903e875SAlex Elder rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id); 26550903e875SAlex Elder 2656c53d5893SAlex Elder return rbd_dev; 2657c53d5893SAlex Elder } 2658c53d5893SAlex Elder 2659c53d5893SAlex Elder static void rbd_dev_destroy(struct rbd_device *rbd_dev) 2660c53d5893SAlex Elder { 266186b00e0dSAlex Elder rbd_spec_put(rbd_dev->parent_spec); 2662c53d5893SAlex Elder kfree(rbd_dev->header_name); 2663c53d5893SAlex Elder rbd_put_client(rbd_dev->rbd_client); 2664c53d5893SAlex Elder rbd_spec_put(rbd_dev->spec); 2665c53d5893SAlex Elder kfree(rbd_dev); 2666c53d5893SAlex Elder } 2667c53d5893SAlex Elder 2668304f6808SAlex Elder static bool rbd_snap_registered(struct rbd_snap *snap) 2669304f6808SAlex Elder { 2670304f6808SAlex Elder bool ret = snap->dev.type == &rbd_snap_device_type; 2671304f6808SAlex Elder bool reg = device_is_registered(&snap->dev); 2672304f6808SAlex Elder 2673304f6808SAlex Elder rbd_assert(!ret ^ reg); 2674304f6808SAlex Elder 2675304f6808SAlex Elder return ret; 2676304f6808SAlex Elder } 2677304f6808SAlex Elder 267841f38c2bSAlex Elder static void rbd_remove_snap_dev(struct rbd_snap *snap) 2679dfc5606dSYehuda Sadeh { 2680dfc5606dSYehuda Sadeh list_del(&snap->node); 2681304f6808SAlex Elder if (device_is_registered(&snap->dev)) 2682dfc5606dSYehuda Sadeh device_unregister(&snap->dev); 2683dfc5606dSYehuda Sadeh } 2684dfc5606dSYehuda Sadeh 268514e7085dSAlex Elder static int rbd_register_snap_dev(struct rbd_snap *snap, 2686dfc5606dSYehuda Sadeh struct device *parent) 2687dfc5606dSYehuda Sadeh { 2688dfc5606dSYehuda Sadeh struct device *dev = &snap->dev; 2689dfc5606dSYehuda Sadeh int ret; 2690dfc5606dSYehuda Sadeh 2691dfc5606dSYehuda Sadeh dev->type = &rbd_snap_device_type; 2692dfc5606dSYehuda Sadeh dev->parent = parent; 2693dfc5606dSYehuda Sadeh dev->release = rbd_snap_dev_release; 2694d4b125e9SAlex Elder dev_set_name(dev, "%s%s", RBD_SNAP_DEV_NAME_PREFIX, snap->name); 2695304f6808SAlex Elder dout("%s: registering device for snapshot %s\n", __func__, snap->name); 2696304f6808SAlex Elder 2697dfc5606dSYehuda Sadeh ret = device_register(dev); 2698dfc5606dSYehuda Sadeh 2699dfc5606dSYehuda Sadeh return ret; 2700dfc5606dSYehuda Sadeh } 2701dfc5606dSYehuda Sadeh 27024e891e0aSAlex Elder static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev, 2703c8d18425SAlex Elder const char *snap_name, 270434b13184SAlex Elder u64 snap_id, u64 snap_size, 270534b13184SAlex Elder u64 snap_features) 2706dfc5606dSYehuda Sadeh { 27074e891e0aSAlex Elder struct rbd_snap *snap; 2708dfc5606dSYehuda Sadeh int ret; 27094e891e0aSAlex Elder 27104e891e0aSAlex Elder snap = kzalloc(sizeof (*snap), GFP_KERNEL); 2711dfc5606dSYehuda Sadeh if (!snap) 27124e891e0aSAlex Elder return ERR_PTR(-ENOMEM); 27134e891e0aSAlex Elder 27144e891e0aSAlex Elder ret = -ENOMEM; 2715c8d18425SAlex Elder snap->name = kstrdup(snap_name, GFP_KERNEL); 27164e891e0aSAlex Elder if (!snap->name) 27174e891e0aSAlex Elder goto err; 27184e891e0aSAlex Elder 2719c8d18425SAlex Elder snap->id = snap_id; 2720c8d18425SAlex Elder snap->size = snap_size; 272134b13184SAlex Elder snap->features = snap_features; 27224e891e0aSAlex Elder 27234e891e0aSAlex Elder return snap; 27244e891e0aSAlex Elder 2725dfc5606dSYehuda Sadeh err: 2726dfc5606dSYehuda Sadeh kfree(snap->name); 2727dfc5606dSYehuda Sadeh kfree(snap); 27284e891e0aSAlex Elder 27294e891e0aSAlex Elder return ERR_PTR(ret); 2730dfc5606dSYehuda Sadeh } 2731dfc5606dSYehuda Sadeh 2732cd892126SAlex Elder static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which, 2733cd892126SAlex Elder u64 *snap_size, u64 *snap_features) 2734cd892126SAlex Elder { 2735cd892126SAlex Elder char *snap_name; 2736cd892126SAlex Elder 2737cd892126SAlex Elder rbd_assert(which < rbd_dev->header.snapc->num_snaps); 2738cd892126SAlex Elder 2739cd892126SAlex Elder *snap_size = rbd_dev->header.snap_sizes[which]; 2740cd892126SAlex Elder *snap_features = 0; /* No features for v1 */ 2741cd892126SAlex Elder 2742cd892126SAlex Elder /* Skip over names until we find the one we are looking for */ 2743cd892126SAlex Elder 2744cd892126SAlex Elder snap_name = rbd_dev->header.snap_names; 2745cd892126SAlex Elder while (which--) 2746cd892126SAlex Elder snap_name += strlen(snap_name) + 1; 2747cd892126SAlex Elder 2748cd892126SAlex Elder return snap_name; 2749cd892126SAlex Elder } 2750cd892126SAlex Elder 2751dfc5606dSYehuda Sadeh /* 27529d475de5SAlex Elder * Get the size and object order for an image snapshot, or if 27539d475de5SAlex Elder * snap_id is CEPH_NOSNAP, gets this information for the base 27549d475de5SAlex Elder * image. 27559d475de5SAlex Elder */ 27569d475de5SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id, 27579d475de5SAlex Elder u8 *order, u64 *snap_size) 27589d475de5SAlex Elder { 27599d475de5SAlex Elder __le64 snapid = cpu_to_le64(snap_id); 27609d475de5SAlex Elder int ret; 27619d475de5SAlex Elder struct { 27629d475de5SAlex Elder u8 order; 27639d475de5SAlex Elder __le64 size; 27649d475de5SAlex Elder } __attribute__ ((packed)) size_buf = { 0 }; 27659d475de5SAlex Elder 276636be9a76SAlex Elder ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, 27679d475de5SAlex Elder "rbd", "get_size", 27689d475de5SAlex Elder (char *) &snapid, sizeof (snapid), 276907b2391fSAlex Elder (char *) &size_buf, sizeof (size_buf), NULL); 277036be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 27719d475de5SAlex Elder if (ret < 0) 27729d475de5SAlex Elder return ret; 27739d475de5SAlex Elder 27749d475de5SAlex Elder *order = size_buf.order; 27759d475de5SAlex Elder *snap_size = le64_to_cpu(size_buf.size); 27769d475de5SAlex Elder 27779d475de5SAlex Elder dout(" snap_id 0x%016llx order = %u, snap_size = %llu\n", 27789d475de5SAlex Elder (unsigned long long) snap_id, (unsigned int) *order, 27799d475de5SAlex Elder (unsigned long long) *snap_size); 27809d475de5SAlex Elder 27819d475de5SAlex Elder return 0; 27829d475de5SAlex Elder } 27839d475de5SAlex Elder 27849d475de5SAlex Elder static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev) 27859d475de5SAlex Elder { 27869d475de5SAlex Elder return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP, 27879d475de5SAlex Elder &rbd_dev->header.obj_order, 27889d475de5SAlex Elder &rbd_dev->header.image_size); 27899d475de5SAlex Elder } 27909d475de5SAlex Elder 27911e130199SAlex Elder static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev) 27921e130199SAlex Elder { 27931e130199SAlex Elder void *reply_buf; 27941e130199SAlex Elder int ret; 27951e130199SAlex Elder void *p; 27961e130199SAlex Elder 27971e130199SAlex Elder reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL); 27981e130199SAlex Elder if (!reply_buf) 27991e130199SAlex Elder return -ENOMEM; 28001e130199SAlex Elder 280136be9a76SAlex Elder ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, 28021e130199SAlex Elder "rbd", "get_object_prefix", 28031e130199SAlex Elder NULL, 0, 280407b2391fSAlex Elder reply_buf, RBD_OBJ_PREFIX_LEN_MAX, NULL); 280536be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 28061e130199SAlex Elder if (ret < 0) 28071e130199SAlex Elder goto out; 28081e130199SAlex Elder 28091e130199SAlex Elder p = reply_buf; 28101e130199SAlex Elder rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p, 28111e130199SAlex Elder p + RBD_OBJ_PREFIX_LEN_MAX, 28121e130199SAlex Elder NULL, GFP_NOIO); 28131e130199SAlex Elder 28141e130199SAlex Elder if (IS_ERR(rbd_dev->header.object_prefix)) { 28151e130199SAlex Elder ret = PTR_ERR(rbd_dev->header.object_prefix); 28161e130199SAlex Elder rbd_dev->header.object_prefix = NULL; 28171e130199SAlex Elder } else { 28181e130199SAlex Elder dout(" object_prefix = %s\n", rbd_dev->header.object_prefix); 28191e130199SAlex Elder } 28201e130199SAlex Elder 28211e130199SAlex Elder out: 28221e130199SAlex Elder kfree(reply_buf); 28231e130199SAlex Elder 28241e130199SAlex Elder return ret; 28251e130199SAlex Elder } 28261e130199SAlex Elder 2827b1b5402aSAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id, 2828b1b5402aSAlex Elder u64 *snap_features) 2829b1b5402aSAlex Elder { 2830b1b5402aSAlex Elder __le64 snapid = cpu_to_le64(snap_id); 2831b1b5402aSAlex Elder struct { 2832b1b5402aSAlex Elder __le64 features; 2833b1b5402aSAlex Elder __le64 incompat; 2834b1b5402aSAlex Elder } features_buf = { 0 }; 2835d889140cSAlex Elder u64 incompat; 2836b1b5402aSAlex Elder int ret; 2837b1b5402aSAlex Elder 283836be9a76SAlex Elder ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, 2839b1b5402aSAlex Elder "rbd", "get_features", 2840b1b5402aSAlex Elder (char *) &snapid, sizeof (snapid), 2841b1b5402aSAlex Elder (char *) &features_buf, sizeof (features_buf), 284207b2391fSAlex Elder NULL); 284336be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 2844b1b5402aSAlex Elder if (ret < 0) 2845b1b5402aSAlex Elder return ret; 2846d889140cSAlex Elder 2847d889140cSAlex Elder incompat = le64_to_cpu(features_buf.incompat); 2848d889140cSAlex Elder if (incompat & ~RBD_FEATURES_ALL) 2849b8f5c6edSAlex Elder return -ENXIO; 2850d889140cSAlex Elder 2851b1b5402aSAlex Elder *snap_features = le64_to_cpu(features_buf.features); 2852b1b5402aSAlex Elder 2853b1b5402aSAlex Elder dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n", 2854b1b5402aSAlex Elder (unsigned long long) snap_id, 2855b1b5402aSAlex Elder (unsigned long long) *snap_features, 2856b1b5402aSAlex Elder (unsigned long long) le64_to_cpu(features_buf.incompat)); 2857b1b5402aSAlex Elder 2858b1b5402aSAlex Elder return 0; 2859b1b5402aSAlex Elder } 2860b1b5402aSAlex Elder 2861b1b5402aSAlex Elder static int rbd_dev_v2_features(struct rbd_device *rbd_dev) 2862b1b5402aSAlex Elder { 2863b1b5402aSAlex Elder return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP, 2864b1b5402aSAlex Elder &rbd_dev->header.features); 2865b1b5402aSAlex Elder } 2866b1b5402aSAlex Elder 286786b00e0dSAlex Elder static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev) 286886b00e0dSAlex Elder { 286986b00e0dSAlex Elder struct rbd_spec *parent_spec; 287086b00e0dSAlex Elder size_t size; 287186b00e0dSAlex Elder void *reply_buf = NULL; 287286b00e0dSAlex Elder __le64 snapid; 287386b00e0dSAlex Elder void *p; 287486b00e0dSAlex Elder void *end; 287586b00e0dSAlex Elder char *image_id; 287686b00e0dSAlex Elder u64 overlap; 287786b00e0dSAlex Elder int ret; 287886b00e0dSAlex Elder 287986b00e0dSAlex Elder parent_spec = rbd_spec_alloc(); 288086b00e0dSAlex Elder if (!parent_spec) 288186b00e0dSAlex Elder return -ENOMEM; 288286b00e0dSAlex Elder 288386b00e0dSAlex Elder size = sizeof (__le64) + /* pool_id */ 288486b00e0dSAlex Elder sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX + /* image_id */ 288586b00e0dSAlex Elder sizeof (__le64) + /* snap_id */ 288686b00e0dSAlex Elder sizeof (__le64); /* overlap */ 288786b00e0dSAlex Elder reply_buf = kmalloc(size, GFP_KERNEL); 288886b00e0dSAlex Elder if (!reply_buf) { 288986b00e0dSAlex Elder ret = -ENOMEM; 289086b00e0dSAlex Elder goto out_err; 289186b00e0dSAlex Elder } 289286b00e0dSAlex Elder 289386b00e0dSAlex Elder snapid = cpu_to_le64(CEPH_NOSNAP); 289436be9a76SAlex Elder ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, 289586b00e0dSAlex Elder "rbd", "get_parent", 289686b00e0dSAlex Elder (char *) &snapid, sizeof (snapid), 289707b2391fSAlex Elder (char *) reply_buf, size, NULL); 289836be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 289986b00e0dSAlex Elder if (ret < 0) 290086b00e0dSAlex Elder goto out_err; 290186b00e0dSAlex Elder 290286b00e0dSAlex Elder ret = -ERANGE; 290386b00e0dSAlex Elder p = reply_buf; 290486b00e0dSAlex Elder end = (char *) reply_buf + size; 290586b00e0dSAlex Elder ceph_decode_64_safe(&p, end, parent_spec->pool_id, out_err); 290686b00e0dSAlex Elder if (parent_spec->pool_id == CEPH_NOPOOL) 290786b00e0dSAlex Elder goto out; /* No parent? No problem. */ 290886b00e0dSAlex Elder 29090903e875SAlex Elder /* The ceph file layout needs to fit pool id in 32 bits */ 29100903e875SAlex Elder 29110903e875SAlex Elder ret = -EIO; 29120903e875SAlex Elder if (WARN_ON(parent_spec->pool_id > (u64) U32_MAX)) 29130903e875SAlex Elder goto out; 29140903e875SAlex Elder 2915979ed480SAlex Elder image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL); 291686b00e0dSAlex Elder if (IS_ERR(image_id)) { 291786b00e0dSAlex Elder ret = PTR_ERR(image_id); 291886b00e0dSAlex Elder goto out_err; 291986b00e0dSAlex Elder } 292086b00e0dSAlex Elder parent_spec->image_id = image_id; 292186b00e0dSAlex Elder ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err); 292286b00e0dSAlex Elder ceph_decode_64_safe(&p, end, overlap, out_err); 292386b00e0dSAlex Elder 292486b00e0dSAlex Elder rbd_dev->parent_overlap = overlap; 292586b00e0dSAlex Elder rbd_dev->parent_spec = parent_spec; 292686b00e0dSAlex Elder parent_spec = NULL; /* rbd_dev now owns this */ 292786b00e0dSAlex Elder out: 292886b00e0dSAlex Elder ret = 0; 292986b00e0dSAlex Elder out_err: 293086b00e0dSAlex Elder kfree(reply_buf); 293186b00e0dSAlex Elder rbd_spec_put(parent_spec); 293286b00e0dSAlex Elder 293386b00e0dSAlex Elder return ret; 293486b00e0dSAlex Elder } 293586b00e0dSAlex Elder 29369e15b77dSAlex Elder static char *rbd_dev_image_name(struct rbd_device *rbd_dev) 29379e15b77dSAlex Elder { 29389e15b77dSAlex Elder size_t image_id_size; 29399e15b77dSAlex Elder char *image_id; 29409e15b77dSAlex Elder void *p; 29419e15b77dSAlex Elder void *end; 29429e15b77dSAlex Elder size_t size; 29439e15b77dSAlex Elder void *reply_buf = NULL; 29449e15b77dSAlex Elder size_t len = 0; 29459e15b77dSAlex Elder char *image_name = NULL; 29469e15b77dSAlex Elder int ret; 29479e15b77dSAlex Elder 29489e15b77dSAlex Elder rbd_assert(!rbd_dev->spec->image_name); 29499e15b77dSAlex Elder 295069e7a02fSAlex Elder len = strlen(rbd_dev->spec->image_id); 295169e7a02fSAlex Elder image_id_size = sizeof (__le32) + len; 29529e15b77dSAlex Elder image_id = kmalloc(image_id_size, GFP_KERNEL); 29539e15b77dSAlex Elder if (!image_id) 29549e15b77dSAlex Elder return NULL; 29559e15b77dSAlex Elder 29569e15b77dSAlex Elder p = image_id; 29579e15b77dSAlex Elder end = (char *) image_id + image_id_size; 295869e7a02fSAlex Elder ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32) len); 29599e15b77dSAlex Elder 29609e15b77dSAlex Elder size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX; 29619e15b77dSAlex Elder reply_buf = kmalloc(size, GFP_KERNEL); 29629e15b77dSAlex Elder if (!reply_buf) 29639e15b77dSAlex Elder goto out; 29649e15b77dSAlex Elder 296536be9a76SAlex Elder ret = rbd_obj_method_sync(rbd_dev, RBD_DIRECTORY, 29669e15b77dSAlex Elder "rbd", "dir_get_name", 29679e15b77dSAlex Elder image_id, image_id_size, 296807b2391fSAlex Elder (char *) reply_buf, size, NULL); 29699e15b77dSAlex Elder if (ret < 0) 29709e15b77dSAlex Elder goto out; 29719e15b77dSAlex Elder p = reply_buf; 29729e15b77dSAlex Elder end = (char *) reply_buf + size; 29739e15b77dSAlex Elder image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL); 29749e15b77dSAlex Elder if (IS_ERR(image_name)) 29759e15b77dSAlex Elder image_name = NULL; 29769e15b77dSAlex Elder else 29779e15b77dSAlex Elder dout("%s: name is %s len is %zd\n", __func__, image_name, len); 29789e15b77dSAlex Elder out: 29799e15b77dSAlex Elder kfree(reply_buf); 29809e15b77dSAlex Elder kfree(image_id); 29819e15b77dSAlex Elder 29829e15b77dSAlex Elder return image_name; 29839e15b77dSAlex Elder } 29849e15b77dSAlex Elder 29859e15b77dSAlex Elder /* 29869e15b77dSAlex Elder * When a parent image gets probed, we only have the pool, image, 29879e15b77dSAlex Elder * and snapshot ids but not the names of any of them. This call 29889e15b77dSAlex Elder * is made later to fill in those names. It has to be done after 29899e15b77dSAlex Elder * rbd_dev_snaps_update() has completed because some of the 29909e15b77dSAlex Elder * information (in particular, snapshot name) is not available 29919e15b77dSAlex Elder * until then. 29929e15b77dSAlex Elder */ 29939e15b77dSAlex Elder static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev) 29949e15b77dSAlex Elder { 29959e15b77dSAlex Elder struct ceph_osd_client *osdc; 29969e15b77dSAlex Elder const char *name; 29979e15b77dSAlex Elder void *reply_buf = NULL; 29989e15b77dSAlex Elder int ret; 29999e15b77dSAlex Elder 30009e15b77dSAlex Elder if (rbd_dev->spec->pool_name) 30019e15b77dSAlex Elder return 0; /* Already have the names */ 30029e15b77dSAlex Elder 30039e15b77dSAlex Elder /* Look up the pool name */ 30049e15b77dSAlex Elder 30059e15b77dSAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 30069e15b77dSAlex Elder name = ceph_pg_pool_name_by_id(osdc->osdmap, rbd_dev->spec->pool_id); 3007935dc89fSAlex Elder if (!name) { 3008935dc89fSAlex Elder rbd_warn(rbd_dev, "there is no pool with id %llu", 3009935dc89fSAlex Elder rbd_dev->spec->pool_id); /* Really a BUG() */ 3010935dc89fSAlex Elder return -EIO; 3011935dc89fSAlex Elder } 30129e15b77dSAlex Elder 30139e15b77dSAlex Elder rbd_dev->spec->pool_name = kstrdup(name, GFP_KERNEL); 30149e15b77dSAlex Elder if (!rbd_dev->spec->pool_name) 30159e15b77dSAlex Elder return -ENOMEM; 30169e15b77dSAlex Elder 30179e15b77dSAlex Elder /* Fetch the image name; tolerate failure here */ 30189e15b77dSAlex Elder 30199e15b77dSAlex Elder name = rbd_dev_image_name(rbd_dev); 302069e7a02fSAlex Elder if (name) 30219e15b77dSAlex Elder rbd_dev->spec->image_name = (char *) name; 302269e7a02fSAlex Elder else 302306ecc6cbSAlex Elder rbd_warn(rbd_dev, "unable to get image name"); 30249e15b77dSAlex Elder 30259e15b77dSAlex Elder /* Look up the snapshot name. */ 30269e15b77dSAlex Elder 30279e15b77dSAlex Elder name = rbd_snap_name(rbd_dev, rbd_dev->spec->snap_id); 30289e15b77dSAlex Elder if (!name) { 3029935dc89fSAlex Elder rbd_warn(rbd_dev, "no snapshot with id %llu", 3030935dc89fSAlex Elder rbd_dev->spec->snap_id); /* Really a BUG() */ 30319e15b77dSAlex Elder ret = -EIO; 30329e15b77dSAlex Elder goto out_err; 30339e15b77dSAlex Elder } 30349e15b77dSAlex Elder rbd_dev->spec->snap_name = kstrdup(name, GFP_KERNEL); 30359e15b77dSAlex Elder if(!rbd_dev->spec->snap_name) 30369e15b77dSAlex Elder goto out_err; 30379e15b77dSAlex Elder 30389e15b77dSAlex Elder return 0; 30399e15b77dSAlex Elder out_err: 30409e15b77dSAlex Elder kfree(reply_buf); 30419e15b77dSAlex Elder kfree(rbd_dev->spec->pool_name); 30429e15b77dSAlex Elder rbd_dev->spec->pool_name = NULL; 30439e15b77dSAlex Elder 30449e15b77dSAlex Elder return ret; 30459e15b77dSAlex Elder } 30469e15b77dSAlex Elder 30476e14b1a6SAlex Elder static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver) 304835d489f9SAlex Elder { 304935d489f9SAlex Elder size_t size; 305035d489f9SAlex Elder int ret; 305135d489f9SAlex Elder void *reply_buf; 305235d489f9SAlex Elder void *p; 305335d489f9SAlex Elder void *end; 305435d489f9SAlex Elder u64 seq; 305535d489f9SAlex Elder u32 snap_count; 305635d489f9SAlex Elder struct ceph_snap_context *snapc; 305735d489f9SAlex Elder u32 i; 305835d489f9SAlex Elder 305935d489f9SAlex Elder /* 306035d489f9SAlex Elder * We'll need room for the seq value (maximum snapshot id), 306135d489f9SAlex Elder * snapshot count, and array of that many snapshot ids. 306235d489f9SAlex Elder * For now we have a fixed upper limit on the number we're 306335d489f9SAlex Elder * prepared to receive. 306435d489f9SAlex Elder */ 306535d489f9SAlex Elder size = sizeof (__le64) + sizeof (__le32) + 306635d489f9SAlex Elder RBD_MAX_SNAP_COUNT * sizeof (__le64); 306735d489f9SAlex Elder reply_buf = kzalloc(size, GFP_KERNEL); 306835d489f9SAlex Elder if (!reply_buf) 306935d489f9SAlex Elder return -ENOMEM; 307035d489f9SAlex Elder 307136be9a76SAlex Elder ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, 307235d489f9SAlex Elder "rbd", "get_snapcontext", 307335d489f9SAlex Elder NULL, 0, 307407b2391fSAlex Elder reply_buf, size, ver); 307536be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 307635d489f9SAlex Elder if (ret < 0) 307735d489f9SAlex Elder goto out; 307835d489f9SAlex Elder 307935d489f9SAlex Elder ret = -ERANGE; 308035d489f9SAlex Elder p = reply_buf; 308135d489f9SAlex Elder end = (char *) reply_buf + size; 308235d489f9SAlex Elder ceph_decode_64_safe(&p, end, seq, out); 308335d489f9SAlex Elder ceph_decode_32_safe(&p, end, snap_count, out); 308435d489f9SAlex Elder 308535d489f9SAlex Elder /* 308635d489f9SAlex Elder * Make sure the reported number of snapshot ids wouldn't go 308735d489f9SAlex Elder * beyond the end of our buffer. But before checking that, 308835d489f9SAlex Elder * make sure the computed size of the snapshot context we 308935d489f9SAlex Elder * allocate is representable in a size_t. 309035d489f9SAlex Elder */ 309135d489f9SAlex Elder if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context)) 309235d489f9SAlex Elder / sizeof (u64)) { 309335d489f9SAlex Elder ret = -EINVAL; 309435d489f9SAlex Elder goto out; 309535d489f9SAlex Elder } 309635d489f9SAlex Elder if (!ceph_has_room(&p, end, snap_count * sizeof (__le64))) 309735d489f9SAlex Elder goto out; 309835d489f9SAlex Elder 309935d489f9SAlex Elder size = sizeof (struct ceph_snap_context) + 310035d489f9SAlex Elder snap_count * sizeof (snapc->snaps[0]); 310135d489f9SAlex Elder snapc = kmalloc(size, GFP_KERNEL); 310235d489f9SAlex Elder if (!snapc) { 310335d489f9SAlex Elder ret = -ENOMEM; 310435d489f9SAlex Elder goto out; 310535d489f9SAlex Elder } 310635d489f9SAlex Elder 310735d489f9SAlex Elder atomic_set(&snapc->nref, 1); 310835d489f9SAlex Elder snapc->seq = seq; 310935d489f9SAlex Elder snapc->num_snaps = snap_count; 311035d489f9SAlex Elder for (i = 0; i < snap_count; i++) 311135d489f9SAlex Elder snapc->snaps[i] = ceph_decode_64(&p); 311235d489f9SAlex Elder 311335d489f9SAlex Elder rbd_dev->header.snapc = snapc; 311435d489f9SAlex Elder 311535d489f9SAlex Elder dout(" snap context seq = %llu, snap_count = %u\n", 311635d489f9SAlex Elder (unsigned long long) seq, (unsigned int) snap_count); 311735d489f9SAlex Elder 311835d489f9SAlex Elder out: 311935d489f9SAlex Elder kfree(reply_buf); 312035d489f9SAlex Elder 312135d489f9SAlex Elder return 0; 312235d489f9SAlex Elder } 312335d489f9SAlex Elder 3124b8b1e2dbSAlex Elder static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which) 3125b8b1e2dbSAlex Elder { 3126b8b1e2dbSAlex Elder size_t size; 3127b8b1e2dbSAlex Elder void *reply_buf; 3128b8b1e2dbSAlex Elder __le64 snap_id; 3129b8b1e2dbSAlex Elder int ret; 3130b8b1e2dbSAlex Elder void *p; 3131b8b1e2dbSAlex Elder void *end; 3132b8b1e2dbSAlex Elder char *snap_name; 3133b8b1e2dbSAlex Elder 3134b8b1e2dbSAlex Elder size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN; 3135b8b1e2dbSAlex Elder reply_buf = kmalloc(size, GFP_KERNEL); 3136b8b1e2dbSAlex Elder if (!reply_buf) 3137b8b1e2dbSAlex Elder return ERR_PTR(-ENOMEM); 3138b8b1e2dbSAlex Elder 3139b8b1e2dbSAlex Elder snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]); 314036be9a76SAlex Elder ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, 3141b8b1e2dbSAlex Elder "rbd", "get_snapshot_name", 3142b8b1e2dbSAlex Elder (char *) &snap_id, sizeof (snap_id), 314307b2391fSAlex Elder reply_buf, size, NULL); 314436be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 3145b8b1e2dbSAlex Elder if (ret < 0) 3146b8b1e2dbSAlex Elder goto out; 3147b8b1e2dbSAlex Elder 3148b8b1e2dbSAlex Elder p = reply_buf; 3149b8b1e2dbSAlex Elder end = (char *) reply_buf + size; 3150e5c35534SAlex Elder snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL); 3151b8b1e2dbSAlex Elder if (IS_ERR(snap_name)) { 3152b8b1e2dbSAlex Elder ret = PTR_ERR(snap_name); 3153b8b1e2dbSAlex Elder goto out; 3154b8b1e2dbSAlex Elder } else { 3155b8b1e2dbSAlex Elder dout(" snap_id 0x%016llx snap_name = %s\n", 3156b8b1e2dbSAlex Elder (unsigned long long) le64_to_cpu(snap_id), snap_name); 3157b8b1e2dbSAlex Elder } 3158b8b1e2dbSAlex Elder kfree(reply_buf); 3159b8b1e2dbSAlex Elder 3160b8b1e2dbSAlex Elder return snap_name; 3161b8b1e2dbSAlex Elder out: 3162b8b1e2dbSAlex Elder kfree(reply_buf); 3163b8b1e2dbSAlex Elder 3164b8b1e2dbSAlex Elder return ERR_PTR(ret); 3165b8b1e2dbSAlex Elder } 3166b8b1e2dbSAlex Elder 3167b8b1e2dbSAlex Elder static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which, 3168b8b1e2dbSAlex Elder u64 *snap_size, u64 *snap_features) 3169b8b1e2dbSAlex Elder { 3170e0b49868SAlex Elder u64 snap_id; 3171b8b1e2dbSAlex Elder u8 order; 3172b8b1e2dbSAlex Elder int ret; 3173b8b1e2dbSAlex Elder 3174b8b1e2dbSAlex Elder snap_id = rbd_dev->header.snapc->snaps[which]; 3175b8b1e2dbSAlex Elder ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, &order, snap_size); 3176b8b1e2dbSAlex Elder if (ret) 3177b8b1e2dbSAlex Elder return ERR_PTR(ret); 3178b8b1e2dbSAlex Elder ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, snap_features); 3179b8b1e2dbSAlex Elder if (ret) 3180b8b1e2dbSAlex Elder return ERR_PTR(ret); 3181b8b1e2dbSAlex Elder 3182b8b1e2dbSAlex Elder return rbd_dev_v2_snap_name(rbd_dev, which); 3183b8b1e2dbSAlex Elder } 3184b8b1e2dbSAlex Elder 3185b8b1e2dbSAlex Elder static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which, 3186b8b1e2dbSAlex Elder u64 *snap_size, u64 *snap_features) 3187b8b1e2dbSAlex Elder { 3188b8b1e2dbSAlex Elder if (rbd_dev->image_format == 1) 3189b8b1e2dbSAlex Elder return rbd_dev_v1_snap_info(rbd_dev, which, 3190b8b1e2dbSAlex Elder snap_size, snap_features); 3191b8b1e2dbSAlex Elder if (rbd_dev->image_format == 2) 3192b8b1e2dbSAlex Elder return rbd_dev_v2_snap_info(rbd_dev, which, 3193b8b1e2dbSAlex Elder snap_size, snap_features); 3194b8b1e2dbSAlex Elder return ERR_PTR(-EINVAL); 3195b8b1e2dbSAlex Elder } 3196b8b1e2dbSAlex Elder 3197117973fbSAlex Elder static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver) 3198117973fbSAlex Elder { 3199117973fbSAlex Elder int ret; 3200117973fbSAlex Elder __u8 obj_order; 3201117973fbSAlex Elder 3202117973fbSAlex Elder down_write(&rbd_dev->header_rwsem); 3203117973fbSAlex Elder 3204117973fbSAlex Elder /* Grab old order first, to see if it changes */ 3205117973fbSAlex Elder 3206117973fbSAlex Elder obj_order = rbd_dev->header.obj_order, 3207117973fbSAlex Elder ret = rbd_dev_v2_image_size(rbd_dev); 3208117973fbSAlex Elder if (ret) 3209117973fbSAlex Elder goto out; 3210117973fbSAlex Elder if (rbd_dev->header.obj_order != obj_order) { 3211117973fbSAlex Elder ret = -EIO; 3212117973fbSAlex Elder goto out; 3213117973fbSAlex Elder } 3214117973fbSAlex Elder rbd_update_mapping_size(rbd_dev); 3215117973fbSAlex Elder 3216117973fbSAlex Elder ret = rbd_dev_v2_snap_context(rbd_dev, hver); 3217117973fbSAlex Elder dout("rbd_dev_v2_snap_context returned %d\n", ret); 3218117973fbSAlex Elder if (ret) 3219117973fbSAlex Elder goto out; 3220117973fbSAlex Elder ret = rbd_dev_snaps_update(rbd_dev); 3221117973fbSAlex Elder dout("rbd_dev_snaps_update returned %d\n", ret); 3222117973fbSAlex Elder if (ret) 3223117973fbSAlex Elder goto out; 3224117973fbSAlex Elder ret = rbd_dev_snaps_register(rbd_dev); 3225117973fbSAlex Elder dout("rbd_dev_snaps_register returned %d\n", ret); 3226117973fbSAlex Elder out: 3227117973fbSAlex Elder up_write(&rbd_dev->header_rwsem); 3228117973fbSAlex Elder 3229117973fbSAlex Elder return ret; 3230117973fbSAlex Elder } 3231117973fbSAlex Elder 32329d475de5SAlex Elder /* 323335938150SAlex Elder * Scan the rbd device's current snapshot list and compare it to the 323435938150SAlex Elder * newly-received snapshot context. Remove any existing snapshots 323535938150SAlex Elder * not present in the new snapshot context. Add a new snapshot for 323635938150SAlex Elder * any snaphots in the snapshot context not in the current list. 323735938150SAlex Elder * And verify there are no changes to snapshots we already know 323835938150SAlex Elder * about. 323935938150SAlex Elder * 324035938150SAlex Elder * Assumes the snapshots in the snapshot context are sorted by 324135938150SAlex Elder * snapshot id, highest id first. (Snapshots in the rbd_dev's list 324235938150SAlex Elder * are also maintained in that order.) 3243dfc5606dSYehuda Sadeh */ 3244304f6808SAlex Elder static int rbd_dev_snaps_update(struct rbd_device *rbd_dev) 3245dfc5606dSYehuda Sadeh { 324635938150SAlex Elder struct ceph_snap_context *snapc = rbd_dev->header.snapc; 324735938150SAlex Elder const u32 snap_count = snapc->num_snaps; 324835938150SAlex Elder struct list_head *head = &rbd_dev->snaps; 324935938150SAlex Elder struct list_head *links = head->next; 325035938150SAlex Elder u32 index = 0; 3251dfc5606dSYehuda Sadeh 32529fcbb800SAlex Elder dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count); 325335938150SAlex Elder while (index < snap_count || links != head) { 325435938150SAlex Elder u64 snap_id; 325535938150SAlex Elder struct rbd_snap *snap; 3256cd892126SAlex Elder char *snap_name; 3257cd892126SAlex Elder u64 snap_size = 0; 3258cd892126SAlex Elder u64 snap_features = 0; 3259dfc5606dSYehuda Sadeh 326035938150SAlex Elder snap_id = index < snap_count ? snapc->snaps[index] 326135938150SAlex Elder : CEPH_NOSNAP; 326235938150SAlex Elder snap = links != head ? list_entry(links, struct rbd_snap, node) 326335938150SAlex Elder : NULL; 3264aafb230eSAlex Elder rbd_assert(!snap || snap->id != CEPH_NOSNAP); 3265dfc5606dSYehuda Sadeh 326635938150SAlex Elder if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) { 326735938150SAlex Elder struct list_head *next = links->next; 3268dfc5606dSYehuda Sadeh 32696d292906SAlex Elder /* 32706d292906SAlex Elder * A previously-existing snapshot is not in 32716d292906SAlex Elder * the new snap context. 32726d292906SAlex Elder * 32736d292906SAlex Elder * If the now missing snapshot is the one the 32746d292906SAlex Elder * image is mapped to, clear its exists flag 32756d292906SAlex Elder * so we can avoid sending any more requests 32766d292906SAlex Elder * to it. 32776d292906SAlex Elder */ 32780d7dbfceSAlex Elder if (rbd_dev->spec->snap_id == snap->id) 32796d292906SAlex Elder clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); 328041f38c2bSAlex Elder rbd_remove_snap_dev(snap); 32819fcbb800SAlex Elder dout("%ssnap id %llu has been removed\n", 32820d7dbfceSAlex Elder rbd_dev->spec->snap_id == snap->id ? 32830d7dbfceSAlex Elder "mapped " : "", 32849fcbb800SAlex Elder (unsigned long long) snap->id); 3285dfc5606dSYehuda Sadeh 328635938150SAlex Elder /* Done with this list entry; advance */ 328735938150SAlex Elder 328835938150SAlex Elder links = next; 328935938150SAlex Elder continue; 3290dfc5606dSYehuda Sadeh } 329135938150SAlex Elder 3292b8b1e2dbSAlex Elder snap_name = rbd_dev_snap_info(rbd_dev, index, 3293cd892126SAlex Elder &snap_size, &snap_features); 3294cd892126SAlex Elder if (IS_ERR(snap_name)) 3295cd892126SAlex Elder return PTR_ERR(snap_name); 3296cd892126SAlex Elder 32979fcbb800SAlex Elder dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count, 32989fcbb800SAlex Elder (unsigned long long) snap_id); 329935938150SAlex Elder if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) { 330035938150SAlex Elder struct rbd_snap *new_snap; 330135938150SAlex Elder 330235938150SAlex Elder /* We haven't seen this snapshot before */ 330335938150SAlex Elder 3304c8d18425SAlex Elder new_snap = __rbd_add_snap_dev(rbd_dev, snap_name, 3305cd892126SAlex Elder snap_id, snap_size, snap_features); 33069fcbb800SAlex Elder if (IS_ERR(new_snap)) { 33079fcbb800SAlex Elder int err = PTR_ERR(new_snap); 33089fcbb800SAlex Elder 33099fcbb800SAlex Elder dout(" failed to add dev, error %d\n", err); 33109fcbb800SAlex Elder 33119fcbb800SAlex Elder return err; 33129fcbb800SAlex Elder } 331335938150SAlex Elder 331435938150SAlex Elder /* New goes before existing, or at end of list */ 331535938150SAlex Elder 33169fcbb800SAlex Elder dout(" added dev%s\n", snap ? "" : " at end\n"); 331735938150SAlex Elder if (snap) 331835938150SAlex Elder list_add_tail(&new_snap->node, &snap->node); 331935938150SAlex Elder else 3320523f3258SAlex Elder list_add_tail(&new_snap->node, head); 332135938150SAlex Elder } else { 332235938150SAlex Elder /* Already have this one */ 332335938150SAlex Elder 33249fcbb800SAlex Elder dout(" already present\n"); 33259fcbb800SAlex Elder 3326cd892126SAlex Elder rbd_assert(snap->size == snap_size); 3327aafb230eSAlex Elder rbd_assert(!strcmp(snap->name, snap_name)); 3328cd892126SAlex Elder rbd_assert(snap->features == snap_features); 332935938150SAlex Elder 333035938150SAlex Elder /* Done with this list entry; advance */ 333135938150SAlex Elder 333235938150SAlex Elder links = links->next; 3333dfc5606dSYehuda Sadeh } 333435938150SAlex Elder 333535938150SAlex Elder /* Advance to the next entry in the snapshot context */ 333635938150SAlex Elder 333735938150SAlex Elder index++; 3338dfc5606dSYehuda Sadeh } 33399fcbb800SAlex Elder dout("%s: done\n", __func__); 3340dfc5606dSYehuda Sadeh 3341dfc5606dSYehuda Sadeh return 0; 3342dfc5606dSYehuda Sadeh } 3343dfc5606dSYehuda Sadeh 3344304f6808SAlex Elder /* 3345304f6808SAlex Elder * Scan the list of snapshots and register the devices for any that 3346304f6808SAlex Elder * have not already been registered. 3347304f6808SAlex Elder */ 3348304f6808SAlex Elder static int rbd_dev_snaps_register(struct rbd_device *rbd_dev) 3349304f6808SAlex Elder { 3350304f6808SAlex Elder struct rbd_snap *snap; 3351304f6808SAlex Elder int ret = 0; 3352304f6808SAlex Elder 3353304f6808SAlex Elder dout("%s called\n", __func__); 335486ff77bbSAlex Elder if (WARN_ON(!device_is_registered(&rbd_dev->dev))) 335586ff77bbSAlex Elder return -EIO; 3356304f6808SAlex Elder 3357304f6808SAlex Elder list_for_each_entry(snap, &rbd_dev->snaps, node) { 3358304f6808SAlex Elder if (!rbd_snap_registered(snap)) { 3359304f6808SAlex Elder ret = rbd_register_snap_dev(snap, &rbd_dev->dev); 3360304f6808SAlex Elder if (ret < 0) 3361304f6808SAlex Elder break; 3362304f6808SAlex Elder } 3363304f6808SAlex Elder } 3364304f6808SAlex Elder dout("%s: returning %d\n", __func__, ret); 3365304f6808SAlex Elder 3366304f6808SAlex Elder return ret; 3367304f6808SAlex Elder } 3368304f6808SAlex Elder 3369dfc5606dSYehuda Sadeh static int rbd_bus_add_dev(struct rbd_device *rbd_dev) 3370dfc5606dSYehuda Sadeh { 3371dfc5606dSYehuda Sadeh struct device *dev; 3372cd789ab9SAlex Elder int ret; 3373dfc5606dSYehuda Sadeh 3374dfc5606dSYehuda Sadeh mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); 3375dfc5606dSYehuda Sadeh 3376cd789ab9SAlex Elder dev = &rbd_dev->dev; 3377dfc5606dSYehuda Sadeh dev->bus = &rbd_bus_type; 3378dfc5606dSYehuda Sadeh dev->type = &rbd_device_type; 3379dfc5606dSYehuda Sadeh dev->parent = &rbd_root_dev; 3380dfc5606dSYehuda Sadeh dev->release = rbd_dev_release; 3381de71a297SAlex Elder dev_set_name(dev, "%d", rbd_dev->dev_id); 3382dfc5606dSYehuda Sadeh ret = device_register(dev); 3383dfc5606dSYehuda Sadeh 3384dfc5606dSYehuda Sadeh mutex_unlock(&ctl_mutex); 3385cd789ab9SAlex Elder 3386dfc5606dSYehuda Sadeh return ret; 3387602adf40SYehuda Sadeh } 3388602adf40SYehuda Sadeh 3389dfc5606dSYehuda Sadeh static void rbd_bus_del_dev(struct rbd_device *rbd_dev) 3390dfc5606dSYehuda Sadeh { 3391dfc5606dSYehuda Sadeh device_unregister(&rbd_dev->dev); 3392dfc5606dSYehuda Sadeh } 3393dfc5606dSYehuda Sadeh 3394e2839308SAlex Elder static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0); 33951ddbe94eSAlex Elder 33961ddbe94eSAlex Elder /* 3397499afd5bSAlex Elder * Get a unique rbd identifier for the given new rbd_dev, and add 3398499afd5bSAlex Elder * the rbd_dev to the global list. The minimum rbd id is 1. 33991ddbe94eSAlex Elder */ 3400e2839308SAlex Elder static void rbd_dev_id_get(struct rbd_device *rbd_dev) 3401b7f23c36SAlex Elder { 3402e2839308SAlex Elder rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max); 3403499afd5bSAlex Elder 3404499afd5bSAlex Elder spin_lock(&rbd_dev_list_lock); 3405499afd5bSAlex Elder list_add_tail(&rbd_dev->node, &rbd_dev_list); 3406499afd5bSAlex Elder spin_unlock(&rbd_dev_list_lock); 3407e2839308SAlex Elder dout("rbd_dev %p given dev id %llu\n", rbd_dev, 3408e2839308SAlex Elder (unsigned long long) rbd_dev->dev_id); 3409b7f23c36SAlex Elder } 3410b7f23c36SAlex Elder 34111ddbe94eSAlex Elder /* 3412499afd5bSAlex Elder * Remove an rbd_dev from the global list, and record that its 3413499afd5bSAlex Elder * identifier is no longer in use. 34141ddbe94eSAlex Elder */ 3415e2839308SAlex Elder static void rbd_dev_id_put(struct rbd_device *rbd_dev) 34161ddbe94eSAlex Elder { 3417d184f6bfSAlex Elder struct list_head *tmp; 3418de71a297SAlex Elder int rbd_id = rbd_dev->dev_id; 3419d184f6bfSAlex Elder int max_id; 3420d184f6bfSAlex Elder 3421aafb230eSAlex Elder rbd_assert(rbd_id > 0); 3422499afd5bSAlex Elder 3423e2839308SAlex Elder dout("rbd_dev %p released dev id %llu\n", rbd_dev, 3424e2839308SAlex Elder (unsigned long long) rbd_dev->dev_id); 3425499afd5bSAlex Elder spin_lock(&rbd_dev_list_lock); 3426499afd5bSAlex Elder list_del_init(&rbd_dev->node); 3427d184f6bfSAlex Elder 3428d184f6bfSAlex Elder /* 3429d184f6bfSAlex Elder * If the id being "put" is not the current maximum, there 3430d184f6bfSAlex Elder * is nothing special we need to do. 3431d184f6bfSAlex Elder */ 3432e2839308SAlex Elder if (rbd_id != atomic64_read(&rbd_dev_id_max)) { 3433d184f6bfSAlex Elder spin_unlock(&rbd_dev_list_lock); 3434d184f6bfSAlex Elder return; 3435d184f6bfSAlex Elder } 3436d184f6bfSAlex Elder 3437d184f6bfSAlex Elder /* 3438d184f6bfSAlex Elder * We need to update the current maximum id. Search the 3439d184f6bfSAlex Elder * list to find out what it is. We're more likely to find 3440d184f6bfSAlex Elder * the maximum at the end, so search the list backward. 3441d184f6bfSAlex Elder */ 3442d184f6bfSAlex Elder max_id = 0; 3443d184f6bfSAlex Elder list_for_each_prev(tmp, &rbd_dev_list) { 3444d184f6bfSAlex Elder struct rbd_device *rbd_dev; 3445d184f6bfSAlex Elder 3446d184f6bfSAlex Elder rbd_dev = list_entry(tmp, struct rbd_device, node); 3447b213e0b1SAlex Elder if (rbd_dev->dev_id > max_id) 3448b213e0b1SAlex Elder max_id = rbd_dev->dev_id; 3449d184f6bfSAlex Elder } 3450499afd5bSAlex Elder spin_unlock(&rbd_dev_list_lock); 34511ddbe94eSAlex Elder 34521ddbe94eSAlex Elder /* 3453e2839308SAlex Elder * The max id could have been updated by rbd_dev_id_get(), in 3454d184f6bfSAlex Elder * which case it now accurately reflects the new maximum. 3455d184f6bfSAlex Elder * Be careful not to overwrite the maximum value in that 3456d184f6bfSAlex Elder * case. 34571ddbe94eSAlex Elder */ 3458e2839308SAlex Elder atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id); 3459e2839308SAlex Elder dout(" max dev id has been reset\n"); 3460b7f23c36SAlex Elder } 3461b7f23c36SAlex Elder 3462a725f65eSAlex Elder /* 3463e28fff26SAlex Elder * Skips over white space at *buf, and updates *buf to point to the 3464e28fff26SAlex Elder * first found non-space character (if any). Returns the length of 3465593a9e7bSAlex Elder * the token (string of non-white space characters) found. Note 3466593a9e7bSAlex Elder * that *buf must be terminated with '\0'. 3467e28fff26SAlex Elder */ 3468e28fff26SAlex Elder static inline size_t next_token(const char **buf) 3469e28fff26SAlex Elder { 3470e28fff26SAlex Elder /* 3471e28fff26SAlex Elder * These are the characters that produce nonzero for 3472e28fff26SAlex Elder * isspace() in the "C" and "POSIX" locales. 3473e28fff26SAlex Elder */ 3474e28fff26SAlex Elder const char *spaces = " \f\n\r\t\v"; 3475e28fff26SAlex Elder 3476e28fff26SAlex Elder *buf += strspn(*buf, spaces); /* Find start of token */ 3477e28fff26SAlex Elder 3478e28fff26SAlex Elder return strcspn(*buf, spaces); /* Return token length */ 3479e28fff26SAlex Elder } 3480e28fff26SAlex Elder 3481e28fff26SAlex Elder /* 3482e28fff26SAlex Elder * Finds the next token in *buf, and if the provided token buffer is 3483e28fff26SAlex Elder * big enough, copies the found token into it. The result, if 3484593a9e7bSAlex Elder * copied, is guaranteed to be terminated with '\0'. Note that *buf 3485593a9e7bSAlex Elder * must be terminated with '\0' on entry. 3486e28fff26SAlex Elder * 3487e28fff26SAlex Elder * Returns the length of the token found (not including the '\0'). 3488e28fff26SAlex Elder * Return value will be 0 if no token is found, and it will be >= 3489e28fff26SAlex Elder * token_size if the token would not fit. 3490e28fff26SAlex Elder * 3491593a9e7bSAlex Elder * The *buf pointer will be updated to point beyond the end of the 3492e28fff26SAlex Elder * found token. Note that this occurs even if the token buffer is 3493e28fff26SAlex Elder * too small to hold it. 3494e28fff26SAlex Elder */ 3495e28fff26SAlex Elder static inline size_t copy_token(const char **buf, 3496e28fff26SAlex Elder char *token, 3497e28fff26SAlex Elder size_t token_size) 3498e28fff26SAlex Elder { 3499e28fff26SAlex Elder size_t len; 3500e28fff26SAlex Elder 3501e28fff26SAlex Elder len = next_token(buf); 3502e28fff26SAlex Elder if (len < token_size) { 3503e28fff26SAlex Elder memcpy(token, *buf, len); 3504e28fff26SAlex Elder *(token + len) = '\0'; 3505e28fff26SAlex Elder } 3506e28fff26SAlex Elder *buf += len; 3507e28fff26SAlex Elder 3508e28fff26SAlex Elder return len; 3509e28fff26SAlex Elder } 3510e28fff26SAlex Elder 3511e28fff26SAlex Elder /* 3512ea3352f4SAlex Elder * Finds the next token in *buf, dynamically allocates a buffer big 3513ea3352f4SAlex Elder * enough to hold a copy of it, and copies the token into the new 3514ea3352f4SAlex Elder * buffer. The copy is guaranteed to be terminated with '\0'. Note 3515ea3352f4SAlex Elder * that a duplicate buffer is created even for a zero-length token. 3516ea3352f4SAlex Elder * 3517ea3352f4SAlex Elder * Returns a pointer to the newly-allocated duplicate, or a null 3518ea3352f4SAlex Elder * pointer if memory for the duplicate was not available. If 3519ea3352f4SAlex Elder * the lenp argument is a non-null pointer, the length of the token 3520ea3352f4SAlex Elder * (not including the '\0') is returned in *lenp. 3521ea3352f4SAlex Elder * 3522ea3352f4SAlex Elder * If successful, the *buf pointer will be updated to point beyond 3523ea3352f4SAlex Elder * the end of the found token. 3524ea3352f4SAlex Elder * 3525ea3352f4SAlex Elder * Note: uses GFP_KERNEL for allocation. 3526ea3352f4SAlex Elder */ 3527ea3352f4SAlex Elder static inline char *dup_token(const char **buf, size_t *lenp) 3528ea3352f4SAlex Elder { 3529ea3352f4SAlex Elder char *dup; 3530ea3352f4SAlex Elder size_t len; 3531ea3352f4SAlex Elder 3532ea3352f4SAlex Elder len = next_token(buf); 35334caf35f9SAlex Elder dup = kmemdup(*buf, len + 1, GFP_KERNEL); 3534ea3352f4SAlex Elder if (!dup) 3535ea3352f4SAlex Elder return NULL; 3536ea3352f4SAlex Elder *(dup + len) = '\0'; 3537ea3352f4SAlex Elder *buf += len; 3538ea3352f4SAlex Elder 3539ea3352f4SAlex Elder if (lenp) 3540ea3352f4SAlex Elder *lenp = len; 3541ea3352f4SAlex Elder 3542ea3352f4SAlex Elder return dup; 3543ea3352f4SAlex Elder } 3544ea3352f4SAlex Elder 3545ea3352f4SAlex Elder /* 3546859c31dfSAlex Elder * Parse the options provided for an "rbd add" (i.e., rbd image 3547859c31dfSAlex Elder * mapping) request. These arrive via a write to /sys/bus/rbd/add, 3548859c31dfSAlex Elder * and the data written is passed here via a NUL-terminated buffer. 3549859c31dfSAlex Elder * Returns 0 if successful or an error code otherwise. 3550d22f76e7SAlex Elder * 3551859c31dfSAlex Elder * The information extracted from these options is recorded in 3552859c31dfSAlex Elder * the other parameters which return dynamically-allocated 3553859c31dfSAlex Elder * structures: 3554859c31dfSAlex Elder * ceph_opts 3555859c31dfSAlex Elder * The address of a pointer that will refer to a ceph options 3556859c31dfSAlex Elder * structure. Caller must release the returned pointer using 3557859c31dfSAlex Elder * ceph_destroy_options() when it is no longer needed. 3558859c31dfSAlex Elder * rbd_opts 3559859c31dfSAlex Elder * Address of an rbd options pointer. Fully initialized by 3560859c31dfSAlex Elder * this function; caller must release with kfree(). 3561859c31dfSAlex Elder * spec 3562859c31dfSAlex Elder * Address of an rbd image specification pointer. Fully 3563859c31dfSAlex Elder * initialized by this function based on parsed options. 3564859c31dfSAlex Elder * Caller must release with rbd_spec_put(). 3565859c31dfSAlex Elder * 3566859c31dfSAlex Elder * The options passed take this form: 3567859c31dfSAlex Elder * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>] 3568859c31dfSAlex Elder * where: 3569859c31dfSAlex Elder * <mon_addrs> 3570859c31dfSAlex Elder * A comma-separated list of one or more monitor addresses. 3571859c31dfSAlex Elder * A monitor address is an ip address, optionally followed 3572859c31dfSAlex Elder * by a port number (separated by a colon). 3573859c31dfSAlex Elder * I.e.: ip1[:port1][,ip2[:port2]...] 3574859c31dfSAlex Elder * <options> 3575859c31dfSAlex Elder * A comma-separated list of ceph and/or rbd options. 3576859c31dfSAlex Elder * <pool_name> 3577859c31dfSAlex Elder * The name of the rados pool containing the rbd image. 3578859c31dfSAlex Elder * <image_name> 3579859c31dfSAlex Elder * The name of the image in that pool to map. 3580859c31dfSAlex Elder * <snap_id> 3581859c31dfSAlex Elder * An optional snapshot id. If provided, the mapping will 3582859c31dfSAlex Elder * present data from the image at the time that snapshot was 3583859c31dfSAlex Elder * created. The image head is used if no snapshot id is 3584859c31dfSAlex Elder * provided. Snapshot mappings are always read-only. 3585a725f65eSAlex Elder */ 3586859c31dfSAlex Elder static int rbd_add_parse_args(const char *buf, 3587dc79b113SAlex Elder struct ceph_options **ceph_opts, 3588859c31dfSAlex Elder struct rbd_options **opts, 3589859c31dfSAlex Elder struct rbd_spec **rbd_spec) 3590a725f65eSAlex Elder { 3591e28fff26SAlex Elder size_t len; 3592859c31dfSAlex Elder char *options; 35930ddebc0cSAlex Elder const char *mon_addrs; 35940ddebc0cSAlex Elder size_t mon_addrs_size; 3595859c31dfSAlex Elder struct rbd_spec *spec = NULL; 35964e9afebaSAlex Elder struct rbd_options *rbd_opts = NULL; 3597859c31dfSAlex Elder struct ceph_options *copts; 3598dc79b113SAlex Elder int ret; 3599e28fff26SAlex Elder 3600e28fff26SAlex Elder /* The first four tokens are required */ 3601e28fff26SAlex Elder 36027ef3214aSAlex Elder len = next_token(&buf); 36034fb5d671SAlex Elder if (!len) { 36044fb5d671SAlex Elder rbd_warn(NULL, "no monitor address(es) provided"); 36054fb5d671SAlex Elder return -EINVAL; 36064fb5d671SAlex Elder } 36070ddebc0cSAlex Elder mon_addrs = buf; 3608f28e565aSAlex Elder mon_addrs_size = len + 1; 36097ef3214aSAlex Elder buf += len; 3610a725f65eSAlex Elder 3611dc79b113SAlex Elder ret = -EINVAL; 3612f28e565aSAlex Elder options = dup_token(&buf, NULL); 3613f28e565aSAlex Elder if (!options) 3614dc79b113SAlex Elder return -ENOMEM; 36154fb5d671SAlex Elder if (!*options) { 36164fb5d671SAlex Elder rbd_warn(NULL, "no options provided"); 36174fb5d671SAlex Elder goto out_err; 36184fb5d671SAlex Elder } 3619a725f65eSAlex Elder 3620859c31dfSAlex Elder spec = rbd_spec_alloc(); 3621859c31dfSAlex Elder if (!spec) 3622f28e565aSAlex Elder goto out_mem; 3623859c31dfSAlex Elder 3624859c31dfSAlex Elder spec->pool_name = dup_token(&buf, NULL); 3625859c31dfSAlex Elder if (!spec->pool_name) 3626859c31dfSAlex Elder goto out_mem; 36274fb5d671SAlex Elder if (!*spec->pool_name) { 36284fb5d671SAlex Elder rbd_warn(NULL, "no pool name provided"); 36294fb5d671SAlex Elder goto out_err; 36304fb5d671SAlex Elder } 3631e28fff26SAlex Elder 363269e7a02fSAlex Elder spec->image_name = dup_token(&buf, NULL); 3633859c31dfSAlex Elder if (!spec->image_name) 3634f28e565aSAlex Elder goto out_mem; 36354fb5d671SAlex Elder if (!*spec->image_name) { 36364fb5d671SAlex Elder rbd_warn(NULL, "no image name provided"); 36374fb5d671SAlex Elder goto out_err; 36384fb5d671SAlex Elder } 3639e28fff26SAlex Elder 3640f28e565aSAlex Elder /* 3641f28e565aSAlex Elder * Snapshot name is optional; default is to use "-" 3642f28e565aSAlex Elder * (indicating the head/no snapshot). 3643f28e565aSAlex Elder */ 36443feeb894SAlex Elder len = next_token(&buf); 3645820a5f3eSAlex Elder if (!len) { 36463feeb894SAlex Elder buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */ 36473feeb894SAlex Elder len = sizeof (RBD_SNAP_HEAD_NAME) - 1; 3648f28e565aSAlex Elder } else if (len > RBD_MAX_SNAP_NAME_LEN) { 3649dc79b113SAlex Elder ret = -ENAMETOOLONG; 3650f28e565aSAlex Elder goto out_err; 3651849b4260SAlex Elder } 36524caf35f9SAlex Elder spec->snap_name = kmemdup(buf, len + 1, GFP_KERNEL); 3653859c31dfSAlex Elder if (!spec->snap_name) 3654f28e565aSAlex Elder goto out_mem; 3655859c31dfSAlex Elder *(spec->snap_name + len) = '\0'; 3656e5c35534SAlex Elder 36570ddebc0cSAlex Elder /* Initialize all rbd options to the defaults */ 3658e28fff26SAlex Elder 36594e9afebaSAlex Elder rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL); 36604e9afebaSAlex Elder if (!rbd_opts) 36614e9afebaSAlex Elder goto out_mem; 36624e9afebaSAlex Elder 36634e9afebaSAlex Elder rbd_opts->read_only = RBD_READ_ONLY_DEFAULT; 3664d22f76e7SAlex Elder 3665859c31dfSAlex Elder copts = ceph_parse_options(options, mon_addrs, 36660ddebc0cSAlex Elder mon_addrs + mon_addrs_size - 1, 36674e9afebaSAlex Elder parse_rbd_opts_token, rbd_opts); 3668859c31dfSAlex Elder if (IS_ERR(copts)) { 3669859c31dfSAlex Elder ret = PTR_ERR(copts); 3670dc79b113SAlex Elder goto out_err; 3671dc79b113SAlex Elder } 3672859c31dfSAlex Elder kfree(options); 3673859c31dfSAlex Elder 3674859c31dfSAlex Elder *ceph_opts = copts; 36754e9afebaSAlex Elder *opts = rbd_opts; 3676859c31dfSAlex Elder *rbd_spec = spec; 36770ddebc0cSAlex Elder 3678dc79b113SAlex Elder return 0; 3679f28e565aSAlex Elder out_mem: 3680dc79b113SAlex Elder ret = -ENOMEM; 3681d22f76e7SAlex Elder out_err: 3682859c31dfSAlex Elder kfree(rbd_opts); 3683859c31dfSAlex Elder rbd_spec_put(spec); 3684f28e565aSAlex Elder kfree(options); 3685d22f76e7SAlex Elder 3686dc79b113SAlex Elder return ret; 3687a725f65eSAlex Elder } 3688a725f65eSAlex Elder 3689589d30e0SAlex Elder /* 3690589d30e0SAlex Elder * An rbd format 2 image has a unique identifier, distinct from the 3691589d30e0SAlex Elder * name given to it by the user. Internally, that identifier is 3692589d30e0SAlex Elder * what's used to specify the names of objects related to the image. 3693589d30e0SAlex Elder * 3694589d30e0SAlex Elder * A special "rbd id" object is used to map an rbd image name to its 3695589d30e0SAlex Elder * id. If that object doesn't exist, then there is no v2 rbd image 3696589d30e0SAlex Elder * with the supplied name. 3697589d30e0SAlex Elder * 3698589d30e0SAlex Elder * This function will record the given rbd_dev's image_id field if 3699589d30e0SAlex Elder * it can be determined, and in that case will return 0. If any 3700589d30e0SAlex Elder * errors occur a negative errno will be returned and the rbd_dev's 3701589d30e0SAlex Elder * image_id field will be unchanged (and should be NULL). 3702589d30e0SAlex Elder */ 3703589d30e0SAlex Elder static int rbd_dev_image_id(struct rbd_device *rbd_dev) 3704589d30e0SAlex Elder { 3705589d30e0SAlex Elder int ret; 3706589d30e0SAlex Elder size_t size; 3707589d30e0SAlex Elder char *object_name; 3708589d30e0SAlex Elder void *response; 3709589d30e0SAlex Elder void *p; 3710589d30e0SAlex Elder 3711589d30e0SAlex Elder /* 37122c0d0a10SAlex Elder * When probing a parent image, the image id is already 37132c0d0a10SAlex Elder * known (and the image name likely is not). There's no 37142c0d0a10SAlex Elder * need to fetch the image id again in this case. 37152c0d0a10SAlex Elder */ 37162c0d0a10SAlex Elder if (rbd_dev->spec->image_id) 37172c0d0a10SAlex Elder return 0; 37182c0d0a10SAlex Elder 37192c0d0a10SAlex Elder /* 3720589d30e0SAlex Elder * First, see if the format 2 image id file exists, and if 3721589d30e0SAlex Elder * so, get the image's persistent id from it. 3722589d30e0SAlex Elder */ 372369e7a02fSAlex Elder size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name); 3724589d30e0SAlex Elder object_name = kmalloc(size, GFP_NOIO); 3725589d30e0SAlex Elder if (!object_name) 3726589d30e0SAlex Elder return -ENOMEM; 37270d7dbfceSAlex Elder sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name); 3728589d30e0SAlex Elder dout("rbd id object name is %s\n", object_name); 3729589d30e0SAlex Elder 3730589d30e0SAlex Elder /* Response will be an encoded string, which includes a length */ 3731589d30e0SAlex Elder 3732589d30e0SAlex Elder size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX; 3733589d30e0SAlex Elder response = kzalloc(size, GFP_NOIO); 3734589d30e0SAlex Elder if (!response) { 3735589d30e0SAlex Elder ret = -ENOMEM; 3736589d30e0SAlex Elder goto out; 3737589d30e0SAlex Elder } 3738589d30e0SAlex Elder 373936be9a76SAlex Elder ret = rbd_obj_method_sync(rbd_dev, object_name, 3740589d30e0SAlex Elder "rbd", "get_id", 3741589d30e0SAlex Elder NULL, 0, 374207b2391fSAlex Elder response, RBD_IMAGE_ID_LEN_MAX, NULL); 374336be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 3744589d30e0SAlex Elder if (ret < 0) 3745589d30e0SAlex Elder goto out; 3746589d30e0SAlex Elder 3747589d30e0SAlex Elder p = response; 37480d7dbfceSAlex Elder rbd_dev->spec->image_id = ceph_extract_encoded_string(&p, 3749589d30e0SAlex Elder p + RBD_IMAGE_ID_LEN_MAX, 3750979ed480SAlex Elder NULL, GFP_NOIO); 37510d7dbfceSAlex Elder if (IS_ERR(rbd_dev->spec->image_id)) { 37520d7dbfceSAlex Elder ret = PTR_ERR(rbd_dev->spec->image_id); 37530d7dbfceSAlex Elder rbd_dev->spec->image_id = NULL; 3754589d30e0SAlex Elder } else { 37550d7dbfceSAlex Elder dout("image_id is %s\n", rbd_dev->spec->image_id); 3756589d30e0SAlex Elder } 3757589d30e0SAlex Elder out: 3758589d30e0SAlex Elder kfree(response); 3759589d30e0SAlex Elder kfree(object_name); 3760589d30e0SAlex Elder 3761589d30e0SAlex Elder return ret; 3762589d30e0SAlex Elder } 3763589d30e0SAlex Elder 3764a30b71b9SAlex Elder static int rbd_dev_v1_probe(struct rbd_device *rbd_dev) 3765a30b71b9SAlex Elder { 3766a30b71b9SAlex Elder int ret; 3767a30b71b9SAlex Elder size_t size; 3768a30b71b9SAlex Elder 3769a30b71b9SAlex Elder /* Version 1 images have no id; empty string is used */ 3770a30b71b9SAlex Elder 37710d7dbfceSAlex Elder rbd_dev->spec->image_id = kstrdup("", GFP_KERNEL); 37720d7dbfceSAlex Elder if (!rbd_dev->spec->image_id) 3773a30b71b9SAlex Elder return -ENOMEM; 3774a30b71b9SAlex Elder 3775a30b71b9SAlex Elder /* Record the header object name for this rbd image. */ 3776a30b71b9SAlex Elder 377769e7a02fSAlex Elder size = strlen(rbd_dev->spec->image_name) + sizeof (RBD_SUFFIX); 3778a30b71b9SAlex Elder rbd_dev->header_name = kmalloc(size, GFP_KERNEL); 3779a30b71b9SAlex Elder if (!rbd_dev->header_name) { 3780a30b71b9SAlex Elder ret = -ENOMEM; 3781a30b71b9SAlex Elder goto out_err; 3782a30b71b9SAlex Elder } 37830d7dbfceSAlex Elder sprintf(rbd_dev->header_name, "%s%s", 37840d7dbfceSAlex Elder rbd_dev->spec->image_name, RBD_SUFFIX); 3785a30b71b9SAlex Elder 3786a30b71b9SAlex Elder /* Populate rbd image metadata */ 3787a30b71b9SAlex Elder 3788a30b71b9SAlex Elder ret = rbd_read_header(rbd_dev, &rbd_dev->header); 3789a30b71b9SAlex Elder if (ret < 0) 3790a30b71b9SAlex Elder goto out_err; 379186b00e0dSAlex Elder 379286b00e0dSAlex Elder /* Version 1 images have no parent (no layering) */ 379386b00e0dSAlex Elder 379486b00e0dSAlex Elder rbd_dev->parent_spec = NULL; 379586b00e0dSAlex Elder rbd_dev->parent_overlap = 0; 379686b00e0dSAlex Elder 3797a30b71b9SAlex Elder rbd_dev->image_format = 1; 3798a30b71b9SAlex Elder 3799a30b71b9SAlex Elder dout("discovered version 1 image, header name is %s\n", 3800a30b71b9SAlex Elder rbd_dev->header_name); 3801a30b71b9SAlex Elder 3802a30b71b9SAlex Elder return 0; 3803a30b71b9SAlex Elder 3804a30b71b9SAlex Elder out_err: 3805a30b71b9SAlex Elder kfree(rbd_dev->header_name); 3806a30b71b9SAlex Elder rbd_dev->header_name = NULL; 38070d7dbfceSAlex Elder kfree(rbd_dev->spec->image_id); 38080d7dbfceSAlex Elder rbd_dev->spec->image_id = NULL; 3809a30b71b9SAlex Elder 3810a30b71b9SAlex Elder return ret; 3811a30b71b9SAlex Elder } 3812a30b71b9SAlex Elder 3813a30b71b9SAlex Elder static int rbd_dev_v2_probe(struct rbd_device *rbd_dev) 3814a30b71b9SAlex Elder { 3815a30b71b9SAlex Elder size_t size; 38169d475de5SAlex Elder int ret; 38176e14b1a6SAlex Elder u64 ver = 0; 3818a30b71b9SAlex Elder 3819a30b71b9SAlex Elder /* 3820a30b71b9SAlex Elder * Image id was filled in by the caller. Record the header 3821a30b71b9SAlex Elder * object name for this rbd image. 3822a30b71b9SAlex Elder */ 3823979ed480SAlex Elder size = sizeof (RBD_HEADER_PREFIX) + strlen(rbd_dev->spec->image_id); 3824a30b71b9SAlex Elder rbd_dev->header_name = kmalloc(size, GFP_KERNEL); 3825a30b71b9SAlex Elder if (!rbd_dev->header_name) 3826a30b71b9SAlex Elder return -ENOMEM; 3827a30b71b9SAlex Elder sprintf(rbd_dev->header_name, "%s%s", 38280d7dbfceSAlex Elder RBD_HEADER_PREFIX, rbd_dev->spec->image_id); 38299d475de5SAlex Elder 38309d475de5SAlex Elder /* Get the size and object order for the image */ 38319d475de5SAlex Elder 38329d475de5SAlex Elder ret = rbd_dev_v2_image_size(rbd_dev); 38339d475de5SAlex Elder if (ret < 0) 38349d475de5SAlex Elder goto out_err; 38351e130199SAlex Elder 38361e130199SAlex Elder /* Get the object prefix (a.k.a. block_name) for the image */ 38371e130199SAlex Elder 38381e130199SAlex Elder ret = rbd_dev_v2_object_prefix(rbd_dev); 38391e130199SAlex Elder if (ret < 0) 38401e130199SAlex Elder goto out_err; 3841b1b5402aSAlex Elder 3842d889140cSAlex Elder /* Get the and check features for the image */ 3843b1b5402aSAlex Elder 3844b1b5402aSAlex Elder ret = rbd_dev_v2_features(rbd_dev); 3845b1b5402aSAlex Elder if (ret < 0) 3846b1b5402aSAlex Elder goto out_err; 384735d489f9SAlex Elder 384886b00e0dSAlex Elder /* If the image supports layering, get the parent info */ 384986b00e0dSAlex Elder 385086b00e0dSAlex Elder if (rbd_dev->header.features & RBD_FEATURE_LAYERING) { 385186b00e0dSAlex Elder ret = rbd_dev_v2_parent_info(rbd_dev); 385286b00e0dSAlex Elder if (ret < 0) 385386b00e0dSAlex Elder goto out_err; 385486b00e0dSAlex Elder } 385586b00e0dSAlex Elder 38566e14b1a6SAlex Elder /* crypto and compression type aren't (yet) supported for v2 images */ 385735d489f9SAlex Elder 38586e14b1a6SAlex Elder rbd_dev->header.crypt_type = 0; 38596e14b1a6SAlex Elder rbd_dev->header.comp_type = 0; 38606e14b1a6SAlex Elder 38616e14b1a6SAlex Elder /* Get the snapshot context, plus the header version */ 38626e14b1a6SAlex Elder 38636e14b1a6SAlex Elder ret = rbd_dev_v2_snap_context(rbd_dev, &ver); 386435d489f9SAlex Elder if (ret) 386535d489f9SAlex Elder goto out_err; 38666e14b1a6SAlex Elder rbd_dev->header.obj_version = ver; 38676e14b1a6SAlex Elder 3868a30b71b9SAlex Elder rbd_dev->image_format = 2; 3869a30b71b9SAlex Elder 3870a30b71b9SAlex Elder dout("discovered version 2 image, header name is %s\n", 3871a30b71b9SAlex Elder rbd_dev->header_name); 3872a30b71b9SAlex Elder 387335152979SAlex Elder return 0; 38749d475de5SAlex Elder out_err: 387586b00e0dSAlex Elder rbd_dev->parent_overlap = 0; 387686b00e0dSAlex Elder rbd_spec_put(rbd_dev->parent_spec); 387786b00e0dSAlex Elder rbd_dev->parent_spec = NULL; 38789d475de5SAlex Elder kfree(rbd_dev->header_name); 38799d475de5SAlex Elder rbd_dev->header_name = NULL; 38801e130199SAlex Elder kfree(rbd_dev->header.object_prefix); 38811e130199SAlex Elder rbd_dev->header.object_prefix = NULL; 38829d475de5SAlex Elder 38839d475de5SAlex Elder return ret; 3884a30b71b9SAlex Elder } 3885a30b71b9SAlex Elder 388683a06263SAlex Elder static int rbd_dev_probe_finish(struct rbd_device *rbd_dev) 388783a06263SAlex Elder { 388883a06263SAlex Elder int ret; 388983a06263SAlex Elder 389083a06263SAlex Elder /* no need to lock here, as rbd_dev is not registered yet */ 389183a06263SAlex Elder ret = rbd_dev_snaps_update(rbd_dev); 389283a06263SAlex Elder if (ret) 389383a06263SAlex Elder return ret; 389483a06263SAlex Elder 38959e15b77dSAlex Elder ret = rbd_dev_probe_update_spec(rbd_dev); 38969e15b77dSAlex Elder if (ret) 38979e15b77dSAlex Elder goto err_out_snaps; 38989e15b77dSAlex Elder 389983a06263SAlex Elder ret = rbd_dev_set_mapping(rbd_dev); 390083a06263SAlex Elder if (ret) 390183a06263SAlex Elder goto err_out_snaps; 390283a06263SAlex Elder 390383a06263SAlex Elder /* generate unique id: find highest unique id, add one */ 390483a06263SAlex Elder rbd_dev_id_get(rbd_dev); 390583a06263SAlex Elder 390683a06263SAlex Elder /* Fill in the device name, now that we have its id. */ 390783a06263SAlex Elder BUILD_BUG_ON(DEV_NAME_LEN 390883a06263SAlex Elder < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH); 390983a06263SAlex Elder sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id); 391083a06263SAlex Elder 391183a06263SAlex Elder /* Get our block major device number. */ 391283a06263SAlex Elder 391383a06263SAlex Elder ret = register_blkdev(0, rbd_dev->name); 391483a06263SAlex Elder if (ret < 0) 391583a06263SAlex Elder goto err_out_id; 391683a06263SAlex Elder rbd_dev->major = ret; 391783a06263SAlex Elder 391883a06263SAlex Elder /* Set up the blkdev mapping. */ 391983a06263SAlex Elder 392083a06263SAlex Elder ret = rbd_init_disk(rbd_dev); 392183a06263SAlex Elder if (ret) 392283a06263SAlex Elder goto err_out_blkdev; 392383a06263SAlex Elder 392483a06263SAlex Elder ret = rbd_bus_add_dev(rbd_dev); 392583a06263SAlex Elder if (ret) 392683a06263SAlex Elder goto err_out_disk; 392783a06263SAlex Elder 392883a06263SAlex Elder /* 392983a06263SAlex Elder * At this point cleanup in the event of an error is the job 393083a06263SAlex Elder * of the sysfs code (initiated by rbd_bus_del_dev()). 393183a06263SAlex Elder */ 393283a06263SAlex Elder down_write(&rbd_dev->header_rwsem); 393383a06263SAlex Elder ret = rbd_dev_snaps_register(rbd_dev); 393483a06263SAlex Elder up_write(&rbd_dev->header_rwsem); 393583a06263SAlex Elder if (ret) 393683a06263SAlex Elder goto err_out_bus; 393783a06263SAlex Elder 39389969ebc5SAlex Elder ret = rbd_dev_header_watch_sync(rbd_dev, 1); 393983a06263SAlex Elder if (ret) 394083a06263SAlex Elder goto err_out_bus; 394183a06263SAlex Elder 394283a06263SAlex Elder /* Everything's ready. Announce the disk to the world. */ 394383a06263SAlex Elder 394483a06263SAlex Elder add_disk(rbd_dev->disk); 394583a06263SAlex Elder 394683a06263SAlex Elder pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name, 394783a06263SAlex Elder (unsigned long long) rbd_dev->mapping.size); 394883a06263SAlex Elder 394983a06263SAlex Elder return ret; 395083a06263SAlex Elder err_out_bus: 395183a06263SAlex Elder /* this will also clean up rest of rbd_dev stuff */ 395283a06263SAlex Elder 395383a06263SAlex Elder rbd_bus_del_dev(rbd_dev); 395483a06263SAlex Elder 395583a06263SAlex Elder return ret; 395683a06263SAlex Elder err_out_disk: 395783a06263SAlex Elder rbd_free_disk(rbd_dev); 395883a06263SAlex Elder err_out_blkdev: 395983a06263SAlex Elder unregister_blkdev(rbd_dev->major, rbd_dev->name); 396083a06263SAlex Elder err_out_id: 396183a06263SAlex Elder rbd_dev_id_put(rbd_dev); 396283a06263SAlex Elder err_out_snaps: 396383a06263SAlex Elder rbd_remove_all_snaps(rbd_dev); 396483a06263SAlex Elder 396583a06263SAlex Elder return ret; 396683a06263SAlex Elder } 396783a06263SAlex Elder 3968a30b71b9SAlex Elder /* 3969a30b71b9SAlex Elder * Probe for the existence of the header object for the given rbd 3970a30b71b9SAlex Elder * device. For format 2 images this includes determining the image 3971a30b71b9SAlex Elder * id. 3972a30b71b9SAlex Elder */ 3973a30b71b9SAlex Elder static int rbd_dev_probe(struct rbd_device *rbd_dev) 3974a30b71b9SAlex Elder { 3975a30b71b9SAlex Elder int ret; 3976a30b71b9SAlex Elder 3977a30b71b9SAlex Elder /* 3978a30b71b9SAlex Elder * Get the id from the image id object. If it's not a 3979a30b71b9SAlex Elder * format 2 image, we'll get ENOENT back, and we'll assume 3980a30b71b9SAlex Elder * it's a format 1 image. 3981a30b71b9SAlex Elder */ 3982a30b71b9SAlex Elder ret = rbd_dev_image_id(rbd_dev); 3983a30b71b9SAlex Elder if (ret) 3984a30b71b9SAlex Elder ret = rbd_dev_v1_probe(rbd_dev); 3985a30b71b9SAlex Elder else 3986a30b71b9SAlex Elder ret = rbd_dev_v2_probe(rbd_dev); 398783a06263SAlex Elder if (ret) { 3988a30b71b9SAlex Elder dout("probe failed, returning %d\n", ret); 3989a30b71b9SAlex Elder 3990a30b71b9SAlex Elder return ret; 3991a30b71b9SAlex Elder } 3992a30b71b9SAlex Elder 399383a06263SAlex Elder ret = rbd_dev_probe_finish(rbd_dev); 399483a06263SAlex Elder if (ret) 399583a06263SAlex Elder rbd_header_free(&rbd_dev->header); 399683a06263SAlex Elder 399783a06263SAlex Elder return ret; 399883a06263SAlex Elder } 399983a06263SAlex Elder 400059c2be1eSYehuda Sadeh static ssize_t rbd_add(struct bus_type *bus, 400159c2be1eSYehuda Sadeh const char *buf, 400259c2be1eSYehuda Sadeh size_t count) 4003602adf40SYehuda Sadeh { 4004cb8627c7SAlex Elder struct rbd_device *rbd_dev = NULL; 4005dc79b113SAlex Elder struct ceph_options *ceph_opts = NULL; 40064e9afebaSAlex Elder struct rbd_options *rbd_opts = NULL; 4007859c31dfSAlex Elder struct rbd_spec *spec = NULL; 40089d3997fdSAlex Elder struct rbd_client *rbdc; 400927cc2594SAlex Elder struct ceph_osd_client *osdc; 401027cc2594SAlex Elder int rc = -ENOMEM; 4011602adf40SYehuda Sadeh 4012602adf40SYehuda Sadeh if (!try_module_get(THIS_MODULE)) 4013602adf40SYehuda Sadeh return -ENODEV; 4014602adf40SYehuda Sadeh 4015a725f65eSAlex Elder /* parse add command */ 4016859c31dfSAlex Elder rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec); 4017dc79b113SAlex Elder if (rc < 0) 4018bd4ba655SAlex Elder goto err_out_module; 4019a725f65eSAlex Elder 40209d3997fdSAlex Elder rbdc = rbd_get_client(ceph_opts); 40219d3997fdSAlex Elder if (IS_ERR(rbdc)) { 40229d3997fdSAlex Elder rc = PTR_ERR(rbdc); 40230ddebc0cSAlex Elder goto err_out_args; 40249d3997fdSAlex Elder } 4025c53d5893SAlex Elder ceph_opts = NULL; /* rbd_dev client now owns this */ 4026602adf40SYehuda Sadeh 4027602adf40SYehuda Sadeh /* pick the pool */ 40289d3997fdSAlex Elder osdc = &rbdc->client->osdc; 4029859c31dfSAlex Elder rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name); 4030602adf40SYehuda Sadeh if (rc < 0) 4031602adf40SYehuda Sadeh goto err_out_client; 4032859c31dfSAlex Elder spec->pool_id = (u64) rc; 4033859c31dfSAlex Elder 40340903e875SAlex Elder /* The ceph file layout needs to fit pool id in 32 bits */ 40350903e875SAlex Elder 40360903e875SAlex Elder if (WARN_ON(spec->pool_id > (u64) U32_MAX)) { 40370903e875SAlex Elder rc = -EIO; 40380903e875SAlex Elder goto err_out_client; 40390903e875SAlex Elder } 40400903e875SAlex Elder 4041c53d5893SAlex Elder rbd_dev = rbd_dev_create(rbdc, spec); 4042bd4ba655SAlex Elder if (!rbd_dev) 4043bd4ba655SAlex Elder goto err_out_client; 4044c53d5893SAlex Elder rbdc = NULL; /* rbd_dev now owns this */ 4045c53d5893SAlex Elder spec = NULL; /* rbd_dev now owns this */ 4046602adf40SYehuda Sadeh 4047bd4ba655SAlex Elder rbd_dev->mapping.read_only = rbd_opts->read_only; 4048c53d5893SAlex Elder kfree(rbd_opts); 4049c53d5893SAlex Elder rbd_opts = NULL; /* done with this */ 4050bd4ba655SAlex Elder 4051a30b71b9SAlex Elder rc = rbd_dev_probe(rbd_dev); 4052a30b71b9SAlex Elder if (rc < 0) 4053c53d5893SAlex Elder goto err_out_rbd_dev; 405405fd6f6fSAlex Elder 4055602adf40SYehuda Sadeh return count; 4056c53d5893SAlex Elder err_out_rbd_dev: 4057c53d5893SAlex Elder rbd_dev_destroy(rbd_dev); 4058bd4ba655SAlex Elder err_out_client: 40599d3997fdSAlex Elder rbd_put_client(rbdc); 40600ddebc0cSAlex Elder err_out_args: 406178cea76eSAlex Elder if (ceph_opts) 406278cea76eSAlex Elder ceph_destroy_options(ceph_opts); 40634e9afebaSAlex Elder kfree(rbd_opts); 4064859c31dfSAlex Elder rbd_spec_put(spec); 4065bd4ba655SAlex Elder err_out_module: 4066bd4ba655SAlex Elder module_put(THIS_MODULE); 406727cc2594SAlex Elder 4068602adf40SYehuda Sadeh dout("Error adding device %s\n", buf); 406927cc2594SAlex Elder 407027cc2594SAlex Elder return (ssize_t) rc; 4071602adf40SYehuda Sadeh } 4072602adf40SYehuda Sadeh 4073de71a297SAlex Elder static struct rbd_device *__rbd_get_dev(unsigned long dev_id) 4074602adf40SYehuda Sadeh { 4075602adf40SYehuda Sadeh struct list_head *tmp; 4076602adf40SYehuda Sadeh struct rbd_device *rbd_dev; 4077602adf40SYehuda Sadeh 4078e124a82fSAlex Elder spin_lock(&rbd_dev_list_lock); 4079602adf40SYehuda Sadeh list_for_each(tmp, &rbd_dev_list) { 4080602adf40SYehuda Sadeh rbd_dev = list_entry(tmp, struct rbd_device, node); 4081de71a297SAlex Elder if (rbd_dev->dev_id == dev_id) { 4082e124a82fSAlex Elder spin_unlock(&rbd_dev_list_lock); 4083602adf40SYehuda Sadeh return rbd_dev; 4084602adf40SYehuda Sadeh } 4085e124a82fSAlex Elder } 4086e124a82fSAlex Elder spin_unlock(&rbd_dev_list_lock); 4087602adf40SYehuda Sadeh return NULL; 4088602adf40SYehuda Sadeh } 4089602adf40SYehuda Sadeh 4090dfc5606dSYehuda Sadeh static void rbd_dev_release(struct device *dev) 4091602adf40SYehuda Sadeh { 4092593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4093602adf40SYehuda Sadeh 409459c2be1eSYehuda Sadeh if (rbd_dev->watch_event) 40959969ebc5SAlex Elder rbd_dev_header_watch_sync(rbd_dev, 0); 4096602adf40SYehuda Sadeh 4097602adf40SYehuda Sadeh /* clean up and free blkdev */ 4098602adf40SYehuda Sadeh rbd_free_disk(rbd_dev); 4099602adf40SYehuda Sadeh unregister_blkdev(rbd_dev->major, rbd_dev->name); 410032eec68dSAlex Elder 41012ac4e75dSAlex Elder /* release allocated disk header fields */ 41022ac4e75dSAlex Elder rbd_header_free(&rbd_dev->header); 41032ac4e75dSAlex Elder 410432eec68dSAlex Elder /* done with the id, and with the rbd_dev */ 4105e2839308SAlex Elder rbd_dev_id_put(rbd_dev); 4106c53d5893SAlex Elder rbd_assert(rbd_dev->rbd_client != NULL); 4107c53d5893SAlex Elder rbd_dev_destroy(rbd_dev); 4108602adf40SYehuda Sadeh 4109602adf40SYehuda Sadeh /* release module ref */ 4110602adf40SYehuda Sadeh module_put(THIS_MODULE); 4111602adf40SYehuda Sadeh } 4112602adf40SYehuda Sadeh 4113dfc5606dSYehuda Sadeh static ssize_t rbd_remove(struct bus_type *bus, 4114602adf40SYehuda Sadeh const char *buf, 4115602adf40SYehuda Sadeh size_t count) 4116602adf40SYehuda Sadeh { 4117602adf40SYehuda Sadeh struct rbd_device *rbd_dev = NULL; 4118602adf40SYehuda Sadeh int target_id, rc; 4119602adf40SYehuda Sadeh unsigned long ul; 4120602adf40SYehuda Sadeh int ret = count; 4121602adf40SYehuda Sadeh 4122602adf40SYehuda Sadeh rc = strict_strtoul(buf, 10, &ul); 4123602adf40SYehuda Sadeh if (rc) 4124602adf40SYehuda Sadeh return rc; 4125602adf40SYehuda Sadeh 4126602adf40SYehuda Sadeh /* convert to int; abort if we lost anything in the conversion */ 4127602adf40SYehuda Sadeh target_id = (int) ul; 4128602adf40SYehuda Sadeh if (target_id != ul) 4129602adf40SYehuda Sadeh return -EINVAL; 4130602adf40SYehuda Sadeh 4131602adf40SYehuda Sadeh mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); 4132602adf40SYehuda Sadeh 4133602adf40SYehuda Sadeh rbd_dev = __rbd_get_dev(target_id); 4134602adf40SYehuda Sadeh if (!rbd_dev) { 4135602adf40SYehuda Sadeh ret = -ENOENT; 4136602adf40SYehuda Sadeh goto done; 4137602adf40SYehuda Sadeh } 4138602adf40SYehuda Sadeh 4139a14ea269SAlex Elder spin_lock_irq(&rbd_dev->lock); 4140b82d167bSAlex Elder if (rbd_dev->open_count) 414142382b70SAlex Elder ret = -EBUSY; 4142b82d167bSAlex Elder else 4143b82d167bSAlex Elder set_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags); 4144a14ea269SAlex Elder spin_unlock_irq(&rbd_dev->lock); 4145b82d167bSAlex Elder if (ret < 0) 414642382b70SAlex Elder goto done; 414742382b70SAlex Elder 414841f38c2bSAlex Elder rbd_remove_all_snaps(rbd_dev); 4149dfc5606dSYehuda Sadeh rbd_bus_del_dev(rbd_dev); 4150602adf40SYehuda Sadeh 4151602adf40SYehuda Sadeh done: 4152602adf40SYehuda Sadeh mutex_unlock(&ctl_mutex); 4153aafb230eSAlex Elder 4154602adf40SYehuda Sadeh return ret; 4155602adf40SYehuda Sadeh } 4156602adf40SYehuda Sadeh 4157602adf40SYehuda Sadeh /* 4158602adf40SYehuda Sadeh * create control files in sysfs 4159dfc5606dSYehuda Sadeh * /sys/bus/rbd/... 4160602adf40SYehuda Sadeh */ 4161602adf40SYehuda Sadeh static int rbd_sysfs_init(void) 4162602adf40SYehuda Sadeh { 4163dfc5606dSYehuda Sadeh int ret; 4164602adf40SYehuda Sadeh 4165fed4c143SAlex Elder ret = device_register(&rbd_root_dev); 4166dfc5606dSYehuda Sadeh if (ret < 0) 4167dfc5606dSYehuda Sadeh return ret; 4168602adf40SYehuda Sadeh 4169fed4c143SAlex Elder ret = bus_register(&rbd_bus_type); 4170fed4c143SAlex Elder if (ret < 0) 4171fed4c143SAlex Elder device_unregister(&rbd_root_dev); 4172602adf40SYehuda Sadeh 4173602adf40SYehuda Sadeh return ret; 4174602adf40SYehuda Sadeh } 4175602adf40SYehuda Sadeh 4176602adf40SYehuda Sadeh static void rbd_sysfs_cleanup(void) 4177602adf40SYehuda Sadeh { 4178dfc5606dSYehuda Sadeh bus_unregister(&rbd_bus_type); 4179fed4c143SAlex Elder device_unregister(&rbd_root_dev); 4180602adf40SYehuda Sadeh } 4181602adf40SYehuda Sadeh 4182602adf40SYehuda Sadeh int __init rbd_init(void) 4183602adf40SYehuda Sadeh { 4184602adf40SYehuda Sadeh int rc; 4185602adf40SYehuda Sadeh 41861e32d34cSAlex Elder if (!libceph_compatible(NULL)) { 41871e32d34cSAlex Elder rbd_warn(NULL, "libceph incompatibility (quitting)"); 41881e32d34cSAlex Elder 41891e32d34cSAlex Elder return -EINVAL; 41901e32d34cSAlex Elder } 4191602adf40SYehuda Sadeh rc = rbd_sysfs_init(); 4192602adf40SYehuda Sadeh if (rc) 4193602adf40SYehuda Sadeh return rc; 4194f0f8cef5SAlex Elder pr_info("loaded " RBD_DRV_NAME_LONG "\n"); 4195602adf40SYehuda Sadeh return 0; 4196602adf40SYehuda Sadeh } 4197602adf40SYehuda Sadeh 4198602adf40SYehuda Sadeh void __exit rbd_exit(void) 4199602adf40SYehuda Sadeh { 4200602adf40SYehuda Sadeh rbd_sysfs_cleanup(); 4201602adf40SYehuda Sadeh } 4202602adf40SYehuda Sadeh 4203602adf40SYehuda Sadeh module_init(rbd_init); 4204602adf40SYehuda Sadeh module_exit(rbd_exit); 4205602adf40SYehuda Sadeh 4206602adf40SYehuda Sadeh MODULE_AUTHOR("Sage Weil <sage@newdream.net>"); 4207602adf40SYehuda Sadeh MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>"); 4208602adf40SYehuda Sadeh MODULE_DESCRIPTION("rados block device"); 4209602adf40SYehuda Sadeh 4210602adf40SYehuda Sadeh /* following authorship retained from original osdblk.c */ 4211602adf40SYehuda Sadeh MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>"); 4212602adf40SYehuda Sadeh 4213602adf40SYehuda Sadeh MODULE_LICENSE("GPL"); 4214