1602adf40SYehuda Sadeh /* 2602adf40SYehuda Sadeh rbd.c -- Export ceph rados objects as a Linux block device 3602adf40SYehuda Sadeh 4602adf40SYehuda Sadeh 5602adf40SYehuda Sadeh based on drivers/block/osdblk.c: 6602adf40SYehuda Sadeh 7602adf40SYehuda Sadeh Copyright 2009 Red Hat, Inc. 8602adf40SYehuda Sadeh 9602adf40SYehuda Sadeh This program is free software; you can redistribute it and/or modify 10602adf40SYehuda Sadeh it under the terms of the GNU General Public License as published by 11602adf40SYehuda Sadeh the Free Software Foundation. 12602adf40SYehuda Sadeh 13602adf40SYehuda Sadeh This program is distributed in the hope that it will be useful, 14602adf40SYehuda Sadeh but WITHOUT ANY WARRANTY; without even the implied warranty of 15602adf40SYehuda Sadeh MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16602adf40SYehuda Sadeh GNU General Public License for more details. 17602adf40SYehuda Sadeh 18602adf40SYehuda Sadeh You should have received a copy of the GNU General Public License 19602adf40SYehuda Sadeh along with this program; see the file COPYING. If not, write to 20602adf40SYehuda Sadeh the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 21602adf40SYehuda Sadeh 22602adf40SYehuda Sadeh 23602adf40SYehuda Sadeh 24dfc5606dSYehuda Sadeh For usage instructions, please refer to: 25602adf40SYehuda Sadeh 26dfc5606dSYehuda Sadeh Documentation/ABI/testing/sysfs-bus-rbd 27602adf40SYehuda Sadeh 28602adf40SYehuda Sadeh */ 29602adf40SYehuda Sadeh 30602adf40SYehuda Sadeh #include <linux/ceph/libceph.h> 31602adf40SYehuda Sadeh #include <linux/ceph/osd_client.h> 32602adf40SYehuda Sadeh #include <linux/ceph/mon_client.h> 33602adf40SYehuda Sadeh #include <linux/ceph/decode.h> 3459c2be1eSYehuda Sadeh #include <linux/parser.h> 35602adf40SYehuda Sadeh 36602adf40SYehuda Sadeh #include <linux/kernel.h> 37602adf40SYehuda Sadeh #include <linux/device.h> 38602adf40SYehuda Sadeh #include <linux/module.h> 39602adf40SYehuda Sadeh #include <linux/fs.h> 40602adf40SYehuda Sadeh #include <linux/blkdev.h> 41602adf40SYehuda Sadeh 42602adf40SYehuda Sadeh #include "rbd_types.h" 43602adf40SYehuda Sadeh 44aafb230eSAlex Elder #define RBD_DEBUG /* Activate rbd_assert() calls */ 45aafb230eSAlex Elder 46593a9e7bSAlex Elder /* 47593a9e7bSAlex Elder * The basic unit of block I/O is a sector. It is interpreted in a 48593a9e7bSAlex Elder * number of contexts in Linux (blk, bio, genhd), but the default is 49593a9e7bSAlex Elder * universally 512 bytes. These symbols are just slightly more 50593a9e7bSAlex Elder * meaningful than the bare numbers they represent. 51593a9e7bSAlex Elder */ 52593a9e7bSAlex Elder #define SECTOR_SHIFT 9 53593a9e7bSAlex Elder #define SECTOR_SIZE (1ULL << SECTOR_SHIFT) 54593a9e7bSAlex Elder 55df111be6SAlex Elder /* It might be useful to have this defined elsewhere too */ 56df111be6SAlex Elder 57df111be6SAlex Elder #define U64_MAX ((u64) (~0ULL)) 58df111be6SAlex Elder 59f0f8cef5SAlex Elder #define RBD_DRV_NAME "rbd" 60f0f8cef5SAlex Elder #define RBD_DRV_NAME_LONG "rbd (rados block device)" 61602adf40SYehuda Sadeh 62602adf40SYehuda Sadeh #define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */ 63602adf40SYehuda Sadeh 64d4b125e9SAlex Elder #define RBD_SNAP_DEV_NAME_PREFIX "snap_" 65d4b125e9SAlex Elder #define RBD_MAX_SNAP_NAME_LEN \ 66d4b125e9SAlex Elder (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1)) 67d4b125e9SAlex Elder 6835d489f9SAlex Elder #define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */ 69602adf40SYehuda Sadeh #define RBD_MAX_OPT_LEN 1024 70602adf40SYehuda Sadeh 71602adf40SYehuda Sadeh #define RBD_SNAP_HEAD_NAME "-" 72602adf40SYehuda Sadeh 739e15b77dSAlex Elder /* This allows a single page to hold an image name sent by OSD */ 749e15b77dSAlex Elder #define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1) 75589d30e0SAlex Elder #define RBD_IMAGE_ID_LEN_MAX 64 769e15b77dSAlex Elder 771e130199SAlex Elder #define RBD_OBJ_PREFIX_LEN_MAX 64 78589d30e0SAlex Elder 79d889140cSAlex Elder /* Feature bits */ 80d889140cSAlex Elder 81d889140cSAlex Elder #define RBD_FEATURE_LAYERING 1 82d889140cSAlex Elder 83d889140cSAlex Elder /* Features supported by this (client software) implementation. */ 84d889140cSAlex Elder 85d889140cSAlex Elder #define RBD_FEATURES_ALL (0) 86d889140cSAlex Elder 8781a89793SAlex Elder /* 8881a89793SAlex Elder * An RBD device name will be "rbd#", where the "rbd" comes from 8981a89793SAlex Elder * RBD_DRV_NAME above, and # is a unique integer identifier. 9081a89793SAlex Elder * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big 9181a89793SAlex Elder * enough to hold all possible device names. 9281a89793SAlex Elder */ 93602adf40SYehuda Sadeh #define DEV_NAME_LEN 32 9481a89793SAlex Elder #define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1) 95602adf40SYehuda Sadeh 96cc0538b6SAlex Elder #define RBD_READ_ONLY_DEFAULT false 9759c2be1eSYehuda Sadeh 98602adf40SYehuda Sadeh /* 99602adf40SYehuda Sadeh * block device image metadata (in-memory version) 100602adf40SYehuda Sadeh */ 101602adf40SYehuda Sadeh struct rbd_image_header { 102f84344f3SAlex Elder /* These four fields never change for a given rbd image */ 103849b4260SAlex Elder char *object_prefix; 10434b13184SAlex Elder u64 features; 105602adf40SYehuda Sadeh __u8 obj_order; 106602adf40SYehuda Sadeh __u8 crypt_type; 107602adf40SYehuda Sadeh __u8 comp_type; 108602adf40SYehuda Sadeh 109f84344f3SAlex Elder /* The remaining fields need to be updated occasionally */ 110f84344f3SAlex Elder u64 image_size; 111f84344f3SAlex Elder struct ceph_snap_context *snapc; 112602adf40SYehuda Sadeh char *snap_names; 113602adf40SYehuda Sadeh u64 *snap_sizes; 11459c2be1eSYehuda Sadeh 11559c2be1eSYehuda Sadeh u64 obj_version; 11659c2be1eSYehuda Sadeh }; 11759c2be1eSYehuda Sadeh 1180d7dbfceSAlex Elder /* 1190d7dbfceSAlex Elder * An rbd image specification. 1200d7dbfceSAlex Elder * 1210d7dbfceSAlex Elder * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely 122c66c6e0cSAlex Elder * identify an image. Each rbd_dev structure includes a pointer to 123c66c6e0cSAlex Elder * an rbd_spec structure that encapsulates this identity. 124c66c6e0cSAlex Elder * 125c66c6e0cSAlex Elder * Each of the id's in an rbd_spec has an associated name. For a 126c66c6e0cSAlex Elder * user-mapped image, the names are supplied and the id's associated 127c66c6e0cSAlex Elder * with them are looked up. For a layered image, a parent image is 128c66c6e0cSAlex Elder * defined by the tuple, and the names are looked up. 129c66c6e0cSAlex Elder * 130c66c6e0cSAlex Elder * An rbd_dev structure contains a parent_spec pointer which is 131c66c6e0cSAlex Elder * non-null if the image it represents is a child in a layered 132c66c6e0cSAlex Elder * image. This pointer will refer to the rbd_spec structure used 133c66c6e0cSAlex Elder * by the parent rbd_dev for its own identity (i.e., the structure 134c66c6e0cSAlex Elder * is shared between the parent and child). 135c66c6e0cSAlex Elder * 136c66c6e0cSAlex Elder * Since these structures are populated once, during the discovery 137c66c6e0cSAlex Elder * phase of image construction, they are effectively immutable so 138c66c6e0cSAlex Elder * we make no effort to synchronize access to them. 139c66c6e0cSAlex Elder * 140c66c6e0cSAlex Elder * Note that code herein does not assume the image name is known (it 141c66c6e0cSAlex Elder * could be a null pointer). 1420d7dbfceSAlex Elder */ 1430d7dbfceSAlex Elder struct rbd_spec { 1440d7dbfceSAlex Elder u64 pool_id; 1450d7dbfceSAlex Elder char *pool_name; 1460d7dbfceSAlex Elder 1470d7dbfceSAlex Elder char *image_id; 1480d7dbfceSAlex Elder char *image_name; 1490d7dbfceSAlex Elder 1500d7dbfceSAlex Elder u64 snap_id; 1510d7dbfceSAlex Elder char *snap_name; 1520d7dbfceSAlex Elder 1530d7dbfceSAlex Elder struct kref kref; 1540d7dbfceSAlex Elder }; 1550d7dbfceSAlex Elder 15659c2be1eSYehuda Sadeh struct rbd_options { 157cc0538b6SAlex Elder bool read_only; 158602adf40SYehuda Sadeh }; 159602adf40SYehuda Sadeh 160602adf40SYehuda Sadeh /* 161f0f8cef5SAlex Elder * an instance of the client. multiple devices may share an rbd client. 162602adf40SYehuda Sadeh */ 163602adf40SYehuda Sadeh struct rbd_client { 164602adf40SYehuda Sadeh struct ceph_client *client; 165602adf40SYehuda Sadeh struct kref kref; 166602adf40SYehuda Sadeh struct list_head node; 167602adf40SYehuda Sadeh }; 168602adf40SYehuda Sadeh 169602adf40SYehuda Sadeh /* 170f0f8cef5SAlex Elder * a request completion status 171602adf40SYehuda Sadeh */ 1721fec7093SYehuda Sadeh struct rbd_req_status { 1731fec7093SYehuda Sadeh int done; 1748986cb37SAlex Elder s32 rc; 1751fec7093SYehuda Sadeh u64 bytes; 1761fec7093SYehuda Sadeh }; 1771fec7093SYehuda Sadeh 1781fec7093SYehuda Sadeh /* 1791fec7093SYehuda Sadeh * a collection of requests 1801fec7093SYehuda Sadeh */ 1811fec7093SYehuda Sadeh struct rbd_req_coll { 1821fec7093SYehuda Sadeh int total; 1831fec7093SYehuda Sadeh int num_done; 1841fec7093SYehuda Sadeh struct kref kref; 1851fec7093SYehuda Sadeh struct rbd_req_status status[0]; 186602adf40SYehuda Sadeh }; 187602adf40SYehuda Sadeh 188f0f8cef5SAlex Elder /* 189f0f8cef5SAlex Elder * a single io request 190f0f8cef5SAlex Elder */ 191f0f8cef5SAlex Elder struct rbd_request { 192f0f8cef5SAlex Elder struct request *rq; /* blk layer request */ 193f0f8cef5SAlex Elder struct bio *bio; /* cloned bio */ 194f0f8cef5SAlex Elder struct page **pages; /* list of used pages */ 195f0f8cef5SAlex Elder u64 len; 196f0f8cef5SAlex Elder int coll_index; 197f0f8cef5SAlex Elder struct rbd_req_coll *coll; 198f0f8cef5SAlex Elder }; 199f0f8cef5SAlex Elder 200dfc5606dSYehuda Sadeh struct rbd_snap { 201dfc5606dSYehuda Sadeh struct device dev; 202dfc5606dSYehuda Sadeh const char *name; 2033591538fSJosh Durgin u64 size; 204dfc5606dSYehuda Sadeh struct list_head node; 205dfc5606dSYehuda Sadeh u64 id; 20634b13184SAlex Elder u64 features; 207dfc5606dSYehuda Sadeh }; 208dfc5606dSYehuda Sadeh 209f84344f3SAlex Elder struct rbd_mapping { 21099c1f08fSAlex Elder u64 size; 21134b13184SAlex Elder u64 features; 212f84344f3SAlex Elder bool read_only; 213f84344f3SAlex Elder }; 214f84344f3SAlex Elder 215602adf40SYehuda Sadeh /* 216602adf40SYehuda Sadeh * a single device 217602adf40SYehuda Sadeh */ 218602adf40SYehuda Sadeh struct rbd_device { 219de71a297SAlex Elder int dev_id; /* blkdev unique id */ 220602adf40SYehuda Sadeh 221602adf40SYehuda Sadeh int major; /* blkdev assigned major */ 222602adf40SYehuda Sadeh struct gendisk *disk; /* blkdev's gendisk and rq */ 223602adf40SYehuda Sadeh 224a30b71b9SAlex Elder u32 image_format; /* Either 1 or 2 */ 225602adf40SYehuda Sadeh struct rbd_client *rbd_client; 226602adf40SYehuda Sadeh 227602adf40SYehuda Sadeh char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */ 228602adf40SYehuda Sadeh 229602adf40SYehuda Sadeh spinlock_t lock; /* queue lock */ 230602adf40SYehuda Sadeh 231602adf40SYehuda Sadeh struct rbd_image_header header; 232d78b650aSAlex Elder atomic_t exists; 2330d7dbfceSAlex Elder struct rbd_spec *spec; 234602adf40SYehuda Sadeh 2350d7dbfceSAlex Elder char *header_name; 236971f839aSAlex Elder 23759c2be1eSYehuda Sadeh struct ceph_osd_event *watch_event; 23859c2be1eSYehuda Sadeh struct ceph_osd_request *watch_request; 23959c2be1eSYehuda Sadeh 24086b00e0dSAlex Elder struct rbd_spec *parent_spec; 24186b00e0dSAlex Elder u64 parent_overlap; 24286b00e0dSAlex Elder 243c666601aSJosh Durgin /* protects updating the header */ 244c666601aSJosh Durgin struct rw_semaphore header_rwsem; 245f84344f3SAlex Elder 246f84344f3SAlex Elder struct rbd_mapping mapping; 247602adf40SYehuda Sadeh 248602adf40SYehuda Sadeh struct list_head node; 249dfc5606dSYehuda Sadeh 250dfc5606dSYehuda Sadeh /* list of snapshots */ 251dfc5606dSYehuda Sadeh struct list_head snaps; 252dfc5606dSYehuda Sadeh 253dfc5606dSYehuda Sadeh /* sysfs related */ 254dfc5606dSYehuda Sadeh struct device dev; 25542382b70SAlex Elder unsigned long open_count; 256dfc5606dSYehuda Sadeh }; 257dfc5606dSYehuda Sadeh 258602adf40SYehuda Sadeh static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */ 259e124a82fSAlex Elder 260602adf40SYehuda Sadeh static LIST_HEAD(rbd_dev_list); /* devices */ 261e124a82fSAlex Elder static DEFINE_SPINLOCK(rbd_dev_list_lock); 262e124a82fSAlex Elder 263602adf40SYehuda Sadeh static LIST_HEAD(rbd_client_list); /* clients */ 264432b8587SAlex Elder static DEFINE_SPINLOCK(rbd_client_list_lock); 265602adf40SYehuda Sadeh 266304f6808SAlex Elder static int rbd_dev_snaps_update(struct rbd_device *rbd_dev); 267304f6808SAlex Elder static int rbd_dev_snaps_register(struct rbd_device *rbd_dev); 268304f6808SAlex Elder 269dfc5606dSYehuda Sadeh static void rbd_dev_release(struct device *dev); 27041f38c2bSAlex Elder static void rbd_remove_snap_dev(struct rbd_snap *snap); 271dfc5606dSYehuda Sadeh 272f0f8cef5SAlex Elder static ssize_t rbd_add(struct bus_type *bus, const char *buf, 273f0f8cef5SAlex Elder size_t count); 274f0f8cef5SAlex Elder static ssize_t rbd_remove(struct bus_type *bus, const char *buf, 275f0f8cef5SAlex Elder size_t count); 276f0f8cef5SAlex Elder 277f0f8cef5SAlex Elder static struct bus_attribute rbd_bus_attrs[] = { 278f0f8cef5SAlex Elder __ATTR(add, S_IWUSR, NULL, rbd_add), 279f0f8cef5SAlex Elder __ATTR(remove, S_IWUSR, NULL, rbd_remove), 280f0f8cef5SAlex Elder __ATTR_NULL 281f0f8cef5SAlex Elder }; 282f0f8cef5SAlex Elder 283f0f8cef5SAlex Elder static struct bus_type rbd_bus_type = { 284f0f8cef5SAlex Elder .name = "rbd", 285f0f8cef5SAlex Elder .bus_attrs = rbd_bus_attrs, 286f0f8cef5SAlex Elder }; 287f0f8cef5SAlex Elder 288f0f8cef5SAlex Elder static void rbd_root_dev_release(struct device *dev) 289f0f8cef5SAlex Elder { 290f0f8cef5SAlex Elder } 291f0f8cef5SAlex Elder 292f0f8cef5SAlex Elder static struct device rbd_root_dev = { 293f0f8cef5SAlex Elder .init_name = "rbd", 294f0f8cef5SAlex Elder .release = rbd_root_dev_release, 295f0f8cef5SAlex Elder }; 296f0f8cef5SAlex Elder 29706ecc6cbSAlex Elder static __printf(2, 3) 29806ecc6cbSAlex Elder void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...) 29906ecc6cbSAlex Elder { 30006ecc6cbSAlex Elder struct va_format vaf; 30106ecc6cbSAlex Elder va_list args; 30206ecc6cbSAlex Elder 30306ecc6cbSAlex Elder va_start(args, fmt); 30406ecc6cbSAlex Elder vaf.fmt = fmt; 30506ecc6cbSAlex Elder vaf.va = &args; 30606ecc6cbSAlex Elder 30706ecc6cbSAlex Elder if (!rbd_dev) 30806ecc6cbSAlex Elder printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf); 30906ecc6cbSAlex Elder else if (rbd_dev->disk) 31006ecc6cbSAlex Elder printk(KERN_WARNING "%s: %s: %pV\n", 31106ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf); 31206ecc6cbSAlex Elder else if (rbd_dev->spec && rbd_dev->spec->image_name) 31306ecc6cbSAlex Elder printk(KERN_WARNING "%s: image %s: %pV\n", 31406ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf); 31506ecc6cbSAlex Elder else if (rbd_dev->spec && rbd_dev->spec->image_id) 31606ecc6cbSAlex Elder printk(KERN_WARNING "%s: id %s: %pV\n", 31706ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf); 31806ecc6cbSAlex Elder else /* punt */ 31906ecc6cbSAlex Elder printk(KERN_WARNING "%s: rbd_dev %p: %pV\n", 32006ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev, &vaf); 32106ecc6cbSAlex Elder va_end(args); 32206ecc6cbSAlex Elder } 32306ecc6cbSAlex Elder 324aafb230eSAlex Elder #ifdef RBD_DEBUG 325aafb230eSAlex Elder #define rbd_assert(expr) \ 326aafb230eSAlex Elder if (unlikely(!(expr))) { \ 327aafb230eSAlex Elder printk(KERN_ERR "\nAssertion failure in %s() " \ 328aafb230eSAlex Elder "at line %d:\n\n" \ 329aafb230eSAlex Elder "\trbd_assert(%s);\n\n", \ 330aafb230eSAlex Elder __func__, __LINE__, #expr); \ 331aafb230eSAlex Elder BUG(); \ 332aafb230eSAlex Elder } 333aafb230eSAlex Elder #else /* !RBD_DEBUG */ 334aafb230eSAlex Elder # define rbd_assert(expr) ((void) 0) 335aafb230eSAlex Elder #endif /* !RBD_DEBUG */ 336dfc5606dSYehuda Sadeh 337117973fbSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver); 338117973fbSAlex Elder static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver); 33959c2be1eSYehuda Sadeh 340602adf40SYehuda Sadeh static int rbd_open(struct block_device *bdev, fmode_t mode) 341602adf40SYehuda Sadeh { 342f0f8cef5SAlex Elder struct rbd_device *rbd_dev = bdev->bd_disk->private_data; 343602adf40SYehuda Sadeh 344f84344f3SAlex Elder if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only) 345602adf40SYehuda Sadeh return -EROFS; 346602adf40SYehuda Sadeh 34742382b70SAlex Elder mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); 348c3e946ceSAlex Elder (void) get_device(&rbd_dev->dev); 349f84344f3SAlex Elder set_device_ro(bdev, rbd_dev->mapping.read_only); 35042382b70SAlex Elder rbd_dev->open_count++; 35142382b70SAlex Elder mutex_unlock(&ctl_mutex); 352340c7a2bSAlex Elder 353602adf40SYehuda Sadeh return 0; 354602adf40SYehuda Sadeh } 355602adf40SYehuda Sadeh 356dfc5606dSYehuda Sadeh static int rbd_release(struct gendisk *disk, fmode_t mode) 357dfc5606dSYehuda Sadeh { 358dfc5606dSYehuda Sadeh struct rbd_device *rbd_dev = disk->private_data; 359dfc5606dSYehuda Sadeh 36042382b70SAlex Elder mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); 36142382b70SAlex Elder rbd_assert(rbd_dev->open_count > 0); 36242382b70SAlex Elder rbd_dev->open_count--; 363c3e946ceSAlex Elder put_device(&rbd_dev->dev); 36442382b70SAlex Elder mutex_unlock(&ctl_mutex); 365dfc5606dSYehuda Sadeh 366dfc5606dSYehuda Sadeh return 0; 367dfc5606dSYehuda Sadeh } 368dfc5606dSYehuda Sadeh 369602adf40SYehuda Sadeh static const struct block_device_operations rbd_bd_ops = { 370602adf40SYehuda Sadeh .owner = THIS_MODULE, 371602adf40SYehuda Sadeh .open = rbd_open, 372dfc5606dSYehuda Sadeh .release = rbd_release, 373602adf40SYehuda Sadeh }; 374602adf40SYehuda Sadeh 375602adf40SYehuda Sadeh /* 376602adf40SYehuda Sadeh * Initialize an rbd client instance. 37743ae4701SAlex Elder * We own *ceph_opts. 378602adf40SYehuda Sadeh */ 379f8c38929SAlex Elder static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts) 380602adf40SYehuda Sadeh { 381602adf40SYehuda Sadeh struct rbd_client *rbdc; 382602adf40SYehuda Sadeh int ret = -ENOMEM; 383602adf40SYehuda Sadeh 384602adf40SYehuda Sadeh dout("rbd_client_create\n"); 385602adf40SYehuda Sadeh rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL); 386602adf40SYehuda Sadeh if (!rbdc) 387602adf40SYehuda Sadeh goto out_opt; 388602adf40SYehuda Sadeh 389602adf40SYehuda Sadeh kref_init(&rbdc->kref); 390602adf40SYehuda Sadeh INIT_LIST_HEAD(&rbdc->node); 391602adf40SYehuda Sadeh 392bc534d86SAlex Elder mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); 393bc534d86SAlex Elder 39443ae4701SAlex Elder rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0); 395602adf40SYehuda Sadeh if (IS_ERR(rbdc->client)) 396bc534d86SAlex Elder goto out_mutex; 39743ae4701SAlex Elder ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */ 398602adf40SYehuda Sadeh 399602adf40SYehuda Sadeh ret = ceph_open_session(rbdc->client); 400602adf40SYehuda Sadeh if (ret < 0) 401602adf40SYehuda Sadeh goto out_err; 402602adf40SYehuda Sadeh 403432b8587SAlex Elder spin_lock(&rbd_client_list_lock); 404602adf40SYehuda Sadeh list_add_tail(&rbdc->node, &rbd_client_list); 405432b8587SAlex Elder spin_unlock(&rbd_client_list_lock); 406602adf40SYehuda Sadeh 407bc534d86SAlex Elder mutex_unlock(&ctl_mutex); 408bc534d86SAlex Elder 409602adf40SYehuda Sadeh dout("rbd_client_create created %p\n", rbdc); 410602adf40SYehuda Sadeh return rbdc; 411602adf40SYehuda Sadeh 412602adf40SYehuda Sadeh out_err: 413602adf40SYehuda Sadeh ceph_destroy_client(rbdc->client); 414bc534d86SAlex Elder out_mutex: 415bc534d86SAlex Elder mutex_unlock(&ctl_mutex); 416602adf40SYehuda Sadeh kfree(rbdc); 417602adf40SYehuda Sadeh out_opt: 41843ae4701SAlex Elder if (ceph_opts) 41943ae4701SAlex Elder ceph_destroy_options(ceph_opts); 42028f259b7SVasiliy Kulikov return ERR_PTR(ret); 421602adf40SYehuda Sadeh } 422602adf40SYehuda Sadeh 423602adf40SYehuda Sadeh /* 4241f7ba331SAlex Elder * Find a ceph client with specific addr and configuration. If 4251f7ba331SAlex Elder * found, bump its reference count. 426602adf40SYehuda Sadeh */ 4271f7ba331SAlex Elder static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts) 428602adf40SYehuda Sadeh { 429602adf40SYehuda Sadeh struct rbd_client *client_node; 4301f7ba331SAlex Elder bool found = false; 431602adf40SYehuda Sadeh 43243ae4701SAlex Elder if (ceph_opts->flags & CEPH_OPT_NOSHARE) 433602adf40SYehuda Sadeh return NULL; 434602adf40SYehuda Sadeh 4351f7ba331SAlex Elder spin_lock(&rbd_client_list_lock); 4361f7ba331SAlex Elder list_for_each_entry(client_node, &rbd_client_list, node) { 4371f7ba331SAlex Elder if (!ceph_compare_options(ceph_opts, client_node->client)) { 4381f7ba331SAlex Elder kref_get(&client_node->kref); 4391f7ba331SAlex Elder found = true; 4401f7ba331SAlex Elder break; 4411f7ba331SAlex Elder } 4421f7ba331SAlex Elder } 4431f7ba331SAlex Elder spin_unlock(&rbd_client_list_lock); 4441f7ba331SAlex Elder 4451f7ba331SAlex Elder return found ? client_node : NULL; 446602adf40SYehuda Sadeh } 447602adf40SYehuda Sadeh 448602adf40SYehuda Sadeh /* 44959c2be1eSYehuda Sadeh * mount options 45059c2be1eSYehuda Sadeh */ 45159c2be1eSYehuda Sadeh enum { 45259c2be1eSYehuda Sadeh Opt_last_int, 45359c2be1eSYehuda Sadeh /* int args above */ 45459c2be1eSYehuda Sadeh Opt_last_string, 45559c2be1eSYehuda Sadeh /* string args above */ 456cc0538b6SAlex Elder Opt_read_only, 457cc0538b6SAlex Elder Opt_read_write, 458cc0538b6SAlex Elder /* Boolean args above */ 459cc0538b6SAlex Elder Opt_last_bool, 46059c2be1eSYehuda Sadeh }; 46159c2be1eSYehuda Sadeh 46243ae4701SAlex Elder static match_table_t rbd_opts_tokens = { 46359c2be1eSYehuda Sadeh /* int args above */ 46459c2be1eSYehuda Sadeh /* string args above */ 465be466c1cSAlex Elder {Opt_read_only, "read_only"}, 466cc0538b6SAlex Elder {Opt_read_only, "ro"}, /* Alternate spelling */ 467cc0538b6SAlex Elder {Opt_read_write, "read_write"}, 468cc0538b6SAlex Elder {Opt_read_write, "rw"}, /* Alternate spelling */ 469cc0538b6SAlex Elder /* Boolean args above */ 47059c2be1eSYehuda Sadeh {-1, NULL} 47159c2be1eSYehuda Sadeh }; 47259c2be1eSYehuda Sadeh 47359c2be1eSYehuda Sadeh static int parse_rbd_opts_token(char *c, void *private) 47459c2be1eSYehuda Sadeh { 47543ae4701SAlex Elder struct rbd_options *rbd_opts = private; 47659c2be1eSYehuda Sadeh substring_t argstr[MAX_OPT_ARGS]; 47759c2be1eSYehuda Sadeh int token, intval, ret; 47859c2be1eSYehuda Sadeh 47943ae4701SAlex Elder token = match_token(c, rbd_opts_tokens, argstr); 48059c2be1eSYehuda Sadeh if (token < 0) 48159c2be1eSYehuda Sadeh return -EINVAL; 48259c2be1eSYehuda Sadeh 48359c2be1eSYehuda Sadeh if (token < Opt_last_int) { 48459c2be1eSYehuda Sadeh ret = match_int(&argstr[0], &intval); 48559c2be1eSYehuda Sadeh if (ret < 0) { 48659c2be1eSYehuda Sadeh pr_err("bad mount option arg (not int) " 48759c2be1eSYehuda Sadeh "at '%s'\n", c); 48859c2be1eSYehuda Sadeh return ret; 48959c2be1eSYehuda Sadeh } 49059c2be1eSYehuda Sadeh dout("got int token %d val %d\n", token, intval); 49159c2be1eSYehuda Sadeh } else if (token > Opt_last_int && token < Opt_last_string) { 49259c2be1eSYehuda Sadeh dout("got string token %d val %s\n", token, 49359c2be1eSYehuda Sadeh argstr[0].from); 494cc0538b6SAlex Elder } else if (token > Opt_last_string && token < Opt_last_bool) { 495cc0538b6SAlex Elder dout("got Boolean token %d\n", token); 49659c2be1eSYehuda Sadeh } else { 49759c2be1eSYehuda Sadeh dout("got token %d\n", token); 49859c2be1eSYehuda Sadeh } 49959c2be1eSYehuda Sadeh 50059c2be1eSYehuda Sadeh switch (token) { 501cc0538b6SAlex Elder case Opt_read_only: 502cc0538b6SAlex Elder rbd_opts->read_only = true; 503cc0538b6SAlex Elder break; 504cc0538b6SAlex Elder case Opt_read_write: 505cc0538b6SAlex Elder rbd_opts->read_only = false; 506cc0538b6SAlex Elder break; 50759c2be1eSYehuda Sadeh default: 508aafb230eSAlex Elder rbd_assert(false); 509aafb230eSAlex Elder break; 51059c2be1eSYehuda Sadeh } 51159c2be1eSYehuda Sadeh return 0; 51259c2be1eSYehuda Sadeh } 51359c2be1eSYehuda Sadeh 51459c2be1eSYehuda Sadeh /* 515602adf40SYehuda Sadeh * Get a ceph client with specific addr and configuration, if one does 516602adf40SYehuda Sadeh * not exist create it. 517602adf40SYehuda Sadeh */ 5189d3997fdSAlex Elder static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts) 519602adf40SYehuda Sadeh { 520f8c38929SAlex Elder struct rbd_client *rbdc; 52159c2be1eSYehuda Sadeh 5221f7ba331SAlex Elder rbdc = rbd_client_find(ceph_opts); 5239d3997fdSAlex Elder if (rbdc) /* using an existing client */ 52443ae4701SAlex Elder ceph_destroy_options(ceph_opts); 5259d3997fdSAlex Elder else 526f8c38929SAlex Elder rbdc = rbd_client_create(ceph_opts); 527d720bcb0SAlex Elder 5289d3997fdSAlex Elder return rbdc; 529602adf40SYehuda Sadeh } 530602adf40SYehuda Sadeh 531602adf40SYehuda Sadeh /* 532602adf40SYehuda Sadeh * Destroy ceph client 533d23a4b3fSAlex Elder * 534432b8587SAlex Elder * Caller must hold rbd_client_list_lock. 535602adf40SYehuda Sadeh */ 536602adf40SYehuda Sadeh static void rbd_client_release(struct kref *kref) 537602adf40SYehuda Sadeh { 538602adf40SYehuda Sadeh struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref); 539602adf40SYehuda Sadeh 540602adf40SYehuda Sadeh dout("rbd_release_client %p\n", rbdc); 541cd9d9f5dSAlex Elder spin_lock(&rbd_client_list_lock); 542602adf40SYehuda Sadeh list_del(&rbdc->node); 543cd9d9f5dSAlex Elder spin_unlock(&rbd_client_list_lock); 544602adf40SYehuda Sadeh 545602adf40SYehuda Sadeh ceph_destroy_client(rbdc->client); 546602adf40SYehuda Sadeh kfree(rbdc); 547602adf40SYehuda Sadeh } 548602adf40SYehuda Sadeh 549602adf40SYehuda Sadeh /* 550602adf40SYehuda Sadeh * Drop reference to ceph client node. If it's not referenced anymore, release 551602adf40SYehuda Sadeh * it. 552602adf40SYehuda Sadeh */ 5539d3997fdSAlex Elder static void rbd_put_client(struct rbd_client *rbdc) 554602adf40SYehuda Sadeh { 555c53d5893SAlex Elder if (rbdc) 5569d3997fdSAlex Elder kref_put(&rbdc->kref, rbd_client_release); 557602adf40SYehuda Sadeh } 558602adf40SYehuda Sadeh 5591fec7093SYehuda Sadeh /* 5601fec7093SYehuda Sadeh * Destroy requests collection 5611fec7093SYehuda Sadeh */ 5621fec7093SYehuda Sadeh static void rbd_coll_release(struct kref *kref) 5631fec7093SYehuda Sadeh { 5641fec7093SYehuda Sadeh struct rbd_req_coll *coll = 5651fec7093SYehuda Sadeh container_of(kref, struct rbd_req_coll, kref); 5661fec7093SYehuda Sadeh 5671fec7093SYehuda Sadeh dout("rbd_coll_release %p\n", coll); 5681fec7093SYehuda Sadeh kfree(coll); 5691fec7093SYehuda Sadeh } 570602adf40SYehuda Sadeh 571a30b71b9SAlex Elder static bool rbd_image_format_valid(u32 image_format) 572a30b71b9SAlex Elder { 573a30b71b9SAlex Elder return image_format == 1 || image_format == 2; 574a30b71b9SAlex Elder } 575a30b71b9SAlex Elder 5768e94af8eSAlex Elder static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk) 5778e94af8eSAlex Elder { 578103a150fSAlex Elder size_t size; 579103a150fSAlex Elder u32 snap_count; 580103a150fSAlex Elder 581103a150fSAlex Elder /* The header has to start with the magic rbd header text */ 582103a150fSAlex Elder if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT))) 583103a150fSAlex Elder return false; 584103a150fSAlex Elder 585db2388b6SAlex Elder /* The bio layer requires at least sector-sized I/O */ 586db2388b6SAlex Elder 587db2388b6SAlex Elder if (ondisk->options.order < SECTOR_SHIFT) 588db2388b6SAlex Elder return false; 589db2388b6SAlex Elder 590db2388b6SAlex Elder /* If we use u64 in a few spots we may be able to loosen this */ 591db2388b6SAlex Elder 592db2388b6SAlex Elder if (ondisk->options.order > 8 * sizeof (int) - 1) 593db2388b6SAlex Elder return false; 594db2388b6SAlex Elder 595103a150fSAlex Elder /* 596103a150fSAlex Elder * The size of a snapshot header has to fit in a size_t, and 597103a150fSAlex Elder * that limits the number of snapshots. 598103a150fSAlex Elder */ 599103a150fSAlex Elder snap_count = le32_to_cpu(ondisk->snap_count); 600103a150fSAlex Elder size = SIZE_MAX - sizeof (struct ceph_snap_context); 601103a150fSAlex Elder if (snap_count > size / sizeof (__le64)) 602103a150fSAlex Elder return false; 603103a150fSAlex Elder 604103a150fSAlex Elder /* 605103a150fSAlex Elder * Not only that, but the size of the entire the snapshot 606103a150fSAlex Elder * header must also be representable in a size_t. 607103a150fSAlex Elder */ 608103a150fSAlex Elder size -= snap_count * sizeof (__le64); 609103a150fSAlex Elder if ((u64) size < le64_to_cpu(ondisk->snap_names_len)) 610103a150fSAlex Elder return false; 611103a150fSAlex Elder 612103a150fSAlex Elder return true; 6138e94af8eSAlex Elder } 6148e94af8eSAlex Elder 615602adf40SYehuda Sadeh /* 616602adf40SYehuda Sadeh * Create a new header structure, translate header format from the on-disk 617602adf40SYehuda Sadeh * header. 618602adf40SYehuda Sadeh */ 619602adf40SYehuda Sadeh static int rbd_header_from_disk(struct rbd_image_header *header, 6204156d998SAlex Elder struct rbd_image_header_ondisk *ondisk) 621602adf40SYehuda Sadeh { 622ccece235SAlex Elder u32 snap_count; 62358c17b0eSAlex Elder size_t len; 624d2bb24e5SAlex Elder size_t size; 625621901d6SAlex Elder u32 i; 626602adf40SYehuda Sadeh 6276a52325fSAlex Elder memset(header, 0, sizeof (*header)); 6286a52325fSAlex Elder 629103a150fSAlex Elder snap_count = le32_to_cpu(ondisk->snap_count); 630103a150fSAlex Elder 63158c17b0eSAlex Elder len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix)); 63258c17b0eSAlex Elder header->object_prefix = kmalloc(len + 1, GFP_KERNEL); 6336a52325fSAlex Elder if (!header->object_prefix) 634602adf40SYehuda Sadeh return -ENOMEM; 63558c17b0eSAlex Elder memcpy(header->object_prefix, ondisk->object_prefix, len); 63658c17b0eSAlex Elder header->object_prefix[len] = '\0'; 63700f1f36fSAlex Elder 638602adf40SYehuda Sadeh if (snap_count) { 639f785cc1dSAlex Elder u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len); 640f785cc1dSAlex Elder 641621901d6SAlex Elder /* Save a copy of the snapshot names */ 642621901d6SAlex Elder 643f785cc1dSAlex Elder if (snap_names_len > (u64) SIZE_MAX) 644f785cc1dSAlex Elder return -EIO; 645f785cc1dSAlex Elder header->snap_names = kmalloc(snap_names_len, GFP_KERNEL); 646602adf40SYehuda Sadeh if (!header->snap_names) 6476a52325fSAlex Elder goto out_err; 648f785cc1dSAlex Elder /* 649f785cc1dSAlex Elder * Note that rbd_dev_v1_header_read() guarantees 650f785cc1dSAlex Elder * the ondisk buffer we're working with has 651f785cc1dSAlex Elder * snap_names_len bytes beyond the end of the 652f785cc1dSAlex Elder * snapshot id array, this memcpy() is safe. 653f785cc1dSAlex Elder */ 654f785cc1dSAlex Elder memcpy(header->snap_names, &ondisk->snaps[snap_count], 655f785cc1dSAlex Elder snap_names_len); 6566a52325fSAlex Elder 657621901d6SAlex Elder /* Record each snapshot's size */ 658621901d6SAlex Elder 659d2bb24e5SAlex Elder size = snap_count * sizeof (*header->snap_sizes); 660d2bb24e5SAlex Elder header->snap_sizes = kmalloc(size, GFP_KERNEL); 661602adf40SYehuda Sadeh if (!header->snap_sizes) 6626a52325fSAlex Elder goto out_err; 663621901d6SAlex Elder for (i = 0; i < snap_count; i++) 664621901d6SAlex Elder header->snap_sizes[i] = 665621901d6SAlex Elder le64_to_cpu(ondisk->snaps[i].image_size); 666602adf40SYehuda Sadeh } else { 667ccece235SAlex Elder WARN_ON(ondisk->snap_names_len); 668602adf40SYehuda Sadeh header->snap_names = NULL; 669602adf40SYehuda Sadeh header->snap_sizes = NULL; 670602adf40SYehuda Sadeh } 671849b4260SAlex Elder 67234b13184SAlex Elder header->features = 0; /* No features support in v1 images */ 673602adf40SYehuda Sadeh header->obj_order = ondisk->options.order; 674602adf40SYehuda Sadeh header->crypt_type = ondisk->options.crypt_type; 675602adf40SYehuda Sadeh header->comp_type = ondisk->options.comp_type; 6766a52325fSAlex Elder 677621901d6SAlex Elder /* Allocate and fill in the snapshot context */ 678621901d6SAlex Elder 679f84344f3SAlex Elder header->image_size = le64_to_cpu(ondisk->image_size); 6806a52325fSAlex Elder size = sizeof (struct ceph_snap_context); 6816a52325fSAlex Elder size += snap_count * sizeof (header->snapc->snaps[0]); 6826a52325fSAlex Elder header->snapc = kzalloc(size, GFP_KERNEL); 6836a52325fSAlex Elder if (!header->snapc) 6846a52325fSAlex Elder goto out_err; 685602adf40SYehuda Sadeh 686602adf40SYehuda Sadeh atomic_set(&header->snapc->nref, 1); 687505cbb9bSAlex Elder header->snapc->seq = le64_to_cpu(ondisk->snap_seq); 688602adf40SYehuda Sadeh header->snapc->num_snaps = snap_count; 689621901d6SAlex Elder for (i = 0; i < snap_count; i++) 690602adf40SYehuda Sadeh header->snapc->snaps[i] = 691602adf40SYehuda Sadeh le64_to_cpu(ondisk->snaps[i].id); 692602adf40SYehuda Sadeh 693602adf40SYehuda Sadeh return 0; 694602adf40SYehuda Sadeh 6956a52325fSAlex Elder out_err: 696849b4260SAlex Elder kfree(header->snap_sizes); 697ccece235SAlex Elder header->snap_sizes = NULL; 698602adf40SYehuda Sadeh kfree(header->snap_names); 699ccece235SAlex Elder header->snap_names = NULL; 7006a52325fSAlex Elder kfree(header->object_prefix); 7016a52325fSAlex Elder header->object_prefix = NULL; 702ccece235SAlex Elder 70300f1f36fSAlex Elder return -ENOMEM; 704602adf40SYehuda Sadeh } 705602adf40SYehuda Sadeh 7069e15b77dSAlex Elder static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id) 7079e15b77dSAlex Elder { 7089e15b77dSAlex Elder struct rbd_snap *snap; 7099e15b77dSAlex Elder 7109e15b77dSAlex Elder if (snap_id == CEPH_NOSNAP) 7119e15b77dSAlex Elder return RBD_SNAP_HEAD_NAME; 7129e15b77dSAlex Elder 7139e15b77dSAlex Elder list_for_each_entry(snap, &rbd_dev->snaps, node) 7149e15b77dSAlex Elder if (snap_id == snap->id) 7159e15b77dSAlex Elder return snap->name; 7169e15b77dSAlex Elder 7179e15b77dSAlex Elder return NULL; 7189e15b77dSAlex Elder } 7199e15b77dSAlex Elder 7208836b995SAlex Elder static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name) 721602adf40SYehuda Sadeh { 722602adf40SYehuda Sadeh 723e86924a8SAlex Elder struct rbd_snap *snap; 72400f1f36fSAlex Elder 725e86924a8SAlex Elder list_for_each_entry(snap, &rbd_dev->snaps, node) { 726e86924a8SAlex Elder if (!strcmp(snap_name, snap->name)) { 7270d7dbfceSAlex Elder rbd_dev->spec->snap_id = snap->id; 728e86924a8SAlex Elder rbd_dev->mapping.size = snap->size; 72934b13184SAlex Elder rbd_dev->mapping.features = snap->features; 73000f1f36fSAlex Elder 731e86924a8SAlex Elder return 0; 732602adf40SYehuda Sadeh } 73300f1f36fSAlex Elder } 734e86924a8SAlex Elder 73500f1f36fSAlex Elder return -ENOENT; 73600f1f36fSAlex Elder } 737602adf40SYehuda Sadeh 738819d52bfSAlex Elder static int rbd_dev_set_mapping(struct rbd_device *rbd_dev) 739602adf40SYehuda Sadeh { 74078dc447dSAlex Elder int ret; 741602adf40SYehuda Sadeh 7420d7dbfceSAlex Elder if (!memcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME, 743cc9d734cSJosh Durgin sizeof (RBD_SNAP_HEAD_NAME))) { 7440d7dbfceSAlex Elder rbd_dev->spec->snap_id = CEPH_NOSNAP; 74599c1f08fSAlex Elder rbd_dev->mapping.size = rbd_dev->header.image_size; 74634b13184SAlex Elder rbd_dev->mapping.features = rbd_dev->header.features; 747e86924a8SAlex Elder ret = 0; 748602adf40SYehuda Sadeh } else { 7490d7dbfceSAlex Elder ret = snap_by_name(rbd_dev, rbd_dev->spec->snap_name); 750602adf40SYehuda Sadeh if (ret < 0) 751602adf40SYehuda Sadeh goto done; 752f84344f3SAlex Elder rbd_dev->mapping.read_only = true; 753602adf40SYehuda Sadeh } 754d78b650aSAlex Elder atomic_set(&rbd_dev->exists, 1); 755602adf40SYehuda Sadeh done: 756602adf40SYehuda Sadeh return ret; 757602adf40SYehuda Sadeh } 758602adf40SYehuda Sadeh 759602adf40SYehuda Sadeh static void rbd_header_free(struct rbd_image_header *header) 760602adf40SYehuda Sadeh { 761849b4260SAlex Elder kfree(header->object_prefix); 762d78fd7aeSAlex Elder header->object_prefix = NULL; 763602adf40SYehuda Sadeh kfree(header->snap_sizes); 764d78fd7aeSAlex Elder header->snap_sizes = NULL; 765849b4260SAlex Elder kfree(header->snap_names); 766d78fd7aeSAlex Elder header->snap_names = NULL; 767d1d25646SJosh Durgin ceph_put_snap_context(header->snapc); 768d78fd7aeSAlex Elder header->snapc = NULL; 769602adf40SYehuda Sadeh } 770602adf40SYehuda Sadeh 77165ccfe21SAlex Elder static char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset) 772602adf40SYehuda Sadeh { 77365ccfe21SAlex Elder char *name; 77465ccfe21SAlex Elder u64 segment; 77565ccfe21SAlex Elder int ret; 776602adf40SYehuda Sadeh 7772fd82b9eSAlex Elder name = kmalloc(MAX_OBJ_NAME_SIZE + 1, GFP_NOIO); 77865ccfe21SAlex Elder if (!name) 77965ccfe21SAlex Elder return NULL; 78065ccfe21SAlex Elder segment = offset >> rbd_dev->header.obj_order; 7812fd82b9eSAlex Elder ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, "%s.%012llx", 78265ccfe21SAlex Elder rbd_dev->header.object_prefix, segment); 7832fd82b9eSAlex Elder if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) { 78465ccfe21SAlex Elder pr_err("error formatting segment name for #%llu (%d)\n", 78565ccfe21SAlex Elder segment, ret); 78665ccfe21SAlex Elder kfree(name); 78765ccfe21SAlex Elder name = NULL; 78865ccfe21SAlex Elder } 789602adf40SYehuda Sadeh 79065ccfe21SAlex Elder return name; 79165ccfe21SAlex Elder } 792602adf40SYehuda Sadeh 79365ccfe21SAlex Elder static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset) 79465ccfe21SAlex Elder { 79565ccfe21SAlex Elder u64 segment_size = (u64) 1 << rbd_dev->header.obj_order; 796602adf40SYehuda Sadeh 79765ccfe21SAlex Elder return offset & (segment_size - 1); 79865ccfe21SAlex Elder } 79965ccfe21SAlex Elder 80065ccfe21SAlex Elder static u64 rbd_segment_length(struct rbd_device *rbd_dev, 80165ccfe21SAlex Elder u64 offset, u64 length) 80265ccfe21SAlex Elder { 80365ccfe21SAlex Elder u64 segment_size = (u64) 1 << rbd_dev->header.obj_order; 80465ccfe21SAlex Elder 80565ccfe21SAlex Elder offset &= segment_size - 1; 80665ccfe21SAlex Elder 807aafb230eSAlex Elder rbd_assert(length <= U64_MAX - offset); 80865ccfe21SAlex Elder if (offset + length > segment_size) 80965ccfe21SAlex Elder length = segment_size - offset; 81065ccfe21SAlex Elder 81165ccfe21SAlex Elder return length; 812602adf40SYehuda Sadeh } 813602adf40SYehuda Sadeh 8141fec7093SYehuda Sadeh static int rbd_get_num_segments(struct rbd_image_header *header, 8151fec7093SYehuda Sadeh u64 ofs, u64 len) 8161fec7093SYehuda Sadeh { 817df111be6SAlex Elder u64 start_seg; 818df111be6SAlex Elder u64 end_seg; 819df111be6SAlex Elder 820df111be6SAlex Elder if (!len) 821df111be6SAlex Elder return 0; 822df111be6SAlex Elder if (len - 1 > U64_MAX - ofs) 823df111be6SAlex Elder return -ERANGE; 824df111be6SAlex Elder 825df111be6SAlex Elder start_seg = ofs >> header->obj_order; 826df111be6SAlex Elder end_seg = (ofs + len - 1) >> header->obj_order; 827df111be6SAlex Elder 8281fec7093SYehuda Sadeh return end_seg - start_seg + 1; 8291fec7093SYehuda Sadeh } 8301fec7093SYehuda Sadeh 831602adf40SYehuda Sadeh /* 832029bcbd8SJosh Durgin * returns the size of an object in the image 833029bcbd8SJosh Durgin */ 834029bcbd8SJosh Durgin static u64 rbd_obj_bytes(struct rbd_image_header *header) 835029bcbd8SJosh Durgin { 836029bcbd8SJosh Durgin return 1 << header->obj_order; 837029bcbd8SJosh Durgin } 838029bcbd8SJosh Durgin 839029bcbd8SJosh Durgin /* 840602adf40SYehuda Sadeh * bio helpers 841602adf40SYehuda Sadeh */ 842602adf40SYehuda Sadeh 843602adf40SYehuda Sadeh static void bio_chain_put(struct bio *chain) 844602adf40SYehuda Sadeh { 845602adf40SYehuda Sadeh struct bio *tmp; 846602adf40SYehuda Sadeh 847602adf40SYehuda Sadeh while (chain) { 848602adf40SYehuda Sadeh tmp = chain; 849602adf40SYehuda Sadeh chain = chain->bi_next; 850602adf40SYehuda Sadeh bio_put(tmp); 851602adf40SYehuda Sadeh } 852602adf40SYehuda Sadeh } 853602adf40SYehuda Sadeh 854602adf40SYehuda Sadeh /* 855602adf40SYehuda Sadeh * zeros a bio chain, starting at specific offset 856602adf40SYehuda Sadeh */ 857602adf40SYehuda Sadeh static void zero_bio_chain(struct bio *chain, int start_ofs) 858602adf40SYehuda Sadeh { 859602adf40SYehuda Sadeh struct bio_vec *bv; 860602adf40SYehuda Sadeh unsigned long flags; 861602adf40SYehuda Sadeh void *buf; 862602adf40SYehuda Sadeh int i; 863602adf40SYehuda Sadeh int pos = 0; 864602adf40SYehuda Sadeh 865602adf40SYehuda Sadeh while (chain) { 866602adf40SYehuda Sadeh bio_for_each_segment(bv, chain, i) { 867602adf40SYehuda Sadeh if (pos + bv->bv_len > start_ofs) { 868602adf40SYehuda Sadeh int remainder = max(start_ofs - pos, 0); 869602adf40SYehuda Sadeh buf = bvec_kmap_irq(bv, &flags); 870602adf40SYehuda Sadeh memset(buf + remainder, 0, 871602adf40SYehuda Sadeh bv->bv_len - remainder); 87285b5aaa6SDan Carpenter bvec_kunmap_irq(buf, &flags); 873602adf40SYehuda Sadeh } 874602adf40SYehuda Sadeh pos += bv->bv_len; 875602adf40SYehuda Sadeh } 876602adf40SYehuda Sadeh 877602adf40SYehuda Sadeh chain = chain->bi_next; 878602adf40SYehuda Sadeh } 879602adf40SYehuda Sadeh } 880602adf40SYehuda Sadeh 881602adf40SYehuda Sadeh /* 882f7760dadSAlex Elder * Clone a portion of a bio, starting at the given byte offset 883f7760dadSAlex Elder * and continuing for the number of bytes indicated. 884602adf40SYehuda Sadeh */ 885f7760dadSAlex Elder static struct bio *bio_clone_range(struct bio *bio_src, 886f7760dadSAlex Elder unsigned int offset, 887f7760dadSAlex Elder unsigned int len, 888f7760dadSAlex Elder gfp_t gfpmask) 889602adf40SYehuda Sadeh { 890f7760dadSAlex Elder struct bio_vec *bv; 891f7760dadSAlex Elder unsigned int resid; 892f7760dadSAlex Elder unsigned short idx; 893f7760dadSAlex Elder unsigned int voff; 894f7760dadSAlex Elder unsigned short end_idx; 895f7760dadSAlex Elder unsigned short vcnt; 896f7760dadSAlex Elder struct bio *bio; 897602adf40SYehuda Sadeh 898f7760dadSAlex Elder /* Handle the easy case for the caller */ 899f7760dadSAlex Elder 900f7760dadSAlex Elder if (!offset && len == bio_src->bi_size) 901f7760dadSAlex Elder return bio_clone(bio_src, gfpmask); 902f7760dadSAlex Elder 903f7760dadSAlex Elder if (WARN_ON_ONCE(!len)) 904f7760dadSAlex Elder return NULL; 905f7760dadSAlex Elder if (WARN_ON_ONCE(len > bio_src->bi_size)) 906f7760dadSAlex Elder return NULL; 907f7760dadSAlex Elder if (WARN_ON_ONCE(offset > bio_src->bi_size - len)) 908f7760dadSAlex Elder return NULL; 909f7760dadSAlex Elder 910f7760dadSAlex Elder /* Find first affected segment... */ 911f7760dadSAlex Elder 912f7760dadSAlex Elder resid = offset; 913f7760dadSAlex Elder __bio_for_each_segment(bv, bio_src, idx, 0) { 914f7760dadSAlex Elder if (resid < bv->bv_len) 915f7760dadSAlex Elder break; 916f7760dadSAlex Elder resid -= bv->bv_len; 917602adf40SYehuda Sadeh } 918f7760dadSAlex Elder voff = resid; 919602adf40SYehuda Sadeh 920f7760dadSAlex Elder /* ...and the last affected segment */ 921542582fcSAlex Elder 922f7760dadSAlex Elder resid += len; 923f7760dadSAlex Elder __bio_for_each_segment(bv, bio_src, end_idx, idx) { 924f7760dadSAlex Elder if (resid <= bv->bv_len) 925f7760dadSAlex Elder break; 926f7760dadSAlex Elder resid -= bv->bv_len; 927f7760dadSAlex Elder } 928f7760dadSAlex Elder vcnt = end_idx - idx + 1; 929602adf40SYehuda Sadeh 930f7760dadSAlex Elder /* Build the clone */ 931f7760dadSAlex Elder 932f7760dadSAlex Elder bio = bio_alloc(gfpmask, (unsigned int) vcnt); 933f7760dadSAlex Elder if (!bio) 934f7760dadSAlex Elder return NULL; /* ENOMEM */ 935f7760dadSAlex Elder 936f7760dadSAlex Elder bio->bi_bdev = bio_src->bi_bdev; 937f7760dadSAlex Elder bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT); 938f7760dadSAlex Elder bio->bi_rw = bio_src->bi_rw; 939f7760dadSAlex Elder bio->bi_flags |= 1 << BIO_CLONED; 940602adf40SYehuda Sadeh 941602adf40SYehuda Sadeh /* 942f7760dadSAlex Elder * Copy over our part of the bio_vec, then update the first 943f7760dadSAlex Elder * and last (or only) entries. 944602adf40SYehuda Sadeh */ 945f7760dadSAlex Elder memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx], 946f7760dadSAlex Elder vcnt * sizeof (struct bio_vec)); 947f7760dadSAlex Elder bio->bi_io_vec[0].bv_offset += voff; 948f7760dadSAlex Elder if (vcnt > 1) { 949f7760dadSAlex Elder bio->bi_io_vec[0].bv_len -= voff; 950f7760dadSAlex Elder bio->bi_io_vec[vcnt - 1].bv_len = resid; 951602adf40SYehuda Sadeh } else { 952f7760dadSAlex Elder bio->bi_io_vec[0].bv_len = len; 953602adf40SYehuda Sadeh } 954602adf40SYehuda Sadeh 955f7760dadSAlex Elder bio->bi_vcnt = vcnt; 956f7760dadSAlex Elder bio->bi_size = len; 957f7760dadSAlex Elder bio->bi_idx = 0; 958602adf40SYehuda Sadeh 959f7760dadSAlex Elder return bio; 960602adf40SYehuda Sadeh } 961602adf40SYehuda Sadeh 962f7760dadSAlex Elder /* 963f7760dadSAlex Elder * Clone a portion of a bio chain, starting at the given byte offset 964f7760dadSAlex Elder * into the first bio in the source chain and continuing for the 965f7760dadSAlex Elder * number of bytes indicated. The result is another bio chain of 966f7760dadSAlex Elder * exactly the given length, or a null pointer on error. 967f7760dadSAlex Elder * 968f7760dadSAlex Elder * The bio_src and offset parameters are both in-out. On entry they 969f7760dadSAlex Elder * refer to the first source bio and the offset into that bio where 970f7760dadSAlex Elder * the start of data to be cloned is located. 971f7760dadSAlex Elder * 972f7760dadSAlex Elder * On return, bio_src is updated to refer to the bio in the source 973f7760dadSAlex Elder * chain that contains first un-cloned byte, and *offset will 974f7760dadSAlex Elder * contain the offset of that byte within that bio. 975f7760dadSAlex Elder */ 976f7760dadSAlex Elder static struct bio *bio_chain_clone_range(struct bio **bio_src, 977f7760dadSAlex Elder unsigned int *offset, 978f7760dadSAlex Elder unsigned int len, 979f7760dadSAlex Elder gfp_t gfpmask) 980f7760dadSAlex Elder { 981f7760dadSAlex Elder struct bio *bi = *bio_src; 982f7760dadSAlex Elder unsigned int off = *offset; 983f7760dadSAlex Elder struct bio *chain = NULL; 984f7760dadSAlex Elder struct bio **end; 985602adf40SYehuda Sadeh 986f7760dadSAlex Elder /* Build up a chain of clone bios up to the limit */ 987602adf40SYehuda Sadeh 988f7760dadSAlex Elder if (!bi || off >= bi->bi_size || !len) 989f7760dadSAlex Elder return NULL; /* Nothing to clone */ 990602adf40SYehuda Sadeh 991f7760dadSAlex Elder end = &chain; 992f7760dadSAlex Elder while (len) { 993f7760dadSAlex Elder unsigned int bi_size; 994f7760dadSAlex Elder struct bio *bio; 995f7760dadSAlex Elder 996f5400b7aSAlex Elder if (!bi) { 997f5400b7aSAlex Elder rbd_warn(NULL, "bio_chain exhausted with %u left", len); 998f7760dadSAlex Elder goto out_err; /* EINVAL; ran out of bio's */ 999f5400b7aSAlex Elder } 1000f7760dadSAlex Elder bi_size = min_t(unsigned int, bi->bi_size - off, len); 1001f7760dadSAlex Elder bio = bio_clone_range(bi, off, bi_size, gfpmask); 1002f7760dadSAlex Elder if (!bio) 1003f7760dadSAlex Elder goto out_err; /* ENOMEM */ 1004f7760dadSAlex Elder 1005f7760dadSAlex Elder *end = bio; 1006f7760dadSAlex Elder end = &bio->bi_next; 1007f7760dadSAlex Elder 1008f7760dadSAlex Elder off += bi_size; 1009f7760dadSAlex Elder if (off == bi->bi_size) { 1010f7760dadSAlex Elder bi = bi->bi_next; 1011f7760dadSAlex Elder off = 0; 1012f7760dadSAlex Elder } 1013f7760dadSAlex Elder len -= bi_size; 1014f7760dadSAlex Elder } 1015f7760dadSAlex Elder *bio_src = bi; 1016f7760dadSAlex Elder *offset = off; 1017f7760dadSAlex Elder 1018f7760dadSAlex Elder return chain; 1019f7760dadSAlex Elder out_err: 1020f7760dadSAlex Elder bio_chain_put(chain); 1021f7760dadSAlex Elder 1022602adf40SYehuda Sadeh return NULL; 1023602adf40SYehuda Sadeh } 1024602adf40SYehuda Sadeh 1025602adf40SYehuda Sadeh /* 1026602adf40SYehuda Sadeh * helpers for osd request op vectors. 1027602adf40SYehuda Sadeh */ 102857cfc106SAlex Elder static struct ceph_osd_req_op *rbd_create_rw_ops(int num_ops, 102957cfc106SAlex Elder int opcode, u32 payload_len) 1030602adf40SYehuda Sadeh { 103157cfc106SAlex Elder struct ceph_osd_req_op *ops; 103257cfc106SAlex Elder 103357cfc106SAlex Elder ops = kzalloc(sizeof (*ops) * (num_ops + 1), GFP_NOIO); 103457cfc106SAlex Elder if (!ops) 103557cfc106SAlex Elder return NULL; 103657cfc106SAlex Elder 103757cfc106SAlex Elder ops[0].op = opcode; 103857cfc106SAlex Elder 1039602adf40SYehuda Sadeh /* 1040602adf40SYehuda Sadeh * op extent offset and length will be set later on 1041602adf40SYehuda Sadeh * in calc_raw_layout() 1042602adf40SYehuda Sadeh */ 104357cfc106SAlex Elder ops[0].payload_len = payload_len; 104457cfc106SAlex Elder 104557cfc106SAlex Elder return ops; 1046602adf40SYehuda Sadeh } 1047602adf40SYehuda Sadeh 1048602adf40SYehuda Sadeh static void rbd_destroy_ops(struct ceph_osd_req_op *ops) 1049602adf40SYehuda Sadeh { 1050602adf40SYehuda Sadeh kfree(ops); 1051602adf40SYehuda Sadeh } 1052602adf40SYehuda Sadeh 10531fec7093SYehuda Sadeh static void rbd_coll_end_req_index(struct request *rq, 10541fec7093SYehuda Sadeh struct rbd_req_coll *coll, 10551fec7093SYehuda Sadeh int index, 10568986cb37SAlex Elder s32 ret, u64 len) 10571fec7093SYehuda Sadeh { 10581fec7093SYehuda Sadeh struct request_queue *q; 10591fec7093SYehuda Sadeh int min, max, i; 10601fec7093SYehuda Sadeh 1061bd919d45SAlex Elder dout("rbd_coll_end_req_index %p index %d ret %d len %llu\n", 10628986cb37SAlex Elder coll, index, (int)ret, (unsigned long long)len); 10631fec7093SYehuda Sadeh 10641fec7093SYehuda Sadeh if (!rq) 10651fec7093SYehuda Sadeh return; 10661fec7093SYehuda Sadeh 10671fec7093SYehuda Sadeh if (!coll) { 10681fec7093SYehuda Sadeh blk_end_request(rq, ret, len); 10691fec7093SYehuda Sadeh return; 10701fec7093SYehuda Sadeh } 10711fec7093SYehuda Sadeh 10721fec7093SYehuda Sadeh q = rq->q; 10731fec7093SYehuda Sadeh 10741fec7093SYehuda Sadeh spin_lock_irq(q->queue_lock); 10751fec7093SYehuda Sadeh coll->status[index].done = 1; 10761fec7093SYehuda Sadeh coll->status[index].rc = ret; 10771fec7093SYehuda Sadeh coll->status[index].bytes = len; 10781fec7093SYehuda Sadeh max = min = coll->num_done; 10791fec7093SYehuda Sadeh while (max < coll->total && coll->status[max].done) 10801fec7093SYehuda Sadeh max++; 10811fec7093SYehuda Sadeh 10821fec7093SYehuda Sadeh for (i = min; i<max; i++) { 10838986cb37SAlex Elder __blk_end_request(rq, (int)coll->status[i].rc, 10841fec7093SYehuda Sadeh coll->status[i].bytes); 10851fec7093SYehuda Sadeh coll->num_done++; 10861fec7093SYehuda Sadeh kref_put(&coll->kref, rbd_coll_release); 10871fec7093SYehuda Sadeh } 10881fec7093SYehuda Sadeh spin_unlock_irq(q->queue_lock); 10891fec7093SYehuda Sadeh } 10901fec7093SYehuda Sadeh 1091725afc97SAlex Elder static void rbd_coll_end_req(struct rbd_request *rbd_req, 10928986cb37SAlex Elder s32 ret, u64 len) 10931fec7093SYehuda Sadeh { 1094725afc97SAlex Elder rbd_coll_end_req_index(rbd_req->rq, 1095725afc97SAlex Elder rbd_req->coll, rbd_req->coll_index, 1096725afc97SAlex Elder ret, len); 10971fec7093SYehuda Sadeh } 10981fec7093SYehuda Sadeh 1099602adf40SYehuda Sadeh /* 1100602adf40SYehuda Sadeh * Send ceph osd request 1101602adf40SYehuda Sadeh */ 1102602adf40SYehuda Sadeh static int rbd_do_request(struct request *rq, 11030ce1a794SAlex Elder struct rbd_device *rbd_dev, 1104602adf40SYehuda Sadeh struct ceph_snap_context *snapc, 1105602adf40SYehuda Sadeh u64 snapid, 1106aded07eaSAlex Elder const char *object_name, u64 ofs, u64 len, 1107602adf40SYehuda Sadeh struct bio *bio, 1108602adf40SYehuda Sadeh struct page **pages, 1109602adf40SYehuda Sadeh int num_pages, 1110602adf40SYehuda Sadeh int flags, 1111602adf40SYehuda Sadeh struct ceph_osd_req_op *ops, 11121fec7093SYehuda Sadeh struct rbd_req_coll *coll, 11131fec7093SYehuda Sadeh int coll_index, 11145f29ddd4SAlex Elder void (*rbd_cb)(struct ceph_osd_request *, 11155f29ddd4SAlex Elder struct ceph_msg *), 111659c2be1eSYehuda Sadeh struct ceph_osd_request **linger_req, 111759c2be1eSYehuda Sadeh u64 *ver) 1118602adf40SYehuda Sadeh { 11195f29ddd4SAlex Elder struct ceph_osd_request *osd_req; 1120602adf40SYehuda Sadeh struct ceph_file_layout *layout; 1121602adf40SYehuda Sadeh int ret; 1122602adf40SYehuda Sadeh u64 bno; 1123602adf40SYehuda Sadeh struct timespec mtime = CURRENT_TIME; 1124725afc97SAlex Elder struct rbd_request *rbd_req; 1125602adf40SYehuda Sadeh struct ceph_osd_request_head *reqhead; 11261dbb4399SAlex Elder struct ceph_osd_client *osdc; 1127602adf40SYehuda Sadeh 1128725afc97SAlex Elder rbd_req = kzalloc(sizeof(*rbd_req), GFP_NOIO); 1129cd323ac0SAlex Elder if (!rbd_req) 11301fec7093SYehuda Sadeh return -ENOMEM; 1131602adf40SYehuda Sadeh 11321fec7093SYehuda Sadeh if (coll) { 1133725afc97SAlex Elder rbd_req->coll = coll; 1134725afc97SAlex Elder rbd_req->coll_index = coll_index; 11351fec7093SYehuda Sadeh } 11361fec7093SYehuda Sadeh 1137f7760dadSAlex Elder dout("rbd_do_request object_name=%s ofs=%llu len=%llu coll=%p[%d]\n", 1138f7760dadSAlex Elder object_name, (unsigned long long) ofs, 1139f7760dadSAlex Elder (unsigned long long) len, coll, coll_index); 1140602adf40SYehuda Sadeh 11410ce1a794SAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 11425f29ddd4SAlex Elder osd_req = ceph_osdc_alloc_request(osdc, flags, snapc, ops, 11431dbb4399SAlex Elder false, GFP_NOIO, pages, bio); 11445f29ddd4SAlex Elder if (!osd_req) { 11454ad12621SSage Weil ret = -ENOMEM; 1146602adf40SYehuda Sadeh goto done_pages; 1147602adf40SYehuda Sadeh } 1148602adf40SYehuda Sadeh 11495f29ddd4SAlex Elder osd_req->r_callback = rbd_cb; 1150602adf40SYehuda Sadeh 1151725afc97SAlex Elder rbd_req->rq = rq; 1152725afc97SAlex Elder rbd_req->bio = bio; 1153725afc97SAlex Elder rbd_req->pages = pages; 1154725afc97SAlex Elder rbd_req->len = len; 1155602adf40SYehuda Sadeh 11565f29ddd4SAlex Elder osd_req->r_priv = rbd_req; 1157602adf40SYehuda Sadeh 11585f29ddd4SAlex Elder reqhead = osd_req->r_request->front.iov_base; 1159602adf40SYehuda Sadeh reqhead->snapid = cpu_to_le64(CEPH_NOSNAP); 1160602adf40SYehuda Sadeh 11615f29ddd4SAlex Elder strncpy(osd_req->r_oid, object_name, sizeof(osd_req->r_oid)); 11625f29ddd4SAlex Elder osd_req->r_oid_len = strlen(osd_req->r_oid); 1163602adf40SYehuda Sadeh 11645f29ddd4SAlex Elder layout = &osd_req->r_file_layout; 1165602adf40SYehuda Sadeh memset(layout, 0, sizeof(*layout)); 1166602adf40SYehuda Sadeh layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); 1167602adf40SYehuda Sadeh layout->fl_stripe_count = cpu_to_le32(1); 1168602adf40SYehuda Sadeh layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); 11690d7dbfceSAlex Elder layout->fl_pg_pool = cpu_to_le32((int) rbd_dev->spec->pool_id); 11706cae3717SSage Weil ret = ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno, 11715f29ddd4SAlex Elder osd_req, ops); 11726cae3717SSage Weil rbd_assert(ret == 0); 1173602adf40SYehuda Sadeh 11745f29ddd4SAlex Elder ceph_osdc_build_request(osd_req, ofs, &len, 1175602adf40SYehuda Sadeh ops, 1176602adf40SYehuda Sadeh snapc, 1177602adf40SYehuda Sadeh &mtime, 11785f29ddd4SAlex Elder osd_req->r_oid, osd_req->r_oid_len); 1179602adf40SYehuda Sadeh 118059c2be1eSYehuda Sadeh if (linger_req) { 11815f29ddd4SAlex Elder ceph_osdc_set_request_linger(osdc, osd_req); 11825f29ddd4SAlex Elder *linger_req = osd_req; 118359c2be1eSYehuda Sadeh } 118459c2be1eSYehuda Sadeh 11855f29ddd4SAlex Elder ret = ceph_osdc_start_request(osdc, osd_req, false); 1186602adf40SYehuda Sadeh if (ret < 0) 1187602adf40SYehuda Sadeh goto done_err; 1188602adf40SYehuda Sadeh 1189602adf40SYehuda Sadeh if (!rbd_cb) { 11905f29ddd4SAlex Elder u64 version; 11915f29ddd4SAlex Elder 11925f29ddd4SAlex Elder ret = ceph_osdc_wait_request(osdc, osd_req); 11935f29ddd4SAlex Elder version = le64_to_cpu(osd_req->r_reassert_version.version); 119459c2be1eSYehuda Sadeh if (ver) 11955f29ddd4SAlex Elder *ver = version; 11965f29ddd4SAlex Elder dout("reassert_ver=%llu\n", (unsigned long long) version); 11975f29ddd4SAlex Elder ceph_osdc_put_request(osd_req); 1198602adf40SYehuda Sadeh } 1199602adf40SYehuda Sadeh return ret; 1200602adf40SYehuda Sadeh 1201602adf40SYehuda Sadeh done_err: 1202725afc97SAlex Elder bio_chain_put(rbd_req->bio); 12035f29ddd4SAlex Elder ceph_osdc_put_request(osd_req); 1204602adf40SYehuda Sadeh done_pages: 1205725afc97SAlex Elder kfree(rbd_req); 1206602adf40SYehuda Sadeh return ret; 1207602adf40SYehuda Sadeh } 1208602adf40SYehuda Sadeh 1209602adf40SYehuda Sadeh /* 1210602adf40SYehuda Sadeh * Ceph osd op callback 1211602adf40SYehuda Sadeh */ 12125f29ddd4SAlex Elder static void rbd_req_cb(struct ceph_osd_request *osd_req, struct ceph_msg *msg) 1213602adf40SYehuda Sadeh { 12145f29ddd4SAlex Elder struct rbd_request *rbd_req = osd_req->r_priv; 1215602adf40SYehuda Sadeh struct ceph_osd_reply_head *replyhead; 1216602adf40SYehuda Sadeh struct ceph_osd_op *op; 12178986cb37SAlex Elder s32 rc; 1218602adf40SYehuda Sadeh u64 bytes; 1219602adf40SYehuda Sadeh int read_op; 1220602adf40SYehuda Sadeh 1221602adf40SYehuda Sadeh /* parse reply */ 1222602adf40SYehuda Sadeh replyhead = msg->front.iov_base; 1223602adf40SYehuda Sadeh WARN_ON(le32_to_cpu(replyhead->num_ops) == 0); 1224602adf40SYehuda Sadeh op = (void *)(replyhead + 1); 12258986cb37SAlex Elder rc = (s32)le32_to_cpu(replyhead->result); 1226602adf40SYehuda Sadeh bytes = le64_to_cpu(op->extent.length); 1227895cfcc8SDan Carpenter read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ); 1228602adf40SYehuda Sadeh 1229bd919d45SAlex Elder dout("rbd_req_cb bytes=%llu readop=%d rc=%d\n", 1230bd919d45SAlex Elder (unsigned long long) bytes, read_op, (int) rc); 1231602adf40SYehuda Sadeh 12328986cb37SAlex Elder if (rc == (s32)-ENOENT && read_op) { 1233725afc97SAlex Elder zero_bio_chain(rbd_req->bio, 0); 1234602adf40SYehuda Sadeh rc = 0; 1235725afc97SAlex Elder } else if (rc == 0 && read_op && bytes < rbd_req->len) { 1236725afc97SAlex Elder zero_bio_chain(rbd_req->bio, bytes); 1237725afc97SAlex Elder bytes = rbd_req->len; 1238602adf40SYehuda Sadeh } 1239602adf40SYehuda Sadeh 1240725afc97SAlex Elder rbd_coll_end_req(rbd_req, rc, bytes); 1241602adf40SYehuda Sadeh 1242725afc97SAlex Elder if (rbd_req->bio) 1243725afc97SAlex Elder bio_chain_put(rbd_req->bio); 1244602adf40SYehuda Sadeh 12455f29ddd4SAlex Elder ceph_osdc_put_request(osd_req); 1246725afc97SAlex Elder kfree(rbd_req); 1247602adf40SYehuda Sadeh } 1248602adf40SYehuda Sadeh 12495f29ddd4SAlex Elder static void rbd_simple_req_cb(struct ceph_osd_request *osd_req, 12505f29ddd4SAlex Elder struct ceph_msg *msg) 125159c2be1eSYehuda Sadeh { 12525f29ddd4SAlex Elder ceph_osdc_put_request(osd_req); 125359c2be1eSYehuda Sadeh } 125459c2be1eSYehuda Sadeh 1255602adf40SYehuda Sadeh /* 1256602adf40SYehuda Sadeh * Do a synchronous ceph osd operation 1257602adf40SYehuda Sadeh */ 12580ce1a794SAlex Elder static int rbd_req_sync_op(struct rbd_device *rbd_dev, 1259602adf40SYehuda Sadeh struct ceph_snap_context *snapc, 1260602adf40SYehuda Sadeh u64 snapid, 1261602adf40SYehuda Sadeh int flags, 1262913d2fdcSAlex Elder struct ceph_osd_req_op *ops, 1263aded07eaSAlex Elder const char *object_name, 1264f8d4de6eSAlex Elder u64 ofs, u64 inbound_size, 1265f8d4de6eSAlex Elder char *inbound, 126659c2be1eSYehuda Sadeh struct ceph_osd_request **linger_req, 126759c2be1eSYehuda Sadeh u64 *ver) 1268602adf40SYehuda Sadeh { 1269602adf40SYehuda Sadeh int ret; 1270602adf40SYehuda Sadeh struct page **pages; 1271602adf40SYehuda Sadeh int num_pages; 1272913d2fdcSAlex Elder 1273aafb230eSAlex Elder rbd_assert(ops != NULL); 1274602adf40SYehuda Sadeh 1275f8d4de6eSAlex Elder num_pages = calc_pages_for(ofs, inbound_size); 1276602adf40SYehuda Sadeh pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); 1277b8d0638aSDan Carpenter if (IS_ERR(pages)) 1278b8d0638aSDan Carpenter return PTR_ERR(pages); 1279602adf40SYehuda Sadeh 12800ce1a794SAlex Elder ret = rbd_do_request(NULL, rbd_dev, snapc, snapid, 1281f8d4de6eSAlex Elder object_name, ofs, inbound_size, NULL, 1282602adf40SYehuda Sadeh pages, num_pages, 1283602adf40SYehuda Sadeh flags, 1284602adf40SYehuda Sadeh ops, 12851fec7093SYehuda Sadeh NULL, 0, 128659c2be1eSYehuda Sadeh NULL, 128759c2be1eSYehuda Sadeh linger_req, ver); 1288602adf40SYehuda Sadeh if (ret < 0) 1289913d2fdcSAlex Elder goto done; 1290602adf40SYehuda Sadeh 1291f8d4de6eSAlex Elder if ((flags & CEPH_OSD_FLAG_READ) && inbound) 1292f8d4de6eSAlex Elder ret = ceph_copy_from_page_vector(pages, inbound, ofs, ret); 1293602adf40SYehuda Sadeh 1294602adf40SYehuda Sadeh done: 1295602adf40SYehuda Sadeh ceph_release_page_vector(pages, num_pages); 1296602adf40SYehuda Sadeh return ret; 1297602adf40SYehuda Sadeh } 1298602adf40SYehuda Sadeh 1299602adf40SYehuda Sadeh /* 1300602adf40SYehuda Sadeh * Do an asynchronous ceph osd operation 1301602adf40SYehuda Sadeh */ 1302602adf40SYehuda Sadeh static int rbd_do_op(struct request *rq, 1303602adf40SYehuda Sadeh struct rbd_device *rbd_dev, 1304602adf40SYehuda Sadeh struct ceph_snap_context *snapc, 1305602adf40SYehuda Sadeh u64 ofs, u64 len, 13061fec7093SYehuda Sadeh struct bio *bio, 13071fec7093SYehuda Sadeh struct rbd_req_coll *coll, 13081fec7093SYehuda Sadeh int coll_index) 1309602adf40SYehuda Sadeh { 1310602adf40SYehuda Sadeh char *seg_name; 1311602adf40SYehuda Sadeh u64 seg_ofs; 1312602adf40SYehuda Sadeh u64 seg_len; 1313602adf40SYehuda Sadeh int ret; 1314602adf40SYehuda Sadeh struct ceph_osd_req_op *ops; 1315602adf40SYehuda Sadeh u32 payload_len; 1316ff2e4bb5SAlex Elder int opcode; 1317ff2e4bb5SAlex Elder int flags; 13184634246dSAlex Elder u64 snapid; 1319602adf40SYehuda Sadeh 132065ccfe21SAlex Elder seg_name = rbd_segment_name(rbd_dev, ofs); 1321602adf40SYehuda Sadeh if (!seg_name) 1322602adf40SYehuda Sadeh return -ENOMEM; 132365ccfe21SAlex Elder seg_len = rbd_segment_length(rbd_dev, ofs, len); 132465ccfe21SAlex Elder seg_ofs = rbd_segment_offset(rbd_dev, ofs); 1325602adf40SYehuda Sadeh 1326ff2e4bb5SAlex Elder if (rq_data_dir(rq) == WRITE) { 1327ff2e4bb5SAlex Elder opcode = CEPH_OSD_OP_WRITE; 1328ff2e4bb5SAlex Elder flags = CEPH_OSD_FLAG_WRITE|CEPH_OSD_FLAG_ONDISK; 13294634246dSAlex Elder snapid = CEPH_NOSNAP; 1330ff2e4bb5SAlex Elder payload_len = seg_len; 1331ff2e4bb5SAlex Elder } else { 1332ff2e4bb5SAlex Elder opcode = CEPH_OSD_OP_READ; 1333ff2e4bb5SAlex Elder flags = CEPH_OSD_FLAG_READ; 13344634246dSAlex Elder snapc = NULL; 13350d7dbfceSAlex Elder snapid = rbd_dev->spec->snap_id; 1336ff2e4bb5SAlex Elder payload_len = 0; 1337ff2e4bb5SAlex Elder } 1338602adf40SYehuda Sadeh 133957cfc106SAlex Elder ret = -ENOMEM; 134057cfc106SAlex Elder ops = rbd_create_rw_ops(1, opcode, payload_len); 134157cfc106SAlex Elder if (!ops) 1342602adf40SYehuda Sadeh goto done; 1343602adf40SYehuda Sadeh 1344602adf40SYehuda Sadeh /* we've taken care of segment sizes earlier when we 1345602adf40SYehuda Sadeh cloned the bios. We should never have a segment 1346602adf40SYehuda Sadeh truncated at this point */ 1347aafb230eSAlex Elder rbd_assert(seg_len == len); 1348602adf40SYehuda Sadeh 1349602adf40SYehuda Sadeh ret = rbd_do_request(rq, rbd_dev, snapc, snapid, 1350602adf40SYehuda Sadeh seg_name, seg_ofs, seg_len, 1351602adf40SYehuda Sadeh bio, 1352602adf40SYehuda Sadeh NULL, 0, 1353602adf40SYehuda Sadeh flags, 1354602adf40SYehuda Sadeh ops, 13551fec7093SYehuda Sadeh coll, coll_index, 135659c2be1eSYehuda Sadeh rbd_req_cb, 0, NULL); 1357cd323ac0SAlex Elder if (ret < 0) 1358cd323ac0SAlex Elder rbd_coll_end_req_index(rq, coll, coll_index, 1359cd323ac0SAlex Elder (s32)ret, seg_len); 136011f77002SSage Weil rbd_destroy_ops(ops); 1361602adf40SYehuda Sadeh done: 1362602adf40SYehuda Sadeh kfree(seg_name); 1363602adf40SYehuda Sadeh return ret; 1364602adf40SYehuda Sadeh } 1365602adf40SYehuda Sadeh 1366602adf40SYehuda Sadeh /* 1367602adf40SYehuda Sadeh * Request sync osd read 1368602adf40SYehuda Sadeh */ 13690ce1a794SAlex Elder static int rbd_req_sync_read(struct rbd_device *rbd_dev, 1370602adf40SYehuda Sadeh u64 snapid, 1371aded07eaSAlex Elder const char *object_name, 1372602adf40SYehuda Sadeh u64 ofs, u64 len, 137359c2be1eSYehuda Sadeh char *buf, 137459c2be1eSYehuda Sadeh u64 *ver) 1375602adf40SYehuda Sadeh { 1376913d2fdcSAlex Elder struct ceph_osd_req_op *ops; 1377913d2fdcSAlex Elder int ret; 1378913d2fdcSAlex Elder 1379913d2fdcSAlex Elder ops = rbd_create_rw_ops(1, CEPH_OSD_OP_READ, 0); 1380913d2fdcSAlex Elder if (!ops) 1381913d2fdcSAlex Elder return -ENOMEM; 1382913d2fdcSAlex Elder 1383913d2fdcSAlex Elder ret = rbd_req_sync_op(rbd_dev, NULL, 1384b06e6a6bSJosh Durgin snapid, 1385602adf40SYehuda Sadeh CEPH_OSD_FLAG_READ, 1386913d2fdcSAlex Elder ops, object_name, ofs, len, buf, NULL, ver); 1387913d2fdcSAlex Elder rbd_destroy_ops(ops); 1388913d2fdcSAlex Elder 1389913d2fdcSAlex Elder return ret; 1390602adf40SYehuda Sadeh } 1391602adf40SYehuda Sadeh 1392602adf40SYehuda Sadeh /* 139359c2be1eSYehuda Sadeh * Request sync osd watch 139459c2be1eSYehuda Sadeh */ 13950ce1a794SAlex Elder static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev, 139659c2be1eSYehuda Sadeh u64 ver, 13977f0a24d8SAlex Elder u64 notify_id) 139859c2be1eSYehuda Sadeh { 139959c2be1eSYehuda Sadeh struct ceph_osd_req_op *ops; 140011f77002SSage Weil int ret; 140111f77002SSage Weil 140257cfc106SAlex Elder ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY_ACK, 0); 140357cfc106SAlex Elder if (!ops) 140457cfc106SAlex Elder return -ENOMEM; 140559c2be1eSYehuda Sadeh 1406a71b891bSJosh Durgin ops[0].watch.ver = cpu_to_le64(ver); 140759c2be1eSYehuda Sadeh ops[0].watch.cookie = notify_id; 140859c2be1eSYehuda Sadeh ops[0].watch.flag = 0; 140959c2be1eSYehuda Sadeh 14100ce1a794SAlex Elder ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP, 14117f0a24d8SAlex Elder rbd_dev->header_name, 0, 0, NULL, 1412ad4f232fSAlex Elder NULL, 0, 141359c2be1eSYehuda Sadeh CEPH_OSD_FLAG_READ, 141459c2be1eSYehuda Sadeh ops, 14151fec7093SYehuda Sadeh NULL, 0, 141659c2be1eSYehuda Sadeh rbd_simple_req_cb, 0, NULL); 141759c2be1eSYehuda Sadeh 141859c2be1eSYehuda Sadeh rbd_destroy_ops(ops); 141959c2be1eSYehuda Sadeh return ret; 142059c2be1eSYehuda Sadeh } 142159c2be1eSYehuda Sadeh 142259c2be1eSYehuda Sadeh static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data) 142359c2be1eSYehuda Sadeh { 14240ce1a794SAlex Elder struct rbd_device *rbd_dev = (struct rbd_device *)data; 1425a71b891bSJosh Durgin u64 hver; 142613143d2dSSage Weil int rc; 142713143d2dSSage Weil 14280ce1a794SAlex Elder if (!rbd_dev) 142959c2be1eSYehuda Sadeh return; 143059c2be1eSYehuda Sadeh 1431bd919d45SAlex Elder dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n", 1432bd919d45SAlex Elder rbd_dev->header_name, (unsigned long long) notify_id, 1433bd919d45SAlex Elder (unsigned int) opcode); 1434117973fbSAlex Elder rc = rbd_dev_refresh(rbd_dev, &hver); 143513143d2dSSage Weil if (rc) 143606ecc6cbSAlex Elder rbd_warn(rbd_dev, "got notification but failed to " 143706ecc6cbSAlex Elder " update snaps: %d\n", rc); 143859c2be1eSYehuda Sadeh 14397f0a24d8SAlex Elder rbd_req_sync_notify_ack(rbd_dev, hver, notify_id); 144059c2be1eSYehuda Sadeh } 144159c2be1eSYehuda Sadeh 144259c2be1eSYehuda Sadeh /* 144359c2be1eSYehuda Sadeh * Request sync osd watch 144459c2be1eSYehuda Sadeh */ 14450e6f322dSAlex Elder static int rbd_req_sync_watch(struct rbd_device *rbd_dev) 144659c2be1eSYehuda Sadeh { 144759c2be1eSYehuda Sadeh struct ceph_osd_req_op *ops; 14480ce1a794SAlex Elder struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 144957cfc106SAlex Elder int ret; 145059c2be1eSYehuda Sadeh 145157cfc106SAlex Elder ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0); 145257cfc106SAlex Elder if (!ops) 145357cfc106SAlex Elder return -ENOMEM; 145459c2be1eSYehuda Sadeh 145559c2be1eSYehuda Sadeh ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0, 14560ce1a794SAlex Elder (void *)rbd_dev, &rbd_dev->watch_event); 145759c2be1eSYehuda Sadeh if (ret < 0) 145859c2be1eSYehuda Sadeh goto fail; 145959c2be1eSYehuda Sadeh 14600e6f322dSAlex Elder ops[0].watch.ver = cpu_to_le64(rbd_dev->header.obj_version); 14610ce1a794SAlex Elder ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie); 146259c2be1eSYehuda Sadeh ops[0].watch.flag = 1; 146359c2be1eSYehuda Sadeh 14640ce1a794SAlex Elder ret = rbd_req_sync_op(rbd_dev, NULL, 146559c2be1eSYehuda Sadeh CEPH_NOSNAP, 146659c2be1eSYehuda Sadeh CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, 146759c2be1eSYehuda Sadeh ops, 14680e6f322dSAlex Elder rbd_dev->header_name, 14690e6f322dSAlex Elder 0, 0, NULL, 14700ce1a794SAlex Elder &rbd_dev->watch_request, NULL); 147159c2be1eSYehuda Sadeh 147259c2be1eSYehuda Sadeh if (ret < 0) 147359c2be1eSYehuda Sadeh goto fail_event; 147459c2be1eSYehuda Sadeh 147559c2be1eSYehuda Sadeh rbd_destroy_ops(ops); 147659c2be1eSYehuda Sadeh return 0; 147759c2be1eSYehuda Sadeh 147859c2be1eSYehuda Sadeh fail_event: 14790ce1a794SAlex Elder ceph_osdc_cancel_event(rbd_dev->watch_event); 14800ce1a794SAlex Elder rbd_dev->watch_event = NULL; 148159c2be1eSYehuda Sadeh fail: 148259c2be1eSYehuda Sadeh rbd_destroy_ops(ops); 148359c2be1eSYehuda Sadeh return ret; 148459c2be1eSYehuda Sadeh } 148559c2be1eSYehuda Sadeh 148679e3057cSYehuda Sadeh /* 148779e3057cSYehuda Sadeh * Request sync osd unwatch 148879e3057cSYehuda Sadeh */ 1489070c633fSAlex Elder static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev) 149079e3057cSYehuda Sadeh { 149179e3057cSYehuda Sadeh struct ceph_osd_req_op *ops; 149257cfc106SAlex Elder int ret; 149379e3057cSYehuda Sadeh 149457cfc106SAlex Elder ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0); 149557cfc106SAlex Elder if (!ops) 149657cfc106SAlex Elder return -ENOMEM; 149779e3057cSYehuda Sadeh 149879e3057cSYehuda Sadeh ops[0].watch.ver = 0; 14990ce1a794SAlex Elder ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie); 150079e3057cSYehuda Sadeh ops[0].watch.flag = 0; 150179e3057cSYehuda Sadeh 15020ce1a794SAlex Elder ret = rbd_req_sync_op(rbd_dev, NULL, 150379e3057cSYehuda Sadeh CEPH_NOSNAP, 150479e3057cSYehuda Sadeh CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, 150579e3057cSYehuda Sadeh ops, 1506070c633fSAlex Elder rbd_dev->header_name, 1507070c633fSAlex Elder 0, 0, NULL, NULL, NULL); 1508070c633fSAlex Elder 150979e3057cSYehuda Sadeh 151079e3057cSYehuda Sadeh rbd_destroy_ops(ops); 15110ce1a794SAlex Elder ceph_osdc_cancel_event(rbd_dev->watch_event); 15120ce1a794SAlex Elder rbd_dev->watch_event = NULL; 151379e3057cSYehuda Sadeh return ret; 151479e3057cSYehuda Sadeh } 151579e3057cSYehuda Sadeh 151659c2be1eSYehuda Sadeh /* 15173cb4a687SAlex Elder * Synchronous osd object method call 1518602adf40SYehuda Sadeh */ 15190ce1a794SAlex Elder static int rbd_req_sync_exec(struct rbd_device *rbd_dev, 1520aded07eaSAlex Elder const char *object_name, 1521aded07eaSAlex Elder const char *class_name, 1522aded07eaSAlex Elder const char *method_name, 15233cb4a687SAlex Elder const char *outbound, 15243cb4a687SAlex Elder size_t outbound_size, 1525f8d4de6eSAlex Elder char *inbound, 1526f8d4de6eSAlex Elder size_t inbound_size, 15273cb4a687SAlex Elder int flags, 152859c2be1eSYehuda Sadeh u64 *ver) 1529602adf40SYehuda Sadeh { 1530602adf40SYehuda Sadeh struct ceph_osd_req_op *ops; 1531aded07eaSAlex Elder int class_name_len = strlen(class_name); 1532aded07eaSAlex Elder int method_name_len = strlen(method_name); 15333cb4a687SAlex Elder int payload_size; 153457cfc106SAlex Elder int ret; 153557cfc106SAlex Elder 15363cb4a687SAlex Elder /* 15373cb4a687SAlex Elder * Any input parameters required by the method we're calling 15383cb4a687SAlex Elder * will be sent along with the class and method names as 15393cb4a687SAlex Elder * part of the message payload. That data and its size are 15403cb4a687SAlex Elder * supplied via the indata and indata_len fields (named from 15413cb4a687SAlex Elder * the perspective of the server side) in the OSD request 15423cb4a687SAlex Elder * operation. 15433cb4a687SAlex Elder */ 15443cb4a687SAlex Elder payload_size = class_name_len + method_name_len + outbound_size; 15453cb4a687SAlex Elder ops = rbd_create_rw_ops(1, CEPH_OSD_OP_CALL, payload_size); 154657cfc106SAlex Elder if (!ops) 154757cfc106SAlex Elder return -ENOMEM; 1548602adf40SYehuda Sadeh 1549aded07eaSAlex Elder ops[0].cls.class_name = class_name; 1550aded07eaSAlex Elder ops[0].cls.class_len = (__u8) class_name_len; 1551aded07eaSAlex Elder ops[0].cls.method_name = method_name; 1552aded07eaSAlex Elder ops[0].cls.method_len = (__u8) method_name_len; 1553602adf40SYehuda Sadeh ops[0].cls.argc = 0; 15543cb4a687SAlex Elder ops[0].cls.indata = outbound; 15553cb4a687SAlex Elder ops[0].cls.indata_len = outbound_size; 1556602adf40SYehuda Sadeh 15570ce1a794SAlex Elder ret = rbd_req_sync_op(rbd_dev, NULL, 1558602adf40SYehuda Sadeh CEPH_NOSNAP, 15593cb4a687SAlex Elder flags, ops, 1560f8d4de6eSAlex Elder object_name, 0, inbound_size, inbound, 1561f8d4de6eSAlex Elder NULL, ver); 1562602adf40SYehuda Sadeh 1563602adf40SYehuda Sadeh rbd_destroy_ops(ops); 1564602adf40SYehuda Sadeh 1565602adf40SYehuda Sadeh dout("cls_exec returned %d\n", ret); 1566602adf40SYehuda Sadeh return ret; 1567602adf40SYehuda Sadeh } 1568602adf40SYehuda Sadeh 15691fec7093SYehuda Sadeh static struct rbd_req_coll *rbd_alloc_coll(int num_reqs) 15701fec7093SYehuda Sadeh { 15711fec7093SYehuda Sadeh struct rbd_req_coll *coll = 15721fec7093SYehuda Sadeh kzalloc(sizeof(struct rbd_req_coll) + 15731fec7093SYehuda Sadeh sizeof(struct rbd_req_status) * num_reqs, 15741fec7093SYehuda Sadeh GFP_ATOMIC); 15751fec7093SYehuda Sadeh 15761fec7093SYehuda Sadeh if (!coll) 15771fec7093SYehuda Sadeh return NULL; 15781fec7093SYehuda Sadeh coll->total = num_reqs; 15791fec7093SYehuda Sadeh kref_init(&coll->kref); 15801fec7093SYehuda Sadeh return coll; 15811fec7093SYehuda Sadeh } 15821fec7093SYehuda Sadeh 15838295cda7SAlex Elder static int rbd_dev_do_request(struct request *rq, 15848295cda7SAlex Elder struct rbd_device *rbd_dev, 15858295cda7SAlex Elder struct ceph_snap_context *snapc, 15868295cda7SAlex Elder u64 ofs, unsigned int size, 15878295cda7SAlex Elder struct bio *bio_chain) 15888295cda7SAlex Elder { 15898295cda7SAlex Elder int num_segs; 15908295cda7SAlex Elder struct rbd_req_coll *coll; 15918295cda7SAlex Elder unsigned int bio_offset; 15928295cda7SAlex Elder int cur_seg = 0; 15938295cda7SAlex Elder 15948295cda7SAlex Elder dout("%s 0x%x bytes at 0x%llx\n", 15958295cda7SAlex Elder rq_data_dir(rq) == WRITE ? "write" : "read", 15968295cda7SAlex Elder size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE); 15978295cda7SAlex Elder 15988295cda7SAlex Elder num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size); 15998295cda7SAlex Elder if (num_segs <= 0) 16008295cda7SAlex Elder return num_segs; 16018295cda7SAlex Elder 16028295cda7SAlex Elder coll = rbd_alloc_coll(num_segs); 16038295cda7SAlex Elder if (!coll) 16048295cda7SAlex Elder return -ENOMEM; 16058295cda7SAlex Elder 16068295cda7SAlex Elder bio_offset = 0; 16078295cda7SAlex Elder do { 16088295cda7SAlex Elder u64 limit = rbd_segment_length(rbd_dev, ofs, size); 16098295cda7SAlex Elder unsigned int clone_size; 16108295cda7SAlex Elder struct bio *bio_clone; 16118295cda7SAlex Elder 16128295cda7SAlex Elder BUG_ON(limit > (u64)UINT_MAX); 16138295cda7SAlex Elder clone_size = (unsigned int)limit; 16148295cda7SAlex Elder dout("bio_chain->bi_vcnt=%hu\n", bio_chain->bi_vcnt); 16158295cda7SAlex Elder 16168295cda7SAlex Elder kref_get(&coll->kref); 16178295cda7SAlex Elder 16188295cda7SAlex Elder /* Pass a cloned bio chain via an osd request */ 16198295cda7SAlex Elder 16208295cda7SAlex Elder bio_clone = bio_chain_clone_range(&bio_chain, 16218295cda7SAlex Elder &bio_offset, clone_size, 16228295cda7SAlex Elder GFP_ATOMIC); 16238295cda7SAlex Elder if (bio_clone) 16248295cda7SAlex Elder (void)rbd_do_op(rq, rbd_dev, snapc, 16258295cda7SAlex Elder ofs, clone_size, 16268295cda7SAlex Elder bio_clone, coll, cur_seg); 16278295cda7SAlex Elder else 16288295cda7SAlex Elder rbd_coll_end_req_index(rq, coll, cur_seg, 16298295cda7SAlex Elder (s32)-ENOMEM, 16308295cda7SAlex Elder clone_size); 16318295cda7SAlex Elder size -= clone_size; 16328295cda7SAlex Elder ofs += clone_size; 16338295cda7SAlex Elder 16348295cda7SAlex Elder cur_seg++; 16358295cda7SAlex Elder } while (size > 0); 16368295cda7SAlex Elder kref_put(&coll->kref, rbd_coll_release); 16378295cda7SAlex Elder 16388295cda7SAlex Elder return 0; 16398295cda7SAlex Elder } 16408295cda7SAlex Elder 1641602adf40SYehuda Sadeh /* 1642602adf40SYehuda Sadeh * block device queue callback 1643602adf40SYehuda Sadeh */ 1644602adf40SYehuda Sadeh static void rbd_rq_fn(struct request_queue *q) 1645602adf40SYehuda Sadeh { 1646602adf40SYehuda Sadeh struct rbd_device *rbd_dev = q->queuedata; 1647b395e8b5SAlex Elder bool read_only = rbd_dev->mapping.read_only; 1648602adf40SYehuda Sadeh struct request *rq; 1649602adf40SYehuda Sadeh 165000f1f36fSAlex Elder while ((rq = blk_fetch_request(q))) { 1651b395e8b5SAlex Elder struct ceph_snap_context *snapc = NULL; 1652b395e8b5SAlex Elder unsigned int size = 0; 16538295cda7SAlex Elder int result; 1654602adf40SYehuda Sadeh 1655602adf40SYehuda Sadeh dout("fetched request\n"); 1656602adf40SYehuda Sadeh 1657b395e8b5SAlex Elder /* Filter out block requests we don't understand */ 1658b395e8b5SAlex Elder 1659602adf40SYehuda Sadeh if ((rq->cmd_type != REQ_TYPE_FS)) { 1660602adf40SYehuda Sadeh __blk_end_request_all(rq, 0); 166100f1f36fSAlex Elder continue; 1662602adf40SYehuda Sadeh } 1663602adf40SYehuda Sadeh spin_unlock_irq(q->queue_lock); 1664602adf40SYehuda Sadeh 1665b395e8b5SAlex Elder /* Stop writes to a read-only device */ 1666e88a36ecSJosh Durgin 1667b395e8b5SAlex Elder result = -EROFS; 1668b395e8b5SAlex Elder if (read_only && rq_data_dir(rq) == WRITE) 1669b395e8b5SAlex Elder goto out_end_request; 1670b395e8b5SAlex Elder 1671b395e8b5SAlex Elder /* Grab a reference to the snapshot context */ 1672b395e8b5SAlex Elder 1673b395e8b5SAlex Elder down_read(&rbd_dev->header_rwsem); 1674d78b650aSAlex Elder if (atomic_read(&rbd_dev->exists)) { 1675b395e8b5SAlex Elder snapc = ceph_get_snap_context(rbd_dev->header.snapc); 1676b395e8b5SAlex Elder rbd_assert(snapc != NULL); 1677b395e8b5SAlex Elder } 1678d1d25646SJosh Durgin up_read(&rbd_dev->header_rwsem); 1679b395e8b5SAlex Elder 1680b395e8b5SAlex Elder if (!snapc) { 1681b395e8b5SAlex Elder rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP); 1682e88a36ecSJosh Durgin dout("request for non-existent snapshot"); 1683b395e8b5SAlex Elder result = -ENXIO; 1684b395e8b5SAlex Elder goto out_end_request; 1685e88a36ecSJosh Durgin } 1686d1d25646SJosh Durgin 1687f7760dadSAlex Elder size = blk_rq_bytes(rq); 1688b395e8b5SAlex Elder result = rbd_dev_do_request(rq, rbd_dev, snapc, 1689b395e8b5SAlex Elder blk_rq_pos(rq) * SECTOR_SIZE, 1690b395e8b5SAlex Elder size, rq->bio); 1691b395e8b5SAlex Elder out_end_request: 1692df111be6SAlex Elder ceph_put_snap_context(snapc); 16931fec7093SYehuda Sadeh spin_lock_irq(q->queue_lock); 16948295cda7SAlex Elder if (!size || result < 0) 16958295cda7SAlex Elder __blk_end_request_all(rq, result); 1696602adf40SYehuda Sadeh } 1697602adf40SYehuda Sadeh } 1698602adf40SYehuda Sadeh 1699602adf40SYehuda Sadeh /* 1700602adf40SYehuda Sadeh * a queue callback. Makes sure that we don't create a bio that spans across 1701602adf40SYehuda Sadeh * multiple osd objects. One exception would be with a single page bios, 1702f7760dadSAlex Elder * which we handle later at bio_chain_clone_range() 1703602adf40SYehuda Sadeh */ 1704602adf40SYehuda Sadeh static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd, 1705602adf40SYehuda Sadeh struct bio_vec *bvec) 1706602adf40SYehuda Sadeh { 1707602adf40SYehuda Sadeh struct rbd_device *rbd_dev = q->queuedata; 1708e5cfeed2SAlex Elder sector_t sector_offset; 1709e5cfeed2SAlex Elder sector_t sectors_per_obj; 1710e5cfeed2SAlex Elder sector_t obj_sector_offset; 1711e5cfeed2SAlex Elder int ret; 1712602adf40SYehuda Sadeh 1713e5cfeed2SAlex Elder /* 1714e5cfeed2SAlex Elder * Find how far into its rbd object the partition-relative 1715e5cfeed2SAlex Elder * bio start sector is to offset relative to the enclosing 1716e5cfeed2SAlex Elder * device. 1717e5cfeed2SAlex Elder */ 1718e5cfeed2SAlex Elder sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector; 1719e5cfeed2SAlex Elder sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT); 1720e5cfeed2SAlex Elder obj_sector_offset = sector_offset & (sectors_per_obj - 1); 1721593a9e7bSAlex Elder 1722e5cfeed2SAlex Elder /* 1723e5cfeed2SAlex Elder * Compute the number of bytes from that offset to the end 1724e5cfeed2SAlex Elder * of the object. Account for what's already used by the bio. 1725e5cfeed2SAlex Elder */ 1726e5cfeed2SAlex Elder ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT; 1727e5cfeed2SAlex Elder if (ret > bmd->bi_size) 1728e5cfeed2SAlex Elder ret -= bmd->bi_size; 1729e5cfeed2SAlex Elder else 1730e5cfeed2SAlex Elder ret = 0; 1731e5cfeed2SAlex Elder 1732e5cfeed2SAlex Elder /* 1733e5cfeed2SAlex Elder * Don't send back more than was asked for. And if the bio 1734e5cfeed2SAlex Elder * was empty, let the whole thing through because: "Note 1735e5cfeed2SAlex Elder * that a block device *must* allow a single page to be 1736e5cfeed2SAlex Elder * added to an empty bio." 1737e5cfeed2SAlex Elder */ 1738e5cfeed2SAlex Elder rbd_assert(bvec->bv_len <= PAGE_SIZE); 1739e5cfeed2SAlex Elder if (ret > (int) bvec->bv_len || !bmd->bi_size) 1740e5cfeed2SAlex Elder ret = (int) bvec->bv_len; 1741e5cfeed2SAlex Elder 1742e5cfeed2SAlex Elder return ret; 1743602adf40SYehuda Sadeh } 1744602adf40SYehuda Sadeh 1745602adf40SYehuda Sadeh static void rbd_free_disk(struct rbd_device *rbd_dev) 1746602adf40SYehuda Sadeh { 1747602adf40SYehuda Sadeh struct gendisk *disk = rbd_dev->disk; 1748602adf40SYehuda Sadeh 1749602adf40SYehuda Sadeh if (!disk) 1750602adf40SYehuda Sadeh return; 1751602adf40SYehuda Sadeh 1752602adf40SYehuda Sadeh if (disk->flags & GENHD_FL_UP) 1753602adf40SYehuda Sadeh del_gendisk(disk); 1754602adf40SYehuda Sadeh if (disk->queue) 1755602adf40SYehuda Sadeh blk_cleanup_queue(disk->queue); 1756602adf40SYehuda Sadeh put_disk(disk); 1757602adf40SYehuda Sadeh } 1758602adf40SYehuda Sadeh 1759602adf40SYehuda Sadeh /* 17604156d998SAlex Elder * Read the complete header for the given rbd device. 17614156d998SAlex Elder * 17624156d998SAlex Elder * Returns a pointer to a dynamically-allocated buffer containing 17634156d998SAlex Elder * the complete and validated header. Caller can pass the address 17644156d998SAlex Elder * of a variable that will be filled in with the version of the 17654156d998SAlex Elder * header object at the time it was read. 17664156d998SAlex Elder * 17674156d998SAlex Elder * Returns a pointer-coded errno if a failure occurs. 17684156d998SAlex Elder */ 17694156d998SAlex Elder static struct rbd_image_header_ondisk * 17704156d998SAlex Elder rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version) 17714156d998SAlex Elder { 17724156d998SAlex Elder struct rbd_image_header_ondisk *ondisk = NULL; 17734156d998SAlex Elder u32 snap_count = 0; 17744156d998SAlex Elder u64 names_size = 0; 17754156d998SAlex Elder u32 want_count; 17764156d998SAlex Elder int ret; 17774156d998SAlex Elder 17784156d998SAlex Elder /* 17794156d998SAlex Elder * The complete header will include an array of its 64-bit 17804156d998SAlex Elder * snapshot ids, followed by the names of those snapshots as 17814156d998SAlex Elder * a contiguous block of NUL-terminated strings. Note that 17824156d998SAlex Elder * the number of snapshots could change by the time we read 17834156d998SAlex Elder * it in, in which case we re-read it. 17844156d998SAlex Elder */ 17854156d998SAlex Elder do { 17864156d998SAlex Elder size_t size; 17874156d998SAlex Elder 17884156d998SAlex Elder kfree(ondisk); 17894156d998SAlex Elder 17904156d998SAlex Elder size = sizeof (*ondisk); 17914156d998SAlex Elder size += snap_count * sizeof (struct rbd_image_snap_ondisk); 17924156d998SAlex Elder size += names_size; 17934156d998SAlex Elder ondisk = kmalloc(size, GFP_KERNEL); 17944156d998SAlex Elder if (!ondisk) 17954156d998SAlex Elder return ERR_PTR(-ENOMEM); 17964156d998SAlex Elder 17974156d998SAlex Elder ret = rbd_req_sync_read(rbd_dev, CEPH_NOSNAP, 17984156d998SAlex Elder rbd_dev->header_name, 17994156d998SAlex Elder 0, size, 18004156d998SAlex Elder (char *) ondisk, version); 18014156d998SAlex Elder 18024156d998SAlex Elder if (ret < 0) 18034156d998SAlex Elder goto out_err; 18044156d998SAlex Elder if (WARN_ON((size_t) ret < size)) { 18054156d998SAlex Elder ret = -ENXIO; 180606ecc6cbSAlex Elder rbd_warn(rbd_dev, "short header read (want %zd got %d)", 180706ecc6cbSAlex Elder size, ret); 18084156d998SAlex Elder goto out_err; 18094156d998SAlex Elder } 18104156d998SAlex Elder if (!rbd_dev_ondisk_valid(ondisk)) { 18114156d998SAlex Elder ret = -ENXIO; 181206ecc6cbSAlex Elder rbd_warn(rbd_dev, "invalid header"); 18134156d998SAlex Elder goto out_err; 18144156d998SAlex Elder } 18154156d998SAlex Elder 18164156d998SAlex Elder names_size = le64_to_cpu(ondisk->snap_names_len); 18174156d998SAlex Elder want_count = snap_count; 18184156d998SAlex Elder snap_count = le32_to_cpu(ondisk->snap_count); 18194156d998SAlex Elder } while (snap_count != want_count); 18204156d998SAlex Elder 18214156d998SAlex Elder return ondisk; 18224156d998SAlex Elder 18234156d998SAlex Elder out_err: 18244156d998SAlex Elder kfree(ondisk); 18254156d998SAlex Elder 18264156d998SAlex Elder return ERR_PTR(ret); 18274156d998SAlex Elder } 18284156d998SAlex Elder 18294156d998SAlex Elder /* 1830602adf40SYehuda Sadeh * reload the ondisk the header 1831602adf40SYehuda Sadeh */ 1832602adf40SYehuda Sadeh static int rbd_read_header(struct rbd_device *rbd_dev, 1833602adf40SYehuda Sadeh struct rbd_image_header *header) 1834602adf40SYehuda Sadeh { 18354156d998SAlex Elder struct rbd_image_header_ondisk *ondisk; 18364156d998SAlex Elder u64 ver = 0; 18374156d998SAlex Elder int ret; 1838602adf40SYehuda Sadeh 18394156d998SAlex Elder ondisk = rbd_dev_v1_header_read(rbd_dev, &ver); 18404156d998SAlex Elder if (IS_ERR(ondisk)) 18414156d998SAlex Elder return PTR_ERR(ondisk); 18424156d998SAlex Elder ret = rbd_header_from_disk(header, ondisk); 18434156d998SAlex Elder if (ret >= 0) 184459c2be1eSYehuda Sadeh header->obj_version = ver; 18454156d998SAlex Elder kfree(ondisk); 1846602adf40SYehuda Sadeh 18474156d998SAlex Elder return ret; 1848602adf40SYehuda Sadeh } 1849602adf40SYehuda Sadeh 185041f38c2bSAlex Elder static void rbd_remove_all_snaps(struct rbd_device *rbd_dev) 1851dfc5606dSYehuda Sadeh { 1852dfc5606dSYehuda Sadeh struct rbd_snap *snap; 1853a0593290SAlex Elder struct rbd_snap *next; 1854dfc5606dSYehuda Sadeh 1855a0593290SAlex Elder list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node) 185641f38c2bSAlex Elder rbd_remove_snap_dev(snap); 1857dfc5606dSYehuda Sadeh } 1858dfc5606dSYehuda Sadeh 18599478554aSAlex Elder static void rbd_update_mapping_size(struct rbd_device *rbd_dev) 18609478554aSAlex Elder { 18619478554aSAlex Elder sector_t size; 18629478554aSAlex Elder 18630d7dbfceSAlex Elder if (rbd_dev->spec->snap_id != CEPH_NOSNAP) 18649478554aSAlex Elder return; 18659478554aSAlex Elder 18669478554aSAlex Elder size = (sector_t) rbd_dev->header.image_size / SECTOR_SIZE; 18679478554aSAlex Elder dout("setting size to %llu sectors", (unsigned long long) size); 18689478554aSAlex Elder rbd_dev->mapping.size = (u64) size; 18699478554aSAlex Elder set_capacity(rbd_dev->disk, size); 18709478554aSAlex Elder } 18719478554aSAlex Elder 1872602adf40SYehuda Sadeh /* 1873602adf40SYehuda Sadeh * only read the first part of the ondisk header, without the snaps info 1874602adf40SYehuda Sadeh */ 1875117973fbSAlex Elder static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver) 1876602adf40SYehuda Sadeh { 1877602adf40SYehuda Sadeh int ret; 1878602adf40SYehuda Sadeh struct rbd_image_header h; 1879602adf40SYehuda Sadeh 1880602adf40SYehuda Sadeh ret = rbd_read_header(rbd_dev, &h); 1881602adf40SYehuda Sadeh if (ret < 0) 1882602adf40SYehuda Sadeh return ret; 1883602adf40SYehuda Sadeh 1884a51aa0c0SJosh Durgin down_write(&rbd_dev->header_rwsem); 1885a51aa0c0SJosh Durgin 18869478554aSAlex Elder /* Update image size, and check for resize of mapped image */ 18879478554aSAlex Elder rbd_dev->header.image_size = h.image_size; 18889478554aSAlex Elder rbd_update_mapping_size(rbd_dev); 18899db4b3e3SSage Weil 1890849b4260SAlex Elder /* rbd_dev->header.object_prefix shouldn't change */ 1891602adf40SYehuda Sadeh kfree(rbd_dev->header.snap_sizes); 1892849b4260SAlex Elder kfree(rbd_dev->header.snap_names); 1893d1d25646SJosh Durgin /* osd requests may still refer to snapc */ 1894d1d25646SJosh Durgin ceph_put_snap_context(rbd_dev->header.snapc); 1895602adf40SYehuda Sadeh 1896b813623aSAlex Elder if (hver) 1897b813623aSAlex Elder *hver = h.obj_version; 1898a71b891bSJosh Durgin rbd_dev->header.obj_version = h.obj_version; 189993a24e08SJosh Durgin rbd_dev->header.image_size = h.image_size; 1900602adf40SYehuda Sadeh rbd_dev->header.snapc = h.snapc; 1901602adf40SYehuda Sadeh rbd_dev->header.snap_names = h.snap_names; 1902602adf40SYehuda Sadeh rbd_dev->header.snap_sizes = h.snap_sizes; 1903849b4260SAlex Elder /* Free the extra copy of the object prefix */ 1904849b4260SAlex Elder WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix)); 1905849b4260SAlex Elder kfree(h.object_prefix); 1906849b4260SAlex Elder 1907304f6808SAlex Elder ret = rbd_dev_snaps_update(rbd_dev); 1908304f6808SAlex Elder if (!ret) 1909304f6808SAlex Elder ret = rbd_dev_snaps_register(rbd_dev); 1910dfc5606dSYehuda Sadeh 1911c666601aSJosh Durgin up_write(&rbd_dev->header_rwsem); 1912602adf40SYehuda Sadeh 1913dfc5606dSYehuda Sadeh return ret; 1914602adf40SYehuda Sadeh } 1915602adf40SYehuda Sadeh 1916117973fbSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver) 19171fe5e993SAlex Elder { 19181fe5e993SAlex Elder int ret; 19191fe5e993SAlex Elder 1920117973fbSAlex Elder rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 19211fe5e993SAlex Elder mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); 1922117973fbSAlex Elder if (rbd_dev->image_format == 1) 1923117973fbSAlex Elder ret = rbd_dev_v1_refresh(rbd_dev, hver); 1924117973fbSAlex Elder else 1925117973fbSAlex Elder ret = rbd_dev_v2_refresh(rbd_dev, hver); 19261fe5e993SAlex Elder mutex_unlock(&ctl_mutex); 19271fe5e993SAlex Elder 19281fe5e993SAlex Elder return ret; 19291fe5e993SAlex Elder } 19301fe5e993SAlex Elder 1931602adf40SYehuda Sadeh static int rbd_init_disk(struct rbd_device *rbd_dev) 1932602adf40SYehuda Sadeh { 1933602adf40SYehuda Sadeh struct gendisk *disk; 1934602adf40SYehuda Sadeh struct request_queue *q; 1935593a9e7bSAlex Elder u64 segment_size; 1936602adf40SYehuda Sadeh 1937602adf40SYehuda Sadeh /* create gendisk info */ 1938602adf40SYehuda Sadeh disk = alloc_disk(RBD_MINORS_PER_MAJOR); 1939602adf40SYehuda Sadeh if (!disk) 19401fcdb8aaSAlex Elder return -ENOMEM; 1941602adf40SYehuda Sadeh 1942f0f8cef5SAlex Elder snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d", 1943de71a297SAlex Elder rbd_dev->dev_id); 1944602adf40SYehuda Sadeh disk->major = rbd_dev->major; 1945602adf40SYehuda Sadeh disk->first_minor = 0; 1946602adf40SYehuda Sadeh disk->fops = &rbd_bd_ops; 1947602adf40SYehuda Sadeh disk->private_data = rbd_dev; 1948602adf40SYehuda Sadeh 1949602adf40SYehuda Sadeh /* init rq */ 1950602adf40SYehuda Sadeh q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock); 1951602adf40SYehuda Sadeh if (!q) 1952602adf40SYehuda Sadeh goto out_disk; 1953029bcbd8SJosh Durgin 1954593a9e7bSAlex Elder /* We use the default size, but let's be explicit about it. */ 1955593a9e7bSAlex Elder blk_queue_physical_block_size(q, SECTOR_SIZE); 1956593a9e7bSAlex Elder 1957029bcbd8SJosh Durgin /* set io sizes to object size */ 1958593a9e7bSAlex Elder segment_size = rbd_obj_bytes(&rbd_dev->header); 1959593a9e7bSAlex Elder blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE); 1960593a9e7bSAlex Elder blk_queue_max_segment_size(q, segment_size); 1961593a9e7bSAlex Elder blk_queue_io_min(q, segment_size); 1962593a9e7bSAlex Elder blk_queue_io_opt(q, segment_size); 1963029bcbd8SJosh Durgin 1964602adf40SYehuda Sadeh blk_queue_merge_bvec(q, rbd_merge_bvec); 1965602adf40SYehuda Sadeh disk->queue = q; 1966602adf40SYehuda Sadeh 1967602adf40SYehuda Sadeh q->queuedata = rbd_dev; 1968602adf40SYehuda Sadeh 1969602adf40SYehuda Sadeh rbd_dev->disk = disk; 1970602adf40SYehuda Sadeh 197112f02944SAlex Elder set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE); 197212f02944SAlex Elder 1973602adf40SYehuda Sadeh return 0; 1974602adf40SYehuda Sadeh out_disk: 1975602adf40SYehuda Sadeh put_disk(disk); 19761fcdb8aaSAlex Elder 19771fcdb8aaSAlex Elder return -ENOMEM; 1978602adf40SYehuda Sadeh } 1979602adf40SYehuda Sadeh 1980dfc5606dSYehuda Sadeh /* 1981dfc5606dSYehuda Sadeh sysfs 1982dfc5606dSYehuda Sadeh */ 1983602adf40SYehuda Sadeh 1984593a9e7bSAlex Elder static struct rbd_device *dev_to_rbd_dev(struct device *dev) 1985593a9e7bSAlex Elder { 1986593a9e7bSAlex Elder return container_of(dev, struct rbd_device, dev); 1987593a9e7bSAlex Elder } 1988593a9e7bSAlex Elder 1989dfc5606dSYehuda Sadeh static ssize_t rbd_size_show(struct device *dev, 1990dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 1991602adf40SYehuda Sadeh { 1992593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 1993a51aa0c0SJosh Durgin sector_t size; 1994dfc5606dSYehuda Sadeh 1995a51aa0c0SJosh Durgin down_read(&rbd_dev->header_rwsem); 1996a51aa0c0SJosh Durgin size = get_capacity(rbd_dev->disk); 1997a51aa0c0SJosh Durgin up_read(&rbd_dev->header_rwsem); 1998a51aa0c0SJosh Durgin 1999a51aa0c0SJosh Durgin return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE); 2000602adf40SYehuda Sadeh } 2001602adf40SYehuda Sadeh 200234b13184SAlex Elder /* 200334b13184SAlex Elder * Note this shows the features for whatever's mapped, which is not 200434b13184SAlex Elder * necessarily the base image. 200534b13184SAlex Elder */ 200634b13184SAlex Elder static ssize_t rbd_features_show(struct device *dev, 200734b13184SAlex Elder struct device_attribute *attr, char *buf) 200834b13184SAlex Elder { 200934b13184SAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 201034b13184SAlex Elder 201134b13184SAlex Elder return sprintf(buf, "0x%016llx\n", 201234b13184SAlex Elder (unsigned long long) rbd_dev->mapping.features); 201334b13184SAlex Elder } 201434b13184SAlex Elder 2015dfc5606dSYehuda Sadeh static ssize_t rbd_major_show(struct device *dev, 2016dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 2017602adf40SYehuda Sadeh { 2018593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 2019dfc5606dSYehuda Sadeh 2020dfc5606dSYehuda Sadeh return sprintf(buf, "%d\n", rbd_dev->major); 2021dfc5606dSYehuda Sadeh } 2022dfc5606dSYehuda Sadeh 2023dfc5606dSYehuda Sadeh static ssize_t rbd_client_id_show(struct device *dev, 2024dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 2025dfc5606dSYehuda Sadeh { 2026593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 2027dfc5606dSYehuda Sadeh 20281dbb4399SAlex Elder return sprintf(buf, "client%lld\n", 20291dbb4399SAlex Elder ceph_client_id(rbd_dev->rbd_client->client)); 2030dfc5606dSYehuda Sadeh } 2031dfc5606dSYehuda Sadeh 2032dfc5606dSYehuda Sadeh static ssize_t rbd_pool_show(struct device *dev, 2033dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 2034dfc5606dSYehuda Sadeh { 2035593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 2036dfc5606dSYehuda Sadeh 20370d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->pool_name); 2038dfc5606dSYehuda Sadeh } 2039dfc5606dSYehuda Sadeh 20409bb2f334SAlex Elder static ssize_t rbd_pool_id_show(struct device *dev, 20419bb2f334SAlex Elder struct device_attribute *attr, char *buf) 20429bb2f334SAlex Elder { 20439bb2f334SAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 20449bb2f334SAlex Elder 20450d7dbfceSAlex Elder return sprintf(buf, "%llu\n", 20460d7dbfceSAlex Elder (unsigned long long) rbd_dev->spec->pool_id); 20479bb2f334SAlex Elder } 20489bb2f334SAlex Elder 2049dfc5606dSYehuda Sadeh static ssize_t rbd_name_show(struct device *dev, 2050dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 2051dfc5606dSYehuda Sadeh { 2052593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 2053dfc5606dSYehuda Sadeh 2054a92ffdf8SAlex Elder if (rbd_dev->spec->image_name) 20550d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->image_name); 2056a92ffdf8SAlex Elder 2057a92ffdf8SAlex Elder return sprintf(buf, "(unknown)\n"); 2058dfc5606dSYehuda Sadeh } 2059dfc5606dSYehuda Sadeh 2060589d30e0SAlex Elder static ssize_t rbd_image_id_show(struct device *dev, 2061589d30e0SAlex Elder struct device_attribute *attr, char *buf) 2062589d30e0SAlex Elder { 2063589d30e0SAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 2064589d30e0SAlex Elder 20650d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->image_id); 2066589d30e0SAlex Elder } 2067589d30e0SAlex Elder 206834b13184SAlex Elder /* 206934b13184SAlex Elder * Shows the name of the currently-mapped snapshot (or 207034b13184SAlex Elder * RBD_SNAP_HEAD_NAME for the base image). 207134b13184SAlex Elder */ 2072dfc5606dSYehuda Sadeh static ssize_t rbd_snap_show(struct device *dev, 2073dfc5606dSYehuda Sadeh struct device_attribute *attr, 2074dfc5606dSYehuda Sadeh char *buf) 2075dfc5606dSYehuda Sadeh { 2076593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 2077dfc5606dSYehuda Sadeh 20780d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->snap_name); 2079dfc5606dSYehuda Sadeh } 2080dfc5606dSYehuda Sadeh 208186b00e0dSAlex Elder /* 208286b00e0dSAlex Elder * For an rbd v2 image, shows the pool id, image id, and snapshot id 208386b00e0dSAlex Elder * for the parent image. If there is no parent, simply shows 208486b00e0dSAlex Elder * "(no parent image)". 208586b00e0dSAlex Elder */ 208686b00e0dSAlex Elder static ssize_t rbd_parent_show(struct device *dev, 208786b00e0dSAlex Elder struct device_attribute *attr, 208886b00e0dSAlex Elder char *buf) 208986b00e0dSAlex Elder { 209086b00e0dSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 209186b00e0dSAlex Elder struct rbd_spec *spec = rbd_dev->parent_spec; 209286b00e0dSAlex Elder int count; 209386b00e0dSAlex Elder char *bufp = buf; 209486b00e0dSAlex Elder 209586b00e0dSAlex Elder if (!spec) 209686b00e0dSAlex Elder return sprintf(buf, "(no parent image)\n"); 209786b00e0dSAlex Elder 209886b00e0dSAlex Elder count = sprintf(bufp, "pool_id %llu\npool_name %s\n", 209986b00e0dSAlex Elder (unsigned long long) spec->pool_id, spec->pool_name); 210086b00e0dSAlex Elder if (count < 0) 210186b00e0dSAlex Elder return count; 210286b00e0dSAlex Elder bufp += count; 210386b00e0dSAlex Elder 210486b00e0dSAlex Elder count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id, 210586b00e0dSAlex Elder spec->image_name ? spec->image_name : "(unknown)"); 210686b00e0dSAlex Elder if (count < 0) 210786b00e0dSAlex Elder return count; 210886b00e0dSAlex Elder bufp += count; 210986b00e0dSAlex Elder 211086b00e0dSAlex Elder count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n", 211186b00e0dSAlex Elder (unsigned long long) spec->snap_id, spec->snap_name); 211286b00e0dSAlex Elder if (count < 0) 211386b00e0dSAlex Elder return count; 211486b00e0dSAlex Elder bufp += count; 211586b00e0dSAlex Elder 211686b00e0dSAlex Elder count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap); 211786b00e0dSAlex Elder if (count < 0) 211886b00e0dSAlex Elder return count; 211986b00e0dSAlex Elder bufp += count; 212086b00e0dSAlex Elder 212186b00e0dSAlex Elder return (ssize_t) (bufp - buf); 212286b00e0dSAlex Elder } 212386b00e0dSAlex Elder 2124dfc5606dSYehuda Sadeh static ssize_t rbd_image_refresh(struct device *dev, 2125dfc5606dSYehuda Sadeh struct device_attribute *attr, 2126dfc5606dSYehuda Sadeh const char *buf, 2127dfc5606dSYehuda Sadeh size_t size) 2128dfc5606dSYehuda Sadeh { 2129593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 2130b813623aSAlex Elder int ret; 2131602adf40SYehuda Sadeh 2132117973fbSAlex Elder ret = rbd_dev_refresh(rbd_dev, NULL); 2133b813623aSAlex Elder 2134b813623aSAlex Elder return ret < 0 ? ret : size; 2135dfc5606dSYehuda Sadeh } 2136602adf40SYehuda Sadeh 2137dfc5606dSYehuda Sadeh static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL); 213834b13184SAlex Elder static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL); 2139dfc5606dSYehuda Sadeh static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL); 2140dfc5606dSYehuda Sadeh static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL); 2141dfc5606dSYehuda Sadeh static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL); 21429bb2f334SAlex Elder static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL); 2143dfc5606dSYehuda Sadeh static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL); 2144589d30e0SAlex Elder static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL); 2145dfc5606dSYehuda Sadeh static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh); 2146dfc5606dSYehuda Sadeh static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL); 214786b00e0dSAlex Elder static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL); 2148dfc5606dSYehuda Sadeh 2149dfc5606dSYehuda Sadeh static struct attribute *rbd_attrs[] = { 2150dfc5606dSYehuda Sadeh &dev_attr_size.attr, 215134b13184SAlex Elder &dev_attr_features.attr, 2152dfc5606dSYehuda Sadeh &dev_attr_major.attr, 2153dfc5606dSYehuda Sadeh &dev_attr_client_id.attr, 2154dfc5606dSYehuda Sadeh &dev_attr_pool.attr, 21559bb2f334SAlex Elder &dev_attr_pool_id.attr, 2156dfc5606dSYehuda Sadeh &dev_attr_name.attr, 2157589d30e0SAlex Elder &dev_attr_image_id.attr, 2158dfc5606dSYehuda Sadeh &dev_attr_current_snap.attr, 215986b00e0dSAlex Elder &dev_attr_parent.attr, 2160dfc5606dSYehuda Sadeh &dev_attr_refresh.attr, 2161dfc5606dSYehuda Sadeh NULL 2162dfc5606dSYehuda Sadeh }; 2163dfc5606dSYehuda Sadeh 2164dfc5606dSYehuda Sadeh static struct attribute_group rbd_attr_group = { 2165dfc5606dSYehuda Sadeh .attrs = rbd_attrs, 2166dfc5606dSYehuda Sadeh }; 2167dfc5606dSYehuda Sadeh 2168dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_attr_groups[] = { 2169dfc5606dSYehuda Sadeh &rbd_attr_group, 2170dfc5606dSYehuda Sadeh NULL 2171dfc5606dSYehuda Sadeh }; 2172dfc5606dSYehuda Sadeh 2173dfc5606dSYehuda Sadeh static void rbd_sysfs_dev_release(struct device *dev) 2174dfc5606dSYehuda Sadeh { 2175dfc5606dSYehuda Sadeh } 2176dfc5606dSYehuda Sadeh 2177dfc5606dSYehuda Sadeh static struct device_type rbd_device_type = { 2178dfc5606dSYehuda Sadeh .name = "rbd", 2179dfc5606dSYehuda Sadeh .groups = rbd_attr_groups, 2180dfc5606dSYehuda Sadeh .release = rbd_sysfs_dev_release, 2181dfc5606dSYehuda Sadeh }; 2182dfc5606dSYehuda Sadeh 2183dfc5606dSYehuda Sadeh 2184dfc5606dSYehuda Sadeh /* 2185dfc5606dSYehuda Sadeh sysfs - snapshots 2186dfc5606dSYehuda Sadeh */ 2187dfc5606dSYehuda Sadeh 2188dfc5606dSYehuda Sadeh static ssize_t rbd_snap_size_show(struct device *dev, 2189dfc5606dSYehuda Sadeh struct device_attribute *attr, 2190dfc5606dSYehuda Sadeh char *buf) 2191dfc5606dSYehuda Sadeh { 2192dfc5606dSYehuda Sadeh struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev); 2193dfc5606dSYehuda Sadeh 21943591538fSJosh Durgin return sprintf(buf, "%llu\n", (unsigned long long)snap->size); 2195dfc5606dSYehuda Sadeh } 2196dfc5606dSYehuda Sadeh 2197dfc5606dSYehuda Sadeh static ssize_t rbd_snap_id_show(struct device *dev, 2198dfc5606dSYehuda Sadeh struct device_attribute *attr, 2199dfc5606dSYehuda Sadeh char *buf) 2200dfc5606dSYehuda Sadeh { 2201dfc5606dSYehuda Sadeh struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev); 2202dfc5606dSYehuda Sadeh 2203593a9e7bSAlex Elder return sprintf(buf, "%llu\n", (unsigned long long)snap->id); 2204dfc5606dSYehuda Sadeh } 2205dfc5606dSYehuda Sadeh 220634b13184SAlex Elder static ssize_t rbd_snap_features_show(struct device *dev, 220734b13184SAlex Elder struct device_attribute *attr, 220834b13184SAlex Elder char *buf) 220934b13184SAlex Elder { 221034b13184SAlex Elder struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev); 221134b13184SAlex Elder 221234b13184SAlex Elder return sprintf(buf, "0x%016llx\n", 221334b13184SAlex Elder (unsigned long long) snap->features); 221434b13184SAlex Elder } 221534b13184SAlex Elder 2216dfc5606dSYehuda Sadeh static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL); 2217dfc5606dSYehuda Sadeh static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL); 221834b13184SAlex Elder static DEVICE_ATTR(snap_features, S_IRUGO, rbd_snap_features_show, NULL); 2219dfc5606dSYehuda Sadeh 2220dfc5606dSYehuda Sadeh static struct attribute *rbd_snap_attrs[] = { 2221dfc5606dSYehuda Sadeh &dev_attr_snap_size.attr, 2222dfc5606dSYehuda Sadeh &dev_attr_snap_id.attr, 222334b13184SAlex Elder &dev_attr_snap_features.attr, 2224dfc5606dSYehuda Sadeh NULL, 2225dfc5606dSYehuda Sadeh }; 2226dfc5606dSYehuda Sadeh 2227dfc5606dSYehuda Sadeh static struct attribute_group rbd_snap_attr_group = { 2228dfc5606dSYehuda Sadeh .attrs = rbd_snap_attrs, 2229dfc5606dSYehuda Sadeh }; 2230dfc5606dSYehuda Sadeh 2231dfc5606dSYehuda Sadeh static void rbd_snap_dev_release(struct device *dev) 2232dfc5606dSYehuda Sadeh { 2233dfc5606dSYehuda Sadeh struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev); 2234dfc5606dSYehuda Sadeh kfree(snap->name); 2235dfc5606dSYehuda Sadeh kfree(snap); 2236dfc5606dSYehuda Sadeh } 2237dfc5606dSYehuda Sadeh 2238dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_snap_attr_groups[] = { 2239dfc5606dSYehuda Sadeh &rbd_snap_attr_group, 2240dfc5606dSYehuda Sadeh NULL 2241dfc5606dSYehuda Sadeh }; 2242dfc5606dSYehuda Sadeh 2243dfc5606dSYehuda Sadeh static struct device_type rbd_snap_device_type = { 2244dfc5606dSYehuda Sadeh .groups = rbd_snap_attr_groups, 2245dfc5606dSYehuda Sadeh .release = rbd_snap_dev_release, 2246dfc5606dSYehuda Sadeh }; 2247dfc5606dSYehuda Sadeh 22488b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec) 22498b8fb99cSAlex Elder { 22508b8fb99cSAlex Elder kref_get(&spec->kref); 22518b8fb99cSAlex Elder 22528b8fb99cSAlex Elder return spec; 22538b8fb99cSAlex Elder } 22548b8fb99cSAlex Elder 22558b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref); 22568b8fb99cSAlex Elder static void rbd_spec_put(struct rbd_spec *spec) 22578b8fb99cSAlex Elder { 22588b8fb99cSAlex Elder if (spec) 22598b8fb99cSAlex Elder kref_put(&spec->kref, rbd_spec_free); 22608b8fb99cSAlex Elder } 22618b8fb99cSAlex Elder 22628b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_alloc(void) 22638b8fb99cSAlex Elder { 22648b8fb99cSAlex Elder struct rbd_spec *spec; 22658b8fb99cSAlex Elder 22668b8fb99cSAlex Elder spec = kzalloc(sizeof (*spec), GFP_KERNEL); 22678b8fb99cSAlex Elder if (!spec) 22688b8fb99cSAlex Elder return NULL; 22698b8fb99cSAlex Elder kref_init(&spec->kref); 22708b8fb99cSAlex Elder 22718b8fb99cSAlex Elder rbd_spec_put(rbd_spec_get(spec)); /* TEMPORARY */ 22728b8fb99cSAlex Elder 22738b8fb99cSAlex Elder return spec; 22748b8fb99cSAlex Elder } 22758b8fb99cSAlex Elder 22768b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref) 22778b8fb99cSAlex Elder { 22788b8fb99cSAlex Elder struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref); 22798b8fb99cSAlex Elder 22808b8fb99cSAlex Elder kfree(spec->pool_name); 22818b8fb99cSAlex Elder kfree(spec->image_id); 22828b8fb99cSAlex Elder kfree(spec->image_name); 22838b8fb99cSAlex Elder kfree(spec->snap_name); 22848b8fb99cSAlex Elder kfree(spec); 22858b8fb99cSAlex Elder } 22868b8fb99cSAlex Elder 2287c53d5893SAlex Elder struct rbd_device *rbd_dev_create(struct rbd_client *rbdc, 2288c53d5893SAlex Elder struct rbd_spec *spec) 2289c53d5893SAlex Elder { 2290c53d5893SAlex Elder struct rbd_device *rbd_dev; 2291c53d5893SAlex Elder 2292c53d5893SAlex Elder rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL); 2293c53d5893SAlex Elder if (!rbd_dev) 2294c53d5893SAlex Elder return NULL; 2295c53d5893SAlex Elder 2296c53d5893SAlex Elder spin_lock_init(&rbd_dev->lock); 2297d78b650aSAlex Elder atomic_set(&rbd_dev->exists, 0); 2298c53d5893SAlex Elder INIT_LIST_HEAD(&rbd_dev->node); 2299c53d5893SAlex Elder INIT_LIST_HEAD(&rbd_dev->snaps); 2300c53d5893SAlex Elder init_rwsem(&rbd_dev->header_rwsem); 2301c53d5893SAlex Elder 2302c53d5893SAlex Elder rbd_dev->spec = spec; 2303c53d5893SAlex Elder rbd_dev->rbd_client = rbdc; 2304c53d5893SAlex Elder 2305c53d5893SAlex Elder return rbd_dev; 2306c53d5893SAlex Elder } 2307c53d5893SAlex Elder 2308c53d5893SAlex Elder static void rbd_dev_destroy(struct rbd_device *rbd_dev) 2309c53d5893SAlex Elder { 231086b00e0dSAlex Elder rbd_spec_put(rbd_dev->parent_spec); 2311c53d5893SAlex Elder kfree(rbd_dev->header_name); 2312c53d5893SAlex Elder rbd_put_client(rbd_dev->rbd_client); 2313c53d5893SAlex Elder rbd_spec_put(rbd_dev->spec); 2314c53d5893SAlex Elder kfree(rbd_dev); 2315c53d5893SAlex Elder } 2316c53d5893SAlex Elder 2317304f6808SAlex Elder static bool rbd_snap_registered(struct rbd_snap *snap) 2318304f6808SAlex Elder { 2319304f6808SAlex Elder bool ret = snap->dev.type == &rbd_snap_device_type; 2320304f6808SAlex Elder bool reg = device_is_registered(&snap->dev); 2321304f6808SAlex Elder 2322304f6808SAlex Elder rbd_assert(!ret ^ reg); 2323304f6808SAlex Elder 2324304f6808SAlex Elder return ret; 2325304f6808SAlex Elder } 2326304f6808SAlex Elder 232741f38c2bSAlex Elder static void rbd_remove_snap_dev(struct rbd_snap *snap) 2328dfc5606dSYehuda Sadeh { 2329dfc5606dSYehuda Sadeh list_del(&snap->node); 2330304f6808SAlex Elder if (device_is_registered(&snap->dev)) 2331dfc5606dSYehuda Sadeh device_unregister(&snap->dev); 2332dfc5606dSYehuda Sadeh } 2333dfc5606dSYehuda Sadeh 233414e7085dSAlex Elder static int rbd_register_snap_dev(struct rbd_snap *snap, 2335dfc5606dSYehuda Sadeh struct device *parent) 2336dfc5606dSYehuda Sadeh { 2337dfc5606dSYehuda Sadeh struct device *dev = &snap->dev; 2338dfc5606dSYehuda Sadeh int ret; 2339dfc5606dSYehuda Sadeh 2340dfc5606dSYehuda Sadeh dev->type = &rbd_snap_device_type; 2341dfc5606dSYehuda Sadeh dev->parent = parent; 2342dfc5606dSYehuda Sadeh dev->release = rbd_snap_dev_release; 2343d4b125e9SAlex Elder dev_set_name(dev, "%s%s", RBD_SNAP_DEV_NAME_PREFIX, snap->name); 2344304f6808SAlex Elder dout("%s: registering device for snapshot %s\n", __func__, snap->name); 2345304f6808SAlex Elder 2346dfc5606dSYehuda Sadeh ret = device_register(dev); 2347dfc5606dSYehuda Sadeh 2348dfc5606dSYehuda Sadeh return ret; 2349dfc5606dSYehuda Sadeh } 2350dfc5606dSYehuda Sadeh 23514e891e0aSAlex Elder static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev, 2352c8d18425SAlex Elder const char *snap_name, 235334b13184SAlex Elder u64 snap_id, u64 snap_size, 235434b13184SAlex Elder u64 snap_features) 2355dfc5606dSYehuda Sadeh { 23564e891e0aSAlex Elder struct rbd_snap *snap; 2357dfc5606dSYehuda Sadeh int ret; 23584e891e0aSAlex Elder 23594e891e0aSAlex Elder snap = kzalloc(sizeof (*snap), GFP_KERNEL); 2360dfc5606dSYehuda Sadeh if (!snap) 23614e891e0aSAlex Elder return ERR_PTR(-ENOMEM); 23624e891e0aSAlex Elder 23634e891e0aSAlex Elder ret = -ENOMEM; 2364c8d18425SAlex Elder snap->name = kstrdup(snap_name, GFP_KERNEL); 23654e891e0aSAlex Elder if (!snap->name) 23664e891e0aSAlex Elder goto err; 23674e891e0aSAlex Elder 2368c8d18425SAlex Elder snap->id = snap_id; 2369c8d18425SAlex Elder snap->size = snap_size; 237034b13184SAlex Elder snap->features = snap_features; 23714e891e0aSAlex Elder 23724e891e0aSAlex Elder return snap; 23734e891e0aSAlex Elder 2374dfc5606dSYehuda Sadeh err: 2375dfc5606dSYehuda Sadeh kfree(snap->name); 2376dfc5606dSYehuda Sadeh kfree(snap); 23774e891e0aSAlex Elder 23784e891e0aSAlex Elder return ERR_PTR(ret); 2379dfc5606dSYehuda Sadeh } 2380dfc5606dSYehuda Sadeh 2381cd892126SAlex Elder static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which, 2382cd892126SAlex Elder u64 *snap_size, u64 *snap_features) 2383cd892126SAlex Elder { 2384cd892126SAlex Elder char *snap_name; 2385cd892126SAlex Elder 2386cd892126SAlex Elder rbd_assert(which < rbd_dev->header.snapc->num_snaps); 2387cd892126SAlex Elder 2388cd892126SAlex Elder *snap_size = rbd_dev->header.snap_sizes[which]; 2389cd892126SAlex Elder *snap_features = 0; /* No features for v1 */ 2390cd892126SAlex Elder 2391cd892126SAlex Elder /* Skip over names until we find the one we are looking for */ 2392cd892126SAlex Elder 2393cd892126SAlex Elder snap_name = rbd_dev->header.snap_names; 2394cd892126SAlex Elder while (which--) 2395cd892126SAlex Elder snap_name += strlen(snap_name) + 1; 2396cd892126SAlex Elder 2397cd892126SAlex Elder return snap_name; 2398cd892126SAlex Elder } 2399cd892126SAlex Elder 2400dfc5606dSYehuda Sadeh /* 24019d475de5SAlex Elder * Get the size and object order for an image snapshot, or if 24029d475de5SAlex Elder * snap_id is CEPH_NOSNAP, gets this information for the base 24039d475de5SAlex Elder * image. 24049d475de5SAlex Elder */ 24059d475de5SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id, 24069d475de5SAlex Elder u8 *order, u64 *snap_size) 24079d475de5SAlex Elder { 24089d475de5SAlex Elder __le64 snapid = cpu_to_le64(snap_id); 24099d475de5SAlex Elder int ret; 24109d475de5SAlex Elder struct { 24119d475de5SAlex Elder u8 order; 24129d475de5SAlex Elder __le64 size; 24139d475de5SAlex Elder } __attribute__ ((packed)) size_buf = { 0 }; 24149d475de5SAlex Elder 24159d475de5SAlex Elder ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, 24169d475de5SAlex Elder "rbd", "get_size", 24179d475de5SAlex Elder (char *) &snapid, sizeof (snapid), 24189d475de5SAlex Elder (char *) &size_buf, sizeof (size_buf), 24199d475de5SAlex Elder CEPH_OSD_FLAG_READ, NULL); 24209d475de5SAlex Elder dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); 24219d475de5SAlex Elder if (ret < 0) 24229d475de5SAlex Elder return ret; 24239d475de5SAlex Elder 24249d475de5SAlex Elder *order = size_buf.order; 24259d475de5SAlex Elder *snap_size = le64_to_cpu(size_buf.size); 24269d475de5SAlex Elder 24279d475de5SAlex Elder dout(" snap_id 0x%016llx order = %u, snap_size = %llu\n", 24289d475de5SAlex Elder (unsigned long long) snap_id, (unsigned int) *order, 24299d475de5SAlex Elder (unsigned long long) *snap_size); 24309d475de5SAlex Elder 24319d475de5SAlex Elder return 0; 24329d475de5SAlex Elder } 24339d475de5SAlex Elder 24349d475de5SAlex Elder static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev) 24359d475de5SAlex Elder { 24369d475de5SAlex Elder return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP, 24379d475de5SAlex Elder &rbd_dev->header.obj_order, 24389d475de5SAlex Elder &rbd_dev->header.image_size); 24399d475de5SAlex Elder } 24409d475de5SAlex Elder 24411e130199SAlex Elder static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev) 24421e130199SAlex Elder { 24431e130199SAlex Elder void *reply_buf; 24441e130199SAlex Elder int ret; 24451e130199SAlex Elder void *p; 24461e130199SAlex Elder 24471e130199SAlex Elder reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL); 24481e130199SAlex Elder if (!reply_buf) 24491e130199SAlex Elder return -ENOMEM; 24501e130199SAlex Elder 24511e130199SAlex Elder ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, 24521e130199SAlex Elder "rbd", "get_object_prefix", 24531e130199SAlex Elder NULL, 0, 24541e130199SAlex Elder reply_buf, RBD_OBJ_PREFIX_LEN_MAX, 24551e130199SAlex Elder CEPH_OSD_FLAG_READ, NULL); 24561e130199SAlex Elder dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); 24571e130199SAlex Elder if (ret < 0) 24581e130199SAlex Elder goto out; 2459a0ea3a40SAlex Elder ret = 0; /* rbd_req_sync_exec() can return positive */ 24601e130199SAlex Elder 24611e130199SAlex Elder p = reply_buf; 24621e130199SAlex Elder rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p, 24631e130199SAlex Elder p + RBD_OBJ_PREFIX_LEN_MAX, 24641e130199SAlex Elder NULL, GFP_NOIO); 24651e130199SAlex Elder 24661e130199SAlex Elder if (IS_ERR(rbd_dev->header.object_prefix)) { 24671e130199SAlex Elder ret = PTR_ERR(rbd_dev->header.object_prefix); 24681e130199SAlex Elder rbd_dev->header.object_prefix = NULL; 24691e130199SAlex Elder } else { 24701e130199SAlex Elder dout(" object_prefix = %s\n", rbd_dev->header.object_prefix); 24711e130199SAlex Elder } 24721e130199SAlex Elder 24731e130199SAlex Elder out: 24741e130199SAlex Elder kfree(reply_buf); 24751e130199SAlex Elder 24761e130199SAlex Elder return ret; 24771e130199SAlex Elder } 24781e130199SAlex Elder 2479b1b5402aSAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id, 2480b1b5402aSAlex Elder u64 *snap_features) 2481b1b5402aSAlex Elder { 2482b1b5402aSAlex Elder __le64 snapid = cpu_to_le64(snap_id); 2483b1b5402aSAlex Elder struct { 2484b1b5402aSAlex Elder __le64 features; 2485b1b5402aSAlex Elder __le64 incompat; 2486b1b5402aSAlex Elder } features_buf = { 0 }; 2487d889140cSAlex Elder u64 incompat; 2488b1b5402aSAlex Elder int ret; 2489b1b5402aSAlex Elder 2490b1b5402aSAlex Elder ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, 2491b1b5402aSAlex Elder "rbd", "get_features", 2492b1b5402aSAlex Elder (char *) &snapid, sizeof (snapid), 2493b1b5402aSAlex Elder (char *) &features_buf, sizeof (features_buf), 2494b1b5402aSAlex Elder CEPH_OSD_FLAG_READ, NULL); 2495b1b5402aSAlex Elder dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); 2496b1b5402aSAlex Elder if (ret < 0) 2497b1b5402aSAlex Elder return ret; 2498d889140cSAlex Elder 2499d889140cSAlex Elder incompat = le64_to_cpu(features_buf.incompat); 2500d889140cSAlex Elder if (incompat & ~RBD_FEATURES_ALL) 2501b8f5c6edSAlex Elder return -ENXIO; 2502d889140cSAlex Elder 2503b1b5402aSAlex Elder *snap_features = le64_to_cpu(features_buf.features); 2504b1b5402aSAlex Elder 2505b1b5402aSAlex Elder dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n", 2506b1b5402aSAlex Elder (unsigned long long) snap_id, 2507b1b5402aSAlex Elder (unsigned long long) *snap_features, 2508b1b5402aSAlex Elder (unsigned long long) le64_to_cpu(features_buf.incompat)); 2509b1b5402aSAlex Elder 2510b1b5402aSAlex Elder return 0; 2511b1b5402aSAlex Elder } 2512b1b5402aSAlex Elder 2513b1b5402aSAlex Elder static int rbd_dev_v2_features(struct rbd_device *rbd_dev) 2514b1b5402aSAlex Elder { 2515b1b5402aSAlex Elder return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP, 2516b1b5402aSAlex Elder &rbd_dev->header.features); 2517b1b5402aSAlex Elder } 2518b1b5402aSAlex Elder 251986b00e0dSAlex Elder static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev) 252086b00e0dSAlex Elder { 252186b00e0dSAlex Elder struct rbd_spec *parent_spec; 252286b00e0dSAlex Elder size_t size; 252386b00e0dSAlex Elder void *reply_buf = NULL; 252486b00e0dSAlex Elder __le64 snapid; 252586b00e0dSAlex Elder void *p; 252686b00e0dSAlex Elder void *end; 252786b00e0dSAlex Elder char *image_id; 252886b00e0dSAlex Elder u64 overlap; 252986b00e0dSAlex Elder int ret; 253086b00e0dSAlex Elder 253186b00e0dSAlex Elder parent_spec = rbd_spec_alloc(); 253286b00e0dSAlex Elder if (!parent_spec) 253386b00e0dSAlex Elder return -ENOMEM; 253486b00e0dSAlex Elder 253586b00e0dSAlex Elder size = sizeof (__le64) + /* pool_id */ 253686b00e0dSAlex Elder sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX + /* image_id */ 253786b00e0dSAlex Elder sizeof (__le64) + /* snap_id */ 253886b00e0dSAlex Elder sizeof (__le64); /* overlap */ 253986b00e0dSAlex Elder reply_buf = kmalloc(size, GFP_KERNEL); 254086b00e0dSAlex Elder if (!reply_buf) { 254186b00e0dSAlex Elder ret = -ENOMEM; 254286b00e0dSAlex Elder goto out_err; 254386b00e0dSAlex Elder } 254486b00e0dSAlex Elder 254586b00e0dSAlex Elder snapid = cpu_to_le64(CEPH_NOSNAP); 254686b00e0dSAlex Elder ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, 254786b00e0dSAlex Elder "rbd", "get_parent", 254886b00e0dSAlex Elder (char *) &snapid, sizeof (snapid), 254986b00e0dSAlex Elder (char *) reply_buf, size, 255086b00e0dSAlex Elder CEPH_OSD_FLAG_READ, NULL); 255186b00e0dSAlex Elder dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); 255286b00e0dSAlex Elder if (ret < 0) 255386b00e0dSAlex Elder goto out_err; 255486b00e0dSAlex Elder 255586b00e0dSAlex Elder ret = -ERANGE; 255686b00e0dSAlex Elder p = reply_buf; 255786b00e0dSAlex Elder end = (char *) reply_buf + size; 255886b00e0dSAlex Elder ceph_decode_64_safe(&p, end, parent_spec->pool_id, out_err); 255986b00e0dSAlex Elder if (parent_spec->pool_id == CEPH_NOPOOL) 256086b00e0dSAlex Elder goto out; /* No parent? No problem. */ 256186b00e0dSAlex Elder 2562979ed480SAlex Elder image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL); 256386b00e0dSAlex Elder if (IS_ERR(image_id)) { 256486b00e0dSAlex Elder ret = PTR_ERR(image_id); 256586b00e0dSAlex Elder goto out_err; 256686b00e0dSAlex Elder } 256786b00e0dSAlex Elder parent_spec->image_id = image_id; 256886b00e0dSAlex Elder ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err); 256986b00e0dSAlex Elder ceph_decode_64_safe(&p, end, overlap, out_err); 257086b00e0dSAlex Elder 257186b00e0dSAlex Elder rbd_dev->parent_overlap = overlap; 257286b00e0dSAlex Elder rbd_dev->parent_spec = parent_spec; 257386b00e0dSAlex Elder parent_spec = NULL; /* rbd_dev now owns this */ 257486b00e0dSAlex Elder out: 257586b00e0dSAlex Elder ret = 0; 257686b00e0dSAlex Elder out_err: 257786b00e0dSAlex Elder kfree(reply_buf); 257886b00e0dSAlex Elder rbd_spec_put(parent_spec); 257986b00e0dSAlex Elder 258086b00e0dSAlex Elder return ret; 258186b00e0dSAlex Elder } 258286b00e0dSAlex Elder 25839e15b77dSAlex Elder static char *rbd_dev_image_name(struct rbd_device *rbd_dev) 25849e15b77dSAlex Elder { 25859e15b77dSAlex Elder size_t image_id_size; 25869e15b77dSAlex Elder char *image_id; 25879e15b77dSAlex Elder void *p; 25889e15b77dSAlex Elder void *end; 25899e15b77dSAlex Elder size_t size; 25909e15b77dSAlex Elder void *reply_buf = NULL; 25919e15b77dSAlex Elder size_t len = 0; 25929e15b77dSAlex Elder char *image_name = NULL; 25939e15b77dSAlex Elder int ret; 25949e15b77dSAlex Elder 25959e15b77dSAlex Elder rbd_assert(!rbd_dev->spec->image_name); 25969e15b77dSAlex Elder 259769e7a02fSAlex Elder len = strlen(rbd_dev->spec->image_id); 259869e7a02fSAlex Elder image_id_size = sizeof (__le32) + len; 25999e15b77dSAlex Elder image_id = kmalloc(image_id_size, GFP_KERNEL); 26009e15b77dSAlex Elder if (!image_id) 26019e15b77dSAlex Elder return NULL; 26029e15b77dSAlex Elder 26039e15b77dSAlex Elder p = image_id; 26049e15b77dSAlex Elder end = (char *) image_id + image_id_size; 260569e7a02fSAlex Elder ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32) len); 26069e15b77dSAlex Elder 26079e15b77dSAlex Elder size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX; 26089e15b77dSAlex Elder reply_buf = kmalloc(size, GFP_KERNEL); 26099e15b77dSAlex Elder if (!reply_buf) 26109e15b77dSAlex Elder goto out; 26119e15b77dSAlex Elder 26129e15b77dSAlex Elder ret = rbd_req_sync_exec(rbd_dev, RBD_DIRECTORY, 26139e15b77dSAlex Elder "rbd", "dir_get_name", 26149e15b77dSAlex Elder image_id, image_id_size, 26159e15b77dSAlex Elder (char *) reply_buf, size, 26169e15b77dSAlex Elder CEPH_OSD_FLAG_READ, NULL); 26179e15b77dSAlex Elder if (ret < 0) 26189e15b77dSAlex Elder goto out; 26199e15b77dSAlex Elder p = reply_buf; 26209e15b77dSAlex Elder end = (char *) reply_buf + size; 26219e15b77dSAlex Elder image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL); 26229e15b77dSAlex Elder if (IS_ERR(image_name)) 26239e15b77dSAlex Elder image_name = NULL; 26249e15b77dSAlex Elder else 26259e15b77dSAlex Elder dout("%s: name is %s len is %zd\n", __func__, image_name, len); 26269e15b77dSAlex Elder out: 26279e15b77dSAlex Elder kfree(reply_buf); 26289e15b77dSAlex Elder kfree(image_id); 26299e15b77dSAlex Elder 26309e15b77dSAlex Elder return image_name; 26319e15b77dSAlex Elder } 26329e15b77dSAlex Elder 26339e15b77dSAlex Elder /* 26349e15b77dSAlex Elder * When a parent image gets probed, we only have the pool, image, 26359e15b77dSAlex Elder * and snapshot ids but not the names of any of them. This call 26369e15b77dSAlex Elder * is made later to fill in those names. It has to be done after 26379e15b77dSAlex Elder * rbd_dev_snaps_update() has completed because some of the 26389e15b77dSAlex Elder * information (in particular, snapshot name) is not available 26399e15b77dSAlex Elder * until then. 26409e15b77dSAlex Elder */ 26419e15b77dSAlex Elder static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev) 26429e15b77dSAlex Elder { 26439e15b77dSAlex Elder struct ceph_osd_client *osdc; 26449e15b77dSAlex Elder const char *name; 26459e15b77dSAlex Elder void *reply_buf = NULL; 26469e15b77dSAlex Elder int ret; 26479e15b77dSAlex Elder 26489e15b77dSAlex Elder if (rbd_dev->spec->pool_name) 26499e15b77dSAlex Elder return 0; /* Already have the names */ 26509e15b77dSAlex Elder 26519e15b77dSAlex Elder /* Look up the pool name */ 26529e15b77dSAlex Elder 26539e15b77dSAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 26549e15b77dSAlex Elder name = ceph_pg_pool_name_by_id(osdc->osdmap, rbd_dev->spec->pool_id); 2655935dc89fSAlex Elder if (!name) { 2656935dc89fSAlex Elder rbd_warn(rbd_dev, "there is no pool with id %llu", 2657935dc89fSAlex Elder rbd_dev->spec->pool_id); /* Really a BUG() */ 2658935dc89fSAlex Elder return -EIO; 2659935dc89fSAlex Elder } 26609e15b77dSAlex Elder 26619e15b77dSAlex Elder rbd_dev->spec->pool_name = kstrdup(name, GFP_KERNEL); 26629e15b77dSAlex Elder if (!rbd_dev->spec->pool_name) 26639e15b77dSAlex Elder return -ENOMEM; 26649e15b77dSAlex Elder 26659e15b77dSAlex Elder /* Fetch the image name; tolerate failure here */ 26669e15b77dSAlex Elder 26679e15b77dSAlex Elder name = rbd_dev_image_name(rbd_dev); 266869e7a02fSAlex Elder if (name) 26699e15b77dSAlex Elder rbd_dev->spec->image_name = (char *) name; 267069e7a02fSAlex Elder else 267106ecc6cbSAlex Elder rbd_warn(rbd_dev, "unable to get image name"); 26729e15b77dSAlex Elder 26739e15b77dSAlex Elder /* Look up the snapshot name. */ 26749e15b77dSAlex Elder 26759e15b77dSAlex Elder name = rbd_snap_name(rbd_dev, rbd_dev->spec->snap_id); 26769e15b77dSAlex Elder if (!name) { 2677935dc89fSAlex Elder rbd_warn(rbd_dev, "no snapshot with id %llu", 2678935dc89fSAlex Elder rbd_dev->spec->snap_id); /* Really a BUG() */ 26799e15b77dSAlex Elder ret = -EIO; 26809e15b77dSAlex Elder goto out_err; 26819e15b77dSAlex Elder } 26829e15b77dSAlex Elder rbd_dev->spec->snap_name = kstrdup(name, GFP_KERNEL); 26839e15b77dSAlex Elder if(!rbd_dev->spec->snap_name) 26849e15b77dSAlex Elder goto out_err; 26859e15b77dSAlex Elder 26869e15b77dSAlex Elder return 0; 26879e15b77dSAlex Elder out_err: 26889e15b77dSAlex Elder kfree(reply_buf); 26899e15b77dSAlex Elder kfree(rbd_dev->spec->pool_name); 26909e15b77dSAlex Elder rbd_dev->spec->pool_name = NULL; 26919e15b77dSAlex Elder 26929e15b77dSAlex Elder return ret; 26939e15b77dSAlex Elder } 26949e15b77dSAlex Elder 26956e14b1a6SAlex Elder static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver) 269635d489f9SAlex Elder { 269735d489f9SAlex Elder size_t size; 269835d489f9SAlex Elder int ret; 269935d489f9SAlex Elder void *reply_buf; 270035d489f9SAlex Elder void *p; 270135d489f9SAlex Elder void *end; 270235d489f9SAlex Elder u64 seq; 270335d489f9SAlex Elder u32 snap_count; 270435d489f9SAlex Elder struct ceph_snap_context *snapc; 270535d489f9SAlex Elder u32 i; 270635d489f9SAlex Elder 270735d489f9SAlex Elder /* 270835d489f9SAlex Elder * We'll need room for the seq value (maximum snapshot id), 270935d489f9SAlex Elder * snapshot count, and array of that many snapshot ids. 271035d489f9SAlex Elder * For now we have a fixed upper limit on the number we're 271135d489f9SAlex Elder * prepared to receive. 271235d489f9SAlex Elder */ 271335d489f9SAlex Elder size = sizeof (__le64) + sizeof (__le32) + 271435d489f9SAlex Elder RBD_MAX_SNAP_COUNT * sizeof (__le64); 271535d489f9SAlex Elder reply_buf = kzalloc(size, GFP_KERNEL); 271635d489f9SAlex Elder if (!reply_buf) 271735d489f9SAlex Elder return -ENOMEM; 271835d489f9SAlex Elder 271935d489f9SAlex Elder ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, 272035d489f9SAlex Elder "rbd", "get_snapcontext", 272135d489f9SAlex Elder NULL, 0, 272235d489f9SAlex Elder reply_buf, size, 27236e14b1a6SAlex Elder CEPH_OSD_FLAG_READ, ver); 272435d489f9SAlex Elder dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); 272535d489f9SAlex Elder if (ret < 0) 272635d489f9SAlex Elder goto out; 272735d489f9SAlex Elder 272835d489f9SAlex Elder ret = -ERANGE; 272935d489f9SAlex Elder p = reply_buf; 273035d489f9SAlex Elder end = (char *) reply_buf + size; 273135d489f9SAlex Elder ceph_decode_64_safe(&p, end, seq, out); 273235d489f9SAlex Elder ceph_decode_32_safe(&p, end, snap_count, out); 273335d489f9SAlex Elder 273435d489f9SAlex Elder /* 273535d489f9SAlex Elder * Make sure the reported number of snapshot ids wouldn't go 273635d489f9SAlex Elder * beyond the end of our buffer. But before checking that, 273735d489f9SAlex Elder * make sure the computed size of the snapshot context we 273835d489f9SAlex Elder * allocate is representable in a size_t. 273935d489f9SAlex Elder */ 274035d489f9SAlex Elder if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context)) 274135d489f9SAlex Elder / sizeof (u64)) { 274235d489f9SAlex Elder ret = -EINVAL; 274335d489f9SAlex Elder goto out; 274435d489f9SAlex Elder } 274535d489f9SAlex Elder if (!ceph_has_room(&p, end, snap_count * sizeof (__le64))) 274635d489f9SAlex Elder goto out; 274735d489f9SAlex Elder 274835d489f9SAlex Elder size = sizeof (struct ceph_snap_context) + 274935d489f9SAlex Elder snap_count * sizeof (snapc->snaps[0]); 275035d489f9SAlex Elder snapc = kmalloc(size, GFP_KERNEL); 275135d489f9SAlex Elder if (!snapc) { 275235d489f9SAlex Elder ret = -ENOMEM; 275335d489f9SAlex Elder goto out; 275435d489f9SAlex Elder } 275535d489f9SAlex Elder 275635d489f9SAlex Elder atomic_set(&snapc->nref, 1); 275735d489f9SAlex Elder snapc->seq = seq; 275835d489f9SAlex Elder snapc->num_snaps = snap_count; 275935d489f9SAlex Elder for (i = 0; i < snap_count; i++) 276035d489f9SAlex Elder snapc->snaps[i] = ceph_decode_64(&p); 276135d489f9SAlex Elder 276235d489f9SAlex Elder rbd_dev->header.snapc = snapc; 276335d489f9SAlex Elder 276435d489f9SAlex Elder dout(" snap context seq = %llu, snap_count = %u\n", 276535d489f9SAlex Elder (unsigned long long) seq, (unsigned int) snap_count); 276635d489f9SAlex Elder 276735d489f9SAlex Elder out: 276835d489f9SAlex Elder kfree(reply_buf); 276935d489f9SAlex Elder 277035d489f9SAlex Elder return 0; 277135d489f9SAlex Elder } 277235d489f9SAlex Elder 2773b8b1e2dbSAlex Elder static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which) 2774b8b1e2dbSAlex Elder { 2775b8b1e2dbSAlex Elder size_t size; 2776b8b1e2dbSAlex Elder void *reply_buf; 2777b8b1e2dbSAlex Elder __le64 snap_id; 2778b8b1e2dbSAlex Elder int ret; 2779b8b1e2dbSAlex Elder void *p; 2780b8b1e2dbSAlex Elder void *end; 2781b8b1e2dbSAlex Elder char *snap_name; 2782b8b1e2dbSAlex Elder 2783b8b1e2dbSAlex Elder size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN; 2784b8b1e2dbSAlex Elder reply_buf = kmalloc(size, GFP_KERNEL); 2785b8b1e2dbSAlex Elder if (!reply_buf) 2786b8b1e2dbSAlex Elder return ERR_PTR(-ENOMEM); 2787b8b1e2dbSAlex Elder 2788b8b1e2dbSAlex Elder snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]); 2789b8b1e2dbSAlex Elder ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, 2790b8b1e2dbSAlex Elder "rbd", "get_snapshot_name", 2791b8b1e2dbSAlex Elder (char *) &snap_id, sizeof (snap_id), 2792b8b1e2dbSAlex Elder reply_buf, size, 2793b8b1e2dbSAlex Elder CEPH_OSD_FLAG_READ, NULL); 2794b8b1e2dbSAlex Elder dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); 2795b8b1e2dbSAlex Elder if (ret < 0) 2796b8b1e2dbSAlex Elder goto out; 2797b8b1e2dbSAlex Elder 2798b8b1e2dbSAlex Elder p = reply_buf; 2799b8b1e2dbSAlex Elder end = (char *) reply_buf + size; 2800e5c35534SAlex Elder snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL); 2801b8b1e2dbSAlex Elder if (IS_ERR(snap_name)) { 2802b8b1e2dbSAlex Elder ret = PTR_ERR(snap_name); 2803b8b1e2dbSAlex Elder goto out; 2804b8b1e2dbSAlex Elder } else { 2805b8b1e2dbSAlex Elder dout(" snap_id 0x%016llx snap_name = %s\n", 2806b8b1e2dbSAlex Elder (unsigned long long) le64_to_cpu(snap_id), snap_name); 2807b8b1e2dbSAlex Elder } 2808b8b1e2dbSAlex Elder kfree(reply_buf); 2809b8b1e2dbSAlex Elder 2810b8b1e2dbSAlex Elder return snap_name; 2811b8b1e2dbSAlex Elder out: 2812b8b1e2dbSAlex Elder kfree(reply_buf); 2813b8b1e2dbSAlex Elder 2814b8b1e2dbSAlex Elder return ERR_PTR(ret); 2815b8b1e2dbSAlex Elder } 2816b8b1e2dbSAlex Elder 2817b8b1e2dbSAlex Elder static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which, 2818b8b1e2dbSAlex Elder u64 *snap_size, u64 *snap_features) 2819b8b1e2dbSAlex Elder { 2820b8b1e2dbSAlex Elder __le64 snap_id; 2821b8b1e2dbSAlex Elder u8 order; 2822b8b1e2dbSAlex Elder int ret; 2823b8b1e2dbSAlex Elder 2824b8b1e2dbSAlex Elder snap_id = rbd_dev->header.snapc->snaps[which]; 2825b8b1e2dbSAlex Elder ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, &order, snap_size); 2826b8b1e2dbSAlex Elder if (ret) 2827b8b1e2dbSAlex Elder return ERR_PTR(ret); 2828b8b1e2dbSAlex Elder ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, snap_features); 2829b8b1e2dbSAlex Elder if (ret) 2830b8b1e2dbSAlex Elder return ERR_PTR(ret); 2831b8b1e2dbSAlex Elder 2832b8b1e2dbSAlex Elder return rbd_dev_v2_snap_name(rbd_dev, which); 2833b8b1e2dbSAlex Elder } 2834b8b1e2dbSAlex Elder 2835b8b1e2dbSAlex Elder static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which, 2836b8b1e2dbSAlex Elder u64 *snap_size, u64 *snap_features) 2837b8b1e2dbSAlex Elder { 2838b8b1e2dbSAlex Elder if (rbd_dev->image_format == 1) 2839b8b1e2dbSAlex Elder return rbd_dev_v1_snap_info(rbd_dev, which, 2840b8b1e2dbSAlex Elder snap_size, snap_features); 2841b8b1e2dbSAlex Elder if (rbd_dev->image_format == 2) 2842b8b1e2dbSAlex Elder return rbd_dev_v2_snap_info(rbd_dev, which, 2843b8b1e2dbSAlex Elder snap_size, snap_features); 2844b8b1e2dbSAlex Elder return ERR_PTR(-EINVAL); 2845b8b1e2dbSAlex Elder } 2846b8b1e2dbSAlex Elder 2847117973fbSAlex Elder static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver) 2848117973fbSAlex Elder { 2849117973fbSAlex Elder int ret; 2850117973fbSAlex Elder __u8 obj_order; 2851117973fbSAlex Elder 2852117973fbSAlex Elder down_write(&rbd_dev->header_rwsem); 2853117973fbSAlex Elder 2854117973fbSAlex Elder /* Grab old order first, to see if it changes */ 2855117973fbSAlex Elder 2856117973fbSAlex Elder obj_order = rbd_dev->header.obj_order, 2857117973fbSAlex Elder ret = rbd_dev_v2_image_size(rbd_dev); 2858117973fbSAlex Elder if (ret) 2859117973fbSAlex Elder goto out; 2860117973fbSAlex Elder if (rbd_dev->header.obj_order != obj_order) { 2861117973fbSAlex Elder ret = -EIO; 2862117973fbSAlex Elder goto out; 2863117973fbSAlex Elder } 2864117973fbSAlex Elder rbd_update_mapping_size(rbd_dev); 2865117973fbSAlex Elder 2866117973fbSAlex Elder ret = rbd_dev_v2_snap_context(rbd_dev, hver); 2867117973fbSAlex Elder dout("rbd_dev_v2_snap_context returned %d\n", ret); 2868117973fbSAlex Elder if (ret) 2869117973fbSAlex Elder goto out; 2870117973fbSAlex Elder ret = rbd_dev_snaps_update(rbd_dev); 2871117973fbSAlex Elder dout("rbd_dev_snaps_update returned %d\n", ret); 2872117973fbSAlex Elder if (ret) 2873117973fbSAlex Elder goto out; 2874117973fbSAlex Elder ret = rbd_dev_snaps_register(rbd_dev); 2875117973fbSAlex Elder dout("rbd_dev_snaps_register returned %d\n", ret); 2876117973fbSAlex Elder out: 2877117973fbSAlex Elder up_write(&rbd_dev->header_rwsem); 2878117973fbSAlex Elder 2879117973fbSAlex Elder return ret; 2880117973fbSAlex Elder } 2881117973fbSAlex Elder 28829d475de5SAlex Elder /* 288335938150SAlex Elder * Scan the rbd device's current snapshot list and compare it to the 288435938150SAlex Elder * newly-received snapshot context. Remove any existing snapshots 288535938150SAlex Elder * not present in the new snapshot context. Add a new snapshot for 288635938150SAlex Elder * any snaphots in the snapshot context not in the current list. 288735938150SAlex Elder * And verify there are no changes to snapshots we already know 288835938150SAlex Elder * about. 288935938150SAlex Elder * 289035938150SAlex Elder * Assumes the snapshots in the snapshot context are sorted by 289135938150SAlex Elder * snapshot id, highest id first. (Snapshots in the rbd_dev's list 289235938150SAlex Elder * are also maintained in that order.) 2893dfc5606dSYehuda Sadeh */ 2894304f6808SAlex Elder static int rbd_dev_snaps_update(struct rbd_device *rbd_dev) 2895dfc5606dSYehuda Sadeh { 289635938150SAlex Elder struct ceph_snap_context *snapc = rbd_dev->header.snapc; 289735938150SAlex Elder const u32 snap_count = snapc->num_snaps; 289835938150SAlex Elder struct list_head *head = &rbd_dev->snaps; 289935938150SAlex Elder struct list_head *links = head->next; 290035938150SAlex Elder u32 index = 0; 2901dfc5606dSYehuda Sadeh 29029fcbb800SAlex Elder dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count); 290335938150SAlex Elder while (index < snap_count || links != head) { 290435938150SAlex Elder u64 snap_id; 290535938150SAlex Elder struct rbd_snap *snap; 2906cd892126SAlex Elder char *snap_name; 2907cd892126SAlex Elder u64 snap_size = 0; 2908cd892126SAlex Elder u64 snap_features = 0; 2909dfc5606dSYehuda Sadeh 291035938150SAlex Elder snap_id = index < snap_count ? snapc->snaps[index] 291135938150SAlex Elder : CEPH_NOSNAP; 291235938150SAlex Elder snap = links != head ? list_entry(links, struct rbd_snap, node) 291335938150SAlex Elder : NULL; 2914aafb230eSAlex Elder rbd_assert(!snap || snap->id != CEPH_NOSNAP); 2915dfc5606dSYehuda Sadeh 291635938150SAlex Elder if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) { 291735938150SAlex Elder struct list_head *next = links->next; 2918dfc5606dSYehuda Sadeh 291935938150SAlex Elder /* Existing snapshot not in the new snap context */ 2920dfc5606dSYehuda Sadeh 29210d7dbfceSAlex Elder if (rbd_dev->spec->snap_id == snap->id) 2922d78b650aSAlex Elder atomic_set(&rbd_dev->exists, 0); 292341f38c2bSAlex Elder rbd_remove_snap_dev(snap); 29249fcbb800SAlex Elder dout("%ssnap id %llu has been removed\n", 29250d7dbfceSAlex Elder rbd_dev->spec->snap_id == snap->id ? 29260d7dbfceSAlex Elder "mapped " : "", 29279fcbb800SAlex Elder (unsigned long long) snap->id); 2928dfc5606dSYehuda Sadeh 292935938150SAlex Elder /* Done with this list entry; advance */ 293035938150SAlex Elder 293135938150SAlex Elder links = next; 293235938150SAlex Elder continue; 2933dfc5606dSYehuda Sadeh } 293435938150SAlex Elder 2935b8b1e2dbSAlex Elder snap_name = rbd_dev_snap_info(rbd_dev, index, 2936cd892126SAlex Elder &snap_size, &snap_features); 2937cd892126SAlex Elder if (IS_ERR(snap_name)) 2938cd892126SAlex Elder return PTR_ERR(snap_name); 2939cd892126SAlex Elder 29409fcbb800SAlex Elder dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count, 29419fcbb800SAlex Elder (unsigned long long) snap_id); 294235938150SAlex Elder if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) { 294335938150SAlex Elder struct rbd_snap *new_snap; 294435938150SAlex Elder 294535938150SAlex Elder /* We haven't seen this snapshot before */ 294635938150SAlex Elder 2947c8d18425SAlex Elder new_snap = __rbd_add_snap_dev(rbd_dev, snap_name, 2948cd892126SAlex Elder snap_id, snap_size, snap_features); 29499fcbb800SAlex Elder if (IS_ERR(new_snap)) { 29509fcbb800SAlex Elder int err = PTR_ERR(new_snap); 29519fcbb800SAlex Elder 29529fcbb800SAlex Elder dout(" failed to add dev, error %d\n", err); 29539fcbb800SAlex Elder 29549fcbb800SAlex Elder return err; 29559fcbb800SAlex Elder } 295635938150SAlex Elder 295735938150SAlex Elder /* New goes before existing, or at end of list */ 295835938150SAlex Elder 29599fcbb800SAlex Elder dout(" added dev%s\n", snap ? "" : " at end\n"); 296035938150SAlex Elder if (snap) 296135938150SAlex Elder list_add_tail(&new_snap->node, &snap->node); 296235938150SAlex Elder else 2963523f3258SAlex Elder list_add_tail(&new_snap->node, head); 296435938150SAlex Elder } else { 296535938150SAlex Elder /* Already have this one */ 296635938150SAlex Elder 29679fcbb800SAlex Elder dout(" already present\n"); 29689fcbb800SAlex Elder 2969cd892126SAlex Elder rbd_assert(snap->size == snap_size); 2970aafb230eSAlex Elder rbd_assert(!strcmp(snap->name, snap_name)); 2971cd892126SAlex Elder rbd_assert(snap->features == snap_features); 297235938150SAlex Elder 297335938150SAlex Elder /* Done with this list entry; advance */ 297435938150SAlex Elder 297535938150SAlex Elder links = links->next; 2976dfc5606dSYehuda Sadeh } 297735938150SAlex Elder 297835938150SAlex Elder /* Advance to the next entry in the snapshot context */ 297935938150SAlex Elder 298035938150SAlex Elder index++; 2981dfc5606dSYehuda Sadeh } 29829fcbb800SAlex Elder dout("%s: done\n", __func__); 2983dfc5606dSYehuda Sadeh 2984dfc5606dSYehuda Sadeh return 0; 2985dfc5606dSYehuda Sadeh } 2986dfc5606dSYehuda Sadeh 2987304f6808SAlex Elder /* 2988304f6808SAlex Elder * Scan the list of snapshots and register the devices for any that 2989304f6808SAlex Elder * have not already been registered. 2990304f6808SAlex Elder */ 2991304f6808SAlex Elder static int rbd_dev_snaps_register(struct rbd_device *rbd_dev) 2992304f6808SAlex Elder { 2993304f6808SAlex Elder struct rbd_snap *snap; 2994304f6808SAlex Elder int ret = 0; 2995304f6808SAlex Elder 2996304f6808SAlex Elder dout("%s called\n", __func__); 299786ff77bbSAlex Elder if (WARN_ON(!device_is_registered(&rbd_dev->dev))) 299886ff77bbSAlex Elder return -EIO; 2999304f6808SAlex Elder 3000304f6808SAlex Elder list_for_each_entry(snap, &rbd_dev->snaps, node) { 3001304f6808SAlex Elder if (!rbd_snap_registered(snap)) { 3002304f6808SAlex Elder ret = rbd_register_snap_dev(snap, &rbd_dev->dev); 3003304f6808SAlex Elder if (ret < 0) 3004304f6808SAlex Elder break; 3005304f6808SAlex Elder } 3006304f6808SAlex Elder } 3007304f6808SAlex Elder dout("%s: returning %d\n", __func__, ret); 3008304f6808SAlex Elder 3009304f6808SAlex Elder return ret; 3010304f6808SAlex Elder } 3011304f6808SAlex Elder 3012dfc5606dSYehuda Sadeh static int rbd_bus_add_dev(struct rbd_device *rbd_dev) 3013dfc5606dSYehuda Sadeh { 3014dfc5606dSYehuda Sadeh struct device *dev; 3015cd789ab9SAlex Elder int ret; 3016dfc5606dSYehuda Sadeh 3017dfc5606dSYehuda Sadeh mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); 3018dfc5606dSYehuda Sadeh 3019cd789ab9SAlex Elder dev = &rbd_dev->dev; 3020dfc5606dSYehuda Sadeh dev->bus = &rbd_bus_type; 3021dfc5606dSYehuda Sadeh dev->type = &rbd_device_type; 3022dfc5606dSYehuda Sadeh dev->parent = &rbd_root_dev; 3023dfc5606dSYehuda Sadeh dev->release = rbd_dev_release; 3024de71a297SAlex Elder dev_set_name(dev, "%d", rbd_dev->dev_id); 3025dfc5606dSYehuda Sadeh ret = device_register(dev); 3026dfc5606dSYehuda Sadeh 3027dfc5606dSYehuda Sadeh mutex_unlock(&ctl_mutex); 3028cd789ab9SAlex Elder 3029dfc5606dSYehuda Sadeh return ret; 3030602adf40SYehuda Sadeh } 3031602adf40SYehuda Sadeh 3032dfc5606dSYehuda Sadeh static void rbd_bus_del_dev(struct rbd_device *rbd_dev) 3033dfc5606dSYehuda Sadeh { 3034dfc5606dSYehuda Sadeh device_unregister(&rbd_dev->dev); 3035dfc5606dSYehuda Sadeh } 3036dfc5606dSYehuda Sadeh 303759c2be1eSYehuda Sadeh static int rbd_init_watch_dev(struct rbd_device *rbd_dev) 303859c2be1eSYehuda Sadeh { 303959c2be1eSYehuda Sadeh int ret, rc; 304059c2be1eSYehuda Sadeh 304159c2be1eSYehuda Sadeh do { 30420e6f322dSAlex Elder ret = rbd_req_sync_watch(rbd_dev); 304359c2be1eSYehuda Sadeh if (ret == -ERANGE) { 3044117973fbSAlex Elder rc = rbd_dev_refresh(rbd_dev, NULL); 304559c2be1eSYehuda Sadeh if (rc < 0) 304659c2be1eSYehuda Sadeh return rc; 304759c2be1eSYehuda Sadeh } 304859c2be1eSYehuda Sadeh } while (ret == -ERANGE); 304959c2be1eSYehuda Sadeh 305059c2be1eSYehuda Sadeh return ret; 305159c2be1eSYehuda Sadeh } 305259c2be1eSYehuda Sadeh 3053e2839308SAlex Elder static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0); 30541ddbe94eSAlex Elder 30551ddbe94eSAlex Elder /* 3056499afd5bSAlex Elder * Get a unique rbd identifier for the given new rbd_dev, and add 3057499afd5bSAlex Elder * the rbd_dev to the global list. The minimum rbd id is 1. 30581ddbe94eSAlex Elder */ 3059e2839308SAlex Elder static void rbd_dev_id_get(struct rbd_device *rbd_dev) 3060b7f23c36SAlex Elder { 3061e2839308SAlex Elder rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max); 3062499afd5bSAlex Elder 3063499afd5bSAlex Elder spin_lock(&rbd_dev_list_lock); 3064499afd5bSAlex Elder list_add_tail(&rbd_dev->node, &rbd_dev_list); 3065499afd5bSAlex Elder spin_unlock(&rbd_dev_list_lock); 3066e2839308SAlex Elder dout("rbd_dev %p given dev id %llu\n", rbd_dev, 3067e2839308SAlex Elder (unsigned long long) rbd_dev->dev_id); 3068b7f23c36SAlex Elder } 3069b7f23c36SAlex Elder 30701ddbe94eSAlex Elder /* 3071499afd5bSAlex Elder * Remove an rbd_dev from the global list, and record that its 3072499afd5bSAlex Elder * identifier is no longer in use. 30731ddbe94eSAlex Elder */ 3074e2839308SAlex Elder static void rbd_dev_id_put(struct rbd_device *rbd_dev) 30751ddbe94eSAlex Elder { 3076d184f6bfSAlex Elder struct list_head *tmp; 3077de71a297SAlex Elder int rbd_id = rbd_dev->dev_id; 3078d184f6bfSAlex Elder int max_id; 3079d184f6bfSAlex Elder 3080aafb230eSAlex Elder rbd_assert(rbd_id > 0); 3081499afd5bSAlex Elder 3082e2839308SAlex Elder dout("rbd_dev %p released dev id %llu\n", rbd_dev, 3083e2839308SAlex Elder (unsigned long long) rbd_dev->dev_id); 3084499afd5bSAlex Elder spin_lock(&rbd_dev_list_lock); 3085499afd5bSAlex Elder list_del_init(&rbd_dev->node); 3086d184f6bfSAlex Elder 3087d184f6bfSAlex Elder /* 3088d184f6bfSAlex Elder * If the id being "put" is not the current maximum, there 3089d184f6bfSAlex Elder * is nothing special we need to do. 3090d184f6bfSAlex Elder */ 3091e2839308SAlex Elder if (rbd_id != atomic64_read(&rbd_dev_id_max)) { 3092d184f6bfSAlex Elder spin_unlock(&rbd_dev_list_lock); 3093d184f6bfSAlex Elder return; 3094d184f6bfSAlex Elder } 3095d184f6bfSAlex Elder 3096d184f6bfSAlex Elder /* 3097d184f6bfSAlex Elder * We need to update the current maximum id. Search the 3098d184f6bfSAlex Elder * list to find out what it is. We're more likely to find 3099d184f6bfSAlex Elder * the maximum at the end, so search the list backward. 3100d184f6bfSAlex Elder */ 3101d184f6bfSAlex Elder max_id = 0; 3102d184f6bfSAlex Elder list_for_each_prev(tmp, &rbd_dev_list) { 3103d184f6bfSAlex Elder struct rbd_device *rbd_dev; 3104d184f6bfSAlex Elder 3105d184f6bfSAlex Elder rbd_dev = list_entry(tmp, struct rbd_device, node); 3106b213e0b1SAlex Elder if (rbd_dev->dev_id > max_id) 3107b213e0b1SAlex Elder max_id = rbd_dev->dev_id; 3108d184f6bfSAlex Elder } 3109499afd5bSAlex Elder spin_unlock(&rbd_dev_list_lock); 31101ddbe94eSAlex Elder 31111ddbe94eSAlex Elder /* 3112e2839308SAlex Elder * The max id could have been updated by rbd_dev_id_get(), in 3113d184f6bfSAlex Elder * which case it now accurately reflects the new maximum. 3114d184f6bfSAlex Elder * Be careful not to overwrite the maximum value in that 3115d184f6bfSAlex Elder * case. 31161ddbe94eSAlex Elder */ 3117e2839308SAlex Elder atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id); 3118e2839308SAlex Elder dout(" max dev id has been reset\n"); 3119b7f23c36SAlex Elder } 3120b7f23c36SAlex Elder 3121a725f65eSAlex Elder /* 3122e28fff26SAlex Elder * Skips over white space at *buf, and updates *buf to point to the 3123e28fff26SAlex Elder * first found non-space character (if any). Returns the length of 3124593a9e7bSAlex Elder * the token (string of non-white space characters) found. Note 3125593a9e7bSAlex Elder * that *buf must be terminated with '\0'. 3126e28fff26SAlex Elder */ 3127e28fff26SAlex Elder static inline size_t next_token(const char **buf) 3128e28fff26SAlex Elder { 3129e28fff26SAlex Elder /* 3130e28fff26SAlex Elder * These are the characters that produce nonzero for 3131e28fff26SAlex Elder * isspace() in the "C" and "POSIX" locales. 3132e28fff26SAlex Elder */ 3133e28fff26SAlex Elder const char *spaces = " \f\n\r\t\v"; 3134e28fff26SAlex Elder 3135e28fff26SAlex Elder *buf += strspn(*buf, spaces); /* Find start of token */ 3136e28fff26SAlex Elder 3137e28fff26SAlex Elder return strcspn(*buf, spaces); /* Return token length */ 3138e28fff26SAlex Elder } 3139e28fff26SAlex Elder 3140e28fff26SAlex Elder /* 3141e28fff26SAlex Elder * Finds the next token in *buf, and if the provided token buffer is 3142e28fff26SAlex Elder * big enough, copies the found token into it. The result, if 3143593a9e7bSAlex Elder * copied, is guaranteed to be terminated with '\0'. Note that *buf 3144593a9e7bSAlex Elder * must be terminated with '\0' on entry. 3145e28fff26SAlex Elder * 3146e28fff26SAlex Elder * Returns the length of the token found (not including the '\0'). 3147e28fff26SAlex Elder * Return value will be 0 if no token is found, and it will be >= 3148e28fff26SAlex Elder * token_size if the token would not fit. 3149e28fff26SAlex Elder * 3150593a9e7bSAlex Elder * The *buf pointer will be updated to point beyond the end of the 3151e28fff26SAlex Elder * found token. Note that this occurs even if the token buffer is 3152e28fff26SAlex Elder * too small to hold it. 3153e28fff26SAlex Elder */ 3154e28fff26SAlex Elder static inline size_t copy_token(const char **buf, 3155e28fff26SAlex Elder char *token, 3156e28fff26SAlex Elder size_t token_size) 3157e28fff26SAlex Elder { 3158e28fff26SAlex Elder size_t len; 3159e28fff26SAlex Elder 3160e28fff26SAlex Elder len = next_token(buf); 3161e28fff26SAlex Elder if (len < token_size) { 3162e28fff26SAlex Elder memcpy(token, *buf, len); 3163e28fff26SAlex Elder *(token + len) = '\0'; 3164e28fff26SAlex Elder } 3165e28fff26SAlex Elder *buf += len; 3166e28fff26SAlex Elder 3167e28fff26SAlex Elder return len; 3168e28fff26SAlex Elder } 3169e28fff26SAlex Elder 3170e28fff26SAlex Elder /* 3171ea3352f4SAlex Elder * Finds the next token in *buf, dynamically allocates a buffer big 3172ea3352f4SAlex Elder * enough to hold a copy of it, and copies the token into the new 3173ea3352f4SAlex Elder * buffer. The copy is guaranteed to be terminated with '\0'. Note 3174ea3352f4SAlex Elder * that a duplicate buffer is created even for a zero-length token. 3175ea3352f4SAlex Elder * 3176ea3352f4SAlex Elder * Returns a pointer to the newly-allocated duplicate, or a null 3177ea3352f4SAlex Elder * pointer if memory for the duplicate was not available. If 3178ea3352f4SAlex Elder * the lenp argument is a non-null pointer, the length of the token 3179ea3352f4SAlex Elder * (not including the '\0') is returned in *lenp. 3180ea3352f4SAlex Elder * 3181ea3352f4SAlex Elder * If successful, the *buf pointer will be updated to point beyond 3182ea3352f4SAlex Elder * the end of the found token. 3183ea3352f4SAlex Elder * 3184ea3352f4SAlex Elder * Note: uses GFP_KERNEL for allocation. 3185ea3352f4SAlex Elder */ 3186ea3352f4SAlex Elder static inline char *dup_token(const char **buf, size_t *lenp) 3187ea3352f4SAlex Elder { 3188ea3352f4SAlex Elder char *dup; 3189ea3352f4SAlex Elder size_t len; 3190ea3352f4SAlex Elder 3191ea3352f4SAlex Elder len = next_token(buf); 31924caf35f9SAlex Elder dup = kmemdup(*buf, len + 1, GFP_KERNEL); 3193ea3352f4SAlex Elder if (!dup) 3194ea3352f4SAlex Elder return NULL; 3195ea3352f4SAlex Elder *(dup + len) = '\0'; 3196ea3352f4SAlex Elder *buf += len; 3197ea3352f4SAlex Elder 3198ea3352f4SAlex Elder if (lenp) 3199ea3352f4SAlex Elder *lenp = len; 3200ea3352f4SAlex Elder 3201ea3352f4SAlex Elder return dup; 3202ea3352f4SAlex Elder } 3203ea3352f4SAlex Elder 3204ea3352f4SAlex Elder /* 3205859c31dfSAlex Elder * Parse the options provided for an "rbd add" (i.e., rbd image 3206859c31dfSAlex Elder * mapping) request. These arrive via a write to /sys/bus/rbd/add, 3207859c31dfSAlex Elder * and the data written is passed here via a NUL-terminated buffer. 3208859c31dfSAlex Elder * Returns 0 if successful or an error code otherwise. 3209d22f76e7SAlex Elder * 3210859c31dfSAlex Elder * The information extracted from these options is recorded in 3211859c31dfSAlex Elder * the other parameters which return dynamically-allocated 3212859c31dfSAlex Elder * structures: 3213859c31dfSAlex Elder * ceph_opts 3214859c31dfSAlex Elder * The address of a pointer that will refer to a ceph options 3215859c31dfSAlex Elder * structure. Caller must release the returned pointer using 3216859c31dfSAlex Elder * ceph_destroy_options() when it is no longer needed. 3217859c31dfSAlex Elder * rbd_opts 3218859c31dfSAlex Elder * Address of an rbd options pointer. Fully initialized by 3219859c31dfSAlex Elder * this function; caller must release with kfree(). 3220859c31dfSAlex Elder * spec 3221859c31dfSAlex Elder * Address of an rbd image specification pointer. Fully 3222859c31dfSAlex Elder * initialized by this function based on parsed options. 3223859c31dfSAlex Elder * Caller must release with rbd_spec_put(). 3224859c31dfSAlex Elder * 3225859c31dfSAlex Elder * The options passed take this form: 3226859c31dfSAlex Elder * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>] 3227859c31dfSAlex Elder * where: 3228859c31dfSAlex Elder * <mon_addrs> 3229859c31dfSAlex Elder * A comma-separated list of one or more monitor addresses. 3230859c31dfSAlex Elder * A monitor address is an ip address, optionally followed 3231859c31dfSAlex Elder * by a port number (separated by a colon). 3232859c31dfSAlex Elder * I.e.: ip1[:port1][,ip2[:port2]...] 3233859c31dfSAlex Elder * <options> 3234859c31dfSAlex Elder * A comma-separated list of ceph and/or rbd options. 3235859c31dfSAlex Elder * <pool_name> 3236859c31dfSAlex Elder * The name of the rados pool containing the rbd image. 3237859c31dfSAlex Elder * <image_name> 3238859c31dfSAlex Elder * The name of the image in that pool to map. 3239859c31dfSAlex Elder * <snap_id> 3240859c31dfSAlex Elder * An optional snapshot id. If provided, the mapping will 3241859c31dfSAlex Elder * present data from the image at the time that snapshot was 3242859c31dfSAlex Elder * created. The image head is used if no snapshot id is 3243859c31dfSAlex Elder * provided. Snapshot mappings are always read-only. 3244a725f65eSAlex Elder */ 3245859c31dfSAlex Elder static int rbd_add_parse_args(const char *buf, 3246dc79b113SAlex Elder struct ceph_options **ceph_opts, 3247859c31dfSAlex Elder struct rbd_options **opts, 3248859c31dfSAlex Elder struct rbd_spec **rbd_spec) 3249a725f65eSAlex Elder { 3250e28fff26SAlex Elder size_t len; 3251859c31dfSAlex Elder char *options; 32520ddebc0cSAlex Elder const char *mon_addrs; 32530ddebc0cSAlex Elder size_t mon_addrs_size; 3254859c31dfSAlex Elder struct rbd_spec *spec = NULL; 32554e9afebaSAlex Elder struct rbd_options *rbd_opts = NULL; 3256859c31dfSAlex Elder struct ceph_options *copts; 3257dc79b113SAlex Elder int ret; 3258e28fff26SAlex Elder 3259e28fff26SAlex Elder /* The first four tokens are required */ 3260e28fff26SAlex Elder 32617ef3214aSAlex Elder len = next_token(&buf); 32624fb5d671SAlex Elder if (!len) { 32634fb5d671SAlex Elder rbd_warn(NULL, "no monitor address(es) provided"); 32644fb5d671SAlex Elder return -EINVAL; 32654fb5d671SAlex Elder } 32660ddebc0cSAlex Elder mon_addrs = buf; 3267f28e565aSAlex Elder mon_addrs_size = len + 1; 32687ef3214aSAlex Elder buf += len; 3269a725f65eSAlex Elder 3270dc79b113SAlex Elder ret = -EINVAL; 3271f28e565aSAlex Elder options = dup_token(&buf, NULL); 3272f28e565aSAlex Elder if (!options) 3273dc79b113SAlex Elder return -ENOMEM; 32744fb5d671SAlex Elder if (!*options) { 32754fb5d671SAlex Elder rbd_warn(NULL, "no options provided"); 32764fb5d671SAlex Elder goto out_err; 32774fb5d671SAlex Elder } 3278a725f65eSAlex Elder 3279859c31dfSAlex Elder spec = rbd_spec_alloc(); 3280859c31dfSAlex Elder if (!spec) 3281f28e565aSAlex Elder goto out_mem; 3282859c31dfSAlex Elder 3283859c31dfSAlex Elder spec->pool_name = dup_token(&buf, NULL); 3284859c31dfSAlex Elder if (!spec->pool_name) 3285859c31dfSAlex Elder goto out_mem; 32864fb5d671SAlex Elder if (!*spec->pool_name) { 32874fb5d671SAlex Elder rbd_warn(NULL, "no pool name provided"); 32884fb5d671SAlex Elder goto out_err; 32894fb5d671SAlex Elder } 3290e28fff26SAlex Elder 329169e7a02fSAlex Elder spec->image_name = dup_token(&buf, NULL); 3292859c31dfSAlex Elder if (!spec->image_name) 3293f28e565aSAlex Elder goto out_mem; 32944fb5d671SAlex Elder if (!*spec->image_name) { 32954fb5d671SAlex Elder rbd_warn(NULL, "no image name provided"); 32964fb5d671SAlex Elder goto out_err; 32974fb5d671SAlex Elder } 3298e28fff26SAlex Elder 3299f28e565aSAlex Elder /* 3300f28e565aSAlex Elder * Snapshot name is optional; default is to use "-" 3301f28e565aSAlex Elder * (indicating the head/no snapshot). 3302f28e565aSAlex Elder */ 33033feeb894SAlex Elder len = next_token(&buf); 3304820a5f3eSAlex Elder if (!len) { 33053feeb894SAlex Elder buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */ 33063feeb894SAlex Elder len = sizeof (RBD_SNAP_HEAD_NAME) - 1; 3307f28e565aSAlex Elder } else if (len > RBD_MAX_SNAP_NAME_LEN) { 3308dc79b113SAlex Elder ret = -ENAMETOOLONG; 3309f28e565aSAlex Elder goto out_err; 3310849b4260SAlex Elder } 33114caf35f9SAlex Elder spec->snap_name = kmemdup(buf, len + 1, GFP_KERNEL); 3312859c31dfSAlex Elder if (!spec->snap_name) 3313f28e565aSAlex Elder goto out_mem; 3314859c31dfSAlex Elder *(spec->snap_name + len) = '\0'; 3315e5c35534SAlex Elder 33160ddebc0cSAlex Elder /* Initialize all rbd options to the defaults */ 3317e28fff26SAlex Elder 33184e9afebaSAlex Elder rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL); 33194e9afebaSAlex Elder if (!rbd_opts) 33204e9afebaSAlex Elder goto out_mem; 33214e9afebaSAlex Elder 33224e9afebaSAlex Elder rbd_opts->read_only = RBD_READ_ONLY_DEFAULT; 3323d22f76e7SAlex Elder 3324859c31dfSAlex Elder copts = ceph_parse_options(options, mon_addrs, 33250ddebc0cSAlex Elder mon_addrs + mon_addrs_size - 1, 33264e9afebaSAlex Elder parse_rbd_opts_token, rbd_opts); 3327859c31dfSAlex Elder if (IS_ERR(copts)) { 3328859c31dfSAlex Elder ret = PTR_ERR(copts); 3329dc79b113SAlex Elder goto out_err; 3330dc79b113SAlex Elder } 3331859c31dfSAlex Elder kfree(options); 3332859c31dfSAlex Elder 3333859c31dfSAlex Elder *ceph_opts = copts; 33344e9afebaSAlex Elder *opts = rbd_opts; 3335859c31dfSAlex Elder *rbd_spec = spec; 33360ddebc0cSAlex Elder 3337dc79b113SAlex Elder return 0; 3338f28e565aSAlex Elder out_mem: 3339dc79b113SAlex Elder ret = -ENOMEM; 3340d22f76e7SAlex Elder out_err: 3341859c31dfSAlex Elder kfree(rbd_opts); 3342859c31dfSAlex Elder rbd_spec_put(spec); 3343f28e565aSAlex Elder kfree(options); 3344d22f76e7SAlex Elder 3345dc79b113SAlex Elder return ret; 3346a725f65eSAlex Elder } 3347a725f65eSAlex Elder 3348589d30e0SAlex Elder /* 3349589d30e0SAlex Elder * An rbd format 2 image has a unique identifier, distinct from the 3350589d30e0SAlex Elder * name given to it by the user. Internally, that identifier is 3351589d30e0SAlex Elder * what's used to specify the names of objects related to the image. 3352589d30e0SAlex Elder * 3353589d30e0SAlex Elder * A special "rbd id" object is used to map an rbd image name to its 3354589d30e0SAlex Elder * id. If that object doesn't exist, then there is no v2 rbd image 3355589d30e0SAlex Elder * with the supplied name. 3356589d30e0SAlex Elder * 3357589d30e0SAlex Elder * This function will record the given rbd_dev's image_id field if 3358589d30e0SAlex Elder * it can be determined, and in that case will return 0. If any 3359589d30e0SAlex Elder * errors occur a negative errno will be returned and the rbd_dev's 3360589d30e0SAlex Elder * image_id field will be unchanged (and should be NULL). 3361589d30e0SAlex Elder */ 3362589d30e0SAlex Elder static int rbd_dev_image_id(struct rbd_device *rbd_dev) 3363589d30e0SAlex Elder { 3364589d30e0SAlex Elder int ret; 3365589d30e0SAlex Elder size_t size; 3366589d30e0SAlex Elder char *object_name; 3367589d30e0SAlex Elder void *response; 3368589d30e0SAlex Elder void *p; 3369589d30e0SAlex Elder 3370589d30e0SAlex Elder /* 33712c0d0a10SAlex Elder * When probing a parent image, the image id is already 33722c0d0a10SAlex Elder * known (and the image name likely is not). There's no 33732c0d0a10SAlex Elder * need to fetch the image id again in this case. 33742c0d0a10SAlex Elder */ 33752c0d0a10SAlex Elder if (rbd_dev->spec->image_id) 33762c0d0a10SAlex Elder return 0; 33772c0d0a10SAlex Elder 33782c0d0a10SAlex Elder /* 3379589d30e0SAlex Elder * First, see if the format 2 image id file exists, and if 3380589d30e0SAlex Elder * so, get the image's persistent id from it. 3381589d30e0SAlex Elder */ 338269e7a02fSAlex Elder size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name); 3383589d30e0SAlex Elder object_name = kmalloc(size, GFP_NOIO); 3384589d30e0SAlex Elder if (!object_name) 3385589d30e0SAlex Elder return -ENOMEM; 33860d7dbfceSAlex Elder sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name); 3387589d30e0SAlex Elder dout("rbd id object name is %s\n", object_name); 3388589d30e0SAlex Elder 3389589d30e0SAlex Elder /* Response will be an encoded string, which includes a length */ 3390589d30e0SAlex Elder 3391589d30e0SAlex Elder size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX; 3392589d30e0SAlex Elder response = kzalloc(size, GFP_NOIO); 3393589d30e0SAlex Elder if (!response) { 3394589d30e0SAlex Elder ret = -ENOMEM; 3395589d30e0SAlex Elder goto out; 3396589d30e0SAlex Elder } 3397589d30e0SAlex Elder 3398589d30e0SAlex Elder ret = rbd_req_sync_exec(rbd_dev, object_name, 3399589d30e0SAlex Elder "rbd", "get_id", 3400589d30e0SAlex Elder NULL, 0, 3401589d30e0SAlex Elder response, RBD_IMAGE_ID_LEN_MAX, 3402589d30e0SAlex Elder CEPH_OSD_FLAG_READ, NULL); 3403589d30e0SAlex Elder dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); 3404589d30e0SAlex Elder if (ret < 0) 3405589d30e0SAlex Elder goto out; 3406a0ea3a40SAlex Elder ret = 0; /* rbd_req_sync_exec() can return positive */ 3407589d30e0SAlex Elder 3408589d30e0SAlex Elder p = response; 34090d7dbfceSAlex Elder rbd_dev->spec->image_id = ceph_extract_encoded_string(&p, 3410589d30e0SAlex Elder p + RBD_IMAGE_ID_LEN_MAX, 3411979ed480SAlex Elder NULL, GFP_NOIO); 34120d7dbfceSAlex Elder if (IS_ERR(rbd_dev->spec->image_id)) { 34130d7dbfceSAlex Elder ret = PTR_ERR(rbd_dev->spec->image_id); 34140d7dbfceSAlex Elder rbd_dev->spec->image_id = NULL; 3415589d30e0SAlex Elder } else { 34160d7dbfceSAlex Elder dout("image_id is %s\n", rbd_dev->spec->image_id); 3417589d30e0SAlex Elder } 3418589d30e0SAlex Elder out: 3419589d30e0SAlex Elder kfree(response); 3420589d30e0SAlex Elder kfree(object_name); 3421589d30e0SAlex Elder 3422589d30e0SAlex Elder return ret; 3423589d30e0SAlex Elder } 3424589d30e0SAlex Elder 3425a30b71b9SAlex Elder static int rbd_dev_v1_probe(struct rbd_device *rbd_dev) 3426a30b71b9SAlex Elder { 3427a30b71b9SAlex Elder int ret; 3428a30b71b9SAlex Elder size_t size; 3429a30b71b9SAlex Elder 3430a30b71b9SAlex Elder /* Version 1 images have no id; empty string is used */ 3431a30b71b9SAlex Elder 34320d7dbfceSAlex Elder rbd_dev->spec->image_id = kstrdup("", GFP_KERNEL); 34330d7dbfceSAlex Elder if (!rbd_dev->spec->image_id) 3434a30b71b9SAlex Elder return -ENOMEM; 3435a30b71b9SAlex Elder 3436a30b71b9SAlex Elder /* Record the header object name for this rbd image. */ 3437a30b71b9SAlex Elder 343869e7a02fSAlex Elder size = strlen(rbd_dev->spec->image_name) + sizeof (RBD_SUFFIX); 3439a30b71b9SAlex Elder rbd_dev->header_name = kmalloc(size, GFP_KERNEL); 3440a30b71b9SAlex Elder if (!rbd_dev->header_name) { 3441a30b71b9SAlex Elder ret = -ENOMEM; 3442a30b71b9SAlex Elder goto out_err; 3443a30b71b9SAlex Elder } 34440d7dbfceSAlex Elder sprintf(rbd_dev->header_name, "%s%s", 34450d7dbfceSAlex Elder rbd_dev->spec->image_name, RBD_SUFFIX); 3446a30b71b9SAlex Elder 3447a30b71b9SAlex Elder /* Populate rbd image metadata */ 3448a30b71b9SAlex Elder 3449a30b71b9SAlex Elder ret = rbd_read_header(rbd_dev, &rbd_dev->header); 3450a30b71b9SAlex Elder if (ret < 0) 3451a30b71b9SAlex Elder goto out_err; 345286b00e0dSAlex Elder 345386b00e0dSAlex Elder /* Version 1 images have no parent (no layering) */ 345486b00e0dSAlex Elder 345586b00e0dSAlex Elder rbd_dev->parent_spec = NULL; 345686b00e0dSAlex Elder rbd_dev->parent_overlap = 0; 345786b00e0dSAlex Elder 3458a30b71b9SAlex Elder rbd_dev->image_format = 1; 3459a30b71b9SAlex Elder 3460a30b71b9SAlex Elder dout("discovered version 1 image, header name is %s\n", 3461a30b71b9SAlex Elder rbd_dev->header_name); 3462a30b71b9SAlex Elder 3463a30b71b9SAlex Elder return 0; 3464a30b71b9SAlex Elder 3465a30b71b9SAlex Elder out_err: 3466a30b71b9SAlex Elder kfree(rbd_dev->header_name); 3467a30b71b9SAlex Elder rbd_dev->header_name = NULL; 34680d7dbfceSAlex Elder kfree(rbd_dev->spec->image_id); 34690d7dbfceSAlex Elder rbd_dev->spec->image_id = NULL; 3470a30b71b9SAlex Elder 3471a30b71b9SAlex Elder return ret; 3472a30b71b9SAlex Elder } 3473a30b71b9SAlex Elder 3474a30b71b9SAlex Elder static int rbd_dev_v2_probe(struct rbd_device *rbd_dev) 3475a30b71b9SAlex Elder { 3476a30b71b9SAlex Elder size_t size; 34779d475de5SAlex Elder int ret; 34786e14b1a6SAlex Elder u64 ver = 0; 3479a30b71b9SAlex Elder 3480a30b71b9SAlex Elder /* 3481a30b71b9SAlex Elder * Image id was filled in by the caller. Record the header 3482a30b71b9SAlex Elder * object name for this rbd image. 3483a30b71b9SAlex Elder */ 3484979ed480SAlex Elder size = sizeof (RBD_HEADER_PREFIX) + strlen(rbd_dev->spec->image_id); 3485a30b71b9SAlex Elder rbd_dev->header_name = kmalloc(size, GFP_KERNEL); 3486a30b71b9SAlex Elder if (!rbd_dev->header_name) 3487a30b71b9SAlex Elder return -ENOMEM; 3488a30b71b9SAlex Elder sprintf(rbd_dev->header_name, "%s%s", 34890d7dbfceSAlex Elder RBD_HEADER_PREFIX, rbd_dev->spec->image_id); 34909d475de5SAlex Elder 34919d475de5SAlex Elder /* Get the size and object order for the image */ 34929d475de5SAlex Elder 34939d475de5SAlex Elder ret = rbd_dev_v2_image_size(rbd_dev); 34949d475de5SAlex Elder if (ret < 0) 34959d475de5SAlex Elder goto out_err; 34961e130199SAlex Elder 34971e130199SAlex Elder /* Get the object prefix (a.k.a. block_name) for the image */ 34981e130199SAlex Elder 34991e130199SAlex Elder ret = rbd_dev_v2_object_prefix(rbd_dev); 35001e130199SAlex Elder if (ret < 0) 35011e130199SAlex Elder goto out_err; 3502b1b5402aSAlex Elder 3503d889140cSAlex Elder /* Get the and check features for the image */ 3504b1b5402aSAlex Elder 3505b1b5402aSAlex Elder ret = rbd_dev_v2_features(rbd_dev); 3506b1b5402aSAlex Elder if (ret < 0) 3507b1b5402aSAlex Elder goto out_err; 350835d489f9SAlex Elder 350986b00e0dSAlex Elder /* If the image supports layering, get the parent info */ 351086b00e0dSAlex Elder 351186b00e0dSAlex Elder if (rbd_dev->header.features & RBD_FEATURE_LAYERING) { 351286b00e0dSAlex Elder ret = rbd_dev_v2_parent_info(rbd_dev); 351386b00e0dSAlex Elder if (ret < 0) 351486b00e0dSAlex Elder goto out_err; 351586b00e0dSAlex Elder } 351686b00e0dSAlex Elder 35176e14b1a6SAlex Elder /* crypto and compression type aren't (yet) supported for v2 images */ 351835d489f9SAlex Elder 35196e14b1a6SAlex Elder rbd_dev->header.crypt_type = 0; 35206e14b1a6SAlex Elder rbd_dev->header.comp_type = 0; 35216e14b1a6SAlex Elder 35226e14b1a6SAlex Elder /* Get the snapshot context, plus the header version */ 35236e14b1a6SAlex Elder 35246e14b1a6SAlex Elder ret = rbd_dev_v2_snap_context(rbd_dev, &ver); 352535d489f9SAlex Elder if (ret) 352635d489f9SAlex Elder goto out_err; 35276e14b1a6SAlex Elder rbd_dev->header.obj_version = ver; 35286e14b1a6SAlex Elder 3529a30b71b9SAlex Elder rbd_dev->image_format = 2; 3530a30b71b9SAlex Elder 3531a30b71b9SAlex Elder dout("discovered version 2 image, header name is %s\n", 3532a30b71b9SAlex Elder rbd_dev->header_name); 3533a30b71b9SAlex Elder 353435152979SAlex Elder return 0; 35359d475de5SAlex Elder out_err: 353686b00e0dSAlex Elder rbd_dev->parent_overlap = 0; 353786b00e0dSAlex Elder rbd_spec_put(rbd_dev->parent_spec); 353886b00e0dSAlex Elder rbd_dev->parent_spec = NULL; 35399d475de5SAlex Elder kfree(rbd_dev->header_name); 35409d475de5SAlex Elder rbd_dev->header_name = NULL; 35411e130199SAlex Elder kfree(rbd_dev->header.object_prefix); 35421e130199SAlex Elder rbd_dev->header.object_prefix = NULL; 35439d475de5SAlex Elder 35449d475de5SAlex Elder return ret; 3545a30b71b9SAlex Elder } 3546a30b71b9SAlex Elder 354783a06263SAlex Elder static int rbd_dev_probe_finish(struct rbd_device *rbd_dev) 354883a06263SAlex Elder { 354983a06263SAlex Elder int ret; 355083a06263SAlex Elder 355183a06263SAlex Elder /* no need to lock here, as rbd_dev is not registered yet */ 355283a06263SAlex Elder ret = rbd_dev_snaps_update(rbd_dev); 355383a06263SAlex Elder if (ret) 355483a06263SAlex Elder return ret; 355583a06263SAlex Elder 35569e15b77dSAlex Elder ret = rbd_dev_probe_update_spec(rbd_dev); 35579e15b77dSAlex Elder if (ret) 35589e15b77dSAlex Elder goto err_out_snaps; 35599e15b77dSAlex Elder 356083a06263SAlex Elder ret = rbd_dev_set_mapping(rbd_dev); 356183a06263SAlex Elder if (ret) 356283a06263SAlex Elder goto err_out_snaps; 356383a06263SAlex Elder 356483a06263SAlex Elder /* generate unique id: find highest unique id, add one */ 356583a06263SAlex Elder rbd_dev_id_get(rbd_dev); 356683a06263SAlex Elder 356783a06263SAlex Elder /* Fill in the device name, now that we have its id. */ 356883a06263SAlex Elder BUILD_BUG_ON(DEV_NAME_LEN 356983a06263SAlex Elder < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH); 357083a06263SAlex Elder sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id); 357183a06263SAlex Elder 357283a06263SAlex Elder /* Get our block major device number. */ 357383a06263SAlex Elder 357483a06263SAlex Elder ret = register_blkdev(0, rbd_dev->name); 357583a06263SAlex Elder if (ret < 0) 357683a06263SAlex Elder goto err_out_id; 357783a06263SAlex Elder rbd_dev->major = ret; 357883a06263SAlex Elder 357983a06263SAlex Elder /* Set up the blkdev mapping. */ 358083a06263SAlex Elder 358183a06263SAlex Elder ret = rbd_init_disk(rbd_dev); 358283a06263SAlex Elder if (ret) 358383a06263SAlex Elder goto err_out_blkdev; 358483a06263SAlex Elder 358583a06263SAlex Elder ret = rbd_bus_add_dev(rbd_dev); 358683a06263SAlex Elder if (ret) 358783a06263SAlex Elder goto err_out_disk; 358883a06263SAlex Elder 358983a06263SAlex Elder /* 359083a06263SAlex Elder * At this point cleanup in the event of an error is the job 359183a06263SAlex Elder * of the sysfs code (initiated by rbd_bus_del_dev()). 359283a06263SAlex Elder */ 359383a06263SAlex Elder down_write(&rbd_dev->header_rwsem); 359483a06263SAlex Elder ret = rbd_dev_snaps_register(rbd_dev); 359583a06263SAlex Elder up_write(&rbd_dev->header_rwsem); 359683a06263SAlex Elder if (ret) 359783a06263SAlex Elder goto err_out_bus; 359883a06263SAlex Elder 359983a06263SAlex Elder ret = rbd_init_watch_dev(rbd_dev); 360083a06263SAlex Elder if (ret) 360183a06263SAlex Elder goto err_out_bus; 360283a06263SAlex Elder 360383a06263SAlex Elder /* Everything's ready. Announce the disk to the world. */ 360483a06263SAlex Elder 360583a06263SAlex Elder add_disk(rbd_dev->disk); 360683a06263SAlex Elder 360783a06263SAlex Elder pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name, 360883a06263SAlex Elder (unsigned long long) rbd_dev->mapping.size); 360983a06263SAlex Elder 361083a06263SAlex Elder return ret; 361183a06263SAlex Elder err_out_bus: 361283a06263SAlex Elder /* this will also clean up rest of rbd_dev stuff */ 361383a06263SAlex Elder 361483a06263SAlex Elder rbd_bus_del_dev(rbd_dev); 361583a06263SAlex Elder 361683a06263SAlex Elder return ret; 361783a06263SAlex Elder err_out_disk: 361883a06263SAlex Elder rbd_free_disk(rbd_dev); 361983a06263SAlex Elder err_out_blkdev: 362083a06263SAlex Elder unregister_blkdev(rbd_dev->major, rbd_dev->name); 362183a06263SAlex Elder err_out_id: 362283a06263SAlex Elder rbd_dev_id_put(rbd_dev); 362383a06263SAlex Elder err_out_snaps: 362483a06263SAlex Elder rbd_remove_all_snaps(rbd_dev); 362583a06263SAlex Elder 362683a06263SAlex Elder return ret; 362783a06263SAlex Elder } 362883a06263SAlex Elder 3629a30b71b9SAlex Elder /* 3630a30b71b9SAlex Elder * Probe for the existence of the header object for the given rbd 3631a30b71b9SAlex Elder * device. For format 2 images this includes determining the image 3632a30b71b9SAlex Elder * id. 3633a30b71b9SAlex Elder */ 3634a30b71b9SAlex Elder static int rbd_dev_probe(struct rbd_device *rbd_dev) 3635a30b71b9SAlex Elder { 3636a30b71b9SAlex Elder int ret; 3637a30b71b9SAlex Elder 3638a30b71b9SAlex Elder /* 3639a30b71b9SAlex Elder * Get the id from the image id object. If it's not a 3640a30b71b9SAlex Elder * format 2 image, we'll get ENOENT back, and we'll assume 3641a30b71b9SAlex Elder * it's a format 1 image. 3642a30b71b9SAlex Elder */ 3643a30b71b9SAlex Elder ret = rbd_dev_image_id(rbd_dev); 3644a30b71b9SAlex Elder if (ret) 3645a30b71b9SAlex Elder ret = rbd_dev_v1_probe(rbd_dev); 3646a30b71b9SAlex Elder else 3647a30b71b9SAlex Elder ret = rbd_dev_v2_probe(rbd_dev); 364883a06263SAlex Elder if (ret) { 3649a30b71b9SAlex Elder dout("probe failed, returning %d\n", ret); 3650a30b71b9SAlex Elder 3651a30b71b9SAlex Elder return ret; 3652a30b71b9SAlex Elder } 3653a30b71b9SAlex Elder 365483a06263SAlex Elder ret = rbd_dev_probe_finish(rbd_dev); 365583a06263SAlex Elder if (ret) 365683a06263SAlex Elder rbd_header_free(&rbd_dev->header); 365783a06263SAlex Elder 365883a06263SAlex Elder return ret; 365983a06263SAlex Elder } 366083a06263SAlex Elder 366159c2be1eSYehuda Sadeh static ssize_t rbd_add(struct bus_type *bus, 366259c2be1eSYehuda Sadeh const char *buf, 366359c2be1eSYehuda Sadeh size_t count) 3664602adf40SYehuda Sadeh { 3665cb8627c7SAlex Elder struct rbd_device *rbd_dev = NULL; 3666dc79b113SAlex Elder struct ceph_options *ceph_opts = NULL; 36674e9afebaSAlex Elder struct rbd_options *rbd_opts = NULL; 3668859c31dfSAlex Elder struct rbd_spec *spec = NULL; 36699d3997fdSAlex Elder struct rbd_client *rbdc; 367027cc2594SAlex Elder struct ceph_osd_client *osdc; 367127cc2594SAlex Elder int rc = -ENOMEM; 3672602adf40SYehuda Sadeh 3673602adf40SYehuda Sadeh if (!try_module_get(THIS_MODULE)) 3674602adf40SYehuda Sadeh return -ENODEV; 3675602adf40SYehuda Sadeh 3676a725f65eSAlex Elder /* parse add command */ 3677859c31dfSAlex Elder rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec); 3678dc79b113SAlex Elder if (rc < 0) 3679bd4ba655SAlex Elder goto err_out_module; 3680a725f65eSAlex Elder 36819d3997fdSAlex Elder rbdc = rbd_get_client(ceph_opts); 36829d3997fdSAlex Elder if (IS_ERR(rbdc)) { 36839d3997fdSAlex Elder rc = PTR_ERR(rbdc); 36840ddebc0cSAlex Elder goto err_out_args; 36859d3997fdSAlex Elder } 3686c53d5893SAlex Elder ceph_opts = NULL; /* rbd_dev client now owns this */ 3687602adf40SYehuda Sadeh 3688602adf40SYehuda Sadeh /* pick the pool */ 36899d3997fdSAlex Elder osdc = &rbdc->client->osdc; 3690859c31dfSAlex Elder rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name); 3691602adf40SYehuda Sadeh if (rc < 0) 3692602adf40SYehuda Sadeh goto err_out_client; 3693859c31dfSAlex Elder spec->pool_id = (u64) rc; 3694859c31dfSAlex Elder 3695c53d5893SAlex Elder rbd_dev = rbd_dev_create(rbdc, spec); 3696bd4ba655SAlex Elder if (!rbd_dev) 3697bd4ba655SAlex Elder goto err_out_client; 3698c53d5893SAlex Elder rbdc = NULL; /* rbd_dev now owns this */ 3699c53d5893SAlex Elder spec = NULL; /* rbd_dev now owns this */ 3700602adf40SYehuda Sadeh 3701bd4ba655SAlex Elder rbd_dev->mapping.read_only = rbd_opts->read_only; 3702c53d5893SAlex Elder kfree(rbd_opts); 3703c53d5893SAlex Elder rbd_opts = NULL; /* done with this */ 3704bd4ba655SAlex Elder 3705a30b71b9SAlex Elder rc = rbd_dev_probe(rbd_dev); 3706a30b71b9SAlex Elder if (rc < 0) 3707c53d5893SAlex Elder goto err_out_rbd_dev; 370805fd6f6fSAlex Elder 3709602adf40SYehuda Sadeh return count; 3710c53d5893SAlex Elder err_out_rbd_dev: 3711c53d5893SAlex Elder rbd_dev_destroy(rbd_dev); 3712bd4ba655SAlex Elder err_out_client: 37139d3997fdSAlex Elder rbd_put_client(rbdc); 37140ddebc0cSAlex Elder err_out_args: 371578cea76eSAlex Elder if (ceph_opts) 371678cea76eSAlex Elder ceph_destroy_options(ceph_opts); 37174e9afebaSAlex Elder kfree(rbd_opts); 3718859c31dfSAlex Elder rbd_spec_put(spec); 3719bd4ba655SAlex Elder err_out_module: 3720bd4ba655SAlex Elder module_put(THIS_MODULE); 372127cc2594SAlex Elder 3722602adf40SYehuda Sadeh dout("Error adding device %s\n", buf); 372327cc2594SAlex Elder 372427cc2594SAlex Elder return (ssize_t) rc; 3725602adf40SYehuda Sadeh } 3726602adf40SYehuda Sadeh 3727de71a297SAlex Elder static struct rbd_device *__rbd_get_dev(unsigned long dev_id) 3728602adf40SYehuda Sadeh { 3729602adf40SYehuda Sadeh struct list_head *tmp; 3730602adf40SYehuda Sadeh struct rbd_device *rbd_dev; 3731602adf40SYehuda Sadeh 3732e124a82fSAlex Elder spin_lock(&rbd_dev_list_lock); 3733602adf40SYehuda Sadeh list_for_each(tmp, &rbd_dev_list) { 3734602adf40SYehuda Sadeh rbd_dev = list_entry(tmp, struct rbd_device, node); 3735de71a297SAlex Elder if (rbd_dev->dev_id == dev_id) { 3736e124a82fSAlex Elder spin_unlock(&rbd_dev_list_lock); 3737602adf40SYehuda Sadeh return rbd_dev; 3738602adf40SYehuda Sadeh } 3739e124a82fSAlex Elder } 3740e124a82fSAlex Elder spin_unlock(&rbd_dev_list_lock); 3741602adf40SYehuda Sadeh return NULL; 3742602adf40SYehuda Sadeh } 3743602adf40SYehuda Sadeh 3744dfc5606dSYehuda Sadeh static void rbd_dev_release(struct device *dev) 3745602adf40SYehuda Sadeh { 3746593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 3747602adf40SYehuda Sadeh 37481dbb4399SAlex Elder if (rbd_dev->watch_request) { 37491dbb4399SAlex Elder struct ceph_client *client = rbd_dev->rbd_client->client; 37501dbb4399SAlex Elder 37511dbb4399SAlex Elder ceph_osdc_unregister_linger_request(&client->osdc, 375259c2be1eSYehuda Sadeh rbd_dev->watch_request); 37531dbb4399SAlex Elder } 375459c2be1eSYehuda Sadeh if (rbd_dev->watch_event) 3755070c633fSAlex Elder rbd_req_sync_unwatch(rbd_dev); 375659c2be1eSYehuda Sadeh 3757602adf40SYehuda Sadeh 3758602adf40SYehuda Sadeh /* clean up and free blkdev */ 3759602adf40SYehuda Sadeh rbd_free_disk(rbd_dev); 3760602adf40SYehuda Sadeh unregister_blkdev(rbd_dev->major, rbd_dev->name); 376132eec68dSAlex Elder 37622ac4e75dSAlex Elder /* release allocated disk header fields */ 37632ac4e75dSAlex Elder rbd_header_free(&rbd_dev->header); 37642ac4e75dSAlex Elder 376532eec68dSAlex Elder /* done with the id, and with the rbd_dev */ 3766e2839308SAlex Elder rbd_dev_id_put(rbd_dev); 3767c53d5893SAlex Elder rbd_assert(rbd_dev->rbd_client != NULL); 3768c53d5893SAlex Elder rbd_dev_destroy(rbd_dev); 3769602adf40SYehuda Sadeh 3770602adf40SYehuda Sadeh /* release module ref */ 3771602adf40SYehuda Sadeh module_put(THIS_MODULE); 3772602adf40SYehuda Sadeh } 3773602adf40SYehuda Sadeh 3774dfc5606dSYehuda Sadeh static ssize_t rbd_remove(struct bus_type *bus, 3775602adf40SYehuda Sadeh const char *buf, 3776602adf40SYehuda Sadeh size_t count) 3777602adf40SYehuda Sadeh { 3778602adf40SYehuda Sadeh struct rbd_device *rbd_dev = NULL; 3779602adf40SYehuda Sadeh int target_id, rc; 3780602adf40SYehuda Sadeh unsigned long ul; 3781602adf40SYehuda Sadeh int ret = count; 3782602adf40SYehuda Sadeh 3783602adf40SYehuda Sadeh rc = strict_strtoul(buf, 10, &ul); 3784602adf40SYehuda Sadeh if (rc) 3785602adf40SYehuda Sadeh return rc; 3786602adf40SYehuda Sadeh 3787602adf40SYehuda Sadeh /* convert to int; abort if we lost anything in the conversion */ 3788602adf40SYehuda Sadeh target_id = (int) ul; 3789602adf40SYehuda Sadeh if (target_id != ul) 3790602adf40SYehuda Sadeh return -EINVAL; 3791602adf40SYehuda Sadeh 3792602adf40SYehuda Sadeh mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); 3793602adf40SYehuda Sadeh 3794602adf40SYehuda Sadeh rbd_dev = __rbd_get_dev(target_id); 3795602adf40SYehuda Sadeh if (!rbd_dev) { 3796602adf40SYehuda Sadeh ret = -ENOENT; 3797602adf40SYehuda Sadeh goto done; 3798602adf40SYehuda Sadeh } 3799602adf40SYehuda Sadeh 380042382b70SAlex Elder if (rbd_dev->open_count) { 380142382b70SAlex Elder ret = -EBUSY; 380242382b70SAlex Elder goto done; 380342382b70SAlex Elder } 380442382b70SAlex Elder 380541f38c2bSAlex Elder rbd_remove_all_snaps(rbd_dev); 3806dfc5606dSYehuda Sadeh rbd_bus_del_dev(rbd_dev); 3807602adf40SYehuda Sadeh 3808602adf40SYehuda Sadeh done: 3809602adf40SYehuda Sadeh mutex_unlock(&ctl_mutex); 3810aafb230eSAlex Elder 3811602adf40SYehuda Sadeh return ret; 3812602adf40SYehuda Sadeh } 3813602adf40SYehuda Sadeh 3814602adf40SYehuda Sadeh /* 3815602adf40SYehuda Sadeh * create control files in sysfs 3816dfc5606dSYehuda Sadeh * /sys/bus/rbd/... 3817602adf40SYehuda Sadeh */ 3818602adf40SYehuda Sadeh static int rbd_sysfs_init(void) 3819602adf40SYehuda Sadeh { 3820dfc5606dSYehuda Sadeh int ret; 3821602adf40SYehuda Sadeh 3822fed4c143SAlex Elder ret = device_register(&rbd_root_dev); 3823dfc5606dSYehuda Sadeh if (ret < 0) 3824dfc5606dSYehuda Sadeh return ret; 3825602adf40SYehuda Sadeh 3826fed4c143SAlex Elder ret = bus_register(&rbd_bus_type); 3827fed4c143SAlex Elder if (ret < 0) 3828fed4c143SAlex Elder device_unregister(&rbd_root_dev); 3829602adf40SYehuda Sadeh 3830602adf40SYehuda Sadeh return ret; 3831602adf40SYehuda Sadeh } 3832602adf40SYehuda Sadeh 3833602adf40SYehuda Sadeh static void rbd_sysfs_cleanup(void) 3834602adf40SYehuda Sadeh { 3835dfc5606dSYehuda Sadeh bus_unregister(&rbd_bus_type); 3836fed4c143SAlex Elder device_unregister(&rbd_root_dev); 3837602adf40SYehuda Sadeh } 3838602adf40SYehuda Sadeh 3839602adf40SYehuda Sadeh int __init rbd_init(void) 3840602adf40SYehuda Sadeh { 3841602adf40SYehuda Sadeh int rc; 3842602adf40SYehuda Sadeh 3843602adf40SYehuda Sadeh rc = rbd_sysfs_init(); 3844602adf40SYehuda Sadeh if (rc) 3845602adf40SYehuda Sadeh return rc; 3846f0f8cef5SAlex Elder pr_info("loaded " RBD_DRV_NAME_LONG "\n"); 3847602adf40SYehuda Sadeh return 0; 3848602adf40SYehuda Sadeh } 3849602adf40SYehuda Sadeh 3850602adf40SYehuda Sadeh void __exit rbd_exit(void) 3851602adf40SYehuda Sadeh { 3852602adf40SYehuda Sadeh rbd_sysfs_cleanup(); 3853602adf40SYehuda Sadeh } 3854602adf40SYehuda Sadeh 3855602adf40SYehuda Sadeh module_init(rbd_init); 3856602adf40SYehuda Sadeh module_exit(rbd_exit); 3857602adf40SYehuda Sadeh 3858602adf40SYehuda Sadeh MODULE_AUTHOR("Sage Weil <sage@newdream.net>"); 3859602adf40SYehuda Sadeh MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>"); 3860602adf40SYehuda Sadeh MODULE_DESCRIPTION("rados block device"); 3861602adf40SYehuda Sadeh 3862602adf40SYehuda Sadeh /* following authorship retained from original osdblk.c */ 3863602adf40SYehuda Sadeh MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>"); 3864602adf40SYehuda Sadeh 3865602adf40SYehuda Sadeh MODULE_LICENSE("GPL"); 3866