1602adf40SYehuda Sadeh /* 2602adf40SYehuda Sadeh rbd.c -- Export ceph rados objects as a Linux block device 3602adf40SYehuda Sadeh 4602adf40SYehuda Sadeh 5602adf40SYehuda Sadeh based on drivers/block/osdblk.c: 6602adf40SYehuda Sadeh 7602adf40SYehuda Sadeh Copyright 2009 Red Hat, Inc. 8602adf40SYehuda Sadeh 9602adf40SYehuda Sadeh This program is free software; you can redistribute it and/or modify 10602adf40SYehuda Sadeh it under the terms of the GNU General Public License as published by 11602adf40SYehuda Sadeh the Free Software Foundation. 12602adf40SYehuda Sadeh 13602adf40SYehuda Sadeh This program is distributed in the hope that it will be useful, 14602adf40SYehuda Sadeh but WITHOUT ANY WARRANTY; without even the implied warranty of 15602adf40SYehuda Sadeh MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16602adf40SYehuda Sadeh GNU General Public License for more details. 17602adf40SYehuda Sadeh 18602adf40SYehuda Sadeh You should have received a copy of the GNU General Public License 19602adf40SYehuda Sadeh along with this program; see the file COPYING. If not, write to 20602adf40SYehuda Sadeh the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 21602adf40SYehuda Sadeh 22602adf40SYehuda Sadeh 23602adf40SYehuda Sadeh 24dfc5606dSYehuda Sadeh For usage instructions, please refer to: 25602adf40SYehuda Sadeh 26dfc5606dSYehuda Sadeh Documentation/ABI/testing/sysfs-bus-rbd 27602adf40SYehuda Sadeh 28602adf40SYehuda Sadeh */ 29602adf40SYehuda Sadeh 30602adf40SYehuda Sadeh #include <linux/ceph/libceph.h> 31602adf40SYehuda Sadeh #include <linux/ceph/osd_client.h> 32602adf40SYehuda Sadeh #include <linux/ceph/mon_client.h> 33602adf40SYehuda Sadeh #include <linux/ceph/decode.h> 3459c2be1eSYehuda Sadeh #include <linux/parser.h> 35602adf40SYehuda Sadeh 36602adf40SYehuda Sadeh #include <linux/kernel.h> 37602adf40SYehuda Sadeh #include <linux/device.h> 38602adf40SYehuda Sadeh #include <linux/module.h> 39602adf40SYehuda Sadeh #include <linux/fs.h> 40602adf40SYehuda Sadeh #include <linux/blkdev.h> 41602adf40SYehuda Sadeh 42602adf40SYehuda Sadeh #include "rbd_types.h" 43602adf40SYehuda Sadeh 44aafb230eSAlex Elder #define RBD_DEBUG /* Activate rbd_assert() calls */ 45aafb230eSAlex Elder 46593a9e7bSAlex Elder /* 47593a9e7bSAlex Elder * The basic unit of block I/O is a sector. It is interpreted in a 48593a9e7bSAlex Elder * number of contexts in Linux (blk, bio, genhd), but the default is 49593a9e7bSAlex Elder * universally 512 bytes. These symbols are just slightly more 50593a9e7bSAlex Elder * meaningful than the bare numbers they represent. 51593a9e7bSAlex Elder */ 52593a9e7bSAlex Elder #define SECTOR_SHIFT 9 53593a9e7bSAlex Elder #define SECTOR_SIZE (1ULL << SECTOR_SHIFT) 54593a9e7bSAlex Elder 55df111be6SAlex Elder /* It might be useful to have this defined elsewhere too */ 56df111be6SAlex Elder 570ec8ce87SAlex Elder #define U32_MAX ((u32) (~0U)) 58df111be6SAlex Elder #define U64_MAX ((u64) (~0ULL)) 59df111be6SAlex Elder 60f0f8cef5SAlex Elder #define RBD_DRV_NAME "rbd" 61f0f8cef5SAlex Elder #define RBD_DRV_NAME_LONG "rbd (rados block device)" 62602adf40SYehuda Sadeh 63602adf40SYehuda Sadeh #define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */ 64602adf40SYehuda Sadeh 65d4b125e9SAlex Elder #define RBD_SNAP_DEV_NAME_PREFIX "snap_" 66d4b125e9SAlex Elder #define RBD_MAX_SNAP_NAME_LEN \ 67d4b125e9SAlex Elder (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1)) 68d4b125e9SAlex Elder 6935d489f9SAlex Elder #define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */ 70602adf40SYehuda Sadeh #define RBD_MAX_OPT_LEN 1024 71602adf40SYehuda Sadeh 72602adf40SYehuda Sadeh #define RBD_SNAP_HEAD_NAME "-" 73602adf40SYehuda Sadeh 749e15b77dSAlex Elder /* This allows a single page to hold an image name sent by OSD */ 759e15b77dSAlex Elder #define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1) 76589d30e0SAlex Elder #define RBD_IMAGE_ID_LEN_MAX 64 779e15b77dSAlex Elder 781e130199SAlex Elder #define RBD_OBJ_PREFIX_LEN_MAX 64 79589d30e0SAlex Elder 80d889140cSAlex Elder /* Feature bits */ 81d889140cSAlex Elder 82d889140cSAlex Elder #define RBD_FEATURE_LAYERING 1 83d889140cSAlex Elder 84d889140cSAlex Elder /* Features supported by this (client software) implementation. */ 85d889140cSAlex Elder 86d889140cSAlex Elder #define RBD_FEATURES_ALL (0) 87d889140cSAlex Elder 8881a89793SAlex Elder /* 8981a89793SAlex Elder * An RBD device name will be "rbd#", where the "rbd" comes from 9081a89793SAlex Elder * RBD_DRV_NAME above, and # is a unique integer identifier. 9181a89793SAlex Elder * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big 9281a89793SAlex Elder * enough to hold all possible device names. 9381a89793SAlex Elder */ 94602adf40SYehuda Sadeh #define DEV_NAME_LEN 32 9581a89793SAlex Elder #define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1) 96602adf40SYehuda Sadeh 97cc0538b6SAlex Elder #define RBD_READ_ONLY_DEFAULT false 9859c2be1eSYehuda Sadeh 99602adf40SYehuda Sadeh /* 100602adf40SYehuda Sadeh * block device image metadata (in-memory version) 101602adf40SYehuda Sadeh */ 102602adf40SYehuda Sadeh struct rbd_image_header { 103f84344f3SAlex Elder /* These four fields never change for a given rbd image */ 104849b4260SAlex Elder char *object_prefix; 10534b13184SAlex Elder u64 features; 106602adf40SYehuda Sadeh __u8 obj_order; 107602adf40SYehuda Sadeh __u8 crypt_type; 108602adf40SYehuda Sadeh __u8 comp_type; 109602adf40SYehuda Sadeh 110f84344f3SAlex Elder /* The remaining fields need to be updated occasionally */ 111f84344f3SAlex Elder u64 image_size; 112f84344f3SAlex Elder struct ceph_snap_context *snapc; 113602adf40SYehuda Sadeh char *snap_names; 114602adf40SYehuda Sadeh u64 *snap_sizes; 11559c2be1eSYehuda Sadeh 11659c2be1eSYehuda Sadeh u64 obj_version; 11759c2be1eSYehuda Sadeh }; 11859c2be1eSYehuda Sadeh 1190d7dbfceSAlex Elder /* 1200d7dbfceSAlex Elder * An rbd image specification. 1210d7dbfceSAlex Elder * 1220d7dbfceSAlex Elder * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely 123c66c6e0cSAlex Elder * identify an image. Each rbd_dev structure includes a pointer to 124c66c6e0cSAlex Elder * an rbd_spec structure that encapsulates this identity. 125c66c6e0cSAlex Elder * 126c66c6e0cSAlex Elder * Each of the id's in an rbd_spec has an associated name. For a 127c66c6e0cSAlex Elder * user-mapped image, the names are supplied and the id's associated 128c66c6e0cSAlex Elder * with them are looked up. For a layered image, a parent image is 129c66c6e0cSAlex Elder * defined by the tuple, and the names are looked up. 130c66c6e0cSAlex Elder * 131c66c6e0cSAlex Elder * An rbd_dev structure contains a parent_spec pointer which is 132c66c6e0cSAlex Elder * non-null if the image it represents is a child in a layered 133c66c6e0cSAlex Elder * image. This pointer will refer to the rbd_spec structure used 134c66c6e0cSAlex Elder * by the parent rbd_dev for its own identity (i.e., the structure 135c66c6e0cSAlex Elder * is shared between the parent and child). 136c66c6e0cSAlex Elder * 137c66c6e0cSAlex Elder * Since these structures are populated once, during the discovery 138c66c6e0cSAlex Elder * phase of image construction, they are effectively immutable so 139c66c6e0cSAlex Elder * we make no effort to synchronize access to them. 140c66c6e0cSAlex Elder * 141c66c6e0cSAlex Elder * Note that code herein does not assume the image name is known (it 142c66c6e0cSAlex Elder * could be a null pointer). 1430d7dbfceSAlex Elder */ 1440d7dbfceSAlex Elder struct rbd_spec { 1450d7dbfceSAlex Elder u64 pool_id; 1460d7dbfceSAlex Elder char *pool_name; 1470d7dbfceSAlex Elder 1480d7dbfceSAlex Elder char *image_id; 1490d7dbfceSAlex Elder char *image_name; 1500d7dbfceSAlex Elder 1510d7dbfceSAlex Elder u64 snap_id; 1520d7dbfceSAlex Elder char *snap_name; 1530d7dbfceSAlex Elder 1540d7dbfceSAlex Elder struct kref kref; 1550d7dbfceSAlex Elder }; 1560d7dbfceSAlex Elder 15759c2be1eSYehuda Sadeh struct rbd_options { 158cc0538b6SAlex Elder bool read_only; 159602adf40SYehuda Sadeh }; 160602adf40SYehuda Sadeh 161602adf40SYehuda Sadeh /* 162f0f8cef5SAlex Elder * an instance of the client. multiple devices may share an rbd client. 163602adf40SYehuda Sadeh */ 164602adf40SYehuda Sadeh struct rbd_client { 165602adf40SYehuda Sadeh struct ceph_client *client; 166602adf40SYehuda Sadeh struct kref kref; 167602adf40SYehuda Sadeh struct list_head node; 168602adf40SYehuda Sadeh }; 169602adf40SYehuda Sadeh 170602adf40SYehuda Sadeh /* 171f0f8cef5SAlex Elder * a request completion status 172602adf40SYehuda Sadeh */ 1731fec7093SYehuda Sadeh struct rbd_req_status { 1741fec7093SYehuda Sadeh int done; 1758986cb37SAlex Elder s32 rc; 1761fec7093SYehuda Sadeh u64 bytes; 1771fec7093SYehuda Sadeh }; 1781fec7093SYehuda Sadeh 1791fec7093SYehuda Sadeh /* 1801fec7093SYehuda Sadeh * a collection of requests 1811fec7093SYehuda Sadeh */ 1821fec7093SYehuda Sadeh struct rbd_req_coll { 1831fec7093SYehuda Sadeh int total; 1841fec7093SYehuda Sadeh int num_done; 1851fec7093SYehuda Sadeh struct kref kref; 1861fec7093SYehuda Sadeh struct rbd_req_status status[0]; 187602adf40SYehuda Sadeh }; 188602adf40SYehuda Sadeh 189f0f8cef5SAlex Elder /* 190f0f8cef5SAlex Elder * a single io request 191f0f8cef5SAlex Elder */ 192f0f8cef5SAlex Elder struct rbd_request { 193f0f8cef5SAlex Elder struct request *rq; /* blk layer request */ 194f0f8cef5SAlex Elder struct bio *bio; /* cloned bio */ 195f0f8cef5SAlex Elder struct page **pages; /* list of used pages */ 196f0f8cef5SAlex Elder u64 len; 197f0f8cef5SAlex Elder int coll_index; 198f0f8cef5SAlex Elder struct rbd_req_coll *coll; 199f0f8cef5SAlex Elder }; 200f0f8cef5SAlex Elder 201dfc5606dSYehuda Sadeh struct rbd_snap { 202dfc5606dSYehuda Sadeh struct device dev; 203dfc5606dSYehuda Sadeh const char *name; 2043591538fSJosh Durgin u64 size; 205dfc5606dSYehuda Sadeh struct list_head node; 206dfc5606dSYehuda Sadeh u64 id; 20734b13184SAlex Elder u64 features; 208dfc5606dSYehuda Sadeh }; 209dfc5606dSYehuda Sadeh 210f84344f3SAlex Elder struct rbd_mapping { 21199c1f08fSAlex Elder u64 size; 21234b13184SAlex Elder u64 features; 213f84344f3SAlex Elder bool read_only; 214f84344f3SAlex Elder }; 215f84344f3SAlex Elder 216602adf40SYehuda Sadeh /* 217602adf40SYehuda Sadeh * a single device 218602adf40SYehuda Sadeh */ 219602adf40SYehuda Sadeh struct rbd_device { 220de71a297SAlex Elder int dev_id; /* blkdev unique id */ 221602adf40SYehuda Sadeh 222602adf40SYehuda Sadeh int major; /* blkdev assigned major */ 223602adf40SYehuda Sadeh struct gendisk *disk; /* blkdev's gendisk and rq */ 224602adf40SYehuda Sadeh 225a30b71b9SAlex Elder u32 image_format; /* Either 1 or 2 */ 226602adf40SYehuda Sadeh struct rbd_client *rbd_client; 227602adf40SYehuda Sadeh 228602adf40SYehuda Sadeh char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */ 229602adf40SYehuda Sadeh 230602adf40SYehuda Sadeh spinlock_t lock; /* queue lock */ 231602adf40SYehuda Sadeh 232602adf40SYehuda Sadeh struct rbd_image_header header; 233d78b650aSAlex Elder atomic_t exists; 2340d7dbfceSAlex Elder struct rbd_spec *spec; 235602adf40SYehuda Sadeh 2360d7dbfceSAlex Elder char *header_name; 237971f839aSAlex Elder 23859c2be1eSYehuda Sadeh struct ceph_osd_event *watch_event; 23959c2be1eSYehuda Sadeh struct ceph_osd_request *watch_request; 24059c2be1eSYehuda Sadeh 24186b00e0dSAlex Elder struct rbd_spec *parent_spec; 24286b00e0dSAlex Elder u64 parent_overlap; 24386b00e0dSAlex Elder 244c666601aSJosh Durgin /* protects updating the header */ 245c666601aSJosh Durgin struct rw_semaphore header_rwsem; 246f84344f3SAlex Elder 247f84344f3SAlex Elder struct rbd_mapping mapping; 248602adf40SYehuda Sadeh 249602adf40SYehuda Sadeh struct list_head node; 250dfc5606dSYehuda Sadeh 251dfc5606dSYehuda Sadeh /* list of snapshots */ 252dfc5606dSYehuda Sadeh struct list_head snaps; 253dfc5606dSYehuda Sadeh 254dfc5606dSYehuda Sadeh /* sysfs related */ 255dfc5606dSYehuda Sadeh struct device dev; 25642382b70SAlex Elder unsigned long open_count; 257dfc5606dSYehuda Sadeh }; 258dfc5606dSYehuda Sadeh 259602adf40SYehuda Sadeh static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */ 260e124a82fSAlex Elder 261602adf40SYehuda Sadeh static LIST_HEAD(rbd_dev_list); /* devices */ 262e124a82fSAlex Elder static DEFINE_SPINLOCK(rbd_dev_list_lock); 263e124a82fSAlex Elder 264602adf40SYehuda Sadeh static LIST_HEAD(rbd_client_list); /* clients */ 265432b8587SAlex Elder static DEFINE_SPINLOCK(rbd_client_list_lock); 266602adf40SYehuda Sadeh 267304f6808SAlex Elder static int rbd_dev_snaps_update(struct rbd_device *rbd_dev); 268304f6808SAlex Elder static int rbd_dev_snaps_register(struct rbd_device *rbd_dev); 269304f6808SAlex Elder 270dfc5606dSYehuda Sadeh static void rbd_dev_release(struct device *dev); 27141f38c2bSAlex Elder static void rbd_remove_snap_dev(struct rbd_snap *snap); 272dfc5606dSYehuda Sadeh 273f0f8cef5SAlex Elder static ssize_t rbd_add(struct bus_type *bus, const char *buf, 274f0f8cef5SAlex Elder size_t count); 275f0f8cef5SAlex Elder static ssize_t rbd_remove(struct bus_type *bus, const char *buf, 276f0f8cef5SAlex Elder size_t count); 277f0f8cef5SAlex Elder 278f0f8cef5SAlex Elder static struct bus_attribute rbd_bus_attrs[] = { 279f0f8cef5SAlex Elder __ATTR(add, S_IWUSR, NULL, rbd_add), 280f0f8cef5SAlex Elder __ATTR(remove, S_IWUSR, NULL, rbd_remove), 281f0f8cef5SAlex Elder __ATTR_NULL 282f0f8cef5SAlex Elder }; 283f0f8cef5SAlex Elder 284f0f8cef5SAlex Elder static struct bus_type rbd_bus_type = { 285f0f8cef5SAlex Elder .name = "rbd", 286f0f8cef5SAlex Elder .bus_attrs = rbd_bus_attrs, 287f0f8cef5SAlex Elder }; 288f0f8cef5SAlex Elder 289f0f8cef5SAlex Elder static void rbd_root_dev_release(struct device *dev) 290f0f8cef5SAlex Elder { 291f0f8cef5SAlex Elder } 292f0f8cef5SAlex Elder 293f0f8cef5SAlex Elder static struct device rbd_root_dev = { 294f0f8cef5SAlex Elder .init_name = "rbd", 295f0f8cef5SAlex Elder .release = rbd_root_dev_release, 296f0f8cef5SAlex Elder }; 297f0f8cef5SAlex Elder 29806ecc6cbSAlex Elder static __printf(2, 3) 29906ecc6cbSAlex Elder void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...) 30006ecc6cbSAlex Elder { 30106ecc6cbSAlex Elder struct va_format vaf; 30206ecc6cbSAlex Elder va_list args; 30306ecc6cbSAlex Elder 30406ecc6cbSAlex Elder va_start(args, fmt); 30506ecc6cbSAlex Elder vaf.fmt = fmt; 30606ecc6cbSAlex Elder vaf.va = &args; 30706ecc6cbSAlex Elder 30806ecc6cbSAlex Elder if (!rbd_dev) 30906ecc6cbSAlex Elder printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf); 31006ecc6cbSAlex Elder else if (rbd_dev->disk) 31106ecc6cbSAlex Elder printk(KERN_WARNING "%s: %s: %pV\n", 31206ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf); 31306ecc6cbSAlex Elder else if (rbd_dev->spec && rbd_dev->spec->image_name) 31406ecc6cbSAlex Elder printk(KERN_WARNING "%s: image %s: %pV\n", 31506ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf); 31606ecc6cbSAlex Elder else if (rbd_dev->spec && rbd_dev->spec->image_id) 31706ecc6cbSAlex Elder printk(KERN_WARNING "%s: id %s: %pV\n", 31806ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf); 31906ecc6cbSAlex Elder else /* punt */ 32006ecc6cbSAlex Elder printk(KERN_WARNING "%s: rbd_dev %p: %pV\n", 32106ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev, &vaf); 32206ecc6cbSAlex Elder va_end(args); 32306ecc6cbSAlex Elder } 32406ecc6cbSAlex Elder 325aafb230eSAlex Elder #ifdef RBD_DEBUG 326aafb230eSAlex Elder #define rbd_assert(expr) \ 327aafb230eSAlex Elder if (unlikely(!(expr))) { \ 328aafb230eSAlex Elder printk(KERN_ERR "\nAssertion failure in %s() " \ 329aafb230eSAlex Elder "at line %d:\n\n" \ 330aafb230eSAlex Elder "\trbd_assert(%s);\n\n", \ 331aafb230eSAlex Elder __func__, __LINE__, #expr); \ 332aafb230eSAlex Elder BUG(); \ 333aafb230eSAlex Elder } 334aafb230eSAlex Elder #else /* !RBD_DEBUG */ 335aafb230eSAlex Elder # define rbd_assert(expr) ((void) 0) 336aafb230eSAlex Elder #endif /* !RBD_DEBUG */ 337dfc5606dSYehuda Sadeh 338117973fbSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver); 339117973fbSAlex Elder static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver); 34059c2be1eSYehuda Sadeh 341602adf40SYehuda Sadeh static int rbd_open(struct block_device *bdev, fmode_t mode) 342602adf40SYehuda Sadeh { 343f0f8cef5SAlex Elder struct rbd_device *rbd_dev = bdev->bd_disk->private_data; 344602adf40SYehuda Sadeh 345f84344f3SAlex Elder if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only) 346602adf40SYehuda Sadeh return -EROFS; 347602adf40SYehuda Sadeh 34842382b70SAlex Elder mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); 349c3e946ceSAlex Elder (void) get_device(&rbd_dev->dev); 350f84344f3SAlex Elder set_device_ro(bdev, rbd_dev->mapping.read_only); 35142382b70SAlex Elder rbd_dev->open_count++; 35242382b70SAlex Elder mutex_unlock(&ctl_mutex); 353340c7a2bSAlex Elder 354602adf40SYehuda Sadeh return 0; 355602adf40SYehuda Sadeh } 356602adf40SYehuda Sadeh 357dfc5606dSYehuda Sadeh static int rbd_release(struct gendisk *disk, fmode_t mode) 358dfc5606dSYehuda Sadeh { 359dfc5606dSYehuda Sadeh struct rbd_device *rbd_dev = disk->private_data; 360dfc5606dSYehuda Sadeh 36142382b70SAlex Elder mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); 36242382b70SAlex Elder rbd_assert(rbd_dev->open_count > 0); 36342382b70SAlex Elder rbd_dev->open_count--; 364c3e946ceSAlex Elder put_device(&rbd_dev->dev); 36542382b70SAlex Elder mutex_unlock(&ctl_mutex); 366dfc5606dSYehuda Sadeh 367dfc5606dSYehuda Sadeh return 0; 368dfc5606dSYehuda Sadeh } 369dfc5606dSYehuda Sadeh 370602adf40SYehuda Sadeh static const struct block_device_operations rbd_bd_ops = { 371602adf40SYehuda Sadeh .owner = THIS_MODULE, 372602adf40SYehuda Sadeh .open = rbd_open, 373dfc5606dSYehuda Sadeh .release = rbd_release, 374602adf40SYehuda Sadeh }; 375602adf40SYehuda Sadeh 376602adf40SYehuda Sadeh /* 377602adf40SYehuda Sadeh * Initialize an rbd client instance. 37843ae4701SAlex Elder * We own *ceph_opts. 379602adf40SYehuda Sadeh */ 380f8c38929SAlex Elder static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts) 381602adf40SYehuda Sadeh { 382602adf40SYehuda Sadeh struct rbd_client *rbdc; 383602adf40SYehuda Sadeh int ret = -ENOMEM; 384602adf40SYehuda Sadeh 385602adf40SYehuda Sadeh dout("rbd_client_create\n"); 386602adf40SYehuda Sadeh rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL); 387602adf40SYehuda Sadeh if (!rbdc) 388602adf40SYehuda Sadeh goto out_opt; 389602adf40SYehuda Sadeh 390602adf40SYehuda Sadeh kref_init(&rbdc->kref); 391602adf40SYehuda Sadeh INIT_LIST_HEAD(&rbdc->node); 392602adf40SYehuda Sadeh 393bc534d86SAlex Elder mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); 394bc534d86SAlex Elder 39543ae4701SAlex Elder rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0); 396602adf40SYehuda Sadeh if (IS_ERR(rbdc->client)) 397bc534d86SAlex Elder goto out_mutex; 39843ae4701SAlex Elder ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */ 399602adf40SYehuda Sadeh 400602adf40SYehuda Sadeh ret = ceph_open_session(rbdc->client); 401602adf40SYehuda Sadeh if (ret < 0) 402602adf40SYehuda Sadeh goto out_err; 403602adf40SYehuda Sadeh 404432b8587SAlex Elder spin_lock(&rbd_client_list_lock); 405602adf40SYehuda Sadeh list_add_tail(&rbdc->node, &rbd_client_list); 406432b8587SAlex Elder spin_unlock(&rbd_client_list_lock); 407602adf40SYehuda Sadeh 408bc534d86SAlex Elder mutex_unlock(&ctl_mutex); 409bc534d86SAlex Elder 410602adf40SYehuda Sadeh dout("rbd_client_create created %p\n", rbdc); 411602adf40SYehuda Sadeh return rbdc; 412602adf40SYehuda Sadeh 413602adf40SYehuda Sadeh out_err: 414602adf40SYehuda Sadeh ceph_destroy_client(rbdc->client); 415bc534d86SAlex Elder out_mutex: 416bc534d86SAlex Elder mutex_unlock(&ctl_mutex); 417602adf40SYehuda Sadeh kfree(rbdc); 418602adf40SYehuda Sadeh out_opt: 41943ae4701SAlex Elder if (ceph_opts) 42043ae4701SAlex Elder ceph_destroy_options(ceph_opts); 42128f259b7SVasiliy Kulikov return ERR_PTR(ret); 422602adf40SYehuda Sadeh } 423602adf40SYehuda Sadeh 424602adf40SYehuda Sadeh /* 4251f7ba331SAlex Elder * Find a ceph client with specific addr and configuration. If 4261f7ba331SAlex Elder * found, bump its reference count. 427602adf40SYehuda Sadeh */ 4281f7ba331SAlex Elder static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts) 429602adf40SYehuda Sadeh { 430602adf40SYehuda Sadeh struct rbd_client *client_node; 4311f7ba331SAlex Elder bool found = false; 432602adf40SYehuda Sadeh 43343ae4701SAlex Elder if (ceph_opts->flags & CEPH_OPT_NOSHARE) 434602adf40SYehuda Sadeh return NULL; 435602adf40SYehuda Sadeh 4361f7ba331SAlex Elder spin_lock(&rbd_client_list_lock); 4371f7ba331SAlex Elder list_for_each_entry(client_node, &rbd_client_list, node) { 4381f7ba331SAlex Elder if (!ceph_compare_options(ceph_opts, client_node->client)) { 4391f7ba331SAlex Elder kref_get(&client_node->kref); 4401f7ba331SAlex Elder found = true; 4411f7ba331SAlex Elder break; 4421f7ba331SAlex Elder } 4431f7ba331SAlex Elder } 4441f7ba331SAlex Elder spin_unlock(&rbd_client_list_lock); 4451f7ba331SAlex Elder 4461f7ba331SAlex Elder return found ? client_node : NULL; 447602adf40SYehuda Sadeh } 448602adf40SYehuda Sadeh 449602adf40SYehuda Sadeh /* 45059c2be1eSYehuda Sadeh * mount options 45159c2be1eSYehuda Sadeh */ 45259c2be1eSYehuda Sadeh enum { 45359c2be1eSYehuda Sadeh Opt_last_int, 45459c2be1eSYehuda Sadeh /* int args above */ 45559c2be1eSYehuda Sadeh Opt_last_string, 45659c2be1eSYehuda Sadeh /* string args above */ 457cc0538b6SAlex Elder Opt_read_only, 458cc0538b6SAlex Elder Opt_read_write, 459cc0538b6SAlex Elder /* Boolean args above */ 460cc0538b6SAlex Elder Opt_last_bool, 46159c2be1eSYehuda Sadeh }; 46259c2be1eSYehuda Sadeh 46343ae4701SAlex Elder static match_table_t rbd_opts_tokens = { 46459c2be1eSYehuda Sadeh /* int args above */ 46559c2be1eSYehuda Sadeh /* string args above */ 466be466c1cSAlex Elder {Opt_read_only, "read_only"}, 467cc0538b6SAlex Elder {Opt_read_only, "ro"}, /* Alternate spelling */ 468cc0538b6SAlex Elder {Opt_read_write, "read_write"}, 469cc0538b6SAlex Elder {Opt_read_write, "rw"}, /* Alternate spelling */ 470cc0538b6SAlex Elder /* Boolean args above */ 47159c2be1eSYehuda Sadeh {-1, NULL} 47259c2be1eSYehuda Sadeh }; 47359c2be1eSYehuda Sadeh 47459c2be1eSYehuda Sadeh static int parse_rbd_opts_token(char *c, void *private) 47559c2be1eSYehuda Sadeh { 47643ae4701SAlex Elder struct rbd_options *rbd_opts = private; 47759c2be1eSYehuda Sadeh substring_t argstr[MAX_OPT_ARGS]; 47859c2be1eSYehuda Sadeh int token, intval, ret; 47959c2be1eSYehuda Sadeh 48043ae4701SAlex Elder token = match_token(c, rbd_opts_tokens, argstr); 48159c2be1eSYehuda Sadeh if (token < 0) 48259c2be1eSYehuda Sadeh return -EINVAL; 48359c2be1eSYehuda Sadeh 48459c2be1eSYehuda Sadeh if (token < Opt_last_int) { 48559c2be1eSYehuda Sadeh ret = match_int(&argstr[0], &intval); 48659c2be1eSYehuda Sadeh if (ret < 0) { 48759c2be1eSYehuda Sadeh pr_err("bad mount option arg (not int) " 48859c2be1eSYehuda Sadeh "at '%s'\n", c); 48959c2be1eSYehuda Sadeh return ret; 49059c2be1eSYehuda Sadeh } 49159c2be1eSYehuda Sadeh dout("got int token %d val %d\n", token, intval); 49259c2be1eSYehuda Sadeh } else if (token > Opt_last_int && token < Opt_last_string) { 49359c2be1eSYehuda Sadeh dout("got string token %d val %s\n", token, 49459c2be1eSYehuda Sadeh argstr[0].from); 495cc0538b6SAlex Elder } else if (token > Opt_last_string && token < Opt_last_bool) { 496cc0538b6SAlex Elder dout("got Boolean token %d\n", token); 49759c2be1eSYehuda Sadeh } else { 49859c2be1eSYehuda Sadeh dout("got token %d\n", token); 49959c2be1eSYehuda Sadeh } 50059c2be1eSYehuda Sadeh 50159c2be1eSYehuda Sadeh switch (token) { 502cc0538b6SAlex Elder case Opt_read_only: 503cc0538b6SAlex Elder rbd_opts->read_only = true; 504cc0538b6SAlex Elder break; 505cc0538b6SAlex Elder case Opt_read_write: 506cc0538b6SAlex Elder rbd_opts->read_only = false; 507cc0538b6SAlex Elder break; 50859c2be1eSYehuda Sadeh default: 509aafb230eSAlex Elder rbd_assert(false); 510aafb230eSAlex Elder break; 51159c2be1eSYehuda Sadeh } 51259c2be1eSYehuda Sadeh return 0; 51359c2be1eSYehuda Sadeh } 51459c2be1eSYehuda Sadeh 51559c2be1eSYehuda Sadeh /* 516602adf40SYehuda Sadeh * Get a ceph client with specific addr and configuration, if one does 517602adf40SYehuda Sadeh * not exist create it. 518602adf40SYehuda Sadeh */ 5199d3997fdSAlex Elder static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts) 520602adf40SYehuda Sadeh { 521f8c38929SAlex Elder struct rbd_client *rbdc; 52259c2be1eSYehuda Sadeh 5231f7ba331SAlex Elder rbdc = rbd_client_find(ceph_opts); 5249d3997fdSAlex Elder if (rbdc) /* using an existing client */ 52543ae4701SAlex Elder ceph_destroy_options(ceph_opts); 5269d3997fdSAlex Elder else 527f8c38929SAlex Elder rbdc = rbd_client_create(ceph_opts); 528d720bcb0SAlex Elder 5299d3997fdSAlex Elder return rbdc; 530602adf40SYehuda Sadeh } 531602adf40SYehuda Sadeh 532602adf40SYehuda Sadeh /* 533602adf40SYehuda Sadeh * Destroy ceph client 534d23a4b3fSAlex Elder * 535432b8587SAlex Elder * Caller must hold rbd_client_list_lock. 536602adf40SYehuda Sadeh */ 537602adf40SYehuda Sadeh static void rbd_client_release(struct kref *kref) 538602adf40SYehuda Sadeh { 539602adf40SYehuda Sadeh struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref); 540602adf40SYehuda Sadeh 541602adf40SYehuda Sadeh dout("rbd_release_client %p\n", rbdc); 542cd9d9f5dSAlex Elder spin_lock(&rbd_client_list_lock); 543602adf40SYehuda Sadeh list_del(&rbdc->node); 544cd9d9f5dSAlex Elder spin_unlock(&rbd_client_list_lock); 545602adf40SYehuda Sadeh 546602adf40SYehuda Sadeh ceph_destroy_client(rbdc->client); 547602adf40SYehuda Sadeh kfree(rbdc); 548602adf40SYehuda Sadeh } 549602adf40SYehuda Sadeh 550602adf40SYehuda Sadeh /* 551602adf40SYehuda Sadeh * Drop reference to ceph client node. If it's not referenced anymore, release 552602adf40SYehuda Sadeh * it. 553602adf40SYehuda Sadeh */ 5549d3997fdSAlex Elder static void rbd_put_client(struct rbd_client *rbdc) 555602adf40SYehuda Sadeh { 556c53d5893SAlex Elder if (rbdc) 5579d3997fdSAlex Elder kref_put(&rbdc->kref, rbd_client_release); 558602adf40SYehuda Sadeh } 559602adf40SYehuda Sadeh 5601fec7093SYehuda Sadeh /* 5611fec7093SYehuda Sadeh * Destroy requests collection 5621fec7093SYehuda Sadeh */ 5631fec7093SYehuda Sadeh static void rbd_coll_release(struct kref *kref) 5641fec7093SYehuda Sadeh { 5651fec7093SYehuda Sadeh struct rbd_req_coll *coll = 5661fec7093SYehuda Sadeh container_of(kref, struct rbd_req_coll, kref); 5671fec7093SYehuda Sadeh 5681fec7093SYehuda Sadeh dout("rbd_coll_release %p\n", coll); 5691fec7093SYehuda Sadeh kfree(coll); 5701fec7093SYehuda Sadeh } 571602adf40SYehuda Sadeh 572a30b71b9SAlex Elder static bool rbd_image_format_valid(u32 image_format) 573a30b71b9SAlex Elder { 574a30b71b9SAlex Elder return image_format == 1 || image_format == 2; 575a30b71b9SAlex Elder } 576a30b71b9SAlex Elder 5778e94af8eSAlex Elder static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk) 5788e94af8eSAlex Elder { 579103a150fSAlex Elder size_t size; 580103a150fSAlex Elder u32 snap_count; 581103a150fSAlex Elder 582103a150fSAlex Elder /* The header has to start with the magic rbd header text */ 583103a150fSAlex Elder if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT))) 584103a150fSAlex Elder return false; 585103a150fSAlex Elder 586db2388b6SAlex Elder /* The bio layer requires at least sector-sized I/O */ 587db2388b6SAlex Elder 588db2388b6SAlex Elder if (ondisk->options.order < SECTOR_SHIFT) 589db2388b6SAlex Elder return false; 590db2388b6SAlex Elder 591db2388b6SAlex Elder /* If we use u64 in a few spots we may be able to loosen this */ 592db2388b6SAlex Elder 593db2388b6SAlex Elder if (ondisk->options.order > 8 * sizeof (int) - 1) 594db2388b6SAlex Elder return false; 595db2388b6SAlex Elder 596103a150fSAlex Elder /* 597103a150fSAlex Elder * The size of a snapshot header has to fit in a size_t, and 598103a150fSAlex Elder * that limits the number of snapshots. 599103a150fSAlex Elder */ 600103a150fSAlex Elder snap_count = le32_to_cpu(ondisk->snap_count); 601103a150fSAlex Elder size = SIZE_MAX - sizeof (struct ceph_snap_context); 602103a150fSAlex Elder if (snap_count > size / sizeof (__le64)) 603103a150fSAlex Elder return false; 604103a150fSAlex Elder 605103a150fSAlex Elder /* 606103a150fSAlex Elder * Not only that, but the size of the entire the snapshot 607103a150fSAlex Elder * header must also be representable in a size_t. 608103a150fSAlex Elder */ 609103a150fSAlex Elder size -= snap_count * sizeof (__le64); 610103a150fSAlex Elder if ((u64) size < le64_to_cpu(ondisk->snap_names_len)) 611103a150fSAlex Elder return false; 612103a150fSAlex Elder 613103a150fSAlex Elder return true; 6148e94af8eSAlex Elder } 6158e94af8eSAlex Elder 616602adf40SYehuda Sadeh /* 617602adf40SYehuda Sadeh * Create a new header structure, translate header format from the on-disk 618602adf40SYehuda Sadeh * header. 619602adf40SYehuda Sadeh */ 620602adf40SYehuda Sadeh static int rbd_header_from_disk(struct rbd_image_header *header, 6214156d998SAlex Elder struct rbd_image_header_ondisk *ondisk) 622602adf40SYehuda Sadeh { 623ccece235SAlex Elder u32 snap_count; 62458c17b0eSAlex Elder size_t len; 625d2bb24e5SAlex Elder size_t size; 626621901d6SAlex Elder u32 i; 627602adf40SYehuda Sadeh 6286a52325fSAlex Elder memset(header, 0, sizeof (*header)); 6296a52325fSAlex Elder 630103a150fSAlex Elder snap_count = le32_to_cpu(ondisk->snap_count); 631103a150fSAlex Elder 63258c17b0eSAlex Elder len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix)); 63358c17b0eSAlex Elder header->object_prefix = kmalloc(len + 1, GFP_KERNEL); 6346a52325fSAlex Elder if (!header->object_prefix) 635602adf40SYehuda Sadeh return -ENOMEM; 63658c17b0eSAlex Elder memcpy(header->object_prefix, ondisk->object_prefix, len); 63758c17b0eSAlex Elder header->object_prefix[len] = '\0'; 63800f1f36fSAlex Elder 639602adf40SYehuda Sadeh if (snap_count) { 640f785cc1dSAlex Elder u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len); 641f785cc1dSAlex Elder 642621901d6SAlex Elder /* Save a copy of the snapshot names */ 643621901d6SAlex Elder 644f785cc1dSAlex Elder if (snap_names_len > (u64) SIZE_MAX) 645f785cc1dSAlex Elder return -EIO; 646f785cc1dSAlex Elder header->snap_names = kmalloc(snap_names_len, GFP_KERNEL); 647602adf40SYehuda Sadeh if (!header->snap_names) 6486a52325fSAlex Elder goto out_err; 649f785cc1dSAlex Elder /* 650f785cc1dSAlex Elder * Note that rbd_dev_v1_header_read() guarantees 651f785cc1dSAlex Elder * the ondisk buffer we're working with has 652f785cc1dSAlex Elder * snap_names_len bytes beyond the end of the 653f785cc1dSAlex Elder * snapshot id array, this memcpy() is safe. 654f785cc1dSAlex Elder */ 655f785cc1dSAlex Elder memcpy(header->snap_names, &ondisk->snaps[snap_count], 656f785cc1dSAlex Elder snap_names_len); 6576a52325fSAlex Elder 658621901d6SAlex Elder /* Record each snapshot's size */ 659621901d6SAlex Elder 660d2bb24e5SAlex Elder size = snap_count * sizeof (*header->snap_sizes); 661d2bb24e5SAlex Elder header->snap_sizes = kmalloc(size, GFP_KERNEL); 662602adf40SYehuda Sadeh if (!header->snap_sizes) 6636a52325fSAlex Elder goto out_err; 664621901d6SAlex Elder for (i = 0; i < snap_count; i++) 665621901d6SAlex Elder header->snap_sizes[i] = 666621901d6SAlex Elder le64_to_cpu(ondisk->snaps[i].image_size); 667602adf40SYehuda Sadeh } else { 668ccece235SAlex Elder WARN_ON(ondisk->snap_names_len); 669602adf40SYehuda Sadeh header->snap_names = NULL; 670602adf40SYehuda Sadeh header->snap_sizes = NULL; 671602adf40SYehuda Sadeh } 672849b4260SAlex Elder 67334b13184SAlex Elder header->features = 0; /* No features support in v1 images */ 674602adf40SYehuda Sadeh header->obj_order = ondisk->options.order; 675602adf40SYehuda Sadeh header->crypt_type = ondisk->options.crypt_type; 676602adf40SYehuda Sadeh header->comp_type = ondisk->options.comp_type; 6776a52325fSAlex Elder 678621901d6SAlex Elder /* Allocate and fill in the snapshot context */ 679621901d6SAlex Elder 680f84344f3SAlex Elder header->image_size = le64_to_cpu(ondisk->image_size); 6816a52325fSAlex Elder size = sizeof (struct ceph_snap_context); 6826a52325fSAlex Elder size += snap_count * sizeof (header->snapc->snaps[0]); 6836a52325fSAlex Elder header->snapc = kzalloc(size, GFP_KERNEL); 6846a52325fSAlex Elder if (!header->snapc) 6856a52325fSAlex Elder goto out_err; 686602adf40SYehuda Sadeh 687602adf40SYehuda Sadeh atomic_set(&header->snapc->nref, 1); 688505cbb9bSAlex Elder header->snapc->seq = le64_to_cpu(ondisk->snap_seq); 689602adf40SYehuda Sadeh header->snapc->num_snaps = snap_count; 690621901d6SAlex Elder for (i = 0; i < snap_count; i++) 691602adf40SYehuda Sadeh header->snapc->snaps[i] = 692602adf40SYehuda Sadeh le64_to_cpu(ondisk->snaps[i].id); 693602adf40SYehuda Sadeh 694602adf40SYehuda Sadeh return 0; 695602adf40SYehuda Sadeh 6966a52325fSAlex Elder out_err: 697849b4260SAlex Elder kfree(header->snap_sizes); 698ccece235SAlex Elder header->snap_sizes = NULL; 699602adf40SYehuda Sadeh kfree(header->snap_names); 700ccece235SAlex Elder header->snap_names = NULL; 7016a52325fSAlex Elder kfree(header->object_prefix); 7026a52325fSAlex Elder header->object_prefix = NULL; 703ccece235SAlex Elder 70400f1f36fSAlex Elder return -ENOMEM; 705602adf40SYehuda Sadeh } 706602adf40SYehuda Sadeh 7079e15b77dSAlex Elder static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id) 7089e15b77dSAlex Elder { 7099e15b77dSAlex Elder struct rbd_snap *snap; 7109e15b77dSAlex Elder 7119e15b77dSAlex Elder if (snap_id == CEPH_NOSNAP) 7129e15b77dSAlex Elder return RBD_SNAP_HEAD_NAME; 7139e15b77dSAlex Elder 7149e15b77dSAlex Elder list_for_each_entry(snap, &rbd_dev->snaps, node) 7159e15b77dSAlex Elder if (snap_id == snap->id) 7169e15b77dSAlex Elder return snap->name; 7179e15b77dSAlex Elder 7189e15b77dSAlex Elder return NULL; 7199e15b77dSAlex Elder } 7209e15b77dSAlex Elder 7218836b995SAlex Elder static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name) 722602adf40SYehuda Sadeh { 723602adf40SYehuda Sadeh 724e86924a8SAlex Elder struct rbd_snap *snap; 72500f1f36fSAlex Elder 726e86924a8SAlex Elder list_for_each_entry(snap, &rbd_dev->snaps, node) { 727e86924a8SAlex Elder if (!strcmp(snap_name, snap->name)) { 7280d7dbfceSAlex Elder rbd_dev->spec->snap_id = snap->id; 729e86924a8SAlex Elder rbd_dev->mapping.size = snap->size; 73034b13184SAlex Elder rbd_dev->mapping.features = snap->features; 73100f1f36fSAlex Elder 732e86924a8SAlex Elder return 0; 733602adf40SYehuda Sadeh } 73400f1f36fSAlex Elder } 735e86924a8SAlex Elder 73600f1f36fSAlex Elder return -ENOENT; 73700f1f36fSAlex Elder } 738602adf40SYehuda Sadeh 739819d52bfSAlex Elder static int rbd_dev_set_mapping(struct rbd_device *rbd_dev) 740602adf40SYehuda Sadeh { 74178dc447dSAlex Elder int ret; 742602adf40SYehuda Sadeh 7430d7dbfceSAlex Elder if (!memcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME, 744cc9d734cSJosh Durgin sizeof (RBD_SNAP_HEAD_NAME))) { 7450d7dbfceSAlex Elder rbd_dev->spec->snap_id = CEPH_NOSNAP; 74699c1f08fSAlex Elder rbd_dev->mapping.size = rbd_dev->header.image_size; 74734b13184SAlex Elder rbd_dev->mapping.features = rbd_dev->header.features; 748e86924a8SAlex Elder ret = 0; 749602adf40SYehuda Sadeh } else { 7500d7dbfceSAlex Elder ret = snap_by_name(rbd_dev, rbd_dev->spec->snap_name); 751602adf40SYehuda Sadeh if (ret < 0) 752602adf40SYehuda Sadeh goto done; 753f84344f3SAlex Elder rbd_dev->mapping.read_only = true; 754602adf40SYehuda Sadeh } 755d78b650aSAlex Elder atomic_set(&rbd_dev->exists, 1); 756602adf40SYehuda Sadeh done: 757602adf40SYehuda Sadeh return ret; 758602adf40SYehuda Sadeh } 759602adf40SYehuda Sadeh 760602adf40SYehuda Sadeh static void rbd_header_free(struct rbd_image_header *header) 761602adf40SYehuda Sadeh { 762849b4260SAlex Elder kfree(header->object_prefix); 763d78fd7aeSAlex Elder header->object_prefix = NULL; 764602adf40SYehuda Sadeh kfree(header->snap_sizes); 765d78fd7aeSAlex Elder header->snap_sizes = NULL; 766849b4260SAlex Elder kfree(header->snap_names); 767d78fd7aeSAlex Elder header->snap_names = NULL; 768d1d25646SJosh Durgin ceph_put_snap_context(header->snapc); 769d78fd7aeSAlex Elder header->snapc = NULL; 770602adf40SYehuda Sadeh } 771602adf40SYehuda Sadeh 77265ccfe21SAlex Elder static char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset) 773602adf40SYehuda Sadeh { 77465ccfe21SAlex Elder char *name; 77565ccfe21SAlex Elder u64 segment; 77665ccfe21SAlex Elder int ret; 777602adf40SYehuda Sadeh 7782fd82b9eSAlex Elder name = kmalloc(MAX_OBJ_NAME_SIZE + 1, GFP_NOIO); 77965ccfe21SAlex Elder if (!name) 78065ccfe21SAlex Elder return NULL; 78165ccfe21SAlex Elder segment = offset >> rbd_dev->header.obj_order; 7822fd82b9eSAlex Elder ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, "%s.%012llx", 78365ccfe21SAlex Elder rbd_dev->header.object_prefix, segment); 7842fd82b9eSAlex Elder if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) { 78565ccfe21SAlex Elder pr_err("error formatting segment name for #%llu (%d)\n", 78665ccfe21SAlex Elder segment, ret); 78765ccfe21SAlex Elder kfree(name); 78865ccfe21SAlex Elder name = NULL; 78965ccfe21SAlex Elder } 790602adf40SYehuda Sadeh 79165ccfe21SAlex Elder return name; 79265ccfe21SAlex Elder } 793602adf40SYehuda Sadeh 79465ccfe21SAlex Elder static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset) 79565ccfe21SAlex Elder { 79665ccfe21SAlex Elder u64 segment_size = (u64) 1 << rbd_dev->header.obj_order; 797602adf40SYehuda Sadeh 79865ccfe21SAlex Elder return offset & (segment_size - 1); 79965ccfe21SAlex Elder } 80065ccfe21SAlex Elder 80165ccfe21SAlex Elder static u64 rbd_segment_length(struct rbd_device *rbd_dev, 80265ccfe21SAlex Elder u64 offset, u64 length) 80365ccfe21SAlex Elder { 80465ccfe21SAlex Elder u64 segment_size = (u64) 1 << rbd_dev->header.obj_order; 80565ccfe21SAlex Elder 80665ccfe21SAlex Elder offset &= segment_size - 1; 80765ccfe21SAlex Elder 808aafb230eSAlex Elder rbd_assert(length <= U64_MAX - offset); 80965ccfe21SAlex Elder if (offset + length > segment_size) 81065ccfe21SAlex Elder length = segment_size - offset; 81165ccfe21SAlex Elder 81265ccfe21SAlex Elder return length; 813602adf40SYehuda Sadeh } 814602adf40SYehuda Sadeh 8151fec7093SYehuda Sadeh static int rbd_get_num_segments(struct rbd_image_header *header, 8161fec7093SYehuda Sadeh u64 ofs, u64 len) 8171fec7093SYehuda Sadeh { 818df111be6SAlex Elder u64 start_seg; 819df111be6SAlex Elder u64 end_seg; 820df111be6SAlex Elder 821df111be6SAlex Elder if (!len) 822df111be6SAlex Elder return 0; 823df111be6SAlex Elder if (len - 1 > U64_MAX - ofs) 824df111be6SAlex Elder return -ERANGE; 825df111be6SAlex Elder 826df111be6SAlex Elder start_seg = ofs >> header->obj_order; 827df111be6SAlex Elder end_seg = (ofs + len - 1) >> header->obj_order; 828df111be6SAlex Elder 8291fec7093SYehuda Sadeh return end_seg - start_seg + 1; 8301fec7093SYehuda Sadeh } 8311fec7093SYehuda Sadeh 832602adf40SYehuda Sadeh /* 833029bcbd8SJosh Durgin * returns the size of an object in the image 834029bcbd8SJosh Durgin */ 835029bcbd8SJosh Durgin static u64 rbd_obj_bytes(struct rbd_image_header *header) 836029bcbd8SJosh Durgin { 837029bcbd8SJosh Durgin return 1 << header->obj_order; 838029bcbd8SJosh Durgin } 839029bcbd8SJosh Durgin 840029bcbd8SJosh Durgin /* 841602adf40SYehuda Sadeh * bio helpers 842602adf40SYehuda Sadeh */ 843602adf40SYehuda Sadeh 844602adf40SYehuda Sadeh static void bio_chain_put(struct bio *chain) 845602adf40SYehuda Sadeh { 846602adf40SYehuda Sadeh struct bio *tmp; 847602adf40SYehuda Sadeh 848602adf40SYehuda Sadeh while (chain) { 849602adf40SYehuda Sadeh tmp = chain; 850602adf40SYehuda Sadeh chain = chain->bi_next; 851602adf40SYehuda Sadeh bio_put(tmp); 852602adf40SYehuda Sadeh } 853602adf40SYehuda Sadeh } 854602adf40SYehuda Sadeh 855602adf40SYehuda Sadeh /* 856602adf40SYehuda Sadeh * zeros a bio chain, starting at specific offset 857602adf40SYehuda Sadeh */ 858602adf40SYehuda Sadeh static void zero_bio_chain(struct bio *chain, int start_ofs) 859602adf40SYehuda Sadeh { 860602adf40SYehuda Sadeh struct bio_vec *bv; 861602adf40SYehuda Sadeh unsigned long flags; 862602adf40SYehuda Sadeh void *buf; 863602adf40SYehuda Sadeh int i; 864602adf40SYehuda Sadeh int pos = 0; 865602adf40SYehuda Sadeh 866602adf40SYehuda Sadeh while (chain) { 867602adf40SYehuda Sadeh bio_for_each_segment(bv, chain, i) { 868602adf40SYehuda Sadeh if (pos + bv->bv_len > start_ofs) { 869602adf40SYehuda Sadeh int remainder = max(start_ofs - pos, 0); 870602adf40SYehuda Sadeh buf = bvec_kmap_irq(bv, &flags); 871602adf40SYehuda Sadeh memset(buf + remainder, 0, 872602adf40SYehuda Sadeh bv->bv_len - remainder); 87385b5aaa6SDan Carpenter bvec_kunmap_irq(buf, &flags); 874602adf40SYehuda Sadeh } 875602adf40SYehuda Sadeh pos += bv->bv_len; 876602adf40SYehuda Sadeh } 877602adf40SYehuda Sadeh 878602adf40SYehuda Sadeh chain = chain->bi_next; 879602adf40SYehuda Sadeh } 880602adf40SYehuda Sadeh } 881602adf40SYehuda Sadeh 882602adf40SYehuda Sadeh /* 883f7760dadSAlex Elder * Clone a portion of a bio, starting at the given byte offset 884f7760dadSAlex Elder * and continuing for the number of bytes indicated. 885602adf40SYehuda Sadeh */ 886f7760dadSAlex Elder static struct bio *bio_clone_range(struct bio *bio_src, 887f7760dadSAlex Elder unsigned int offset, 888f7760dadSAlex Elder unsigned int len, 889f7760dadSAlex Elder gfp_t gfpmask) 890602adf40SYehuda Sadeh { 891f7760dadSAlex Elder struct bio_vec *bv; 892f7760dadSAlex Elder unsigned int resid; 893f7760dadSAlex Elder unsigned short idx; 894f7760dadSAlex Elder unsigned int voff; 895f7760dadSAlex Elder unsigned short end_idx; 896f7760dadSAlex Elder unsigned short vcnt; 897f7760dadSAlex Elder struct bio *bio; 898602adf40SYehuda Sadeh 899f7760dadSAlex Elder /* Handle the easy case for the caller */ 900f7760dadSAlex Elder 901f7760dadSAlex Elder if (!offset && len == bio_src->bi_size) 902f7760dadSAlex Elder return bio_clone(bio_src, gfpmask); 903f7760dadSAlex Elder 904f7760dadSAlex Elder if (WARN_ON_ONCE(!len)) 905f7760dadSAlex Elder return NULL; 906f7760dadSAlex Elder if (WARN_ON_ONCE(len > bio_src->bi_size)) 907f7760dadSAlex Elder return NULL; 908f7760dadSAlex Elder if (WARN_ON_ONCE(offset > bio_src->bi_size - len)) 909f7760dadSAlex Elder return NULL; 910f7760dadSAlex Elder 911f7760dadSAlex Elder /* Find first affected segment... */ 912f7760dadSAlex Elder 913f7760dadSAlex Elder resid = offset; 914f7760dadSAlex Elder __bio_for_each_segment(bv, bio_src, idx, 0) { 915f7760dadSAlex Elder if (resid < bv->bv_len) 916f7760dadSAlex Elder break; 917f7760dadSAlex Elder resid -= bv->bv_len; 918602adf40SYehuda Sadeh } 919f7760dadSAlex Elder voff = resid; 920602adf40SYehuda Sadeh 921f7760dadSAlex Elder /* ...and the last affected segment */ 922542582fcSAlex Elder 923f7760dadSAlex Elder resid += len; 924f7760dadSAlex Elder __bio_for_each_segment(bv, bio_src, end_idx, idx) { 925f7760dadSAlex Elder if (resid <= bv->bv_len) 926f7760dadSAlex Elder break; 927f7760dadSAlex Elder resid -= bv->bv_len; 928f7760dadSAlex Elder } 929f7760dadSAlex Elder vcnt = end_idx - idx + 1; 930602adf40SYehuda Sadeh 931f7760dadSAlex Elder /* Build the clone */ 932f7760dadSAlex Elder 933f7760dadSAlex Elder bio = bio_alloc(gfpmask, (unsigned int) vcnt); 934f7760dadSAlex Elder if (!bio) 935f7760dadSAlex Elder return NULL; /* ENOMEM */ 936f7760dadSAlex Elder 937f7760dadSAlex Elder bio->bi_bdev = bio_src->bi_bdev; 938f7760dadSAlex Elder bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT); 939f7760dadSAlex Elder bio->bi_rw = bio_src->bi_rw; 940f7760dadSAlex Elder bio->bi_flags |= 1 << BIO_CLONED; 941602adf40SYehuda Sadeh 942602adf40SYehuda Sadeh /* 943f7760dadSAlex Elder * Copy over our part of the bio_vec, then update the first 944f7760dadSAlex Elder * and last (or only) entries. 945602adf40SYehuda Sadeh */ 946f7760dadSAlex Elder memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx], 947f7760dadSAlex Elder vcnt * sizeof (struct bio_vec)); 948f7760dadSAlex Elder bio->bi_io_vec[0].bv_offset += voff; 949f7760dadSAlex Elder if (vcnt > 1) { 950f7760dadSAlex Elder bio->bi_io_vec[0].bv_len -= voff; 951f7760dadSAlex Elder bio->bi_io_vec[vcnt - 1].bv_len = resid; 952602adf40SYehuda Sadeh } else { 953f7760dadSAlex Elder bio->bi_io_vec[0].bv_len = len; 954602adf40SYehuda Sadeh } 955602adf40SYehuda Sadeh 956f7760dadSAlex Elder bio->bi_vcnt = vcnt; 957f7760dadSAlex Elder bio->bi_size = len; 958f7760dadSAlex Elder bio->bi_idx = 0; 959602adf40SYehuda Sadeh 960f7760dadSAlex Elder return bio; 961602adf40SYehuda Sadeh } 962602adf40SYehuda Sadeh 963f7760dadSAlex Elder /* 964f7760dadSAlex Elder * Clone a portion of a bio chain, starting at the given byte offset 965f7760dadSAlex Elder * into the first bio in the source chain and continuing for the 966f7760dadSAlex Elder * number of bytes indicated. The result is another bio chain of 967f7760dadSAlex Elder * exactly the given length, or a null pointer on error. 968f7760dadSAlex Elder * 969f7760dadSAlex Elder * The bio_src and offset parameters are both in-out. On entry they 970f7760dadSAlex Elder * refer to the first source bio and the offset into that bio where 971f7760dadSAlex Elder * the start of data to be cloned is located. 972f7760dadSAlex Elder * 973f7760dadSAlex Elder * On return, bio_src is updated to refer to the bio in the source 974f7760dadSAlex Elder * chain that contains first un-cloned byte, and *offset will 975f7760dadSAlex Elder * contain the offset of that byte within that bio. 976f7760dadSAlex Elder */ 977f7760dadSAlex Elder static struct bio *bio_chain_clone_range(struct bio **bio_src, 978f7760dadSAlex Elder unsigned int *offset, 979f7760dadSAlex Elder unsigned int len, 980f7760dadSAlex Elder gfp_t gfpmask) 981f7760dadSAlex Elder { 982f7760dadSAlex Elder struct bio *bi = *bio_src; 983f7760dadSAlex Elder unsigned int off = *offset; 984f7760dadSAlex Elder struct bio *chain = NULL; 985f7760dadSAlex Elder struct bio **end; 986602adf40SYehuda Sadeh 987f7760dadSAlex Elder /* Build up a chain of clone bios up to the limit */ 988602adf40SYehuda Sadeh 989f7760dadSAlex Elder if (!bi || off >= bi->bi_size || !len) 990f7760dadSAlex Elder return NULL; /* Nothing to clone */ 991602adf40SYehuda Sadeh 992f7760dadSAlex Elder end = &chain; 993f7760dadSAlex Elder while (len) { 994f7760dadSAlex Elder unsigned int bi_size; 995f7760dadSAlex Elder struct bio *bio; 996f7760dadSAlex Elder 997f5400b7aSAlex Elder if (!bi) { 998f5400b7aSAlex Elder rbd_warn(NULL, "bio_chain exhausted with %u left", len); 999f7760dadSAlex Elder goto out_err; /* EINVAL; ran out of bio's */ 1000f5400b7aSAlex Elder } 1001f7760dadSAlex Elder bi_size = min_t(unsigned int, bi->bi_size - off, len); 1002f7760dadSAlex Elder bio = bio_clone_range(bi, off, bi_size, gfpmask); 1003f7760dadSAlex Elder if (!bio) 1004f7760dadSAlex Elder goto out_err; /* ENOMEM */ 1005f7760dadSAlex Elder 1006f7760dadSAlex Elder *end = bio; 1007f7760dadSAlex Elder end = &bio->bi_next; 1008f7760dadSAlex Elder 1009f7760dadSAlex Elder off += bi_size; 1010f7760dadSAlex Elder if (off == bi->bi_size) { 1011f7760dadSAlex Elder bi = bi->bi_next; 1012f7760dadSAlex Elder off = 0; 1013f7760dadSAlex Elder } 1014f7760dadSAlex Elder len -= bi_size; 1015f7760dadSAlex Elder } 1016f7760dadSAlex Elder *bio_src = bi; 1017f7760dadSAlex Elder *offset = off; 1018f7760dadSAlex Elder 1019f7760dadSAlex Elder return chain; 1020f7760dadSAlex Elder out_err: 1021f7760dadSAlex Elder bio_chain_put(chain); 1022f7760dadSAlex Elder 1023602adf40SYehuda Sadeh return NULL; 1024602adf40SYehuda Sadeh } 1025602adf40SYehuda Sadeh 1026602adf40SYehuda Sadeh /* 1027602adf40SYehuda Sadeh * helpers for osd request op vectors. 1028602adf40SYehuda Sadeh */ 102957cfc106SAlex Elder static struct ceph_osd_req_op *rbd_create_rw_ops(int num_ops, 103057cfc106SAlex Elder int opcode, u32 payload_len) 1031602adf40SYehuda Sadeh { 103257cfc106SAlex Elder struct ceph_osd_req_op *ops; 103357cfc106SAlex Elder 103457cfc106SAlex Elder ops = kzalloc(sizeof (*ops) * (num_ops + 1), GFP_NOIO); 103557cfc106SAlex Elder if (!ops) 103657cfc106SAlex Elder return NULL; 103757cfc106SAlex Elder 103857cfc106SAlex Elder ops[0].op = opcode; 103957cfc106SAlex Elder 1040602adf40SYehuda Sadeh /* 1041602adf40SYehuda Sadeh * op extent offset and length will be set later on 1042602adf40SYehuda Sadeh * in calc_raw_layout() 1043602adf40SYehuda Sadeh */ 104457cfc106SAlex Elder ops[0].payload_len = payload_len; 104557cfc106SAlex Elder 104657cfc106SAlex Elder return ops; 1047602adf40SYehuda Sadeh } 1048602adf40SYehuda Sadeh 1049602adf40SYehuda Sadeh static void rbd_destroy_ops(struct ceph_osd_req_op *ops) 1050602adf40SYehuda Sadeh { 1051602adf40SYehuda Sadeh kfree(ops); 1052602adf40SYehuda Sadeh } 1053602adf40SYehuda Sadeh 10541fec7093SYehuda Sadeh static void rbd_coll_end_req_index(struct request *rq, 10551fec7093SYehuda Sadeh struct rbd_req_coll *coll, 10561fec7093SYehuda Sadeh int index, 10578986cb37SAlex Elder s32 ret, u64 len) 10581fec7093SYehuda Sadeh { 10591fec7093SYehuda Sadeh struct request_queue *q; 10601fec7093SYehuda Sadeh int min, max, i; 10611fec7093SYehuda Sadeh 1062bd919d45SAlex Elder dout("rbd_coll_end_req_index %p index %d ret %d len %llu\n", 10638986cb37SAlex Elder coll, index, (int)ret, (unsigned long long)len); 10641fec7093SYehuda Sadeh 10651fec7093SYehuda Sadeh if (!rq) 10661fec7093SYehuda Sadeh return; 10671fec7093SYehuda Sadeh 10681fec7093SYehuda Sadeh if (!coll) { 10691fec7093SYehuda Sadeh blk_end_request(rq, ret, len); 10701fec7093SYehuda Sadeh return; 10711fec7093SYehuda Sadeh } 10721fec7093SYehuda Sadeh 10731fec7093SYehuda Sadeh q = rq->q; 10741fec7093SYehuda Sadeh 10751fec7093SYehuda Sadeh spin_lock_irq(q->queue_lock); 10761fec7093SYehuda Sadeh coll->status[index].done = 1; 10771fec7093SYehuda Sadeh coll->status[index].rc = ret; 10781fec7093SYehuda Sadeh coll->status[index].bytes = len; 10791fec7093SYehuda Sadeh max = min = coll->num_done; 10801fec7093SYehuda Sadeh while (max < coll->total && coll->status[max].done) 10811fec7093SYehuda Sadeh max++; 10821fec7093SYehuda Sadeh 10831fec7093SYehuda Sadeh for (i = min; i<max; i++) { 10848986cb37SAlex Elder __blk_end_request(rq, (int)coll->status[i].rc, 10851fec7093SYehuda Sadeh coll->status[i].bytes); 10861fec7093SYehuda Sadeh coll->num_done++; 10871fec7093SYehuda Sadeh kref_put(&coll->kref, rbd_coll_release); 10881fec7093SYehuda Sadeh } 10891fec7093SYehuda Sadeh spin_unlock_irq(q->queue_lock); 10901fec7093SYehuda Sadeh } 10911fec7093SYehuda Sadeh 1092725afc97SAlex Elder static void rbd_coll_end_req(struct rbd_request *rbd_req, 10938986cb37SAlex Elder s32 ret, u64 len) 10941fec7093SYehuda Sadeh { 1095725afc97SAlex Elder rbd_coll_end_req_index(rbd_req->rq, 1096725afc97SAlex Elder rbd_req->coll, rbd_req->coll_index, 1097725afc97SAlex Elder ret, len); 10981fec7093SYehuda Sadeh } 10991fec7093SYehuda Sadeh 11000ec8ce87SAlex Elder static void rbd_layout_init(struct ceph_file_layout *layout, u64 pool_id) 11010ec8ce87SAlex Elder { 11020ec8ce87SAlex Elder memset(layout, 0, sizeof (*layout)); 11030ec8ce87SAlex Elder layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); 11040ec8ce87SAlex Elder layout->fl_stripe_count = cpu_to_le32(1); 11050ec8ce87SAlex Elder layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); 11060ec8ce87SAlex Elder rbd_assert(pool_id <= (u64) U32_MAX); 11070ec8ce87SAlex Elder layout->fl_pg_pool = cpu_to_le32((u32) pool_id); 11080ec8ce87SAlex Elder } 11090ec8ce87SAlex Elder 1110602adf40SYehuda Sadeh /* 1111602adf40SYehuda Sadeh * Send ceph osd request 1112602adf40SYehuda Sadeh */ 1113602adf40SYehuda Sadeh static int rbd_do_request(struct request *rq, 11140ce1a794SAlex Elder struct rbd_device *rbd_dev, 1115602adf40SYehuda Sadeh struct ceph_snap_context *snapc, 1116602adf40SYehuda Sadeh u64 snapid, 1117aded07eaSAlex Elder const char *object_name, u64 ofs, u64 len, 1118602adf40SYehuda Sadeh struct bio *bio, 1119602adf40SYehuda Sadeh struct page **pages, 1120602adf40SYehuda Sadeh int num_pages, 1121602adf40SYehuda Sadeh int flags, 1122d07c0958SAlex Elder unsigned int num_op, 1123602adf40SYehuda Sadeh struct ceph_osd_req_op *ops, 11241fec7093SYehuda Sadeh struct rbd_req_coll *coll, 11251fec7093SYehuda Sadeh int coll_index, 11265f29ddd4SAlex Elder void (*rbd_cb)(struct ceph_osd_request *, 11275f29ddd4SAlex Elder struct ceph_msg *), 112859c2be1eSYehuda Sadeh struct ceph_osd_request **linger_req, 112959c2be1eSYehuda Sadeh u64 *ver) 1130602adf40SYehuda Sadeh { 11315f29ddd4SAlex Elder struct ceph_osd_request *osd_req; 1132602adf40SYehuda Sadeh int ret; 1133602adf40SYehuda Sadeh u64 bno; 1134602adf40SYehuda Sadeh struct timespec mtime = CURRENT_TIME; 1135725afc97SAlex Elder struct rbd_request *rbd_req; 11361dbb4399SAlex Elder struct ceph_osd_client *osdc; 1137602adf40SYehuda Sadeh 1138725afc97SAlex Elder rbd_req = kzalloc(sizeof(*rbd_req), GFP_NOIO); 1139cd323ac0SAlex Elder if (!rbd_req) 11401fec7093SYehuda Sadeh return -ENOMEM; 1141602adf40SYehuda Sadeh 11421fec7093SYehuda Sadeh if (coll) { 1143725afc97SAlex Elder rbd_req->coll = coll; 1144725afc97SAlex Elder rbd_req->coll_index = coll_index; 11451fec7093SYehuda Sadeh } 11461fec7093SYehuda Sadeh 1147f7760dadSAlex Elder dout("rbd_do_request object_name=%s ofs=%llu len=%llu coll=%p[%d]\n", 1148f7760dadSAlex Elder object_name, (unsigned long long) ofs, 1149f7760dadSAlex Elder (unsigned long long) len, coll, coll_index); 1150602adf40SYehuda Sadeh 11510ce1a794SAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 115254a54007SAlex Elder osd_req = ceph_osdc_alloc_request(osdc, snapc, ops, false, GFP_NOIO); 11535f29ddd4SAlex Elder if (!osd_req) { 11544ad12621SSage Weil ret = -ENOMEM; 1155602adf40SYehuda Sadeh goto done_pages; 1156602adf40SYehuda Sadeh } 1157602adf40SYehuda Sadeh 1158d178a9e7SAlex Elder osd_req->r_flags = flags; 115954a54007SAlex Elder osd_req->r_pages = pages; 116054a54007SAlex Elder if (bio) { 116154a54007SAlex Elder osd_req->r_bio = bio; 116254a54007SAlex Elder bio_get(osd_req->r_bio); 116354a54007SAlex Elder } 11645f29ddd4SAlex Elder osd_req->r_callback = rbd_cb; 1165602adf40SYehuda Sadeh 1166725afc97SAlex Elder rbd_req->rq = rq; 1167725afc97SAlex Elder rbd_req->bio = bio; 1168725afc97SAlex Elder rbd_req->pages = pages; 1169725afc97SAlex Elder rbd_req->len = len; 1170602adf40SYehuda Sadeh 11715f29ddd4SAlex Elder osd_req->r_priv = rbd_req; 1172602adf40SYehuda Sadeh 11735f29ddd4SAlex Elder strncpy(osd_req->r_oid, object_name, sizeof(osd_req->r_oid)); 11745f29ddd4SAlex Elder osd_req->r_oid_len = strlen(osd_req->r_oid); 1175602adf40SYehuda Sadeh 11760ec8ce87SAlex Elder rbd_layout_init(&osd_req->r_file_layout, rbd_dev->spec->pool_id); 1177e75b45cfSAlex Elder ret = ceph_calc_raw_layout(&osd_req->r_file_layout, 11784d6b250bSAlex Elder ofs, &len, &bno, osd_req, ops); 11796cae3717SSage Weil rbd_assert(ret == 0); 1180602adf40SYehuda Sadeh 11814d6b250bSAlex Elder ceph_osdc_build_request(osd_req, ofs, len, ops, snapc, snapid, &mtime); 1182602adf40SYehuda Sadeh 118359c2be1eSYehuda Sadeh if (linger_req) { 11845f29ddd4SAlex Elder ceph_osdc_set_request_linger(osdc, osd_req); 11855f29ddd4SAlex Elder *linger_req = osd_req; 118659c2be1eSYehuda Sadeh } 118759c2be1eSYehuda Sadeh 11885f29ddd4SAlex Elder ret = ceph_osdc_start_request(osdc, osd_req, false); 1189602adf40SYehuda Sadeh if (ret < 0) 1190602adf40SYehuda Sadeh goto done_err; 1191602adf40SYehuda Sadeh 1192602adf40SYehuda Sadeh if (!rbd_cb) { 11935f29ddd4SAlex Elder u64 version; 11945f29ddd4SAlex Elder 11955f29ddd4SAlex Elder ret = ceph_osdc_wait_request(osdc, osd_req); 11965f29ddd4SAlex Elder version = le64_to_cpu(osd_req->r_reassert_version.version); 119759c2be1eSYehuda Sadeh if (ver) 11985f29ddd4SAlex Elder *ver = version; 11995f29ddd4SAlex Elder dout("reassert_ver=%llu\n", (unsigned long long) version); 12005f29ddd4SAlex Elder ceph_osdc_put_request(osd_req); 1201602adf40SYehuda Sadeh } 1202602adf40SYehuda Sadeh return ret; 1203602adf40SYehuda Sadeh 1204602adf40SYehuda Sadeh done_err: 1205725afc97SAlex Elder bio_chain_put(rbd_req->bio); 12065f29ddd4SAlex Elder ceph_osdc_put_request(osd_req); 1207602adf40SYehuda Sadeh done_pages: 1208725afc97SAlex Elder kfree(rbd_req); 1209602adf40SYehuda Sadeh return ret; 1210602adf40SYehuda Sadeh } 1211602adf40SYehuda Sadeh 1212602adf40SYehuda Sadeh /* 1213602adf40SYehuda Sadeh * Ceph osd op callback 1214602adf40SYehuda Sadeh */ 12155f29ddd4SAlex Elder static void rbd_req_cb(struct ceph_osd_request *osd_req, struct ceph_msg *msg) 1216602adf40SYehuda Sadeh { 12175f29ddd4SAlex Elder struct rbd_request *rbd_req = osd_req->r_priv; 1218602adf40SYehuda Sadeh struct ceph_osd_reply_head *replyhead; 1219602adf40SYehuda Sadeh struct ceph_osd_op *op; 12208986cb37SAlex Elder s32 rc; 1221602adf40SYehuda Sadeh u64 bytes; 1222602adf40SYehuda Sadeh int read_op; 1223602adf40SYehuda Sadeh 1224602adf40SYehuda Sadeh /* parse reply */ 1225602adf40SYehuda Sadeh replyhead = msg->front.iov_base; 1226602adf40SYehuda Sadeh WARN_ON(le32_to_cpu(replyhead->num_ops) == 0); 1227602adf40SYehuda Sadeh op = (void *)(replyhead + 1); 12288986cb37SAlex Elder rc = (s32)le32_to_cpu(replyhead->result); 1229602adf40SYehuda Sadeh bytes = le64_to_cpu(op->extent.length); 1230895cfcc8SDan Carpenter read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ); 1231602adf40SYehuda Sadeh 1232bd919d45SAlex Elder dout("rbd_req_cb bytes=%llu readop=%d rc=%d\n", 1233bd919d45SAlex Elder (unsigned long long) bytes, read_op, (int) rc); 1234602adf40SYehuda Sadeh 12358986cb37SAlex Elder if (rc == (s32)-ENOENT && read_op) { 1236725afc97SAlex Elder zero_bio_chain(rbd_req->bio, 0); 1237602adf40SYehuda Sadeh rc = 0; 1238725afc97SAlex Elder } else if (rc == 0 && read_op && bytes < rbd_req->len) { 1239725afc97SAlex Elder zero_bio_chain(rbd_req->bio, bytes); 1240725afc97SAlex Elder bytes = rbd_req->len; 1241602adf40SYehuda Sadeh } 1242602adf40SYehuda Sadeh 1243725afc97SAlex Elder rbd_coll_end_req(rbd_req, rc, bytes); 1244602adf40SYehuda Sadeh 1245725afc97SAlex Elder if (rbd_req->bio) 1246725afc97SAlex Elder bio_chain_put(rbd_req->bio); 1247602adf40SYehuda Sadeh 12485f29ddd4SAlex Elder ceph_osdc_put_request(osd_req); 1249725afc97SAlex Elder kfree(rbd_req); 1250602adf40SYehuda Sadeh } 1251602adf40SYehuda Sadeh 12525f29ddd4SAlex Elder static void rbd_simple_req_cb(struct ceph_osd_request *osd_req, 12535f29ddd4SAlex Elder struct ceph_msg *msg) 125459c2be1eSYehuda Sadeh { 12555f29ddd4SAlex Elder ceph_osdc_put_request(osd_req); 125659c2be1eSYehuda Sadeh } 125759c2be1eSYehuda Sadeh 1258602adf40SYehuda Sadeh /* 1259602adf40SYehuda Sadeh * Do a synchronous ceph osd operation 1260602adf40SYehuda Sadeh */ 12610ce1a794SAlex Elder static int rbd_req_sync_op(struct rbd_device *rbd_dev, 1262602adf40SYehuda Sadeh int flags, 1263d07c0958SAlex Elder unsigned int num_op, 1264913d2fdcSAlex Elder struct ceph_osd_req_op *ops, 1265aded07eaSAlex Elder const char *object_name, 1266f8d4de6eSAlex Elder u64 ofs, u64 inbound_size, 1267f8d4de6eSAlex Elder char *inbound, 126859c2be1eSYehuda Sadeh struct ceph_osd_request **linger_req, 126959c2be1eSYehuda Sadeh u64 *ver) 1270602adf40SYehuda Sadeh { 1271602adf40SYehuda Sadeh int ret; 1272602adf40SYehuda Sadeh struct page **pages; 1273602adf40SYehuda Sadeh int num_pages; 1274913d2fdcSAlex Elder 1275aafb230eSAlex Elder rbd_assert(ops != NULL); 1276602adf40SYehuda Sadeh 1277f8d4de6eSAlex Elder num_pages = calc_pages_for(ofs, inbound_size); 1278602adf40SYehuda Sadeh pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); 1279b8d0638aSDan Carpenter if (IS_ERR(pages)) 1280b8d0638aSDan Carpenter return PTR_ERR(pages); 1281602adf40SYehuda Sadeh 128225704ac9SAlex Elder ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP, 1283f8d4de6eSAlex Elder object_name, ofs, inbound_size, NULL, 1284602adf40SYehuda Sadeh pages, num_pages, 1285602adf40SYehuda Sadeh flags, 1286d07c0958SAlex Elder num_op, ops, 12871fec7093SYehuda Sadeh NULL, 0, 128859c2be1eSYehuda Sadeh NULL, 128959c2be1eSYehuda Sadeh linger_req, ver); 1290602adf40SYehuda Sadeh if (ret < 0) 1291913d2fdcSAlex Elder goto done; 1292602adf40SYehuda Sadeh 1293f8d4de6eSAlex Elder if ((flags & CEPH_OSD_FLAG_READ) && inbound) 1294f8d4de6eSAlex Elder ret = ceph_copy_from_page_vector(pages, inbound, ofs, ret); 1295602adf40SYehuda Sadeh 1296602adf40SYehuda Sadeh done: 1297602adf40SYehuda Sadeh ceph_release_page_vector(pages, num_pages); 1298602adf40SYehuda Sadeh return ret; 1299602adf40SYehuda Sadeh } 1300602adf40SYehuda Sadeh 1301602adf40SYehuda Sadeh /* 1302602adf40SYehuda Sadeh * Do an asynchronous ceph osd operation 1303602adf40SYehuda Sadeh */ 1304602adf40SYehuda Sadeh static int rbd_do_op(struct request *rq, 1305602adf40SYehuda Sadeh struct rbd_device *rbd_dev, 1306602adf40SYehuda Sadeh struct ceph_snap_context *snapc, 1307602adf40SYehuda Sadeh u64 ofs, u64 len, 13081fec7093SYehuda Sadeh struct bio *bio, 13091fec7093SYehuda Sadeh struct rbd_req_coll *coll, 13101fec7093SYehuda Sadeh int coll_index) 1311602adf40SYehuda Sadeh { 1312602adf40SYehuda Sadeh char *seg_name; 1313602adf40SYehuda Sadeh u64 seg_ofs; 1314602adf40SYehuda Sadeh u64 seg_len; 1315602adf40SYehuda Sadeh int ret; 1316602adf40SYehuda Sadeh struct ceph_osd_req_op *ops; 1317602adf40SYehuda Sadeh u32 payload_len; 1318ff2e4bb5SAlex Elder int opcode; 1319ff2e4bb5SAlex Elder int flags; 13204634246dSAlex Elder u64 snapid; 1321602adf40SYehuda Sadeh 132265ccfe21SAlex Elder seg_name = rbd_segment_name(rbd_dev, ofs); 1323602adf40SYehuda Sadeh if (!seg_name) 1324602adf40SYehuda Sadeh return -ENOMEM; 132565ccfe21SAlex Elder seg_len = rbd_segment_length(rbd_dev, ofs, len); 132665ccfe21SAlex Elder seg_ofs = rbd_segment_offset(rbd_dev, ofs); 1327602adf40SYehuda Sadeh 1328ff2e4bb5SAlex Elder if (rq_data_dir(rq) == WRITE) { 1329ff2e4bb5SAlex Elder opcode = CEPH_OSD_OP_WRITE; 1330ff2e4bb5SAlex Elder flags = CEPH_OSD_FLAG_WRITE|CEPH_OSD_FLAG_ONDISK; 13314634246dSAlex Elder snapid = CEPH_NOSNAP; 1332ff2e4bb5SAlex Elder payload_len = seg_len; 1333ff2e4bb5SAlex Elder } else { 1334ff2e4bb5SAlex Elder opcode = CEPH_OSD_OP_READ; 1335ff2e4bb5SAlex Elder flags = CEPH_OSD_FLAG_READ; 1336a7b4c65fSAlex Elder rbd_assert(!snapc); 13370d7dbfceSAlex Elder snapid = rbd_dev->spec->snap_id; 1338ff2e4bb5SAlex Elder payload_len = 0; 1339ff2e4bb5SAlex Elder } 1340602adf40SYehuda Sadeh 134157cfc106SAlex Elder ret = -ENOMEM; 134257cfc106SAlex Elder ops = rbd_create_rw_ops(1, opcode, payload_len); 134357cfc106SAlex Elder if (!ops) 1344602adf40SYehuda Sadeh goto done; 1345602adf40SYehuda Sadeh 1346602adf40SYehuda Sadeh /* we've taken care of segment sizes earlier when we 1347602adf40SYehuda Sadeh cloned the bios. We should never have a segment 1348602adf40SYehuda Sadeh truncated at this point */ 1349aafb230eSAlex Elder rbd_assert(seg_len == len); 1350602adf40SYehuda Sadeh 1351602adf40SYehuda Sadeh ret = rbd_do_request(rq, rbd_dev, snapc, snapid, 1352602adf40SYehuda Sadeh seg_name, seg_ofs, seg_len, 1353602adf40SYehuda Sadeh bio, 1354602adf40SYehuda Sadeh NULL, 0, 1355602adf40SYehuda Sadeh flags, 1356d07c0958SAlex Elder 1, ops, 13571fec7093SYehuda Sadeh coll, coll_index, 135859c2be1eSYehuda Sadeh rbd_req_cb, 0, NULL); 1359cd323ac0SAlex Elder if (ret < 0) 1360cd323ac0SAlex Elder rbd_coll_end_req_index(rq, coll, coll_index, 1361cd323ac0SAlex Elder (s32)ret, seg_len); 136211f77002SSage Weil rbd_destroy_ops(ops); 1363602adf40SYehuda Sadeh done: 1364602adf40SYehuda Sadeh kfree(seg_name); 1365602adf40SYehuda Sadeh return ret; 1366602adf40SYehuda Sadeh } 1367602adf40SYehuda Sadeh 1368602adf40SYehuda Sadeh /* 1369602adf40SYehuda Sadeh * Request sync osd read 1370602adf40SYehuda Sadeh */ 13710ce1a794SAlex Elder static int rbd_req_sync_read(struct rbd_device *rbd_dev, 1372aded07eaSAlex Elder const char *object_name, 1373602adf40SYehuda Sadeh u64 ofs, u64 len, 137459c2be1eSYehuda Sadeh char *buf, 137559c2be1eSYehuda Sadeh u64 *ver) 1376602adf40SYehuda Sadeh { 1377913d2fdcSAlex Elder struct ceph_osd_req_op *ops; 1378913d2fdcSAlex Elder int ret; 1379913d2fdcSAlex Elder 1380913d2fdcSAlex Elder ops = rbd_create_rw_ops(1, CEPH_OSD_OP_READ, 0); 1381913d2fdcSAlex Elder if (!ops) 1382913d2fdcSAlex Elder return -ENOMEM; 1383913d2fdcSAlex Elder 138425704ac9SAlex Elder ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ, 1385d07c0958SAlex Elder 1, ops, object_name, ofs, len, buf, NULL, ver); 1386913d2fdcSAlex Elder rbd_destroy_ops(ops); 1387913d2fdcSAlex Elder 1388913d2fdcSAlex Elder return ret; 1389602adf40SYehuda Sadeh } 1390602adf40SYehuda Sadeh 1391602adf40SYehuda Sadeh /* 139259c2be1eSYehuda Sadeh * Request sync osd watch 139359c2be1eSYehuda Sadeh */ 13940ce1a794SAlex Elder static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev, 139559c2be1eSYehuda Sadeh u64 ver, 13967f0a24d8SAlex Elder u64 notify_id) 139759c2be1eSYehuda Sadeh { 139859c2be1eSYehuda Sadeh struct ceph_osd_req_op *ops; 139911f77002SSage Weil int ret; 140011f77002SSage Weil 140157cfc106SAlex Elder ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY_ACK, 0); 140257cfc106SAlex Elder if (!ops) 140357cfc106SAlex Elder return -ENOMEM; 140459c2be1eSYehuda Sadeh 1405a71b891bSJosh Durgin ops[0].watch.ver = cpu_to_le64(ver); 140659c2be1eSYehuda Sadeh ops[0].watch.cookie = notify_id; 140759c2be1eSYehuda Sadeh ops[0].watch.flag = 0; 140859c2be1eSYehuda Sadeh 14090ce1a794SAlex Elder ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP, 14107f0a24d8SAlex Elder rbd_dev->header_name, 0, 0, NULL, 1411ad4f232fSAlex Elder NULL, 0, 141259c2be1eSYehuda Sadeh CEPH_OSD_FLAG_READ, 1413d07c0958SAlex Elder 1, ops, 14141fec7093SYehuda Sadeh NULL, 0, 141559c2be1eSYehuda Sadeh rbd_simple_req_cb, 0, NULL); 141659c2be1eSYehuda Sadeh 141759c2be1eSYehuda Sadeh rbd_destroy_ops(ops); 141859c2be1eSYehuda Sadeh return ret; 141959c2be1eSYehuda Sadeh } 142059c2be1eSYehuda Sadeh 142159c2be1eSYehuda Sadeh static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data) 142259c2be1eSYehuda Sadeh { 14230ce1a794SAlex Elder struct rbd_device *rbd_dev = (struct rbd_device *)data; 1424a71b891bSJosh Durgin u64 hver; 142513143d2dSSage Weil int rc; 142613143d2dSSage Weil 14270ce1a794SAlex Elder if (!rbd_dev) 142859c2be1eSYehuda Sadeh return; 142959c2be1eSYehuda Sadeh 1430bd919d45SAlex Elder dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n", 1431bd919d45SAlex Elder rbd_dev->header_name, (unsigned long long) notify_id, 1432bd919d45SAlex Elder (unsigned int) opcode); 1433117973fbSAlex Elder rc = rbd_dev_refresh(rbd_dev, &hver); 143413143d2dSSage Weil if (rc) 143506ecc6cbSAlex Elder rbd_warn(rbd_dev, "got notification but failed to " 143606ecc6cbSAlex Elder " update snaps: %d\n", rc); 143759c2be1eSYehuda Sadeh 14387f0a24d8SAlex Elder rbd_req_sync_notify_ack(rbd_dev, hver, notify_id); 143959c2be1eSYehuda Sadeh } 144059c2be1eSYehuda Sadeh 144159c2be1eSYehuda Sadeh /* 144259c2be1eSYehuda Sadeh * Request sync osd watch 144359c2be1eSYehuda Sadeh */ 14440e6f322dSAlex Elder static int rbd_req_sync_watch(struct rbd_device *rbd_dev) 144559c2be1eSYehuda Sadeh { 144659c2be1eSYehuda Sadeh struct ceph_osd_req_op *ops; 14470ce1a794SAlex Elder struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 144857cfc106SAlex Elder int ret; 144959c2be1eSYehuda Sadeh 145057cfc106SAlex Elder ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0); 145157cfc106SAlex Elder if (!ops) 145257cfc106SAlex Elder return -ENOMEM; 145359c2be1eSYehuda Sadeh 145459c2be1eSYehuda Sadeh ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0, 14550ce1a794SAlex Elder (void *)rbd_dev, &rbd_dev->watch_event); 145659c2be1eSYehuda Sadeh if (ret < 0) 145759c2be1eSYehuda Sadeh goto fail; 145859c2be1eSYehuda Sadeh 14590e6f322dSAlex Elder ops[0].watch.ver = cpu_to_le64(rbd_dev->header.obj_version); 14600ce1a794SAlex Elder ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie); 146159c2be1eSYehuda Sadeh ops[0].watch.flag = 1; 146259c2be1eSYehuda Sadeh 146325704ac9SAlex Elder ret = rbd_req_sync_op(rbd_dev, 146459c2be1eSYehuda Sadeh CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, 1465d07c0958SAlex Elder 1, ops, 14660e6f322dSAlex Elder rbd_dev->header_name, 14670e6f322dSAlex Elder 0, 0, NULL, 14680ce1a794SAlex Elder &rbd_dev->watch_request, NULL); 146959c2be1eSYehuda Sadeh 147059c2be1eSYehuda Sadeh if (ret < 0) 147159c2be1eSYehuda Sadeh goto fail_event; 147259c2be1eSYehuda Sadeh 147359c2be1eSYehuda Sadeh rbd_destroy_ops(ops); 147459c2be1eSYehuda Sadeh return 0; 147559c2be1eSYehuda Sadeh 147659c2be1eSYehuda Sadeh fail_event: 14770ce1a794SAlex Elder ceph_osdc_cancel_event(rbd_dev->watch_event); 14780ce1a794SAlex Elder rbd_dev->watch_event = NULL; 147959c2be1eSYehuda Sadeh fail: 148059c2be1eSYehuda Sadeh rbd_destroy_ops(ops); 148159c2be1eSYehuda Sadeh return ret; 148259c2be1eSYehuda Sadeh } 148359c2be1eSYehuda Sadeh 148479e3057cSYehuda Sadeh /* 148579e3057cSYehuda Sadeh * Request sync osd unwatch 148679e3057cSYehuda Sadeh */ 1487070c633fSAlex Elder static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev) 148879e3057cSYehuda Sadeh { 148979e3057cSYehuda Sadeh struct ceph_osd_req_op *ops; 149057cfc106SAlex Elder int ret; 149179e3057cSYehuda Sadeh 149257cfc106SAlex Elder ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0); 149357cfc106SAlex Elder if (!ops) 149457cfc106SAlex Elder return -ENOMEM; 149579e3057cSYehuda Sadeh 149679e3057cSYehuda Sadeh ops[0].watch.ver = 0; 14970ce1a794SAlex Elder ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie); 149879e3057cSYehuda Sadeh ops[0].watch.flag = 0; 149979e3057cSYehuda Sadeh 150025704ac9SAlex Elder ret = rbd_req_sync_op(rbd_dev, 150179e3057cSYehuda Sadeh CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, 1502d07c0958SAlex Elder 1, ops, 1503070c633fSAlex Elder rbd_dev->header_name, 1504070c633fSAlex Elder 0, 0, NULL, NULL, NULL); 1505070c633fSAlex Elder 150679e3057cSYehuda Sadeh 150779e3057cSYehuda Sadeh rbd_destroy_ops(ops); 15080ce1a794SAlex Elder ceph_osdc_cancel_event(rbd_dev->watch_event); 15090ce1a794SAlex Elder rbd_dev->watch_event = NULL; 151079e3057cSYehuda Sadeh return ret; 151179e3057cSYehuda Sadeh } 151279e3057cSYehuda Sadeh 151359c2be1eSYehuda Sadeh /* 15143cb4a687SAlex Elder * Synchronous osd object method call 1515602adf40SYehuda Sadeh */ 15160ce1a794SAlex Elder static int rbd_req_sync_exec(struct rbd_device *rbd_dev, 1517aded07eaSAlex Elder const char *object_name, 1518aded07eaSAlex Elder const char *class_name, 1519aded07eaSAlex Elder const char *method_name, 15203cb4a687SAlex Elder const char *outbound, 15213cb4a687SAlex Elder size_t outbound_size, 1522f8d4de6eSAlex Elder char *inbound, 1523f8d4de6eSAlex Elder size_t inbound_size, 152459c2be1eSYehuda Sadeh u64 *ver) 1525602adf40SYehuda Sadeh { 1526602adf40SYehuda Sadeh struct ceph_osd_req_op *ops; 1527aded07eaSAlex Elder int class_name_len = strlen(class_name); 1528aded07eaSAlex Elder int method_name_len = strlen(method_name); 15293cb4a687SAlex Elder int payload_size; 153057cfc106SAlex Elder int ret; 153157cfc106SAlex Elder 15323cb4a687SAlex Elder /* 15333cb4a687SAlex Elder * Any input parameters required by the method we're calling 15343cb4a687SAlex Elder * will be sent along with the class and method names as 15353cb4a687SAlex Elder * part of the message payload. That data and its size are 15363cb4a687SAlex Elder * supplied via the indata and indata_len fields (named from 15373cb4a687SAlex Elder * the perspective of the server side) in the OSD request 15383cb4a687SAlex Elder * operation. 15393cb4a687SAlex Elder */ 15403cb4a687SAlex Elder payload_size = class_name_len + method_name_len + outbound_size; 15413cb4a687SAlex Elder ops = rbd_create_rw_ops(1, CEPH_OSD_OP_CALL, payload_size); 154257cfc106SAlex Elder if (!ops) 154357cfc106SAlex Elder return -ENOMEM; 1544602adf40SYehuda Sadeh 1545aded07eaSAlex Elder ops[0].cls.class_name = class_name; 1546aded07eaSAlex Elder ops[0].cls.class_len = (__u8) class_name_len; 1547aded07eaSAlex Elder ops[0].cls.method_name = method_name; 1548aded07eaSAlex Elder ops[0].cls.method_len = (__u8) method_name_len; 1549602adf40SYehuda Sadeh ops[0].cls.argc = 0; 15503cb4a687SAlex Elder ops[0].cls.indata = outbound; 15513cb4a687SAlex Elder ops[0].cls.indata_len = outbound_size; 1552602adf40SYehuda Sadeh 1553d07c0958SAlex Elder ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ, 1, ops, 1554f8d4de6eSAlex Elder object_name, 0, inbound_size, inbound, 1555f8d4de6eSAlex Elder NULL, ver); 1556602adf40SYehuda Sadeh 1557602adf40SYehuda Sadeh rbd_destroy_ops(ops); 1558602adf40SYehuda Sadeh 1559602adf40SYehuda Sadeh dout("cls_exec returned %d\n", ret); 1560602adf40SYehuda Sadeh return ret; 1561602adf40SYehuda Sadeh } 1562602adf40SYehuda Sadeh 15631fec7093SYehuda Sadeh static struct rbd_req_coll *rbd_alloc_coll(int num_reqs) 15641fec7093SYehuda Sadeh { 15651fec7093SYehuda Sadeh struct rbd_req_coll *coll = 15661fec7093SYehuda Sadeh kzalloc(sizeof(struct rbd_req_coll) + 15671fec7093SYehuda Sadeh sizeof(struct rbd_req_status) * num_reqs, 15681fec7093SYehuda Sadeh GFP_ATOMIC); 15691fec7093SYehuda Sadeh 15701fec7093SYehuda Sadeh if (!coll) 15711fec7093SYehuda Sadeh return NULL; 15721fec7093SYehuda Sadeh coll->total = num_reqs; 15731fec7093SYehuda Sadeh kref_init(&coll->kref); 15741fec7093SYehuda Sadeh return coll; 15751fec7093SYehuda Sadeh } 15761fec7093SYehuda Sadeh 15778295cda7SAlex Elder static int rbd_dev_do_request(struct request *rq, 15788295cda7SAlex Elder struct rbd_device *rbd_dev, 15798295cda7SAlex Elder struct ceph_snap_context *snapc, 15808295cda7SAlex Elder u64 ofs, unsigned int size, 15818295cda7SAlex Elder struct bio *bio_chain) 15828295cda7SAlex Elder { 15838295cda7SAlex Elder int num_segs; 15848295cda7SAlex Elder struct rbd_req_coll *coll; 15858295cda7SAlex Elder unsigned int bio_offset; 15868295cda7SAlex Elder int cur_seg = 0; 15878295cda7SAlex Elder 15888295cda7SAlex Elder dout("%s 0x%x bytes at 0x%llx\n", 15898295cda7SAlex Elder rq_data_dir(rq) == WRITE ? "write" : "read", 15908295cda7SAlex Elder size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE); 15918295cda7SAlex Elder 15928295cda7SAlex Elder num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size); 15938295cda7SAlex Elder if (num_segs <= 0) 15948295cda7SAlex Elder return num_segs; 15958295cda7SAlex Elder 15968295cda7SAlex Elder coll = rbd_alloc_coll(num_segs); 15978295cda7SAlex Elder if (!coll) 15988295cda7SAlex Elder return -ENOMEM; 15998295cda7SAlex Elder 16008295cda7SAlex Elder bio_offset = 0; 16018295cda7SAlex Elder do { 16028295cda7SAlex Elder u64 limit = rbd_segment_length(rbd_dev, ofs, size); 16038295cda7SAlex Elder unsigned int clone_size; 16048295cda7SAlex Elder struct bio *bio_clone; 16058295cda7SAlex Elder 16068295cda7SAlex Elder BUG_ON(limit > (u64)UINT_MAX); 16078295cda7SAlex Elder clone_size = (unsigned int)limit; 16088295cda7SAlex Elder dout("bio_chain->bi_vcnt=%hu\n", bio_chain->bi_vcnt); 16098295cda7SAlex Elder 16108295cda7SAlex Elder kref_get(&coll->kref); 16118295cda7SAlex Elder 16128295cda7SAlex Elder /* Pass a cloned bio chain via an osd request */ 16138295cda7SAlex Elder 16148295cda7SAlex Elder bio_clone = bio_chain_clone_range(&bio_chain, 16158295cda7SAlex Elder &bio_offset, clone_size, 16168295cda7SAlex Elder GFP_ATOMIC); 16178295cda7SAlex Elder if (bio_clone) 16188295cda7SAlex Elder (void)rbd_do_op(rq, rbd_dev, snapc, 16198295cda7SAlex Elder ofs, clone_size, 16208295cda7SAlex Elder bio_clone, coll, cur_seg); 16218295cda7SAlex Elder else 16228295cda7SAlex Elder rbd_coll_end_req_index(rq, coll, cur_seg, 16238295cda7SAlex Elder (s32)-ENOMEM, 16248295cda7SAlex Elder clone_size); 16258295cda7SAlex Elder size -= clone_size; 16268295cda7SAlex Elder ofs += clone_size; 16278295cda7SAlex Elder 16288295cda7SAlex Elder cur_seg++; 16298295cda7SAlex Elder } while (size > 0); 16308295cda7SAlex Elder kref_put(&coll->kref, rbd_coll_release); 16318295cda7SAlex Elder 16328295cda7SAlex Elder return 0; 16338295cda7SAlex Elder } 16348295cda7SAlex Elder 1635602adf40SYehuda Sadeh /* 1636602adf40SYehuda Sadeh * block device queue callback 1637602adf40SYehuda Sadeh */ 1638602adf40SYehuda Sadeh static void rbd_rq_fn(struct request_queue *q) 1639602adf40SYehuda Sadeh { 1640602adf40SYehuda Sadeh struct rbd_device *rbd_dev = q->queuedata; 1641b395e8b5SAlex Elder bool read_only = rbd_dev->mapping.read_only; 1642602adf40SYehuda Sadeh struct request *rq; 1643602adf40SYehuda Sadeh 164400f1f36fSAlex Elder while ((rq = blk_fetch_request(q))) { 1645b395e8b5SAlex Elder struct ceph_snap_context *snapc = NULL; 1646b395e8b5SAlex Elder unsigned int size = 0; 16478295cda7SAlex Elder int result; 1648602adf40SYehuda Sadeh 1649602adf40SYehuda Sadeh dout("fetched request\n"); 1650602adf40SYehuda Sadeh 1651b395e8b5SAlex Elder /* Filter out block requests we don't understand */ 1652b395e8b5SAlex Elder 1653602adf40SYehuda Sadeh if ((rq->cmd_type != REQ_TYPE_FS)) { 1654602adf40SYehuda Sadeh __blk_end_request_all(rq, 0); 165500f1f36fSAlex Elder continue; 1656602adf40SYehuda Sadeh } 1657602adf40SYehuda Sadeh spin_unlock_irq(q->queue_lock); 1658602adf40SYehuda Sadeh 1659a7b4c65fSAlex Elder /* Write requests need a reference to the snapshot context */ 1660e88a36ecSJosh Durgin 1661a7b4c65fSAlex Elder if (rq_data_dir(rq) == WRITE) { 1662b395e8b5SAlex Elder result = -EROFS; 1663a7b4c65fSAlex Elder if (read_only) /* Can't write to a read-only device */ 1664b395e8b5SAlex Elder goto out_end_request; 1665b395e8b5SAlex Elder 1666a7b4c65fSAlex Elder /* 1667a7b4c65fSAlex Elder * Note that each osd request will take its 1668a7b4c65fSAlex Elder * own reference to the snapshot context 1669a7b4c65fSAlex Elder * supplied. The reference we take here 1670a7b4c65fSAlex Elder * just guarantees the one we provide stays 1671a7b4c65fSAlex Elder * valid. 1672a7b4c65fSAlex Elder */ 1673b395e8b5SAlex Elder down_read(&rbd_dev->header_rwsem); 1674b395e8b5SAlex Elder snapc = ceph_get_snap_context(rbd_dev->header.snapc); 1675d1d25646SJosh Durgin up_read(&rbd_dev->header_rwsem); 1676a7b4c65fSAlex Elder rbd_assert(snapc != NULL); 1677a7b4c65fSAlex Elder } else if (!atomic_read(&rbd_dev->exists)) { 1678b395e8b5SAlex Elder rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP); 1679e88a36ecSJosh Durgin dout("request for non-existent snapshot"); 1680b395e8b5SAlex Elder result = -ENXIO; 1681b395e8b5SAlex Elder goto out_end_request; 1682e88a36ecSJosh Durgin } 1683d1d25646SJosh Durgin 1684f7760dadSAlex Elder size = blk_rq_bytes(rq); 1685b395e8b5SAlex Elder result = rbd_dev_do_request(rq, rbd_dev, snapc, 1686b395e8b5SAlex Elder blk_rq_pos(rq) * SECTOR_SIZE, 1687b395e8b5SAlex Elder size, rq->bio); 1688b395e8b5SAlex Elder out_end_request: 1689a7b4c65fSAlex Elder if (snapc) 1690df111be6SAlex Elder ceph_put_snap_context(snapc); 16911fec7093SYehuda Sadeh spin_lock_irq(q->queue_lock); 16928295cda7SAlex Elder if (!size || result < 0) 16938295cda7SAlex Elder __blk_end_request_all(rq, result); 1694602adf40SYehuda Sadeh } 1695602adf40SYehuda Sadeh } 1696602adf40SYehuda Sadeh 1697602adf40SYehuda Sadeh /* 1698602adf40SYehuda Sadeh * a queue callback. Makes sure that we don't create a bio that spans across 1699602adf40SYehuda Sadeh * multiple osd objects. One exception would be with a single page bios, 1700f7760dadSAlex Elder * which we handle later at bio_chain_clone_range() 1701602adf40SYehuda Sadeh */ 1702602adf40SYehuda Sadeh static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd, 1703602adf40SYehuda Sadeh struct bio_vec *bvec) 1704602adf40SYehuda Sadeh { 1705602adf40SYehuda Sadeh struct rbd_device *rbd_dev = q->queuedata; 1706e5cfeed2SAlex Elder sector_t sector_offset; 1707e5cfeed2SAlex Elder sector_t sectors_per_obj; 1708e5cfeed2SAlex Elder sector_t obj_sector_offset; 1709e5cfeed2SAlex Elder int ret; 1710602adf40SYehuda Sadeh 1711e5cfeed2SAlex Elder /* 1712e5cfeed2SAlex Elder * Find how far into its rbd object the partition-relative 1713e5cfeed2SAlex Elder * bio start sector is to offset relative to the enclosing 1714e5cfeed2SAlex Elder * device. 1715e5cfeed2SAlex Elder */ 1716e5cfeed2SAlex Elder sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector; 1717e5cfeed2SAlex Elder sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT); 1718e5cfeed2SAlex Elder obj_sector_offset = sector_offset & (sectors_per_obj - 1); 1719593a9e7bSAlex Elder 1720e5cfeed2SAlex Elder /* 1721e5cfeed2SAlex Elder * Compute the number of bytes from that offset to the end 1722e5cfeed2SAlex Elder * of the object. Account for what's already used by the bio. 1723e5cfeed2SAlex Elder */ 1724e5cfeed2SAlex Elder ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT; 1725e5cfeed2SAlex Elder if (ret > bmd->bi_size) 1726e5cfeed2SAlex Elder ret -= bmd->bi_size; 1727e5cfeed2SAlex Elder else 1728e5cfeed2SAlex Elder ret = 0; 1729e5cfeed2SAlex Elder 1730e5cfeed2SAlex Elder /* 1731e5cfeed2SAlex Elder * Don't send back more than was asked for. And if the bio 1732e5cfeed2SAlex Elder * was empty, let the whole thing through because: "Note 1733e5cfeed2SAlex Elder * that a block device *must* allow a single page to be 1734e5cfeed2SAlex Elder * added to an empty bio." 1735e5cfeed2SAlex Elder */ 1736e5cfeed2SAlex Elder rbd_assert(bvec->bv_len <= PAGE_SIZE); 1737e5cfeed2SAlex Elder if (ret > (int) bvec->bv_len || !bmd->bi_size) 1738e5cfeed2SAlex Elder ret = (int) bvec->bv_len; 1739e5cfeed2SAlex Elder 1740e5cfeed2SAlex Elder return ret; 1741602adf40SYehuda Sadeh } 1742602adf40SYehuda Sadeh 1743602adf40SYehuda Sadeh static void rbd_free_disk(struct rbd_device *rbd_dev) 1744602adf40SYehuda Sadeh { 1745602adf40SYehuda Sadeh struct gendisk *disk = rbd_dev->disk; 1746602adf40SYehuda Sadeh 1747602adf40SYehuda Sadeh if (!disk) 1748602adf40SYehuda Sadeh return; 1749602adf40SYehuda Sadeh 1750602adf40SYehuda Sadeh if (disk->flags & GENHD_FL_UP) 1751602adf40SYehuda Sadeh del_gendisk(disk); 1752602adf40SYehuda Sadeh if (disk->queue) 1753602adf40SYehuda Sadeh blk_cleanup_queue(disk->queue); 1754602adf40SYehuda Sadeh put_disk(disk); 1755602adf40SYehuda Sadeh } 1756602adf40SYehuda Sadeh 1757602adf40SYehuda Sadeh /* 17584156d998SAlex Elder * Read the complete header for the given rbd device. 17594156d998SAlex Elder * 17604156d998SAlex Elder * Returns a pointer to a dynamically-allocated buffer containing 17614156d998SAlex Elder * the complete and validated header. Caller can pass the address 17624156d998SAlex Elder * of a variable that will be filled in with the version of the 17634156d998SAlex Elder * header object at the time it was read. 17644156d998SAlex Elder * 17654156d998SAlex Elder * Returns a pointer-coded errno if a failure occurs. 17664156d998SAlex Elder */ 17674156d998SAlex Elder static struct rbd_image_header_ondisk * 17684156d998SAlex Elder rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version) 17694156d998SAlex Elder { 17704156d998SAlex Elder struct rbd_image_header_ondisk *ondisk = NULL; 17714156d998SAlex Elder u32 snap_count = 0; 17724156d998SAlex Elder u64 names_size = 0; 17734156d998SAlex Elder u32 want_count; 17744156d998SAlex Elder int ret; 17754156d998SAlex Elder 17764156d998SAlex Elder /* 17774156d998SAlex Elder * The complete header will include an array of its 64-bit 17784156d998SAlex Elder * snapshot ids, followed by the names of those snapshots as 17794156d998SAlex Elder * a contiguous block of NUL-terminated strings. Note that 17804156d998SAlex Elder * the number of snapshots could change by the time we read 17814156d998SAlex Elder * it in, in which case we re-read it. 17824156d998SAlex Elder */ 17834156d998SAlex Elder do { 17844156d998SAlex Elder size_t size; 17854156d998SAlex Elder 17864156d998SAlex Elder kfree(ondisk); 17874156d998SAlex Elder 17884156d998SAlex Elder size = sizeof (*ondisk); 17894156d998SAlex Elder size += snap_count * sizeof (struct rbd_image_snap_ondisk); 17904156d998SAlex Elder size += names_size; 17914156d998SAlex Elder ondisk = kmalloc(size, GFP_KERNEL); 17924156d998SAlex Elder if (!ondisk) 17934156d998SAlex Elder return ERR_PTR(-ENOMEM); 17944156d998SAlex Elder 17954775618dSAlex Elder ret = rbd_req_sync_read(rbd_dev, rbd_dev->header_name, 17964156d998SAlex Elder 0, size, 17974156d998SAlex Elder (char *) ondisk, version); 17984156d998SAlex Elder 17994156d998SAlex Elder if (ret < 0) 18004156d998SAlex Elder goto out_err; 18014156d998SAlex Elder if (WARN_ON((size_t) ret < size)) { 18024156d998SAlex Elder ret = -ENXIO; 180306ecc6cbSAlex Elder rbd_warn(rbd_dev, "short header read (want %zd got %d)", 180406ecc6cbSAlex Elder size, ret); 18054156d998SAlex Elder goto out_err; 18064156d998SAlex Elder } 18074156d998SAlex Elder if (!rbd_dev_ondisk_valid(ondisk)) { 18084156d998SAlex Elder ret = -ENXIO; 180906ecc6cbSAlex Elder rbd_warn(rbd_dev, "invalid header"); 18104156d998SAlex Elder goto out_err; 18114156d998SAlex Elder } 18124156d998SAlex Elder 18134156d998SAlex Elder names_size = le64_to_cpu(ondisk->snap_names_len); 18144156d998SAlex Elder want_count = snap_count; 18154156d998SAlex Elder snap_count = le32_to_cpu(ondisk->snap_count); 18164156d998SAlex Elder } while (snap_count != want_count); 18174156d998SAlex Elder 18184156d998SAlex Elder return ondisk; 18194156d998SAlex Elder 18204156d998SAlex Elder out_err: 18214156d998SAlex Elder kfree(ondisk); 18224156d998SAlex Elder 18234156d998SAlex Elder return ERR_PTR(ret); 18244156d998SAlex Elder } 18254156d998SAlex Elder 18264156d998SAlex Elder /* 1827602adf40SYehuda Sadeh * reload the ondisk the header 1828602adf40SYehuda Sadeh */ 1829602adf40SYehuda Sadeh static int rbd_read_header(struct rbd_device *rbd_dev, 1830602adf40SYehuda Sadeh struct rbd_image_header *header) 1831602adf40SYehuda Sadeh { 18324156d998SAlex Elder struct rbd_image_header_ondisk *ondisk; 18334156d998SAlex Elder u64 ver = 0; 18344156d998SAlex Elder int ret; 1835602adf40SYehuda Sadeh 18364156d998SAlex Elder ondisk = rbd_dev_v1_header_read(rbd_dev, &ver); 18374156d998SAlex Elder if (IS_ERR(ondisk)) 18384156d998SAlex Elder return PTR_ERR(ondisk); 18394156d998SAlex Elder ret = rbd_header_from_disk(header, ondisk); 18404156d998SAlex Elder if (ret >= 0) 184159c2be1eSYehuda Sadeh header->obj_version = ver; 18424156d998SAlex Elder kfree(ondisk); 1843602adf40SYehuda Sadeh 18444156d998SAlex Elder return ret; 1845602adf40SYehuda Sadeh } 1846602adf40SYehuda Sadeh 184741f38c2bSAlex Elder static void rbd_remove_all_snaps(struct rbd_device *rbd_dev) 1848dfc5606dSYehuda Sadeh { 1849dfc5606dSYehuda Sadeh struct rbd_snap *snap; 1850a0593290SAlex Elder struct rbd_snap *next; 1851dfc5606dSYehuda Sadeh 1852a0593290SAlex Elder list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node) 185341f38c2bSAlex Elder rbd_remove_snap_dev(snap); 1854dfc5606dSYehuda Sadeh } 1855dfc5606dSYehuda Sadeh 18569478554aSAlex Elder static void rbd_update_mapping_size(struct rbd_device *rbd_dev) 18579478554aSAlex Elder { 18589478554aSAlex Elder sector_t size; 18599478554aSAlex Elder 18600d7dbfceSAlex Elder if (rbd_dev->spec->snap_id != CEPH_NOSNAP) 18619478554aSAlex Elder return; 18629478554aSAlex Elder 18639478554aSAlex Elder size = (sector_t) rbd_dev->header.image_size / SECTOR_SIZE; 18649478554aSAlex Elder dout("setting size to %llu sectors", (unsigned long long) size); 18659478554aSAlex Elder rbd_dev->mapping.size = (u64) size; 18669478554aSAlex Elder set_capacity(rbd_dev->disk, size); 18679478554aSAlex Elder } 18689478554aSAlex Elder 1869602adf40SYehuda Sadeh /* 1870602adf40SYehuda Sadeh * only read the first part of the ondisk header, without the snaps info 1871602adf40SYehuda Sadeh */ 1872117973fbSAlex Elder static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver) 1873602adf40SYehuda Sadeh { 1874602adf40SYehuda Sadeh int ret; 1875602adf40SYehuda Sadeh struct rbd_image_header h; 1876602adf40SYehuda Sadeh 1877602adf40SYehuda Sadeh ret = rbd_read_header(rbd_dev, &h); 1878602adf40SYehuda Sadeh if (ret < 0) 1879602adf40SYehuda Sadeh return ret; 1880602adf40SYehuda Sadeh 1881a51aa0c0SJosh Durgin down_write(&rbd_dev->header_rwsem); 1882a51aa0c0SJosh Durgin 18839478554aSAlex Elder /* Update image size, and check for resize of mapped image */ 18849478554aSAlex Elder rbd_dev->header.image_size = h.image_size; 18859478554aSAlex Elder rbd_update_mapping_size(rbd_dev); 18869db4b3e3SSage Weil 1887849b4260SAlex Elder /* rbd_dev->header.object_prefix shouldn't change */ 1888602adf40SYehuda Sadeh kfree(rbd_dev->header.snap_sizes); 1889849b4260SAlex Elder kfree(rbd_dev->header.snap_names); 1890d1d25646SJosh Durgin /* osd requests may still refer to snapc */ 1891d1d25646SJosh Durgin ceph_put_snap_context(rbd_dev->header.snapc); 1892602adf40SYehuda Sadeh 1893b813623aSAlex Elder if (hver) 1894b813623aSAlex Elder *hver = h.obj_version; 1895a71b891bSJosh Durgin rbd_dev->header.obj_version = h.obj_version; 189693a24e08SJosh Durgin rbd_dev->header.image_size = h.image_size; 1897602adf40SYehuda Sadeh rbd_dev->header.snapc = h.snapc; 1898602adf40SYehuda Sadeh rbd_dev->header.snap_names = h.snap_names; 1899602adf40SYehuda Sadeh rbd_dev->header.snap_sizes = h.snap_sizes; 1900849b4260SAlex Elder /* Free the extra copy of the object prefix */ 1901849b4260SAlex Elder WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix)); 1902849b4260SAlex Elder kfree(h.object_prefix); 1903849b4260SAlex Elder 1904304f6808SAlex Elder ret = rbd_dev_snaps_update(rbd_dev); 1905304f6808SAlex Elder if (!ret) 1906304f6808SAlex Elder ret = rbd_dev_snaps_register(rbd_dev); 1907dfc5606dSYehuda Sadeh 1908c666601aSJosh Durgin up_write(&rbd_dev->header_rwsem); 1909602adf40SYehuda Sadeh 1910dfc5606dSYehuda Sadeh return ret; 1911602adf40SYehuda Sadeh } 1912602adf40SYehuda Sadeh 1913117973fbSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver) 19141fe5e993SAlex Elder { 19151fe5e993SAlex Elder int ret; 19161fe5e993SAlex Elder 1917117973fbSAlex Elder rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 19181fe5e993SAlex Elder mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); 1919117973fbSAlex Elder if (rbd_dev->image_format == 1) 1920117973fbSAlex Elder ret = rbd_dev_v1_refresh(rbd_dev, hver); 1921117973fbSAlex Elder else 1922117973fbSAlex Elder ret = rbd_dev_v2_refresh(rbd_dev, hver); 19231fe5e993SAlex Elder mutex_unlock(&ctl_mutex); 19241fe5e993SAlex Elder 19251fe5e993SAlex Elder return ret; 19261fe5e993SAlex Elder } 19271fe5e993SAlex Elder 1928602adf40SYehuda Sadeh static int rbd_init_disk(struct rbd_device *rbd_dev) 1929602adf40SYehuda Sadeh { 1930602adf40SYehuda Sadeh struct gendisk *disk; 1931602adf40SYehuda Sadeh struct request_queue *q; 1932593a9e7bSAlex Elder u64 segment_size; 1933602adf40SYehuda Sadeh 1934602adf40SYehuda Sadeh /* create gendisk info */ 1935602adf40SYehuda Sadeh disk = alloc_disk(RBD_MINORS_PER_MAJOR); 1936602adf40SYehuda Sadeh if (!disk) 19371fcdb8aaSAlex Elder return -ENOMEM; 1938602adf40SYehuda Sadeh 1939f0f8cef5SAlex Elder snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d", 1940de71a297SAlex Elder rbd_dev->dev_id); 1941602adf40SYehuda Sadeh disk->major = rbd_dev->major; 1942602adf40SYehuda Sadeh disk->first_minor = 0; 1943602adf40SYehuda Sadeh disk->fops = &rbd_bd_ops; 1944602adf40SYehuda Sadeh disk->private_data = rbd_dev; 1945602adf40SYehuda Sadeh 1946602adf40SYehuda Sadeh /* init rq */ 1947602adf40SYehuda Sadeh q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock); 1948602adf40SYehuda Sadeh if (!q) 1949602adf40SYehuda Sadeh goto out_disk; 1950029bcbd8SJosh Durgin 1951593a9e7bSAlex Elder /* We use the default size, but let's be explicit about it. */ 1952593a9e7bSAlex Elder blk_queue_physical_block_size(q, SECTOR_SIZE); 1953593a9e7bSAlex Elder 1954029bcbd8SJosh Durgin /* set io sizes to object size */ 1955593a9e7bSAlex Elder segment_size = rbd_obj_bytes(&rbd_dev->header); 1956593a9e7bSAlex Elder blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE); 1957593a9e7bSAlex Elder blk_queue_max_segment_size(q, segment_size); 1958593a9e7bSAlex Elder blk_queue_io_min(q, segment_size); 1959593a9e7bSAlex Elder blk_queue_io_opt(q, segment_size); 1960029bcbd8SJosh Durgin 1961602adf40SYehuda Sadeh blk_queue_merge_bvec(q, rbd_merge_bvec); 1962602adf40SYehuda Sadeh disk->queue = q; 1963602adf40SYehuda Sadeh 1964602adf40SYehuda Sadeh q->queuedata = rbd_dev; 1965602adf40SYehuda Sadeh 1966602adf40SYehuda Sadeh rbd_dev->disk = disk; 1967602adf40SYehuda Sadeh 196812f02944SAlex Elder set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE); 196912f02944SAlex Elder 1970602adf40SYehuda Sadeh return 0; 1971602adf40SYehuda Sadeh out_disk: 1972602adf40SYehuda Sadeh put_disk(disk); 19731fcdb8aaSAlex Elder 19741fcdb8aaSAlex Elder return -ENOMEM; 1975602adf40SYehuda Sadeh } 1976602adf40SYehuda Sadeh 1977dfc5606dSYehuda Sadeh /* 1978dfc5606dSYehuda Sadeh sysfs 1979dfc5606dSYehuda Sadeh */ 1980602adf40SYehuda Sadeh 1981593a9e7bSAlex Elder static struct rbd_device *dev_to_rbd_dev(struct device *dev) 1982593a9e7bSAlex Elder { 1983593a9e7bSAlex Elder return container_of(dev, struct rbd_device, dev); 1984593a9e7bSAlex Elder } 1985593a9e7bSAlex Elder 1986dfc5606dSYehuda Sadeh static ssize_t rbd_size_show(struct device *dev, 1987dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 1988602adf40SYehuda Sadeh { 1989593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 1990a51aa0c0SJosh Durgin sector_t size; 1991dfc5606dSYehuda Sadeh 1992a51aa0c0SJosh Durgin down_read(&rbd_dev->header_rwsem); 1993a51aa0c0SJosh Durgin size = get_capacity(rbd_dev->disk); 1994a51aa0c0SJosh Durgin up_read(&rbd_dev->header_rwsem); 1995a51aa0c0SJosh Durgin 1996a51aa0c0SJosh Durgin return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE); 1997602adf40SYehuda Sadeh } 1998602adf40SYehuda Sadeh 199934b13184SAlex Elder /* 200034b13184SAlex Elder * Note this shows the features for whatever's mapped, which is not 200134b13184SAlex Elder * necessarily the base image. 200234b13184SAlex Elder */ 200334b13184SAlex Elder static ssize_t rbd_features_show(struct device *dev, 200434b13184SAlex Elder struct device_attribute *attr, char *buf) 200534b13184SAlex Elder { 200634b13184SAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 200734b13184SAlex Elder 200834b13184SAlex Elder return sprintf(buf, "0x%016llx\n", 200934b13184SAlex Elder (unsigned long long) rbd_dev->mapping.features); 201034b13184SAlex Elder } 201134b13184SAlex Elder 2012dfc5606dSYehuda Sadeh static ssize_t rbd_major_show(struct device *dev, 2013dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 2014602adf40SYehuda Sadeh { 2015593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 2016dfc5606dSYehuda Sadeh 2017dfc5606dSYehuda Sadeh return sprintf(buf, "%d\n", rbd_dev->major); 2018dfc5606dSYehuda Sadeh } 2019dfc5606dSYehuda Sadeh 2020dfc5606dSYehuda Sadeh static ssize_t rbd_client_id_show(struct device *dev, 2021dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 2022dfc5606dSYehuda Sadeh { 2023593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 2024dfc5606dSYehuda Sadeh 20251dbb4399SAlex Elder return sprintf(buf, "client%lld\n", 20261dbb4399SAlex Elder ceph_client_id(rbd_dev->rbd_client->client)); 2027dfc5606dSYehuda Sadeh } 2028dfc5606dSYehuda Sadeh 2029dfc5606dSYehuda Sadeh static ssize_t rbd_pool_show(struct device *dev, 2030dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 2031dfc5606dSYehuda Sadeh { 2032593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 2033dfc5606dSYehuda Sadeh 20340d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->pool_name); 2035dfc5606dSYehuda Sadeh } 2036dfc5606dSYehuda Sadeh 20379bb2f334SAlex Elder static ssize_t rbd_pool_id_show(struct device *dev, 20389bb2f334SAlex Elder struct device_attribute *attr, char *buf) 20399bb2f334SAlex Elder { 20409bb2f334SAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 20419bb2f334SAlex Elder 20420d7dbfceSAlex Elder return sprintf(buf, "%llu\n", 20430d7dbfceSAlex Elder (unsigned long long) rbd_dev->spec->pool_id); 20449bb2f334SAlex Elder } 20459bb2f334SAlex Elder 2046dfc5606dSYehuda Sadeh static ssize_t rbd_name_show(struct device *dev, 2047dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 2048dfc5606dSYehuda Sadeh { 2049593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 2050dfc5606dSYehuda Sadeh 2051a92ffdf8SAlex Elder if (rbd_dev->spec->image_name) 20520d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->image_name); 2053a92ffdf8SAlex Elder 2054a92ffdf8SAlex Elder return sprintf(buf, "(unknown)\n"); 2055dfc5606dSYehuda Sadeh } 2056dfc5606dSYehuda Sadeh 2057589d30e0SAlex Elder static ssize_t rbd_image_id_show(struct device *dev, 2058589d30e0SAlex Elder struct device_attribute *attr, char *buf) 2059589d30e0SAlex Elder { 2060589d30e0SAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 2061589d30e0SAlex Elder 20620d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->image_id); 2063589d30e0SAlex Elder } 2064589d30e0SAlex Elder 206534b13184SAlex Elder /* 206634b13184SAlex Elder * Shows the name of the currently-mapped snapshot (or 206734b13184SAlex Elder * RBD_SNAP_HEAD_NAME for the base image). 206834b13184SAlex Elder */ 2069dfc5606dSYehuda Sadeh static ssize_t rbd_snap_show(struct device *dev, 2070dfc5606dSYehuda Sadeh struct device_attribute *attr, 2071dfc5606dSYehuda Sadeh char *buf) 2072dfc5606dSYehuda Sadeh { 2073593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 2074dfc5606dSYehuda Sadeh 20750d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->snap_name); 2076dfc5606dSYehuda Sadeh } 2077dfc5606dSYehuda Sadeh 207886b00e0dSAlex Elder /* 207986b00e0dSAlex Elder * For an rbd v2 image, shows the pool id, image id, and snapshot id 208086b00e0dSAlex Elder * for the parent image. If there is no parent, simply shows 208186b00e0dSAlex Elder * "(no parent image)". 208286b00e0dSAlex Elder */ 208386b00e0dSAlex Elder static ssize_t rbd_parent_show(struct device *dev, 208486b00e0dSAlex Elder struct device_attribute *attr, 208586b00e0dSAlex Elder char *buf) 208686b00e0dSAlex Elder { 208786b00e0dSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 208886b00e0dSAlex Elder struct rbd_spec *spec = rbd_dev->parent_spec; 208986b00e0dSAlex Elder int count; 209086b00e0dSAlex Elder char *bufp = buf; 209186b00e0dSAlex Elder 209286b00e0dSAlex Elder if (!spec) 209386b00e0dSAlex Elder return sprintf(buf, "(no parent image)\n"); 209486b00e0dSAlex Elder 209586b00e0dSAlex Elder count = sprintf(bufp, "pool_id %llu\npool_name %s\n", 209686b00e0dSAlex Elder (unsigned long long) spec->pool_id, spec->pool_name); 209786b00e0dSAlex Elder if (count < 0) 209886b00e0dSAlex Elder return count; 209986b00e0dSAlex Elder bufp += count; 210086b00e0dSAlex Elder 210186b00e0dSAlex Elder count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id, 210286b00e0dSAlex Elder spec->image_name ? spec->image_name : "(unknown)"); 210386b00e0dSAlex Elder if (count < 0) 210486b00e0dSAlex Elder return count; 210586b00e0dSAlex Elder bufp += count; 210686b00e0dSAlex Elder 210786b00e0dSAlex Elder count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n", 210886b00e0dSAlex Elder (unsigned long long) spec->snap_id, spec->snap_name); 210986b00e0dSAlex Elder if (count < 0) 211086b00e0dSAlex Elder return count; 211186b00e0dSAlex Elder bufp += count; 211286b00e0dSAlex Elder 211386b00e0dSAlex Elder count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap); 211486b00e0dSAlex Elder if (count < 0) 211586b00e0dSAlex Elder return count; 211686b00e0dSAlex Elder bufp += count; 211786b00e0dSAlex Elder 211886b00e0dSAlex Elder return (ssize_t) (bufp - buf); 211986b00e0dSAlex Elder } 212086b00e0dSAlex Elder 2121dfc5606dSYehuda Sadeh static ssize_t rbd_image_refresh(struct device *dev, 2122dfc5606dSYehuda Sadeh struct device_attribute *attr, 2123dfc5606dSYehuda Sadeh const char *buf, 2124dfc5606dSYehuda Sadeh size_t size) 2125dfc5606dSYehuda Sadeh { 2126593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 2127b813623aSAlex Elder int ret; 2128602adf40SYehuda Sadeh 2129117973fbSAlex Elder ret = rbd_dev_refresh(rbd_dev, NULL); 2130b813623aSAlex Elder 2131b813623aSAlex Elder return ret < 0 ? ret : size; 2132dfc5606dSYehuda Sadeh } 2133602adf40SYehuda Sadeh 2134dfc5606dSYehuda Sadeh static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL); 213534b13184SAlex Elder static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL); 2136dfc5606dSYehuda Sadeh static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL); 2137dfc5606dSYehuda Sadeh static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL); 2138dfc5606dSYehuda Sadeh static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL); 21399bb2f334SAlex Elder static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL); 2140dfc5606dSYehuda Sadeh static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL); 2141589d30e0SAlex Elder static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL); 2142dfc5606dSYehuda Sadeh static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh); 2143dfc5606dSYehuda Sadeh static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL); 214486b00e0dSAlex Elder static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL); 2145dfc5606dSYehuda Sadeh 2146dfc5606dSYehuda Sadeh static struct attribute *rbd_attrs[] = { 2147dfc5606dSYehuda Sadeh &dev_attr_size.attr, 214834b13184SAlex Elder &dev_attr_features.attr, 2149dfc5606dSYehuda Sadeh &dev_attr_major.attr, 2150dfc5606dSYehuda Sadeh &dev_attr_client_id.attr, 2151dfc5606dSYehuda Sadeh &dev_attr_pool.attr, 21529bb2f334SAlex Elder &dev_attr_pool_id.attr, 2153dfc5606dSYehuda Sadeh &dev_attr_name.attr, 2154589d30e0SAlex Elder &dev_attr_image_id.attr, 2155dfc5606dSYehuda Sadeh &dev_attr_current_snap.attr, 215686b00e0dSAlex Elder &dev_attr_parent.attr, 2157dfc5606dSYehuda Sadeh &dev_attr_refresh.attr, 2158dfc5606dSYehuda Sadeh NULL 2159dfc5606dSYehuda Sadeh }; 2160dfc5606dSYehuda Sadeh 2161dfc5606dSYehuda Sadeh static struct attribute_group rbd_attr_group = { 2162dfc5606dSYehuda Sadeh .attrs = rbd_attrs, 2163dfc5606dSYehuda Sadeh }; 2164dfc5606dSYehuda Sadeh 2165dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_attr_groups[] = { 2166dfc5606dSYehuda Sadeh &rbd_attr_group, 2167dfc5606dSYehuda Sadeh NULL 2168dfc5606dSYehuda Sadeh }; 2169dfc5606dSYehuda Sadeh 2170dfc5606dSYehuda Sadeh static void rbd_sysfs_dev_release(struct device *dev) 2171dfc5606dSYehuda Sadeh { 2172dfc5606dSYehuda Sadeh } 2173dfc5606dSYehuda Sadeh 2174dfc5606dSYehuda Sadeh static struct device_type rbd_device_type = { 2175dfc5606dSYehuda Sadeh .name = "rbd", 2176dfc5606dSYehuda Sadeh .groups = rbd_attr_groups, 2177dfc5606dSYehuda Sadeh .release = rbd_sysfs_dev_release, 2178dfc5606dSYehuda Sadeh }; 2179dfc5606dSYehuda Sadeh 2180dfc5606dSYehuda Sadeh 2181dfc5606dSYehuda Sadeh /* 2182dfc5606dSYehuda Sadeh sysfs - snapshots 2183dfc5606dSYehuda Sadeh */ 2184dfc5606dSYehuda Sadeh 2185dfc5606dSYehuda Sadeh static ssize_t rbd_snap_size_show(struct device *dev, 2186dfc5606dSYehuda Sadeh struct device_attribute *attr, 2187dfc5606dSYehuda Sadeh char *buf) 2188dfc5606dSYehuda Sadeh { 2189dfc5606dSYehuda Sadeh struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev); 2190dfc5606dSYehuda Sadeh 21913591538fSJosh Durgin return sprintf(buf, "%llu\n", (unsigned long long)snap->size); 2192dfc5606dSYehuda Sadeh } 2193dfc5606dSYehuda Sadeh 2194dfc5606dSYehuda Sadeh static ssize_t rbd_snap_id_show(struct device *dev, 2195dfc5606dSYehuda Sadeh struct device_attribute *attr, 2196dfc5606dSYehuda Sadeh char *buf) 2197dfc5606dSYehuda Sadeh { 2198dfc5606dSYehuda Sadeh struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev); 2199dfc5606dSYehuda Sadeh 2200593a9e7bSAlex Elder return sprintf(buf, "%llu\n", (unsigned long long)snap->id); 2201dfc5606dSYehuda Sadeh } 2202dfc5606dSYehuda Sadeh 220334b13184SAlex Elder static ssize_t rbd_snap_features_show(struct device *dev, 220434b13184SAlex Elder struct device_attribute *attr, 220534b13184SAlex Elder char *buf) 220634b13184SAlex Elder { 220734b13184SAlex Elder struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev); 220834b13184SAlex Elder 220934b13184SAlex Elder return sprintf(buf, "0x%016llx\n", 221034b13184SAlex Elder (unsigned long long) snap->features); 221134b13184SAlex Elder } 221234b13184SAlex Elder 2213dfc5606dSYehuda Sadeh static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL); 2214dfc5606dSYehuda Sadeh static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL); 221534b13184SAlex Elder static DEVICE_ATTR(snap_features, S_IRUGO, rbd_snap_features_show, NULL); 2216dfc5606dSYehuda Sadeh 2217dfc5606dSYehuda Sadeh static struct attribute *rbd_snap_attrs[] = { 2218dfc5606dSYehuda Sadeh &dev_attr_snap_size.attr, 2219dfc5606dSYehuda Sadeh &dev_attr_snap_id.attr, 222034b13184SAlex Elder &dev_attr_snap_features.attr, 2221dfc5606dSYehuda Sadeh NULL, 2222dfc5606dSYehuda Sadeh }; 2223dfc5606dSYehuda Sadeh 2224dfc5606dSYehuda Sadeh static struct attribute_group rbd_snap_attr_group = { 2225dfc5606dSYehuda Sadeh .attrs = rbd_snap_attrs, 2226dfc5606dSYehuda Sadeh }; 2227dfc5606dSYehuda Sadeh 2228dfc5606dSYehuda Sadeh static void rbd_snap_dev_release(struct device *dev) 2229dfc5606dSYehuda Sadeh { 2230dfc5606dSYehuda Sadeh struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev); 2231dfc5606dSYehuda Sadeh kfree(snap->name); 2232dfc5606dSYehuda Sadeh kfree(snap); 2233dfc5606dSYehuda Sadeh } 2234dfc5606dSYehuda Sadeh 2235dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_snap_attr_groups[] = { 2236dfc5606dSYehuda Sadeh &rbd_snap_attr_group, 2237dfc5606dSYehuda Sadeh NULL 2238dfc5606dSYehuda Sadeh }; 2239dfc5606dSYehuda Sadeh 2240dfc5606dSYehuda Sadeh static struct device_type rbd_snap_device_type = { 2241dfc5606dSYehuda Sadeh .groups = rbd_snap_attr_groups, 2242dfc5606dSYehuda Sadeh .release = rbd_snap_dev_release, 2243dfc5606dSYehuda Sadeh }; 2244dfc5606dSYehuda Sadeh 22458b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec) 22468b8fb99cSAlex Elder { 22478b8fb99cSAlex Elder kref_get(&spec->kref); 22488b8fb99cSAlex Elder 22498b8fb99cSAlex Elder return spec; 22508b8fb99cSAlex Elder } 22518b8fb99cSAlex Elder 22528b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref); 22538b8fb99cSAlex Elder static void rbd_spec_put(struct rbd_spec *spec) 22548b8fb99cSAlex Elder { 22558b8fb99cSAlex Elder if (spec) 22568b8fb99cSAlex Elder kref_put(&spec->kref, rbd_spec_free); 22578b8fb99cSAlex Elder } 22588b8fb99cSAlex Elder 22598b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_alloc(void) 22608b8fb99cSAlex Elder { 22618b8fb99cSAlex Elder struct rbd_spec *spec; 22628b8fb99cSAlex Elder 22638b8fb99cSAlex Elder spec = kzalloc(sizeof (*spec), GFP_KERNEL); 22648b8fb99cSAlex Elder if (!spec) 22658b8fb99cSAlex Elder return NULL; 22668b8fb99cSAlex Elder kref_init(&spec->kref); 22678b8fb99cSAlex Elder 22688b8fb99cSAlex Elder rbd_spec_put(rbd_spec_get(spec)); /* TEMPORARY */ 22698b8fb99cSAlex Elder 22708b8fb99cSAlex Elder return spec; 22718b8fb99cSAlex Elder } 22728b8fb99cSAlex Elder 22738b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref) 22748b8fb99cSAlex Elder { 22758b8fb99cSAlex Elder struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref); 22768b8fb99cSAlex Elder 22778b8fb99cSAlex Elder kfree(spec->pool_name); 22788b8fb99cSAlex Elder kfree(spec->image_id); 22798b8fb99cSAlex Elder kfree(spec->image_name); 22808b8fb99cSAlex Elder kfree(spec->snap_name); 22818b8fb99cSAlex Elder kfree(spec); 22828b8fb99cSAlex Elder } 22838b8fb99cSAlex Elder 2284c53d5893SAlex Elder struct rbd_device *rbd_dev_create(struct rbd_client *rbdc, 2285c53d5893SAlex Elder struct rbd_spec *spec) 2286c53d5893SAlex Elder { 2287c53d5893SAlex Elder struct rbd_device *rbd_dev; 2288c53d5893SAlex Elder 2289c53d5893SAlex Elder rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL); 2290c53d5893SAlex Elder if (!rbd_dev) 2291c53d5893SAlex Elder return NULL; 2292c53d5893SAlex Elder 2293c53d5893SAlex Elder spin_lock_init(&rbd_dev->lock); 2294d78b650aSAlex Elder atomic_set(&rbd_dev->exists, 0); 2295c53d5893SAlex Elder INIT_LIST_HEAD(&rbd_dev->node); 2296c53d5893SAlex Elder INIT_LIST_HEAD(&rbd_dev->snaps); 2297c53d5893SAlex Elder init_rwsem(&rbd_dev->header_rwsem); 2298c53d5893SAlex Elder 2299c53d5893SAlex Elder rbd_dev->spec = spec; 2300c53d5893SAlex Elder rbd_dev->rbd_client = rbdc; 2301c53d5893SAlex Elder 2302c53d5893SAlex Elder return rbd_dev; 2303c53d5893SAlex Elder } 2304c53d5893SAlex Elder 2305c53d5893SAlex Elder static void rbd_dev_destroy(struct rbd_device *rbd_dev) 2306c53d5893SAlex Elder { 230786b00e0dSAlex Elder rbd_spec_put(rbd_dev->parent_spec); 2308c53d5893SAlex Elder kfree(rbd_dev->header_name); 2309c53d5893SAlex Elder rbd_put_client(rbd_dev->rbd_client); 2310c53d5893SAlex Elder rbd_spec_put(rbd_dev->spec); 2311c53d5893SAlex Elder kfree(rbd_dev); 2312c53d5893SAlex Elder } 2313c53d5893SAlex Elder 2314304f6808SAlex Elder static bool rbd_snap_registered(struct rbd_snap *snap) 2315304f6808SAlex Elder { 2316304f6808SAlex Elder bool ret = snap->dev.type == &rbd_snap_device_type; 2317304f6808SAlex Elder bool reg = device_is_registered(&snap->dev); 2318304f6808SAlex Elder 2319304f6808SAlex Elder rbd_assert(!ret ^ reg); 2320304f6808SAlex Elder 2321304f6808SAlex Elder return ret; 2322304f6808SAlex Elder } 2323304f6808SAlex Elder 232441f38c2bSAlex Elder static void rbd_remove_snap_dev(struct rbd_snap *snap) 2325dfc5606dSYehuda Sadeh { 2326dfc5606dSYehuda Sadeh list_del(&snap->node); 2327304f6808SAlex Elder if (device_is_registered(&snap->dev)) 2328dfc5606dSYehuda Sadeh device_unregister(&snap->dev); 2329dfc5606dSYehuda Sadeh } 2330dfc5606dSYehuda Sadeh 233114e7085dSAlex Elder static int rbd_register_snap_dev(struct rbd_snap *snap, 2332dfc5606dSYehuda Sadeh struct device *parent) 2333dfc5606dSYehuda Sadeh { 2334dfc5606dSYehuda Sadeh struct device *dev = &snap->dev; 2335dfc5606dSYehuda Sadeh int ret; 2336dfc5606dSYehuda Sadeh 2337dfc5606dSYehuda Sadeh dev->type = &rbd_snap_device_type; 2338dfc5606dSYehuda Sadeh dev->parent = parent; 2339dfc5606dSYehuda Sadeh dev->release = rbd_snap_dev_release; 2340d4b125e9SAlex Elder dev_set_name(dev, "%s%s", RBD_SNAP_DEV_NAME_PREFIX, snap->name); 2341304f6808SAlex Elder dout("%s: registering device for snapshot %s\n", __func__, snap->name); 2342304f6808SAlex Elder 2343dfc5606dSYehuda Sadeh ret = device_register(dev); 2344dfc5606dSYehuda Sadeh 2345dfc5606dSYehuda Sadeh return ret; 2346dfc5606dSYehuda Sadeh } 2347dfc5606dSYehuda Sadeh 23484e891e0aSAlex Elder static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev, 2349c8d18425SAlex Elder const char *snap_name, 235034b13184SAlex Elder u64 snap_id, u64 snap_size, 235134b13184SAlex Elder u64 snap_features) 2352dfc5606dSYehuda Sadeh { 23534e891e0aSAlex Elder struct rbd_snap *snap; 2354dfc5606dSYehuda Sadeh int ret; 23554e891e0aSAlex Elder 23564e891e0aSAlex Elder snap = kzalloc(sizeof (*snap), GFP_KERNEL); 2357dfc5606dSYehuda Sadeh if (!snap) 23584e891e0aSAlex Elder return ERR_PTR(-ENOMEM); 23594e891e0aSAlex Elder 23604e891e0aSAlex Elder ret = -ENOMEM; 2361c8d18425SAlex Elder snap->name = kstrdup(snap_name, GFP_KERNEL); 23624e891e0aSAlex Elder if (!snap->name) 23634e891e0aSAlex Elder goto err; 23644e891e0aSAlex Elder 2365c8d18425SAlex Elder snap->id = snap_id; 2366c8d18425SAlex Elder snap->size = snap_size; 236734b13184SAlex Elder snap->features = snap_features; 23684e891e0aSAlex Elder 23694e891e0aSAlex Elder return snap; 23704e891e0aSAlex Elder 2371dfc5606dSYehuda Sadeh err: 2372dfc5606dSYehuda Sadeh kfree(snap->name); 2373dfc5606dSYehuda Sadeh kfree(snap); 23744e891e0aSAlex Elder 23754e891e0aSAlex Elder return ERR_PTR(ret); 2376dfc5606dSYehuda Sadeh } 2377dfc5606dSYehuda Sadeh 2378cd892126SAlex Elder static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which, 2379cd892126SAlex Elder u64 *snap_size, u64 *snap_features) 2380cd892126SAlex Elder { 2381cd892126SAlex Elder char *snap_name; 2382cd892126SAlex Elder 2383cd892126SAlex Elder rbd_assert(which < rbd_dev->header.snapc->num_snaps); 2384cd892126SAlex Elder 2385cd892126SAlex Elder *snap_size = rbd_dev->header.snap_sizes[which]; 2386cd892126SAlex Elder *snap_features = 0; /* No features for v1 */ 2387cd892126SAlex Elder 2388cd892126SAlex Elder /* Skip over names until we find the one we are looking for */ 2389cd892126SAlex Elder 2390cd892126SAlex Elder snap_name = rbd_dev->header.snap_names; 2391cd892126SAlex Elder while (which--) 2392cd892126SAlex Elder snap_name += strlen(snap_name) + 1; 2393cd892126SAlex Elder 2394cd892126SAlex Elder return snap_name; 2395cd892126SAlex Elder } 2396cd892126SAlex Elder 2397dfc5606dSYehuda Sadeh /* 23989d475de5SAlex Elder * Get the size and object order for an image snapshot, or if 23999d475de5SAlex Elder * snap_id is CEPH_NOSNAP, gets this information for the base 24009d475de5SAlex Elder * image. 24019d475de5SAlex Elder */ 24029d475de5SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id, 24039d475de5SAlex Elder u8 *order, u64 *snap_size) 24049d475de5SAlex Elder { 24059d475de5SAlex Elder __le64 snapid = cpu_to_le64(snap_id); 24069d475de5SAlex Elder int ret; 24079d475de5SAlex Elder struct { 24089d475de5SAlex Elder u8 order; 24099d475de5SAlex Elder __le64 size; 24109d475de5SAlex Elder } __attribute__ ((packed)) size_buf = { 0 }; 24119d475de5SAlex Elder 24129d475de5SAlex Elder ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, 24139d475de5SAlex Elder "rbd", "get_size", 24149d475de5SAlex Elder (char *) &snapid, sizeof (snapid), 241507b2391fSAlex Elder (char *) &size_buf, sizeof (size_buf), NULL); 24169d475de5SAlex Elder dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); 24179d475de5SAlex Elder if (ret < 0) 24189d475de5SAlex Elder return ret; 24199d475de5SAlex Elder 24209d475de5SAlex Elder *order = size_buf.order; 24219d475de5SAlex Elder *snap_size = le64_to_cpu(size_buf.size); 24229d475de5SAlex Elder 24239d475de5SAlex Elder dout(" snap_id 0x%016llx order = %u, snap_size = %llu\n", 24249d475de5SAlex Elder (unsigned long long) snap_id, (unsigned int) *order, 24259d475de5SAlex Elder (unsigned long long) *snap_size); 24269d475de5SAlex Elder 24279d475de5SAlex Elder return 0; 24289d475de5SAlex Elder } 24299d475de5SAlex Elder 24309d475de5SAlex Elder static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev) 24319d475de5SAlex Elder { 24329d475de5SAlex Elder return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP, 24339d475de5SAlex Elder &rbd_dev->header.obj_order, 24349d475de5SAlex Elder &rbd_dev->header.image_size); 24359d475de5SAlex Elder } 24369d475de5SAlex Elder 24371e130199SAlex Elder static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev) 24381e130199SAlex Elder { 24391e130199SAlex Elder void *reply_buf; 24401e130199SAlex Elder int ret; 24411e130199SAlex Elder void *p; 24421e130199SAlex Elder 24431e130199SAlex Elder reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL); 24441e130199SAlex Elder if (!reply_buf) 24451e130199SAlex Elder return -ENOMEM; 24461e130199SAlex Elder 24471e130199SAlex Elder ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, 24481e130199SAlex Elder "rbd", "get_object_prefix", 24491e130199SAlex Elder NULL, 0, 245007b2391fSAlex Elder reply_buf, RBD_OBJ_PREFIX_LEN_MAX, NULL); 24511e130199SAlex Elder dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); 24521e130199SAlex Elder if (ret < 0) 24531e130199SAlex Elder goto out; 2454a0ea3a40SAlex Elder ret = 0; /* rbd_req_sync_exec() can return positive */ 24551e130199SAlex Elder 24561e130199SAlex Elder p = reply_buf; 24571e130199SAlex Elder rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p, 24581e130199SAlex Elder p + RBD_OBJ_PREFIX_LEN_MAX, 24591e130199SAlex Elder NULL, GFP_NOIO); 24601e130199SAlex Elder 24611e130199SAlex Elder if (IS_ERR(rbd_dev->header.object_prefix)) { 24621e130199SAlex Elder ret = PTR_ERR(rbd_dev->header.object_prefix); 24631e130199SAlex Elder rbd_dev->header.object_prefix = NULL; 24641e130199SAlex Elder } else { 24651e130199SAlex Elder dout(" object_prefix = %s\n", rbd_dev->header.object_prefix); 24661e130199SAlex Elder } 24671e130199SAlex Elder 24681e130199SAlex Elder out: 24691e130199SAlex Elder kfree(reply_buf); 24701e130199SAlex Elder 24711e130199SAlex Elder return ret; 24721e130199SAlex Elder } 24731e130199SAlex Elder 2474b1b5402aSAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id, 2475b1b5402aSAlex Elder u64 *snap_features) 2476b1b5402aSAlex Elder { 2477b1b5402aSAlex Elder __le64 snapid = cpu_to_le64(snap_id); 2478b1b5402aSAlex Elder struct { 2479b1b5402aSAlex Elder __le64 features; 2480b1b5402aSAlex Elder __le64 incompat; 2481b1b5402aSAlex Elder } features_buf = { 0 }; 2482d889140cSAlex Elder u64 incompat; 2483b1b5402aSAlex Elder int ret; 2484b1b5402aSAlex Elder 2485b1b5402aSAlex Elder ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, 2486b1b5402aSAlex Elder "rbd", "get_features", 2487b1b5402aSAlex Elder (char *) &snapid, sizeof (snapid), 2488b1b5402aSAlex Elder (char *) &features_buf, sizeof (features_buf), 248907b2391fSAlex Elder NULL); 2490b1b5402aSAlex Elder dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); 2491b1b5402aSAlex Elder if (ret < 0) 2492b1b5402aSAlex Elder return ret; 2493d889140cSAlex Elder 2494d889140cSAlex Elder incompat = le64_to_cpu(features_buf.incompat); 2495d889140cSAlex Elder if (incompat & ~RBD_FEATURES_ALL) 2496b8f5c6edSAlex Elder return -ENXIO; 2497d889140cSAlex Elder 2498b1b5402aSAlex Elder *snap_features = le64_to_cpu(features_buf.features); 2499b1b5402aSAlex Elder 2500b1b5402aSAlex Elder dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n", 2501b1b5402aSAlex Elder (unsigned long long) snap_id, 2502b1b5402aSAlex Elder (unsigned long long) *snap_features, 2503b1b5402aSAlex Elder (unsigned long long) le64_to_cpu(features_buf.incompat)); 2504b1b5402aSAlex Elder 2505b1b5402aSAlex Elder return 0; 2506b1b5402aSAlex Elder } 2507b1b5402aSAlex Elder 2508b1b5402aSAlex Elder static int rbd_dev_v2_features(struct rbd_device *rbd_dev) 2509b1b5402aSAlex Elder { 2510b1b5402aSAlex Elder return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP, 2511b1b5402aSAlex Elder &rbd_dev->header.features); 2512b1b5402aSAlex Elder } 2513b1b5402aSAlex Elder 251486b00e0dSAlex Elder static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev) 251586b00e0dSAlex Elder { 251686b00e0dSAlex Elder struct rbd_spec *parent_spec; 251786b00e0dSAlex Elder size_t size; 251886b00e0dSAlex Elder void *reply_buf = NULL; 251986b00e0dSAlex Elder __le64 snapid; 252086b00e0dSAlex Elder void *p; 252186b00e0dSAlex Elder void *end; 252286b00e0dSAlex Elder char *image_id; 252386b00e0dSAlex Elder u64 overlap; 252486b00e0dSAlex Elder int ret; 252586b00e0dSAlex Elder 252686b00e0dSAlex Elder parent_spec = rbd_spec_alloc(); 252786b00e0dSAlex Elder if (!parent_spec) 252886b00e0dSAlex Elder return -ENOMEM; 252986b00e0dSAlex Elder 253086b00e0dSAlex Elder size = sizeof (__le64) + /* pool_id */ 253186b00e0dSAlex Elder sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX + /* image_id */ 253286b00e0dSAlex Elder sizeof (__le64) + /* snap_id */ 253386b00e0dSAlex Elder sizeof (__le64); /* overlap */ 253486b00e0dSAlex Elder reply_buf = kmalloc(size, GFP_KERNEL); 253586b00e0dSAlex Elder if (!reply_buf) { 253686b00e0dSAlex Elder ret = -ENOMEM; 253786b00e0dSAlex Elder goto out_err; 253886b00e0dSAlex Elder } 253986b00e0dSAlex Elder 254086b00e0dSAlex Elder snapid = cpu_to_le64(CEPH_NOSNAP); 254186b00e0dSAlex Elder ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, 254286b00e0dSAlex Elder "rbd", "get_parent", 254386b00e0dSAlex Elder (char *) &snapid, sizeof (snapid), 254407b2391fSAlex Elder (char *) reply_buf, size, NULL); 254586b00e0dSAlex Elder dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); 254686b00e0dSAlex Elder if (ret < 0) 254786b00e0dSAlex Elder goto out_err; 254886b00e0dSAlex Elder 254986b00e0dSAlex Elder ret = -ERANGE; 255086b00e0dSAlex Elder p = reply_buf; 255186b00e0dSAlex Elder end = (char *) reply_buf + size; 255286b00e0dSAlex Elder ceph_decode_64_safe(&p, end, parent_spec->pool_id, out_err); 255386b00e0dSAlex Elder if (parent_spec->pool_id == CEPH_NOPOOL) 255486b00e0dSAlex Elder goto out; /* No parent? No problem. */ 255586b00e0dSAlex Elder 2556979ed480SAlex Elder image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL); 255786b00e0dSAlex Elder if (IS_ERR(image_id)) { 255886b00e0dSAlex Elder ret = PTR_ERR(image_id); 255986b00e0dSAlex Elder goto out_err; 256086b00e0dSAlex Elder } 256186b00e0dSAlex Elder parent_spec->image_id = image_id; 256286b00e0dSAlex Elder ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err); 256386b00e0dSAlex Elder ceph_decode_64_safe(&p, end, overlap, out_err); 256486b00e0dSAlex Elder 256586b00e0dSAlex Elder rbd_dev->parent_overlap = overlap; 256686b00e0dSAlex Elder rbd_dev->parent_spec = parent_spec; 256786b00e0dSAlex Elder parent_spec = NULL; /* rbd_dev now owns this */ 256886b00e0dSAlex Elder out: 256986b00e0dSAlex Elder ret = 0; 257086b00e0dSAlex Elder out_err: 257186b00e0dSAlex Elder kfree(reply_buf); 257286b00e0dSAlex Elder rbd_spec_put(parent_spec); 257386b00e0dSAlex Elder 257486b00e0dSAlex Elder return ret; 257586b00e0dSAlex Elder } 257686b00e0dSAlex Elder 25779e15b77dSAlex Elder static char *rbd_dev_image_name(struct rbd_device *rbd_dev) 25789e15b77dSAlex Elder { 25799e15b77dSAlex Elder size_t image_id_size; 25809e15b77dSAlex Elder char *image_id; 25819e15b77dSAlex Elder void *p; 25829e15b77dSAlex Elder void *end; 25839e15b77dSAlex Elder size_t size; 25849e15b77dSAlex Elder void *reply_buf = NULL; 25859e15b77dSAlex Elder size_t len = 0; 25869e15b77dSAlex Elder char *image_name = NULL; 25879e15b77dSAlex Elder int ret; 25889e15b77dSAlex Elder 25899e15b77dSAlex Elder rbd_assert(!rbd_dev->spec->image_name); 25909e15b77dSAlex Elder 259169e7a02fSAlex Elder len = strlen(rbd_dev->spec->image_id); 259269e7a02fSAlex Elder image_id_size = sizeof (__le32) + len; 25939e15b77dSAlex Elder image_id = kmalloc(image_id_size, GFP_KERNEL); 25949e15b77dSAlex Elder if (!image_id) 25959e15b77dSAlex Elder return NULL; 25969e15b77dSAlex Elder 25979e15b77dSAlex Elder p = image_id; 25989e15b77dSAlex Elder end = (char *) image_id + image_id_size; 259969e7a02fSAlex Elder ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32) len); 26009e15b77dSAlex Elder 26019e15b77dSAlex Elder size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX; 26029e15b77dSAlex Elder reply_buf = kmalloc(size, GFP_KERNEL); 26039e15b77dSAlex Elder if (!reply_buf) 26049e15b77dSAlex Elder goto out; 26059e15b77dSAlex Elder 26069e15b77dSAlex Elder ret = rbd_req_sync_exec(rbd_dev, RBD_DIRECTORY, 26079e15b77dSAlex Elder "rbd", "dir_get_name", 26089e15b77dSAlex Elder image_id, image_id_size, 260907b2391fSAlex Elder (char *) reply_buf, size, NULL); 26109e15b77dSAlex Elder if (ret < 0) 26119e15b77dSAlex Elder goto out; 26129e15b77dSAlex Elder p = reply_buf; 26139e15b77dSAlex Elder end = (char *) reply_buf + size; 26149e15b77dSAlex Elder image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL); 26159e15b77dSAlex Elder if (IS_ERR(image_name)) 26169e15b77dSAlex Elder image_name = NULL; 26179e15b77dSAlex Elder else 26189e15b77dSAlex Elder dout("%s: name is %s len is %zd\n", __func__, image_name, len); 26199e15b77dSAlex Elder out: 26209e15b77dSAlex Elder kfree(reply_buf); 26219e15b77dSAlex Elder kfree(image_id); 26229e15b77dSAlex Elder 26239e15b77dSAlex Elder return image_name; 26249e15b77dSAlex Elder } 26259e15b77dSAlex Elder 26269e15b77dSAlex Elder /* 26279e15b77dSAlex Elder * When a parent image gets probed, we only have the pool, image, 26289e15b77dSAlex Elder * and snapshot ids but not the names of any of them. This call 26299e15b77dSAlex Elder * is made later to fill in those names. It has to be done after 26309e15b77dSAlex Elder * rbd_dev_snaps_update() has completed because some of the 26319e15b77dSAlex Elder * information (in particular, snapshot name) is not available 26329e15b77dSAlex Elder * until then. 26339e15b77dSAlex Elder */ 26349e15b77dSAlex Elder static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev) 26359e15b77dSAlex Elder { 26369e15b77dSAlex Elder struct ceph_osd_client *osdc; 26379e15b77dSAlex Elder const char *name; 26389e15b77dSAlex Elder void *reply_buf = NULL; 26399e15b77dSAlex Elder int ret; 26409e15b77dSAlex Elder 26419e15b77dSAlex Elder if (rbd_dev->spec->pool_name) 26429e15b77dSAlex Elder return 0; /* Already have the names */ 26439e15b77dSAlex Elder 26449e15b77dSAlex Elder /* Look up the pool name */ 26459e15b77dSAlex Elder 26469e15b77dSAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 26479e15b77dSAlex Elder name = ceph_pg_pool_name_by_id(osdc->osdmap, rbd_dev->spec->pool_id); 2648935dc89fSAlex Elder if (!name) { 2649935dc89fSAlex Elder rbd_warn(rbd_dev, "there is no pool with id %llu", 2650935dc89fSAlex Elder rbd_dev->spec->pool_id); /* Really a BUG() */ 2651935dc89fSAlex Elder return -EIO; 2652935dc89fSAlex Elder } 26539e15b77dSAlex Elder 26549e15b77dSAlex Elder rbd_dev->spec->pool_name = kstrdup(name, GFP_KERNEL); 26559e15b77dSAlex Elder if (!rbd_dev->spec->pool_name) 26569e15b77dSAlex Elder return -ENOMEM; 26579e15b77dSAlex Elder 26589e15b77dSAlex Elder /* Fetch the image name; tolerate failure here */ 26599e15b77dSAlex Elder 26609e15b77dSAlex Elder name = rbd_dev_image_name(rbd_dev); 266169e7a02fSAlex Elder if (name) 26629e15b77dSAlex Elder rbd_dev->spec->image_name = (char *) name; 266369e7a02fSAlex Elder else 266406ecc6cbSAlex Elder rbd_warn(rbd_dev, "unable to get image name"); 26659e15b77dSAlex Elder 26669e15b77dSAlex Elder /* Look up the snapshot name. */ 26679e15b77dSAlex Elder 26689e15b77dSAlex Elder name = rbd_snap_name(rbd_dev, rbd_dev->spec->snap_id); 26699e15b77dSAlex Elder if (!name) { 2670935dc89fSAlex Elder rbd_warn(rbd_dev, "no snapshot with id %llu", 2671935dc89fSAlex Elder rbd_dev->spec->snap_id); /* Really a BUG() */ 26729e15b77dSAlex Elder ret = -EIO; 26739e15b77dSAlex Elder goto out_err; 26749e15b77dSAlex Elder } 26759e15b77dSAlex Elder rbd_dev->spec->snap_name = kstrdup(name, GFP_KERNEL); 26769e15b77dSAlex Elder if(!rbd_dev->spec->snap_name) 26779e15b77dSAlex Elder goto out_err; 26789e15b77dSAlex Elder 26799e15b77dSAlex Elder return 0; 26809e15b77dSAlex Elder out_err: 26819e15b77dSAlex Elder kfree(reply_buf); 26829e15b77dSAlex Elder kfree(rbd_dev->spec->pool_name); 26839e15b77dSAlex Elder rbd_dev->spec->pool_name = NULL; 26849e15b77dSAlex Elder 26859e15b77dSAlex Elder return ret; 26869e15b77dSAlex Elder } 26879e15b77dSAlex Elder 26886e14b1a6SAlex Elder static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver) 268935d489f9SAlex Elder { 269035d489f9SAlex Elder size_t size; 269135d489f9SAlex Elder int ret; 269235d489f9SAlex Elder void *reply_buf; 269335d489f9SAlex Elder void *p; 269435d489f9SAlex Elder void *end; 269535d489f9SAlex Elder u64 seq; 269635d489f9SAlex Elder u32 snap_count; 269735d489f9SAlex Elder struct ceph_snap_context *snapc; 269835d489f9SAlex Elder u32 i; 269935d489f9SAlex Elder 270035d489f9SAlex Elder /* 270135d489f9SAlex Elder * We'll need room for the seq value (maximum snapshot id), 270235d489f9SAlex Elder * snapshot count, and array of that many snapshot ids. 270335d489f9SAlex Elder * For now we have a fixed upper limit on the number we're 270435d489f9SAlex Elder * prepared to receive. 270535d489f9SAlex Elder */ 270635d489f9SAlex Elder size = sizeof (__le64) + sizeof (__le32) + 270735d489f9SAlex Elder RBD_MAX_SNAP_COUNT * sizeof (__le64); 270835d489f9SAlex Elder reply_buf = kzalloc(size, GFP_KERNEL); 270935d489f9SAlex Elder if (!reply_buf) 271035d489f9SAlex Elder return -ENOMEM; 271135d489f9SAlex Elder 271235d489f9SAlex Elder ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, 271335d489f9SAlex Elder "rbd", "get_snapcontext", 271435d489f9SAlex Elder NULL, 0, 271507b2391fSAlex Elder reply_buf, size, ver); 271635d489f9SAlex Elder dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); 271735d489f9SAlex Elder if (ret < 0) 271835d489f9SAlex Elder goto out; 271935d489f9SAlex Elder 272035d489f9SAlex Elder ret = -ERANGE; 272135d489f9SAlex Elder p = reply_buf; 272235d489f9SAlex Elder end = (char *) reply_buf + size; 272335d489f9SAlex Elder ceph_decode_64_safe(&p, end, seq, out); 272435d489f9SAlex Elder ceph_decode_32_safe(&p, end, snap_count, out); 272535d489f9SAlex Elder 272635d489f9SAlex Elder /* 272735d489f9SAlex Elder * Make sure the reported number of snapshot ids wouldn't go 272835d489f9SAlex Elder * beyond the end of our buffer. But before checking that, 272935d489f9SAlex Elder * make sure the computed size of the snapshot context we 273035d489f9SAlex Elder * allocate is representable in a size_t. 273135d489f9SAlex Elder */ 273235d489f9SAlex Elder if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context)) 273335d489f9SAlex Elder / sizeof (u64)) { 273435d489f9SAlex Elder ret = -EINVAL; 273535d489f9SAlex Elder goto out; 273635d489f9SAlex Elder } 273735d489f9SAlex Elder if (!ceph_has_room(&p, end, snap_count * sizeof (__le64))) 273835d489f9SAlex Elder goto out; 273935d489f9SAlex Elder 274035d489f9SAlex Elder size = sizeof (struct ceph_snap_context) + 274135d489f9SAlex Elder snap_count * sizeof (snapc->snaps[0]); 274235d489f9SAlex Elder snapc = kmalloc(size, GFP_KERNEL); 274335d489f9SAlex Elder if (!snapc) { 274435d489f9SAlex Elder ret = -ENOMEM; 274535d489f9SAlex Elder goto out; 274635d489f9SAlex Elder } 274735d489f9SAlex Elder 274835d489f9SAlex Elder atomic_set(&snapc->nref, 1); 274935d489f9SAlex Elder snapc->seq = seq; 275035d489f9SAlex Elder snapc->num_snaps = snap_count; 275135d489f9SAlex Elder for (i = 0; i < snap_count; i++) 275235d489f9SAlex Elder snapc->snaps[i] = ceph_decode_64(&p); 275335d489f9SAlex Elder 275435d489f9SAlex Elder rbd_dev->header.snapc = snapc; 275535d489f9SAlex Elder 275635d489f9SAlex Elder dout(" snap context seq = %llu, snap_count = %u\n", 275735d489f9SAlex Elder (unsigned long long) seq, (unsigned int) snap_count); 275835d489f9SAlex Elder 275935d489f9SAlex Elder out: 276035d489f9SAlex Elder kfree(reply_buf); 276135d489f9SAlex Elder 276235d489f9SAlex Elder return 0; 276335d489f9SAlex Elder } 276435d489f9SAlex Elder 2765b8b1e2dbSAlex Elder static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which) 2766b8b1e2dbSAlex Elder { 2767b8b1e2dbSAlex Elder size_t size; 2768b8b1e2dbSAlex Elder void *reply_buf; 2769b8b1e2dbSAlex Elder __le64 snap_id; 2770b8b1e2dbSAlex Elder int ret; 2771b8b1e2dbSAlex Elder void *p; 2772b8b1e2dbSAlex Elder void *end; 2773b8b1e2dbSAlex Elder char *snap_name; 2774b8b1e2dbSAlex Elder 2775b8b1e2dbSAlex Elder size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN; 2776b8b1e2dbSAlex Elder reply_buf = kmalloc(size, GFP_KERNEL); 2777b8b1e2dbSAlex Elder if (!reply_buf) 2778b8b1e2dbSAlex Elder return ERR_PTR(-ENOMEM); 2779b8b1e2dbSAlex Elder 2780b8b1e2dbSAlex Elder snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]); 2781b8b1e2dbSAlex Elder ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, 2782b8b1e2dbSAlex Elder "rbd", "get_snapshot_name", 2783b8b1e2dbSAlex Elder (char *) &snap_id, sizeof (snap_id), 278407b2391fSAlex Elder reply_buf, size, NULL); 2785b8b1e2dbSAlex Elder dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); 2786b8b1e2dbSAlex Elder if (ret < 0) 2787b8b1e2dbSAlex Elder goto out; 2788b8b1e2dbSAlex Elder 2789b8b1e2dbSAlex Elder p = reply_buf; 2790b8b1e2dbSAlex Elder end = (char *) reply_buf + size; 2791e5c35534SAlex Elder snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL); 2792b8b1e2dbSAlex Elder if (IS_ERR(snap_name)) { 2793b8b1e2dbSAlex Elder ret = PTR_ERR(snap_name); 2794b8b1e2dbSAlex Elder goto out; 2795b8b1e2dbSAlex Elder } else { 2796b8b1e2dbSAlex Elder dout(" snap_id 0x%016llx snap_name = %s\n", 2797b8b1e2dbSAlex Elder (unsigned long long) le64_to_cpu(snap_id), snap_name); 2798b8b1e2dbSAlex Elder } 2799b8b1e2dbSAlex Elder kfree(reply_buf); 2800b8b1e2dbSAlex Elder 2801b8b1e2dbSAlex Elder return snap_name; 2802b8b1e2dbSAlex Elder out: 2803b8b1e2dbSAlex Elder kfree(reply_buf); 2804b8b1e2dbSAlex Elder 2805b8b1e2dbSAlex Elder return ERR_PTR(ret); 2806b8b1e2dbSAlex Elder } 2807b8b1e2dbSAlex Elder 2808b8b1e2dbSAlex Elder static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which, 2809b8b1e2dbSAlex Elder u64 *snap_size, u64 *snap_features) 2810b8b1e2dbSAlex Elder { 2811b8b1e2dbSAlex Elder __le64 snap_id; 2812b8b1e2dbSAlex Elder u8 order; 2813b8b1e2dbSAlex Elder int ret; 2814b8b1e2dbSAlex Elder 2815b8b1e2dbSAlex Elder snap_id = rbd_dev->header.snapc->snaps[which]; 2816b8b1e2dbSAlex Elder ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, &order, snap_size); 2817b8b1e2dbSAlex Elder if (ret) 2818b8b1e2dbSAlex Elder return ERR_PTR(ret); 2819b8b1e2dbSAlex Elder ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, snap_features); 2820b8b1e2dbSAlex Elder if (ret) 2821b8b1e2dbSAlex Elder return ERR_PTR(ret); 2822b8b1e2dbSAlex Elder 2823b8b1e2dbSAlex Elder return rbd_dev_v2_snap_name(rbd_dev, which); 2824b8b1e2dbSAlex Elder } 2825b8b1e2dbSAlex Elder 2826b8b1e2dbSAlex Elder static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which, 2827b8b1e2dbSAlex Elder u64 *snap_size, u64 *snap_features) 2828b8b1e2dbSAlex Elder { 2829b8b1e2dbSAlex Elder if (rbd_dev->image_format == 1) 2830b8b1e2dbSAlex Elder return rbd_dev_v1_snap_info(rbd_dev, which, 2831b8b1e2dbSAlex Elder snap_size, snap_features); 2832b8b1e2dbSAlex Elder if (rbd_dev->image_format == 2) 2833b8b1e2dbSAlex Elder return rbd_dev_v2_snap_info(rbd_dev, which, 2834b8b1e2dbSAlex Elder snap_size, snap_features); 2835b8b1e2dbSAlex Elder return ERR_PTR(-EINVAL); 2836b8b1e2dbSAlex Elder } 2837b8b1e2dbSAlex Elder 2838117973fbSAlex Elder static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver) 2839117973fbSAlex Elder { 2840117973fbSAlex Elder int ret; 2841117973fbSAlex Elder __u8 obj_order; 2842117973fbSAlex Elder 2843117973fbSAlex Elder down_write(&rbd_dev->header_rwsem); 2844117973fbSAlex Elder 2845117973fbSAlex Elder /* Grab old order first, to see if it changes */ 2846117973fbSAlex Elder 2847117973fbSAlex Elder obj_order = rbd_dev->header.obj_order, 2848117973fbSAlex Elder ret = rbd_dev_v2_image_size(rbd_dev); 2849117973fbSAlex Elder if (ret) 2850117973fbSAlex Elder goto out; 2851117973fbSAlex Elder if (rbd_dev->header.obj_order != obj_order) { 2852117973fbSAlex Elder ret = -EIO; 2853117973fbSAlex Elder goto out; 2854117973fbSAlex Elder } 2855117973fbSAlex Elder rbd_update_mapping_size(rbd_dev); 2856117973fbSAlex Elder 2857117973fbSAlex Elder ret = rbd_dev_v2_snap_context(rbd_dev, hver); 2858117973fbSAlex Elder dout("rbd_dev_v2_snap_context returned %d\n", ret); 2859117973fbSAlex Elder if (ret) 2860117973fbSAlex Elder goto out; 2861117973fbSAlex Elder ret = rbd_dev_snaps_update(rbd_dev); 2862117973fbSAlex Elder dout("rbd_dev_snaps_update returned %d\n", ret); 2863117973fbSAlex Elder if (ret) 2864117973fbSAlex Elder goto out; 2865117973fbSAlex Elder ret = rbd_dev_snaps_register(rbd_dev); 2866117973fbSAlex Elder dout("rbd_dev_snaps_register returned %d\n", ret); 2867117973fbSAlex Elder out: 2868117973fbSAlex Elder up_write(&rbd_dev->header_rwsem); 2869117973fbSAlex Elder 2870117973fbSAlex Elder return ret; 2871117973fbSAlex Elder } 2872117973fbSAlex Elder 28739d475de5SAlex Elder /* 287435938150SAlex Elder * Scan the rbd device's current snapshot list and compare it to the 287535938150SAlex Elder * newly-received snapshot context. Remove any existing snapshots 287635938150SAlex Elder * not present in the new snapshot context. Add a new snapshot for 287735938150SAlex Elder * any snaphots in the snapshot context not in the current list. 287835938150SAlex Elder * And verify there are no changes to snapshots we already know 287935938150SAlex Elder * about. 288035938150SAlex Elder * 288135938150SAlex Elder * Assumes the snapshots in the snapshot context are sorted by 288235938150SAlex Elder * snapshot id, highest id first. (Snapshots in the rbd_dev's list 288335938150SAlex Elder * are also maintained in that order.) 2884dfc5606dSYehuda Sadeh */ 2885304f6808SAlex Elder static int rbd_dev_snaps_update(struct rbd_device *rbd_dev) 2886dfc5606dSYehuda Sadeh { 288735938150SAlex Elder struct ceph_snap_context *snapc = rbd_dev->header.snapc; 288835938150SAlex Elder const u32 snap_count = snapc->num_snaps; 288935938150SAlex Elder struct list_head *head = &rbd_dev->snaps; 289035938150SAlex Elder struct list_head *links = head->next; 289135938150SAlex Elder u32 index = 0; 2892dfc5606dSYehuda Sadeh 28939fcbb800SAlex Elder dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count); 289435938150SAlex Elder while (index < snap_count || links != head) { 289535938150SAlex Elder u64 snap_id; 289635938150SAlex Elder struct rbd_snap *snap; 2897cd892126SAlex Elder char *snap_name; 2898cd892126SAlex Elder u64 snap_size = 0; 2899cd892126SAlex Elder u64 snap_features = 0; 2900dfc5606dSYehuda Sadeh 290135938150SAlex Elder snap_id = index < snap_count ? snapc->snaps[index] 290235938150SAlex Elder : CEPH_NOSNAP; 290335938150SAlex Elder snap = links != head ? list_entry(links, struct rbd_snap, node) 290435938150SAlex Elder : NULL; 2905aafb230eSAlex Elder rbd_assert(!snap || snap->id != CEPH_NOSNAP); 2906dfc5606dSYehuda Sadeh 290735938150SAlex Elder if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) { 290835938150SAlex Elder struct list_head *next = links->next; 2909dfc5606dSYehuda Sadeh 291035938150SAlex Elder /* Existing snapshot not in the new snap context */ 2911dfc5606dSYehuda Sadeh 29120d7dbfceSAlex Elder if (rbd_dev->spec->snap_id == snap->id) 2913d78b650aSAlex Elder atomic_set(&rbd_dev->exists, 0); 291441f38c2bSAlex Elder rbd_remove_snap_dev(snap); 29159fcbb800SAlex Elder dout("%ssnap id %llu has been removed\n", 29160d7dbfceSAlex Elder rbd_dev->spec->snap_id == snap->id ? 29170d7dbfceSAlex Elder "mapped " : "", 29189fcbb800SAlex Elder (unsigned long long) snap->id); 2919dfc5606dSYehuda Sadeh 292035938150SAlex Elder /* Done with this list entry; advance */ 292135938150SAlex Elder 292235938150SAlex Elder links = next; 292335938150SAlex Elder continue; 2924dfc5606dSYehuda Sadeh } 292535938150SAlex Elder 2926b8b1e2dbSAlex Elder snap_name = rbd_dev_snap_info(rbd_dev, index, 2927cd892126SAlex Elder &snap_size, &snap_features); 2928cd892126SAlex Elder if (IS_ERR(snap_name)) 2929cd892126SAlex Elder return PTR_ERR(snap_name); 2930cd892126SAlex Elder 29319fcbb800SAlex Elder dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count, 29329fcbb800SAlex Elder (unsigned long long) snap_id); 293335938150SAlex Elder if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) { 293435938150SAlex Elder struct rbd_snap *new_snap; 293535938150SAlex Elder 293635938150SAlex Elder /* We haven't seen this snapshot before */ 293735938150SAlex Elder 2938c8d18425SAlex Elder new_snap = __rbd_add_snap_dev(rbd_dev, snap_name, 2939cd892126SAlex Elder snap_id, snap_size, snap_features); 29409fcbb800SAlex Elder if (IS_ERR(new_snap)) { 29419fcbb800SAlex Elder int err = PTR_ERR(new_snap); 29429fcbb800SAlex Elder 29439fcbb800SAlex Elder dout(" failed to add dev, error %d\n", err); 29449fcbb800SAlex Elder 29459fcbb800SAlex Elder return err; 29469fcbb800SAlex Elder } 294735938150SAlex Elder 294835938150SAlex Elder /* New goes before existing, or at end of list */ 294935938150SAlex Elder 29509fcbb800SAlex Elder dout(" added dev%s\n", snap ? "" : " at end\n"); 295135938150SAlex Elder if (snap) 295235938150SAlex Elder list_add_tail(&new_snap->node, &snap->node); 295335938150SAlex Elder else 2954523f3258SAlex Elder list_add_tail(&new_snap->node, head); 295535938150SAlex Elder } else { 295635938150SAlex Elder /* Already have this one */ 295735938150SAlex Elder 29589fcbb800SAlex Elder dout(" already present\n"); 29599fcbb800SAlex Elder 2960cd892126SAlex Elder rbd_assert(snap->size == snap_size); 2961aafb230eSAlex Elder rbd_assert(!strcmp(snap->name, snap_name)); 2962cd892126SAlex Elder rbd_assert(snap->features == snap_features); 296335938150SAlex Elder 296435938150SAlex Elder /* Done with this list entry; advance */ 296535938150SAlex Elder 296635938150SAlex Elder links = links->next; 2967dfc5606dSYehuda Sadeh } 296835938150SAlex Elder 296935938150SAlex Elder /* Advance to the next entry in the snapshot context */ 297035938150SAlex Elder 297135938150SAlex Elder index++; 2972dfc5606dSYehuda Sadeh } 29739fcbb800SAlex Elder dout("%s: done\n", __func__); 2974dfc5606dSYehuda Sadeh 2975dfc5606dSYehuda Sadeh return 0; 2976dfc5606dSYehuda Sadeh } 2977dfc5606dSYehuda Sadeh 2978304f6808SAlex Elder /* 2979304f6808SAlex Elder * Scan the list of snapshots and register the devices for any that 2980304f6808SAlex Elder * have not already been registered. 2981304f6808SAlex Elder */ 2982304f6808SAlex Elder static int rbd_dev_snaps_register(struct rbd_device *rbd_dev) 2983304f6808SAlex Elder { 2984304f6808SAlex Elder struct rbd_snap *snap; 2985304f6808SAlex Elder int ret = 0; 2986304f6808SAlex Elder 2987304f6808SAlex Elder dout("%s called\n", __func__); 298886ff77bbSAlex Elder if (WARN_ON(!device_is_registered(&rbd_dev->dev))) 298986ff77bbSAlex Elder return -EIO; 2990304f6808SAlex Elder 2991304f6808SAlex Elder list_for_each_entry(snap, &rbd_dev->snaps, node) { 2992304f6808SAlex Elder if (!rbd_snap_registered(snap)) { 2993304f6808SAlex Elder ret = rbd_register_snap_dev(snap, &rbd_dev->dev); 2994304f6808SAlex Elder if (ret < 0) 2995304f6808SAlex Elder break; 2996304f6808SAlex Elder } 2997304f6808SAlex Elder } 2998304f6808SAlex Elder dout("%s: returning %d\n", __func__, ret); 2999304f6808SAlex Elder 3000304f6808SAlex Elder return ret; 3001304f6808SAlex Elder } 3002304f6808SAlex Elder 3003dfc5606dSYehuda Sadeh static int rbd_bus_add_dev(struct rbd_device *rbd_dev) 3004dfc5606dSYehuda Sadeh { 3005dfc5606dSYehuda Sadeh struct device *dev; 3006cd789ab9SAlex Elder int ret; 3007dfc5606dSYehuda Sadeh 3008dfc5606dSYehuda Sadeh mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); 3009dfc5606dSYehuda Sadeh 3010cd789ab9SAlex Elder dev = &rbd_dev->dev; 3011dfc5606dSYehuda Sadeh dev->bus = &rbd_bus_type; 3012dfc5606dSYehuda Sadeh dev->type = &rbd_device_type; 3013dfc5606dSYehuda Sadeh dev->parent = &rbd_root_dev; 3014dfc5606dSYehuda Sadeh dev->release = rbd_dev_release; 3015de71a297SAlex Elder dev_set_name(dev, "%d", rbd_dev->dev_id); 3016dfc5606dSYehuda Sadeh ret = device_register(dev); 3017dfc5606dSYehuda Sadeh 3018dfc5606dSYehuda Sadeh mutex_unlock(&ctl_mutex); 3019cd789ab9SAlex Elder 3020dfc5606dSYehuda Sadeh return ret; 3021602adf40SYehuda Sadeh } 3022602adf40SYehuda Sadeh 3023dfc5606dSYehuda Sadeh static void rbd_bus_del_dev(struct rbd_device *rbd_dev) 3024dfc5606dSYehuda Sadeh { 3025dfc5606dSYehuda Sadeh device_unregister(&rbd_dev->dev); 3026dfc5606dSYehuda Sadeh } 3027dfc5606dSYehuda Sadeh 302859c2be1eSYehuda Sadeh static int rbd_init_watch_dev(struct rbd_device *rbd_dev) 302959c2be1eSYehuda Sadeh { 303059c2be1eSYehuda Sadeh int ret, rc; 303159c2be1eSYehuda Sadeh 303259c2be1eSYehuda Sadeh do { 30330e6f322dSAlex Elder ret = rbd_req_sync_watch(rbd_dev); 303459c2be1eSYehuda Sadeh if (ret == -ERANGE) { 3035117973fbSAlex Elder rc = rbd_dev_refresh(rbd_dev, NULL); 303659c2be1eSYehuda Sadeh if (rc < 0) 303759c2be1eSYehuda Sadeh return rc; 303859c2be1eSYehuda Sadeh } 303959c2be1eSYehuda Sadeh } while (ret == -ERANGE); 304059c2be1eSYehuda Sadeh 304159c2be1eSYehuda Sadeh return ret; 304259c2be1eSYehuda Sadeh } 304359c2be1eSYehuda Sadeh 3044e2839308SAlex Elder static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0); 30451ddbe94eSAlex Elder 30461ddbe94eSAlex Elder /* 3047499afd5bSAlex Elder * Get a unique rbd identifier for the given new rbd_dev, and add 3048499afd5bSAlex Elder * the rbd_dev to the global list. The minimum rbd id is 1. 30491ddbe94eSAlex Elder */ 3050e2839308SAlex Elder static void rbd_dev_id_get(struct rbd_device *rbd_dev) 3051b7f23c36SAlex Elder { 3052e2839308SAlex Elder rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max); 3053499afd5bSAlex Elder 3054499afd5bSAlex Elder spin_lock(&rbd_dev_list_lock); 3055499afd5bSAlex Elder list_add_tail(&rbd_dev->node, &rbd_dev_list); 3056499afd5bSAlex Elder spin_unlock(&rbd_dev_list_lock); 3057e2839308SAlex Elder dout("rbd_dev %p given dev id %llu\n", rbd_dev, 3058e2839308SAlex Elder (unsigned long long) rbd_dev->dev_id); 3059b7f23c36SAlex Elder } 3060b7f23c36SAlex Elder 30611ddbe94eSAlex Elder /* 3062499afd5bSAlex Elder * Remove an rbd_dev from the global list, and record that its 3063499afd5bSAlex Elder * identifier is no longer in use. 30641ddbe94eSAlex Elder */ 3065e2839308SAlex Elder static void rbd_dev_id_put(struct rbd_device *rbd_dev) 30661ddbe94eSAlex Elder { 3067d184f6bfSAlex Elder struct list_head *tmp; 3068de71a297SAlex Elder int rbd_id = rbd_dev->dev_id; 3069d184f6bfSAlex Elder int max_id; 3070d184f6bfSAlex Elder 3071aafb230eSAlex Elder rbd_assert(rbd_id > 0); 3072499afd5bSAlex Elder 3073e2839308SAlex Elder dout("rbd_dev %p released dev id %llu\n", rbd_dev, 3074e2839308SAlex Elder (unsigned long long) rbd_dev->dev_id); 3075499afd5bSAlex Elder spin_lock(&rbd_dev_list_lock); 3076499afd5bSAlex Elder list_del_init(&rbd_dev->node); 3077d184f6bfSAlex Elder 3078d184f6bfSAlex Elder /* 3079d184f6bfSAlex Elder * If the id being "put" is not the current maximum, there 3080d184f6bfSAlex Elder * is nothing special we need to do. 3081d184f6bfSAlex Elder */ 3082e2839308SAlex Elder if (rbd_id != atomic64_read(&rbd_dev_id_max)) { 3083d184f6bfSAlex Elder spin_unlock(&rbd_dev_list_lock); 3084d184f6bfSAlex Elder return; 3085d184f6bfSAlex Elder } 3086d184f6bfSAlex Elder 3087d184f6bfSAlex Elder /* 3088d184f6bfSAlex Elder * We need to update the current maximum id. Search the 3089d184f6bfSAlex Elder * list to find out what it is. We're more likely to find 3090d184f6bfSAlex Elder * the maximum at the end, so search the list backward. 3091d184f6bfSAlex Elder */ 3092d184f6bfSAlex Elder max_id = 0; 3093d184f6bfSAlex Elder list_for_each_prev(tmp, &rbd_dev_list) { 3094d184f6bfSAlex Elder struct rbd_device *rbd_dev; 3095d184f6bfSAlex Elder 3096d184f6bfSAlex Elder rbd_dev = list_entry(tmp, struct rbd_device, node); 3097b213e0b1SAlex Elder if (rbd_dev->dev_id > max_id) 3098b213e0b1SAlex Elder max_id = rbd_dev->dev_id; 3099d184f6bfSAlex Elder } 3100499afd5bSAlex Elder spin_unlock(&rbd_dev_list_lock); 31011ddbe94eSAlex Elder 31021ddbe94eSAlex Elder /* 3103e2839308SAlex Elder * The max id could have been updated by rbd_dev_id_get(), in 3104d184f6bfSAlex Elder * which case it now accurately reflects the new maximum. 3105d184f6bfSAlex Elder * Be careful not to overwrite the maximum value in that 3106d184f6bfSAlex Elder * case. 31071ddbe94eSAlex Elder */ 3108e2839308SAlex Elder atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id); 3109e2839308SAlex Elder dout(" max dev id has been reset\n"); 3110b7f23c36SAlex Elder } 3111b7f23c36SAlex Elder 3112a725f65eSAlex Elder /* 3113e28fff26SAlex Elder * Skips over white space at *buf, and updates *buf to point to the 3114e28fff26SAlex Elder * first found non-space character (if any). Returns the length of 3115593a9e7bSAlex Elder * the token (string of non-white space characters) found. Note 3116593a9e7bSAlex Elder * that *buf must be terminated with '\0'. 3117e28fff26SAlex Elder */ 3118e28fff26SAlex Elder static inline size_t next_token(const char **buf) 3119e28fff26SAlex Elder { 3120e28fff26SAlex Elder /* 3121e28fff26SAlex Elder * These are the characters that produce nonzero for 3122e28fff26SAlex Elder * isspace() in the "C" and "POSIX" locales. 3123e28fff26SAlex Elder */ 3124e28fff26SAlex Elder const char *spaces = " \f\n\r\t\v"; 3125e28fff26SAlex Elder 3126e28fff26SAlex Elder *buf += strspn(*buf, spaces); /* Find start of token */ 3127e28fff26SAlex Elder 3128e28fff26SAlex Elder return strcspn(*buf, spaces); /* Return token length */ 3129e28fff26SAlex Elder } 3130e28fff26SAlex Elder 3131e28fff26SAlex Elder /* 3132e28fff26SAlex Elder * Finds the next token in *buf, and if the provided token buffer is 3133e28fff26SAlex Elder * big enough, copies the found token into it. The result, if 3134593a9e7bSAlex Elder * copied, is guaranteed to be terminated with '\0'. Note that *buf 3135593a9e7bSAlex Elder * must be terminated with '\0' on entry. 3136e28fff26SAlex Elder * 3137e28fff26SAlex Elder * Returns the length of the token found (not including the '\0'). 3138e28fff26SAlex Elder * Return value will be 0 if no token is found, and it will be >= 3139e28fff26SAlex Elder * token_size if the token would not fit. 3140e28fff26SAlex Elder * 3141593a9e7bSAlex Elder * The *buf pointer will be updated to point beyond the end of the 3142e28fff26SAlex Elder * found token. Note that this occurs even if the token buffer is 3143e28fff26SAlex Elder * too small to hold it. 3144e28fff26SAlex Elder */ 3145e28fff26SAlex Elder static inline size_t copy_token(const char **buf, 3146e28fff26SAlex Elder char *token, 3147e28fff26SAlex Elder size_t token_size) 3148e28fff26SAlex Elder { 3149e28fff26SAlex Elder size_t len; 3150e28fff26SAlex Elder 3151e28fff26SAlex Elder len = next_token(buf); 3152e28fff26SAlex Elder if (len < token_size) { 3153e28fff26SAlex Elder memcpy(token, *buf, len); 3154e28fff26SAlex Elder *(token + len) = '\0'; 3155e28fff26SAlex Elder } 3156e28fff26SAlex Elder *buf += len; 3157e28fff26SAlex Elder 3158e28fff26SAlex Elder return len; 3159e28fff26SAlex Elder } 3160e28fff26SAlex Elder 3161e28fff26SAlex Elder /* 3162ea3352f4SAlex Elder * Finds the next token in *buf, dynamically allocates a buffer big 3163ea3352f4SAlex Elder * enough to hold a copy of it, and copies the token into the new 3164ea3352f4SAlex Elder * buffer. The copy is guaranteed to be terminated with '\0'. Note 3165ea3352f4SAlex Elder * that a duplicate buffer is created even for a zero-length token. 3166ea3352f4SAlex Elder * 3167ea3352f4SAlex Elder * Returns a pointer to the newly-allocated duplicate, or a null 3168ea3352f4SAlex Elder * pointer if memory for the duplicate was not available. If 3169ea3352f4SAlex Elder * the lenp argument is a non-null pointer, the length of the token 3170ea3352f4SAlex Elder * (not including the '\0') is returned in *lenp. 3171ea3352f4SAlex Elder * 3172ea3352f4SAlex Elder * If successful, the *buf pointer will be updated to point beyond 3173ea3352f4SAlex Elder * the end of the found token. 3174ea3352f4SAlex Elder * 3175ea3352f4SAlex Elder * Note: uses GFP_KERNEL for allocation. 3176ea3352f4SAlex Elder */ 3177ea3352f4SAlex Elder static inline char *dup_token(const char **buf, size_t *lenp) 3178ea3352f4SAlex Elder { 3179ea3352f4SAlex Elder char *dup; 3180ea3352f4SAlex Elder size_t len; 3181ea3352f4SAlex Elder 3182ea3352f4SAlex Elder len = next_token(buf); 31834caf35f9SAlex Elder dup = kmemdup(*buf, len + 1, GFP_KERNEL); 3184ea3352f4SAlex Elder if (!dup) 3185ea3352f4SAlex Elder return NULL; 3186ea3352f4SAlex Elder *(dup + len) = '\0'; 3187ea3352f4SAlex Elder *buf += len; 3188ea3352f4SAlex Elder 3189ea3352f4SAlex Elder if (lenp) 3190ea3352f4SAlex Elder *lenp = len; 3191ea3352f4SAlex Elder 3192ea3352f4SAlex Elder return dup; 3193ea3352f4SAlex Elder } 3194ea3352f4SAlex Elder 3195ea3352f4SAlex Elder /* 3196859c31dfSAlex Elder * Parse the options provided for an "rbd add" (i.e., rbd image 3197859c31dfSAlex Elder * mapping) request. These arrive via a write to /sys/bus/rbd/add, 3198859c31dfSAlex Elder * and the data written is passed here via a NUL-terminated buffer. 3199859c31dfSAlex Elder * Returns 0 if successful or an error code otherwise. 3200d22f76e7SAlex Elder * 3201859c31dfSAlex Elder * The information extracted from these options is recorded in 3202859c31dfSAlex Elder * the other parameters which return dynamically-allocated 3203859c31dfSAlex Elder * structures: 3204859c31dfSAlex Elder * ceph_opts 3205859c31dfSAlex Elder * The address of a pointer that will refer to a ceph options 3206859c31dfSAlex Elder * structure. Caller must release the returned pointer using 3207859c31dfSAlex Elder * ceph_destroy_options() when it is no longer needed. 3208859c31dfSAlex Elder * rbd_opts 3209859c31dfSAlex Elder * Address of an rbd options pointer. Fully initialized by 3210859c31dfSAlex Elder * this function; caller must release with kfree(). 3211859c31dfSAlex Elder * spec 3212859c31dfSAlex Elder * Address of an rbd image specification pointer. Fully 3213859c31dfSAlex Elder * initialized by this function based on parsed options. 3214859c31dfSAlex Elder * Caller must release with rbd_spec_put(). 3215859c31dfSAlex Elder * 3216859c31dfSAlex Elder * The options passed take this form: 3217859c31dfSAlex Elder * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>] 3218859c31dfSAlex Elder * where: 3219859c31dfSAlex Elder * <mon_addrs> 3220859c31dfSAlex Elder * A comma-separated list of one or more monitor addresses. 3221859c31dfSAlex Elder * A monitor address is an ip address, optionally followed 3222859c31dfSAlex Elder * by a port number (separated by a colon). 3223859c31dfSAlex Elder * I.e.: ip1[:port1][,ip2[:port2]...] 3224859c31dfSAlex Elder * <options> 3225859c31dfSAlex Elder * A comma-separated list of ceph and/or rbd options. 3226859c31dfSAlex Elder * <pool_name> 3227859c31dfSAlex Elder * The name of the rados pool containing the rbd image. 3228859c31dfSAlex Elder * <image_name> 3229859c31dfSAlex Elder * The name of the image in that pool to map. 3230859c31dfSAlex Elder * <snap_id> 3231859c31dfSAlex Elder * An optional snapshot id. If provided, the mapping will 3232859c31dfSAlex Elder * present data from the image at the time that snapshot was 3233859c31dfSAlex Elder * created. The image head is used if no snapshot id is 3234859c31dfSAlex Elder * provided. Snapshot mappings are always read-only. 3235a725f65eSAlex Elder */ 3236859c31dfSAlex Elder static int rbd_add_parse_args(const char *buf, 3237dc79b113SAlex Elder struct ceph_options **ceph_opts, 3238859c31dfSAlex Elder struct rbd_options **opts, 3239859c31dfSAlex Elder struct rbd_spec **rbd_spec) 3240a725f65eSAlex Elder { 3241e28fff26SAlex Elder size_t len; 3242859c31dfSAlex Elder char *options; 32430ddebc0cSAlex Elder const char *mon_addrs; 32440ddebc0cSAlex Elder size_t mon_addrs_size; 3245859c31dfSAlex Elder struct rbd_spec *spec = NULL; 32464e9afebaSAlex Elder struct rbd_options *rbd_opts = NULL; 3247859c31dfSAlex Elder struct ceph_options *copts; 3248dc79b113SAlex Elder int ret; 3249e28fff26SAlex Elder 3250e28fff26SAlex Elder /* The first four tokens are required */ 3251e28fff26SAlex Elder 32527ef3214aSAlex Elder len = next_token(&buf); 32534fb5d671SAlex Elder if (!len) { 32544fb5d671SAlex Elder rbd_warn(NULL, "no monitor address(es) provided"); 32554fb5d671SAlex Elder return -EINVAL; 32564fb5d671SAlex Elder } 32570ddebc0cSAlex Elder mon_addrs = buf; 3258f28e565aSAlex Elder mon_addrs_size = len + 1; 32597ef3214aSAlex Elder buf += len; 3260a725f65eSAlex Elder 3261dc79b113SAlex Elder ret = -EINVAL; 3262f28e565aSAlex Elder options = dup_token(&buf, NULL); 3263f28e565aSAlex Elder if (!options) 3264dc79b113SAlex Elder return -ENOMEM; 32654fb5d671SAlex Elder if (!*options) { 32664fb5d671SAlex Elder rbd_warn(NULL, "no options provided"); 32674fb5d671SAlex Elder goto out_err; 32684fb5d671SAlex Elder } 3269a725f65eSAlex Elder 3270859c31dfSAlex Elder spec = rbd_spec_alloc(); 3271859c31dfSAlex Elder if (!spec) 3272f28e565aSAlex Elder goto out_mem; 3273859c31dfSAlex Elder 3274859c31dfSAlex Elder spec->pool_name = dup_token(&buf, NULL); 3275859c31dfSAlex Elder if (!spec->pool_name) 3276859c31dfSAlex Elder goto out_mem; 32774fb5d671SAlex Elder if (!*spec->pool_name) { 32784fb5d671SAlex Elder rbd_warn(NULL, "no pool name provided"); 32794fb5d671SAlex Elder goto out_err; 32804fb5d671SAlex Elder } 3281e28fff26SAlex Elder 328269e7a02fSAlex Elder spec->image_name = dup_token(&buf, NULL); 3283859c31dfSAlex Elder if (!spec->image_name) 3284f28e565aSAlex Elder goto out_mem; 32854fb5d671SAlex Elder if (!*spec->image_name) { 32864fb5d671SAlex Elder rbd_warn(NULL, "no image name provided"); 32874fb5d671SAlex Elder goto out_err; 32884fb5d671SAlex Elder } 3289e28fff26SAlex Elder 3290f28e565aSAlex Elder /* 3291f28e565aSAlex Elder * Snapshot name is optional; default is to use "-" 3292f28e565aSAlex Elder * (indicating the head/no snapshot). 3293f28e565aSAlex Elder */ 32943feeb894SAlex Elder len = next_token(&buf); 3295820a5f3eSAlex Elder if (!len) { 32963feeb894SAlex Elder buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */ 32973feeb894SAlex Elder len = sizeof (RBD_SNAP_HEAD_NAME) - 1; 3298f28e565aSAlex Elder } else if (len > RBD_MAX_SNAP_NAME_LEN) { 3299dc79b113SAlex Elder ret = -ENAMETOOLONG; 3300f28e565aSAlex Elder goto out_err; 3301849b4260SAlex Elder } 33024caf35f9SAlex Elder spec->snap_name = kmemdup(buf, len + 1, GFP_KERNEL); 3303859c31dfSAlex Elder if (!spec->snap_name) 3304f28e565aSAlex Elder goto out_mem; 3305859c31dfSAlex Elder *(spec->snap_name + len) = '\0'; 3306e5c35534SAlex Elder 33070ddebc0cSAlex Elder /* Initialize all rbd options to the defaults */ 3308e28fff26SAlex Elder 33094e9afebaSAlex Elder rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL); 33104e9afebaSAlex Elder if (!rbd_opts) 33114e9afebaSAlex Elder goto out_mem; 33124e9afebaSAlex Elder 33134e9afebaSAlex Elder rbd_opts->read_only = RBD_READ_ONLY_DEFAULT; 3314d22f76e7SAlex Elder 3315859c31dfSAlex Elder copts = ceph_parse_options(options, mon_addrs, 33160ddebc0cSAlex Elder mon_addrs + mon_addrs_size - 1, 33174e9afebaSAlex Elder parse_rbd_opts_token, rbd_opts); 3318859c31dfSAlex Elder if (IS_ERR(copts)) { 3319859c31dfSAlex Elder ret = PTR_ERR(copts); 3320dc79b113SAlex Elder goto out_err; 3321dc79b113SAlex Elder } 3322859c31dfSAlex Elder kfree(options); 3323859c31dfSAlex Elder 3324859c31dfSAlex Elder *ceph_opts = copts; 33254e9afebaSAlex Elder *opts = rbd_opts; 3326859c31dfSAlex Elder *rbd_spec = spec; 33270ddebc0cSAlex Elder 3328dc79b113SAlex Elder return 0; 3329f28e565aSAlex Elder out_mem: 3330dc79b113SAlex Elder ret = -ENOMEM; 3331d22f76e7SAlex Elder out_err: 3332859c31dfSAlex Elder kfree(rbd_opts); 3333859c31dfSAlex Elder rbd_spec_put(spec); 3334f28e565aSAlex Elder kfree(options); 3335d22f76e7SAlex Elder 3336dc79b113SAlex Elder return ret; 3337a725f65eSAlex Elder } 3338a725f65eSAlex Elder 3339589d30e0SAlex Elder /* 3340589d30e0SAlex Elder * An rbd format 2 image has a unique identifier, distinct from the 3341589d30e0SAlex Elder * name given to it by the user. Internally, that identifier is 3342589d30e0SAlex Elder * what's used to specify the names of objects related to the image. 3343589d30e0SAlex Elder * 3344589d30e0SAlex Elder * A special "rbd id" object is used to map an rbd image name to its 3345589d30e0SAlex Elder * id. If that object doesn't exist, then there is no v2 rbd image 3346589d30e0SAlex Elder * with the supplied name. 3347589d30e0SAlex Elder * 3348589d30e0SAlex Elder * This function will record the given rbd_dev's image_id field if 3349589d30e0SAlex Elder * it can be determined, and in that case will return 0. If any 3350589d30e0SAlex Elder * errors occur a negative errno will be returned and the rbd_dev's 3351589d30e0SAlex Elder * image_id field will be unchanged (and should be NULL). 3352589d30e0SAlex Elder */ 3353589d30e0SAlex Elder static int rbd_dev_image_id(struct rbd_device *rbd_dev) 3354589d30e0SAlex Elder { 3355589d30e0SAlex Elder int ret; 3356589d30e0SAlex Elder size_t size; 3357589d30e0SAlex Elder char *object_name; 3358589d30e0SAlex Elder void *response; 3359589d30e0SAlex Elder void *p; 3360589d30e0SAlex Elder 3361589d30e0SAlex Elder /* 33622c0d0a10SAlex Elder * When probing a parent image, the image id is already 33632c0d0a10SAlex Elder * known (and the image name likely is not). There's no 33642c0d0a10SAlex Elder * need to fetch the image id again in this case. 33652c0d0a10SAlex Elder */ 33662c0d0a10SAlex Elder if (rbd_dev->spec->image_id) 33672c0d0a10SAlex Elder return 0; 33682c0d0a10SAlex Elder 33692c0d0a10SAlex Elder /* 3370589d30e0SAlex Elder * First, see if the format 2 image id file exists, and if 3371589d30e0SAlex Elder * so, get the image's persistent id from it. 3372589d30e0SAlex Elder */ 337369e7a02fSAlex Elder size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name); 3374589d30e0SAlex Elder object_name = kmalloc(size, GFP_NOIO); 3375589d30e0SAlex Elder if (!object_name) 3376589d30e0SAlex Elder return -ENOMEM; 33770d7dbfceSAlex Elder sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name); 3378589d30e0SAlex Elder dout("rbd id object name is %s\n", object_name); 3379589d30e0SAlex Elder 3380589d30e0SAlex Elder /* Response will be an encoded string, which includes a length */ 3381589d30e0SAlex Elder 3382589d30e0SAlex Elder size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX; 3383589d30e0SAlex Elder response = kzalloc(size, GFP_NOIO); 3384589d30e0SAlex Elder if (!response) { 3385589d30e0SAlex Elder ret = -ENOMEM; 3386589d30e0SAlex Elder goto out; 3387589d30e0SAlex Elder } 3388589d30e0SAlex Elder 3389589d30e0SAlex Elder ret = rbd_req_sync_exec(rbd_dev, object_name, 3390589d30e0SAlex Elder "rbd", "get_id", 3391589d30e0SAlex Elder NULL, 0, 339207b2391fSAlex Elder response, RBD_IMAGE_ID_LEN_MAX, NULL); 3393589d30e0SAlex Elder dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); 3394589d30e0SAlex Elder if (ret < 0) 3395589d30e0SAlex Elder goto out; 3396a0ea3a40SAlex Elder ret = 0; /* rbd_req_sync_exec() can return positive */ 3397589d30e0SAlex Elder 3398589d30e0SAlex Elder p = response; 33990d7dbfceSAlex Elder rbd_dev->spec->image_id = ceph_extract_encoded_string(&p, 3400589d30e0SAlex Elder p + RBD_IMAGE_ID_LEN_MAX, 3401979ed480SAlex Elder NULL, GFP_NOIO); 34020d7dbfceSAlex Elder if (IS_ERR(rbd_dev->spec->image_id)) { 34030d7dbfceSAlex Elder ret = PTR_ERR(rbd_dev->spec->image_id); 34040d7dbfceSAlex Elder rbd_dev->spec->image_id = NULL; 3405589d30e0SAlex Elder } else { 34060d7dbfceSAlex Elder dout("image_id is %s\n", rbd_dev->spec->image_id); 3407589d30e0SAlex Elder } 3408589d30e0SAlex Elder out: 3409589d30e0SAlex Elder kfree(response); 3410589d30e0SAlex Elder kfree(object_name); 3411589d30e0SAlex Elder 3412589d30e0SAlex Elder return ret; 3413589d30e0SAlex Elder } 3414589d30e0SAlex Elder 3415a30b71b9SAlex Elder static int rbd_dev_v1_probe(struct rbd_device *rbd_dev) 3416a30b71b9SAlex Elder { 3417a30b71b9SAlex Elder int ret; 3418a30b71b9SAlex Elder size_t size; 3419a30b71b9SAlex Elder 3420a30b71b9SAlex Elder /* Version 1 images have no id; empty string is used */ 3421a30b71b9SAlex Elder 34220d7dbfceSAlex Elder rbd_dev->spec->image_id = kstrdup("", GFP_KERNEL); 34230d7dbfceSAlex Elder if (!rbd_dev->spec->image_id) 3424a30b71b9SAlex Elder return -ENOMEM; 3425a30b71b9SAlex Elder 3426a30b71b9SAlex Elder /* Record the header object name for this rbd image. */ 3427a30b71b9SAlex Elder 342869e7a02fSAlex Elder size = strlen(rbd_dev->spec->image_name) + sizeof (RBD_SUFFIX); 3429a30b71b9SAlex Elder rbd_dev->header_name = kmalloc(size, GFP_KERNEL); 3430a30b71b9SAlex Elder if (!rbd_dev->header_name) { 3431a30b71b9SAlex Elder ret = -ENOMEM; 3432a30b71b9SAlex Elder goto out_err; 3433a30b71b9SAlex Elder } 34340d7dbfceSAlex Elder sprintf(rbd_dev->header_name, "%s%s", 34350d7dbfceSAlex Elder rbd_dev->spec->image_name, RBD_SUFFIX); 3436a30b71b9SAlex Elder 3437a30b71b9SAlex Elder /* Populate rbd image metadata */ 3438a30b71b9SAlex Elder 3439a30b71b9SAlex Elder ret = rbd_read_header(rbd_dev, &rbd_dev->header); 3440a30b71b9SAlex Elder if (ret < 0) 3441a30b71b9SAlex Elder goto out_err; 344286b00e0dSAlex Elder 344386b00e0dSAlex Elder /* Version 1 images have no parent (no layering) */ 344486b00e0dSAlex Elder 344586b00e0dSAlex Elder rbd_dev->parent_spec = NULL; 344686b00e0dSAlex Elder rbd_dev->parent_overlap = 0; 344786b00e0dSAlex Elder 3448a30b71b9SAlex Elder rbd_dev->image_format = 1; 3449a30b71b9SAlex Elder 3450a30b71b9SAlex Elder dout("discovered version 1 image, header name is %s\n", 3451a30b71b9SAlex Elder rbd_dev->header_name); 3452a30b71b9SAlex Elder 3453a30b71b9SAlex Elder return 0; 3454a30b71b9SAlex Elder 3455a30b71b9SAlex Elder out_err: 3456a30b71b9SAlex Elder kfree(rbd_dev->header_name); 3457a30b71b9SAlex Elder rbd_dev->header_name = NULL; 34580d7dbfceSAlex Elder kfree(rbd_dev->spec->image_id); 34590d7dbfceSAlex Elder rbd_dev->spec->image_id = NULL; 3460a30b71b9SAlex Elder 3461a30b71b9SAlex Elder return ret; 3462a30b71b9SAlex Elder } 3463a30b71b9SAlex Elder 3464a30b71b9SAlex Elder static int rbd_dev_v2_probe(struct rbd_device *rbd_dev) 3465a30b71b9SAlex Elder { 3466a30b71b9SAlex Elder size_t size; 34679d475de5SAlex Elder int ret; 34686e14b1a6SAlex Elder u64 ver = 0; 3469a30b71b9SAlex Elder 3470a30b71b9SAlex Elder /* 3471a30b71b9SAlex Elder * Image id was filled in by the caller. Record the header 3472a30b71b9SAlex Elder * object name for this rbd image. 3473a30b71b9SAlex Elder */ 3474979ed480SAlex Elder size = sizeof (RBD_HEADER_PREFIX) + strlen(rbd_dev->spec->image_id); 3475a30b71b9SAlex Elder rbd_dev->header_name = kmalloc(size, GFP_KERNEL); 3476a30b71b9SAlex Elder if (!rbd_dev->header_name) 3477a30b71b9SAlex Elder return -ENOMEM; 3478a30b71b9SAlex Elder sprintf(rbd_dev->header_name, "%s%s", 34790d7dbfceSAlex Elder RBD_HEADER_PREFIX, rbd_dev->spec->image_id); 34809d475de5SAlex Elder 34819d475de5SAlex Elder /* Get the size and object order for the image */ 34829d475de5SAlex Elder 34839d475de5SAlex Elder ret = rbd_dev_v2_image_size(rbd_dev); 34849d475de5SAlex Elder if (ret < 0) 34859d475de5SAlex Elder goto out_err; 34861e130199SAlex Elder 34871e130199SAlex Elder /* Get the object prefix (a.k.a. block_name) for the image */ 34881e130199SAlex Elder 34891e130199SAlex Elder ret = rbd_dev_v2_object_prefix(rbd_dev); 34901e130199SAlex Elder if (ret < 0) 34911e130199SAlex Elder goto out_err; 3492b1b5402aSAlex Elder 3493d889140cSAlex Elder /* Get the and check features for the image */ 3494b1b5402aSAlex Elder 3495b1b5402aSAlex Elder ret = rbd_dev_v2_features(rbd_dev); 3496b1b5402aSAlex Elder if (ret < 0) 3497b1b5402aSAlex Elder goto out_err; 349835d489f9SAlex Elder 349986b00e0dSAlex Elder /* If the image supports layering, get the parent info */ 350086b00e0dSAlex Elder 350186b00e0dSAlex Elder if (rbd_dev->header.features & RBD_FEATURE_LAYERING) { 350286b00e0dSAlex Elder ret = rbd_dev_v2_parent_info(rbd_dev); 350386b00e0dSAlex Elder if (ret < 0) 350486b00e0dSAlex Elder goto out_err; 350586b00e0dSAlex Elder } 350686b00e0dSAlex Elder 35076e14b1a6SAlex Elder /* crypto and compression type aren't (yet) supported for v2 images */ 350835d489f9SAlex Elder 35096e14b1a6SAlex Elder rbd_dev->header.crypt_type = 0; 35106e14b1a6SAlex Elder rbd_dev->header.comp_type = 0; 35116e14b1a6SAlex Elder 35126e14b1a6SAlex Elder /* Get the snapshot context, plus the header version */ 35136e14b1a6SAlex Elder 35146e14b1a6SAlex Elder ret = rbd_dev_v2_snap_context(rbd_dev, &ver); 351535d489f9SAlex Elder if (ret) 351635d489f9SAlex Elder goto out_err; 35176e14b1a6SAlex Elder rbd_dev->header.obj_version = ver; 35186e14b1a6SAlex Elder 3519a30b71b9SAlex Elder rbd_dev->image_format = 2; 3520a30b71b9SAlex Elder 3521a30b71b9SAlex Elder dout("discovered version 2 image, header name is %s\n", 3522a30b71b9SAlex Elder rbd_dev->header_name); 3523a30b71b9SAlex Elder 352435152979SAlex Elder return 0; 35259d475de5SAlex Elder out_err: 352686b00e0dSAlex Elder rbd_dev->parent_overlap = 0; 352786b00e0dSAlex Elder rbd_spec_put(rbd_dev->parent_spec); 352886b00e0dSAlex Elder rbd_dev->parent_spec = NULL; 35299d475de5SAlex Elder kfree(rbd_dev->header_name); 35309d475de5SAlex Elder rbd_dev->header_name = NULL; 35311e130199SAlex Elder kfree(rbd_dev->header.object_prefix); 35321e130199SAlex Elder rbd_dev->header.object_prefix = NULL; 35339d475de5SAlex Elder 35349d475de5SAlex Elder return ret; 3535a30b71b9SAlex Elder } 3536a30b71b9SAlex Elder 353783a06263SAlex Elder static int rbd_dev_probe_finish(struct rbd_device *rbd_dev) 353883a06263SAlex Elder { 353983a06263SAlex Elder int ret; 354083a06263SAlex Elder 354183a06263SAlex Elder /* no need to lock here, as rbd_dev is not registered yet */ 354283a06263SAlex Elder ret = rbd_dev_snaps_update(rbd_dev); 354383a06263SAlex Elder if (ret) 354483a06263SAlex Elder return ret; 354583a06263SAlex Elder 35469e15b77dSAlex Elder ret = rbd_dev_probe_update_spec(rbd_dev); 35479e15b77dSAlex Elder if (ret) 35489e15b77dSAlex Elder goto err_out_snaps; 35499e15b77dSAlex Elder 355083a06263SAlex Elder ret = rbd_dev_set_mapping(rbd_dev); 355183a06263SAlex Elder if (ret) 355283a06263SAlex Elder goto err_out_snaps; 355383a06263SAlex Elder 355483a06263SAlex Elder /* generate unique id: find highest unique id, add one */ 355583a06263SAlex Elder rbd_dev_id_get(rbd_dev); 355683a06263SAlex Elder 355783a06263SAlex Elder /* Fill in the device name, now that we have its id. */ 355883a06263SAlex Elder BUILD_BUG_ON(DEV_NAME_LEN 355983a06263SAlex Elder < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH); 356083a06263SAlex Elder sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id); 356183a06263SAlex Elder 356283a06263SAlex Elder /* Get our block major device number. */ 356383a06263SAlex Elder 356483a06263SAlex Elder ret = register_blkdev(0, rbd_dev->name); 356583a06263SAlex Elder if (ret < 0) 356683a06263SAlex Elder goto err_out_id; 356783a06263SAlex Elder rbd_dev->major = ret; 356883a06263SAlex Elder 356983a06263SAlex Elder /* Set up the blkdev mapping. */ 357083a06263SAlex Elder 357183a06263SAlex Elder ret = rbd_init_disk(rbd_dev); 357283a06263SAlex Elder if (ret) 357383a06263SAlex Elder goto err_out_blkdev; 357483a06263SAlex Elder 357583a06263SAlex Elder ret = rbd_bus_add_dev(rbd_dev); 357683a06263SAlex Elder if (ret) 357783a06263SAlex Elder goto err_out_disk; 357883a06263SAlex Elder 357983a06263SAlex Elder /* 358083a06263SAlex Elder * At this point cleanup in the event of an error is the job 358183a06263SAlex Elder * of the sysfs code (initiated by rbd_bus_del_dev()). 358283a06263SAlex Elder */ 358383a06263SAlex Elder down_write(&rbd_dev->header_rwsem); 358483a06263SAlex Elder ret = rbd_dev_snaps_register(rbd_dev); 358583a06263SAlex Elder up_write(&rbd_dev->header_rwsem); 358683a06263SAlex Elder if (ret) 358783a06263SAlex Elder goto err_out_bus; 358883a06263SAlex Elder 358983a06263SAlex Elder ret = rbd_init_watch_dev(rbd_dev); 359083a06263SAlex Elder if (ret) 359183a06263SAlex Elder goto err_out_bus; 359283a06263SAlex Elder 359383a06263SAlex Elder /* Everything's ready. Announce the disk to the world. */ 359483a06263SAlex Elder 359583a06263SAlex Elder add_disk(rbd_dev->disk); 359683a06263SAlex Elder 359783a06263SAlex Elder pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name, 359883a06263SAlex Elder (unsigned long long) rbd_dev->mapping.size); 359983a06263SAlex Elder 360083a06263SAlex Elder return ret; 360183a06263SAlex Elder err_out_bus: 360283a06263SAlex Elder /* this will also clean up rest of rbd_dev stuff */ 360383a06263SAlex Elder 360483a06263SAlex Elder rbd_bus_del_dev(rbd_dev); 360583a06263SAlex Elder 360683a06263SAlex Elder return ret; 360783a06263SAlex Elder err_out_disk: 360883a06263SAlex Elder rbd_free_disk(rbd_dev); 360983a06263SAlex Elder err_out_blkdev: 361083a06263SAlex Elder unregister_blkdev(rbd_dev->major, rbd_dev->name); 361183a06263SAlex Elder err_out_id: 361283a06263SAlex Elder rbd_dev_id_put(rbd_dev); 361383a06263SAlex Elder err_out_snaps: 361483a06263SAlex Elder rbd_remove_all_snaps(rbd_dev); 361583a06263SAlex Elder 361683a06263SAlex Elder return ret; 361783a06263SAlex Elder } 361883a06263SAlex Elder 3619a30b71b9SAlex Elder /* 3620a30b71b9SAlex Elder * Probe for the existence of the header object for the given rbd 3621a30b71b9SAlex Elder * device. For format 2 images this includes determining the image 3622a30b71b9SAlex Elder * id. 3623a30b71b9SAlex Elder */ 3624a30b71b9SAlex Elder static int rbd_dev_probe(struct rbd_device *rbd_dev) 3625a30b71b9SAlex Elder { 3626a30b71b9SAlex Elder int ret; 3627a30b71b9SAlex Elder 3628a30b71b9SAlex Elder /* 3629a30b71b9SAlex Elder * Get the id from the image id object. If it's not a 3630a30b71b9SAlex Elder * format 2 image, we'll get ENOENT back, and we'll assume 3631a30b71b9SAlex Elder * it's a format 1 image. 3632a30b71b9SAlex Elder */ 3633a30b71b9SAlex Elder ret = rbd_dev_image_id(rbd_dev); 3634a30b71b9SAlex Elder if (ret) 3635a30b71b9SAlex Elder ret = rbd_dev_v1_probe(rbd_dev); 3636a30b71b9SAlex Elder else 3637a30b71b9SAlex Elder ret = rbd_dev_v2_probe(rbd_dev); 363883a06263SAlex Elder if (ret) { 3639a30b71b9SAlex Elder dout("probe failed, returning %d\n", ret); 3640a30b71b9SAlex Elder 3641a30b71b9SAlex Elder return ret; 3642a30b71b9SAlex Elder } 3643a30b71b9SAlex Elder 364483a06263SAlex Elder ret = rbd_dev_probe_finish(rbd_dev); 364583a06263SAlex Elder if (ret) 364683a06263SAlex Elder rbd_header_free(&rbd_dev->header); 364783a06263SAlex Elder 364883a06263SAlex Elder return ret; 364983a06263SAlex Elder } 365083a06263SAlex Elder 365159c2be1eSYehuda Sadeh static ssize_t rbd_add(struct bus_type *bus, 365259c2be1eSYehuda Sadeh const char *buf, 365359c2be1eSYehuda Sadeh size_t count) 3654602adf40SYehuda Sadeh { 3655cb8627c7SAlex Elder struct rbd_device *rbd_dev = NULL; 3656dc79b113SAlex Elder struct ceph_options *ceph_opts = NULL; 36574e9afebaSAlex Elder struct rbd_options *rbd_opts = NULL; 3658859c31dfSAlex Elder struct rbd_spec *spec = NULL; 36599d3997fdSAlex Elder struct rbd_client *rbdc; 366027cc2594SAlex Elder struct ceph_osd_client *osdc; 366127cc2594SAlex Elder int rc = -ENOMEM; 3662602adf40SYehuda Sadeh 3663602adf40SYehuda Sadeh if (!try_module_get(THIS_MODULE)) 3664602adf40SYehuda Sadeh return -ENODEV; 3665602adf40SYehuda Sadeh 3666a725f65eSAlex Elder /* parse add command */ 3667859c31dfSAlex Elder rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec); 3668dc79b113SAlex Elder if (rc < 0) 3669bd4ba655SAlex Elder goto err_out_module; 3670a725f65eSAlex Elder 36719d3997fdSAlex Elder rbdc = rbd_get_client(ceph_opts); 36729d3997fdSAlex Elder if (IS_ERR(rbdc)) { 36739d3997fdSAlex Elder rc = PTR_ERR(rbdc); 36740ddebc0cSAlex Elder goto err_out_args; 36759d3997fdSAlex Elder } 3676c53d5893SAlex Elder ceph_opts = NULL; /* rbd_dev client now owns this */ 3677602adf40SYehuda Sadeh 3678602adf40SYehuda Sadeh /* pick the pool */ 36799d3997fdSAlex Elder osdc = &rbdc->client->osdc; 3680859c31dfSAlex Elder rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name); 3681602adf40SYehuda Sadeh if (rc < 0) 3682602adf40SYehuda Sadeh goto err_out_client; 3683859c31dfSAlex Elder spec->pool_id = (u64) rc; 3684859c31dfSAlex Elder 3685c53d5893SAlex Elder rbd_dev = rbd_dev_create(rbdc, spec); 3686bd4ba655SAlex Elder if (!rbd_dev) 3687bd4ba655SAlex Elder goto err_out_client; 3688c53d5893SAlex Elder rbdc = NULL; /* rbd_dev now owns this */ 3689c53d5893SAlex Elder spec = NULL; /* rbd_dev now owns this */ 3690602adf40SYehuda Sadeh 3691bd4ba655SAlex Elder rbd_dev->mapping.read_only = rbd_opts->read_only; 3692c53d5893SAlex Elder kfree(rbd_opts); 3693c53d5893SAlex Elder rbd_opts = NULL; /* done with this */ 3694bd4ba655SAlex Elder 3695a30b71b9SAlex Elder rc = rbd_dev_probe(rbd_dev); 3696a30b71b9SAlex Elder if (rc < 0) 3697c53d5893SAlex Elder goto err_out_rbd_dev; 369805fd6f6fSAlex Elder 3699602adf40SYehuda Sadeh return count; 3700c53d5893SAlex Elder err_out_rbd_dev: 3701c53d5893SAlex Elder rbd_dev_destroy(rbd_dev); 3702bd4ba655SAlex Elder err_out_client: 37039d3997fdSAlex Elder rbd_put_client(rbdc); 37040ddebc0cSAlex Elder err_out_args: 370578cea76eSAlex Elder if (ceph_opts) 370678cea76eSAlex Elder ceph_destroy_options(ceph_opts); 37074e9afebaSAlex Elder kfree(rbd_opts); 3708859c31dfSAlex Elder rbd_spec_put(spec); 3709bd4ba655SAlex Elder err_out_module: 3710bd4ba655SAlex Elder module_put(THIS_MODULE); 371127cc2594SAlex Elder 3712602adf40SYehuda Sadeh dout("Error adding device %s\n", buf); 371327cc2594SAlex Elder 371427cc2594SAlex Elder return (ssize_t) rc; 3715602adf40SYehuda Sadeh } 3716602adf40SYehuda Sadeh 3717de71a297SAlex Elder static struct rbd_device *__rbd_get_dev(unsigned long dev_id) 3718602adf40SYehuda Sadeh { 3719602adf40SYehuda Sadeh struct list_head *tmp; 3720602adf40SYehuda Sadeh struct rbd_device *rbd_dev; 3721602adf40SYehuda Sadeh 3722e124a82fSAlex Elder spin_lock(&rbd_dev_list_lock); 3723602adf40SYehuda Sadeh list_for_each(tmp, &rbd_dev_list) { 3724602adf40SYehuda Sadeh rbd_dev = list_entry(tmp, struct rbd_device, node); 3725de71a297SAlex Elder if (rbd_dev->dev_id == dev_id) { 3726e124a82fSAlex Elder spin_unlock(&rbd_dev_list_lock); 3727602adf40SYehuda Sadeh return rbd_dev; 3728602adf40SYehuda Sadeh } 3729e124a82fSAlex Elder } 3730e124a82fSAlex Elder spin_unlock(&rbd_dev_list_lock); 3731602adf40SYehuda Sadeh return NULL; 3732602adf40SYehuda Sadeh } 3733602adf40SYehuda Sadeh 3734dfc5606dSYehuda Sadeh static void rbd_dev_release(struct device *dev) 3735602adf40SYehuda Sadeh { 3736593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 3737602adf40SYehuda Sadeh 37381dbb4399SAlex Elder if (rbd_dev->watch_request) { 37391dbb4399SAlex Elder struct ceph_client *client = rbd_dev->rbd_client->client; 37401dbb4399SAlex Elder 37411dbb4399SAlex Elder ceph_osdc_unregister_linger_request(&client->osdc, 374259c2be1eSYehuda Sadeh rbd_dev->watch_request); 37431dbb4399SAlex Elder } 374459c2be1eSYehuda Sadeh if (rbd_dev->watch_event) 3745070c633fSAlex Elder rbd_req_sync_unwatch(rbd_dev); 374659c2be1eSYehuda Sadeh 3747602adf40SYehuda Sadeh 3748602adf40SYehuda Sadeh /* clean up and free blkdev */ 3749602adf40SYehuda Sadeh rbd_free_disk(rbd_dev); 3750602adf40SYehuda Sadeh unregister_blkdev(rbd_dev->major, rbd_dev->name); 375132eec68dSAlex Elder 37522ac4e75dSAlex Elder /* release allocated disk header fields */ 37532ac4e75dSAlex Elder rbd_header_free(&rbd_dev->header); 37542ac4e75dSAlex Elder 375532eec68dSAlex Elder /* done with the id, and with the rbd_dev */ 3756e2839308SAlex Elder rbd_dev_id_put(rbd_dev); 3757c53d5893SAlex Elder rbd_assert(rbd_dev->rbd_client != NULL); 3758c53d5893SAlex Elder rbd_dev_destroy(rbd_dev); 3759602adf40SYehuda Sadeh 3760602adf40SYehuda Sadeh /* release module ref */ 3761602adf40SYehuda Sadeh module_put(THIS_MODULE); 3762602adf40SYehuda Sadeh } 3763602adf40SYehuda Sadeh 3764dfc5606dSYehuda Sadeh static ssize_t rbd_remove(struct bus_type *bus, 3765602adf40SYehuda Sadeh const char *buf, 3766602adf40SYehuda Sadeh size_t count) 3767602adf40SYehuda Sadeh { 3768602adf40SYehuda Sadeh struct rbd_device *rbd_dev = NULL; 3769602adf40SYehuda Sadeh int target_id, rc; 3770602adf40SYehuda Sadeh unsigned long ul; 3771602adf40SYehuda Sadeh int ret = count; 3772602adf40SYehuda Sadeh 3773602adf40SYehuda Sadeh rc = strict_strtoul(buf, 10, &ul); 3774602adf40SYehuda Sadeh if (rc) 3775602adf40SYehuda Sadeh return rc; 3776602adf40SYehuda Sadeh 3777602adf40SYehuda Sadeh /* convert to int; abort if we lost anything in the conversion */ 3778602adf40SYehuda Sadeh target_id = (int) ul; 3779602adf40SYehuda Sadeh if (target_id != ul) 3780602adf40SYehuda Sadeh return -EINVAL; 3781602adf40SYehuda Sadeh 3782602adf40SYehuda Sadeh mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); 3783602adf40SYehuda Sadeh 3784602adf40SYehuda Sadeh rbd_dev = __rbd_get_dev(target_id); 3785602adf40SYehuda Sadeh if (!rbd_dev) { 3786602adf40SYehuda Sadeh ret = -ENOENT; 3787602adf40SYehuda Sadeh goto done; 3788602adf40SYehuda Sadeh } 3789602adf40SYehuda Sadeh 379042382b70SAlex Elder if (rbd_dev->open_count) { 379142382b70SAlex Elder ret = -EBUSY; 379242382b70SAlex Elder goto done; 379342382b70SAlex Elder } 379442382b70SAlex Elder 379541f38c2bSAlex Elder rbd_remove_all_snaps(rbd_dev); 3796dfc5606dSYehuda Sadeh rbd_bus_del_dev(rbd_dev); 3797602adf40SYehuda Sadeh 3798602adf40SYehuda Sadeh done: 3799602adf40SYehuda Sadeh mutex_unlock(&ctl_mutex); 3800aafb230eSAlex Elder 3801602adf40SYehuda Sadeh return ret; 3802602adf40SYehuda Sadeh } 3803602adf40SYehuda Sadeh 3804602adf40SYehuda Sadeh /* 3805602adf40SYehuda Sadeh * create control files in sysfs 3806dfc5606dSYehuda Sadeh * /sys/bus/rbd/... 3807602adf40SYehuda Sadeh */ 3808602adf40SYehuda Sadeh static int rbd_sysfs_init(void) 3809602adf40SYehuda Sadeh { 3810dfc5606dSYehuda Sadeh int ret; 3811602adf40SYehuda Sadeh 3812fed4c143SAlex Elder ret = device_register(&rbd_root_dev); 3813dfc5606dSYehuda Sadeh if (ret < 0) 3814dfc5606dSYehuda Sadeh return ret; 3815602adf40SYehuda Sadeh 3816fed4c143SAlex Elder ret = bus_register(&rbd_bus_type); 3817fed4c143SAlex Elder if (ret < 0) 3818fed4c143SAlex Elder device_unregister(&rbd_root_dev); 3819602adf40SYehuda Sadeh 3820602adf40SYehuda Sadeh return ret; 3821602adf40SYehuda Sadeh } 3822602adf40SYehuda Sadeh 3823602adf40SYehuda Sadeh static void rbd_sysfs_cleanup(void) 3824602adf40SYehuda Sadeh { 3825dfc5606dSYehuda Sadeh bus_unregister(&rbd_bus_type); 3826fed4c143SAlex Elder device_unregister(&rbd_root_dev); 3827602adf40SYehuda Sadeh } 3828602adf40SYehuda Sadeh 3829602adf40SYehuda Sadeh int __init rbd_init(void) 3830602adf40SYehuda Sadeh { 3831602adf40SYehuda Sadeh int rc; 3832602adf40SYehuda Sadeh 3833602adf40SYehuda Sadeh rc = rbd_sysfs_init(); 3834602adf40SYehuda Sadeh if (rc) 3835602adf40SYehuda Sadeh return rc; 3836f0f8cef5SAlex Elder pr_info("loaded " RBD_DRV_NAME_LONG "\n"); 3837602adf40SYehuda Sadeh return 0; 3838602adf40SYehuda Sadeh } 3839602adf40SYehuda Sadeh 3840602adf40SYehuda Sadeh void __exit rbd_exit(void) 3841602adf40SYehuda Sadeh { 3842602adf40SYehuda Sadeh rbd_sysfs_cleanup(); 3843602adf40SYehuda Sadeh } 3844602adf40SYehuda Sadeh 3845602adf40SYehuda Sadeh module_init(rbd_init); 3846602adf40SYehuda Sadeh module_exit(rbd_exit); 3847602adf40SYehuda Sadeh 3848602adf40SYehuda Sadeh MODULE_AUTHOR("Sage Weil <sage@newdream.net>"); 3849602adf40SYehuda Sadeh MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>"); 3850602adf40SYehuda Sadeh MODULE_DESCRIPTION("rados block device"); 3851602adf40SYehuda Sadeh 3852602adf40SYehuda Sadeh /* following authorship retained from original osdblk.c */ 3853602adf40SYehuda Sadeh MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>"); 3854602adf40SYehuda Sadeh 3855602adf40SYehuda Sadeh MODULE_LICENSE("GPL"); 3856