1602adf40SYehuda Sadeh /* 2602adf40SYehuda Sadeh rbd.c -- Export ceph rados objects as a Linux block device 3602adf40SYehuda Sadeh 4602adf40SYehuda Sadeh 5602adf40SYehuda Sadeh based on drivers/block/osdblk.c: 6602adf40SYehuda Sadeh 7602adf40SYehuda Sadeh Copyright 2009 Red Hat, Inc. 8602adf40SYehuda Sadeh 9602adf40SYehuda Sadeh This program is free software; you can redistribute it and/or modify 10602adf40SYehuda Sadeh it under the terms of the GNU General Public License as published by 11602adf40SYehuda Sadeh the Free Software Foundation. 12602adf40SYehuda Sadeh 13602adf40SYehuda Sadeh This program is distributed in the hope that it will be useful, 14602adf40SYehuda Sadeh but WITHOUT ANY WARRANTY; without even the implied warranty of 15602adf40SYehuda Sadeh MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16602adf40SYehuda Sadeh GNU General Public License for more details. 17602adf40SYehuda Sadeh 18602adf40SYehuda Sadeh You should have received a copy of the GNU General Public License 19602adf40SYehuda Sadeh along with this program; see the file COPYING. If not, write to 20602adf40SYehuda Sadeh the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 21602adf40SYehuda Sadeh 22602adf40SYehuda Sadeh 23602adf40SYehuda Sadeh 24dfc5606dSYehuda Sadeh For usage instructions, please refer to: 25602adf40SYehuda Sadeh 26dfc5606dSYehuda Sadeh Documentation/ABI/testing/sysfs-bus-rbd 27602adf40SYehuda Sadeh 28602adf40SYehuda Sadeh */ 29602adf40SYehuda Sadeh 30602adf40SYehuda Sadeh #include <linux/ceph/libceph.h> 31602adf40SYehuda Sadeh #include <linux/ceph/osd_client.h> 32602adf40SYehuda Sadeh #include <linux/ceph/mon_client.h> 33602adf40SYehuda Sadeh #include <linux/ceph/decode.h> 3459c2be1eSYehuda Sadeh #include <linux/parser.h> 35602adf40SYehuda Sadeh 36602adf40SYehuda Sadeh #include <linux/kernel.h> 37602adf40SYehuda Sadeh #include <linux/device.h> 38602adf40SYehuda Sadeh #include <linux/module.h> 39602adf40SYehuda Sadeh #include <linux/fs.h> 40602adf40SYehuda Sadeh #include <linux/blkdev.h> 41602adf40SYehuda Sadeh 42602adf40SYehuda Sadeh #include "rbd_types.h" 43602adf40SYehuda Sadeh 44aafb230eSAlex Elder #define RBD_DEBUG /* Activate rbd_assert() calls */ 45aafb230eSAlex Elder 46593a9e7bSAlex Elder /* 47593a9e7bSAlex Elder * The basic unit of block I/O is a sector. It is interpreted in a 48593a9e7bSAlex Elder * number of contexts in Linux (blk, bio, genhd), but the default is 49593a9e7bSAlex Elder * universally 512 bytes. These symbols are just slightly more 50593a9e7bSAlex Elder * meaningful than the bare numbers they represent. 51593a9e7bSAlex Elder */ 52593a9e7bSAlex Elder #define SECTOR_SHIFT 9 53593a9e7bSAlex Elder #define SECTOR_SIZE (1ULL << SECTOR_SHIFT) 54593a9e7bSAlex Elder 552647ba38SAlex Elder /* It might be useful to have these defined elsewhere */ 56df111be6SAlex Elder 572647ba38SAlex Elder #define U8_MAX ((u8) (~0U)) 582647ba38SAlex Elder #define U16_MAX ((u16) (~0U)) 590ec8ce87SAlex Elder #define U32_MAX ((u32) (~0U)) 60df111be6SAlex Elder #define U64_MAX ((u64) (~0ULL)) 61df111be6SAlex Elder 62f0f8cef5SAlex Elder #define RBD_DRV_NAME "rbd" 63f0f8cef5SAlex Elder #define RBD_DRV_NAME_LONG "rbd (rados block device)" 64602adf40SYehuda Sadeh 65602adf40SYehuda Sadeh #define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */ 66602adf40SYehuda Sadeh 67d4b125e9SAlex Elder #define RBD_SNAP_DEV_NAME_PREFIX "snap_" 68d4b125e9SAlex Elder #define RBD_MAX_SNAP_NAME_LEN \ 69d4b125e9SAlex Elder (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1)) 70d4b125e9SAlex Elder 7135d489f9SAlex Elder #define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */ 72602adf40SYehuda Sadeh #define RBD_MAX_OPT_LEN 1024 73602adf40SYehuda Sadeh 74602adf40SYehuda Sadeh #define RBD_SNAP_HEAD_NAME "-" 75602adf40SYehuda Sadeh 769e15b77dSAlex Elder /* This allows a single page to hold an image name sent by OSD */ 779e15b77dSAlex Elder #define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1) 78589d30e0SAlex Elder #define RBD_IMAGE_ID_LEN_MAX 64 799e15b77dSAlex Elder 801e130199SAlex Elder #define RBD_OBJ_PREFIX_LEN_MAX 64 81589d30e0SAlex Elder 82d889140cSAlex Elder /* Feature bits */ 83d889140cSAlex Elder 84d889140cSAlex Elder #define RBD_FEATURE_LAYERING 1 85d889140cSAlex Elder 86d889140cSAlex Elder /* Features supported by this (client software) implementation. */ 87d889140cSAlex Elder 88d889140cSAlex Elder #define RBD_FEATURES_ALL (0) 89d889140cSAlex Elder 9081a89793SAlex Elder /* 9181a89793SAlex Elder * An RBD device name will be "rbd#", where the "rbd" comes from 9281a89793SAlex Elder * RBD_DRV_NAME above, and # is a unique integer identifier. 9381a89793SAlex Elder * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big 9481a89793SAlex Elder * enough to hold all possible device names. 9581a89793SAlex Elder */ 96602adf40SYehuda Sadeh #define DEV_NAME_LEN 32 9781a89793SAlex Elder #define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1) 98602adf40SYehuda Sadeh 99cc0538b6SAlex Elder #define RBD_READ_ONLY_DEFAULT false 10059c2be1eSYehuda Sadeh 101602adf40SYehuda Sadeh /* 102602adf40SYehuda Sadeh * block device image metadata (in-memory version) 103602adf40SYehuda Sadeh */ 104602adf40SYehuda Sadeh struct rbd_image_header { 105f84344f3SAlex Elder /* These four fields never change for a given rbd image */ 106849b4260SAlex Elder char *object_prefix; 10734b13184SAlex Elder u64 features; 108602adf40SYehuda Sadeh __u8 obj_order; 109602adf40SYehuda Sadeh __u8 crypt_type; 110602adf40SYehuda Sadeh __u8 comp_type; 111602adf40SYehuda Sadeh 112f84344f3SAlex Elder /* The remaining fields need to be updated occasionally */ 113f84344f3SAlex Elder u64 image_size; 114f84344f3SAlex Elder struct ceph_snap_context *snapc; 115602adf40SYehuda Sadeh char *snap_names; 116602adf40SYehuda Sadeh u64 *snap_sizes; 11759c2be1eSYehuda Sadeh 11859c2be1eSYehuda Sadeh u64 obj_version; 11959c2be1eSYehuda Sadeh }; 12059c2be1eSYehuda Sadeh 1210d7dbfceSAlex Elder /* 1220d7dbfceSAlex Elder * An rbd image specification. 1230d7dbfceSAlex Elder * 1240d7dbfceSAlex Elder * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely 125c66c6e0cSAlex Elder * identify an image. Each rbd_dev structure includes a pointer to 126c66c6e0cSAlex Elder * an rbd_spec structure that encapsulates this identity. 127c66c6e0cSAlex Elder * 128c66c6e0cSAlex Elder * Each of the id's in an rbd_spec has an associated name. For a 129c66c6e0cSAlex Elder * user-mapped image, the names are supplied and the id's associated 130c66c6e0cSAlex Elder * with them are looked up. For a layered image, a parent image is 131c66c6e0cSAlex Elder * defined by the tuple, and the names are looked up. 132c66c6e0cSAlex Elder * 133c66c6e0cSAlex Elder * An rbd_dev structure contains a parent_spec pointer which is 134c66c6e0cSAlex Elder * non-null if the image it represents is a child in a layered 135c66c6e0cSAlex Elder * image. This pointer will refer to the rbd_spec structure used 136c66c6e0cSAlex Elder * by the parent rbd_dev for its own identity (i.e., the structure 137c66c6e0cSAlex Elder * is shared between the parent and child). 138c66c6e0cSAlex Elder * 139c66c6e0cSAlex Elder * Since these structures are populated once, during the discovery 140c66c6e0cSAlex Elder * phase of image construction, they are effectively immutable so 141c66c6e0cSAlex Elder * we make no effort to synchronize access to them. 142c66c6e0cSAlex Elder * 143c66c6e0cSAlex Elder * Note that code herein does not assume the image name is known (it 144c66c6e0cSAlex Elder * could be a null pointer). 1450d7dbfceSAlex Elder */ 1460d7dbfceSAlex Elder struct rbd_spec { 1470d7dbfceSAlex Elder u64 pool_id; 1480d7dbfceSAlex Elder char *pool_name; 1490d7dbfceSAlex Elder 1500d7dbfceSAlex Elder char *image_id; 1510d7dbfceSAlex Elder char *image_name; 1520d7dbfceSAlex Elder 1530d7dbfceSAlex Elder u64 snap_id; 1540d7dbfceSAlex Elder char *snap_name; 1550d7dbfceSAlex Elder 1560d7dbfceSAlex Elder struct kref kref; 1570d7dbfceSAlex Elder }; 1580d7dbfceSAlex Elder 15959c2be1eSYehuda Sadeh struct rbd_options { 160cc0538b6SAlex Elder bool read_only; 161602adf40SYehuda Sadeh }; 162602adf40SYehuda Sadeh 163602adf40SYehuda Sadeh /* 164f0f8cef5SAlex Elder * an instance of the client. multiple devices may share an rbd client. 165602adf40SYehuda Sadeh */ 166602adf40SYehuda Sadeh struct rbd_client { 167602adf40SYehuda Sadeh struct ceph_client *client; 168602adf40SYehuda Sadeh struct kref kref; 169602adf40SYehuda Sadeh struct list_head node; 170602adf40SYehuda Sadeh }; 171602adf40SYehuda Sadeh 172602adf40SYehuda Sadeh /* 173f0f8cef5SAlex Elder * a request completion status 174602adf40SYehuda Sadeh */ 1751fec7093SYehuda Sadeh struct rbd_req_status { 1761fec7093SYehuda Sadeh int done; 1778986cb37SAlex Elder s32 rc; 1781fec7093SYehuda Sadeh u64 bytes; 1791fec7093SYehuda Sadeh }; 1801fec7093SYehuda Sadeh 1811fec7093SYehuda Sadeh /* 1821fec7093SYehuda Sadeh * a collection of requests 1831fec7093SYehuda Sadeh */ 1841fec7093SYehuda Sadeh struct rbd_req_coll { 1851fec7093SYehuda Sadeh int total; 1861fec7093SYehuda Sadeh int num_done; 1871fec7093SYehuda Sadeh struct kref kref; 1881fec7093SYehuda Sadeh struct rbd_req_status status[0]; 189602adf40SYehuda Sadeh }; 190602adf40SYehuda Sadeh 191f0f8cef5SAlex Elder /* 192f0f8cef5SAlex Elder * a single io request 193f0f8cef5SAlex Elder */ 194f0f8cef5SAlex Elder struct rbd_request { 195f0f8cef5SAlex Elder struct request *rq; /* blk layer request */ 196f0f8cef5SAlex Elder struct bio *bio; /* cloned bio */ 197f0f8cef5SAlex Elder struct page **pages; /* list of used pages */ 198f0f8cef5SAlex Elder u64 len; 199f0f8cef5SAlex Elder int coll_index; 200f0f8cef5SAlex Elder struct rbd_req_coll *coll; 201f0f8cef5SAlex Elder }; 202f0f8cef5SAlex Elder 203dfc5606dSYehuda Sadeh struct rbd_snap { 204dfc5606dSYehuda Sadeh struct device dev; 205dfc5606dSYehuda Sadeh const char *name; 2063591538fSJosh Durgin u64 size; 207dfc5606dSYehuda Sadeh struct list_head node; 208dfc5606dSYehuda Sadeh u64 id; 20934b13184SAlex Elder u64 features; 210dfc5606dSYehuda Sadeh }; 211dfc5606dSYehuda Sadeh 212f84344f3SAlex Elder struct rbd_mapping { 21399c1f08fSAlex Elder u64 size; 21434b13184SAlex Elder u64 features; 215f84344f3SAlex Elder bool read_only; 216f84344f3SAlex Elder }; 217f84344f3SAlex Elder 218602adf40SYehuda Sadeh /* 219602adf40SYehuda Sadeh * a single device 220602adf40SYehuda Sadeh */ 221602adf40SYehuda Sadeh struct rbd_device { 222de71a297SAlex Elder int dev_id; /* blkdev unique id */ 223602adf40SYehuda Sadeh 224602adf40SYehuda Sadeh int major; /* blkdev assigned major */ 225602adf40SYehuda Sadeh struct gendisk *disk; /* blkdev's gendisk and rq */ 226602adf40SYehuda Sadeh 227a30b71b9SAlex Elder u32 image_format; /* Either 1 or 2 */ 228602adf40SYehuda Sadeh struct rbd_client *rbd_client; 229602adf40SYehuda Sadeh 230602adf40SYehuda Sadeh char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */ 231602adf40SYehuda Sadeh 232602adf40SYehuda Sadeh spinlock_t lock; /* queue lock */ 233602adf40SYehuda Sadeh 234602adf40SYehuda Sadeh struct rbd_image_header header; 235d78b650aSAlex Elder atomic_t exists; 2360d7dbfceSAlex Elder struct rbd_spec *spec; 237602adf40SYehuda Sadeh 2380d7dbfceSAlex Elder char *header_name; 239971f839aSAlex Elder 2400903e875SAlex Elder struct ceph_file_layout layout; 2410903e875SAlex Elder 24259c2be1eSYehuda Sadeh struct ceph_osd_event *watch_event; 24359c2be1eSYehuda Sadeh struct ceph_osd_request *watch_request; 24459c2be1eSYehuda Sadeh 24586b00e0dSAlex Elder struct rbd_spec *parent_spec; 24686b00e0dSAlex Elder u64 parent_overlap; 24786b00e0dSAlex Elder 248c666601aSJosh Durgin /* protects updating the header */ 249c666601aSJosh Durgin struct rw_semaphore header_rwsem; 250f84344f3SAlex Elder 251f84344f3SAlex Elder struct rbd_mapping mapping; 252602adf40SYehuda Sadeh 253602adf40SYehuda Sadeh struct list_head node; 254dfc5606dSYehuda Sadeh 255dfc5606dSYehuda Sadeh /* list of snapshots */ 256dfc5606dSYehuda Sadeh struct list_head snaps; 257dfc5606dSYehuda Sadeh 258dfc5606dSYehuda Sadeh /* sysfs related */ 259dfc5606dSYehuda Sadeh struct device dev; 26042382b70SAlex Elder unsigned long open_count; 261dfc5606dSYehuda Sadeh }; 262dfc5606dSYehuda Sadeh 263602adf40SYehuda Sadeh static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */ 264e124a82fSAlex Elder 265602adf40SYehuda Sadeh static LIST_HEAD(rbd_dev_list); /* devices */ 266e124a82fSAlex Elder static DEFINE_SPINLOCK(rbd_dev_list_lock); 267e124a82fSAlex Elder 268602adf40SYehuda Sadeh static LIST_HEAD(rbd_client_list); /* clients */ 269432b8587SAlex Elder static DEFINE_SPINLOCK(rbd_client_list_lock); 270602adf40SYehuda Sadeh 271304f6808SAlex Elder static int rbd_dev_snaps_update(struct rbd_device *rbd_dev); 272304f6808SAlex Elder static int rbd_dev_snaps_register(struct rbd_device *rbd_dev); 273304f6808SAlex Elder 274dfc5606dSYehuda Sadeh static void rbd_dev_release(struct device *dev); 27541f38c2bSAlex Elder static void rbd_remove_snap_dev(struct rbd_snap *snap); 276dfc5606dSYehuda Sadeh 277f0f8cef5SAlex Elder static ssize_t rbd_add(struct bus_type *bus, const char *buf, 278f0f8cef5SAlex Elder size_t count); 279f0f8cef5SAlex Elder static ssize_t rbd_remove(struct bus_type *bus, const char *buf, 280f0f8cef5SAlex Elder size_t count); 281f0f8cef5SAlex Elder 282f0f8cef5SAlex Elder static struct bus_attribute rbd_bus_attrs[] = { 283f0f8cef5SAlex Elder __ATTR(add, S_IWUSR, NULL, rbd_add), 284f0f8cef5SAlex Elder __ATTR(remove, S_IWUSR, NULL, rbd_remove), 285f0f8cef5SAlex Elder __ATTR_NULL 286f0f8cef5SAlex Elder }; 287f0f8cef5SAlex Elder 288f0f8cef5SAlex Elder static struct bus_type rbd_bus_type = { 289f0f8cef5SAlex Elder .name = "rbd", 290f0f8cef5SAlex Elder .bus_attrs = rbd_bus_attrs, 291f0f8cef5SAlex Elder }; 292f0f8cef5SAlex Elder 293f0f8cef5SAlex Elder static void rbd_root_dev_release(struct device *dev) 294f0f8cef5SAlex Elder { 295f0f8cef5SAlex Elder } 296f0f8cef5SAlex Elder 297f0f8cef5SAlex Elder static struct device rbd_root_dev = { 298f0f8cef5SAlex Elder .init_name = "rbd", 299f0f8cef5SAlex Elder .release = rbd_root_dev_release, 300f0f8cef5SAlex Elder }; 301f0f8cef5SAlex Elder 30206ecc6cbSAlex Elder static __printf(2, 3) 30306ecc6cbSAlex Elder void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...) 30406ecc6cbSAlex Elder { 30506ecc6cbSAlex Elder struct va_format vaf; 30606ecc6cbSAlex Elder va_list args; 30706ecc6cbSAlex Elder 30806ecc6cbSAlex Elder va_start(args, fmt); 30906ecc6cbSAlex Elder vaf.fmt = fmt; 31006ecc6cbSAlex Elder vaf.va = &args; 31106ecc6cbSAlex Elder 31206ecc6cbSAlex Elder if (!rbd_dev) 31306ecc6cbSAlex Elder printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf); 31406ecc6cbSAlex Elder else if (rbd_dev->disk) 31506ecc6cbSAlex Elder printk(KERN_WARNING "%s: %s: %pV\n", 31606ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf); 31706ecc6cbSAlex Elder else if (rbd_dev->spec && rbd_dev->spec->image_name) 31806ecc6cbSAlex Elder printk(KERN_WARNING "%s: image %s: %pV\n", 31906ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf); 32006ecc6cbSAlex Elder else if (rbd_dev->spec && rbd_dev->spec->image_id) 32106ecc6cbSAlex Elder printk(KERN_WARNING "%s: id %s: %pV\n", 32206ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf); 32306ecc6cbSAlex Elder else /* punt */ 32406ecc6cbSAlex Elder printk(KERN_WARNING "%s: rbd_dev %p: %pV\n", 32506ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev, &vaf); 32606ecc6cbSAlex Elder va_end(args); 32706ecc6cbSAlex Elder } 32806ecc6cbSAlex Elder 329aafb230eSAlex Elder #ifdef RBD_DEBUG 330aafb230eSAlex Elder #define rbd_assert(expr) \ 331aafb230eSAlex Elder if (unlikely(!(expr))) { \ 332aafb230eSAlex Elder printk(KERN_ERR "\nAssertion failure in %s() " \ 333aafb230eSAlex Elder "at line %d:\n\n" \ 334aafb230eSAlex Elder "\trbd_assert(%s);\n\n", \ 335aafb230eSAlex Elder __func__, __LINE__, #expr); \ 336aafb230eSAlex Elder BUG(); \ 337aafb230eSAlex Elder } 338aafb230eSAlex Elder #else /* !RBD_DEBUG */ 339aafb230eSAlex Elder # define rbd_assert(expr) ((void) 0) 340aafb230eSAlex Elder #endif /* !RBD_DEBUG */ 341dfc5606dSYehuda Sadeh 342117973fbSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver); 343117973fbSAlex Elder static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver); 34459c2be1eSYehuda Sadeh 345602adf40SYehuda Sadeh static int rbd_open(struct block_device *bdev, fmode_t mode) 346602adf40SYehuda Sadeh { 347f0f8cef5SAlex Elder struct rbd_device *rbd_dev = bdev->bd_disk->private_data; 348602adf40SYehuda Sadeh 349f84344f3SAlex Elder if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only) 350602adf40SYehuda Sadeh return -EROFS; 351602adf40SYehuda Sadeh 35242382b70SAlex Elder mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); 353c3e946ceSAlex Elder (void) get_device(&rbd_dev->dev); 354f84344f3SAlex Elder set_device_ro(bdev, rbd_dev->mapping.read_only); 35542382b70SAlex Elder rbd_dev->open_count++; 35642382b70SAlex Elder mutex_unlock(&ctl_mutex); 357340c7a2bSAlex Elder 358602adf40SYehuda Sadeh return 0; 359602adf40SYehuda Sadeh } 360602adf40SYehuda Sadeh 361dfc5606dSYehuda Sadeh static int rbd_release(struct gendisk *disk, fmode_t mode) 362dfc5606dSYehuda Sadeh { 363dfc5606dSYehuda Sadeh struct rbd_device *rbd_dev = disk->private_data; 364dfc5606dSYehuda Sadeh 36542382b70SAlex Elder mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); 36642382b70SAlex Elder rbd_assert(rbd_dev->open_count > 0); 36742382b70SAlex Elder rbd_dev->open_count--; 368c3e946ceSAlex Elder put_device(&rbd_dev->dev); 36942382b70SAlex Elder mutex_unlock(&ctl_mutex); 370dfc5606dSYehuda Sadeh 371dfc5606dSYehuda Sadeh return 0; 372dfc5606dSYehuda Sadeh } 373dfc5606dSYehuda Sadeh 374602adf40SYehuda Sadeh static const struct block_device_operations rbd_bd_ops = { 375602adf40SYehuda Sadeh .owner = THIS_MODULE, 376602adf40SYehuda Sadeh .open = rbd_open, 377dfc5606dSYehuda Sadeh .release = rbd_release, 378602adf40SYehuda Sadeh }; 379602adf40SYehuda Sadeh 380602adf40SYehuda Sadeh /* 381602adf40SYehuda Sadeh * Initialize an rbd client instance. 38243ae4701SAlex Elder * We own *ceph_opts. 383602adf40SYehuda Sadeh */ 384f8c38929SAlex Elder static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts) 385602adf40SYehuda Sadeh { 386602adf40SYehuda Sadeh struct rbd_client *rbdc; 387602adf40SYehuda Sadeh int ret = -ENOMEM; 388602adf40SYehuda Sadeh 389602adf40SYehuda Sadeh dout("rbd_client_create\n"); 390602adf40SYehuda Sadeh rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL); 391602adf40SYehuda Sadeh if (!rbdc) 392602adf40SYehuda Sadeh goto out_opt; 393602adf40SYehuda Sadeh 394602adf40SYehuda Sadeh kref_init(&rbdc->kref); 395602adf40SYehuda Sadeh INIT_LIST_HEAD(&rbdc->node); 396602adf40SYehuda Sadeh 397bc534d86SAlex Elder mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); 398bc534d86SAlex Elder 39943ae4701SAlex Elder rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0); 400602adf40SYehuda Sadeh if (IS_ERR(rbdc->client)) 401bc534d86SAlex Elder goto out_mutex; 40243ae4701SAlex Elder ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */ 403602adf40SYehuda Sadeh 404602adf40SYehuda Sadeh ret = ceph_open_session(rbdc->client); 405602adf40SYehuda Sadeh if (ret < 0) 406602adf40SYehuda Sadeh goto out_err; 407602adf40SYehuda Sadeh 408432b8587SAlex Elder spin_lock(&rbd_client_list_lock); 409602adf40SYehuda Sadeh list_add_tail(&rbdc->node, &rbd_client_list); 410432b8587SAlex Elder spin_unlock(&rbd_client_list_lock); 411602adf40SYehuda Sadeh 412bc534d86SAlex Elder mutex_unlock(&ctl_mutex); 413bc534d86SAlex Elder 414602adf40SYehuda Sadeh dout("rbd_client_create created %p\n", rbdc); 415602adf40SYehuda Sadeh return rbdc; 416602adf40SYehuda Sadeh 417602adf40SYehuda Sadeh out_err: 418602adf40SYehuda Sadeh ceph_destroy_client(rbdc->client); 419bc534d86SAlex Elder out_mutex: 420bc534d86SAlex Elder mutex_unlock(&ctl_mutex); 421602adf40SYehuda Sadeh kfree(rbdc); 422602adf40SYehuda Sadeh out_opt: 42343ae4701SAlex Elder if (ceph_opts) 42443ae4701SAlex Elder ceph_destroy_options(ceph_opts); 42528f259b7SVasiliy Kulikov return ERR_PTR(ret); 426602adf40SYehuda Sadeh } 427602adf40SYehuda Sadeh 428602adf40SYehuda Sadeh /* 4291f7ba331SAlex Elder * Find a ceph client with specific addr and configuration. If 4301f7ba331SAlex Elder * found, bump its reference count. 431602adf40SYehuda Sadeh */ 4321f7ba331SAlex Elder static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts) 433602adf40SYehuda Sadeh { 434602adf40SYehuda Sadeh struct rbd_client *client_node; 4351f7ba331SAlex Elder bool found = false; 436602adf40SYehuda Sadeh 43743ae4701SAlex Elder if (ceph_opts->flags & CEPH_OPT_NOSHARE) 438602adf40SYehuda Sadeh return NULL; 439602adf40SYehuda Sadeh 4401f7ba331SAlex Elder spin_lock(&rbd_client_list_lock); 4411f7ba331SAlex Elder list_for_each_entry(client_node, &rbd_client_list, node) { 4421f7ba331SAlex Elder if (!ceph_compare_options(ceph_opts, client_node->client)) { 4431f7ba331SAlex Elder kref_get(&client_node->kref); 4441f7ba331SAlex Elder found = true; 4451f7ba331SAlex Elder break; 4461f7ba331SAlex Elder } 4471f7ba331SAlex Elder } 4481f7ba331SAlex Elder spin_unlock(&rbd_client_list_lock); 4491f7ba331SAlex Elder 4501f7ba331SAlex Elder return found ? client_node : NULL; 451602adf40SYehuda Sadeh } 452602adf40SYehuda Sadeh 453602adf40SYehuda Sadeh /* 45459c2be1eSYehuda Sadeh * mount options 45559c2be1eSYehuda Sadeh */ 45659c2be1eSYehuda Sadeh enum { 45759c2be1eSYehuda Sadeh Opt_last_int, 45859c2be1eSYehuda Sadeh /* int args above */ 45959c2be1eSYehuda Sadeh Opt_last_string, 46059c2be1eSYehuda Sadeh /* string args above */ 461cc0538b6SAlex Elder Opt_read_only, 462cc0538b6SAlex Elder Opt_read_write, 463cc0538b6SAlex Elder /* Boolean args above */ 464cc0538b6SAlex Elder Opt_last_bool, 46559c2be1eSYehuda Sadeh }; 46659c2be1eSYehuda Sadeh 46743ae4701SAlex Elder static match_table_t rbd_opts_tokens = { 46859c2be1eSYehuda Sadeh /* int args above */ 46959c2be1eSYehuda Sadeh /* string args above */ 470be466c1cSAlex Elder {Opt_read_only, "read_only"}, 471cc0538b6SAlex Elder {Opt_read_only, "ro"}, /* Alternate spelling */ 472cc0538b6SAlex Elder {Opt_read_write, "read_write"}, 473cc0538b6SAlex Elder {Opt_read_write, "rw"}, /* Alternate spelling */ 474cc0538b6SAlex Elder /* Boolean args above */ 47559c2be1eSYehuda Sadeh {-1, NULL} 47659c2be1eSYehuda Sadeh }; 47759c2be1eSYehuda Sadeh 47859c2be1eSYehuda Sadeh static int parse_rbd_opts_token(char *c, void *private) 47959c2be1eSYehuda Sadeh { 48043ae4701SAlex Elder struct rbd_options *rbd_opts = private; 48159c2be1eSYehuda Sadeh substring_t argstr[MAX_OPT_ARGS]; 48259c2be1eSYehuda Sadeh int token, intval, ret; 48359c2be1eSYehuda Sadeh 48443ae4701SAlex Elder token = match_token(c, rbd_opts_tokens, argstr); 48559c2be1eSYehuda Sadeh if (token < 0) 48659c2be1eSYehuda Sadeh return -EINVAL; 48759c2be1eSYehuda Sadeh 48859c2be1eSYehuda Sadeh if (token < Opt_last_int) { 48959c2be1eSYehuda Sadeh ret = match_int(&argstr[0], &intval); 49059c2be1eSYehuda Sadeh if (ret < 0) { 49159c2be1eSYehuda Sadeh pr_err("bad mount option arg (not int) " 49259c2be1eSYehuda Sadeh "at '%s'\n", c); 49359c2be1eSYehuda Sadeh return ret; 49459c2be1eSYehuda Sadeh } 49559c2be1eSYehuda Sadeh dout("got int token %d val %d\n", token, intval); 49659c2be1eSYehuda Sadeh } else if (token > Opt_last_int && token < Opt_last_string) { 49759c2be1eSYehuda Sadeh dout("got string token %d val %s\n", token, 49859c2be1eSYehuda Sadeh argstr[0].from); 499cc0538b6SAlex Elder } else if (token > Opt_last_string && token < Opt_last_bool) { 500cc0538b6SAlex Elder dout("got Boolean token %d\n", token); 50159c2be1eSYehuda Sadeh } else { 50259c2be1eSYehuda Sadeh dout("got token %d\n", token); 50359c2be1eSYehuda Sadeh } 50459c2be1eSYehuda Sadeh 50559c2be1eSYehuda Sadeh switch (token) { 506cc0538b6SAlex Elder case Opt_read_only: 507cc0538b6SAlex Elder rbd_opts->read_only = true; 508cc0538b6SAlex Elder break; 509cc0538b6SAlex Elder case Opt_read_write: 510cc0538b6SAlex Elder rbd_opts->read_only = false; 511cc0538b6SAlex Elder break; 51259c2be1eSYehuda Sadeh default: 513aafb230eSAlex Elder rbd_assert(false); 514aafb230eSAlex Elder break; 51559c2be1eSYehuda Sadeh } 51659c2be1eSYehuda Sadeh return 0; 51759c2be1eSYehuda Sadeh } 51859c2be1eSYehuda Sadeh 51959c2be1eSYehuda Sadeh /* 520602adf40SYehuda Sadeh * Get a ceph client with specific addr and configuration, if one does 521602adf40SYehuda Sadeh * not exist create it. 522602adf40SYehuda Sadeh */ 5239d3997fdSAlex Elder static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts) 524602adf40SYehuda Sadeh { 525f8c38929SAlex Elder struct rbd_client *rbdc; 52659c2be1eSYehuda Sadeh 5271f7ba331SAlex Elder rbdc = rbd_client_find(ceph_opts); 5289d3997fdSAlex Elder if (rbdc) /* using an existing client */ 52943ae4701SAlex Elder ceph_destroy_options(ceph_opts); 5309d3997fdSAlex Elder else 531f8c38929SAlex Elder rbdc = rbd_client_create(ceph_opts); 532d720bcb0SAlex Elder 5339d3997fdSAlex Elder return rbdc; 534602adf40SYehuda Sadeh } 535602adf40SYehuda Sadeh 536602adf40SYehuda Sadeh /* 537602adf40SYehuda Sadeh * Destroy ceph client 538d23a4b3fSAlex Elder * 539432b8587SAlex Elder * Caller must hold rbd_client_list_lock. 540602adf40SYehuda Sadeh */ 541602adf40SYehuda Sadeh static void rbd_client_release(struct kref *kref) 542602adf40SYehuda Sadeh { 543602adf40SYehuda Sadeh struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref); 544602adf40SYehuda Sadeh 545602adf40SYehuda Sadeh dout("rbd_release_client %p\n", rbdc); 546cd9d9f5dSAlex Elder spin_lock(&rbd_client_list_lock); 547602adf40SYehuda Sadeh list_del(&rbdc->node); 548cd9d9f5dSAlex Elder spin_unlock(&rbd_client_list_lock); 549602adf40SYehuda Sadeh 550602adf40SYehuda Sadeh ceph_destroy_client(rbdc->client); 551602adf40SYehuda Sadeh kfree(rbdc); 552602adf40SYehuda Sadeh } 553602adf40SYehuda Sadeh 554602adf40SYehuda Sadeh /* 555602adf40SYehuda Sadeh * Drop reference to ceph client node. If it's not referenced anymore, release 556602adf40SYehuda Sadeh * it. 557602adf40SYehuda Sadeh */ 5589d3997fdSAlex Elder static void rbd_put_client(struct rbd_client *rbdc) 559602adf40SYehuda Sadeh { 560c53d5893SAlex Elder if (rbdc) 5619d3997fdSAlex Elder kref_put(&rbdc->kref, rbd_client_release); 562602adf40SYehuda Sadeh } 563602adf40SYehuda Sadeh 5641fec7093SYehuda Sadeh /* 5651fec7093SYehuda Sadeh * Destroy requests collection 5661fec7093SYehuda Sadeh */ 5671fec7093SYehuda Sadeh static void rbd_coll_release(struct kref *kref) 5681fec7093SYehuda Sadeh { 5691fec7093SYehuda Sadeh struct rbd_req_coll *coll = 5701fec7093SYehuda Sadeh container_of(kref, struct rbd_req_coll, kref); 5711fec7093SYehuda Sadeh 5721fec7093SYehuda Sadeh dout("rbd_coll_release %p\n", coll); 5731fec7093SYehuda Sadeh kfree(coll); 5741fec7093SYehuda Sadeh } 575602adf40SYehuda Sadeh 576a30b71b9SAlex Elder static bool rbd_image_format_valid(u32 image_format) 577a30b71b9SAlex Elder { 578a30b71b9SAlex Elder return image_format == 1 || image_format == 2; 579a30b71b9SAlex Elder } 580a30b71b9SAlex Elder 5818e94af8eSAlex Elder static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk) 5828e94af8eSAlex Elder { 583103a150fSAlex Elder size_t size; 584103a150fSAlex Elder u32 snap_count; 585103a150fSAlex Elder 586103a150fSAlex Elder /* The header has to start with the magic rbd header text */ 587103a150fSAlex Elder if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT))) 588103a150fSAlex Elder return false; 589103a150fSAlex Elder 590db2388b6SAlex Elder /* The bio layer requires at least sector-sized I/O */ 591db2388b6SAlex Elder 592db2388b6SAlex Elder if (ondisk->options.order < SECTOR_SHIFT) 593db2388b6SAlex Elder return false; 594db2388b6SAlex Elder 595db2388b6SAlex Elder /* If we use u64 in a few spots we may be able to loosen this */ 596db2388b6SAlex Elder 597db2388b6SAlex Elder if (ondisk->options.order > 8 * sizeof (int) - 1) 598db2388b6SAlex Elder return false; 599db2388b6SAlex Elder 600103a150fSAlex Elder /* 601103a150fSAlex Elder * The size of a snapshot header has to fit in a size_t, and 602103a150fSAlex Elder * that limits the number of snapshots. 603103a150fSAlex Elder */ 604103a150fSAlex Elder snap_count = le32_to_cpu(ondisk->snap_count); 605103a150fSAlex Elder size = SIZE_MAX - sizeof (struct ceph_snap_context); 606103a150fSAlex Elder if (snap_count > size / sizeof (__le64)) 607103a150fSAlex Elder return false; 608103a150fSAlex Elder 609103a150fSAlex Elder /* 610103a150fSAlex Elder * Not only that, but the size of the entire the snapshot 611103a150fSAlex Elder * header must also be representable in a size_t. 612103a150fSAlex Elder */ 613103a150fSAlex Elder size -= snap_count * sizeof (__le64); 614103a150fSAlex Elder if ((u64) size < le64_to_cpu(ondisk->snap_names_len)) 615103a150fSAlex Elder return false; 616103a150fSAlex Elder 617103a150fSAlex Elder return true; 6188e94af8eSAlex Elder } 6198e94af8eSAlex Elder 620602adf40SYehuda Sadeh /* 621602adf40SYehuda Sadeh * Create a new header structure, translate header format from the on-disk 622602adf40SYehuda Sadeh * header. 623602adf40SYehuda Sadeh */ 624602adf40SYehuda Sadeh static int rbd_header_from_disk(struct rbd_image_header *header, 6254156d998SAlex Elder struct rbd_image_header_ondisk *ondisk) 626602adf40SYehuda Sadeh { 627ccece235SAlex Elder u32 snap_count; 62858c17b0eSAlex Elder size_t len; 629d2bb24e5SAlex Elder size_t size; 630621901d6SAlex Elder u32 i; 631602adf40SYehuda Sadeh 6326a52325fSAlex Elder memset(header, 0, sizeof (*header)); 6336a52325fSAlex Elder 634103a150fSAlex Elder snap_count = le32_to_cpu(ondisk->snap_count); 635103a150fSAlex Elder 63658c17b0eSAlex Elder len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix)); 63758c17b0eSAlex Elder header->object_prefix = kmalloc(len + 1, GFP_KERNEL); 6386a52325fSAlex Elder if (!header->object_prefix) 639602adf40SYehuda Sadeh return -ENOMEM; 64058c17b0eSAlex Elder memcpy(header->object_prefix, ondisk->object_prefix, len); 64158c17b0eSAlex Elder header->object_prefix[len] = '\0'; 64200f1f36fSAlex Elder 643602adf40SYehuda Sadeh if (snap_count) { 644f785cc1dSAlex Elder u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len); 645f785cc1dSAlex Elder 646621901d6SAlex Elder /* Save a copy of the snapshot names */ 647621901d6SAlex Elder 648f785cc1dSAlex Elder if (snap_names_len > (u64) SIZE_MAX) 649f785cc1dSAlex Elder return -EIO; 650f785cc1dSAlex Elder header->snap_names = kmalloc(snap_names_len, GFP_KERNEL); 651602adf40SYehuda Sadeh if (!header->snap_names) 6526a52325fSAlex Elder goto out_err; 653f785cc1dSAlex Elder /* 654f785cc1dSAlex Elder * Note that rbd_dev_v1_header_read() guarantees 655f785cc1dSAlex Elder * the ondisk buffer we're working with has 656f785cc1dSAlex Elder * snap_names_len bytes beyond the end of the 657f785cc1dSAlex Elder * snapshot id array, this memcpy() is safe. 658f785cc1dSAlex Elder */ 659f785cc1dSAlex Elder memcpy(header->snap_names, &ondisk->snaps[snap_count], 660f785cc1dSAlex Elder snap_names_len); 6616a52325fSAlex Elder 662621901d6SAlex Elder /* Record each snapshot's size */ 663621901d6SAlex Elder 664d2bb24e5SAlex Elder size = snap_count * sizeof (*header->snap_sizes); 665d2bb24e5SAlex Elder header->snap_sizes = kmalloc(size, GFP_KERNEL); 666602adf40SYehuda Sadeh if (!header->snap_sizes) 6676a52325fSAlex Elder goto out_err; 668621901d6SAlex Elder for (i = 0; i < snap_count; i++) 669621901d6SAlex Elder header->snap_sizes[i] = 670621901d6SAlex Elder le64_to_cpu(ondisk->snaps[i].image_size); 671602adf40SYehuda Sadeh } else { 672ccece235SAlex Elder WARN_ON(ondisk->snap_names_len); 673602adf40SYehuda Sadeh header->snap_names = NULL; 674602adf40SYehuda Sadeh header->snap_sizes = NULL; 675602adf40SYehuda Sadeh } 676849b4260SAlex Elder 67734b13184SAlex Elder header->features = 0; /* No features support in v1 images */ 678602adf40SYehuda Sadeh header->obj_order = ondisk->options.order; 679602adf40SYehuda Sadeh header->crypt_type = ondisk->options.crypt_type; 680602adf40SYehuda Sadeh header->comp_type = ondisk->options.comp_type; 6816a52325fSAlex Elder 682621901d6SAlex Elder /* Allocate and fill in the snapshot context */ 683621901d6SAlex Elder 684f84344f3SAlex Elder header->image_size = le64_to_cpu(ondisk->image_size); 6856a52325fSAlex Elder size = sizeof (struct ceph_snap_context); 6866a52325fSAlex Elder size += snap_count * sizeof (header->snapc->snaps[0]); 6876a52325fSAlex Elder header->snapc = kzalloc(size, GFP_KERNEL); 6886a52325fSAlex Elder if (!header->snapc) 6896a52325fSAlex Elder goto out_err; 690602adf40SYehuda Sadeh 691602adf40SYehuda Sadeh atomic_set(&header->snapc->nref, 1); 692505cbb9bSAlex Elder header->snapc->seq = le64_to_cpu(ondisk->snap_seq); 693602adf40SYehuda Sadeh header->snapc->num_snaps = snap_count; 694621901d6SAlex Elder for (i = 0; i < snap_count; i++) 695602adf40SYehuda Sadeh header->snapc->snaps[i] = 696602adf40SYehuda Sadeh le64_to_cpu(ondisk->snaps[i].id); 697602adf40SYehuda Sadeh 698602adf40SYehuda Sadeh return 0; 699602adf40SYehuda Sadeh 7006a52325fSAlex Elder out_err: 701849b4260SAlex Elder kfree(header->snap_sizes); 702ccece235SAlex Elder header->snap_sizes = NULL; 703602adf40SYehuda Sadeh kfree(header->snap_names); 704ccece235SAlex Elder header->snap_names = NULL; 7056a52325fSAlex Elder kfree(header->object_prefix); 7066a52325fSAlex Elder header->object_prefix = NULL; 707ccece235SAlex Elder 70800f1f36fSAlex Elder return -ENOMEM; 709602adf40SYehuda Sadeh } 710602adf40SYehuda Sadeh 7119e15b77dSAlex Elder static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id) 7129e15b77dSAlex Elder { 7139e15b77dSAlex Elder struct rbd_snap *snap; 7149e15b77dSAlex Elder 7159e15b77dSAlex Elder if (snap_id == CEPH_NOSNAP) 7169e15b77dSAlex Elder return RBD_SNAP_HEAD_NAME; 7179e15b77dSAlex Elder 7189e15b77dSAlex Elder list_for_each_entry(snap, &rbd_dev->snaps, node) 7199e15b77dSAlex Elder if (snap_id == snap->id) 7209e15b77dSAlex Elder return snap->name; 7219e15b77dSAlex Elder 7229e15b77dSAlex Elder return NULL; 7239e15b77dSAlex Elder } 7249e15b77dSAlex Elder 7258836b995SAlex Elder static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name) 726602adf40SYehuda Sadeh { 727602adf40SYehuda Sadeh 728e86924a8SAlex Elder struct rbd_snap *snap; 72900f1f36fSAlex Elder 730e86924a8SAlex Elder list_for_each_entry(snap, &rbd_dev->snaps, node) { 731e86924a8SAlex Elder if (!strcmp(snap_name, snap->name)) { 7320d7dbfceSAlex Elder rbd_dev->spec->snap_id = snap->id; 733e86924a8SAlex Elder rbd_dev->mapping.size = snap->size; 73434b13184SAlex Elder rbd_dev->mapping.features = snap->features; 73500f1f36fSAlex Elder 736e86924a8SAlex Elder return 0; 737602adf40SYehuda Sadeh } 73800f1f36fSAlex Elder } 739e86924a8SAlex Elder 74000f1f36fSAlex Elder return -ENOENT; 74100f1f36fSAlex Elder } 742602adf40SYehuda Sadeh 743819d52bfSAlex Elder static int rbd_dev_set_mapping(struct rbd_device *rbd_dev) 744602adf40SYehuda Sadeh { 74578dc447dSAlex Elder int ret; 746602adf40SYehuda Sadeh 7470d7dbfceSAlex Elder if (!memcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME, 748cc9d734cSJosh Durgin sizeof (RBD_SNAP_HEAD_NAME))) { 7490d7dbfceSAlex Elder rbd_dev->spec->snap_id = CEPH_NOSNAP; 75099c1f08fSAlex Elder rbd_dev->mapping.size = rbd_dev->header.image_size; 75134b13184SAlex Elder rbd_dev->mapping.features = rbd_dev->header.features; 752e86924a8SAlex Elder ret = 0; 753602adf40SYehuda Sadeh } else { 7540d7dbfceSAlex Elder ret = snap_by_name(rbd_dev, rbd_dev->spec->snap_name); 755602adf40SYehuda Sadeh if (ret < 0) 756602adf40SYehuda Sadeh goto done; 757f84344f3SAlex Elder rbd_dev->mapping.read_only = true; 758602adf40SYehuda Sadeh } 759d78b650aSAlex Elder atomic_set(&rbd_dev->exists, 1); 760602adf40SYehuda Sadeh done: 761602adf40SYehuda Sadeh return ret; 762602adf40SYehuda Sadeh } 763602adf40SYehuda Sadeh 764602adf40SYehuda Sadeh static void rbd_header_free(struct rbd_image_header *header) 765602adf40SYehuda Sadeh { 766849b4260SAlex Elder kfree(header->object_prefix); 767d78fd7aeSAlex Elder header->object_prefix = NULL; 768602adf40SYehuda Sadeh kfree(header->snap_sizes); 769d78fd7aeSAlex Elder header->snap_sizes = NULL; 770849b4260SAlex Elder kfree(header->snap_names); 771d78fd7aeSAlex Elder header->snap_names = NULL; 772d1d25646SJosh Durgin ceph_put_snap_context(header->snapc); 773d78fd7aeSAlex Elder header->snapc = NULL; 774602adf40SYehuda Sadeh } 775602adf40SYehuda Sadeh 77665ccfe21SAlex Elder static char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset) 777602adf40SYehuda Sadeh { 77865ccfe21SAlex Elder char *name; 77965ccfe21SAlex Elder u64 segment; 78065ccfe21SAlex Elder int ret; 781602adf40SYehuda Sadeh 7822fd82b9eSAlex Elder name = kmalloc(MAX_OBJ_NAME_SIZE + 1, GFP_NOIO); 78365ccfe21SAlex Elder if (!name) 78465ccfe21SAlex Elder return NULL; 78565ccfe21SAlex Elder segment = offset >> rbd_dev->header.obj_order; 7862fd82b9eSAlex Elder ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, "%s.%012llx", 78765ccfe21SAlex Elder rbd_dev->header.object_prefix, segment); 7882fd82b9eSAlex Elder if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) { 78965ccfe21SAlex Elder pr_err("error formatting segment name for #%llu (%d)\n", 79065ccfe21SAlex Elder segment, ret); 79165ccfe21SAlex Elder kfree(name); 79265ccfe21SAlex Elder name = NULL; 79365ccfe21SAlex Elder } 794602adf40SYehuda Sadeh 79565ccfe21SAlex Elder return name; 79665ccfe21SAlex Elder } 797602adf40SYehuda Sadeh 79865ccfe21SAlex Elder static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset) 79965ccfe21SAlex Elder { 80065ccfe21SAlex Elder u64 segment_size = (u64) 1 << rbd_dev->header.obj_order; 801602adf40SYehuda Sadeh 80265ccfe21SAlex Elder return offset & (segment_size - 1); 80365ccfe21SAlex Elder } 80465ccfe21SAlex Elder 80565ccfe21SAlex Elder static u64 rbd_segment_length(struct rbd_device *rbd_dev, 80665ccfe21SAlex Elder u64 offset, u64 length) 80765ccfe21SAlex Elder { 80865ccfe21SAlex Elder u64 segment_size = (u64) 1 << rbd_dev->header.obj_order; 80965ccfe21SAlex Elder 81065ccfe21SAlex Elder offset &= segment_size - 1; 81165ccfe21SAlex Elder 812aafb230eSAlex Elder rbd_assert(length <= U64_MAX - offset); 81365ccfe21SAlex Elder if (offset + length > segment_size) 81465ccfe21SAlex Elder length = segment_size - offset; 81565ccfe21SAlex Elder 81665ccfe21SAlex Elder return length; 817602adf40SYehuda Sadeh } 818602adf40SYehuda Sadeh 8191fec7093SYehuda Sadeh static int rbd_get_num_segments(struct rbd_image_header *header, 8201fec7093SYehuda Sadeh u64 ofs, u64 len) 8211fec7093SYehuda Sadeh { 822df111be6SAlex Elder u64 start_seg; 823df111be6SAlex Elder u64 end_seg; 824df111be6SAlex Elder 825df111be6SAlex Elder if (!len) 826df111be6SAlex Elder return 0; 827df111be6SAlex Elder if (len - 1 > U64_MAX - ofs) 828df111be6SAlex Elder return -ERANGE; 829df111be6SAlex Elder 830df111be6SAlex Elder start_seg = ofs >> header->obj_order; 831df111be6SAlex Elder end_seg = (ofs + len - 1) >> header->obj_order; 832df111be6SAlex Elder 8331fec7093SYehuda Sadeh return end_seg - start_seg + 1; 8341fec7093SYehuda Sadeh } 8351fec7093SYehuda Sadeh 836602adf40SYehuda Sadeh /* 837029bcbd8SJosh Durgin * returns the size of an object in the image 838029bcbd8SJosh Durgin */ 839029bcbd8SJosh Durgin static u64 rbd_obj_bytes(struct rbd_image_header *header) 840029bcbd8SJosh Durgin { 841029bcbd8SJosh Durgin return 1 << header->obj_order; 842029bcbd8SJosh Durgin } 843029bcbd8SJosh Durgin 844029bcbd8SJosh Durgin /* 845602adf40SYehuda Sadeh * bio helpers 846602adf40SYehuda Sadeh */ 847602adf40SYehuda Sadeh 848602adf40SYehuda Sadeh static void bio_chain_put(struct bio *chain) 849602adf40SYehuda Sadeh { 850602adf40SYehuda Sadeh struct bio *tmp; 851602adf40SYehuda Sadeh 852602adf40SYehuda Sadeh while (chain) { 853602adf40SYehuda Sadeh tmp = chain; 854602adf40SYehuda Sadeh chain = chain->bi_next; 855602adf40SYehuda Sadeh bio_put(tmp); 856602adf40SYehuda Sadeh } 857602adf40SYehuda Sadeh } 858602adf40SYehuda Sadeh 859602adf40SYehuda Sadeh /* 860602adf40SYehuda Sadeh * zeros a bio chain, starting at specific offset 861602adf40SYehuda Sadeh */ 862602adf40SYehuda Sadeh static void zero_bio_chain(struct bio *chain, int start_ofs) 863602adf40SYehuda Sadeh { 864602adf40SYehuda Sadeh struct bio_vec *bv; 865602adf40SYehuda Sadeh unsigned long flags; 866602adf40SYehuda Sadeh void *buf; 867602adf40SYehuda Sadeh int i; 868602adf40SYehuda Sadeh int pos = 0; 869602adf40SYehuda Sadeh 870602adf40SYehuda Sadeh while (chain) { 871602adf40SYehuda Sadeh bio_for_each_segment(bv, chain, i) { 872602adf40SYehuda Sadeh if (pos + bv->bv_len > start_ofs) { 873602adf40SYehuda Sadeh int remainder = max(start_ofs - pos, 0); 874602adf40SYehuda Sadeh buf = bvec_kmap_irq(bv, &flags); 875602adf40SYehuda Sadeh memset(buf + remainder, 0, 876602adf40SYehuda Sadeh bv->bv_len - remainder); 87785b5aaa6SDan Carpenter bvec_kunmap_irq(buf, &flags); 878602adf40SYehuda Sadeh } 879602adf40SYehuda Sadeh pos += bv->bv_len; 880602adf40SYehuda Sadeh } 881602adf40SYehuda Sadeh 882602adf40SYehuda Sadeh chain = chain->bi_next; 883602adf40SYehuda Sadeh } 884602adf40SYehuda Sadeh } 885602adf40SYehuda Sadeh 886602adf40SYehuda Sadeh /* 887f7760dadSAlex Elder * Clone a portion of a bio, starting at the given byte offset 888f7760dadSAlex Elder * and continuing for the number of bytes indicated. 889602adf40SYehuda Sadeh */ 890f7760dadSAlex Elder static struct bio *bio_clone_range(struct bio *bio_src, 891f7760dadSAlex Elder unsigned int offset, 892f7760dadSAlex Elder unsigned int len, 893f7760dadSAlex Elder gfp_t gfpmask) 894602adf40SYehuda Sadeh { 895f7760dadSAlex Elder struct bio_vec *bv; 896f7760dadSAlex Elder unsigned int resid; 897f7760dadSAlex Elder unsigned short idx; 898f7760dadSAlex Elder unsigned int voff; 899f7760dadSAlex Elder unsigned short end_idx; 900f7760dadSAlex Elder unsigned short vcnt; 901f7760dadSAlex Elder struct bio *bio; 902602adf40SYehuda Sadeh 903f7760dadSAlex Elder /* Handle the easy case for the caller */ 904f7760dadSAlex Elder 905f7760dadSAlex Elder if (!offset && len == bio_src->bi_size) 906f7760dadSAlex Elder return bio_clone(bio_src, gfpmask); 907f7760dadSAlex Elder 908f7760dadSAlex Elder if (WARN_ON_ONCE(!len)) 909f7760dadSAlex Elder return NULL; 910f7760dadSAlex Elder if (WARN_ON_ONCE(len > bio_src->bi_size)) 911f7760dadSAlex Elder return NULL; 912f7760dadSAlex Elder if (WARN_ON_ONCE(offset > bio_src->bi_size - len)) 913f7760dadSAlex Elder return NULL; 914f7760dadSAlex Elder 915f7760dadSAlex Elder /* Find first affected segment... */ 916f7760dadSAlex Elder 917f7760dadSAlex Elder resid = offset; 918f7760dadSAlex Elder __bio_for_each_segment(bv, bio_src, idx, 0) { 919f7760dadSAlex Elder if (resid < bv->bv_len) 920f7760dadSAlex Elder break; 921f7760dadSAlex Elder resid -= bv->bv_len; 922602adf40SYehuda Sadeh } 923f7760dadSAlex Elder voff = resid; 924602adf40SYehuda Sadeh 925f7760dadSAlex Elder /* ...and the last affected segment */ 926542582fcSAlex Elder 927f7760dadSAlex Elder resid += len; 928f7760dadSAlex Elder __bio_for_each_segment(bv, bio_src, end_idx, idx) { 929f7760dadSAlex Elder if (resid <= bv->bv_len) 930f7760dadSAlex Elder break; 931f7760dadSAlex Elder resid -= bv->bv_len; 932f7760dadSAlex Elder } 933f7760dadSAlex Elder vcnt = end_idx - idx + 1; 934602adf40SYehuda Sadeh 935f7760dadSAlex Elder /* Build the clone */ 936f7760dadSAlex Elder 937f7760dadSAlex Elder bio = bio_alloc(gfpmask, (unsigned int) vcnt); 938f7760dadSAlex Elder if (!bio) 939f7760dadSAlex Elder return NULL; /* ENOMEM */ 940f7760dadSAlex Elder 941f7760dadSAlex Elder bio->bi_bdev = bio_src->bi_bdev; 942f7760dadSAlex Elder bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT); 943f7760dadSAlex Elder bio->bi_rw = bio_src->bi_rw; 944f7760dadSAlex Elder bio->bi_flags |= 1 << BIO_CLONED; 945602adf40SYehuda Sadeh 946602adf40SYehuda Sadeh /* 947f7760dadSAlex Elder * Copy over our part of the bio_vec, then update the first 948f7760dadSAlex Elder * and last (or only) entries. 949602adf40SYehuda Sadeh */ 950f7760dadSAlex Elder memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx], 951f7760dadSAlex Elder vcnt * sizeof (struct bio_vec)); 952f7760dadSAlex Elder bio->bi_io_vec[0].bv_offset += voff; 953f7760dadSAlex Elder if (vcnt > 1) { 954f7760dadSAlex Elder bio->bi_io_vec[0].bv_len -= voff; 955f7760dadSAlex Elder bio->bi_io_vec[vcnt - 1].bv_len = resid; 956602adf40SYehuda Sadeh } else { 957f7760dadSAlex Elder bio->bi_io_vec[0].bv_len = len; 958602adf40SYehuda Sadeh } 959602adf40SYehuda Sadeh 960f7760dadSAlex Elder bio->bi_vcnt = vcnt; 961f7760dadSAlex Elder bio->bi_size = len; 962f7760dadSAlex Elder bio->bi_idx = 0; 963602adf40SYehuda Sadeh 964f7760dadSAlex Elder return bio; 965602adf40SYehuda Sadeh } 966602adf40SYehuda Sadeh 967f7760dadSAlex Elder /* 968f7760dadSAlex Elder * Clone a portion of a bio chain, starting at the given byte offset 969f7760dadSAlex Elder * into the first bio in the source chain and continuing for the 970f7760dadSAlex Elder * number of bytes indicated. The result is another bio chain of 971f7760dadSAlex Elder * exactly the given length, or a null pointer on error. 972f7760dadSAlex Elder * 973f7760dadSAlex Elder * The bio_src and offset parameters are both in-out. On entry they 974f7760dadSAlex Elder * refer to the first source bio and the offset into that bio where 975f7760dadSAlex Elder * the start of data to be cloned is located. 976f7760dadSAlex Elder * 977f7760dadSAlex Elder * On return, bio_src is updated to refer to the bio in the source 978f7760dadSAlex Elder * chain that contains first un-cloned byte, and *offset will 979f7760dadSAlex Elder * contain the offset of that byte within that bio. 980f7760dadSAlex Elder */ 981f7760dadSAlex Elder static struct bio *bio_chain_clone_range(struct bio **bio_src, 982f7760dadSAlex Elder unsigned int *offset, 983f7760dadSAlex Elder unsigned int len, 984f7760dadSAlex Elder gfp_t gfpmask) 985f7760dadSAlex Elder { 986f7760dadSAlex Elder struct bio *bi = *bio_src; 987f7760dadSAlex Elder unsigned int off = *offset; 988f7760dadSAlex Elder struct bio *chain = NULL; 989f7760dadSAlex Elder struct bio **end; 990602adf40SYehuda Sadeh 991f7760dadSAlex Elder /* Build up a chain of clone bios up to the limit */ 992602adf40SYehuda Sadeh 993f7760dadSAlex Elder if (!bi || off >= bi->bi_size || !len) 994f7760dadSAlex Elder return NULL; /* Nothing to clone */ 995602adf40SYehuda Sadeh 996f7760dadSAlex Elder end = &chain; 997f7760dadSAlex Elder while (len) { 998f7760dadSAlex Elder unsigned int bi_size; 999f7760dadSAlex Elder struct bio *bio; 1000f7760dadSAlex Elder 1001f5400b7aSAlex Elder if (!bi) { 1002f5400b7aSAlex Elder rbd_warn(NULL, "bio_chain exhausted with %u left", len); 1003f7760dadSAlex Elder goto out_err; /* EINVAL; ran out of bio's */ 1004f5400b7aSAlex Elder } 1005f7760dadSAlex Elder bi_size = min_t(unsigned int, bi->bi_size - off, len); 1006f7760dadSAlex Elder bio = bio_clone_range(bi, off, bi_size, gfpmask); 1007f7760dadSAlex Elder if (!bio) 1008f7760dadSAlex Elder goto out_err; /* ENOMEM */ 1009f7760dadSAlex Elder 1010f7760dadSAlex Elder *end = bio; 1011f7760dadSAlex Elder end = &bio->bi_next; 1012f7760dadSAlex Elder 1013f7760dadSAlex Elder off += bi_size; 1014f7760dadSAlex Elder if (off == bi->bi_size) { 1015f7760dadSAlex Elder bi = bi->bi_next; 1016f7760dadSAlex Elder off = 0; 1017f7760dadSAlex Elder } 1018f7760dadSAlex Elder len -= bi_size; 1019f7760dadSAlex Elder } 1020f7760dadSAlex Elder *bio_src = bi; 1021f7760dadSAlex Elder *offset = off; 1022f7760dadSAlex Elder 1023f7760dadSAlex Elder return chain; 1024f7760dadSAlex Elder out_err: 1025f7760dadSAlex Elder bio_chain_put(chain); 1026f7760dadSAlex Elder 1027602adf40SYehuda Sadeh return NULL; 1028602adf40SYehuda Sadeh } 1029602adf40SYehuda Sadeh 10308d23bf29SAlex Elder struct ceph_osd_req_op *rbd_osd_req_op_create(u16 opcode, ...) 10318d23bf29SAlex Elder { 10328d23bf29SAlex Elder struct ceph_osd_req_op *op; 10338d23bf29SAlex Elder va_list args; 10342647ba38SAlex Elder size_t size; 10358d23bf29SAlex Elder 10368d23bf29SAlex Elder op = kzalloc(sizeof (*op), GFP_NOIO); 10378d23bf29SAlex Elder if (!op) 10388d23bf29SAlex Elder return NULL; 10398d23bf29SAlex Elder op->op = opcode; 10408d23bf29SAlex Elder va_start(args, opcode); 10418d23bf29SAlex Elder switch (opcode) { 10428d23bf29SAlex Elder case CEPH_OSD_OP_READ: 10438d23bf29SAlex Elder case CEPH_OSD_OP_WRITE: 10448d23bf29SAlex Elder /* rbd_osd_req_op_create(READ, offset, length) */ 10458d23bf29SAlex Elder /* rbd_osd_req_op_create(WRITE, offset, length) */ 10468d23bf29SAlex Elder op->extent.offset = va_arg(args, u64); 10478d23bf29SAlex Elder op->extent.length = va_arg(args, u64); 10488d23bf29SAlex Elder if (opcode == CEPH_OSD_OP_WRITE) 10498d23bf29SAlex Elder op->payload_len = op->extent.length; 10508d23bf29SAlex Elder break; 10512647ba38SAlex Elder case CEPH_OSD_OP_CALL: 10522647ba38SAlex Elder /* rbd_osd_req_op_create(CALL, class, method, data, datalen) */ 10532647ba38SAlex Elder op->cls.class_name = va_arg(args, char *); 10542647ba38SAlex Elder size = strlen(op->cls.class_name); 10552647ba38SAlex Elder rbd_assert(size <= (size_t) U8_MAX); 10562647ba38SAlex Elder op->cls.class_len = size; 10572647ba38SAlex Elder op->payload_len = size; 10582647ba38SAlex Elder 10592647ba38SAlex Elder op->cls.method_name = va_arg(args, char *); 10602647ba38SAlex Elder size = strlen(op->cls.method_name); 10612647ba38SAlex Elder rbd_assert(size <= (size_t) U8_MAX); 10622647ba38SAlex Elder op->cls.method_len = size; 10632647ba38SAlex Elder op->payload_len += size; 10642647ba38SAlex Elder 10652647ba38SAlex Elder op->cls.argc = 0; 10662647ba38SAlex Elder op->cls.indata = va_arg(args, void *); 10672647ba38SAlex Elder size = va_arg(args, size_t); 10682647ba38SAlex Elder rbd_assert(size <= (size_t) U32_MAX); 10692647ba38SAlex Elder op->cls.indata_len = (u32) size; 10702647ba38SAlex Elder op->payload_len += size; 10712647ba38SAlex Elder break; 10725efea49aSAlex Elder case CEPH_OSD_OP_NOTIFY_ACK: 10735efea49aSAlex Elder case CEPH_OSD_OP_WATCH: 10745efea49aSAlex Elder /* rbd_osd_req_op_create(NOTIFY_ACK, cookie, version) */ 10755efea49aSAlex Elder /* rbd_osd_req_op_create(WATCH, cookie, version, flag) */ 10765efea49aSAlex Elder op->watch.cookie = va_arg(args, u64); 10775efea49aSAlex Elder op->watch.ver = va_arg(args, u64); 10785efea49aSAlex Elder op->watch.ver = cpu_to_le64(op->watch.ver); 10795efea49aSAlex Elder if (opcode == CEPH_OSD_OP_WATCH && va_arg(args, int)) 10805efea49aSAlex Elder op->watch.flag = (u8) 1; 10815efea49aSAlex Elder break; 10828d23bf29SAlex Elder default: 10838d23bf29SAlex Elder rbd_warn(NULL, "unsupported opcode %hu\n", opcode); 10848d23bf29SAlex Elder kfree(op); 10858d23bf29SAlex Elder op = NULL; 10868d23bf29SAlex Elder break; 10878d23bf29SAlex Elder } 10888d23bf29SAlex Elder va_end(args); 10898d23bf29SAlex Elder 10908d23bf29SAlex Elder return op; 10918d23bf29SAlex Elder } 10928d23bf29SAlex Elder 10938d23bf29SAlex Elder static void rbd_osd_req_op_destroy(struct ceph_osd_req_op *op) 10948d23bf29SAlex Elder { 10958d23bf29SAlex Elder kfree(op); 10968d23bf29SAlex Elder } 10978d23bf29SAlex Elder 10981fec7093SYehuda Sadeh static void rbd_coll_end_req_index(struct request *rq, 10991fec7093SYehuda Sadeh struct rbd_req_coll *coll, 11001fec7093SYehuda Sadeh int index, 11018986cb37SAlex Elder s32 ret, u64 len) 11021fec7093SYehuda Sadeh { 11031fec7093SYehuda Sadeh struct request_queue *q; 11041fec7093SYehuda Sadeh int min, max, i; 11051fec7093SYehuda Sadeh 1106bd919d45SAlex Elder dout("rbd_coll_end_req_index %p index %d ret %d len %llu\n", 11078986cb37SAlex Elder coll, index, (int)ret, (unsigned long long)len); 11081fec7093SYehuda Sadeh 11091fec7093SYehuda Sadeh if (!rq) 11101fec7093SYehuda Sadeh return; 11111fec7093SYehuda Sadeh 11121fec7093SYehuda Sadeh if (!coll) { 11131fec7093SYehuda Sadeh blk_end_request(rq, ret, len); 11141fec7093SYehuda Sadeh return; 11151fec7093SYehuda Sadeh } 11161fec7093SYehuda Sadeh 11171fec7093SYehuda Sadeh q = rq->q; 11181fec7093SYehuda Sadeh 11191fec7093SYehuda Sadeh spin_lock_irq(q->queue_lock); 11201fec7093SYehuda Sadeh coll->status[index].done = 1; 11211fec7093SYehuda Sadeh coll->status[index].rc = ret; 11221fec7093SYehuda Sadeh coll->status[index].bytes = len; 11231fec7093SYehuda Sadeh max = min = coll->num_done; 11241fec7093SYehuda Sadeh while (max < coll->total && coll->status[max].done) 11251fec7093SYehuda Sadeh max++; 11261fec7093SYehuda Sadeh 11271fec7093SYehuda Sadeh for (i = min; i<max; i++) { 11288986cb37SAlex Elder __blk_end_request(rq, (int)coll->status[i].rc, 11291fec7093SYehuda Sadeh coll->status[i].bytes); 11301fec7093SYehuda Sadeh coll->num_done++; 11311fec7093SYehuda Sadeh kref_put(&coll->kref, rbd_coll_release); 11321fec7093SYehuda Sadeh } 11331fec7093SYehuda Sadeh spin_unlock_irq(q->queue_lock); 11341fec7093SYehuda Sadeh } 11351fec7093SYehuda Sadeh 1136725afc97SAlex Elder static void rbd_coll_end_req(struct rbd_request *rbd_req, 11378986cb37SAlex Elder s32 ret, u64 len) 11381fec7093SYehuda Sadeh { 1139725afc97SAlex Elder rbd_coll_end_req_index(rbd_req->rq, 1140725afc97SAlex Elder rbd_req->coll, rbd_req->coll_index, 1141725afc97SAlex Elder ret, len); 11421fec7093SYehuda Sadeh } 11431fec7093SYehuda Sadeh 1144602adf40SYehuda Sadeh /* 1145602adf40SYehuda Sadeh * Send ceph osd request 1146602adf40SYehuda Sadeh */ 1147602adf40SYehuda Sadeh static int rbd_do_request(struct request *rq, 11480ce1a794SAlex Elder struct rbd_device *rbd_dev, 1149602adf40SYehuda Sadeh struct ceph_snap_context *snapc, 1150602adf40SYehuda Sadeh u64 snapid, 1151aded07eaSAlex Elder const char *object_name, u64 ofs, u64 len, 1152602adf40SYehuda Sadeh struct bio *bio, 1153602adf40SYehuda Sadeh struct page **pages, 1154602adf40SYehuda Sadeh int num_pages, 1155602adf40SYehuda Sadeh int flags, 115630573d68SAlex Elder struct ceph_osd_req_op *op, 11571fec7093SYehuda Sadeh struct rbd_req_coll *coll, 11581fec7093SYehuda Sadeh int coll_index, 11595f29ddd4SAlex Elder void (*rbd_cb)(struct ceph_osd_request *, 11605f29ddd4SAlex Elder struct ceph_msg *), 116159c2be1eSYehuda Sadeh struct ceph_osd_request **linger_req, 116259c2be1eSYehuda Sadeh u64 *ver) 1163602adf40SYehuda Sadeh { 11641dbb4399SAlex Elder struct ceph_osd_client *osdc; 11652e53c6c3SAlex Elder struct ceph_osd_request *osd_req; 11662e53c6c3SAlex Elder struct rbd_request *rbd_req = NULL; 11672e53c6c3SAlex Elder struct timespec mtime = CURRENT_TIME; 11682e53c6c3SAlex Elder int ret; 11691fec7093SYehuda Sadeh 1170f7760dadSAlex Elder dout("rbd_do_request object_name=%s ofs=%llu len=%llu coll=%p[%d]\n", 1171f7760dadSAlex Elder object_name, (unsigned long long) ofs, 1172f7760dadSAlex Elder (unsigned long long) len, coll, coll_index); 1173602adf40SYehuda Sadeh 11740ce1a794SAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 117530573d68SAlex Elder osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_NOIO); 11762e53c6c3SAlex Elder if (!osd_req) 11772e53c6c3SAlex Elder return -ENOMEM; 1178602adf40SYehuda Sadeh 1179d178a9e7SAlex Elder osd_req->r_flags = flags; 118054a54007SAlex Elder osd_req->r_pages = pages; 118154a54007SAlex Elder if (bio) { 118254a54007SAlex Elder osd_req->r_bio = bio; 118354a54007SAlex Elder bio_get(osd_req->r_bio); 118454a54007SAlex Elder } 11852e53c6c3SAlex Elder 118618216657SAlex Elder if (coll) { 11872e53c6c3SAlex Elder ret = -ENOMEM; 11882e53c6c3SAlex Elder rbd_req = kmalloc(sizeof(*rbd_req), GFP_NOIO); 11892e53c6c3SAlex Elder if (!rbd_req) 11902e53c6c3SAlex Elder goto done_osd_req; 1191602adf40SYehuda Sadeh 1192725afc97SAlex Elder rbd_req->rq = rq; 1193725afc97SAlex Elder rbd_req->bio = bio; 1194725afc97SAlex Elder rbd_req->pages = pages; 1195725afc97SAlex Elder rbd_req->len = len; 11962e53c6c3SAlex Elder rbd_req->coll = coll; 119718216657SAlex Elder rbd_req->coll_index = coll_index; 11982e53c6c3SAlex Elder } 1199602adf40SYehuda Sadeh 12002e53c6c3SAlex Elder osd_req->r_callback = rbd_cb; 12015f29ddd4SAlex Elder osd_req->r_priv = rbd_req; 1202602adf40SYehuda Sadeh 12035f29ddd4SAlex Elder strncpy(osd_req->r_oid, object_name, sizeof(osd_req->r_oid)); 12045f29ddd4SAlex Elder osd_req->r_oid_len = strlen(osd_req->r_oid); 1205602adf40SYehuda Sadeh 12060903e875SAlex Elder osd_req->r_file_layout = rbd_dev->layout; /* struct */ 1207e01e7927SAlex Elder osd_req->r_num_pages = calc_pages_for(ofs, len); 1208e01e7927SAlex Elder osd_req->r_page_alignment = ofs & ~PAGE_MASK; 1209602adf40SYehuda Sadeh 121030573d68SAlex Elder ceph_osdc_build_request(osd_req, ofs, len, 1, op, 1211ae7ca4a3SAlex Elder snapc, snapid, &mtime); 1212602adf40SYehuda Sadeh 121359c2be1eSYehuda Sadeh if (linger_req) { 12145f29ddd4SAlex Elder ceph_osdc_set_request_linger(osdc, osd_req); 12155f29ddd4SAlex Elder *linger_req = osd_req; 121659c2be1eSYehuda Sadeh } 121759c2be1eSYehuda Sadeh 12185f29ddd4SAlex Elder ret = ceph_osdc_start_request(osdc, osd_req, false); 1219602adf40SYehuda Sadeh if (ret < 0) 1220602adf40SYehuda Sadeh goto done_err; 1221602adf40SYehuda Sadeh 1222602adf40SYehuda Sadeh if (!rbd_cb) { 12235f29ddd4SAlex Elder u64 version; 12245f29ddd4SAlex Elder 12255f29ddd4SAlex Elder ret = ceph_osdc_wait_request(osdc, osd_req); 12265f29ddd4SAlex Elder version = le64_to_cpu(osd_req->r_reassert_version.version); 122759c2be1eSYehuda Sadeh if (ver) 12285f29ddd4SAlex Elder *ver = version; 12295f29ddd4SAlex Elder dout("reassert_ver=%llu\n", (unsigned long long) version); 12305f29ddd4SAlex Elder ceph_osdc_put_request(osd_req); 1231602adf40SYehuda Sadeh } 1232602adf40SYehuda Sadeh return ret; 1233602adf40SYehuda Sadeh 1234602adf40SYehuda Sadeh done_err: 12352e53c6c3SAlex Elder if (bio) 12362e53c6c3SAlex Elder bio_chain_put(osd_req->r_bio); 1237725afc97SAlex Elder kfree(rbd_req); 12382e53c6c3SAlex Elder done_osd_req: 12392e53c6c3SAlex Elder ceph_osdc_put_request(osd_req); 12402e53c6c3SAlex Elder 1241602adf40SYehuda Sadeh return ret; 1242602adf40SYehuda Sadeh } 1243602adf40SYehuda Sadeh 1244602adf40SYehuda Sadeh /* 1245602adf40SYehuda Sadeh * Ceph osd op callback 1246602adf40SYehuda Sadeh */ 12475f29ddd4SAlex Elder static void rbd_req_cb(struct ceph_osd_request *osd_req, struct ceph_msg *msg) 1248602adf40SYehuda Sadeh { 12495f29ddd4SAlex Elder struct rbd_request *rbd_req = osd_req->r_priv; 1250602adf40SYehuda Sadeh struct ceph_osd_reply_head *replyhead; 1251602adf40SYehuda Sadeh struct ceph_osd_op *op; 12528986cb37SAlex Elder s32 rc; 1253602adf40SYehuda Sadeh u64 bytes; 1254602adf40SYehuda Sadeh int read_op; 1255602adf40SYehuda Sadeh 1256602adf40SYehuda Sadeh /* parse reply */ 1257602adf40SYehuda Sadeh replyhead = msg->front.iov_base; 1258602adf40SYehuda Sadeh WARN_ON(le32_to_cpu(replyhead->num_ops) == 0); 1259602adf40SYehuda Sadeh op = (void *)(replyhead + 1); 12608986cb37SAlex Elder rc = (s32)le32_to_cpu(replyhead->result); 1261602adf40SYehuda Sadeh bytes = le64_to_cpu(op->extent.length); 1262895cfcc8SDan Carpenter read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ); 1263602adf40SYehuda Sadeh 1264bd919d45SAlex Elder dout("rbd_req_cb bytes=%llu readop=%d rc=%d\n", 1265bd919d45SAlex Elder (unsigned long long) bytes, read_op, (int) rc); 1266602adf40SYehuda Sadeh 12678986cb37SAlex Elder if (rc == (s32)-ENOENT && read_op) { 1268725afc97SAlex Elder zero_bio_chain(rbd_req->bio, 0); 1269602adf40SYehuda Sadeh rc = 0; 1270725afc97SAlex Elder } else if (rc == 0 && read_op && bytes < rbd_req->len) { 1271725afc97SAlex Elder zero_bio_chain(rbd_req->bio, bytes); 1272725afc97SAlex Elder bytes = rbd_req->len; 1273602adf40SYehuda Sadeh } 1274602adf40SYehuda Sadeh 1275725afc97SAlex Elder rbd_coll_end_req(rbd_req, rc, bytes); 1276602adf40SYehuda Sadeh 1277725afc97SAlex Elder if (rbd_req->bio) 1278725afc97SAlex Elder bio_chain_put(rbd_req->bio); 1279602adf40SYehuda Sadeh 12805f29ddd4SAlex Elder ceph_osdc_put_request(osd_req); 1281725afc97SAlex Elder kfree(rbd_req); 1282602adf40SYehuda Sadeh } 1283602adf40SYehuda Sadeh 12845f29ddd4SAlex Elder static void rbd_simple_req_cb(struct ceph_osd_request *osd_req, 12855f29ddd4SAlex Elder struct ceph_msg *msg) 128659c2be1eSYehuda Sadeh { 12875f29ddd4SAlex Elder ceph_osdc_put_request(osd_req); 128859c2be1eSYehuda Sadeh } 128959c2be1eSYehuda Sadeh 1290602adf40SYehuda Sadeh /* 1291602adf40SYehuda Sadeh * Do a synchronous ceph osd operation 1292602adf40SYehuda Sadeh */ 12930ce1a794SAlex Elder static int rbd_req_sync_op(struct rbd_device *rbd_dev, 1294602adf40SYehuda Sadeh int flags, 129530573d68SAlex Elder struct ceph_osd_req_op *op, 1296aded07eaSAlex Elder const char *object_name, 1297f8d4de6eSAlex Elder u64 ofs, u64 inbound_size, 1298f8d4de6eSAlex Elder char *inbound, 129959c2be1eSYehuda Sadeh struct ceph_osd_request **linger_req, 130059c2be1eSYehuda Sadeh u64 *ver) 1301602adf40SYehuda Sadeh { 1302602adf40SYehuda Sadeh int ret; 1303602adf40SYehuda Sadeh struct page **pages; 1304602adf40SYehuda Sadeh int num_pages; 1305913d2fdcSAlex Elder 130630573d68SAlex Elder rbd_assert(op != NULL); 1307602adf40SYehuda Sadeh 1308f8d4de6eSAlex Elder num_pages = calc_pages_for(ofs, inbound_size); 1309602adf40SYehuda Sadeh pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); 1310b8d0638aSDan Carpenter if (IS_ERR(pages)) 1311b8d0638aSDan Carpenter return PTR_ERR(pages); 1312602adf40SYehuda Sadeh 131325704ac9SAlex Elder ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP, 1314f8d4de6eSAlex Elder object_name, ofs, inbound_size, NULL, 1315602adf40SYehuda Sadeh pages, num_pages, 1316602adf40SYehuda Sadeh flags, 131730573d68SAlex Elder op, 13181fec7093SYehuda Sadeh NULL, 0, 131959c2be1eSYehuda Sadeh NULL, 132059c2be1eSYehuda Sadeh linger_req, ver); 1321602adf40SYehuda Sadeh if (ret < 0) 1322913d2fdcSAlex Elder goto done; 1323602adf40SYehuda Sadeh 1324f8d4de6eSAlex Elder if ((flags & CEPH_OSD_FLAG_READ) && inbound) 1325f8d4de6eSAlex Elder ret = ceph_copy_from_page_vector(pages, inbound, ofs, ret); 1326602adf40SYehuda Sadeh 1327602adf40SYehuda Sadeh done: 1328602adf40SYehuda Sadeh ceph_release_page_vector(pages, num_pages); 1329602adf40SYehuda Sadeh return ret; 1330602adf40SYehuda Sadeh } 1331602adf40SYehuda Sadeh 1332602adf40SYehuda Sadeh /* 1333602adf40SYehuda Sadeh * Do an asynchronous ceph osd operation 1334602adf40SYehuda Sadeh */ 1335602adf40SYehuda Sadeh static int rbd_do_op(struct request *rq, 1336602adf40SYehuda Sadeh struct rbd_device *rbd_dev, 1337602adf40SYehuda Sadeh struct ceph_snap_context *snapc, 1338602adf40SYehuda Sadeh u64 ofs, u64 len, 13391fec7093SYehuda Sadeh struct bio *bio, 13401fec7093SYehuda Sadeh struct rbd_req_coll *coll, 13411fec7093SYehuda Sadeh int coll_index) 1342602adf40SYehuda Sadeh { 1343602adf40SYehuda Sadeh char *seg_name; 1344602adf40SYehuda Sadeh u64 seg_ofs; 1345602adf40SYehuda Sadeh u64 seg_len; 1346602adf40SYehuda Sadeh int ret; 1347139b4318SAlex Elder struct ceph_osd_req_op *op; 1348ff2e4bb5SAlex Elder int opcode; 1349ff2e4bb5SAlex Elder int flags; 13504634246dSAlex Elder u64 snapid; 1351602adf40SYehuda Sadeh 135265ccfe21SAlex Elder seg_name = rbd_segment_name(rbd_dev, ofs); 1353602adf40SYehuda Sadeh if (!seg_name) 1354602adf40SYehuda Sadeh return -ENOMEM; 135565ccfe21SAlex Elder seg_len = rbd_segment_length(rbd_dev, ofs, len); 135665ccfe21SAlex Elder seg_ofs = rbd_segment_offset(rbd_dev, ofs); 1357602adf40SYehuda Sadeh 1358ff2e4bb5SAlex Elder if (rq_data_dir(rq) == WRITE) { 1359ff2e4bb5SAlex Elder opcode = CEPH_OSD_OP_WRITE; 1360ff2e4bb5SAlex Elder flags = CEPH_OSD_FLAG_WRITE|CEPH_OSD_FLAG_ONDISK; 13614634246dSAlex Elder snapid = CEPH_NOSNAP; 1362ff2e4bb5SAlex Elder } else { 1363ff2e4bb5SAlex Elder opcode = CEPH_OSD_OP_READ; 1364ff2e4bb5SAlex Elder flags = CEPH_OSD_FLAG_READ; 1365a7b4c65fSAlex Elder rbd_assert(!snapc); 13660d7dbfceSAlex Elder snapid = rbd_dev->spec->snap_id; 1367ff2e4bb5SAlex Elder } 1368602adf40SYehuda Sadeh 136957cfc106SAlex Elder ret = -ENOMEM; 13708d23bf29SAlex Elder op = rbd_osd_req_op_create(opcode, seg_ofs, seg_len); 1371139b4318SAlex Elder if (!op) 1372602adf40SYehuda Sadeh goto done; 1373602adf40SYehuda Sadeh 1374602adf40SYehuda Sadeh /* we've taken care of segment sizes earlier when we 1375602adf40SYehuda Sadeh cloned the bios. We should never have a segment 1376602adf40SYehuda Sadeh truncated at this point */ 1377aafb230eSAlex Elder rbd_assert(seg_len == len); 1378602adf40SYehuda Sadeh 1379602adf40SYehuda Sadeh ret = rbd_do_request(rq, rbd_dev, snapc, snapid, 1380602adf40SYehuda Sadeh seg_name, seg_ofs, seg_len, 1381602adf40SYehuda Sadeh bio, 1382602adf40SYehuda Sadeh NULL, 0, 1383602adf40SYehuda Sadeh flags, 138430573d68SAlex Elder op, 13851fec7093SYehuda Sadeh coll, coll_index, 138659c2be1eSYehuda Sadeh rbd_req_cb, 0, NULL); 1387cd323ac0SAlex Elder if (ret < 0) 1388cd323ac0SAlex Elder rbd_coll_end_req_index(rq, coll, coll_index, 1389cd323ac0SAlex Elder (s32)ret, seg_len); 13908d23bf29SAlex Elder rbd_osd_req_op_destroy(op); 1391602adf40SYehuda Sadeh done: 1392602adf40SYehuda Sadeh kfree(seg_name); 1393602adf40SYehuda Sadeh return ret; 1394602adf40SYehuda Sadeh } 1395602adf40SYehuda Sadeh 1396602adf40SYehuda Sadeh /* 1397602adf40SYehuda Sadeh * Request sync osd read 1398602adf40SYehuda Sadeh */ 13990ce1a794SAlex Elder static int rbd_req_sync_read(struct rbd_device *rbd_dev, 1400aded07eaSAlex Elder const char *object_name, 1401602adf40SYehuda Sadeh u64 ofs, u64 len, 140259c2be1eSYehuda Sadeh char *buf, 140359c2be1eSYehuda Sadeh u64 *ver) 1404602adf40SYehuda Sadeh { 1405139b4318SAlex Elder struct ceph_osd_req_op *op; 1406913d2fdcSAlex Elder int ret; 1407913d2fdcSAlex Elder 14088d23bf29SAlex Elder op = rbd_osd_req_op_create(CEPH_OSD_OP_READ, ofs, len); 1409139b4318SAlex Elder if (!op) 1410913d2fdcSAlex Elder return -ENOMEM; 1411913d2fdcSAlex Elder 141225704ac9SAlex Elder ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ, 141330573d68SAlex Elder op, object_name, ofs, len, buf, NULL, ver); 14148d23bf29SAlex Elder rbd_osd_req_op_destroy(op); 1415913d2fdcSAlex Elder 1416913d2fdcSAlex Elder return ret; 1417602adf40SYehuda Sadeh } 1418602adf40SYehuda Sadeh 1419602adf40SYehuda Sadeh /* 142059c2be1eSYehuda Sadeh * Request sync osd watch 142159c2be1eSYehuda Sadeh */ 14220ce1a794SAlex Elder static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev, 142359c2be1eSYehuda Sadeh u64 ver, 14247f0a24d8SAlex Elder u64 notify_id) 142559c2be1eSYehuda Sadeh { 1426139b4318SAlex Elder struct ceph_osd_req_op *op; 142711f77002SSage Weil int ret; 142811f77002SSage Weil 14295efea49aSAlex Elder op = rbd_osd_req_op_create(CEPH_OSD_OP_NOTIFY_ACK, notify_id, ver); 1430139b4318SAlex Elder if (!op) 143157cfc106SAlex Elder return -ENOMEM; 143259c2be1eSYehuda Sadeh 14330ce1a794SAlex Elder ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP, 14347f0a24d8SAlex Elder rbd_dev->header_name, 0, 0, NULL, 1435ad4f232fSAlex Elder NULL, 0, 143659c2be1eSYehuda Sadeh CEPH_OSD_FLAG_READ, 143730573d68SAlex Elder op, 14381fec7093SYehuda Sadeh NULL, 0, 143959c2be1eSYehuda Sadeh rbd_simple_req_cb, 0, NULL); 144059c2be1eSYehuda Sadeh 14415efea49aSAlex Elder rbd_osd_req_op_destroy(op); 14425efea49aSAlex Elder 144359c2be1eSYehuda Sadeh return ret; 144459c2be1eSYehuda Sadeh } 144559c2be1eSYehuda Sadeh 144659c2be1eSYehuda Sadeh static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data) 144759c2be1eSYehuda Sadeh { 14480ce1a794SAlex Elder struct rbd_device *rbd_dev = (struct rbd_device *)data; 1449a71b891bSJosh Durgin u64 hver; 145013143d2dSSage Weil int rc; 145113143d2dSSage Weil 14520ce1a794SAlex Elder if (!rbd_dev) 145359c2be1eSYehuda Sadeh return; 145459c2be1eSYehuda Sadeh 1455bd919d45SAlex Elder dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n", 1456bd919d45SAlex Elder rbd_dev->header_name, (unsigned long long) notify_id, 1457bd919d45SAlex Elder (unsigned int) opcode); 1458117973fbSAlex Elder rc = rbd_dev_refresh(rbd_dev, &hver); 145913143d2dSSage Weil if (rc) 146006ecc6cbSAlex Elder rbd_warn(rbd_dev, "got notification but failed to " 146106ecc6cbSAlex Elder " update snaps: %d\n", rc); 146259c2be1eSYehuda Sadeh 14637f0a24d8SAlex Elder rbd_req_sync_notify_ack(rbd_dev, hver, notify_id); 146459c2be1eSYehuda Sadeh } 146559c2be1eSYehuda Sadeh 146659c2be1eSYehuda Sadeh /* 1467907703d0SAlex Elder * Request sync osd watch/unwatch. The value of "start" determines 1468907703d0SAlex Elder * whether a watch request is being initiated or torn down. 146959c2be1eSYehuda Sadeh */ 1470907703d0SAlex Elder static int rbd_req_sync_watch(struct rbd_device *rbd_dev, int start) 147159c2be1eSYehuda Sadeh { 1472907703d0SAlex Elder struct ceph_osd_request **linger_req = NULL; 14735efea49aSAlex Elder struct ceph_osd_req_op *op; 14745efea49aSAlex Elder int ret = 0; 147559c2be1eSYehuda Sadeh 1476907703d0SAlex Elder if (start) { 1477907703d0SAlex Elder struct ceph_osd_client *osdc; 1478907703d0SAlex Elder 1479907703d0SAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 1480907703d0SAlex Elder ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0, rbd_dev, 1481907703d0SAlex Elder &rbd_dev->watch_event); 148259c2be1eSYehuda Sadeh if (ret < 0) 14835efea49aSAlex Elder return ret; 1484907703d0SAlex Elder linger_req = &rbd_dev->watch_request; 14855efea49aSAlex Elder } else { 14865efea49aSAlex Elder rbd_assert(rbd_dev->watch_request != NULL); 148759c2be1eSYehuda Sadeh } 148859c2be1eSYehuda Sadeh 14895efea49aSAlex Elder op = rbd_osd_req_op_create(CEPH_OSD_OP_WATCH, 14905efea49aSAlex Elder rbd_dev->watch_event->cookie, 14915efea49aSAlex Elder rbd_dev->header.obj_version, start); 14925efea49aSAlex Elder if (op) 149325704ac9SAlex Elder ret = rbd_req_sync_op(rbd_dev, 149479e3057cSYehuda Sadeh CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, 1495907703d0SAlex Elder op, rbd_dev->header_name, 1496907703d0SAlex Elder 0, 0, NULL, linger_req, NULL); 1497070c633fSAlex Elder 14985efea49aSAlex Elder /* Cancel the event if we're tearing down, or on error */ 14995efea49aSAlex Elder 15005efea49aSAlex Elder if (!start || !op || ret < 0) { 15010ce1a794SAlex Elder ceph_osdc_cancel_event(rbd_dev->watch_event); 15020ce1a794SAlex Elder rbd_dev->watch_event = NULL; 1503907703d0SAlex Elder } 15045efea49aSAlex Elder rbd_osd_req_op_destroy(op); 1505907703d0SAlex Elder 150679e3057cSYehuda Sadeh return ret; 150779e3057cSYehuda Sadeh } 150879e3057cSYehuda Sadeh 150959c2be1eSYehuda Sadeh /* 15103cb4a687SAlex Elder * Synchronous osd object method call 1511602adf40SYehuda Sadeh */ 15120ce1a794SAlex Elder static int rbd_req_sync_exec(struct rbd_device *rbd_dev, 1513aded07eaSAlex Elder const char *object_name, 1514aded07eaSAlex Elder const char *class_name, 1515aded07eaSAlex Elder const char *method_name, 15163cb4a687SAlex Elder const char *outbound, 15173cb4a687SAlex Elder size_t outbound_size, 1518f8d4de6eSAlex Elder char *inbound, 1519f8d4de6eSAlex Elder size_t inbound_size, 152059c2be1eSYehuda Sadeh u64 *ver) 1521602adf40SYehuda Sadeh { 1522139b4318SAlex Elder struct ceph_osd_req_op *op; 152357cfc106SAlex Elder int ret; 152457cfc106SAlex Elder 15253cb4a687SAlex Elder /* 15263cb4a687SAlex Elder * Any input parameters required by the method we're calling 15273cb4a687SAlex Elder * will be sent along with the class and method names as 15283cb4a687SAlex Elder * part of the message payload. That data and its size are 15293cb4a687SAlex Elder * supplied via the indata and indata_len fields (named from 15303cb4a687SAlex Elder * the perspective of the server side) in the OSD request 15313cb4a687SAlex Elder * operation. 15323cb4a687SAlex Elder */ 15332647ba38SAlex Elder op = rbd_osd_req_op_create(CEPH_OSD_OP_CALL, class_name, 15342647ba38SAlex Elder method_name, outbound, outbound_size); 1535139b4318SAlex Elder if (!op) 153657cfc106SAlex Elder return -ENOMEM; 1537602adf40SYehuda Sadeh 153830573d68SAlex Elder ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ, op, 1539f8d4de6eSAlex Elder object_name, 0, inbound_size, inbound, 1540f8d4de6eSAlex Elder NULL, ver); 1541602adf40SYehuda Sadeh 15422647ba38SAlex Elder rbd_osd_req_op_destroy(op); 1543602adf40SYehuda Sadeh 1544602adf40SYehuda Sadeh dout("cls_exec returned %d\n", ret); 1545602adf40SYehuda Sadeh return ret; 1546602adf40SYehuda Sadeh } 1547602adf40SYehuda Sadeh 15481fec7093SYehuda Sadeh static struct rbd_req_coll *rbd_alloc_coll(int num_reqs) 15491fec7093SYehuda Sadeh { 15501fec7093SYehuda Sadeh struct rbd_req_coll *coll = 15511fec7093SYehuda Sadeh kzalloc(sizeof(struct rbd_req_coll) + 15521fec7093SYehuda Sadeh sizeof(struct rbd_req_status) * num_reqs, 15531fec7093SYehuda Sadeh GFP_ATOMIC); 15541fec7093SYehuda Sadeh 15551fec7093SYehuda Sadeh if (!coll) 15561fec7093SYehuda Sadeh return NULL; 15571fec7093SYehuda Sadeh coll->total = num_reqs; 15581fec7093SYehuda Sadeh kref_init(&coll->kref); 15591fec7093SYehuda Sadeh return coll; 15601fec7093SYehuda Sadeh } 15611fec7093SYehuda Sadeh 15628295cda7SAlex Elder static int rbd_dev_do_request(struct request *rq, 15638295cda7SAlex Elder struct rbd_device *rbd_dev, 15648295cda7SAlex Elder struct ceph_snap_context *snapc, 15658295cda7SAlex Elder u64 ofs, unsigned int size, 15668295cda7SAlex Elder struct bio *bio_chain) 15678295cda7SAlex Elder { 15688295cda7SAlex Elder int num_segs; 15698295cda7SAlex Elder struct rbd_req_coll *coll; 15708295cda7SAlex Elder unsigned int bio_offset; 15718295cda7SAlex Elder int cur_seg = 0; 15728295cda7SAlex Elder 15738295cda7SAlex Elder dout("%s 0x%x bytes at 0x%llx\n", 15748295cda7SAlex Elder rq_data_dir(rq) == WRITE ? "write" : "read", 15758295cda7SAlex Elder size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE); 15768295cda7SAlex Elder 15778295cda7SAlex Elder num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size); 15788295cda7SAlex Elder if (num_segs <= 0) 15798295cda7SAlex Elder return num_segs; 15808295cda7SAlex Elder 15818295cda7SAlex Elder coll = rbd_alloc_coll(num_segs); 15828295cda7SAlex Elder if (!coll) 15838295cda7SAlex Elder return -ENOMEM; 15848295cda7SAlex Elder 15858295cda7SAlex Elder bio_offset = 0; 15868295cda7SAlex Elder do { 15878295cda7SAlex Elder u64 limit = rbd_segment_length(rbd_dev, ofs, size); 15888295cda7SAlex Elder unsigned int clone_size; 15898295cda7SAlex Elder struct bio *bio_clone; 15908295cda7SAlex Elder 15918295cda7SAlex Elder BUG_ON(limit > (u64)UINT_MAX); 15928295cda7SAlex Elder clone_size = (unsigned int)limit; 15938295cda7SAlex Elder dout("bio_chain->bi_vcnt=%hu\n", bio_chain->bi_vcnt); 15948295cda7SAlex Elder 15958295cda7SAlex Elder kref_get(&coll->kref); 15968295cda7SAlex Elder 15978295cda7SAlex Elder /* Pass a cloned bio chain via an osd request */ 15988295cda7SAlex Elder 15998295cda7SAlex Elder bio_clone = bio_chain_clone_range(&bio_chain, 16008295cda7SAlex Elder &bio_offset, clone_size, 16018295cda7SAlex Elder GFP_ATOMIC); 16028295cda7SAlex Elder if (bio_clone) 16038295cda7SAlex Elder (void)rbd_do_op(rq, rbd_dev, snapc, 16048295cda7SAlex Elder ofs, clone_size, 16058295cda7SAlex Elder bio_clone, coll, cur_seg); 16068295cda7SAlex Elder else 16078295cda7SAlex Elder rbd_coll_end_req_index(rq, coll, cur_seg, 16088295cda7SAlex Elder (s32)-ENOMEM, 16098295cda7SAlex Elder clone_size); 16108295cda7SAlex Elder size -= clone_size; 16118295cda7SAlex Elder ofs += clone_size; 16128295cda7SAlex Elder 16138295cda7SAlex Elder cur_seg++; 16148295cda7SAlex Elder } while (size > 0); 16158295cda7SAlex Elder kref_put(&coll->kref, rbd_coll_release); 16168295cda7SAlex Elder 16178295cda7SAlex Elder return 0; 16188295cda7SAlex Elder } 16198295cda7SAlex Elder 1620602adf40SYehuda Sadeh /* 1621602adf40SYehuda Sadeh * block device queue callback 1622602adf40SYehuda Sadeh */ 1623602adf40SYehuda Sadeh static void rbd_rq_fn(struct request_queue *q) 1624602adf40SYehuda Sadeh { 1625602adf40SYehuda Sadeh struct rbd_device *rbd_dev = q->queuedata; 1626b395e8b5SAlex Elder bool read_only = rbd_dev->mapping.read_only; 1627602adf40SYehuda Sadeh struct request *rq; 1628602adf40SYehuda Sadeh 162900f1f36fSAlex Elder while ((rq = blk_fetch_request(q))) { 1630b395e8b5SAlex Elder struct ceph_snap_context *snapc = NULL; 1631b395e8b5SAlex Elder unsigned int size = 0; 16328295cda7SAlex Elder int result; 1633602adf40SYehuda Sadeh 1634602adf40SYehuda Sadeh dout("fetched request\n"); 1635602adf40SYehuda Sadeh 1636b395e8b5SAlex Elder /* Filter out block requests we don't understand */ 1637b395e8b5SAlex Elder 1638602adf40SYehuda Sadeh if ((rq->cmd_type != REQ_TYPE_FS)) { 1639602adf40SYehuda Sadeh __blk_end_request_all(rq, 0); 164000f1f36fSAlex Elder continue; 1641602adf40SYehuda Sadeh } 1642602adf40SYehuda Sadeh spin_unlock_irq(q->queue_lock); 1643602adf40SYehuda Sadeh 1644a7b4c65fSAlex Elder /* Write requests need a reference to the snapshot context */ 1645e88a36ecSJosh Durgin 1646a7b4c65fSAlex Elder if (rq_data_dir(rq) == WRITE) { 1647b395e8b5SAlex Elder result = -EROFS; 1648a7b4c65fSAlex Elder if (read_only) /* Can't write to a read-only device */ 1649b395e8b5SAlex Elder goto out_end_request; 1650b395e8b5SAlex Elder 1651a7b4c65fSAlex Elder /* 1652a7b4c65fSAlex Elder * Note that each osd request will take its 1653a7b4c65fSAlex Elder * own reference to the snapshot context 1654a7b4c65fSAlex Elder * supplied. The reference we take here 1655a7b4c65fSAlex Elder * just guarantees the one we provide stays 1656a7b4c65fSAlex Elder * valid. 1657a7b4c65fSAlex Elder */ 1658b395e8b5SAlex Elder down_read(&rbd_dev->header_rwsem); 1659b395e8b5SAlex Elder snapc = ceph_get_snap_context(rbd_dev->header.snapc); 1660d1d25646SJosh Durgin up_read(&rbd_dev->header_rwsem); 1661a7b4c65fSAlex Elder rbd_assert(snapc != NULL); 1662a7b4c65fSAlex Elder } else if (!atomic_read(&rbd_dev->exists)) { 1663b395e8b5SAlex Elder rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP); 1664e88a36ecSJosh Durgin dout("request for non-existent snapshot"); 1665b395e8b5SAlex Elder result = -ENXIO; 1666b395e8b5SAlex Elder goto out_end_request; 1667e88a36ecSJosh Durgin } 1668d1d25646SJosh Durgin 1669f7760dadSAlex Elder size = blk_rq_bytes(rq); 1670b395e8b5SAlex Elder result = rbd_dev_do_request(rq, rbd_dev, snapc, 1671b395e8b5SAlex Elder blk_rq_pos(rq) * SECTOR_SIZE, 1672b395e8b5SAlex Elder size, rq->bio); 1673b395e8b5SAlex Elder out_end_request: 1674a7b4c65fSAlex Elder if (snapc) 1675df111be6SAlex Elder ceph_put_snap_context(snapc); 16761fec7093SYehuda Sadeh spin_lock_irq(q->queue_lock); 16778295cda7SAlex Elder if (!size || result < 0) 16788295cda7SAlex Elder __blk_end_request_all(rq, result); 1679602adf40SYehuda Sadeh } 1680602adf40SYehuda Sadeh } 1681602adf40SYehuda Sadeh 1682602adf40SYehuda Sadeh /* 1683602adf40SYehuda Sadeh * a queue callback. Makes sure that we don't create a bio that spans across 1684602adf40SYehuda Sadeh * multiple osd objects. One exception would be with a single page bios, 1685f7760dadSAlex Elder * which we handle later at bio_chain_clone_range() 1686602adf40SYehuda Sadeh */ 1687602adf40SYehuda Sadeh static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd, 1688602adf40SYehuda Sadeh struct bio_vec *bvec) 1689602adf40SYehuda Sadeh { 1690602adf40SYehuda Sadeh struct rbd_device *rbd_dev = q->queuedata; 1691e5cfeed2SAlex Elder sector_t sector_offset; 1692e5cfeed2SAlex Elder sector_t sectors_per_obj; 1693e5cfeed2SAlex Elder sector_t obj_sector_offset; 1694e5cfeed2SAlex Elder int ret; 1695602adf40SYehuda Sadeh 1696e5cfeed2SAlex Elder /* 1697e5cfeed2SAlex Elder * Find how far into its rbd object the partition-relative 1698e5cfeed2SAlex Elder * bio start sector is to offset relative to the enclosing 1699e5cfeed2SAlex Elder * device. 1700e5cfeed2SAlex Elder */ 1701e5cfeed2SAlex Elder sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector; 1702e5cfeed2SAlex Elder sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT); 1703e5cfeed2SAlex Elder obj_sector_offset = sector_offset & (sectors_per_obj - 1); 1704593a9e7bSAlex Elder 1705e5cfeed2SAlex Elder /* 1706e5cfeed2SAlex Elder * Compute the number of bytes from that offset to the end 1707e5cfeed2SAlex Elder * of the object. Account for what's already used by the bio. 1708e5cfeed2SAlex Elder */ 1709e5cfeed2SAlex Elder ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT; 1710e5cfeed2SAlex Elder if (ret > bmd->bi_size) 1711e5cfeed2SAlex Elder ret -= bmd->bi_size; 1712e5cfeed2SAlex Elder else 1713e5cfeed2SAlex Elder ret = 0; 1714e5cfeed2SAlex Elder 1715e5cfeed2SAlex Elder /* 1716e5cfeed2SAlex Elder * Don't send back more than was asked for. And if the bio 1717e5cfeed2SAlex Elder * was empty, let the whole thing through because: "Note 1718e5cfeed2SAlex Elder * that a block device *must* allow a single page to be 1719e5cfeed2SAlex Elder * added to an empty bio." 1720e5cfeed2SAlex Elder */ 1721e5cfeed2SAlex Elder rbd_assert(bvec->bv_len <= PAGE_SIZE); 1722e5cfeed2SAlex Elder if (ret > (int) bvec->bv_len || !bmd->bi_size) 1723e5cfeed2SAlex Elder ret = (int) bvec->bv_len; 1724e5cfeed2SAlex Elder 1725e5cfeed2SAlex Elder return ret; 1726602adf40SYehuda Sadeh } 1727602adf40SYehuda Sadeh 1728602adf40SYehuda Sadeh static void rbd_free_disk(struct rbd_device *rbd_dev) 1729602adf40SYehuda Sadeh { 1730602adf40SYehuda Sadeh struct gendisk *disk = rbd_dev->disk; 1731602adf40SYehuda Sadeh 1732602adf40SYehuda Sadeh if (!disk) 1733602adf40SYehuda Sadeh return; 1734602adf40SYehuda Sadeh 1735602adf40SYehuda Sadeh if (disk->flags & GENHD_FL_UP) 1736602adf40SYehuda Sadeh del_gendisk(disk); 1737602adf40SYehuda Sadeh if (disk->queue) 1738602adf40SYehuda Sadeh blk_cleanup_queue(disk->queue); 1739602adf40SYehuda Sadeh put_disk(disk); 1740602adf40SYehuda Sadeh } 1741602adf40SYehuda Sadeh 1742602adf40SYehuda Sadeh /* 17434156d998SAlex Elder * Read the complete header for the given rbd device. 17444156d998SAlex Elder * 17454156d998SAlex Elder * Returns a pointer to a dynamically-allocated buffer containing 17464156d998SAlex Elder * the complete and validated header. Caller can pass the address 17474156d998SAlex Elder * of a variable that will be filled in with the version of the 17484156d998SAlex Elder * header object at the time it was read. 17494156d998SAlex Elder * 17504156d998SAlex Elder * Returns a pointer-coded errno if a failure occurs. 17514156d998SAlex Elder */ 17524156d998SAlex Elder static struct rbd_image_header_ondisk * 17534156d998SAlex Elder rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version) 17544156d998SAlex Elder { 17554156d998SAlex Elder struct rbd_image_header_ondisk *ondisk = NULL; 17564156d998SAlex Elder u32 snap_count = 0; 17574156d998SAlex Elder u64 names_size = 0; 17584156d998SAlex Elder u32 want_count; 17594156d998SAlex Elder int ret; 17604156d998SAlex Elder 17614156d998SAlex Elder /* 17624156d998SAlex Elder * The complete header will include an array of its 64-bit 17634156d998SAlex Elder * snapshot ids, followed by the names of those snapshots as 17644156d998SAlex Elder * a contiguous block of NUL-terminated strings. Note that 17654156d998SAlex Elder * the number of snapshots could change by the time we read 17664156d998SAlex Elder * it in, in which case we re-read it. 17674156d998SAlex Elder */ 17684156d998SAlex Elder do { 17694156d998SAlex Elder size_t size; 17704156d998SAlex Elder 17714156d998SAlex Elder kfree(ondisk); 17724156d998SAlex Elder 17734156d998SAlex Elder size = sizeof (*ondisk); 17744156d998SAlex Elder size += snap_count * sizeof (struct rbd_image_snap_ondisk); 17754156d998SAlex Elder size += names_size; 17764156d998SAlex Elder ondisk = kmalloc(size, GFP_KERNEL); 17774156d998SAlex Elder if (!ondisk) 17784156d998SAlex Elder return ERR_PTR(-ENOMEM); 17794156d998SAlex Elder 17804775618dSAlex Elder ret = rbd_req_sync_read(rbd_dev, rbd_dev->header_name, 17814156d998SAlex Elder 0, size, 17824156d998SAlex Elder (char *) ondisk, version); 17834156d998SAlex Elder 17844156d998SAlex Elder if (ret < 0) 17854156d998SAlex Elder goto out_err; 17864156d998SAlex Elder if (WARN_ON((size_t) ret < size)) { 17874156d998SAlex Elder ret = -ENXIO; 178806ecc6cbSAlex Elder rbd_warn(rbd_dev, "short header read (want %zd got %d)", 178906ecc6cbSAlex Elder size, ret); 17904156d998SAlex Elder goto out_err; 17914156d998SAlex Elder } 17924156d998SAlex Elder if (!rbd_dev_ondisk_valid(ondisk)) { 17934156d998SAlex Elder ret = -ENXIO; 179406ecc6cbSAlex Elder rbd_warn(rbd_dev, "invalid header"); 17954156d998SAlex Elder goto out_err; 17964156d998SAlex Elder } 17974156d998SAlex Elder 17984156d998SAlex Elder names_size = le64_to_cpu(ondisk->snap_names_len); 17994156d998SAlex Elder want_count = snap_count; 18004156d998SAlex Elder snap_count = le32_to_cpu(ondisk->snap_count); 18014156d998SAlex Elder } while (snap_count != want_count); 18024156d998SAlex Elder 18034156d998SAlex Elder return ondisk; 18044156d998SAlex Elder 18054156d998SAlex Elder out_err: 18064156d998SAlex Elder kfree(ondisk); 18074156d998SAlex Elder 18084156d998SAlex Elder return ERR_PTR(ret); 18094156d998SAlex Elder } 18104156d998SAlex Elder 18114156d998SAlex Elder /* 1812602adf40SYehuda Sadeh * reload the ondisk the header 1813602adf40SYehuda Sadeh */ 1814602adf40SYehuda Sadeh static int rbd_read_header(struct rbd_device *rbd_dev, 1815602adf40SYehuda Sadeh struct rbd_image_header *header) 1816602adf40SYehuda Sadeh { 18174156d998SAlex Elder struct rbd_image_header_ondisk *ondisk; 18184156d998SAlex Elder u64 ver = 0; 18194156d998SAlex Elder int ret; 1820602adf40SYehuda Sadeh 18214156d998SAlex Elder ondisk = rbd_dev_v1_header_read(rbd_dev, &ver); 18224156d998SAlex Elder if (IS_ERR(ondisk)) 18234156d998SAlex Elder return PTR_ERR(ondisk); 18244156d998SAlex Elder ret = rbd_header_from_disk(header, ondisk); 18254156d998SAlex Elder if (ret >= 0) 182659c2be1eSYehuda Sadeh header->obj_version = ver; 18274156d998SAlex Elder kfree(ondisk); 1828602adf40SYehuda Sadeh 18294156d998SAlex Elder return ret; 1830602adf40SYehuda Sadeh } 1831602adf40SYehuda Sadeh 183241f38c2bSAlex Elder static void rbd_remove_all_snaps(struct rbd_device *rbd_dev) 1833dfc5606dSYehuda Sadeh { 1834dfc5606dSYehuda Sadeh struct rbd_snap *snap; 1835a0593290SAlex Elder struct rbd_snap *next; 1836dfc5606dSYehuda Sadeh 1837a0593290SAlex Elder list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node) 183841f38c2bSAlex Elder rbd_remove_snap_dev(snap); 1839dfc5606dSYehuda Sadeh } 1840dfc5606dSYehuda Sadeh 18419478554aSAlex Elder static void rbd_update_mapping_size(struct rbd_device *rbd_dev) 18429478554aSAlex Elder { 18439478554aSAlex Elder sector_t size; 18449478554aSAlex Elder 18450d7dbfceSAlex Elder if (rbd_dev->spec->snap_id != CEPH_NOSNAP) 18469478554aSAlex Elder return; 18479478554aSAlex Elder 18489478554aSAlex Elder size = (sector_t) rbd_dev->header.image_size / SECTOR_SIZE; 18499478554aSAlex Elder dout("setting size to %llu sectors", (unsigned long long) size); 18509478554aSAlex Elder rbd_dev->mapping.size = (u64) size; 18519478554aSAlex Elder set_capacity(rbd_dev->disk, size); 18529478554aSAlex Elder } 18539478554aSAlex Elder 1854602adf40SYehuda Sadeh /* 1855602adf40SYehuda Sadeh * only read the first part of the ondisk header, without the snaps info 1856602adf40SYehuda Sadeh */ 1857117973fbSAlex Elder static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver) 1858602adf40SYehuda Sadeh { 1859602adf40SYehuda Sadeh int ret; 1860602adf40SYehuda Sadeh struct rbd_image_header h; 1861602adf40SYehuda Sadeh 1862602adf40SYehuda Sadeh ret = rbd_read_header(rbd_dev, &h); 1863602adf40SYehuda Sadeh if (ret < 0) 1864602adf40SYehuda Sadeh return ret; 1865602adf40SYehuda Sadeh 1866a51aa0c0SJosh Durgin down_write(&rbd_dev->header_rwsem); 1867a51aa0c0SJosh Durgin 18689478554aSAlex Elder /* Update image size, and check for resize of mapped image */ 18699478554aSAlex Elder rbd_dev->header.image_size = h.image_size; 18709478554aSAlex Elder rbd_update_mapping_size(rbd_dev); 18719db4b3e3SSage Weil 1872849b4260SAlex Elder /* rbd_dev->header.object_prefix shouldn't change */ 1873602adf40SYehuda Sadeh kfree(rbd_dev->header.snap_sizes); 1874849b4260SAlex Elder kfree(rbd_dev->header.snap_names); 1875d1d25646SJosh Durgin /* osd requests may still refer to snapc */ 1876d1d25646SJosh Durgin ceph_put_snap_context(rbd_dev->header.snapc); 1877602adf40SYehuda Sadeh 1878b813623aSAlex Elder if (hver) 1879b813623aSAlex Elder *hver = h.obj_version; 1880a71b891bSJosh Durgin rbd_dev->header.obj_version = h.obj_version; 188193a24e08SJosh Durgin rbd_dev->header.image_size = h.image_size; 1882602adf40SYehuda Sadeh rbd_dev->header.snapc = h.snapc; 1883602adf40SYehuda Sadeh rbd_dev->header.snap_names = h.snap_names; 1884602adf40SYehuda Sadeh rbd_dev->header.snap_sizes = h.snap_sizes; 1885849b4260SAlex Elder /* Free the extra copy of the object prefix */ 1886849b4260SAlex Elder WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix)); 1887849b4260SAlex Elder kfree(h.object_prefix); 1888849b4260SAlex Elder 1889304f6808SAlex Elder ret = rbd_dev_snaps_update(rbd_dev); 1890304f6808SAlex Elder if (!ret) 1891304f6808SAlex Elder ret = rbd_dev_snaps_register(rbd_dev); 1892dfc5606dSYehuda Sadeh 1893c666601aSJosh Durgin up_write(&rbd_dev->header_rwsem); 1894602adf40SYehuda Sadeh 1895dfc5606dSYehuda Sadeh return ret; 1896602adf40SYehuda Sadeh } 1897602adf40SYehuda Sadeh 1898117973fbSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver) 18991fe5e993SAlex Elder { 19001fe5e993SAlex Elder int ret; 19011fe5e993SAlex Elder 1902117973fbSAlex Elder rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 19031fe5e993SAlex Elder mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); 1904117973fbSAlex Elder if (rbd_dev->image_format == 1) 1905117973fbSAlex Elder ret = rbd_dev_v1_refresh(rbd_dev, hver); 1906117973fbSAlex Elder else 1907117973fbSAlex Elder ret = rbd_dev_v2_refresh(rbd_dev, hver); 19081fe5e993SAlex Elder mutex_unlock(&ctl_mutex); 19091fe5e993SAlex Elder 19101fe5e993SAlex Elder return ret; 19111fe5e993SAlex Elder } 19121fe5e993SAlex Elder 1913602adf40SYehuda Sadeh static int rbd_init_disk(struct rbd_device *rbd_dev) 1914602adf40SYehuda Sadeh { 1915602adf40SYehuda Sadeh struct gendisk *disk; 1916602adf40SYehuda Sadeh struct request_queue *q; 1917593a9e7bSAlex Elder u64 segment_size; 1918602adf40SYehuda Sadeh 1919602adf40SYehuda Sadeh /* create gendisk info */ 1920602adf40SYehuda Sadeh disk = alloc_disk(RBD_MINORS_PER_MAJOR); 1921602adf40SYehuda Sadeh if (!disk) 19221fcdb8aaSAlex Elder return -ENOMEM; 1923602adf40SYehuda Sadeh 1924f0f8cef5SAlex Elder snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d", 1925de71a297SAlex Elder rbd_dev->dev_id); 1926602adf40SYehuda Sadeh disk->major = rbd_dev->major; 1927602adf40SYehuda Sadeh disk->first_minor = 0; 1928602adf40SYehuda Sadeh disk->fops = &rbd_bd_ops; 1929602adf40SYehuda Sadeh disk->private_data = rbd_dev; 1930602adf40SYehuda Sadeh 1931602adf40SYehuda Sadeh /* init rq */ 1932602adf40SYehuda Sadeh q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock); 1933602adf40SYehuda Sadeh if (!q) 1934602adf40SYehuda Sadeh goto out_disk; 1935029bcbd8SJosh Durgin 1936593a9e7bSAlex Elder /* We use the default size, but let's be explicit about it. */ 1937593a9e7bSAlex Elder blk_queue_physical_block_size(q, SECTOR_SIZE); 1938593a9e7bSAlex Elder 1939029bcbd8SJosh Durgin /* set io sizes to object size */ 1940593a9e7bSAlex Elder segment_size = rbd_obj_bytes(&rbd_dev->header); 1941593a9e7bSAlex Elder blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE); 1942593a9e7bSAlex Elder blk_queue_max_segment_size(q, segment_size); 1943593a9e7bSAlex Elder blk_queue_io_min(q, segment_size); 1944593a9e7bSAlex Elder blk_queue_io_opt(q, segment_size); 1945029bcbd8SJosh Durgin 1946602adf40SYehuda Sadeh blk_queue_merge_bvec(q, rbd_merge_bvec); 1947602adf40SYehuda Sadeh disk->queue = q; 1948602adf40SYehuda Sadeh 1949602adf40SYehuda Sadeh q->queuedata = rbd_dev; 1950602adf40SYehuda Sadeh 1951602adf40SYehuda Sadeh rbd_dev->disk = disk; 1952602adf40SYehuda Sadeh 195312f02944SAlex Elder set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE); 195412f02944SAlex Elder 1955602adf40SYehuda Sadeh return 0; 1956602adf40SYehuda Sadeh out_disk: 1957602adf40SYehuda Sadeh put_disk(disk); 19581fcdb8aaSAlex Elder 19591fcdb8aaSAlex Elder return -ENOMEM; 1960602adf40SYehuda Sadeh } 1961602adf40SYehuda Sadeh 1962dfc5606dSYehuda Sadeh /* 1963dfc5606dSYehuda Sadeh sysfs 1964dfc5606dSYehuda Sadeh */ 1965602adf40SYehuda Sadeh 1966593a9e7bSAlex Elder static struct rbd_device *dev_to_rbd_dev(struct device *dev) 1967593a9e7bSAlex Elder { 1968593a9e7bSAlex Elder return container_of(dev, struct rbd_device, dev); 1969593a9e7bSAlex Elder } 1970593a9e7bSAlex Elder 1971dfc5606dSYehuda Sadeh static ssize_t rbd_size_show(struct device *dev, 1972dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 1973602adf40SYehuda Sadeh { 1974593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 1975a51aa0c0SJosh Durgin sector_t size; 1976dfc5606dSYehuda Sadeh 1977a51aa0c0SJosh Durgin down_read(&rbd_dev->header_rwsem); 1978a51aa0c0SJosh Durgin size = get_capacity(rbd_dev->disk); 1979a51aa0c0SJosh Durgin up_read(&rbd_dev->header_rwsem); 1980a51aa0c0SJosh Durgin 1981a51aa0c0SJosh Durgin return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE); 1982602adf40SYehuda Sadeh } 1983602adf40SYehuda Sadeh 198434b13184SAlex Elder /* 198534b13184SAlex Elder * Note this shows the features for whatever's mapped, which is not 198634b13184SAlex Elder * necessarily the base image. 198734b13184SAlex Elder */ 198834b13184SAlex Elder static ssize_t rbd_features_show(struct device *dev, 198934b13184SAlex Elder struct device_attribute *attr, char *buf) 199034b13184SAlex Elder { 199134b13184SAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 199234b13184SAlex Elder 199334b13184SAlex Elder return sprintf(buf, "0x%016llx\n", 199434b13184SAlex Elder (unsigned long long) rbd_dev->mapping.features); 199534b13184SAlex Elder } 199634b13184SAlex Elder 1997dfc5606dSYehuda Sadeh static ssize_t rbd_major_show(struct device *dev, 1998dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 1999602adf40SYehuda Sadeh { 2000593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 2001dfc5606dSYehuda Sadeh 2002dfc5606dSYehuda Sadeh return sprintf(buf, "%d\n", rbd_dev->major); 2003dfc5606dSYehuda Sadeh } 2004dfc5606dSYehuda Sadeh 2005dfc5606dSYehuda Sadeh static ssize_t rbd_client_id_show(struct device *dev, 2006dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 2007dfc5606dSYehuda Sadeh { 2008593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 2009dfc5606dSYehuda Sadeh 20101dbb4399SAlex Elder return sprintf(buf, "client%lld\n", 20111dbb4399SAlex Elder ceph_client_id(rbd_dev->rbd_client->client)); 2012dfc5606dSYehuda Sadeh } 2013dfc5606dSYehuda Sadeh 2014dfc5606dSYehuda Sadeh static ssize_t rbd_pool_show(struct device *dev, 2015dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 2016dfc5606dSYehuda Sadeh { 2017593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 2018dfc5606dSYehuda Sadeh 20190d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->pool_name); 2020dfc5606dSYehuda Sadeh } 2021dfc5606dSYehuda Sadeh 20229bb2f334SAlex Elder static ssize_t rbd_pool_id_show(struct device *dev, 20239bb2f334SAlex Elder struct device_attribute *attr, char *buf) 20249bb2f334SAlex Elder { 20259bb2f334SAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 20269bb2f334SAlex Elder 20270d7dbfceSAlex Elder return sprintf(buf, "%llu\n", 20280d7dbfceSAlex Elder (unsigned long long) rbd_dev->spec->pool_id); 20299bb2f334SAlex Elder } 20309bb2f334SAlex Elder 2031dfc5606dSYehuda Sadeh static ssize_t rbd_name_show(struct device *dev, 2032dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 2033dfc5606dSYehuda Sadeh { 2034593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 2035dfc5606dSYehuda Sadeh 2036a92ffdf8SAlex Elder if (rbd_dev->spec->image_name) 20370d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->image_name); 2038a92ffdf8SAlex Elder 2039a92ffdf8SAlex Elder return sprintf(buf, "(unknown)\n"); 2040dfc5606dSYehuda Sadeh } 2041dfc5606dSYehuda Sadeh 2042589d30e0SAlex Elder static ssize_t rbd_image_id_show(struct device *dev, 2043589d30e0SAlex Elder struct device_attribute *attr, char *buf) 2044589d30e0SAlex Elder { 2045589d30e0SAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 2046589d30e0SAlex Elder 20470d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->image_id); 2048589d30e0SAlex Elder } 2049589d30e0SAlex Elder 205034b13184SAlex Elder /* 205134b13184SAlex Elder * Shows the name of the currently-mapped snapshot (or 205234b13184SAlex Elder * RBD_SNAP_HEAD_NAME for the base image). 205334b13184SAlex Elder */ 2054dfc5606dSYehuda Sadeh static ssize_t rbd_snap_show(struct device *dev, 2055dfc5606dSYehuda Sadeh struct device_attribute *attr, 2056dfc5606dSYehuda Sadeh char *buf) 2057dfc5606dSYehuda Sadeh { 2058593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 2059dfc5606dSYehuda Sadeh 20600d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->snap_name); 2061dfc5606dSYehuda Sadeh } 2062dfc5606dSYehuda Sadeh 206386b00e0dSAlex Elder /* 206486b00e0dSAlex Elder * For an rbd v2 image, shows the pool id, image id, and snapshot id 206586b00e0dSAlex Elder * for the parent image. If there is no parent, simply shows 206686b00e0dSAlex Elder * "(no parent image)". 206786b00e0dSAlex Elder */ 206886b00e0dSAlex Elder static ssize_t rbd_parent_show(struct device *dev, 206986b00e0dSAlex Elder struct device_attribute *attr, 207086b00e0dSAlex Elder char *buf) 207186b00e0dSAlex Elder { 207286b00e0dSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 207386b00e0dSAlex Elder struct rbd_spec *spec = rbd_dev->parent_spec; 207486b00e0dSAlex Elder int count; 207586b00e0dSAlex Elder char *bufp = buf; 207686b00e0dSAlex Elder 207786b00e0dSAlex Elder if (!spec) 207886b00e0dSAlex Elder return sprintf(buf, "(no parent image)\n"); 207986b00e0dSAlex Elder 208086b00e0dSAlex Elder count = sprintf(bufp, "pool_id %llu\npool_name %s\n", 208186b00e0dSAlex Elder (unsigned long long) spec->pool_id, spec->pool_name); 208286b00e0dSAlex Elder if (count < 0) 208386b00e0dSAlex Elder return count; 208486b00e0dSAlex Elder bufp += count; 208586b00e0dSAlex Elder 208686b00e0dSAlex Elder count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id, 208786b00e0dSAlex Elder spec->image_name ? spec->image_name : "(unknown)"); 208886b00e0dSAlex Elder if (count < 0) 208986b00e0dSAlex Elder return count; 209086b00e0dSAlex Elder bufp += count; 209186b00e0dSAlex Elder 209286b00e0dSAlex Elder count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n", 209386b00e0dSAlex Elder (unsigned long long) spec->snap_id, spec->snap_name); 209486b00e0dSAlex Elder if (count < 0) 209586b00e0dSAlex Elder return count; 209686b00e0dSAlex Elder bufp += count; 209786b00e0dSAlex Elder 209886b00e0dSAlex Elder count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap); 209986b00e0dSAlex Elder if (count < 0) 210086b00e0dSAlex Elder return count; 210186b00e0dSAlex Elder bufp += count; 210286b00e0dSAlex Elder 210386b00e0dSAlex Elder return (ssize_t) (bufp - buf); 210486b00e0dSAlex Elder } 210586b00e0dSAlex Elder 2106dfc5606dSYehuda Sadeh static ssize_t rbd_image_refresh(struct device *dev, 2107dfc5606dSYehuda Sadeh struct device_attribute *attr, 2108dfc5606dSYehuda Sadeh const char *buf, 2109dfc5606dSYehuda Sadeh size_t size) 2110dfc5606dSYehuda Sadeh { 2111593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 2112b813623aSAlex Elder int ret; 2113602adf40SYehuda Sadeh 2114117973fbSAlex Elder ret = rbd_dev_refresh(rbd_dev, NULL); 2115b813623aSAlex Elder 2116b813623aSAlex Elder return ret < 0 ? ret : size; 2117dfc5606dSYehuda Sadeh } 2118602adf40SYehuda Sadeh 2119dfc5606dSYehuda Sadeh static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL); 212034b13184SAlex Elder static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL); 2121dfc5606dSYehuda Sadeh static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL); 2122dfc5606dSYehuda Sadeh static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL); 2123dfc5606dSYehuda Sadeh static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL); 21249bb2f334SAlex Elder static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL); 2125dfc5606dSYehuda Sadeh static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL); 2126589d30e0SAlex Elder static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL); 2127dfc5606dSYehuda Sadeh static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh); 2128dfc5606dSYehuda Sadeh static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL); 212986b00e0dSAlex Elder static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL); 2130dfc5606dSYehuda Sadeh 2131dfc5606dSYehuda Sadeh static struct attribute *rbd_attrs[] = { 2132dfc5606dSYehuda Sadeh &dev_attr_size.attr, 213334b13184SAlex Elder &dev_attr_features.attr, 2134dfc5606dSYehuda Sadeh &dev_attr_major.attr, 2135dfc5606dSYehuda Sadeh &dev_attr_client_id.attr, 2136dfc5606dSYehuda Sadeh &dev_attr_pool.attr, 21379bb2f334SAlex Elder &dev_attr_pool_id.attr, 2138dfc5606dSYehuda Sadeh &dev_attr_name.attr, 2139589d30e0SAlex Elder &dev_attr_image_id.attr, 2140dfc5606dSYehuda Sadeh &dev_attr_current_snap.attr, 214186b00e0dSAlex Elder &dev_attr_parent.attr, 2142dfc5606dSYehuda Sadeh &dev_attr_refresh.attr, 2143dfc5606dSYehuda Sadeh NULL 2144dfc5606dSYehuda Sadeh }; 2145dfc5606dSYehuda Sadeh 2146dfc5606dSYehuda Sadeh static struct attribute_group rbd_attr_group = { 2147dfc5606dSYehuda Sadeh .attrs = rbd_attrs, 2148dfc5606dSYehuda Sadeh }; 2149dfc5606dSYehuda Sadeh 2150dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_attr_groups[] = { 2151dfc5606dSYehuda Sadeh &rbd_attr_group, 2152dfc5606dSYehuda Sadeh NULL 2153dfc5606dSYehuda Sadeh }; 2154dfc5606dSYehuda Sadeh 2155dfc5606dSYehuda Sadeh static void rbd_sysfs_dev_release(struct device *dev) 2156dfc5606dSYehuda Sadeh { 2157dfc5606dSYehuda Sadeh } 2158dfc5606dSYehuda Sadeh 2159dfc5606dSYehuda Sadeh static struct device_type rbd_device_type = { 2160dfc5606dSYehuda Sadeh .name = "rbd", 2161dfc5606dSYehuda Sadeh .groups = rbd_attr_groups, 2162dfc5606dSYehuda Sadeh .release = rbd_sysfs_dev_release, 2163dfc5606dSYehuda Sadeh }; 2164dfc5606dSYehuda Sadeh 2165dfc5606dSYehuda Sadeh 2166dfc5606dSYehuda Sadeh /* 2167dfc5606dSYehuda Sadeh sysfs - snapshots 2168dfc5606dSYehuda Sadeh */ 2169dfc5606dSYehuda Sadeh 2170dfc5606dSYehuda Sadeh static ssize_t rbd_snap_size_show(struct device *dev, 2171dfc5606dSYehuda Sadeh struct device_attribute *attr, 2172dfc5606dSYehuda Sadeh char *buf) 2173dfc5606dSYehuda Sadeh { 2174dfc5606dSYehuda Sadeh struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev); 2175dfc5606dSYehuda Sadeh 21763591538fSJosh Durgin return sprintf(buf, "%llu\n", (unsigned long long)snap->size); 2177dfc5606dSYehuda Sadeh } 2178dfc5606dSYehuda Sadeh 2179dfc5606dSYehuda Sadeh static ssize_t rbd_snap_id_show(struct device *dev, 2180dfc5606dSYehuda Sadeh struct device_attribute *attr, 2181dfc5606dSYehuda Sadeh char *buf) 2182dfc5606dSYehuda Sadeh { 2183dfc5606dSYehuda Sadeh struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev); 2184dfc5606dSYehuda Sadeh 2185593a9e7bSAlex Elder return sprintf(buf, "%llu\n", (unsigned long long)snap->id); 2186dfc5606dSYehuda Sadeh } 2187dfc5606dSYehuda Sadeh 218834b13184SAlex Elder static ssize_t rbd_snap_features_show(struct device *dev, 218934b13184SAlex Elder struct device_attribute *attr, 219034b13184SAlex Elder char *buf) 219134b13184SAlex Elder { 219234b13184SAlex Elder struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev); 219334b13184SAlex Elder 219434b13184SAlex Elder return sprintf(buf, "0x%016llx\n", 219534b13184SAlex Elder (unsigned long long) snap->features); 219634b13184SAlex Elder } 219734b13184SAlex Elder 2198dfc5606dSYehuda Sadeh static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL); 2199dfc5606dSYehuda Sadeh static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL); 220034b13184SAlex Elder static DEVICE_ATTR(snap_features, S_IRUGO, rbd_snap_features_show, NULL); 2201dfc5606dSYehuda Sadeh 2202dfc5606dSYehuda Sadeh static struct attribute *rbd_snap_attrs[] = { 2203dfc5606dSYehuda Sadeh &dev_attr_snap_size.attr, 2204dfc5606dSYehuda Sadeh &dev_attr_snap_id.attr, 220534b13184SAlex Elder &dev_attr_snap_features.attr, 2206dfc5606dSYehuda Sadeh NULL, 2207dfc5606dSYehuda Sadeh }; 2208dfc5606dSYehuda Sadeh 2209dfc5606dSYehuda Sadeh static struct attribute_group rbd_snap_attr_group = { 2210dfc5606dSYehuda Sadeh .attrs = rbd_snap_attrs, 2211dfc5606dSYehuda Sadeh }; 2212dfc5606dSYehuda Sadeh 2213dfc5606dSYehuda Sadeh static void rbd_snap_dev_release(struct device *dev) 2214dfc5606dSYehuda Sadeh { 2215dfc5606dSYehuda Sadeh struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev); 2216dfc5606dSYehuda Sadeh kfree(snap->name); 2217dfc5606dSYehuda Sadeh kfree(snap); 2218dfc5606dSYehuda Sadeh } 2219dfc5606dSYehuda Sadeh 2220dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_snap_attr_groups[] = { 2221dfc5606dSYehuda Sadeh &rbd_snap_attr_group, 2222dfc5606dSYehuda Sadeh NULL 2223dfc5606dSYehuda Sadeh }; 2224dfc5606dSYehuda Sadeh 2225dfc5606dSYehuda Sadeh static struct device_type rbd_snap_device_type = { 2226dfc5606dSYehuda Sadeh .groups = rbd_snap_attr_groups, 2227dfc5606dSYehuda Sadeh .release = rbd_snap_dev_release, 2228dfc5606dSYehuda Sadeh }; 2229dfc5606dSYehuda Sadeh 22308b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec) 22318b8fb99cSAlex Elder { 22328b8fb99cSAlex Elder kref_get(&spec->kref); 22338b8fb99cSAlex Elder 22348b8fb99cSAlex Elder return spec; 22358b8fb99cSAlex Elder } 22368b8fb99cSAlex Elder 22378b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref); 22388b8fb99cSAlex Elder static void rbd_spec_put(struct rbd_spec *spec) 22398b8fb99cSAlex Elder { 22408b8fb99cSAlex Elder if (spec) 22418b8fb99cSAlex Elder kref_put(&spec->kref, rbd_spec_free); 22428b8fb99cSAlex Elder } 22438b8fb99cSAlex Elder 22448b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_alloc(void) 22458b8fb99cSAlex Elder { 22468b8fb99cSAlex Elder struct rbd_spec *spec; 22478b8fb99cSAlex Elder 22488b8fb99cSAlex Elder spec = kzalloc(sizeof (*spec), GFP_KERNEL); 22498b8fb99cSAlex Elder if (!spec) 22508b8fb99cSAlex Elder return NULL; 22518b8fb99cSAlex Elder kref_init(&spec->kref); 22528b8fb99cSAlex Elder 22538b8fb99cSAlex Elder rbd_spec_put(rbd_spec_get(spec)); /* TEMPORARY */ 22548b8fb99cSAlex Elder 22558b8fb99cSAlex Elder return spec; 22568b8fb99cSAlex Elder } 22578b8fb99cSAlex Elder 22588b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref) 22598b8fb99cSAlex Elder { 22608b8fb99cSAlex Elder struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref); 22618b8fb99cSAlex Elder 22628b8fb99cSAlex Elder kfree(spec->pool_name); 22638b8fb99cSAlex Elder kfree(spec->image_id); 22648b8fb99cSAlex Elder kfree(spec->image_name); 22658b8fb99cSAlex Elder kfree(spec->snap_name); 22668b8fb99cSAlex Elder kfree(spec); 22678b8fb99cSAlex Elder } 22688b8fb99cSAlex Elder 2269c53d5893SAlex Elder struct rbd_device *rbd_dev_create(struct rbd_client *rbdc, 2270c53d5893SAlex Elder struct rbd_spec *spec) 2271c53d5893SAlex Elder { 2272c53d5893SAlex Elder struct rbd_device *rbd_dev; 2273c53d5893SAlex Elder 2274c53d5893SAlex Elder rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL); 2275c53d5893SAlex Elder if (!rbd_dev) 2276c53d5893SAlex Elder return NULL; 2277c53d5893SAlex Elder 2278c53d5893SAlex Elder spin_lock_init(&rbd_dev->lock); 2279d78b650aSAlex Elder atomic_set(&rbd_dev->exists, 0); 2280c53d5893SAlex Elder INIT_LIST_HEAD(&rbd_dev->node); 2281c53d5893SAlex Elder INIT_LIST_HEAD(&rbd_dev->snaps); 2282c53d5893SAlex Elder init_rwsem(&rbd_dev->header_rwsem); 2283c53d5893SAlex Elder 2284c53d5893SAlex Elder rbd_dev->spec = spec; 2285c53d5893SAlex Elder rbd_dev->rbd_client = rbdc; 2286c53d5893SAlex Elder 22870903e875SAlex Elder /* Initialize the layout used for all rbd requests */ 22880903e875SAlex Elder 22890903e875SAlex Elder rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); 22900903e875SAlex Elder rbd_dev->layout.fl_stripe_count = cpu_to_le32(1); 22910903e875SAlex Elder rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); 22920903e875SAlex Elder rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id); 22930903e875SAlex Elder 2294c53d5893SAlex Elder return rbd_dev; 2295c53d5893SAlex Elder } 2296c53d5893SAlex Elder 2297c53d5893SAlex Elder static void rbd_dev_destroy(struct rbd_device *rbd_dev) 2298c53d5893SAlex Elder { 229986b00e0dSAlex Elder rbd_spec_put(rbd_dev->parent_spec); 2300c53d5893SAlex Elder kfree(rbd_dev->header_name); 2301c53d5893SAlex Elder rbd_put_client(rbd_dev->rbd_client); 2302c53d5893SAlex Elder rbd_spec_put(rbd_dev->spec); 2303c53d5893SAlex Elder kfree(rbd_dev); 2304c53d5893SAlex Elder } 2305c53d5893SAlex Elder 2306304f6808SAlex Elder static bool rbd_snap_registered(struct rbd_snap *snap) 2307304f6808SAlex Elder { 2308304f6808SAlex Elder bool ret = snap->dev.type == &rbd_snap_device_type; 2309304f6808SAlex Elder bool reg = device_is_registered(&snap->dev); 2310304f6808SAlex Elder 2311304f6808SAlex Elder rbd_assert(!ret ^ reg); 2312304f6808SAlex Elder 2313304f6808SAlex Elder return ret; 2314304f6808SAlex Elder } 2315304f6808SAlex Elder 231641f38c2bSAlex Elder static void rbd_remove_snap_dev(struct rbd_snap *snap) 2317dfc5606dSYehuda Sadeh { 2318dfc5606dSYehuda Sadeh list_del(&snap->node); 2319304f6808SAlex Elder if (device_is_registered(&snap->dev)) 2320dfc5606dSYehuda Sadeh device_unregister(&snap->dev); 2321dfc5606dSYehuda Sadeh } 2322dfc5606dSYehuda Sadeh 232314e7085dSAlex Elder static int rbd_register_snap_dev(struct rbd_snap *snap, 2324dfc5606dSYehuda Sadeh struct device *parent) 2325dfc5606dSYehuda Sadeh { 2326dfc5606dSYehuda Sadeh struct device *dev = &snap->dev; 2327dfc5606dSYehuda Sadeh int ret; 2328dfc5606dSYehuda Sadeh 2329dfc5606dSYehuda Sadeh dev->type = &rbd_snap_device_type; 2330dfc5606dSYehuda Sadeh dev->parent = parent; 2331dfc5606dSYehuda Sadeh dev->release = rbd_snap_dev_release; 2332d4b125e9SAlex Elder dev_set_name(dev, "%s%s", RBD_SNAP_DEV_NAME_PREFIX, snap->name); 2333304f6808SAlex Elder dout("%s: registering device for snapshot %s\n", __func__, snap->name); 2334304f6808SAlex Elder 2335dfc5606dSYehuda Sadeh ret = device_register(dev); 2336dfc5606dSYehuda Sadeh 2337dfc5606dSYehuda Sadeh return ret; 2338dfc5606dSYehuda Sadeh } 2339dfc5606dSYehuda Sadeh 23404e891e0aSAlex Elder static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev, 2341c8d18425SAlex Elder const char *snap_name, 234234b13184SAlex Elder u64 snap_id, u64 snap_size, 234334b13184SAlex Elder u64 snap_features) 2344dfc5606dSYehuda Sadeh { 23454e891e0aSAlex Elder struct rbd_snap *snap; 2346dfc5606dSYehuda Sadeh int ret; 23474e891e0aSAlex Elder 23484e891e0aSAlex Elder snap = kzalloc(sizeof (*snap), GFP_KERNEL); 2349dfc5606dSYehuda Sadeh if (!snap) 23504e891e0aSAlex Elder return ERR_PTR(-ENOMEM); 23514e891e0aSAlex Elder 23524e891e0aSAlex Elder ret = -ENOMEM; 2353c8d18425SAlex Elder snap->name = kstrdup(snap_name, GFP_KERNEL); 23544e891e0aSAlex Elder if (!snap->name) 23554e891e0aSAlex Elder goto err; 23564e891e0aSAlex Elder 2357c8d18425SAlex Elder snap->id = snap_id; 2358c8d18425SAlex Elder snap->size = snap_size; 235934b13184SAlex Elder snap->features = snap_features; 23604e891e0aSAlex Elder 23614e891e0aSAlex Elder return snap; 23624e891e0aSAlex Elder 2363dfc5606dSYehuda Sadeh err: 2364dfc5606dSYehuda Sadeh kfree(snap->name); 2365dfc5606dSYehuda Sadeh kfree(snap); 23664e891e0aSAlex Elder 23674e891e0aSAlex Elder return ERR_PTR(ret); 2368dfc5606dSYehuda Sadeh } 2369dfc5606dSYehuda Sadeh 2370cd892126SAlex Elder static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which, 2371cd892126SAlex Elder u64 *snap_size, u64 *snap_features) 2372cd892126SAlex Elder { 2373cd892126SAlex Elder char *snap_name; 2374cd892126SAlex Elder 2375cd892126SAlex Elder rbd_assert(which < rbd_dev->header.snapc->num_snaps); 2376cd892126SAlex Elder 2377cd892126SAlex Elder *snap_size = rbd_dev->header.snap_sizes[which]; 2378cd892126SAlex Elder *snap_features = 0; /* No features for v1 */ 2379cd892126SAlex Elder 2380cd892126SAlex Elder /* Skip over names until we find the one we are looking for */ 2381cd892126SAlex Elder 2382cd892126SAlex Elder snap_name = rbd_dev->header.snap_names; 2383cd892126SAlex Elder while (which--) 2384cd892126SAlex Elder snap_name += strlen(snap_name) + 1; 2385cd892126SAlex Elder 2386cd892126SAlex Elder return snap_name; 2387cd892126SAlex Elder } 2388cd892126SAlex Elder 2389dfc5606dSYehuda Sadeh /* 23909d475de5SAlex Elder * Get the size and object order for an image snapshot, or if 23919d475de5SAlex Elder * snap_id is CEPH_NOSNAP, gets this information for the base 23929d475de5SAlex Elder * image. 23939d475de5SAlex Elder */ 23949d475de5SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id, 23959d475de5SAlex Elder u8 *order, u64 *snap_size) 23969d475de5SAlex Elder { 23979d475de5SAlex Elder __le64 snapid = cpu_to_le64(snap_id); 23989d475de5SAlex Elder int ret; 23999d475de5SAlex Elder struct { 24009d475de5SAlex Elder u8 order; 24019d475de5SAlex Elder __le64 size; 24029d475de5SAlex Elder } __attribute__ ((packed)) size_buf = { 0 }; 24039d475de5SAlex Elder 24049d475de5SAlex Elder ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, 24059d475de5SAlex Elder "rbd", "get_size", 24069d475de5SAlex Elder (char *) &snapid, sizeof (snapid), 240707b2391fSAlex Elder (char *) &size_buf, sizeof (size_buf), NULL); 24089d475de5SAlex Elder dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); 24099d475de5SAlex Elder if (ret < 0) 24109d475de5SAlex Elder return ret; 24119d475de5SAlex Elder 24129d475de5SAlex Elder *order = size_buf.order; 24139d475de5SAlex Elder *snap_size = le64_to_cpu(size_buf.size); 24149d475de5SAlex Elder 24159d475de5SAlex Elder dout(" snap_id 0x%016llx order = %u, snap_size = %llu\n", 24169d475de5SAlex Elder (unsigned long long) snap_id, (unsigned int) *order, 24179d475de5SAlex Elder (unsigned long long) *snap_size); 24189d475de5SAlex Elder 24199d475de5SAlex Elder return 0; 24209d475de5SAlex Elder } 24219d475de5SAlex Elder 24229d475de5SAlex Elder static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev) 24239d475de5SAlex Elder { 24249d475de5SAlex Elder return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP, 24259d475de5SAlex Elder &rbd_dev->header.obj_order, 24269d475de5SAlex Elder &rbd_dev->header.image_size); 24279d475de5SAlex Elder } 24289d475de5SAlex Elder 24291e130199SAlex Elder static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev) 24301e130199SAlex Elder { 24311e130199SAlex Elder void *reply_buf; 24321e130199SAlex Elder int ret; 24331e130199SAlex Elder void *p; 24341e130199SAlex Elder 24351e130199SAlex Elder reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL); 24361e130199SAlex Elder if (!reply_buf) 24371e130199SAlex Elder return -ENOMEM; 24381e130199SAlex Elder 24391e130199SAlex Elder ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, 24401e130199SAlex Elder "rbd", "get_object_prefix", 24411e130199SAlex Elder NULL, 0, 244207b2391fSAlex Elder reply_buf, RBD_OBJ_PREFIX_LEN_MAX, NULL); 24431e130199SAlex Elder dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); 24441e130199SAlex Elder if (ret < 0) 24451e130199SAlex Elder goto out; 2446a0ea3a40SAlex Elder ret = 0; /* rbd_req_sync_exec() can return positive */ 24471e130199SAlex Elder 24481e130199SAlex Elder p = reply_buf; 24491e130199SAlex Elder rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p, 24501e130199SAlex Elder p + RBD_OBJ_PREFIX_LEN_MAX, 24511e130199SAlex Elder NULL, GFP_NOIO); 24521e130199SAlex Elder 24531e130199SAlex Elder if (IS_ERR(rbd_dev->header.object_prefix)) { 24541e130199SAlex Elder ret = PTR_ERR(rbd_dev->header.object_prefix); 24551e130199SAlex Elder rbd_dev->header.object_prefix = NULL; 24561e130199SAlex Elder } else { 24571e130199SAlex Elder dout(" object_prefix = %s\n", rbd_dev->header.object_prefix); 24581e130199SAlex Elder } 24591e130199SAlex Elder 24601e130199SAlex Elder out: 24611e130199SAlex Elder kfree(reply_buf); 24621e130199SAlex Elder 24631e130199SAlex Elder return ret; 24641e130199SAlex Elder } 24651e130199SAlex Elder 2466b1b5402aSAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id, 2467b1b5402aSAlex Elder u64 *snap_features) 2468b1b5402aSAlex Elder { 2469b1b5402aSAlex Elder __le64 snapid = cpu_to_le64(snap_id); 2470b1b5402aSAlex Elder struct { 2471b1b5402aSAlex Elder __le64 features; 2472b1b5402aSAlex Elder __le64 incompat; 2473b1b5402aSAlex Elder } features_buf = { 0 }; 2474d889140cSAlex Elder u64 incompat; 2475b1b5402aSAlex Elder int ret; 2476b1b5402aSAlex Elder 2477b1b5402aSAlex Elder ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, 2478b1b5402aSAlex Elder "rbd", "get_features", 2479b1b5402aSAlex Elder (char *) &snapid, sizeof (snapid), 2480b1b5402aSAlex Elder (char *) &features_buf, sizeof (features_buf), 248107b2391fSAlex Elder NULL); 2482b1b5402aSAlex Elder dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); 2483b1b5402aSAlex Elder if (ret < 0) 2484b1b5402aSAlex Elder return ret; 2485d889140cSAlex Elder 2486d889140cSAlex Elder incompat = le64_to_cpu(features_buf.incompat); 2487d889140cSAlex Elder if (incompat & ~RBD_FEATURES_ALL) 2488b8f5c6edSAlex Elder return -ENXIO; 2489d889140cSAlex Elder 2490b1b5402aSAlex Elder *snap_features = le64_to_cpu(features_buf.features); 2491b1b5402aSAlex Elder 2492b1b5402aSAlex Elder dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n", 2493b1b5402aSAlex Elder (unsigned long long) snap_id, 2494b1b5402aSAlex Elder (unsigned long long) *snap_features, 2495b1b5402aSAlex Elder (unsigned long long) le64_to_cpu(features_buf.incompat)); 2496b1b5402aSAlex Elder 2497b1b5402aSAlex Elder return 0; 2498b1b5402aSAlex Elder } 2499b1b5402aSAlex Elder 2500b1b5402aSAlex Elder static int rbd_dev_v2_features(struct rbd_device *rbd_dev) 2501b1b5402aSAlex Elder { 2502b1b5402aSAlex Elder return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP, 2503b1b5402aSAlex Elder &rbd_dev->header.features); 2504b1b5402aSAlex Elder } 2505b1b5402aSAlex Elder 250686b00e0dSAlex Elder static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev) 250786b00e0dSAlex Elder { 250886b00e0dSAlex Elder struct rbd_spec *parent_spec; 250986b00e0dSAlex Elder size_t size; 251086b00e0dSAlex Elder void *reply_buf = NULL; 251186b00e0dSAlex Elder __le64 snapid; 251286b00e0dSAlex Elder void *p; 251386b00e0dSAlex Elder void *end; 251486b00e0dSAlex Elder char *image_id; 251586b00e0dSAlex Elder u64 overlap; 251686b00e0dSAlex Elder int ret; 251786b00e0dSAlex Elder 251886b00e0dSAlex Elder parent_spec = rbd_spec_alloc(); 251986b00e0dSAlex Elder if (!parent_spec) 252086b00e0dSAlex Elder return -ENOMEM; 252186b00e0dSAlex Elder 252286b00e0dSAlex Elder size = sizeof (__le64) + /* pool_id */ 252386b00e0dSAlex Elder sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX + /* image_id */ 252486b00e0dSAlex Elder sizeof (__le64) + /* snap_id */ 252586b00e0dSAlex Elder sizeof (__le64); /* overlap */ 252686b00e0dSAlex Elder reply_buf = kmalloc(size, GFP_KERNEL); 252786b00e0dSAlex Elder if (!reply_buf) { 252886b00e0dSAlex Elder ret = -ENOMEM; 252986b00e0dSAlex Elder goto out_err; 253086b00e0dSAlex Elder } 253186b00e0dSAlex Elder 253286b00e0dSAlex Elder snapid = cpu_to_le64(CEPH_NOSNAP); 253386b00e0dSAlex Elder ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, 253486b00e0dSAlex Elder "rbd", "get_parent", 253586b00e0dSAlex Elder (char *) &snapid, sizeof (snapid), 253607b2391fSAlex Elder (char *) reply_buf, size, NULL); 253786b00e0dSAlex Elder dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); 253886b00e0dSAlex Elder if (ret < 0) 253986b00e0dSAlex Elder goto out_err; 254086b00e0dSAlex Elder 254186b00e0dSAlex Elder ret = -ERANGE; 254286b00e0dSAlex Elder p = reply_buf; 254386b00e0dSAlex Elder end = (char *) reply_buf + size; 254486b00e0dSAlex Elder ceph_decode_64_safe(&p, end, parent_spec->pool_id, out_err); 254586b00e0dSAlex Elder if (parent_spec->pool_id == CEPH_NOPOOL) 254686b00e0dSAlex Elder goto out; /* No parent? No problem. */ 254786b00e0dSAlex Elder 25480903e875SAlex Elder /* The ceph file layout needs to fit pool id in 32 bits */ 25490903e875SAlex Elder 25500903e875SAlex Elder ret = -EIO; 25510903e875SAlex Elder if (WARN_ON(parent_spec->pool_id > (u64) U32_MAX)) 25520903e875SAlex Elder goto out; 25530903e875SAlex Elder 2554979ed480SAlex Elder image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL); 255586b00e0dSAlex Elder if (IS_ERR(image_id)) { 255686b00e0dSAlex Elder ret = PTR_ERR(image_id); 255786b00e0dSAlex Elder goto out_err; 255886b00e0dSAlex Elder } 255986b00e0dSAlex Elder parent_spec->image_id = image_id; 256086b00e0dSAlex Elder ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err); 256186b00e0dSAlex Elder ceph_decode_64_safe(&p, end, overlap, out_err); 256286b00e0dSAlex Elder 256386b00e0dSAlex Elder rbd_dev->parent_overlap = overlap; 256486b00e0dSAlex Elder rbd_dev->parent_spec = parent_spec; 256586b00e0dSAlex Elder parent_spec = NULL; /* rbd_dev now owns this */ 256686b00e0dSAlex Elder out: 256786b00e0dSAlex Elder ret = 0; 256886b00e0dSAlex Elder out_err: 256986b00e0dSAlex Elder kfree(reply_buf); 257086b00e0dSAlex Elder rbd_spec_put(parent_spec); 257186b00e0dSAlex Elder 257286b00e0dSAlex Elder return ret; 257386b00e0dSAlex Elder } 257486b00e0dSAlex Elder 25759e15b77dSAlex Elder static char *rbd_dev_image_name(struct rbd_device *rbd_dev) 25769e15b77dSAlex Elder { 25779e15b77dSAlex Elder size_t image_id_size; 25789e15b77dSAlex Elder char *image_id; 25799e15b77dSAlex Elder void *p; 25809e15b77dSAlex Elder void *end; 25819e15b77dSAlex Elder size_t size; 25829e15b77dSAlex Elder void *reply_buf = NULL; 25839e15b77dSAlex Elder size_t len = 0; 25849e15b77dSAlex Elder char *image_name = NULL; 25859e15b77dSAlex Elder int ret; 25869e15b77dSAlex Elder 25879e15b77dSAlex Elder rbd_assert(!rbd_dev->spec->image_name); 25889e15b77dSAlex Elder 258969e7a02fSAlex Elder len = strlen(rbd_dev->spec->image_id); 259069e7a02fSAlex Elder image_id_size = sizeof (__le32) + len; 25919e15b77dSAlex Elder image_id = kmalloc(image_id_size, GFP_KERNEL); 25929e15b77dSAlex Elder if (!image_id) 25939e15b77dSAlex Elder return NULL; 25949e15b77dSAlex Elder 25959e15b77dSAlex Elder p = image_id; 25969e15b77dSAlex Elder end = (char *) image_id + image_id_size; 259769e7a02fSAlex Elder ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32) len); 25989e15b77dSAlex Elder 25999e15b77dSAlex Elder size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX; 26009e15b77dSAlex Elder reply_buf = kmalloc(size, GFP_KERNEL); 26019e15b77dSAlex Elder if (!reply_buf) 26029e15b77dSAlex Elder goto out; 26039e15b77dSAlex Elder 26049e15b77dSAlex Elder ret = rbd_req_sync_exec(rbd_dev, RBD_DIRECTORY, 26059e15b77dSAlex Elder "rbd", "dir_get_name", 26069e15b77dSAlex Elder image_id, image_id_size, 260707b2391fSAlex Elder (char *) reply_buf, size, NULL); 26089e15b77dSAlex Elder if (ret < 0) 26099e15b77dSAlex Elder goto out; 26109e15b77dSAlex Elder p = reply_buf; 26119e15b77dSAlex Elder end = (char *) reply_buf + size; 26129e15b77dSAlex Elder image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL); 26139e15b77dSAlex Elder if (IS_ERR(image_name)) 26149e15b77dSAlex Elder image_name = NULL; 26159e15b77dSAlex Elder else 26169e15b77dSAlex Elder dout("%s: name is %s len is %zd\n", __func__, image_name, len); 26179e15b77dSAlex Elder out: 26189e15b77dSAlex Elder kfree(reply_buf); 26199e15b77dSAlex Elder kfree(image_id); 26209e15b77dSAlex Elder 26219e15b77dSAlex Elder return image_name; 26229e15b77dSAlex Elder } 26239e15b77dSAlex Elder 26249e15b77dSAlex Elder /* 26259e15b77dSAlex Elder * When a parent image gets probed, we only have the pool, image, 26269e15b77dSAlex Elder * and snapshot ids but not the names of any of them. This call 26279e15b77dSAlex Elder * is made later to fill in those names. It has to be done after 26289e15b77dSAlex Elder * rbd_dev_snaps_update() has completed because some of the 26299e15b77dSAlex Elder * information (in particular, snapshot name) is not available 26309e15b77dSAlex Elder * until then. 26319e15b77dSAlex Elder */ 26329e15b77dSAlex Elder static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev) 26339e15b77dSAlex Elder { 26349e15b77dSAlex Elder struct ceph_osd_client *osdc; 26359e15b77dSAlex Elder const char *name; 26369e15b77dSAlex Elder void *reply_buf = NULL; 26379e15b77dSAlex Elder int ret; 26389e15b77dSAlex Elder 26399e15b77dSAlex Elder if (rbd_dev->spec->pool_name) 26409e15b77dSAlex Elder return 0; /* Already have the names */ 26419e15b77dSAlex Elder 26429e15b77dSAlex Elder /* Look up the pool name */ 26439e15b77dSAlex Elder 26449e15b77dSAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 26459e15b77dSAlex Elder name = ceph_pg_pool_name_by_id(osdc->osdmap, rbd_dev->spec->pool_id); 2646935dc89fSAlex Elder if (!name) { 2647935dc89fSAlex Elder rbd_warn(rbd_dev, "there is no pool with id %llu", 2648935dc89fSAlex Elder rbd_dev->spec->pool_id); /* Really a BUG() */ 2649935dc89fSAlex Elder return -EIO; 2650935dc89fSAlex Elder } 26519e15b77dSAlex Elder 26529e15b77dSAlex Elder rbd_dev->spec->pool_name = kstrdup(name, GFP_KERNEL); 26539e15b77dSAlex Elder if (!rbd_dev->spec->pool_name) 26549e15b77dSAlex Elder return -ENOMEM; 26559e15b77dSAlex Elder 26569e15b77dSAlex Elder /* Fetch the image name; tolerate failure here */ 26579e15b77dSAlex Elder 26589e15b77dSAlex Elder name = rbd_dev_image_name(rbd_dev); 265969e7a02fSAlex Elder if (name) 26609e15b77dSAlex Elder rbd_dev->spec->image_name = (char *) name; 266169e7a02fSAlex Elder else 266206ecc6cbSAlex Elder rbd_warn(rbd_dev, "unable to get image name"); 26639e15b77dSAlex Elder 26649e15b77dSAlex Elder /* Look up the snapshot name. */ 26659e15b77dSAlex Elder 26669e15b77dSAlex Elder name = rbd_snap_name(rbd_dev, rbd_dev->spec->snap_id); 26679e15b77dSAlex Elder if (!name) { 2668935dc89fSAlex Elder rbd_warn(rbd_dev, "no snapshot with id %llu", 2669935dc89fSAlex Elder rbd_dev->spec->snap_id); /* Really a BUG() */ 26709e15b77dSAlex Elder ret = -EIO; 26719e15b77dSAlex Elder goto out_err; 26729e15b77dSAlex Elder } 26739e15b77dSAlex Elder rbd_dev->spec->snap_name = kstrdup(name, GFP_KERNEL); 26749e15b77dSAlex Elder if(!rbd_dev->spec->snap_name) 26759e15b77dSAlex Elder goto out_err; 26769e15b77dSAlex Elder 26779e15b77dSAlex Elder return 0; 26789e15b77dSAlex Elder out_err: 26799e15b77dSAlex Elder kfree(reply_buf); 26809e15b77dSAlex Elder kfree(rbd_dev->spec->pool_name); 26819e15b77dSAlex Elder rbd_dev->spec->pool_name = NULL; 26829e15b77dSAlex Elder 26839e15b77dSAlex Elder return ret; 26849e15b77dSAlex Elder } 26859e15b77dSAlex Elder 26866e14b1a6SAlex Elder static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver) 268735d489f9SAlex Elder { 268835d489f9SAlex Elder size_t size; 268935d489f9SAlex Elder int ret; 269035d489f9SAlex Elder void *reply_buf; 269135d489f9SAlex Elder void *p; 269235d489f9SAlex Elder void *end; 269335d489f9SAlex Elder u64 seq; 269435d489f9SAlex Elder u32 snap_count; 269535d489f9SAlex Elder struct ceph_snap_context *snapc; 269635d489f9SAlex Elder u32 i; 269735d489f9SAlex Elder 269835d489f9SAlex Elder /* 269935d489f9SAlex Elder * We'll need room for the seq value (maximum snapshot id), 270035d489f9SAlex Elder * snapshot count, and array of that many snapshot ids. 270135d489f9SAlex Elder * For now we have a fixed upper limit on the number we're 270235d489f9SAlex Elder * prepared to receive. 270335d489f9SAlex Elder */ 270435d489f9SAlex Elder size = sizeof (__le64) + sizeof (__le32) + 270535d489f9SAlex Elder RBD_MAX_SNAP_COUNT * sizeof (__le64); 270635d489f9SAlex Elder reply_buf = kzalloc(size, GFP_KERNEL); 270735d489f9SAlex Elder if (!reply_buf) 270835d489f9SAlex Elder return -ENOMEM; 270935d489f9SAlex Elder 271035d489f9SAlex Elder ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, 271135d489f9SAlex Elder "rbd", "get_snapcontext", 271235d489f9SAlex Elder NULL, 0, 271307b2391fSAlex Elder reply_buf, size, ver); 271435d489f9SAlex Elder dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); 271535d489f9SAlex Elder if (ret < 0) 271635d489f9SAlex Elder goto out; 271735d489f9SAlex Elder 271835d489f9SAlex Elder ret = -ERANGE; 271935d489f9SAlex Elder p = reply_buf; 272035d489f9SAlex Elder end = (char *) reply_buf + size; 272135d489f9SAlex Elder ceph_decode_64_safe(&p, end, seq, out); 272235d489f9SAlex Elder ceph_decode_32_safe(&p, end, snap_count, out); 272335d489f9SAlex Elder 272435d489f9SAlex Elder /* 272535d489f9SAlex Elder * Make sure the reported number of snapshot ids wouldn't go 272635d489f9SAlex Elder * beyond the end of our buffer. But before checking that, 272735d489f9SAlex Elder * make sure the computed size of the snapshot context we 272835d489f9SAlex Elder * allocate is representable in a size_t. 272935d489f9SAlex Elder */ 273035d489f9SAlex Elder if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context)) 273135d489f9SAlex Elder / sizeof (u64)) { 273235d489f9SAlex Elder ret = -EINVAL; 273335d489f9SAlex Elder goto out; 273435d489f9SAlex Elder } 273535d489f9SAlex Elder if (!ceph_has_room(&p, end, snap_count * sizeof (__le64))) 273635d489f9SAlex Elder goto out; 273735d489f9SAlex Elder 273835d489f9SAlex Elder size = sizeof (struct ceph_snap_context) + 273935d489f9SAlex Elder snap_count * sizeof (snapc->snaps[0]); 274035d489f9SAlex Elder snapc = kmalloc(size, GFP_KERNEL); 274135d489f9SAlex Elder if (!snapc) { 274235d489f9SAlex Elder ret = -ENOMEM; 274335d489f9SAlex Elder goto out; 274435d489f9SAlex Elder } 274535d489f9SAlex Elder 274635d489f9SAlex Elder atomic_set(&snapc->nref, 1); 274735d489f9SAlex Elder snapc->seq = seq; 274835d489f9SAlex Elder snapc->num_snaps = snap_count; 274935d489f9SAlex Elder for (i = 0; i < snap_count; i++) 275035d489f9SAlex Elder snapc->snaps[i] = ceph_decode_64(&p); 275135d489f9SAlex Elder 275235d489f9SAlex Elder rbd_dev->header.snapc = snapc; 275335d489f9SAlex Elder 275435d489f9SAlex Elder dout(" snap context seq = %llu, snap_count = %u\n", 275535d489f9SAlex Elder (unsigned long long) seq, (unsigned int) snap_count); 275635d489f9SAlex Elder 275735d489f9SAlex Elder out: 275835d489f9SAlex Elder kfree(reply_buf); 275935d489f9SAlex Elder 276035d489f9SAlex Elder return 0; 276135d489f9SAlex Elder } 276235d489f9SAlex Elder 2763b8b1e2dbSAlex Elder static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which) 2764b8b1e2dbSAlex Elder { 2765b8b1e2dbSAlex Elder size_t size; 2766b8b1e2dbSAlex Elder void *reply_buf; 2767b8b1e2dbSAlex Elder __le64 snap_id; 2768b8b1e2dbSAlex Elder int ret; 2769b8b1e2dbSAlex Elder void *p; 2770b8b1e2dbSAlex Elder void *end; 2771b8b1e2dbSAlex Elder char *snap_name; 2772b8b1e2dbSAlex Elder 2773b8b1e2dbSAlex Elder size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN; 2774b8b1e2dbSAlex Elder reply_buf = kmalloc(size, GFP_KERNEL); 2775b8b1e2dbSAlex Elder if (!reply_buf) 2776b8b1e2dbSAlex Elder return ERR_PTR(-ENOMEM); 2777b8b1e2dbSAlex Elder 2778b8b1e2dbSAlex Elder snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]); 2779b8b1e2dbSAlex Elder ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, 2780b8b1e2dbSAlex Elder "rbd", "get_snapshot_name", 2781b8b1e2dbSAlex Elder (char *) &snap_id, sizeof (snap_id), 278207b2391fSAlex Elder reply_buf, size, NULL); 2783b8b1e2dbSAlex Elder dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); 2784b8b1e2dbSAlex Elder if (ret < 0) 2785b8b1e2dbSAlex Elder goto out; 2786b8b1e2dbSAlex Elder 2787b8b1e2dbSAlex Elder p = reply_buf; 2788b8b1e2dbSAlex Elder end = (char *) reply_buf + size; 2789e5c35534SAlex Elder snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL); 2790b8b1e2dbSAlex Elder if (IS_ERR(snap_name)) { 2791b8b1e2dbSAlex Elder ret = PTR_ERR(snap_name); 2792b8b1e2dbSAlex Elder goto out; 2793b8b1e2dbSAlex Elder } else { 2794b8b1e2dbSAlex Elder dout(" snap_id 0x%016llx snap_name = %s\n", 2795b8b1e2dbSAlex Elder (unsigned long long) le64_to_cpu(snap_id), snap_name); 2796b8b1e2dbSAlex Elder } 2797b8b1e2dbSAlex Elder kfree(reply_buf); 2798b8b1e2dbSAlex Elder 2799b8b1e2dbSAlex Elder return snap_name; 2800b8b1e2dbSAlex Elder out: 2801b8b1e2dbSAlex Elder kfree(reply_buf); 2802b8b1e2dbSAlex Elder 2803b8b1e2dbSAlex Elder return ERR_PTR(ret); 2804b8b1e2dbSAlex Elder } 2805b8b1e2dbSAlex Elder 2806b8b1e2dbSAlex Elder static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which, 2807b8b1e2dbSAlex Elder u64 *snap_size, u64 *snap_features) 2808b8b1e2dbSAlex Elder { 2809b8b1e2dbSAlex Elder __le64 snap_id; 2810b8b1e2dbSAlex Elder u8 order; 2811b8b1e2dbSAlex Elder int ret; 2812b8b1e2dbSAlex Elder 2813b8b1e2dbSAlex Elder snap_id = rbd_dev->header.snapc->snaps[which]; 2814b8b1e2dbSAlex Elder ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, &order, snap_size); 2815b8b1e2dbSAlex Elder if (ret) 2816b8b1e2dbSAlex Elder return ERR_PTR(ret); 2817b8b1e2dbSAlex Elder ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, snap_features); 2818b8b1e2dbSAlex Elder if (ret) 2819b8b1e2dbSAlex Elder return ERR_PTR(ret); 2820b8b1e2dbSAlex Elder 2821b8b1e2dbSAlex Elder return rbd_dev_v2_snap_name(rbd_dev, which); 2822b8b1e2dbSAlex Elder } 2823b8b1e2dbSAlex Elder 2824b8b1e2dbSAlex Elder static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which, 2825b8b1e2dbSAlex Elder u64 *snap_size, u64 *snap_features) 2826b8b1e2dbSAlex Elder { 2827b8b1e2dbSAlex Elder if (rbd_dev->image_format == 1) 2828b8b1e2dbSAlex Elder return rbd_dev_v1_snap_info(rbd_dev, which, 2829b8b1e2dbSAlex Elder snap_size, snap_features); 2830b8b1e2dbSAlex Elder if (rbd_dev->image_format == 2) 2831b8b1e2dbSAlex Elder return rbd_dev_v2_snap_info(rbd_dev, which, 2832b8b1e2dbSAlex Elder snap_size, snap_features); 2833b8b1e2dbSAlex Elder return ERR_PTR(-EINVAL); 2834b8b1e2dbSAlex Elder } 2835b8b1e2dbSAlex Elder 2836117973fbSAlex Elder static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver) 2837117973fbSAlex Elder { 2838117973fbSAlex Elder int ret; 2839117973fbSAlex Elder __u8 obj_order; 2840117973fbSAlex Elder 2841117973fbSAlex Elder down_write(&rbd_dev->header_rwsem); 2842117973fbSAlex Elder 2843117973fbSAlex Elder /* Grab old order first, to see if it changes */ 2844117973fbSAlex Elder 2845117973fbSAlex Elder obj_order = rbd_dev->header.obj_order, 2846117973fbSAlex Elder ret = rbd_dev_v2_image_size(rbd_dev); 2847117973fbSAlex Elder if (ret) 2848117973fbSAlex Elder goto out; 2849117973fbSAlex Elder if (rbd_dev->header.obj_order != obj_order) { 2850117973fbSAlex Elder ret = -EIO; 2851117973fbSAlex Elder goto out; 2852117973fbSAlex Elder } 2853117973fbSAlex Elder rbd_update_mapping_size(rbd_dev); 2854117973fbSAlex Elder 2855117973fbSAlex Elder ret = rbd_dev_v2_snap_context(rbd_dev, hver); 2856117973fbSAlex Elder dout("rbd_dev_v2_snap_context returned %d\n", ret); 2857117973fbSAlex Elder if (ret) 2858117973fbSAlex Elder goto out; 2859117973fbSAlex Elder ret = rbd_dev_snaps_update(rbd_dev); 2860117973fbSAlex Elder dout("rbd_dev_snaps_update returned %d\n", ret); 2861117973fbSAlex Elder if (ret) 2862117973fbSAlex Elder goto out; 2863117973fbSAlex Elder ret = rbd_dev_snaps_register(rbd_dev); 2864117973fbSAlex Elder dout("rbd_dev_snaps_register returned %d\n", ret); 2865117973fbSAlex Elder out: 2866117973fbSAlex Elder up_write(&rbd_dev->header_rwsem); 2867117973fbSAlex Elder 2868117973fbSAlex Elder return ret; 2869117973fbSAlex Elder } 2870117973fbSAlex Elder 28719d475de5SAlex Elder /* 287235938150SAlex Elder * Scan the rbd device's current snapshot list and compare it to the 287335938150SAlex Elder * newly-received snapshot context. Remove any existing snapshots 287435938150SAlex Elder * not present in the new snapshot context. Add a new snapshot for 287535938150SAlex Elder * any snaphots in the snapshot context not in the current list. 287635938150SAlex Elder * And verify there are no changes to snapshots we already know 287735938150SAlex Elder * about. 287835938150SAlex Elder * 287935938150SAlex Elder * Assumes the snapshots in the snapshot context are sorted by 288035938150SAlex Elder * snapshot id, highest id first. (Snapshots in the rbd_dev's list 288135938150SAlex Elder * are also maintained in that order.) 2882dfc5606dSYehuda Sadeh */ 2883304f6808SAlex Elder static int rbd_dev_snaps_update(struct rbd_device *rbd_dev) 2884dfc5606dSYehuda Sadeh { 288535938150SAlex Elder struct ceph_snap_context *snapc = rbd_dev->header.snapc; 288635938150SAlex Elder const u32 snap_count = snapc->num_snaps; 288735938150SAlex Elder struct list_head *head = &rbd_dev->snaps; 288835938150SAlex Elder struct list_head *links = head->next; 288935938150SAlex Elder u32 index = 0; 2890dfc5606dSYehuda Sadeh 28919fcbb800SAlex Elder dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count); 289235938150SAlex Elder while (index < snap_count || links != head) { 289335938150SAlex Elder u64 snap_id; 289435938150SAlex Elder struct rbd_snap *snap; 2895cd892126SAlex Elder char *snap_name; 2896cd892126SAlex Elder u64 snap_size = 0; 2897cd892126SAlex Elder u64 snap_features = 0; 2898dfc5606dSYehuda Sadeh 289935938150SAlex Elder snap_id = index < snap_count ? snapc->snaps[index] 290035938150SAlex Elder : CEPH_NOSNAP; 290135938150SAlex Elder snap = links != head ? list_entry(links, struct rbd_snap, node) 290235938150SAlex Elder : NULL; 2903aafb230eSAlex Elder rbd_assert(!snap || snap->id != CEPH_NOSNAP); 2904dfc5606dSYehuda Sadeh 290535938150SAlex Elder if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) { 290635938150SAlex Elder struct list_head *next = links->next; 2907dfc5606dSYehuda Sadeh 290835938150SAlex Elder /* Existing snapshot not in the new snap context */ 2909dfc5606dSYehuda Sadeh 29100d7dbfceSAlex Elder if (rbd_dev->spec->snap_id == snap->id) 2911d78b650aSAlex Elder atomic_set(&rbd_dev->exists, 0); 291241f38c2bSAlex Elder rbd_remove_snap_dev(snap); 29139fcbb800SAlex Elder dout("%ssnap id %llu has been removed\n", 29140d7dbfceSAlex Elder rbd_dev->spec->snap_id == snap->id ? 29150d7dbfceSAlex Elder "mapped " : "", 29169fcbb800SAlex Elder (unsigned long long) snap->id); 2917dfc5606dSYehuda Sadeh 291835938150SAlex Elder /* Done with this list entry; advance */ 291935938150SAlex Elder 292035938150SAlex Elder links = next; 292135938150SAlex Elder continue; 2922dfc5606dSYehuda Sadeh } 292335938150SAlex Elder 2924b8b1e2dbSAlex Elder snap_name = rbd_dev_snap_info(rbd_dev, index, 2925cd892126SAlex Elder &snap_size, &snap_features); 2926cd892126SAlex Elder if (IS_ERR(snap_name)) 2927cd892126SAlex Elder return PTR_ERR(snap_name); 2928cd892126SAlex Elder 29299fcbb800SAlex Elder dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count, 29309fcbb800SAlex Elder (unsigned long long) snap_id); 293135938150SAlex Elder if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) { 293235938150SAlex Elder struct rbd_snap *new_snap; 293335938150SAlex Elder 293435938150SAlex Elder /* We haven't seen this snapshot before */ 293535938150SAlex Elder 2936c8d18425SAlex Elder new_snap = __rbd_add_snap_dev(rbd_dev, snap_name, 2937cd892126SAlex Elder snap_id, snap_size, snap_features); 29389fcbb800SAlex Elder if (IS_ERR(new_snap)) { 29399fcbb800SAlex Elder int err = PTR_ERR(new_snap); 29409fcbb800SAlex Elder 29419fcbb800SAlex Elder dout(" failed to add dev, error %d\n", err); 29429fcbb800SAlex Elder 29439fcbb800SAlex Elder return err; 29449fcbb800SAlex Elder } 294535938150SAlex Elder 294635938150SAlex Elder /* New goes before existing, or at end of list */ 294735938150SAlex Elder 29489fcbb800SAlex Elder dout(" added dev%s\n", snap ? "" : " at end\n"); 294935938150SAlex Elder if (snap) 295035938150SAlex Elder list_add_tail(&new_snap->node, &snap->node); 295135938150SAlex Elder else 2952523f3258SAlex Elder list_add_tail(&new_snap->node, head); 295335938150SAlex Elder } else { 295435938150SAlex Elder /* Already have this one */ 295535938150SAlex Elder 29569fcbb800SAlex Elder dout(" already present\n"); 29579fcbb800SAlex Elder 2958cd892126SAlex Elder rbd_assert(snap->size == snap_size); 2959aafb230eSAlex Elder rbd_assert(!strcmp(snap->name, snap_name)); 2960cd892126SAlex Elder rbd_assert(snap->features == snap_features); 296135938150SAlex Elder 296235938150SAlex Elder /* Done with this list entry; advance */ 296335938150SAlex Elder 296435938150SAlex Elder links = links->next; 2965dfc5606dSYehuda Sadeh } 296635938150SAlex Elder 296735938150SAlex Elder /* Advance to the next entry in the snapshot context */ 296835938150SAlex Elder 296935938150SAlex Elder index++; 2970dfc5606dSYehuda Sadeh } 29719fcbb800SAlex Elder dout("%s: done\n", __func__); 2972dfc5606dSYehuda Sadeh 2973dfc5606dSYehuda Sadeh return 0; 2974dfc5606dSYehuda Sadeh } 2975dfc5606dSYehuda Sadeh 2976304f6808SAlex Elder /* 2977304f6808SAlex Elder * Scan the list of snapshots and register the devices for any that 2978304f6808SAlex Elder * have not already been registered. 2979304f6808SAlex Elder */ 2980304f6808SAlex Elder static int rbd_dev_snaps_register(struct rbd_device *rbd_dev) 2981304f6808SAlex Elder { 2982304f6808SAlex Elder struct rbd_snap *snap; 2983304f6808SAlex Elder int ret = 0; 2984304f6808SAlex Elder 2985304f6808SAlex Elder dout("%s called\n", __func__); 298686ff77bbSAlex Elder if (WARN_ON(!device_is_registered(&rbd_dev->dev))) 298786ff77bbSAlex Elder return -EIO; 2988304f6808SAlex Elder 2989304f6808SAlex Elder list_for_each_entry(snap, &rbd_dev->snaps, node) { 2990304f6808SAlex Elder if (!rbd_snap_registered(snap)) { 2991304f6808SAlex Elder ret = rbd_register_snap_dev(snap, &rbd_dev->dev); 2992304f6808SAlex Elder if (ret < 0) 2993304f6808SAlex Elder break; 2994304f6808SAlex Elder } 2995304f6808SAlex Elder } 2996304f6808SAlex Elder dout("%s: returning %d\n", __func__, ret); 2997304f6808SAlex Elder 2998304f6808SAlex Elder return ret; 2999304f6808SAlex Elder } 3000304f6808SAlex Elder 3001dfc5606dSYehuda Sadeh static int rbd_bus_add_dev(struct rbd_device *rbd_dev) 3002dfc5606dSYehuda Sadeh { 3003dfc5606dSYehuda Sadeh struct device *dev; 3004cd789ab9SAlex Elder int ret; 3005dfc5606dSYehuda Sadeh 3006dfc5606dSYehuda Sadeh mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); 3007dfc5606dSYehuda Sadeh 3008cd789ab9SAlex Elder dev = &rbd_dev->dev; 3009dfc5606dSYehuda Sadeh dev->bus = &rbd_bus_type; 3010dfc5606dSYehuda Sadeh dev->type = &rbd_device_type; 3011dfc5606dSYehuda Sadeh dev->parent = &rbd_root_dev; 3012dfc5606dSYehuda Sadeh dev->release = rbd_dev_release; 3013de71a297SAlex Elder dev_set_name(dev, "%d", rbd_dev->dev_id); 3014dfc5606dSYehuda Sadeh ret = device_register(dev); 3015dfc5606dSYehuda Sadeh 3016dfc5606dSYehuda Sadeh mutex_unlock(&ctl_mutex); 3017cd789ab9SAlex Elder 3018dfc5606dSYehuda Sadeh return ret; 3019602adf40SYehuda Sadeh } 3020602adf40SYehuda Sadeh 3021dfc5606dSYehuda Sadeh static void rbd_bus_del_dev(struct rbd_device *rbd_dev) 3022dfc5606dSYehuda Sadeh { 3023dfc5606dSYehuda Sadeh device_unregister(&rbd_dev->dev); 3024dfc5606dSYehuda Sadeh } 3025dfc5606dSYehuda Sadeh 302659c2be1eSYehuda Sadeh static int rbd_init_watch_dev(struct rbd_device *rbd_dev) 302759c2be1eSYehuda Sadeh { 302859c2be1eSYehuda Sadeh int ret, rc; 302959c2be1eSYehuda Sadeh 303059c2be1eSYehuda Sadeh do { 3031907703d0SAlex Elder ret = rbd_req_sync_watch(rbd_dev, 1); 303259c2be1eSYehuda Sadeh if (ret == -ERANGE) { 3033117973fbSAlex Elder rc = rbd_dev_refresh(rbd_dev, NULL); 303459c2be1eSYehuda Sadeh if (rc < 0) 303559c2be1eSYehuda Sadeh return rc; 303659c2be1eSYehuda Sadeh } 303759c2be1eSYehuda Sadeh } while (ret == -ERANGE); 303859c2be1eSYehuda Sadeh 303959c2be1eSYehuda Sadeh return ret; 304059c2be1eSYehuda Sadeh } 304159c2be1eSYehuda Sadeh 3042e2839308SAlex Elder static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0); 30431ddbe94eSAlex Elder 30441ddbe94eSAlex Elder /* 3045499afd5bSAlex Elder * Get a unique rbd identifier for the given new rbd_dev, and add 3046499afd5bSAlex Elder * the rbd_dev to the global list. The minimum rbd id is 1. 30471ddbe94eSAlex Elder */ 3048e2839308SAlex Elder static void rbd_dev_id_get(struct rbd_device *rbd_dev) 3049b7f23c36SAlex Elder { 3050e2839308SAlex Elder rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max); 3051499afd5bSAlex Elder 3052499afd5bSAlex Elder spin_lock(&rbd_dev_list_lock); 3053499afd5bSAlex Elder list_add_tail(&rbd_dev->node, &rbd_dev_list); 3054499afd5bSAlex Elder spin_unlock(&rbd_dev_list_lock); 3055e2839308SAlex Elder dout("rbd_dev %p given dev id %llu\n", rbd_dev, 3056e2839308SAlex Elder (unsigned long long) rbd_dev->dev_id); 3057b7f23c36SAlex Elder } 3058b7f23c36SAlex Elder 30591ddbe94eSAlex Elder /* 3060499afd5bSAlex Elder * Remove an rbd_dev from the global list, and record that its 3061499afd5bSAlex Elder * identifier is no longer in use. 30621ddbe94eSAlex Elder */ 3063e2839308SAlex Elder static void rbd_dev_id_put(struct rbd_device *rbd_dev) 30641ddbe94eSAlex Elder { 3065d184f6bfSAlex Elder struct list_head *tmp; 3066de71a297SAlex Elder int rbd_id = rbd_dev->dev_id; 3067d184f6bfSAlex Elder int max_id; 3068d184f6bfSAlex Elder 3069aafb230eSAlex Elder rbd_assert(rbd_id > 0); 3070499afd5bSAlex Elder 3071e2839308SAlex Elder dout("rbd_dev %p released dev id %llu\n", rbd_dev, 3072e2839308SAlex Elder (unsigned long long) rbd_dev->dev_id); 3073499afd5bSAlex Elder spin_lock(&rbd_dev_list_lock); 3074499afd5bSAlex Elder list_del_init(&rbd_dev->node); 3075d184f6bfSAlex Elder 3076d184f6bfSAlex Elder /* 3077d184f6bfSAlex Elder * If the id being "put" is not the current maximum, there 3078d184f6bfSAlex Elder * is nothing special we need to do. 3079d184f6bfSAlex Elder */ 3080e2839308SAlex Elder if (rbd_id != atomic64_read(&rbd_dev_id_max)) { 3081d184f6bfSAlex Elder spin_unlock(&rbd_dev_list_lock); 3082d184f6bfSAlex Elder return; 3083d184f6bfSAlex Elder } 3084d184f6bfSAlex Elder 3085d184f6bfSAlex Elder /* 3086d184f6bfSAlex Elder * We need to update the current maximum id. Search the 3087d184f6bfSAlex Elder * list to find out what it is. We're more likely to find 3088d184f6bfSAlex Elder * the maximum at the end, so search the list backward. 3089d184f6bfSAlex Elder */ 3090d184f6bfSAlex Elder max_id = 0; 3091d184f6bfSAlex Elder list_for_each_prev(tmp, &rbd_dev_list) { 3092d184f6bfSAlex Elder struct rbd_device *rbd_dev; 3093d184f6bfSAlex Elder 3094d184f6bfSAlex Elder rbd_dev = list_entry(tmp, struct rbd_device, node); 3095b213e0b1SAlex Elder if (rbd_dev->dev_id > max_id) 3096b213e0b1SAlex Elder max_id = rbd_dev->dev_id; 3097d184f6bfSAlex Elder } 3098499afd5bSAlex Elder spin_unlock(&rbd_dev_list_lock); 30991ddbe94eSAlex Elder 31001ddbe94eSAlex Elder /* 3101e2839308SAlex Elder * The max id could have been updated by rbd_dev_id_get(), in 3102d184f6bfSAlex Elder * which case it now accurately reflects the new maximum. 3103d184f6bfSAlex Elder * Be careful not to overwrite the maximum value in that 3104d184f6bfSAlex Elder * case. 31051ddbe94eSAlex Elder */ 3106e2839308SAlex Elder atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id); 3107e2839308SAlex Elder dout(" max dev id has been reset\n"); 3108b7f23c36SAlex Elder } 3109b7f23c36SAlex Elder 3110a725f65eSAlex Elder /* 3111e28fff26SAlex Elder * Skips over white space at *buf, and updates *buf to point to the 3112e28fff26SAlex Elder * first found non-space character (if any). Returns the length of 3113593a9e7bSAlex Elder * the token (string of non-white space characters) found. Note 3114593a9e7bSAlex Elder * that *buf must be terminated with '\0'. 3115e28fff26SAlex Elder */ 3116e28fff26SAlex Elder static inline size_t next_token(const char **buf) 3117e28fff26SAlex Elder { 3118e28fff26SAlex Elder /* 3119e28fff26SAlex Elder * These are the characters that produce nonzero for 3120e28fff26SAlex Elder * isspace() in the "C" and "POSIX" locales. 3121e28fff26SAlex Elder */ 3122e28fff26SAlex Elder const char *spaces = " \f\n\r\t\v"; 3123e28fff26SAlex Elder 3124e28fff26SAlex Elder *buf += strspn(*buf, spaces); /* Find start of token */ 3125e28fff26SAlex Elder 3126e28fff26SAlex Elder return strcspn(*buf, spaces); /* Return token length */ 3127e28fff26SAlex Elder } 3128e28fff26SAlex Elder 3129e28fff26SAlex Elder /* 3130e28fff26SAlex Elder * Finds the next token in *buf, and if the provided token buffer is 3131e28fff26SAlex Elder * big enough, copies the found token into it. The result, if 3132593a9e7bSAlex Elder * copied, is guaranteed to be terminated with '\0'. Note that *buf 3133593a9e7bSAlex Elder * must be terminated with '\0' on entry. 3134e28fff26SAlex Elder * 3135e28fff26SAlex Elder * Returns the length of the token found (not including the '\0'). 3136e28fff26SAlex Elder * Return value will be 0 if no token is found, and it will be >= 3137e28fff26SAlex Elder * token_size if the token would not fit. 3138e28fff26SAlex Elder * 3139593a9e7bSAlex Elder * The *buf pointer will be updated to point beyond the end of the 3140e28fff26SAlex Elder * found token. Note that this occurs even if the token buffer is 3141e28fff26SAlex Elder * too small to hold it. 3142e28fff26SAlex Elder */ 3143e28fff26SAlex Elder static inline size_t copy_token(const char **buf, 3144e28fff26SAlex Elder char *token, 3145e28fff26SAlex Elder size_t token_size) 3146e28fff26SAlex Elder { 3147e28fff26SAlex Elder size_t len; 3148e28fff26SAlex Elder 3149e28fff26SAlex Elder len = next_token(buf); 3150e28fff26SAlex Elder if (len < token_size) { 3151e28fff26SAlex Elder memcpy(token, *buf, len); 3152e28fff26SAlex Elder *(token + len) = '\0'; 3153e28fff26SAlex Elder } 3154e28fff26SAlex Elder *buf += len; 3155e28fff26SAlex Elder 3156e28fff26SAlex Elder return len; 3157e28fff26SAlex Elder } 3158e28fff26SAlex Elder 3159e28fff26SAlex Elder /* 3160ea3352f4SAlex Elder * Finds the next token in *buf, dynamically allocates a buffer big 3161ea3352f4SAlex Elder * enough to hold a copy of it, and copies the token into the new 3162ea3352f4SAlex Elder * buffer. The copy is guaranteed to be terminated with '\0'. Note 3163ea3352f4SAlex Elder * that a duplicate buffer is created even for a zero-length token. 3164ea3352f4SAlex Elder * 3165ea3352f4SAlex Elder * Returns a pointer to the newly-allocated duplicate, or a null 3166ea3352f4SAlex Elder * pointer if memory for the duplicate was not available. If 3167ea3352f4SAlex Elder * the lenp argument is a non-null pointer, the length of the token 3168ea3352f4SAlex Elder * (not including the '\0') is returned in *lenp. 3169ea3352f4SAlex Elder * 3170ea3352f4SAlex Elder * If successful, the *buf pointer will be updated to point beyond 3171ea3352f4SAlex Elder * the end of the found token. 3172ea3352f4SAlex Elder * 3173ea3352f4SAlex Elder * Note: uses GFP_KERNEL for allocation. 3174ea3352f4SAlex Elder */ 3175ea3352f4SAlex Elder static inline char *dup_token(const char **buf, size_t *lenp) 3176ea3352f4SAlex Elder { 3177ea3352f4SAlex Elder char *dup; 3178ea3352f4SAlex Elder size_t len; 3179ea3352f4SAlex Elder 3180ea3352f4SAlex Elder len = next_token(buf); 31814caf35f9SAlex Elder dup = kmemdup(*buf, len + 1, GFP_KERNEL); 3182ea3352f4SAlex Elder if (!dup) 3183ea3352f4SAlex Elder return NULL; 3184ea3352f4SAlex Elder *(dup + len) = '\0'; 3185ea3352f4SAlex Elder *buf += len; 3186ea3352f4SAlex Elder 3187ea3352f4SAlex Elder if (lenp) 3188ea3352f4SAlex Elder *lenp = len; 3189ea3352f4SAlex Elder 3190ea3352f4SAlex Elder return dup; 3191ea3352f4SAlex Elder } 3192ea3352f4SAlex Elder 3193ea3352f4SAlex Elder /* 3194859c31dfSAlex Elder * Parse the options provided for an "rbd add" (i.e., rbd image 3195859c31dfSAlex Elder * mapping) request. These arrive via a write to /sys/bus/rbd/add, 3196859c31dfSAlex Elder * and the data written is passed here via a NUL-terminated buffer. 3197859c31dfSAlex Elder * Returns 0 if successful or an error code otherwise. 3198d22f76e7SAlex Elder * 3199859c31dfSAlex Elder * The information extracted from these options is recorded in 3200859c31dfSAlex Elder * the other parameters which return dynamically-allocated 3201859c31dfSAlex Elder * structures: 3202859c31dfSAlex Elder * ceph_opts 3203859c31dfSAlex Elder * The address of a pointer that will refer to a ceph options 3204859c31dfSAlex Elder * structure. Caller must release the returned pointer using 3205859c31dfSAlex Elder * ceph_destroy_options() when it is no longer needed. 3206859c31dfSAlex Elder * rbd_opts 3207859c31dfSAlex Elder * Address of an rbd options pointer. Fully initialized by 3208859c31dfSAlex Elder * this function; caller must release with kfree(). 3209859c31dfSAlex Elder * spec 3210859c31dfSAlex Elder * Address of an rbd image specification pointer. Fully 3211859c31dfSAlex Elder * initialized by this function based on parsed options. 3212859c31dfSAlex Elder * Caller must release with rbd_spec_put(). 3213859c31dfSAlex Elder * 3214859c31dfSAlex Elder * The options passed take this form: 3215859c31dfSAlex Elder * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>] 3216859c31dfSAlex Elder * where: 3217859c31dfSAlex Elder * <mon_addrs> 3218859c31dfSAlex Elder * A comma-separated list of one or more monitor addresses. 3219859c31dfSAlex Elder * A monitor address is an ip address, optionally followed 3220859c31dfSAlex Elder * by a port number (separated by a colon). 3221859c31dfSAlex Elder * I.e.: ip1[:port1][,ip2[:port2]...] 3222859c31dfSAlex Elder * <options> 3223859c31dfSAlex Elder * A comma-separated list of ceph and/or rbd options. 3224859c31dfSAlex Elder * <pool_name> 3225859c31dfSAlex Elder * The name of the rados pool containing the rbd image. 3226859c31dfSAlex Elder * <image_name> 3227859c31dfSAlex Elder * The name of the image in that pool to map. 3228859c31dfSAlex Elder * <snap_id> 3229859c31dfSAlex Elder * An optional snapshot id. If provided, the mapping will 3230859c31dfSAlex Elder * present data from the image at the time that snapshot was 3231859c31dfSAlex Elder * created. The image head is used if no snapshot id is 3232859c31dfSAlex Elder * provided. Snapshot mappings are always read-only. 3233a725f65eSAlex Elder */ 3234859c31dfSAlex Elder static int rbd_add_parse_args(const char *buf, 3235dc79b113SAlex Elder struct ceph_options **ceph_opts, 3236859c31dfSAlex Elder struct rbd_options **opts, 3237859c31dfSAlex Elder struct rbd_spec **rbd_spec) 3238a725f65eSAlex Elder { 3239e28fff26SAlex Elder size_t len; 3240859c31dfSAlex Elder char *options; 32410ddebc0cSAlex Elder const char *mon_addrs; 32420ddebc0cSAlex Elder size_t mon_addrs_size; 3243859c31dfSAlex Elder struct rbd_spec *spec = NULL; 32444e9afebaSAlex Elder struct rbd_options *rbd_opts = NULL; 3245859c31dfSAlex Elder struct ceph_options *copts; 3246dc79b113SAlex Elder int ret; 3247e28fff26SAlex Elder 3248e28fff26SAlex Elder /* The first four tokens are required */ 3249e28fff26SAlex Elder 32507ef3214aSAlex Elder len = next_token(&buf); 32514fb5d671SAlex Elder if (!len) { 32524fb5d671SAlex Elder rbd_warn(NULL, "no monitor address(es) provided"); 32534fb5d671SAlex Elder return -EINVAL; 32544fb5d671SAlex Elder } 32550ddebc0cSAlex Elder mon_addrs = buf; 3256f28e565aSAlex Elder mon_addrs_size = len + 1; 32577ef3214aSAlex Elder buf += len; 3258a725f65eSAlex Elder 3259dc79b113SAlex Elder ret = -EINVAL; 3260f28e565aSAlex Elder options = dup_token(&buf, NULL); 3261f28e565aSAlex Elder if (!options) 3262dc79b113SAlex Elder return -ENOMEM; 32634fb5d671SAlex Elder if (!*options) { 32644fb5d671SAlex Elder rbd_warn(NULL, "no options provided"); 32654fb5d671SAlex Elder goto out_err; 32664fb5d671SAlex Elder } 3267a725f65eSAlex Elder 3268859c31dfSAlex Elder spec = rbd_spec_alloc(); 3269859c31dfSAlex Elder if (!spec) 3270f28e565aSAlex Elder goto out_mem; 3271859c31dfSAlex Elder 3272859c31dfSAlex Elder spec->pool_name = dup_token(&buf, NULL); 3273859c31dfSAlex Elder if (!spec->pool_name) 3274859c31dfSAlex Elder goto out_mem; 32754fb5d671SAlex Elder if (!*spec->pool_name) { 32764fb5d671SAlex Elder rbd_warn(NULL, "no pool name provided"); 32774fb5d671SAlex Elder goto out_err; 32784fb5d671SAlex Elder } 3279e28fff26SAlex Elder 328069e7a02fSAlex Elder spec->image_name = dup_token(&buf, NULL); 3281859c31dfSAlex Elder if (!spec->image_name) 3282f28e565aSAlex Elder goto out_mem; 32834fb5d671SAlex Elder if (!*spec->image_name) { 32844fb5d671SAlex Elder rbd_warn(NULL, "no image name provided"); 32854fb5d671SAlex Elder goto out_err; 32864fb5d671SAlex Elder } 3287e28fff26SAlex Elder 3288f28e565aSAlex Elder /* 3289f28e565aSAlex Elder * Snapshot name is optional; default is to use "-" 3290f28e565aSAlex Elder * (indicating the head/no snapshot). 3291f28e565aSAlex Elder */ 32923feeb894SAlex Elder len = next_token(&buf); 3293820a5f3eSAlex Elder if (!len) { 32943feeb894SAlex Elder buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */ 32953feeb894SAlex Elder len = sizeof (RBD_SNAP_HEAD_NAME) - 1; 3296f28e565aSAlex Elder } else if (len > RBD_MAX_SNAP_NAME_LEN) { 3297dc79b113SAlex Elder ret = -ENAMETOOLONG; 3298f28e565aSAlex Elder goto out_err; 3299849b4260SAlex Elder } 33004caf35f9SAlex Elder spec->snap_name = kmemdup(buf, len + 1, GFP_KERNEL); 3301859c31dfSAlex Elder if (!spec->snap_name) 3302f28e565aSAlex Elder goto out_mem; 3303859c31dfSAlex Elder *(spec->snap_name + len) = '\0'; 3304e5c35534SAlex Elder 33050ddebc0cSAlex Elder /* Initialize all rbd options to the defaults */ 3306e28fff26SAlex Elder 33074e9afebaSAlex Elder rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL); 33084e9afebaSAlex Elder if (!rbd_opts) 33094e9afebaSAlex Elder goto out_mem; 33104e9afebaSAlex Elder 33114e9afebaSAlex Elder rbd_opts->read_only = RBD_READ_ONLY_DEFAULT; 3312d22f76e7SAlex Elder 3313859c31dfSAlex Elder copts = ceph_parse_options(options, mon_addrs, 33140ddebc0cSAlex Elder mon_addrs + mon_addrs_size - 1, 33154e9afebaSAlex Elder parse_rbd_opts_token, rbd_opts); 3316859c31dfSAlex Elder if (IS_ERR(copts)) { 3317859c31dfSAlex Elder ret = PTR_ERR(copts); 3318dc79b113SAlex Elder goto out_err; 3319dc79b113SAlex Elder } 3320859c31dfSAlex Elder kfree(options); 3321859c31dfSAlex Elder 3322859c31dfSAlex Elder *ceph_opts = copts; 33234e9afebaSAlex Elder *opts = rbd_opts; 3324859c31dfSAlex Elder *rbd_spec = spec; 33250ddebc0cSAlex Elder 3326dc79b113SAlex Elder return 0; 3327f28e565aSAlex Elder out_mem: 3328dc79b113SAlex Elder ret = -ENOMEM; 3329d22f76e7SAlex Elder out_err: 3330859c31dfSAlex Elder kfree(rbd_opts); 3331859c31dfSAlex Elder rbd_spec_put(spec); 3332f28e565aSAlex Elder kfree(options); 3333d22f76e7SAlex Elder 3334dc79b113SAlex Elder return ret; 3335a725f65eSAlex Elder } 3336a725f65eSAlex Elder 3337589d30e0SAlex Elder /* 3338589d30e0SAlex Elder * An rbd format 2 image has a unique identifier, distinct from the 3339589d30e0SAlex Elder * name given to it by the user. Internally, that identifier is 3340589d30e0SAlex Elder * what's used to specify the names of objects related to the image. 3341589d30e0SAlex Elder * 3342589d30e0SAlex Elder * A special "rbd id" object is used to map an rbd image name to its 3343589d30e0SAlex Elder * id. If that object doesn't exist, then there is no v2 rbd image 3344589d30e0SAlex Elder * with the supplied name. 3345589d30e0SAlex Elder * 3346589d30e0SAlex Elder * This function will record the given rbd_dev's image_id field if 3347589d30e0SAlex Elder * it can be determined, and in that case will return 0. If any 3348589d30e0SAlex Elder * errors occur a negative errno will be returned and the rbd_dev's 3349589d30e0SAlex Elder * image_id field will be unchanged (and should be NULL). 3350589d30e0SAlex Elder */ 3351589d30e0SAlex Elder static int rbd_dev_image_id(struct rbd_device *rbd_dev) 3352589d30e0SAlex Elder { 3353589d30e0SAlex Elder int ret; 3354589d30e0SAlex Elder size_t size; 3355589d30e0SAlex Elder char *object_name; 3356589d30e0SAlex Elder void *response; 3357589d30e0SAlex Elder void *p; 3358589d30e0SAlex Elder 3359589d30e0SAlex Elder /* 33602c0d0a10SAlex Elder * When probing a parent image, the image id is already 33612c0d0a10SAlex Elder * known (and the image name likely is not). There's no 33622c0d0a10SAlex Elder * need to fetch the image id again in this case. 33632c0d0a10SAlex Elder */ 33642c0d0a10SAlex Elder if (rbd_dev->spec->image_id) 33652c0d0a10SAlex Elder return 0; 33662c0d0a10SAlex Elder 33672c0d0a10SAlex Elder /* 3368589d30e0SAlex Elder * First, see if the format 2 image id file exists, and if 3369589d30e0SAlex Elder * so, get the image's persistent id from it. 3370589d30e0SAlex Elder */ 337169e7a02fSAlex Elder size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name); 3372589d30e0SAlex Elder object_name = kmalloc(size, GFP_NOIO); 3373589d30e0SAlex Elder if (!object_name) 3374589d30e0SAlex Elder return -ENOMEM; 33750d7dbfceSAlex Elder sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name); 3376589d30e0SAlex Elder dout("rbd id object name is %s\n", object_name); 3377589d30e0SAlex Elder 3378589d30e0SAlex Elder /* Response will be an encoded string, which includes a length */ 3379589d30e0SAlex Elder 3380589d30e0SAlex Elder size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX; 3381589d30e0SAlex Elder response = kzalloc(size, GFP_NOIO); 3382589d30e0SAlex Elder if (!response) { 3383589d30e0SAlex Elder ret = -ENOMEM; 3384589d30e0SAlex Elder goto out; 3385589d30e0SAlex Elder } 3386589d30e0SAlex Elder 3387589d30e0SAlex Elder ret = rbd_req_sync_exec(rbd_dev, object_name, 3388589d30e0SAlex Elder "rbd", "get_id", 3389589d30e0SAlex Elder NULL, 0, 339007b2391fSAlex Elder response, RBD_IMAGE_ID_LEN_MAX, NULL); 3391589d30e0SAlex Elder dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); 3392589d30e0SAlex Elder if (ret < 0) 3393589d30e0SAlex Elder goto out; 3394a0ea3a40SAlex Elder ret = 0; /* rbd_req_sync_exec() can return positive */ 3395589d30e0SAlex Elder 3396589d30e0SAlex Elder p = response; 33970d7dbfceSAlex Elder rbd_dev->spec->image_id = ceph_extract_encoded_string(&p, 3398589d30e0SAlex Elder p + RBD_IMAGE_ID_LEN_MAX, 3399979ed480SAlex Elder NULL, GFP_NOIO); 34000d7dbfceSAlex Elder if (IS_ERR(rbd_dev->spec->image_id)) { 34010d7dbfceSAlex Elder ret = PTR_ERR(rbd_dev->spec->image_id); 34020d7dbfceSAlex Elder rbd_dev->spec->image_id = NULL; 3403589d30e0SAlex Elder } else { 34040d7dbfceSAlex Elder dout("image_id is %s\n", rbd_dev->spec->image_id); 3405589d30e0SAlex Elder } 3406589d30e0SAlex Elder out: 3407589d30e0SAlex Elder kfree(response); 3408589d30e0SAlex Elder kfree(object_name); 3409589d30e0SAlex Elder 3410589d30e0SAlex Elder return ret; 3411589d30e0SAlex Elder } 3412589d30e0SAlex Elder 3413a30b71b9SAlex Elder static int rbd_dev_v1_probe(struct rbd_device *rbd_dev) 3414a30b71b9SAlex Elder { 3415a30b71b9SAlex Elder int ret; 3416a30b71b9SAlex Elder size_t size; 3417a30b71b9SAlex Elder 3418a30b71b9SAlex Elder /* Version 1 images have no id; empty string is used */ 3419a30b71b9SAlex Elder 34200d7dbfceSAlex Elder rbd_dev->spec->image_id = kstrdup("", GFP_KERNEL); 34210d7dbfceSAlex Elder if (!rbd_dev->spec->image_id) 3422a30b71b9SAlex Elder return -ENOMEM; 3423a30b71b9SAlex Elder 3424a30b71b9SAlex Elder /* Record the header object name for this rbd image. */ 3425a30b71b9SAlex Elder 342669e7a02fSAlex Elder size = strlen(rbd_dev->spec->image_name) + sizeof (RBD_SUFFIX); 3427a30b71b9SAlex Elder rbd_dev->header_name = kmalloc(size, GFP_KERNEL); 3428a30b71b9SAlex Elder if (!rbd_dev->header_name) { 3429a30b71b9SAlex Elder ret = -ENOMEM; 3430a30b71b9SAlex Elder goto out_err; 3431a30b71b9SAlex Elder } 34320d7dbfceSAlex Elder sprintf(rbd_dev->header_name, "%s%s", 34330d7dbfceSAlex Elder rbd_dev->spec->image_name, RBD_SUFFIX); 3434a30b71b9SAlex Elder 3435a30b71b9SAlex Elder /* Populate rbd image metadata */ 3436a30b71b9SAlex Elder 3437a30b71b9SAlex Elder ret = rbd_read_header(rbd_dev, &rbd_dev->header); 3438a30b71b9SAlex Elder if (ret < 0) 3439a30b71b9SAlex Elder goto out_err; 344086b00e0dSAlex Elder 344186b00e0dSAlex Elder /* Version 1 images have no parent (no layering) */ 344286b00e0dSAlex Elder 344386b00e0dSAlex Elder rbd_dev->parent_spec = NULL; 344486b00e0dSAlex Elder rbd_dev->parent_overlap = 0; 344586b00e0dSAlex Elder 3446a30b71b9SAlex Elder rbd_dev->image_format = 1; 3447a30b71b9SAlex Elder 3448a30b71b9SAlex Elder dout("discovered version 1 image, header name is %s\n", 3449a30b71b9SAlex Elder rbd_dev->header_name); 3450a30b71b9SAlex Elder 3451a30b71b9SAlex Elder return 0; 3452a30b71b9SAlex Elder 3453a30b71b9SAlex Elder out_err: 3454a30b71b9SAlex Elder kfree(rbd_dev->header_name); 3455a30b71b9SAlex Elder rbd_dev->header_name = NULL; 34560d7dbfceSAlex Elder kfree(rbd_dev->spec->image_id); 34570d7dbfceSAlex Elder rbd_dev->spec->image_id = NULL; 3458a30b71b9SAlex Elder 3459a30b71b9SAlex Elder return ret; 3460a30b71b9SAlex Elder } 3461a30b71b9SAlex Elder 3462a30b71b9SAlex Elder static int rbd_dev_v2_probe(struct rbd_device *rbd_dev) 3463a30b71b9SAlex Elder { 3464a30b71b9SAlex Elder size_t size; 34659d475de5SAlex Elder int ret; 34666e14b1a6SAlex Elder u64 ver = 0; 3467a30b71b9SAlex Elder 3468a30b71b9SAlex Elder /* 3469a30b71b9SAlex Elder * Image id was filled in by the caller. Record the header 3470a30b71b9SAlex Elder * object name for this rbd image. 3471a30b71b9SAlex Elder */ 3472979ed480SAlex Elder size = sizeof (RBD_HEADER_PREFIX) + strlen(rbd_dev->spec->image_id); 3473a30b71b9SAlex Elder rbd_dev->header_name = kmalloc(size, GFP_KERNEL); 3474a30b71b9SAlex Elder if (!rbd_dev->header_name) 3475a30b71b9SAlex Elder return -ENOMEM; 3476a30b71b9SAlex Elder sprintf(rbd_dev->header_name, "%s%s", 34770d7dbfceSAlex Elder RBD_HEADER_PREFIX, rbd_dev->spec->image_id); 34789d475de5SAlex Elder 34799d475de5SAlex Elder /* Get the size and object order for the image */ 34809d475de5SAlex Elder 34819d475de5SAlex Elder ret = rbd_dev_v2_image_size(rbd_dev); 34829d475de5SAlex Elder if (ret < 0) 34839d475de5SAlex Elder goto out_err; 34841e130199SAlex Elder 34851e130199SAlex Elder /* Get the object prefix (a.k.a. block_name) for the image */ 34861e130199SAlex Elder 34871e130199SAlex Elder ret = rbd_dev_v2_object_prefix(rbd_dev); 34881e130199SAlex Elder if (ret < 0) 34891e130199SAlex Elder goto out_err; 3490b1b5402aSAlex Elder 3491d889140cSAlex Elder /* Get the and check features for the image */ 3492b1b5402aSAlex Elder 3493b1b5402aSAlex Elder ret = rbd_dev_v2_features(rbd_dev); 3494b1b5402aSAlex Elder if (ret < 0) 3495b1b5402aSAlex Elder goto out_err; 349635d489f9SAlex Elder 349786b00e0dSAlex Elder /* If the image supports layering, get the parent info */ 349886b00e0dSAlex Elder 349986b00e0dSAlex Elder if (rbd_dev->header.features & RBD_FEATURE_LAYERING) { 350086b00e0dSAlex Elder ret = rbd_dev_v2_parent_info(rbd_dev); 350186b00e0dSAlex Elder if (ret < 0) 350286b00e0dSAlex Elder goto out_err; 350386b00e0dSAlex Elder } 350486b00e0dSAlex Elder 35056e14b1a6SAlex Elder /* crypto and compression type aren't (yet) supported for v2 images */ 350635d489f9SAlex Elder 35076e14b1a6SAlex Elder rbd_dev->header.crypt_type = 0; 35086e14b1a6SAlex Elder rbd_dev->header.comp_type = 0; 35096e14b1a6SAlex Elder 35106e14b1a6SAlex Elder /* Get the snapshot context, plus the header version */ 35116e14b1a6SAlex Elder 35126e14b1a6SAlex Elder ret = rbd_dev_v2_snap_context(rbd_dev, &ver); 351335d489f9SAlex Elder if (ret) 351435d489f9SAlex Elder goto out_err; 35156e14b1a6SAlex Elder rbd_dev->header.obj_version = ver; 35166e14b1a6SAlex Elder 3517a30b71b9SAlex Elder rbd_dev->image_format = 2; 3518a30b71b9SAlex Elder 3519a30b71b9SAlex Elder dout("discovered version 2 image, header name is %s\n", 3520a30b71b9SAlex Elder rbd_dev->header_name); 3521a30b71b9SAlex Elder 352235152979SAlex Elder return 0; 35239d475de5SAlex Elder out_err: 352486b00e0dSAlex Elder rbd_dev->parent_overlap = 0; 352586b00e0dSAlex Elder rbd_spec_put(rbd_dev->parent_spec); 352686b00e0dSAlex Elder rbd_dev->parent_spec = NULL; 35279d475de5SAlex Elder kfree(rbd_dev->header_name); 35289d475de5SAlex Elder rbd_dev->header_name = NULL; 35291e130199SAlex Elder kfree(rbd_dev->header.object_prefix); 35301e130199SAlex Elder rbd_dev->header.object_prefix = NULL; 35319d475de5SAlex Elder 35329d475de5SAlex Elder return ret; 3533a30b71b9SAlex Elder } 3534a30b71b9SAlex Elder 353583a06263SAlex Elder static int rbd_dev_probe_finish(struct rbd_device *rbd_dev) 353683a06263SAlex Elder { 353783a06263SAlex Elder int ret; 353883a06263SAlex Elder 353983a06263SAlex Elder /* no need to lock here, as rbd_dev is not registered yet */ 354083a06263SAlex Elder ret = rbd_dev_snaps_update(rbd_dev); 354183a06263SAlex Elder if (ret) 354283a06263SAlex Elder return ret; 354383a06263SAlex Elder 35449e15b77dSAlex Elder ret = rbd_dev_probe_update_spec(rbd_dev); 35459e15b77dSAlex Elder if (ret) 35469e15b77dSAlex Elder goto err_out_snaps; 35479e15b77dSAlex Elder 354883a06263SAlex Elder ret = rbd_dev_set_mapping(rbd_dev); 354983a06263SAlex Elder if (ret) 355083a06263SAlex Elder goto err_out_snaps; 355183a06263SAlex Elder 355283a06263SAlex Elder /* generate unique id: find highest unique id, add one */ 355383a06263SAlex Elder rbd_dev_id_get(rbd_dev); 355483a06263SAlex Elder 355583a06263SAlex Elder /* Fill in the device name, now that we have its id. */ 355683a06263SAlex Elder BUILD_BUG_ON(DEV_NAME_LEN 355783a06263SAlex Elder < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH); 355883a06263SAlex Elder sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id); 355983a06263SAlex Elder 356083a06263SAlex Elder /* Get our block major device number. */ 356183a06263SAlex Elder 356283a06263SAlex Elder ret = register_blkdev(0, rbd_dev->name); 356383a06263SAlex Elder if (ret < 0) 356483a06263SAlex Elder goto err_out_id; 356583a06263SAlex Elder rbd_dev->major = ret; 356683a06263SAlex Elder 356783a06263SAlex Elder /* Set up the blkdev mapping. */ 356883a06263SAlex Elder 356983a06263SAlex Elder ret = rbd_init_disk(rbd_dev); 357083a06263SAlex Elder if (ret) 357183a06263SAlex Elder goto err_out_blkdev; 357283a06263SAlex Elder 357383a06263SAlex Elder ret = rbd_bus_add_dev(rbd_dev); 357483a06263SAlex Elder if (ret) 357583a06263SAlex Elder goto err_out_disk; 357683a06263SAlex Elder 357783a06263SAlex Elder /* 357883a06263SAlex Elder * At this point cleanup in the event of an error is the job 357983a06263SAlex Elder * of the sysfs code (initiated by rbd_bus_del_dev()). 358083a06263SAlex Elder */ 358183a06263SAlex Elder down_write(&rbd_dev->header_rwsem); 358283a06263SAlex Elder ret = rbd_dev_snaps_register(rbd_dev); 358383a06263SAlex Elder up_write(&rbd_dev->header_rwsem); 358483a06263SAlex Elder if (ret) 358583a06263SAlex Elder goto err_out_bus; 358683a06263SAlex Elder 358783a06263SAlex Elder ret = rbd_init_watch_dev(rbd_dev); 358883a06263SAlex Elder if (ret) 358983a06263SAlex Elder goto err_out_bus; 359083a06263SAlex Elder 359183a06263SAlex Elder /* Everything's ready. Announce the disk to the world. */ 359283a06263SAlex Elder 359383a06263SAlex Elder add_disk(rbd_dev->disk); 359483a06263SAlex Elder 359583a06263SAlex Elder pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name, 359683a06263SAlex Elder (unsigned long long) rbd_dev->mapping.size); 359783a06263SAlex Elder 359883a06263SAlex Elder return ret; 359983a06263SAlex Elder err_out_bus: 360083a06263SAlex Elder /* this will also clean up rest of rbd_dev stuff */ 360183a06263SAlex Elder 360283a06263SAlex Elder rbd_bus_del_dev(rbd_dev); 360383a06263SAlex Elder 360483a06263SAlex Elder return ret; 360583a06263SAlex Elder err_out_disk: 360683a06263SAlex Elder rbd_free_disk(rbd_dev); 360783a06263SAlex Elder err_out_blkdev: 360883a06263SAlex Elder unregister_blkdev(rbd_dev->major, rbd_dev->name); 360983a06263SAlex Elder err_out_id: 361083a06263SAlex Elder rbd_dev_id_put(rbd_dev); 361183a06263SAlex Elder err_out_snaps: 361283a06263SAlex Elder rbd_remove_all_snaps(rbd_dev); 361383a06263SAlex Elder 361483a06263SAlex Elder return ret; 361583a06263SAlex Elder } 361683a06263SAlex Elder 3617a30b71b9SAlex Elder /* 3618a30b71b9SAlex Elder * Probe for the existence of the header object for the given rbd 3619a30b71b9SAlex Elder * device. For format 2 images this includes determining the image 3620a30b71b9SAlex Elder * id. 3621a30b71b9SAlex Elder */ 3622a30b71b9SAlex Elder static int rbd_dev_probe(struct rbd_device *rbd_dev) 3623a30b71b9SAlex Elder { 3624a30b71b9SAlex Elder int ret; 3625a30b71b9SAlex Elder 3626a30b71b9SAlex Elder /* 3627a30b71b9SAlex Elder * Get the id from the image id object. If it's not a 3628a30b71b9SAlex Elder * format 2 image, we'll get ENOENT back, and we'll assume 3629a30b71b9SAlex Elder * it's a format 1 image. 3630a30b71b9SAlex Elder */ 3631a30b71b9SAlex Elder ret = rbd_dev_image_id(rbd_dev); 3632a30b71b9SAlex Elder if (ret) 3633a30b71b9SAlex Elder ret = rbd_dev_v1_probe(rbd_dev); 3634a30b71b9SAlex Elder else 3635a30b71b9SAlex Elder ret = rbd_dev_v2_probe(rbd_dev); 363683a06263SAlex Elder if (ret) { 3637a30b71b9SAlex Elder dout("probe failed, returning %d\n", ret); 3638a30b71b9SAlex Elder 3639a30b71b9SAlex Elder return ret; 3640a30b71b9SAlex Elder } 3641a30b71b9SAlex Elder 364283a06263SAlex Elder ret = rbd_dev_probe_finish(rbd_dev); 364383a06263SAlex Elder if (ret) 364483a06263SAlex Elder rbd_header_free(&rbd_dev->header); 364583a06263SAlex Elder 364683a06263SAlex Elder return ret; 364783a06263SAlex Elder } 364883a06263SAlex Elder 364959c2be1eSYehuda Sadeh static ssize_t rbd_add(struct bus_type *bus, 365059c2be1eSYehuda Sadeh const char *buf, 365159c2be1eSYehuda Sadeh size_t count) 3652602adf40SYehuda Sadeh { 3653cb8627c7SAlex Elder struct rbd_device *rbd_dev = NULL; 3654dc79b113SAlex Elder struct ceph_options *ceph_opts = NULL; 36554e9afebaSAlex Elder struct rbd_options *rbd_opts = NULL; 3656859c31dfSAlex Elder struct rbd_spec *spec = NULL; 36579d3997fdSAlex Elder struct rbd_client *rbdc; 365827cc2594SAlex Elder struct ceph_osd_client *osdc; 365927cc2594SAlex Elder int rc = -ENOMEM; 3660602adf40SYehuda Sadeh 3661602adf40SYehuda Sadeh if (!try_module_get(THIS_MODULE)) 3662602adf40SYehuda Sadeh return -ENODEV; 3663602adf40SYehuda Sadeh 3664a725f65eSAlex Elder /* parse add command */ 3665859c31dfSAlex Elder rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec); 3666dc79b113SAlex Elder if (rc < 0) 3667bd4ba655SAlex Elder goto err_out_module; 3668a725f65eSAlex Elder 36699d3997fdSAlex Elder rbdc = rbd_get_client(ceph_opts); 36709d3997fdSAlex Elder if (IS_ERR(rbdc)) { 36719d3997fdSAlex Elder rc = PTR_ERR(rbdc); 36720ddebc0cSAlex Elder goto err_out_args; 36739d3997fdSAlex Elder } 3674c53d5893SAlex Elder ceph_opts = NULL; /* rbd_dev client now owns this */ 3675602adf40SYehuda Sadeh 3676602adf40SYehuda Sadeh /* pick the pool */ 36779d3997fdSAlex Elder osdc = &rbdc->client->osdc; 3678859c31dfSAlex Elder rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name); 3679602adf40SYehuda Sadeh if (rc < 0) 3680602adf40SYehuda Sadeh goto err_out_client; 3681859c31dfSAlex Elder spec->pool_id = (u64) rc; 3682859c31dfSAlex Elder 36830903e875SAlex Elder /* The ceph file layout needs to fit pool id in 32 bits */ 36840903e875SAlex Elder 36850903e875SAlex Elder if (WARN_ON(spec->pool_id > (u64) U32_MAX)) { 36860903e875SAlex Elder rc = -EIO; 36870903e875SAlex Elder goto err_out_client; 36880903e875SAlex Elder } 36890903e875SAlex Elder 3690c53d5893SAlex Elder rbd_dev = rbd_dev_create(rbdc, spec); 3691bd4ba655SAlex Elder if (!rbd_dev) 3692bd4ba655SAlex Elder goto err_out_client; 3693c53d5893SAlex Elder rbdc = NULL; /* rbd_dev now owns this */ 3694c53d5893SAlex Elder spec = NULL; /* rbd_dev now owns this */ 3695602adf40SYehuda Sadeh 3696bd4ba655SAlex Elder rbd_dev->mapping.read_only = rbd_opts->read_only; 3697c53d5893SAlex Elder kfree(rbd_opts); 3698c53d5893SAlex Elder rbd_opts = NULL; /* done with this */ 3699bd4ba655SAlex Elder 3700a30b71b9SAlex Elder rc = rbd_dev_probe(rbd_dev); 3701a30b71b9SAlex Elder if (rc < 0) 3702c53d5893SAlex Elder goto err_out_rbd_dev; 370305fd6f6fSAlex Elder 3704602adf40SYehuda Sadeh return count; 3705c53d5893SAlex Elder err_out_rbd_dev: 3706c53d5893SAlex Elder rbd_dev_destroy(rbd_dev); 3707bd4ba655SAlex Elder err_out_client: 37089d3997fdSAlex Elder rbd_put_client(rbdc); 37090ddebc0cSAlex Elder err_out_args: 371078cea76eSAlex Elder if (ceph_opts) 371178cea76eSAlex Elder ceph_destroy_options(ceph_opts); 37124e9afebaSAlex Elder kfree(rbd_opts); 3713859c31dfSAlex Elder rbd_spec_put(spec); 3714bd4ba655SAlex Elder err_out_module: 3715bd4ba655SAlex Elder module_put(THIS_MODULE); 371627cc2594SAlex Elder 3717602adf40SYehuda Sadeh dout("Error adding device %s\n", buf); 371827cc2594SAlex Elder 371927cc2594SAlex Elder return (ssize_t) rc; 3720602adf40SYehuda Sadeh } 3721602adf40SYehuda Sadeh 3722de71a297SAlex Elder static struct rbd_device *__rbd_get_dev(unsigned long dev_id) 3723602adf40SYehuda Sadeh { 3724602adf40SYehuda Sadeh struct list_head *tmp; 3725602adf40SYehuda Sadeh struct rbd_device *rbd_dev; 3726602adf40SYehuda Sadeh 3727e124a82fSAlex Elder spin_lock(&rbd_dev_list_lock); 3728602adf40SYehuda Sadeh list_for_each(tmp, &rbd_dev_list) { 3729602adf40SYehuda Sadeh rbd_dev = list_entry(tmp, struct rbd_device, node); 3730de71a297SAlex Elder if (rbd_dev->dev_id == dev_id) { 3731e124a82fSAlex Elder spin_unlock(&rbd_dev_list_lock); 3732602adf40SYehuda Sadeh return rbd_dev; 3733602adf40SYehuda Sadeh } 3734e124a82fSAlex Elder } 3735e124a82fSAlex Elder spin_unlock(&rbd_dev_list_lock); 3736602adf40SYehuda Sadeh return NULL; 3737602adf40SYehuda Sadeh } 3738602adf40SYehuda Sadeh 3739dfc5606dSYehuda Sadeh static void rbd_dev_release(struct device *dev) 3740602adf40SYehuda Sadeh { 3741593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 3742602adf40SYehuda Sadeh 37431dbb4399SAlex Elder if (rbd_dev->watch_request) { 37441dbb4399SAlex Elder struct ceph_client *client = rbd_dev->rbd_client->client; 37451dbb4399SAlex Elder 37461dbb4399SAlex Elder ceph_osdc_unregister_linger_request(&client->osdc, 374759c2be1eSYehuda Sadeh rbd_dev->watch_request); 37481dbb4399SAlex Elder } 374959c2be1eSYehuda Sadeh if (rbd_dev->watch_event) 3750907703d0SAlex Elder rbd_req_sync_watch(rbd_dev, 0); 3751602adf40SYehuda Sadeh 3752602adf40SYehuda Sadeh /* clean up and free blkdev */ 3753602adf40SYehuda Sadeh rbd_free_disk(rbd_dev); 3754602adf40SYehuda Sadeh unregister_blkdev(rbd_dev->major, rbd_dev->name); 375532eec68dSAlex Elder 37562ac4e75dSAlex Elder /* release allocated disk header fields */ 37572ac4e75dSAlex Elder rbd_header_free(&rbd_dev->header); 37582ac4e75dSAlex Elder 375932eec68dSAlex Elder /* done with the id, and with the rbd_dev */ 3760e2839308SAlex Elder rbd_dev_id_put(rbd_dev); 3761c53d5893SAlex Elder rbd_assert(rbd_dev->rbd_client != NULL); 3762c53d5893SAlex Elder rbd_dev_destroy(rbd_dev); 3763602adf40SYehuda Sadeh 3764602adf40SYehuda Sadeh /* release module ref */ 3765602adf40SYehuda Sadeh module_put(THIS_MODULE); 3766602adf40SYehuda Sadeh } 3767602adf40SYehuda Sadeh 3768dfc5606dSYehuda Sadeh static ssize_t rbd_remove(struct bus_type *bus, 3769602adf40SYehuda Sadeh const char *buf, 3770602adf40SYehuda Sadeh size_t count) 3771602adf40SYehuda Sadeh { 3772602adf40SYehuda Sadeh struct rbd_device *rbd_dev = NULL; 3773602adf40SYehuda Sadeh int target_id, rc; 3774602adf40SYehuda Sadeh unsigned long ul; 3775602adf40SYehuda Sadeh int ret = count; 3776602adf40SYehuda Sadeh 3777602adf40SYehuda Sadeh rc = strict_strtoul(buf, 10, &ul); 3778602adf40SYehuda Sadeh if (rc) 3779602adf40SYehuda Sadeh return rc; 3780602adf40SYehuda Sadeh 3781602adf40SYehuda Sadeh /* convert to int; abort if we lost anything in the conversion */ 3782602adf40SYehuda Sadeh target_id = (int) ul; 3783602adf40SYehuda Sadeh if (target_id != ul) 3784602adf40SYehuda Sadeh return -EINVAL; 3785602adf40SYehuda Sadeh 3786602adf40SYehuda Sadeh mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); 3787602adf40SYehuda Sadeh 3788602adf40SYehuda Sadeh rbd_dev = __rbd_get_dev(target_id); 3789602adf40SYehuda Sadeh if (!rbd_dev) { 3790602adf40SYehuda Sadeh ret = -ENOENT; 3791602adf40SYehuda Sadeh goto done; 3792602adf40SYehuda Sadeh } 3793602adf40SYehuda Sadeh 379442382b70SAlex Elder if (rbd_dev->open_count) { 379542382b70SAlex Elder ret = -EBUSY; 379642382b70SAlex Elder goto done; 379742382b70SAlex Elder } 379842382b70SAlex Elder 379941f38c2bSAlex Elder rbd_remove_all_snaps(rbd_dev); 3800dfc5606dSYehuda Sadeh rbd_bus_del_dev(rbd_dev); 3801602adf40SYehuda Sadeh 3802602adf40SYehuda Sadeh done: 3803602adf40SYehuda Sadeh mutex_unlock(&ctl_mutex); 3804aafb230eSAlex Elder 3805602adf40SYehuda Sadeh return ret; 3806602adf40SYehuda Sadeh } 3807602adf40SYehuda Sadeh 3808602adf40SYehuda Sadeh /* 3809602adf40SYehuda Sadeh * create control files in sysfs 3810dfc5606dSYehuda Sadeh * /sys/bus/rbd/... 3811602adf40SYehuda Sadeh */ 3812602adf40SYehuda Sadeh static int rbd_sysfs_init(void) 3813602adf40SYehuda Sadeh { 3814dfc5606dSYehuda Sadeh int ret; 3815602adf40SYehuda Sadeh 3816fed4c143SAlex Elder ret = device_register(&rbd_root_dev); 3817dfc5606dSYehuda Sadeh if (ret < 0) 3818dfc5606dSYehuda Sadeh return ret; 3819602adf40SYehuda Sadeh 3820fed4c143SAlex Elder ret = bus_register(&rbd_bus_type); 3821fed4c143SAlex Elder if (ret < 0) 3822fed4c143SAlex Elder device_unregister(&rbd_root_dev); 3823602adf40SYehuda Sadeh 3824602adf40SYehuda Sadeh return ret; 3825602adf40SYehuda Sadeh } 3826602adf40SYehuda Sadeh 3827602adf40SYehuda Sadeh static void rbd_sysfs_cleanup(void) 3828602adf40SYehuda Sadeh { 3829dfc5606dSYehuda Sadeh bus_unregister(&rbd_bus_type); 3830fed4c143SAlex Elder device_unregister(&rbd_root_dev); 3831602adf40SYehuda Sadeh } 3832602adf40SYehuda Sadeh 3833602adf40SYehuda Sadeh int __init rbd_init(void) 3834602adf40SYehuda Sadeh { 3835602adf40SYehuda Sadeh int rc; 3836602adf40SYehuda Sadeh 3837602adf40SYehuda Sadeh rc = rbd_sysfs_init(); 3838602adf40SYehuda Sadeh if (rc) 3839602adf40SYehuda Sadeh return rc; 3840f0f8cef5SAlex Elder pr_info("loaded " RBD_DRV_NAME_LONG "\n"); 3841602adf40SYehuda Sadeh return 0; 3842602adf40SYehuda Sadeh } 3843602adf40SYehuda Sadeh 3844602adf40SYehuda Sadeh void __exit rbd_exit(void) 3845602adf40SYehuda Sadeh { 3846602adf40SYehuda Sadeh rbd_sysfs_cleanup(); 3847602adf40SYehuda Sadeh } 3848602adf40SYehuda Sadeh 3849602adf40SYehuda Sadeh module_init(rbd_init); 3850602adf40SYehuda Sadeh module_exit(rbd_exit); 3851602adf40SYehuda Sadeh 3852602adf40SYehuda Sadeh MODULE_AUTHOR("Sage Weil <sage@newdream.net>"); 3853602adf40SYehuda Sadeh MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>"); 3854602adf40SYehuda Sadeh MODULE_DESCRIPTION("rados block device"); 3855602adf40SYehuda Sadeh 3856602adf40SYehuda Sadeh /* following authorship retained from original osdblk.c */ 3857602adf40SYehuda Sadeh MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>"); 3858602adf40SYehuda Sadeh 3859602adf40SYehuda Sadeh MODULE_LICENSE("GPL"); 3860