1602adf40SYehuda Sadeh /* 2602adf40SYehuda Sadeh rbd.c -- Export ceph rados objects as a Linux block device 3602adf40SYehuda Sadeh 4602adf40SYehuda Sadeh 5602adf40SYehuda Sadeh based on drivers/block/osdblk.c: 6602adf40SYehuda Sadeh 7602adf40SYehuda Sadeh Copyright 2009 Red Hat, Inc. 8602adf40SYehuda Sadeh 9602adf40SYehuda Sadeh This program is free software; you can redistribute it and/or modify 10602adf40SYehuda Sadeh it under the terms of the GNU General Public License as published by 11602adf40SYehuda Sadeh the Free Software Foundation. 12602adf40SYehuda Sadeh 13602adf40SYehuda Sadeh This program is distributed in the hope that it will be useful, 14602adf40SYehuda Sadeh but WITHOUT ANY WARRANTY; without even the implied warranty of 15602adf40SYehuda Sadeh MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16602adf40SYehuda Sadeh GNU General Public License for more details. 17602adf40SYehuda Sadeh 18602adf40SYehuda Sadeh You should have received a copy of the GNU General Public License 19602adf40SYehuda Sadeh along with this program; see the file COPYING. If not, write to 20602adf40SYehuda Sadeh the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 21602adf40SYehuda Sadeh 22602adf40SYehuda Sadeh 23602adf40SYehuda Sadeh 24dfc5606dSYehuda Sadeh For usage instructions, please refer to: 25602adf40SYehuda Sadeh 26dfc5606dSYehuda Sadeh Documentation/ABI/testing/sysfs-bus-rbd 27602adf40SYehuda Sadeh 28602adf40SYehuda Sadeh */ 29602adf40SYehuda Sadeh 30602adf40SYehuda Sadeh #include <linux/ceph/libceph.h> 31602adf40SYehuda Sadeh #include <linux/ceph/osd_client.h> 32602adf40SYehuda Sadeh #include <linux/ceph/mon_client.h> 33602adf40SYehuda Sadeh #include <linux/ceph/decode.h> 3459c2be1eSYehuda Sadeh #include <linux/parser.h> 35602adf40SYehuda Sadeh 36602adf40SYehuda Sadeh #include <linux/kernel.h> 37602adf40SYehuda Sadeh #include <linux/device.h> 38602adf40SYehuda Sadeh #include <linux/module.h> 39602adf40SYehuda Sadeh #include <linux/fs.h> 40602adf40SYehuda Sadeh #include <linux/blkdev.h> 41602adf40SYehuda Sadeh 42602adf40SYehuda Sadeh #include "rbd_types.h" 43602adf40SYehuda Sadeh 44aafb230eSAlex Elder #define RBD_DEBUG /* Activate rbd_assert() calls */ 45aafb230eSAlex Elder 46593a9e7bSAlex Elder /* 47593a9e7bSAlex Elder * The basic unit of block I/O is a sector. It is interpreted in a 48593a9e7bSAlex Elder * number of contexts in Linux (blk, bio, genhd), but the default is 49593a9e7bSAlex Elder * universally 512 bytes. These symbols are just slightly more 50593a9e7bSAlex Elder * meaningful than the bare numbers they represent. 51593a9e7bSAlex Elder */ 52593a9e7bSAlex Elder #define SECTOR_SHIFT 9 53593a9e7bSAlex Elder #define SECTOR_SIZE (1ULL << SECTOR_SHIFT) 54593a9e7bSAlex Elder 55df111be6SAlex Elder /* It might be useful to have this defined elsewhere too */ 56df111be6SAlex Elder 57df111be6SAlex Elder #define U64_MAX ((u64) (~0ULL)) 58df111be6SAlex Elder 59f0f8cef5SAlex Elder #define RBD_DRV_NAME "rbd" 60f0f8cef5SAlex Elder #define RBD_DRV_NAME_LONG "rbd (rados block device)" 61602adf40SYehuda Sadeh 62602adf40SYehuda Sadeh #define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */ 63602adf40SYehuda Sadeh 64d4b125e9SAlex Elder #define RBD_SNAP_DEV_NAME_PREFIX "snap_" 65d4b125e9SAlex Elder #define RBD_MAX_SNAP_NAME_LEN \ 66d4b125e9SAlex Elder (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1)) 67d4b125e9SAlex Elder 6835d489f9SAlex Elder #define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */ 69602adf40SYehuda Sadeh #define RBD_MAX_OPT_LEN 1024 70602adf40SYehuda Sadeh 71602adf40SYehuda Sadeh #define RBD_SNAP_HEAD_NAME "-" 72602adf40SYehuda Sadeh 73589d30e0SAlex Elder #define RBD_IMAGE_ID_LEN_MAX 64 741e130199SAlex Elder #define RBD_OBJ_PREFIX_LEN_MAX 64 75589d30e0SAlex Elder 76d889140cSAlex Elder /* Feature bits */ 77d889140cSAlex Elder 78d889140cSAlex Elder #define RBD_FEATURE_LAYERING 1 79d889140cSAlex Elder 80d889140cSAlex Elder /* Features supported by this (client software) implementation. */ 81d889140cSAlex Elder 82d889140cSAlex Elder #define RBD_FEATURES_ALL (0) 83d889140cSAlex Elder 8481a89793SAlex Elder /* 8581a89793SAlex Elder * An RBD device name will be "rbd#", where the "rbd" comes from 8681a89793SAlex Elder * RBD_DRV_NAME above, and # is a unique integer identifier. 8781a89793SAlex Elder * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big 8881a89793SAlex Elder * enough to hold all possible device names. 8981a89793SAlex Elder */ 90602adf40SYehuda Sadeh #define DEV_NAME_LEN 32 9181a89793SAlex Elder #define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1) 92602adf40SYehuda Sadeh 93cc0538b6SAlex Elder #define RBD_READ_ONLY_DEFAULT false 9459c2be1eSYehuda Sadeh 95602adf40SYehuda Sadeh /* 96602adf40SYehuda Sadeh * block device image metadata (in-memory version) 97602adf40SYehuda Sadeh */ 98602adf40SYehuda Sadeh struct rbd_image_header { 99f84344f3SAlex Elder /* These four fields never change for a given rbd image */ 100849b4260SAlex Elder char *object_prefix; 10134b13184SAlex Elder u64 features; 102602adf40SYehuda Sadeh __u8 obj_order; 103602adf40SYehuda Sadeh __u8 crypt_type; 104602adf40SYehuda Sadeh __u8 comp_type; 105602adf40SYehuda Sadeh 106f84344f3SAlex Elder /* The remaining fields need to be updated occasionally */ 107f84344f3SAlex Elder u64 image_size; 108f84344f3SAlex Elder struct ceph_snap_context *snapc; 109602adf40SYehuda Sadeh char *snap_names; 110602adf40SYehuda Sadeh u64 *snap_sizes; 11159c2be1eSYehuda Sadeh 11259c2be1eSYehuda Sadeh u64 obj_version; 11359c2be1eSYehuda Sadeh }; 11459c2be1eSYehuda Sadeh 1150d7dbfceSAlex Elder /* 1160d7dbfceSAlex Elder * An rbd image specification. 1170d7dbfceSAlex Elder * 1180d7dbfceSAlex Elder * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely 1190d7dbfceSAlex Elder * identify an image. 1200d7dbfceSAlex Elder */ 1210d7dbfceSAlex Elder struct rbd_spec { 1220d7dbfceSAlex Elder u64 pool_id; 1230d7dbfceSAlex Elder char *pool_name; 1240d7dbfceSAlex Elder 1250d7dbfceSAlex Elder char *image_id; 1260d7dbfceSAlex Elder size_t image_id_len; 1270d7dbfceSAlex Elder char *image_name; 1280d7dbfceSAlex Elder size_t image_name_len; 1290d7dbfceSAlex Elder 1300d7dbfceSAlex Elder u64 snap_id; 1310d7dbfceSAlex Elder char *snap_name; 1320d7dbfceSAlex Elder 1330d7dbfceSAlex Elder struct kref kref; 1340d7dbfceSAlex Elder }; 1350d7dbfceSAlex Elder 13659c2be1eSYehuda Sadeh struct rbd_options { 137cc0538b6SAlex Elder bool read_only; 138602adf40SYehuda Sadeh }; 139602adf40SYehuda Sadeh 140602adf40SYehuda Sadeh /* 141f0f8cef5SAlex Elder * an instance of the client. multiple devices may share an rbd client. 142602adf40SYehuda Sadeh */ 143602adf40SYehuda Sadeh struct rbd_client { 144602adf40SYehuda Sadeh struct ceph_client *client; 145602adf40SYehuda Sadeh struct kref kref; 146602adf40SYehuda Sadeh struct list_head node; 147602adf40SYehuda Sadeh }; 148602adf40SYehuda Sadeh 149602adf40SYehuda Sadeh /* 150f0f8cef5SAlex Elder * a request completion status 151602adf40SYehuda Sadeh */ 1521fec7093SYehuda Sadeh struct rbd_req_status { 1531fec7093SYehuda Sadeh int done; 1541fec7093SYehuda Sadeh int rc; 1551fec7093SYehuda Sadeh u64 bytes; 1561fec7093SYehuda Sadeh }; 1571fec7093SYehuda Sadeh 1581fec7093SYehuda Sadeh /* 1591fec7093SYehuda Sadeh * a collection of requests 1601fec7093SYehuda Sadeh */ 1611fec7093SYehuda Sadeh struct rbd_req_coll { 1621fec7093SYehuda Sadeh int total; 1631fec7093SYehuda Sadeh int num_done; 1641fec7093SYehuda Sadeh struct kref kref; 1651fec7093SYehuda Sadeh struct rbd_req_status status[0]; 166602adf40SYehuda Sadeh }; 167602adf40SYehuda Sadeh 168f0f8cef5SAlex Elder /* 169f0f8cef5SAlex Elder * a single io request 170f0f8cef5SAlex Elder */ 171f0f8cef5SAlex Elder struct rbd_request { 172f0f8cef5SAlex Elder struct request *rq; /* blk layer request */ 173f0f8cef5SAlex Elder struct bio *bio; /* cloned bio */ 174f0f8cef5SAlex Elder struct page **pages; /* list of used pages */ 175f0f8cef5SAlex Elder u64 len; 176f0f8cef5SAlex Elder int coll_index; 177f0f8cef5SAlex Elder struct rbd_req_coll *coll; 178f0f8cef5SAlex Elder }; 179f0f8cef5SAlex Elder 180dfc5606dSYehuda Sadeh struct rbd_snap { 181dfc5606dSYehuda Sadeh struct device dev; 182dfc5606dSYehuda Sadeh const char *name; 1833591538fSJosh Durgin u64 size; 184dfc5606dSYehuda Sadeh struct list_head node; 185dfc5606dSYehuda Sadeh u64 id; 18634b13184SAlex Elder u64 features; 187dfc5606dSYehuda Sadeh }; 188dfc5606dSYehuda Sadeh 189f84344f3SAlex Elder struct rbd_mapping { 19099c1f08fSAlex Elder u64 size; 19134b13184SAlex Elder u64 features; 192f84344f3SAlex Elder bool read_only; 193f84344f3SAlex Elder }; 194f84344f3SAlex Elder 195602adf40SYehuda Sadeh /* 196602adf40SYehuda Sadeh * a single device 197602adf40SYehuda Sadeh */ 198602adf40SYehuda Sadeh struct rbd_device { 199de71a297SAlex Elder int dev_id; /* blkdev unique id */ 200602adf40SYehuda Sadeh 201602adf40SYehuda Sadeh int major; /* blkdev assigned major */ 202602adf40SYehuda Sadeh struct gendisk *disk; /* blkdev's gendisk and rq */ 203602adf40SYehuda Sadeh 204a30b71b9SAlex Elder u32 image_format; /* Either 1 or 2 */ 205602adf40SYehuda Sadeh struct rbd_client *rbd_client; 206602adf40SYehuda Sadeh 207602adf40SYehuda Sadeh char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */ 208602adf40SYehuda Sadeh 209602adf40SYehuda Sadeh spinlock_t lock; /* queue lock */ 210602adf40SYehuda Sadeh 211602adf40SYehuda Sadeh struct rbd_image_header header; 212daba5fdbSAlex Elder bool exists; 2130d7dbfceSAlex Elder struct rbd_spec *spec; 214602adf40SYehuda Sadeh 2150d7dbfceSAlex Elder char *header_name; 216971f839aSAlex Elder 21759c2be1eSYehuda Sadeh struct ceph_osd_event *watch_event; 21859c2be1eSYehuda Sadeh struct ceph_osd_request *watch_request; 21959c2be1eSYehuda Sadeh 22086b00e0dSAlex Elder struct rbd_spec *parent_spec; 22186b00e0dSAlex Elder u64 parent_overlap; 22286b00e0dSAlex Elder 223c666601aSJosh Durgin /* protects updating the header */ 224c666601aSJosh Durgin struct rw_semaphore header_rwsem; 225f84344f3SAlex Elder 226f84344f3SAlex Elder struct rbd_mapping mapping; 227602adf40SYehuda Sadeh 228602adf40SYehuda Sadeh struct list_head node; 229dfc5606dSYehuda Sadeh 230dfc5606dSYehuda Sadeh /* list of snapshots */ 231dfc5606dSYehuda Sadeh struct list_head snaps; 232dfc5606dSYehuda Sadeh 233dfc5606dSYehuda Sadeh /* sysfs related */ 234dfc5606dSYehuda Sadeh struct device dev; 235dfc5606dSYehuda Sadeh }; 236dfc5606dSYehuda Sadeh 237602adf40SYehuda Sadeh static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */ 238e124a82fSAlex Elder 239602adf40SYehuda Sadeh static LIST_HEAD(rbd_dev_list); /* devices */ 240e124a82fSAlex Elder static DEFINE_SPINLOCK(rbd_dev_list_lock); 241e124a82fSAlex Elder 242602adf40SYehuda Sadeh static LIST_HEAD(rbd_client_list); /* clients */ 243432b8587SAlex Elder static DEFINE_SPINLOCK(rbd_client_list_lock); 244602adf40SYehuda Sadeh 245304f6808SAlex Elder static int rbd_dev_snaps_update(struct rbd_device *rbd_dev); 246304f6808SAlex Elder static int rbd_dev_snaps_register(struct rbd_device *rbd_dev); 247304f6808SAlex Elder 248dfc5606dSYehuda Sadeh static void rbd_dev_release(struct device *dev); 24941f38c2bSAlex Elder static void rbd_remove_snap_dev(struct rbd_snap *snap); 250dfc5606dSYehuda Sadeh 251f0f8cef5SAlex Elder static ssize_t rbd_add(struct bus_type *bus, const char *buf, 252f0f8cef5SAlex Elder size_t count); 253f0f8cef5SAlex Elder static ssize_t rbd_remove(struct bus_type *bus, const char *buf, 254f0f8cef5SAlex Elder size_t count); 255f0f8cef5SAlex Elder 256f0f8cef5SAlex Elder static struct bus_attribute rbd_bus_attrs[] = { 257f0f8cef5SAlex Elder __ATTR(add, S_IWUSR, NULL, rbd_add), 258f0f8cef5SAlex Elder __ATTR(remove, S_IWUSR, NULL, rbd_remove), 259f0f8cef5SAlex Elder __ATTR_NULL 260f0f8cef5SAlex Elder }; 261f0f8cef5SAlex Elder 262f0f8cef5SAlex Elder static struct bus_type rbd_bus_type = { 263f0f8cef5SAlex Elder .name = "rbd", 264f0f8cef5SAlex Elder .bus_attrs = rbd_bus_attrs, 265f0f8cef5SAlex Elder }; 266f0f8cef5SAlex Elder 267f0f8cef5SAlex Elder static void rbd_root_dev_release(struct device *dev) 268f0f8cef5SAlex Elder { 269f0f8cef5SAlex Elder } 270f0f8cef5SAlex Elder 271f0f8cef5SAlex Elder static struct device rbd_root_dev = { 272f0f8cef5SAlex Elder .init_name = "rbd", 273f0f8cef5SAlex Elder .release = rbd_root_dev_release, 274f0f8cef5SAlex Elder }; 275f0f8cef5SAlex Elder 276aafb230eSAlex Elder #ifdef RBD_DEBUG 277aafb230eSAlex Elder #define rbd_assert(expr) \ 278aafb230eSAlex Elder if (unlikely(!(expr))) { \ 279aafb230eSAlex Elder printk(KERN_ERR "\nAssertion failure in %s() " \ 280aafb230eSAlex Elder "at line %d:\n\n" \ 281aafb230eSAlex Elder "\trbd_assert(%s);\n\n", \ 282aafb230eSAlex Elder __func__, __LINE__, #expr); \ 283aafb230eSAlex Elder BUG(); \ 284aafb230eSAlex Elder } 285aafb230eSAlex Elder #else /* !RBD_DEBUG */ 286aafb230eSAlex Elder # define rbd_assert(expr) ((void) 0) 287aafb230eSAlex Elder #endif /* !RBD_DEBUG */ 288dfc5606dSYehuda Sadeh 289dfc5606dSYehuda Sadeh static struct device *rbd_get_dev(struct rbd_device *rbd_dev) 290dfc5606dSYehuda Sadeh { 291dfc5606dSYehuda Sadeh return get_device(&rbd_dev->dev); 292dfc5606dSYehuda Sadeh } 293dfc5606dSYehuda Sadeh 294dfc5606dSYehuda Sadeh static void rbd_put_dev(struct rbd_device *rbd_dev) 295dfc5606dSYehuda Sadeh { 296dfc5606dSYehuda Sadeh put_device(&rbd_dev->dev); 297dfc5606dSYehuda Sadeh } 298602adf40SYehuda Sadeh 299117973fbSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver); 300117973fbSAlex Elder static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver); 30159c2be1eSYehuda Sadeh 302602adf40SYehuda Sadeh static int rbd_open(struct block_device *bdev, fmode_t mode) 303602adf40SYehuda Sadeh { 304f0f8cef5SAlex Elder struct rbd_device *rbd_dev = bdev->bd_disk->private_data; 305602adf40SYehuda Sadeh 306f84344f3SAlex Elder if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only) 307602adf40SYehuda Sadeh return -EROFS; 308602adf40SYehuda Sadeh 309340c7a2bSAlex Elder rbd_get_dev(rbd_dev); 310f84344f3SAlex Elder set_device_ro(bdev, rbd_dev->mapping.read_only); 311340c7a2bSAlex Elder 312602adf40SYehuda Sadeh return 0; 313602adf40SYehuda Sadeh } 314602adf40SYehuda Sadeh 315dfc5606dSYehuda Sadeh static int rbd_release(struct gendisk *disk, fmode_t mode) 316dfc5606dSYehuda Sadeh { 317dfc5606dSYehuda Sadeh struct rbd_device *rbd_dev = disk->private_data; 318dfc5606dSYehuda Sadeh 319dfc5606dSYehuda Sadeh rbd_put_dev(rbd_dev); 320dfc5606dSYehuda Sadeh 321dfc5606dSYehuda Sadeh return 0; 322dfc5606dSYehuda Sadeh } 323dfc5606dSYehuda Sadeh 324602adf40SYehuda Sadeh static const struct block_device_operations rbd_bd_ops = { 325602adf40SYehuda Sadeh .owner = THIS_MODULE, 326602adf40SYehuda Sadeh .open = rbd_open, 327dfc5606dSYehuda Sadeh .release = rbd_release, 328602adf40SYehuda Sadeh }; 329602adf40SYehuda Sadeh 330602adf40SYehuda Sadeh /* 331602adf40SYehuda Sadeh * Initialize an rbd client instance. 33243ae4701SAlex Elder * We own *ceph_opts. 333602adf40SYehuda Sadeh */ 334f8c38929SAlex Elder static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts) 335602adf40SYehuda Sadeh { 336602adf40SYehuda Sadeh struct rbd_client *rbdc; 337602adf40SYehuda Sadeh int ret = -ENOMEM; 338602adf40SYehuda Sadeh 339602adf40SYehuda Sadeh dout("rbd_client_create\n"); 340602adf40SYehuda Sadeh rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL); 341602adf40SYehuda Sadeh if (!rbdc) 342602adf40SYehuda Sadeh goto out_opt; 343602adf40SYehuda Sadeh 344602adf40SYehuda Sadeh kref_init(&rbdc->kref); 345602adf40SYehuda Sadeh INIT_LIST_HEAD(&rbdc->node); 346602adf40SYehuda Sadeh 347bc534d86SAlex Elder mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); 348bc534d86SAlex Elder 34943ae4701SAlex Elder rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0); 350602adf40SYehuda Sadeh if (IS_ERR(rbdc->client)) 351bc534d86SAlex Elder goto out_mutex; 35243ae4701SAlex Elder ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */ 353602adf40SYehuda Sadeh 354602adf40SYehuda Sadeh ret = ceph_open_session(rbdc->client); 355602adf40SYehuda Sadeh if (ret < 0) 356602adf40SYehuda Sadeh goto out_err; 357602adf40SYehuda Sadeh 358432b8587SAlex Elder spin_lock(&rbd_client_list_lock); 359602adf40SYehuda Sadeh list_add_tail(&rbdc->node, &rbd_client_list); 360432b8587SAlex Elder spin_unlock(&rbd_client_list_lock); 361602adf40SYehuda Sadeh 362bc534d86SAlex Elder mutex_unlock(&ctl_mutex); 363bc534d86SAlex Elder 364602adf40SYehuda Sadeh dout("rbd_client_create created %p\n", rbdc); 365602adf40SYehuda Sadeh return rbdc; 366602adf40SYehuda Sadeh 367602adf40SYehuda Sadeh out_err: 368602adf40SYehuda Sadeh ceph_destroy_client(rbdc->client); 369bc534d86SAlex Elder out_mutex: 370bc534d86SAlex Elder mutex_unlock(&ctl_mutex); 371602adf40SYehuda Sadeh kfree(rbdc); 372602adf40SYehuda Sadeh out_opt: 37343ae4701SAlex Elder if (ceph_opts) 37443ae4701SAlex Elder ceph_destroy_options(ceph_opts); 37528f259b7SVasiliy Kulikov return ERR_PTR(ret); 376602adf40SYehuda Sadeh } 377602adf40SYehuda Sadeh 378602adf40SYehuda Sadeh /* 3791f7ba331SAlex Elder * Find a ceph client with specific addr and configuration. If 3801f7ba331SAlex Elder * found, bump its reference count. 381602adf40SYehuda Sadeh */ 3821f7ba331SAlex Elder static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts) 383602adf40SYehuda Sadeh { 384602adf40SYehuda Sadeh struct rbd_client *client_node; 3851f7ba331SAlex Elder bool found = false; 386602adf40SYehuda Sadeh 38743ae4701SAlex Elder if (ceph_opts->flags & CEPH_OPT_NOSHARE) 388602adf40SYehuda Sadeh return NULL; 389602adf40SYehuda Sadeh 3901f7ba331SAlex Elder spin_lock(&rbd_client_list_lock); 3911f7ba331SAlex Elder list_for_each_entry(client_node, &rbd_client_list, node) { 3921f7ba331SAlex Elder if (!ceph_compare_options(ceph_opts, client_node->client)) { 3931f7ba331SAlex Elder kref_get(&client_node->kref); 3941f7ba331SAlex Elder found = true; 3951f7ba331SAlex Elder break; 3961f7ba331SAlex Elder } 3971f7ba331SAlex Elder } 3981f7ba331SAlex Elder spin_unlock(&rbd_client_list_lock); 3991f7ba331SAlex Elder 4001f7ba331SAlex Elder return found ? client_node : NULL; 401602adf40SYehuda Sadeh } 402602adf40SYehuda Sadeh 403602adf40SYehuda Sadeh /* 40459c2be1eSYehuda Sadeh * mount options 40559c2be1eSYehuda Sadeh */ 40659c2be1eSYehuda Sadeh enum { 40759c2be1eSYehuda Sadeh Opt_last_int, 40859c2be1eSYehuda Sadeh /* int args above */ 40959c2be1eSYehuda Sadeh Opt_last_string, 41059c2be1eSYehuda Sadeh /* string args above */ 411cc0538b6SAlex Elder Opt_read_only, 412cc0538b6SAlex Elder Opt_read_write, 413cc0538b6SAlex Elder /* Boolean args above */ 414cc0538b6SAlex Elder Opt_last_bool, 41559c2be1eSYehuda Sadeh }; 41659c2be1eSYehuda Sadeh 41743ae4701SAlex Elder static match_table_t rbd_opts_tokens = { 41859c2be1eSYehuda Sadeh /* int args above */ 41959c2be1eSYehuda Sadeh /* string args above */ 420be466c1cSAlex Elder {Opt_read_only, "read_only"}, 421cc0538b6SAlex Elder {Opt_read_only, "ro"}, /* Alternate spelling */ 422cc0538b6SAlex Elder {Opt_read_write, "read_write"}, 423cc0538b6SAlex Elder {Opt_read_write, "rw"}, /* Alternate spelling */ 424cc0538b6SAlex Elder /* Boolean args above */ 42559c2be1eSYehuda Sadeh {-1, NULL} 42659c2be1eSYehuda Sadeh }; 42759c2be1eSYehuda Sadeh 42859c2be1eSYehuda Sadeh static int parse_rbd_opts_token(char *c, void *private) 42959c2be1eSYehuda Sadeh { 43043ae4701SAlex Elder struct rbd_options *rbd_opts = private; 43159c2be1eSYehuda Sadeh substring_t argstr[MAX_OPT_ARGS]; 43259c2be1eSYehuda Sadeh int token, intval, ret; 43359c2be1eSYehuda Sadeh 43443ae4701SAlex Elder token = match_token(c, rbd_opts_tokens, argstr); 43559c2be1eSYehuda Sadeh if (token < 0) 43659c2be1eSYehuda Sadeh return -EINVAL; 43759c2be1eSYehuda Sadeh 43859c2be1eSYehuda Sadeh if (token < Opt_last_int) { 43959c2be1eSYehuda Sadeh ret = match_int(&argstr[0], &intval); 44059c2be1eSYehuda Sadeh if (ret < 0) { 44159c2be1eSYehuda Sadeh pr_err("bad mount option arg (not int) " 44259c2be1eSYehuda Sadeh "at '%s'\n", c); 44359c2be1eSYehuda Sadeh return ret; 44459c2be1eSYehuda Sadeh } 44559c2be1eSYehuda Sadeh dout("got int token %d val %d\n", token, intval); 44659c2be1eSYehuda Sadeh } else if (token > Opt_last_int && token < Opt_last_string) { 44759c2be1eSYehuda Sadeh dout("got string token %d val %s\n", token, 44859c2be1eSYehuda Sadeh argstr[0].from); 449cc0538b6SAlex Elder } else if (token > Opt_last_string && token < Opt_last_bool) { 450cc0538b6SAlex Elder dout("got Boolean token %d\n", token); 45159c2be1eSYehuda Sadeh } else { 45259c2be1eSYehuda Sadeh dout("got token %d\n", token); 45359c2be1eSYehuda Sadeh } 45459c2be1eSYehuda Sadeh 45559c2be1eSYehuda Sadeh switch (token) { 456cc0538b6SAlex Elder case Opt_read_only: 457cc0538b6SAlex Elder rbd_opts->read_only = true; 458cc0538b6SAlex Elder break; 459cc0538b6SAlex Elder case Opt_read_write: 460cc0538b6SAlex Elder rbd_opts->read_only = false; 461cc0538b6SAlex Elder break; 46259c2be1eSYehuda Sadeh default: 463aafb230eSAlex Elder rbd_assert(false); 464aafb230eSAlex Elder break; 46559c2be1eSYehuda Sadeh } 46659c2be1eSYehuda Sadeh return 0; 46759c2be1eSYehuda Sadeh } 46859c2be1eSYehuda Sadeh 46959c2be1eSYehuda Sadeh /* 470602adf40SYehuda Sadeh * Get a ceph client with specific addr and configuration, if one does 471602adf40SYehuda Sadeh * not exist create it. 472602adf40SYehuda Sadeh */ 4739d3997fdSAlex Elder static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts) 474602adf40SYehuda Sadeh { 475f8c38929SAlex Elder struct rbd_client *rbdc; 47659c2be1eSYehuda Sadeh 4771f7ba331SAlex Elder rbdc = rbd_client_find(ceph_opts); 4789d3997fdSAlex Elder if (rbdc) /* using an existing client */ 47943ae4701SAlex Elder ceph_destroy_options(ceph_opts); 4809d3997fdSAlex Elder else 481f8c38929SAlex Elder rbdc = rbd_client_create(ceph_opts); 482d720bcb0SAlex Elder 4839d3997fdSAlex Elder return rbdc; 484602adf40SYehuda Sadeh } 485602adf40SYehuda Sadeh 486602adf40SYehuda Sadeh /* 487602adf40SYehuda Sadeh * Destroy ceph client 488d23a4b3fSAlex Elder * 489432b8587SAlex Elder * Caller must hold rbd_client_list_lock. 490602adf40SYehuda Sadeh */ 491602adf40SYehuda Sadeh static void rbd_client_release(struct kref *kref) 492602adf40SYehuda Sadeh { 493602adf40SYehuda Sadeh struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref); 494602adf40SYehuda Sadeh 495602adf40SYehuda Sadeh dout("rbd_release_client %p\n", rbdc); 496cd9d9f5dSAlex Elder spin_lock(&rbd_client_list_lock); 497602adf40SYehuda Sadeh list_del(&rbdc->node); 498cd9d9f5dSAlex Elder spin_unlock(&rbd_client_list_lock); 499602adf40SYehuda Sadeh 500602adf40SYehuda Sadeh ceph_destroy_client(rbdc->client); 501602adf40SYehuda Sadeh kfree(rbdc); 502602adf40SYehuda Sadeh } 503602adf40SYehuda Sadeh 504602adf40SYehuda Sadeh /* 505602adf40SYehuda Sadeh * Drop reference to ceph client node. If it's not referenced anymore, release 506602adf40SYehuda Sadeh * it. 507602adf40SYehuda Sadeh */ 5089d3997fdSAlex Elder static void rbd_put_client(struct rbd_client *rbdc) 509602adf40SYehuda Sadeh { 510c53d5893SAlex Elder if (rbdc) 5119d3997fdSAlex Elder kref_put(&rbdc->kref, rbd_client_release); 512602adf40SYehuda Sadeh } 513602adf40SYehuda Sadeh 5141fec7093SYehuda Sadeh /* 5151fec7093SYehuda Sadeh * Destroy requests collection 5161fec7093SYehuda Sadeh */ 5171fec7093SYehuda Sadeh static void rbd_coll_release(struct kref *kref) 5181fec7093SYehuda Sadeh { 5191fec7093SYehuda Sadeh struct rbd_req_coll *coll = 5201fec7093SYehuda Sadeh container_of(kref, struct rbd_req_coll, kref); 5211fec7093SYehuda Sadeh 5221fec7093SYehuda Sadeh dout("rbd_coll_release %p\n", coll); 5231fec7093SYehuda Sadeh kfree(coll); 5241fec7093SYehuda Sadeh } 525602adf40SYehuda Sadeh 526a30b71b9SAlex Elder static bool rbd_image_format_valid(u32 image_format) 527a30b71b9SAlex Elder { 528a30b71b9SAlex Elder return image_format == 1 || image_format == 2; 529a30b71b9SAlex Elder } 530a30b71b9SAlex Elder 5318e94af8eSAlex Elder static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk) 5328e94af8eSAlex Elder { 533103a150fSAlex Elder size_t size; 534103a150fSAlex Elder u32 snap_count; 535103a150fSAlex Elder 536103a150fSAlex Elder /* The header has to start with the magic rbd header text */ 537103a150fSAlex Elder if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT))) 538103a150fSAlex Elder return false; 539103a150fSAlex Elder 540db2388b6SAlex Elder /* The bio layer requires at least sector-sized I/O */ 541db2388b6SAlex Elder 542db2388b6SAlex Elder if (ondisk->options.order < SECTOR_SHIFT) 543db2388b6SAlex Elder return false; 544db2388b6SAlex Elder 545db2388b6SAlex Elder /* If we use u64 in a few spots we may be able to loosen this */ 546db2388b6SAlex Elder 547db2388b6SAlex Elder if (ondisk->options.order > 8 * sizeof (int) - 1) 548db2388b6SAlex Elder return false; 549db2388b6SAlex Elder 550103a150fSAlex Elder /* 551103a150fSAlex Elder * The size of a snapshot header has to fit in a size_t, and 552103a150fSAlex Elder * that limits the number of snapshots. 553103a150fSAlex Elder */ 554103a150fSAlex Elder snap_count = le32_to_cpu(ondisk->snap_count); 555103a150fSAlex Elder size = SIZE_MAX - sizeof (struct ceph_snap_context); 556103a150fSAlex Elder if (snap_count > size / sizeof (__le64)) 557103a150fSAlex Elder return false; 558103a150fSAlex Elder 559103a150fSAlex Elder /* 560103a150fSAlex Elder * Not only that, but the size of the entire the snapshot 561103a150fSAlex Elder * header must also be representable in a size_t. 562103a150fSAlex Elder */ 563103a150fSAlex Elder size -= snap_count * sizeof (__le64); 564103a150fSAlex Elder if ((u64) size < le64_to_cpu(ondisk->snap_names_len)) 565103a150fSAlex Elder return false; 566103a150fSAlex Elder 567103a150fSAlex Elder return true; 5688e94af8eSAlex Elder } 5698e94af8eSAlex Elder 570602adf40SYehuda Sadeh /* 571602adf40SYehuda Sadeh * Create a new header structure, translate header format from the on-disk 572602adf40SYehuda Sadeh * header. 573602adf40SYehuda Sadeh */ 574602adf40SYehuda Sadeh static int rbd_header_from_disk(struct rbd_image_header *header, 5754156d998SAlex Elder struct rbd_image_header_ondisk *ondisk) 576602adf40SYehuda Sadeh { 577ccece235SAlex Elder u32 snap_count; 57858c17b0eSAlex Elder size_t len; 579d2bb24e5SAlex Elder size_t size; 580621901d6SAlex Elder u32 i; 581602adf40SYehuda Sadeh 5826a52325fSAlex Elder memset(header, 0, sizeof (*header)); 5836a52325fSAlex Elder 584103a150fSAlex Elder snap_count = le32_to_cpu(ondisk->snap_count); 585103a150fSAlex Elder 58658c17b0eSAlex Elder len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix)); 58758c17b0eSAlex Elder header->object_prefix = kmalloc(len + 1, GFP_KERNEL); 5886a52325fSAlex Elder if (!header->object_prefix) 589602adf40SYehuda Sadeh return -ENOMEM; 59058c17b0eSAlex Elder memcpy(header->object_prefix, ondisk->object_prefix, len); 59158c17b0eSAlex Elder header->object_prefix[len] = '\0'; 59200f1f36fSAlex Elder 593602adf40SYehuda Sadeh if (snap_count) { 594f785cc1dSAlex Elder u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len); 595f785cc1dSAlex Elder 596621901d6SAlex Elder /* Save a copy of the snapshot names */ 597621901d6SAlex Elder 598f785cc1dSAlex Elder if (snap_names_len > (u64) SIZE_MAX) 599f785cc1dSAlex Elder return -EIO; 600f785cc1dSAlex Elder header->snap_names = kmalloc(snap_names_len, GFP_KERNEL); 601602adf40SYehuda Sadeh if (!header->snap_names) 6026a52325fSAlex Elder goto out_err; 603f785cc1dSAlex Elder /* 604f785cc1dSAlex Elder * Note that rbd_dev_v1_header_read() guarantees 605f785cc1dSAlex Elder * the ondisk buffer we're working with has 606f785cc1dSAlex Elder * snap_names_len bytes beyond the end of the 607f785cc1dSAlex Elder * snapshot id array, this memcpy() is safe. 608f785cc1dSAlex Elder */ 609f785cc1dSAlex Elder memcpy(header->snap_names, &ondisk->snaps[snap_count], 610f785cc1dSAlex Elder snap_names_len); 6116a52325fSAlex Elder 612621901d6SAlex Elder /* Record each snapshot's size */ 613621901d6SAlex Elder 614d2bb24e5SAlex Elder size = snap_count * sizeof (*header->snap_sizes); 615d2bb24e5SAlex Elder header->snap_sizes = kmalloc(size, GFP_KERNEL); 616602adf40SYehuda Sadeh if (!header->snap_sizes) 6176a52325fSAlex Elder goto out_err; 618621901d6SAlex Elder for (i = 0; i < snap_count; i++) 619621901d6SAlex Elder header->snap_sizes[i] = 620621901d6SAlex Elder le64_to_cpu(ondisk->snaps[i].image_size); 621602adf40SYehuda Sadeh } else { 622ccece235SAlex Elder WARN_ON(ondisk->snap_names_len); 623602adf40SYehuda Sadeh header->snap_names = NULL; 624602adf40SYehuda Sadeh header->snap_sizes = NULL; 625602adf40SYehuda Sadeh } 626849b4260SAlex Elder 62734b13184SAlex Elder header->features = 0; /* No features support in v1 images */ 628602adf40SYehuda Sadeh header->obj_order = ondisk->options.order; 629602adf40SYehuda Sadeh header->crypt_type = ondisk->options.crypt_type; 630602adf40SYehuda Sadeh header->comp_type = ondisk->options.comp_type; 6316a52325fSAlex Elder 632621901d6SAlex Elder /* Allocate and fill in the snapshot context */ 633621901d6SAlex Elder 634f84344f3SAlex Elder header->image_size = le64_to_cpu(ondisk->image_size); 6356a52325fSAlex Elder size = sizeof (struct ceph_snap_context); 6366a52325fSAlex Elder size += snap_count * sizeof (header->snapc->snaps[0]); 6376a52325fSAlex Elder header->snapc = kzalloc(size, GFP_KERNEL); 6386a52325fSAlex Elder if (!header->snapc) 6396a52325fSAlex Elder goto out_err; 640602adf40SYehuda Sadeh 641602adf40SYehuda Sadeh atomic_set(&header->snapc->nref, 1); 642505cbb9bSAlex Elder header->snapc->seq = le64_to_cpu(ondisk->snap_seq); 643602adf40SYehuda Sadeh header->snapc->num_snaps = snap_count; 644621901d6SAlex Elder for (i = 0; i < snap_count; i++) 645602adf40SYehuda Sadeh header->snapc->snaps[i] = 646602adf40SYehuda Sadeh le64_to_cpu(ondisk->snaps[i].id); 647602adf40SYehuda Sadeh 648602adf40SYehuda Sadeh return 0; 649602adf40SYehuda Sadeh 6506a52325fSAlex Elder out_err: 651849b4260SAlex Elder kfree(header->snap_sizes); 652ccece235SAlex Elder header->snap_sizes = NULL; 653602adf40SYehuda Sadeh kfree(header->snap_names); 654ccece235SAlex Elder header->snap_names = NULL; 6556a52325fSAlex Elder kfree(header->object_prefix); 6566a52325fSAlex Elder header->object_prefix = NULL; 657ccece235SAlex Elder 65800f1f36fSAlex Elder return -ENOMEM; 659602adf40SYehuda Sadeh } 660602adf40SYehuda Sadeh 6618836b995SAlex Elder static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name) 662602adf40SYehuda Sadeh { 663602adf40SYehuda Sadeh 664e86924a8SAlex Elder struct rbd_snap *snap; 66500f1f36fSAlex Elder 666e86924a8SAlex Elder list_for_each_entry(snap, &rbd_dev->snaps, node) { 667e86924a8SAlex Elder if (!strcmp(snap_name, snap->name)) { 6680d7dbfceSAlex Elder rbd_dev->spec->snap_id = snap->id; 669e86924a8SAlex Elder rbd_dev->mapping.size = snap->size; 67034b13184SAlex Elder rbd_dev->mapping.features = snap->features; 67100f1f36fSAlex Elder 672e86924a8SAlex Elder return 0; 673602adf40SYehuda Sadeh } 67400f1f36fSAlex Elder } 675e86924a8SAlex Elder 67600f1f36fSAlex Elder return -ENOENT; 67700f1f36fSAlex Elder } 678602adf40SYehuda Sadeh 679819d52bfSAlex Elder static int rbd_dev_set_mapping(struct rbd_device *rbd_dev) 680602adf40SYehuda Sadeh { 68178dc447dSAlex Elder int ret; 682602adf40SYehuda Sadeh 6830d7dbfceSAlex Elder if (!memcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME, 684cc9d734cSJosh Durgin sizeof (RBD_SNAP_HEAD_NAME))) { 6850d7dbfceSAlex Elder rbd_dev->spec->snap_id = CEPH_NOSNAP; 68699c1f08fSAlex Elder rbd_dev->mapping.size = rbd_dev->header.image_size; 68734b13184SAlex Elder rbd_dev->mapping.features = rbd_dev->header.features; 688e86924a8SAlex Elder ret = 0; 689602adf40SYehuda Sadeh } else { 6900d7dbfceSAlex Elder ret = snap_by_name(rbd_dev, rbd_dev->spec->snap_name); 691602adf40SYehuda Sadeh if (ret < 0) 692602adf40SYehuda Sadeh goto done; 693f84344f3SAlex Elder rbd_dev->mapping.read_only = true; 694602adf40SYehuda Sadeh } 695daba5fdbSAlex Elder rbd_dev->exists = true; 696602adf40SYehuda Sadeh done: 697602adf40SYehuda Sadeh return ret; 698602adf40SYehuda Sadeh } 699602adf40SYehuda Sadeh 700602adf40SYehuda Sadeh static void rbd_header_free(struct rbd_image_header *header) 701602adf40SYehuda Sadeh { 702849b4260SAlex Elder kfree(header->object_prefix); 703d78fd7aeSAlex Elder header->object_prefix = NULL; 704602adf40SYehuda Sadeh kfree(header->snap_sizes); 705d78fd7aeSAlex Elder header->snap_sizes = NULL; 706849b4260SAlex Elder kfree(header->snap_names); 707d78fd7aeSAlex Elder header->snap_names = NULL; 708d1d25646SJosh Durgin ceph_put_snap_context(header->snapc); 709d78fd7aeSAlex Elder header->snapc = NULL; 710602adf40SYehuda Sadeh } 711602adf40SYehuda Sadeh 71265ccfe21SAlex Elder static char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset) 713602adf40SYehuda Sadeh { 71465ccfe21SAlex Elder char *name; 71565ccfe21SAlex Elder u64 segment; 71665ccfe21SAlex Elder int ret; 717602adf40SYehuda Sadeh 71865ccfe21SAlex Elder name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO); 71965ccfe21SAlex Elder if (!name) 72065ccfe21SAlex Elder return NULL; 72165ccfe21SAlex Elder segment = offset >> rbd_dev->header.obj_order; 72265ccfe21SAlex Elder ret = snprintf(name, RBD_MAX_SEG_NAME_LEN, "%s.%012llx", 72365ccfe21SAlex Elder rbd_dev->header.object_prefix, segment); 72465ccfe21SAlex Elder if (ret < 0 || ret >= RBD_MAX_SEG_NAME_LEN) { 72565ccfe21SAlex Elder pr_err("error formatting segment name for #%llu (%d)\n", 72665ccfe21SAlex Elder segment, ret); 72765ccfe21SAlex Elder kfree(name); 72865ccfe21SAlex Elder name = NULL; 72965ccfe21SAlex Elder } 730602adf40SYehuda Sadeh 73165ccfe21SAlex Elder return name; 73265ccfe21SAlex Elder } 733602adf40SYehuda Sadeh 73465ccfe21SAlex Elder static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset) 73565ccfe21SAlex Elder { 73665ccfe21SAlex Elder u64 segment_size = (u64) 1 << rbd_dev->header.obj_order; 737602adf40SYehuda Sadeh 73865ccfe21SAlex Elder return offset & (segment_size - 1); 73965ccfe21SAlex Elder } 74065ccfe21SAlex Elder 74165ccfe21SAlex Elder static u64 rbd_segment_length(struct rbd_device *rbd_dev, 74265ccfe21SAlex Elder u64 offset, u64 length) 74365ccfe21SAlex Elder { 74465ccfe21SAlex Elder u64 segment_size = (u64) 1 << rbd_dev->header.obj_order; 74565ccfe21SAlex Elder 74665ccfe21SAlex Elder offset &= segment_size - 1; 74765ccfe21SAlex Elder 748aafb230eSAlex Elder rbd_assert(length <= U64_MAX - offset); 74965ccfe21SAlex Elder if (offset + length > segment_size) 75065ccfe21SAlex Elder length = segment_size - offset; 75165ccfe21SAlex Elder 75265ccfe21SAlex Elder return length; 753602adf40SYehuda Sadeh } 754602adf40SYehuda Sadeh 7551fec7093SYehuda Sadeh static int rbd_get_num_segments(struct rbd_image_header *header, 7561fec7093SYehuda Sadeh u64 ofs, u64 len) 7571fec7093SYehuda Sadeh { 758df111be6SAlex Elder u64 start_seg; 759df111be6SAlex Elder u64 end_seg; 760df111be6SAlex Elder 761df111be6SAlex Elder if (!len) 762df111be6SAlex Elder return 0; 763df111be6SAlex Elder if (len - 1 > U64_MAX - ofs) 764df111be6SAlex Elder return -ERANGE; 765df111be6SAlex Elder 766df111be6SAlex Elder start_seg = ofs >> header->obj_order; 767df111be6SAlex Elder end_seg = (ofs + len - 1) >> header->obj_order; 768df111be6SAlex Elder 7691fec7093SYehuda Sadeh return end_seg - start_seg + 1; 7701fec7093SYehuda Sadeh } 7711fec7093SYehuda Sadeh 772602adf40SYehuda Sadeh /* 773029bcbd8SJosh Durgin * returns the size of an object in the image 774029bcbd8SJosh Durgin */ 775029bcbd8SJosh Durgin static u64 rbd_obj_bytes(struct rbd_image_header *header) 776029bcbd8SJosh Durgin { 777029bcbd8SJosh Durgin return 1 << header->obj_order; 778029bcbd8SJosh Durgin } 779029bcbd8SJosh Durgin 780029bcbd8SJosh Durgin /* 781602adf40SYehuda Sadeh * bio helpers 782602adf40SYehuda Sadeh */ 783602adf40SYehuda Sadeh 784602adf40SYehuda Sadeh static void bio_chain_put(struct bio *chain) 785602adf40SYehuda Sadeh { 786602adf40SYehuda Sadeh struct bio *tmp; 787602adf40SYehuda Sadeh 788602adf40SYehuda Sadeh while (chain) { 789602adf40SYehuda Sadeh tmp = chain; 790602adf40SYehuda Sadeh chain = chain->bi_next; 791602adf40SYehuda Sadeh bio_put(tmp); 792602adf40SYehuda Sadeh } 793602adf40SYehuda Sadeh } 794602adf40SYehuda Sadeh 795602adf40SYehuda Sadeh /* 796602adf40SYehuda Sadeh * zeros a bio chain, starting at specific offset 797602adf40SYehuda Sadeh */ 798602adf40SYehuda Sadeh static void zero_bio_chain(struct bio *chain, int start_ofs) 799602adf40SYehuda Sadeh { 800602adf40SYehuda Sadeh struct bio_vec *bv; 801602adf40SYehuda Sadeh unsigned long flags; 802602adf40SYehuda Sadeh void *buf; 803602adf40SYehuda Sadeh int i; 804602adf40SYehuda Sadeh int pos = 0; 805602adf40SYehuda Sadeh 806602adf40SYehuda Sadeh while (chain) { 807602adf40SYehuda Sadeh bio_for_each_segment(bv, chain, i) { 808602adf40SYehuda Sadeh if (pos + bv->bv_len > start_ofs) { 809602adf40SYehuda Sadeh int remainder = max(start_ofs - pos, 0); 810602adf40SYehuda Sadeh buf = bvec_kmap_irq(bv, &flags); 811602adf40SYehuda Sadeh memset(buf + remainder, 0, 812602adf40SYehuda Sadeh bv->bv_len - remainder); 81385b5aaa6SDan Carpenter bvec_kunmap_irq(buf, &flags); 814602adf40SYehuda Sadeh } 815602adf40SYehuda Sadeh pos += bv->bv_len; 816602adf40SYehuda Sadeh } 817602adf40SYehuda Sadeh 818602adf40SYehuda Sadeh chain = chain->bi_next; 819602adf40SYehuda Sadeh } 820602adf40SYehuda Sadeh } 821602adf40SYehuda Sadeh 822602adf40SYehuda Sadeh /* 823f7760dadSAlex Elder * Clone a portion of a bio, starting at the given byte offset 824f7760dadSAlex Elder * and continuing for the number of bytes indicated. 825602adf40SYehuda Sadeh */ 826f7760dadSAlex Elder static struct bio *bio_clone_range(struct bio *bio_src, 827f7760dadSAlex Elder unsigned int offset, 828f7760dadSAlex Elder unsigned int len, 829f7760dadSAlex Elder gfp_t gfpmask) 830602adf40SYehuda Sadeh { 831f7760dadSAlex Elder struct bio_vec *bv; 832f7760dadSAlex Elder unsigned int resid; 833f7760dadSAlex Elder unsigned short idx; 834f7760dadSAlex Elder unsigned int voff; 835f7760dadSAlex Elder unsigned short end_idx; 836f7760dadSAlex Elder unsigned short vcnt; 837f7760dadSAlex Elder struct bio *bio; 838602adf40SYehuda Sadeh 839f7760dadSAlex Elder /* Handle the easy case for the caller */ 840f7760dadSAlex Elder 841f7760dadSAlex Elder if (!offset && len == bio_src->bi_size) 842f7760dadSAlex Elder return bio_clone(bio_src, gfpmask); 843f7760dadSAlex Elder 844f7760dadSAlex Elder if (WARN_ON_ONCE(!len)) 845f7760dadSAlex Elder return NULL; 846f7760dadSAlex Elder if (WARN_ON_ONCE(len > bio_src->bi_size)) 847f7760dadSAlex Elder return NULL; 848f7760dadSAlex Elder if (WARN_ON_ONCE(offset > bio_src->bi_size - len)) 849f7760dadSAlex Elder return NULL; 850f7760dadSAlex Elder 851f7760dadSAlex Elder /* Find first affected segment... */ 852f7760dadSAlex Elder 853f7760dadSAlex Elder resid = offset; 854f7760dadSAlex Elder __bio_for_each_segment(bv, bio_src, idx, 0) { 855f7760dadSAlex Elder if (resid < bv->bv_len) 856f7760dadSAlex Elder break; 857f7760dadSAlex Elder resid -= bv->bv_len; 858602adf40SYehuda Sadeh } 859f7760dadSAlex Elder voff = resid; 860602adf40SYehuda Sadeh 861f7760dadSAlex Elder /* ...and the last affected segment */ 862542582fcSAlex Elder 863f7760dadSAlex Elder resid += len; 864f7760dadSAlex Elder __bio_for_each_segment(bv, bio_src, end_idx, idx) { 865f7760dadSAlex Elder if (resid <= bv->bv_len) 866f7760dadSAlex Elder break; 867f7760dadSAlex Elder resid -= bv->bv_len; 868f7760dadSAlex Elder } 869f7760dadSAlex Elder vcnt = end_idx - idx + 1; 870602adf40SYehuda Sadeh 871f7760dadSAlex Elder /* Build the clone */ 872f7760dadSAlex Elder 873f7760dadSAlex Elder bio = bio_alloc(gfpmask, (unsigned int) vcnt); 874f7760dadSAlex Elder if (!bio) 875f7760dadSAlex Elder return NULL; /* ENOMEM */ 876f7760dadSAlex Elder 877f7760dadSAlex Elder bio->bi_bdev = bio_src->bi_bdev; 878f7760dadSAlex Elder bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT); 879f7760dadSAlex Elder bio->bi_rw = bio_src->bi_rw; 880f7760dadSAlex Elder bio->bi_flags |= 1 << BIO_CLONED; 881602adf40SYehuda Sadeh 882602adf40SYehuda Sadeh /* 883f7760dadSAlex Elder * Copy over our part of the bio_vec, then update the first 884f7760dadSAlex Elder * and last (or only) entries. 885602adf40SYehuda Sadeh */ 886f7760dadSAlex Elder memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx], 887f7760dadSAlex Elder vcnt * sizeof (struct bio_vec)); 888f7760dadSAlex Elder bio->bi_io_vec[0].bv_offset += voff; 889f7760dadSAlex Elder if (vcnt > 1) { 890f7760dadSAlex Elder bio->bi_io_vec[0].bv_len -= voff; 891f7760dadSAlex Elder bio->bi_io_vec[vcnt - 1].bv_len = resid; 892602adf40SYehuda Sadeh } else { 893f7760dadSAlex Elder bio->bi_io_vec[0].bv_len = len; 894602adf40SYehuda Sadeh } 895602adf40SYehuda Sadeh 896f7760dadSAlex Elder bio->bi_vcnt = vcnt; 897f7760dadSAlex Elder bio->bi_size = len; 898f7760dadSAlex Elder bio->bi_idx = 0; 899602adf40SYehuda Sadeh 900f7760dadSAlex Elder return bio; 901602adf40SYehuda Sadeh } 902602adf40SYehuda Sadeh 903f7760dadSAlex Elder /* 904f7760dadSAlex Elder * Clone a portion of a bio chain, starting at the given byte offset 905f7760dadSAlex Elder * into the first bio in the source chain and continuing for the 906f7760dadSAlex Elder * number of bytes indicated. The result is another bio chain of 907f7760dadSAlex Elder * exactly the given length, or a null pointer on error. 908f7760dadSAlex Elder * 909f7760dadSAlex Elder * The bio_src and offset parameters are both in-out. On entry they 910f7760dadSAlex Elder * refer to the first source bio and the offset into that bio where 911f7760dadSAlex Elder * the start of data to be cloned is located. 912f7760dadSAlex Elder * 913f7760dadSAlex Elder * On return, bio_src is updated to refer to the bio in the source 914f7760dadSAlex Elder * chain that contains first un-cloned byte, and *offset will 915f7760dadSAlex Elder * contain the offset of that byte within that bio. 916f7760dadSAlex Elder */ 917f7760dadSAlex Elder static struct bio *bio_chain_clone_range(struct bio **bio_src, 918f7760dadSAlex Elder unsigned int *offset, 919f7760dadSAlex Elder unsigned int len, 920f7760dadSAlex Elder gfp_t gfpmask) 921f7760dadSAlex Elder { 922f7760dadSAlex Elder struct bio *bi = *bio_src; 923f7760dadSAlex Elder unsigned int off = *offset; 924f7760dadSAlex Elder struct bio *chain = NULL; 925f7760dadSAlex Elder struct bio **end; 926602adf40SYehuda Sadeh 927f7760dadSAlex Elder /* Build up a chain of clone bios up to the limit */ 928602adf40SYehuda Sadeh 929f7760dadSAlex Elder if (!bi || off >= bi->bi_size || !len) 930f7760dadSAlex Elder return NULL; /* Nothing to clone */ 931602adf40SYehuda Sadeh 932f7760dadSAlex Elder end = &chain; 933f7760dadSAlex Elder while (len) { 934f7760dadSAlex Elder unsigned int bi_size; 935f7760dadSAlex Elder struct bio *bio; 936f7760dadSAlex Elder 937f7760dadSAlex Elder if (!bi) 938f7760dadSAlex Elder goto out_err; /* EINVAL; ran out of bio's */ 939f7760dadSAlex Elder bi_size = min_t(unsigned int, bi->bi_size - off, len); 940f7760dadSAlex Elder bio = bio_clone_range(bi, off, bi_size, gfpmask); 941f7760dadSAlex Elder if (!bio) 942f7760dadSAlex Elder goto out_err; /* ENOMEM */ 943f7760dadSAlex Elder 944f7760dadSAlex Elder *end = bio; 945f7760dadSAlex Elder end = &bio->bi_next; 946f7760dadSAlex Elder 947f7760dadSAlex Elder off += bi_size; 948f7760dadSAlex Elder if (off == bi->bi_size) { 949f7760dadSAlex Elder bi = bi->bi_next; 950f7760dadSAlex Elder off = 0; 951f7760dadSAlex Elder } 952f7760dadSAlex Elder len -= bi_size; 953f7760dadSAlex Elder } 954f7760dadSAlex Elder *bio_src = bi; 955f7760dadSAlex Elder *offset = off; 956f7760dadSAlex Elder 957f7760dadSAlex Elder return chain; 958f7760dadSAlex Elder out_err: 959f7760dadSAlex Elder bio_chain_put(chain); 960f7760dadSAlex Elder 961602adf40SYehuda Sadeh return NULL; 962602adf40SYehuda Sadeh } 963602adf40SYehuda Sadeh 964602adf40SYehuda Sadeh /* 965602adf40SYehuda Sadeh * helpers for osd request op vectors. 966602adf40SYehuda Sadeh */ 96757cfc106SAlex Elder static struct ceph_osd_req_op *rbd_create_rw_ops(int num_ops, 96857cfc106SAlex Elder int opcode, u32 payload_len) 969602adf40SYehuda Sadeh { 97057cfc106SAlex Elder struct ceph_osd_req_op *ops; 97157cfc106SAlex Elder 97257cfc106SAlex Elder ops = kzalloc(sizeof (*ops) * (num_ops + 1), GFP_NOIO); 97357cfc106SAlex Elder if (!ops) 97457cfc106SAlex Elder return NULL; 97557cfc106SAlex Elder 97657cfc106SAlex Elder ops[0].op = opcode; 97757cfc106SAlex Elder 978602adf40SYehuda Sadeh /* 979602adf40SYehuda Sadeh * op extent offset and length will be set later on 980602adf40SYehuda Sadeh * in calc_raw_layout() 981602adf40SYehuda Sadeh */ 98257cfc106SAlex Elder ops[0].payload_len = payload_len; 98357cfc106SAlex Elder 98457cfc106SAlex Elder return ops; 985602adf40SYehuda Sadeh } 986602adf40SYehuda Sadeh 987602adf40SYehuda Sadeh static void rbd_destroy_ops(struct ceph_osd_req_op *ops) 988602adf40SYehuda Sadeh { 989602adf40SYehuda Sadeh kfree(ops); 990602adf40SYehuda Sadeh } 991602adf40SYehuda Sadeh 9921fec7093SYehuda Sadeh static void rbd_coll_end_req_index(struct request *rq, 9931fec7093SYehuda Sadeh struct rbd_req_coll *coll, 9941fec7093SYehuda Sadeh int index, 9951fec7093SYehuda Sadeh int ret, u64 len) 9961fec7093SYehuda Sadeh { 9971fec7093SYehuda Sadeh struct request_queue *q; 9981fec7093SYehuda Sadeh int min, max, i; 9991fec7093SYehuda Sadeh 1000bd919d45SAlex Elder dout("rbd_coll_end_req_index %p index %d ret %d len %llu\n", 1001bd919d45SAlex Elder coll, index, ret, (unsigned long long) len); 10021fec7093SYehuda Sadeh 10031fec7093SYehuda Sadeh if (!rq) 10041fec7093SYehuda Sadeh return; 10051fec7093SYehuda Sadeh 10061fec7093SYehuda Sadeh if (!coll) { 10071fec7093SYehuda Sadeh blk_end_request(rq, ret, len); 10081fec7093SYehuda Sadeh return; 10091fec7093SYehuda Sadeh } 10101fec7093SYehuda Sadeh 10111fec7093SYehuda Sadeh q = rq->q; 10121fec7093SYehuda Sadeh 10131fec7093SYehuda Sadeh spin_lock_irq(q->queue_lock); 10141fec7093SYehuda Sadeh coll->status[index].done = 1; 10151fec7093SYehuda Sadeh coll->status[index].rc = ret; 10161fec7093SYehuda Sadeh coll->status[index].bytes = len; 10171fec7093SYehuda Sadeh max = min = coll->num_done; 10181fec7093SYehuda Sadeh while (max < coll->total && coll->status[max].done) 10191fec7093SYehuda Sadeh max++; 10201fec7093SYehuda Sadeh 10211fec7093SYehuda Sadeh for (i = min; i<max; i++) { 10221fec7093SYehuda Sadeh __blk_end_request(rq, coll->status[i].rc, 10231fec7093SYehuda Sadeh coll->status[i].bytes); 10241fec7093SYehuda Sadeh coll->num_done++; 10251fec7093SYehuda Sadeh kref_put(&coll->kref, rbd_coll_release); 10261fec7093SYehuda Sadeh } 10271fec7093SYehuda Sadeh spin_unlock_irq(q->queue_lock); 10281fec7093SYehuda Sadeh } 10291fec7093SYehuda Sadeh 10301fec7093SYehuda Sadeh static void rbd_coll_end_req(struct rbd_request *req, 10311fec7093SYehuda Sadeh int ret, u64 len) 10321fec7093SYehuda Sadeh { 10331fec7093SYehuda Sadeh rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len); 10341fec7093SYehuda Sadeh } 10351fec7093SYehuda Sadeh 1036602adf40SYehuda Sadeh /* 1037602adf40SYehuda Sadeh * Send ceph osd request 1038602adf40SYehuda Sadeh */ 1039602adf40SYehuda Sadeh static int rbd_do_request(struct request *rq, 10400ce1a794SAlex Elder struct rbd_device *rbd_dev, 1041602adf40SYehuda Sadeh struct ceph_snap_context *snapc, 1042602adf40SYehuda Sadeh u64 snapid, 1043aded07eaSAlex Elder const char *object_name, u64 ofs, u64 len, 1044602adf40SYehuda Sadeh struct bio *bio, 1045602adf40SYehuda Sadeh struct page **pages, 1046602adf40SYehuda Sadeh int num_pages, 1047602adf40SYehuda Sadeh int flags, 1048602adf40SYehuda Sadeh struct ceph_osd_req_op *ops, 10491fec7093SYehuda Sadeh struct rbd_req_coll *coll, 10501fec7093SYehuda Sadeh int coll_index, 1051602adf40SYehuda Sadeh void (*rbd_cb)(struct ceph_osd_request *req, 105259c2be1eSYehuda Sadeh struct ceph_msg *msg), 105359c2be1eSYehuda Sadeh struct ceph_osd_request **linger_req, 105459c2be1eSYehuda Sadeh u64 *ver) 1055602adf40SYehuda Sadeh { 1056602adf40SYehuda Sadeh struct ceph_osd_request *req; 1057602adf40SYehuda Sadeh struct ceph_file_layout *layout; 1058602adf40SYehuda Sadeh int ret; 1059602adf40SYehuda Sadeh u64 bno; 1060602adf40SYehuda Sadeh struct timespec mtime = CURRENT_TIME; 1061602adf40SYehuda Sadeh struct rbd_request *req_data; 1062602adf40SYehuda Sadeh struct ceph_osd_request_head *reqhead; 10631dbb4399SAlex Elder struct ceph_osd_client *osdc; 1064602adf40SYehuda Sadeh 1065602adf40SYehuda Sadeh req_data = kzalloc(sizeof(*req_data), GFP_NOIO); 10661fec7093SYehuda Sadeh if (!req_data) { 10671fec7093SYehuda Sadeh if (coll) 10681fec7093SYehuda Sadeh rbd_coll_end_req_index(rq, coll, coll_index, 10691fec7093SYehuda Sadeh -ENOMEM, len); 10701fec7093SYehuda Sadeh return -ENOMEM; 10711fec7093SYehuda Sadeh } 1072602adf40SYehuda Sadeh 10731fec7093SYehuda Sadeh if (coll) { 10741fec7093SYehuda Sadeh req_data->coll = coll; 10751fec7093SYehuda Sadeh req_data->coll_index = coll_index; 10761fec7093SYehuda Sadeh } 10771fec7093SYehuda Sadeh 1078f7760dadSAlex Elder dout("rbd_do_request object_name=%s ofs=%llu len=%llu coll=%p[%d]\n", 1079f7760dadSAlex Elder object_name, (unsigned long long) ofs, 1080f7760dadSAlex Elder (unsigned long long) len, coll, coll_index); 1081602adf40SYehuda Sadeh 10820ce1a794SAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 10831dbb4399SAlex Elder req = ceph_osdc_alloc_request(osdc, flags, snapc, ops, 10841dbb4399SAlex Elder false, GFP_NOIO, pages, bio); 10854ad12621SSage Weil if (!req) { 10864ad12621SSage Weil ret = -ENOMEM; 1087602adf40SYehuda Sadeh goto done_pages; 1088602adf40SYehuda Sadeh } 1089602adf40SYehuda Sadeh 1090602adf40SYehuda Sadeh req->r_callback = rbd_cb; 1091602adf40SYehuda Sadeh 1092602adf40SYehuda Sadeh req_data->rq = rq; 1093602adf40SYehuda Sadeh req_data->bio = bio; 1094602adf40SYehuda Sadeh req_data->pages = pages; 1095602adf40SYehuda Sadeh req_data->len = len; 1096602adf40SYehuda Sadeh 1097602adf40SYehuda Sadeh req->r_priv = req_data; 1098602adf40SYehuda Sadeh 1099602adf40SYehuda Sadeh reqhead = req->r_request->front.iov_base; 1100602adf40SYehuda Sadeh reqhead->snapid = cpu_to_le64(CEPH_NOSNAP); 1101602adf40SYehuda Sadeh 1102aded07eaSAlex Elder strncpy(req->r_oid, object_name, sizeof(req->r_oid)); 1103602adf40SYehuda Sadeh req->r_oid_len = strlen(req->r_oid); 1104602adf40SYehuda Sadeh 1105602adf40SYehuda Sadeh layout = &req->r_file_layout; 1106602adf40SYehuda Sadeh memset(layout, 0, sizeof(*layout)); 1107602adf40SYehuda Sadeh layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); 1108602adf40SYehuda Sadeh layout->fl_stripe_count = cpu_to_le32(1); 1109602adf40SYehuda Sadeh layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); 11100d7dbfceSAlex Elder layout->fl_pg_pool = cpu_to_le32((int) rbd_dev->spec->pool_id); 11116cae3717SSage Weil ret = ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno, 11121dbb4399SAlex Elder req, ops); 11136cae3717SSage Weil rbd_assert(ret == 0); 1114602adf40SYehuda Sadeh 1115602adf40SYehuda Sadeh ceph_osdc_build_request(req, ofs, &len, 1116602adf40SYehuda Sadeh ops, 1117602adf40SYehuda Sadeh snapc, 1118602adf40SYehuda Sadeh &mtime, 1119602adf40SYehuda Sadeh req->r_oid, req->r_oid_len); 1120602adf40SYehuda Sadeh 112159c2be1eSYehuda Sadeh if (linger_req) { 11221dbb4399SAlex Elder ceph_osdc_set_request_linger(osdc, req); 112359c2be1eSYehuda Sadeh *linger_req = req; 112459c2be1eSYehuda Sadeh } 112559c2be1eSYehuda Sadeh 11261dbb4399SAlex Elder ret = ceph_osdc_start_request(osdc, req, false); 1127602adf40SYehuda Sadeh if (ret < 0) 1128602adf40SYehuda Sadeh goto done_err; 1129602adf40SYehuda Sadeh 1130602adf40SYehuda Sadeh if (!rbd_cb) { 11311dbb4399SAlex Elder ret = ceph_osdc_wait_request(osdc, req); 113259c2be1eSYehuda Sadeh if (ver) 113359c2be1eSYehuda Sadeh *ver = le64_to_cpu(req->r_reassert_version.version); 1134bd919d45SAlex Elder dout("reassert_ver=%llu\n", 1135bd919d45SAlex Elder (unsigned long long) 11361fec7093SYehuda Sadeh le64_to_cpu(req->r_reassert_version.version)); 1137602adf40SYehuda Sadeh ceph_osdc_put_request(req); 1138602adf40SYehuda Sadeh } 1139602adf40SYehuda Sadeh return ret; 1140602adf40SYehuda Sadeh 1141602adf40SYehuda Sadeh done_err: 1142602adf40SYehuda Sadeh bio_chain_put(req_data->bio); 1143602adf40SYehuda Sadeh ceph_osdc_put_request(req); 1144602adf40SYehuda Sadeh done_pages: 11451fec7093SYehuda Sadeh rbd_coll_end_req(req_data, ret, len); 1146602adf40SYehuda Sadeh kfree(req_data); 1147602adf40SYehuda Sadeh return ret; 1148602adf40SYehuda Sadeh } 1149602adf40SYehuda Sadeh 1150602adf40SYehuda Sadeh /* 1151602adf40SYehuda Sadeh * Ceph osd op callback 1152602adf40SYehuda Sadeh */ 1153602adf40SYehuda Sadeh static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg) 1154602adf40SYehuda Sadeh { 1155602adf40SYehuda Sadeh struct rbd_request *req_data = req->r_priv; 1156602adf40SYehuda Sadeh struct ceph_osd_reply_head *replyhead; 1157602adf40SYehuda Sadeh struct ceph_osd_op *op; 1158602adf40SYehuda Sadeh __s32 rc; 1159602adf40SYehuda Sadeh u64 bytes; 1160602adf40SYehuda Sadeh int read_op; 1161602adf40SYehuda Sadeh 1162602adf40SYehuda Sadeh /* parse reply */ 1163602adf40SYehuda Sadeh replyhead = msg->front.iov_base; 1164602adf40SYehuda Sadeh WARN_ON(le32_to_cpu(replyhead->num_ops) == 0); 1165602adf40SYehuda Sadeh op = (void *)(replyhead + 1); 1166602adf40SYehuda Sadeh rc = le32_to_cpu(replyhead->result); 1167602adf40SYehuda Sadeh bytes = le64_to_cpu(op->extent.length); 1168895cfcc8SDan Carpenter read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ); 1169602adf40SYehuda Sadeh 1170bd919d45SAlex Elder dout("rbd_req_cb bytes=%llu readop=%d rc=%d\n", 1171bd919d45SAlex Elder (unsigned long long) bytes, read_op, (int) rc); 1172602adf40SYehuda Sadeh 1173602adf40SYehuda Sadeh if (rc == -ENOENT && read_op) { 1174602adf40SYehuda Sadeh zero_bio_chain(req_data->bio, 0); 1175602adf40SYehuda Sadeh rc = 0; 1176602adf40SYehuda Sadeh } else if (rc == 0 && read_op && bytes < req_data->len) { 1177602adf40SYehuda Sadeh zero_bio_chain(req_data->bio, bytes); 1178602adf40SYehuda Sadeh bytes = req_data->len; 1179602adf40SYehuda Sadeh } 1180602adf40SYehuda Sadeh 11811fec7093SYehuda Sadeh rbd_coll_end_req(req_data, rc, bytes); 1182602adf40SYehuda Sadeh 1183602adf40SYehuda Sadeh if (req_data->bio) 1184602adf40SYehuda Sadeh bio_chain_put(req_data->bio); 1185602adf40SYehuda Sadeh 1186602adf40SYehuda Sadeh ceph_osdc_put_request(req); 1187602adf40SYehuda Sadeh kfree(req_data); 1188602adf40SYehuda Sadeh } 1189602adf40SYehuda Sadeh 119059c2be1eSYehuda Sadeh static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg) 119159c2be1eSYehuda Sadeh { 119259c2be1eSYehuda Sadeh ceph_osdc_put_request(req); 119359c2be1eSYehuda Sadeh } 119459c2be1eSYehuda Sadeh 1195602adf40SYehuda Sadeh /* 1196602adf40SYehuda Sadeh * Do a synchronous ceph osd operation 1197602adf40SYehuda Sadeh */ 11980ce1a794SAlex Elder static int rbd_req_sync_op(struct rbd_device *rbd_dev, 1199602adf40SYehuda Sadeh struct ceph_snap_context *snapc, 1200602adf40SYehuda Sadeh u64 snapid, 1201602adf40SYehuda Sadeh int flags, 1202913d2fdcSAlex Elder struct ceph_osd_req_op *ops, 1203aded07eaSAlex Elder const char *object_name, 1204f8d4de6eSAlex Elder u64 ofs, u64 inbound_size, 1205f8d4de6eSAlex Elder char *inbound, 120659c2be1eSYehuda Sadeh struct ceph_osd_request **linger_req, 120759c2be1eSYehuda Sadeh u64 *ver) 1208602adf40SYehuda Sadeh { 1209602adf40SYehuda Sadeh int ret; 1210602adf40SYehuda Sadeh struct page **pages; 1211602adf40SYehuda Sadeh int num_pages; 1212913d2fdcSAlex Elder 1213aafb230eSAlex Elder rbd_assert(ops != NULL); 1214602adf40SYehuda Sadeh 1215f8d4de6eSAlex Elder num_pages = calc_pages_for(ofs, inbound_size); 1216602adf40SYehuda Sadeh pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); 1217b8d0638aSDan Carpenter if (IS_ERR(pages)) 1218b8d0638aSDan Carpenter return PTR_ERR(pages); 1219602adf40SYehuda Sadeh 12200ce1a794SAlex Elder ret = rbd_do_request(NULL, rbd_dev, snapc, snapid, 1221f8d4de6eSAlex Elder object_name, ofs, inbound_size, NULL, 1222602adf40SYehuda Sadeh pages, num_pages, 1223602adf40SYehuda Sadeh flags, 1224602adf40SYehuda Sadeh ops, 12251fec7093SYehuda Sadeh NULL, 0, 122659c2be1eSYehuda Sadeh NULL, 122759c2be1eSYehuda Sadeh linger_req, ver); 1228602adf40SYehuda Sadeh if (ret < 0) 1229913d2fdcSAlex Elder goto done; 1230602adf40SYehuda Sadeh 1231f8d4de6eSAlex Elder if ((flags & CEPH_OSD_FLAG_READ) && inbound) 1232f8d4de6eSAlex Elder ret = ceph_copy_from_page_vector(pages, inbound, ofs, ret); 1233602adf40SYehuda Sadeh 1234602adf40SYehuda Sadeh done: 1235602adf40SYehuda Sadeh ceph_release_page_vector(pages, num_pages); 1236602adf40SYehuda Sadeh return ret; 1237602adf40SYehuda Sadeh } 1238602adf40SYehuda Sadeh 1239602adf40SYehuda Sadeh /* 1240602adf40SYehuda Sadeh * Do an asynchronous ceph osd operation 1241602adf40SYehuda Sadeh */ 1242602adf40SYehuda Sadeh static int rbd_do_op(struct request *rq, 1243602adf40SYehuda Sadeh struct rbd_device *rbd_dev, 1244602adf40SYehuda Sadeh struct ceph_snap_context *snapc, 1245602adf40SYehuda Sadeh u64 ofs, u64 len, 12461fec7093SYehuda Sadeh struct bio *bio, 12471fec7093SYehuda Sadeh struct rbd_req_coll *coll, 12481fec7093SYehuda Sadeh int coll_index) 1249602adf40SYehuda Sadeh { 1250602adf40SYehuda Sadeh char *seg_name; 1251602adf40SYehuda Sadeh u64 seg_ofs; 1252602adf40SYehuda Sadeh u64 seg_len; 1253602adf40SYehuda Sadeh int ret; 1254602adf40SYehuda Sadeh struct ceph_osd_req_op *ops; 1255602adf40SYehuda Sadeh u32 payload_len; 1256ff2e4bb5SAlex Elder int opcode; 1257ff2e4bb5SAlex Elder int flags; 12584634246dSAlex Elder u64 snapid; 1259602adf40SYehuda Sadeh 126065ccfe21SAlex Elder seg_name = rbd_segment_name(rbd_dev, ofs); 1261602adf40SYehuda Sadeh if (!seg_name) 1262602adf40SYehuda Sadeh return -ENOMEM; 126365ccfe21SAlex Elder seg_len = rbd_segment_length(rbd_dev, ofs, len); 126465ccfe21SAlex Elder seg_ofs = rbd_segment_offset(rbd_dev, ofs); 1265602adf40SYehuda Sadeh 1266ff2e4bb5SAlex Elder if (rq_data_dir(rq) == WRITE) { 1267ff2e4bb5SAlex Elder opcode = CEPH_OSD_OP_WRITE; 1268ff2e4bb5SAlex Elder flags = CEPH_OSD_FLAG_WRITE|CEPH_OSD_FLAG_ONDISK; 12694634246dSAlex Elder snapid = CEPH_NOSNAP; 1270ff2e4bb5SAlex Elder payload_len = seg_len; 1271ff2e4bb5SAlex Elder } else { 1272ff2e4bb5SAlex Elder opcode = CEPH_OSD_OP_READ; 1273ff2e4bb5SAlex Elder flags = CEPH_OSD_FLAG_READ; 12744634246dSAlex Elder snapc = NULL; 12750d7dbfceSAlex Elder snapid = rbd_dev->spec->snap_id; 1276ff2e4bb5SAlex Elder payload_len = 0; 1277ff2e4bb5SAlex Elder } 1278602adf40SYehuda Sadeh 127957cfc106SAlex Elder ret = -ENOMEM; 128057cfc106SAlex Elder ops = rbd_create_rw_ops(1, opcode, payload_len); 128157cfc106SAlex Elder if (!ops) 1282602adf40SYehuda Sadeh goto done; 1283602adf40SYehuda Sadeh 1284602adf40SYehuda Sadeh /* we've taken care of segment sizes earlier when we 1285602adf40SYehuda Sadeh cloned the bios. We should never have a segment 1286602adf40SYehuda Sadeh truncated at this point */ 1287aafb230eSAlex Elder rbd_assert(seg_len == len); 1288602adf40SYehuda Sadeh 1289602adf40SYehuda Sadeh ret = rbd_do_request(rq, rbd_dev, snapc, snapid, 1290602adf40SYehuda Sadeh seg_name, seg_ofs, seg_len, 1291602adf40SYehuda Sadeh bio, 1292602adf40SYehuda Sadeh NULL, 0, 1293602adf40SYehuda Sadeh flags, 1294602adf40SYehuda Sadeh ops, 12951fec7093SYehuda Sadeh coll, coll_index, 129659c2be1eSYehuda Sadeh rbd_req_cb, 0, NULL); 129711f77002SSage Weil 129811f77002SSage Weil rbd_destroy_ops(ops); 1299602adf40SYehuda Sadeh done: 1300602adf40SYehuda Sadeh kfree(seg_name); 1301602adf40SYehuda Sadeh return ret; 1302602adf40SYehuda Sadeh } 1303602adf40SYehuda Sadeh 1304602adf40SYehuda Sadeh /* 1305602adf40SYehuda Sadeh * Request sync osd read 1306602adf40SYehuda Sadeh */ 13070ce1a794SAlex Elder static int rbd_req_sync_read(struct rbd_device *rbd_dev, 1308602adf40SYehuda Sadeh u64 snapid, 1309aded07eaSAlex Elder const char *object_name, 1310602adf40SYehuda Sadeh u64 ofs, u64 len, 131159c2be1eSYehuda Sadeh char *buf, 131259c2be1eSYehuda Sadeh u64 *ver) 1313602adf40SYehuda Sadeh { 1314913d2fdcSAlex Elder struct ceph_osd_req_op *ops; 1315913d2fdcSAlex Elder int ret; 1316913d2fdcSAlex Elder 1317913d2fdcSAlex Elder ops = rbd_create_rw_ops(1, CEPH_OSD_OP_READ, 0); 1318913d2fdcSAlex Elder if (!ops) 1319913d2fdcSAlex Elder return -ENOMEM; 1320913d2fdcSAlex Elder 1321913d2fdcSAlex Elder ret = rbd_req_sync_op(rbd_dev, NULL, 1322b06e6a6bSJosh Durgin snapid, 1323602adf40SYehuda Sadeh CEPH_OSD_FLAG_READ, 1324913d2fdcSAlex Elder ops, object_name, ofs, len, buf, NULL, ver); 1325913d2fdcSAlex Elder rbd_destroy_ops(ops); 1326913d2fdcSAlex Elder 1327913d2fdcSAlex Elder return ret; 1328602adf40SYehuda Sadeh } 1329602adf40SYehuda Sadeh 1330602adf40SYehuda Sadeh /* 133159c2be1eSYehuda Sadeh * Request sync osd watch 133259c2be1eSYehuda Sadeh */ 13330ce1a794SAlex Elder static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev, 133459c2be1eSYehuda Sadeh u64 ver, 13357f0a24d8SAlex Elder u64 notify_id) 133659c2be1eSYehuda Sadeh { 133759c2be1eSYehuda Sadeh struct ceph_osd_req_op *ops; 133811f77002SSage Weil int ret; 133911f77002SSage Weil 134057cfc106SAlex Elder ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY_ACK, 0); 134157cfc106SAlex Elder if (!ops) 134257cfc106SAlex Elder return -ENOMEM; 134359c2be1eSYehuda Sadeh 1344a71b891bSJosh Durgin ops[0].watch.ver = cpu_to_le64(ver); 134559c2be1eSYehuda Sadeh ops[0].watch.cookie = notify_id; 134659c2be1eSYehuda Sadeh ops[0].watch.flag = 0; 134759c2be1eSYehuda Sadeh 13480ce1a794SAlex Elder ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP, 13497f0a24d8SAlex Elder rbd_dev->header_name, 0, 0, NULL, 1350ad4f232fSAlex Elder NULL, 0, 135159c2be1eSYehuda Sadeh CEPH_OSD_FLAG_READ, 135259c2be1eSYehuda Sadeh ops, 13531fec7093SYehuda Sadeh NULL, 0, 135459c2be1eSYehuda Sadeh rbd_simple_req_cb, 0, NULL); 135559c2be1eSYehuda Sadeh 135659c2be1eSYehuda Sadeh rbd_destroy_ops(ops); 135759c2be1eSYehuda Sadeh return ret; 135859c2be1eSYehuda Sadeh } 135959c2be1eSYehuda Sadeh 136059c2be1eSYehuda Sadeh static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data) 136159c2be1eSYehuda Sadeh { 13620ce1a794SAlex Elder struct rbd_device *rbd_dev = (struct rbd_device *)data; 1363a71b891bSJosh Durgin u64 hver; 136413143d2dSSage Weil int rc; 136513143d2dSSage Weil 13660ce1a794SAlex Elder if (!rbd_dev) 136759c2be1eSYehuda Sadeh return; 136859c2be1eSYehuda Sadeh 1369bd919d45SAlex Elder dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n", 1370bd919d45SAlex Elder rbd_dev->header_name, (unsigned long long) notify_id, 1371bd919d45SAlex Elder (unsigned int) opcode); 1372117973fbSAlex Elder rc = rbd_dev_refresh(rbd_dev, &hver); 137313143d2dSSage Weil if (rc) 1374f0f8cef5SAlex Elder pr_warning(RBD_DRV_NAME "%d got notification but failed to " 13750ce1a794SAlex Elder " update snaps: %d\n", rbd_dev->major, rc); 137659c2be1eSYehuda Sadeh 13777f0a24d8SAlex Elder rbd_req_sync_notify_ack(rbd_dev, hver, notify_id); 137859c2be1eSYehuda Sadeh } 137959c2be1eSYehuda Sadeh 138059c2be1eSYehuda Sadeh /* 138159c2be1eSYehuda Sadeh * Request sync osd watch 138259c2be1eSYehuda Sadeh */ 13830e6f322dSAlex Elder static int rbd_req_sync_watch(struct rbd_device *rbd_dev) 138459c2be1eSYehuda Sadeh { 138559c2be1eSYehuda Sadeh struct ceph_osd_req_op *ops; 13860ce1a794SAlex Elder struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 138757cfc106SAlex Elder int ret; 138859c2be1eSYehuda Sadeh 138957cfc106SAlex Elder ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0); 139057cfc106SAlex Elder if (!ops) 139157cfc106SAlex Elder return -ENOMEM; 139259c2be1eSYehuda Sadeh 139359c2be1eSYehuda Sadeh ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0, 13940ce1a794SAlex Elder (void *)rbd_dev, &rbd_dev->watch_event); 139559c2be1eSYehuda Sadeh if (ret < 0) 139659c2be1eSYehuda Sadeh goto fail; 139759c2be1eSYehuda Sadeh 13980e6f322dSAlex Elder ops[0].watch.ver = cpu_to_le64(rbd_dev->header.obj_version); 13990ce1a794SAlex Elder ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie); 140059c2be1eSYehuda Sadeh ops[0].watch.flag = 1; 140159c2be1eSYehuda Sadeh 14020ce1a794SAlex Elder ret = rbd_req_sync_op(rbd_dev, NULL, 140359c2be1eSYehuda Sadeh CEPH_NOSNAP, 140459c2be1eSYehuda Sadeh CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, 140559c2be1eSYehuda Sadeh ops, 14060e6f322dSAlex Elder rbd_dev->header_name, 14070e6f322dSAlex Elder 0, 0, NULL, 14080ce1a794SAlex Elder &rbd_dev->watch_request, NULL); 140959c2be1eSYehuda Sadeh 141059c2be1eSYehuda Sadeh if (ret < 0) 141159c2be1eSYehuda Sadeh goto fail_event; 141259c2be1eSYehuda Sadeh 141359c2be1eSYehuda Sadeh rbd_destroy_ops(ops); 141459c2be1eSYehuda Sadeh return 0; 141559c2be1eSYehuda Sadeh 141659c2be1eSYehuda Sadeh fail_event: 14170ce1a794SAlex Elder ceph_osdc_cancel_event(rbd_dev->watch_event); 14180ce1a794SAlex Elder rbd_dev->watch_event = NULL; 141959c2be1eSYehuda Sadeh fail: 142059c2be1eSYehuda Sadeh rbd_destroy_ops(ops); 142159c2be1eSYehuda Sadeh return ret; 142259c2be1eSYehuda Sadeh } 142359c2be1eSYehuda Sadeh 142479e3057cSYehuda Sadeh /* 142579e3057cSYehuda Sadeh * Request sync osd unwatch 142679e3057cSYehuda Sadeh */ 1427070c633fSAlex Elder static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev) 142879e3057cSYehuda Sadeh { 142979e3057cSYehuda Sadeh struct ceph_osd_req_op *ops; 143057cfc106SAlex Elder int ret; 143179e3057cSYehuda Sadeh 143257cfc106SAlex Elder ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0); 143357cfc106SAlex Elder if (!ops) 143457cfc106SAlex Elder return -ENOMEM; 143579e3057cSYehuda Sadeh 143679e3057cSYehuda Sadeh ops[0].watch.ver = 0; 14370ce1a794SAlex Elder ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie); 143879e3057cSYehuda Sadeh ops[0].watch.flag = 0; 143979e3057cSYehuda Sadeh 14400ce1a794SAlex Elder ret = rbd_req_sync_op(rbd_dev, NULL, 144179e3057cSYehuda Sadeh CEPH_NOSNAP, 144279e3057cSYehuda Sadeh CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, 144379e3057cSYehuda Sadeh ops, 1444070c633fSAlex Elder rbd_dev->header_name, 1445070c633fSAlex Elder 0, 0, NULL, NULL, NULL); 1446070c633fSAlex Elder 144779e3057cSYehuda Sadeh 144879e3057cSYehuda Sadeh rbd_destroy_ops(ops); 14490ce1a794SAlex Elder ceph_osdc_cancel_event(rbd_dev->watch_event); 14500ce1a794SAlex Elder rbd_dev->watch_event = NULL; 145179e3057cSYehuda Sadeh return ret; 145279e3057cSYehuda Sadeh } 145379e3057cSYehuda Sadeh 145459c2be1eSYehuda Sadeh /* 14553cb4a687SAlex Elder * Synchronous osd object method call 1456602adf40SYehuda Sadeh */ 14570ce1a794SAlex Elder static int rbd_req_sync_exec(struct rbd_device *rbd_dev, 1458aded07eaSAlex Elder const char *object_name, 1459aded07eaSAlex Elder const char *class_name, 1460aded07eaSAlex Elder const char *method_name, 14613cb4a687SAlex Elder const char *outbound, 14623cb4a687SAlex Elder size_t outbound_size, 1463f8d4de6eSAlex Elder char *inbound, 1464f8d4de6eSAlex Elder size_t inbound_size, 14653cb4a687SAlex Elder int flags, 146659c2be1eSYehuda Sadeh u64 *ver) 1467602adf40SYehuda Sadeh { 1468602adf40SYehuda Sadeh struct ceph_osd_req_op *ops; 1469aded07eaSAlex Elder int class_name_len = strlen(class_name); 1470aded07eaSAlex Elder int method_name_len = strlen(method_name); 14713cb4a687SAlex Elder int payload_size; 147257cfc106SAlex Elder int ret; 147357cfc106SAlex Elder 14743cb4a687SAlex Elder /* 14753cb4a687SAlex Elder * Any input parameters required by the method we're calling 14763cb4a687SAlex Elder * will be sent along with the class and method names as 14773cb4a687SAlex Elder * part of the message payload. That data and its size are 14783cb4a687SAlex Elder * supplied via the indata and indata_len fields (named from 14793cb4a687SAlex Elder * the perspective of the server side) in the OSD request 14803cb4a687SAlex Elder * operation. 14813cb4a687SAlex Elder */ 14823cb4a687SAlex Elder payload_size = class_name_len + method_name_len + outbound_size; 14833cb4a687SAlex Elder ops = rbd_create_rw_ops(1, CEPH_OSD_OP_CALL, payload_size); 148457cfc106SAlex Elder if (!ops) 148557cfc106SAlex Elder return -ENOMEM; 1486602adf40SYehuda Sadeh 1487aded07eaSAlex Elder ops[0].cls.class_name = class_name; 1488aded07eaSAlex Elder ops[0].cls.class_len = (__u8) class_name_len; 1489aded07eaSAlex Elder ops[0].cls.method_name = method_name; 1490aded07eaSAlex Elder ops[0].cls.method_len = (__u8) method_name_len; 1491602adf40SYehuda Sadeh ops[0].cls.argc = 0; 14923cb4a687SAlex Elder ops[0].cls.indata = outbound; 14933cb4a687SAlex Elder ops[0].cls.indata_len = outbound_size; 1494602adf40SYehuda Sadeh 14950ce1a794SAlex Elder ret = rbd_req_sync_op(rbd_dev, NULL, 1496602adf40SYehuda Sadeh CEPH_NOSNAP, 14973cb4a687SAlex Elder flags, ops, 1498f8d4de6eSAlex Elder object_name, 0, inbound_size, inbound, 1499f8d4de6eSAlex Elder NULL, ver); 1500602adf40SYehuda Sadeh 1501602adf40SYehuda Sadeh rbd_destroy_ops(ops); 1502602adf40SYehuda Sadeh 1503602adf40SYehuda Sadeh dout("cls_exec returned %d\n", ret); 1504602adf40SYehuda Sadeh return ret; 1505602adf40SYehuda Sadeh } 1506602adf40SYehuda Sadeh 15071fec7093SYehuda Sadeh static struct rbd_req_coll *rbd_alloc_coll(int num_reqs) 15081fec7093SYehuda Sadeh { 15091fec7093SYehuda Sadeh struct rbd_req_coll *coll = 15101fec7093SYehuda Sadeh kzalloc(sizeof(struct rbd_req_coll) + 15111fec7093SYehuda Sadeh sizeof(struct rbd_req_status) * num_reqs, 15121fec7093SYehuda Sadeh GFP_ATOMIC); 15131fec7093SYehuda Sadeh 15141fec7093SYehuda Sadeh if (!coll) 15151fec7093SYehuda Sadeh return NULL; 15161fec7093SYehuda Sadeh coll->total = num_reqs; 15171fec7093SYehuda Sadeh kref_init(&coll->kref); 15181fec7093SYehuda Sadeh return coll; 15191fec7093SYehuda Sadeh } 15201fec7093SYehuda Sadeh 1521602adf40SYehuda Sadeh /* 1522602adf40SYehuda Sadeh * block device queue callback 1523602adf40SYehuda Sadeh */ 1524602adf40SYehuda Sadeh static void rbd_rq_fn(struct request_queue *q) 1525602adf40SYehuda Sadeh { 1526602adf40SYehuda Sadeh struct rbd_device *rbd_dev = q->queuedata; 1527602adf40SYehuda Sadeh struct request *rq; 1528602adf40SYehuda Sadeh 152900f1f36fSAlex Elder while ((rq = blk_fetch_request(q))) { 1530602adf40SYehuda Sadeh struct bio *bio; 1531602adf40SYehuda Sadeh bool do_write; 1532bd919d45SAlex Elder unsigned int size; 1533602adf40SYehuda Sadeh u64 ofs; 15341fec7093SYehuda Sadeh int num_segs, cur_seg = 0; 15351fec7093SYehuda Sadeh struct rbd_req_coll *coll; 1536d1d25646SJosh Durgin struct ceph_snap_context *snapc; 1537f7760dadSAlex Elder unsigned int bio_offset; 1538602adf40SYehuda Sadeh 1539602adf40SYehuda Sadeh dout("fetched request\n"); 1540602adf40SYehuda Sadeh 1541602adf40SYehuda Sadeh /* filter out block requests we don't understand */ 1542602adf40SYehuda Sadeh if ((rq->cmd_type != REQ_TYPE_FS)) { 1543602adf40SYehuda Sadeh __blk_end_request_all(rq, 0); 154400f1f36fSAlex Elder continue; 1545602adf40SYehuda Sadeh } 1546602adf40SYehuda Sadeh 1547602adf40SYehuda Sadeh /* deduce our operation (read, write) */ 1548602adf40SYehuda Sadeh do_write = (rq_data_dir(rq) == WRITE); 1549f84344f3SAlex Elder if (do_write && rbd_dev->mapping.read_only) { 1550602adf40SYehuda Sadeh __blk_end_request_all(rq, -EROFS); 155100f1f36fSAlex Elder continue; 1552602adf40SYehuda Sadeh } 1553602adf40SYehuda Sadeh 1554602adf40SYehuda Sadeh spin_unlock_irq(q->queue_lock); 1555602adf40SYehuda Sadeh 1556e88a36ecSJosh Durgin down_read(&rbd_dev->header_rwsem); 1557e88a36ecSJosh Durgin 1558daba5fdbSAlex Elder if (!rbd_dev->exists) { 15590d7dbfceSAlex Elder rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP); 1560d1d25646SJosh Durgin up_read(&rbd_dev->header_rwsem); 1561e88a36ecSJosh Durgin dout("request for non-existent snapshot"); 1562e88a36ecSJosh Durgin spin_lock_irq(q->queue_lock); 1563e88a36ecSJosh Durgin __blk_end_request_all(rq, -ENXIO); 1564e88a36ecSJosh Durgin continue; 1565e88a36ecSJosh Durgin } 1566d1d25646SJosh Durgin 1567d1d25646SJosh Durgin snapc = ceph_get_snap_context(rbd_dev->header.snapc); 1568d1d25646SJosh Durgin 1569d1d25646SJosh Durgin up_read(&rbd_dev->header_rwsem); 1570e88a36ecSJosh Durgin 1571f7760dadSAlex Elder size = blk_rq_bytes(rq); 1572f7760dadSAlex Elder ofs = blk_rq_pos(rq) * SECTOR_SIZE; 1573f7760dadSAlex Elder bio = rq->bio; 1574f7760dadSAlex Elder 1575602adf40SYehuda Sadeh dout("%s 0x%x bytes at 0x%llx\n", 1576602adf40SYehuda Sadeh do_write ? "write" : "read", 1577bd919d45SAlex Elder size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE); 1578602adf40SYehuda Sadeh 15791fec7093SYehuda Sadeh num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size); 1580df111be6SAlex Elder if (num_segs <= 0) { 1581df111be6SAlex Elder spin_lock_irq(q->queue_lock); 1582df111be6SAlex Elder __blk_end_request_all(rq, num_segs); 1583df111be6SAlex Elder ceph_put_snap_context(snapc); 1584df111be6SAlex Elder continue; 1585df111be6SAlex Elder } 15861fec7093SYehuda Sadeh coll = rbd_alloc_coll(num_segs); 15871fec7093SYehuda Sadeh if (!coll) { 15881fec7093SYehuda Sadeh spin_lock_irq(q->queue_lock); 15891fec7093SYehuda Sadeh __blk_end_request_all(rq, -ENOMEM); 1590d1d25646SJosh Durgin ceph_put_snap_context(snapc); 159100f1f36fSAlex Elder continue; 15921fec7093SYehuda Sadeh } 15931fec7093SYehuda Sadeh 1594f7760dadSAlex Elder bio_offset = 0; 1595602adf40SYehuda Sadeh do { 1596f7760dadSAlex Elder u64 limit = rbd_segment_length(rbd_dev, ofs, size); 1597f7760dadSAlex Elder unsigned int chain_size; 1598f7760dadSAlex Elder struct bio *bio_chain; 1599f7760dadSAlex Elder 1600f7760dadSAlex Elder BUG_ON(limit > (u64) UINT_MAX); 1601f7760dadSAlex Elder chain_size = (unsigned int) limit; 1602bd919d45SAlex Elder dout("rq->bio->bi_vcnt=%hu\n", rq->bio->bi_vcnt); 1603f7760dadSAlex Elder 16041fec7093SYehuda Sadeh kref_get(&coll->kref); 1605f7760dadSAlex Elder 1606f7760dadSAlex Elder /* Pass a cloned bio chain via an osd request */ 1607f7760dadSAlex Elder 1608f7760dadSAlex Elder bio_chain = bio_chain_clone_range(&bio, 1609f7760dadSAlex Elder &bio_offset, chain_size, 1610f7760dadSAlex Elder GFP_ATOMIC); 1611f7760dadSAlex Elder if (bio_chain) 16124634246dSAlex Elder (void) rbd_do_op(rq, rbd_dev, snapc, 1613f7760dadSAlex Elder ofs, chain_size, 1614f7760dadSAlex Elder bio_chain, coll, cur_seg); 16154634246dSAlex Elder else 16161fec7093SYehuda Sadeh rbd_coll_end_req_index(rq, coll, cur_seg, 1617f7760dadSAlex Elder -ENOMEM, chain_size); 1618f7760dadSAlex Elder size -= chain_size; 1619f7760dadSAlex Elder ofs += chain_size; 1620602adf40SYehuda Sadeh 16211fec7093SYehuda Sadeh cur_seg++; 1622602adf40SYehuda Sadeh } while (size > 0); 16231fec7093SYehuda Sadeh kref_put(&coll->kref, rbd_coll_release); 1624602adf40SYehuda Sadeh 1625602adf40SYehuda Sadeh spin_lock_irq(q->queue_lock); 1626d1d25646SJosh Durgin 1627d1d25646SJosh Durgin ceph_put_snap_context(snapc); 1628602adf40SYehuda Sadeh } 1629602adf40SYehuda Sadeh } 1630602adf40SYehuda Sadeh 1631602adf40SYehuda Sadeh /* 1632602adf40SYehuda Sadeh * a queue callback. Makes sure that we don't create a bio that spans across 1633602adf40SYehuda Sadeh * multiple osd objects. One exception would be with a single page bios, 1634f7760dadSAlex Elder * which we handle later at bio_chain_clone_range() 1635602adf40SYehuda Sadeh */ 1636602adf40SYehuda Sadeh static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd, 1637602adf40SYehuda Sadeh struct bio_vec *bvec) 1638602adf40SYehuda Sadeh { 1639602adf40SYehuda Sadeh struct rbd_device *rbd_dev = q->queuedata; 1640e5cfeed2SAlex Elder sector_t sector_offset; 1641e5cfeed2SAlex Elder sector_t sectors_per_obj; 1642e5cfeed2SAlex Elder sector_t obj_sector_offset; 1643e5cfeed2SAlex Elder int ret; 1644602adf40SYehuda Sadeh 1645e5cfeed2SAlex Elder /* 1646e5cfeed2SAlex Elder * Find how far into its rbd object the partition-relative 1647e5cfeed2SAlex Elder * bio start sector is to offset relative to the enclosing 1648e5cfeed2SAlex Elder * device. 1649e5cfeed2SAlex Elder */ 1650e5cfeed2SAlex Elder sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector; 1651e5cfeed2SAlex Elder sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT); 1652e5cfeed2SAlex Elder obj_sector_offset = sector_offset & (sectors_per_obj - 1); 1653593a9e7bSAlex Elder 1654e5cfeed2SAlex Elder /* 1655e5cfeed2SAlex Elder * Compute the number of bytes from that offset to the end 1656e5cfeed2SAlex Elder * of the object. Account for what's already used by the bio. 1657e5cfeed2SAlex Elder */ 1658e5cfeed2SAlex Elder ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT; 1659e5cfeed2SAlex Elder if (ret > bmd->bi_size) 1660e5cfeed2SAlex Elder ret -= bmd->bi_size; 1661e5cfeed2SAlex Elder else 1662e5cfeed2SAlex Elder ret = 0; 1663e5cfeed2SAlex Elder 1664e5cfeed2SAlex Elder /* 1665e5cfeed2SAlex Elder * Don't send back more than was asked for. And if the bio 1666e5cfeed2SAlex Elder * was empty, let the whole thing through because: "Note 1667e5cfeed2SAlex Elder * that a block device *must* allow a single page to be 1668e5cfeed2SAlex Elder * added to an empty bio." 1669e5cfeed2SAlex Elder */ 1670e5cfeed2SAlex Elder rbd_assert(bvec->bv_len <= PAGE_SIZE); 1671e5cfeed2SAlex Elder if (ret > (int) bvec->bv_len || !bmd->bi_size) 1672e5cfeed2SAlex Elder ret = (int) bvec->bv_len; 1673e5cfeed2SAlex Elder 1674e5cfeed2SAlex Elder return ret; 1675602adf40SYehuda Sadeh } 1676602adf40SYehuda Sadeh 1677602adf40SYehuda Sadeh static void rbd_free_disk(struct rbd_device *rbd_dev) 1678602adf40SYehuda Sadeh { 1679602adf40SYehuda Sadeh struct gendisk *disk = rbd_dev->disk; 1680602adf40SYehuda Sadeh 1681602adf40SYehuda Sadeh if (!disk) 1682602adf40SYehuda Sadeh return; 1683602adf40SYehuda Sadeh 1684602adf40SYehuda Sadeh if (disk->flags & GENHD_FL_UP) 1685602adf40SYehuda Sadeh del_gendisk(disk); 1686602adf40SYehuda Sadeh if (disk->queue) 1687602adf40SYehuda Sadeh blk_cleanup_queue(disk->queue); 1688602adf40SYehuda Sadeh put_disk(disk); 1689602adf40SYehuda Sadeh } 1690602adf40SYehuda Sadeh 1691602adf40SYehuda Sadeh /* 16924156d998SAlex Elder * Read the complete header for the given rbd device. 16934156d998SAlex Elder * 16944156d998SAlex Elder * Returns a pointer to a dynamically-allocated buffer containing 16954156d998SAlex Elder * the complete and validated header. Caller can pass the address 16964156d998SAlex Elder * of a variable that will be filled in with the version of the 16974156d998SAlex Elder * header object at the time it was read. 16984156d998SAlex Elder * 16994156d998SAlex Elder * Returns a pointer-coded errno if a failure occurs. 17004156d998SAlex Elder */ 17014156d998SAlex Elder static struct rbd_image_header_ondisk * 17024156d998SAlex Elder rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version) 17034156d998SAlex Elder { 17044156d998SAlex Elder struct rbd_image_header_ondisk *ondisk = NULL; 17054156d998SAlex Elder u32 snap_count = 0; 17064156d998SAlex Elder u64 names_size = 0; 17074156d998SAlex Elder u32 want_count; 17084156d998SAlex Elder int ret; 17094156d998SAlex Elder 17104156d998SAlex Elder /* 17114156d998SAlex Elder * The complete header will include an array of its 64-bit 17124156d998SAlex Elder * snapshot ids, followed by the names of those snapshots as 17134156d998SAlex Elder * a contiguous block of NUL-terminated strings. Note that 17144156d998SAlex Elder * the number of snapshots could change by the time we read 17154156d998SAlex Elder * it in, in which case we re-read it. 17164156d998SAlex Elder */ 17174156d998SAlex Elder do { 17184156d998SAlex Elder size_t size; 17194156d998SAlex Elder 17204156d998SAlex Elder kfree(ondisk); 17214156d998SAlex Elder 17224156d998SAlex Elder size = sizeof (*ondisk); 17234156d998SAlex Elder size += snap_count * sizeof (struct rbd_image_snap_ondisk); 17244156d998SAlex Elder size += names_size; 17254156d998SAlex Elder ondisk = kmalloc(size, GFP_KERNEL); 17264156d998SAlex Elder if (!ondisk) 17274156d998SAlex Elder return ERR_PTR(-ENOMEM); 17284156d998SAlex Elder 17294156d998SAlex Elder ret = rbd_req_sync_read(rbd_dev, CEPH_NOSNAP, 17304156d998SAlex Elder rbd_dev->header_name, 17314156d998SAlex Elder 0, size, 17324156d998SAlex Elder (char *) ondisk, version); 17334156d998SAlex Elder 17344156d998SAlex Elder if (ret < 0) 17354156d998SAlex Elder goto out_err; 17364156d998SAlex Elder if (WARN_ON((size_t) ret < size)) { 17374156d998SAlex Elder ret = -ENXIO; 17384156d998SAlex Elder pr_warning("short header read for image %s" 17394156d998SAlex Elder " (want %zd got %d)\n", 17400d7dbfceSAlex Elder rbd_dev->spec->image_name, size, ret); 17414156d998SAlex Elder goto out_err; 17424156d998SAlex Elder } 17434156d998SAlex Elder if (!rbd_dev_ondisk_valid(ondisk)) { 17444156d998SAlex Elder ret = -ENXIO; 17454156d998SAlex Elder pr_warning("invalid header for image %s\n", 17460d7dbfceSAlex Elder rbd_dev->spec->image_name); 17474156d998SAlex Elder goto out_err; 17484156d998SAlex Elder } 17494156d998SAlex Elder 17504156d998SAlex Elder names_size = le64_to_cpu(ondisk->snap_names_len); 17514156d998SAlex Elder want_count = snap_count; 17524156d998SAlex Elder snap_count = le32_to_cpu(ondisk->snap_count); 17534156d998SAlex Elder } while (snap_count != want_count); 17544156d998SAlex Elder 17554156d998SAlex Elder return ondisk; 17564156d998SAlex Elder 17574156d998SAlex Elder out_err: 17584156d998SAlex Elder kfree(ondisk); 17594156d998SAlex Elder 17604156d998SAlex Elder return ERR_PTR(ret); 17614156d998SAlex Elder } 17624156d998SAlex Elder 17634156d998SAlex Elder /* 1764602adf40SYehuda Sadeh * reload the ondisk the header 1765602adf40SYehuda Sadeh */ 1766602adf40SYehuda Sadeh static int rbd_read_header(struct rbd_device *rbd_dev, 1767602adf40SYehuda Sadeh struct rbd_image_header *header) 1768602adf40SYehuda Sadeh { 17694156d998SAlex Elder struct rbd_image_header_ondisk *ondisk; 17704156d998SAlex Elder u64 ver = 0; 17714156d998SAlex Elder int ret; 1772602adf40SYehuda Sadeh 17734156d998SAlex Elder ondisk = rbd_dev_v1_header_read(rbd_dev, &ver); 17744156d998SAlex Elder if (IS_ERR(ondisk)) 17754156d998SAlex Elder return PTR_ERR(ondisk); 17764156d998SAlex Elder ret = rbd_header_from_disk(header, ondisk); 17774156d998SAlex Elder if (ret >= 0) 177859c2be1eSYehuda Sadeh header->obj_version = ver; 17794156d998SAlex Elder kfree(ondisk); 1780602adf40SYehuda Sadeh 17814156d998SAlex Elder return ret; 1782602adf40SYehuda Sadeh } 1783602adf40SYehuda Sadeh 178441f38c2bSAlex Elder static void rbd_remove_all_snaps(struct rbd_device *rbd_dev) 1785dfc5606dSYehuda Sadeh { 1786dfc5606dSYehuda Sadeh struct rbd_snap *snap; 1787a0593290SAlex Elder struct rbd_snap *next; 1788dfc5606dSYehuda Sadeh 1789a0593290SAlex Elder list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node) 179041f38c2bSAlex Elder rbd_remove_snap_dev(snap); 1791dfc5606dSYehuda Sadeh } 1792dfc5606dSYehuda Sadeh 17939478554aSAlex Elder static void rbd_update_mapping_size(struct rbd_device *rbd_dev) 17949478554aSAlex Elder { 17959478554aSAlex Elder sector_t size; 17969478554aSAlex Elder 17970d7dbfceSAlex Elder if (rbd_dev->spec->snap_id != CEPH_NOSNAP) 17989478554aSAlex Elder return; 17999478554aSAlex Elder 18009478554aSAlex Elder size = (sector_t) rbd_dev->header.image_size / SECTOR_SIZE; 18019478554aSAlex Elder dout("setting size to %llu sectors", (unsigned long long) size); 18029478554aSAlex Elder rbd_dev->mapping.size = (u64) size; 18039478554aSAlex Elder set_capacity(rbd_dev->disk, size); 18049478554aSAlex Elder } 18059478554aSAlex Elder 1806602adf40SYehuda Sadeh /* 1807602adf40SYehuda Sadeh * only read the first part of the ondisk header, without the snaps info 1808602adf40SYehuda Sadeh */ 1809117973fbSAlex Elder static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver) 1810602adf40SYehuda Sadeh { 1811602adf40SYehuda Sadeh int ret; 1812602adf40SYehuda Sadeh struct rbd_image_header h; 1813602adf40SYehuda Sadeh 1814602adf40SYehuda Sadeh ret = rbd_read_header(rbd_dev, &h); 1815602adf40SYehuda Sadeh if (ret < 0) 1816602adf40SYehuda Sadeh return ret; 1817602adf40SYehuda Sadeh 1818a51aa0c0SJosh Durgin down_write(&rbd_dev->header_rwsem); 1819a51aa0c0SJosh Durgin 18209478554aSAlex Elder /* Update image size, and check for resize of mapped image */ 18219478554aSAlex Elder rbd_dev->header.image_size = h.image_size; 18229478554aSAlex Elder rbd_update_mapping_size(rbd_dev); 18239db4b3e3SSage Weil 1824849b4260SAlex Elder /* rbd_dev->header.object_prefix shouldn't change */ 1825602adf40SYehuda Sadeh kfree(rbd_dev->header.snap_sizes); 1826849b4260SAlex Elder kfree(rbd_dev->header.snap_names); 1827d1d25646SJosh Durgin /* osd requests may still refer to snapc */ 1828d1d25646SJosh Durgin ceph_put_snap_context(rbd_dev->header.snapc); 1829602adf40SYehuda Sadeh 1830b813623aSAlex Elder if (hver) 1831b813623aSAlex Elder *hver = h.obj_version; 1832a71b891bSJosh Durgin rbd_dev->header.obj_version = h.obj_version; 183393a24e08SJosh Durgin rbd_dev->header.image_size = h.image_size; 1834602adf40SYehuda Sadeh rbd_dev->header.snapc = h.snapc; 1835602adf40SYehuda Sadeh rbd_dev->header.snap_names = h.snap_names; 1836602adf40SYehuda Sadeh rbd_dev->header.snap_sizes = h.snap_sizes; 1837849b4260SAlex Elder /* Free the extra copy of the object prefix */ 1838849b4260SAlex Elder WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix)); 1839849b4260SAlex Elder kfree(h.object_prefix); 1840849b4260SAlex Elder 1841304f6808SAlex Elder ret = rbd_dev_snaps_update(rbd_dev); 1842304f6808SAlex Elder if (!ret) 1843304f6808SAlex Elder ret = rbd_dev_snaps_register(rbd_dev); 1844dfc5606dSYehuda Sadeh 1845c666601aSJosh Durgin up_write(&rbd_dev->header_rwsem); 1846602adf40SYehuda Sadeh 1847dfc5606dSYehuda Sadeh return ret; 1848602adf40SYehuda Sadeh } 1849602adf40SYehuda Sadeh 1850117973fbSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver) 18511fe5e993SAlex Elder { 18521fe5e993SAlex Elder int ret; 18531fe5e993SAlex Elder 1854117973fbSAlex Elder rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 18551fe5e993SAlex Elder mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); 1856117973fbSAlex Elder if (rbd_dev->image_format == 1) 1857117973fbSAlex Elder ret = rbd_dev_v1_refresh(rbd_dev, hver); 1858117973fbSAlex Elder else 1859117973fbSAlex Elder ret = rbd_dev_v2_refresh(rbd_dev, hver); 18601fe5e993SAlex Elder mutex_unlock(&ctl_mutex); 18611fe5e993SAlex Elder 18621fe5e993SAlex Elder return ret; 18631fe5e993SAlex Elder } 18641fe5e993SAlex Elder 1865602adf40SYehuda Sadeh static int rbd_init_disk(struct rbd_device *rbd_dev) 1866602adf40SYehuda Sadeh { 1867602adf40SYehuda Sadeh struct gendisk *disk; 1868602adf40SYehuda Sadeh struct request_queue *q; 1869593a9e7bSAlex Elder u64 segment_size; 1870602adf40SYehuda Sadeh 1871602adf40SYehuda Sadeh /* create gendisk info */ 1872602adf40SYehuda Sadeh disk = alloc_disk(RBD_MINORS_PER_MAJOR); 1873602adf40SYehuda Sadeh if (!disk) 18741fcdb8aaSAlex Elder return -ENOMEM; 1875602adf40SYehuda Sadeh 1876f0f8cef5SAlex Elder snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d", 1877de71a297SAlex Elder rbd_dev->dev_id); 1878602adf40SYehuda Sadeh disk->major = rbd_dev->major; 1879602adf40SYehuda Sadeh disk->first_minor = 0; 1880602adf40SYehuda Sadeh disk->fops = &rbd_bd_ops; 1881602adf40SYehuda Sadeh disk->private_data = rbd_dev; 1882602adf40SYehuda Sadeh 1883602adf40SYehuda Sadeh /* init rq */ 1884602adf40SYehuda Sadeh q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock); 1885602adf40SYehuda Sadeh if (!q) 1886602adf40SYehuda Sadeh goto out_disk; 1887029bcbd8SJosh Durgin 1888593a9e7bSAlex Elder /* We use the default size, but let's be explicit about it. */ 1889593a9e7bSAlex Elder blk_queue_physical_block_size(q, SECTOR_SIZE); 1890593a9e7bSAlex Elder 1891029bcbd8SJosh Durgin /* set io sizes to object size */ 1892593a9e7bSAlex Elder segment_size = rbd_obj_bytes(&rbd_dev->header); 1893593a9e7bSAlex Elder blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE); 1894593a9e7bSAlex Elder blk_queue_max_segment_size(q, segment_size); 1895593a9e7bSAlex Elder blk_queue_io_min(q, segment_size); 1896593a9e7bSAlex Elder blk_queue_io_opt(q, segment_size); 1897029bcbd8SJosh Durgin 1898602adf40SYehuda Sadeh blk_queue_merge_bvec(q, rbd_merge_bvec); 1899602adf40SYehuda Sadeh disk->queue = q; 1900602adf40SYehuda Sadeh 1901602adf40SYehuda Sadeh q->queuedata = rbd_dev; 1902602adf40SYehuda Sadeh 1903602adf40SYehuda Sadeh rbd_dev->disk = disk; 1904602adf40SYehuda Sadeh 190512f02944SAlex Elder set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE); 190612f02944SAlex Elder 1907602adf40SYehuda Sadeh return 0; 1908602adf40SYehuda Sadeh out_disk: 1909602adf40SYehuda Sadeh put_disk(disk); 19101fcdb8aaSAlex Elder 19111fcdb8aaSAlex Elder return -ENOMEM; 1912602adf40SYehuda Sadeh } 1913602adf40SYehuda Sadeh 1914dfc5606dSYehuda Sadeh /* 1915dfc5606dSYehuda Sadeh sysfs 1916dfc5606dSYehuda Sadeh */ 1917602adf40SYehuda Sadeh 1918593a9e7bSAlex Elder static struct rbd_device *dev_to_rbd_dev(struct device *dev) 1919593a9e7bSAlex Elder { 1920593a9e7bSAlex Elder return container_of(dev, struct rbd_device, dev); 1921593a9e7bSAlex Elder } 1922593a9e7bSAlex Elder 1923dfc5606dSYehuda Sadeh static ssize_t rbd_size_show(struct device *dev, 1924dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 1925602adf40SYehuda Sadeh { 1926593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 1927a51aa0c0SJosh Durgin sector_t size; 1928dfc5606dSYehuda Sadeh 1929a51aa0c0SJosh Durgin down_read(&rbd_dev->header_rwsem); 1930a51aa0c0SJosh Durgin size = get_capacity(rbd_dev->disk); 1931a51aa0c0SJosh Durgin up_read(&rbd_dev->header_rwsem); 1932a51aa0c0SJosh Durgin 1933a51aa0c0SJosh Durgin return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE); 1934602adf40SYehuda Sadeh } 1935602adf40SYehuda Sadeh 193634b13184SAlex Elder /* 193734b13184SAlex Elder * Note this shows the features for whatever's mapped, which is not 193834b13184SAlex Elder * necessarily the base image. 193934b13184SAlex Elder */ 194034b13184SAlex Elder static ssize_t rbd_features_show(struct device *dev, 194134b13184SAlex Elder struct device_attribute *attr, char *buf) 194234b13184SAlex Elder { 194334b13184SAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 194434b13184SAlex Elder 194534b13184SAlex Elder return sprintf(buf, "0x%016llx\n", 194634b13184SAlex Elder (unsigned long long) rbd_dev->mapping.features); 194734b13184SAlex Elder } 194834b13184SAlex Elder 1949dfc5606dSYehuda Sadeh static ssize_t rbd_major_show(struct device *dev, 1950dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 1951602adf40SYehuda Sadeh { 1952593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 1953dfc5606dSYehuda Sadeh 1954dfc5606dSYehuda Sadeh return sprintf(buf, "%d\n", rbd_dev->major); 1955dfc5606dSYehuda Sadeh } 1956dfc5606dSYehuda Sadeh 1957dfc5606dSYehuda Sadeh static ssize_t rbd_client_id_show(struct device *dev, 1958dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 1959dfc5606dSYehuda Sadeh { 1960593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 1961dfc5606dSYehuda Sadeh 19621dbb4399SAlex Elder return sprintf(buf, "client%lld\n", 19631dbb4399SAlex Elder ceph_client_id(rbd_dev->rbd_client->client)); 1964dfc5606dSYehuda Sadeh } 1965dfc5606dSYehuda Sadeh 1966dfc5606dSYehuda Sadeh static ssize_t rbd_pool_show(struct device *dev, 1967dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 1968dfc5606dSYehuda Sadeh { 1969593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 1970dfc5606dSYehuda Sadeh 19710d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->pool_name); 1972dfc5606dSYehuda Sadeh } 1973dfc5606dSYehuda Sadeh 19749bb2f334SAlex Elder static ssize_t rbd_pool_id_show(struct device *dev, 19759bb2f334SAlex Elder struct device_attribute *attr, char *buf) 19769bb2f334SAlex Elder { 19779bb2f334SAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 19789bb2f334SAlex Elder 19790d7dbfceSAlex Elder return sprintf(buf, "%llu\n", 19800d7dbfceSAlex Elder (unsigned long long) rbd_dev->spec->pool_id); 19819bb2f334SAlex Elder } 19829bb2f334SAlex Elder 1983dfc5606dSYehuda Sadeh static ssize_t rbd_name_show(struct device *dev, 1984dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 1985dfc5606dSYehuda Sadeh { 1986593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 1987dfc5606dSYehuda Sadeh 1988a92ffdf8SAlex Elder if (rbd_dev->spec->image_name) 19890d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->image_name); 1990a92ffdf8SAlex Elder 1991a92ffdf8SAlex Elder return sprintf(buf, "(unknown)\n"); 1992dfc5606dSYehuda Sadeh } 1993dfc5606dSYehuda Sadeh 1994589d30e0SAlex Elder static ssize_t rbd_image_id_show(struct device *dev, 1995589d30e0SAlex Elder struct device_attribute *attr, char *buf) 1996589d30e0SAlex Elder { 1997589d30e0SAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 1998589d30e0SAlex Elder 19990d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->image_id); 2000589d30e0SAlex Elder } 2001589d30e0SAlex Elder 200234b13184SAlex Elder /* 200334b13184SAlex Elder * Shows the name of the currently-mapped snapshot (or 200434b13184SAlex Elder * RBD_SNAP_HEAD_NAME for the base image). 200534b13184SAlex Elder */ 2006dfc5606dSYehuda Sadeh static ssize_t rbd_snap_show(struct device *dev, 2007dfc5606dSYehuda Sadeh struct device_attribute *attr, 2008dfc5606dSYehuda Sadeh char *buf) 2009dfc5606dSYehuda Sadeh { 2010593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 2011dfc5606dSYehuda Sadeh 20120d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->snap_name); 2013dfc5606dSYehuda Sadeh } 2014dfc5606dSYehuda Sadeh 201586b00e0dSAlex Elder /* 201686b00e0dSAlex Elder * For an rbd v2 image, shows the pool id, image id, and snapshot id 201786b00e0dSAlex Elder * for the parent image. If there is no parent, simply shows 201886b00e0dSAlex Elder * "(no parent image)". 201986b00e0dSAlex Elder */ 202086b00e0dSAlex Elder static ssize_t rbd_parent_show(struct device *dev, 202186b00e0dSAlex Elder struct device_attribute *attr, 202286b00e0dSAlex Elder char *buf) 202386b00e0dSAlex Elder { 202486b00e0dSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 202586b00e0dSAlex Elder struct rbd_spec *spec = rbd_dev->parent_spec; 202686b00e0dSAlex Elder int count; 202786b00e0dSAlex Elder char *bufp = buf; 202886b00e0dSAlex Elder 202986b00e0dSAlex Elder if (!spec) 203086b00e0dSAlex Elder return sprintf(buf, "(no parent image)\n"); 203186b00e0dSAlex Elder 203286b00e0dSAlex Elder count = sprintf(bufp, "pool_id %llu\npool_name %s\n", 203386b00e0dSAlex Elder (unsigned long long) spec->pool_id, spec->pool_name); 203486b00e0dSAlex Elder if (count < 0) 203586b00e0dSAlex Elder return count; 203686b00e0dSAlex Elder bufp += count; 203786b00e0dSAlex Elder 203886b00e0dSAlex Elder count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id, 203986b00e0dSAlex Elder spec->image_name ? spec->image_name : "(unknown)"); 204086b00e0dSAlex Elder if (count < 0) 204186b00e0dSAlex Elder return count; 204286b00e0dSAlex Elder bufp += count; 204386b00e0dSAlex Elder 204486b00e0dSAlex Elder count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n", 204586b00e0dSAlex Elder (unsigned long long) spec->snap_id, spec->snap_name); 204686b00e0dSAlex Elder if (count < 0) 204786b00e0dSAlex Elder return count; 204886b00e0dSAlex Elder bufp += count; 204986b00e0dSAlex Elder 205086b00e0dSAlex Elder count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap); 205186b00e0dSAlex Elder if (count < 0) 205286b00e0dSAlex Elder return count; 205386b00e0dSAlex Elder bufp += count; 205486b00e0dSAlex Elder 205586b00e0dSAlex Elder return (ssize_t) (bufp - buf); 205686b00e0dSAlex Elder } 205786b00e0dSAlex Elder 2058dfc5606dSYehuda Sadeh static ssize_t rbd_image_refresh(struct device *dev, 2059dfc5606dSYehuda Sadeh struct device_attribute *attr, 2060dfc5606dSYehuda Sadeh const char *buf, 2061dfc5606dSYehuda Sadeh size_t size) 2062dfc5606dSYehuda Sadeh { 2063593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 2064b813623aSAlex Elder int ret; 2065602adf40SYehuda Sadeh 2066117973fbSAlex Elder ret = rbd_dev_refresh(rbd_dev, NULL); 2067b813623aSAlex Elder 2068b813623aSAlex Elder return ret < 0 ? ret : size; 2069dfc5606dSYehuda Sadeh } 2070602adf40SYehuda Sadeh 2071dfc5606dSYehuda Sadeh static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL); 207234b13184SAlex Elder static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL); 2073dfc5606dSYehuda Sadeh static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL); 2074dfc5606dSYehuda Sadeh static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL); 2075dfc5606dSYehuda Sadeh static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL); 20769bb2f334SAlex Elder static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL); 2077dfc5606dSYehuda Sadeh static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL); 2078589d30e0SAlex Elder static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL); 2079dfc5606dSYehuda Sadeh static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh); 2080dfc5606dSYehuda Sadeh static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL); 208186b00e0dSAlex Elder static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL); 2082dfc5606dSYehuda Sadeh 2083dfc5606dSYehuda Sadeh static struct attribute *rbd_attrs[] = { 2084dfc5606dSYehuda Sadeh &dev_attr_size.attr, 208534b13184SAlex Elder &dev_attr_features.attr, 2086dfc5606dSYehuda Sadeh &dev_attr_major.attr, 2087dfc5606dSYehuda Sadeh &dev_attr_client_id.attr, 2088dfc5606dSYehuda Sadeh &dev_attr_pool.attr, 20899bb2f334SAlex Elder &dev_attr_pool_id.attr, 2090dfc5606dSYehuda Sadeh &dev_attr_name.attr, 2091589d30e0SAlex Elder &dev_attr_image_id.attr, 2092dfc5606dSYehuda Sadeh &dev_attr_current_snap.attr, 209386b00e0dSAlex Elder &dev_attr_parent.attr, 2094dfc5606dSYehuda Sadeh &dev_attr_refresh.attr, 2095dfc5606dSYehuda Sadeh NULL 2096dfc5606dSYehuda Sadeh }; 2097dfc5606dSYehuda Sadeh 2098dfc5606dSYehuda Sadeh static struct attribute_group rbd_attr_group = { 2099dfc5606dSYehuda Sadeh .attrs = rbd_attrs, 2100dfc5606dSYehuda Sadeh }; 2101dfc5606dSYehuda Sadeh 2102dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_attr_groups[] = { 2103dfc5606dSYehuda Sadeh &rbd_attr_group, 2104dfc5606dSYehuda Sadeh NULL 2105dfc5606dSYehuda Sadeh }; 2106dfc5606dSYehuda Sadeh 2107dfc5606dSYehuda Sadeh static void rbd_sysfs_dev_release(struct device *dev) 2108dfc5606dSYehuda Sadeh { 2109dfc5606dSYehuda Sadeh } 2110dfc5606dSYehuda Sadeh 2111dfc5606dSYehuda Sadeh static struct device_type rbd_device_type = { 2112dfc5606dSYehuda Sadeh .name = "rbd", 2113dfc5606dSYehuda Sadeh .groups = rbd_attr_groups, 2114dfc5606dSYehuda Sadeh .release = rbd_sysfs_dev_release, 2115dfc5606dSYehuda Sadeh }; 2116dfc5606dSYehuda Sadeh 2117dfc5606dSYehuda Sadeh 2118dfc5606dSYehuda Sadeh /* 2119dfc5606dSYehuda Sadeh sysfs - snapshots 2120dfc5606dSYehuda Sadeh */ 2121dfc5606dSYehuda Sadeh 2122dfc5606dSYehuda Sadeh static ssize_t rbd_snap_size_show(struct device *dev, 2123dfc5606dSYehuda Sadeh struct device_attribute *attr, 2124dfc5606dSYehuda Sadeh char *buf) 2125dfc5606dSYehuda Sadeh { 2126dfc5606dSYehuda Sadeh struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev); 2127dfc5606dSYehuda Sadeh 21283591538fSJosh Durgin return sprintf(buf, "%llu\n", (unsigned long long)snap->size); 2129dfc5606dSYehuda Sadeh } 2130dfc5606dSYehuda Sadeh 2131dfc5606dSYehuda Sadeh static ssize_t rbd_snap_id_show(struct device *dev, 2132dfc5606dSYehuda Sadeh struct device_attribute *attr, 2133dfc5606dSYehuda Sadeh char *buf) 2134dfc5606dSYehuda Sadeh { 2135dfc5606dSYehuda Sadeh struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev); 2136dfc5606dSYehuda Sadeh 2137593a9e7bSAlex Elder return sprintf(buf, "%llu\n", (unsigned long long)snap->id); 2138dfc5606dSYehuda Sadeh } 2139dfc5606dSYehuda Sadeh 214034b13184SAlex Elder static ssize_t rbd_snap_features_show(struct device *dev, 214134b13184SAlex Elder struct device_attribute *attr, 214234b13184SAlex Elder char *buf) 214334b13184SAlex Elder { 214434b13184SAlex Elder struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev); 214534b13184SAlex Elder 214634b13184SAlex Elder return sprintf(buf, "0x%016llx\n", 214734b13184SAlex Elder (unsigned long long) snap->features); 214834b13184SAlex Elder } 214934b13184SAlex Elder 2150dfc5606dSYehuda Sadeh static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL); 2151dfc5606dSYehuda Sadeh static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL); 215234b13184SAlex Elder static DEVICE_ATTR(snap_features, S_IRUGO, rbd_snap_features_show, NULL); 2153dfc5606dSYehuda Sadeh 2154dfc5606dSYehuda Sadeh static struct attribute *rbd_snap_attrs[] = { 2155dfc5606dSYehuda Sadeh &dev_attr_snap_size.attr, 2156dfc5606dSYehuda Sadeh &dev_attr_snap_id.attr, 215734b13184SAlex Elder &dev_attr_snap_features.attr, 2158dfc5606dSYehuda Sadeh NULL, 2159dfc5606dSYehuda Sadeh }; 2160dfc5606dSYehuda Sadeh 2161dfc5606dSYehuda Sadeh static struct attribute_group rbd_snap_attr_group = { 2162dfc5606dSYehuda Sadeh .attrs = rbd_snap_attrs, 2163dfc5606dSYehuda Sadeh }; 2164dfc5606dSYehuda Sadeh 2165dfc5606dSYehuda Sadeh static void rbd_snap_dev_release(struct device *dev) 2166dfc5606dSYehuda Sadeh { 2167dfc5606dSYehuda Sadeh struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev); 2168dfc5606dSYehuda Sadeh kfree(snap->name); 2169dfc5606dSYehuda Sadeh kfree(snap); 2170dfc5606dSYehuda Sadeh } 2171dfc5606dSYehuda Sadeh 2172dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_snap_attr_groups[] = { 2173dfc5606dSYehuda Sadeh &rbd_snap_attr_group, 2174dfc5606dSYehuda Sadeh NULL 2175dfc5606dSYehuda Sadeh }; 2176dfc5606dSYehuda Sadeh 2177dfc5606dSYehuda Sadeh static struct device_type rbd_snap_device_type = { 2178dfc5606dSYehuda Sadeh .groups = rbd_snap_attr_groups, 2179dfc5606dSYehuda Sadeh .release = rbd_snap_dev_release, 2180dfc5606dSYehuda Sadeh }; 2181dfc5606dSYehuda Sadeh 21828b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec) 21838b8fb99cSAlex Elder { 21848b8fb99cSAlex Elder kref_get(&spec->kref); 21858b8fb99cSAlex Elder 21868b8fb99cSAlex Elder return spec; 21878b8fb99cSAlex Elder } 21888b8fb99cSAlex Elder 21898b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref); 21908b8fb99cSAlex Elder static void rbd_spec_put(struct rbd_spec *spec) 21918b8fb99cSAlex Elder { 21928b8fb99cSAlex Elder if (spec) 21938b8fb99cSAlex Elder kref_put(&spec->kref, rbd_spec_free); 21948b8fb99cSAlex Elder } 21958b8fb99cSAlex Elder 21968b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_alloc(void) 21978b8fb99cSAlex Elder { 21988b8fb99cSAlex Elder struct rbd_spec *spec; 21998b8fb99cSAlex Elder 22008b8fb99cSAlex Elder spec = kzalloc(sizeof (*spec), GFP_KERNEL); 22018b8fb99cSAlex Elder if (!spec) 22028b8fb99cSAlex Elder return NULL; 22038b8fb99cSAlex Elder kref_init(&spec->kref); 22048b8fb99cSAlex Elder 22058b8fb99cSAlex Elder rbd_spec_put(rbd_spec_get(spec)); /* TEMPORARY */ 22068b8fb99cSAlex Elder 22078b8fb99cSAlex Elder return spec; 22088b8fb99cSAlex Elder } 22098b8fb99cSAlex Elder 22108b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref) 22118b8fb99cSAlex Elder { 22128b8fb99cSAlex Elder struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref); 22138b8fb99cSAlex Elder 22148b8fb99cSAlex Elder kfree(spec->pool_name); 22158b8fb99cSAlex Elder kfree(spec->image_id); 22168b8fb99cSAlex Elder kfree(spec->image_name); 22178b8fb99cSAlex Elder kfree(spec->snap_name); 22188b8fb99cSAlex Elder kfree(spec); 22198b8fb99cSAlex Elder } 22208b8fb99cSAlex Elder 2221c53d5893SAlex Elder struct rbd_device *rbd_dev_create(struct rbd_client *rbdc, 2222c53d5893SAlex Elder struct rbd_spec *spec) 2223c53d5893SAlex Elder { 2224c53d5893SAlex Elder struct rbd_device *rbd_dev; 2225c53d5893SAlex Elder 2226c53d5893SAlex Elder rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL); 2227c53d5893SAlex Elder if (!rbd_dev) 2228c53d5893SAlex Elder return NULL; 2229c53d5893SAlex Elder 2230c53d5893SAlex Elder spin_lock_init(&rbd_dev->lock); 2231c53d5893SAlex Elder INIT_LIST_HEAD(&rbd_dev->node); 2232c53d5893SAlex Elder INIT_LIST_HEAD(&rbd_dev->snaps); 2233c53d5893SAlex Elder init_rwsem(&rbd_dev->header_rwsem); 2234c53d5893SAlex Elder 2235c53d5893SAlex Elder rbd_dev->spec = spec; 2236c53d5893SAlex Elder rbd_dev->rbd_client = rbdc; 2237c53d5893SAlex Elder 2238c53d5893SAlex Elder return rbd_dev; 2239c53d5893SAlex Elder } 2240c53d5893SAlex Elder 2241c53d5893SAlex Elder static void rbd_dev_destroy(struct rbd_device *rbd_dev) 2242c53d5893SAlex Elder { 224386b00e0dSAlex Elder rbd_spec_put(rbd_dev->parent_spec); 2244c53d5893SAlex Elder kfree(rbd_dev->header_name); 2245c53d5893SAlex Elder rbd_put_client(rbd_dev->rbd_client); 2246c53d5893SAlex Elder rbd_spec_put(rbd_dev->spec); 2247c53d5893SAlex Elder kfree(rbd_dev); 2248c53d5893SAlex Elder } 2249c53d5893SAlex Elder 2250304f6808SAlex Elder static bool rbd_snap_registered(struct rbd_snap *snap) 2251304f6808SAlex Elder { 2252304f6808SAlex Elder bool ret = snap->dev.type == &rbd_snap_device_type; 2253304f6808SAlex Elder bool reg = device_is_registered(&snap->dev); 2254304f6808SAlex Elder 2255304f6808SAlex Elder rbd_assert(!ret ^ reg); 2256304f6808SAlex Elder 2257304f6808SAlex Elder return ret; 2258304f6808SAlex Elder } 2259304f6808SAlex Elder 226041f38c2bSAlex Elder static void rbd_remove_snap_dev(struct rbd_snap *snap) 2261dfc5606dSYehuda Sadeh { 2262dfc5606dSYehuda Sadeh list_del(&snap->node); 2263304f6808SAlex Elder if (device_is_registered(&snap->dev)) 2264dfc5606dSYehuda Sadeh device_unregister(&snap->dev); 2265dfc5606dSYehuda Sadeh } 2266dfc5606dSYehuda Sadeh 226714e7085dSAlex Elder static int rbd_register_snap_dev(struct rbd_snap *snap, 2268dfc5606dSYehuda Sadeh struct device *parent) 2269dfc5606dSYehuda Sadeh { 2270dfc5606dSYehuda Sadeh struct device *dev = &snap->dev; 2271dfc5606dSYehuda Sadeh int ret; 2272dfc5606dSYehuda Sadeh 2273dfc5606dSYehuda Sadeh dev->type = &rbd_snap_device_type; 2274dfc5606dSYehuda Sadeh dev->parent = parent; 2275dfc5606dSYehuda Sadeh dev->release = rbd_snap_dev_release; 2276d4b125e9SAlex Elder dev_set_name(dev, "%s%s", RBD_SNAP_DEV_NAME_PREFIX, snap->name); 2277304f6808SAlex Elder dout("%s: registering device for snapshot %s\n", __func__, snap->name); 2278304f6808SAlex Elder 2279dfc5606dSYehuda Sadeh ret = device_register(dev); 2280dfc5606dSYehuda Sadeh 2281dfc5606dSYehuda Sadeh return ret; 2282dfc5606dSYehuda Sadeh } 2283dfc5606dSYehuda Sadeh 22844e891e0aSAlex Elder static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev, 2285c8d18425SAlex Elder const char *snap_name, 228634b13184SAlex Elder u64 snap_id, u64 snap_size, 228734b13184SAlex Elder u64 snap_features) 2288dfc5606dSYehuda Sadeh { 22894e891e0aSAlex Elder struct rbd_snap *snap; 2290dfc5606dSYehuda Sadeh int ret; 22914e891e0aSAlex Elder 22924e891e0aSAlex Elder snap = kzalloc(sizeof (*snap), GFP_KERNEL); 2293dfc5606dSYehuda Sadeh if (!snap) 22944e891e0aSAlex Elder return ERR_PTR(-ENOMEM); 22954e891e0aSAlex Elder 22964e891e0aSAlex Elder ret = -ENOMEM; 2297c8d18425SAlex Elder snap->name = kstrdup(snap_name, GFP_KERNEL); 22984e891e0aSAlex Elder if (!snap->name) 22994e891e0aSAlex Elder goto err; 23004e891e0aSAlex Elder 2301c8d18425SAlex Elder snap->id = snap_id; 2302c8d18425SAlex Elder snap->size = snap_size; 230334b13184SAlex Elder snap->features = snap_features; 23044e891e0aSAlex Elder 23054e891e0aSAlex Elder return snap; 23064e891e0aSAlex Elder 2307dfc5606dSYehuda Sadeh err: 2308dfc5606dSYehuda Sadeh kfree(snap->name); 2309dfc5606dSYehuda Sadeh kfree(snap); 23104e891e0aSAlex Elder 23114e891e0aSAlex Elder return ERR_PTR(ret); 2312dfc5606dSYehuda Sadeh } 2313dfc5606dSYehuda Sadeh 2314cd892126SAlex Elder static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which, 2315cd892126SAlex Elder u64 *snap_size, u64 *snap_features) 2316cd892126SAlex Elder { 2317cd892126SAlex Elder char *snap_name; 2318cd892126SAlex Elder 2319cd892126SAlex Elder rbd_assert(which < rbd_dev->header.snapc->num_snaps); 2320cd892126SAlex Elder 2321cd892126SAlex Elder *snap_size = rbd_dev->header.snap_sizes[which]; 2322cd892126SAlex Elder *snap_features = 0; /* No features for v1 */ 2323cd892126SAlex Elder 2324cd892126SAlex Elder /* Skip over names until we find the one we are looking for */ 2325cd892126SAlex Elder 2326cd892126SAlex Elder snap_name = rbd_dev->header.snap_names; 2327cd892126SAlex Elder while (which--) 2328cd892126SAlex Elder snap_name += strlen(snap_name) + 1; 2329cd892126SAlex Elder 2330cd892126SAlex Elder return snap_name; 2331cd892126SAlex Elder } 2332cd892126SAlex Elder 2333dfc5606dSYehuda Sadeh /* 23349d475de5SAlex Elder * Get the size and object order for an image snapshot, or if 23359d475de5SAlex Elder * snap_id is CEPH_NOSNAP, gets this information for the base 23369d475de5SAlex Elder * image. 23379d475de5SAlex Elder */ 23389d475de5SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id, 23399d475de5SAlex Elder u8 *order, u64 *snap_size) 23409d475de5SAlex Elder { 23419d475de5SAlex Elder __le64 snapid = cpu_to_le64(snap_id); 23429d475de5SAlex Elder int ret; 23439d475de5SAlex Elder struct { 23449d475de5SAlex Elder u8 order; 23459d475de5SAlex Elder __le64 size; 23469d475de5SAlex Elder } __attribute__ ((packed)) size_buf = { 0 }; 23479d475de5SAlex Elder 23489d475de5SAlex Elder ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, 23499d475de5SAlex Elder "rbd", "get_size", 23509d475de5SAlex Elder (char *) &snapid, sizeof (snapid), 23519d475de5SAlex Elder (char *) &size_buf, sizeof (size_buf), 23529d475de5SAlex Elder CEPH_OSD_FLAG_READ, NULL); 23539d475de5SAlex Elder dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); 23549d475de5SAlex Elder if (ret < 0) 23559d475de5SAlex Elder return ret; 23569d475de5SAlex Elder 23579d475de5SAlex Elder *order = size_buf.order; 23589d475de5SAlex Elder *snap_size = le64_to_cpu(size_buf.size); 23599d475de5SAlex Elder 23609d475de5SAlex Elder dout(" snap_id 0x%016llx order = %u, snap_size = %llu\n", 23619d475de5SAlex Elder (unsigned long long) snap_id, (unsigned int) *order, 23629d475de5SAlex Elder (unsigned long long) *snap_size); 23639d475de5SAlex Elder 23649d475de5SAlex Elder return 0; 23659d475de5SAlex Elder } 23669d475de5SAlex Elder 23679d475de5SAlex Elder static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev) 23689d475de5SAlex Elder { 23699d475de5SAlex Elder return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP, 23709d475de5SAlex Elder &rbd_dev->header.obj_order, 23719d475de5SAlex Elder &rbd_dev->header.image_size); 23729d475de5SAlex Elder } 23739d475de5SAlex Elder 23741e130199SAlex Elder static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev) 23751e130199SAlex Elder { 23761e130199SAlex Elder void *reply_buf; 23771e130199SAlex Elder int ret; 23781e130199SAlex Elder void *p; 23791e130199SAlex Elder 23801e130199SAlex Elder reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL); 23811e130199SAlex Elder if (!reply_buf) 23821e130199SAlex Elder return -ENOMEM; 23831e130199SAlex Elder 23841e130199SAlex Elder ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, 23851e130199SAlex Elder "rbd", "get_object_prefix", 23861e130199SAlex Elder NULL, 0, 23871e130199SAlex Elder reply_buf, RBD_OBJ_PREFIX_LEN_MAX, 23881e130199SAlex Elder CEPH_OSD_FLAG_READ, NULL); 23891e130199SAlex Elder dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); 23901e130199SAlex Elder if (ret < 0) 23911e130199SAlex Elder goto out; 2392a0ea3a40SAlex Elder ret = 0; /* rbd_req_sync_exec() can return positive */ 23931e130199SAlex Elder 23941e130199SAlex Elder p = reply_buf; 23951e130199SAlex Elder rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p, 23961e130199SAlex Elder p + RBD_OBJ_PREFIX_LEN_MAX, 23971e130199SAlex Elder NULL, GFP_NOIO); 23981e130199SAlex Elder 23991e130199SAlex Elder if (IS_ERR(rbd_dev->header.object_prefix)) { 24001e130199SAlex Elder ret = PTR_ERR(rbd_dev->header.object_prefix); 24011e130199SAlex Elder rbd_dev->header.object_prefix = NULL; 24021e130199SAlex Elder } else { 24031e130199SAlex Elder dout(" object_prefix = %s\n", rbd_dev->header.object_prefix); 24041e130199SAlex Elder } 24051e130199SAlex Elder 24061e130199SAlex Elder out: 24071e130199SAlex Elder kfree(reply_buf); 24081e130199SAlex Elder 24091e130199SAlex Elder return ret; 24101e130199SAlex Elder } 24111e130199SAlex Elder 2412b1b5402aSAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id, 2413b1b5402aSAlex Elder u64 *snap_features) 2414b1b5402aSAlex Elder { 2415b1b5402aSAlex Elder __le64 snapid = cpu_to_le64(snap_id); 2416b1b5402aSAlex Elder struct { 2417b1b5402aSAlex Elder __le64 features; 2418b1b5402aSAlex Elder __le64 incompat; 2419b1b5402aSAlex Elder } features_buf = { 0 }; 2420d889140cSAlex Elder u64 incompat; 2421b1b5402aSAlex Elder int ret; 2422b1b5402aSAlex Elder 2423b1b5402aSAlex Elder ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, 2424b1b5402aSAlex Elder "rbd", "get_features", 2425b1b5402aSAlex Elder (char *) &snapid, sizeof (snapid), 2426b1b5402aSAlex Elder (char *) &features_buf, sizeof (features_buf), 2427b1b5402aSAlex Elder CEPH_OSD_FLAG_READ, NULL); 2428b1b5402aSAlex Elder dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); 2429b1b5402aSAlex Elder if (ret < 0) 2430b1b5402aSAlex Elder return ret; 2431d889140cSAlex Elder 2432d889140cSAlex Elder incompat = le64_to_cpu(features_buf.incompat); 2433d889140cSAlex Elder if (incompat & ~RBD_FEATURES_ALL) 2434d889140cSAlex Elder return -ENOTSUPP; 2435d889140cSAlex Elder 2436b1b5402aSAlex Elder *snap_features = le64_to_cpu(features_buf.features); 2437b1b5402aSAlex Elder 2438b1b5402aSAlex Elder dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n", 2439b1b5402aSAlex Elder (unsigned long long) snap_id, 2440b1b5402aSAlex Elder (unsigned long long) *snap_features, 2441b1b5402aSAlex Elder (unsigned long long) le64_to_cpu(features_buf.incompat)); 2442b1b5402aSAlex Elder 2443b1b5402aSAlex Elder return 0; 2444b1b5402aSAlex Elder } 2445b1b5402aSAlex Elder 2446b1b5402aSAlex Elder static int rbd_dev_v2_features(struct rbd_device *rbd_dev) 2447b1b5402aSAlex Elder { 2448b1b5402aSAlex Elder return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP, 2449b1b5402aSAlex Elder &rbd_dev->header.features); 2450b1b5402aSAlex Elder } 2451b1b5402aSAlex Elder 245286b00e0dSAlex Elder static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev) 245386b00e0dSAlex Elder { 245486b00e0dSAlex Elder struct rbd_spec *parent_spec; 245586b00e0dSAlex Elder size_t size; 245686b00e0dSAlex Elder void *reply_buf = NULL; 245786b00e0dSAlex Elder __le64 snapid; 245886b00e0dSAlex Elder void *p; 245986b00e0dSAlex Elder void *end; 246086b00e0dSAlex Elder char *image_id; 246186b00e0dSAlex Elder u64 overlap; 246286b00e0dSAlex Elder size_t len = 0; 246386b00e0dSAlex Elder int ret; 246486b00e0dSAlex Elder 246586b00e0dSAlex Elder parent_spec = rbd_spec_alloc(); 246686b00e0dSAlex Elder if (!parent_spec) 246786b00e0dSAlex Elder return -ENOMEM; 246886b00e0dSAlex Elder 246986b00e0dSAlex Elder size = sizeof (__le64) + /* pool_id */ 247086b00e0dSAlex Elder sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX + /* image_id */ 247186b00e0dSAlex Elder sizeof (__le64) + /* snap_id */ 247286b00e0dSAlex Elder sizeof (__le64); /* overlap */ 247386b00e0dSAlex Elder reply_buf = kmalloc(size, GFP_KERNEL); 247486b00e0dSAlex Elder if (!reply_buf) { 247586b00e0dSAlex Elder ret = -ENOMEM; 247686b00e0dSAlex Elder goto out_err; 247786b00e0dSAlex Elder } 247886b00e0dSAlex Elder 247986b00e0dSAlex Elder snapid = cpu_to_le64(CEPH_NOSNAP); 248086b00e0dSAlex Elder ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, 248186b00e0dSAlex Elder "rbd", "get_parent", 248286b00e0dSAlex Elder (char *) &snapid, sizeof (snapid), 248386b00e0dSAlex Elder (char *) reply_buf, size, 248486b00e0dSAlex Elder CEPH_OSD_FLAG_READ, NULL); 248586b00e0dSAlex Elder dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); 248686b00e0dSAlex Elder if (ret < 0) 248786b00e0dSAlex Elder goto out_err; 248886b00e0dSAlex Elder 248986b00e0dSAlex Elder ret = -ERANGE; 249086b00e0dSAlex Elder p = reply_buf; 249186b00e0dSAlex Elder end = (char *) reply_buf + size; 249286b00e0dSAlex Elder ceph_decode_64_safe(&p, end, parent_spec->pool_id, out_err); 249386b00e0dSAlex Elder if (parent_spec->pool_id == CEPH_NOPOOL) 249486b00e0dSAlex Elder goto out; /* No parent? No problem. */ 249586b00e0dSAlex Elder 249686b00e0dSAlex Elder image_id = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL); 249786b00e0dSAlex Elder if (IS_ERR(image_id)) { 249886b00e0dSAlex Elder ret = PTR_ERR(image_id); 249986b00e0dSAlex Elder goto out_err; 250086b00e0dSAlex Elder } 250186b00e0dSAlex Elder parent_spec->image_id = image_id; 250286b00e0dSAlex Elder ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err); 250386b00e0dSAlex Elder ceph_decode_64_safe(&p, end, overlap, out_err); 250486b00e0dSAlex Elder 250586b00e0dSAlex Elder rbd_dev->parent_overlap = overlap; 250686b00e0dSAlex Elder rbd_dev->parent_spec = parent_spec; 250786b00e0dSAlex Elder parent_spec = NULL; /* rbd_dev now owns this */ 250886b00e0dSAlex Elder out: 250986b00e0dSAlex Elder ret = 0; 251086b00e0dSAlex Elder out_err: 251186b00e0dSAlex Elder kfree(reply_buf); 251286b00e0dSAlex Elder rbd_spec_put(parent_spec); 251386b00e0dSAlex Elder 251486b00e0dSAlex Elder return ret; 251586b00e0dSAlex Elder } 251686b00e0dSAlex Elder 25176e14b1a6SAlex Elder static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver) 251835d489f9SAlex Elder { 251935d489f9SAlex Elder size_t size; 252035d489f9SAlex Elder int ret; 252135d489f9SAlex Elder void *reply_buf; 252235d489f9SAlex Elder void *p; 252335d489f9SAlex Elder void *end; 252435d489f9SAlex Elder u64 seq; 252535d489f9SAlex Elder u32 snap_count; 252635d489f9SAlex Elder struct ceph_snap_context *snapc; 252735d489f9SAlex Elder u32 i; 252835d489f9SAlex Elder 252935d489f9SAlex Elder /* 253035d489f9SAlex Elder * We'll need room for the seq value (maximum snapshot id), 253135d489f9SAlex Elder * snapshot count, and array of that many snapshot ids. 253235d489f9SAlex Elder * For now we have a fixed upper limit on the number we're 253335d489f9SAlex Elder * prepared to receive. 253435d489f9SAlex Elder */ 253535d489f9SAlex Elder size = sizeof (__le64) + sizeof (__le32) + 253635d489f9SAlex Elder RBD_MAX_SNAP_COUNT * sizeof (__le64); 253735d489f9SAlex Elder reply_buf = kzalloc(size, GFP_KERNEL); 253835d489f9SAlex Elder if (!reply_buf) 253935d489f9SAlex Elder return -ENOMEM; 254035d489f9SAlex Elder 254135d489f9SAlex Elder ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, 254235d489f9SAlex Elder "rbd", "get_snapcontext", 254335d489f9SAlex Elder NULL, 0, 254435d489f9SAlex Elder reply_buf, size, 25456e14b1a6SAlex Elder CEPH_OSD_FLAG_READ, ver); 254635d489f9SAlex Elder dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); 254735d489f9SAlex Elder if (ret < 0) 254835d489f9SAlex Elder goto out; 254935d489f9SAlex Elder 255035d489f9SAlex Elder ret = -ERANGE; 255135d489f9SAlex Elder p = reply_buf; 255235d489f9SAlex Elder end = (char *) reply_buf + size; 255335d489f9SAlex Elder ceph_decode_64_safe(&p, end, seq, out); 255435d489f9SAlex Elder ceph_decode_32_safe(&p, end, snap_count, out); 255535d489f9SAlex Elder 255635d489f9SAlex Elder /* 255735d489f9SAlex Elder * Make sure the reported number of snapshot ids wouldn't go 255835d489f9SAlex Elder * beyond the end of our buffer. But before checking that, 255935d489f9SAlex Elder * make sure the computed size of the snapshot context we 256035d489f9SAlex Elder * allocate is representable in a size_t. 256135d489f9SAlex Elder */ 256235d489f9SAlex Elder if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context)) 256335d489f9SAlex Elder / sizeof (u64)) { 256435d489f9SAlex Elder ret = -EINVAL; 256535d489f9SAlex Elder goto out; 256635d489f9SAlex Elder } 256735d489f9SAlex Elder if (!ceph_has_room(&p, end, snap_count * sizeof (__le64))) 256835d489f9SAlex Elder goto out; 256935d489f9SAlex Elder 257035d489f9SAlex Elder size = sizeof (struct ceph_snap_context) + 257135d489f9SAlex Elder snap_count * sizeof (snapc->snaps[0]); 257235d489f9SAlex Elder snapc = kmalloc(size, GFP_KERNEL); 257335d489f9SAlex Elder if (!snapc) { 257435d489f9SAlex Elder ret = -ENOMEM; 257535d489f9SAlex Elder goto out; 257635d489f9SAlex Elder } 257735d489f9SAlex Elder 257835d489f9SAlex Elder atomic_set(&snapc->nref, 1); 257935d489f9SAlex Elder snapc->seq = seq; 258035d489f9SAlex Elder snapc->num_snaps = snap_count; 258135d489f9SAlex Elder for (i = 0; i < snap_count; i++) 258235d489f9SAlex Elder snapc->snaps[i] = ceph_decode_64(&p); 258335d489f9SAlex Elder 258435d489f9SAlex Elder rbd_dev->header.snapc = snapc; 258535d489f9SAlex Elder 258635d489f9SAlex Elder dout(" snap context seq = %llu, snap_count = %u\n", 258735d489f9SAlex Elder (unsigned long long) seq, (unsigned int) snap_count); 258835d489f9SAlex Elder 258935d489f9SAlex Elder out: 259035d489f9SAlex Elder kfree(reply_buf); 259135d489f9SAlex Elder 259235d489f9SAlex Elder return 0; 259335d489f9SAlex Elder } 259435d489f9SAlex Elder 2595b8b1e2dbSAlex Elder static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which) 2596b8b1e2dbSAlex Elder { 2597b8b1e2dbSAlex Elder size_t size; 2598b8b1e2dbSAlex Elder void *reply_buf; 2599b8b1e2dbSAlex Elder __le64 snap_id; 2600b8b1e2dbSAlex Elder int ret; 2601b8b1e2dbSAlex Elder void *p; 2602b8b1e2dbSAlex Elder void *end; 2603b8b1e2dbSAlex Elder char *snap_name; 2604b8b1e2dbSAlex Elder 2605b8b1e2dbSAlex Elder size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN; 2606b8b1e2dbSAlex Elder reply_buf = kmalloc(size, GFP_KERNEL); 2607b8b1e2dbSAlex Elder if (!reply_buf) 2608b8b1e2dbSAlex Elder return ERR_PTR(-ENOMEM); 2609b8b1e2dbSAlex Elder 2610b8b1e2dbSAlex Elder snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]); 2611b8b1e2dbSAlex Elder ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, 2612b8b1e2dbSAlex Elder "rbd", "get_snapshot_name", 2613b8b1e2dbSAlex Elder (char *) &snap_id, sizeof (snap_id), 2614b8b1e2dbSAlex Elder reply_buf, size, 2615b8b1e2dbSAlex Elder CEPH_OSD_FLAG_READ, NULL); 2616b8b1e2dbSAlex Elder dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); 2617b8b1e2dbSAlex Elder if (ret < 0) 2618b8b1e2dbSAlex Elder goto out; 2619b8b1e2dbSAlex Elder 2620b8b1e2dbSAlex Elder p = reply_buf; 2621b8b1e2dbSAlex Elder end = (char *) reply_buf + size; 2622e5c35534SAlex Elder snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL); 2623b8b1e2dbSAlex Elder if (IS_ERR(snap_name)) { 2624b8b1e2dbSAlex Elder ret = PTR_ERR(snap_name); 2625b8b1e2dbSAlex Elder goto out; 2626b8b1e2dbSAlex Elder } else { 2627b8b1e2dbSAlex Elder dout(" snap_id 0x%016llx snap_name = %s\n", 2628b8b1e2dbSAlex Elder (unsigned long long) le64_to_cpu(snap_id), snap_name); 2629b8b1e2dbSAlex Elder } 2630b8b1e2dbSAlex Elder kfree(reply_buf); 2631b8b1e2dbSAlex Elder 2632b8b1e2dbSAlex Elder return snap_name; 2633b8b1e2dbSAlex Elder out: 2634b8b1e2dbSAlex Elder kfree(reply_buf); 2635b8b1e2dbSAlex Elder 2636b8b1e2dbSAlex Elder return ERR_PTR(ret); 2637b8b1e2dbSAlex Elder } 2638b8b1e2dbSAlex Elder 2639b8b1e2dbSAlex Elder static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which, 2640b8b1e2dbSAlex Elder u64 *snap_size, u64 *snap_features) 2641b8b1e2dbSAlex Elder { 2642b8b1e2dbSAlex Elder __le64 snap_id; 2643b8b1e2dbSAlex Elder u8 order; 2644b8b1e2dbSAlex Elder int ret; 2645b8b1e2dbSAlex Elder 2646b8b1e2dbSAlex Elder snap_id = rbd_dev->header.snapc->snaps[which]; 2647b8b1e2dbSAlex Elder ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, &order, snap_size); 2648b8b1e2dbSAlex Elder if (ret) 2649b8b1e2dbSAlex Elder return ERR_PTR(ret); 2650b8b1e2dbSAlex Elder ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, snap_features); 2651b8b1e2dbSAlex Elder if (ret) 2652b8b1e2dbSAlex Elder return ERR_PTR(ret); 2653b8b1e2dbSAlex Elder 2654b8b1e2dbSAlex Elder return rbd_dev_v2_snap_name(rbd_dev, which); 2655b8b1e2dbSAlex Elder } 2656b8b1e2dbSAlex Elder 2657b8b1e2dbSAlex Elder static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which, 2658b8b1e2dbSAlex Elder u64 *snap_size, u64 *snap_features) 2659b8b1e2dbSAlex Elder { 2660b8b1e2dbSAlex Elder if (rbd_dev->image_format == 1) 2661b8b1e2dbSAlex Elder return rbd_dev_v1_snap_info(rbd_dev, which, 2662b8b1e2dbSAlex Elder snap_size, snap_features); 2663b8b1e2dbSAlex Elder if (rbd_dev->image_format == 2) 2664b8b1e2dbSAlex Elder return rbd_dev_v2_snap_info(rbd_dev, which, 2665b8b1e2dbSAlex Elder snap_size, snap_features); 2666b8b1e2dbSAlex Elder return ERR_PTR(-EINVAL); 2667b8b1e2dbSAlex Elder } 2668b8b1e2dbSAlex Elder 2669117973fbSAlex Elder static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver) 2670117973fbSAlex Elder { 2671117973fbSAlex Elder int ret; 2672117973fbSAlex Elder __u8 obj_order; 2673117973fbSAlex Elder 2674117973fbSAlex Elder down_write(&rbd_dev->header_rwsem); 2675117973fbSAlex Elder 2676117973fbSAlex Elder /* Grab old order first, to see if it changes */ 2677117973fbSAlex Elder 2678117973fbSAlex Elder obj_order = rbd_dev->header.obj_order, 2679117973fbSAlex Elder ret = rbd_dev_v2_image_size(rbd_dev); 2680117973fbSAlex Elder if (ret) 2681117973fbSAlex Elder goto out; 2682117973fbSAlex Elder if (rbd_dev->header.obj_order != obj_order) { 2683117973fbSAlex Elder ret = -EIO; 2684117973fbSAlex Elder goto out; 2685117973fbSAlex Elder } 2686117973fbSAlex Elder rbd_update_mapping_size(rbd_dev); 2687117973fbSAlex Elder 2688117973fbSAlex Elder ret = rbd_dev_v2_snap_context(rbd_dev, hver); 2689117973fbSAlex Elder dout("rbd_dev_v2_snap_context returned %d\n", ret); 2690117973fbSAlex Elder if (ret) 2691117973fbSAlex Elder goto out; 2692117973fbSAlex Elder ret = rbd_dev_snaps_update(rbd_dev); 2693117973fbSAlex Elder dout("rbd_dev_snaps_update returned %d\n", ret); 2694117973fbSAlex Elder if (ret) 2695117973fbSAlex Elder goto out; 2696117973fbSAlex Elder ret = rbd_dev_snaps_register(rbd_dev); 2697117973fbSAlex Elder dout("rbd_dev_snaps_register returned %d\n", ret); 2698117973fbSAlex Elder out: 2699117973fbSAlex Elder up_write(&rbd_dev->header_rwsem); 2700117973fbSAlex Elder 2701117973fbSAlex Elder return ret; 2702117973fbSAlex Elder } 2703117973fbSAlex Elder 27049d475de5SAlex Elder /* 270535938150SAlex Elder * Scan the rbd device's current snapshot list and compare it to the 270635938150SAlex Elder * newly-received snapshot context. Remove any existing snapshots 270735938150SAlex Elder * not present in the new snapshot context. Add a new snapshot for 270835938150SAlex Elder * any snaphots in the snapshot context not in the current list. 270935938150SAlex Elder * And verify there are no changes to snapshots we already know 271035938150SAlex Elder * about. 271135938150SAlex Elder * 271235938150SAlex Elder * Assumes the snapshots in the snapshot context are sorted by 271335938150SAlex Elder * snapshot id, highest id first. (Snapshots in the rbd_dev's list 271435938150SAlex Elder * are also maintained in that order.) 2715dfc5606dSYehuda Sadeh */ 2716304f6808SAlex Elder static int rbd_dev_snaps_update(struct rbd_device *rbd_dev) 2717dfc5606dSYehuda Sadeh { 271835938150SAlex Elder struct ceph_snap_context *snapc = rbd_dev->header.snapc; 271935938150SAlex Elder const u32 snap_count = snapc->num_snaps; 272035938150SAlex Elder struct list_head *head = &rbd_dev->snaps; 272135938150SAlex Elder struct list_head *links = head->next; 272235938150SAlex Elder u32 index = 0; 2723dfc5606dSYehuda Sadeh 27249fcbb800SAlex Elder dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count); 272535938150SAlex Elder while (index < snap_count || links != head) { 272635938150SAlex Elder u64 snap_id; 272735938150SAlex Elder struct rbd_snap *snap; 2728cd892126SAlex Elder char *snap_name; 2729cd892126SAlex Elder u64 snap_size = 0; 2730cd892126SAlex Elder u64 snap_features = 0; 2731dfc5606dSYehuda Sadeh 273235938150SAlex Elder snap_id = index < snap_count ? snapc->snaps[index] 273335938150SAlex Elder : CEPH_NOSNAP; 273435938150SAlex Elder snap = links != head ? list_entry(links, struct rbd_snap, node) 273535938150SAlex Elder : NULL; 2736aafb230eSAlex Elder rbd_assert(!snap || snap->id != CEPH_NOSNAP); 2737dfc5606dSYehuda Sadeh 273835938150SAlex Elder if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) { 273935938150SAlex Elder struct list_head *next = links->next; 2740dfc5606dSYehuda Sadeh 274135938150SAlex Elder /* Existing snapshot not in the new snap context */ 2742dfc5606dSYehuda Sadeh 27430d7dbfceSAlex Elder if (rbd_dev->spec->snap_id == snap->id) 2744daba5fdbSAlex Elder rbd_dev->exists = false; 274541f38c2bSAlex Elder rbd_remove_snap_dev(snap); 27469fcbb800SAlex Elder dout("%ssnap id %llu has been removed\n", 27470d7dbfceSAlex Elder rbd_dev->spec->snap_id == snap->id ? 27480d7dbfceSAlex Elder "mapped " : "", 27499fcbb800SAlex Elder (unsigned long long) snap->id); 2750dfc5606dSYehuda Sadeh 275135938150SAlex Elder /* Done with this list entry; advance */ 275235938150SAlex Elder 275335938150SAlex Elder links = next; 275435938150SAlex Elder continue; 2755dfc5606dSYehuda Sadeh } 275635938150SAlex Elder 2757b8b1e2dbSAlex Elder snap_name = rbd_dev_snap_info(rbd_dev, index, 2758cd892126SAlex Elder &snap_size, &snap_features); 2759cd892126SAlex Elder if (IS_ERR(snap_name)) 2760cd892126SAlex Elder return PTR_ERR(snap_name); 2761cd892126SAlex Elder 27629fcbb800SAlex Elder dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count, 27639fcbb800SAlex Elder (unsigned long long) snap_id); 276435938150SAlex Elder if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) { 276535938150SAlex Elder struct rbd_snap *new_snap; 276635938150SAlex Elder 276735938150SAlex Elder /* We haven't seen this snapshot before */ 276835938150SAlex Elder 2769c8d18425SAlex Elder new_snap = __rbd_add_snap_dev(rbd_dev, snap_name, 2770cd892126SAlex Elder snap_id, snap_size, snap_features); 27719fcbb800SAlex Elder if (IS_ERR(new_snap)) { 27729fcbb800SAlex Elder int err = PTR_ERR(new_snap); 27739fcbb800SAlex Elder 27749fcbb800SAlex Elder dout(" failed to add dev, error %d\n", err); 27759fcbb800SAlex Elder 27769fcbb800SAlex Elder return err; 27779fcbb800SAlex Elder } 277835938150SAlex Elder 277935938150SAlex Elder /* New goes before existing, or at end of list */ 278035938150SAlex Elder 27819fcbb800SAlex Elder dout(" added dev%s\n", snap ? "" : " at end\n"); 278235938150SAlex Elder if (snap) 278335938150SAlex Elder list_add_tail(&new_snap->node, &snap->node); 278435938150SAlex Elder else 2785523f3258SAlex Elder list_add_tail(&new_snap->node, head); 278635938150SAlex Elder } else { 278735938150SAlex Elder /* Already have this one */ 278835938150SAlex Elder 27899fcbb800SAlex Elder dout(" already present\n"); 27909fcbb800SAlex Elder 2791cd892126SAlex Elder rbd_assert(snap->size == snap_size); 2792aafb230eSAlex Elder rbd_assert(!strcmp(snap->name, snap_name)); 2793cd892126SAlex Elder rbd_assert(snap->features == snap_features); 279435938150SAlex Elder 279535938150SAlex Elder /* Done with this list entry; advance */ 279635938150SAlex Elder 279735938150SAlex Elder links = links->next; 2798dfc5606dSYehuda Sadeh } 279935938150SAlex Elder 280035938150SAlex Elder /* Advance to the next entry in the snapshot context */ 280135938150SAlex Elder 280235938150SAlex Elder index++; 2803dfc5606dSYehuda Sadeh } 28049fcbb800SAlex Elder dout("%s: done\n", __func__); 2805dfc5606dSYehuda Sadeh 2806dfc5606dSYehuda Sadeh return 0; 2807dfc5606dSYehuda Sadeh } 2808dfc5606dSYehuda Sadeh 2809304f6808SAlex Elder /* 2810304f6808SAlex Elder * Scan the list of snapshots and register the devices for any that 2811304f6808SAlex Elder * have not already been registered. 2812304f6808SAlex Elder */ 2813304f6808SAlex Elder static int rbd_dev_snaps_register(struct rbd_device *rbd_dev) 2814304f6808SAlex Elder { 2815304f6808SAlex Elder struct rbd_snap *snap; 2816304f6808SAlex Elder int ret = 0; 2817304f6808SAlex Elder 2818304f6808SAlex Elder dout("%s called\n", __func__); 281986ff77bbSAlex Elder if (WARN_ON(!device_is_registered(&rbd_dev->dev))) 282086ff77bbSAlex Elder return -EIO; 2821304f6808SAlex Elder 2822304f6808SAlex Elder list_for_each_entry(snap, &rbd_dev->snaps, node) { 2823304f6808SAlex Elder if (!rbd_snap_registered(snap)) { 2824304f6808SAlex Elder ret = rbd_register_snap_dev(snap, &rbd_dev->dev); 2825304f6808SAlex Elder if (ret < 0) 2826304f6808SAlex Elder break; 2827304f6808SAlex Elder } 2828304f6808SAlex Elder } 2829304f6808SAlex Elder dout("%s: returning %d\n", __func__, ret); 2830304f6808SAlex Elder 2831304f6808SAlex Elder return ret; 2832304f6808SAlex Elder } 2833304f6808SAlex Elder 2834dfc5606dSYehuda Sadeh static int rbd_bus_add_dev(struct rbd_device *rbd_dev) 2835dfc5606dSYehuda Sadeh { 2836dfc5606dSYehuda Sadeh struct device *dev; 2837cd789ab9SAlex Elder int ret; 2838dfc5606dSYehuda Sadeh 2839dfc5606dSYehuda Sadeh mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); 2840dfc5606dSYehuda Sadeh 2841cd789ab9SAlex Elder dev = &rbd_dev->dev; 2842dfc5606dSYehuda Sadeh dev->bus = &rbd_bus_type; 2843dfc5606dSYehuda Sadeh dev->type = &rbd_device_type; 2844dfc5606dSYehuda Sadeh dev->parent = &rbd_root_dev; 2845dfc5606dSYehuda Sadeh dev->release = rbd_dev_release; 2846de71a297SAlex Elder dev_set_name(dev, "%d", rbd_dev->dev_id); 2847dfc5606dSYehuda Sadeh ret = device_register(dev); 2848dfc5606dSYehuda Sadeh 2849dfc5606dSYehuda Sadeh mutex_unlock(&ctl_mutex); 2850cd789ab9SAlex Elder 2851dfc5606dSYehuda Sadeh return ret; 2852602adf40SYehuda Sadeh } 2853602adf40SYehuda Sadeh 2854dfc5606dSYehuda Sadeh static void rbd_bus_del_dev(struct rbd_device *rbd_dev) 2855dfc5606dSYehuda Sadeh { 2856dfc5606dSYehuda Sadeh device_unregister(&rbd_dev->dev); 2857dfc5606dSYehuda Sadeh } 2858dfc5606dSYehuda Sadeh 285959c2be1eSYehuda Sadeh static int rbd_init_watch_dev(struct rbd_device *rbd_dev) 286059c2be1eSYehuda Sadeh { 286159c2be1eSYehuda Sadeh int ret, rc; 286259c2be1eSYehuda Sadeh 286359c2be1eSYehuda Sadeh do { 28640e6f322dSAlex Elder ret = rbd_req_sync_watch(rbd_dev); 286559c2be1eSYehuda Sadeh if (ret == -ERANGE) { 2866117973fbSAlex Elder rc = rbd_dev_refresh(rbd_dev, NULL); 286759c2be1eSYehuda Sadeh if (rc < 0) 286859c2be1eSYehuda Sadeh return rc; 286959c2be1eSYehuda Sadeh } 287059c2be1eSYehuda Sadeh } while (ret == -ERANGE); 287159c2be1eSYehuda Sadeh 287259c2be1eSYehuda Sadeh return ret; 287359c2be1eSYehuda Sadeh } 287459c2be1eSYehuda Sadeh 2875e2839308SAlex Elder static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0); 28761ddbe94eSAlex Elder 28771ddbe94eSAlex Elder /* 2878499afd5bSAlex Elder * Get a unique rbd identifier for the given new rbd_dev, and add 2879499afd5bSAlex Elder * the rbd_dev to the global list. The minimum rbd id is 1. 28801ddbe94eSAlex Elder */ 2881e2839308SAlex Elder static void rbd_dev_id_get(struct rbd_device *rbd_dev) 2882b7f23c36SAlex Elder { 2883e2839308SAlex Elder rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max); 2884499afd5bSAlex Elder 2885499afd5bSAlex Elder spin_lock(&rbd_dev_list_lock); 2886499afd5bSAlex Elder list_add_tail(&rbd_dev->node, &rbd_dev_list); 2887499afd5bSAlex Elder spin_unlock(&rbd_dev_list_lock); 2888e2839308SAlex Elder dout("rbd_dev %p given dev id %llu\n", rbd_dev, 2889e2839308SAlex Elder (unsigned long long) rbd_dev->dev_id); 2890b7f23c36SAlex Elder } 2891b7f23c36SAlex Elder 28921ddbe94eSAlex Elder /* 2893499afd5bSAlex Elder * Remove an rbd_dev from the global list, and record that its 2894499afd5bSAlex Elder * identifier is no longer in use. 28951ddbe94eSAlex Elder */ 2896e2839308SAlex Elder static void rbd_dev_id_put(struct rbd_device *rbd_dev) 28971ddbe94eSAlex Elder { 2898d184f6bfSAlex Elder struct list_head *tmp; 2899de71a297SAlex Elder int rbd_id = rbd_dev->dev_id; 2900d184f6bfSAlex Elder int max_id; 2901d184f6bfSAlex Elder 2902aafb230eSAlex Elder rbd_assert(rbd_id > 0); 2903499afd5bSAlex Elder 2904e2839308SAlex Elder dout("rbd_dev %p released dev id %llu\n", rbd_dev, 2905e2839308SAlex Elder (unsigned long long) rbd_dev->dev_id); 2906499afd5bSAlex Elder spin_lock(&rbd_dev_list_lock); 2907499afd5bSAlex Elder list_del_init(&rbd_dev->node); 2908d184f6bfSAlex Elder 2909d184f6bfSAlex Elder /* 2910d184f6bfSAlex Elder * If the id being "put" is not the current maximum, there 2911d184f6bfSAlex Elder * is nothing special we need to do. 2912d184f6bfSAlex Elder */ 2913e2839308SAlex Elder if (rbd_id != atomic64_read(&rbd_dev_id_max)) { 2914d184f6bfSAlex Elder spin_unlock(&rbd_dev_list_lock); 2915d184f6bfSAlex Elder return; 2916d184f6bfSAlex Elder } 2917d184f6bfSAlex Elder 2918d184f6bfSAlex Elder /* 2919d184f6bfSAlex Elder * We need to update the current maximum id. Search the 2920d184f6bfSAlex Elder * list to find out what it is. We're more likely to find 2921d184f6bfSAlex Elder * the maximum at the end, so search the list backward. 2922d184f6bfSAlex Elder */ 2923d184f6bfSAlex Elder max_id = 0; 2924d184f6bfSAlex Elder list_for_each_prev(tmp, &rbd_dev_list) { 2925d184f6bfSAlex Elder struct rbd_device *rbd_dev; 2926d184f6bfSAlex Elder 2927d184f6bfSAlex Elder rbd_dev = list_entry(tmp, struct rbd_device, node); 2928b213e0b1SAlex Elder if (rbd_dev->dev_id > max_id) 2929b213e0b1SAlex Elder max_id = rbd_dev->dev_id; 2930d184f6bfSAlex Elder } 2931499afd5bSAlex Elder spin_unlock(&rbd_dev_list_lock); 29321ddbe94eSAlex Elder 29331ddbe94eSAlex Elder /* 2934e2839308SAlex Elder * The max id could have been updated by rbd_dev_id_get(), in 2935d184f6bfSAlex Elder * which case it now accurately reflects the new maximum. 2936d184f6bfSAlex Elder * Be careful not to overwrite the maximum value in that 2937d184f6bfSAlex Elder * case. 29381ddbe94eSAlex Elder */ 2939e2839308SAlex Elder atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id); 2940e2839308SAlex Elder dout(" max dev id has been reset\n"); 2941b7f23c36SAlex Elder } 2942b7f23c36SAlex Elder 2943a725f65eSAlex Elder /* 2944e28fff26SAlex Elder * Skips over white space at *buf, and updates *buf to point to the 2945e28fff26SAlex Elder * first found non-space character (if any). Returns the length of 2946593a9e7bSAlex Elder * the token (string of non-white space characters) found. Note 2947593a9e7bSAlex Elder * that *buf must be terminated with '\0'. 2948e28fff26SAlex Elder */ 2949e28fff26SAlex Elder static inline size_t next_token(const char **buf) 2950e28fff26SAlex Elder { 2951e28fff26SAlex Elder /* 2952e28fff26SAlex Elder * These are the characters that produce nonzero for 2953e28fff26SAlex Elder * isspace() in the "C" and "POSIX" locales. 2954e28fff26SAlex Elder */ 2955e28fff26SAlex Elder const char *spaces = " \f\n\r\t\v"; 2956e28fff26SAlex Elder 2957e28fff26SAlex Elder *buf += strspn(*buf, spaces); /* Find start of token */ 2958e28fff26SAlex Elder 2959e28fff26SAlex Elder return strcspn(*buf, spaces); /* Return token length */ 2960e28fff26SAlex Elder } 2961e28fff26SAlex Elder 2962e28fff26SAlex Elder /* 2963e28fff26SAlex Elder * Finds the next token in *buf, and if the provided token buffer is 2964e28fff26SAlex Elder * big enough, copies the found token into it. The result, if 2965593a9e7bSAlex Elder * copied, is guaranteed to be terminated with '\0'. Note that *buf 2966593a9e7bSAlex Elder * must be terminated with '\0' on entry. 2967e28fff26SAlex Elder * 2968e28fff26SAlex Elder * Returns the length of the token found (not including the '\0'). 2969e28fff26SAlex Elder * Return value will be 0 if no token is found, and it will be >= 2970e28fff26SAlex Elder * token_size if the token would not fit. 2971e28fff26SAlex Elder * 2972593a9e7bSAlex Elder * The *buf pointer will be updated to point beyond the end of the 2973e28fff26SAlex Elder * found token. Note that this occurs even if the token buffer is 2974e28fff26SAlex Elder * too small to hold it. 2975e28fff26SAlex Elder */ 2976e28fff26SAlex Elder static inline size_t copy_token(const char **buf, 2977e28fff26SAlex Elder char *token, 2978e28fff26SAlex Elder size_t token_size) 2979e28fff26SAlex Elder { 2980e28fff26SAlex Elder size_t len; 2981e28fff26SAlex Elder 2982e28fff26SAlex Elder len = next_token(buf); 2983e28fff26SAlex Elder if (len < token_size) { 2984e28fff26SAlex Elder memcpy(token, *buf, len); 2985e28fff26SAlex Elder *(token + len) = '\0'; 2986e28fff26SAlex Elder } 2987e28fff26SAlex Elder *buf += len; 2988e28fff26SAlex Elder 2989e28fff26SAlex Elder return len; 2990e28fff26SAlex Elder } 2991e28fff26SAlex Elder 2992e28fff26SAlex Elder /* 2993ea3352f4SAlex Elder * Finds the next token in *buf, dynamically allocates a buffer big 2994ea3352f4SAlex Elder * enough to hold a copy of it, and copies the token into the new 2995ea3352f4SAlex Elder * buffer. The copy is guaranteed to be terminated with '\0'. Note 2996ea3352f4SAlex Elder * that a duplicate buffer is created even for a zero-length token. 2997ea3352f4SAlex Elder * 2998ea3352f4SAlex Elder * Returns a pointer to the newly-allocated duplicate, or a null 2999ea3352f4SAlex Elder * pointer if memory for the duplicate was not available. If 3000ea3352f4SAlex Elder * the lenp argument is a non-null pointer, the length of the token 3001ea3352f4SAlex Elder * (not including the '\0') is returned in *lenp. 3002ea3352f4SAlex Elder * 3003ea3352f4SAlex Elder * If successful, the *buf pointer will be updated to point beyond 3004ea3352f4SAlex Elder * the end of the found token. 3005ea3352f4SAlex Elder * 3006ea3352f4SAlex Elder * Note: uses GFP_KERNEL for allocation. 3007ea3352f4SAlex Elder */ 3008ea3352f4SAlex Elder static inline char *dup_token(const char **buf, size_t *lenp) 3009ea3352f4SAlex Elder { 3010ea3352f4SAlex Elder char *dup; 3011ea3352f4SAlex Elder size_t len; 3012ea3352f4SAlex Elder 3013ea3352f4SAlex Elder len = next_token(buf); 3014ea3352f4SAlex Elder dup = kmalloc(len + 1, GFP_KERNEL); 3015ea3352f4SAlex Elder if (!dup) 3016ea3352f4SAlex Elder return NULL; 3017ea3352f4SAlex Elder 3018ea3352f4SAlex Elder memcpy(dup, *buf, len); 3019ea3352f4SAlex Elder *(dup + len) = '\0'; 3020ea3352f4SAlex Elder *buf += len; 3021ea3352f4SAlex Elder 3022ea3352f4SAlex Elder if (lenp) 3023ea3352f4SAlex Elder *lenp = len; 3024ea3352f4SAlex Elder 3025ea3352f4SAlex Elder return dup; 3026ea3352f4SAlex Elder } 3027ea3352f4SAlex Elder 3028ea3352f4SAlex Elder /* 3029859c31dfSAlex Elder * Parse the options provided for an "rbd add" (i.e., rbd image 3030859c31dfSAlex Elder * mapping) request. These arrive via a write to /sys/bus/rbd/add, 3031859c31dfSAlex Elder * and the data written is passed here via a NUL-terminated buffer. 3032859c31dfSAlex Elder * Returns 0 if successful or an error code otherwise. 3033d22f76e7SAlex Elder * 3034859c31dfSAlex Elder * The information extracted from these options is recorded in 3035859c31dfSAlex Elder * the other parameters which return dynamically-allocated 3036859c31dfSAlex Elder * structures: 3037859c31dfSAlex Elder * ceph_opts 3038859c31dfSAlex Elder * The address of a pointer that will refer to a ceph options 3039859c31dfSAlex Elder * structure. Caller must release the returned pointer using 3040859c31dfSAlex Elder * ceph_destroy_options() when it is no longer needed. 3041859c31dfSAlex Elder * rbd_opts 3042859c31dfSAlex Elder * Address of an rbd options pointer. Fully initialized by 3043859c31dfSAlex Elder * this function; caller must release with kfree(). 3044859c31dfSAlex Elder * spec 3045859c31dfSAlex Elder * Address of an rbd image specification pointer. Fully 3046859c31dfSAlex Elder * initialized by this function based on parsed options. 3047859c31dfSAlex Elder * Caller must release with rbd_spec_put(). 3048859c31dfSAlex Elder * 3049859c31dfSAlex Elder * The options passed take this form: 3050859c31dfSAlex Elder * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>] 3051859c31dfSAlex Elder * where: 3052859c31dfSAlex Elder * <mon_addrs> 3053859c31dfSAlex Elder * A comma-separated list of one or more monitor addresses. 3054859c31dfSAlex Elder * A monitor address is an ip address, optionally followed 3055859c31dfSAlex Elder * by a port number (separated by a colon). 3056859c31dfSAlex Elder * I.e.: ip1[:port1][,ip2[:port2]...] 3057859c31dfSAlex Elder * <options> 3058859c31dfSAlex Elder * A comma-separated list of ceph and/or rbd options. 3059859c31dfSAlex Elder * <pool_name> 3060859c31dfSAlex Elder * The name of the rados pool containing the rbd image. 3061859c31dfSAlex Elder * <image_name> 3062859c31dfSAlex Elder * The name of the image in that pool to map. 3063859c31dfSAlex Elder * <snap_id> 3064859c31dfSAlex Elder * An optional snapshot id. If provided, the mapping will 3065859c31dfSAlex Elder * present data from the image at the time that snapshot was 3066859c31dfSAlex Elder * created. The image head is used if no snapshot id is 3067859c31dfSAlex Elder * provided. Snapshot mappings are always read-only. 3068a725f65eSAlex Elder */ 3069859c31dfSAlex Elder static int rbd_add_parse_args(const char *buf, 3070dc79b113SAlex Elder struct ceph_options **ceph_opts, 3071859c31dfSAlex Elder struct rbd_options **opts, 3072859c31dfSAlex Elder struct rbd_spec **rbd_spec) 3073a725f65eSAlex Elder { 3074e28fff26SAlex Elder size_t len; 3075859c31dfSAlex Elder char *options; 30760ddebc0cSAlex Elder const char *mon_addrs; 30770ddebc0cSAlex Elder size_t mon_addrs_size; 3078859c31dfSAlex Elder struct rbd_spec *spec = NULL; 30794e9afebaSAlex Elder struct rbd_options *rbd_opts = NULL; 3080859c31dfSAlex Elder struct ceph_options *copts; 3081dc79b113SAlex Elder int ret; 3082e28fff26SAlex Elder 3083e28fff26SAlex Elder /* The first four tokens are required */ 3084e28fff26SAlex Elder 30857ef3214aSAlex Elder len = next_token(&buf); 30867ef3214aSAlex Elder if (!len) 3087dc79b113SAlex Elder return -EINVAL; /* Missing monitor address(es) */ 30880ddebc0cSAlex Elder mon_addrs = buf; 3089f28e565aSAlex Elder mon_addrs_size = len + 1; 30907ef3214aSAlex Elder buf += len; 3091a725f65eSAlex Elder 3092dc79b113SAlex Elder ret = -EINVAL; 3093f28e565aSAlex Elder options = dup_token(&buf, NULL); 3094f28e565aSAlex Elder if (!options) 3095dc79b113SAlex Elder return -ENOMEM; 3096f28e565aSAlex Elder if (!*options) 3097f28e565aSAlex Elder goto out_err; /* Missing options */ 3098a725f65eSAlex Elder 3099859c31dfSAlex Elder spec = rbd_spec_alloc(); 3100859c31dfSAlex Elder if (!spec) 3101f28e565aSAlex Elder goto out_mem; 3102859c31dfSAlex Elder 3103859c31dfSAlex Elder spec->pool_name = dup_token(&buf, NULL); 3104859c31dfSAlex Elder if (!spec->pool_name) 3105859c31dfSAlex Elder goto out_mem; 3106859c31dfSAlex Elder if (!*spec->pool_name) 3107f28e565aSAlex Elder goto out_err; /* Missing pool name */ 3108e28fff26SAlex Elder 3109859c31dfSAlex Elder spec->image_name = dup_token(&buf, &spec->image_name_len); 3110859c31dfSAlex Elder if (!spec->image_name) 3111f28e565aSAlex Elder goto out_mem; 3112859c31dfSAlex Elder if (!*spec->image_name) 3113f28e565aSAlex Elder goto out_err; /* Missing image name */ 3114e28fff26SAlex Elder 3115f28e565aSAlex Elder /* 3116f28e565aSAlex Elder * Snapshot name is optional; default is to use "-" 3117f28e565aSAlex Elder * (indicating the head/no snapshot). 3118f28e565aSAlex Elder */ 31193feeb894SAlex Elder len = next_token(&buf); 3120820a5f3eSAlex Elder if (!len) { 31213feeb894SAlex Elder buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */ 31223feeb894SAlex Elder len = sizeof (RBD_SNAP_HEAD_NAME) - 1; 3123f28e565aSAlex Elder } else if (len > RBD_MAX_SNAP_NAME_LEN) { 3124dc79b113SAlex Elder ret = -ENAMETOOLONG; 3125f28e565aSAlex Elder goto out_err; 3126849b4260SAlex Elder } 3127859c31dfSAlex Elder spec->snap_name = kmalloc(len + 1, GFP_KERNEL); 3128859c31dfSAlex Elder if (!spec->snap_name) 3129f28e565aSAlex Elder goto out_mem; 3130859c31dfSAlex Elder memcpy(spec->snap_name, buf, len); 3131859c31dfSAlex Elder *(spec->snap_name + len) = '\0'; 3132e5c35534SAlex Elder 31330ddebc0cSAlex Elder /* Initialize all rbd options to the defaults */ 3134e28fff26SAlex Elder 31354e9afebaSAlex Elder rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL); 31364e9afebaSAlex Elder if (!rbd_opts) 31374e9afebaSAlex Elder goto out_mem; 31384e9afebaSAlex Elder 31394e9afebaSAlex Elder rbd_opts->read_only = RBD_READ_ONLY_DEFAULT; 3140d22f76e7SAlex Elder 3141859c31dfSAlex Elder copts = ceph_parse_options(options, mon_addrs, 31420ddebc0cSAlex Elder mon_addrs + mon_addrs_size - 1, 31434e9afebaSAlex Elder parse_rbd_opts_token, rbd_opts); 3144859c31dfSAlex Elder if (IS_ERR(copts)) { 3145859c31dfSAlex Elder ret = PTR_ERR(copts); 3146dc79b113SAlex Elder goto out_err; 3147dc79b113SAlex Elder } 3148859c31dfSAlex Elder kfree(options); 3149859c31dfSAlex Elder 3150859c31dfSAlex Elder *ceph_opts = copts; 31514e9afebaSAlex Elder *opts = rbd_opts; 3152859c31dfSAlex Elder *rbd_spec = spec; 31530ddebc0cSAlex Elder 3154dc79b113SAlex Elder return 0; 3155f28e565aSAlex Elder out_mem: 3156dc79b113SAlex Elder ret = -ENOMEM; 3157d22f76e7SAlex Elder out_err: 3158859c31dfSAlex Elder kfree(rbd_opts); 3159859c31dfSAlex Elder rbd_spec_put(spec); 3160f28e565aSAlex Elder kfree(options); 3161d22f76e7SAlex Elder 3162dc79b113SAlex Elder return ret; 3163a725f65eSAlex Elder } 3164a725f65eSAlex Elder 3165589d30e0SAlex Elder /* 3166589d30e0SAlex Elder * An rbd format 2 image has a unique identifier, distinct from the 3167589d30e0SAlex Elder * name given to it by the user. Internally, that identifier is 3168589d30e0SAlex Elder * what's used to specify the names of objects related to the image. 3169589d30e0SAlex Elder * 3170589d30e0SAlex Elder * A special "rbd id" object is used to map an rbd image name to its 3171589d30e0SAlex Elder * id. If that object doesn't exist, then there is no v2 rbd image 3172589d30e0SAlex Elder * with the supplied name. 3173589d30e0SAlex Elder * 3174589d30e0SAlex Elder * This function will record the given rbd_dev's image_id field if 3175589d30e0SAlex Elder * it can be determined, and in that case will return 0. If any 3176589d30e0SAlex Elder * errors occur a negative errno will be returned and the rbd_dev's 3177589d30e0SAlex Elder * image_id field will be unchanged (and should be NULL). 3178589d30e0SAlex Elder */ 3179589d30e0SAlex Elder static int rbd_dev_image_id(struct rbd_device *rbd_dev) 3180589d30e0SAlex Elder { 3181589d30e0SAlex Elder int ret; 3182589d30e0SAlex Elder size_t size; 3183589d30e0SAlex Elder char *object_name; 3184589d30e0SAlex Elder void *response; 3185589d30e0SAlex Elder void *p; 3186589d30e0SAlex Elder 3187589d30e0SAlex Elder /* 31882c0d0a10SAlex Elder * When probing a parent image, the image id is already 31892c0d0a10SAlex Elder * known (and the image name likely is not). There's no 31902c0d0a10SAlex Elder * need to fetch the image id again in this case. 31912c0d0a10SAlex Elder */ 31922c0d0a10SAlex Elder if (rbd_dev->spec->image_id) 31932c0d0a10SAlex Elder return 0; 31942c0d0a10SAlex Elder 31952c0d0a10SAlex Elder /* 3196589d30e0SAlex Elder * First, see if the format 2 image id file exists, and if 3197589d30e0SAlex Elder * so, get the image's persistent id from it. 3198589d30e0SAlex Elder */ 31990d7dbfceSAlex Elder size = sizeof (RBD_ID_PREFIX) + rbd_dev->spec->image_name_len; 3200589d30e0SAlex Elder object_name = kmalloc(size, GFP_NOIO); 3201589d30e0SAlex Elder if (!object_name) 3202589d30e0SAlex Elder return -ENOMEM; 32030d7dbfceSAlex Elder sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name); 3204589d30e0SAlex Elder dout("rbd id object name is %s\n", object_name); 3205589d30e0SAlex Elder 3206589d30e0SAlex Elder /* Response will be an encoded string, which includes a length */ 3207589d30e0SAlex Elder 3208589d30e0SAlex Elder size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX; 3209589d30e0SAlex Elder response = kzalloc(size, GFP_NOIO); 3210589d30e0SAlex Elder if (!response) { 3211589d30e0SAlex Elder ret = -ENOMEM; 3212589d30e0SAlex Elder goto out; 3213589d30e0SAlex Elder } 3214589d30e0SAlex Elder 3215589d30e0SAlex Elder ret = rbd_req_sync_exec(rbd_dev, object_name, 3216589d30e0SAlex Elder "rbd", "get_id", 3217589d30e0SAlex Elder NULL, 0, 3218589d30e0SAlex Elder response, RBD_IMAGE_ID_LEN_MAX, 3219589d30e0SAlex Elder CEPH_OSD_FLAG_READ, NULL); 3220589d30e0SAlex Elder dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); 3221589d30e0SAlex Elder if (ret < 0) 3222589d30e0SAlex Elder goto out; 3223a0ea3a40SAlex Elder ret = 0; /* rbd_req_sync_exec() can return positive */ 3224589d30e0SAlex Elder 3225589d30e0SAlex Elder p = response; 32260d7dbfceSAlex Elder rbd_dev->spec->image_id = ceph_extract_encoded_string(&p, 3227589d30e0SAlex Elder p + RBD_IMAGE_ID_LEN_MAX, 32280d7dbfceSAlex Elder &rbd_dev->spec->image_id_len, 3229589d30e0SAlex Elder GFP_NOIO); 32300d7dbfceSAlex Elder if (IS_ERR(rbd_dev->spec->image_id)) { 32310d7dbfceSAlex Elder ret = PTR_ERR(rbd_dev->spec->image_id); 32320d7dbfceSAlex Elder rbd_dev->spec->image_id = NULL; 3233589d30e0SAlex Elder } else { 32340d7dbfceSAlex Elder dout("image_id is %s\n", rbd_dev->spec->image_id); 3235589d30e0SAlex Elder } 3236589d30e0SAlex Elder out: 3237589d30e0SAlex Elder kfree(response); 3238589d30e0SAlex Elder kfree(object_name); 3239589d30e0SAlex Elder 3240589d30e0SAlex Elder return ret; 3241589d30e0SAlex Elder } 3242589d30e0SAlex Elder 3243a30b71b9SAlex Elder static int rbd_dev_v1_probe(struct rbd_device *rbd_dev) 3244a30b71b9SAlex Elder { 3245a30b71b9SAlex Elder int ret; 3246a30b71b9SAlex Elder size_t size; 3247a30b71b9SAlex Elder 3248a30b71b9SAlex Elder /* Version 1 images have no id; empty string is used */ 3249a30b71b9SAlex Elder 32500d7dbfceSAlex Elder rbd_dev->spec->image_id = kstrdup("", GFP_KERNEL); 32510d7dbfceSAlex Elder if (!rbd_dev->spec->image_id) 3252a30b71b9SAlex Elder return -ENOMEM; 32530d7dbfceSAlex Elder rbd_dev->spec->image_id_len = 0; 3254a30b71b9SAlex Elder 3255a30b71b9SAlex Elder /* Record the header object name for this rbd image. */ 3256a30b71b9SAlex Elder 32570d7dbfceSAlex Elder size = rbd_dev->spec->image_name_len + sizeof (RBD_SUFFIX); 3258a30b71b9SAlex Elder rbd_dev->header_name = kmalloc(size, GFP_KERNEL); 3259a30b71b9SAlex Elder if (!rbd_dev->header_name) { 3260a30b71b9SAlex Elder ret = -ENOMEM; 3261a30b71b9SAlex Elder goto out_err; 3262a30b71b9SAlex Elder } 32630d7dbfceSAlex Elder sprintf(rbd_dev->header_name, "%s%s", 32640d7dbfceSAlex Elder rbd_dev->spec->image_name, RBD_SUFFIX); 3265a30b71b9SAlex Elder 3266a30b71b9SAlex Elder /* Populate rbd image metadata */ 3267a30b71b9SAlex Elder 3268a30b71b9SAlex Elder ret = rbd_read_header(rbd_dev, &rbd_dev->header); 3269a30b71b9SAlex Elder if (ret < 0) 3270a30b71b9SAlex Elder goto out_err; 327186b00e0dSAlex Elder 327286b00e0dSAlex Elder /* Version 1 images have no parent (no layering) */ 327386b00e0dSAlex Elder 327486b00e0dSAlex Elder rbd_dev->parent_spec = NULL; 327586b00e0dSAlex Elder rbd_dev->parent_overlap = 0; 327686b00e0dSAlex Elder 3277a30b71b9SAlex Elder rbd_dev->image_format = 1; 3278a30b71b9SAlex Elder 3279a30b71b9SAlex Elder dout("discovered version 1 image, header name is %s\n", 3280a30b71b9SAlex Elder rbd_dev->header_name); 3281a30b71b9SAlex Elder 3282a30b71b9SAlex Elder return 0; 3283a30b71b9SAlex Elder 3284a30b71b9SAlex Elder out_err: 3285a30b71b9SAlex Elder kfree(rbd_dev->header_name); 3286a30b71b9SAlex Elder rbd_dev->header_name = NULL; 32870d7dbfceSAlex Elder kfree(rbd_dev->spec->image_id); 32880d7dbfceSAlex Elder rbd_dev->spec->image_id = NULL; 3289a30b71b9SAlex Elder 3290a30b71b9SAlex Elder return ret; 3291a30b71b9SAlex Elder } 3292a30b71b9SAlex Elder 3293a30b71b9SAlex Elder static int rbd_dev_v2_probe(struct rbd_device *rbd_dev) 3294a30b71b9SAlex Elder { 3295a30b71b9SAlex Elder size_t size; 32969d475de5SAlex Elder int ret; 32976e14b1a6SAlex Elder u64 ver = 0; 3298a30b71b9SAlex Elder 3299a30b71b9SAlex Elder /* 3300a30b71b9SAlex Elder * Image id was filled in by the caller. Record the header 3301a30b71b9SAlex Elder * object name for this rbd image. 3302a30b71b9SAlex Elder */ 33030d7dbfceSAlex Elder size = sizeof (RBD_HEADER_PREFIX) + rbd_dev->spec->image_id_len; 3304a30b71b9SAlex Elder rbd_dev->header_name = kmalloc(size, GFP_KERNEL); 3305a30b71b9SAlex Elder if (!rbd_dev->header_name) 3306a30b71b9SAlex Elder return -ENOMEM; 3307a30b71b9SAlex Elder sprintf(rbd_dev->header_name, "%s%s", 33080d7dbfceSAlex Elder RBD_HEADER_PREFIX, rbd_dev->spec->image_id); 33099d475de5SAlex Elder 33109d475de5SAlex Elder /* Get the size and object order for the image */ 33119d475de5SAlex Elder 33129d475de5SAlex Elder ret = rbd_dev_v2_image_size(rbd_dev); 33139d475de5SAlex Elder if (ret < 0) 33149d475de5SAlex Elder goto out_err; 33151e130199SAlex Elder 33161e130199SAlex Elder /* Get the object prefix (a.k.a. block_name) for the image */ 33171e130199SAlex Elder 33181e130199SAlex Elder ret = rbd_dev_v2_object_prefix(rbd_dev); 33191e130199SAlex Elder if (ret < 0) 33201e130199SAlex Elder goto out_err; 3321b1b5402aSAlex Elder 3322d889140cSAlex Elder /* Get the and check features for the image */ 3323b1b5402aSAlex Elder 3324b1b5402aSAlex Elder ret = rbd_dev_v2_features(rbd_dev); 3325b1b5402aSAlex Elder if (ret < 0) 3326b1b5402aSAlex Elder goto out_err; 332735d489f9SAlex Elder 332886b00e0dSAlex Elder /* If the image supports layering, get the parent info */ 332986b00e0dSAlex Elder 333086b00e0dSAlex Elder if (rbd_dev->header.features & RBD_FEATURE_LAYERING) { 333186b00e0dSAlex Elder ret = rbd_dev_v2_parent_info(rbd_dev); 333286b00e0dSAlex Elder if (ret < 0) 333386b00e0dSAlex Elder goto out_err; 333486b00e0dSAlex Elder } 333586b00e0dSAlex Elder 33366e14b1a6SAlex Elder /* crypto and compression type aren't (yet) supported for v2 images */ 333735d489f9SAlex Elder 33386e14b1a6SAlex Elder rbd_dev->header.crypt_type = 0; 33396e14b1a6SAlex Elder rbd_dev->header.comp_type = 0; 33406e14b1a6SAlex Elder 33416e14b1a6SAlex Elder /* Get the snapshot context, plus the header version */ 33426e14b1a6SAlex Elder 33436e14b1a6SAlex Elder ret = rbd_dev_v2_snap_context(rbd_dev, &ver); 334435d489f9SAlex Elder if (ret) 334535d489f9SAlex Elder goto out_err; 33466e14b1a6SAlex Elder rbd_dev->header.obj_version = ver; 33476e14b1a6SAlex Elder 3348a30b71b9SAlex Elder rbd_dev->image_format = 2; 3349a30b71b9SAlex Elder 3350a30b71b9SAlex Elder dout("discovered version 2 image, header name is %s\n", 3351a30b71b9SAlex Elder rbd_dev->header_name); 3352a30b71b9SAlex Elder 335335152979SAlex Elder return 0; 33549d475de5SAlex Elder out_err: 335586b00e0dSAlex Elder rbd_dev->parent_overlap = 0; 335686b00e0dSAlex Elder rbd_spec_put(rbd_dev->parent_spec); 335786b00e0dSAlex Elder rbd_dev->parent_spec = NULL; 33589d475de5SAlex Elder kfree(rbd_dev->header_name); 33599d475de5SAlex Elder rbd_dev->header_name = NULL; 33601e130199SAlex Elder kfree(rbd_dev->header.object_prefix); 33611e130199SAlex Elder rbd_dev->header.object_prefix = NULL; 33629d475de5SAlex Elder 33639d475de5SAlex Elder return ret; 3364a30b71b9SAlex Elder } 3365a30b71b9SAlex Elder 336683a06263SAlex Elder static int rbd_dev_probe_finish(struct rbd_device *rbd_dev) 336783a06263SAlex Elder { 336883a06263SAlex Elder int ret; 336983a06263SAlex Elder 337083a06263SAlex Elder /* no need to lock here, as rbd_dev is not registered yet */ 337183a06263SAlex Elder ret = rbd_dev_snaps_update(rbd_dev); 337283a06263SAlex Elder if (ret) 337383a06263SAlex Elder return ret; 337483a06263SAlex Elder 337583a06263SAlex Elder ret = rbd_dev_set_mapping(rbd_dev); 337683a06263SAlex Elder if (ret) 337783a06263SAlex Elder goto err_out_snaps; 337883a06263SAlex Elder 337983a06263SAlex Elder /* generate unique id: find highest unique id, add one */ 338083a06263SAlex Elder rbd_dev_id_get(rbd_dev); 338183a06263SAlex Elder 338283a06263SAlex Elder /* Fill in the device name, now that we have its id. */ 338383a06263SAlex Elder BUILD_BUG_ON(DEV_NAME_LEN 338483a06263SAlex Elder < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH); 338583a06263SAlex Elder sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id); 338683a06263SAlex Elder 338783a06263SAlex Elder /* Get our block major device number. */ 338883a06263SAlex Elder 338983a06263SAlex Elder ret = register_blkdev(0, rbd_dev->name); 339083a06263SAlex Elder if (ret < 0) 339183a06263SAlex Elder goto err_out_id; 339283a06263SAlex Elder rbd_dev->major = ret; 339383a06263SAlex Elder 339483a06263SAlex Elder /* Set up the blkdev mapping. */ 339583a06263SAlex Elder 339683a06263SAlex Elder ret = rbd_init_disk(rbd_dev); 339783a06263SAlex Elder if (ret) 339883a06263SAlex Elder goto err_out_blkdev; 339983a06263SAlex Elder 340083a06263SAlex Elder ret = rbd_bus_add_dev(rbd_dev); 340183a06263SAlex Elder if (ret) 340283a06263SAlex Elder goto err_out_disk; 340383a06263SAlex Elder 340483a06263SAlex Elder /* 340583a06263SAlex Elder * At this point cleanup in the event of an error is the job 340683a06263SAlex Elder * of the sysfs code (initiated by rbd_bus_del_dev()). 340783a06263SAlex Elder */ 340883a06263SAlex Elder down_write(&rbd_dev->header_rwsem); 340983a06263SAlex Elder ret = rbd_dev_snaps_register(rbd_dev); 341083a06263SAlex Elder up_write(&rbd_dev->header_rwsem); 341183a06263SAlex Elder if (ret) 341283a06263SAlex Elder goto err_out_bus; 341383a06263SAlex Elder 341483a06263SAlex Elder ret = rbd_init_watch_dev(rbd_dev); 341583a06263SAlex Elder if (ret) 341683a06263SAlex Elder goto err_out_bus; 341783a06263SAlex Elder 341883a06263SAlex Elder /* Everything's ready. Announce the disk to the world. */ 341983a06263SAlex Elder 342083a06263SAlex Elder add_disk(rbd_dev->disk); 342183a06263SAlex Elder 342283a06263SAlex Elder pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name, 342383a06263SAlex Elder (unsigned long long) rbd_dev->mapping.size); 342483a06263SAlex Elder 342583a06263SAlex Elder return ret; 342683a06263SAlex Elder err_out_bus: 342783a06263SAlex Elder /* this will also clean up rest of rbd_dev stuff */ 342883a06263SAlex Elder 342983a06263SAlex Elder rbd_bus_del_dev(rbd_dev); 343083a06263SAlex Elder 343183a06263SAlex Elder return ret; 343283a06263SAlex Elder err_out_disk: 343383a06263SAlex Elder rbd_free_disk(rbd_dev); 343483a06263SAlex Elder err_out_blkdev: 343583a06263SAlex Elder unregister_blkdev(rbd_dev->major, rbd_dev->name); 343683a06263SAlex Elder err_out_id: 343783a06263SAlex Elder rbd_dev_id_put(rbd_dev); 343883a06263SAlex Elder err_out_snaps: 343983a06263SAlex Elder rbd_remove_all_snaps(rbd_dev); 344083a06263SAlex Elder 344183a06263SAlex Elder return ret; 344283a06263SAlex Elder } 344383a06263SAlex Elder 3444a30b71b9SAlex Elder /* 3445a30b71b9SAlex Elder * Probe for the existence of the header object for the given rbd 3446a30b71b9SAlex Elder * device. For format 2 images this includes determining the image 3447a30b71b9SAlex Elder * id. 3448a30b71b9SAlex Elder */ 3449a30b71b9SAlex Elder static int rbd_dev_probe(struct rbd_device *rbd_dev) 3450a30b71b9SAlex Elder { 3451a30b71b9SAlex Elder int ret; 3452a30b71b9SAlex Elder 3453a30b71b9SAlex Elder /* 3454a30b71b9SAlex Elder * Get the id from the image id object. If it's not a 3455a30b71b9SAlex Elder * format 2 image, we'll get ENOENT back, and we'll assume 3456a30b71b9SAlex Elder * it's a format 1 image. 3457a30b71b9SAlex Elder */ 3458a30b71b9SAlex Elder ret = rbd_dev_image_id(rbd_dev); 3459a30b71b9SAlex Elder if (ret) 3460a30b71b9SAlex Elder ret = rbd_dev_v1_probe(rbd_dev); 3461a30b71b9SAlex Elder else 3462a30b71b9SAlex Elder ret = rbd_dev_v2_probe(rbd_dev); 346383a06263SAlex Elder if (ret) { 3464a30b71b9SAlex Elder dout("probe failed, returning %d\n", ret); 3465a30b71b9SAlex Elder 3466a30b71b9SAlex Elder return ret; 3467a30b71b9SAlex Elder } 3468a30b71b9SAlex Elder 346983a06263SAlex Elder ret = rbd_dev_probe_finish(rbd_dev); 347083a06263SAlex Elder if (ret) 347183a06263SAlex Elder rbd_header_free(&rbd_dev->header); 347283a06263SAlex Elder 347383a06263SAlex Elder return ret; 347483a06263SAlex Elder } 347583a06263SAlex Elder 347659c2be1eSYehuda Sadeh static ssize_t rbd_add(struct bus_type *bus, 347759c2be1eSYehuda Sadeh const char *buf, 347859c2be1eSYehuda Sadeh size_t count) 3479602adf40SYehuda Sadeh { 3480cb8627c7SAlex Elder struct rbd_device *rbd_dev = NULL; 3481dc79b113SAlex Elder struct ceph_options *ceph_opts = NULL; 34824e9afebaSAlex Elder struct rbd_options *rbd_opts = NULL; 3483859c31dfSAlex Elder struct rbd_spec *spec = NULL; 34849d3997fdSAlex Elder struct rbd_client *rbdc; 348527cc2594SAlex Elder struct ceph_osd_client *osdc; 348627cc2594SAlex Elder int rc = -ENOMEM; 3487602adf40SYehuda Sadeh 3488602adf40SYehuda Sadeh if (!try_module_get(THIS_MODULE)) 3489602adf40SYehuda Sadeh return -ENODEV; 3490602adf40SYehuda Sadeh 3491a725f65eSAlex Elder /* parse add command */ 3492859c31dfSAlex Elder rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec); 3493dc79b113SAlex Elder if (rc < 0) 3494bd4ba655SAlex Elder goto err_out_module; 3495a725f65eSAlex Elder 34969d3997fdSAlex Elder rbdc = rbd_get_client(ceph_opts); 34979d3997fdSAlex Elder if (IS_ERR(rbdc)) { 34989d3997fdSAlex Elder rc = PTR_ERR(rbdc); 34990ddebc0cSAlex Elder goto err_out_args; 35009d3997fdSAlex Elder } 3501c53d5893SAlex Elder ceph_opts = NULL; /* rbd_dev client now owns this */ 3502602adf40SYehuda Sadeh 3503602adf40SYehuda Sadeh /* pick the pool */ 35049d3997fdSAlex Elder osdc = &rbdc->client->osdc; 3505859c31dfSAlex Elder rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name); 3506602adf40SYehuda Sadeh if (rc < 0) 3507602adf40SYehuda Sadeh goto err_out_client; 3508859c31dfSAlex Elder spec->pool_id = (u64) rc; 3509859c31dfSAlex Elder 3510c53d5893SAlex Elder rbd_dev = rbd_dev_create(rbdc, spec); 3511bd4ba655SAlex Elder if (!rbd_dev) 3512bd4ba655SAlex Elder goto err_out_client; 3513c53d5893SAlex Elder rbdc = NULL; /* rbd_dev now owns this */ 3514c53d5893SAlex Elder spec = NULL; /* rbd_dev now owns this */ 3515602adf40SYehuda Sadeh 3516bd4ba655SAlex Elder rbd_dev->mapping.read_only = rbd_opts->read_only; 3517c53d5893SAlex Elder kfree(rbd_opts); 3518c53d5893SAlex Elder rbd_opts = NULL; /* done with this */ 3519bd4ba655SAlex Elder 3520a30b71b9SAlex Elder rc = rbd_dev_probe(rbd_dev); 3521a30b71b9SAlex Elder if (rc < 0) 3522c53d5893SAlex Elder goto err_out_rbd_dev; 352305fd6f6fSAlex Elder 3524602adf40SYehuda Sadeh return count; 3525c53d5893SAlex Elder err_out_rbd_dev: 3526c53d5893SAlex Elder rbd_dev_destroy(rbd_dev); 3527bd4ba655SAlex Elder err_out_client: 35289d3997fdSAlex Elder rbd_put_client(rbdc); 35290ddebc0cSAlex Elder err_out_args: 353078cea76eSAlex Elder if (ceph_opts) 353178cea76eSAlex Elder ceph_destroy_options(ceph_opts); 35324e9afebaSAlex Elder kfree(rbd_opts); 3533859c31dfSAlex Elder rbd_spec_put(spec); 3534bd4ba655SAlex Elder err_out_module: 3535bd4ba655SAlex Elder module_put(THIS_MODULE); 353627cc2594SAlex Elder 3537602adf40SYehuda Sadeh dout("Error adding device %s\n", buf); 353827cc2594SAlex Elder 353927cc2594SAlex Elder return (ssize_t) rc; 3540602adf40SYehuda Sadeh } 3541602adf40SYehuda Sadeh 3542de71a297SAlex Elder static struct rbd_device *__rbd_get_dev(unsigned long dev_id) 3543602adf40SYehuda Sadeh { 3544602adf40SYehuda Sadeh struct list_head *tmp; 3545602adf40SYehuda Sadeh struct rbd_device *rbd_dev; 3546602adf40SYehuda Sadeh 3547e124a82fSAlex Elder spin_lock(&rbd_dev_list_lock); 3548602adf40SYehuda Sadeh list_for_each(tmp, &rbd_dev_list) { 3549602adf40SYehuda Sadeh rbd_dev = list_entry(tmp, struct rbd_device, node); 3550de71a297SAlex Elder if (rbd_dev->dev_id == dev_id) { 3551e124a82fSAlex Elder spin_unlock(&rbd_dev_list_lock); 3552602adf40SYehuda Sadeh return rbd_dev; 3553602adf40SYehuda Sadeh } 3554e124a82fSAlex Elder } 3555e124a82fSAlex Elder spin_unlock(&rbd_dev_list_lock); 3556602adf40SYehuda Sadeh return NULL; 3557602adf40SYehuda Sadeh } 3558602adf40SYehuda Sadeh 3559dfc5606dSYehuda Sadeh static void rbd_dev_release(struct device *dev) 3560602adf40SYehuda Sadeh { 3561593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 3562602adf40SYehuda Sadeh 35631dbb4399SAlex Elder if (rbd_dev->watch_request) { 35641dbb4399SAlex Elder struct ceph_client *client = rbd_dev->rbd_client->client; 35651dbb4399SAlex Elder 35661dbb4399SAlex Elder ceph_osdc_unregister_linger_request(&client->osdc, 356759c2be1eSYehuda Sadeh rbd_dev->watch_request); 35681dbb4399SAlex Elder } 356959c2be1eSYehuda Sadeh if (rbd_dev->watch_event) 3570070c633fSAlex Elder rbd_req_sync_unwatch(rbd_dev); 357159c2be1eSYehuda Sadeh 3572602adf40SYehuda Sadeh 3573602adf40SYehuda Sadeh /* clean up and free blkdev */ 3574602adf40SYehuda Sadeh rbd_free_disk(rbd_dev); 3575602adf40SYehuda Sadeh unregister_blkdev(rbd_dev->major, rbd_dev->name); 357632eec68dSAlex Elder 35772ac4e75dSAlex Elder /* release allocated disk header fields */ 35782ac4e75dSAlex Elder rbd_header_free(&rbd_dev->header); 35792ac4e75dSAlex Elder 358032eec68dSAlex Elder /* done with the id, and with the rbd_dev */ 3581e2839308SAlex Elder rbd_dev_id_put(rbd_dev); 3582c53d5893SAlex Elder rbd_assert(rbd_dev->rbd_client != NULL); 3583c53d5893SAlex Elder rbd_dev_destroy(rbd_dev); 3584602adf40SYehuda Sadeh 3585602adf40SYehuda Sadeh /* release module ref */ 3586602adf40SYehuda Sadeh module_put(THIS_MODULE); 3587602adf40SYehuda Sadeh } 3588602adf40SYehuda Sadeh 3589dfc5606dSYehuda Sadeh static ssize_t rbd_remove(struct bus_type *bus, 3590602adf40SYehuda Sadeh const char *buf, 3591602adf40SYehuda Sadeh size_t count) 3592602adf40SYehuda Sadeh { 3593602adf40SYehuda Sadeh struct rbd_device *rbd_dev = NULL; 3594602adf40SYehuda Sadeh int target_id, rc; 3595602adf40SYehuda Sadeh unsigned long ul; 3596602adf40SYehuda Sadeh int ret = count; 3597602adf40SYehuda Sadeh 3598602adf40SYehuda Sadeh rc = strict_strtoul(buf, 10, &ul); 3599602adf40SYehuda Sadeh if (rc) 3600602adf40SYehuda Sadeh return rc; 3601602adf40SYehuda Sadeh 3602602adf40SYehuda Sadeh /* convert to int; abort if we lost anything in the conversion */ 3603602adf40SYehuda Sadeh target_id = (int) ul; 3604602adf40SYehuda Sadeh if (target_id != ul) 3605602adf40SYehuda Sadeh return -EINVAL; 3606602adf40SYehuda Sadeh 3607602adf40SYehuda Sadeh mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); 3608602adf40SYehuda Sadeh 3609602adf40SYehuda Sadeh rbd_dev = __rbd_get_dev(target_id); 3610602adf40SYehuda Sadeh if (!rbd_dev) { 3611602adf40SYehuda Sadeh ret = -ENOENT; 3612602adf40SYehuda Sadeh goto done; 3613602adf40SYehuda Sadeh } 3614602adf40SYehuda Sadeh 361541f38c2bSAlex Elder rbd_remove_all_snaps(rbd_dev); 3616dfc5606dSYehuda Sadeh rbd_bus_del_dev(rbd_dev); 3617602adf40SYehuda Sadeh 3618602adf40SYehuda Sadeh done: 3619602adf40SYehuda Sadeh mutex_unlock(&ctl_mutex); 3620aafb230eSAlex Elder 3621602adf40SYehuda Sadeh return ret; 3622602adf40SYehuda Sadeh } 3623602adf40SYehuda Sadeh 3624602adf40SYehuda Sadeh /* 3625602adf40SYehuda Sadeh * create control files in sysfs 3626dfc5606dSYehuda Sadeh * /sys/bus/rbd/... 3627602adf40SYehuda Sadeh */ 3628602adf40SYehuda Sadeh static int rbd_sysfs_init(void) 3629602adf40SYehuda Sadeh { 3630dfc5606dSYehuda Sadeh int ret; 3631602adf40SYehuda Sadeh 3632fed4c143SAlex Elder ret = device_register(&rbd_root_dev); 3633dfc5606dSYehuda Sadeh if (ret < 0) 3634dfc5606dSYehuda Sadeh return ret; 3635602adf40SYehuda Sadeh 3636fed4c143SAlex Elder ret = bus_register(&rbd_bus_type); 3637fed4c143SAlex Elder if (ret < 0) 3638fed4c143SAlex Elder device_unregister(&rbd_root_dev); 3639602adf40SYehuda Sadeh 3640602adf40SYehuda Sadeh return ret; 3641602adf40SYehuda Sadeh } 3642602adf40SYehuda Sadeh 3643602adf40SYehuda Sadeh static void rbd_sysfs_cleanup(void) 3644602adf40SYehuda Sadeh { 3645dfc5606dSYehuda Sadeh bus_unregister(&rbd_bus_type); 3646fed4c143SAlex Elder device_unregister(&rbd_root_dev); 3647602adf40SYehuda Sadeh } 3648602adf40SYehuda Sadeh 3649602adf40SYehuda Sadeh int __init rbd_init(void) 3650602adf40SYehuda Sadeh { 3651602adf40SYehuda Sadeh int rc; 3652602adf40SYehuda Sadeh 3653602adf40SYehuda Sadeh rc = rbd_sysfs_init(); 3654602adf40SYehuda Sadeh if (rc) 3655602adf40SYehuda Sadeh return rc; 3656f0f8cef5SAlex Elder pr_info("loaded " RBD_DRV_NAME_LONG "\n"); 3657602adf40SYehuda Sadeh return 0; 3658602adf40SYehuda Sadeh } 3659602adf40SYehuda Sadeh 3660602adf40SYehuda Sadeh void __exit rbd_exit(void) 3661602adf40SYehuda Sadeh { 3662602adf40SYehuda Sadeh rbd_sysfs_cleanup(); 3663602adf40SYehuda Sadeh } 3664602adf40SYehuda Sadeh 3665602adf40SYehuda Sadeh module_init(rbd_init); 3666602adf40SYehuda Sadeh module_exit(rbd_exit); 3667602adf40SYehuda Sadeh 3668602adf40SYehuda Sadeh MODULE_AUTHOR("Sage Weil <sage@newdream.net>"); 3669602adf40SYehuda Sadeh MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>"); 3670602adf40SYehuda Sadeh MODULE_DESCRIPTION("rados block device"); 3671602adf40SYehuda Sadeh 3672602adf40SYehuda Sadeh /* following authorship retained from original osdblk.c */ 3673602adf40SYehuda Sadeh MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>"); 3674602adf40SYehuda Sadeh 3675602adf40SYehuda Sadeh MODULE_LICENSE("GPL"); 3676