1602adf40SYehuda Sadeh /* 2602adf40SYehuda Sadeh rbd.c -- Export ceph rados objects as a Linux block device 3602adf40SYehuda Sadeh 4602adf40SYehuda Sadeh 5602adf40SYehuda Sadeh based on drivers/block/osdblk.c: 6602adf40SYehuda Sadeh 7602adf40SYehuda Sadeh Copyright 2009 Red Hat, Inc. 8602adf40SYehuda Sadeh 9602adf40SYehuda Sadeh This program is free software; you can redistribute it and/or modify 10602adf40SYehuda Sadeh it under the terms of the GNU General Public License as published by 11602adf40SYehuda Sadeh the Free Software Foundation. 12602adf40SYehuda Sadeh 13602adf40SYehuda Sadeh This program is distributed in the hope that it will be useful, 14602adf40SYehuda Sadeh but WITHOUT ANY WARRANTY; without even the implied warranty of 15602adf40SYehuda Sadeh MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16602adf40SYehuda Sadeh GNU General Public License for more details. 17602adf40SYehuda Sadeh 18602adf40SYehuda Sadeh You should have received a copy of the GNU General Public License 19602adf40SYehuda Sadeh along with this program; see the file COPYING. If not, write to 20602adf40SYehuda Sadeh the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 21602adf40SYehuda Sadeh 22602adf40SYehuda Sadeh 23602adf40SYehuda Sadeh 24dfc5606dSYehuda Sadeh For usage instructions, please refer to: 25602adf40SYehuda Sadeh 26dfc5606dSYehuda Sadeh Documentation/ABI/testing/sysfs-bus-rbd 27602adf40SYehuda Sadeh 28602adf40SYehuda Sadeh */ 29602adf40SYehuda Sadeh 30602adf40SYehuda Sadeh #include <linux/ceph/libceph.h> 31602adf40SYehuda Sadeh #include <linux/ceph/osd_client.h> 32602adf40SYehuda Sadeh #include <linux/ceph/mon_client.h> 33602adf40SYehuda Sadeh #include <linux/ceph/decode.h> 3459c2be1eSYehuda Sadeh #include <linux/parser.h> 35602adf40SYehuda Sadeh 36602adf40SYehuda Sadeh #include <linux/kernel.h> 37602adf40SYehuda Sadeh #include <linux/device.h> 38602adf40SYehuda Sadeh #include <linux/module.h> 39602adf40SYehuda Sadeh #include <linux/fs.h> 40602adf40SYehuda Sadeh #include <linux/blkdev.h> 41602adf40SYehuda Sadeh 42602adf40SYehuda Sadeh #include "rbd_types.h" 43602adf40SYehuda Sadeh 44aafb230eSAlex Elder #define RBD_DEBUG /* Activate rbd_assert() calls */ 45aafb230eSAlex Elder 46593a9e7bSAlex Elder /* 47593a9e7bSAlex Elder * The basic unit of block I/O is a sector. It is interpreted in a 48593a9e7bSAlex Elder * number of contexts in Linux (blk, bio, genhd), but the default is 49593a9e7bSAlex Elder * universally 512 bytes. These symbols are just slightly more 50593a9e7bSAlex Elder * meaningful than the bare numbers they represent. 51593a9e7bSAlex Elder */ 52593a9e7bSAlex Elder #define SECTOR_SHIFT 9 53593a9e7bSAlex Elder #define SECTOR_SIZE (1ULL << SECTOR_SHIFT) 54593a9e7bSAlex Elder 55df111be6SAlex Elder /* It might be useful to have this defined elsewhere too */ 56df111be6SAlex Elder 57df111be6SAlex Elder #define U64_MAX ((u64) (~0ULL)) 58df111be6SAlex Elder 59f0f8cef5SAlex Elder #define RBD_DRV_NAME "rbd" 60f0f8cef5SAlex Elder #define RBD_DRV_NAME_LONG "rbd (rados block device)" 61602adf40SYehuda Sadeh 62602adf40SYehuda Sadeh #define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */ 63602adf40SYehuda Sadeh 64602adf40SYehuda Sadeh #define RBD_MAX_SNAP_NAME_LEN 32 65602adf40SYehuda Sadeh #define RBD_MAX_OPT_LEN 1024 66602adf40SYehuda Sadeh 67602adf40SYehuda Sadeh #define RBD_SNAP_HEAD_NAME "-" 68602adf40SYehuda Sadeh 69589d30e0SAlex Elder #define RBD_IMAGE_ID_LEN_MAX 64 701e130199SAlex Elder #define RBD_OBJ_PREFIX_LEN_MAX 64 71589d30e0SAlex Elder 7281a89793SAlex Elder /* 7381a89793SAlex Elder * An RBD device name will be "rbd#", where the "rbd" comes from 7481a89793SAlex Elder * RBD_DRV_NAME above, and # is a unique integer identifier. 7581a89793SAlex Elder * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big 7681a89793SAlex Elder * enough to hold all possible device names. 7781a89793SAlex Elder */ 78602adf40SYehuda Sadeh #define DEV_NAME_LEN 32 7981a89793SAlex Elder #define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1) 80602adf40SYehuda Sadeh 81cc0538b6SAlex Elder #define RBD_READ_ONLY_DEFAULT false 8259c2be1eSYehuda Sadeh 83602adf40SYehuda Sadeh /* 84602adf40SYehuda Sadeh * block device image metadata (in-memory version) 85602adf40SYehuda Sadeh */ 86602adf40SYehuda Sadeh struct rbd_image_header { 87f84344f3SAlex Elder /* These four fields never change for a given rbd image */ 88849b4260SAlex Elder char *object_prefix; 8934b13184SAlex Elder u64 features; 90602adf40SYehuda Sadeh __u8 obj_order; 91602adf40SYehuda Sadeh __u8 crypt_type; 92602adf40SYehuda Sadeh __u8 comp_type; 93602adf40SYehuda Sadeh 94f84344f3SAlex Elder /* The remaining fields need to be updated occasionally */ 95f84344f3SAlex Elder u64 image_size; 96f84344f3SAlex Elder struct ceph_snap_context *snapc; 97602adf40SYehuda Sadeh char *snap_names; 98602adf40SYehuda Sadeh u64 *snap_sizes; 9959c2be1eSYehuda Sadeh 10059c2be1eSYehuda Sadeh u64 obj_version; 10159c2be1eSYehuda Sadeh }; 10259c2be1eSYehuda Sadeh 10359c2be1eSYehuda Sadeh struct rbd_options { 104cc0538b6SAlex Elder bool read_only; 105602adf40SYehuda Sadeh }; 106602adf40SYehuda Sadeh 107602adf40SYehuda Sadeh /* 108f0f8cef5SAlex Elder * an instance of the client. multiple devices may share an rbd client. 109602adf40SYehuda Sadeh */ 110602adf40SYehuda Sadeh struct rbd_client { 111602adf40SYehuda Sadeh struct ceph_client *client; 112602adf40SYehuda Sadeh struct kref kref; 113602adf40SYehuda Sadeh struct list_head node; 114602adf40SYehuda Sadeh }; 115602adf40SYehuda Sadeh 116602adf40SYehuda Sadeh /* 117f0f8cef5SAlex Elder * a request completion status 118602adf40SYehuda Sadeh */ 1191fec7093SYehuda Sadeh struct rbd_req_status { 1201fec7093SYehuda Sadeh int done; 1211fec7093SYehuda Sadeh int rc; 1221fec7093SYehuda Sadeh u64 bytes; 1231fec7093SYehuda Sadeh }; 1241fec7093SYehuda Sadeh 1251fec7093SYehuda Sadeh /* 1261fec7093SYehuda Sadeh * a collection of requests 1271fec7093SYehuda Sadeh */ 1281fec7093SYehuda Sadeh struct rbd_req_coll { 1291fec7093SYehuda Sadeh int total; 1301fec7093SYehuda Sadeh int num_done; 1311fec7093SYehuda Sadeh struct kref kref; 1321fec7093SYehuda Sadeh struct rbd_req_status status[0]; 133602adf40SYehuda Sadeh }; 134602adf40SYehuda Sadeh 135f0f8cef5SAlex Elder /* 136f0f8cef5SAlex Elder * a single io request 137f0f8cef5SAlex Elder */ 138f0f8cef5SAlex Elder struct rbd_request { 139f0f8cef5SAlex Elder struct request *rq; /* blk layer request */ 140f0f8cef5SAlex Elder struct bio *bio; /* cloned bio */ 141f0f8cef5SAlex Elder struct page **pages; /* list of used pages */ 142f0f8cef5SAlex Elder u64 len; 143f0f8cef5SAlex Elder int coll_index; 144f0f8cef5SAlex Elder struct rbd_req_coll *coll; 145f0f8cef5SAlex Elder }; 146f0f8cef5SAlex Elder 147dfc5606dSYehuda Sadeh struct rbd_snap { 148dfc5606dSYehuda Sadeh struct device dev; 149dfc5606dSYehuda Sadeh const char *name; 1503591538fSJosh Durgin u64 size; 151dfc5606dSYehuda Sadeh struct list_head node; 152dfc5606dSYehuda Sadeh u64 id; 15334b13184SAlex Elder u64 features; 154dfc5606dSYehuda Sadeh }; 155dfc5606dSYehuda Sadeh 156f84344f3SAlex Elder struct rbd_mapping { 157f84344f3SAlex Elder char *snap_name; 158f84344f3SAlex Elder u64 snap_id; 15999c1f08fSAlex Elder u64 size; 16034b13184SAlex Elder u64 features; 161f84344f3SAlex Elder bool snap_exists; 162f84344f3SAlex Elder bool read_only; 163f84344f3SAlex Elder }; 164f84344f3SAlex Elder 165602adf40SYehuda Sadeh /* 166602adf40SYehuda Sadeh * a single device 167602adf40SYehuda Sadeh */ 168602adf40SYehuda Sadeh struct rbd_device { 169de71a297SAlex Elder int dev_id; /* blkdev unique id */ 170602adf40SYehuda Sadeh 171602adf40SYehuda Sadeh int major; /* blkdev assigned major */ 172602adf40SYehuda Sadeh struct gendisk *disk; /* blkdev's gendisk and rq */ 173602adf40SYehuda Sadeh 174a30b71b9SAlex Elder u32 image_format; /* Either 1 or 2 */ 175f8c38929SAlex Elder struct rbd_options rbd_opts; 176602adf40SYehuda Sadeh struct rbd_client *rbd_client; 177602adf40SYehuda Sadeh 178602adf40SYehuda Sadeh char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */ 179602adf40SYehuda Sadeh 180602adf40SYehuda Sadeh spinlock_t lock; /* queue lock */ 181602adf40SYehuda Sadeh 182602adf40SYehuda Sadeh struct rbd_image_header header; 183589d30e0SAlex Elder char *image_id; 184589d30e0SAlex Elder size_t image_id_len; 1850bed54dcSAlex Elder char *image_name; 1860bed54dcSAlex Elder size_t image_name_len; 1870bed54dcSAlex Elder char *header_name; 188d22f76e7SAlex Elder char *pool_name; 1899bb2f334SAlex Elder int pool_id; 190602adf40SYehuda Sadeh 19159c2be1eSYehuda Sadeh struct ceph_osd_event *watch_event; 19259c2be1eSYehuda Sadeh struct ceph_osd_request *watch_request; 19359c2be1eSYehuda Sadeh 194c666601aSJosh Durgin /* protects updating the header */ 195c666601aSJosh Durgin struct rw_semaphore header_rwsem; 196f84344f3SAlex Elder 197f84344f3SAlex Elder struct rbd_mapping mapping; 198602adf40SYehuda Sadeh 199602adf40SYehuda Sadeh struct list_head node; 200dfc5606dSYehuda Sadeh 201dfc5606dSYehuda Sadeh /* list of snapshots */ 202dfc5606dSYehuda Sadeh struct list_head snaps; 203dfc5606dSYehuda Sadeh 204dfc5606dSYehuda Sadeh /* sysfs related */ 205dfc5606dSYehuda Sadeh struct device dev; 206dfc5606dSYehuda Sadeh }; 207dfc5606dSYehuda Sadeh 208602adf40SYehuda Sadeh static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */ 209e124a82fSAlex Elder 210602adf40SYehuda Sadeh static LIST_HEAD(rbd_dev_list); /* devices */ 211e124a82fSAlex Elder static DEFINE_SPINLOCK(rbd_dev_list_lock); 212e124a82fSAlex Elder 213602adf40SYehuda Sadeh static LIST_HEAD(rbd_client_list); /* clients */ 214432b8587SAlex Elder static DEFINE_SPINLOCK(rbd_client_list_lock); 215602adf40SYehuda Sadeh 216304f6808SAlex Elder static int rbd_dev_snaps_update(struct rbd_device *rbd_dev); 217304f6808SAlex Elder static int rbd_dev_snaps_register(struct rbd_device *rbd_dev); 218304f6808SAlex Elder 219dfc5606dSYehuda Sadeh static void rbd_dev_release(struct device *dev); 22014e7085dSAlex Elder static void __rbd_remove_snap_dev(struct rbd_snap *snap); 221dfc5606dSYehuda Sadeh 222f0f8cef5SAlex Elder static ssize_t rbd_add(struct bus_type *bus, const char *buf, 223f0f8cef5SAlex Elder size_t count); 224f0f8cef5SAlex Elder static ssize_t rbd_remove(struct bus_type *bus, const char *buf, 225f0f8cef5SAlex Elder size_t count); 226f0f8cef5SAlex Elder 227f0f8cef5SAlex Elder static struct bus_attribute rbd_bus_attrs[] = { 228f0f8cef5SAlex Elder __ATTR(add, S_IWUSR, NULL, rbd_add), 229f0f8cef5SAlex Elder __ATTR(remove, S_IWUSR, NULL, rbd_remove), 230f0f8cef5SAlex Elder __ATTR_NULL 231f0f8cef5SAlex Elder }; 232f0f8cef5SAlex Elder 233f0f8cef5SAlex Elder static struct bus_type rbd_bus_type = { 234f0f8cef5SAlex Elder .name = "rbd", 235f0f8cef5SAlex Elder .bus_attrs = rbd_bus_attrs, 236f0f8cef5SAlex Elder }; 237f0f8cef5SAlex Elder 238f0f8cef5SAlex Elder static void rbd_root_dev_release(struct device *dev) 239f0f8cef5SAlex Elder { 240f0f8cef5SAlex Elder } 241f0f8cef5SAlex Elder 242f0f8cef5SAlex Elder static struct device rbd_root_dev = { 243f0f8cef5SAlex Elder .init_name = "rbd", 244f0f8cef5SAlex Elder .release = rbd_root_dev_release, 245f0f8cef5SAlex Elder }; 246f0f8cef5SAlex Elder 247aafb230eSAlex Elder #ifdef RBD_DEBUG 248aafb230eSAlex Elder #define rbd_assert(expr) \ 249aafb230eSAlex Elder if (unlikely(!(expr))) { \ 250aafb230eSAlex Elder printk(KERN_ERR "\nAssertion failure in %s() " \ 251aafb230eSAlex Elder "at line %d:\n\n" \ 252aafb230eSAlex Elder "\trbd_assert(%s);\n\n", \ 253aafb230eSAlex Elder __func__, __LINE__, #expr); \ 254aafb230eSAlex Elder BUG(); \ 255aafb230eSAlex Elder } 256aafb230eSAlex Elder #else /* !RBD_DEBUG */ 257aafb230eSAlex Elder # define rbd_assert(expr) ((void) 0) 258aafb230eSAlex Elder #endif /* !RBD_DEBUG */ 259dfc5606dSYehuda Sadeh 260dfc5606dSYehuda Sadeh static struct device *rbd_get_dev(struct rbd_device *rbd_dev) 261dfc5606dSYehuda Sadeh { 262dfc5606dSYehuda Sadeh return get_device(&rbd_dev->dev); 263dfc5606dSYehuda Sadeh } 264dfc5606dSYehuda Sadeh 265dfc5606dSYehuda Sadeh static void rbd_put_dev(struct rbd_device *rbd_dev) 266dfc5606dSYehuda Sadeh { 267dfc5606dSYehuda Sadeh put_device(&rbd_dev->dev); 268dfc5606dSYehuda Sadeh } 269602adf40SYehuda Sadeh 2701fe5e993SAlex Elder static int rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver); 27159c2be1eSYehuda Sadeh 272602adf40SYehuda Sadeh static int rbd_open(struct block_device *bdev, fmode_t mode) 273602adf40SYehuda Sadeh { 274f0f8cef5SAlex Elder struct rbd_device *rbd_dev = bdev->bd_disk->private_data; 275602adf40SYehuda Sadeh 276f84344f3SAlex Elder if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only) 277602adf40SYehuda Sadeh return -EROFS; 278602adf40SYehuda Sadeh 279340c7a2bSAlex Elder rbd_get_dev(rbd_dev); 280f84344f3SAlex Elder set_device_ro(bdev, rbd_dev->mapping.read_only); 281340c7a2bSAlex Elder 282602adf40SYehuda Sadeh return 0; 283602adf40SYehuda Sadeh } 284602adf40SYehuda Sadeh 285dfc5606dSYehuda Sadeh static int rbd_release(struct gendisk *disk, fmode_t mode) 286dfc5606dSYehuda Sadeh { 287dfc5606dSYehuda Sadeh struct rbd_device *rbd_dev = disk->private_data; 288dfc5606dSYehuda Sadeh 289dfc5606dSYehuda Sadeh rbd_put_dev(rbd_dev); 290dfc5606dSYehuda Sadeh 291dfc5606dSYehuda Sadeh return 0; 292dfc5606dSYehuda Sadeh } 293dfc5606dSYehuda Sadeh 294602adf40SYehuda Sadeh static const struct block_device_operations rbd_bd_ops = { 295602adf40SYehuda Sadeh .owner = THIS_MODULE, 296602adf40SYehuda Sadeh .open = rbd_open, 297dfc5606dSYehuda Sadeh .release = rbd_release, 298602adf40SYehuda Sadeh }; 299602adf40SYehuda Sadeh 300602adf40SYehuda Sadeh /* 301602adf40SYehuda Sadeh * Initialize an rbd client instance. 30243ae4701SAlex Elder * We own *ceph_opts. 303602adf40SYehuda Sadeh */ 304f8c38929SAlex Elder static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts) 305602adf40SYehuda Sadeh { 306602adf40SYehuda Sadeh struct rbd_client *rbdc; 307602adf40SYehuda Sadeh int ret = -ENOMEM; 308602adf40SYehuda Sadeh 309602adf40SYehuda Sadeh dout("rbd_client_create\n"); 310602adf40SYehuda Sadeh rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL); 311602adf40SYehuda Sadeh if (!rbdc) 312602adf40SYehuda Sadeh goto out_opt; 313602adf40SYehuda Sadeh 314602adf40SYehuda Sadeh kref_init(&rbdc->kref); 315602adf40SYehuda Sadeh INIT_LIST_HEAD(&rbdc->node); 316602adf40SYehuda Sadeh 317bc534d86SAlex Elder mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); 318bc534d86SAlex Elder 31943ae4701SAlex Elder rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0); 320602adf40SYehuda Sadeh if (IS_ERR(rbdc->client)) 321bc534d86SAlex Elder goto out_mutex; 32243ae4701SAlex Elder ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */ 323602adf40SYehuda Sadeh 324602adf40SYehuda Sadeh ret = ceph_open_session(rbdc->client); 325602adf40SYehuda Sadeh if (ret < 0) 326602adf40SYehuda Sadeh goto out_err; 327602adf40SYehuda Sadeh 328432b8587SAlex Elder spin_lock(&rbd_client_list_lock); 329602adf40SYehuda Sadeh list_add_tail(&rbdc->node, &rbd_client_list); 330432b8587SAlex Elder spin_unlock(&rbd_client_list_lock); 331602adf40SYehuda Sadeh 332bc534d86SAlex Elder mutex_unlock(&ctl_mutex); 333bc534d86SAlex Elder 334602adf40SYehuda Sadeh dout("rbd_client_create created %p\n", rbdc); 335602adf40SYehuda Sadeh return rbdc; 336602adf40SYehuda Sadeh 337602adf40SYehuda Sadeh out_err: 338602adf40SYehuda Sadeh ceph_destroy_client(rbdc->client); 339bc534d86SAlex Elder out_mutex: 340bc534d86SAlex Elder mutex_unlock(&ctl_mutex); 341602adf40SYehuda Sadeh kfree(rbdc); 342602adf40SYehuda Sadeh out_opt: 34343ae4701SAlex Elder if (ceph_opts) 34443ae4701SAlex Elder ceph_destroy_options(ceph_opts); 34528f259b7SVasiliy Kulikov return ERR_PTR(ret); 346602adf40SYehuda Sadeh } 347602adf40SYehuda Sadeh 348602adf40SYehuda Sadeh /* 3491f7ba331SAlex Elder * Find a ceph client with specific addr and configuration. If 3501f7ba331SAlex Elder * found, bump its reference count. 351602adf40SYehuda Sadeh */ 3521f7ba331SAlex Elder static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts) 353602adf40SYehuda Sadeh { 354602adf40SYehuda Sadeh struct rbd_client *client_node; 3551f7ba331SAlex Elder bool found = false; 356602adf40SYehuda Sadeh 35743ae4701SAlex Elder if (ceph_opts->flags & CEPH_OPT_NOSHARE) 358602adf40SYehuda Sadeh return NULL; 359602adf40SYehuda Sadeh 3601f7ba331SAlex Elder spin_lock(&rbd_client_list_lock); 3611f7ba331SAlex Elder list_for_each_entry(client_node, &rbd_client_list, node) { 3621f7ba331SAlex Elder if (!ceph_compare_options(ceph_opts, client_node->client)) { 3631f7ba331SAlex Elder kref_get(&client_node->kref); 3641f7ba331SAlex Elder found = true; 3651f7ba331SAlex Elder break; 3661f7ba331SAlex Elder } 3671f7ba331SAlex Elder } 3681f7ba331SAlex Elder spin_unlock(&rbd_client_list_lock); 3691f7ba331SAlex Elder 3701f7ba331SAlex Elder return found ? client_node : NULL; 371602adf40SYehuda Sadeh } 372602adf40SYehuda Sadeh 373602adf40SYehuda Sadeh /* 37459c2be1eSYehuda Sadeh * mount options 37559c2be1eSYehuda Sadeh */ 37659c2be1eSYehuda Sadeh enum { 37759c2be1eSYehuda Sadeh Opt_last_int, 37859c2be1eSYehuda Sadeh /* int args above */ 37959c2be1eSYehuda Sadeh Opt_last_string, 38059c2be1eSYehuda Sadeh /* string args above */ 381cc0538b6SAlex Elder Opt_read_only, 382cc0538b6SAlex Elder Opt_read_write, 383cc0538b6SAlex Elder /* Boolean args above */ 384cc0538b6SAlex Elder Opt_last_bool, 38559c2be1eSYehuda Sadeh }; 38659c2be1eSYehuda Sadeh 38743ae4701SAlex Elder static match_table_t rbd_opts_tokens = { 38859c2be1eSYehuda Sadeh /* int args above */ 38959c2be1eSYehuda Sadeh /* string args above */ 390f84344f3SAlex Elder {Opt_read_only, "mapping.read_only"}, 391cc0538b6SAlex Elder {Opt_read_only, "ro"}, /* Alternate spelling */ 392cc0538b6SAlex Elder {Opt_read_write, "read_write"}, 393cc0538b6SAlex Elder {Opt_read_write, "rw"}, /* Alternate spelling */ 394cc0538b6SAlex Elder /* Boolean args above */ 39559c2be1eSYehuda Sadeh {-1, NULL} 39659c2be1eSYehuda Sadeh }; 39759c2be1eSYehuda Sadeh 39859c2be1eSYehuda Sadeh static int parse_rbd_opts_token(char *c, void *private) 39959c2be1eSYehuda Sadeh { 40043ae4701SAlex Elder struct rbd_options *rbd_opts = private; 40159c2be1eSYehuda Sadeh substring_t argstr[MAX_OPT_ARGS]; 40259c2be1eSYehuda Sadeh int token, intval, ret; 40359c2be1eSYehuda Sadeh 40443ae4701SAlex Elder token = match_token(c, rbd_opts_tokens, argstr); 40559c2be1eSYehuda Sadeh if (token < 0) 40659c2be1eSYehuda Sadeh return -EINVAL; 40759c2be1eSYehuda Sadeh 40859c2be1eSYehuda Sadeh if (token < Opt_last_int) { 40959c2be1eSYehuda Sadeh ret = match_int(&argstr[0], &intval); 41059c2be1eSYehuda Sadeh if (ret < 0) { 41159c2be1eSYehuda Sadeh pr_err("bad mount option arg (not int) " 41259c2be1eSYehuda Sadeh "at '%s'\n", c); 41359c2be1eSYehuda Sadeh return ret; 41459c2be1eSYehuda Sadeh } 41559c2be1eSYehuda Sadeh dout("got int token %d val %d\n", token, intval); 41659c2be1eSYehuda Sadeh } else if (token > Opt_last_int && token < Opt_last_string) { 41759c2be1eSYehuda Sadeh dout("got string token %d val %s\n", token, 41859c2be1eSYehuda Sadeh argstr[0].from); 419cc0538b6SAlex Elder } else if (token > Opt_last_string && token < Opt_last_bool) { 420cc0538b6SAlex Elder dout("got Boolean token %d\n", token); 42159c2be1eSYehuda Sadeh } else { 42259c2be1eSYehuda Sadeh dout("got token %d\n", token); 42359c2be1eSYehuda Sadeh } 42459c2be1eSYehuda Sadeh 42559c2be1eSYehuda Sadeh switch (token) { 426cc0538b6SAlex Elder case Opt_read_only: 427cc0538b6SAlex Elder rbd_opts->read_only = true; 428cc0538b6SAlex Elder break; 429cc0538b6SAlex Elder case Opt_read_write: 430cc0538b6SAlex Elder rbd_opts->read_only = false; 431cc0538b6SAlex Elder break; 43259c2be1eSYehuda Sadeh default: 433aafb230eSAlex Elder rbd_assert(false); 434aafb230eSAlex Elder break; 43559c2be1eSYehuda Sadeh } 43659c2be1eSYehuda Sadeh return 0; 43759c2be1eSYehuda Sadeh } 43859c2be1eSYehuda Sadeh 43959c2be1eSYehuda Sadeh /* 440602adf40SYehuda Sadeh * Get a ceph client with specific addr and configuration, if one does 441602adf40SYehuda Sadeh * not exist create it. 442602adf40SYehuda Sadeh */ 443f8c38929SAlex Elder static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr, 444f8c38929SAlex Elder size_t mon_addr_len, char *options) 445602adf40SYehuda Sadeh { 446f8c38929SAlex Elder struct rbd_options *rbd_opts = &rbd_dev->rbd_opts; 44743ae4701SAlex Elder struct ceph_options *ceph_opts; 448f8c38929SAlex Elder struct rbd_client *rbdc; 44959c2be1eSYehuda Sadeh 450cc0538b6SAlex Elder rbd_opts->read_only = RBD_READ_ONLY_DEFAULT; 451602adf40SYehuda Sadeh 45243ae4701SAlex Elder ceph_opts = ceph_parse_options(options, mon_addr, 4535214ecc4SAlex Elder mon_addr + mon_addr_len, 45421079786SAlex Elder parse_rbd_opts_token, rbd_opts); 455f8c38929SAlex Elder if (IS_ERR(ceph_opts)) 456f8c38929SAlex Elder return PTR_ERR(ceph_opts); 457602adf40SYehuda Sadeh 4581f7ba331SAlex Elder rbdc = rbd_client_find(ceph_opts); 459602adf40SYehuda Sadeh if (rbdc) { 460e6994d3dSAlex Elder /* using an existing client */ 46143ae4701SAlex Elder ceph_destroy_options(ceph_opts); 462f8c38929SAlex Elder } else { 463f8c38929SAlex Elder rbdc = rbd_client_create(ceph_opts); 464d720bcb0SAlex Elder if (IS_ERR(rbdc)) 465f8c38929SAlex Elder return PTR_ERR(rbdc); 466f8c38929SAlex Elder } 467f8c38929SAlex Elder rbd_dev->rbd_client = rbdc; 468d720bcb0SAlex Elder 469f8c38929SAlex Elder return 0; 470602adf40SYehuda Sadeh } 471602adf40SYehuda Sadeh 472602adf40SYehuda Sadeh /* 473602adf40SYehuda Sadeh * Destroy ceph client 474d23a4b3fSAlex Elder * 475432b8587SAlex Elder * Caller must hold rbd_client_list_lock. 476602adf40SYehuda Sadeh */ 477602adf40SYehuda Sadeh static void rbd_client_release(struct kref *kref) 478602adf40SYehuda Sadeh { 479602adf40SYehuda Sadeh struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref); 480602adf40SYehuda Sadeh 481602adf40SYehuda Sadeh dout("rbd_release_client %p\n", rbdc); 482cd9d9f5dSAlex Elder spin_lock(&rbd_client_list_lock); 483602adf40SYehuda Sadeh list_del(&rbdc->node); 484cd9d9f5dSAlex Elder spin_unlock(&rbd_client_list_lock); 485602adf40SYehuda Sadeh 486602adf40SYehuda Sadeh ceph_destroy_client(rbdc->client); 487602adf40SYehuda Sadeh kfree(rbdc); 488602adf40SYehuda Sadeh } 489602adf40SYehuda Sadeh 490602adf40SYehuda Sadeh /* 491602adf40SYehuda Sadeh * Drop reference to ceph client node. If it's not referenced anymore, release 492602adf40SYehuda Sadeh * it. 493602adf40SYehuda Sadeh */ 494602adf40SYehuda Sadeh static void rbd_put_client(struct rbd_device *rbd_dev) 495602adf40SYehuda Sadeh { 496602adf40SYehuda Sadeh kref_put(&rbd_dev->rbd_client->kref, rbd_client_release); 497602adf40SYehuda Sadeh rbd_dev->rbd_client = NULL; 498602adf40SYehuda Sadeh } 499602adf40SYehuda Sadeh 5001fec7093SYehuda Sadeh /* 5011fec7093SYehuda Sadeh * Destroy requests collection 5021fec7093SYehuda Sadeh */ 5031fec7093SYehuda Sadeh static void rbd_coll_release(struct kref *kref) 5041fec7093SYehuda Sadeh { 5051fec7093SYehuda Sadeh struct rbd_req_coll *coll = 5061fec7093SYehuda Sadeh container_of(kref, struct rbd_req_coll, kref); 5071fec7093SYehuda Sadeh 5081fec7093SYehuda Sadeh dout("rbd_coll_release %p\n", coll); 5091fec7093SYehuda Sadeh kfree(coll); 5101fec7093SYehuda Sadeh } 511602adf40SYehuda Sadeh 512a30b71b9SAlex Elder static bool rbd_image_format_valid(u32 image_format) 513a30b71b9SAlex Elder { 514a30b71b9SAlex Elder return image_format == 1 || image_format == 2; 515a30b71b9SAlex Elder } 516a30b71b9SAlex Elder 5178e94af8eSAlex Elder static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk) 5188e94af8eSAlex Elder { 519103a150fSAlex Elder size_t size; 520103a150fSAlex Elder u32 snap_count; 521103a150fSAlex Elder 522103a150fSAlex Elder /* The header has to start with the magic rbd header text */ 523103a150fSAlex Elder if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT))) 524103a150fSAlex Elder return false; 525103a150fSAlex Elder 526103a150fSAlex Elder /* 527103a150fSAlex Elder * The size of a snapshot header has to fit in a size_t, and 528103a150fSAlex Elder * that limits the number of snapshots. 529103a150fSAlex Elder */ 530103a150fSAlex Elder snap_count = le32_to_cpu(ondisk->snap_count); 531103a150fSAlex Elder size = SIZE_MAX - sizeof (struct ceph_snap_context); 532103a150fSAlex Elder if (snap_count > size / sizeof (__le64)) 533103a150fSAlex Elder return false; 534103a150fSAlex Elder 535103a150fSAlex Elder /* 536103a150fSAlex Elder * Not only that, but the size of the entire the snapshot 537103a150fSAlex Elder * header must also be representable in a size_t. 538103a150fSAlex Elder */ 539103a150fSAlex Elder size -= snap_count * sizeof (__le64); 540103a150fSAlex Elder if ((u64) size < le64_to_cpu(ondisk->snap_names_len)) 541103a150fSAlex Elder return false; 542103a150fSAlex Elder 543103a150fSAlex Elder return true; 5448e94af8eSAlex Elder } 5458e94af8eSAlex Elder 546602adf40SYehuda Sadeh /* 547602adf40SYehuda Sadeh * Create a new header structure, translate header format from the on-disk 548602adf40SYehuda Sadeh * header. 549602adf40SYehuda Sadeh */ 550602adf40SYehuda Sadeh static int rbd_header_from_disk(struct rbd_image_header *header, 5514156d998SAlex Elder struct rbd_image_header_ondisk *ondisk) 552602adf40SYehuda Sadeh { 553ccece235SAlex Elder u32 snap_count; 55458c17b0eSAlex Elder size_t len; 555d2bb24e5SAlex Elder size_t size; 556621901d6SAlex Elder u32 i; 557602adf40SYehuda Sadeh 5586a52325fSAlex Elder memset(header, 0, sizeof (*header)); 5596a52325fSAlex Elder 560103a150fSAlex Elder snap_count = le32_to_cpu(ondisk->snap_count); 561103a150fSAlex Elder 56258c17b0eSAlex Elder len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix)); 56358c17b0eSAlex Elder header->object_prefix = kmalloc(len + 1, GFP_KERNEL); 5646a52325fSAlex Elder if (!header->object_prefix) 565602adf40SYehuda Sadeh return -ENOMEM; 56658c17b0eSAlex Elder memcpy(header->object_prefix, ondisk->object_prefix, len); 56758c17b0eSAlex Elder header->object_prefix[len] = '\0'; 56800f1f36fSAlex Elder 569602adf40SYehuda Sadeh if (snap_count) { 570f785cc1dSAlex Elder u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len); 571f785cc1dSAlex Elder 572621901d6SAlex Elder /* Save a copy of the snapshot names */ 573621901d6SAlex Elder 574f785cc1dSAlex Elder if (snap_names_len > (u64) SIZE_MAX) 575f785cc1dSAlex Elder return -EIO; 576f785cc1dSAlex Elder header->snap_names = kmalloc(snap_names_len, GFP_KERNEL); 577602adf40SYehuda Sadeh if (!header->snap_names) 5786a52325fSAlex Elder goto out_err; 579f785cc1dSAlex Elder /* 580f785cc1dSAlex Elder * Note that rbd_dev_v1_header_read() guarantees 581f785cc1dSAlex Elder * the ondisk buffer we're working with has 582f785cc1dSAlex Elder * snap_names_len bytes beyond the end of the 583f785cc1dSAlex Elder * snapshot id array, this memcpy() is safe. 584f785cc1dSAlex Elder */ 585f785cc1dSAlex Elder memcpy(header->snap_names, &ondisk->snaps[snap_count], 586f785cc1dSAlex Elder snap_names_len); 5876a52325fSAlex Elder 588621901d6SAlex Elder /* Record each snapshot's size */ 589621901d6SAlex Elder 590d2bb24e5SAlex Elder size = snap_count * sizeof (*header->snap_sizes); 591d2bb24e5SAlex Elder header->snap_sizes = kmalloc(size, GFP_KERNEL); 592602adf40SYehuda Sadeh if (!header->snap_sizes) 5936a52325fSAlex Elder goto out_err; 594621901d6SAlex Elder for (i = 0; i < snap_count; i++) 595621901d6SAlex Elder header->snap_sizes[i] = 596621901d6SAlex Elder le64_to_cpu(ondisk->snaps[i].image_size); 597602adf40SYehuda Sadeh } else { 598ccece235SAlex Elder WARN_ON(ondisk->snap_names_len); 599602adf40SYehuda Sadeh header->snap_names = NULL; 600602adf40SYehuda Sadeh header->snap_sizes = NULL; 601602adf40SYehuda Sadeh } 602849b4260SAlex Elder 60334b13184SAlex Elder header->features = 0; /* No features support in v1 images */ 604602adf40SYehuda Sadeh header->obj_order = ondisk->options.order; 605602adf40SYehuda Sadeh header->crypt_type = ondisk->options.crypt_type; 606602adf40SYehuda Sadeh header->comp_type = ondisk->options.comp_type; 6076a52325fSAlex Elder 608621901d6SAlex Elder /* Allocate and fill in the snapshot context */ 609621901d6SAlex Elder 610f84344f3SAlex Elder header->image_size = le64_to_cpu(ondisk->image_size); 6116a52325fSAlex Elder size = sizeof (struct ceph_snap_context); 6126a52325fSAlex Elder size += snap_count * sizeof (header->snapc->snaps[0]); 6136a52325fSAlex Elder header->snapc = kzalloc(size, GFP_KERNEL); 6146a52325fSAlex Elder if (!header->snapc) 6156a52325fSAlex Elder goto out_err; 616602adf40SYehuda Sadeh 617602adf40SYehuda Sadeh atomic_set(&header->snapc->nref, 1); 618505cbb9bSAlex Elder header->snapc->seq = le64_to_cpu(ondisk->snap_seq); 619602adf40SYehuda Sadeh header->snapc->num_snaps = snap_count; 620621901d6SAlex Elder for (i = 0; i < snap_count; i++) 621602adf40SYehuda Sadeh header->snapc->snaps[i] = 622602adf40SYehuda Sadeh le64_to_cpu(ondisk->snaps[i].id); 623602adf40SYehuda Sadeh 624602adf40SYehuda Sadeh return 0; 625602adf40SYehuda Sadeh 6266a52325fSAlex Elder out_err: 627849b4260SAlex Elder kfree(header->snap_sizes); 628ccece235SAlex Elder header->snap_sizes = NULL; 629602adf40SYehuda Sadeh kfree(header->snap_names); 630ccece235SAlex Elder header->snap_names = NULL; 6316a52325fSAlex Elder kfree(header->object_prefix); 6326a52325fSAlex Elder header->object_prefix = NULL; 633ccece235SAlex Elder 63400f1f36fSAlex Elder return -ENOMEM; 635602adf40SYehuda Sadeh } 636602adf40SYehuda Sadeh 6378836b995SAlex Elder static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name) 638602adf40SYehuda Sadeh { 639602adf40SYehuda Sadeh 640e86924a8SAlex Elder struct rbd_snap *snap; 64100f1f36fSAlex Elder 642e86924a8SAlex Elder list_for_each_entry(snap, &rbd_dev->snaps, node) { 643e86924a8SAlex Elder if (!strcmp(snap_name, snap->name)) { 644e86924a8SAlex Elder rbd_dev->mapping.snap_id = snap->id; 645e86924a8SAlex Elder rbd_dev->mapping.size = snap->size; 64634b13184SAlex Elder rbd_dev->mapping.features = snap->features; 64700f1f36fSAlex Elder 648e86924a8SAlex Elder return 0; 649602adf40SYehuda Sadeh } 65000f1f36fSAlex Elder } 651e86924a8SAlex Elder 65200f1f36fSAlex Elder return -ENOENT; 65300f1f36fSAlex Elder } 654602adf40SYehuda Sadeh 6555ed16177SAlex Elder static int rbd_dev_set_mapping(struct rbd_device *rbd_dev, char *snap_name) 656602adf40SYehuda Sadeh { 65778dc447dSAlex Elder int ret; 658602adf40SYehuda Sadeh 6594e1105a2SAlex Elder if (!memcmp(snap_name, RBD_SNAP_HEAD_NAME, 660cc9d734cSJosh Durgin sizeof (RBD_SNAP_HEAD_NAME))) { 661f84344f3SAlex Elder rbd_dev->mapping.snap_id = CEPH_NOSNAP; 66299c1f08fSAlex Elder rbd_dev->mapping.size = rbd_dev->header.image_size; 66334b13184SAlex Elder rbd_dev->mapping.features = rbd_dev->header.features; 664f84344f3SAlex Elder rbd_dev->mapping.snap_exists = false; 665f84344f3SAlex Elder rbd_dev->mapping.read_only = rbd_dev->rbd_opts.read_only; 666e86924a8SAlex Elder ret = 0; 667602adf40SYehuda Sadeh } else { 6688836b995SAlex Elder ret = snap_by_name(rbd_dev, snap_name); 669602adf40SYehuda Sadeh if (ret < 0) 670602adf40SYehuda Sadeh goto done; 671f84344f3SAlex Elder rbd_dev->mapping.snap_exists = true; 672f84344f3SAlex Elder rbd_dev->mapping.read_only = true; 673602adf40SYehuda Sadeh } 6744e1105a2SAlex Elder rbd_dev->mapping.snap_name = snap_name; 675602adf40SYehuda Sadeh done: 676602adf40SYehuda Sadeh return ret; 677602adf40SYehuda Sadeh } 678602adf40SYehuda Sadeh 679602adf40SYehuda Sadeh static void rbd_header_free(struct rbd_image_header *header) 680602adf40SYehuda Sadeh { 681849b4260SAlex Elder kfree(header->object_prefix); 682d78fd7aeSAlex Elder header->object_prefix = NULL; 683602adf40SYehuda Sadeh kfree(header->snap_sizes); 684d78fd7aeSAlex Elder header->snap_sizes = NULL; 685849b4260SAlex Elder kfree(header->snap_names); 686d78fd7aeSAlex Elder header->snap_names = NULL; 687d1d25646SJosh Durgin ceph_put_snap_context(header->snapc); 688d78fd7aeSAlex Elder header->snapc = NULL; 689602adf40SYehuda Sadeh } 690602adf40SYehuda Sadeh 69165ccfe21SAlex Elder static char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset) 692602adf40SYehuda Sadeh { 69365ccfe21SAlex Elder char *name; 69465ccfe21SAlex Elder u64 segment; 69565ccfe21SAlex Elder int ret; 696602adf40SYehuda Sadeh 69765ccfe21SAlex Elder name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO); 69865ccfe21SAlex Elder if (!name) 69965ccfe21SAlex Elder return NULL; 70065ccfe21SAlex Elder segment = offset >> rbd_dev->header.obj_order; 70165ccfe21SAlex Elder ret = snprintf(name, RBD_MAX_SEG_NAME_LEN, "%s.%012llx", 70265ccfe21SAlex Elder rbd_dev->header.object_prefix, segment); 70365ccfe21SAlex Elder if (ret < 0 || ret >= RBD_MAX_SEG_NAME_LEN) { 70465ccfe21SAlex Elder pr_err("error formatting segment name for #%llu (%d)\n", 70565ccfe21SAlex Elder segment, ret); 70665ccfe21SAlex Elder kfree(name); 70765ccfe21SAlex Elder name = NULL; 70865ccfe21SAlex Elder } 709602adf40SYehuda Sadeh 71065ccfe21SAlex Elder return name; 71165ccfe21SAlex Elder } 712602adf40SYehuda Sadeh 71365ccfe21SAlex Elder static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset) 71465ccfe21SAlex Elder { 71565ccfe21SAlex Elder u64 segment_size = (u64) 1 << rbd_dev->header.obj_order; 716602adf40SYehuda Sadeh 71765ccfe21SAlex Elder return offset & (segment_size - 1); 71865ccfe21SAlex Elder } 71965ccfe21SAlex Elder 72065ccfe21SAlex Elder static u64 rbd_segment_length(struct rbd_device *rbd_dev, 72165ccfe21SAlex Elder u64 offset, u64 length) 72265ccfe21SAlex Elder { 72365ccfe21SAlex Elder u64 segment_size = (u64) 1 << rbd_dev->header.obj_order; 72465ccfe21SAlex Elder 72565ccfe21SAlex Elder offset &= segment_size - 1; 72665ccfe21SAlex Elder 727aafb230eSAlex Elder rbd_assert(length <= U64_MAX - offset); 72865ccfe21SAlex Elder if (offset + length > segment_size) 72965ccfe21SAlex Elder length = segment_size - offset; 73065ccfe21SAlex Elder 73165ccfe21SAlex Elder return length; 732602adf40SYehuda Sadeh } 733602adf40SYehuda Sadeh 7341fec7093SYehuda Sadeh static int rbd_get_num_segments(struct rbd_image_header *header, 7351fec7093SYehuda Sadeh u64 ofs, u64 len) 7361fec7093SYehuda Sadeh { 737df111be6SAlex Elder u64 start_seg; 738df111be6SAlex Elder u64 end_seg; 739df111be6SAlex Elder 740df111be6SAlex Elder if (!len) 741df111be6SAlex Elder return 0; 742df111be6SAlex Elder if (len - 1 > U64_MAX - ofs) 743df111be6SAlex Elder return -ERANGE; 744df111be6SAlex Elder 745df111be6SAlex Elder start_seg = ofs >> header->obj_order; 746df111be6SAlex Elder end_seg = (ofs + len - 1) >> header->obj_order; 747df111be6SAlex Elder 7481fec7093SYehuda Sadeh return end_seg - start_seg + 1; 7491fec7093SYehuda Sadeh } 7501fec7093SYehuda Sadeh 751602adf40SYehuda Sadeh /* 752029bcbd8SJosh Durgin * returns the size of an object in the image 753029bcbd8SJosh Durgin */ 754029bcbd8SJosh Durgin static u64 rbd_obj_bytes(struct rbd_image_header *header) 755029bcbd8SJosh Durgin { 756029bcbd8SJosh Durgin return 1 << header->obj_order; 757029bcbd8SJosh Durgin } 758029bcbd8SJosh Durgin 759029bcbd8SJosh Durgin /* 760602adf40SYehuda Sadeh * bio helpers 761602adf40SYehuda Sadeh */ 762602adf40SYehuda Sadeh 763602adf40SYehuda Sadeh static void bio_chain_put(struct bio *chain) 764602adf40SYehuda Sadeh { 765602adf40SYehuda Sadeh struct bio *tmp; 766602adf40SYehuda Sadeh 767602adf40SYehuda Sadeh while (chain) { 768602adf40SYehuda Sadeh tmp = chain; 769602adf40SYehuda Sadeh chain = chain->bi_next; 770602adf40SYehuda Sadeh bio_put(tmp); 771602adf40SYehuda Sadeh } 772602adf40SYehuda Sadeh } 773602adf40SYehuda Sadeh 774602adf40SYehuda Sadeh /* 775602adf40SYehuda Sadeh * zeros a bio chain, starting at specific offset 776602adf40SYehuda Sadeh */ 777602adf40SYehuda Sadeh static void zero_bio_chain(struct bio *chain, int start_ofs) 778602adf40SYehuda Sadeh { 779602adf40SYehuda Sadeh struct bio_vec *bv; 780602adf40SYehuda Sadeh unsigned long flags; 781602adf40SYehuda Sadeh void *buf; 782602adf40SYehuda Sadeh int i; 783602adf40SYehuda Sadeh int pos = 0; 784602adf40SYehuda Sadeh 785602adf40SYehuda Sadeh while (chain) { 786602adf40SYehuda Sadeh bio_for_each_segment(bv, chain, i) { 787602adf40SYehuda Sadeh if (pos + bv->bv_len > start_ofs) { 788602adf40SYehuda Sadeh int remainder = max(start_ofs - pos, 0); 789602adf40SYehuda Sadeh buf = bvec_kmap_irq(bv, &flags); 790602adf40SYehuda Sadeh memset(buf + remainder, 0, 791602adf40SYehuda Sadeh bv->bv_len - remainder); 79285b5aaa6SDan Carpenter bvec_kunmap_irq(buf, &flags); 793602adf40SYehuda Sadeh } 794602adf40SYehuda Sadeh pos += bv->bv_len; 795602adf40SYehuda Sadeh } 796602adf40SYehuda Sadeh 797602adf40SYehuda Sadeh chain = chain->bi_next; 798602adf40SYehuda Sadeh } 799602adf40SYehuda Sadeh } 800602adf40SYehuda Sadeh 801602adf40SYehuda Sadeh /* 802602adf40SYehuda Sadeh * bio_chain_clone - clone a chain of bios up to a certain length. 803602adf40SYehuda Sadeh * might return a bio_pair that will need to be released. 804602adf40SYehuda Sadeh */ 805602adf40SYehuda Sadeh static struct bio *bio_chain_clone(struct bio **old, struct bio **next, 806602adf40SYehuda Sadeh struct bio_pair **bp, 807602adf40SYehuda Sadeh int len, gfp_t gfpmask) 808602adf40SYehuda Sadeh { 809542582fcSAlex Elder struct bio *old_chain = *old; 810542582fcSAlex Elder struct bio *new_chain = NULL; 811542582fcSAlex Elder struct bio *tail; 812602adf40SYehuda Sadeh int total = 0; 813602adf40SYehuda Sadeh 814602adf40SYehuda Sadeh if (*bp) { 815602adf40SYehuda Sadeh bio_pair_release(*bp); 816602adf40SYehuda Sadeh *bp = NULL; 817602adf40SYehuda Sadeh } 818602adf40SYehuda Sadeh 819602adf40SYehuda Sadeh while (old_chain && (total < len)) { 820542582fcSAlex Elder struct bio *tmp; 821542582fcSAlex Elder 822602adf40SYehuda Sadeh tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs); 823602adf40SYehuda Sadeh if (!tmp) 824602adf40SYehuda Sadeh goto err_out; 825542582fcSAlex Elder gfpmask &= ~__GFP_WAIT; /* can't wait after the first */ 826602adf40SYehuda Sadeh 827602adf40SYehuda Sadeh if (total + old_chain->bi_size > len) { 828602adf40SYehuda Sadeh struct bio_pair *bp; 829602adf40SYehuda Sadeh 830602adf40SYehuda Sadeh /* 831602adf40SYehuda Sadeh * this split can only happen with a single paged bio, 832602adf40SYehuda Sadeh * split_bio will BUG_ON if this is not the case 833602adf40SYehuda Sadeh */ 834602adf40SYehuda Sadeh dout("bio_chain_clone split! total=%d remaining=%d" 835bd919d45SAlex Elder "bi_size=%u\n", 836bd919d45SAlex Elder total, len - total, old_chain->bi_size); 837602adf40SYehuda Sadeh 838602adf40SYehuda Sadeh /* split the bio. We'll release it either in the next 839602adf40SYehuda Sadeh call, or it will have to be released outside */ 840593a9e7bSAlex Elder bp = bio_split(old_chain, (len - total) / SECTOR_SIZE); 841602adf40SYehuda Sadeh if (!bp) 842602adf40SYehuda Sadeh goto err_out; 843602adf40SYehuda Sadeh 844602adf40SYehuda Sadeh __bio_clone(tmp, &bp->bio1); 845602adf40SYehuda Sadeh 846602adf40SYehuda Sadeh *next = &bp->bio2; 847602adf40SYehuda Sadeh } else { 848602adf40SYehuda Sadeh __bio_clone(tmp, old_chain); 849602adf40SYehuda Sadeh *next = old_chain->bi_next; 850602adf40SYehuda Sadeh } 851602adf40SYehuda Sadeh 852602adf40SYehuda Sadeh tmp->bi_bdev = NULL; 853602adf40SYehuda Sadeh tmp->bi_next = NULL; 854542582fcSAlex Elder if (new_chain) 855602adf40SYehuda Sadeh tail->bi_next = tmp; 856542582fcSAlex Elder else 857542582fcSAlex Elder new_chain = tmp; 858602adf40SYehuda Sadeh tail = tmp; 859602adf40SYehuda Sadeh old_chain = old_chain->bi_next; 860602adf40SYehuda Sadeh 861602adf40SYehuda Sadeh total += tmp->bi_size; 862602adf40SYehuda Sadeh } 863602adf40SYehuda Sadeh 864aafb230eSAlex Elder rbd_assert(total == len); 865602adf40SYehuda Sadeh 866602adf40SYehuda Sadeh *old = old_chain; 867602adf40SYehuda Sadeh 868602adf40SYehuda Sadeh return new_chain; 869602adf40SYehuda Sadeh 870602adf40SYehuda Sadeh err_out: 871602adf40SYehuda Sadeh dout("bio_chain_clone with err\n"); 872602adf40SYehuda Sadeh bio_chain_put(new_chain); 873602adf40SYehuda Sadeh return NULL; 874602adf40SYehuda Sadeh } 875602adf40SYehuda Sadeh 876602adf40SYehuda Sadeh /* 877602adf40SYehuda Sadeh * helpers for osd request op vectors. 878602adf40SYehuda Sadeh */ 87957cfc106SAlex Elder static struct ceph_osd_req_op *rbd_create_rw_ops(int num_ops, 88057cfc106SAlex Elder int opcode, u32 payload_len) 881602adf40SYehuda Sadeh { 88257cfc106SAlex Elder struct ceph_osd_req_op *ops; 88357cfc106SAlex Elder 88457cfc106SAlex Elder ops = kzalloc(sizeof (*ops) * (num_ops + 1), GFP_NOIO); 88557cfc106SAlex Elder if (!ops) 88657cfc106SAlex Elder return NULL; 88757cfc106SAlex Elder 88857cfc106SAlex Elder ops[0].op = opcode; 88957cfc106SAlex Elder 890602adf40SYehuda Sadeh /* 891602adf40SYehuda Sadeh * op extent offset and length will be set later on 892602adf40SYehuda Sadeh * in calc_raw_layout() 893602adf40SYehuda Sadeh */ 89457cfc106SAlex Elder ops[0].payload_len = payload_len; 89557cfc106SAlex Elder 89657cfc106SAlex Elder return ops; 897602adf40SYehuda Sadeh } 898602adf40SYehuda Sadeh 899602adf40SYehuda Sadeh static void rbd_destroy_ops(struct ceph_osd_req_op *ops) 900602adf40SYehuda Sadeh { 901602adf40SYehuda Sadeh kfree(ops); 902602adf40SYehuda Sadeh } 903602adf40SYehuda Sadeh 9041fec7093SYehuda Sadeh static void rbd_coll_end_req_index(struct request *rq, 9051fec7093SYehuda Sadeh struct rbd_req_coll *coll, 9061fec7093SYehuda Sadeh int index, 9071fec7093SYehuda Sadeh int ret, u64 len) 9081fec7093SYehuda Sadeh { 9091fec7093SYehuda Sadeh struct request_queue *q; 9101fec7093SYehuda Sadeh int min, max, i; 9111fec7093SYehuda Sadeh 912bd919d45SAlex Elder dout("rbd_coll_end_req_index %p index %d ret %d len %llu\n", 913bd919d45SAlex Elder coll, index, ret, (unsigned long long) len); 9141fec7093SYehuda Sadeh 9151fec7093SYehuda Sadeh if (!rq) 9161fec7093SYehuda Sadeh return; 9171fec7093SYehuda Sadeh 9181fec7093SYehuda Sadeh if (!coll) { 9191fec7093SYehuda Sadeh blk_end_request(rq, ret, len); 9201fec7093SYehuda Sadeh return; 9211fec7093SYehuda Sadeh } 9221fec7093SYehuda Sadeh 9231fec7093SYehuda Sadeh q = rq->q; 9241fec7093SYehuda Sadeh 9251fec7093SYehuda Sadeh spin_lock_irq(q->queue_lock); 9261fec7093SYehuda Sadeh coll->status[index].done = 1; 9271fec7093SYehuda Sadeh coll->status[index].rc = ret; 9281fec7093SYehuda Sadeh coll->status[index].bytes = len; 9291fec7093SYehuda Sadeh max = min = coll->num_done; 9301fec7093SYehuda Sadeh while (max < coll->total && coll->status[max].done) 9311fec7093SYehuda Sadeh max++; 9321fec7093SYehuda Sadeh 9331fec7093SYehuda Sadeh for (i = min; i<max; i++) { 9341fec7093SYehuda Sadeh __blk_end_request(rq, coll->status[i].rc, 9351fec7093SYehuda Sadeh coll->status[i].bytes); 9361fec7093SYehuda Sadeh coll->num_done++; 9371fec7093SYehuda Sadeh kref_put(&coll->kref, rbd_coll_release); 9381fec7093SYehuda Sadeh } 9391fec7093SYehuda Sadeh spin_unlock_irq(q->queue_lock); 9401fec7093SYehuda Sadeh } 9411fec7093SYehuda Sadeh 9421fec7093SYehuda Sadeh static void rbd_coll_end_req(struct rbd_request *req, 9431fec7093SYehuda Sadeh int ret, u64 len) 9441fec7093SYehuda Sadeh { 9451fec7093SYehuda Sadeh rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len); 9461fec7093SYehuda Sadeh } 9471fec7093SYehuda Sadeh 948602adf40SYehuda Sadeh /* 949602adf40SYehuda Sadeh * Send ceph osd request 950602adf40SYehuda Sadeh */ 951602adf40SYehuda Sadeh static int rbd_do_request(struct request *rq, 9520ce1a794SAlex Elder struct rbd_device *rbd_dev, 953602adf40SYehuda Sadeh struct ceph_snap_context *snapc, 954602adf40SYehuda Sadeh u64 snapid, 955aded07eaSAlex Elder const char *object_name, u64 ofs, u64 len, 956602adf40SYehuda Sadeh struct bio *bio, 957602adf40SYehuda Sadeh struct page **pages, 958602adf40SYehuda Sadeh int num_pages, 959602adf40SYehuda Sadeh int flags, 960602adf40SYehuda Sadeh struct ceph_osd_req_op *ops, 9611fec7093SYehuda Sadeh struct rbd_req_coll *coll, 9621fec7093SYehuda Sadeh int coll_index, 963602adf40SYehuda Sadeh void (*rbd_cb)(struct ceph_osd_request *req, 96459c2be1eSYehuda Sadeh struct ceph_msg *msg), 96559c2be1eSYehuda Sadeh struct ceph_osd_request **linger_req, 96659c2be1eSYehuda Sadeh u64 *ver) 967602adf40SYehuda Sadeh { 968602adf40SYehuda Sadeh struct ceph_osd_request *req; 969602adf40SYehuda Sadeh struct ceph_file_layout *layout; 970602adf40SYehuda Sadeh int ret; 971602adf40SYehuda Sadeh u64 bno; 972602adf40SYehuda Sadeh struct timespec mtime = CURRENT_TIME; 973602adf40SYehuda Sadeh struct rbd_request *req_data; 974602adf40SYehuda Sadeh struct ceph_osd_request_head *reqhead; 9751dbb4399SAlex Elder struct ceph_osd_client *osdc; 976602adf40SYehuda Sadeh 977602adf40SYehuda Sadeh req_data = kzalloc(sizeof(*req_data), GFP_NOIO); 9781fec7093SYehuda Sadeh if (!req_data) { 9791fec7093SYehuda Sadeh if (coll) 9801fec7093SYehuda Sadeh rbd_coll_end_req_index(rq, coll, coll_index, 9811fec7093SYehuda Sadeh -ENOMEM, len); 9821fec7093SYehuda Sadeh return -ENOMEM; 9831fec7093SYehuda Sadeh } 984602adf40SYehuda Sadeh 9851fec7093SYehuda Sadeh if (coll) { 9861fec7093SYehuda Sadeh req_data->coll = coll; 9871fec7093SYehuda Sadeh req_data->coll_index = coll_index; 9881fec7093SYehuda Sadeh } 9891fec7093SYehuda Sadeh 990bd919d45SAlex Elder dout("rbd_do_request object_name=%s ofs=%llu len=%llu\n", object_name, 991bd919d45SAlex Elder (unsigned long long) ofs, (unsigned long long) len); 992602adf40SYehuda Sadeh 9930ce1a794SAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 9941dbb4399SAlex Elder req = ceph_osdc_alloc_request(osdc, flags, snapc, ops, 9951dbb4399SAlex Elder false, GFP_NOIO, pages, bio); 9964ad12621SSage Weil if (!req) { 9974ad12621SSage Weil ret = -ENOMEM; 998602adf40SYehuda Sadeh goto done_pages; 999602adf40SYehuda Sadeh } 1000602adf40SYehuda Sadeh 1001602adf40SYehuda Sadeh req->r_callback = rbd_cb; 1002602adf40SYehuda Sadeh 1003602adf40SYehuda Sadeh req_data->rq = rq; 1004602adf40SYehuda Sadeh req_data->bio = bio; 1005602adf40SYehuda Sadeh req_data->pages = pages; 1006602adf40SYehuda Sadeh req_data->len = len; 1007602adf40SYehuda Sadeh 1008602adf40SYehuda Sadeh req->r_priv = req_data; 1009602adf40SYehuda Sadeh 1010602adf40SYehuda Sadeh reqhead = req->r_request->front.iov_base; 1011602adf40SYehuda Sadeh reqhead->snapid = cpu_to_le64(CEPH_NOSNAP); 1012602adf40SYehuda Sadeh 1013aded07eaSAlex Elder strncpy(req->r_oid, object_name, sizeof(req->r_oid)); 1014602adf40SYehuda Sadeh req->r_oid_len = strlen(req->r_oid); 1015602adf40SYehuda Sadeh 1016602adf40SYehuda Sadeh layout = &req->r_file_layout; 1017602adf40SYehuda Sadeh memset(layout, 0, sizeof(*layout)); 1018602adf40SYehuda Sadeh layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); 1019602adf40SYehuda Sadeh layout->fl_stripe_count = cpu_to_le32(1); 1020602adf40SYehuda Sadeh layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); 10210ce1a794SAlex Elder layout->fl_pg_pool = cpu_to_le32(rbd_dev->pool_id); 10221dbb4399SAlex Elder ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno, 10231dbb4399SAlex Elder req, ops); 1024602adf40SYehuda Sadeh 1025602adf40SYehuda Sadeh ceph_osdc_build_request(req, ofs, &len, 1026602adf40SYehuda Sadeh ops, 1027602adf40SYehuda Sadeh snapc, 1028602adf40SYehuda Sadeh &mtime, 1029602adf40SYehuda Sadeh req->r_oid, req->r_oid_len); 1030602adf40SYehuda Sadeh 103159c2be1eSYehuda Sadeh if (linger_req) { 10321dbb4399SAlex Elder ceph_osdc_set_request_linger(osdc, req); 103359c2be1eSYehuda Sadeh *linger_req = req; 103459c2be1eSYehuda Sadeh } 103559c2be1eSYehuda Sadeh 10361dbb4399SAlex Elder ret = ceph_osdc_start_request(osdc, req, false); 1037602adf40SYehuda Sadeh if (ret < 0) 1038602adf40SYehuda Sadeh goto done_err; 1039602adf40SYehuda Sadeh 1040602adf40SYehuda Sadeh if (!rbd_cb) { 10411dbb4399SAlex Elder ret = ceph_osdc_wait_request(osdc, req); 104259c2be1eSYehuda Sadeh if (ver) 104359c2be1eSYehuda Sadeh *ver = le64_to_cpu(req->r_reassert_version.version); 1044bd919d45SAlex Elder dout("reassert_ver=%llu\n", 1045bd919d45SAlex Elder (unsigned long long) 10461fec7093SYehuda Sadeh le64_to_cpu(req->r_reassert_version.version)); 1047602adf40SYehuda Sadeh ceph_osdc_put_request(req); 1048602adf40SYehuda Sadeh } 1049602adf40SYehuda Sadeh return ret; 1050602adf40SYehuda Sadeh 1051602adf40SYehuda Sadeh done_err: 1052602adf40SYehuda Sadeh bio_chain_put(req_data->bio); 1053602adf40SYehuda Sadeh ceph_osdc_put_request(req); 1054602adf40SYehuda Sadeh done_pages: 10551fec7093SYehuda Sadeh rbd_coll_end_req(req_data, ret, len); 1056602adf40SYehuda Sadeh kfree(req_data); 1057602adf40SYehuda Sadeh return ret; 1058602adf40SYehuda Sadeh } 1059602adf40SYehuda Sadeh 1060602adf40SYehuda Sadeh /* 1061602adf40SYehuda Sadeh * Ceph osd op callback 1062602adf40SYehuda Sadeh */ 1063602adf40SYehuda Sadeh static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg) 1064602adf40SYehuda Sadeh { 1065602adf40SYehuda Sadeh struct rbd_request *req_data = req->r_priv; 1066602adf40SYehuda Sadeh struct ceph_osd_reply_head *replyhead; 1067602adf40SYehuda Sadeh struct ceph_osd_op *op; 1068602adf40SYehuda Sadeh __s32 rc; 1069602adf40SYehuda Sadeh u64 bytes; 1070602adf40SYehuda Sadeh int read_op; 1071602adf40SYehuda Sadeh 1072602adf40SYehuda Sadeh /* parse reply */ 1073602adf40SYehuda Sadeh replyhead = msg->front.iov_base; 1074602adf40SYehuda Sadeh WARN_ON(le32_to_cpu(replyhead->num_ops) == 0); 1075602adf40SYehuda Sadeh op = (void *)(replyhead + 1); 1076602adf40SYehuda Sadeh rc = le32_to_cpu(replyhead->result); 1077602adf40SYehuda Sadeh bytes = le64_to_cpu(op->extent.length); 1078895cfcc8SDan Carpenter read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ); 1079602adf40SYehuda Sadeh 1080bd919d45SAlex Elder dout("rbd_req_cb bytes=%llu readop=%d rc=%d\n", 1081bd919d45SAlex Elder (unsigned long long) bytes, read_op, (int) rc); 1082602adf40SYehuda Sadeh 1083602adf40SYehuda Sadeh if (rc == -ENOENT && read_op) { 1084602adf40SYehuda Sadeh zero_bio_chain(req_data->bio, 0); 1085602adf40SYehuda Sadeh rc = 0; 1086602adf40SYehuda Sadeh } else if (rc == 0 && read_op && bytes < req_data->len) { 1087602adf40SYehuda Sadeh zero_bio_chain(req_data->bio, bytes); 1088602adf40SYehuda Sadeh bytes = req_data->len; 1089602adf40SYehuda Sadeh } 1090602adf40SYehuda Sadeh 10911fec7093SYehuda Sadeh rbd_coll_end_req(req_data, rc, bytes); 1092602adf40SYehuda Sadeh 1093602adf40SYehuda Sadeh if (req_data->bio) 1094602adf40SYehuda Sadeh bio_chain_put(req_data->bio); 1095602adf40SYehuda Sadeh 1096602adf40SYehuda Sadeh ceph_osdc_put_request(req); 1097602adf40SYehuda Sadeh kfree(req_data); 1098602adf40SYehuda Sadeh } 1099602adf40SYehuda Sadeh 110059c2be1eSYehuda Sadeh static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg) 110159c2be1eSYehuda Sadeh { 110259c2be1eSYehuda Sadeh ceph_osdc_put_request(req); 110359c2be1eSYehuda Sadeh } 110459c2be1eSYehuda Sadeh 1105602adf40SYehuda Sadeh /* 1106602adf40SYehuda Sadeh * Do a synchronous ceph osd operation 1107602adf40SYehuda Sadeh */ 11080ce1a794SAlex Elder static int rbd_req_sync_op(struct rbd_device *rbd_dev, 1109602adf40SYehuda Sadeh struct ceph_snap_context *snapc, 1110602adf40SYehuda Sadeh u64 snapid, 1111602adf40SYehuda Sadeh int flags, 1112913d2fdcSAlex Elder struct ceph_osd_req_op *ops, 1113aded07eaSAlex Elder const char *object_name, 1114f8d4de6eSAlex Elder u64 ofs, u64 inbound_size, 1115f8d4de6eSAlex Elder char *inbound, 111659c2be1eSYehuda Sadeh struct ceph_osd_request **linger_req, 111759c2be1eSYehuda Sadeh u64 *ver) 1118602adf40SYehuda Sadeh { 1119602adf40SYehuda Sadeh int ret; 1120602adf40SYehuda Sadeh struct page **pages; 1121602adf40SYehuda Sadeh int num_pages; 1122913d2fdcSAlex Elder 1123aafb230eSAlex Elder rbd_assert(ops != NULL); 1124602adf40SYehuda Sadeh 1125f8d4de6eSAlex Elder num_pages = calc_pages_for(ofs, inbound_size); 1126602adf40SYehuda Sadeh pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); 1127b8d0638aSDan Carpenter if (IS_ERR(pages)) 1128b8d0638aSDan Carpenter return PTR_ERR(pages); 1129602adf40SYehuda Sadeh 11300ce1a794SAlex Elder ret = rbd_do_request(NULL, rbd_dev, snapc, snapid, 1131f8d4de6eSAlex Elder object_name, ofs, inbound_size, NULL, 1132602adf40SYehuda Sadeh pages, num_pages, 1133602adf40SYehuda Sadeh flags, 1134602adf40SYehuda Sadeh ops, 11351fec7093SYehuda Sadeh NULL, 0, 113659c2be1eSYehuda Sadeh NULL, 113759c2be1eSYehuda Sadeh linger_req, ver); 1138602adf40SYehuda Sadeh if (ret < 0) 1139913d2fdcSAlex Elder goto done; 1140602adf40SYehuda Sadeh 1141f8d4de6eSAlex Elder if ((flags & CEPH_OSD_FLAG_READ) && inbound) 1142f8d4de6eSAlex Elder ret = ceph_copy_from_page_vector(pages, inbound, ofs, ret); 1143602adf40SYehuda Sadeh 1144602adf40SYehuda Sadeh done: 1145602adf40SYehuda Sadeh ceph_release_page_vector(pages, num_pages); 1146602adf40SYehuda Sadeh return ret; 1147602adf40SYehuda Sadeh } 1148602adf40SYehuda Sadeh 1149602adf40SYehuda Sadeh /* 1150602adf40SYehuda Sadeh * Do an asynchronous ceph osd operation 1151602adf40SYehuda Sadeh */ 1152602adf40SYehuda Sadeh static int rbd_do_op(struct request *rq, 1153602adf40SYehuda Sadeh struct rbd_device *rbd_dev, 1154602adf40SYehuda Sadeh struct ceph_snap_context *snapc, 1155602adf40SYehuda Sadeh u64 snapid, 1156d1f57ea6SAlex Elder int opcode, int flags, 1157602adf40SYehuda Sadeh u64 ofs, u64 len, 11581fec7093SYehuda Sadeh struct bio *bio, 11591fec7093SYehuda Sadeh struct rbd_req_coll *coll, 11601fec7093SYehuda Sadeh int coll_index) 1161602adf40SYehuda Sadeh { 1162602adf40SYehuda Sadeh char *seg_name; 1163602adf40SYehuda Sadeh u64 seg_ofs; 1164602adf40SYehuda Sadeh u64 seg_len; 1165602adf40SYehuda Sadeh int ret; 1166602adf40SYehuda Sadeh struct ceph_osd_req_op *ops; 1167602adf40SYehuda Sadeh u32 payload_len; 1168602adf40SYehuda Sadeh 116965ccfe21SAlex Elder seg_name = rbd_segment_name(rbd_dev, ofs); 1170602adf40SYehuda Sadeh if (!seg_name) 1171602adf40SYehuda Sadeh return -ENOMEM; 117265ccfe21SAlex Elder seg_len = rbd_segment_length(rbd_dev, ofs, len); 117365ccfe21SAlex Elder seg_ofs = rbd_segment_offset(rbd_dev, ofs); 1174602adf40SYehuda Sadeh 1175602adf40SYehuda Sadeh payload_len = (flags & CEPH_OSD_FLAG_WRITE ? seg_len : 0); 1176602adf40SYehuda Sadeh 117757cfc106SAlex Elder ret = -ENOMEM; 117857cfc106SAlex Elder ops = rbd_create_rw_ops(1, opcode, payload_len); 117957cfc106SAlex Elder if (!ops) 1180602adf40SYehuda Sadeh goto done; 1181602adf40SYehuda Sadeh 1182602adf40SYehuda Sadeh /* we've taken care of segment sizes earlier when we 1183602adf40SYehuda Sadeh cloned the bios. We should never have a segment 1184602adf40SYehuda Sadeh truncated at this point */ 1185aafb230eSAlex Elder rbd_assert(seg_len == len); 1186602adf40SYehuda Sadeh 1187602adf40SYehuda Sadeh ret = rbd_do_request(rq, rbd_dev, snapc, snapid, 1188602adf40SYehuda Sadeh seg_name, seg_ofs, seg_len, 1189602adf40SYehuda Sadeh bio, 1190602adf40SYehuda Sadeh NULL, 0, 1191602adf40SYehuda Sadeh flags, 1192602adf40SYehuda Sadeh ops, 11931fec7093SYehuda Sadeh coll, coll_index, 119459c2be1eSYehuda Sadeh rbd_req_cb, 0, NULL); 119511f77002SSage Weil 119611f77002SSage Weil rbd_destroy_ops(ops); 1197602adf40SYehuda Sadeh done: 1198602adf40SYehuda Sadeh kfree(seg_name); 1199602adf40SYehuda Sadeh return ret; 1200602adf40SYehuda Sadeh } 1201602adf40SYehuda Sadeh 1202602adf40SYehuda Sadeh /* 1203602adf40SYehuda Sadeh * Request async osd write 1204602adf40SYehuda Sadeh */ 1205602adf40SYehuda Sadeh static int rbd_req_write(struct request *rq, 1206602adf40SYehuda Sadeh struct rbd_device *rbd_dev, 1207602adf40SYehuda Sadeh struct ceph_snap_context *snapc, 1208602adf40SYehuda Sadeh u64 ofs, u64 len, 12091fec7093SYehuda Sadeh struct bio *bio, 12101fec7093SYehuda Sadeh struct rbd_req_coll *coll, 12111fec7093SYehuda Sadeh int coll_index) 1212602adf40SYehuda Sadeh { 1213602adf40SYehuda Sadeh return rbd_do_op(rq, rbd_dev, snapc, CEPH_NOSNAP, 1214602adf40SYehuda Sadeh CEPH_OSD_OP_WRITE, 1215602adf40SYehuda Sadeh CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, 12161fec7093SYehuda Sadeh ofs, len, bio, coll, coll_index); 1217602adf40SYehuda Sadeh } 1218602adf40SYehuda Sadeh 1219602adf40SYehuda Sadeh /* 1220602adf40SYehuda Sadeh * Request async osd read 1221602adf40SYehuda Sadeh */ 1222602adf40SYehuda Sadeh static int rbd_req_read(struct request *rq, 1223602adf40SYehuda Sadeh struct rbd_device *rbd_dev, 1224602adf40SYehuda Sadeh u64 snapid, 1225602adf40SYehuda Sadeh u64 ofs, u64 len, 12261fec7093SYehuda Sadeh struct bio *bio, 12271fec7093SYehuda Sadeh struct rbd_req_coll *coll, 12281fec7093SYehuda Sadeh int coll_index) 1229602adf40SYehuda Sadeh { 1230602adf40SYehuda Sadeh return rbd_do_op(rq, rbd_dev, NULL, 1231b06e6a6bSJosh Durgin snapid, 1232602adf40SYehuda Sadeh CEPH_OSD_OP_READ, 1233602adf40SYehuda Sadeh CEPH_OSD_FLAG_READ, 12341fec7093SYehuda Sadeh ofs, len, bio, coll, coll_index); 1235602adf40SYehuda Sadeh } 1236602adf40SYehuda Sadeh 1237602adf40SYehuda Sadeh /* 1238602adf40SYehuda Sadeh * Request sync osd read 1239602adf40SYehuda Sadeh */ 12400ce1a794SAlex Elder static int rbd_req_sync_read(struct rbd_device *rbd_dev, 1241602adf40SYehuda Sadeh u64 snapid, 1242aded07eaSAlex Elder const char *object_name, 1243602adf40SYehuda Sadeh u64 ofs, u64 len, 124459c2be1eSYehuda Sadeh char *buf, 124559c2be1eSYehuda Sadeh u64 *ver) 1246602adf40SYehuda Sadeh { 1247913d2fdcSAlex Elder struct ceph_osd_req_op *ops; 1248913d2fdcSAlex Elder int ret; 1249913d2fdcSAlex Elder 1250913d2fdcSAlex Elder ops = rbd_create_rw_ops(1, CEPH_OSD_OP_READ, 0); 1251913d2fdcSAlex Elder if (!ops) 1252913d2fdcSAlex Elder return -ENOMEM; 1253913d2fdcSAlex Elder 1254913d2fdcSAlex Elder ret = rbd_req_sync_op(rbd_dev, NULL, 1255b06e6a6bSJosh Durgin snapid, 1256602adf40SYehuda Sadeh CEPH_OSD_FLAG_READ, 1257913d2fdcSAlex Elder ops, object_name, ofs, len, buf, NULL, ver); 1258913d2fdcSAlex Elder rbd_destroy_ops(ops); 1259913d2fdcSAlex Elder 1260913d2fdcSAlex Elder return ret; 1261602adf40SYehuda Sadeh } 1262602adf40SYehuda Sadeh 1263602adf40SYehuda Sadeh /* 126459c2be1eSYehuda Sadeh * Request sync osd watch 126559c2be1eSYehuda Sadeh */ 12660ce1a794SAlex Elder static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev, 126759c2be1eSYehuda Sadeh u64 ver, 12687f0a24d8SAlex Elder u64 notify_id) 126959c2be1eSYehuda Sadeh { 127059c2be1eSYehuda Sadeh struct ceph_osd_req_op *ops; 127111f77002SSage Weil int ret; 127211f77002SSage Weil 127357cfc106SAlex Elder ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY_ACK, 0); 127457cfc106SAlex Elder if (!ops) 127557cfc106SAlex Elder return -ENOMEM; 127659c2be1eSYehuda Sadeh 1277a71b891bSJosh Durgin ops[0].watch.ver = cpu_to_le64(ver); 127859c2be1eSYehuda Sadeh ops[0].watch.cookie = notify_id; 127959c2be1eSYehuda Sadeh ops[0].watch.flag = 0; 128059c2be1eSYehuda Sadeh 12810ce1a794SAlex Elder ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP, 12827f0a24d8SAlex Elder rbd_dev->header_name, 0, 0, NULL, 1283ad4f232fSAlex Elder NULL, 0, 128459c2be1eSYehuda Sadeh CEPH_OSD_FLAG_READ, 128559c2be1eSYehuda Sadeh ops, 12861fec7093SYehuda Sadeh NULL, 0, 128759c2be1eSYehuda Sadeh rbd_simple_req_cb, 0, NULL); 128859c2be1eSYehuda Sadeh 128959c2be1eSYehuda Sadeh rbd_destroy_ops(ops); 129059c2be1eSYehuda Sadeh return ret; 129159c2be1eSYehuda Sadeh } 129259c2be1eSYehuda Sadeh 129359c2be1eSYehuda Sadeh static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data) 129459c2be1eSYehuda Sadeh { 12950ce1a794SAlex Elder struct rbd_device *rbd_dev = (struct rbd_device *)data; 1296a71b891bSJosh Durgin u64 hver; 129713143d2dSSage Weil int rc; 129813143d2dSSage Weil 12990ce1a794SAlex Elder if (!rbd_dev) 130059c2be1eSYehuda Sadeh return; 130159c2be1eSYehuda Sadeh 1302bd919d45SAlex Elder dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n", 1303bd919d45SAlex Elder rbd_dev->header_name, (unsigned long long) notify_id, 1304bd919d45SAlex Elder (unsigned int) opcode); 13051fe5e993SAlex Elder rc = rbd_refresh_header(rbd_dev, &hver); 130613143d2dSSage Weil if (rc) 1307f0f8cef5SAlex Elder pr_warning(RBD_DRV_NAME "%d got notification but failed to " 13080ce1a794SAlex Elder " update snaps: %d\n", rbd_dev->major, rc); 130959c2be1eSYehuda Sadeh 13107f0a24d8SAlex Elder rbd_req_sync_notify_ack(rbd_dev, hver, notify_id); 131159c2be1eSYehuda Sadeh } 131259c2be1eSYehuda Sadeh 131359c2be1eSYehuda Sadeh /* 131459c2be1eSYehuda Sadeh * Request sync osd watch 131559c2be1eSYehuda Sadeh */ 13160e6f322dSAlex Elder static int rbd_req_sync_watch(struct rbd_device *rbd_dev) 131759c2be1eSYehuda Sadeh { 131859c2be1eSYehuda Sadeh struct ceph_osd_req_op *ops; 13190ce1a794SAlex Elder struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 132057cfc106SAlex Elder int ret; 132159c2be1eSYehuda Sadeh 132257cfc106SAlex Elder ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0); 132357cfc106SAlex Elder if (!ops) 132457cfc106SAlex Elder return -ENOMEM; 132559c2be1eSYehuda Sadeh 132659c2be1eSYehuda Sadeh ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0, 13270ce1a794SAlex Elder (void *)rbd_dev, &rbd_dev->watch_event); 132859c2be1eSYehuda Sadeh if (ret < 0) 132959c2be1eSYehuda Sadeh goto fail; 133059c2be1eSYehuda Sadeh 13310e6f322dSAlex Elder ops[0].watch.ver = cpu_to_le64(rbd_dev->header.obj_version); 13320ce1a794SAlex Elder ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie); 133359c2be1eSYehuda Sadeh ops[0].watch.flag = 1; 133459c2be1eSYehuda Sadeh 13350ce1a794SAlex Elder ret = rbd_req_sync_op(rbd_dev, NULL, 133659c2be1eSYehuda Sadeh CEPH_NOSNAP, 133759c2be1eSYehuda Sadeh CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, 133859c2be1eSYehuda Sadeh ops, 13390e6f322dSAlex Elder rbd_dev->header_name, 13400e6f322dSAlex Elder 0, 0, NULL, 13410ce1a794SAlex Elder &rbd_dev->watch_request, NULL); 134259c2be1eSYehuda Sadeh 134359c2be1eSYehuda Sadeh if (ret < 0) 134459c2be1eSYehuda Sadeh goto fail_event; 134559c2be1eSYehuda Sadeh 134659c2be1eSYehuda Sadeh rbd_destroy_ops(ops); 134759c2be1eSYehuda Sadeh return 0; 134859c2be1eSYehuda Sadeh 134959c2be1eSYehuda Sadeh fail_event: 13500ce1a794SAlex Elder ceph_osdc_cancel_event(rbd_dev->watch_event); 13510ce1a794SAlex Elder rbd_dev->watch_event = NULL; 135259c2be1eSYehuda Sadeh fail: 135359c2be1eSYehuda Sadeh rbd_destroy_ops(ops); 135459c2be1eSYehuda Sadeh return ret; 135559c2be1eSYehuda Sadeh } 135659c2be1eSYehuda Sadeh 135779e3057cSYehuda Sadeh /* 135879e3057cSYehuda Sadeh * Request sync osd unwatch 135979e3057cSYehuda Sadeh */ 1360070c633fSAlex Elder static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev) 136179e3057cSYehuda Sadeh { 136279e3057cSYehuda Sadeh struct ceph_osd_req_op *ops; 136357cfc106SAlex Elder int ret; 136479e3057cSYehuda Sadeh 136557cfc106SAlex Elder ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0); 136657cfc106SAlex Elder if (!ops) 136757cfc106SAlex Elder return -ENOMEM; 136879e3057cSYehuda Sadeh 136979e3057cSYehuda Sadeh ops[0].watch.ver = 0; 13700ce1a794SAlex Elder ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie); 137179e3057cSYehuda Sadeh ops[0].watch.flag = 0; 137279e3057cSYehuda Sadeh 13730ce1a794SAlex Elder ret = rbd_req_sync_op(rbd_dev, NULL, 137479e3057cSYehuda Sadeh CEPH_NOSNAP, 137579e3057cSYehuda Sadeh CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, 137679e3057cSYehuda Sadeh ops, 1377070c633fSAlex Elder rbd_dev->header_name, 1378070c633fSAlex Elder 0, 0, NULL, NULL, NULL); 1379070c633fSAlex Elder 138079e3057cSYehuda Sadeh 138179e3057cSYehuda Sadeh rbd_destroy_ops(ops); 13820ce1a794SAlex Elder ceph_osdc_cancel_event(rbd_dev->watch_event); 13830ce1a794SAlex Elder rbd_dev->watch_event = NULL; 138479e3057cSYehuda Sadeh return ret; 138579e3057cSYehuda Sadeh } 138679e3057cSYehuda Sadeh 138759c2be1eSYehuda Sadeh /* 13883cb4a687SAlex Elder * Synchronous osd object method call 1389602adf40SYehuda Sadeh */ 13900ce1a794SAlex Elder static int rbd_req_sync_exec(struct rbd_device *rbd_dev, 1391aded07eaSAlex Elder const char *object_name, 1392aded07eaSAlex Elder const char *class_name, 1393aded07eaSAlex Elder const char *method_name, 13943cb4a687SAlex Elder const char *outbound, 13953cb4a687SAlex Elder size_t outbound_size, 1396f8d4de6eSAlex Elder char *inbound, 1397f8d4de6eSAlex Elder size_t inbound_size, 13983cb4a687SAlex Elder int flags, 139959c2be1eSYehuda Sadeh u64 *ver) 1400602adf40SYehuda Sadeh { 1401602adf40SYehuda Sadeh struct ceph_osd_req_op *ops; 1402aded07eaSAlex Elder int class_name_len = strlen(class_name); 1403aded07eaSAlex Elder int method_name_len = strlen(method_name); 14043cb4a687SAlex Elder int payload_size; 140557cfc106SAlex Elder int ret; 140657cfc106SAlex Elder 14073cb4a687SAlex Elder /* 14083cb4a687SAlex Elder * Any input parameters required by the method we're calling 14093cb4a687SAlex Elder * will be sent along with the class and method names as 14103cb4a687SAlex Elder * part of the message payload. That data and its size are 14113cb4a687SAlex Elder * supplied via the indata and indata_len fields (named from 14123cb4a687SAlex Elder * the perspective of the server side) in the OSD request 14133cb4a687SAlex Elder * operation. 14143cb4a687SAlex Elder */ 14153cb4a687SAlex Elder payload_size = class_name_len + method_name_len + outbound_size; 14163cb4a687SAlex Elder ops = rbd_create_rw_ops(1, CEPH_OSD_OP_CALL, payload_size); 141757cfc106SAlex Elder if (!ops) 141857cfc106SAlex Elder return -ENOMEM; 1419602adf40SYehuda Sadeh 1420aded07eaSAlex Elder ops[0].cls.class_name = class_name; 1421aded07eaSAlex Elder ops[0].cls.class_len = (__u8) class_name_len; 1422aded07eaSAlex Elder ops[0].cls.method_name = method_name; 1423aded07eaSAlex Elder ops[0].cls.method_len = (__u8) method_name_len; 1424602adf40SYehuda Sadeh ops[0].cls.argc = 0; 14253cb4a687SAlex Elder ops[0].cls.indata = outbound; 14263cb4a687SAlex Elder ops[0].cls.indata_len = outbound_size; 1427602adf40SYehuda Sadeh 14280ce1a794SAlex Elder ret = rbd_req_sync_op(rbd_dev, NULL, 1429602adf40SYehuda Sadeh CEPH_NOSNAP, 14303cb4a687SAlex Elder flags, ops, 1431f8d4de6eSAlex Elder object_name, 0, inbound_size, inbound, 1432f8d4de6eSAlex Elder NULL, ver); 1433602adf40SYehuda Sadeh 1434602adf40SYehuda Sadeh rbd_destroy_ops(ops); 1435602adf40SYehuda Sadeh 1436602adf40SYehuda Sadeh dout("cls_exec returned %d\n", ret); 1437602adf40SYehuda Sadeh return ret; 1438602adf40SYehuda Sadeh } 1439602adf40SYehuda Sadeh 14401fec7093SYehuda Sadeh static struct rbd_req_coll *rbd_alloc_coll(int num_reqs) 14411fec7093SYehuda Sadeh { 14421fec7093SYehuda Sadeh struct rbd_req_coll *coll = 14431fec7093SYehuda Sadeh kzalloc(sizeof(struct rbd_req_coll) + 14441fec7093SYehuda Sadeh sizeof(struct rbd_req_status) * num_reqs, 14451fec7093SYehuda Sadeh GFP_ATOMIC); 14461fec7093SYehuda Sadeh 14471fec7093SYehuda Sadeh if (!coll) 14481fec7093SYehuda Sadeh return NULL; 14491fec7093SYehuda Sadeh coll->total = num_reqs; 14501fec7093SYehuda Sadeh kref_init(&coll->kref); 14511fec7093SYehuda Sadeh return coll; 14521fec7093SYehuda Sadeh } 14531fec7093SYehuda Sadeh 1454602adf40SYehuda Sadeh /* 1455602adf40SYehuda Sadeh * block device queue callback 1456602adf40SYehuda Sadeh */ 1457602adf40SYehuda Sadeh static void rbd_rq_fn(struct request_queue *q) 1458602adf40SYehuda Sadeh { 1459602adf40SYehuda Sadeh struct rbd_device *rbd_dev = q->queuedata; 1460602adf40SYehuda Sadeh struct request *rq; 1461602adf40SYehuda Sadeh struct bio_pair *bp = NULL; 1462602adf40SYehuda Sadeh 146300f1f36fSAlex Elder while ((rq = blk_fetch_request(q))) { 1464602adf40SYehuda Sadeh struct bio *bio; 1465602adf40SYehuda Sadeh struct bio *rq_bio, *next_bio = NULL; 1466602adf40SYehuda Sadeh bool do_write; 1467bd919d45SAlex Elder unsigned int size; 1468bd919d45SAlex Elder u64 op_size = 0; 1469602adf40SYehuda Sadeh u64 ofs; 14701fec7093SYehuda Sadeh int num_segs, cur_seg = 0; 14711fec7093SYehuda Sadeh struct rbd_req_coll *coll; 1472d1d25646SJosh Durgin struct ceph_snap_context *snapc; 1473602adf40SYehuda Sadeh 1474602adf40SYehuda Sadeh dout("fetched request\n"); 1475602adf40SYehuda Sadeh 1476602adf40SYehuda Sadeh /* filter out block requests we don't understand */ 1477602adf40SYehuda Sadeh if ((rq->cmd_type != REQ_TYPE_FS)) { 1478602adf40SYehuda Sadeh __blk_end_request_all(rq, 0); 147900f1f36fSAlex Elder continue; 1480602adf40SYehuda Sadeh } 1481602adf40SYehuda Sadeh 1482602adf40SYehuda Sadeh /* deduce our operation (read, write) */ 1483602adf40SYehuda Sadeh do_write = (rq_data_dir(rq) == WRITE); 1484602adf40SYehuda Sadeh 1485602adf40SYehuda Sadeh size = blk_rq_bytes(rq); 1486593a9e7bSAlex Elder ofs = blk_rq_pos(rq) * SECTOR_SIZE; 1487602adf40SYehuda Sadeh rq_bio = rq->bio; 1488f84344f3SAlex Elder if (do_write && rbd_dev->mapping.read_only) { 1489602adf40SYehuda Sadeh __blk_end_request_all(rq, -EROFS); 149000f1f36fSAlex Elder continue; 1491602adf40SYehuda Sadeh } 1492602adf40SYehuda Sadeh 1493602adf40SYehuda Sadeh spin_unlock_irq(q->queue_lock); 1494602adf40SYehuda Sadeh 1495e88a36ecSJosh Durgin down_read(&rbd_dev->header_rwsem); 1496e88a36ecSJosh Durgin 1497f84344f3SAlex Elder if (rbd_dev->mapping.snap_id != CEPH_NOSNAP && 1498f84344f3SAlex Elder !rbd_dev->mapping.snap_exists) { 1499d1d25646SJosh Durgin up_read(&rbd_dev->header_rwsem); 1500e88a36ecSJosh Durgin dout("request for non-existent snapshot"); 1501e88a36ecSJosh Durgin spin_lock_irq(q->queue_lock); 1502e88a36ecSJosh Durgin __blk_end_request_all(rq, -ENXIO); 1503e88a36ecSJosh Durgin continue; 1504e88a36ecSJosh Durgin } 1505d1d25646SJosh Durgin 1506d1d25646SJosh Durgin snapc = ceph_get_snap_context(rbd_dev->header.snapc); 1507d1d25646SJosh Durgin 1508d1d25646SJosh Durgin up_read(&rbd_dev->header_rwsem); 1509e88a36ecSJosh Durgin 1510602adf40SYehuda Sadeh dout("%s 0x%x bytes at 0x%llx\n", 1511602adf40SYehuda Sadeh do_write ? "write" : "read", 1512bd919d45SAlex Elder size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE); 1513602adf40SYehuda Sadeh 15141fec7093SYehuda Sadeh num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size); 1515df111be6SAlex Elder if (num_segs <= 0) { 1516df111be6SAlex Elder spin_lock_irq(q->queue_lock); 1517df111be6SAlex Elder __blk_end_request_all(rq, num_segs); 1518df111be6SAlex Elder ceph_put_snap_context(snapc); 1519df111be6SAlex Elder continue; 1520df111be6SAlex Elder } 15211fec7093SYehuda Sadeh coll = rbd_alloc_coll(num_segs); 15221fec7093SYehuda Sadeh if (!coll) { 15231fec7093SYehuda Sadeh spin_lock_irq(q->queue_lock); 15241fec7093SYehuda Sadeh __blk_end_request_all(rq, -ENOMEM); 1525d1d25646SJosh Durgin ceph_put_snap_context(snapc); 152600f1f36fSAlex Elder continue; 15271fec7093SYehuda Sadeh } 15281fec7093SYehuda Sadeh 1529602adf40SYehuda Sadeh do { 1530602adf40SYehuda Sadeh /* a bio clone to be passed down to OSD req */ 1531bd919d45SAlex Elder dout("rq->bio->bi_vcnt=%hu\n", rq->bio->bi_vcnt); 153265ccfe21SAlex Elder op_size = rbd_segment_length(rbd_dev, ofs, size); 15331fec7093SYehuda Sadeh kref_get(&coll->kref); 1534602adf40SYehuda Sadeh bio = bio_chain_clone(&rq_bio, &next_bio, &bp, 1535602adf40SYehuda Sadeh op_size, GFP_ATOMIC); 1536602adf40SYehuda Sadeh if (!bio) { 15371fec7093SYehuda Sadeh rbd_coll_end_req_index(rq, coll, cur_seg, 15381fec7093SYehuda Sadeh -ENOMEM, op_size); 15391fec7093SYehuda Sadeh goto next_seg; 1540602adf40SYehuda Sadeh } 1541602adf40SYehuda Sadeh 15421fec7093SYehuda Sadeh 1543602adf40SYehuda Sadeh /* init OSD command: write or read */ 1544602adf40SYehuda Sadeh if (do_write) 1545602adf40SYehuda Sadeh rbd_req_write(rq, rbd_dev, 1546d1d25646SJosh Durgin snapc, 1547602adf40SYehuda Sadeh ofs, 15481fec7093SYehuda Sadeh op_size, bio, 15491fec7093SYehuda Sadeh coll, cur_seg); 1550602adf40SYehuda Sadeh else 1551602adf40SYehuda Sadeh rbd_req_read(rq, rbd_dev, 1552f84344f3SAlex Elder rbd_dev->mapping.snap_id, 1553602adf40SYehuda Sadeh ofs, 15541fec7093SYehuda Sadeh op_size, bio, 15551fec7093SYehuda Sadeh coll, cur_seg); 1556602adf40SYehuda Sadeh 15571fec7093SYehuda Sadeh next_seg: 1558602adf40SYehuda Sadeh size -= op_size; 1559602adf40SYehuda Sadeh ofs += op_size; 1560602adf40SYehuda Sadeh 15611fec7093SYehuda Sadeh cur_seg++; 1562602adf40SYehuda Sadeh rq_bio = next_bio; 1563602adf40SYehuda Sadeh } while (size > 0); 15641fec7093SYehuda Sadeh kref_put(&coll->kref, rbd_coll_release); 1565602adf40SYehuda Sadeh 1566602adf40SYehuda Sadeh if (bp) 1567602adf40SYehuda Sadeh bio_pair_release(bp); 1568602adf40SYehuda Sadeh spin_lock_irq(q->queue_lock); 1569d1d25646SJosh Durgin 1570d1d25646SJosh Durgin ceph_put_snap_context(snapc); 1571602adf40SYehuda Sadeh } 1572602adf40SYehuda Sadeh } 1573602adf40SYehuda Sadeh 1574602adf40SYehuda Sadeh /* 1575602adf40SYehuda Sadeh * a queue callback. Makes sure that we don't create a bio that spans across 1576602adf40SYehuda Sadeh * multiple osd objects. One exception would be with a single page bios, 1577602adf40SYehuda Sadeh * which we handle later at bio_chain_clone 1578602adf40SYehuda Sadeh */ 1579602adf40SYehuda Sadeh static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd, 1580602adf40SYehuda Sadeh struct bio_vec *bvec) 1581602adf40SYehuda Sadeh { 1582602adf40SYehuda Sadeh struct rbd_device *rbd_dev = q->queuedata; 1583593a9e7bSAlex Elder unsigned int chunk_sectors; 1584593a9e7bSAlex Elder sector_t sector; 1585593a9e7bSAlex Elder unsigned int bio_sectors; 1586602adf40SYehuda Sadeh int max; 1587602adf40SYehuda Sadeh 1588593a9e7bSAlex Elder chunk_sectors = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT); 1589593a9e7bSAlex Elder sector = bmd->bi_sector + get_start_sect(bmd->bi_bdev); 1590593a9e7bSAlex Elder bio_sectors = bmd->bi_size >> SECTOR_SHIFT; 1591593a9e7bSAlex Elder 1592602adf40SYehuda Sadeh max = (chunk_sectors - ((sector & (chunk_sectors - 1)) 1593593a9e7bSAlex Elder + bio_sectors)) << SECTOR_SHIFT; 1594602adf40SYehuda Sadeh if (max < 0) 1595602adf40SYehuda Sadeh max = 0; /* bio_add cannot handle a negative return */ 1596602adf40SYehuda Sadeh if (max <= bvec->bv_len && bio_sectors == 0) 1597602adf40SYehuda Sadeh return bvec->bv_len; 1598602adf40SYehuda Sadeh return max; 1599602adf40SYehuda Sadeh } 1600602adf40SYehuda Sadeh 1601602adf40SYehuda Sadeh static void rbd_free_disk(struct rbd_device *rbd_dev) 1602602adf40SYehuda Sadeh { 1603602adf40SYehuda Sadeh struct gendisk *disk = rbd_dev->disk; 1604602adf40SYehuda Sadeh 1605602adf40SYehuda Sadeh if (!disk) 1606602adf40SYehuda Sadeh return; 1607602adf40SYehuda Sadeh 1608602adf40SYehuda Sadeh if (disk->flags & GENHD_FL_UP) 1609602adf40SYehuda Sadeh del_gendisk(disk); 1610602adf40SYehuda Sadeh if (disk->queue) 1611602adf40SYehuda Sadeh blk_cleanup_queue(disk->queue); 1612602adf40SYehuda Sadeh put_disk(disk); 1613602adf40SYehuda Sadeh } 1614602adf40SYehuda Sadeh 1615602adf40SYehuda Sadeh /* 16164156d998SAlex Elder * Read the complete header for the given rbd device. 16174156d998SAlex Elder * 16184156d998SAlex Elder * Returns a pointer to a dynamically-allocated buffer containing 16194156d998SAlex Elder * the complete and validated header. Caller can pass the address 16204156d998SAlex Elder * of a variable that will be filled in with the version of the 16214156d998SAlex Elder * header object at the time it was read. 16224156d998SAlex Elder * 16234156d998SAlex Elder * Returns a pointer-coded errno if a failure occurs. 16244156d998SAlex Elder */ 16254156d998SAlex Elder static struct rbd_image_header_ondisk * 16264156d998SAlex Elder rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version) 16274156d998SAlex Elder { 16284156d998SAlex Elder struct rbd_image_header_ondisk *ondisk = NULL; 16294156d998SAlex Elder u32 snap_count = 0; 16304156d998SAlex Elder u64 names_size = 0; 16314156d998SAlex Elder u32 want_count; 16324156d998SAlex Elder int ret; 16334156d998SAlex Elder 16344156d998SAlex Elder /* 16354156d998SAlex Elder * The complete header will include an array of its 64-bit 16364156d998SAlex Elder * snapshot ids, followed by the names of those snapshots as 16374156d998SAlex Elder * a contiguous block of NUL-terminated strings. Note that 16384156d998SAlex Elder * the number of snapshots could change by the time we read 16394156d998SAlex Elder * it in, in which case we re-read it. 16404156d998SAlex Elder */ 16414156d998SAlex Elder do { 16424156d998SAlex Elder size_t size; 16434156d998SAlex Elder 16444156d998SAlex Elder kfree(ondisk); 16454156d998SAlex Elder 16464156d998SAlex Elder size = sizeof (*ondisk); 16474156d998SAlex Elder size += snap_count * sizeof (struct rbd_image_snap_ondisk); 16484156d998SAlex Elder size += names_size; 16494156d998SAlex Elder ondisk = kmalloc(size, GFP_KERNEL); 16504156d998SAlex Elder if (!ondisk) 16514156d998SAlex Elder return ERR_PTR(-ENOMEM); 16524156d998SAlex Elder 16534156d998SAlex Elder ret = rbd_req_sync_read(rbd_dev, CEPH_NOSNAP, 16544156d998SAlex Elder rbd_dev->header_name, 16554156d998SAlex Elder 0, size, 16564156d998SAlex Elder (char *) ondisk, version); 16574156d998SAlex Elder 16584156d998SAlex Elder if (ret < 0) 16594156d998SAlex Elder goto out_err; 16604156d998SAlex Elder if (WARN_ON((size_t) ret < size)) { 16614156d998SAlex Elder ret = -ENXIO; 16624156d998SAlex Elder pr_warning("short header read for image %s" 16634156d998SAlex Elder " (want %zd got %d)\n", 16644156d998SAlex Elder rbd_dev->image_name, size, ret); 16654156d998SAlex Elder goto out_err; 16664156d998SAlex Elder } 16674156d998SAlex Elder if (!rbd_dev_ondisk_valid(ondisk)) { 16684156d998SAlex Elder ret = -ENXIO; 16694156d998SAlex Elder pr_warning("invalid header for image %s\n", 16704156d998SAlex Elder rbd_dev->image_name); 16714156d998SAlex Elder goto out_err; 16724156d998SAlex Elder } 16734156d998SAlex Elder 16744156d998SAlex Elder names_size = le64_to_cpu(ondisk->snap_names_len); 16754156d998SAlex Elder want_count = snap_count; 16764156d998SAlex Elder snap_count = le32_to_cpu(ondisk->snap_count); 16774156d998SAlex Elder } while (snap_count != want_count); 16784156d998SAlex Elder 16794156d998SAlex Elder return ondisk; 16804156d998SAlex Elder 16814156d998SAlex Elder out_err: 16824156d998SAlex Elder kfree(ondisk); 16834156d998SAlex Elder 16844156d998SAlex Elder return ERR_PTR(ret); 16854156d998SAlex Elder } 16864156d998SAlex Elder 16874156d998SAlex Elder /* 1688602adf40SYehuda Sadeh * reload the ondisk the header 1689602adf40SYehuda Sadeh */ 1690602adf40SYehuda Sadeh static int rbd_read_header(struct rbd_device *rbd_dev, 1691602adf40SYehuda Sadeh struct rbd_image_header *header) 1692602adf40SYehuda Sadeh { 16934156d998SAlex Elder struct rbd_image_header_ondisk *ondisk; 16944156d998SAlex Elder u64 ver = 0; 16954156d998SAlex Elder int ret; 1696602adf40SYehuda Sadeh 16974156d998SAlex Elder ondisk = rbd_dev_v1_header_read(rbd_dev, &ver); 16984156d998SAlex Elder if (IS_ERR(ondisk)) 16994156d998SAlex Elder return PTR_ERR(ondisk); 17004156d998SAlex Elder ret = rbd_header_from_disk(header, ondisk); 17014156d998SAlex Elder if (ret >= 0) 170259c2be1eSYehuda Sadeh header->obj_version = ver; 17034156d998SAlex Elder kfree(ondisk); 1704602adf40SYehuda Sadeh 17054156d998SAlex Elder return ret; 1706602adf40SYehuda Sadeh } 1707602adf40SYehuda Sadeh 1708dfc5606dSYehuda Sadeh static void __rbd_remove_all_snaps(struct rbd_device *rbd_dev) 1709dfc5606dSYehuda Sadeh { 1710dfc5606dSYehuda Sadeh struct rbd_snap *snap; 1711a0593290SAlex Elder struct rbd_snap *next; 1712dfc5606dSYehuda Sadeh 1713a0593290SAlex Elder list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node) 171414e7085dSAlex Elder __rbd_remove_snap_dev(snap); 1715dfc5606dSYehuda Sadeh } 1716dfc5606dSYehuda Sadeh 1717602adf40SYehuda Sadeh /* 1718602adf40SYehuda Sadeh * only read the first part of the ondisk header, without the snaps info 1719602adf40SYehuda Sadeh */ 1720b813623aSAlex Elder static int __rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver) 1721602adf40SYehuda Sadeh { 1722602adf40SYehuda Sadeh int ret; 1723602adf40SYehuda Sadeh struct rbd_image_header h; 1724602adf40SYehuda Sadeh 1725602adf40SYehuda Sadeh ret = rbd_read_header(rbd_dev, &h); 1726602adf40SYehuda Sadeh if (ret < 0) 1727602adf40SYehuda Sadeh return ret; 1728602adf40SYehuda Sadeh 1729a51aa0c0SJosh Durgin down_write(&rbd_dev->header_rwsem); 1730a51aa0c0SJosh Durgin 17319db4b3e3SSage Weil /* resized? */ 1732f84344f3SAlex Elder if (rbd_dev->mapping.snap_id == CEPH_NOSNAP) { 1733474ef7ceSJosh Durgin sector_t size = (sector_t) h.image_size / SECTOR_SIZE; 1734474ef7ceSJosh Durgin 173599c1f08fSAlex Elder if (size != (sector_t) rbd_dev->mapping.size) { 173699c1f08fSAlex Elder dout("setting size to %llu sectors", 173799c1f08fSAlex Elder (unsigned long long) size); 173899c1f08fSAlex Elder rbd_dev->mapping.size = (u64) size; 1739474ef7ceSJosh Durgin set_capacity(rbd_dev->disk, size); 1740474ef7ceSJosh Durgin } 174199c1f08fSAlex Elder } 17429db4b3e3SSage Weil 1743849b4260SAlex Elder /* rbd_dev->header.object_prefix shouldn't change */ 1744602adf40SYehuda Sadeh kfree(rbd_dev->header.snap_sizes); 1745849b4260SAlex Elder kfree(rbd_dev->header.snap_names); 1746d1d25646SJosh Durgin /* osd requests may still refer to snapc */ 1747d1d25646SJosh Durgin ceph_put_snap_context(rbd_dev->header.snapc); 1748602adf40SYehuda Sadeh 1749b813623aSAlex Elder if (hver) 1750b813623aSAlex Elder *hver = h.obj_version; 1751a71b891bSJosh Durgin rbd_dev->header.obj_version = h.obj_version; 175293a24e08SJosh Durgin rbd_dev->header.image_size = h.image_size; 1753602adf40SYehuda Sadeh rbd_dev->header.snapc = h.snapc; 1754602adf40SYehuda Sadeh rbd_dev->header.snap_names = h.snap_names; 1755602adf40SYehuda Sadeh rbd_dev->header.snap_sizes = h.snap_sizes; 1756849b4260SAlex Elder /* Free the extra copy of the object prefix */ 1757849b4260SAlex Elder WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix)); 1758849b4260SAlex Elder kfree(h.object_prefix); 1759849b4260SAlex Elder 1760304f6808SAlex Elder ret = rbd_dev_snaps_update(rbd_dev); 1761304f6808SAlex Elder if (!ret) 1762304f6808SAlex Elder ret = rbd_dev_snaps_register(rbd_dev); 1763dfc5606dSYehuda Sadeh 1764c666601aSJosh Durgin up_write(&rbd_dev->header_rwsem); 1765602adf40SYehuda Sadeh 1766dfc5606dSYehuda Sadeh return ret; 1767602adf40SYehuda Sadeh } 1768602adf40SYehuda Sadeh 17691fe5e993SAlex Elder static int rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver) 17701fe5e993SAlex Elder { 17711fe5e993SAlex Elder int ret; 17721fe5e993SAlex Elder 17731fe5e993SAlex Elder mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); 17741fe5e993SAlex Elder ret = __rbd_refresh_header(rbd_dev, hver); 17751fe5e993SAlex Elder mutex_unlock(&ctl_mutex); 17761fe5e993SAlex Elder 17771fe5e993SAlex Elder return ret; 17781fe5e993SAlex Elder } 17791fe5e993SAlex Elder 1780602adf40SYehuda Sadeh static int rbd_init_disk(struct rbd_device *rbd_dev) 1781602adf40SYehuda Sadeh { 1782602adf40SYehuda Sadeh struct gendisk *disk; 1783602adf40SYehuda Sadeh struct request_queue *q; 1784593a9e7bSAlex Elder u64 segment_size; 1785602adf40SYehuda Sadeh 1786602adf40SYehuda Sadeh /* create gendisk info */ 1787602adf40SYehuda Sadeh disk = alloc_disk(RBD_MINORS_PER_MAJOR); 1788602adf40SYehuda Sadeh if (!disk) 17891fcdb8aaSAlex Elder return -ENOMEM; 1790602adf40SYehuda Sadeh 1791f0f8cef5SAlex Elder snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d", 1792de71a297SAlex Elder rbd_dev->dev_id); 1793602adf40SYehuda Sadeh disk->major = rbd_dev->major; 1794602adf40SYehuda Sadeh disk->first_minor = 0; 1795602adf40SYehuda Sadeh disk->fops = &rbd_bd_ops; 1796602adf40SYehuda Sadeh disk->private_data = rbd_dev; 1797602adf40SYehuda Sadeh 1798602adf40SYehuda Sadeh /* init rq */ 1799602adf40SYehuda Sadeh q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock); 1800602adf40SYehuda Sadeh if (!q) 1801602adf40SYehuda Sadeh goto out_disk; 1802029bcbd8SJosh Durgin 1803593a9e7bSAlex Elder /* We use the default size, but let's be explicit about it. */ 1804593a9e7bSAlex Elder blk_queue_physical_block_size(q, SECTOR_SIZE); 1805593a9e7bSAlex Elder 1806029bcbd8SJosh Durgin /* set io sizes to object size */ 1807593a9e7bSAlex Elder segment_size = rbd_obj_bytes(&rbd_dev->header); 1808593a9e7bSAlex Elder blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE); 1809593a9e7bSAlex Elder blk_queue_max_segment_size(q, segment_size); 1810593a9e7bSAlex Elder blk_queue_io_min(q, segment_size); 1811593a9e7bSAlex Elder blk_queue_io_opt(q, segment_size); 1812029bcbd8SJosh Durgin 1813602adf40SYehuda Sadeh blk_queue_merge_bvec(q, rbd_merge_bvec); 1814602adf40SYehuda Sadeh disk->queue = q; 1815602adf40SYehuda Sadeh 1816602adf40SYehuda Sadeh q->queuedata = rbd_dev; 1817602adf40SYehuda Sadeh 1818602adf40SYehuda Sadeh rbd_dev->disk = disk; 1819602adf40SYehuda Sadeh 182012f02944SAlex Elder set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE); 182112f02944SAlex Elder 1822602adf40SYehuda Sadeh return 0; 1823602adf40SYehuda Sadeh out_disk: 1824602adf40SYehuda Sadeh put_disk(disk); 18251fcdb8aaSAlex Elder 18261fcdb8aaSAlex Elder return -ENOMEM; 1827602adf40SYehuda Sadeh } 1828602adf40SYehuda Sadeh 1829dfc5606dSYehuda Sadeh /* 1830dfc5606dSYehuda Sadeh sysfs 1831dfc5606dSYehuda Sadeh */ 1832602adf40SYehuda Sadeh 1833593a9e7bSAlex Elder static struct rbd_device *dev_to_rbd_dev(struct device *dev) 1834593a9e7bSAlex Elder { 1835593a9e7bSAlex Elder return container_of(dev, struct rbd_device, dev); 1836593a9e7bSAlex Elder } 1837593a9e7bSAlex Elder 1838dfc5606dSYehuda Sadeh static ssize_t rbd_size_show(struct device *dev, 1839dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 1840602adf40SYehuda Sadeh { 1841593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 1842a51aa0c0SJosh Durgin sector_t size; 1843dfc5606dSYehuda Sadeh 1844a51aa0c0SJosh Durgin down_read(&rbd_dev->header_rwsem); 1845a51aa0c0SJosh Durgin size = get_capacity(rbd_dev->disk); 1846a51aa0c0SJosh Durgin up_read(&rbd_dev->header_rwsem); 1847a51aa0c0SJosh Durgin 1848a51aa0c0SJosh Durgin return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE); 1849602adf40SYehuda Sadeh } 1850602adf40SYehuda Sadeh 185134b13184SAlex Elder /* 185234b13184SAlex Elder * Note this shows the features for whatever's mapped, which is not 185334b13184SAlex Elder * necessarily the base image. 185434b13184SAlex Elder */ 185534b13184SAlex Elder static ssize_t rbd_features_show(struct device *dev, 185634b13184SAlex Elder struct device_attribute *attr, char *buf) 185734b13184SAlex Elder { 185834b13184SAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 185934b13184SAlex Elder 186034b13184SAlex Elder return sprintf(buf, "0x%016llx\n", 186134b13184SAlex Elder (unsigned long long) rbd_dev->mapping.features); 186234b13184SAlex Elder } 186334b13184SAlex Elder 1864dfc5606dSYehuda Sadeh static ssize_t rbd_major_show(struct device *dev, 1865dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 1866602adf40SYehuda Sadeh { 1867593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 1868dfc5606dSYehuda Sadeh 1869dfc5606dSYehuda Sadeh return sprintf(buf, "%d\n", rbd_dev->major); 1870dfc5606dSYehuda Sadeh } 1871dfc5606dSYehuda Sadeh 1872dfc5606dSYehuda Sadeh static ssize_t rbd_client_id_show(struct device *dev, 1873dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 1874dfc5606dSYehuda Sadeh { 1875593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 1876dfc5606dSYehuda Sadeh 18771dbb4399SAlex Elder return sprintf(buf, "client%lld\n", 18781dbb4399SAlex Elder ceph_client_id(rbd_dev->rbd_client->client)); 1879dfc5606dSYehuda Sadeh } 1880dfc5606dSYehuda Sadeh 1881dfc5606dSYehuda Sadeh static ssize_t rbd_pool_show(struct device *dev, 1882dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 1883dfc5606dSYehuda Sadeh { 1884593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 1885dfc5606dSYehuda Sadeh 1886dfc5606dSYehuda Sadeh return sprintf(buf, "%s\n", rbd_dev->pool_name); 1887dfc5606dSYehuda Sadeh } 1888dfc5606dSYehuda Sadeh 18899bb2f334SAlex Elder static ssize_t rbd_pool_id_show(struct device *dev, 18909bb2f334SAlex Elder struct device_attribute *attr, char *buf) 18919bb2f334SAlex Elder { 18929bb2f334SAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 18939bb2f334SAlex Elder 18949bb2f334SAlex Elder return sprintf(buf, "%d\n", rbd_dev->pool_id); 18959bb2f334SAlex Elder } 18969bb2f334SAlex Elder 1897dfc5606dSYehuda Sadeh static ssize_t rbd_name_show(struct device *dev, 1898dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 1899dfc5606dSYehuda Sadeh { 1900593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 1901dfc5606dSYehuda Sadeh 19020bed54dcSAlex Elder return sprintf(buf, "%s\n", rbd_dev->image_name); 1903dfc5606dSYehuda Sadeh } 1904dfc5606dSYehuda Sadeh 1905589d30e0SAlex Elder static ssize_t rbd_image_id_show(struct device *dev, 1906589d30e0SAlex Elder struct device_attribute *attr, char *buf) 1907589d30e0SAlex Elder { 1908589d30e0SAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 1909589d30e0SAlex Elder 1910589d30e0SAlex Elder return sprintf(buf, "%s\n", rbd_dev->image_id); 1911589d30e0SAlex Elder } 1912589d30e0SAlex Elder 191334b13184SAlex Elder /* 191434b13184SAlex Elder * Shows the name of the currently-mapped snapshot (or 191534b13184SAlex Elder * RBD_SNAP_HEAD_NAME for the base image). 191634b13184SAlex Elder */ 1917dfc5606dSYehuda Sadeh static ssize_t rbd_snap_show(struct device *dev, 1918dfc5606dSYehuda Sadeh struct device_attribute *attr, 1919dfc5606dSYehuda Sadeh char *buf) 1920dfc5606dSYehuda Sadeh { 1921593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 1922dfc5606dSYehuda Sadeh 1923f84344f3SAlex Elder return sprintf(buf, "%s\n", rbd_dev->mapping.snap_name); 1924dfc5606dSYehuda Sadeh } 1925dfc5606dSYehuda Sadeh 1926dfc5606dSYehuda Sadeh static ssize_t rbd_image_refresh(struct device *dev, 1927dfc5606dSYehuda Sadeh struct device_attribute *attr, 1928dfc5606dSYehuda Sadeh const char *buf, 1929dfc5606dSYehuda Sadeh size_t size) 1930dfc5606dSYehuda Sadeh { 1931593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 1932b813623aSAlex Elder int ret; 1933602adf40SYehuda Sadeh 19341fe5e993SAlex Elder ret = rbd_refresh_header(rbd_dev, NULL); 1935b813623aSAlex Elder 1936b813623aSAlex Elder return ret < 0 ? ret : size; 1937dfc5606dSYehuda Sadeh } 1938602adf40SYehuda Sadeh 1939dfc5606dSYehuda Sadeh static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL); 194034b13184SAlex Elder static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL); 1941dfc5606dSYehuda Sadeh static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL); 1942dfc5606dSYehuda Sadeh static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL); 1943dfc5606dSYehuda Sadeh static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL); 19449bb2f334SAlex Elder static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL); 1945dfc5606dSYehuda Sadeh static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL); 1946589d30e0SAlex Elder static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL); 1947dfc5606dSYehuda Sadeh static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh); 1948dfc5606dSYehuda Sadeh static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL); 1949dfc5606dSYehuda Sadeh 1950dfc5606dSYehuda Sadeh static struct attribute *rbd_attrs[] = { 1951dfc5606dSYehuda Sadeh &dev_attr_size.attr, 195234b13184SAlex Elder &dev_attr_features.attr, 1953dfc5606dSYehuda Sadeh &dev_attr_major.attr, 1954dfc5606dSYehuda Sadeh &dev_attr_client_id.attr, 1955dfc5606dSYehuda Sadeh &dev_attr_pool.attr, 19569bb2f334SAlex Elder &dev_attr_pool_id.attr, 1957dfc5606dSYehuda Sadeh &dev_attr_name.attr, 1958589d30e0SAlex Elder &dev_attr_image_id.attr, 1959dfc5606dSYehuda Sadeh &dev_attr_current_snap.attr, 1960dfc5606dSYehuda Sadeh &dev_attr_refresh.attr, 1961dfc5606dSYehuda Sadeh NULL 1962dfc5606dSYehuda Sadeh }; 1963dfc5606dSYehuda Sadeh 1964dfc5606dSYehuda Sadeh static struct attribute_group rbd_attr_group = { 1965dfc5606dSYehuda Sadeh .attrs = rbd_attrs, 1966dfc5606dSYehuda Sadeh }; 1967dfc5606dSYehuda Sadeh 1968dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_attr_groups[] = { 1969dfc5606dSYehuda Sadeh &rbd_attr_group, 1970dfc5606dSYehuda Sadeh NULL 1971dfc5606dSYehuda Sadeh }; 1972dfc5606dSYehuda Sadeh 1973dfc5606dSYehuda Sadeh static void rbd_sysfs_dev_release(struct device *dev) 1974dfc5606dSYehuda Sadeh { 1975dfc5606dSYehuda Sadeh } 1976dfc5606dSYehuda Sadeh 1977dfc5606dSYehuda Sadeh static struct device_type rbd_device_type = { 1978dfc5606dSYehuda Sadeh .name = "rbd", 1979dfc5606dSYehuda Sadeh .groups = rbd_attr_groups, 1980dfc5606dSYehuda Sadeh .release = rbd_sysfs_dev_release, 1981dfc5606dSYehuda Sadeh }; 1982dfc5606dSYehuda Sadeh 1983dfc5606dSYehuda Sadeh 1984dfc5606dSYehuda Sadeh /* 1985dfc5606dSYehuda Sadeh sysfs - snapshots 1986dfc5606dSYehuda Sadeh */ 1987dfc5606dSYehuda Sadeh 1988dfc5606dSYehuda Sadeh static ssize_t rbd_snap_size_show(struct device *dev, 1989dfc5606dSYehuda Sadeh struct device_attribute *attr, 1990dfc5606dSYehuda Sadeh char *buf) 1991dfc5606dSYehuda Sadeh { 1992dfc5606dSYehuda Sadeh struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev); 1993dfc5606dSYehuda Sadeh 19943591538fSJosh Durgin return sprintf(buf, "%llu\n", (unsigned long long)snap->size); 1995dfc5606dSYehuda Sadeh } 1996dfc5606dSYehuda Sadeh 1997dfc5606dSYehuda Sadeh static ssize_t rbd_snap_id_show(struct device *dev, 1998dfc5606dSYehuda Sadeh struct device_attribute *attr, 1999dfc5606dSYehuda Sadeh char *buf) 2000dfc5606dSYehuda Sadeh { 2001dfc5606dSYehuda Sadeh struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev); 2002dfc5606dSYehuda Sadeh 2003593a9e7bSAlex Elder return sprintf(buf, "%llu\n", (unsigned long long)snap->id); 2004dfc5606dSYehuda Sadeh } 2005dfc5606dSYehuda Sadeh 200634b13184SAlex Elder static ssize_t rbd_snap_features_show(struct device *dev, 200734b13184SAlex Elder struct device_attribute *attr, 200834b13184SAlex Elder char *buf) 200934b13184SAlex Elder { 201034b13184SAlex Elder struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev); 201134b13184SAlex Elder 201234b13184SAlex Elder return sprintf(buf, "0x%016llx\n", 201334b13184SAlex Elder (unsigned long long) snap->features); 201434b13184SAlex Elder } 201534b13184SAlex Elder 2016dfc5606dSYehuda Sadeh static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL); 2017dfc5606dSYehuda Sadeh static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL); 201834b13184SAlex Elder static DEVICE_ATTR(snap_features, S_IRUGO, rbd_snap_features_show, NULL); 2019dfc5606dSYehuda Sadeh 2020dfc5606dSYehuda Sadeh static struct attribute *rbd_snap_attrs[] = { 2021dfc5606dSYehuda Sadeh &dev_attr_snap_size.attr, 2022dfc5606dSYehuda Sadeh &dev_attr_snap_id.attr, 202334b13184SAlex Elder &dev_attr_snap_features.attr, 2024dfc5606dSYehuda Sadeh NULL, 2025dfc5606dSYehuda Sadeh }; 2026dfc5606dSYehuda Sadeh 2027dfc5606dSYehuda Sadeh static struct attribute_group rbd_snap_attr_group = { 2028dfc5606dSYehuda Sadeh .attrs = rbd_snap_attrs, 2029dfc5606dSYehuda Sadeh }; 2030dfc5606dSYehuda Sadeh 2031dfc5606dSYehuda Sadeh static void rbd_snap_dev_release(struct device *dev) 2032dfc5606dSYehuda Sadeh { 2033dfc5606dSYehuda Sadeh struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev); 2034dfc5606dSYehuda Sadeh kfree(snap->name); 2035dfc5606dSYehuda Sadeh kfree(snap); 2036dfc5606dSYehuda Sadeh } 2037dfc5606dSYehuda Sadeh 2038dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_snap_attr_groups[] = { 2039dfc5606dSYehuda Sadeh &rbd_snap_attr_group, 2040dfc5606dSYehuda Sadeh NULL 2041dfc5606dSYehuda Sadeh }; 2042dfc5606dSYehuda Sadeh 2043dfc5606dSYehuda Sadeh static struct device_type rbd_snap_device_type = { 2044dfc5606dSYehuda Sadeh .groups = rbd_snap_attr_groups, 2045dfc5606dSYehuda Sadeh .release = rbd_snap_dev_release, 2046dfc5606dSYehuda Sadeh }; 2047dfc5606dSYehuda Sadeh 2048304f6808SAlex Elder static bool rbd_snap_registered(struct rbd_snap *snap) 2049304f6808SAlex Elder { 2050304f6808SAlex Elder bool ret = snap->dev.type == &rbd_snap_device_type; 2051304f6808SAlex Elder bool reg = device_is_registered(&snap->dev); 2052304f6808SAlex Elder 2053304f6808SAlex Elder rbd_assert(!ret ^ reg); 2054304f6808SAlex Elder 2055304f6808SAlex Elder return ret; 2056304f6808SAlex Elder } 2057304f6808SAlex Elder 205814e7085dSAlex Elder static void __rbd_remove_snap_dev(struct rbd_snap *snap) 2059dfc5606dSYehuda Sadeh { 2060dfc5606dSYehuda Sadeh list_del(&snap->node); 2061304f6808SAlex Elder if (device_is_registered(&snap->dev)) 2062dfc5606dSYehuda Sadeh device_unregister(&snap->dev); 2063dfc5606dSYehuda Sadeh } 2064dfc5606dSYehuda Sadeh 206514e7085dSAlex Elder static int rbd_register_snap_dev(struct rbd_snap *snap, 2066dfc5606dSYehuda Sadeh struct device *parent) 2067dfc5606dSYehuda Sadeh { 2068dfc5606dSYehuda Sadeh struct device *dev = &snap->dev; 2069dfc5606dSYehuda Sadeh int ret; 2070dfc5606dSYehuda Sadeh 2071dfc5606dSYehuda Sadeh dev->type = &rbd_snap_device_type; 2072dfc5606dSYehuda Sadeh dev->parent = parent; 2073dfc5606dSYehuda Sadeh dev->release = rbd_snap_dev_release; 2074dfc5606dSYehuda Sadeh dev_set_name(dev, "snap_%s", snap->name); 2075304f6808SAlex Elder dout("%s: registering device for snapshot %s\n", __func__, snap->name); 2076304f6808SAlex Elder 2077dfc5606dSYehuda Sadeh ret = device_register(dev); 2078dfc5606dSYehuda Sadeh 2079dfc5606dSYehuda Sadeh return ret; 2080dfc5606dSYehuda Sadeh } 2081dfc5606dSYehuda Sadeh 20824e891e0aSAlex Elder static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev, 2083c8d18425SAlex Elder const char *snap_name, 208434b13184SAlex Elder u64 snap_id, u64 snap_size, 208534b13184SAlex Elder u64 snap_features) 2086dfc5606dSYehuda Sadeh { 20874e891e0aSAlex Elder struct rbd_snap *snap; 2088dfc5606dSYehuda Sadeh int ret; 20894e891e0aSAlex Elder 20904e891e0aSAlex Elder snap = kzalloc(sizeof (*snap), GFP_KERNEL); 2091dfc5606dSYehuda Sadeh if (!snap) 20924e891e0aSAlex Elder return ERR_PTR(-ENOMEM); 20934e891e0aSAlex Elder 20944e891e0aSAlex Elder ret = -ENOMEM; 2095c8d18425SAlex Elder snap->name = kstrdup(snap_name, GFP_KERNEL); 20964e891e0aSAlex Elder if (!snap->name) 20974e891e0aSAlex Elder goto err; 20984e891e0aSAlex Elder 2099c8d18425SAlex Elder snap->id = snap_id; 2100c8d18425SAlex Elder snap->size = snap_size; 210134b13184SAlex Elder snap->features = snap_features; 21024e891e0aSAlex Elder 21034e891e0aSAlex Elder return snap; 21044e891e0aSAlex Elder 2105dfc5606dSYehuda Sadeh err: 2106dfc5606dSYehuda Sadeh kfree(snap->name); 2107dfc5606dSYehuda Sadeh kfree(snap); 21084e891e0aSAlex Elder 21094e891e0aSAlex Elder return ERR_PTR(ret); 2110dfc5606dSYehuda Sadeh } 2111dfc5606dSYehuda Sadeh 2112cd892126SAlex Elder static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which, 2113cd892126SAlex Elder u64 *snap_size, u64 *snap_features) 2114cd892126SAlex Elder { 2115cd892126SAlex Elder char *snap_name; 2116cd892126SAlex Elder 2117cd892126SAlex Elder rbd_assert(which < rbd_dev->header.snapc->num_snaps); 2118cd892126SAlex Elder 2119cd892126SAlex Elder *snap_size = rbd_dev->header.snap_sizes[which]; 2120cd892126SAlex Elder *snap_features = 0; /* No features for v1 */ 2121cd892126SAlex Elder 2122cd892126SAlex Elder /* Skip over names until we find the one we are looking for */ 2123cd892126SAlex Elder 2124cd892126SAlex Elder snap_name = rbd_dev->header.snap_names; 2125cd892126SAlex Elder while (which--) 2126cd892126SAlex Elder snap_name += strlen(snap_name) + 1; 2127cd892126SAlex Elder 2128cd892126SAlex Elder return snap_name; 2129cd892126SAlex Elder } 2130cd892126SAlex Elder 2131dfc5606dSYehuda Sadeh /* 21329d475de5SAlex Elder * Get the size and object order for an image snapshot, or if 21339d475de5SAlex Elder * snap_id is CEPH_NOSNAP, gets this information for the base 21349d475de5SAlex Elder * image. 21359d475de5SAlex Elder */ 21369d475de5SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id, 21379d475de5SAlex Elder u8 *order, u64 *snap_size) 21389d475de5SAlex Elder { 21399d475de5SAlex Elder __le64 snapid = cpu_to_le64(snap_id); 21409d475de5SAlex Elder int ret; 21419d475de5SAlex Elder struct { 21429d475de5SAlex Elder u8 order; 21439d475de5SAlex Elder __le64 size; 21449d475de5SAlex Elder } __attribute__ ((packed)) size_buf = { 0 }; 21459d475de5SAlex Elder 21469d475de5SAlex Elder ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, 21479d475de5SAlex Elder "rbd", "get_size", 21489d475de5SAlex Elder (char *) &snapid, sizeof (snapid), 21499d475de5SAlex Elder (char *) &size_buf, sizeof (size_buf), 21509d475de5SAlex Elder CEPH_OSD_FLAG_READ, NULL); 21519d475de5SAlex Elder dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); 21529d475de5SAlex Elder if (ret < 0) 21539d475de5SAlex Elder return ret; 21549d475de5SAlex Elder 21559d475de5SAlex Elder *order = size_buf.order; 21569d475de5SAlex Elder *snap_size = le64_to_cpu(size_buf.size); 21579d475de5SAlex Elder 21589d475de5SAlex Elder dout(" snap_id 0x%016llx order = %u, snap_size = %llu\n", 21599d475de5SAlex Elder (unsigned long long) snap_id, (unsigned int) *order, 21609d475de5SAlex Elder (unsigned long long) *snap_size); 21619d475de5SAlex Elder 21629d475de5SAlex Elder return 0; 21639d475de5SAlex Elder } 21649d475de5SAlex Elder 21659d475de5SAlex Elder static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev) 21669d475de5SAlex Elder { 21679d475de5SAlex Elder return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP, 21689d475de5SAlex Elder &rbd_dev->header.obj_order, 21699d475de5SAlex Elder &rbd_dev->header.image_size); 21709d475de5SAlex Elder } 21719d475de5SAlex Elder 21721e130199SAlex Elder static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev) 21731e130199SAlex Elder { 21741e130199SAlex Elder void *reply_buf; 21751e130199SAlex Elder int ret; 21761e130199SAlex Elder void *p; 21771e130199SAlex Elder 21781e130199SAlex Elder reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL); 21791e130199SAlex Elder if (!reply_buf) 21801e130199SAlex Elder return -ENOMEM; 21811e130199SAlex Elder 21821e130199SAlex Elder ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, 21831e130199SAlex Elder "rbd", "get_object_prefix", 21841e130199SAlex Elder NULL, 0, 21851e130199SAlex Elder reply_buf, RBD_OBJ_PREFIX_LEN_MAX, 21861e130199SAlex Elder CEPH_OSD_FLAG_READ, NULL); 21871e130199SAlex Elder dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); 21881e130199SAlex Elder if (ret < 0) 21891e130199SAlex Elder goto out; 21901e130199SAlex Elder 21911e130199SAlex Elder p = reply_buf; 21921e130199SAlex Elder rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p, 21931e130199SAlex Elder p + RBD_OBJ_PREFIX_LEN_MAX, 21941e130199SAlex Elder NULL, GFP_NOIO); 21951e130199SAlex Elder 21961e130199SAlex Elder if (IS_ERR(rbd_dev->header.object_prefix)) { 21971e130199SAlex Elder ret = PTR_ERR(rbd_dev->header.object_prefix); 21981e130199SAlex Elder rbd_dev->header.object_prefix = NULL; 21991e130199SAlex Elder } else { 22001e130199SAlex Elder dout(" object_prefix = %s\n", rbd_dev->header.object_prefix); 22011e130199SAlex Elder } 22021e130199SAlex Elder 22031e130199SAlex Elder out: 22041e130199SAlex Elder kfree(reply_buf); 22051e130199SAlex Elder 22061e130199SAlex Elder return ret; 22071e130199SAlex Elder } 22081e130199SAlex Elder 22099d475de5SAlex Elder /* 221035938150SAlex Elder * Scan the rbd device's current snapshot list and compare it to the 221135938150SAlex Elder * newly-received snapshot context. Remove any existing snapshots 221235938150SAlex Elder * not present in the new snapshot context. Add a new snapshot for 221335938150SAlex Elder * any snaphots in the snapshot context not in the current list. 221435938150SAlex Elder * And verify there are no changes to snapshots we already know 221535938150SAlex Elder * about. 221635938150SAlex Elder * 221735938150SAlex Elder * Assumes the snapshots in the snapshot context are sorted by 221835938150SAlex Elder * snapshot id, highest id first. (Snapshots in the rbd_dev's list 221935938150SAlex Elder * are also maintained in that order.) 2220dfc5606dSYehuda Sadeh */ 2221304f6808SAlex Elder static int rbd_dev_snaps_update(struct rbd_device *rbd_dev) 2222dfc5606dSYehuda Sadeh { 222335938150SAlex Elder struct ceph_snap_context *snapc = rbd_dev->header.snapc; 222435938150SAlex Elder const u32 snap_count = snapc->num_snaps; 222535938150SAlex Elder struct list_head *head = &rbd_dev->snaps; 222635938150SAlex Elder struct list_head *links = head->next; 222735938150SAlex Elder u32 index = 0; 2228dfc5606dSYehuda Sadeh 22299fcbb800SAlex Elder dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count); 223035938150SAlex Elder while (index < snap_count || links != head) { 223135938150SAlex Elder u64 snap_id; 223235938150SAlex Elder struct rbd_snap *snap; 2233cd892126SAlex Elder char *snap_name; 2234cd892126SAlex Elder u64 snap_size = 0; 2235cd892126SAlex Elder u64 snap_features = 0; 2236dfc5606dSYehuda Sadeh 223735938150SAlex Elder snap_id = index < snap_count ? snapc->snaps[index] 223835938150SAlex Elder : CEPH_NOSNAP; 223935938150SAlex Elder snap = links != head ? list_entry(links, struct rbd_snap, node) 224035938150SAlex Elder : NULL; 2241aafb230eSAlex Elder rbd_assert(!snap || snap->id != CEPH_NOSNAP); 2242dfc5606dSYehuda Sadeh 224335938150SAlex Elder if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) { 224435938150SAlex Elder struct list_head *next = links->next; 2245dfc5606dSYehuda Sadeh 224635938150SAlex Elder /* Existing snapshot not in the new snap context */ 2247dfc5606dSYehuda Sadeh 2248f84344f3SAlex Elder if (rbd_dev->mapping.snap_id == snap->id) 2249f84344f3SAlex Elder rbd_dev->mapping.snap_exists = false; 225035938150SAlex Elder __rbd_remove_snap_dev(snap); 22519fcbb800SAlex Elder dout("%ssnap id %llu has been removed\n", 2252f84344f3SAlex Elder rbd_dev->mapping.snap_id == snap->id ? 2253f84344f3SAlex Elder "mapped " : "", 22549fcbb800SAlex Elder (unsigned long long) snap->id); 2255dfc5606dSYehuda Sadeh 225635938150SAlex Elder /* Done with this list entry; advance */ 225735938150SAlex Elder 225835938150SAlex Elder links = next; 225935938150SAlex Elder continue; 2260dfc5606dSYehuda Sadeh } 226135938150SAlex Elder 2262cd892126SAlex Elder snap_name = rbd_dev_v1_snap_info(rbd_dev, index, 2263cd892126SAlex Elder &snap_size, &snap_features); 2264cd892126SAlex Elder if (IS_ERR(snap_name)) 2265cd892126SAlex Elder return PTR_ERR(snap_name); 2266cd892126SAlex Elder 22679fcbb800SAlex Elder dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count, 22689fcbb800SAlex Elder (unsigned long long) snap_id); 226935938150SAlex Elder if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) { 227035938150SAlex Elder struct rbd_snap *new_snap; 227135938150SAlex Elder 227235938150SAlex Elder /* We haven't seen this snapshot before */ 227335938150SAlex Elder 2274c8d18425SAlex Elder new_snap = __rbd_add_snap_dev(rbd_dev, snap_name, 2275cd892126SAlex Elder snap_id, snap_size, snap_features); 22769fcbb800SAlex Elder if (IS_ERR(new_snap)) { 22779fcbb800SAlex Elder int err = PTR_ERR(new_snap); 22789fcbb800SAlex Elder 22799fcbb800SAlex Elder dout(" failed to add dev, error %d\n", err); 22809fcbb800SAlex Elder 22819fcbb800SAlex Elder return err; 22829fcbb800SAlex Elder } 228335938150SAlex Elder 228435938150SAlex Elder /* New goes before existing, or at end of list */ 228535938150SAlex Elder 22869fcbb800SAlex Elder dout(" added dev%s\n", snap ? "" : " at end\n"); 228735938150SAlex Elder if (snap) 228835938150SAlex Elder list_add_tail(&new_snap->node, &snap->node); 228935938150SAlex Elder else 2290523f3258SAlex Elder list_add_tail(&new_snap->node, head); 229135938150SAlex Elder } else { 229235938150SAlex Elder /* Already have this one */ 229335938150SAlex Elder 22949fcbb800SAlex Elder dout(" already present\n"); 22959fcbb800SAlex Elder 2296cd892126SAlex Elder rbd_assert(snap->size == snap_size); 2297aafb230eSAlex Elder rbd_assert(!strcmp(snap->name, snap_name)); 2298cd892126SAlex Elder rbd_assert(snap->features == snap_features); 229935938150SAlex Elder 230035938150SAlex Elder /* Done with this list entry; advance */ 230135938150SAlex Elder 230235938150SAlex Elder links = links->next; 2303dfc5606dSYehuda Sadeh } 230435938150SAlex Elder 230535938150SAlex Elder /* Advance to the next entry in the snapshot context */ 230635938150SAlex Elder 230735938150SAlex Elder index++; 2308dfc5606dSYehuda Sadeh } 23099fcbb800SAlex Elder dout("%s: done\n", __func__); 2310dfc5606dSYehuda Sadeh 2311dfc5606dSYehuda Sadeh return 0; 2312dfc5606dSYehuda Sadeh } 2313dfc5606dSYehuda Sadeh 2314304f6808SAlex Elder /* 2315304f6808SAlex Elder * Scan the list of snapshots and register the devices for any that 2316304f6808SAlex Elder * have not already been registered. 2317304f6808SAlex Elder */ 2318304f6808SAlex Elder static int rbd_dev_snaps_register(struct rbd_device *rbd_dev) 2319304f6808SAlex Elder { 2320304f6808SAlex Elder struct rbd_snap *snap; 2321304f6808SAlex Elder int ret = 0; 2322304f6808SAlex Elder 2323304f6808SAlex Elder dout("%s called\n", __func__); 232486ff77bbSAlex Elder if (WARN_ON(!device_is_registered(&rbd_dev->dev))) 232586ff77bbSAlex Elder return -EIO; 2326304f6808SAlex Elder 2327304f6808SAlex Elder list_for_each_entry(snap, &rbd_dev->snaps, node) { 2328304f6808SAlex Elder if (!rbd_snap_registered(snap)) { 2329304f6808SAlex Elder ret = rbd_register_snap_dev(snap, &rbd_dev->dev); 2330304f6808SAlex Elder if (ret < 0) 2331304f6808SAlex Elder break; 2332304f6808SAlex Elder } 2333304f6808SAlex Elder } 2334304f6808SAlex Elder dout("%s: returning %d\n", __func__, ret); 2335304f6808SAlex Elder 2336304f6808SAlex Elder return ret; 2337304f6808SAlex Elder } 2338304f6808SAlex Elder 2339dfc5606dSYehuda Sadeh static int rbd_bus_add_dev(struct rbd_device *rbd_dev) 2340dfc5606dSYehuda Sadeh { 2341dfc5606dSYehuda Sadeh struct device *dev; 2342cd789ab9SAlex Elder int ret; 2343dfc5606dSYehuda Sadeh 2344dfc5606dSYehuda Sadeh mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); 2345dfc5606dSYehuda Sadeh 2346cd789ab9SAlex Elder dev = &rbd_dev->dev; 2347dfc5606dSYehuda Sadeh dev->bus = &rbd_bus_type; 2348dfc5606dSYehuda Sadeh dev->type = &rbd_device_type; 2349dfc5606dSYehuda Sadeh dev->parent = &rbd_root_dev; 2350dfc5606dSYehuda Sadeh dev->release = rbd_dev_release; 2351de71a297SAlex Elder dev_set_name(dev, "%d", rbd_dev->dev_id); 2352dfc5606dSYehuda Sadeh ret = device_register(dev); 2353dfc5606dSYehuda Sadeh 2354dfc5606dSYehuda Sadeh mutex_unlock(&ctl_mutex); 2355cd789ab9SAlex Elder 2356dfc5606dSYehuda Sadeh return ret; 2357602adf40SYehuda Sadeh } 2358602adf40SYehuda Sadeh 2359dfc5606dSYehuda Sadeh static void rbd_bus_del_dev(struct rbd_device *rbd_dev) 2360dfc5606dSYehuda Sadeh { 2361dfc5606dSYehuda Sadeh device_unregister(&rbd_dev->dev); 2362dfc5606dSYehuda Sadeh } 2363dfc5606dSYehuda Sadeh 236459c2be1eSYehuda Sadeh static int rbd_init_watch_dev(struct rbd_device *rbd_dev) 236559c2be1eSYehuda Sadeh { 236659c2be1eSYehuda Sadeh int ret, rc; 236759c2be1eSYehuda Sadeh 236859c2be1eSYehuda Sadeh do { 23690e6f322dSAlex Elder ret = rbd_req_sync_watch(rbd_dev); 237059c2be1eSYehuda Sadeh if (ret == -ERANGE) { 23711fe5e993SAlex Elder rc = rbd_refresh_header(rbd_dev, NULL); 237259c2be1eSYehuda Sadeh if (rc < 0) 237359c2be1eSYehuda Sadeh return rc; 237459c2be1eSYehuda Sadeh } 237559c2be1eSYehuda Sadeh } while (ret == -ERANGE); 237659c2be1eSYehuda Sadeh 237759c2be1eSYehuda Sadeh return ret; 237859c2be1eSYehuda Sadeh } 237959c2be1eSYehuda Sadeh 2380e2839308SAlex Elder static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0); 23811ddbe94eSAlex Elder 23821ddbe94eSAlex Elder /* 2383499afd5bSAlex Elder * Get a unique rbd identifier for the given new rbd_dev, and add 2384499afd5bSAlex Elder * the rbd_dev to the global list. The minimum rbd id is 1. 23851ddbe94eSAlex Elder */ 2386e2839308SAlex Elder static void rbd_dev_id_get(struct rbd_device *rbd_dev) 2387b7f23c36SAlex Elder { 2388e2839308SAlex Elder rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max); 2389499afd5bSAlex Elder 2390499afd5bSAlex Elder spin_lock(&rbd_dev_list_lock); 2391499afd5bSAlex Elder list_add_tail(&rbd_dev->node, &rbd_dev_list); 2392499afd5bSAlex Elder spin_unlock(&rbd_dev_list_lock); 2393e2839308SAlex Elder dout("rbd_dev %p given dev id %llu\n", rbd_dev, 2394e2839308SAlex Elder (unsigned long long) rbd_dev->dev_id); 2395b7f23c36SAlex Elder } 2396b7f23c36SAlex Elder 23971ddbe94eSAlex Elder /* 2398499afd5bSAlex Elder * Remove an rbd_dev from the global list, and record that its 2399499afd5bSAlex Elder * identifier is no longer in use. 24001ddbe94eSAlex Elder */ 2401e2839308SAlex Elder static void rbd_dev_id_put(struct rbd_device *rbd_dev) 24021ddbe94eSAlex Elder { 2403d184f6bfSAlex Elder struct list_head *tmp; 2404de71a297SAlex Elder int rbd_id = rbd_dev->dev_id; 2405d184f6bfSAlex Elder int max_id; 2406d184f6bfSAlex Elder 2407aafb230eSAlex Elder rbd_assert(rbd_id > 0); 2408499afd5bSAlex Elder 2409e2839308SAlex Elder dout("rbd_dev %p released dev id %llu\n", rbd_dev, 2410e2839308SAlex Elder (unsigned long long) rbd_dev->dev_id); 2411499afd5bSAlex Elder spin_lock(&rbd_dev_list_lock); 2412499afd5bSAlex Elder list_del_init(&rbd_dev->node); 2413d184f6bfSAlex Elder 2414d184f6bfSAlex Elder /* 2415d184f6bfSAlex Elder * If the id being "put" is not the current maximum, there 2416d184f6bfSAlex Elder * is nothing special we need to do. 2417d184f6bfSAlex Elder */ 2418e2839308SAlex Elder if (rbd_id != atomic64_read(&rbd_dev_id_max)) { 2419d184f6bfSAlex Elder spin_unlock(&rbd_dev_list_lock); 2420d184f6bfSAlex Elder return; 2421d184f6bfSAlex Elder } 2422d184f6bfSAlex Elder 2423d184f6bfSAlex Elder /* 2424d184f6bfSAlex Elder * We need to update the current maximum id. Search the 2425d184f6bfSAlex Elder * list to find out what it is. We're more likely to find 2426d184f6bfSAlex Elder * the maximum at the end, so search the list backward. 2427d184f6bfSAlex Elder */ 2428d184f6bfSAlex Elder max_id = 0; 2429d184f6bfSAlex Elder list_for_each_prev(tmp, &rbd_dev_list) { 2430d184f6bfSAlex Elder struct rbd_device *rbd_dev; 2431d184f6bfSAlex Elder 2432d184f6bfSAlex Elder rbd_dev = list_entry(tmp, struct rbd_device, node); 2433d184f6bfSAlex Elder if (rbd_id > max_id) 2434d184f6bfSAlex Elder max_id = rbd_id; 2435d184f6bfSAlex Elder } 2436499afd5bSAlex Elder spin_unlock(&rbd_dev_list_lock); 24371ddbe94eSAlex Elder 24381ddbe94eSAlex Elder /* 2439e2839308SAlex Elder * The max id could have been updated by rbd_dev_id_get(), in 2440d184f6bfSAlex Elder * which case it now accurately reflects the new maximum. 2441d184f6bfSAlex Elder * Be careful not to overwrite the maximum value in that 2442d184f6bfSAlex Elder * case. 24431ddbe94eSAlex Elder */ 2444e2839308SAlex Elder atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id); 2445e2839308SAlex Elder dout(" max dev id has been reset\n"); 2446b7f23c36SAlex Elder } 2447b7f23c36SAlex Elder 2448a725f65eSAlex Elder /* 2449e28fff26SAlex Elder * Skips over white space at *buf, and updates *buf to point to the 2450e28fff26SAlex Elder * first found non-space character (if any). Returns the length of 2451593a9e7bSAlex Elder * the token (string of non-white space characters) found. Note 2452593a9e7bSAlex Elder * that *buf must be terminated with '\0'. 2453e28fff26SAlex Elder */ 2454e28fff26SAlex Elder static inline size_t next_token(const char **buf) 2455e28fff26SAlex Elder { 2456e28fff26SAlex Elder /* 2457e28fff26SAlex Elder * These are the characters that produce nonzero for 2458e28fff26SAlex Elder * isspace() in the "C" and "POSIX" locales. 2459e28fff26SAlex Elder */ 2460e28fff26SAlex Elder const char *spaces = " \f\n\r\t\v"; 2461e28fff26SAlex Elder 2462e28fff26SAlex Elder *buf += strspn(*buf, spaces); /* Find start of token */ 2463e28fff26SAlex Elder 2464e28fff26SAlex Elder return strcspn(*buf, spaces); /* Return token length */ 2465e28fff26SAlex Elder } 2466e28fff26SAlex Elder 2467e28fff26SAlex Elder /* 2468e28fff26SAlex Elder * Finds the next token in *buf, and if the provided token buffer is 2469e28fff26SAlex Elder * big enough, copies the found token into it. The result, if 2470593a9e7bSAlex Elder * copied, is guaranteed to be terminated with '\0'. Note that *buf 2471593a9e7bSAlex Elder * must be terminated with '\0' on entry. 2472e28fff26SAlex Elder * 2473e28fff26SAlex Elder * Returns the length of the token found (not including the '\0'). 2474e28fff26SAlex Elder * Return value will be 0 if no token is found, and it will be >= 2475e28fff26SAlex Elder * token_size if the token would not fit. 2476e28fff26SAlex Elder * 2477593a9e7bSAlex Elder * The *buf pointer will be updated to point beyond the end of the 2478e28fff26SAlex Elder * found token. Note that this occurs even if the token buffer is 2479e28fff26SAlex Elder * too small to hold it. 2480e28fff26SAlex Elder */ 2481e28fff26SAlex Elder static inline size_t copy_token(const char **buf, 2482e28fff26SAlex Elder char *token, 2483e28fff26SAlex Elder size_t token_size) 2484e28fff26SAlex Elder { 2485e28fff26SAlex Elder size_t len; 2486e28fff26SAlex Elder 2487e28fff26SAlex Elder len = next_token(buf); 2488e28fff26SAlex Elder if (len < token_size) { 2489e28fff26SAlex Elder memcpy(token, *buf, len); 2490e28fff26SAlex Elder *(token + len) = '\0'; 2491e28fff26SAlex Elder } 2492e28fff26SAlex Elder *buf += len; 2493e28fff26SAlex Elder 2494e28fff26SAlex Elder return len; 2495e28fff26SAlex Elder } 2496e28fff26SAlex Elder 2497e28fff26SAlex Elder /* 2498ea3352f4SAlex Elder * Finds the next token in *buf, dynamically allocates a buffer big 2499ea3352f4SAlex Elder * enough to hold a copy of it, and copies the token into the new 2500ea3352f4SAlex Elder * buffer. The copy is guaranteed to be terminated with '\0'. Note 2501ea3352f4SAlex Elder * that a duplicate buffer is created even for a zero-length token. 2502ea3352f4SAlex Elder * 2503ea3352f4SAlex Elder * Returns a pointer to the newly-allocated duplicate, or a null 2504ea3352f4SAlex Elder * pointer if memory for the duplicate was not available. If 2505ea3352f4SAlex Elder * the lenp argument is a non-null pointer, the length of the token 2506ea3352f4SAlex Elder * (not including the '\0') is returned in *lenp. 2507ea3352f4SAlex Elder * 2508ea3352f4SAlex Elder * If successful, the *buf pointer will be updated to point beyond 2509ea3352f4SAlex Elder * the end of the found token. 2510ea3352f4SAlex Elder * 2511ea3352f4SAlex Elder * Note: uses GFP_KERNEL for allocation. 2512ea3352f4SAlex Elder */ 2513ea3352f4SAlex Elder static inline char *dup_token(const char **buf, size_t *lenp) 2514ea3352f4SAlex Elder { 2515ea3352f4SAlex Elder char *dup; 2516ea3352f4SAlex Elder size_t len; 2517ea3352f4SAlex Elder 2518ea3352f4SAlex Elder len = next_token(buf); 2519ea3352f4SAlex Elder dup = kmalloc(len + 1, GFP_KERNEL); 2520ea3352f4SAlex Elder if (!dup) 2521ea3352f4SAlex Elder return NULL; 2522ea3352f4SAlex Elder 2523ea3352f4SAlex Elder memcpy(dup, *buf, len); 2524ea3352f4SAlex Elder *(dup + len) = '\0'; 2525ea3352f4SAlex Elder *buf += len; 2526ea3352f4SAlex Elder 2527ea3352f4SAlex Elder if (lenp) 2528ea3352f4SAlex Elder *lenp = len; 2529ea3352f4SAlex Elder 2530ea3352f4SAlex Elder return dup; 2531ea3352f4SAlex Elder } 2532ea3352f4SAlex Elder 2533ea3352f4SAlex Elder /* 25343feeb894SAlex Elder * This fills in the pool_name, image_name, image_name_len, rbd_dev, 25353feeb894SAlex Elder * rbd_md_name, and name fields of the given rbd_dev, based on the 25363feeb894SAlex Elder * list of monitor addresses and other options provided via 25373feeb894SAlex Elder * /sys/bus/rbd/add. Returns a pointer to a dynamically-allocated 25383feeb894SAlex Elder * copy of the snapshot name to map if successful, or a 25393feeb894SAlex Elder * pointer-coded error otherwise. 2540d22f76e7SAlex Elder * 2541d22f76e7SAlex Elder * Note: rbd_dev is assumed to have been initially zero-filled. 2542a725f65eSAlex Elder */ 25433feeb894SAlex Elder static char *rbd_add_parse_args(struct rbd_device *rbd_dev, 2544a725f65eSAlex Elder const char *buf, 25457ef3214aSAlex Elder const char **mon_addrs, 25465214ecc4SAlex Elder size_t *mon_addrs_size, 2547e28fff26SAlex Elder char *options, 2548e28fff26SAlex Elder size_t options_size) 2549a725f65eSAlex Elder { 2550e28fff26SAlex Elder size_t len; 25513feeb894SAlex Elder char *err_ptr = ERR_PTR(-EINVAL); 25523feeb894SAlex Elder char *snap_name; 2553e28fff26SAlex Elder 2554e28fff26SAlex Elder /* The first four tokens are required */ 2555e28fff26SAlex Elder 25567ef3214aSAlex Elder len = next_token(&buf); 25577ef3214aSAlex Elder if (!len) 25583feeb894SAlex Elder return err_ptr; 25595214ecc4SAlex Elder *mon_addrs_size = len + 1; 25607ef3214aSAlex Elder *mon_addrs = buf; 25617ef3214aSAlex Elder 25627ef3214aSAlex Elder buf += len; 2563a725f65eSAlex Elder 2564e28fff26SAlex Elder len = copy_token(&buf, options, options_size); 2565e28fff26SAlex Elder if (!len || len >= options_size) 25663feeb894SAlex Elder return err_ptr; 2567a725f65eSAlex Elder 25683feeb894SAlex Elder err_ptr = ERR_PTR(-ENOMEM); 2569d22f76e7SAlex Elder rbd_dev->pool_name = dup_token(&buf, NULL); 2570d22f76e7SAlex Elder if (!rbd_dev->pool_name) 2571d22f76e7SAlex Elder goto out_err; 2572e28fff26SAlex Elder 25730bed54dcSAlex Elder rbd_dev->image_name = dup_token(&buf, &rbd_dev->image_name_len); 25740bed54dcSAlex Elder if (!rbd_dev->image_name) 2575bf3e5ae1SAlex Elder goto out_err; 2576e28fff26SAlex Elder 25773feeb894SAlex Elder /* Snapshot name is optional */ 25783feeb894SAlex Elder len = next_token(&buf); 2579820a5f3eSAlex Elder if (!len) { 25803feeb894SAlex Elder buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */ 25813feeb894SAlex Elder len = sizeof (RBD_SNAP_HEAD_NAME) - 1; 2582849b4260SAlex Elder } 25833feeb894SAlex Elder snap_name = kmalloc(len + 1, GFP_KERNEL); 25843feeb894SAlex Elder if (!snap_name) 25853feeb894SAlex Elder goto out_err; 25863feeb894SAlex Elder memcpy(snap_name, buf, len); 25873feeb894SAlex Elder *(snap_name + len) = '\0'; 2588e28fff26SAlex Elder 25893feeb894SAlex Elder dout(" SNAP_NAME is <%s>, len is %zd\n", snap_name, len); 25903feeb894SAlex Elder 25913feeb894SAlex Elder return snap_name; 2592d22f76e7SAlex Elder 2593d22f76e7SAlex Elder out_err: 25940bed54dcSAlex Elder kfree(rbd_dev->image_name); 2595d78fd7aeSAlex Elder rbd_dev->image_name = NULL; 2596d78fd7aeSAlex Elder rbd_dev->image_name_len = 0; 2597d22f76e7SAlex Elder kfree(rbd_dev->pool_name); 2598d22f76e7SAlex Elder rbd_dev->pool_name = NULL; 2599d22f76e7SAlex Elder 26003feeb894SAlex Elder return err_ptr; 2601a725f65eSAlex Elder } 2602a725f65eSAlex Elder 2603589d30e0SAlex Elder /* 2604589d30e0SAlex Elder * An rbd format 2 image has a unique identifier, distinct from the 2605589d30e0SAlex Elder * name given to it by the user. Internally, that identifier is 2606589d30e0SAlex Elder * what's used to specify the names of objects related to the image. 2607589d30e0SAlex Elder * 2608589d30e0SAlex Elder * A special "rbd id" object is used to map an rbd image name to its 2609589d30e0SAlex Elder * id. If that object doesn't exist, then there is no v2 rbd image 2610589d30e0SAlex Elder * with the supplied name. 2611589d30e0SAlex Elder * 2612589d30e0SAlex Elder * This function will record the given rbd_dev's image_id field if 2613589d30e0SAlex Elder * it can be determined, and in that case will return 0. If any 2614589d30e0SAlex Elder * errors occur a negative errno will be returned and the rbd_dev's 2615589d30e0SAlex Elder * image_id field will be unchanged (and should be NULL). 2616589d30e0SAlex Elder */ 2617589d30e0SAlex Elder static int rbd_dev_image_id(struct rbd_device *rbd_dev) 2618589d30e0SAlex Elder { 2619589d30e0SAlex Elder int ret; 2620589d30e0SAlex Elder size_t size; 2621589d30e0SAlex Elder char *object_name; 2622589d30e0SAlex Elder void *response; 2623589d30e0SAlex Elder void *p; 2624589d30e0SAlex Elder 2625589d30e0SAlex Elder /* 2626589d30e0SAlex Elder * First, see if the format 2 image id file exists, and if 2627589d30e0SAlex Elder * so, get the image's persistent id from it. 2628589d30e0SAlex Elder */ 2629589d30e0SAlex Elder size = sizeof (RBD_ID_PREFIX) + rbd_dev->image_name_len; 2630589d30e0SAlex Elder object_name = kmalloc(size, GFP_NOIO); 2631589d30e0SAlex Elder if (!object_name) 2632589d30e0SAlex Elder return -ENOMEM; 2633589d30e0SAlex Elder sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->image_name); 2634589d30e0SAlex Elder dout("rbd id object name is %s\n", object_name); 2635589d30e0SAlex Elder 2636589d30e0SAlex Elder /* Response will be an encoded string, which includes a length */ 2637589d30e0SAlex Elder 2638589d30e0SAlex Elder size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX; 2639589d30e0SAlex Elder response = kzalloc(size, GFP_NOIO); 2640589d30e0SAlex Elder if (!response) { 2641589d30e0SAlex Elder ret = -ENOMEM; 2642589d30e0SAlex Elder goto out; 2643589d30e0SAlex Elder } 2644589d30e0SAlex Elder 2645589d30e0SAlex Elder ret = rbd_req_sync_exec(rbd_dev, object_name, 2646589d30e0SAlex Elder "rbd", "get_id", 2647589d30e0SAlex Elder NULL, 0, 2648589d30e0SAlex Elder response, RBD_IMAGE_ID_LEN_MAX, 2649589d30e0SAlex Elder CEPH_OSD_FLAG_READ, NULL); 2650589d30e0SAlex Elder dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); 2651589d30e0SAlex Elder if (ret < 0) 2652589d30e0SAlex Elder goto out; 2653589d30e0SAlex Elder 2654589d30e0SAlex Elder p = response; 2655589d30e0SAlex Elder rbd_dev->image_id = ceph_extract_encoded_string(&p, 2656589d30e0SAlex Elder p + RBD_IMAGE_ID_LEN_MAX, 2657589d30e0SAlex Elder &rbd_dev->image_id_len, 2658589d30e0SAlex Elder GFP_NOIO); 2659589d30e0SAlex Elder if (IS_ERR(rbd_dev->image_id)) { 2660589d30e0SAlex Elder ret = PTR_ERR(rbd_dev->image_id); 2661589d30e0SAlex Elder rbd_dev->image_id = NULL; 2662589d30e0SAlex Elder } else { 2663589d30e0SAlex Elder dout("image_id is %s\n", rbd_dev->image_id); 2664589d30e0SAlex Elder } 2665589d30e0SAlex Elder out: 2666589d30e0SAlex Elder kfree(response); 2667589d30e0SAlex Elder kfree(object_name); 2668589d30e0SAlex Elder 2669589d30e0SAlex Elder return ret; 2670589d30e0SAlex Elder } 2671589d30e0SAlex Elder 2672a30b71b9SAlex Elder static int rbd_dev_v1_probe(struct rbd_device *rbd_dev) 2673a30b71b9SAlex Elder { 2674a30b71b9SAlex Elder int ret; 2675a30b71b9SAlex Elder size_t size; 2676a30b71b9SAlex Elder 2677a30b71b9SAlex Elder /* Version 1 images have no id; empty string is used */ 2678a30b71b9SAlex Elder 2679a30b71b9SAlex Elder rbd_dev->image_id = kstrdup("", GFP_KERNEL); 2680a30b71b9SAlex Elder if (!rbd_dev->image_id) 2681a30b71b9SAlex Elder return -ENOMEM; 2682a30b71b9SAlex Elder rbd_dev->image_id_len = 0; 2683a30b71b9SAlex Elder 2684a30b71b9SAlex Elder /* Record the header object name for this rbd image. */ 2685a30b71b9SAlex Elder 2686a30b71b9SAlex Elder size = rbd_dev->image_name_len + sizeof (RBD_SUFFIX); 2687a30b71b9SAlex Elder rbd_dev->header_name = kmalloc(size, GFP_KERNEL); 2688a30b71b9SAlex Elder if (!rbd_dev->header_name) { 2689a30b71b9SAlex Elder ret = -ENOMEM; 2690a30b71b9SAlex Elder goto out_err; 2691a30b71b9SAlex Elder } 2692a30b71b9SAlex Elder sprintf(rbd_dev->header_name, "%s%s", rbd_dev->image_name, RBD_SUFFIX); 2693a30b71b9SAlex Elder 2694a30b71b9SAlex Elder /* Populate rbd image metadata */ 2695a30b71b9SAlex Elder 2696a30b71b9SAlex Elder ret = rbd_read_header(rbd_dev, &rbd_dev->header); 2697a30b71b9SAlex Elder if (ret < 0) 2698a30b71b9SAlex Elder goto out_err; 2699a30b71b9SAlex Elder rbd_dev->image_format = 1; 2700a30b71b9SAlex Elder 2701a30b71b9SAlex Elder dout("discovered version 1 image, header name is %s\n", 2702a30b71b9SAlex Elder rbd_dev->header_name); 2703a30b71b9SAlex Elder 2704a30b71b9SAlex Elder return 0; 2705a30b71b9SAlex Elder 2706a30b71b9SAlex Elder out_err: 2707a30b71b9SAlex Elder kfree(rbd_dev->header_name); 2708a30b71b9SAlex Elder rbd_dev->header_name = NULL; 2709a30b71b9SAlex Elder kfree(rbd_dev->image_id); 2710a30b71b9SAlex Elder rbd_dev->image_id = NULL; 2711a30b71b9SAlex Elder 2712a30b71b9SAlex Elder return ret; 2713a30b71b9SAlex Elder } 2714a30b71b9SAlex Elder 2715a30b71b9SAlex Elder static int rbd_dev_v2_probe(struct rbd_device *rbd_dev) 2716a30b71b9SAlex Elder { 2717a30b71b9SAlex Elder size_t size; 27189d475de5SAlex Elder int ret; 2719a30b71b9SAlex Elder 2720a30b71b9SAlex Elder /* 2721a30b71b9SAlex Elder * Image id was filled in by the caller. Record the header 2722a30b71b9SAlex Elder * object name for this rbd image. 2723a30b71b9SAlex Elder */ 2724a30b71b9SAlex Elder size = sizeof (RBD_HEADER_PREFIX) + rbd_dev->image_id_len; 2725a30b71b9SAlex Elder rbd_dev->header_name = kmalloc(size, GFP_KERNEL); 2726a30b71b9SAlex Elder if (!rbd_dev->header_name) 2727a30b71b9SAlex Elder return -ENOMEM; 2728a30b71b9SAlex Elder sprintf(rbd_dev->header_name, "%s%s", 2729a30b71b9SAlex Elder RBD_HEADER_PREFIX, rbd_dev->image_id); 27309d475de5SAlex Elder 27319d475de5SAlex Elder /* Get the size and object order for the image */ 27329d475de5SAlex Elder 27339d475de5SAlex Elder ret = rbd_dev_v2_image_size(rbd_dev); 27349d475de5SAlex Elder if (ret < 0) 27359d475de5SAlex Elder goto out_err; 27361e130199SAlex Elder 27371e130199SAlex Elder /* Get the object prefix (a.k.a. block_name) for the image */ 27381e130199SAlex Elder 27391e130199SAlex Elder ret = rbd_dev_v2_object_prefix(rbd_dev); 27401e130199SAlex Elder if (ret < 0) 27411e130199SAlex Elder goto out_err; 2742a30b71b9SAlex Elder rbd_dev->image_format = 2; 2743a30b71b9SAlex Elder 2744a30b71b9SAlex Elder dout("discovered version 2 image, header name is %s\n", 2745a30b71b9SAlex Elder rbd_dev->header_name); 2746a30b71b9SAlex Elder 2747a30b71b9SAlex Elder return -ENOTSUPP; 27489d475de5SAlex Elder out_err: 27499d475de5SAlex Elder kfree(rbd_dev->header_name); 27509d475de5SAlex Elder rbd_dev->header_name = NULL; 27511e130199SAlex Elder kfree(rbd_dev->header.object_prefix); 27521e130199SAlex Elder rbd_dev->header.object_prefix = NULL; 27539d475de5SAlex Elder 27549d475de5SAlex Elder return ret; 2755a30b71b9SAlex Elder } 2756a30b71b9SAlex Elder 2757a30b71b9SAlex Elder /* 2758a30b71b9SAlex Elder * Probe for the existence of the header object for the given rbd 2759a30b71b9SAlex Elder * device. For format 2 images this includes determining the image 2760a30b71b9SAlex Elder * id. 2761a30b71b9SAlex Elder */ 2762a30b71b9SAlex Elder static int rbd_dev_probe(struct rbd_device *rbd_dev) 2763a30b71b9SAlex Elder { 2764a30b71b9SAlex Elder int ret; 2765a30b71b9SAlex Elder 2766a30b71b9SAlex Elder /* 2767a30b71b9SAlex Elder * Get the id from the image id object. If it's not a 2768a30b71b9SAlex Elder * format 2 image, we'll get ENOENT back, and we'll assume 2769a30b71b9SAlex Elder * it's a format 1 image. 2770a30b71b9SAlex Elder */ 2771a30b71b9SAlex Elder ret = rbd_dev_image_id(rbd_dev); 2772a30b71b9SAlex Elder if (ret) 2773a30b71b9SAlex Elder ret = rbd_dev_v1_probe(rbd_dev); 2774a30b71b9SAlex Elder else 2775a30b71b9SAlex Elder ret = rbd_dev_v2_probe(rbd_dev); 2776a30b71b9SAlex Elder if (ret) 2777a30b71b9SAlex Elder dout("probe failed, returning %d\n", ret); 2778a30b71b9SAlex Elder 2779a30b71b9SAlex Elder return ret; 2780a30b71b9SAlex Elder } 2781a30b71b9SAlex Elder 278259c2be1eSYehuda Sadeh static ssize_t rbd_add(struct bus_type *bus, 278359c2be1eSYehuda Sadeh const char *buf, 278459c2be1eSYehuda Sadeh size_t count) 2785602adf40SYehuda Sadeh { 2786cb8627c7SAlex Elder char *options; 2787cb8627c7SAlex Elder struct rbd_device *rbd_dev = NULL; 27887ef3214aSAlex Elder const char *mon_addrs = NULL; 27897ef3214aSAlex Elder size_t mon_addrs_size = 0; 279027cc2594SAlex Elder struct ceph_osd_client *osdc; 279127cc2594SAlex Elder int rc = -ENOMEM; 27923feeb894SAlex Elder char *snap_name; 2793602adf40SYehuda Sadeh 2794602adf40SYehuda Sadeh if (!try_module_get(THIS_MODULE)) 2795602adf40SYehuda Sadeh return -ENODEV; 2796602adf40SYehuda Sadeh 279727cc2594SAlex Elder options = kmalloc(count, GFP_KERNEL); 279827cc2594SAlex Elder if (!options) 279985ae8926SAlex Elder goto err_out_mem; 2800cb8627c7SAlex Elder rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL); 2801cb8627c7SAlex Elder if (!rbd_dev) 280285ae8926SAlex Elder goto err_out_mem; 2803602adf40SYehuda Sadeh 2804602adf40SYehuda Sadeh /* static rbd_device initialization */ 2805602adf40SYehuda Sadeh spin_lock_init(&rbd_dev->lock); 2806602adf40SYehuda Sadeh INIT_LIST_HEAD(&rbd_dev->node); 2807dfc5606dSYehuda Sadeh INIT_LIST_HEAD(&rbd_dev->snaps); 2808c666601aSJosh Durgin init_rwsem(&rbd_dev->header_rwsem); 2809602adf40SYehuda Sadeh 2810a725f65eSAlex Elder /* parse add command */ 28113feeb894SAlex Elder snap_name = rbd_add_parse_args(rbd_dev, buf, 28123feeb894SAlex Elder &mon_addrs, &mon_addrs_size, options, count); 28133feeb894SAlex Elder if (IS_ERR(snap_name)) { 28143feeb894SAlex Elder rc = PTR_ERR(snap_name); 281585ae8926SAlex Elder goto err_out_mem; 28163feeb894SAlex Elder } 2817a725f65eSAlex Elder 2818f8c38929SAlex Elder rc = rbd_get_client(rbd_dev, mon_addrs, mon_addrs_size - 1, options); 2819f8c38929SAlex Elder if (rc < 0) 282085ae8926SAlex Elder goto err_out_args; 2821602adf40SYehuda Sadeh 2822602adf40SYehuda Sadeh /* pick the pool */ 28231dbb4399SAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 2824602adf40SYehuda Sadeh rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name); 2825602adf40SYehuda Sadeh if (rc < 0) 2826602adf40SYehuda Sadeh goto err_out_client; 28279bb2f334SAlex Elder rbd_dev->pool_id = rc; 2828602adf40SYehuda Sadeh 2829a30b71b9SAlex Elder rc = rbd_dev_probe(rbd_dev); 2830a30b71b9SAlex Elder if (rc < 0) 2831589d30e0SAlex Elder goto err_out_client; 2832a30b71b9SAlex Elder rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 283305fd6f6fSAlex Elder 283405fd6f6fSAlex Elder /* no need to lock here, as rbd_dev is not registered yet */ 283505fd6f6fSAlex Elder rc = rbd_dev_snaps_update(rbd_dev); 283605fd6f6fSAlex Elder if (rc) 283705fd6f6fSAlex Elder goto err_out_header; 283805fd6f6fSAlex Elder 283905fd6f6fSAlex Elder rc = rbd_dev_set_mapping(rbd_dev, snap_name); 284005fd6f6fSAlex Elder if (rc) 284105fd6f6fSAlex Elder goto err_out_header; 284205fd6f6fSAlex Elder 284385ae8926SAlex Elder /* generate unique id: find highest unique id, add one */ 284485ae8926SAlex Elder rbd_dev_id_get(rbd_dev); 284585ae8926SAlex Elder 284685ae8926SAlex Elder /* Fill in the device name, now that we have its id. */ 284785ae8926SAlex Elder BUILD_BUG_ON(DEV_NAME_LEN 284885ae8926SAlex Elder < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH); 284985ae8926SAlex Elder sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id); 285085ae8926SAlex Elder 285185ae8926SAlex Elder /* Get our block major device number. */ 285285ae8926SAlex Elder 285327cc2594SAlex Elder rc = register_blkdev(0, rbd_dev->name); 285427cc2594SAlex Elder if (rc < 0) 285585ae8926SAlex Elder goto err_out_id; 285627cc2594SAlex Elder rbd_dev->major = rc; 2857602adf40SYehuda Sadeh 28580f308a31SAlex Elder /* Set up the blkdev mapping. */ 28590f308a31SAlex Elder 28600f308a31SAlex Elder rc = rbd_init_disk(rbd_dev); 2861dfc5606dSYehuda Sadeh if (rc) 2862766fc439SYehuda Sadeh goto err_out_blkdev; 2863766fc439SYehuda Sadeh 28640f308a31SAlex Elder rc = rbd_bus_add_dev(rbd_dev); 28650f308a31SAlex Elder if (rc) 28660f308a31SAlex Elder goto err_out_disk; 28670f308a31SAlex Elder 286832eec68dSAlex Elder /* 286932eec68dSAlex Elder * At this point cleanup in the event of an error is the job 287032eec68dSAlex Elder * of the sysfs code (initiated by rbd_bus_del_dev()). 287132eec68dSAlex Elder */ 28722ac4e75dSAlex Elder 28734bb1f1edSAlex Elder down_write(&rbd_dev->header_rwsem); 28745ed16177SAlex Elder rc = rbd_dev_snaps_register(rbd_dev); 28754bb1f1edSAlex Elder up_write(&rbd_dev->header_rwsem); 28762ac4e75dSAlex Elder if (rc) 28772ac4e75dSAlex Elder goto err_out_bus; 28782ac4e75dSAlex Elder 287959c2be1eSYehuda Sadeh rc = rbd_init_watch_dev(rbd_dev); 288059c2be1eSYehuda Sadeh if (rc) 288159c2be1eSYehuda Sadeh goto err_out_bus; 288259c2be1eSYehuda Sadeh 28833ee4001eSAlex Elder /* Everything's ready. Announce the disk to the world. */ 28843ee4001eSAlex Elder 28853ee4001eSAlex Elder add_disk(rbd_dev->disk); 28863ee4001eSAlex Elder 28873ee4001eSAlex Elder pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name, 28883ee4001eSAlex Elder (unsigned long long) rbd_dev->mapping.size); 28893ee4001eSAlex Elder 2890602adf40SYehuda Sadeh return count; 2891602adf40SYehuda Sadeh 2892766fc439SYehuda Sadeh err_out_bus: 2893766fc439SYehuda Sadeh /* this will also clean up rest of rbd_dev stuff */ 2894766fc439SYehuda Sadeh 2895766fc439SYehuda Sadeh rbd_bus_del_dev(rbd_dev); 2896766fc439SYehuda Sadeh kfree(options); 2897766fc439SYehuda Sadeh return rc; 2898766fc439SYehuda Sadeh 28990f308a31SAlex Elder err_out_disk: 29000f308a31SAlex Elder rbd_free_disk(rbd_dev); 2901602adf40SYehuda Sadeh err_out_blkdev: 2902602adf40SYehuda Sadeh unregister_blkdev(rbd_dev->major, rbd_dev->name); 290385ae8926SAlex Elder err_out_id: 290485ae8926SAlex Elder rbd_dev_id_put(rbd_dev); 290505fd6f6fSAlex Elder err_out_header: 290605fd6f6fSAlex Elder rbd_header_free(&rbd_dev->header); 2907602adf40SYehuda Sadeh err_out_client: 29083fcf2581SAlex Elder kfree(rbd_dev->header_name); 2909602adf40SYehuda Sadeh rbd_put_client(rbd_dev); 2910589d30e0SAlex Elder kfree(rbd_dev->image_id); 291185ae8926SAlex Elder err_out_args: 2912f84344f3SAlex Elder kfree(rbd_dev->mapping.snap_name); 29130bed54dcSAlex Elder kfree(rbd_dev->image_name); 2914d22f76e7SAlex Elder kfree(rbd_dev->pool_name); 291585ae8926SAlex Elder err_out_mem: 291627cc2594SAlex Elder kfree(rbd_dev); 2917cb8627c7SAlex Elder kfree(options); 291827cc2594SAlex Elder 2919602adf40SYehuda Sadeh dout("Error adding device %s\n", buf); 2920602adf40SYehuda Sadeh module_put(THIS_MODULE); 292127cc2594SAlex Elder 292227cc2594SAlex Elder return (ssize_t) rc; 2923602adf40SYehuda Sadeh } 2924602adf40SYehuda Sadeh 2925de71a297SAlex Elder static struct rbd_device *__rbd_get_dev(unsigned long dev_id) 2926602adf40SYehuda Sadeh { 2927602adf40SYehuda Sadeh struct list_head *tmp; 2928602adf40SYehuda Sadeh struct rbd_device *rbd_dev; 2929602adf40SYehuda Sadeh 2930e124a82fSAlex Elder spin_lock(&rbd_dev_list_lock); 2931602adf40SYehuda Sadeh list_for_each(tmp, &rbd_dev_list) { 2932602adf40SYehuda Sadeh rbd_dev = list_entry(tmp, struct rbd_device, node); 2933de71a297SAlex Elder if (rbd_dev->dev_id == dev_id) { 2934e124a82fSAlex Elder spin_unlock(&rbd_dev_list_lock); 2935602adf40SYehuda Sadeh return rbd_dev; 2936602adf40SYehuda Sadeh } 2937e124a82fSAlex Elder } 2938e124a82fSAlex Elder spin_unlock(&rbd_dev_list_lock); 2939602adf40SYehuda Sadeh return NULL; 2940602adf40SYehuda Sadeh } 2941602adf40SYehuda Sadeh 2942dfc5606dSYehuda Sadeh static void rbd_dev_release(struct device *dev) 2943602adf40SYehuda Sadeh { 2944593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 2945602adf40SYehuda Sadeh 29461dbb4399SAlex Elder if (rbd_dev->watch_request) { 29471dbb4399SAlex Elder struct ceph_client *client = rbd_dev->rbd_client->client; 29481dbb4399SAlex Elder 29491dbb4399SAlex Elder ceph_osdc_unregister_linger_request(&client->osdc, 295059c2be1eSYehuda Sadeh rbd_dev->watch_request); 29511dbb4399SAlex Elder } 295259c2be1eSYehuda Sadeh if (rbd_dev->watch_event) 2953070c633fSAlex Elder rbd_req_sync_unwatch(rbd_dev); 295459c2be1eSYehuda Sadeh 2955602adf40SYehuda Sadeh rbd_put_client(rbd_dev); 2956602adf40SYehuda Sadeh 2957602adf40SYehuda Sadeh /* clean up and free blkdev */ 2958602adf40SYehuda Sadeh rbd_free_disk(rbd_dev); 2959602adf40SYehuda Sadeh unregister_blkdev(rbd_dev->major, rbd_dev->name); 296032eec68dSAlex Elder 29612ac4e75dSAlex Elder /* release allocated disk header fields */ 29622ac4e75dSAlex Elder rbd_header_free(&rbd_dev->header); 29632ac4e75dSAlex Elder 296432eec68dSAlex Elder /* done with the id, and with the rbd_dev */ 2965f84344f3SAlex Elder kfree(rbd_dev->mapping.snap_name); 2966589d30e0SAlex Elder kfree(rbd_dev->image_id); 29670bed54dcSAlex Elder kfree(rbd_dev->header_name); 2968d22f76e7SAlex Elder kfree(rbd_dev->pool_name); 29690bed54dcSAlex Elder kfree(rbd_dev->image_name); 2970e2839308SAlex Elder rbd_dev_id_put(rbd_dev); 2971602adf40SYehuda Sadeh kfree(rbd_dev); 2972602adf40SYehuda Sadeh 2973602adf40SYehuda Sadeh /* release module ref */ 2974602adf40SYehuda Sadeh module_put(THIS_MODULE); 2975602adf40SYehuda Sadeh } 2976602adf40SYehuda Sadeh 2977dfc5606dSYehuda Sadeh static ssize_t rbd_remove(struct bus_type *bus, 2978602adf40SYehuda Sadeh const char *buf, 2979602adf40SYehuda Sadeh size_t count) 2980602adf40SYehuda Sadeh { 2981602adf40SYehuda Sadeh struct rbd_device *rbd_dev = NULL; 2982602adf40SYehuda Sadeh int target_id, rc; 2983602adf40SYehuda Sadeh unsigned long ul; 2984602adf40SYehuda Sadeh int ret = count; 2985602adf40SYehuda Sadeh 2986602adf40SYehuda Sadeh rc = strict_strtoul(buf, 10, &ul); 2987602adf40SYehuda Sadeh if (rc) 2988602adf40SYehuda Sadeh return rc; 2989602adf40SYehuda Sadeh 2990602adf40SYehuda Sadeh /* convert to int; abort if we lost anything in the conversion */ 2991602adf40SYehuda Sadeh target_id = (int) ul; 2992602adf40SYehuda Sadeh if (target_id != ul) 2993602adf40SYehuda Sadeh return -EINVAL; 2994602adf40SYehuda Sadeh 2995602adf40SYehuda Sadeh mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); 2996602adf40SYehuda Sadeh 2997602adf40SYehuda Sadeh rbd_dev = __rbd_get_dev(target_id); 2998602adf40SYehuda Sadeh if (!rbd_dev) { 2999602adf40SYehuda Sadeh ret = -ENOENT; 3000602adf40SYehuda Sadeh goto done; 3001602adf40SYehuda Sadeh } 3002602adf40SYehuda Sadeh 3003dfc5606dSYehuda Sadeh __rbd_remove_all_snaps(rbd_dev); 3004dfc5606dSYehuda Sadeh rbd_bus_del_dev(rbd_dev); 3005602adf40SYehuda Sadeh 3006602adf40SYehuda Sadeh done: 3007602adf40SYehuda Sadeh mutex_unlock(&ctl_mutex); 3008aafb230eSAlex Elder 3009602adf40SYehuda Sadeh return ret; 3010602adf40SYehuda Sadeh } 3011602adf40SYehuda Sadeh 3012602adf40SYehuda Sadeh /* 3013602adf40SYehuda Sadeh * create control files in sysfs 3014dfc5606dSYehuda Sadeh * /sys/bus/rbd/... 3015602adf40SYehuda Sadeh */ 3016602adf40SYehuda Sadeh static int rbd_sysfs_init(void) 3017602adf40SYehuda Sadeh { 3018dfc5606dSYehuda Sadeh int ret; 3019602adf40SYehuda Sadeh 3020fed4c143SAlex Elder ret = device_register(&rbd_root_dev); 3021dfc5606dSYehuda Sadeh if (ret < 0) 3022dfc5606dSYehuda Sadeh return ret; 3023602adf40SYehuda Sadeh 3024fed4c143SAlex Elder ret = bus_register(&rbd_bus_type); 3025fed4c143SAlex Elder if (ret < 0) 3026fed4c143SAlex Elder device_unregister(&rbd_root_dev); 3027602adf40SYehuda Sadeh 3028602adf40SYehuda Sadeh return ret; 3029602adf40SYehuda Sadeh } 3030602adf40SYehuda Sadeh 3031602adf40SYehuda Sadeh static void rbd_sysfs_cleanup(void) 3032602adf40SYehuda Sadeh { 3033dfc5606dSYehuda Sadeh bus_unregister(&rbd_bus_type); 3034fed4c143SAlex Elder device_unregister(&rbd_root_dev); 3035602adf40SYehuda Sadeh } 3036602adf40SYehuda Sadeh 3037602adf40SYehuda Sadeh int __init rbd_init(void) 3038602adf40SYehuda Sadeh { 3039602adf40SYehuda Sadeh int rc; 3040602adf40SYehuda Sadeh 3041602adf40SYehuda Sadeh rc = rbd_sysfs_init(); 3042602adf40SYehuda Sadeh if (rc) 3043602adf40SYehuda Sadeh return rc; 3044f0f8cef5SAlex Elder pr_info("loaded " RBD_DRV_NAME_LONG "\n"); 3045602adf40SYehuda Sadeh return 0; 3046602adf40SYehuda Sadeh } 3047602adf40SYehuda Sadeh 3048602adf40SYehuda Sadeh void __exit rbd_exit(void) 3049602adf40SYehuda Sadeh { 3050602adf40SYehuda Sadeh rbd_sysfs_cleanup(); 3051602adf40SYehuda Sadeh } 3052602adf40SYehuda Sadeh 3053602adf40SYehuda Sadeh module_init(rbd_init); 3054602adf40SYehuda Sadeh module_exit(rbd_exit); 3055602adf40SYehuda Sadeh 3056602adf40SYehuda Sadeh MODULE_AUTHOR("Sage Weil <sage@newdream.net>"); 3057602adf40SYehuda Sadeh MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>"); 3058602adf40SYehuda Sadeh MODULE_DESCRIPTION("rados block device"); 3059602adf40SYehuda Sadeh 3060602adf40SYehuda Sadeh /* following authorship retained from original osdblk.c */ 3061602adf40SYehuda Sadeh MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>"); 3062602adf40SYehuda Sadeh 3063602adf40SYehuda Sadeh MODULE_LICENSE("GPL"); 3064