1602adf40SYehuda Sadeh /* 2602adf40SYehuda Sadeh rbd.c -- Export ceph rados objects as a Linux block device 3602adf40SYehuda Sadeh 4602adf40SYehuda Sadeh 5602adf40SYehuda Sadeh based on drivers/block/osdblk.c: 6602adf40SYehuda Sadeh 7602adf40SYehuda Sadeh Copyright 2009 Red Hat, Inc. 8602adf40SYehuda Sadeh 9602adf40SYehuda Sadeh This program is free software; you can redistribute it and/or modify 10602adf40SYehuda Sadeh it under the terms of the GNU General Public License as published by 11602adf40SYehuda Sadeh the Free Software Foundation. 12602adf40SYehuda Sadeh 13602adf40SYehuda Sadeh This program is distributed in the hope that it will be useful, 14602adf40SYehuda Sadeh but WITHOUT ANY WARRANTY; without even the implied warranty of 15602adf40SYehuda Sadeh MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16602adf40SYehuda Sadeh GNU General Public License for more details. 17602adf40SYehuda Sadeh 18602adf40SYehuda Sadeh You should have received a copy of the GNU General Public License 19602adf40SYehuda Sadeh along with this program; see the file COPYING. If not, write to 20602adf40SYehuda Sadeh the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 21602adf40SYehuda Sadeh 22602adf40SYehuda Sadeh 23602adf40SYehuda Sadeh 24dfc5606dSYehuda Sadeh For usage instructions, please refer to: 25602adf40SYehuda Sadeh 26dfc5606dSYehuda Sadeh Documentation/ABI/testing/sysfs-bus-rbd 27602adf40SYehuda Sadeh 28602adf40SYehuda Sadeh */ 29602adf40SYehuda Sadeh 30602adf40SYehuda Sadeh #include <linux/ceph/libceph.h> 31602adf40SYehuda Sadeh #include <linux/ceph/osd_client.h> 32602adf40SYehuda Sadeh #include <linux/ceph/mon_client.h> 33602adf40SYehuda Sadeh #include <linux/ceph/decode.h> 3459c2be1eSYehuda Sadeh #include <linux/parser.h> 35602adf40SYehuda Sadeh 36602adf40SYehuda Sadeh #include <linux/kernel.h> 37602adf40SYehuda Sadeh #include <linux/device.h> 38602adf40SYehuda Sadeh #include <linux/module.h> 39602adf40SYehuda Sadeh #include <linux/fs.h> 40602adf40SYehuda Sadeh #include <linux/blkdev.h> 41602adf40SYehuda Sadeh 42602adf40SYehuda Sadeh #include "rbd_types.h" 43602adf40SYehuda Sadeh 44aafb230eSAlex Elder #define RBD_DEBUG /* Activate rbd_assert() calls */ 45aafb230eSAlex Elder 46593a9e7bSAlex Elder /* 47593a9e7bSAlex Elder * The basic unit of block I/O is a sector. It is interpreted in a 48593a9e7bSAlex Elder * number of contexts in Linux (blk, bio, genhd), but the default is 49593a9e7bSAlex Elder * universally 512 bytes. These symbols are just slightly more 50593a9e7bSAlex Elder * meaningful than the bare numbers they represent. 51593a9e7bSAlex Elder */ 52593a9e7bSAlex Elder #define SECTOR_SHIFT 9 53593a9e7bSAlex Elder #define SECTOR_SIZE (1ULL << SECTOR_SHIFT) 54593a9e7bSAlex Elder 55df111be6SAlex Elder /* It might be useful to have this defined elsewhere too */ 56df111be6SAlex Elder 57df111be6SAlex Elder #define U64_MAX ((u64) (~0ULL)) 58df111be6SAlex Elder 59f0f8cef5SAlex Elder #define RBD_DRV_NAME "rbd" 60f0f8cef5SAlex Elder #define RBD_DRV_NAME_LONG "rbd (rados block device)" 61602adf40SYehuda Sadeh 62602adf40SYehuda Sadeh #define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */ 63602adf40SYehuda Sadeh 64d4b125e9SAlex Elder #define RBD_SNAP_DEV_NAME_PREFIX "snap_" 65d4b125e9SAlex Elder #define RBD_MAX_SNAP_NAME_LEN \ 66d4b125e9SAlex Elder (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1)) 67d4b125e9SAlex Elder 6835d489f9SAlex Elder #define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */ 69602adf40SYehuda Sadeh #define RBD_MAX_OPT_LEN 1024 70602adf40SYehuda Sadeh 71602adf40SYehuda Sadeh #define RBD_SNAP_HEAD_NAME "-" 72602adf40SYehuda Sadeh 73589d30e0SAlex Elder #define RBD_IMAGE_ID_LEN_MAX 64 741e130199SAlex Elder #define RBD_OBJ_PREFIX_LEN_MAX 64 75589d30e0SAlex Elder 76d889140cSAlex Elder /* Feature bits */ 77d889140cSAlex Elder 78d889140cSAlex Elder #define RBD_FEATURE_LAYERING 1 79d889140cSAlex Elder 80d889140cSAlex Elder /* Features supported by this (client software) implementation. */ 81d889140cSAlex Elder 82d889140cSAlex Elder #define RBD_FEATURES_ALL (0) 83d889140cSAlex Elder 8481a89793SAlex Elder /* 8581a89793SAlex Elder * An RBD device name will be "rbd#", where the "rbd" comes from 8681a89793SAlex Elder * RBD_DRV_NAME above, and # is a unique integer identifier. 8781a89793SAlex Elder * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big 8881a89793SAlex Elder * enough to hold all possible device names. 8981a89793SAlex Elder */ 90602adf40SYehuda Sadeh #define DEV_NAME_LEN 32 9181a89793SAlex Elder #define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1) 92602adf40SYehuda Sadeh 93cc0538b6SAlex Elder #define RBD_READ_ONLY_DEFAULT false 9459c2be1eSYehuda Sadeh 95602adf40SYehuda Sadeh /* 96602adf40SYehuda Sadeh * block device image metadata (in-memory version) 97602adf40SYehuda Sadeh */ 98602adf40SYehuda Sadeh struct rbd_image_header { 99f84344f3SAlex Elder /* These four fields never change for a given rbd image */ 100849b4260SAlex Elder char *object_prefix; 10134b13184SAlex Elder u64 features; 102602adf40SYehuda Sadeh __u8 obj_order; 103602adf40SYehuda Sadeh __u8 crypt_type; 104602adf40SYehuda Sadeh __u8 comp_type; 105602adf40SYehuda Sadeh 106f84344f3SAlex Elder /* The remaining fields need to be updated occasionally */ 107f84344f3SAlex Elder u64 image_size; 108f84344f3SAlex Elder struct ceph_snap_context *snapc; 109602adf40SYehuda Sadeh char *snap_names; 110602adf40SYehuda Sadeh u64 *snap_sizes; 11159c2be1eSYehuda Sadeh 11259c2be1eSYehuda Sadeh u64 obj_version; 11359c2be1eSYehuda Sadeh }; 11459c2be1eSYehuda Sadeh 11559c2be1eSYehuda Sadeh struct rbd_options { 116cc0538b6SAlex Elder bool read_only; 117602adf40SYehuda Sadeh }; 118602adf40SYehuda Sadeh 119602adf40SYehuda Sadeh /* 120f0f8cef5SAlex Elder * an instance of the client. multiple devices may share an rbd client. 121602adf40SYehuda Sadeh */ 122602adf40SYehuda Sadeh struct rbd_client { 123602adf40SYehuda Sadeh struct ceph_client *client; 124602adf40SYehuda Sadeh struct kref kref; 125602adf40SYehuda Sadeh struct list_head node; 126602adf40SYehuda Sadeh }; 127602adf40SYehuda Sadeh 128602adf40SYehuda Sadeh /* 129f0f8cef5SAlex Elder * a request completion status 130602adf40SYehuda Sadeh */ 1311fec7093SYehuda Sadeh struct rbd_req_status { 1321fec7093SYehuda Sadeh int done; 1331fec7093SYehuda Sadeh int rc; 1341fec7093SYehuda Sadeh u64 bytes; 1351fec7093SYehuda Sadeh }; 1361fec7093SYehuda Sadeh 1371fec7093SYehuda Sadeh /* 1381fec7093SYehuda Sadeh * a collection of requests 1391fec7093SYehuda Sadeh */ 1401fec7093SYehuda Sadeh struct rbd_req_coll { 1411fec7093SYehuda Sadeh int total; 1421fec7093SYehuda Sadeh int num_done; 1431fec7093SYehuda Sadeh struct kref kref; 1441fec7093SYehuda Sadeh struct rbd_req_status status[0]; 145602adf40SYehuda Sadeh }; 146602adf40SYehuda Sadeh 147f0f8cef5SAlex Elder /* 148f0f8cef5SAlex Elder * a single io request 149f0f8cef5SAlex Elder */ 150f0f8cef5SAlex Elder struct rbd_request { 151f0f8cef5SAlex Elder struct request *rq; /* blk layer request */ 152f0f8cef5SAlex Elder struct bio *bio; /* cloned bio */ 153f0f8cef5SAlex Elder struct page **pages; /* list of used pages */ 154f0f8cef5SAlex Elder u64 len; 155f0f8cef5SAlex Elder int coll_index; 156f0f8cef5SAlex Elder struct rbd_req_coll *coll; 157f0f8cef5SAlex Elder }; 158f0f8cef5SAlex Elder 159dfc5606dSYehuda Sadeh struct rbd_snap { 160dfc5606dSYehuda Sadeh struct device dev; 161dfc5606dSYehuda Sadeh const char *name; 1623591538fSJosh Durgin u64 size; 163dfc5606dSYehuda Sadeh struct list_head node; 164dfc5606dSYehuda Sadeh u64 id; 16534b13184SAlex Elder u64 features; 166dfc5606dSYehuda Sadeh }; 167dfc5606dSYehuda Sadeh 168f84344f3SAlex Elder struct rbd_mapping { 169f84344f3SAlex Elder char *snap_name; 170f84344f3SAlex Elder u64 snap_id; 17199c1f08fSAlex Elder u64 size; 17234b13184SAlex Elder u64 features; 173f84344f3SAlex Elder bool snap_exists; 174f84344f3SAlex Elder bool read_only; 175f84344f3SAlex Elder }; 176f84344f3SAlex Elder 177602adf40SYehuda Sadeh /* 178602adf40SYehuda Sadeh * a single device 179602adf40SYehuda Sadeh */ 180602adf40SYehuda Sadeh struct rbd_device { 181de71a297SAlex Elder int dev_id; /* blkdev unique id */ 182602adf40SYehuda Sadeh 183602adf40SYehuda Sadeh int major; /* blkdev assigned major */ 184602adf40SYehuda Sadeh struct gendisk *disk; /* blkdev's gendisk and rq */ 185602adf40SYehuda Sadeh 186a30b71b9SAlex Elder u32 image_format; /* Either 1 or 2 */ 187602adf40SYehuda Sadeh struct rbd_client *rbd_client; 188602adf40SYehuda Sadeh 189602adf40SYehuda Sadeh char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */ 190602adf40SYehuda Sadeh 191602adf40SYehuda Sadeh spinlock_t lock; /* queue lock */ 192602adf40SYehuda Sadeh 193602adf40SYehuda Sadeh struct rbd_image_header header; 194589d30e0SAlex Elder char *image_id; 195589d30e0SAlex Elder size_t image_id_len; 1960bed54dcSAlex Elder char *image_name; 1970bed54dcSAlex Elder size_t image_name_len; 1980bed54dcSAlex Elder char *header_name; 199d22f76e7SAlex Elder char *pool_name; 2009bb2f334SAlex Elder int pool_id; 201602adf40SYehuda Sadeh 20259c2be1eSYehuda Sadeh struct ceph_osd_event *watch_event; 20359c2be1eSYehuda Sadeh struct ceph_osd_request *watch_request; 20459c2be1eSYehuda Sadeh 205c666601aSJosh Durgin /* protects updating the header */ 206c666601aSJosh Durgin struct rw_semaphore header_rwsem; 207f84344f3SAlex Elder 208f84344f3SAlex Elder struct rbd_mapping mapping; 209602adf40SYehuda Sadeh 210602adf40SYehuda Sadeh struct list_head node; 211dfc5606dSYehuda Sadeh 212dfc5606dSYehuda Sadeh /* list of snapshots */ 213dfc5606dSYehuda Sadeh struct list_head snaps; 214dfc5606dSYehuda Sadeh 215dfc5606dSYehuda Sadeh /* sysfs related */ 216dfc5606dSYehuda Sadeh struct device dev; 217dfc5606dSYehuda Sadeh }; 218dfc5606dSYehuda Sadeh 219602adf40SYehuda Sadeh static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */ 220e124a82fSAlex Elder 221602adf40SYehuda Sadeh static LIST_HEAD(rbd_dev_list); /* devices */ 222e124a82fSAlex Elder static DEFINE_SPINLOCK(rbd_dev_list_lock); 223e124a82fSAlex Elder 224602adf40SYehuda Sadeh static LIST_HEAD(rbd_client_list); /* clients */ 225432b8587SAlex Elder static DEFINE_SPINLOCK(rbd_client_list_lock); 226602adf40SYehuda Sadeh 227304f6808SAlex Elder static int rbd_dev_snaps_update(struct rbd_device *rbd_dev); 228304f6808SAlex Elder static int rbd_dev_snaps_register(struct rbd_device *rbd_dev); 229304f6808SAlex Elder 230dfc5606dSYehuda Sadeh static void rbd_dev_release(struct device *dev); 23114e7085dSAlex Elder static void __rbd_remove_snap_dev(struct rbd_snap *snap); 232dfc5606dSYehuda Sadeh 233f0f8cef5SAlex Elder static ssize_t rbd_add(struct bus_type *bus, const char *buf, 234f0f8cef5SAlex Elder size_t count); 235f0f8cef5SAlex Elder static ssize_t rbd_remove(struct bus_type *bus, const char *buf, 236f0f8cef5SAlex Elder size_t count); 237f0f8cef5SAlex Elder 238f0f8cef5SAlex Elder static struct bus_attribute rbd_bus_attrs[] = { 239f0f8cef5SAlex Elder __ATTR(add, S_IWUSR, NULL, rbd_add), 240f0f8cef5SAlex Elder __ATTR(remove, S_IWUSR, NULL, rbd_remove), 241f0f8cef5SAlex Elder __ATTR_NULL 242f0f8cef5SAlex Elder }; 243f0f8cef5SAlex Elder 244f0f8cef5SAlex Elder static struct bus_type rbd_bus_type = { 245f0f8cef5SAlex Elder .name = "rbd", 246f0f8cef5SAlex Elder .bus_attrs = rbd_bus_attrs, 247f0f8cef5SAlex Elder }; 248f0f8cef5SAlex Elder 249f0f8cef5SAlex Elder static void rbd_root_dev_release(struct device *dev) 250f0f8cef5SAlex Elder { 251f0f8cef5SAlex Elder } 252f0f8cef5SAlex Elder 253f0f8cef5SAlex Elder static struct device rbd_root_dev = { 254f0f8cef5SAlex Elder .init_name = "rbd", 255f0f8cef5SAlex Elder .release = rbd_root_dev_release, 256f0f8cef5SAlex Elder }; 257f0f8cef5SAlex Elder 258aafb230eSAlex Elder #ifdef RBD_DEBUG 259aafb230eSAlex Elder #define rbd_assert(expr) \ 260aafb230eSAlex Elder if (unlikely(!(expr))) { \ 261aafb230eSAlex Elder printk(KERN_ERR "\nAssertion failure in %s() " \ 262aafb230eSAlex Elder "at line %d:\n\n" \ 263aafb230eSAlex Elder "\trbd_assert(%s);\n\n", \ 264aafb230eSAlex Elder __func__, __LINE__, #expr); \ 265aafb230eSAlex Elder BUG(); \ 266aafb230eSAlex Elder } 267aafb230eSAlex Elder #else /* !RBD_DEBUG */ 268aafb230eSAlex Elder # define rbd_assert(expr) ((void) 0) 269aafb230eSAlex Elder #endif /* !RBD_DEBUG */ 270dfc5606dSYehuda Sadeh 271dfc5606dSYehuda Sadeh static struct device *rbd_get_dev(struct rbd_device *rbd_dev) 272dfc5606dSYehuda Sadeh { 273dfc5606dSYehuda Sadeh return get_device(&rbd_dev->dev); 274dfc5606dSYehuda Sadeh } 275dfc5606dSYehuda Sadeh 276dfc5606dSYehuda Sadeh static void rbd_put_dev(struct rbd_device *rbd_dev) 277dfc5606dSYehuda Sadeh { 278dfc5606dSYehuda Sadeh put_device(&rbd_dev->dev); 279dfc5606dSYehuda Sadeh } 280602adf40SYehuda Sadeh 281117973fbSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver); 282117973fbSAlex Elder static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver); 28359c2be1eSYehuda Sadeh 284602adf40SYehuda Sadeh static int rbd_open(struct block_device *bdev, fmode_t mode) 285602adf40SYehuda Sadeh { 286f0f8cef5SAlex Elder struct rbd_device *rbd_dev = bdev->bd_disk->private_data; 287602adf40SYehuda Sadeh 288f84344f3SAlex Elder if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only) 289602adf40SYehuda Sadeh return -EROFS; 290602adf40SYehuda Sadeh 291340c7a2bSAlex Elder rbd_get_dev(rbd_dev); 292f84344f3SAlex Elder set_device_ro(bdev, rbd_dev->mapping.read_only); 293340c7a2bSAlex Elder 294602adf40SYehuda Sadeh return 0; 295602adf40SYehuda Sadeh } 296602adf40SYehuda Sadeh 297dfc5606dSYehuda Sadeh static int rbd_release(struct gendisk *disk, fmode_t mode) 298dfc5606dSYehuda Sadeh { 299dfc5606dSYehuda Sadeh struct rbd_device *rbd_dev = disk->private_data; 300dfc5606dSYehuda Sadeh 301dfc5606dSYehuda Sadeh rbd_put_dev(rbd_dev); 302dfc5606dSYehuda Sadeh 303dfc5606dSYehuda Sadeh return 0; 304dfc5606dSYehuda Sadeh } 305dfc5606dSYehuda Sadeh 306602adf40SYehuda Sadeh static const struct block_device_operations rbd_bd_ops = { 307602adf40SYehuda Sadeh .owner = THIS_MODULE, 308602adf40SYehuda Sadeh .open = rbd_open, 309dfc5606dSYehuda Sadeh .release = rbd_release, 310602adf40SYehuda Sadeh }; 311602adf40SYehuda Sadeh 312602adf40SYehuda Sadeh /* 313602adf40SYehuda Sadeh * Initialize an rbd client instance. 31443ae4701SAlex Elder * We own *ceph_opts. 315602adf40SYehuda Sadeh */ 316f8c38929SAlex Elder static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts) 317602adf40SYehuda Sadeh { 318602adf40SYehuda Sadeh struct rbd_client *rbdc; 319602adf40SYehuda Sadeh int ret = -ENOMEM; 320602adf40SYehuda Sadeh 321602adf40SYehuda Sadeh dout("rbd_client_create\n"); 322602adf40SYehuda Sadeh rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL); 323602adf40SYehuda Sadeh if (!rbdc) 324602adf40SYehuda Sadeh goto out_opt; 325602adf40SYehuda Sadeh 326602adf40SYehuda Sadeh kref_init(&rbdc->kref); 327602adf40SYehuda Sadeh INIT_LIST_HEAD(&rbdc->node); 328602adf40SYehuda Sadeh 329bc534d86SAlex Elder mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); 330bc534d86SAlex Elder 33143ae4701SAlex Elder rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0); 332602adf40SYehuda Sadeh if (IS_ERR(rbdc->client)) 333bc534d86SAlex Elder goto out_mutex; 33443ae4701SAlex Elder ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */ 335602adf40SYehuda Sadeh 336602adf40SYehuda Sadeh ret = ceph_open_session(rbdc->client); 337602adf40SYehuda Sadeh if (ret < 0) 338602adf40SYehuda Sadeh goto out_err; 339602adf40SYehuda Sadeh 340432b8587SAlex Elder spin_lock(&rbd_client_list_lock); 341602adf40SYehuda Sadeh list_add_tail(&rbdc->node, &rbd_client_list); 342432b8587SAlex Elder spin_unlock(&rbd_client_list_lock); 343602adf40SYehuda Sadeh 344bc534d86SAlex Elder mutex_unlock(&ctl_mutex); 345bc534d86SAlex Elder 346602adf40SYehuda Sadeh dout("rbd_client_create created %p\n", rbdc); 347602adf40SYehuda Sadeh return rbdc; 348602adf40SYehuda Sadeh 349602adf40SYehuda Sadeh out_err: 350602adf40SYehuda Sadeh ceph_destroy_client(rbdc->client); 351bc534d86SAlex Elder out_mutex: 352bc534d86SAlex Elder mutex_unlock(&ctl_mutex); 353602adf40SYehuda Sadeh kfree(rbdc); 354602adf40SYehuda Sadeh out_opt: 35543ae4701SAlex Elder if (ceph_opts) 35643ae4701SAlex Elder ceph_destroy_options(ceph_opts); 35728f259b7SVasiliy Kulikov return ERR_PTR(ret); 358602adf40SYehuda Sadeh } 359602adf40SYehuda Sadeh 360602adf40SYehuda Sadeh /* 3611f7ba331SAlex Elder * Find a ceph client with specific addr and configuration. If 3621f7ba331SAlex Elder * found, bump its reference count. 363602adf40SYehuda Sadeh */ 3641f7ba331SAlex Elder static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts) 365602adf40SYehuda Sadeh { 366602adf40SYehuda Sadeh struct rbd_client *client_node; 3671f7ba331SAlex Elder bool found = false; 368602adf40SYehuda Sadeh 36943ae4701SAlex Elder if (ceph_opts->flags & CEPH_OPT_NOSHARE) 370602adf40SYehuda Sadeh return NULL; 371602adf40SYehuda Sadeh 3721f7ba331SAlex Elder spin_lock(&rbd_client_list_lock); 3731f7ba331SAlex Elder list_for_each_entry(client_node, &rbd_client_list, node) { 3741f7ba331SAlex Elder if (!ceph_compare_options(ceph_opts, client_node->client)) { 3751f7ba331SAlex Elder kref_get(&client_node->kref); 3761f7ba331SAlex Elder found = true; 3771f7ba331SAlex Elder break; 3781f7ba331SAlex Elder } 3791f7ba331SAlex Elder } 3801f7ba331SAlex Elder spin_unlock(&rbd_client_list_lock); 3811f7ba331SAlex Elder 3821f7ba331SAlex Elder return found ? client_node : NULL; 383602adf40SYehuda Sadeh } 384602adf40SYehuda Sadeh 385602adf40SYehuda Sadeh /* 38659c2be1eSYehuda Sadeh * mount options 38759c2be1eSYehuda Sadeh */ 38859c2be1eSYehuda Sadeh enum { 38959c2be1eSYehuda Sadeh Opt_last_int, 39059c2be1eSYehuda Sadeh /* int args above */ 39159c2be1eSYehuda Sadeh Opt_last_string, 39259c2be1eSYehuda Sadeh /* string args above */ 393cc0538b6SAlex Elder Opt_read_only, 394cc0538b6SAlex Elder Opt_read_write, 395cc0538b6SAlex Elder /* Boolean args above */ 396cc0538b6SAlex Elder Opt_last_bool, 39759c2be1eSYehuda Sadeh }; 39859c2be1eSYehuda Sadeh 39943ae4701SAlex Elder static match_table_t rbd_opts_tokens = { 40059c2be1eSYehuda Sadeh /* int args above */ 40159c2be1eSYehuda Sadeh /* string args above */ 402be466c1cSAlex Elder {Opt_read_only, "read_only"}, 403cc0538b6SAlex Elder {Opt_read_only, "ro"}, /* Alternate spelling */ 404cc0538b6SAlex Elder {Opt_read_write, "read_write"}, 405cc0538b6SAlex Elder {Opt_read_write, "rw"}, /* Alternate spelling */ 406cc0538b6SAlex Elder /* Boolean args above */ 40759c2be1eSYehuda Sadeh {-1, NULL} 40859c2be1eSYehuda Sadeh }; 40959c2be1eSYehuda Sadeh 41059c2be1eSYehuda Sadeh static int parse_rbd_opts_token(char *c, void *private) 41159c2be1eSYehuda Sadeh { 41243ae4701SAlex Elder struct rbd_options *rbd_opts = private; 41359c2be1eSYehuda Sadeh substring_t argstr[MAX_OPT_ARGS]; 41459c2be1eSYehuda Sadeh int token, intval, ret; 41559c2be1eSYehuda Sadeh 41643ae4701SAlex Elder token = match_token(c, rbd_opts_tokens, argstr); 41759c2be1eSYehuda Sadeh if (token < 0) 41859c2be1eSYehuda Sadeh return -EINVAL; 41959c2be1eSYehuda Sadeh 42059c2be1eSYehuda Sadeh if (token < Opt_last_int) { 42159c2be1eSYehuda Sadeh ret = match_int(&argstr[0], &intval); 42259c2be1eSYehuda Sadeh if (ret < 0) { 42359c2be1eSYehuda Sadeh pr_err("bad mount option arg (not int) " 42459c2be1eSYehuda Sadeh "at '%s'\n", c); 42559c2be1eSYehuda Sadeh return ret; 42659c2be1eSYehuda Sadeh } 42759c2be1eSYehuda Sadeh dout("got int token %d val %d\n", token, intval); 42859c2be1eSYehuda Sadeh } else if (token > Opt_last_int && token < Opt_last_string) { 42959c2be1eSYehuda Sadeh dout("got string token %d val %s\n", token, 43059c2be1eSYehuda Sadeh argstr[0].from); 431cc0538b6SAlex Elder } else if (token > Opt_last_string && token < Opt_last_bool) { 432cc0538b6SAlex Elder dout("got Boolean token %d\n", token); 43359c2be1eSYehuda Sadeh } else { 43459c2be1eSYehuda Sadeh dout("got token %d\n", token); 43559c2be1eSYehuda Sadeh } 43659c2be1eSYehuda Sadeh 43759c2be1eSYehuda Sadeh switch (token) { 438cc0538b6SAlex Elder case Opt_read_only: 439cc0538b6SAlex Elder rbd_opts->read_only = true; 440cc0538b6SAlex Elder break; 441cc0538b6SAlex Elder case Opt_read_write: 442cc0538b6SAlex Elder rbd_opts->read_only = false; 443cc0538b6SAlex Elder break; 44459c2be1eSYehuda Sadeh default: 445aafb230eSAlex Elder rbd_assert(false); 446aafb230eSAlex Elder break; 44759c2be1eSYehuda Sadeh } 44859c2be1eSYehuda Sadeh return 0; 44959c2be1eSYehuda Sadeh } 45059c2be1eSYehuda Sadeh 45159c2be1eSYehuda Sadeh /* 452602adf40SYehuda Sadeh * Get a ceph client with specific addr and configuration, if one does 453602adf40SYehuda Sadeh * not exist create it. 454602adf40SYehuda Sadeh */ 455f8c38929SAlex Elder static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr, 456f8c38929SAlex Elder size_t mon_addr_len, char *options) 457602adf40SYehuda Sadeh { 458069a4b56SAlex Elder struct rbd_options rbd_opts; 45943ae4701SAlex Elder struct ceph_options *ceph_opts; 460f8c38929SAlex Elder struct rbd_client *rbdc; 46159c2be1eSYehuda Sadeh 462069a4b56SAlex Elder /* Initialize all rbd options to the defaults */ 463069a4b56SAlex Elder 464069a4b56SAlex Elder rbd_opts.read_only = RBD_READ_ONLY_DEFAULT; 465602adf40SYehuda Sadeh 46643ae4701SAlex Elder ceph_opts = ceph_parse_options(options, mon_addr, 4675214ecc4SAlex Elder mon_addr + mon_addr_len, 468069a4b56SAlex Elder parse_rbd_opts_token, &rbd_opts); 469f8c38929SAlex Elder if (IS_ERR(ceph_opts)) 470f8c38929SAlex Elder return PTR_ERR(ceph_opts); 471602adf40SYehuda Sadeh 472069a4b56SAlex Elder /* Record the parsed rbd options */ 473069a4b56SAlex Elder 474069a4b56SAlex Elder rbd_dev->mapping.read_only = rbd_opts.read_only; 475069a4b56SAlex Elder 4761f7ba331SAlex Elder rbdc = rbd_client_find(ceph_opts); 477602adf40SYehuda Sadeh if (rbdc) { 478e6994d3dSAlex Elder /* using an existing client */ 47943ae4701SAlex Elder ceph_destroy_options(ceph_opts); 480f8c38929SAlex Elder } else { 481f8c38929SAlex Elder rbdc = rbd_client_create(ceph_opts); 482d720bcb0SAlex Elder if (IS_ERR(rbdc)) 483f8c38929SAlex Elder return PTR_ERR(rbdc); 484f8c38929SAlex Elder } 485f8c38929SAlex Elder rbd_dev->rbd_client = rbdc; 486d720bcb0SAlex Elder 487f8c38929SAlex Elder return 0; 488602adf40SYehuda Sadeh } 489602adf40SYehuda Sadeh 490602adf40SYehuda Sadeh /* 491602adf40SYehuda Sadeh * Destroy ceph client 492d23a4b3fSAlex Elder * 493432b8587SAlex Elder * Caller must hold rbd_client_list_lock. 494602adf40SYehuda Sadeh */ 495602adf40SYehuda Sadeh static void rbd_client_release(struct kref *kref) 496602adf40SYehuda Sadeh { 497602adf40SYehuda Sadeh struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref); 498602adf40SYehuda Sadeh 499602adf40SYehuda Sadeh dout("rbd_release_client %p\n", rbdc); 500cd9d9f5dSAlex Elder spin_lock(&rbd_client_list_lock); 501602adf40SYehuda Sadeh list_del(&rbdc->node); 502cd9d9f5dSAlex Elder spin_unlock(&rbd_client_list_lock); 503602adf40SYehuda Sadeh 504602adf40SYehuda Sadeh ceph_destroy_client(rbdc->client); 505602adf40SYehuda Sadeh kfree(rbdc); 506602adf40SYehuda Sadeh } 507602adf40SYehuda Sadeh 508602adf40SYehuda Sadeh /* 509602adf40SYehuda Sadeh * Drop reference to ceph client node. If it's not referenced anymore, release 510602adf40SYehuda Sadeh * it. 511602adf40SYehuda Sadeh */ 512602adf40SYehuda Sadeh static void rbd_put_client(struct rbd_device *rbd_dev) 513602adf40SYehuda Sadeh { 514602adf40SYehuda Sadeh kref_put(&rbd_dev->rbd_client->kref, rbd_client_release); 515602adf40SYehuda Sadeh rbd_dev->rbd_client = NULL; 516602adf40SYehuda Sadeh } 517602adf40SYehuda Sadeh 5181fec7093SYehuda Sadeh /* 5191fec7093SYehuda Sadeh * Destroy requests collection 5201fec7093SYehuda Sadeh */ 5211fec7093SYehuda Sadeh static void rbd_coll_release(struct kref *kref) 5221fec7093SYehuda Sadeh { 5231fec7093SYehuda Sadeh struct rbd_req_coll *coll = 5241fec7093SYehuda Sadeh container_of(kref, struct rbd_req_coll, kref); 5251fec7093SYehuda Sadeh 5261fec7093SYehuda Sadeh dout("rbd_coll_release %p\n", coll); 5271fec7093SYehuda Sadeh kfree(coll); 5281fec7093SYehuda Sadeh } 529602adf40SYehuda Sadeh 530a30b71b9SAlex Elder static bool rbd_image_format_valid(u32 image_format) 531a30b71b9SAlex Elder { 532a30b71b9SAlex Elder return image_format == 1 || image_format == 2; 533a30b71b9SAlex Elder } 534a30b71b9SAlex Elder 5358e94af8eSAlex Elder static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk) 5368e94af8eSAlex Elder { 537103a150fSAlex Elder size_t size; 538103a150fSAlex Elder u32 snap_count; 539103a150fSAlex Elder 540103a150fSAlex Elder /* The header has to start with the magic rbd header text */ 541103a150fSAlex Elder if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT))) 542103a150fSAlex Elder return false; 543103a150fSAlex Elder 544db2388b6SAlex Elder /* The bio layer requires at least sector-sized I/O */ 545db2388b6SAlex Elder 546db2388b6SAlex Elder if (ondisk->options.order < SECTOR_SHIFT) 547db2388b6SAlex Elder return false; 548db2388b6SAlex Elder 549db2388b6SAlex Elder /* If we use u64 in a few spots we may be able to loosen this */ 550db2388b6SAlex Elder 551db2388b6SAlex Elder if (ondisk->options.order > 8 * sizeof (int) - 1) 552db2388b6SAlex Elder return false; 553db2388b6SAlex Elder 554103a150fSAlex Elder /* 555103a150fSAlex Elder * The size of a snapshot header has to fit in a size_t, and 556103a150fSAlex Elder * that limits the number of snapshots. 557103a150fSAlex Elder */ 558103a150fSAlex Elder snap_count = le32_to_cpu(ondisk->snap_count); 559103a150fSAlex Elder size = SIZE_MAX - sizeof (struct ceph_snap_context); 560103a150fSAlex Elder if (snap_count > size / sizeof (__le64)) 561103a150fSAlex Elder return false; 562103a150fSAlex Elder 563103a150fSAlex Elder /* 564103a150fSAlex Elder * Not only that, but the size of the entire the snapshot 565103a150fSAlex Elder * header must also be representable in a size_t. 566103a150fSAlex Elder */ 567103a150fSAlex Elder size -= snap_count * sizeof (__le64); 568103a150fSAlex Elder if ((u64) size < le64_to_cpu(ondisk->snap_names_len)) 569103a150fSAlex Elder return false; 570103a150fSAlex Elder 571103a150fSAlex Elder return true; 5728e94af8eSAlex Elder } 5738e94af8eSAlex Elder 574602adf40SYehuda Sadeh /* 575602adf40SYehuda Sadeh * Create a new header structure, translate header format from the on-disk 576602adf40SYehuda Sadeh * header. 577602adf40SYehuda Sadeh */ 578602adf40SYehuda Sadeh static int rbd_header_from_disk(struct rbd_image_header *header, 5794156d998SAlex Elder struct rbd_image_header_ondisk *ondisk) 580602adf40SYehuda Sadeh { 581ccece235SAlex Elder u32 snap_count; 58258c17b0eSAlex Elder size_t len; 583d2bb24e5SAlex Elder size_t size; 584621901d6SAlex Elder u32 i; 585602adf40SYehuda Sadeh 5866a52325fSAlex Elder memset(header, 0, sizeof (*header)); 5876a52325fSAlex Elder 588103a150fSAlex Elder snap_count = le32_to_cpu(ondisk->snap_count); 589103a150fSAlex Elder 59058c17b0eSAlex Elder len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix)); 59158c17b0eSAlex Elder header->object_prefix = kmalloc(len + 1, GFP_KERNEL); 5926a52325fSAlex Elder if (!header->object_prefix) 593602adf40SYehuda Sadeh return -ENOMEM; 59458c17b0eSAlex Elder memcpy(header->object_prefix, ondisk->object_prefix, len); 59558c17b0eSAlex Elder header->object_prefix[len] = '\0'; 59600f1f36fSAlex Elder 597602adf40SYehuda Sadeh if (snap_count) { 598f785cc1dSAlex Elder u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len); 599f785cc1dSAlex Elder 600621901d6SAlex Elder /* Save a copy of the snapshot names */ 601621901d6SAlex Elder 602f785cc1dSAlex Elder if (snap_names_len > (u64) SIZE_MAX) 603f785cc1dSAlex Elder return -EIO; 604f785cc1dSAlex Elder header->snap_names = kmalloc(snap_names_len, GFP_KERNEL); 605602adf40SYehuda Sadeh if (!header->snap_names) 6066a52325fSAlex Elder goto out_err; 607f785cc1dSAlex Elder /* 608f785cc1dSAlex Elder * Note that rbd_dev_v1_header_read() guarantees 609f785cc1dSAlex Elder * the ondisk buffer we're working with has 610f785cc1dSAlex Elder * snap_names_len bytes beyond the end of the 611f785cc1dSAlex Elder * snapshot id array, this memcpy() is safe. 612f785cc1dSAlex Elder */ 613f785cc1dSAlex Elder memcpy(header->snap_names, &ondisk->snaps[snap_count], 614f785cc1dSAlex Elder snap_names_len); 6156a52325fSAlex Elder 616621901d6SAlex Elder /* Record each snapshot's size */ 617621901d6SAlex Elder 618d2bb24e5SAlex Elder size = snap_count * sizeof (*header->snap_sizes); 619d2bb24e5SAlex Elder header->snap_sizes = kmalloc(size, GFP_KERNEL); 620602adf40SYehuda Sadeh if (!header->snap_sizes) 6216a52325fSAlex Elder goto out_err; 622621901d6SAlex Elder for (i = 0; i < snap_count; i++) 623621901d6SAlex Elder header->snap_sizes[i] = 624621901d6SAlex Elder le64_to_cpu(ondisk->snaps[i].image_size); 625602adf40SYehuda Sadeh } else { 626ccece235SAlex Elder WARN_ON(ondisk->snap_names_len); 627602adf40SYehuda Sadeh header->snap_names = NULL; 628602adf40SYehuda Sadeh header->snap_sizes = NULL; 629602adf40SYehuda Sadeh } 630849b4260SAlex Elder 63134b13184SAlex Elder header->features = 0; /* No features support in v1 images */ 632602adf40SYehuda Sadeh header->obj_order = ondisk->options.order; 633602adf40SYehuda Sadeh header->crypt_type = ondisk->options.crypt_type; 634602adf40SYehuda Sadeh header->comp_type = ondisk->options.comp_type; 6356a52325fSAlex Elder 636621901d6SAlex Elder /* Allocate and fill in the snapshot context */ 637621901d6SAlex Elder 638f84344f3SAlex Elder header->image_size = le64_to_cpu(ondisk->image_size); 6396a52325fSAlex Elder size = sizeof (struct ceph_snap_context); 6406a52325fSAlex Elder size += snap_count * sizeof (header->snapc->snaps[0]); 6416a52325fSAlex Elder header->snapc = kzalloc(size, GFP_KERNEL); 6426a52325fSAlex Elder if (!header->snapc) 6436a52325fSAlex Elder goto out_err; 644602adf40SYehuda Sadeh 645602adf40SYehuda Sadeh atomic_set(&header->snapc->nref, 1); 646505cbb9bSAlex Elder header->snapc->seq = le64_to_cpu(ondisk->snap_seq); 647602adf40SYehuda Sadeh header->snapc->num_snaps = snap_count; 648621901d6SAlex Elder for (i = 0; i < snap_count; i++) 649602adf40SYehuda Sadeh header->snapc->snaps[i] = 650602adf40SYehuda Sadeh le64_to_cpu(ondisk->snaps[i].id); 651602adf40SYehuda Sadeh 652602adf40SYehuda Sadeh return 0; 653602adf40SYehuda Sadeh 6546a52325fSAlex Elder out_err: 655849b4260SAlex Elder kfree(header->snap_sizes); 656ccece235SAlex Elder header->snap_sizes = NULL; 657602adf40SYehuda Sadeh kfree(header->snap_names); 658ccece235SAlex Elder header->snap_names = NULL; 6596a52325fSAlex Elder kfree(header->object_prefix); 6606a52325fSAlex Elder header->object_prefix = NULL; 661ccece235SAlex Elder 66200f1f36fSAlex Elder return -ENOMEM; 663602adf40SYehuda Sadeh } 664602adf40SYehuda Sadeh 6658836b995SAlex Elder static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name) 666602adf40SYehuda Sadeh { 667602adf40SYehuda Sadeh 668e86924a8SAlex Elder struct rbd_snap *snap; 66900f1f36fSAlex Elder 670e86924a8SAlex Elder list_for_each_entry(snap, &rbd_dev->snaps, node) { 671e86924a8SAlex Elder if (!strcmp(snap_name, snap->name)) { 672e86924a8SAlex Elder rbd_dev->mapping.snap_id = snap->id; 673e86924a8SAlex Elder rbd_dev->mapping.size = snap->size; 67434b13184SAlex Elder rbd_dev->mapping.features = snap->features; 67500f1f36fSAlex Elder 676e86924a8SAlex Elder return 0; 677602adf40SYehuda Sadeh } 67800f1f36fSAlex Elder } 679e86924a8SAlex Elder 68000f1f36fSAlex Elder return -ENOENT; 68100f1f36fSAlex Elder } 682602adf40SYehuda Sadeh 6835ed16177SAlex Elder static int rbd_dev_set_mapping(struct rbd_device *rbd_dev, char *snap_name) 684602adf40SYehuda Sadeh { 68578dc447dSAlex Elder int ret; 686602adf40SYehuda Sadeh 6874e1105a2SAlex Elder if (!memcmp(snap_name, RBD_SNAP_HEAD_NAME, 688cc9d734cSJosh Durgin sizeof (RBD_SNAP_HEAD_NAME))) { 689f84344f3SAlex Elder rbd_dev->mapping.snap_id = CEPH_NOSNAP; 69099c1f08fSAlex Elder rbd_dev->mapping.size = rbd_dev->header.image_size; 69134b13184SAlex Elder rbd_dev->mapping.features = rbd_dev->header.features; 692f84344f3SAlex Elder rbd_dev->mapping.snap_exists = false; 693e86924a8SAlex Elder ret = 0; 694602adf40SYehuda Sadeh } else { 6958836b995SAlex Elder ret = snap_by_name(rbd_dev, snap_name); 696602adf40SYehuda Sadeh if (ret < 0) 697602adf40SYehuda Sadeh goto done; 698f84344f3SAlex Elder rbd_dev->mapping.snap_exists = true; 699f84344f3SAlex Elder rbd_dev->mapping.read_only = true; 700602adf40SYehuda Sadeh } 7014e1105a2SAlex Elder rbd_dev->mapping.snap_name = snap_name; 702602adf40SYehuda Sadeh done: 703602adf40SYehuda Sadeh return ret; 704602adf40SYehuda Sadeh } 705602adf40SYehuda Sadeh 706602adf40SYehuda Sadeh static void rbd_header_free(struct rbd_image_header *header) 707602adf40SYehuda Sadeh { 708849b4260SAlex Elder kfree(header->object_prefix); 709d78fd7aeSAlex Elder header->object_prefix = NULL; 710602adf40SYehuda Sadeh kfree(header->snap_sizes); 711d78fd7aeSAlex Elder header->snap_sizes = NULL; 712849b4260SAlex Elder kfree(header->snap_names); 713d78fd7aeSAlex Elder header->snap_names = NULL; 714d1d25646SJosh Durgin ceph_put_snap_context(header->snapc); 715d78fd7aeSAlex Elder header->snapc = NULL; 716602adf40SYehuda Sadeh } 717602adf40SYehuda Sadeh 71865ccfe21SAlex Elder static char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset) 719602adf40SYehuda Sadeh { 72065ccfe21SAlex Elder char *name; 72165ccfe21SAlex Elder u64 segment; 72265ccfe21SAlex Elder int ret; 723602adf40SYehuda Sadeh 72465ccfe21SAlex Elder name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO); 72565ccfe21SAlex Elder if (!name) 72665ccfe21SAlex Elder return NULL; 72765ccfe21SAlex Elder segment = offset >> rbd_dev->header.obj_order; 72865ccfe21SAlex Elder ret = snprintf(name, RBD_MAX_SEG_NAME_LEN, "%s.%012llx", 72965ccfe21SAlex Elder rbd_dev->header.object_prefix, segment); 73065ccfe21SAlex Elder if (ret < 0 || ret >= RBD_MAX_SEG_NAME_LEN) { 73165ccfe21SAlex Elder pr_err("error formatting segment name for #%llu (%d)\n", 73265ccfe21SAlex Elder segment, ret); 73365ccfe21SAlex Elder kfree(name); 73465ccfe21SAlex Elder name = NULL; 73565ccfe21SAlex Elder } 736602adf40SYehuda Sadeh 73765ccfe21SAlex Elder return name; 73865ccfe21SAlex Elder } 739602adf40SYehuda Sadeh 74065ccfe21SAlex Elder static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset) 74165ccfe21SAlex Elder { 74265ccfe21SAlex Elder u64 segment_size = (u64) 1 << rbd_dev->header.obj_order; 743602adf40SYehuda Sadeh 74465ccfe21SAlex Elder return offset & (segment_size - 1); 74565ccfe21SAlex Elder } 74665ccfe21SAlex Elder 74765ccfe21SAlex Elder static u64 rbd_segment_length(struct rbd_device *rbd_dev, 74865ccfe21SAlex Elder u64 offset, u64 length) 74965ccfe21SAlex Elder { 75065ccfe21SAlex Elder u64 segment_size = (u64) 1 << rbd_dev->header.obj_order; 75165ccfe21SAlex Elder 75265ccfe21SAlex Elder offset &= segment_size - 1; 75365ccfe21SAlex Elder 754aafb230eSAlex Elder rbd_assert(length <= U64_MAX - offset); 75565ccfe21SAlex Elder if (offset + length > segment_size) 75665ccfe21SAlex Elder length = segment_size - offset; 75765ccfe21SAlex Elder 75865ccfe21SAlex Elder return length; 759602adf40SYehuda Sadeh } 760602adf40SYehuda Sadeh 7611fec7093SYehuda Sadeh static int rbd_get_num_segments(struct rbd_image_header *header, 7621fec7093SYehuda Sadeh u64 ofs, u64 len) 7631fec7093SYehuda Sadeh { 764df111be6SAlex Elder u64 start_seg; 765df111be6SAlex Elder u64 end_seg; 766df111be6SAlex Elder 767df111be6SAlex Elder if (!len) 768df111be6SAlex Elder return 0; 769df111be6SAlex Elder if (len - 1 > U64_MAX - ofs) 770df111be6SAlex Elder return -ERANGE; 771df111be6SAlex Elder 772df111be6SAlex Elder start_seg = ofs >> header->obj_order; 773df111be6SAlex Elder end_seg = (ofs + len - 1) >> header->obj_order; 774df111be6SAlex Elder 7751fec7093SYehuda Sadeh return end_seg - start_seg + 1; 7761fec7093SYehuda Sadeh } 7771fec7093SYehuda Sadeh 778602adf40SYehuda Sadeh /* 779029bcbd8SJosh Durgin * returns the size of an object in the image 780029bcbd8SJosh Durgin */ 781029bcbd8SJosh Durgin static u64 rbd_obj_bytes(struct rbd_image_header *header) 782029bcbd8SJosh Durgin { 783029bcbd8SJosh Durgin return 1 << header->obj_order; 784029bcbd8SJosh Durgin } 785029bcbd8SJosh Durgin 786029bcbd8SJosh Durgin /* 787602adf40SYehuda Sadeh * bio helpers 788602adf40SYehuda Sadeh */ 789602adf40SYehuda Sadeh 790602adf40SYehuda Sadeh static void bio_chain_put(struct bio *chain) 791602adf40SYehuda Sadeh { 792602adf40SYehuda Sadeh struct bio *tmp; 793602adf40SYehuda Sadeh 794602adf40SYehuda Sadeh while (chain) { 795602adf40SYehuda Sadeh tmp = chain; 796602adf40SYehuda Sadeh chain = chain->bi_next; 797602adf40SYehuda Sadeh bio_put(tmp); 798602adf40SYehuda Sadeh } 799602adf40SYehuda Sadeh } 800602adf40SYehuda Sadeh 801602adf40SYehuda Sadeh /* 802602adf40SYehuda Sadeh * zeros a bio chain, starting at specific offset 803602adf40SYehuda Sadeh */ 804602adf40SYehuda Sadeh static void zero_bio_chain(struct bio *chain, int start_ofs) 805602adf40SYehuda Sadeh { 806602adf40SYehuda Sadeh struct bio_vec *bv; 807602adf40SYehuda Sadeh unsigned long flags; 808602adf40SYehuda Sadeh void *buf; 809602adf40SYehuda Sadeh int i; 810602adf40SYehuda Sadeh int pos = 0; 811602adf40SYehuda Sadeh 812602adf40SYehuda Sadeh while (chain) { 813602adf40SYehuda Sadeh bio_for_each_segment(bv, chain, i) { 814602adf40SYehuda Sadeh if (pos + bv->bv_len > start_ofs) { 815602adf40SYehuda Sadeh int remainder = max(start_ofs - pos, 0); 816602adf40SYehuda Sadeh buf = bvec_kmap_irq(bv, &flags); 817602adf40SYehuda Sadeh memset(buf + remainder, 0, 818602adf40SYehuda Sadeh bv->bv_len - remainder); 81985b5aaa6SDan Carpenter bvec_kunmap_irq(buf, &flags); 820602adf40SYehuda Sadeh } 821602adf40SYehuda Sadeh pos += bv->bv_len; 822602adf40SYehuda Sadeh } 823602adf40SYehuda Sadeh 824602adf40SYehuda Sadeh chain = chain->bi_next; 825602adf40SYehuda Sadeh } 826602adf40SYehuda Sadeh } 827602adf40SYehuda Sadeh 828602adf40SYehuda Sadeh /* 829f7760dadSAlex Elder * Clone a portion of a bio, starting at the given byte offset 830f7760dadSAlex Elder * and continuing for the number of bytes indicated. 831602adf40SYehuda Sadeh */ 832f7760dadSAlex Elder static struct bio *bio_clone_range(struct bio *bio_src, 833f7760dadSAlex Elder unsigned int offset, 834f7760dadSAlex Elder unsigned int len, 835f7760dadSAlex Elder gfp_t gfpmask) 836602adf40SYehuda Sadeh { 837f7760dadSAlex Elder struct bio_vec *bv; 838f7760dadSAlex Elder unsigned int resid; 839f7760dadSAlex Elder unsigned short idx; 840f7760dadSAlex Elder unsigned int voff; 841f7760dadSAlex Elder unsigned short end_idx; 842f7760dadSAlex Elder unsigned short vcnt; 843f7760dadSAlex Elder struct bio *bio; 844602adf40SYehuda Sadeh 845f7760dadSAlex Elder /* Handle the easy case for the caller */ 846f7760dadSAlex Elder 847f7760dadSAlex Elder if (!offset && len == bio_src->bi_size) 848f7760dadSAlex Elder return bio_clone(bio_src, gfpmask); 849f7760dadSAlex Elder 850f7760dadSAlex Elder if (WARN_ON_ONCE(!len)) 851f7760dadSAlex Elder return NULL; 852f7760dadSAlex Elder if (WARN_ON_ONCE(len > bio_src->bi_size)) 853f7760dadSAlex Elder return NULL; 854f7760dadSAlex Elder if (WARN_ON_ONCE(offset > bio_src->bi_size - len)) 855f7760dadSAlex Elder return NULL; 856f7760dadSAlex Elder 857f7760dadSAlex Elder /* Find first affected segment... */ 858f7760dadSAlex Elder 859f7760dadSAlex Elder resid = offset; 860f7760dadSAlex Elder __bio_for_each_segment(bv, bio_src, idx, 0) { 861f7760dadSAlex Elder if (resid < bv->bv_len) 862f7760dadSAlex Elder break; 863f7760dadSAlex Elder resid -= bv->bv_len; 864602adf40SYehuda Sadeh } 865f7760dadSAlex Elder voff = resid; 866602adf40SYehuda Sadeh 867f7760dadSAlex Elder /* ...and the last affected segment */ 868542582fcSAlex Elder 869f7760dadSAlex Elder resid += len; 870f7760dadSAlex Elder __bio_for_each_segment(bv, bio_src, end_idx, idx) { 871f7760dadSAlex Elder if (resid <= bv->bv_len) 872f7760dadSAlex Elder break; 873f7760dadSAlex Elder resid -= bv->bv_len; 874f7760dadSAlex Elder } 875f7760dadSAlex Elder vcnt = end_idx - idx + 1; 876602adf40SYehuda Sadeh 877f7760dadSAlex Elder /* Build the clone */ 878f7760dadSAlex Elder 879f7760dadSAlex Elder bio = bio_alloc(gfpmask, (unsigned int) vcnt); 880f7760dadSAlex Elder if (!bio) 881f7760dadSAlex Elder return NULL; /* ENOMEM */ 882f7760dadSAlex Elder 883f7760dadSAlex Elder bio->bi_bdev = bio_src->bi_bdev; 884f7760dadSAlex Elder bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT); 885f7760dadSAlex Elder bio->bi_rw = bio_src->bi_rw; 886f7760dadSAlex Elder bio->bi_flags |= 1 << BIO_CLONED; 887602adf40SYehuda Sadeh 888602adf40SYehuda Sadeh /* 889f7760dadSAlex Elder * Copy over our part of the bio_vec, then update the first 890f7760dadSAlex Elder * and last (or only) entries. 891602adf40SYehuda Sadeh */ 892f7760dadSAlex Elder memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx], 893f7760dadSAlex Elder vcnt * sizeof (struct bio_vec)); 894f7760dadSAlex Elder bio->bi_io_vec[0].bv_offset += voff; 895f7760dadSAlex Elder if (vcnt > 1) { 896f7760dadSAlex Elder bio->bi_io_vec[0].bv_len -= voff; 897f7760dadSAlex Elder bio->bi_io_vec[vcnt - 1].bv_len = resid; 898602adf40SYehuda Sadeh } else { 899f7760dadSAlex Elder bio->bi_io_vec[0].bv_len = len; 900602adf40SYehuda Sadeh } 901602adf40SYehuda Sadeh 902f7760dadSAlex Elder bio->bi_vcnt = vcnt; 903f7760dadSAlex Elder bio->bi_size = len; 904f7760dadSAlex Elder bio->bi_idx = 0; 905602adf40SYehuda Sadeh 906f7760dadSAlex Elder return bio; 907602adf40SYehuda Sadeh } 908602adf40SYehuda Sadeh 909f7760dadSAlex Elder /* 910f7760dadSAlex Elder * Clone a portion of a bio chain, starting at the given byte offset 911f7760dadSAlex Elder * into the first bio in the source chain and continuing for the 912f7760dadSAlex Elder * number of bytes indicated. The result is another bio chain of 913f7760dadSAlex Elder * exactly the given length, or a null pointer on error. 914f7760dadSAlex Elder * 915f7760dadSAlex Elder * The bio_src and offset parameters are both in-out. On entry they 916f7760dadSAlex Elder * refer to the first source bio and the offset into that bio where 917f7760dadSAlex Elder * the start of data to be cloned is located. 918f7760dadSAlex Elder * 919f7760dadSAlex Elder * On return, bio_src is updated to refer to the bio in the source 920f7760dadSAlex Elder * chain that contains first un-cloned byte, and *offset will 921f7760dadSAlex Elder * contain the offset of that byte within that bio. 922f7760dadSAlex Elder */ 923f7760dadSAlex Elder static struct bio *bio_chain_clone_range(struct bio **bio_src, 924f7760dadSAlex Elder unsigned int *offset, 925f7760dadSAlex Elder unsigned int len, 926f7760dadSAlex Elder gfp_t gfpmask) 927f7760dadSAlex Elder { 928f7760dadSAlex Elder struct bio *bi = *bio_src; 929f7760dadSAlex Elder unsigned int off = *offset; 930f7760dadSAlex Elder struct bio *chain = NULL; 931f7760dadSAlex Elder struct bio **end; 932602adf40SYehuda Sadeh 933f7760dadSAlex Elder /* Build up a chain of clone bios up to the limit */ 934602adf40SYehuda Sadeh 935f7760dadSAlex Elder if (!bi || off >= bi->bi_size || !len) 936f7760dadSAlex Elder return NULL; /* Nothing to clone */ 937602adf40SYehuda Sadeh 938f7760dadSAlex Elder end = &chain; 939f7760dadSAlex Elder while (len) { 940f7760dadSAlex Elder unsigned int bi_size; 941f7760dadSAlex Elder struct bio *bio; 942f7760dadSAlex Elder 943f7760dadSAlex Elder if (!bi) 944f7760dadSAlex Elder goto out_err; /* EINVAL; ran out of bio's */ 945f7760dadSAlex Elder bi_size = min_t(unsigned int, bi->bi_size - off, len); 946f7760dadSAlex Elder bio = bio_clone_range(bi, off, bi_size, gfpmask); 947f7760dadSAlex Elder if (!bio) 948f7760dadSAlex Elder goto out_err; /* ENOMEM */ 949f7760dadSAlex Elder 950f7760dadSAlex Elder *end = bio; 951f7760dadSAlex Elder end = &bio->bi_next; 952f7760dadSAlex Elder 953f7760dadSAlex Elder off += bi_size; 954f7760dadSAlex Elder if (off == bi->bi_size) { 955f7760dadSAlex Elder bi = bi->bi_next; 956f7760dadSAlex Elder off = 0; 957f7760dadSAlex Elder } 958f7760dadSAlex Elder len -= bi_size; 959f7760dadSAlex Elder } 960f7760dadSAlex Elder *bio_src = bi; 961f7760dadSAlex Elder *offset = off; 962f7760dadSAlex Elder 963f7760dadSAlex Elder return chain; 964f7760dadSAlex Elder out_err: 965f7760dadSAlex Elder bio_chain_put(chain); 966f7760dadSAlex Elder 967602adf40SYehuda Sadeh return NULL; 968602adf40SYehuda Sadeh } 969602adf40SYehuda Sadeh 970602adf40SYehuda Sadeh /* 971602adf40SYehuda Sadeh * helpers for osd request op vectors. 972602adf40SYehuda Sadeh */ 97357cfc106SAlex Elder static struct ceph_osd_req_op *rbd_create_rw_ops(int num_ops, 97457cfc106SAlex Elder int opcode, u32 payload_len) 975602adf40SYehuda Sadeh { 97657cfc106SAlex Elder struct ceph_osd_req_op *ops; 97757cfc106SAlex Elder 97857cfc106SAlex Elder ops = kzalloc(sizeof (*ops) * (num_ops + 1), GFP_NOIO); 97957cfc106SAlex Elder if (!ops) 98057cfc106SAlex Elder return NULL; 98157cfc106SAlex Elder 98257cfc106SAlex Elder ops[0].op = opcode; 98357cfc106SAlex Elder 984602adf40SYehuda Sadeh /* 985602adf40SYehuda Sadeh * op extent offset and length will be set later on 986602adf40SYehuda Sadeh * in calc_raw_layout() 987602adf40SYehuda Sadeh */ 98857cfc106SAlex Elder ops[0].payload_len = payload_len; 98957cfc106SAlex Elder 99057cfc106SAlex Elder return ops; 991602adf40SYehuda Sadeh } 992602adf40SYehuda Sadeh 993602adf40SYehuda Sadeh static void rbd_destroy_ops(struct ceph_osd_req_op *ops) 994602adf40SYehuda Sadeh { 995602adf40SYehuda Sadeh kfree(ops); 996602adf40SYehuda Sadeh } 997602adf40SYehuda Sadeh 9981fec7093SYehuda Sadeh static void rbd_coll_end_req_index(struct request *rq, 9991fec7093SYehuda Sadeh struct rbd_req_coll *coll, 10001fec7093SYehuda Sadeh int index, 10011fec7093SYehuda Sadeh int ret, u64 len) 10021fec7093SYehuda Sadeh { 10031fec7093SYehuda Sadeh struct request_queue *q; 10041fec7093SYehuda Sadeh int min, max, i; 10051fec7093SYehuda Sadeh 1006bd919d45SAlex Elder dout("rbd_coll_end_req_index %p index %d ret %d len %llu\n", 1007bd919d45SAlex Elder coll, index, ret, (unsigned long long) len); 10081fec7093SYehuda Sadeh 10091fec7093SYehuda Sadeh if (!rq) 10101fec7093SYehuda Sadeh return; 10111fec7093SYehuda Sadeh 10121fec7093SYehuda Sadeh if (!coll) { 10131fec7093SYehuda Sadeh blk_end_request(rq, ret, len); 10141fec7093SYehuda Sadeh return; 10151fec7093SYehuda Sadeh } 10161fec7093SYehuda Sadeh 10171fec7093SYehuda Sadeh q = rq->q; 10181fec7093SYehuda Sadeh 10191fec7093SYehuda Sadeh spin_lock_irq(q->queue_lock); 10201fec7093SYehuda Sadeh coll->status[index].done = 1; 10211fec7093SYehuda Sadeh coll->status[index].rc = ret; 10221fec7093SYehuda Sadeh coll->status[index].bytes = len; 10231fec7093SYehuda Sadeh max = min = coll->num_done; 10241fec7093SYehuda Sadeh while (max < coll->total && coll->status[max].done) 10251fec7093SYehuda Sadeh max++; 10261fec7093SYehuda Sadeh 10271fec7093SYehuda Sadeh for (i = min; i<max; i++) { 10281fec7093SYehuda Sadeh __blk_end_request(rq, coll->status[i].rc, 10291fec7093SYehuda Sadeh coll->status[i].bytes); 10301fec7093SYehuda Sadeh coll->num_done++; 10311fec7093SYehuda Sadeh kref_put(&coll->kref, rbd_coll_release); 10321fec7093SYehuda Sadeh } 10331fec7093SYehuda Sadeh spin_unlock_irq(q->queue_lock); 10341fec7093SYehuda Sadeh } 10351fec7093SYehuda Sadeh 10361fec7093SYehuda Sadeh static void rbd_coll_end_req(struct rbd_request *req, 10371fec7093SYehuda Sadeh int ret, u64 len) 10381fec7093SYehuda Sadeh { 10391fec7093SYehuda Sadeh rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len); 10401fec7093SYehuda Sadeh } 10411fec7093SYehuda Sadeh 1042602adf40SYehuda Sadeh /* 1043602adf40SYehuda Sadeh * Send ceph osd request 1044602adf40SYehuda Sadeh */ 1045602adf40SYehuda Sadeh static int rbd_do_request(struct request *rq, 10460ce1a794SAlex Elder struct rbd_device *rbd_dev, 1047602adf40SYehuda Sadeh struct ceph_snap_context *snapc, 1048602adf40SYehuda Sadeh u64 snapid, 1049aded07eaSAlex Elder const char *object_name, u64 ofs, u64 len, 1050602adf40SYehuda Sadeh struct bio *bio, 1051602adf40SYehuda Sadeh struct page **pages, 1052602adf40SYehuda Sadeh int num_pages, 1053602adf40SYehuda Sadeh int flags, 1054602adf40SYehuda Sadeh struct ceph_osd_req_op *ops, 10551fec7093SYehuda Sadeh struct rbd_req_coll *coll, 10561fec7093SYehuda Sadeh int coll_index, 1057602adf40SYehuda Sadeh void (*rbd_cb)(struct ceph_osd_request *req, 105859c2be1eSYehuda Sadeh struct ceph_msg *msg), 105959c2be1eSYehuda Sadeh struct ceph_osd_request **linger_req, 106059c2be1eSYehuda Sadeh u64 *ver) 1061602adf40SYehuda Sadeh { 1062602adf40SYehuda Sadeh struct ceph_osd_request *req; 1063602adf40SYehuda Sadeh struct ceph_file_layout *layout; 1064602adf40SYehuda Sadeh int ret; 1065602adf40SYehuda Sadeh u64 bno; 1066602adf40SYehuda Sadeh struct timespec mtime = CURRENT_TIME; 1067602adf40SYehuda Sadeh struct rbd_request *req_data; 1068602adf40SYehuda Sadeh struct ceph_osd_request_head *reqhead; 10691dbb4399SAlex Elder struct ceph_osd_client *osdc; 1070602adf40SYehuda Sadeh 1071602adf40SYehuda Sadeh req_data = kzalloc(sizeof(*req_data), GFP_NOIO); 10721fec7093SYehuda Sadeh if (!req_data) { 10731fec7093SYehuda Sadeh if (coll) 10741fec7093SYehuda Sadeh rbd_coll_end_req_index(rq, coll, coll_index, 10751fec7093SYehuda Sadeh -ENOMEM, len); 10761fec7093SYehuda Sadeh return -ENOMEM; 10771fec7093SYehuda Sadeh } 1078602adf40SYehuda Sadeh 10791fec7093SYehuda Sadeh if (coll) { 10801fec7093SYehuda Sadeh req_data->coll = coll; 10811fec7093SYehuda Sadeh req_data->coll_index = coll_index; 10821fec7093SYehuda Sadeh } 10831fec7093SYehuda Sadeh 1084f7760dadSAlex Elder dout("rbd_do_request object_name=%s ofs=%llu len=%llu coll=%p[%d]\n", 1085f7760dadSAlex Elder object_name, (unsigned long long) ofs, 1086f7760dadSAlex Elder (unsigned long long) len, coll, coll_index); 1087602adf40SYehuda Sadeh 10880ce1a794SAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 10891dbb4399SAlex Elder req = ceph_osdc_alloc_request(osdc, flags, snapc, ops, 10901dbb4399SAlex Elder false, GFP_NOIO, pages, bio); 10914ad12621SSage Weil if (!req) { 10924ad12621SSage Weil ret = -ENOMEM; 1093602adf40SYehuda Sadeh goto done_pages; 1094602adf40SYehuda Sadeh } 1095602adf40SYehuda Sadeh 1096602adf40SYehuda Sadeh req->r_callback = rbd_cb; 1097602adf40SYehuda Sadeh 1098602adf40SYehuda Sadeh req_data->rq = rq; 1099602adf40SYehuda Sadeh req_data->bio = bio; 1100602adf40SYehuda Sadeh req_data->pages = pages; 1101602adf40SYehuda Sadeh req_data->len = len; 1102602adf40SYehuda Sadeh 1103602adf40SYehuda Sadeh req->r_priv = req_data; 1104602adf40SYehuda Sadeh 1105602adf40SYehuda Sadeh reqhead = req->r_request->front.iov_base; 1106602adf40SYehuda Sadeh reqhead->snapid = cpu_to_le64(CEPH_NOSNAP); 1107602adf40SYehuda Sadeh 1108aded07eaSAlex Elder strncpy(req->r_oid, object_name, sizeof(req->r_oid)); 1109602adf40SYehuda Sadeh req->r_oid_len = strlen(req->r_oid); 1110602adf40SYehuda Sadeh 1111602adf40SYehuda Sadeh layout = &req->r_file_layout; 1112602adf40SYehuda Sadeh memset(layout, 0, sizeof(*layout)); 1113602adf40SYehuda Sadeh layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); 1114602adf40SYehuda Sadeh layout->fl_stripe_count = cpu_to_le32(1); 1115602adf40SYehuda Sadeh layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); 11160ce1a794SAlex Elder layout->fl_pg_pool = cpu_to_le32(rbd_dev->pool_id); 11176cae3717SSage Weil ret = ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno, 11181dbb4399SAlex Elder req, ops); 11196cae3717SSage Weil rbd_assert(ret == 0); 1120602adf40SYehuda Sadeh 1121602adf40SYehuda Sadeh ceph_osdc_build_request(req, ofs, &len, 1122602adf40SYehuda Sadeh ops, 1123602adf40SYehuda Sadeh snapc, 1124602adf40SYehuda Sadeh &mtime, 1125602adf40SYehuda Sadeh req->r_oid, req->r_oid_len); 1126602adf40SYehuda Sadeh 112759c2be1eSYehuda Sadeh if (linger_req) { 11281dbb4399SAlex Elder ceph_osdc_set_request_linger(osdc, req); 112959c2be1eSYehuda Sadeh *linger_req = req; 113059c2be1eSYehuda Sadeh } 113159c2be1eSYehuda Sadeh 11321dbb4399SAlex Elder ret = ceph_osdc_start_request(osdc, req, false); 1133602adf40SYehuda Sadeh if (ret < 0) 1134602adf40SYehuda Sadeh goto done_err; 1135602adf40SYehuda Sadeh 1136602adf40SYehuda Sadeh if (!rbd_cb) { 11371dbb4399SAlex Elder ret = ceph_osdc_wait_request(osdc, req); 113859c2be1eSYehuda Sadeh if (ver) 113959c2be1eSYehuda Sadeh *ver = le64_to_cpu(req->r_reassert_version.version); 1140bd919d45SAlex Elder dout("reassert_ver=%llu\n", 1141bd919d45SAlex Elder (unsigned long long) 11421fec7093SYehuda Sadeh le64_to_cpu(req->r_reassert_version.version)); 1143602adf40SYehuda Sadeh ceph_osdc_put_request(req); 1144602adf40SYehuda Sadeh } 1145602adf40SYehuda Sadeh return ret; 1146602adf40SYehuda Sadeh 1147602adf40SYehuda Sadeh done_err: 1148602adf40SYehuda Sadeh bio_chain_put(req_data->bio); 1149602adf40SYehuda Sadeh ceph_osdc_put_request(req); 1150602adf40SYehuda Sadeh done_pages: 11511fec7093SYehuda Sadeh rbd_coll_end_req(req_data, ret, len); 1152602adf40SYehuda Sadeh kfree(req_data); 1153602adf40SYehuda Sadeh return ret; 1154602adf40SYehuda Sadeh } 1155602adf40SYehuda Sadeh 1156602adf40SYehuda Sadeh /* 1157602adf40SYehuda Sadeh * Ceph osd op callback 1158602adf40SYehuda Sadeh */ 1159602adf40SYehuda Sadeh static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg) 1160602adf40SYehuda Sadeh { 1161602adf40SYehuda Sadeh struct rbd_request *req_data = req->r_priv; 1162602adf40SYehuda Sadeh struct ceph_osd_reply_head *replyhead; 1163602adf40SYehuda Sadeh struct ceph_osd_op *op; 1164602adf40SYehuda Sadeh __s32 rc; 1165602adf40SYehuda Sadeh u64 bytes; 1166602adf40SYehuda Sadeh int read_op; 1167602adf40SYehuda Sadeh 1168602adf40SYehuda Sadeh /* parse reply */ 1169602adf40SYehuda Sadeh replyhead = msg->front.iov_base; 1170602adf40SYehuda Sadeh WARN_ON(le32_to_cpu(replyhead->num_ops) == 0); 1171602adf40SYehuda Sadeh op = (void *)(replyhead + 1); 1172602adf40SYehuda Sadeh rc = le32_to_cpu(replyhead->result); 1173602adf40SYehuda Sadeh bytes = le64_to_cpu(op->extent.length); 1174895cfcc8SDan Carpenter read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ); 1175602adf40SYehuda Sadeh 1176bd919d45SAlex Elder dout("rbd_req_cb bytes=%llu readop=%d rc=%d\n", 1177bd919d45SAlex Elder (unsigned long long) bytes, read_op, (int) rc); 1178602adf40SYehuda Sadeh 1179602adf40SYehuda Sadeh if (rc == -ENOENT && read_op) { 1180602adf40SYehuda Sadeh zero_bio_chain(req_data->bio, 0); 1181602adf40SYehuda Sadeh rc = 0; 1182602adf40SYehuda Sadeh } else if (rc == 0 && read_op && bytes < req_data->len) { 1183602adf40SYehuda Sadeh zero_bio_chain(req_data->bio, bytes); 1184602adf40SYehuda Sadeh bytes = req_data->len; 1185602adf40SYehuda Sadeh } 1186602adf40SYehuda Sadeh 11871fec7093SYehuda Sadeh rbd_coll_end_req(req_data, rc, bytes); 1188602adf40SYehuda Sadeh 1189602adf40SYehuda Sadeh if (req_data->bio) 1190602adf40SYehuda Sadeh bio_chain_put(req_data->bio); 1191602adf40SYehuda Sadeh 1192602adf40SYehuda Sadeh ceph_osdc_put_request(req); 1193602adf40SYehuda Sadeh kfree(req_data); 1194602adf40SYehuda Sadeh } 1195602adf40SYehuda Sadeh 119659c2be1eSYehuda Sadeh static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg) 119759c2be1eSYehuda Sadeh { 119859c2be1eSYehuda Sadeh ceph_osdc_put_request(req); 119959c2be1eSYehuda Sadeh } 120059c2be1eSYehuda Sadeh 1201602adf40SYehuda Sadeh /* 1202602adf40SYehuda Sadeh * Do a synchronous ceph osd operation 1203602adf40SYehuda Sadeh */ 12040ce1a794SAlex Elder static int rbd_req_sync_op(struct rbd_device *rbd_dev, 1205602adf40SYehuda Sadeh struct ceph_snap_context *snapc, 1206602adf40SYehuda Sadeh u64 snapid, 1207602adf40SYehuda Sadeh int flags, 1208913d2fdcSAlex Elder struct ceph_osd_req_op *ops, 1209aded07eaSAlex Elder const char *object_name, 1210f8d4de6eSAlex Elder u64 ofs, u64 inbound_size, 1211f8d4de6eSAlex Elder char *inbound, 121259c2be1eSYehuda Sadeh struct ceph_osd_request **linger_req, 121359c2be1eSYehuda Sadeh u64 *ver) 1214602adf40SYehuda Sadeh { 1215602adf40SYehuda Sadeh int ret; 1216602adf40SYehuda Sadeh struct page **pages; 1217602adf40SYehuda Sadeh int num_pages; 1218913d2fdcSAlex Elder 1219aafb230eSAlex Elder rbd_assert(ops != NULL); 1220602adf40SYehuda Sadeh 1221f8d4de6eSAlex Elder num_pages = calc_pages_for(ofs, inbound_size); 1222602adf40SYehuda Sadeh pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); 1223b8d0638aSDan Carpenter if (IS_ERR(pages)) 1224b8d0638aSDan Carpenter return PTR_ERR(pages); 1225602adf40SYehuda Sadeh 12260ce1a794SAlex Elder ret = rbd_do_request(NULL, rbd_dev, snapc, snapid, 1227f8d4de6eSAlex Elder object_name, ofs, inbound_size, NULL, 1228602adf40SYehuda Sadeh pages, num_pages, 1229602adf40SYehuda Sadeh flags, 1230602adf40SYehuda Sadeh ops, 12311fec7093SYehuda Sadeh NULL, 0, 123259c2be1eSYehuda Sadeh NULL, 123359c2be1eSYehuda Sadeh linger_req, ver); 1234602adf40SYehuda Sadeh if (ret < 0) 1235913d2fdcSAlex Elder goto done; 1236602adf40SYehuda Sadeh 1237f8d4de6eSAlex Elder if ((flags & CEPH_OSD_FLAG_READ) && inbound) 1238f8d4de6eSAlex Elder ret = ceph_copy_from_page_vector(pages, inbound, ofs, ret); 1239602adf40SYehuda Sadeh 1240602adf40SYehuda Sadeh done: 1241602adf40SYehuda Sadeh ceph_release_page_vector(pages, num_pages); 1242602adf40SYehuda Sadeh return ret; 1243602adf40SYehuda Sadeh } 1244602adf40SYehuda Sadeh 1245602adf40SYehuda Sadeh /* 1246602adf40SYehuda Sadeh * Do an asynchronous ceph osd operation 1247602adf40SYehuda Sadeh */ 1248602adf40SYehuda Sadeh static int rbd_do_op(struct request *rq, 1249602adf40SYehuda Sadeh struct rbd_device *rbd_dev, 1250602adf40SYehuda Sadeh struct ceph_snap_context *snapc, 1251602adf40SYehuda Sadeh u64 ofs, u64 len, 12521fec7093SYehuda Sadeh struct bio *bio, 12531fec7093SYehuda Sadeh struct rbd_req_coll *coll, 12541fec7093SYehuda Sadeh int coll_index) 1255602adf40SYehuda Sadeh { 1256602adf40SYehuda Sadeh char *seg_name; 1257602adf40SYehuda Sadeh u64 seg_ofs; 1258602adf40SYehuda Sadeh u64 seg_len; 1259602adf40SYehuda Sadeh int ret; 1260602adf40SYehuda Sadeh struct ceph_osd_req_op *ops; 1261602adf40SYehuda Sadeh u32 payload_len; 1262ff2e4bb5SAlex Elder int opcode; 1263ff2e4bb5SAlex Elder int flags; 12644634246dSAlex Elder u64 snapid; 1265602adf40SYehuda Sadeh 126665ccfe21SAlex Elder seg_name = rbd_segment_name(rbd_dev, ofs); 1267602adf40SYehuda Sadeh if (!seg_name) 1268602adf40SYehuda Sadeh return -ENOMEM; 126965ccfe21SAlex Elder seg_len = rbd_segment_length(rbd_dev, ofs, len); 127065ccfe21SAlex Elder seg_ofs = rbd_segment_offset(rbd_dev, ofs); 1271602adf40SYehuda Sadeh 1272ff2e4bb5SAlex Elder if (rq_data_dir(rq) == WRITE) { 1273ff2e4bb5SAlex Elder opcode = CEPH_OSD_OP_WRITE; 1274ff2e4bb5SAlex Elder flags = CEPH_OSD_FLAG_WRITE|CEPH_OSD_FLAG_ONDISK; 12754634246dSAlex Elder snapid = CEPH_NOSNAP; 1276ff2e4bb5SAlex Elder payload_len = seg_len; 1277ff2e4bb5SAlex Elder } else { 1278ff2e4bb5SAlex Elder opcode = CEPH_OSD_OP_READ; 1279ff2e4bb5SAlex Elder flags = CEPH_OSD_FLAG_READ; 12804634246dSAlex Elder snapc = NULL; 12814634246dSAlex Elder snapid = rbd_dev->mapping.snap_id; 1282ff2e4bb5SAlex Elder payload_len = 0; 1283ff2e4bb5SAlex Elder } 1284602adf40SYehuda Sadeh 128557cfc106SAlex Elder ret = -ENOMEM; 128657cfc106SAlex Elder ops = rbd_create_rw_ops(1, opcode, payload_len); 128757cfc106SAlex Elder if (!ops) 1288602adf40SYehuda Sadeh goto done; 1289602adf40SYehuda Sadeh 1290602adf40SYehuda Sadeh /* we've taken care of segment sizes earlier when we 1291602adf40SYehuda Sadeh cloned the bios. We should never have a segment 1292602adf40SYehuda Sadeh truncated at this point */ 1293aafb230eSAlex Elder rbd_assert(seg_len == len); 1294602adf40SYehuda Sadeh 1295602adf40SYehuda Sadeh ret = rbd_do_request(rq, rbd_dev, snapc, snapid, 1296602adf40SYehuda Sadeh seg_name, seg_ofs, seg_len, 1297602adf40SYehuda Sadeh bio, 1298602adf40SYehuda Sadeh NULL, 0, 1299602adf40SYehuda Sadeh flags, 1300602adf40SYehuda Sadeh ops, 13011fec7093SYehuda Sadeh coll, coll_index, 130259c2be1eSYehuda Sadeh rbd_req_cb, 0, NULL); 130311f77002SSage Weil 130411f77002SSage Weil rbd_destroy_ops(ops); 1305602adf40SYehuda Sadeh done: 1306602adf40SYehuda Sadeh kfree(seg_name); 1307602adf40SYehuda Sadeh return ret; 1308602adf40SYehuda Sadeh } 1309602adf40SYehuda Sadeh 1310602adf40SYehuda Sadeh /* 1311602adf40SYehuda Sadeh * Request sync osd read 1312602adf40SYehuda Sadeh */ 13130ce1a794SAlex Elder static int rbd_req_sync_read(struct rbd_device *rbd_dev, 1314602adf40SYehuda Sadeh u64 snapid, 1315aded07eaSAlex Elder const char *object_name, 1316602adf40SYehuda Sadeh u64 ofs, u64 len, 131759c2be1eSYehuda Sadeh char *buf, 131859c2be1eSYehuda Sadeh u64 *ver) 1319602adf40SYehuda Sadeh { 1320913d2fdcSAlex Elder struct ceph_osd_req_op *ops; 1321913d2fdcSAlex Elder int ret; 1322913d2fdcSAlex Elder 1323913d2fdcSAlex Elder ops = rbd_create_rw_ops(1, CEPH_OSD_OP_READ, 0); 1324913d2fdcSAlex Elder if (!ops) 1325913d2fdcSAlex Elder return -ENOMEM; 1326913d2fdcSAlex Elder 1327913d2fdcSAlex Elder ret = rbd_req_sync_op(rbd_dev, NULL, 1328b06e6a6bSJosh Durgin snapid, 1329602adf40SYehuda Sadeh CEPH_OSD_FLAG_READ, 1330913d2fdcSAlex Elder ops, object_name, ofs, len, buf, NULL, ver); 1331913d2fdcSAlex Elder rbd_destroy_ops(ops); 1332913d2fdcSAlex Elder 1333913d2fdcSAlex Elder return ret; 1334602adf40SYehuda Sadeh } 1335602adf40SYehuda Sadeh 1336602adf40SYehuda Sadeh /* 133759c2be1eSYehuda Sadeh * Request sync osd watch 133859c2be1eSYehuda Sadeh */ 13390ce1a794SAlex Elder static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev, 134059c2be1eSYehuda Sadeh u64 ver, 13417f0a24d8SAlex Elder u64 notify_id) 134259c2be1eSYehuda Sadeh { 134359c2be1eSYehuda Sadeh struct ceph_osd_req_op *ops; 134411f77002SSage Weil int ret; 134511f77002SSage Weil 134657cfc106SAlex Elder ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY_ACK, 0); 134757cfc106SAlex Elder if (!ops) 134857cfc106SAlex Elder return -ENOMEM; 134959c2be1eSYehuda Sadeh 1350a71b891bSJosh Durgin ops[0].watch.ver = cpu_to_le64(ver); 135159c2be1eSYehuda Sadeh ops[0].watch.cookie = notify_id; 135259c2be1eSYehuda Sadeh ops[0].watch.flag = 0; 135359c2be1eSYehuda Sadeh 13540ce1a794SAlex Elder ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP, 13557f0a24d8SAlex Elder rbd_dev->header_name, 0, 0, NULL, 1356ad4f232fSAlex Elder NULL, 0, 135759c2be1eSYehuda Sadeh CEPH_OSD_FLAG_READ, 135859c2be1eSYehuda Sadeh ops, 13591fec7093SYehuda Sadeh NULL, 0, 136059c2be1eSYehuda Sadeh rbd_simple_req_cb, 0, NULL); 136159c2be1eSYehuda Sadeh 136259c2be1eSYehuda Sadeh rbd_destroy_ops(ops); 136359c2be1eSYehuda Sadeh return ret; 136459c2be1eSYehuda Sadeh } 136559c2be1eSYehuda Sadeh 136659c2be1eSYehuda Sadeh static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data) 136759c2be1eSYehuda Sadeh { 13680ce1a794SAlex Elder struct rbd_device *rbd_dev = (struct rbd_device *)data; 1369a71b891bSJosh Durgin u64 hver; 137013143d2dSSage Weil int rc; 137113143d2dSSage Weil 13720ce1a794SAlex Elder if (!rbd_dev) 137359c2be1eSYehuda Sadeh return; 137459c2be1eSYehuda Sadeh 1375bd919d45SAlex Elder dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n", 1376bd919d45SAlex Elder rbd_dev->header_name, (unsigned long long) notify_id, 1377bd919d45SAlex Elder (unsigned int) opcode); 1378117973fbSAlex Elder rc = rbd_dev_refresh(rbd_dev, &hver); 137913143d2dSSage Weil if (rc) 1380f0f8cef5SAlex Elder pr_warning(RBD_DRV_NAME "%d got notification but failed to " 13810ce1a794SAlex Elder " update snaps: %d\n", rbd_dev->major, rc); 138259c2be1eSYehuda Sadeh 13837f0a24d8SAlex Elder rbd_req_sync_notify_ack(rbd_dev, hver, notify_id); 138459c2be1eSYehuda Sadeh } 138559c2be1eSYehuda Sadeh 138659c2be1eSYehuda Sadeh /* 138759c2be1eSYehuda Sadeh * Request sync osd watch 138859c2be1eSYehuda Sadeh */ 13890e6f322dSAlex Elder static int rbd_req_sync_watch(struct rbd_device *rbd_dev) 139059c2be1eSYehuda Sadeh { 139159c2be1eSYehuda Sadeh struct ceph_osd_req_op *ops; 13920ce1a794SAlex Elder struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 139357cfc106SAlex Elder int ret; 139459c2be1eSYehuda Sadeh 139557cfc106SAlex Elder ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0); 139657cfc106SAlex Elder if (!ops) 139757cfc106SAlex Elder return -ENOMEM; 139859c2be1eSYehuda Sadeh 139959c2be1eSYehuda Sadeh ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0, 14000ce1a794SAlex Elder (void *)rbd_dev, &rbd_dev->watch_event); 140159c2be1eSYehuda Sadeh if (ret < 0) 140259c2be1eSYehuda Sadeh goto fail; 140359c2be1eSYehuda Sadeh 14040e6f322dSAlex Elder ops[0].watch.ver = cpu_to_le64(rbd_dev->header.obj_version); 14050ce1a794SAlex Elder ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie); 140659c2be1eSYehuda Sadeh ops[0].watch.flag = 1; 140759c2be1eSYehuda Sadeh 14080ce1a794SAlex Elder ret = rbd_req_sync_op(rbd_dev, NULL, 140959c2be1eSYehuda Sadeh CEPH_NOSNAP, 141059c2be1eSYehuda Sadeh CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, 141159c2be1eSYehuda Sadeh ops, 14120e6f322dSAlex Elder rbd_dev->header_name, 14130e6f322dSAlex Elder 0, 0, NULL, 14140ce1a794SAlex Elder &rbd_dev->watch_request, NULL); 141559c2be1eSYehuda Sadeh 141659c2be1eSYehuda Sadeh if (ret < 0) 141759c2be1eSYehuda Sadeh goto fail_event; 141859c2be1eSYehuda Sadeh 141959c2be1eSYehuda Sadeh rbd_destroy_ops(ops); 142059c2be1eSYehuda Sadeh return 0; 142159c2be1eSYehuda Sadeh 142259c2be1eSYehuda Sadeh fail_event: 14230ce1a794SAlex Elder ceph_osdc_cancel_event(rbd_dev->watch_event); 14240ce1a794SAlex Elder rbd_dev->watch_event = NULL; 142559c2be1eSYehuda Sadeh fail: 142659c2be1eSYehuda Sadeh rbd_destroy_ops(ops); 142759c2be1eSYehuda Sadeh return ret; 142859c2be1eSYehuda Sadeh } 142959c2be1eSYehuda Sadeh 143079e3057cSYehuda Sadeh /* 143179e3057cSYehuda Sadeh * Request sync osd unwatch 143279e3057cSYehuda Sadeh */ 1433070c633fSAlex Elder static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev) 143479e3057cSYehuda Sadeh { 143579e3057cSYehuda Sadeh struct ceph_osd_req_op *ops; 143657cfc106SAlex Elder int ret; 143779e3057cSYehuda Sadeh 143857cfc106SAlex Elder ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0); 143957cfc106SAlex Elder if (!ops) 144057cfc106SAlex Elder return -ENOMEM; 144179e3057cSYehuda Sadeh 144279e3057cSYehuda Sadeh ops[0].watch.ver = 0; 14430ce1a794SAlex Elder ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie); 144479e3057cSYehuda Sadeh ops[0].watch.flag = 0; 144579e3057cSYehuda Sadeh 14460ce1a794SAlex Elder ret = rbd_req_sync_op(rbd_dev, NULL, 144779e3057cSYehuda Sadeh CEPH_NOSNAP, 144879e3057cSYehuda Sadeh CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, 144979e3057cSYehuda Sadeh ops, 1450070c633fSAlex Elder rbd_dev->header_name, 1451070c633fSAlex Elder 0, 0, NULL, NULL, NULL); 1452070c633fSAlex Elder 145379e3057cSYehuda Sadeh 145479e3057cSYehuda Sadeh rbd_destroy_ops(ops); 14550ce1a794SAlex Elder ceph_osdc_cancel_event(rbd_dev->watch_event); 14560ce1a794SAlex Elder rbd_dev->watch_event = NULL; 145779e3057cSYehuda Sadeh return ret; 145879e3057cSYehuda Sadeh } 145979e3057cSYehuda Sadeh 146059c2be1eSYehuda Sadeh /* 14613cb4a687SAlex Elder * Synchronous osd object method call 1462602adf40SYehuda Sadeh */ 14630ce1a794SAlex Elder static int rbd_req_sync_exec(struct rbd_device *rbd_dev, 1464aded07eaSAlex Elder const char *object_name, 1465aded07eaSAlex Elder const char *class_name, 1466aded07eaSAlex Elder const char *method_name, 14673cb4a687SAlex Elder const char *outbound, 14683cb4a687SAlex Elder size_t outbound_size, 1469f8d4de6eSAlex Elder char *inbound, 1470f8d4de6eSAlex Elder size_t inbound_size, 14713cb4a687SAlex Elder int flags, 147259c2be1eSYehuda Sadeh u64 *ver) 1473602adf40SYehuda Sadeh { 1474602adf40SYehuda Sadeh struct ceph_osd_req_op *ops; 1475aded07eaSAlex Elder int class_name_len = strlen(class_name); 1476aded07eaSAlex Elder int method_name_len = strlen(method_name); 14773cb4a687SAlex Elder int payload_size; 147857cfc106SAlex Elder int ret; 147957cfc106SAlex Elder 14803cb4a687SAlex Elder /* 14813cb4a687SAlex Elder * Any input parameters required by the method we're calling 14823cb4a687SAlex Elder * will be sent along with the class and method names as 14833cb4a687SAlex Elder * part of the message payload. That data and its size are 14843cb4a687SAlex Elder * supplied via the indata and indata_len fields (named from 14853cb4a687SAlex Elder * the perspective of the server side) in the OSD request 14863cb4a687SAlex Elder * operation. 14873cb4a687SAlex Elder */ 14883cb4a687SAlex Elder payload_size = class_name_len + method_name_len + outbound_size; 14893cb4a687SAlex Elder ops = rbd_create_rw_ops(1, CEPH_OSD_OP_CALL, payload_size); 149057cfc106SAlex Elder if (!ops) 149157cfc106SAlex Elder return -ENOMEM; 1492602adf40SYehuda Sadeh 1493aded07eaSAlex Elder ops[0].cls.class_name = class_name; 1494aded07eaSAlex Elder ops[0].cls.class_len = (__u8) class_name_len; 1495aded07eaSAlex Elder ops[0].cls.method_name = method_name; 1496aded07eaSAlex Elder ops[0].cls.method_len = (__u8) method_name_len; 1497602adf40SYehuda Sadeh ops[0].cls.argc = 0; 14983cb4a687SAlex Elder ops[0].cls.indata = outbound; 14993cb4a687SAlex Elder ops[0].cls.indata_len = outbound_size; 1500602adf40SYehuda Sadeh 15010ce1a794SAlex Elder ret = rbd_req_sync_op(rbd_dev, NULL, 1502602adf40SYehuda Sadeh CEPH_NOSNAP, 15033cb4a687SAlex Elder flags, ops, 1504f8d4de6eSAlex Elder object_name, 0, inbound_size, inbound, 1505f8d4de6eSAlex Elder NULL, ver); 1506602adf40SYehuda Sadeh 1507602adf40SYehuda Sadeh rbd_destroy_ops(ops); 1508602adf40SYehuda Sadeh 1509602adf40SYehuda Sadeh dout("cls_exec returned %d\n", ret); 1510602adf40SYehuda Sadeh return ret; 1511602adf40SYehuda Sadeh } 1512602adf40SYehuda Sadeh 15131fec7093SYehuda Sadeh static struct rbd_req_coll *rbd_alloc_coll(int num_reqs) 15141fec7093SYehuda Sadeh { 15151fec7093SYehuda Sadeh struct rbd_req_coll *coll = 15161fec7093SYehuda Sadeh kzalloc(sizeof(struct rbd_req_coll) + 15171fec7093SYehuda Sadeh sizeof(struct rbd_req_status) * num_reqs, 15181fec7093SYehuda Sadeh GFP_ATOMIC); 15191fec7093SYehuda Sadeh 15201fec7093SYehuda Sadeh if (!coll) 15211fec7093SYehuda Sadeh return NULL; 15221fec7093SYehuda Sadeh coll->total = num_reqs; 15231fec7093SYehuda Sadeh kref_init(&coll->kref); 15241fec7093SYehuda Sadeh return coll; 15251fec7093SYehuda Sadeh } 15261fec7093SYehuda Sadeh 1527602adf40SYehuda Sadeh /* 1528602adf40SYehuda Sadeh * block device queue callback 1529602adf40SYehuda Sadeh */ 1530602adf40SYehuda Sadeh static void rbd_rq_fn(struct request_queue *q) 1531602adf40SYehuda Sadeh { 1532602adf40SYehuda Sadeh struct rbd_device *rbd_dev = q->queuedata; 1533602adf40SYehuda Sadeh struct request *rq; 1534602adf40SYehuda Sadeh 153500f1f36fSAlex Elder while ((rq = blk_fetch_request(q))) { 1536602adf40SYehuda Sadeh struct bio *bio; 1537602adf40SYehuda Sadeh bool do_write; 1538bd919d45SAlex Elder unsigned int size; 1539602adf40SYehuda Sadeh u64 ofs; 15401fec7093SYehuda Sadeh int num_segs, cur_seg = 0; 15411fec7093SYehuda Sadeh struct rbd_req_coll *coll; 1542d1d25646SJosh Durgin struct ceph_snap_context *snapc; 1543f7760dadSAlex Elder unsigned int bio_offset; 1544602adf40SYehuda Sadeh 1545602adf40SYehuda Sadeh dout("fetched request\n"); 1546602adf40SYehuda Sadeh 1547602adf40SYehuda Sadeh /* filter out block requests we don't understand */ 1548602adf40SYehuda Sadeh if ((rq->cmd_type != REQ_TYPE_FS)) { 1549602adf40SYehuda Sadeh __blk_end_request_all(rq, 0); 155000f1f36fSAlex Elder continue; 1551602adf40SYehuda Sadeh } 1552602adf40SYehuda Sadeh 1553602adf40SYehuda Sadeh /* deduce our operation (read, write) */ 1554602adf40SYehuda Sadeh do_write = (rq_data_dir(rq) == WRITE); 1555f84344f3SAlex Elder if (do_write && rbd_dev->mapping.read_only) { 1556602adf40SYehuda Sadeh __blk_end_request_all(rq, -EROFS); 155700f1f36fSAlex Elder continue; 1558602adf40SYehuda Sadeh } 1559602adf40SYehuda Sadeh 1560602adf40SYehuda Sadeh spin_unlock_irq(q->queue_lock); 1561602adf40SYehuda Sadeh 1562e88a36ecSJosh Durgin down_read(&rbd_dev->header_rwsem); 1563e88a36ecSJosh Durgin 1564f84344f3SAlex Elder if (rbd_dev->mapping.snap_id != CEPH_NOSNAP && 1565f84344f3SAlex Elder !rbd_dev->mapping.snap_exists) { 1566d1d25646SJosh Durgin up_read(&rbd_dev->header_rwsem); 1567e88a36ecSJosh Durgin dout("request for non-existent snapshot"); 1568e88a36ecSJosh Durgin spin_lock_irq(q->queue_lock); 1569e88a36ecSJosh Durgin __blk_end_request_all(rq, -ENXIO); 1570e88a36ecSJosh Durgin continue; 1571e88a36ecSJosh Durgin } 1572d1d25646SJosh Durgin 1573d1d25646SJosh Durgin snapc = ceph_get_snap_context(rbd_dev->header.snapc); 1574d1d25646SJosh Durgin 1575d1d25646SJosh Durgin up_read(&rbd_dev->header_rwsem); 1576e88a36ecSJosh Durgin 1577f7760dadSAlex Elder size = blk_rq_bytes(rq); 1578f7760dadSAlex Elder ofs = blk_rq_pos(rq) * SECTOR_SIZE; 1579f7760dadSAlex Elder bio = rq->bio; 1580f7760dadSAlex Elder 1581602adf40SYehuda Sadeh dout("%s 0x%x bytes at 0x%llx\n", 1582602adf40SYehuda Sadeh do_write ? "write" : "read", 1583bd919d45SAlex Elder size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE); 1584602adf40SYehuda Sadeh 15851fec7093SYehuda Sadeh num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size); 1586df111be6SAlex Elder if (num_segs <= 0) { 1587df111be6SAlex Elder spin_lock_irq(q->queue_lock); 1588df111be6SAlex Elder __blk_end_request_all(rq, num_segs); 1589df111be6SAlex Elder ceph_put_snap_context(snapc); 1590df111be6SAlex Elder continue; 1591df111be6SAlex Elder } 15921fec7093SYehuda Sadeh coll = rbd_alloc_coll(num_segs); 15931fec7093SYehuda Sadeh if (!coll) { 15941fec7093SYehuda Sadeh spin_lock_irq(q->queue_lock); 15951fec7093SYehuda Sadeh __blk_end_request_all(rq, -ENOMEM); 1596d1d25646SJosh Durgin ceph_put_snap_context(snapc); 159700f1f36fSAlex Elder continue; 15981fec7093SYehuda Sadeh } 15991fec7093SYehuda Sadeh 1600f7760dadSAlex Elder bio_offset = 0; 1601602adf40SYehuda Sadeh do { 1602f7760dadSAlex Elder u64 limit = rbd_segment_length(rbd_dev, ofs, size); 1603f7760dadSAlex Elder unsigned int chain_size; 1604f7760dadSAlex Elder struct bio *bio_chain; 1605f7760dadSAlex Elder 1606f7760dadSAlex Elder BUG_ON(limit > (u64) UINT_MAX); 1607f7760dadSAlex Elder chain_size = (unsigned int) limit; 1608bd919d45SAlex Elder dout("rq->bio->bi_vcnt=%hu\n", rq->bio->bi_vcnt); 1609f7760dadSAlex Elder 16101fec7093SYehuda Sadeh kref_get(&coll->kref); 1611f7760dadSAlex Elder 1612f7760dadSAlex Elder /* Pass a cloned bio chain via an osd request */ 1613f7760dadSAlex Elder 1614f7760dadSAlex Elder bio_chain = bio_chain_clone_range(&bio, 1615f7760dadSAlex Elder &bio_offset, chain_size, 1616f7760dadSAlex Elder GFP_ATOMIC); 1617f7760dadSAlex Elder if (bio_chain) 16184634246dSAlex Elder (void) rbd_do_op(rq, rbd_dev, snapc, 1619f7760dadSAlex Elder ofs, chain_size, 1620f7760dadSAlex Elder bio_chain, coll, cur_seg); 16214634246dSAlex Elder else 16221fec7093SYehuda Sadeh rbd_coll_end_req_index(rq, coll, cur_seg, 1623f7760dadSAlex Elder -ENOMEM, chain_size); 1624f7760dadSAlex Elder size -= chain_size; 1625f7760dadSAlex Elder ofs += chain_size; 1626602adf40SYehuda Sadeh 16271fec7093SYehuda Sadeh cur_seg++; 1628602adf40SYehuda Sadeh } while (size > 0); 16291fec7093SYehuda Sadeh kref_put(&coll->kref, rbd_coll_release); 1630602adf40SYehuda Sadeh 1631602adf40SYehuda Sadeh spin_lock_irq(q->queue_lock); 1632d1d25646SJosh Durgin 1633d1d25646SJosh Durgin ceph_put_snap_context(snapc); 1634602adf40SYehuda Sadeh } 1635602adf40SYehuda Sadeh } 1636602adf40SYehuda Sadeh 1637602adf40SYehuda Sadeh /* 1638602adf40SYehuda Sadeh * a queue callback. Makes sure that we don't create a bio that spans across 1639602adf40SYehuda Sadeh * multiple osd objects. One exception would be with a single page bios, 1640f7760dadSAlex Elder * which we handle later at bio_chain_clone_range() 1641602adf40SYehuda Sadeh */ 1642602adf40SYehuda Sadeh static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd, 1643602adf40SYehuda Sadeh struct bio_vec *bvec) 1644602adf40SYehuda Sadeh { 1645602adf40SYehuda Sadeh struct rbd_device *rbd_dev = q->queuedata; 1646e5cfeed2SAlex Elder sector_t sector_offset; 1647e5cfeed2SAlex Elder sector_t sectors_per_obj; 1648e5cfeed2SAlex Elder sector_t obj_sector_offset; 1649e5cfeed2SAlex Elder int ret; 1650602adf40SYehuda Sadeh 1651e5cfeed2SAlex Elder /* 1652e5cfeed2SAlex Elder * Find how far into its rbd object the partition-relative 1653e5cfeed2SAlex Elder * bio start sector is to offset relative to the enclosing 1654e5cfeed2SAlex Elder * device. 1655e5cfeed2SAlex Elder */ 1656e5cfeed2SAlex Elder sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector; 1657e5cfeed2SAlex Elder sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT); 1658e5cfeed2SAlex Elder obj_sector_offset = sector_offset & (sectors_per_obj - 1); 1659593a9e7bSAlex Elder 1660e5cfeed2SAlex Elder /* 1661e5cfeed2SAlex Elder * Compute the number of bytes from that offset to the end 1662e5cfeed2SAlex Elder * of the object. Account for what's already used by the bio. 1663e5cfeed2SAlex Elder */ 1664e5cfeed2SAlex Elder ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT; 1665e5cfeed2SAlex Elder if (ret > bmd->bi_size) 1666e5cfeed2SAlex Elder ret -= bmd->bi_size; 1667e5cfeed2SAlex Elder else 1668e5cfeed2SAlex Elder ret = 0; 1669e5cfeed2SAlex Elder 1670e5cfeed2SAlex Elder /* 1671e5cfeed2SAlex Elder * Don't send back more than was asked for. And if the bio 1672e5cfeed2SAlex Elder * was empty, let the whole thing through because: "Note 1673e5cfeed2SAlex Elder * that a block device *must* allow a single page to be 1674e5cfeed2SAlex Elder * added to an empty bio." 1675e5cfeed2SAlex Elder */ 1676e5cfeed2SAlex Elder rbd_assert(bvec->bv_len <= PAGE_SIZE); 1677e5cfeed2SAlex Elder if (ret > (int) bvec->bv_len || !bmd->bi_size) 1678e5cfeed2SAlex Elder ret = (int) bvec->bv_len; 1679e5cfeed2SAlex Elder 1680e5cfeed2SAlex Elder return ret; 1681602adf40SYehuda Sadeh } 1682602adf40SYehuda Sadeh 1683602adf40SYehuda Sadeh static void rbd_free_disk(struct rbd_device *rbd_dev) 1684602adf40SYehuda Sadeh { 1685602adf40SYehuda Sadeh struct gendisk *disk = rbd_dev->disk; 1686602adf40SYehuda Sadeh 1687602adf40SYehuda Sadeh if (!disk) 1688602adf40SYehuda Sadeh return; 1689602adf40SYehuda Sadeh 1690602adf40SYehuda Sadeh if (disk->flags & GENHD_FL_UP) 1691602adf40SYehuda Sadeh del_gendisk(disk); 1692602adf40SYehuda Sadeh if (disk->queue) 1693602adf40SYehuda Sadeh blk_cleanup_queue(disk->queue); 1694602adf40SYehuda Sadeh put_disk(disk); 1695602adf40SYehuda Sadeh } 1696602adf40SYehuda Sadeh 1697602adf40SYehuda Sadeh /* 16984156d998SAlex Elder * Read the complete header for the given rbd device. 16994156d998SAlex Elder * 17004156d998SAlex Elder * Returns a pointer to a dynamically-allocated buffer containing 17014156d998SAlex Elder * the complete and validated header. Caller can pass the address 17024156d998SAlex Elder * of a variable that will be filled in with the version of the 17034156d998SAlex Elder * header object at the time it was read. 17044156d998SAlex Elder * 17054156d998SAlex Elder * Returns a pointer-coded errno if a failure occurs. 17064156d998SAlex Elder */ 17074156d998SAlex Elder static struct rbd_image_header_ondisk * 17084156d998SAlex Elder rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version) 17094156d998SAlex Elder { 17104156d998SAlex Elder struct rbd_image_header_ondisk *ondisk = NULL; 17114156d998SAlex Elder u32 snap_count = 0; 17124156d998SAlex Elder u64 names_size = 0; 17134156d998SAlex Elder u32 want_count; 17144156d998SAlex Elder int ret; 17154156d998SAlex Elder 17164156d998SAlex Elder /* 17174156d998SAlex Elder * The complete header will include an array of its 64-bit 17184156d998SAlex Elder * snapshot ids, followed by the names of those snapshots as 17194156d998SAlex Elder * a contiguous block of NUL-terminated strings. Note that 17204156d998SAlex Elder * the number of snapshots could change by the time we read 17214156d998SAlex Elder * it in, in which case we re-read it. 17224156d998SAlex Elder */ 17234156d998SAlex Elder do { 17244156d998SAlex Elder size_t size; 17254156d998SAlex Elder 17264156d998SAlex Elder kfree(ondisk); 17274156d998SAlex Elder 17284156d998SAlex Elder size = sizeof (*ondisk); 17294156d998SAlex Elder size += snap_count * sizeof (struct rbd_image_snap_ondisk); 17304156d998SAlex Elder size += names_size; 17314156d998SAlex Elder ondisk = kmalloc(size, GFP_KERNEL); 17324156d998SAlex Elder if (!ondisk) 17334156d998SAlex Elder return ERR_PTR(-ENOMEM); 17344156d998SAlex Elder 17354156d998SAlex Elder ret = rbd_req_sync_read(rbd_dev, CEPH_NOSNAP, 17364156d998SAlex Elder rbd_dev->header_name, 17374156d998SAlex Elder 0, size, 17384156d998SAlex Elder (char *) ondisk, version); 17394156d998SAlex Elder 17404156d998SAlex Elder if (ret < 0) 17414156d998SAlex Elder goto out_err; 17424156d998SAlex Elder if (WARN_ON((size_t) ret < size)) { 17434156d998SAlex Elder ret = -ENXIO; 17444156d998SAlex Elder pr_warning("short header read for image %s" 17454156d998SAlex Elder " (want %zd got %d)\n", 17464156d998SAlex Elder rbd_dev->image_name, size, ret); 17474156d998SAlex Elder goto out_err; 17484156d998SAlex Elder } 17494156d998SAlex Elder if (!rbd_dev_ondisk_valid(ondisk)) { 17504156d998SAlex Elder ret = -ENXIO; 17514156d998SAlex Elder pr_warning("invalid header for image %s\n", 17524156d998SAlex Elder rbd_dev->image_name); 17534156d998SAlex Elder goto out_err; 17544156d998SAlex Elder } 17554156d998SAlex Elder 17564156d998SAlex Elder names_size = le64_to_cpu(ondisk->snap_names_len); 17574156d998SAlex Elder want_count = snap_count; 17584156d998SAlex Elder snap_count = le32_to_cpu(ondisk->snap_count); 17594156d998SAlex Elder } while (snap_count != want_count); 17604156d998SAlex Elder 17614156d998SAlex Elder return ondisk; 17624156d998SAlex Elder 17634156d998SAlex Elder out_err: 17644156d998SAlex Elder kfree(ondisk); 17654156d998SAlex Elder 17664156d998SAlex Elder return ERR_PTR(ret); 17674156d998SAlex Elder } 17684156d998SAlex Elder 17694156d998SAlex Elder /* 1770602adf40SYehuda Sadeh * reload the ondisk the header 1771602adf40SYehuda Sadeh */ 1772602adf40SYehuda Sadeh static int rbd_read_header(struct rbd_device *rbd_dev, 1773602adf40SYehuda Sadeh struct rbd_image_header *header) 1774602adf40SYehuda Sadeh { 17754156d998SAlex Elder struct rbd_image_header_ondisk *ondisk; 17764156d998SAlex Elder u64 ver = 0; 17774156d998SAlex Elder int ret; 1778602adf40SYehuda Sadeh 17794156d998SAlex Elder ondisk = rbd_dev_v1_header_read(rbd_dev, &ver); 17804156d998SAlex Elder if (IS_ERR(ondisk)) 17814156d998SAlex Elder return PTR_ERR(ondisk); 17824156d998SAlex Elder ret = rbd_header_from_disk(header, ondisk); 17834156d998SAlex Elder if (ret >= 0) 178459c2be1eSYehuda Sadeh header->obj_version = ver; 17854156d998SAlex Elder kfree(ondisk); 1786602adf40SYehuda Sadeh 17874156d998SAlex Elder return ret; 1788602adf40SYehuda Sadeh } 1789602adf40SYehuda Sadeh 1790dfc5606dSYehuda Sadeh static void __rbd_remove_all_snaps(struct rbd_device *rbd_dev) 1791dfc5606dSYehuda Sadeh { 1792dfc5606dSYehuda Sadeh struct rbd_snap *snap; 1793a0593290SAlex Elder struct rbd_snap *next; 1794dfc5606dSYehuda Sadeh 1795a0593290SAlex Elder list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node) 179614e7085dSAlex Elder __rbd_remove_snap_dev(snap); 1797dfc5606dSYehuda Sadeh } 1798dfc5606dSYehuda Sadeh 17999478554aSAlex Elder static void rbd_update_mapping_size(struct rbd_device *rbd_dev) 18009478554aSAlex Elder { 18019478554aSAlex Elder sector_t size; 18029478554aSAlex Elder 18039478554aSAlex Elder if (rbd_dev->mapping.snap_id != CEPH_NOSNAP) 18049478554aSAlex Elder return; 18059478554aSAlex Elder 18069478554aSAlex Elder size = (sector_t) rbd_dev->header.image_size / SECTOR_SIZE; 18079478554aSAlex Elder dout("setting size to %llu sectors", (unsigned long long) size); 18089478554aSAlex Elder rbd_dev->mapping.size = (u64) size; 18099478554aSAlex Elder set_capacity(rbd_dev->disk, size); 18109478554aSAlex Elder } 18119478554aSAlex Elder 1812602adf40SYehuda Sadeh /* 1813602adf40SYehuda Sadeh * only read the first part of the ondisk header, without the snaps info 1814602adf40SYehuda Sadeh */ 1815117973fbSAlex Elder static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver) 1816602adf40SYehuda Sadeh { 1817602adf40SYehuda Sadeh int ret; 1818602adf40SYehuda Sadeh struct rbd_image_header h; 1819602adf40SYehuda Sadeh 1820602adf40SYehuda Sadeh ret = rbd_read_header(rbd_dev, &h); 1821602adf40SYehuda Sadeh if (ret < 0) 1822602adf40SYehuda Sadeh return ret; 1823602adf40SYehuda Sadeh 1824a51aa0c0SJosh Durgin down_write(&rbd_dev->header_rwsem); 1825a51aa0c0SJosh Durgin 18269478554aSAlex Elder /* Update image size, and check for resize of mapped image */ 18279478554aSAlex Elder rbd_dev->header.image_size = h.image_size; 18289478554aSAlex Elder rbd_update_mapping_size(rbd_dev); 18299db4b3e3SSage Weil 1830849b4260SAlex Elder /* rbd_dev->header.object_prefix shouldn't change */ 1831602adf40SYehuda Sadeh kfree(rbd_dev->header.snap_sizes); 1832849b4260SAlex Elder kfree(rbd_dev->header.snap_names); 1833d1d25646SJosh Durgin /* osd requests may still refer to snapc */ 1834d1d25646SJosh Durgin ceph_put_snap_context(rbd_dev->header.snapc); 1835602adf40SYehuda Sadeh 1836b813623aSAlex Elder if (hver) 1837b813623aSAlex Elder *hver = h.obj_version; 1838a71b891bSJosh Durgin rbd_dev->header.obj_version = h.obj_version; 183993a24e08SJosh Durgin rbd_dev->header.image_size = h.image_size; 1840602adf40SYehuda Sadeh rbd_dev->header.snapc = h.snapc; 1841602adf40SYehuda Sadeh rbd_dev->header.snap_names = h.snap_names; 1842602adf40SYehuda Sadeh rbd_dev->header.snap_sizes = h.snap_sizes; 1843849b4260SAlex Elder /* Free the extra copy of the object prefix */ 1844849b4260SAlex Elder WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix)); 1845849b4260SAlex Elder kfree(h.object_prefix); 1846849b4260SAlex Elder 1847304f6808SAlex Elder ret = rbd_dev_snaps_update(rbd_dev); 1848304f6808SAlex Elder if (!ret) 1849304f6808SAlex Elder ret = rbd_dev_snaps_register(rbd_dev); 1850dfc5606dSYehuda Sadeh 1851c666601aSJosh Durgin up_write(&rbd_dev->header_rwsem); 1852602adf40SYehuda Sadeh 1853dfc5606dSYehuda Sadeh return ret; 1854602adf40SYehuda Sadeh } 1855602adf40SYehuda Sadeh 1856117973fbSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver) 18571fe5e993SAlex Elder { 18581fe5e993SAlex Elder int ret; 18591fe5e993SAlex Elder 1860117973fbSAlex Elder rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 18611fe5e993SAlex Elder mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); 1862117973fbSAlex Elder if (rbd_dev->image_format == 1) 1863117973fbSAlex Elder ret = rbd_dev_v1_refresh(rbd_dev, hver); 1864117973fbSAlex Elder else 1865117973fbSAlex Elder ret = rbd_dev_v2_refresh(rbd_dev, hver); 18661fe5e993SAlex Elder mutex_unlock(&ctl_mutex); 18671fe5e993SAlex Elder 18681fe5e993SAlex Elder return ret; 18691fe5e993SAlex Elder } 18701fe5e993SAlex Elder 1871602adf40SYehuda Sadeh static int rbd_init_disk(struct rbd_device *rbd_dev) 1872602adf40SYehuda Sadeh { 1873602adf40SYehuda Sadeh struct gendisk *disk; 1874602adf40SYehuda Sadeh struct request_queue *q; 1875593a9e7bSAlex Elder u64 segment_size; 1876602adf40SYehuda Sadeh 1877602adf40SYehuda Sadeh /* create gendisk info */ 1878602adf40SYehuda Sadeh disk = alloc_disk(RBD_MINORS_PER_MAJOR); 1879602adf40SYehuda Sadeh if (!disk) 18801fcdb8aaSAlex Elder return -ENOMEM; 1881602adf40SYehuda Sadeh 1882f0f8cef5SAlex Elder snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d", 1883de71a297SAlex Elder rbd_dev->dev_id); 1884602adf40SYehuda Sadeh disk->major = rbd_dev->major; 1885602adf40SYehuda Sadeh disk->first_minor = 0; 1886602adf40SYehuda Sadeh disk->fops = &rbd_bd_ops; 1887602adf40SYehuda Sadeh disk->private_data = rbd_dev; 1888602adf40SYehuda Sadeh 1889602adf40SYehuda Sadeh /* init rq */ 1890602adf40SYehuda Sadeh q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock); 1891602adf40SYehuda Sadeh if (!q) 1892602adf40SYehuda Sadeh goto out_disk; 1893029bcbd8SJosh Durgin 1894593a9e7bSAlex Elder /* We use the default size, but let's be explicit about it. */ 1895593a9e7bSAlex Elder blk_queue_physical_block_size(q, SECTOR_SIZE); 1896593a9e7bSAlex Elder 1897029bcbd8SJosh Durgin /* set io sizes to object size */ 1898593a9e7bSAlex Elder segment_size = rbd_obj_bytes(&rbd_dev->header); 1899593a9e7bSAlex Elder blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE); 1900593a9e7bSAlex Elder blk_queue_max_segment_size(q, segment_size); 1901593a9e7bSAlex Elder blk_queue_io_min(q, segment_size); 1902593a9e7bSAlex Elder blk_queue_io_opt(q, segment_size); 1903029bcbd8SJosh Durgin 1904602adf40SYehuda Sadeh blk_queue_merge_bvec(q, rbd_merge_bvec); 1905602adf40SYehuda Sadeh disk->queue = q; 1906602adf40SYehuda Sadeh 1907602adf40SYehuda Sadeh q->queuedata = rbd_dev; 1908602adf40SYehuda Sadeh 1909602adf40SYehuda Sadeh rbd_dev->disk = disk; 1910602adf40SYehuda Sadeh 191112f02944SAlex Elder set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE); 191212f02944SAlex Elder 1913602adf40SYehuda Sadeh return 0; 1914602adf40SYehuda Sadeh out_disk: 1915602adf40SYehuda Sadeh put_disk(disk); 19161fcdb8aaSAlex Elder 19171fcdb8aaSAlex Elder return -ENOMEM; 1918602adf40SYehuda Sadeh } 1919602adf40SYehuda Sadeh 1920dfc5606dSYehuda Sadeh /* 1921dfc5606dSYehuda Sadeh sysfs 1922dfc5606dSYehuda Sadeh */ 1923602adf40SYehuda Sadeh 1924593a9e7bSAlex Elder static struct rbd_device *dev_to_rbd_dev(struct device *dev) 1925593a9e7bSAlex Elder { 1926593a9e7bSAlex Elder return container_of(dev, struct rbd_device, dev); 1927593a9e7bSAlex Elder } 1928593a9e7bSAlex Elder 1929dfc5606dSYehuda Sadeh static ssize_t rbd_size_show(struct device *dev, 1930dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 1931602adf40SYehuda Sadeh { 1932593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 1933a51aa0c0SJosh Durgin sector_t size; 1934dfc5606dSYehuda Sadeh 1935a51aa0c0SJosh Durgin down_read(&rbd_dev->header_rwsem); 1936a51aa0c0SJosh Durgin size = get_capacity(rbd_dev->disk); 1937a51aa0c0SJosh Durgin up_read(&rbd_dev->header_rwsem); 1938a51aa0c0SJosh Durgin 1939a51aa0c0SJosh Durgin return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE); 1940602adf40SYehuda Sadeh } 1941602adf40SYehuda Sadeh 194234b13184SAlex Elder /* 194334b13184SAlex Elder * Note this shows the features for whatever's mapped, which is not 194434b13184SAlex Elder * necessarily the base image. 194534b13184SAlex Elder */ 194634b13184SAlex Elder static ssize_t rbd_features_show(struct device *dev, 194734b13184SAlex Elder struct device_attribute *attr, char *buf) 194834b13184SAlex Elder { 194934b13184SAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 195034b13184SAlex Elder 195134b13184SAlex Elder return sprintf(buf, "0x%016llx\n", 195234b13184SAlex Elder (unsigned long long) rbd_dev->mapping.features); 195334b13184SAlex Elder } 195434b13184SAlex Elder 1955dfc5606dSYehuda Sadeh static ssize_t rbd_major_show(struct device *dev, 1956dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 1957602adf40SYehuda Sadeh { 1958593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 1959dfc5606dSYehuda Sadeh 1960dfc5606dSYehuda Sadeh return sprintf(buf, "%d\n", rbd_dev->major); 1961dfc5606dSYehuda Sadeh } 1962dfc5606dSYehuda Sadeh 1963dfc5606dSYehuda Sadeh static ssize_t rbd_client_id_show(struct device *dev, 1964dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 1965dfc5606dSYehuda Sadeh { 1966593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 1967dfc5606dSYehuda Sadeh 19681dbb4399SAlex Elder return sprintf(buf, "client%lld\n", 19691dbb4399SAlex Elder ceph_client_id(rbd_dev->rbd_client->client)); 1970dfc5606dSYehuda Sadeh } 1971dfc5606dSYehuda Sadeh 1972dfc5606dSYehuda Sadeh static ssize_t rbd_pool_show(struct device *dev, 1973dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 1974dfc5606dSYehuda Sadeh { 1975593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 1976dfc5606dSYehuda Sadeh 1977dfc5606dSYehuda Sadeh return sprintf(buf, "%s\n", rbd_dev->pool_name); 1978dfc5606dSYehuda Sadeh } 1979dfc5606dSYehuda Sadeh 19809bb2f334SAlex Elder static ssize_t rbd_pool_id_show(struct device *dev, 19819bb2f334SAlex Elder struct device_attribute *attr, char *buf) 19829bb2f334SAlex Elder { 19839bb2f334SAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 19849bb2f334SAlex Elder 19859bb2f334SAlex Elder return sprintf(buf, "%d\n", rbd_dev->pool_id); 19869bb2f334SAlex Elder } 19879bb2f334SAlex Elder 1988dfc5606dSYehuda Sadeh static ssize_t rbd_name_show(struct device *dev, 1989dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 1990dfc5606dSYehuda Sadeh { 1991593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 1992dfc5606dSYehuda Sadeh 19930bed54dcSAlex Elder return sprintf(buf, "%s\n", rbd_dev->image_name); 1994dfc5606dSYehuda Sadeh } 1995dfc5606dSYehuda Sadeh 1996589d30e0SAlex Elder static ssize_t rbd_image_id_show(struct device *dev, 1997589d30e0SAlex Elder struct device_attribute *attr, char *buf) 1998589d30e0SAlex Elder { 1999589d30e0SAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 2000589d30e0SAlex Elder 2001589d30e0SAlex Elder return sprintf(buf, "%s\n", rbd_dev->image_id); 2002589d30e0SAlex Elder } 2003589d30e0SAlex Elder 200434b13184SAlex Elder /* 200534b13184SAlex Elder * Shows the name of the currently-mapped snapshot (or 200634b13184SAlex Elder * RBD_SNAP_HEAD_NAME for the base image). 200734b13184SAlex Elder */ 2008dfc5606dSYehuda Sadeh static ssize_t rbd_snap_show(struct device *dev, 2009dfc5606dSYehuda Sadeh struct device_attribute *attr, 2010dfc5606dSYehuda Sadeh char *buf) 2011dfc5606dSYehuda Sadeh { 2012593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 2013dfc5606dSYehuda Sadeh 2014f84344f3SAlex Elder return sprintf(buf, "%s\n", rbd_dev->mapping.snap_name); 2015dfc5606dSYehuda Sadeh } 2016dfc5606dSYehuda Sadeh 2017dfc5606dSYehuda Sadeh static ssize_t rbd_image_refresh(struct device *dev, 2018dfc5606dSYehuda Sadeh struct device_attribute *attr, 2019dfc5606dSYehuda Sadeh const char *buf, 2020dfc5606dSYehuda Sadeh size_t size) 2021dfc5606dSYehuda Sadeh { 2022593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 2023b813623aSAlex Elder int ret; 2024602adf40SYehuda Sadeh 2025117973fbSAlex Elder ret = rbd_dev_refresh(rbd_dev, NULL); 2026b813623aSAlex Elder 2027b813623aSAlex Elder return ret < 0 ? ret : size; 2028dfc5606dSYehuda Sadeh } 2029602adf40SYehuda Sadeh 2030dfc5606dSYehuda Sadeh static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL); 203134b13184SAlex Elder static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL); 2032dfc5606dSYehuda Sadeh static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL); 2033dfc5606dSYehuda Sadeh static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL); 2034dfc5606dSYehuda Sadeh static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL); 20359bb2f334SAlex Elder static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL); 2036dfc5606dSYehuda Sadeh static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL); 2037589d30e0SAlex Elder static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL); 2038dfc5606dSYehuda Sadeh static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh); 2039dfc5606dSYehuda Sadeh static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL); 2040dfc5606dSYehuda Sadeh 2041dfc5606dSYehuda Sadeh static struct attribute *rbd_attrs[] = { 2042dfc5606dSYehuda Sadeh &dev_attr_size.attr, 204334b13184SAlex Elder &dev_attr_features.attr, 2044dfc5606dSYehuda Sadeh &dev_attr_major.attr, 2045dfc5606dSYehuda Sadeh &dev_attr_client_id.attr, 2046dfc5606dSYehuda Sadeh &dev_attr_pool.attr, 20479bb2f334SAlex Elder &dev_attr_pool_id.attr, 2048dfc5606dSYehuda Sadeh &dev_attr_name.attr, 2049589d30e0SAlex Elder &dev_attr_image_id.attr, 2050dfc5606dSYehuda Sadeh &dev_attr_current_snap.attr, 2051dfc5606dSYehuda Sadeh &dev_attr_refresh.attr, 2052dfc5606dSYehuda Sadeh NULL 2053dfc5606dSYehuda Sadeh }; 2054dfc5606dSYehuda Sadeh 2055dfc5606dSYehuda Sadeh static struct attribute_group rbd_attr_group = { 2056dfc5606dSYehuda Sadeh .attrs = rbd_attrs, 2057dfc5606dSYehuda Sadeh }; 2058dfc5606dSYehuda Sadeh 2059dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_attr_groups[] = { 2060dfc5606dSYehuda Sadeh &rbd_attr_group, 2061dfc5606dSYehuda Sadeh NULL 2062dfc5606dSYehuda Sadeh }; 2063dfc5606dSYehuda Sadeh 2064dfc5606dSYehuda Sadeh static void rbd_sysfs_dev_release(struct device *dev) 2065dfc5606dSYehuda Sadeh { 2066dfc5606dSYehuda Sadeh } 2067dfc5606dSYehuda Sadeh 2068dfc5606dSYehuda Sadeh static struct device_type rbd_device_type = { 2069dfc5606dSYehuda Sadeh .name = "rbd", 2070dfc5606dSYehuda Sadeh .groups = rbd_attr_groups, 2071dfc5606dSYehuda Sadeh .release = rbd_sysfs_dev_release, 2072dfc5606dSYehuda Sadeh }; 2073dfc5606dSYehuda Sadeh 2074dfc5606dSYehuda Sadeh 2075dfc5606dSYehuda Sadeh /* 2076dfc5606dSYehuda Sadeh sysfs - snapshots 2077dfc5606dSYehuda Sadeh */ 2078dfc5606dSYehuda Sadeh 2079dfc5606dSYehuda Sadeh static ssize_t rbd_snap_size_show(struct device *dev, 2080dfc5606dSYehuda Sadeh struct device_attribute *attr, 2081dfc5606dSYehuda Sadeh char *buf) 2082dfc5606dSYehuda Sadeh { 2083dfc5606dSYehuda Sadeh struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev); 2084dfc5606dSYehuda Sadeh 20853591538fSJosh Durgin return sprintf(buf, "%llu\n", (unsigned long long)snap->size); 2086dfc5606dSYehuda Sadeh } 2087dfc5606dSYehuda Sadeh 2088dfc5606dSYehuda Sadeh static ssize_t rbd_snap_id_show(struct device *dev, 2089dfc5606dSYehuda Sadeh struct device_attribute *attr, 2090dfc5606dSYehuda Sadeh char *buf) 2091dfc5606dSYehuda Sadeh { 2092dfc5606dSYehuda Sadeh struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev); 2093dfc5606dSYehuda Sadeh 2094593a9e7bSAlex Elder return sprintf(buf, "%llu\n", (unsigned long long)snap->id); 2095dfc5606dSYehuda Sadeh } 2096dfc5606dSYehuda Sadeh 209734b13184SAlex Elder static ssize_t rbd_snap_features_show(struct device *dev, 209834b13184SAlex Elder struct device_attribute *attr, 209934b13184SAlex Elder char *buf) 210034b13184SAlex Elder { 210134b13184SAlex Elder struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev); 210234b13184SAlex Elder 210334b13184SAlex Elder return sprintf(buf, "0x%016llx\n", 210434b13184SAlex Elder (unsigned long long) snap->features); 210534b13184SAlex Elder } 210634b13184SAlex Elder 2107dfc5606dSYehuda Sadeh static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL); 2108dfc5606dSYehuda Sadeh static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL); 210934b13184SAlex Elder static DEVICE_ATTR(snap_features, S_IRUGO, rbd_snap_features_show, NULL); 2110dfc5606dSYehuda Sadeh 2111dfc5606dSYehuda Sadeh static struct attribute *rbd_snap_attrs[] = { 2112dfc5606dSYehuda Sadeh &dev_attr_snap_size.attr, 2113dfc5606dSYehuda Sadeh &dev_attr_snap_id.attr, 211434b13184SAlex Elder &dev_attr_snap_features.attr, 2115dfc5606dSYehuda Sadeh NULL, 2116dfc5606dSYehuda Sadeh }; 2117dfc5606dSYehuda Sadeh 2118dfc5606dSYehuda Sadeh static struct attribute_group rbd_snap_attr_group = { 2119dfc5606dSYehuda Sadeh .attrs = rbd_snap_attrs, 2120dfc5606dSYehuda Sadeh }; 2121dfc5606dSYehuda Sadeh 2122dfc5606dSYehuda Sadeh static void rbd_snap_dev_release(struct device *dev) 2123dfc5606dSYehuda Sadeh { 2124dfc5606dSYehuda Sadeh struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev); 2125dfc5606dSYehuda Sadeh kfree(snap->name); 2126dfc5606dSYehuda Sadeh kfree(snap); 2127dfc5606dSYehuda Sadeh } 2128dfc5606dSYehuda Sadeh 2129dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_snap_attr_groups[] = { 2130dfc5606dSYehuda Sadeh &rbd_snap_attr_group, 2131dfc5606dSYehuda Sadeh NULL 2132dfc5606dSYehuda Sadeh }; 2133dfc5606dSYehuda Sadeh 2134dfc5606dSYehuda Sadeh static struct device_type rbd_snap_device_type = { 2135dfc5606dSYehuda Sadeh .groups = rbd_snap_attr_groups, 2136dfc5606dSYehuda Sadeh .release = rbd_snap_dev_release, 2137dfc5606dSYehuda Sadeh }; 2138dfc5606dSYehuda Sadeh 2139304f6808SAlex Elder static bool rbd_snap_registered(struct rbd_snap *snap) 2140304f6808SAlex Elder { 2141304f6808SAlex Elder bool ret = snap->dev.type == &rbd_snap_device_type; 2142304f6808SAlex Elder bool reg = device_is_registered(&snap->dev); 2143304f6808SAlex Elder 2144304f6808SAlex Elder rbd_assert(!ret ^ reg); 2145304f6808SAlex Elder 2146304f6808SAlex Elder return ret; 2147304f6808SAlex Elder } 2148304f6808SAlex Elder 214914e7085dSAlex Elder static void __rbd_remove_snap_dev(struct rbd_snap *snap) 2150dfc5606dSYehuda Sadeh { 2151dfc5606dSYehuda Sadeh list_del(&snap->node); 2152304f6808SAlex Elder if (device_is_registered(&snap->dev)) 2153dfc5606dSYehuda Sadeh device_unregister(&snap->dev); 2154dfc5606dSYehuda Sadeh } 2155dfc5606dSYehuda Sadeh 215614e7085dSAlex Elder static int rbd_register_snap_dev(struct rbd_snap *snap, 2157dfc5606dSYehuda Sadeh struct device *parent) 2158dfc5606dSYehuda Sadeh { 2159dfc5606dSYehuda Sadeh struct device *dev = &snap->dev; 2160dfc5606dSYehuda Sadeh int ret; 2161dfc5606dSYehuda Sadeh 2162dfc5606dSYehuda Sadeh dev->type = &rbd_snap_device_type; 2163dfc5606dSYehuda Sadeh dev->parent = parent; 2164dfc5606dSYehuda Sadeh dev->release = rbd_snap_dev_release; 2165d4b125e9SAlex Elder dev_set_name(dev, "%s%s", RBD_SNAP_DEV_NAME_PREFIX, snap->name); 2166304f6808SAlex Elder dout("%s: registering device for snapshot %s\n", __func__, snap->name); 2167304f6808SAlex Elder 2168dfc5606dSYehuda Sadeh ret = device_register(dev); 2169dfc5606dSYehuda Sadeh 2170dfc5606dSYehuda Sadeh return ret; 2171dfc5606dSYehuda Sadeh } 2172dfc5606dSYehuda Sadeh 21734e891e0aSAlex Elder static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev, 2174c8d18425SAlex Elder const char *snap_name, 217534b13184SAlex Elder u64 snap_id, u64 snap_size, 217634b13184SAlex Elder u64 snap_features) 2177dfc5606dSYehuda Sadeh { 21784e891e0aSAlex Elder struct rbd_snap *snap; 2179dfc5606dSYehuda Sadeh int ret; 21804e891e0aSAlex Elder 21814e891e0aSAlex Elder snap = kzalloc(sizeof (*snap), GFP_KERNEL); 2182dfc5606dSYehuda Sadeh if (!snap) 21834e891e0aSAlex Elder return ERR_PTR(-ENOMEM); 21844e891e0aSAlex Elder 21854e891e0aSAlex Elder ret = -ENOMEM; 2186c8d18425SAlex Elder snap->name = kstrdup(snap_name, GFP_KERNEL); 21874e891e0aSAlex Elder if (!snap->name) 21884e891e0aSAlex Elder goto err; 21894e891e0aSAlex Elder 2190c8d18425SAlex Elder snap->id = snap_id; 2191c8d18425SAlex Elder snap->size = snap_size; 219234b13184SAlex Elder snap->features = snap_features; 21934e891e0aSAlex Elder 21944e891e0aSAlex Elder return snap; 21954e891e0aSAlex Elder 2196dfc5606dSYehuda Sadeh err: 2197dfc5606dSYehuda Sadeh kfree(snap->name); 2198dfc5606dSYehuda Sadeh kfree(snap); 21994e891e0aSAlex Elder 22004e891e0aSAlex Elder return ERR_PTR(ret); 2201dfc5606dSYehuda Sadeh } 2202dfc5606dSYehuda Sadeh 2203cd892126SAlex Elder static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which, 2204cd892126SAlex Elder u64 *snap_size, u64 *snap_features) 2205cd892126SAlex Elder { 2206cd892126SAlex Elder char *snap_name; 2207cd892126SAlex Elder 2208cd892126SAlex Elder rbd_assert(which < rbd_dev->header.snapc->num_snaps); 2209cd892126SAlex Elder 2210cd892126SAlex Elder *snap_size = rbd_dev->header.snap_sizes[which]; 2211cd892126SAlex Elder *snap_features = 0; /* No features for v1 */ 2212cd892126SAlex Elder 2213cd892126SAlex Elder /* Skip over names until we find the one we are looking for */ 2214cd892126SAlex Elder 2215cd892126SAlex Elder snap_name = rbd_dev->header.snap_names; 2216cd892126SAlex Elder while (which--) 2217cd892126SAlex Elder snap_name += strlen(snap_name) + 1; 2218cd892126SAlex Elder 2219cd892126SAlex Elder return snap_name; 2220cd892126SAlex Elder } 2221cd892126SAlex Elder 2222dfc5606dSYehuda Sadeh /* 22239d475de5SAlex Elder * Get the size and object order for an image snapshot, or if 22249d475de5SAlex Elder * snap_id is CEPH_NOSNAP, gets this information for the base 22259d475de5SAlex Elder * image. 22269d475de5SAlex Elder */ 22279d475de5SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id, 22289d475de5SAlex Elder u8 *order, u64 *snap_size) 22299d475de5SAlex Elder { 22309d475de5SAlex Elder __le64 snapid = cpu_to_le64(snap_id); 22319d475de5SAlex Elder int ret; 22329d475de5SAlex Elder struct { 22339d475de5SAlex Elder u8 order; 22349d475de5SAlex Elder __le64 size; 22359d475de5SAlex Elder } __attribute__ ((packed)) size_buf = { 0 }; 22369d475de5SAlex Elder 22379d475de5SAlex Elder ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, 22389d475de5SAlex Elder "rbd", "get_size", 22399d475de5SAlex Elder (char *) &snapid, sizeof (snapid), 22409d475de5SAlex Elder (char *) &size_buf, sizeof (size_buf), 22419d475de5SAlex Elder CEPH_OSD_FLAG_READ, NULL); 22429d475de5SAlex Elder dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); 22439d475de5SAlex Elder if (ret < 0) 22449d475de5SAlex Elder return ret; 22459d475de5SAlex Elder 22469d475de5SAlex Elder *order = size_buf.order; 22479d475de5SAlex Elder *snap_size = le64_to_cpu(size_buf.size); 22489d475de5SAlex Elder 22499d475de5SAlex Elder dout(" snap_id 0x%016llx order = %u, snap_size = %llu\n", 22509d475de5SAlex Elder (unsigned long long) snap_id, (unsigned int) *order, 22519d475de5SAlex Elder (unsigned long long) *snap_size); 22529d475de5SAlex Elder 22539d475de5SAlex Elder return 0; 22549d475de5SAlex Elder } 22559d475de5SAlex Elder 22569d475de5SAlex Elder static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev) 22579d475de5SAlex Elder { 22589d475de5SAlex Elder return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP, 22599d475de5SAlex Elder &rbd_dev->header.obj_order, 22609d475de5SAlex Elder &rbd_dev->header.image_size); 22619d475de5SAlex Elder } 22629d475de5SAlex Elder 22631e130199SAlex Elder static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev) 22641e130199SAlex Elder { 22651e130199SAlex Elder void *reply_buf; 22661e130199SAlex Elder int ret; 22671e130199SAlex Elder void *p; 22681e130199SAlex Elder 22691e130199SAlex Elder reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL); 22701e130199SAlex Elder if (!reply_buf) 22711e130199SAlex Elder return -ENOMEM; 22721e130199SAlex Elder 22731e130199SAlex Elder ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, 22741e130199SAlex Elder "rbd", "get_object_prefix", 22751e130199SAlex Elder NULL, 0, 22761e130199SAlex Elder reply_buf, RBD_OBJ_PREFIX_LEN_MAX, 22771e130199SAlex Elder CEPH_OSD_FLAG_READ, NULL); 22781e130199SAlex Elder dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); 22791e130199SAlex Elder if (ret < 0) 22801e130199SAlex Elder goto out; 2281a0ea3a40SAlex Elder ret = 0; /* rbd_req_sync_exec() can return positive */ 22821e130199SAlex Elder 22831e130199SAlex Elder p = reply_buf; 22841e130199SAlex Elder rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p, 22851e130199SAlex Elder p + RBD_OBJ_PREFIX_LEN_MAX, 22861e130199SAlex Elder NULL, GFP_NOIO); 22871e130199SAlex Elder 22881e130199SAlex Elder if (IS_ERR(rbd_dev->header.object_prefix)) { 22891e130199SAlex Elder ret = PTR_ERR(rbd_dev->header.object_prefix); 22901e130199SAlex Elder rbd_dev->header.object_prefix = NULL; 22911e130199SAlex Elder } else { 22921e130199SAlex Elder dout(" object_prefix = %s\n", rbd_dev->header.object_prefix); 22931e130199SAlex Elder } 22941e130199SAlex Elder 22951e130199SAlex Elder out: 22961e130199SAlex Elder kfree(reply_buf); 22971e130199SAlex Elder 22981e130199SAlex Elder return ret; 22991e130199SAlex Elder } 23001e130199SAlex Elder 2301b1b5402aSAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id, 2302b1b5402aSAlex Elder u64 *snap_features) 2303b1b5402aSAlex Elder { 2304b1b5402aSAlex Elder __le64 snapid = cpu_to_le64(snap_id); 2305b1b5402aSAlex Elder struct { 2306b1b5402aSAlex Elder __le64 features; 2307b1b5402aSAlex Elder __le64 incompat; 2308b1b5402aSAlex Elder } features_buf = { 0 }; 2309d889140cSAlex Elder u64 incompat; 2310b1b5402aSAlex Elder int ret; 2311b1b5402aSAlex Elder 2312b1b5402aSAlex Elder ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, 2313b1b5402aSAlex Elder "rbd", "get_features", 2314b1b5402aSAlex Elder (char *) &snapid, sizeof (snapid), 2315b1b5402aSAlex Elder (char *) &features_buf, sizeof (features_buf), 2316b1b5402aSAlex Elder CEPH_OSD_FLAG_READ, NULL); 2317b1b5402aSAlex Elder dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); 2318b1b5402aSAlex Elder if (ret < 0) 2319b1b5402aSAlex Elder return ret; 2320d889140cSAlex Elder 2321d889140cSAlex Elder incompat = le64_to_cpu(features_buf.incompat); 2322d889140cSAlex Elder if (incompat & ~RBD_FEATURES_ALL) 2323d889140cSAlex Elder return -ENOTSUPP; 2324d889140cSAlex Elder 2325b1b5402aSAlex Elder *snap_features = le64_to_cpu(features_buf.features); 2326b1b5402aSAlex Elder 2327b1b5402aSAlex Elder dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n", 2328b1b5402aSAlex Elder (unsigned long long) snap_id, 2329b1b5402aSAlex Elder (unsigned long long) *snap_features, 2330b1b5402aSAlex Elder (unsigned long long) le64_to_cpu(features_buf.incompat)); 2331b1b5402aSAlex Elder 2332b1b5402aSAlex Elder return 0; 2333b1b5402aSAlex Elder } 2334b1b5402aSAlex Elder 2335b1b5402aSAlex Elder static int rbd_dev_v2_features(struct rbd_device *rbd_dev) 2336b1b5402aSAlex Elder { 2337b1b5402aSAlex Elder return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP, 2338b1b5402aSAlex Elder &rbd_dev->header.features); 2339b1b5402aSAlex Elder } 2340b1b5402aSAlex Elder 23416e14b1a6SAlex Elder static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver) 234235d489f9SAlex Elder { 234335d489f9SAlex Elder size_t size; 234435d489f9SAlex Elder int ret; 234535d489f9SAlex Elder void *reply_buf; 234635d489f9SAlex Elder void *p; 234735d489f9SAlex Elder void *end; 234835d489f9SAlex Elder u64 seq; 234935d489f9SAlex Elder u32 snap_count; 235035d489f9SAlex Elder struct ceph_snap_context *snapc; 235135d489f9SAlex Elder u32 i; 235235d489f9SAlex Elder 235335d489f9SAlex Elder /* 235435d489f9SAlex Elder * We'll need room for the seq value (maximum snapshot id), 235535d489f9SAlex Elder * snapshot count, and array of that many snapshot ids. 235635d489f9SAlex Elder * For now we have a fixed upper limit on the number we're 235735d489f9SAlex Elder * prepared to receive. 235835d489f9SAlex Elder */ 235935d489f9SAlex Elder size = sizeof (__le64) + sizeof (__le32) + 236035d489f9SAlex Elder RBD_MAX_SNAP_COUNT * sizeof (__le64); 236135d489f9SAlex Elder reply_buf = kzalloc(size, GFP_KERNEL); 236235d489f9SAlex Elder if (!reply_buf) 236335d489f9SAlex Elder return -ENOMEM; 236435d489f9SAlex Elder 236535d489f9SAlex Elder ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, 236635d489f9SAlex Elder "rbd", "get_snapcontext", 236735d489f9SAlex Elder NULL, 0, 236835d489f9SAlex Elder reply_buf, size, 23696e14b1a6SAlex Elder CEPH_OSD_FLAG_READ, ver); 237035d489f9SAlex Elder dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); 237135d489f9SAlex Elder if (ret < 0) 237235d489f9SAlex Elder goto out; 237335d489f9SAlex Elder 237435d489f9SAlex Elder ret = -ERANGE; 237535d489f9SAlex Elder p = reply_buf; 237635d489f9SAlex Elder end = (char *) reply_buf + size; 237735d489f9SAlex Elder ceph_decode_64_safe(&p, end, seq, out); 237835d489f9SAlex Elder ceph_decode_32_safe(&p, end, snap_count, out); 237935d489f9SAlex Elder 238035d489f9SAlex Elder /* 238135d489f9SAlex Elder * Make sure the reported number of snapshot ids wouldn't go 238235d489f9SAlex Elder * beyond the end of our buffer. But before checking that, 238335d489f9SAlex Elder * make sure the computed size of the snapshot context we 238435d489f9SAlex Elder * allocate is representable in a size_t. 238535d489f9SAlex Elder */ 238635d489f9SAlex Elder if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context)) 238735d489f9SAlex Elder / sizeof (u64)) { 238835d489f9SAlex Elder ret = -EINVAL; 238935d489f9SAlex Elder goto out; 239035d489f9SAlex Elder } 239135d489f9SAlex Elder if (!ceph_has_room(&p, end, snap_count * sizeof (__le64))) 239235d489f9SAlex Elder goto out; 239335d489f9SAlex Elder 239435d489f9SAlex Elder size = sizeof (struct ceph_snap_context) + 239535d489f9SAlex Elder snap_count * sizeof (snapc->snaps[0]); 239635d489f9SAlex Elder snapc = kmalloc(size, GFP_KERNEL); 239735d489f9SAlex Elder if (!snapc) { 239835d489f9SAlex Elder ret = -ENOMEM; 239935d489f9SAlex Elder goto out; 240035d489f9SAlex Elder } 240135d489f9SAlex Elder 240235d489f9SAlex Elder atomic_set(&snapc->nref, 1); 240335d489f9SAlex Elder snapc->seq = seq; 240435d489f9SAlex Elder snapc->num_snaps = snap_count; 240535d489f9SAlex Elder for (i = 0; i < snap_count; i++) 240635d489f9SAlex Elder snapc->snaps[i] = ceph_decode_64(&p); 240735d489f9SAlex Elder 240835d489f9SAlex Elder rbd_dev->header.snapc = snapc; 240935d489f9SAlex Elder 241035d489f9SAlex Elder dout(" snap context seq = %llu, snap_count = %u\n", 241135d489f9SAlex Elder (unsigned long long) seq, (unsigned int) snap_count); 241235d489f9SAlex Elder 241335d489f9SAlex Elder out: 241435d489f9SAlex Elder kfree(reply_buf); 241535d489f9SAlex Elder 241635d489f9SAlex Elder return 0; 241735d489f9SAlex Elder } 241835d489f9SAlex Elder 2419b8b1e2dbSAlex Elder static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which) 2420b8b1e2dbSAlex Elder { 2421b8b1e2dbSAlex Elder size_t size; 2422b8b1e2dbSAlex Elder void *reply_buf; 2423b8b1e2dbSAlex Elder __le64 snap_id; 2424b8b1e2dbSAlex Elder int ret; 2425b8b1e2dbSAlex Elder void *p; 2426b8b1e2dbSAlex Elder void *end; 2427b8b1e2dbSAlex Elder size_t snap_name_len; 2428b8b1e2dbSAlex Elder char *snap_name; 2429b8b1e2dbSAlex Elder 2430b8b1e2dbSAlex Elder size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN; 2431b8b1e2dbSAlex Elder reply_buf = kmalloc(size, GFP_KERNEL); 2432b8b1e2dbSAlex Elder if (!reply_buf) 2433b8b1e2dbSAlex Elder return ERR_PTR(-ENOMEM); 2434b8b1e2dbSAlex Elder 2435b8b1e2dbSAlex Elder snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]); 2436b8b1e2dbSAlex Elder ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, 2437b8b1e2dbSAlex Elder "rbd", "get_snapshot_name", 2438b8b1e2dbSAlex Elder (char *) &snap_id, sizeof (snap_id), 2439b8b1e2dbSAlex Elder reply_buf, size, 2440b8b1e2dbSAlex Elder CEPH_OSD_FLAG_READ, NULL); 2441b8b1e2dbSAlex Elder dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); 2442b8b1e2dbSAlex Elder if (ret < 0) 2443b8b1e2dbSAlex Elder goto out; 2444b8b1e2dbSAlex Elder 2445b8b1e2dbSAlex Elder p = reply_buf; 2446b8b1e2dbSAlex Elder end = (char *) reply_buf + size; 2447b8b1e2dbSAlex Elder snap_name_len = 0; 2448b8b1e2dbSAlex Elder snap_name = ceph_extract_encoded_string(&p, end, &snap_name_len, 2449b8b1e2dbSAlex Elder GFP_KERNEL); 2450b8b1e2dbSAlex Elder if (IS_ERR(snap_name)) { 2451b8b1e2dbSAlex Elder ret = PTR_ERR(snap_name); 2452b8b1e2dbSAlex Elder goto out; 2453b8b1e2dbSAlex Elder } else { 2454b8b1e2dbSAlex Elder dout(" snap_id 0x%016llx snap_name = %s\n", 2455b8b1e2dbSAlex Elder (unsigned long long) le64_to_cpu(snap_id), snap_name); 2456b8b1e2dbSAlex Elder } 2457b8b1e2dbSAlex Elder kfree(reply_buf); 2458b8b1e2dbSAlex Elder 2459b8b1e2dbSAlex Elder return snap_name; 2460b8b1e2dbSAlex Elder out: 2461b8b1e2dbSAlex Elder kfree(reply_buf); 2462b8b1e2dbSAlex Elder 2463b8b1e2dbSAlex Elder return ERR_PTR(ret); 2464b8b1e2dbSAlex Elder } 2465b8b1e2dbSAlex Elder 2466b8b1e2dbSAlex Elder static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which, 2467b8b1e2dbSAlex Elder u64 *snap_size, u64 *snap_features) 2468b8b1e2dbSAlex Elder { 2469b8b1e2dbSAlex Elder __le64 snap_id; 2470b8b1e2dbSAlex Elder u8 order; 2471b8b1e2dbSAlex Elder int ret; 2472b8b1e2dbSAlex Elder 2473b8b1e2dbSAlex Elder snap_id = rbd_dev->header.snapc->snaps[which]; 2474b8b1e2dbSAlex Elder ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, &order, snap_size); 2475b8b1e2dbSAlex Elder if (ret) 2476b8b1e2dbSAlex Elder return ERR_PTR(ret); 2477b8b1e2dbSAlex Elder ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, snap_features); 2478b8b1e2dbSAlex Elder if (ret) 2479b8b1e2dbSAlex Elder return ERR_PTR(ret); 2480b8b1e2dbSAlex Elder 2481b8b1e2dbSAlex Elder return rbd_dev_v2_snap_name(rbd_dev, which); 2482b8b1e2dbSAlex Elder } 2483b8b1e2dbSAlex Elder 2484b8b1e2dbSAlex Elder static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which, 2485b8b1e2dbSAlex Elder u64 *snap_size, u64 *snap_features) 2486b8b1e2dbSAlex Elder { 2487b8b1e2dbSAlex Elder if (rbd_dev->image_format == 1) 2488b8b1e2dbSAlex Elder return rbd_dev_v1_snap_info(rbd_dev, which, 2489b8b1e2dbSAlex Elder snap_size, snap_features); 2490b8b1e2dbSAlex Elder if (rbd_dev->image_format == 2) 2491b8b1e2dbSAlex Elder return rbd_dev_v2_snap_info(rbd_dev, which, 2492b8b1e2dbSAlex Elder snap_size, snap_features); 2493b8b1e2dbSAlex Elder return ERR_PTR(-EINVAL); 2494b8b1e2dbSAlex Elder } 2495b8b1e2dbSAlex Elder 2496117973fbSAlex Elder static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver) 2497117973fbSAlex Elder { 2498117973fbSAlex Elder int ret; 2499117973fbSAlex Elder __u8 obj_order; 2500117973fbSAlex Elder 2501117973fbSAlex Elder down_write(&rbd_dev->header_rwsem); 2502117973fbSAlex Elder 2503117973fbSAlex Elder /* Grab old order first, to see if it changes */ 2504117973fbSAlex Elder 2505117973fbSAlex Elder obj_order = rbd_dev->header.obj_order, 2506117973fbSAlex Elder ret = rbd_dev_v2_image_size(rbd_dev); 2507117973fbSAlex Elder if (ret) 2508117973fbSAlex Elder goto out; 2509117973fbSAlex Elder if (rbd_dev->header.obj_order != obj_order) { 2510117973fbSAlex Elder ret = -EIO; 2511117973fbSAlex Elder goto out; 2512117973fbSAlex Elder } 2513117973fbSAlex Elder rbd_update_mapping_size(rbd_dev); 2514117973fbSAlex Elder 2515117973fbSAlex Elder ret = rbd_dev_v2_snap_context(rbd_dev, hver); 2516117973fbSAlex Elder dout("rbd_dev_v2_snap_context returned %d\n", ret); 2517117973fbSAlex Elder if (ret) 2518117973fbSAlex Elder goto out; 2519117973fbSAlex Elder ret = rbd_dev_snaps_update(rbd_dev); 2520117973fbSAlex Elder dout("rbd_dev_snaps_update returned %d\n", ret); 2521117973fbSAlex Elder if (ret) 2522117973fbSAlex Elder goto out; 2523117973fbSAlex Elder ret = rbd_dev_snaps_register(rbd_dev); 2524117973fbSAlex Elder dout("rbd_dev_snaps_register returned %d\n", ret); 2525117973fbSAlex Elder out: 2526117973fbSAlex Elder up_write(&rbd_dev->header_rwsem); 2527117973fbSAlex Elder 2528117973fbSAlex Elder return ret; 2529117973fbSAlex Elder } 2530117973fbSAlex Elder 25319d475de5SAlex Elder /* 253235938150SAlex Elder * Scan the rbd device's current snapshot list and compare it to the 253335938150SAlex Elder * newly-received snapshot context. Remove any existing snapshots 253435938150SAlex Elder * not present in the new snapshot context. Add a new snapshot for 253535938150SAlex Elder * any snaphots in the snapshot context not in the current list. 253635938150SAlex Elder * And verify there are no changes to snapshots we already know 253735938150SAlex Elder * about. 253835938150SAlex Elder * 253935938150SAlex Elder * Assumes the snapshots in the snapshot context are sorted by 254035938150SAlex Elder * snapshot id, highest id first. (Snapshots in the rbd_dev's list 254135938150SAlex Elder * are also maintained in that order.) 2542dfc5606dSYehuda Sadeh */ 2543304f6808SAlex Elder static int rbd_dev_snaps_update(struct rbd_device *rbd_dev) 2544dfc5606dSYehuda Sadeh { 254535938150SAlex Elder struct ceph_snap_context *snapc = rbd_dev->header.snapc; 254635938150SAlex Elder const u32 snap_count = snapc->num_snaps; 254735938150SAlex Elder struct list_head *head = &rbd_dev->snaps; 254835938150SAlex Elder struct list_head *links = head->next; 254935938150SAlex Elder u32 index = 0; 2550dfc5606dSYehuda Sadeh 25519fcbb800SAlex Elder dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count); 255235938150SAlex Elder while (index < snap_count || links != head) { 255335938150SAlex Elder u64 snap_id; 255435938150SAlex Elder struct rbd_snap *snap; 2555cd892126SAlex Elder char *snap_name; 2556cd892126SAlex Elder u64 snap_size = 0; 2557cd892126SAlex Elder u64 snap_features = 0; 2558dfc5606dSYehuda Sadeh 255935938150SAlex Elder snap_id = index < snap_count ? snapc->snaps[index] 256035938150SAlex Elder : CEPH_NOSNAP; 256135938150SAlex Elder snap = links != head ? list_entry(links, struct rbd_snap, node) 256235938150SAlex Elder : NULL; 2563aafb230eSAlex Elder rbd_assert(!snap || snap->id != CEPH_NOSNAP); 2564dfc5606dSYehuda Sadeh 256535938150SAlex Elder if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) { 256635938150SAlex Elder struct list_head *next = links->next; 2567dfc5606dSYehuda Sadeh 256835938150SAlex Elder /* Existing snapshot not in the new snap context */ 2569dfc5606dSYehuda Sadeh 2570f84344f3SAlex Elder if (rbd_dev->mapping.snap_id == snap->id) 2571f84344f3SAlex Elder rbd_dev->mapping.snap_exists = false; 257235938150SAlex Elder __rbd_remove_snap_dev(snap); 25739fcbb800SAlex Elder dout("%ssnap id %llu has been removed\n", 2574f84344f3SAlex Elder rbd_dev->mapping.snap_id == snap->id ? 2575f84344f3SAlex Elder "mapped " : "", 25769fcbb800SAlex Elder (unsigned long long) snap->id); 2577dfc5606dSYehuda Sadeh 257835938150SAlex Elder /* Done with this list entry; advance */ 257935938150SAlex Elder 258035938150SAlex Elder links = next; 258135938150SAlex Elder continue; 2582dfc5606dSYehuda Sadeh } 258335938150SAlex Elder 2584b8b1e2dbSAlex Elder snap_name = rbd_dev_snap_info(rbd_dev, index, 2585cd892126SAlex Elder &snap_size, &snap_features); 2586cd892126SAlex Elder if (IS_ERR(snap_name)) 2587cd892126SAlex Elder return PTR_ERR(snap_name); 2588cd892126SAlex Elder 25899fcbb800SAlex Elder dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count, 25909fcbb800SAlex Elder (unsigned long long) snap_id); 259135938150SAlex Elder if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) { 259235938150SAlex Elder struct rbd_snap *new_snap; 259335938150SAlex Elder 259435938150SAlex Elder /* We haven't seen this snapshot before */ 259535938150SAlex Elder 2596c8d18425SAlex Elder new_snap = __rbd_add_snap_dev(rbd_dev, snap_name, 2597cd892126SAlex Elder snap_id, snap_size, snap_features); 25989fcbb800SAlex Elder if (IS_ERR(new_snap)) { 25999fcbb800SAlex Elder int err = PTR_ERR(new_snap); 26009fcbb800SAlex Elder 26019fcbb800SAlex Elder dout(" failed to add dev, error %d\n", err); 26029fcbb800SAlex Elder 26039fcbb800SAlex Elder return err; 26049fcbb800SAlex Elder } 260535938150SAlex Elder 260635938150SAlex Elder /* New goes before existing, or at end of list */ 260735938150SAlex Elder 26089fcbb800SAlex Elder dout(" added dev%s\n", snap ? "" : " at end\n"); 260935938150SAlex Elder if (snap) 261035938150SAlex Elder list_add_tail(&new_snap->node, &snap->node); 261135938150SAlex Elder else 2612523f3258SAlex Elder list_add_tail(&new_snap->node, head); 261335938150SAlex Elder } else { 261435938150SAlex Elder /* Already have this one */ 261535938150SAlex Elder 26169fcbb800SAlex Elder dout(" already present\n"); 26179fcbb800SAlex Elder 2618cd892126SAlex Elder rbd_assert(snap->size == snap_size); 2619aafb230eSAlex Elder rbd_assert(!strcmp(snap->name, snap_name)); 2620cd892126SAlex Elder rbd_assert(snap->features == snap_features); 262135938150SAlex Elder 262235938150SAlex Elder /* Done with this list entry; advance */ 262335938150SAlex Elder 262435938150SAlex Elder links = links->next; 2625dfc5606dSYehuda Sadeh } 262635938150SAlex Elder 262735938150SAlex Elder /* Advance to the next entry in the snapshot context */ 262835938150SAlex Elder 262935938150SAlex Elder index++; 2630dfc5606dSYehuda Sadeh } 26319fcbb800SAlex Elder dout("%s: done\n", __func__); 2632dfc5606dSYehuda Sadeh 2633dfc5606dSYehuda Sadeh return 0; 2634dfc5606dSYehuda Sadeh } 2635dfc5606dSYehuda Sadeh 2636304f6808SAlex Elder /* 2637304f6808SAlex Elder * Scan the list of snapshots and register the devices for any that 2638304f6808SAlex Elder * have not already been registered. 2639304f6808SAlex Elder */ 2640304f6808SAlex Elder static int rbd_dev_snaps_register(struct rbd_device *rbd_dev) 2641304f6808SAlex Elder { 2642304f6808SAlex Elder struct rbd_snap *snap; 2643304f6808SAlex Elder int ret = 0; 2644304f6808SAlex Elder 2645304f6808SAlex Elder dout("%s called\n", __func__); 264686ff77bbSAlex Elder if (WARN_ON(!device_is_registered(&rbd_dev->dev))) 264786ff77bbSAlex Elder return -EIO; 2648304f6808SAlex Elder 2649304f6808SAlex Elder list_for_each_entry(snap, &rbd_dev->snaps, node) { 2650304f6808SAlex Elder if (!rbd_snap_registered(snap)) { 2651304f6808SAlex Elder ret = rbd_register_snap_dev(snap, &rbd_dev->dev); 2652304f6808SAlex Elder if (ret < 0) 2653304f6808SAlex Elder break; 2654304f6808SAlex Elder } 2655304f6808SAlex Elder } 2656304f6808SAlex Elder dout("%s: returning %d\n", __func__, ret); 2657304f6808SAlex Elder 2658304f6808SAlex Elder return ret; 2659304f6808SAlex Elder } 2660304f6808SAlex Elder 2661dfc5606dSYehuda Sadeh static int rbd_bus_add_dev(struct rbd_device *rbd_dev) 2662dfc5606dSYehuda Sadeh { 2663dfc5606dSYehuda Sadeh struct device *dev; 2664cd789ab9SAlex Elder int ret; 2665dfc5606dSYehuda Sadeh 2666dfc5606dSYehuda Sadeh mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); 2667dfc5606dSYehuda Sadeh 2668cd789ab9SAlex Elder dev = &rbd_dev->dev; 2669dfc5606dSYehuda Sadeh dev->bus = &rbd_bus_type; 2670dfc5606dSYehuda Sadeh dev->type = &rbd_device_type; 2671dfc5606dSYehuda Sadeh dev->parent = &rbd_root_dev; 2672dfc5606dSYehuda Sadeh dev->release = rbd_dev_release; 2673de71a297SAlex Elder dev_set_name(dev, "%d", rbd_dev->dev_id); 2674dfc5606dSYehuda Sadeh ret = device_register(dev); 2675dfc5606dSYehuda Sadeh 2676dfc5606dSYehuda Sadeh mutex_unlock(&ctl_mutex); 2677cd789ab9SAlex Elder 2678dfc5606dSYehuda Sadeh return ret; 2679602adf40SYehuda Sadeh } 2680602adf40SYehuda Sadeh 2681dfc5606dSYehuda Sadeh static void rbd_bus_del_dev(struct rbd_device *rbd_dev) 2682dfc5606dSYehuda Sadeh { 2683dfc5606dSYehuda Sadeh device_unregister(&rbd_dev->dev); 2684dfc5606dSYehuda Sadeh } 2685dfc5606dSYehuda Sadeh 268659c2be1eSYehuda Sadeh static int rbd_init_watch_dev(struct rbd_device *rbd_dev) 268759c2be1eSYehuda Sadeh { 268859c2be1eSYehuda Sadeh int ret, rc; 268959c2be1eSYehuda Sadeh 269059c2be1eSYehuda Sadeh do { 26910e6f322dSAlex Elder ret = rbd_req_sync_watch(rbd_dev); 269259c2be1eSYehuda Sadeh if (ret == -ERANGE) { 2693117973fbSAlex Elder rc = rbd_dev_refresh(rbd_dev, NULL); 269459c2be1eSYehuda Sadeh if (rc < 0) 269559c2be1eSYehuda Sadeh return rc; 269659c2be1eSYehuda Sadeh } 269759c2be1eSYehuda Sadeh } while (ret == -ERANGE); 269859c2be1eSYehuda Sadeh 269959c2be1eSYehuda Sadeh return ret; 270059c2be1eSYehuda Sadeh } 270159c2be1eSYehuda Sadeh 2702e2839308SAlex Elder static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0); 27031ddbe94eSAlex Elder 27041ddbe94eSAlex Elder /* 2705499afd5bSAlex Elder * Get a unique rbd identifier for the given new rbd_dev, and add 2706499afd5bSAlex Elder * the rbd_dev to the global list. The minimum rbd id is 1. 27071ddbe94eSAlex Elder */ 2708e2839308SAlex Elder static void rbd_dev_id_get(struct rbd_device *rbd_dev) 2709b7f23c36SAlex Elder { 2710e2839308SAlex Elder rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max); 2711499afd5bSAlex Elder 2712499afd5bSAlex Elder spin_lock(&rbd_dev_list_lock); 2713499afd5bSAlex Elder list_add_tail(&rbd_dev->node, &rbd_dev_list); 2714499afd5bSAlex Elder spin_unlock(&rbd_dev_list_lock); 2715e2839308SAlex Elder dout("rbd_dev %p given dev id %llu\n", rbd_dev, 2716e2839308SAlex Elder (unsigned long long) rbd_dev->dev_id); 2717b7f23c36SAlex Elder } 2718b7f23c36SAlex Elder 27191ddbe94eSAlex Elder /* 2720499afd5bSAlex Elder * Remove an rbd_dev from the global list, and record that its 2721499afd5bSAlex Elder * identifier is no longer in use. 27221ddbe94eSAlex Elder */ 2723e2839308SAlex Elder static void rbd_dev_id_put(struct rbd_device *rbd_dev) 27241ddbe94eSAlex Elder { 2725d184f6bfSAlex Elder struct list_head *tmp; 2726de71a297SAlex Elder int rbd_id = rbd_dev->dev_id; 2727d184f6bfSAlex Elder int max_id; 2728d184f6bfSAlex Elder 2729aafb230eSAlex Elder rbd_assert(rbd_id > 0); 2730499afd5bSAlex Elder 2731e2839308SAlex Elder dout("rbd_dev %p released dev id %llu\n", rbd_dev, 2732e2839308SAlex Elder (unsigned long long) rbd_dev->dev_id); 2733499afd5bSAlex Elder spin_lock(&rbd_dev_list_lock); 2734499afd5bSAlex Elder list_del_init(&rbd_dev->node); 2735d184f6bfSAlex Elder 2736d184f6bfSAlex Elder /* 2737d184f6bfSAlex Elder * If the id being "put" is not the current maximum, there 2738d184f6bfSAlex Elder * is nothing special we need to do. 2739d184f6bfSAlex Elder */ 2740e2839308SAlex Elder if (rbd_id != atomic64_read(&rbd_dev_id_max)) { 2741d184f6bfSAlex Elder spin_unlock(&rbd_dev_list_lock); 2742d184f6bfSAlex Elder return; 2743d184f6bfSAlex Elder } 2744d184f6bfSAlex Elder 2745d184f6bfSAlex Elder /* 2746d184f6bfSAlex Elder * We need to update the current maximum id. Search the 2747d184f6bfSAlex Elder * list to find out what it is. We're more likely to find 2748d184f6bfSAlex Elder * the maximum at the end, so search the list backward. 2749d184f6bfSAlex Elder */ 2750d184f6bfSAlex Elder max_id = 0; 2751d184f6bfSAlex Elder list_for_each_prev(tmp, &rbd_dev_list) { 2752d184f6bfSAlex Elder struct rbd_device *rbd_dev; 2753d184f6bfSAlex Elder 2754d184f6bfSAlex Elder rbd_dev = list_entry(tmp, struct rbd_device, node); 2755b213e0b1SAlex Elder if (rbd_dev->dev_id > max_id) 2756b213e0b1SAlex Elder max_id = rbd_dev->dev_id; 2757d184f6bfSAlex Elder } 2758499afd5bSAlex Elder spin_unlock(&rbd_dev_list_lock); 27591ddbe94eSAlex Elder 27601ddbe94eSAlex Elder /* 2761e2839308SAlex Elder * The max id could have been updated by rbd_dev_id_get(), in 2762d184f6bfSAlex Elder * which case it now accurately reflects the new maximum. 2763d184f6bfSAlex Elder * Be careful not to overwrite the maximum value in that 2764d184f6bfSAlex Elder * case. 27651ddbe94eSAlex Elder */ 2766e2839308SAlex Elder atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id); 2767e2839308SAlex Elder dout(" max dev id has been reset\n"); 2768b7f23c36SAlex Elder } 2769b7f23c36SAlex Elder 2770a725f65eSAlex Elder /* 2771e28fff26SAlex Elder * Skips over white space at *buf, and updates *buf to point to the 2772e28fff26SAlex Elder * first found non-space character (if any). Returns the length of 2773593a9e7bSAlex Elder * the token (string of non-white space characters) found. Note 2774593a9e7bSAlex Elder * that *buf must be terminated with '\0'. 2775e28fff26SAlex Elder */ 2776e28fff26SAlex Elder static inline size_t next_token(const char **buf) 2777e28fff26SAlex Elder { 2778e28fff26SAlex Elder /* 2779e28fff26SAlex Elder * These are the characters that produce nonzero for 2780e28fff26SAlex Elder * isspace() in the "C" and "POSIX" locales. 2781e28fff26SAlex Elder */ 2782e28fff26SAlex Elder const char *spaces = " \f\n\r\t\v"; 2783e28fff26SAlex Elder 2784e28fff26SAlex Elder *buf += strspn(*buf, spaces); /* Find start of token */ 2785e28fff26SAlex Elder 2786e28fff26SAlex Elder return strcspn(*buf, spaces); /* Return token length */ 2787e28fff26SAlex Elder } 2788e28fff26SAlex Elder 2789e28fff26SAlex Elder /* 2790e28fff26SAlex Elder * Finds the next token in *buf, and if the provided token buffer is 2791e28fff26SAlex Elder * big enough, copies the found token into it. The result, if 2792593a9e7bSAlex Elder * copied, is guaranteed to be terminated with '\0'. Note that *buf 2793593a9e7bSAlex Elder * must be terminated with '\0' on entry. 2794e28fff26SAlex Elder * 2795e28fff26SAlex Elder * Returns the length of the token found (not including the '\0'). 2796e28fff26SAlex Elder * Return value will be 0 if no token is found, and it will be >= 2797e28fff26SAlex Elder * token_size if the token would not fit. 2798e28fff26SAlex Elder * 2799593a9e7bSAlex Elder * The *buf pointer will be updated to point beyond the end of the 2800e28fff26SAlex Elder * found token. Note that this occurs even if the token buffer is 2801e28fff26SAlex Elder * too small to hold it. 2802e28fff26SAlex Elder */ 2803e28fff26SAlex Elder static inline size_t copy_token(const char **buf, 2804e28fff26SAlex Elder char *token, 2805e28fff26SAlex Elder size_t token_size) 2806e28fff26SAlex Elder { 2807e28fff26SAlex Elder size_t len; 2808e28fff26SAlex Elder 2809e28fff26SAlex Elder len = next_token(buf); 2810e28fff26SAlex Elder if (len < token_size) { 2811e28fff26SAlex Elder memcpy(token, *buf, len); 2812e28fff26SAlex Elder *(token + len) = '\0'; 2813e28fff26SAlex Elder } 2814e28fff26SAlex Elder *buf += len; 2815e28fff26SAlex Elder 2816e28fff26SAlex Elder return len; 2817e28fff26SAlex Elder } 2818e28fff26SAlex Elder 2819e28fff26SAlex Elder /* 2820ea3352f4SAlex Elder * Finds the next token in *buf, dynamically allocates a buffer big 2821ea3352f4SAlex Elder * enough to hold a copy of it, and copies the token into the new 2822ea3352f4SAlex Elder * buffer. The copy is guaranteed to be terminated with '\0'. Note 2823ea3352f4SAlex Elder * that a duplicate buffer is created even for a zero-length token. 2824ea3352f4SAlex Elder * 2825ea3352f4SAlex Elder * Returns a pointer to the newly-allocated duplicate, or a null 2826ea3352f4SAlex Elder * pointer if memory for the duplicate was not available. If 2827ea3352f4SAlex Elder * the lenp argument is a non-null pointer, the length of the token 2828ea3352f4SAlex Elder * (not including the '\0') is returned in *lenp. 2829ea3352f4SAlex Elder * 2830ea3352f4SAlex Elder * If successful, the *buf pointer will be updated to point beyond 2831ea3352f4SAlex Elder * the end of the found token. 2832ea3352f4SAlex Elder * 2833ea3352f4SAlex Elder * Note: uses GFP_KERNEL for allocation. 2834ea3352f4SAlex Elder */ 2835ea3352f4SAlex Elder static inline char *dup_token(const char **buf, size_t *lenp) 2836ea3352f4SAlex Elder { 2837ea3352f4SAlex Elder char *dup; 2838ea3352f4SAlex Elder size_t len; 2839ea3352f4SAlex Elder 2840ea3352f4SAlex Elder len = next_token(buf); 2841ea3352f4SAlex Elder dup = kmalloc(len + 1, GFP_KERNEL); 2842ea3352f4SAlex Elder if (!dup) 2843ea3352f4SAlex Elder return NULL; 2844ea3352f4SAlex Elder 2845ea3352f4SAlex Elder memcpy(dup, *buf, len); 2846ea3352f4SAlex Elder *(dup + len) = '\0'; 2847ea3352f4SAlex Elder *buf += len; 2848ea3352f4SAlex Elder 2849ea3352f4SAlex Elder if (lenp) 2850ea3352f4SAlex Elder *lenp = len; 2851ea3352f4SAlex Elder 2852ea3352f4SAlex Elder return dup; 2853ea3352f4SAlex Elder } 2854ea3352f4SAlex Elder 2855ea3352f4SAlex Elder /* 28563feeb894SAlex Elder * This fills in the pool_name, image_name, image_name_len, rbd_dev, 28573feeb894SAlex Elder * rbd_md_name, and name fields of the given rbd_dev, based on the 28583feeb894SAlex Elder * list of monitor addresses and other options provided via 28593feeb894SAlex Elder * /sys/bus/rbd/add. Returns a pointer to a dynamically-allocated 28603feeb894SAlex Elder * copy of the snapshot name to map if successful, or a 28613feeb894SAlex Elder * pointer-coded error otherwise. 2862d22f76e7SAlex Elder * 2863d22f76e7SAlex Elder * Note: rbd_dev is assumed to have been initially zero-filled. 2864a725f65eSAlex Elder */ 28653feeb894SAlex Elder static char *rbd_add_parse_args(struct rbd_device *rbd_dev, 2866a725f65eSAlex Elder const char *buf, 28677ef3214aSAlex Elder const char **mon_addrs, 28685214ecc4SAlex Elder size_t *mon_addrs_size, 2869e28fff26SAlex Elder char *options, 2870e28fff26SAlex Elder size_t options_size) 2871a725f65eSAlex Elder { 2872e28fff26SAlex Elder size_t len; 28733feeb894SAlex Elder char *err_ptr = ERR_PTR(-EINVAL); 28743feeb894SAlex Elder char *snap_name; 2875e28fff26SAlex Elder 2876e28fff26SAlex Elder /* The first four tokens are required */ 2877e28fff26SAlex Elder 28787ef3214aSAlex Elder len = next_token(&buf); 28797ef3214aSAlex Elder if (!len) 28803feeb894SAlex Elder return err_ptr; 28815214ecc4SAlex Elder *mon_addrs_size = len + 1; 28827ef3214aSAlex Elder *mon_addrs = buf; 28837ef3214aSAlex Elder 28847ef3214aSAlex Elder buf += len; 2885a725f65eSAlex Elder 2886e28fff26SAlex Elder len = copy_token(&buf, options, options_size); 2887e28fff26SAlex Elder if (!len || len >= options_size) 28883feeb894SAlex Elder return err_ptr; 2889a725f65eSAlex Elder 28903feeb894SAlex Elder err_ptr = ERR_PTR(-ENOMEM); 2891d22f76e7SAlex Elder rbd_dev->pool_name = dup_token(&buf, NULL); 2892d22f76e7SAlex Elder if (!rbd_dev->pool_name) 2893d22f76e7SAlex Elder goto out_err; 2894e28fff26SAlex Elder 28950bed54dcSAlex Elder rbd_dev->image_name = dup_token(&buf, &rbd_dev->image_name_len); 28960bed54dcSAlex Elder if (!rbd_dev->image_name) 2897bf3e5ae1SAlex Elder goto out_err; 2898e28fff26SAlex Elder 2899d4b125e9SAlex Elder /* Snapshot name is optional; default is to use "head" */ 2900d4b125e9SAlex Elder 29013feeb894SAlex Elder len = next_token(&buf); 2902d4b125e9SAlex Elder if (len > RBD_MAX_SNAP_NAME_LEN) { 2903d4b125e9SAlex Elder err_ptr = ERR_PTR(-ENAMETOOLONG); 2904d4b125e9SAlex Elder goto out_err; 2905d4b125e9SAlex Elder } 2906820a5f3eSAlex Elder if (!len) { 29073feeb894SAlex Elder buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */ 29083feeb894SAlex Elder len = sizeof (RBD_SNAP_HEAD_NAME) - 1; 2909849b4260SAlex Elder } 29103feeb894SAlex Elder snap_name = kmalloc(len + 1, GFP_KERNEL); 29113feeb894SAlex Elder if (!snap_name) 29123feeb894SAlex Elder goto out_err; 29133feeb894SAlex Elder memcpy(snap_name, buf, len); 29143feeb894SAlex Elder *(snap_name + len) = '\0'; 2915e28fff26SAlex Elder 29163feeb894SAlex Elder return snap_name; 2917d22f76e7SAlex Elder 2918d22f76e7SAlex Elder out_err: 29190bed54dcSAlex Elder kfree(rbd_dev->image_name); 2920d78fd7aeSAlex Elder rbd_dev->image_name = NULL; 2921d78fd7aeSAlex Elder rbd_dev->image_name_len = 0; 2922d22f76e7SAlex Elder kfree(rbd_dev->pool_name); 2923d22f76e7SAlex Elder rbd_dev->pool_name = NULL; 2924d22f76e7SAlex Elder 29253feeb894SAlex Elder return err_ptr; 2926a725f65eSAlex Elder } 2927a725f65eSAlex Elder 2928589d30e0SAlex Elder /* 2929589d30e0SAlex Elder * An rbd format 2 image has a unique identifier, distinct from the 2930589d30e0SAlex Elder * name given to it by the user. Internally, that identifier is 2931589d30e0SAlex Elder * what's used to specify the names of objects related to the image. 2932589d30e0SAlex Elder * 2933589d30e0SAlex Elder * A special "rbd id" object is used to map an rbd image name to its 2934589d30e0SAlex Elder * id. If that object doesn't exist, then there is no v2 rbd image 2935589d30e0SAlex Elder * with the supplied name. 2936589d30e0SAlex Elder * 2937589d30e0SAlex Elder * This function will record the given rbd_dev's image_id field if 2938589d30e0SAlex Elder * it can be determined, and in that case will return 0. If any 2939589d30e0SAlex Elder * errors occur a negative errno will be returned and the rbd_dev's 2940589d30e0SAlex Elder * image_id field will be unchanged (and should be NULL). 2941589d30e0SAlex Elder */ 2942589d30e0SAlex Elder static int rbd_dev_image_id(struct rbd_device *rbd_dev) 2943589d30e0SAlex Elder { 2944589d30e0SAlex Elder int ret; 2945589d30e0SAlex Elder size_t size; 2946589d30e0SAlex Elder char *object_name; 2947589d30e0SAlex Elder void *response; 2948589d30e0SAlex Elder void *p; 2949589d30e0SAlex Elder 2950589d30e0SAlex Elder /* 2951589d30e0SAlex Elder * First, see if the format 2 image id file exists, and if 2952589d30e0SAlex Elder * so, get the image's persistent id from it. 2953589d30e0SAlex Elder */ 2954589d30e0SAlex Elder size = sizeof (RBD_ID_PREFIX) + rbd_dev->image_name_len; 2955589d30e0SAlex Elder object_name = kmalloc(size, GFP_NOIO); 2956589d30e0SAlex Elder if (!object_name) 2957589d30e0SAlex Elder return -ENOMEM; 2958589d30e0SAlex Elder sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->image_name); 2959589d30e0SAlex Elder dout("rbd id object name is %s\n", object_name); 2960589d30e0SAlex Elder 2961589d30e0SAlex Elder /* Response will be an encoded string, which includes a length */ 2962589d30e0SAlex Elder 2963589d30e0SAlex Elder size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX; 2964589d30e0SAlex Elder response = kzalloc(size, GFP_NOIO); 2965589d30e0SAlex Elder if (!response) { 2966589d30e0SAlex Elder ret = -ENOMEM; 2967589d30e0SAlex Elder goto out; 2968589d30e0SAlex Elder } 2969589d30e0SAlex Elder 2970589d30e0SAlex Elder ret = rbd_req_sync_exec(rbd_dev, object_name, 2971589d30e0SAlex Elder "rbd", "get_id", 2972589d30e0SAlex Elder NULL, 0, 2973589d30e0SAlex Elder response, RBD_IMAGE_ID_LEN_MAX, 2974589d30e0SAlex Elder CEPH_OSD_FLAG_READ, NULL); 2975589d30e0SAlex Elder dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); 2976589d30e0SAlex Elder if (ret < 0) 2977589d30e0SAlex Elder goto out; 2978a0ea3a40SAlex Elder ret = 0; /* rbd_req_sync_exec() can return positive */ 2979589d30e0SAlex Elder 2980589d30e0SAlex Elder p = response; 2981589d30e0SAlex Elder rbd_dev->image_id = ceph_extract_encoded_string(&p, 2982589d30e0SAlex Elder p + RBD_IMAGE_ID_LEN_MAX, 2983589d30e0SAlex Elder &rbd_dev->image_id_len, 2984589d30e0SAlex Elder GFP_NOIO); 2985589d30e0SAlex Elder if (IS_ERR(rbd_dev->image_id)) { 2986589d30e0SAlex Elder ret = PTR_ERR(rbd_dev->image_id); 2987589d30e0SAlex Elder rbd_dev->image_id = NULL; 2988589d30e0SAlex Elder } else { 2989589d30e0SAlex Elder dout("image_id is %s\n", rbd_dev->image_id); 2990589d30e0SAlex Elder } 2991589d30e0SAlex Elder out: 2992589d30e0SAlex Elder kfree(response); 2993589d30e0SAlex Elder kfree(object_name); 2994589d30e0SAlex Elder 2995589d30e0SAlex Elder return ret; 2996589d30e0SAlex Elder } 2997589d30e0SAlex Elder 2998a30b71b9SAlex Elder static int rbd_dev_v1_probe(struct rbd_device *rbd_dev) 2999a30b71b9SAlex Elder { 3000a30b71b9SAlex Elder int ret; 3001a30b71b9SAlex Elder size_t size; 3002a30b71b9SAlex Elder 3003a30b71b9SAlex Elder /* Version 1 images have no id; empty string is used */ 3004a30b71b9SAlex Elder 3005a30b71b9SAlex Elder rbd_dev->image_id = kstrdup("", GFP_KERNEL); 3006a30b71b9SAlex Elder if (!rbd_dev->image_id) 3007a30b71b9SAlex Elder return -ENOMEM; 3008a30b71b9SAlex Elder rbd_dev->image_id_len = 0; 3009a30b71b9SAlex Elder 3010a30b71b9SAlex Elder /* Record the header object name for this rbd image. */ 3011a30b71b9SAlex Elder 3012a30b71b9SAlex Elder size = rbd_dev->image_name_len + sizeof (RBD_SUFFIX); 3013a30b71b9SAlex Elder rbd_dev->header_name = kmalloc(size, GFP_KERNEL); 3014a30b71b9SAlex Elder if (!rbd_dev->header_name) { 3015a30b71b9SAlex Elder ret = -ENOMEM; 3016a30b71b9SAlex Elder goto out_err; 3017a30b71b9SAlex Elder } 3018a30b71b9SAlex Elder sprintf(rbd_dev->header_name, "%s%s", rbd_dev->image_name, RBD_SUFFIX); 3019a30b71b9SAlex Elder 3020a30b71b9SAlex Elder /* Populate rbd image metadata */ 3021a30b71b9SAlex Elder 3022a30b71b9SAlex Elder ret = rbd_read_header(rbd_dev, &rbd_dev->header); 3023a30b71b9SAlex Elder if (ret < 0) 3024a30b71b9SAlex Elder goto out_err; 3025a30b71b9SAlex Elder rbd_dev->image_format = 1; 3026a30b71b9SAlex Elder 3027a30b71b9SAlex Elder dout("discovered version 1 image, header name is %s\n", 3028a30b71b9SAlex Elder rbd_dev->header_name); 3029a30b71b9SAlex Elder 3030a30b71b9SAlex Elder return 0; 3031a30b71b9SAlex Elder 3032a30b71b9SAlex Elder out_err: 3033a30b71b9SAlex Elder kfree(rbd_dev->header_name); 3034a30b71b9SAlex Elder rbd_dev->header_name = NULL; 3035a30b71b9SAlex Elder kfree(rbd_dev->image_id); 3036a30b71b9SAlex Elder rbd_dev->image_id = NULL; 3037a30b71b9SAlex Elder 3038a30b71b9SAlex Elder return ret; 3039a30b71b9SAlex Elder } 3040a30b71b9SAlex Elder 3041a30b71b9SAlex Elder static int rbd_dev_v2_probe(struct rbd_device *rbd_dev) 3042a30b71b9SAlex Elder { 3043a30b71b9SAlex Elder size_t size; 30449d475de5SAlex Elder int ret; 30456e14b1a6SAlex Elder u64 ver = 0; 3046a30b71b9SAlex Elder 3047a30b71b9SAlex Elder /* 3048a30b71b9SAlex Elder * Image id was filled in by the caller. Record the header 3049a30b71b9SAlex Elder * object name for this rbd image. 3050a30b71b9SAlex Elder */ 3051a30b71b9SAlex Elder size = sizeof (RBD_HEADER_PREFIX) + rbd_dev->image_id_len; 3052a30b71b9SAlex Elder rbd_dev->header_name = kmalloc(size, GFP_KERNEL); 3053a30b71b9SAlex Elder if (!rbd_dev->header_name) 3054a30b71b9SAlex Elder return -ENOMEM; 3055a30b71b9SAlex Elder sprintf(rbd_dev->header_name, "%s%s", 3056a30b71b9SAlex Elder RBD_HEADER_PREFIX, rbd_dev->image_id); 30579d475de5SAlex Elder 30589d475de5SAlex Elder /* Get the size and object order for the image */ 30599d475de5SAlex Elder 30609d475de5SAlex Elder ret = rbd_dev_v2_image_size(rbd_dev); 30619d475de5SAlex Elder if (ret < 0) 30629d475de5SAlex Elder goto out_err; 30631e130199SAlex Elder 30641e130199SAlex Elder /* Get the object prefix (a.k.a. block_name) for the image */ 30651e130199SAlex Elder 30661e130199SAlex Elder ret = rbd_dev_v2_object_prefix(rbd_dev); 30671e130199SAlex Elder if (ret < 0) 30681e130199SAlex Elder goto out_err; 3069b1b5402aSAlex Elder 3070d889140cSAlex Elder /* Get the and check features for the image */ 3071b1b5402aSAlex Elder 3072b1b5402aSAlex Elder ret = rbd_dev_v2_features(rbd_dev); 3073b1b5402aSAlex Elder if (ret < 0) 3074b1b5402aSAlex Elder goto out_err; 307535d489f9SAlex Elder 30766e14b1a6SAlex Elder /* crypto and compression type aren't (yet) supported for v2 images */ 307735d489f9SAlex Elder 30786e14b1a6SAlex Elder rbd_dev->header.crypt_type = 0; 30796e14b1a6SAlex Elder rbd_dev->header.comp_type = 0; 30806e14b1a6SAlex Elder 30816e14b1a6SAlex Elder /* Get the snapshot context, plus the header version */ 30826e14b1a6SAlex Elder 30836e14b1a6SAlex Elder ret = rbd_dev_v2_snap_context(rbd_dev, &ver); 308435d489f9SAlex Elder if (ret) 308535d489f9SAlex Elder goto out_err; 30866e14b1a6SAlex Elder rbd_dev->header.obj_version = ver; 30876e14b1a6SAlex Elder 3088a30b71b9SAlex Elder rbd_dev->image_format = 2; 3089a30b71b9SAlex Elder 3090a30b71b9SAlex Elder dout("discovered version 2 image, header name is %s\n", 3091a30b71b9SAlex Elder rbd_dev->header_name); 3092a30b71b9SAlex Elder 309335152979SAlex Elder return 0; 30949d475de5SAlex Elder out_err: 30959d475de5SAlex Elder kfree(rbd_dev->header_name); 30969d475de5SAlex Elder rbd_dev->header_name = NULL; 30971e130199SAlex Elder kfree(rbd_dev->header.object_prefix); 30981e130199SAlex Elder rbd_dev->header.object_prefix = NULL; 30999d475de5SAlex Elder 31009d475de5SAlex Elder return ret; 3101a30b71b9SAlex Elder } 3102a30b71b9SAlex Elder 3103a30b71b9SAlex Elder /* 3104a30b71b9SAlex Elder * Probe for the existence of the header object for the given rbd 3105a30b71b9SAlex Elder * device. For format 2 images this includes determining the image 3106a30b71b9SAlex Elder * id. 3107a30b71b9SAlex Elder */ 3108a30b71b9SAlex Elder static int rbd_dev_probe(struct rbd_device *rbd_dev) 3109a30b71b9SAlex Elder { 3110a30b71b9SAlex Elder int ret; 3111a30b71b9SAlex Elder 3112a30b71b9SAlex Elder /* 3113a30b71b9SAlex Elder * Get the id from the image id object. If it's not a 3114a30b71b9SAlex Elder * format 2 image, we'll get ENOENT back, and we'll assume 3115a30b71b9SAlex Elder * it's a format 1 image. 3116a30b71b9SAlex Elder */ 3117a30b71b9SAlex Elder ret = rbd_dev_image_id(rbd_dev); 3118a30b71b9SAlex Elder if (ret) 3119a30b71b9SAlex Elder ret = rbd_dev_v1_probe(rbd_dev); 3120a30b71b9SAlex Elder else 3121a30b71b9SAlex Elder ret = rbd_dev_v2_probe(rbd_dev); 3122a30b71b9SAlex Elder if (ret) 3123a30b71b9SAlex Elder dout("probe failed, returning %d\n", ret); 3124a30b71b9SAlex Elder 3125a30b71b9SAlex Elder return ret; 3126a30b71b9SAlex Elder } 3127a30b71b9SAlex Elder 312859c2be1eSYehuda Sadeh static ssize_t rbd_add(struct bus_type *bus, 312959c2be1eSYehuda Sadeh const char *buf, 313059c2be1eSYehuda Sadeh size_t count) 3131602adf40SYehuda Sadeh { 3132cb8627c7SAlex Elder char *options; 3133cb8627c7SAlex Elder struct rbd_device *rbd_dev = NULL; 31347ef3214aSAlex Elder const char *mon_addrs = NULL; 31357ef3214aSAlex Elder size_t mon_addrs_size = 0; 313627cc2594SAlex Elder struct ceph_osd_client *osdc; 313727cc2594SAlex Elder int rc = -ENOMEM; 31383feeb894SAlex Elder char *snap_name; 3139602adf40SYehuda Sadeh 3140602adf40SYehuda Sadeh if (!try_module_get(THIS_MODULE)) 3141602adf40SYehuda Sadeh return -ENODEV; 3142602adf40SYehuda Sadeh 314327cc2594SAlex Elder options = kmalloc(count, GFP_KERNEL); 314427cc2594SAlex Elder if (!options) 314585ae8926SAlex Elder goto err_out_mem; 3146cb8627c7SAlex Elder rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL); 3147cb8627c7SAlex Elder if (!rbd_dev) 314885ae8926SAlex Elder goto err_out_mem; 3149602adf40SYehuda Sadeh 3150602adf40SYehuda Sadeh /* static rbd_device initialization */ 3151602adf40SYehuda Sadeh spin_lock_init(&rbd_dev->lock); 3152602adf40SYehuda Sadeh INIT_LIST_HEAD(&rbd_dev->node); 3153dfc5606dSYehuda Sadeh INIT_LIST_HEAD(&rbd_dev->snaps); 3154c666601aSJosh Durgin init_rwsem(&rbd_dev->header_rwsem); 3155602adf40SYehuda Sadeh 3156a725f65eSAlex Elder /* parse add command */ 31573feeb894SAlex Elder snap_name = rbd_add_parse_args(rbd_dev, buf, 31583feeb894SAlex Elder &mon_addrs, &mon_addrs_size, options, count); 31593feeb894SAlex Elder if (IS_ERR(snap_name)) { 31603feeb894SAlex Elder rc = PTR_ERR(snap_name); 316185ae8926SAlex Elder goto err_out_mem; 31623feeb894SAlex Elder } 3163a725f65eSAlex Elder 3164f8c38929SAlex Elder rc = rbd_get_client(rbd_dev, mon_addrs, mon_addrs_size - 1, options); 3165f8c38929SAlex Elder if (rc < 0) 316685ae8926SAlex Elder goto err_out_args; 3167602adf40SYehuda Sadeh 3168602adf40SYehuda Sadeh /* pick the pool */ 31691dbb4399SAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 3170602adf40SYehuda Sadeh rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name); 3171602adf40SYehuda Sadeh if (rc < 0) 3172602adf40SYehuda Sadeh goto err_out_client; 31739bb2f334SAlex Elder rbd_dev->pool_id = rc; 3174602adf40SYehuda Sadeh 3175a30b71b9SAlex Elder rc = rbd_dev_probe(rbd_dev); 3176a30b71b9SAlex Elder if (rc < 0) 3177589d30e0SAlex Elder goto err_out_client; 317805fd6f6fSAlex Elder 317905fd6f6fSAlex Elder /* no need to lock here, as rbd_dev is not registered yet */ 318005fd6f6fSAlex Elder rc = rbd_dev_snaps_update(rbd_dev); 318105fd6f6fSAlex Elder if (rc) 318205fd6f6fSAlex Elder goto err_out_header; 318305fd6f6fSAlex Elder 318405fd6f6fSAlex Elder rc = rbd_dev_set_mapping(rbd_dev, snap_name); 318505fd6f6fSAlex Elder if (rc) 318605fd6f6fSAlex Elder goto err_out_header; 318705fd6f6fSAlex Elder 318885ae8926SAlex Elder /* generate unique id: find highest unique id, add one */ 318985ae8926SAlex Elder rbd_dev_id_get(rbd_dev); 319085ae8926SAlex Elder 319185ae8926SAlex Elder /* Fill in the device name, now that we have its id. */ 319285ae8926SAlex Elder BUILD_BUG_ON(DEV_NAME_LEN 319385ae8926SAlex Elder < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH); 319485ae8926SAlex Elder sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id); 319585ae8926SAlex Elder 319685ae8926SAlex Elder /* Get our block major device number. */ 319785ae8926SAlex Elder 319827cc2594SAlex Elder rc = register_blkdev(0, rbd_dev->name); 319927cc2594SAlex Elder if (rc < 0) 320085ae8926SAlex Elder goto err_out_id; 320127cc2594SAlex Elder rbd_dev->major = rc; 3202602adf40SYehuda Sadeh 32030f308a31SAlex Elder /* Set up the blkdev mapping. */ 32040f308a31SAlex Elder 32050f308a31SAlex Elder rc = rbd_init_disk(rbd_dev); 3206dfc5606dSYehuda Sadeh if (rc) 3207766fc439SYehuda Sadeh goto err_out_blkdev; 3208766fc439SYehuda Sadeh 32090f308a31SAlex Elder rc = rbd_bus_add_dev(rbd_dev); 32100f308a31SAlex Elder if (rc) 32110f308a31SAlex Elder goto err_out_disk; 32120f308a31SAlex Elder 321332eec68dSAlex Elder /* 321432eec68dSAlex Elder * At this point cleanup in the event of an error is the job 321532eec68dSAlex Elder * of the sysfs code (initiated by rbd_bus_del_dev()). 321632eec68dSAlex Elder */ 32172ac4e75dSAlex Elder 32184bb1f1edSAlex Elder down_write(&rbd_dev->header_rwsem); 32195ed16177SAlex Elder rc = rbd_dev_snaps_register(rbd_dev); 32204bb1f1edSAlex Elder up_write(&rbd_dev->header_rwsem); 32212ac4e75dSAlex Elder if (rc) 32222ac4e75dSAlex Elder goto err_out_bus; 32232ac4e75dSAlex Elder 322459c2be1eSYehuda Sadeh rc = rbd_init_watch_dev(rbd_dev); 322559c2be1eSYehuda Sadeh if (rc) 322659c2be1eSYehuda Sadeh goto err_out_bus; 322759c2be1eSYehuda Sadeh 32283ee4001eSAlex Elder /* Everything's ready. Announce the disk to the world. */ 32293ee4001eSAlex Elder 32303ee4001eSAlex Elder add_disk(rbd_dev->disk); 32313ee4001eSAlex Elder 32323ee4001eSAlex Elder pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name, 32333ee4001eSAlex Elder (unsigned long long) rbd_dev->mapping.size); 32343ee4001eSAlex Elder 3235602adf40SYehuda Sadeh return count; 3236602adf40SYehuda Sadeh 3237766fc439SYehuda Sadeh err_out_bus: 3238766fc439SYehuda Sadeh /* this will also clean up rest of rbd_dev stuff */ 3239766fc439SYehuda Sadeh 3240766fc439SYehuda Sadeh rbd_bus_del_dev(rbd_dev); 3241766fc439SYehuda Sadeh kfree(options); 3242766fc439SYehuda Sadeh return rc; 3243766fc439SYehuda Sadeh 32440f308a31SAlex Elder err_out_disk: 32450f308a31SAlex Elder rbd_free_disk(rbd_dev); 3246602adf40SYehuda Sadeh err_out_blkdev: 3247602adf40SYehuda Sadeh unregister_blkdev(rbd_dev->major, rbd_dev->name); 324885ae8926SAlex Elder err_out_id: 324985ae8926SAlex Elder rbd_dev_id_put(rbd_dev); 325005fd6f6fSAlex Elder err_out_header: 325105fd6f6fSAlex Elder rbd_header_free(&rbd_dev->header); 3252602adf40SYehuda Sadeh err_out_client: 32533fcf2581SAlex Elder kfree(rbd_dev->header_name); 3254602adf40SYehuda Sadeh rbd_put_client(rbd_dev); 3255589d30e0SAlex Elder kfree(rbd_dev->image_id); 325685ae8926SAlex Elder err_out_args: 3257f84344f3SAlex Elder kfree(rbd_dev->mapping.snap_name); 32580bed54dcSAlex Elder kfree(rbd_dev->image_name); 3259d22f76e7SAlex Elder kfree(rbd_dev->pool_name); 326085ae8926SAlex Elder err_out_mem: 326127cc2594SAlex Elder kfree(rbd_dev); 3262cb8627c7SAlex Elder kfree(options); 326327cc2594SAlex Elder 3264602adf40SYehuda Sadeh dout("Error adding device %s\n", buf); 3265602adf40SYehuda Sadeh module_put(THIS_MODULE); 326627cc2594SAlex Elder 326727cc2594SAlex Elder return (ssize_t) rc; 3268602adf40SYehuda Sadeh } 3269602adf40SYehuda Sadeh 3270de71a297SAlex Elder static struct rbd_device *__rbd_get_dev(unsigned long dev_id) 3271602adf40SYehuda Sadeh { 3272602adf40SYehuda Sadeh struct list_head *tmp; 3273602adf40SYehuda Sadeh struct rbd_device *rbd_dev; 3274602adf40SYehuda Sadeh 3275e124a82fSAlex Elder spin_lock(&rbd_dev_list_lock); 3276602adf40SYehuda Sadeh list_for_each(tmp, &rbd_dev_list) { 3277602adf40SYehuda Sadeh rbd_dev = list_entry(tmp, struct rbd_device, node); 3278de71a297SAlex Elder if (rbd_dev->dev_id == dev_id) { 3279e124a82fSAlex Elder spin_unlock(&rbd_dev_list_lock); 3280602adf40SYehuda Sadeh return rbd_dev; 3281602adf40SYehuda Sadeh } 3282e124a82fSAlex Elder } 3283e124a82fSAlex Elder spin_unlock(&rbd_dev_list_lock); 3284602adf40SYehuda Sadeh return NULL; 3285602adf40SYehuda Sadeh } 3286602adf40SYehuda Sadeh 3287dfc5606dSYehuda Sadeh static void rbd_dev_release(struct device *dev) 3288602adf40SYehuda Sadeh { 3289593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 3290602adf40SYehuda Sadeh 32911dbb4399SAlex Elder if (rbd_dev->watch_request) { 32921dbb4399SAlex Elder struct ceph_client *client = rbd_dev->rbd_client->client; 32931dbb4399SAlex Elder 32941dbb4399SAlex Elder ceph_osdc_unregister_linger_request(&client->osdc, 329559c2be1eSYehuda Sadeh rbd_dev->watch_request); 32961dbb4399SAlex Elder } 329759c2be1eSYehuda Sadeh if (rbd_dev->watch_event) 3298070c633fSAlex Elder rbd_req_sync_unwatch(rbd_dev); 329959c2be1eSYehuda Sadeh 3300602adf40SYehuda Sadeh rbd_put_client(rbd_dev); 3301602adf40SYehuda Sadeh 3302602adf40SYehuda Sadeh /* clean up and free blkdev */ 3303602adf40SYehuda Sadeh rbd_free_disk(rbd_dev); 3304602adf40SYehuda Sadeh unregister_blkdev(rbd_dev->major, rbd_dev->name); 330532eec68dSAlex Elder 33062ac4e75dSAlex Elder /* release allocated disk header fields */ 33072ac4e75dSAlex Elder rbd_header_free(&rbd_dev->header); 33082ac4e75dSAlex Elder 330932eec68dSAlex Elder /* done with the id, and with the rbd_dev */ 3310f84344f3SAlex Elder kfree(rbd_dev->mapping.snap_name); 3311589d30e0SAlex Elder kfree(rbd_dev->image_id); 33120bed54dcSAlex Elder kfree(rbd_dev->header_name); 3313d22f76e7SAlex Elder kfree(rbd_dev->pool_name); 33140bed54dcSAlex Elder kfree(rbd_dev->image_name); 3315e2839308SAlex Elder rbd_dev_id_put(rbd_dev); 3316602adf40SYehuda Sadeh kfree(rbd_dev); 3317602adf40SYehuda Sadeh 3318602adf40SYehuda Sadeh /* release module ref */ 3319602adf40SYehuda Sadeh module_put(THIS_MODULE); 3320602adf40SYehuda Sadeh } 3321602adf40SYehuda Sadeh 3322dfc5606dSYehuda Sadeh static ssize_t rbd_remove(struct bus_type *bus, 3323602adf40SYehuda Sadeh const char *buf, 3324602adf40SYehuda Sadeh size_t count) 3325602adf40SYehuda Sadeh { 3326602adf40SYehuda Sadeh struct rbd_device *rbd_dev = NULL; 3327602adf40SYehuda Sadeh int target_id, rc; 3328602adf40SYehuda Sadeh unsigned long ul; 3329602adf40SYehuda Sadeh int ret = count; 3330602adf40SYehuda Sadeh 3331602adf40SYehuda Sadeh rc = strict_strtoul(buf, 10, &ul); 3332602adf40SYehuda Sadeh if (rc) 3333602adf40SYehuda Sadeh return rc; 3334602adf40SYehuda Sadeh 3335602adf40SYehuda Sadeh /* convert to int; abort if we lost anything in the conversion */ 3336602adf40SYehuda Sadeh target_id = (int) ul; 3337602adf40SYehuda Sadeh if (target_id != ul) 3338602adf40SYehuda Sadeh return -EINVAL; 3339602adf40SYehuda Sadeh 3340602adf40SYehuda Sadeh mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); 3341602adf40SYehuda Sadeh 3342602adf40SYehuda Sadeh rbd_dev = __rbd_get_dev(target_id); 3343602adf40SYehuda Sadeh if (!rbd_dev) { 3344602adf40SYehuda Sadeh ret = -ENOENT; 3345602adf40SYehuda Sadeh goto done; 3346602adf40SYehuda Sadeh } 3347602adf40SYehuda Sadeh 3348dfc5606dSYehuda Sadeh __rbd_remove_all_snaps(rbd_dev); 3349dfc5606dSYehuda Sadeh rbd_bus_del_dev(rbd_dev); 3350602adf40SYehuda Sadeh 3351602adf40SYehuda Sadeh done: 3352602adf40SYehuda Sadeh mutex_unlock(&ctl_mutex); 3353aafb230eSAlex Elder 3354602adf40SYehuda Sadeh return ret; 3355602adf40SYehuda Sadeh } 3356602adf40SYehuda Sadeh 3357602adf40SYehuda Sadeh /* 3358602adf40SYehuda Sadeh * create control files in sysfs 3359dfc5606dSYehuda Sadeh * /sys/bus/rbd/... 3360602adf40SYehuda Sadeh */ 3361602adf40SYehuda Sadeh static int rbd_sysfs_init(void) 3362602adf40SYehuda Sadeh { 3363dfc5606dSYehuda Sadeh int ret; 3364602adf40SYehuda Sadeh 3365fed4c143SAlex Elder ret = device_register(&rbd_root_dev); 3366dfc5606dSYehuda Sadeh if (ret < 0) 3367dfc5606dSYehuda Sadeh return ret; 3368602adf40SYehuda Sadeh 3369fed4c143SAlex Elder ret = bus_register(&rbd_bus_type); 3370fed4c143SAlex Elder if (ret < 0) 3371fed4c143SAlex Elder device_unregister(&rbd_root_dev); 3372602adf40SYehuda Sadeh 3373602adf40SYehuda Sadeh return ret; 3374602adf40SYehuda Sadeh } 3375602adf40SYehuda Sadeh 3376602adf40SYehuda Sadeh static void rbd_sysfs_cleanup(void) 3377602adf40SYehuda Sadeh { 3378dfc5606dSYehuda Sadeh bus_unregister(&rbd_bus_type); 3379fed4c143SAlex Elder device_unregister(&rbd_root_dev); 3380602adf40SYehuda Sadeh } 3381602adf40SYehuda Sadeh 3382602adf40SYehuda Sadeh int __init rbd_init(void) 3383602adf40SYehuda Sadeh { 3384602adf40SYehuda Sadeh int rc; 3385602adf40SYehuda Sadeh 3386602adf40SYehuda Sadeh rc = rbd_sysfs_init(); 3387602adf40SYehuda Sadeh if (rc) 3388602adf40SYehuda Sadeh return rc; 3389f0f8cef5SAlex Elder pr_info("loaded " RBD_DRV_NAME_LONG "\n"); 3390602adf40SYehuda Sadeh return 0; 3391602adf40SYehuda Sadeh } 3392602adf40SYehuda Sadeh 3393602adf40SYehuda Sadeh void __exit rbd_exit(void) 3394602adf40SYehuda Sadeh { 3395602adf40SYehuda Sadeh rbd_sysfs_cleanup(); 3396602adf40SYehuda Sadeh } 3397602adf40SYehuda Sadeh 3398602adf40SYehuda Sadeh module_init(rbd_init); 3399602adf40SYehuda Sadeh module_exit(rbd_exit); 3400602adf40SYehuda Sadeh 3401602adf40SYehuda Sadeh MODULE_AUTHOR("Sage Weil <sage@newdream.net>"); 3402602adf40SYehuda Sadeh MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>"); 3403602adf40SYehuda Sadeh MODULE_DESCRIPTION("rados block device"); 3404602adf40SYehuda Sadeh 3405602adf40SYehuda Sadeh /* following authorship retained from original osdblk.c */ 3406602adf40SYehuda Sadeh MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>"); 3407602adf40SYehuda Sadeh 3408602adf40SYehuda Sadeh MODULE_LICENSE("GPL"); 3409