1602adf40SYehuda Sadeh /* 2602adf40SYehuda Sadeh rbd.c -- Export ceph rados objects as a Linux block device 3602adf40SYehuda Sadeh 4602adf40SYehuda Sadeh 5602adf40SYehuda Sadeh based on drivers/block/osdblk.c: 6602adf40SYehuda Sadeh 7602adf40SYehuda Sadeh Copyright 2009 Red Hat, Inc. 8602adf40SYehuda Sadeh 9602adf40SYehuda Sadeh This program is free software; you can redistribute it and/or modify 10602adf40SYehuda Sadeh it under the terms of the GNU General Public License as published by 11602adf40SYehuda Sadeh the Free Software Foundation. 12602adf40SYehuda Sadeh 13602adf40SYehuda Sadeh This program is distributed in the hope that it will be useful, 14602adf40SYehuda Sadeh but WITHOUT ANY WARRANTY; without even the implied warranty of 15602adf40SYehuda Sadeh MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16602adf40SYehuda Sadeh GNU General Public License for more details. 17602adf40SYehuda Sadeh 18602adf40SYehuda Sadeh You should have received a copy of the GNU General Public License 19602adf40SYehuda Sadeh along with this program; see the file COPYING. If not, write to 20602adf40SYehuda Sadeh the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 21602adf40SYehuda Sadeh 22602adf40SYehuda Sadeh 23602adf40SYehuda Sadeh 24dfc5606dSYehuda Sadeh For usage instructions, please refer to: 25602adf40SYehuda Sadeh 26dfc5606dSYehuda Sadeh Documentation/ABI/testing/sysfs-bus-rbd 27602adf40SYehuda Sadeh 28602adf40SYehuda Sadeh */ 29602adf40SYehuda Sadeh 30602adf40SYehuda Sadeh #include <linux/ceph/libceph.h> 31602adf40SYehuda Sadeh #include <linux/ceph/osd_client.h> 32602adf40SYehuda Sadeh #include <linux/ceph/mon_client.h> 33602adf40SYehuda Sadeh #include <linux/ceph/decode.h> 3459c2be1eSYehuda Sadeh #include <linux/parser.h> 35602adf40SYehuda Sadeh 36602adf40SYehuda Sadeh #include <linux/kernel.h> 37602adf40SYehuda Sadeh #include <linux/device.h> 38602adf40SYehuda Sadeh #include <linux/module.h> 39602adf40SYehuda Sadeh #include <linux/fs.h> 40602adf40SYehuda Sadeh #include <linux/blkdev.h> 41602adf40SYehuda Sadeh 42602adf40SYehuda Sadeh #include "rbd_types.h" 43602adf40SYehuda Sadeh 44aafb230eSAlex Elder #define RBD_DEBUG /* Activate rbd_assert() calls */ 45aafb230eSAlex Elder 46593a9e7bSAlex Elder /* 47593a9e7bSAlex Elder * The basic unit of block I/O is a sector. It is interpreted in a 48593a9e7bSAlex Elder * number of contexts in Linux (blk, bio, genhd), but the default is 49593a9e7bSAlex Elder * universally 512 bytes. These symbols are just slightly more 50593a9e7bSAlex Elder * meaningful than the bare numbers they represent. 51593a9e7bSAlex Elder */ 52593a9e7bSAlex Elder #define SECTOR_SHIFT 9 53593a9e7bSAlex Elder #define SECTOR_SIZE (1ULL << SECTOR_SHIFT) 54593a9e7bSAlex Elder 55f0f8cef5SAlex Elder #define RBD_DRV_NAME "rbd" 56f0f8cef5SAlex Elder #define RBD_DRV_NAME_LONG "rbd (rados block device)" 57602adf40SYehuda Sadeh 58602adf40SYehuda Sadeh #define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */ 59602adf40SYehuda Sadeh 60d4b125e9SAlex Elder #define RBD_SNAP_DEV_NAME_PREFIX "snap_" 61d4b125e9SAlex Elder #define RBD_MAX_SNAP_NAME_LEN \ 62d4b125e9SAlex Elder (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1)) 63d4b125e9SAlex Elder 6435d489f9SAlex Elder #define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */ 65602adf40SYehuda Sadeh 66602adf40SYehuda Sadeh #define RBD_SNAP_HEAD_NAME "-" 67602adf40SYehuda Sadeh 689e15b77dSAlex Elder /* This allows a single page to hold an image name sent by OSD */ 699e15b77dSAlex Elder #define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1) 70589d30e0SAlex Elder #define RBD_IMAGE_ID_LEN_MAX 64 719e15b77dSAlex Elder 721e130199SAlex Elder #define RBD_OBJ_PREFIX_LEN_MAX 64 73589d30e0SAlex Elder 74d889140cSAlex Elder /* Feature bits */ 75d889140cSAlex Elder 765cbf6f12SAlex Elder #define RBD_FEATURE_LAYERING (1<<0) 775cbf6f12SAlex Elder #define RBD_FEATURE_STRIPINGV2 (1<<1) 785cbf6f12SAlex Elder #define RBD_FEATURES_ALL \ 795cbf6f12SAlex Elder (RBD_FEATURE_LAYERING | RBD_FEATURE_STRIPINGV2) 80d889140cSAlex Elder 81d889140cSAlex Elder /* Features supported by this (client software) implementation. */ 82d889140cSAlex Elder 83770eba6eSAlex Elder #define RBD_FEATURES_SUPPORTED (RBD_FEATURES_ALL) 84d889140cSAlex Elder 8581a89793SAlex Elder /* 8681a89793SAlex Elder * An RBD device name will be "rbd#", where the "rbd" comes from 8781a89793SAlex Elder * RBD_DRV_NAME above, and # is a unique integer identifier. 8881a89793SAlex Elder * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big 8981a89793SAlex Elder * enough to hold all possible device names. 9081a89793SAlex Elder */ 91602adf40SYehuda Sadeh #define DEV_NAME_LEN 32 9281a89793SAlex Elder #define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1) 93602adf40SYehuda Sadeh 94602adf40SYehuda Sadeh /* 95602adf40SYehuda Sadeh * block device image metadata (in-memory version) 96602adf40SYehuda Sadeh */ 97602adf40SYehuda Sadeh struct rbd_image_header { 98f84344f3SAlex Elder /* These four fields never change for a given rbd image */ 99849b4260SAlex Elder char *object_prefix; 10034b13184SAlex Elder u64 features; 101602adf40SYehuda Sadeh __u8 obj_order; 102602adf40SYehuda Sadeh __u8 crypt_type; 103602adf40SYehuda Sadeh __u8 comp_type; 104602adf40SYehuda Sadeh 105f84344f3SAlex Elder /* The remaining fields need to be updated occasionally */ 106f84344f3SAlex Elder u64 image_size; 107f84344f3SAlex Elder struct ceph_snap_context *snapc; 108602adf40SYehuda Sadeh char *snap_names; 109602adf40SYehuda Sadeh u64 *snap_sizes; 11059c2be1eSYehuda Sadeh 11159c2be1eSYehuda Sadeh u64 obj_version; 11259c2be1eSYehuda Sadeh }; 11359c2be1eSYehuda Sadeh 1140d7dbfceSAlex Elder /* 1150d7dbfceSAlex Elder * An rbd image specification. 1160d7dbfceSAlex Elder * 1170d7dbfceSAlex Elder * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely 118c66c6e0cSAlex Elder * identify an image. Each rbd_dev structure includes a pointer to 119c66c6e0cSAlex Elder * an rbd_spec structure that encapsulates this identity. 120c66c6e0cSAlex Elder * 121c66c6e0cSAlex Elder * Each of the id's in an rbd_spec has an associated name. For a 122c66c6e0cSAlex Elder * user-mapped image, the names are supplied and the id's associated 123c66c6e0cSAlex Elder * with them are looked up. For a layered image, a parent image is 124c66c6e0cSAlex Elder * defined by the tuple, and the names are looked up. 125c66c6e0cSAlex Elder * 126c66c6e0cSAlex Elder * An rbd_dev structure contains a parent_spec pointer which is 127c66c6e0cSAlex Elder * non-null if the image it represents is a child in a layered 128c66c6e0cSAlex Elder * image. This pointer will refer to the rbd_spec structure used 129c66c6e0cSAlex Elder * by the parent rbd_dev for its own identity (i.e., the structure 130c66c6e0cSAlex Elder * is shared between the parent and child). 131c66c6e0cSAlex Elder * 132c66c6e0cSAlex Elder * Since these structures are populated once, during the discovery 133c66c6e0cSAlex Elder * phase of image construction, they are effectively immutable so 134c66c6e0cSAlex Elder * we make no effort to synchronize access to them. 135c66c6e0cSAlex Elder * 136c66c6e0cSAlex Elder * Note that code herein does not assume the image name is known (it 137c66c6e0cSAlex Elder * could be a null pointer). 1380d7dbfceSAlex Elder */ 1390d7dbfceSAlex Elder struct rbd_spec { 1400d7dbfceSAlex Elder u64 pool_id; 1410d7dbfceSAlex Elder char *pool_name; 1420d7dbfceSAlex Elder 1430d7dbfceSAlex Elder char *image_id; 1440d7dbfceSAlex Elder char *image_name; 1450d7dbfceSAlex Elder 1460d7dbfceSAlex Elder u64 snap_id; 1470d7dbfceSAlex Elder char *snap_name; 1480d7dbfceSAlex Elder 1490d7dbfceSAlex Elder struct kref kref; 1500d7dbfceSAlex Elder }; 1510d7dbfceSAlex Elder 152602adf40SYehuda Sadeh /* 153f0f8cef5SAlex Elder * an instance of the client. multiple devices may share an rbd client. 154602adf40SYehuda Sadeh */ 155602adf40SYehuda Sadeh struct rbd_client { 156602adf40SYehuda Sadeh struct ceph_client *client; 157602adf40SYehuda Sadeh struct kref kref; 158602adf40SYehuda Sadeh struct list_head node; 159602adf40SYehuda Sadeh }; 160602adf40SYehuda Sadeh 161bf0d5f50SAlex Elder struct rbd_img_request; 162bf0d5f50SAlex Elder typedef void (*rbd_img_callback_t)(struct rbd_img_request *); 163bf0d5f50SAlex Elder 164bf0d5f50SAlex Elder #define BAD_WHICH U32_MAX /* Good which or bad which, which? */ 165bf0d5f50SAlex Elder 166bf0d5f50SAlex Elder struct rbd_obj_request; 167bf0d5f50SAlex Elder typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *); 168bf0d5f50SAlex Elder 1699969ebc5SAlex Elder enum obj_request_type { 1709969ebc5SAlex Elder OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES 1719969ebc5SAlex Elder }; 172bf0d5f50SAlex Elder 173926f9b3fSAlex Elder enum obj_req_flags { 174926f9b3fSAlex Elder OBJ_REQ_DONE, /* completion flag: not done = 0, done = 1 */ 1756365d33aSAlex Elder OBJ_REQ_IMG_DATA, /* object usage: standalone = 0, image = 1 */ 1765679c59fSAlex Elder OBJ_REQ_KNOWN, /* EXISTS flag valid: no = 0, yes = 1 */ 1775679c59fSAlex Elder OBJ_REQ_EXISTS, /* target exists: no = 0, yes = 1 */ 178926f9b3fSAlex Elder }; 179926f9b3fSAlex Elder 180bf0d5f50SAlex Elder struct rbd_obj_request { 181bf0d5f50SAlex Elder const char *object_name; 182bf0d5f50SAlex Elder u64 offset; /* object start byte */ 183bf0d5f50SAlex Elder u64 length; /* bytes from offset */ 184926f9b3fSAlex Elder unsigned long flags; 185bf0d5f50SAlex Elder 186c5b5ef6cSAlex Elder /* 187c5b5ef6cSAlex Elder * An object request associated with an image will have its 188c5b5ef6cSAlex Elder * img_data flag set; a standalone object request will not. 189c5b5ef6cSAlex Elder * 190c5b5ef6cSAlex Elder * A standalone object request will have which == BAD_WHICH 191c5b5ef6cSAlex Elder * and a null obj_request pointer. 192c5b5ef6cSAlex Elder * 193c5b5ef6cSAlex Elder * An object request initiated in support of a layered image 194c5b5ef6cSAlex Elder * object (to check for its existence before a write) will 195c5b5ef6cSAlex Elder * have which == BAD_WHICH and a non-null obj_request pointer. 196c5b5ef6cSAlex Elder * 197c5b5ef6cSAlex Elder * Finally, an object request for rbd image data will have 198c5b5ef6cSAlex Elder * which != BAD_WHICH, and will have a non-null img_request 199c5b5ef6cSAlex Elder * pointer. The value of which will be in the range 200c5b5ef6cSAlex Elder * 0..(img_request->obj_request_count-1). 201c5b5ef6cSAlex Elder */ 202c5b5ef6cSAlex Elder union { 203c5b5ef6cSAlex Elder struct rbd_obj_request *obj_request; /* STAT op */ 204c5b5ef6cSAlex Elder struct { 205bf0d5f50SAlex Elder struct rbd_img_request *img_request; 206c5b5ef6cSAlex Elder u64 img_offset; 207c5b5ef6cSAlex Elder /* links for img_request->obj_requests list */ 208c5b5ef6cSAlex Elder struct list_head links; 209c5b5ef6cSAlex Elder }; 210c5b5ef6cSAlex Elder }; 211bf0d5f50SAlex Elder u32 which; /* posn image request list */ 212bf0d5f50SAlex Elder 213bf0d5f50SAlex Elder enum obj_request_type type; 214788e2df3SAlex Elder union { 215bf0d5f50SAlex Elder struct bio *bio_list; 216788e2df3SAlex Elder struct { 217788e2df3SAlex Elder struct page **pages; 218788e2df3SAlex Elder u32 page_count; 219788e2df3SAlex Elder }; 220788e2df3SAlex Elder }; 2210eefd470SAlex Elder struct page **copyup_pages; 222bf0d5f50SAlex Elder 223bf0d5f50SAlex Elder struct ceph_osd_request *osd_req; 224bf0d5f50SAlex Elder 225bf0d5f50SAlex Elder u64 xferred; /* bytes transferred */ 226bf0d5f50SAlex Elder u64 version; 2271b83bef2SSage Weil int result; 228bf0d5f50SAlex Elder 229bf0d5f50SAlex Elder rbd_obj_callback_t callback; 230788e2df3SAlex Elder struct completion completion; 231bf0d5f50SAlex Elder 232bf0d5f50SAlex Elder struct kref kref; 233bf0d5f50SAlex Elder }; 234bf0d5f50SAlex Elder 2350c425248SAlex Elder enum img_req_flags { 2369849e986SAlex Elder IMG_REQ_WRITE, /* I/O direction: read = 0, write = 1 */ 2379849e986SAlex Elder IMG_REQ_CHILD, /* initiator: block = 0, child image = 1 */ 238d0b2e944SAlex Elder IMG_REQ_LAYERED, /* ENOENT handling: normal = 0, layered = 1 */ 2390c425248SAlex Elder }; 2400c425248SAlex Elder 241bf0d5f50SAlex Elder struct rbd_img_request { 242bf0d5f50SAlex Elder struct rbd_device *rbd_dev; 243bf0d5f50SAlex Elder u64 offset; /* starting image byte offset */ 244bf0d5f50SAlex Elder u64 length; /* byte count from offset */ 2450c425248SAlex Elder unsigned long flags; 246bf0d5f50SAlex Elder union { 247bf0d5f50SAlex Elder u64 snap_id; /* for reads */ 2489849e986SAlex Elder struct ceph_snap_context *snapc; /* for writes */ 2499849e986SAlex Elder }; 2509849e986SAlex Elder union { 2519849e986SAlex Elder struct request *rq; /* block request */ 2529849e986SAlex Elder struct rbd_obj_request *obj_request; /* obj req initiator */ 253bf0d5f50SAlex Elder }; 2543d7efd18SAlex Elder struct page **copyup_pages; 255bf0d5f50SAlex Elder spinlock_t completion_lock;/* protects next_completion */ 256bf0d5f50SAlex Elder u32 next_completion; 257bf0d5f50SAlex Elder rbd_img_callback_t callback; 25855f27e09SAlex Elder u64 xferred;/* aggregate bytes transferred */ 259a5a337d4SAlex Elder int result; /* first nonzero obj_request result */ 260bf0d5f50SAlex Elder 261bf0d5f50SAlex Elder u32 obj_request_count; 262bf0d5f50SAlex Elder struct list_head obj_requests; /* rbd_obj_request structs */ 263bf0d5f50SAlex Elder 264bf0d5f50SAlex Elder struct kref kref; 265bf0d5f50SAlex Elder }; 266bf0d5f50SAlex Elder 267bf0d5f50SAlex Elder #define for_each_obj_request(ireq, oreq) \ 268ef06f4d3SAlex Elder list_for_each_entry(oreq, &(ireq)->obj_requests, links) 269bf0d5f50SAlex Elder #define for_each_obj_request_from(ireq, oreq) \ 270ef06f4d3SAlex Elder list_for_each_entry_from(oreq, &(ireq)->obj_requests, links) 271bf0d5f50SAlex Elder #define for_each_obj_request_safe(ireq, oreq, n) \ 272ef06f4d3SAlex Elder list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links) 273bf0d5f50SAlex Elder 274dfc5606dSYehuda Sadeh struct rbd_snap { 275dfc5606dSYehuda Sadeh const char *name; 2763591538fSJosh Durgin u64 size; 277dfc5606dSYehuda Sadeh struct list_head node; 278dfc5606dSYehuda Sadeh u64 id; 27934b13184SAlex Elder u64 features; 280dfc5606dSYehuda Sadeh }; 281dfc5606dSYehuda Sadeh 282f84344f3SAlex Elder struct rbd_mapping { 28399c1f08fSAlex Elder u64 size; 28434b13184SAlex Elder u64 features; 285f84344f3SAlex Elder bool read_only; 286f84344f3SAlex Elder }; 287f84344f3SAlex Elder 288602adf40SYehuda Sadeh /* 289602adf40SYehuda Sadeh * a single device 290602adf40SYehuda Sadeh */ 291602adf40SYehuda Sadeh struct rbd_device { 292de71a297SAlex Elder int dev_id; /* blkdev unique id */ 293602adf40SYehuda Sadeh 294602adf40SYehuda Sadeh int major; /* blkdev assigned major */ 295602adf40SYehuda Sadeh struct gendisk *disk; /* blkdev's gendisk and rq */ 296602adf40SYehuda Sadeh 297a30b71b9SAlex Elder u32 image_format; /* Either 1 or 2 */ 298602adf40SYehuda Sadeh struct rbd_client *rbd_client; 299602adf40SYehuda Sadeh 300602adf40SYehuda Sadeh char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */ 301602adf40SYehuda Sadeh 302b82d167bSAlex Elder spinlock_t lock; /* queue, flags, open_count */ 303602adf40SYehuda Sadeh 304602adf40SYehuda Sadeh struct rbd_image_header header; 305b82d167bSAlex Elder unsigned long flags; /* possibly lock protected */ 3060d7dbfceSAlex Elder struct rbd_spec *spec; 307602adf40SYehuda Sadeh 3080d7dbfceSAlex Elder char *header_name; 309971f839aSAlex Elder 3100903e875SAlex Elder struct ceph_file_layout layout; 3110903e875SAlex Elder 31259c2be1eSYehuda Sadeh struct ceph_osd_event *watch_event; 313975241afSAlex Elder struct rbd_obj_request *watch_request; 31459c2be1eSYehuda Sadeh 31586b00e0dSAlex Elder struct rbd_spec *parent_spec; 31686b00e0dSAlex Elder u64 parent_overlap; 3172f82ee54SAlex Elder struct rbd_device *parent; 31886b00e0dSAlex Elder 319cc070d59SAlex Elder u64 stripe_unit; 320cc070d59SAlex Elder u64 stripe_count; 321cc070d59SAlex Elder 322c666601aSJosh Durgin /* protects updating the header */ 323c666601aSJosh Durgin struct rw_semaphore header_rwsem; 324f84344f3SAlex Elder 325f84344f3SAlex Elder struct rbd_mapping mapping; 326602adf40SYehuda Sadeh 327602adf40SYehuda Sadeh struct list_head node; 328dfc5606dSYehuda Sadeh 329dfc5606dSYehuda Sadeh /* list of snapshots */ 330dfc5606dSYehuda Sadeh struct list_head snaps; 331dfc5606dSYehuda Sadeh 332dfc5606dSYehuda Sadeh /* sysfs related */ 333dfc5606dSYehuda Sadeh struct device dev; 334b82d167bSAlex Elder unsigned long open_count; /* protected by lock */ 335dfc5606dSYehuda Sadeh }; 336dfc5606dSYehuda Sadeh 337b82d167bSAlex Elder /* 338b82d167bSAlex Elder * Flag bits for rbd_dev->flags. If atomicity is required, 339b82d167bSAlex Elder * rbd_dev->lock is used to protect access. 340b82d167bSAlex Elder * 341b82d167bSAlex Elder * Currently, only the "removing" flag (which is coupled with the 342b82d167bSAlex Elder * "open_count" field) requires atomic access. 343b82d167bSAlex Elder */ 3446d292906SAlex Elder enum rbd_dev_flags { 3456d292906SAlex Elder RBD_DEV_FLAG_EXISTS, /* mapped snapshot has not been deleted */ 346b82d167bSAlex Elder RBD_DEV_FLAG_REMOVING, /* this mapping is being removed */ 3476d292906SAlex Elder }; 3486d292906SAlex Elder 349602adf40SYehuda Sadeh static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */ 350e124a82fSAlex Elder 351602adf40SYehuda Sadeh static LIST_HEAD(rbd_dev_list); /* devices */ 352e124a82fSAlex Elder static DEFINE_SPINLOCK(rbd_dev_list_lock); 353e124a82fSAlex Elder 354602adf40SYehuda Sadeh static LIST_HEAD(rbd_client_list); /* clients */ 355432b8587SAlex Elder static DEFINE_SPINLOCK(rbd_client_list_lock); 356602adf40SYehuda Sadeh 3573d7efd18SAlex Elder static int rbd_img_request_submit(struct rbd_img_request *img_request); 3583d7efd18SAlex Elder 359304f6808SAlex Elder static int rbd_dev_snaps_update(struct rbd_device *rbd_dev); 360304f6808SAlex Elder 361dfc5606dSYehuda Sadeh static void rbd_dev_release(struct device *dev); 3626087b51bSAlex Elder static void rbd_snap_destroy(struct rbd_snap *snap); 363dfc5606dSYehuda Sadeh 364f0f8cef5SAlex Elder static ssize_t rbd_add(struct bus_type *bus, const char *buf, 365f0f8cef5SAlex Elder size_t count); 366f0f8cef5SAlex Elder static ssize_t rbd_remove(struct bus_type *bus, const char *buf, 367f0f8cef5SAlex Elder size_t count); 3682f82ee54SAlex Elder static int rbd_dev_probe(struct rbd_device *rbd_dev); 369f0f8cef5SAlex Elder 370f0f8cef5SAlex Elder static struct bus_attribute rbd_bus_attrs[] = { 371f0f8cef5SAlex Elder __ATTR(add, S_IWUSR, NULL, rbd_add), 372f0f8cef5SAlex Elder __ATTR(remove, S_IWUSR, NULL, rbd_remove), 373f0f8cef5SAlex Elder __ATTR_NULL 374f0f8cef5SAlex Elder }; 375f0f8cef5SAlex Elder 376f0f8cef5SAlex Elder static struct bus_type rbd_bus_type = { 377f0f8cef5SAlex Elder .name = "rbd", 378f0f8cef5SAlex Elder .bus_attrs = rbd_bus_attrs, 379f0f8cef5SAlex Elder }; 380f0f8cef5SAlex Elder 381f0f8cef5SAlex Elder static void rbd_root_dev_release(struct device *dev) 382f0f8cef5SAlex Elder { 383f0f8cef5SAlex Elder } 384f0f8cef5SAlex Elder 385f0f8cef5SAlex Elder static struct device rbd_root_dev = { 386f0f8cef5SAlex Elder .init_name = "rbd", 387f0f8cef5SAlex Elder .release = rbd_root_dev_release, 388f0f8cef5SAlex Elder }; 389f0f8cef5SAlex Elder 39006ecc6cbSAlex Elder static __printf(2, 3) 39106ecc6cbSAlex Elder void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...) 39206ecc6cbSAlex Elder { 39306ecc6cbSAlex Elder struct va_format vaf; 39406ecc6cbSAlex Elder va_list args; 39506ecc6cbSAlex Elder 39606ecc6cbSAlex Elder va_start(args, fmt); 39706ecc6cbSAlex Elder vaf.fmt = fmt; 39806ecc6cbSAlex Elder vaf.va = &args; 39906ecc6cbSAlex Elder 40006ecc6cbSAlex Elder if (!rbd_dev) 40106ecc6cbSAlex Elder printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf); 40206ecc6cbSAlex Elder else if (rbd_dev->disk) 40306ecc6cbSAlex Elder printk(KERN_WARNING "%s: %s: %pV\n", 40406ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf); 40506ecc6cbSAlex Elder else if (rbd_dev->spec && rbd_dev->spec->image_name) 40606ecc6cbSAlex Elder printk(KERN_WARNING "%s: image %s: %pV\n", 40706ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf); 40806ecc6cbSAlex Elder else if (rbd_dev->spec && rbd_dev->spec->image_id) 40906ecc6cbSAlex Elder printk(KERN_WARNING "%s: id %s: %pV\n", 41006ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf); 41106ecc6cbSAlex Elder else /* punt */ 41206ecc6cbSAlex Elder printk(KERN_WARNING "%s: rbd_dev %p: %pV\n", 41306ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev, &vaf); 41406ecc6cbSAlex Elder va_end(args); 41506ecc6cbSAlex Elder } 41606ecc6cbSAlex Elder 417aafb230eSAlex Elder #ifdef RBD_DEBUG 418aafb230eSAlex Elder #define rbd_assert(expr) \ 419aafb230eSAlex Elder if (unlikely(!(expr))) { \ 420aafb230eSAlex Elder printk(KERN_ERR "\nAssertion failure in %s() " \ 421aafb230eSAlex Elder "at line %d:\n\n" \ 422aafb230eSAlex Elder "\trbd_assert(%s);\n\n", \ 423aafb230eSAlex Elder __func__, __LINE__, #expr); \ 424aafb230eSAlex Elder BUG(); \ 425aafb230eSAlex Elder } 426aafb230eSAlex Elder #else /* !RBD_DEBUG */ 427aafb230eSAlex Elder # define rbd_assert(expr) ((void) 0) 428aafb230eSAlex Elder #endif /* !RBD_DEBUG */ 429dfc5606dSYehuda Sadeh 4308b3e1a56SAlex Elder static void rbd_img_parent_read(struct rbd_obj_request *obj_request); 431b454e36dSAlex Elder static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request); 4328b3e1a56SAlex Elder 433117973fbSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver); 434117973fbSAlex Elder static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver); 43559c2be1eSYehuda Sadeh 436602adf40SYehuda Sadeh static int rbd_open(struct block_device *bdev, fmode_t mode) 437602adf40SYehuda Sadeh { 438f0f8cef5SAlex Elder struct rbd_device *rbd_dev = bdev->bd_disk->private_data; 439b82d167bSAlex Elder bool removing = false; 440602adf40SYehuda Sadeh 441f84344f3SAlex Elder if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only) 442602adf40SYehuda Sadeh return -EROFS; 443602adf40SYehuda Sadeh 444a14ea269SAlex Elder spin_lock_irq(&rbd_dev->lock); 445b82d167bSAlex Elder if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags)) 446b82d167bSAlex Elder removing = true; 447b82d167bSAlex Elder else 448b82d167bSAlex Elder rbd_dev->open_count++; 449a14ea269SAlex Elder spin_unlock_irq(&rbd_dev->lock); 450b82d167bSAlex Elder if (removing) 451b82d167bSAlex Elder return -ENOENT; 452b82d167bSAlex Elder 45342382b70SAlex Elder mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); 454c3e946ceSAlex Elder (void) get_device(&rbd_dev->dev); 455f84344f3SAlex Elder set_device_ro(bdev, rbd_dev->mapping.read_only); 45642382b70SAlex Elder mutex_unlock(&ctl_mutex); 457340c7a2bSAlex Elder 458602adf40SYehuda Sadeh return 0; 459602adf40SYehuda Sadeh } 460602adf40SYehuda Sadeh 461dfc5606dSYehuda Sadeh static int rbd_release(struct gendisk *disk, fmode_t mode) 462dfc5606dSYehuda Sadeh { 463dfc5606dSYehuda Sadeh struct rbd_device *rbd_dev = disk->private_data; 464b82d167bSAlex Elder unsigned long open_count_before; 465b82d167bSAlex Elder 466a14ea269SAlex Elder spin_lock_irq(&rbd_dev->lock); 467b82d167bSAlex Elder open_count_before = rbd_dev->open_count--; 468a14ea269SAlex Elder spin_unlock_irq(&rbd_dev->lock); 469b82d167bSAlex Elder rbd_assert(open_count_before > 0); 470dfc5606dSYehuda Sadeh 47142382b70SAlex Elder mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); 472c3e946ceSAlex Elder put_device(&rbd_dev->dev); 47342382b70SAlex Elder mutex_unlock(&ctl_mutex); 474dfc5606dSYehuda Sadeh 475dfc5606dSYehuda Sadeh return 0; 476dfc5606dSYehuda Sadeh } 477dfc5606dSYehuda Sadeh 478602adf40SYehuda Sadeh static const struct block_device_operations rbd_bd_ops = { 479602adf40SYehuda Sadeh .owner = THIS_MODULE, 480602adf40SYehuda Sadeh .open = rbd_open, 481dfc5606dSYehuda Sadeh .release = rbd_release, 482602adf40SYehuda Sadeh }; 483602adf40SYehuda Sadeh 484602adf40SYehuda Sadeh /* 485602adf40SYehuda Sadeh * Initialize an rbd client instance. 48643ae4701SAlex Elder * We own *ceph_opts. 487602adf40SYehuda Sadeh */ 488f8c38929SAlex Elder static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts) 489602adf40SYehuda Sadeh { 490602adf40SYehuda Sadeh struct rbd_client *rbdc; 491602adf40SYehuda Sadeh int ret = -ENOMEM; 492602adf40SYehuda Sadeh 49337206ee5SAlex Elder dout("%s:\n", __func__); 494602adf40SYehuda Sadeh rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL); 495602adf40SYehuda Sadeh if (!rbdc) 496602adf40SYehuda Sadeh goto out_opt; 497602adf40SYehuda Sadeh 498602adf40SYehuda Sadeh kref_init(&rbdc->kref); 499602adf40SYehuda Sadeh INIT_LIST_HEAD(&rbdc->node); 500602adf40SYehuda Sadeh 501bc534d86SAlex Elder mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); 502bc534d86SAlex Elder 50343ae4701SAlex Elder rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0); 504602adf40SYehuda Sadeh if (IS_ERR(rbdc->client)) 505bc534d86SAlex Elder goto out_mutex; 50643ae4701SAlex Elder ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */ 507602adf40SYehuda Sadeh 508602adf40SYehuda Sadeh ret = ceph_open_session(rbdc->client); 509602adf40SYehuda Sadeh if (ret < 0) 510602adf40SYehuda Sadeh goto out_err; 511602adf40SYehuda Sadeh 512432b8587SAlex Elder spin_lock(&rbd_client_list_lock); 513602adf40SYehuda Sadeh list_add_tail(&rbdc->node, &rbd_client_list); 514432b8587SAlex Elder spin_unlock(&rbd_client_list_lock); 515602adf40SYehuda Sadeh 516bc534d86SAlex Elder mutex_unlock(&ctl_mutex); 51737206ee5SAlex Elder dout("%s: rbdc %p\n", __func__, rbdc); 518bc534d86SAlex Elder 519602adf40SYehuda Sadeh return rbdc; 520602adf40SYehuda Sadeh 521602adf40SYehuda Sadeh out_err: 522602adf40SYehuda Sadeh ceph_destroy_client(rbdc->client); 523bc534d86SAlex Elder out_mutex: 524bc534d86SAlex Elder mutex_unlock(&ctl_mutex); 525602adf40SYehuda Sadeh kfree(rbdc); 526602adf40SYehuda Sadeh out_opt: 52743ae4701SAlex Elder if (ceph_opts) 52843ae4701SAlex Elder ceph_destroy_options(ceph_opts); 52937206ee5SAlex Elder dout("%s: error %d\n", __func__, ret); 53037206ee5SAlex Elder 53128f259b7SVasiliy Kulikov return ERR_PTR(ret); 532602adf40SYehuda Sadeh } 533602adf40SYehuda Sadeh 5342f82ee54SAlex Elder static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc) 5352f82ee54SAlex Elder { 5362f82ee54SAlex Elder kref_get(&rbdc->kref); 5372f82ee54SAlex Elder 5382f82ee54SAlex Elder return rbdc; 5392f82ee54SAlex Elder } 5402f82ee54SAlex Elder 541602adf40SYehuda Sadeh /* 5421f7ba331SAlex Elder * Find a ceph client with specific addr and configuration. If 5431f7ba331SAlex Elder * found, bump its reference count. 544602adf40SYehuda Sadeh */ 5451f7ba331SAlex Elder static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts) 546602adf40SYehuda Sadeh { 547602adf40SYehuda Sadeh struct rbd_client *client_node; 5481f7ba331SAlex Elder bool found = false; 549602adf40SYehuda Sadeh 55043ae4701SAlex Elder if (ceph_opts->flags & CEPH_OPT_NOSHARE) 551602adf40SYehuda Sadeh return NULL; 552602adf40SYehuda Sadeh 5531f7ba331SAlex Elder spin_lock(&rbd_client_list_lock); 5541f7ba331SAlex Elder list_for_each_entry(client_node, &rbd_client_list, node) { 5551f7ba331SAlex Elder if (!ceph_compare_options(ceph_opts, client_node->client)) { 5562f82ee54SAlex Elder __rbd_get_client(client_node); 5572f82ee54SAlex Elder 5581f7ba331SAlex Elder found = true; 5591f7ba331SAlex Elder break; 5601f7ba331SAlex Elder } 5611f7ba331SAlex Elder } 5621f7ba331SAlex Elder spin_unlock(&rbd_client_list_lock); 5631f7ba331SAlex Elder 5641f7ba331SAlex Elder return found ? client_node : NULL; 565602adf40SYehuda Sadeh } 566602adf40SYehuda Sadeh 567602adf40SYehuda Sadeh /* 56859c2be1eSYehuda Sadeh * mount options 56959c2be1eSYehuda Sadeh */ 57059c2be1eSYehuda Sadeh enum { 57159c2be1eSYehuda Sadeh Opt_last_int, 57259c2be1eSYehuda Sadeh /* int args above */ 57359c2be1eSYehuda Sadeh Opt_last_string, 57459c2be1eSYehuda Sadeh /* string args above */ 575cc0538b6SAlex Elder Opt_read_only, 576cc0538b6SAlex Elder Opt_read_write, 577cc0538b6SAlex Elder /* Boolean args above */ 578cc0538b6SAlex Elder Opt_last_bool, 57959c2be1eSYehuda Sadeh }; 58059c2be1eSYehuda Sadeh 58143ae4701SAlex Elder static match_table_t rbd_opts_tokens = { 58259c2be1eSYehuda Sadeh /* int args above */ 58359c2be1eSYehuda Sadeh /* string args above */ 584be466c1cSAlex Elder {Opt_read_only, "read_only"}, 585cc0538b6SAlex Elder {Opt_read_only, "ro"}, /* Alternate spelling */ 586cc0538b6SAlex Elder {Opt_read_write, "read_write"}, 587cc0538b6SAlex Elder {Opt_read_write, "rw"}, /* Alternate spelling */ 588cc0538b6SAlex Elder /* Boolean args above */ 58959c2be1eSYehuda Sadeh {-1, NULL} 59059c2be1eSYehuda Sadeh }; 59159c2be1eSYehuda Sadeh 59298571b5aSAlex Elder struct rbd_options { 59398571b5aSAlex Elder bool read_only; 59498571b5aSAlex Elder }; 59598571b5aSAlex Elder 59698571b5aSAlex Elder #define RBD_READ_ONLY_DEFAULT false 59798571b5aSAlex Elder 59859c2be1eSYehuda Sadeh static int parse_rbd_opts_token(char *c, void *private) 59959c2be1eSYehuda Sadeh { 60043ae4701SAlex Elder struct rbd_options *rbd_opts = private; 60159c2be1eSYehuda Sadeh substring_t argstr[MAX_OPT_ARGS]; 60259c2be1eSYehuda Sadeh int token, intval, ret; 60359c2be1eSYehuda Sadeh 60443ae4701SAlex Elder token = match_token(c, rbd_opts_tokens, argstr); 60559c2be1eSYehuda Sadeh if (token < 0) 60659c2be1eSYehuda Sadeh return -EINVAL; 60759c2be1eSYehuda Sadeh 60859c2be1eSYehuda Sadeh if (token < Opt_last_int) { 60959c2be1eSYehuda Sadeh ret = match_int(&argstr[0], &intval); 61059c2be1eSYehuda Sadeh if (ret < 0) { 61159c2be1eSYehuda Sadeh pr_err("bad mount option arg (not int) " 61259c2be1eSYehuda Sadeh "at '%s'\n", c); 61359c2be1eSYehuda Sadeh return ret; 61459c2be1eSYehuda Sadeh } 61559c2be1eSYehuda Sadeh dout("got int token %d val %d\n", token, intval); 61659c2be1eSYehuda Sadeh } else if (token > Opt_last_int && token < Opt_last_string) { 61759c2be1eSYehuda Sadeh dout("got string token %d val %s\n", token, 61859c2be1eSYehuda Sadeh argstr[0].from); 619cc0538b6SAlex Elder } else if (token > Opt_last_string && token < Opt_last_bool) { 620cc0538b6SAlex Elder dout("got Boolean token %d\n", token); 62159c2be1eSYehuda Sadeh } else { 62259c2be1eSYehuda Sadeh dout("got token %d\n", token); 62359c2be1eSYehuda Sadeh } 62459c2be1eSYehuda Sadeh 62559c2be1eSYehuda Sadeh switch (token) { 626cc0538b6SAlex Elder case Opt_read_only: 627cc0538b6SAlex Elder rbd_opts->read_only = true; 628cc0538b6SAlex Elder break; 629cc0538b6SAlex Elder case Opt_read_write: 630cc0538b6SAlex Elder rbd_opts->read_only = false; 631cc0538b6SAlex Elder break; 63259c2be1eSYehuda Sadeh default: 633aafb230eSAlex Elder rbd_assert(false); 634aafb230eSAlex Elder break; 63559c2be1eSYehuda Sadeh } 63659c2be1eSYehuda Sadeh return 0; 63759c2be1eSYehuda Sadeh } 63859c2be1eSYehuda Sadeh 63959c2be1eSYehuda Sadeh /* 640602adf40SYehuda Sadeh * Get a ceph client with specific addr and configuration, if one does 641602adf40SYehuda Sadeh * not exist create it. 642602adf40SYehuda Sadeh */ 6439d3997fdSAlex Elder static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts) 644602adf40SYehuda Sadeh { 645f8c38929SAlex Elder struct rbd_client *rbdc; 64659c2be1eSYehuda Sadeh 6471f7ba331SAlex Elder rbdc = rbd_client_find(ceph_opts); 6489d3997fdSAlex Elder if (rbdc) /* using an existing client */ 64943ae4701SAlex Elder ceph_destroy_options(ceph_opts); 6509d3997fdSAlex Elder else 651f8c38929SAlex Elder rbdc = rbd_client_create(ceph_opts); 652d720bcb0SAlex Elder 6539d3997fdSAlex Elder return rbdc; 654602adf40SYehuda Sadeh } 655602adf40SYehuda Sadeh 656602adf40SYehuda Sadeh /* 657602adf40SYehuda Sadeh * Destroy ceph client 658d23a4b3fSAlex Elder * 659432b8587SAlex Elder * Caller must hold rbd_client_list_lock. 660602adf40SYehuda Sadeh */ 661602adf40SYehuda Sadeh static void rbd_client_release(struct kref *kref) 662602adf40SYehuda Sadeh { 663602adf40SYehuda Sadeh struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref); 664602adf40SYehuda Sadeh 66537206ee5SAlex Elder dout("%s: rbdc %p\n", __func__, rbdc); 666cd9d9f5dSAlex Elder spin_lock(&rbd_client_list_lock); 667602adf40SYehuda Sadeh list_del(&rbdc->node); 668cd9d9f5dSAlex Elder spin_unlock(&rbd_client_list_lock); 669602adf40SYehuda Sadeh 670602adf40SYehuda Sadeh ceph_destroy_client(rbdc->client); 671602adf40SYehuda Sadeh kfree(rbdc); 672602adf40SYehuda Sadeh } 673602adf40SYehuda Sadeh 674602adf40SYehuda Sadeh /* 675602adf40SYehuda Sadeh * Drop reference to ceph client node. If it's not referenced anymore, release 676602adf40SYehuda Sadeh * it. 677602adf40SYehuda Sadeh */ 6789d3997fdSAlex Elder static void rbd_put_client(struct rbd_client *rbdc) 679602adf40SYehuda Sadeh { 680c53d5893SAlex Elder if (rbdc) 6819d3997fdSAlex Elder kref_put(&rbdc->kref, rbd_client_release); 682602adf40SYehuda Sadeh } 683602adf40SYehuda Sadeh 684a30b71b9SAlex Elder static bool rbd_image_format_valid(u32 image_format) 685a30b71b9SAlex Elder { 686a30b71b9SAlex Elder return image_format == 1 || image_format == 2; 687a30b71b9SAlex Elder } 688a30b71b9SAlex Elder 6898e94af8eSAlex Elder static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk) 6908e94af8eSAlex Elder { 691103a150fSAlex Elder size_t size; 692103a150fSAlex Elder u32 snap_count; 693103a150fSAlex Elder 694103a150fSAlex Elder /* The header has to start with the magic rbd header text */ 695103a150fSAlex Elder if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT))) 696103a150fSAlex Elder return false; 697103a150fSAlex Elder 698db2388b6SAlex Elder /* The bio layer requires at least sector-sized I/O */ 699db2388b6SAlex Elder 700db2388b6SAlex Elder if (ondisk->options.order < SECTOR_SHIFT) 701db2388b6SAlex Elder return false; 702db2388b6SAlex Elder 703db2388b6SAlex Elder /* If we use u64 in a few spots we may be able to loosen this */ 704db2388b6SAlex Elder 705db2388b6SAlex Elder if (ondisk->options.order > 8 * sizeof (int) - 1) 706db2388b6SAlex Elder return false; 707db2388b6SAlex Elder 708103a150fSAlex Elder /* 709103a150fSAlex Elder * The size of a snapshot header has to fit in a size_t, and 710103a150fSAlex Elder * that limits the number of snapshots. 711103a150fSAlex Elder */ 712103a150fSAlex Elder snap_count = le32_to_cpu(ondisk->snap_count); 713103a150fSAlex Elder size = SIZE_MAX - sizeof (struct ceph_snap_context); 714103a150fSAlex Elder if (snap_count > size / sizeof (__le64)) 715103a150fSAlex Elder return false; 716103a150fSAlex Elder 717103a150fSAlex Elder /* 718103a150fSAlex Elder * Not only that, but the size of the entire the snapshot 719103a150fSAlex Elder * header must also be representable in a size_t. 720103a150fSAlex Elder */ 721103a150fSAlex Elder size -= snap_count * sizeof (__le64); 722103a150fSAlex Elder if ((u64) size < le64_to_cpu(ondisk->snap_names_len)) 723103a150fSAlex Elder return false; 724103a150fSAlex Elder 725103a150fSAlex Elder return true; 7268e94af8eSAlex Elder } 7278e94af8eSAlex Elder 728602adf40SYehuda Sadeh /* 729602adf40SYehuda Sadeh * Create a new header structure, translate header format from the on-disk 730602adf40SYehuda Sadeh * header. 731602adf40SYehuda Sadeh */ 732602adf40SYehuda Sadeh static int rbd_header_from_disk(struct rbd_image_header *header, 7334156d998SAlex Elder struct rbd_image_header_ondisk *ondisk) 734602adf40SYehuda Sadeh { 735ccece235SAlex Elder u32 snap_count; 73658c17b0eSAlex Elder size_t len; 737d2bb24e5SAlex Elder size_t size; 738621901d6SAlex Elder u32 i; 739602adf40SYehuda Sadeh 7406a52325fSAlex Elder memset(header, 0, sizeof (*header)); 7416a52325fSAlex Elder 742103a150fSAlex Elder snap_count = le32_to_cpu(ondisk->snap_count); 743103a150fSAlex Elder 74458c17b0eSAlex Elder len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix)); 74558c17b0eSAlex Elder header->object_prefix = kmalloc(len + 1, GFP_KERNEL); 7466a52325fSAlex Elder if (!header->object_prefix) 747602adf40SYehuda Sadeh return -ENOMEM; 74858c17b0eSAlex Elder memcpy(header->object_prefix, ondisk->object_prefix, len); 74958c17b0eSAlex Elder header->object_prefix[len] = '\0'; 75000f1f36fSAlex Elder 751602adf40SYehuda Sadeh if (snap_count) { 752f785cc1dSAlex Elder u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len); 753f785cc1dSAlex Elder 754621901d6SAlex Elder /* Save a copy of the snapshot names */ 755621901d6SAlex Elder 756f785cc1dSAlex Elder if (snap_names_len > (u64) SIZE_MAX) 757f785cc1dSAlex Elder return -EIO; 758f785cc1dSAlex Elder header->snap_names = kmalloc(snap_names_len, GFP_KERNEL); 759602adf40SYehuda Sadeh if (!header->snap_names) 7606a52325fSAlex Elder goto out_err; 761f785cc1dSAlex Elder /* 762f785cc1dSAlex Elder * Note that rbd_dev_v1_header_read() guarantees 763f785cc1dSAlex Elder * the ondisk buffer we're working with has 764f785cc1dSAlex Elder * snap_names_len bytes beyond the end of the 765f785cc1dSAlex Elder * snapshot id array, this memcpy() is safe. 766f785cc1dSAlex Elder */ 767f785cc1dSAlex Elder memcpy(header->snap_names, &ondisk->snaps[snap_count], 768f785cc1dSAlex Elder snap_names_len); 7696a52325fSAlex Elder 770621901d6SAlex Elder /* Record each snapshot's size */ 771621901d6SAlex Elder 772d2bb24e5SAlex Elder size = snap_count * sizeof (*header->snap_sizes); 773d2bb24e5SAlex Elder header->snap_sizes = kmalloc(size, GFP_KERNEL); 774602adf40SYehuda Sadeh if (!header->snap_sizes) 7756a52325fSAlex Elder goto out_err; 776621901d6SAlex Elder for (i = 0; i < snap_count; i++) 777621901d6SAlex Elder header->snap_sizes[i] = 778621901d6SAlex Elder le64_to_cpu(ondisk->snaps[i].image_size); 779602adf40SYehuda Sadeh } else { 780ccece235SAlex Elder WARN_ON(ondisk->snap_names_len); 781602adf40SYehuda Sadeh header->snap_names = NULL; 782602adf40SYehuda Sadeh header->snap_sizes = NULL; 783602adf40SYehuda Sadeh } 784849b4260SAlex Elder 78534b13184SAlex Elder header->features = 0; /* No features support in v1 images */ 786602adf40SYehuda Sadeh header->obj_order = ondisk->options.order; 787602adf40SYehuda Sadeh header->crypt_type = ondisk->options.crypt_type; 788602adf40SYehuda Sadeh header->comp_type = ondisk->options.comp_type; 7896a52325fSAlex Elder 790621901d6SAlex Elder /* Allocate and fill in the snapshot context */ 791621901d6SAlex Elder 792f84344f3SAlex Elder header->image_size = le64_to_cpu(ondisk->image_size); 7936a52325fSAlex Elder size = sizeof (struct ceph_snap_context); 7946a52325fSAlex Elder size += snap_count * sizeof (header->snapc->snaps[0]); 7956a52325fSAlex Elder header->snapc = kzalloc(size, GFP_KERNEL); 7966a52325fSAlex Elder if (!header->snapc) 7976a52325fSAlex Elder goto out_err; 798602adf40SYehuda Sadeh 799602adf40SYehuda Sadeh atomic_set(&header->snapc->nref, 1); 800505cbb9bSAlex Elder header->snapc->seq = le64_to_cpu(ondisk->snap_seq); 801602adf40SYehuda Sadeh header->snapc->num_snaps = snap_count; 802621901d6SAlex Elder for (i = 0; i < snap_count; i++) 803602adf40SYehuda Sadeh header->snapc->snaps[i] = 804602adf40SYehuda Sadeh le64_to_cpu(ondisk->snaps[i].id); 805602adf40SYehuda Sadeh 806602adf40SYehuda Sadeh return 0; 807602adf40SYehuda Sadeh 8086a52325fSAlex Elder out_err: 809849b4260SAlex Elder kfree(header->snap_sizes); 810ccece235SAlex Elder header->snap_sizes = NULL; 811602adf40SYehuda Sadeh kfree(header->snap_names); 812ccece235SAlex Elder header->snap_names = NULL; 8136a52325fSAlex Elder kfree(header->object_prefix); 8146a52325fSAlex Elder header->object_prefix = NULL; 815ccece235SAlex Elder 81600f1f36fSAlex Elder return -ENOMEM; 817602adf40SYehuda Sadeh } 818602adf40SYehuda Sadeh 8199e15b77dSAlex Elder static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id) 8209e15b77dSAlex Elder { 8219e15b77dSAlex Elder struct rbd_snap *snap; 8229e15b77dSAlex Elder 8239e15b77dSAlex Elder if (snap_id == CEPH_NOSNAP) 8249e15b77dSAlex Elder return RBD_SNAP_HEAD_NAME; 8259e15b77dSAlex Elder 8269e15b77dSAlex Elder list_for_each_entry(snap, &rbd_dev->snaps, node) 8279e15b77dSAlex Elder if (snap_id == snap->id) 8289e15b77dSAlex Elder return snap->name; 8299e15b77dSAlex Elder 8309e15b77dSAlex Elder return NULL; 8319e15b77dSAlex Elder } 8329e15b77dSAlex Elder 8338836b995SAlex Elder static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name) 834602adf40SYehuda Sadeh { 835602adf40SYehuda Sadeh 836e86924a8SAlex Elder struct rbd_snap *snap; 83700f1f36fSAlex Elder 838e86924a8SAlex Elder list_for_each_entry(snap, &rbd_dev->snaps, node) { 839e86924a8SAlex Elder if (!strcmp(snap_name, snap->name)) { 8400d7dbfceSAlex Elder rbd_dev->spec->snap_id = snap->id; 841e86924a8SAlex Elder rbd_dev->mapping.size = snap->size; 84234b13184SAlex Elder rbd_dev->mapping.features = snap->features; 84300f1f36fSAlex Elder 844e86924a8SAlex Elder return 0; 845602adf40SYehuda Sadeh } 84600f1f36fSAlex Elder } 847e86924a8SAlex Elder 84800f1f36fSAlex Elder return -ENOENT; 84900f1f36fSAlex Elder } 850602adf40SYehuda Sadeh 851819d52bfSAlex Elder static int rbd_dev_set_mapping(struct rbd_device *rbd_dev) 852602adf40SYehuda Sadeh { 85378dc447dSAlex Elder int ret; 854602adf40SYehuda Sadeh 8550d7dbfceSAlex Elder if (!memcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME, 856cc9d734cSJosh Durgin sizeof (RBD_SNAP_HEAD_NAME))) { 8570d7dbfceSAlex Elder rbd_dev->spec->snap_id = CEPH_NOSNAP; 85899c1f08fSAlex Elder rbd_dev->mapping.size = rbd_dev->header.image_size; 85934b13184SAlex Elder rbd_dev->mapping.features = rbd_dev->header.features; 860e86924a8SAlex Elder ret = 0; 861602adf40SYehuda Sadeh } else { 8620d7dbfceSAlex Elder ret = snap_by_name(rbd_dev, rbd_dev->spec->snap_name); 863602adf40SYehuda Sadeh if (ret < 0) 864602adf40SYehuda Sadeh goto done; 865f84344f3SAlex Elder rbd_dev->mapping.read_only = true; 866602adf40SYehuda Sadeh } 8676d292906SAlex Elder set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); 8686d292906SAlex Elder 869602adf40SYehuda Sadeh done: 870602adf40SYehuda Sadeh return ret; 871602adf40SYehuda Sadeh } 872602adf40SYehuda Sadeh 873602adf40SYehuda Sadeh static void rbd_header_free(struct rbd_image_header *header) 874602adf40SYehuda Sadeh { 875849b4260SAlex Elder kfree(header->object_prefix); 876d78fd7aeSAlex Elder header->object_prefix = NULL; 877602adf40SYehuda Sadeh kfree(header->snap_sizes); 878d78fd7aeSAlex Elder header->snap_sizes = NULL; 879849b4260SAlex Elder kfree(header->snap_names); 880d78fd7aeSAlex Elder header->snap_names = NULL; 881d1d25646SJosh Durgin ceph_put_snap_context(header->snapc); 882d78fd7aeSAlex Elder header->snapc = NULL; 883602adf40SYehuda Sadeh } 884602adf40SYehuda Sadeh 88598571b5aSAlex Elder static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset) 886602adf40SYehuda Sadeh { 88765ccfe21SAlex Elder char *name; 88865ccfe21SAlex Elder u64 segment; 88965ccfe21SAlex Elder int ret; 890602adf40SYehuda Sadeh 8912fd82b9eSAlex Elder name = kmalloc(MAX_OBJ_NAME_SIZE + 1, GFP_NOIO); 89265ccfe21SAlex Elder if (!name) 89365ccfe21SAlex Elder return NULL; 89465ccfe21SAlex Elder segment = offset >> rbd_dev->header.obj_order; 8952fd82b9eSAlex Elder ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, "%s.%012llx", 89665ccfe21SAlex Elder rbd_dev->header.object_prefix, segment); 8972fd82b9eSAlex Elder if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) { 89865ccfe21SAlex Elder pr_err("error formatting segment name for #%llu (%d)\n", 89965ccfe21SAlex Elder segment, ret); 90065ccfe21SAlex Elder kfree(name); 90165ccfe21SAlex Elder name = NULL; 90265ccfe21SAlex Elder } 903602adf40SYehuda Sadeh 90465ccfe21SAlex Elder return name; 90565ccfe21SAlex Elder } 906602adf40SYehuda Sadeh 90765ccfe21SAlex Elder static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset) 90865ccfe21SAlex Elder { 90965ccfe21SAlex Elder u64 segment_size = (u64) 1 << rbd_dev->header.obj_order; 910602adf40SYehuda Sadeh 91165ccfe21SAlex Elder return offset & (segment_size - 1); 91265ccfe21SAlex Elder } 91365ccfe21SAlex Elder 91465ccfe21SAlex Elder static u64 rbd_segment_length(struct rbd_device *rbd_dev, 91565ccfe21SAlex Elder u64 offset, u64 length) 91665ccfe21SAlex Elder { 91765ccfe21SAlex Elder u64 segment_size = (u64) 1 << rbd_dev->header.obj_order; 91865ccfe21SAlex Elder 91965ccfe21SAlex Elder offset &= segment_size - 1; 92065ccfe21SAlex Elder 921aafb230eSAlex Elder rbd_assert(length <= U64_MAX - offset); 92265ccfe21SAlex Elder if (offset + length > segment_size) 92365ccfe21SAlex Elder length = segment_size - offset; 92465ccfe21SAlex Elder 92565ccfe21SAlex Elder return length; 926602adf40SYehuda Sadeh } 927602adf40SYehuda Sadeh 928602adf40SYehuda Sadeh /* 929029bcbd8SJosh Durgin * returns the size of an object in the image 930029bcbd8SJosh Durgin */ 931029bcbd8SJosh Durgin static u64 rbd_obj_bytes(struct rbd_image_header *header) 932029bcbd8SJosh Durgin { 933029bcbd8SJosh Durgin return 1 << header->obj_order; 934029bcbd8SJosh Durgin } 935029bcbd8SJosh Durgin 936029bcbd8SJosh Durgin /* 937602adf40SYehuda Sadeh * bio helpers 938602adf40SYehuda Sadeh */ 939602adf40SYehuda Sadeh 940602adf40SYehuda Sadeh static void bio_chain_put(struct bio *chain) 941602adf40SYehuda Sadeh { 942602adf40SYehuda Sadeh struct bio *tmp; 943602adf40SYehuda Sadeh 944602adf40SYehuda Sadeh while (chain) { 945602adf40SYehuda Sadeh tmp = chain; 946602adf40SYehuda Sadeh chain = chain->bi_next; 947602adf40SYehuda Sadeh bio_put(tmp); 948602adf40SYehuda Sadeh } 949602adf40SYehuda Sadeh } 950602adf40SYehuda Sadeh 951602adf40SYehuda Sadeh /* 952602adf40SYehuda Sadeh * zeros a bio chain, starting at specific offset 953602adf40SYehuda Sadeh */ 954602adf40SYehuda Sadeh static void zero_bio_chain(struct bio *chain, int start_ofs) 955602adf40SYehuda Sadeh { 956602adf40SYehuda Sadeh struct bio_vec *bv; 957602adf40SYehuda Sadeh unsigned long flags; 958602adf40SYehuda Sadeh void *buf; 959602adf40SYehuda Sadeh int i; 960602adf40SYehuda Sadeh int pos = 0; 961602adf40SYehuda Sadeh 962602adf40SYehuda Sadeh while (chain) { 963602adf40SYehuda Sadeh bio_for_each_segment(bv, chain, i) { 964602adf40SYehuda Sadeh if (pos + bv->bv_len > start_ofs) { 965602adf40SYehuda Sadeh int remainder = max(start_ofs - pos, 0); 966602adf40SYehuda Sadeh buf = bvec_kmap_irq(bv, &flags); 967602adf40SYehuda Sadeh memset(buf + remainder, 0, 968602adf40SYehuda Sadeh bv->bv_len - remainder); 96985b5aaa6SDan Carpenter bvec_kunmap_irq(buf, &flags); 970602adf40SYehuda Sadeh } 971602adf40SYehuda Sadeh pos += bv->bv_len; 972602adf40SYehuda Sadeh } 973602adf40SYehuda Sadeh 974602adf40SYehuda Sadeh chain = chain->bi_next; 975602adf40SYehuda Sadeh } 976602adf40SYehuda Sadeh } 977602adf40SYehuda Sadeh 978602adf40SYehuda Sadeh /* 979b9434c5bSAlex Elder * similar to zero_bio_chain(), zeros data defined by a page array, 980b9434c5bSAlex Elder * starting at the given byte offset from the start of the array and 981b9434c5bSAlex Elder * continuing up to the given end offset. The pages array is 982b9434c5bSAlex Elder * assumed to be big enough to hold all bytes up to the end. 983b9434c5bSAlex Elder */ 984b9434c5bSAlex Elder static void zero_pages(struct page **pages, u64 offset, u64 end) 985b9434c5bSAlex Elder { 986b9434c5bSAlex Elder struct page **page = &pages[offset >> PAGE_SHIFT]; 987b9434c5bSAlex Elder 988b9434c5bSAlex Elder rbd_assert(end > offset); 989b9434c5bSAlex Elder rbd_assert(end - offset <= (u64)SIZE_MAX); 990b9434c5bSAlex Elder while (offset < end) { 991b9434c5bSAlex Elder size_t page_offset; 992b9434c5bSAlex Elder size_t length; 993b9434c5bSAlex Elder unsigned long flags; 994b9434c5bSAlex Elder void *kaddr; 995b9434c5bSAlex Elder 996b9434c5bSAlex Elder page_offset = (size_t)(offset & ~PAGE_MASK); 997b9434c5bSAlex Elder length = min(PAGE_SIZE - page_offset, (size_t)(end - offset)); 998b9434c5bSAlex Elder local_irq_save(flags); 999b9434c5bSAlex Elder kaddr = kmap_atomic(*page); 1000b9434c5bSAlex Elder memset(kaddr + page_offset, 0, length); 1001b9434c5bSAlex Elder kunmap_atomic(kaddr); 1002b9434c5bSAlex Elder local_irq_restore(flags); 1003b9434c5bSAlex Elder 1004b9434c5bSAlex Elder offset += length; 1005b9434c5bSAlex Elder page++; 1006b9434c5bSAlex Elder } 1007b9434c5bSAlex Elder } 1008b9434c5bSAlex Elder 1009b9434c5bSAlex Elder /* 1010f7760dadSAlex Elder * Clone a portion of a bio, starting at the given byte offset 1011f7760dadSAlex Elder * and continuing for the number of bytes indicated. 1012602adf40SYehuda Sadeh */ 1013f7760dadSAlex Elder static struct bio *bio_clone_range(struct bio *bio_src, 1014f7760dadSAlex Elder unsigned int offset, 1015f7760dadSAlex Elder unsigned int len, 1016f7760dadSAlex Elder gfp_t gfpmask) 1017602adf40SYehuda Sadeh { 1018f7760dadSAlex Elder struct bio_vec *bv; 1019f7760dadSAlex Elder unsigned int resid; 1020f7760dadSAlex Elder unsigned short idx; 1021f7760dadSAlex Elder unsigned int voff; 1022f7760dadSAlex Elder unsigned short end_idx; 1023f7760dadSAlex Elder unsigned short vcnt; 1024f7760dadSAlex Elder struct bio *bio; 1025602adf40SYehuda Sadeh 1026f7760dadSAlex Elder /* Handle the easy case for the caller */ 1027f7760dadSAlex Elder 1028f7760dadSAlex Elder if (!offset && len == bio_src->bi_size) 1029f7760dadSAlex Elder return bio_clone(bio_src, gfpmask); 1030f7760dadSAlex Elder 1031f7760dadSAlex Elder if (WARN_ON_ONCE(!len)) 1032f7760dadSAlex Elder return NULL; 1033f7760dadSAlex Elder if (WARN_ON_ONCE(len > bio_src->bi_size)) 1034f7760dadSAlex Elder return NULL; 1035f7760dadSAlex Elder if (WARN_ON_ONCE(offset > bio_src->bi_size - len)) 1036f7760dadSAlex Elder return NULL; 1037f7760dadSAlex Elder 1038f7760dadSAlex Elder /* Find first affected segment... */ 1039f7760dadSAlex Elder 1040f7760dadSAlex Elder resid = offset; 1041f7760dadSAlex Elder __bio_for_each_segment(bv, bio_src, idx, 0) { 1042f7760dadSAlex Elder if (resid < bv->bv_len) 1043f7760dadSAlex Elder break; 1044f7760dadSAlex Elder resid -= bv->bv_len; 1045602adf40SYehuda Sadeh } 1046f7760dadSAlex Elder voff = resid; 1047602adf40SYehuda Sadeh 1048f7760dadSAlex Elder /* ...and the last affected segment */ 1049542582fcSAlex Elder 1050f7760dadSAlex Elder resid += len; 1051f7760dadSAlex Elder __bio_for_each_segment(bv, bio_src, end_idx, idx) { 1052f7760dadSAlex Elder if (resid <= bv->bv_len) 1053f7760dadSAlex Elder break; 1054f7760dadSAlex Elder resid -= bv->bv_len; 1055f7760dadSAlex Elder } 1056f7760dadSAlex Elder vcnt = end_idx - idx + 1; 1057602adf40SYehuda Sadeh 1058f7760dadSAlex Elder /* Build the clone */ 1059f7760dadSAlex Elder 1060f7760dadSAlex Elder bio = bio_alloc(gfpmask, (unsigned int) vcnt); 1061f7760dadSAlex Elder if (!bio) 1062f7760dadSAlex Elder return NULL; /* ENOMEM */ 1063f7760dadSAlex Elder 1064f7760dadSAlex Elder bio->bi_bdev = bio_src->bi_bdev; 1065f7760dadSAlex Elder bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT); 1066f7760dadSAlex Elder bio->bi_rw = bio_src->bi_rw; 1067f7760dadSAlex Elder bio->bi_flags |= 1 << BIO_CLONED; 1068602adf40SYehuda Sadeh 1069602adf40SYehuda Sadeh /* 1070f7760dadSAlex Elder * Copy over our part of the bio_vec, then update the first 1071f7760dadSAlex Elder * and last (or only) entries. 1072602adf40SYehuda Sadeh */ 1073f7760dadSAlex Elder memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx], 1074f7760dadSAlex Elder vcnt * sizeof (struct bio_vec)); 1075f7760dadSAlex Elder bio->bi_io_vec[0].bv_offset += voff; 1076f7760dadSAlex Elder if (vcnt > 1) { 1077f7760dadSAlex Elder bio->bi_io_vec[0].bv_len -= voff; 1078f7760dadSAlex Elder bio->bi_io_vec[vcnt - 1].bv_len = resid; 1079602adf40SYehuda Sadeh } else { 1080f7760dadSAlex Elder bio->bi_io_vec[0].bv_len = len; 1081602adf40SYehuda Sadeh } 1082602adf40SYehuda Sadeh 1083f7760dadSAlex Elder bio->bi_vcnt = vcnt; 1084f7760dadSAlex Elder bio->bi_size = len; 1085f7760dadSAlex Elder bio->bi_idx = 0; 1086602adf40SYehuda Sadeh 1087f7760dadSAlex Elder return bio; 1088602adf40SYehuda Sadeh } 1089602adf40SYehuda Sadeh 1090f7760dadSAlex Elder /* 1091f7760dadSAlex Elder * Clone a portion of a bio chain, starting at the given byte offset 1092f7760dadSAlex Elder * into the first bio in the source chain and continuing for the 1093f7760dadSAlex Elder * number of bytes indicated. The result is another bio chain of 1094f7760dadSAlex Elder * exactly the given length, or a null pointer on error. 1095f7760dadSAlex Elder * 1096f7760dadSAlex Elder * The bio_src and offset parameters are both in-out. On entry they 1097f7760dadSAlex Elder * refer to the first source bio and the offset into that bio where 1098f7760dadSAlex Elder * the start of data to be cloned is located. 1099f7760dadSAlex Elder * 1100f7760dadSAlex Elder * On return, bio_src is updated to refer to the bio in the source 1101f7760dadSAlex Elder * chain that contains first un-cloned byte, and *offset will 1102f7760dadSAlex Elder * contain the offset of that byte within that bio. 1103f7760dadSAlex Elder */ 1104f7760dadSAlex Elder static struct bio *bio_chain_clone_range(struct bio **bio_src, 1105f7760dadSAlex Elder unsigned int *offset, 1106f7760dadSAlex Elder unsigned int len, 1107f7760dadSAlex Elder gfp_t gfpmask) 1108f7760dadSAlex Elder { 1109f7760dadSAlex Elder struct bio *bi = *bio_src; 1110f7760dadSAlex Elder unsigned int off = *offset; 1111f7760dadSAlex Elder struct bio *chain = NULL; 1112f7760dadSAlex Elder struct bio **end; 1113602adf40SYehuda Sadeh 1114f7760dadSAlex Elder /* Build up a chain of clone bios up to the limit */ 1115602adf40SYehuda Sadeh 1116f7760dadSAlex Elder if (!bi || off >= bi->bi_size || !len) 1117f7760dadSAlex Elder return NULL; /* Nothing to clone */ 1118602adf40SYehuda Sadeh 1119f7760dadSAlex Elder end = &chain; 1120f7760dadSAlex Elder while (len) { 1121f7760dadSAlex Elder unsigned int bi_size; 1122f7760dadSAlex Elder struct bio *bio; 1123f7760dadSAlex Elder 1124f5400b7aSAlex Elder if (!bi) { 1125f5400b7aSAlex Elder rbd_warn(NULL, "bio_chain exhausted with %u left", len); 1126f7760dadSAlex Elder goto out_err; /* EINVAL; ran out of bio's */ 1127f5400b7aSAlex Elder } 1128f7760dadSAlex Elder bi_size = min_t(unsigned int, bi->bi_size - off, len); 1129f7760dadSAlex Elder bio = bio_clone_range(bi, off, bi_size, gfpmask); 1130f7760dadSAlex Elder if (!bio) 1131f7760dadSAlex Elder goto out_err; /* ENOMEM */ 1132f7760dadSAlex Elder 1133f7760dadSAlex Elder *end = bio; 1134f7760dadSAlex Elder end = &bio->bi_next; 1135f7760dadSAlex Elder 1136f7760dadSAlex Elder off += bi_size; 1137f7760dadSAlex Elder if (off == bi->bi_size) { 1138f7760dadSAlex Elder bi = bi->bi_next; 1139f7760dadSAlex Elder off = 0; 1140f7760dadSAlex Elder } 1141f7760dadSAlex Elder len -= bi_size; 1142f7760dadSAlex Elder } 1143f7760dadSAlex Elder *bio_src = bi; 1144f7760dadSAlex Elder *offset = off; 1145f7760dadSAlex Elder 1146f7760dadSAlex Elder return chain; 1147f7760dadSAlex Elder out_err: 1148f7760dadSAlex Elder bio_chain_put(chain); 1149f7760dadSAlex Elder 1150602adf40SYehuda Sadeh return NULL; 1151602adf40SYehuda Sadeh } 1152602adf40SYehuda Sadeh 1153926f9b3fSAlex Elder /* 1154926f9b3fSAlex Elder * The default/initial value for all object request flags is 0. For 1155926f9b3fSAlex Elder * each flag, once its value is set to 1 it is never reset to 0 1156926f9b3fSAlex Elder * again. 1157926f9b3fSAlex Elder */ 11586365d33aSAlex Elder static void obj_request_img_data_set(struct rbd_obj_request *obj_request) 11596365d33aSAlex Elder { 11606365d33aSAlex Elder if (test_and_set_bit(OBJ_REQ_IMG_DATA, &obj_request->flags)) { 11616365d33aSAlex Elder struct rbd_device *rbd_dev; 11626365d33aSAlex Elder 116357acbaa7SAlex Elder rbd_dev = obj_request->img_request->rbd_dev; 11646365d33aSAlex Elder rbd_warn(rbd_dev, "obj_request %p already marked img_data\n", 11656365d33aSAlex Elder obj_request); 11666365d33aSAlex Elder } 11676365d33aSAlex Elder } 11686365d33aSAlex Elder 11696365d33aSAlex Elder static bool obj_request_img_data_test(struct rbd_obj_request *obj_request) 11706365d33aSAlex Elder { 11716365d33aSAlex Elder smp_mb(); 11726365d33aSAlex Elder return test_bit(OBJ_REQ_IMG_DATA, &obj_request->flags) != 0; 11736365d33aSAlex Elder } 11746365d33aSAlex Elder 117557acbaa7SAlex Elder static void obj_request_done_set(struct rbd_obj_request *obj_request) 117657acbaa7SAlex Elder { 117757acbaa7SAlex Elder if (test_and_set_bit(OBJ_REQ_DONE, &obj_request->flags)) { 117857acbaa7SAlex Elder struct rbd_device *rbd_dev = NULL; 117957acbaa7SAlex Elder 118057acbaa7SAlex Elder if (obj_request_img_data_test(obj_request)) 118157acbaa7SAlex Elder rbd_dev = obj_request->img_request->rbd_dev; 118257acbaa7SAlex Elder rbd_warn(rbd_dev, "obj_request %p already marked done\n", 118357acbaa7SAlex Elder obj_request); 118457acbaa7SAlex Elder } 118557acbaa7SAlex Elder } 118657acbaa7SAlex Elder 118757acbaa7SAlex Elder static bool obj_request_done_test(struct rbd_obj_request *obj_request) 118857acbaa7SAlex Elder { 118957acbaa7SAlex Elder smp_mb(); 119057acbaa7SAlex Elder return test_bit(OBJ_REQ_DONE, &obj_request->flags) != 0; 119157acbaa7SAlex Elder } 119257acbaa7SAlex Elder 11935679c59fSAlex Elder /* 11945679c59fSAlex Elder * This sets the KNOWN flag after (possibly) setting the EXISTS 11955679c59fSAlex Elder * flag. The latter is set based on the "exists" value provided. 11965679c59fSAlex Elder * 11975679c59fSAlex Elder * Note that for our purposes once an object exists it never goes 11985679c59fSAlex Elder * away again. It's possible that the response from two existence 11995679c59fSAlex Elder * checks are separated by the creation of the target object, and 12005679c59fSAlex Elder * the first ("doesn't exist") response arrives *after* the second 12015679c59fSAlex Elder * ("does exist"). In that case we ignore the second one. 12025679c59fSAlex Elder */ 12035679c59fSAlex Elder static void obj_request_existence_set(struct rbd_obj_request *obj_request, 12045679c59fSAlex Elder bool exists) 12055679c59fSAlex Elder { 12065679c59fSAlex Elder if (exists) 12075679c59fSAlex Elder set_bit(OBJ_REQ_EXISTS, &obj_request->flags); 12085679c59fSAlex Elder set_bit(OBJ_REQ_KNOWN, &obj_request->flags); 12095679c59fSAlex Elder smp_mb(); 12105679c59fSAlex Elder } 12115679c59fSAlex Elder 12125679c59fSAlex Elder static bool obj_request_known_test(struct rbd_obj_request *obj_request) 12135679c59fSAlex Elder { 12145679c59fSAlex Elder smp_mb(); 12155679c59fSAlex Elder return test_bit(OBJ_REQ_KNOWN, &obj_request->flags) != 0; 12165679c59fSAlex Elder } 12175679c59fSAlex Elder 12185679c59fSAlex Elder static bool obj_request_exists_test(struct rbd_obj_request *obj_request) 12195679c59fSAlex Elder { 12205679c59fSAlex Elder smp_mb(); 12215679c59fSAlex Elder return test_bit(OBJ_REQ_EXISTS, &obj_request->flags) != 0; 12225679c59fSAlex Elder } 12235679c59fSAlex Elder 1224bf0d5f50SAlex Elder static void rbd_obj_request_get(struct rbd_obj_request *obj_request) 1225bf0d5f50SAlex Elder { 122637206ee5SAlex Elder dout("%s: obj %p (was %d)\n", __func__, obj_request, 122737206ee5SAlex Elder atomic_read(&obj_request->kref.refcount)); 1228bf0d5f50SAlex Elder kref_get(&obj_request->kref); 1229bf0d5f50SAlex Elder } 1230bf0d5f50SAlex Elder 1231bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref); 1232bf0d5f50SAlex Elder static void rbd_obj_request_put(struct rbd_obj_request *obj_request) 1233bf0d5f50SAlex Elder { 1234bf0d5f50SAlex Elder rbd_assert(obj_request != NULL); 123537206ee5SAlex Elder dout("%s: obj %p (was %d)\n", __func__, obj_request, 123637206ee5SAlex Elder atomic_read(&obj_request->kref.refcount)); 1237bf0d5f50SAlex Elder kref_put(&obj_request->kref, rbd_obj_request_destroy); 1238bf0d5f50SAlex Elder } 1239bf0d5f50SAlex Elder 1240bf0d5f50SAlex Elder static void rbd_img_request_get(struct rbd_img_request *img_request) 1241bf0d5f50SAlex Elder { 124237206ee5SAlex Elder dout("%s: img %p (was %d)\n", __func__, img_request, 124337206ee5SAlex Elder atomic_read(&img_request->kref.refcount)); 1244bf0d5f50SAlex Elder kref_get(&img_request->kref); 1245bf0d5f50SAlex Elder } 1246bf0d5f50SAlex Elder 1247bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref); 1248bf0d5f50SAlex Elder static void rbd_img_request_put(struct rbd_img_request *img_request) 1249bf0d5f50SAlex Elder { 1250bf0d5f50SAlex Elder rbd_assert(img_request != NULL); 125137206ee5SAlex Elder dout("%s: img %p (was %d)\n", __func__, img_request, 125237206ee5SAlex Elder atomic_read(&img_request->kref.refcount)); 1253bf0d5f50SAlex Elder kref_put(&img_request->kref, rbd_img_request_destroy); 1254bf0d5f50SAlex Elder } 1255bf0d5f50SAlex Elder 1256bf0d5f50SAlex Elder static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request, 1257bf0d5f50SAlex Elder struct rbd_obj_request *obj_request) 1258bf0d5f50SAlex Elder { 125925dcf954SAlex Elder rbd_assert(obj_request->img_request == NULL); 126025dcf954SAlex Elder 1261b155e86cSAlex Elder /* Image request now owns object's original reference */ 1262bf0d5f50SAlex Elder obj_request->img_request = img_request; 126325dcf954SAlex Elder obj_request->which = img_request->obj_request_count; 12646365d33aSAlex Elder rbd_assert(!obj_request_img_data_test(obj_request)); 12656365d33aSAlex Elder obj_request_img_data_set(obj_request); 1266bf0d5f50SAlex Elder rbd_assert(obj_request->which != BAD_WHICH); 126725dcf954SAlex Elder img_request->obj_request_count++; 126825dcf954SAlex Elder list_add_tail(&obj_request->links, &img_request->obj_requests); 126937206ee5SAlex Elder dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request, 127037206ee5SAlex Elder obj_request->which); 1271bf0d5f50SAlex Elder } 1272bf0d5f50SAlex Elder 1273bf0d5f50SAlex Elder static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request, 1274bf0d5f50SAlex Elder struct rbd_obj_request *obj_request) 1275bf0d5f50SAlex Elder { 1276bf0d5f50SAlex Elder rbd_assert(obj_request->which != BAD_WHICH); 127725dcf954SAlex Elder 127837206ee5SAlex Elder dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request, 127937206ee5SAlex Elder obj_request->which); 1280bf0d5f50SAlex Elder list_del(&obj_request->links); 128125dcf954SAlex Elder rbd_assert(img_request->obj_request_count > 0); 128225dcf954SAlex Elder img_request->obj_request_count--; 128325dcf954SAlex Elder rbd_assert(obj_request->which == img_request->obj_request_count); 128425dcf954SAlex Elder obj_request->which = BAD_WHICH; 12856365d33aSAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 1286bf0d5f50SAlex Elder rbd_assert(obj_request->img_request == img_request); 1287bf0d5f50SAlex Elder obj_request->img_request = NULL; 128825dcf954SAlex Elder obj_request->callback = NULL; 1289bf0d5f50SAlex Elder rbd_obj_request_put(obj_request); 1290bf0d5f50SAlex Elder } 1291bf0d5f50SAlex Elder 1292bf0d5f50SAlex Elder static bool obj_request_type_valid(enum obj_request_type type) 1293bf0d5f50SAlex Elder { 1294bf0d5f50SAlex Elder switch (type) { 12959969ebc5SAlex Elder case OBJ_REQUEST_NODATA: 1296bf0d5f50SAlex Elder case OBJ_REQUEST_BIO: 1297788e2df3SAlex Elder case OBJ_REQUEST_PAGES: 1298bf0d5f50SAlex Elder return true; 1299bf0d5f50SAlex Elder default: 1300bf0d5f50SAlex Elder return false; 1301bf0d5f50SAlex Elder } 1302bf0d5f50SAlex Elder } 1303bf0d5f50SAlex Elder 1304bf0d5f50SAlex Elder static int rbd_obj_request_submit(struct ceph_osd_client *osdc, 1305bf0d5f50SAlex Elder struct rbd_obj_request *obj_request) 1306bf0d5f50SAlex Elder { 130737206ee5SAlex Elder dout("%s: osdc %p obj %p\n", __func__, osdc, obj_request); 130837206ee5SAlex Elder 1309bf0d5f50SAlex Elder return ceph_osdc_start_request(osdc, obj_request->osd_req, false); 1310bf0d5f50SAlex Elder } 1311bf0d5f50SAlex Elder 1312bf0d5f50SAlex Elder static void rbd_img_request_complete(struct rbd_img_request *img_request) 1313bf0d5f50SAlex Elder { 131455f27e09SAlex Elder 131537206ee5SAlex Elder dout("%s: img %p\n", __func__, img_request); 131655f27e09SAlex Elder 131755f27e09SAlex Elder /* 131855f27e09SAlex Elder * If no error occurred, compute the aggregate transfer 131955f27e09SAlex Elder * count for the image request. We could instead use 132055f27e09SAlex Elder * atomic64_cmpxchg() to update it as each object request 132155f27e09SAlex Elder * completes; not clear which way is better off hand. 132255f27e09SAlex Elder */ 132355f27e09SAlex Elder if (!img_request->result) { 132455f27e09SAlex Elder struct rbd_obj_request *obj_request; 132555f27e09SAlex Elder u64 xferred = 0; 132655f27e09SAlex Elder 132755f27e09SAlex Elder for_each_obj_request(img_request, obj_request) 132855f27e09SAlex Elder xferred += obj_request->xferred; 132955f27e09SAlex Elder img_request->xferred = xferred; 133055f27e09SAlex Elder } 133155f27e09SAlex Elder 1332bf0d5f50SAlex Elder if (img_request->callback) 1333bf0d5f50SAlex Elder img_request->callback(img_request); 1334bf0d5f50SAlex Elder else 1335bf0d5f50SAlex Elder rbd_img_request_put(img_request); 1336bf0d5f50SAlex Elder } 1337bf0d5f50SAlex Elder 1338788e2df3SAlex Elder /* Caller is responsible for rbd_obj_request_destroy(obj_request) */ 1339788e2df3SAlex Elder 1340788e2df3SAlex Elder static int rbd_obj_request_wait(struct rbd_obj_request *obj_request) 1341788e2df3SAlex Elder { 134237206ee5SAlex Elder dout("%s: obj %p\n", __func__, obj_request); 134337206ee5SAlex Elder 1344788e2df3SAlex Elder return wait_for_completion_interruptible(&obj_request->completion); 1345788e2df3SAlex Elder } 1346788e2df3SAlex Elder 13470c425248SAlex Elder /* 13480c425248SAlex Elder * The default/initial value for all image request flags is 0. Each 13490c425248SAlex Elder * is conditionally set to 1 at image request initialization time 13500c425248SAlex Elder * and currently never change thereafter. 13510c425248SAlex Elder */ 13520c425248SAlex Elder static void img_request_write_set(struct rbd_img_request *img_request) 13530c425248SAlex Elder { 13540c425248SAlex Elder set_bit(IMG_REQ_WRITE, &img_request->flags); 13550c425248SAlex Elder smp_mb(); 13560c425248SAlex Elder } 13570c425248SAlex Elder 13580c425248SAlex Elder static bool img_request_write_test(struct rbd_img_request *img_request) 13590c425248SAlex Elder { 13600c425248SAlex Elder smp_mb(); 13610c425248SAlex Elder return test_bit(IMG_REQ_WRITE, &img_request->flags) != 0; 13620c425248SAlex Elder } 13630c425248SAlex Elder 13649849e986SAlex Elder static void img_request_child_set(struct rbd_img_request *img_request) 13659849e986SAlex Elder { 13669849e986SAlex Elder set_bit(IMG_REQ_CHILD, &img_request->flags); 13679849e986SAlex Elder smp_mb(); 13689849e986SAlex Elder } 13699849e986SAlex Elder 13709849e986SAlex Elder static bool img_request_child_test(struct rbd_img_request *img_request) 13719849e986SAlex Elder { 13729849e986SAlex Elder smp_mb(); 13739849e986SAlex Elder return test_bit(IMG_REQ_CHILD, &img_request->flags) != 0; 13749849e986SAlex Elder } 13759849e986SAlex Elder 1376d0b2e944SAlex Elder static void img_request_layered_set(struct rbd_img_request *img_request) 1377d0b2e944SAlex Elder { 1378d0b2e944SAlex Elder set_bit(IMG_REQ_LAYERED, &img_request->flags); 1379d0b2e944SAlex Elder smp_mb(); 1380d0b2e944SAlex Elder } 1381d0b2e944SAlex Elder 1382d0b2e944SAlex Elder static bool img_request_layered_test(struct rbd_img_request *img_request) 1383d0b2e944SAlex Elder { 1384d0b2e944SAlex Elder smp_mb(); 1385d0b2e944SAlex Elder return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0; 1386d0b2e944SAlex Elder } 1387d0b2e944SAlex Elder 13886e2a4505SAlex Elder static void 13896e2a4505SAlex Elder rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request) 13906e2a4505SAlex Elder { 1391b9434c5bSAlex Elder u64 xferred = obj_request->xferred; 1392b9434c5bSAlex Elder u64 length = obj_request->length; 1393b9434c5bSAlex Elder 13946e2a4505SAlex Elder dout("%s: obj %p img %p result %d %llu/%llu\n", __func__, 13956e2a4505SAlex Elder obj_request, obj_request->img_request, obj_request->result, 1396b9434c5bSAlex Elder xferred, length); 13976e2a4505SAlex Elder /* 13986e2a4505SAlex Elder * ENOENT means a hole in the image. We zero-fill the 13996e2a4505SAlex Elder * entire length of the request. A short read also implies 14006e2a4505SAlex Elder * zero-fill to the end of the request. Either way we 14016e2a4505SAlex Elder * update the xferred count to indicate the whole request 14026e2a4505SAlex Elder * was satisfied. 14036e2a4505SAlex Elder */ 1404b9434c5bSAlex Elder rbd_assert(obj_request->type != OBJ_REQUEST_NODATA); 14056e2a4505SAlex Elder if (obj_request->result == -ENOENT) { 1406b9434c5bSAlex Elder if (obj_request->type == OBJ_REQUEST_BIO) 14076e2a4505SAlex Elder zero_bio_chain(obj_request->bio_list, 0); 1408b9434c5bSAlex Elder else 1409b9434c5bSAlex Elder zero_pages(obj_request->pages, 0, length); 14106e2a4505SAlex Elder obj_request->result = 0; 1411b9434c5bSAlex Elder obj_request->xferred = length; 1412b9434c5bSAlex Elder } else if (xferred < length && !obj_request->result) { 1413b9434c5bSAlex Elder if (obj_request->type == OBJ_REQUEST_BIO) 1414b9434c5bSAlex Elder zero_bio_chain(obj_request->bio_list, xferred); 1415b9434c5bSAlex Elder else 1416b9434c5bSAlex Elder zero_pages(obj_request->pages, xferred, length); 1417b9434c5bSAlex Elder obj_request->xferred = length; 14186e2a4505SAlex Elder } 14196e2a4505SAlex Elder obj_request_done_set(obj_request); 14206e2a4505SAlex Elder } 14216e2a4505SAlex Elder 1422bf0d5f50SAlex Elder static void rbd_obj_request_complete(struct rbd_obj_request *obj_request) 1423bf0d5f50SAlex Elder { 142437206ee5SAlex Elder dout("%s: obj %p cb %p\n", __func__, obj_request, 142537206ee5SAlex Elder obj_request->callback); 1426bf0d5f50SAlex Elder if (obj_request->callback) 1427bf0d5f50SAlex Elder obj_request->callback(obj_request); 1428788e2df3SAlex Elder else 1429788e2df3SAlex Elder complete_all(&obj_request->completion); 1430bf0d5f50SAlex Elder } 1431bf0d5f50SAlex Elder 1432c47f9371SAlex Elder static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request) 143339bf2c5dSAlex Elder { 143439bf2c5dSAlex Elder dout("%s: obj %p\n", __func__, obj_request); 143539bf2c5dSAlex Elder obj_request_done_set(obj_request); 143639bf2c5dSAlex Elder } 143739bf2c5dSAlex Elder 1438c47f9371SAlex Elder static void rbd_osd_read_callback(struct rbd_obj_request *obj_request) 1439bf0d5f50SAlex Elder { 144057acbaa7SAlex Elder struct rbd_img_request *img_request = NULL; 1441a9e8ba2cSAlex Elder struct rbd_device *rbd_dev = NULL; 144257acbaa7SAlex Elder bool layered = false; 144357acbaa7SAlex Elder 144457acbaa7SAlex Elder if (obj_request_img_data_test(obj_request)) { 144557acbaa7SAlex Elder img_request = obj_request->img_request; 144657acbaa7SAlex Elder layered = img_request && img_request_layered_test(img_request); 1447a9e8ba2cSAlex Elder rbd_dev = img_request->rbd_dev; 144857acbaa7SAlex Elder } 14498b3e1a56SAlex Elder 14508b3e1a56SAlex Elder dout("%s: obj %p img %p result %d %llu/%llu\n", __func__, 14518b3e1a56SAlex Elder obj_request, img_request, obj_request->result, 14528b3e1a56SAlex Elder obj_request->xferred, obj_request->length); 1453a9e8ba2cSAlex Elder if (layered && obj_request->result == -ENOENT && 1454a9e8ba2cSAlex Elder obj_request->img_offset < rbd_dev->parent_overlap) 14558b3e1a56SAlex Elder rbd_img_parent_read(obj_request); 14568b3e1a56SAlex Elder else if (img_request) 14576e2a4505SAlex Elder rbd_img_obj_request_read_callback(obj_request); 14586e2a4505SAlex Elder else 145907741308SAlex Elder obj_request_done_set(obj_request); 1460bf0d5f50SAlex Elder } 1461bf0d5f50SAlex Elder 1462c47f9371SAlex Elder static void rbd_osd_write_callback(struct rbd_obj_request *obj_request) 1463bf0d5f50SAlex Elder { 14641b83bef2SSage Weil dout("%s: obj %p result %d %llu\n", __func__, obj_request, 14651b83bef2SSage Weil obj_request->result, obj_request->length); 14661b83bef2SSage Weil /* 14678b3e1a56SAlex Elder * There is no such thing as a successful short write. Set 14688b3e1a56SAlex Elder * it to our originally-requested length. 14691b83bef2SSage Weil */ 14701b83bef2SSage Weil obj_request->xferred = obj_request->length; 147107741308SAlex Elder obj_request_done_set(obj_request); 1472bf0d5f50SAlex Elder } 1473bf0d5f50SAlex Elder 1474fbfab539SAlex Elder /* 1475fbfab539SAlex Elder * For a simple stat call there's nothing to do. We'll do more if 1476fbfab539SAlex Elder * this is part of a write sequence for a layered image. 1477fbfab539SAlex Elder */ 1478c47f9371SAlex Elder static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request) 1479fbfab539SAlex Elder { 148037206ee5SAlex Elder dout("%s: obj %p\n", __func__, obj_request); 1481fbfab539SAlex Elder obj_request_done_set(obj_request); 1482fbfab539SAlex Elder } 1483fbfab539SAlex Elder 1484bf0d5f50SAlex Elder static void rbd_osd_req_callback(struct ceph_osd_request *osd_req, 1485bf0d5f50SAlex Elder struct ceph_msg *msg) 1486bf0d5f50SAlex Elder { 1487bf0d5f50SAlex Elder struct rbd_obj_request *obj_request = osd_req->r_priv; 1488bf0d5f50SAlex Elder u16 opcode; 1489bf0d5f50SAlex Elder 149037206ee5SAlex Elder dout("%s: osd_req %p msg %p\n", __func__, osd_req, msg); 1491bf0d5f50SAlex Elder rbd_assert(osd_req == obj_request->osd_req); 149257acbaa7SAlex Elder if (obj_request_img_data_test(obj_request)) { 149357acbaa7SAlex Elder rbd_assert(obj_request->img_request); 149457acbaa7SAlex Elder rbd_assert(obj_request->which != BAD_WHICH); 149557acbaa7SAlex Elder } else { 149657acbaa7SAlex Elder rbd_assert(obj_request->which == BAD_WHICH); 149757acbaa7SAlex Elder } 1498bf0d5f50SAlex Elder 14991b83bef2SSage Weil if (osd_req->r_result < 0) 15001b83bef2SSage Weil obj_request->result = osd_req->r_result; 1501bf0d5f50SAlex Elder obj_request->version = le64_to_cpu(osd_req->r_reassert_version.version); 1502bf0d5f50SAlex Elder 15030eefd470SAlex Elder BUG_ON(osd_req->r_num_ops > 2); 1504bf0d5f50SAlex Elder 1505c47f9371SAlex Elder /* 1506c47f9371SAlex Elder * We support a 64-bit length, but ultimately it has to be 1507c47f9371SAlex Elder * passed to blk_end_request(), which takes an unsigned int. 1508c47f9371SAlex Elder */ 15091b83bef2SSage Weil obj_request->xferred = osd_req->r_reply_op_len[0]; 1510c47f9371SAlex Elder rbd_assert(obj_request->xferred < (u64)UINT_MAX); 151179528734SAlex Elder opcode = osd_req->r_ops[0].op; 1512bf0d5f50SAlex Elder switch (opcode) { 1513bf0d5f50SAlex Elder case CEPH_OSD_OP_READ: 1514c47f9371SAlex Elder rbd_osd_read_callback(obj_request); 1515bf0d5f50SAlex Elder break; 1516bf0d5f50SAlex Elder case CEPH_OSD_OP_WRITE: 1517c47f9371SAlex Elder rbd_osd_write_callback(obj_request); 1518bf0d5f50SAlex Elder break; 1519fbfab539SAlex Elder case CEPH_OSD_OP_STAT: 1520c47f9371SAlex Elder rbd_osd_stat_callback(obj_request); 1521fbfab539SAlex Elder break; 152236be9a76SAlex Elder case CEPH_OSD_OP_CALL: 1523b8d70035SAlex Elder case CEPH_OSD_OP_NOTIFY_ACK: 15249969ebc5SAlex Elder case CEPH_OSD_OP_WATCH: 1525c47f9371SAlex Elder rbd_osd_trivial_callback(obj_request); 15269969ebc5SAlex Elder break; 1527bf0d5f50SAlex Elder default: 1528bf0d5f50SAlex Elder rbd_warn(NULL, "%s: unsupported op %hu\n", 1529bf0d5f50SAlex Elder obj_request->object_name, (unsigned short) opcode); 1530bf0d5f50SAlex Elder break; 1531bf0d5f50SAlex Elder } 1532bf0d5f50SAlex Elder 153307741308SAlex Elder if (obj_request_done_test(obj_request)) 1534bf0d5f50SAlex Elder rbd_obj_request_complete(obj_request); 1535bf0d5f50SAlex Elder } 1536bf0d5f50SAlex Elder 15379d4df01fSAlex Elder static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request) 1538430c28c3SAlex Elder { 1539430c28c3SAlex Elder struct rbd_img_request *img_request = obj_request->img_request; 15408c042b0dSAlex Elder struct ceph_osd_request *osd_req = obj_request->osd_req; 15419d4df01fSAlex Elder u64 snap_id; 1542430c28c3SAlex Elder 15438c042b0dSAlex Elder rbd_assert(osd_req != NULL); 1544430c28c3SAlex Elder 15459d4df01fSAlex Elder snap_id = img_request ? img_request->snap_id : CEPH_NOSNAP; 15468c042b0dSAlex Elder ceph_osdc_build_request(osd_req, obj_request->offset, 15479d4df01fSAlex Elder NULL, snap_id, NULL); 15489d4df01fSAlex Elder } 15499d4df01fSAlex Elder 15509d4df01fSAlex Elder static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request) 15519d4df01fSAlex Elder { 15529d4df01fSAlex Elder struct rbd_img_request *img_request = obj_request->img_request; 15539d4df01fSAlex Elder struct ceph_osd_request *osd_req = obj_request->osd_req; 15549d4df01fSAlex Elder struct ceph_snap_context *snapc; 15559d4df01fSAlex Elder struct timespec mtime = CURRENT_TIME; 15569d4df01fSAlex Elder 15579d4df01fSAlex Elder rbd_assert(osd_req != NULL); 15589d4df01fSAlex Elder 15599d4df01fSAlex Elder snapc = img_request ? img_request->snapc : NULL; 15609d4df01fSAlex Elder ceph_osdc_build_request(osd_req, obj_request->offset, 15619d4df01fSAlex Elder snapc, CEPH_NOSNAP, &mtime); 1562430c28c3SAlex Elder } 1563430c28c3SAlex Elder 1564bf0d5f50SAlex Elder static struct ceph_osd_request *rbd_osd_req_create( 1565bf0d5f50SAlex Elder struct rbd_device *rbd_dev, 1566bf0d5f50SAlex Elder bool write_request, 1567430c28c3SAlex Elder struct rbd_obj_request *obj_request) 1568bf0d5f50SAlex Elder { 1569bf0d5f50SAlex Elder struct ceph_snap_context *snapc = NULL; 1570bf0d5f50SAlex Elder struct ceph_osd_client *osdc; 1571bf0d5f50SAlex Elder struct ceph_osd_request *osd_req; 1572bf0d5f50SAlex Elder 15736365d33aSAlex Elder if (obj_request_img_data_test(obj_request)) { 15746365d33aSAlex Elder struct rbd_img_request *img_request = obj_request->img_request; 15756365d33aSAlex Elder 15760c425248SAlex Elder rbd_assert(write_request == 15770c425248SAlex Elder img_request_write_test(img_request)); 15780c425248SAlex Elder if (write_request) 1579bf0d5f50SAlex Elder snapc = img_request->snapc; 1580bf0d5f50SAlex Elder } 1581bf0d5f50SAlex Elder 1582bf0d5f50SAlex Elder /* Allocate and initialize the request, for the single op */ 1583bf0d5f50SAlex Elder 1584bf0d5f50SAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 1585bf0d5f50SAlex Elder osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_ATOMIC); 1586bf0d5f50SAlex Elder if (!osd_req) 1587bf0d5f50SAlex Elder return NULL; /* ENOMEM */ 1588bf0d5f50SAlex Elder 1589430c28c3SAlex Elder if (write_request) 1590bf0d5f50SAlex Elder osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK; 1591430c28c3SAlex Elder else 1592bf0d5f50SAlex Elder osd_req->r_flags = CEPH_OSD_FLAG_READ; 1593bf0d5f50SAlex Elder 1594bf0d5f50SAlex Elder osd_req->r_callback = rbd_osd_req_callback; 1595bf0d5f50SAlex Elder osd_req->r_priv = obj_request; 1596bf0d5f50SAlex Elder 1597bf0d5f50SAlex Elder osd_req->r_oid_len = strlen(obj_request->object_name); 1598bf0d5f50SAlex Elder rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid)); 1599bf0d5f50SAlex Elder memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len); 1600bf0d5f50SAlex Elder 1601bf0d5f50SAlex Elder osd_req->r_file_layout = rbd_dev->layout; /* struct */ 1602bf0d5f50SAlex Elder 1603bf0d5f50SAlex Elder return osd_req; 1604bf0d5f50SAlex Elder } 1605bf0d5f50SAlex Elder 16060eefd470SAlex Elder /* 16070eefd470SAlex Elder * Create a copyup osd request based on the information in the 16080eefd470SAlex Elder * object request supplied. A copyup request has two osd ops, 16090eefd470SAlex Elder * a copyup method call, and a "normal" write request. 16100eefd470SAlex Elder */ 16110eefd470SAlex Elder static struct ceph_osd_request * 16120eefd470SAlex Elder rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request) 16130eefd470SAlex Elder { 16140eefd470SAlex Elder struct rbd_img_request *img_request; 16150eefd470SAlex Elder struct ceph_snap_context *snapc; 16160eefd470SAlex Elder struct rbd_device *rbd_dev; 16170eefd470SAlex Elder struct ceph_osd_client *osdc; 16180eefd470SAlex Elder struct ceph_osd_request *osd_req; 16190eefd470SAlex Elder 16200eefd470SAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 16210eefd470SAlex Elder img_request = obj_request->img_request; 16220eefd470SAlex Elder rbd_assert(img_request); 16230eefd470SAlex Elder rbd_assert(img_request_write_test(img_request)); 16240eefd470SAlex Elder 16250eefd470SAlex Elder /* Allocate and initialize the request, for the two ops */ 16260eefd470SAlex Elder 16270eefd470SAlex Elder snapc = img_request->snapc; 16280eefd470SAlex Elder rbd_dev = img_request->rbd_dev; 16290eefd470SAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 16300eefd470SAlex Elder osd_req = ceph_osdc_alloc_request(osdc, snapc, 2, false, GFP_ATOMIC); 16310eefd470SAlex Elder if (!osd_req) 16320eefd470SAlex Elder return NULL; /* ENOMEM */ 16330eefd470SAlex Elder 16340eefd470SAlex Elder osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK; 16350eefd470SAlex Elder osd_req->r_callback = rbd_osd_req_callback; 16360eefd470SAlex Elder osd_req->r_priv = obj_request; 16370eefd470SAlex Elder 16380eefd470SAlex Elder osd_req->r_oid_len = strlen(obj_request->object_name); 16390eefd470SAlex Elder rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid)); 16400eefd470SAlex Elder memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len); 16410eefd470SAlex Elder 16420eefd470SAlex Elder osd_req->r_file_layout = rbd_dev->layout; /* struct */ 16430eefd470SAlex Elder 16440eefd470SAlex Elder return osd_req; 16450eefd470SAlex Elder } 16460eefd470SAlex Elder 16470eefd470SAlex Elder 1648bf0d5f50SAlex Elder static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req) 1649bf0d5f50SAlex Elder { 1650bf0d5f50SAlex Elder ceph_osdc_put_request(osd_req); 1651bf0d5f50SAlex Elder } 1652bf0d5f50SAlex Elder 1653bf0d5f50SAlex Elder /* object_name is assumed to be a non-null pointer and NUL-terminated */ 1654bf0d5f50SAlex Elder 1655bf0d5f50SAlex Elder static struct rbd_obj_request *rbd_obj_request_create(const char *object_name, 1656bf0d5f50SAlex Elder u64 offset, u64 length, 1657bf0d5f50SAlex Elder enum obj_request_type type) 1658bf0d5f50SAlex Elder { 1659bf0d5f50SAlex Elder struct rbd_obj_request *obj_request; 1660bf0d5f50SAlex Elder size_t size; 1661bf0d5f50SAlex Elder char *name; 1662bf0d5f50SAlex Elder 1663bf0d5f50SAlex Elder rbd_assert(obj_request_type_valid(type)); 1664bf0d5f50SAlex Elder 1665bf0d5f50SAlex Elder size = strlen(object_name) + 1; 1666bf0d5f50SAlex Elder obj_request = kzalloc(sizeof (*obj_request) + size, GFP_KERNEL); 1667bf0d5f50SAlex Elder if (!obj_request) 1668bf0d5f50SAlex Elder return NULL; 1669bf0d5f50SAlex Elder 1670bf0d5f50SAlex Elder name = (char *)(obj_request + 1); 1671bf0d5f50SAlex Elder obj_request->object_name = memcpy(name, object_name, size); 1672bf0d5f50SAlex Elder obj_request->offset = offset; 1673bf0d5f50SAlex Elder obj_request->length = length; 1674926f9b3fSAlex Elder obj_request->flags = 0; 1675bf0d5f50SAlex Elder obj_request->which = BAD_WHICH; 1676bf0d5f50SAlex Elder obj_request->type = type; 1677bf0d5f50SAlex Elder INIT_LIST_HEAD(&obj_request->links); 1678788e2df3SAlex Elder init_completion(&obj_request->completion); 1679bf0d5f50SAlex Elder kref_init(&obj_request->kref); 1680bf0d5f50SAlex Elder 168137206ee5SAlex Elder dout("%s: \"%s\" %llu/%llu %d -> obj %p\n", __func__, object_name, 168237206ee5SAlex Elder offset, length, (int)type, obj_request); 168337206ee5SAlex Elder 1684bf0d5f50SAlex Elder return obj_request; 1685bf0d5f50SAlex Elder } 1686bf0d5f50SAlex Elder 1687bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref) 1688bf0d5f50SAlex Elder { 1689bf0d5f50SAlex Elder struct rbd_obj_request *obj_request; 1690bf0d5f50SAlex Elder 1691bf0d5f50SAlex Elder obj_request = container_of(kref, struct rbd_obj_request, kref); 1692bf0d5f50SAlex Elder 169337206ee5SAlex Elder dout("%s: obj %p\n", __func__, obj_request); 169437206ee5SAlex Elder 1695bf0d5f50SAlex Elder rbd_assert(obj_request->img_request == NULL); 1696bf0d5f50SAlex Elder rbd_assert(obj_request->which == BAD_WHICH); 1697bf0d5f50SAlex Elder 1698bf0d5f50SAlex Elder if (obj_request->osd_req) 1699bf0d5f50SAlex Elder rbd_osd_req_destroy(obj_request->osd_req); 1700bf0d5f50SAlex Elder 1701bf0d5f50SAlex Elder rbd_assert(obj_request_type_valid(obj_request->type)); 1702bf0d5f50SAlex Elder switch (obj_request->type) { 17039969ebc5SAlex Elder case OBJ_REQUEST_NODATA: 17049969ebc5SAlex Elder break; /* Nothing to do */ 1705bf0d5f50SAlex Elder case OBJ_REQUEST_BIO: 1706bf0d5f50SAlex Elder if (obj_request->bio_list) 1707bf0d5f50SAlex Elder bio_chain_put(obj_request->bio_list); 1708bf0d5f50SAlex Elder break; 1709788e2df3SAlex Elder case OBJ_REQUEST_PAGES: 1710788e2df3SAlex Elder if (obj_request->pages) 1711788e2df3SAlex Elder ceph_release_page_vector(obj_request->pages, 1712788e2df3SAlex Elder obj_request->page_count); 1713788e2df3SAlex Elder break; 1714bf0d5f50SAlex Elder } 1715bf0d5f50SAlex Elder 1716bf0d5f50SAlex Elder kfree(obj_request); 1717bf0d5f50SAlex Elder } 1718bf0d5f50SAlex Elder 1719bf0d5f50SAlex Elder /* 1720bf0d5f50SAlex Elder * Caller is responsible for filling in the list of object requests 1721bf0d5f50SAlex Elder * that comprises the image request, and the Linux request pointer 1722bf0d5f50SAlex Elder * (if there is one). 1723bf0d5f50SAlex Elder */ 1724cc344fa1SAlex Elder static struct rbd_img_request *rbd_img_request_create( 1725cc344fa1SAlex Elder struct rbd_device *rbd_dev, 1726bf0d5f50SAlex Elder u64 offset, u64 length, 17279849e986SAlex Elder bool write_request, 17289849e986SAlex Elder bool child_request) 1729bf0d5f50SAlex Elder { 1730bf0d5f50SAlex Elder struct rbd_img_request *img_request; 1731bf0d5f50SAlex Elder struct ceph_snap_context *snapc = NULL; 1732bf0d5f50SAlex Elder 1733bf0d5f50SAlex Elder img_request = kmalloc(sizeof (*img_request), GFP_ATOMIC); 1734bf0d5f50SAlex Elder if (!img_request) 1735bf0d5f50SAlex Elder return NULL; 1736bf0d5f50SAlex Elder 1737bf0d5f50SAlex Elder if (write_request) { 1738bf0d5f50SAlex Elder down_read(&rbd_dev->header_rwsem); 1739bf0d5f50SAlex Elder snapc = ceph_get_snap_context(rbd_dev->header.snapc); 1740bf0d5f50SAlex Elder up_read(&rbd_dev->header_rwsem); 1741bf0d5f50SAlex Elder if (WARN_ON(!snapc)) { 1742bf0d5f50SAlex Elder kfree(img_request); 1743bf0d5f50SAlex Elder return NULL; /* Shouldn't happen */ 1744bf0d5f50SAlex Elder } 17450c425248SAlex Elder 1746bf0d5f50SAlex Elder } 1747bf0d5f50SAlex Elder 1748bf0d5f50SAlex Elder img_request->rq = NULL; 1749bf0d5f50SAlex Elder img_request->rbd_dev = rbd_dev; 1750bf0d5f50SAlex Elder img_request->offset = offset; 1751bf0d5f50SAlex Elder img_request->length = length; 17520c425248SAlex Elder img_request->flags = 0; 17530c425248SAlex Elder if (write_request) { 17540c425248SAlex Elder img_request_write_set(img_request); 1755bf0d5f50SAlex Elder img_request->snapc = snapc; 17560c425248SAlex Elder } else { 1757bf0d5f50SAlex Elder img_request->snap_id = rbd_dev->spec->snap_id; 17580c425248SAlex Elder } 17599849e986SAlex Elder if (child_request) 17609849e986SAlex Elder img_request_child_set(img_request); 1761d0b2e944SAlex Elder if (rbd_dev->parent_spec) 1762d0b2e944SAlex Elder img_request_layered_set(img_request); 1763bf0d5f50SAlex Elder spin_lock_init(&img_request->completion_lock); 1764bf0d5f50SAlex Elder img_request->next_completion = 0; 1765bf0d5f50SAlex Elder img_request->callback = NULL; 1766a5a337d4SAlex Elder img_request->result = 0; 1767bf0d5f50SAlex Elder img_request->obj_request_count = 0; 1768bf0d5f50SAlex Elder INIT_LIST_HEAD(&img_request->obj_requests); 1769bf0d5f50SAlex Elder kref_init(&img_request->kref); 1770bf0d5f50SAlex Elder 1771bf0d5f50SAlex Elder rbd_img_request_get(img_request); /* Avoid a warning */ 1772bf0d5f50SAlex Elder rbd_img_request_put(img_request); /* TEMPORARY */ 1773bf0d5f50SAlex Elder 177437206ee5SAlex Elder dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev, 177537206ee5SAlex Elder write_request ? "write" : "read", offset, length, 177637206ee5SAlex Elder img_request); 177737206ee5SAlex Elder 1778bf0d5f50SAlex Elder return img_request; 1779bf0d5f50SAlex Elder } 1780bf0d5f50SAlex Elder 1781bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref) 1782bf0d5f50SAlex Elder { 1783bf0d5f50SAlex Elder struct rbd_img_request *img_request; 1784bf0d5f50SAlex Elder struct rbd_obj_request *obj_request; 1785bf0d5f50SAlex Elder struct rbd_obj_request *next_obj_request; 1786bf0d5f50SAlex Elder 1787bf0d5f50SAlex Elder img_request = container_of(kref, struct rbd_img_request, kref); 1788bf0d5f50SAlex Elder 178937206ee5SAlex Elder dout("%s: img %p\n", __func__, img_request); 179037206ee5SAlex Elder 1791bf0d5f50SAlex Elder for_each_obj_request_safe(img_request, obj_request, next_obj_request) 1792bf0d5f50SAlex Elder rbd_img_obj_request_del(img_request, obj_request); 179325dcf954SAlex Elder rbd_assert(img_request->obj_request_count == 0); 1794bf0d5f50SAlex Elder 17950c425248SAlex Elder if (img_request_write_test(img_request)) 1796bf0d5f50SAlex Elder ceph_put_snap_context(img_request->snapc); 1797bf0d5f50SAlex Elder 17988b3e1a56SAlex Elder if (img_request_child_test(img_request)) 17998b3e1a56SAlex Elder rbd_obj_request_put(img_request->obj_request); 18008b3e1a56SAlex Elder 1801bf0d5f50SAlex Elder kfree(img_request); 1802bf0d5f50SAlex Elder } 1803bf0d5f50SAlex Elder 18041217857fSAlex Elder static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request) 18051217857fSAlex Elder { 18066365d33aSAlex Elder struct rbd_img_request *img_request; 18071217857fSAlex Elder unsigned int xferred; 18081217857fSAlex Elder int result; 18098b3e1a56SAlex Elder bool more; 18101217857fSAlex Elder 18116365d33aSAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 18126365d33aSAlex Elder img_request = obj_request->img_request; 18136365d33aSAlex Elder 18141217857fSAlex Elder rbd_assert(obj_request->xferred <= (u64)UINT_MAX); 18151217857fSAlex Elder xferred = (unsigned int)obj_request->xferred; 18161217857fSAlex Elder result = obj_request->result; 18171217857fSAlex Elder if (result) { 18181217857fSAlex Elder struct rbd_device *rbd_dev = img_request->rbd_dev; 18191217857fSAlex Elder 18201217857fSAlex Elder rbd_warn(rbd_dev, "%s %llx at %llx (%llx)\n", 18211217857fSAlex Elder img_request_write_test(img_request) ? "write" : "read", 18221217857fSAlex Elder obj_request->length, obj_request->img_offset, 18231217857fSAlex Elder obj_request->offset); 18241217857fSAlex Elder rbd_warn(rbd_dev, " result %d xferred %x\n", 18251217857fSAlex Elder result, xferred); 18261217857fSAlex Elder if (!img_request->result) 18271217857fSAlex Elder img_request->result = result; 18281217857fSAlex Elder } 18291217857fSAlex Elder 1830f1a4739fSAlex Elder /* Image object requests don't own their page array */ 1831f1a4739fSAlex Elder 1832f1a4739fSAlex Elder if (obj_request->type == OBJ_REQUEST_PAGES) { 1833f1a4739fSAlex Elder obj_request->pages = NULL; 1834f1a4739fSAlex Elder obj_request->page_count = 0; 1835f1a4739fSAlex Elder } 1836f1a4739fSAlex Elder 18378b3e1a56SAlex Elder if (img_request_child_test(img_request)) { 18388b3e1a56SAlex Elder rbd_assert(img_request->obj_request != NULL); 18398b3e1a56SAlex Elder more = obj_request->which < img_request->obj_request_count - 1; 18408b3e1a56SAlex Elder } else { 18418b3e1a56SAlex Elder rbd_assert(img_request->rq != NULL); 18428b3e1a56SAlex Elder more = blk_end_request(img_request->rq, result, xferred); 18438b3e1a56SAlex Elder } 18448b3e1a56SAlex Elder 18458b3e1a56SAlex Elder return more; 18461217857fSAlex Elder } 18471217857fSAlex Elder 18482169238dSAlex Elder static void rbd_img_obj_callback(struct rbd_obj_request *obj_request) 18492169238dSAlex Elder { 18502169238dSAlex Elder struct rbd_img_request *img_request; 18512169238dSAlex Elder u32 which = obj_request->which; 18522169238dSAlex Elder bool more = true; 18532169238dSAlex Elder 18546365d33aSAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 18552169238dSAlex Elder img_request = obj_request->img_request; 18562169238dSAlex Elder 18572169238dSAlex Elder dout("%s: img %p obj %p\n", __func__, img_request, obj_request); 18582169238dSAlex Elder rbd_assert(img_request != NULL); 18592169238dSAlex Elder rbd_assert(img_request->obj_request_count > 0); 18602169238dSAlex Elder rbd_assert(which != BAD_WHICH); 18612169238dSAlex Elder rbd_assert(which < img_request->obj_request_count); 18622169238dSAlex Elder rbd_assert(which >= img_request->next_completion); 18632169238dSAlex Elder 18642169238dSAlex Elder spin_lock_irq(&img_request->completion_lock); 18652169238dSAlex Elder if (which != img_request->next_completion) 18662169238dSAlex Elder goto out; 18672169238dSAlex Elder 18682169238dSAlex Elder for_each_obj_request_from(img_request, obj_request) { 18692169238dSAlex Elder rbd_assert(more); 18702169238dSAlex Elder rbd_assert(which < img_request->obj_request_count); 18712169238dSAlex Elder 18722169238dSAlex Elder if (!obj_request_done_test(obj_request)) 18732169238dSAlex Elder break; 18741217857fSAlex Elder more = rbd_img_obj_end_request(obj_request); 18752169238dSAlex Elder which++; 18762169238dSAlex Elder } 18772169238dSAlex Elder 18782169238dSAlex Elder rbd_assert(more ^ (which == img_request->obj_request_count)); 18792169238dSAlex Elder img_request->next_completion = which; 18802169238dSAlex Elder out: 18812169238dSAlex Elder spin_unlock_irq(&img_request->completion_lock); 18822169238dSAlex Elder 18832169238dSAlex Elder if (!more) 18842169238dSAlex Elder rbd_img_request_complete(img_request); 18852169238dSAlex Elder } 18862169238dSAlex Elder 1887f1a4739fSAlex Elder /* 1888f1a4739fSAlex Elder * Split up an image request into one or more object requests, each 1889f1a4739fSAlex Elder * to a different object. The "type" parameter indicates whether 1890f1a4739fSAlex Elder * "data_desc" is the pointer to the head of a list of bio 1891f1a4739fSAlex Elder * structures, or the base of a page array. In either case this 1892f1a4739fSAlex Elder * function assumes data_desc describes memory sufficient to hold 1893f1a4739fSAlex Elder * all data described by the image request. 1894f1a4739fSAlex Elder */ 1895f1a4739fSAlex Elder static int rbd_img_request_fill(struct rbd_img_request *img_request, 1896f1a4739fSAlex Elder enum obj_request_type type, 1897f1a4739fSAlex Elder void *data_desc) 1898bf0d5f50SAlex Elder { 1899bf0d5f50SAlex Elder struct rbd_device *rbd_dev = img_request->rbd_dev; 1900bf0d5f50SAlex Elder struct rbd_obj_request *obj_request = NULL; 1901bf0d5f50SAlex Elder struct rbd_obj_request *next_obj_request; 19020c425248SAlex Elder bool write_request = img_request_write_test(img_request); 1903f1a4739fSAlex Elder struct bio *bio_list; 1904f1a4739fSAlex Elder unsigned int bio_offset = 0; 1905f1a4739fSAlex Elder struct page **pages; 19067da22d29SAlex Elder u64 img_offset; 1907bf0d5f50SAlex Elder u64 resid; 1908bf0d5f50SAlex Elder u16 opcode; 1909bf0d5f50SAlex Elder 1910f1a4739fSAlex Elder dout("%s: img %p type %d data_desc %p\n", __func__, img_request, 1911f1a4739fSAlex Elder (int)type, data_desc); 191237206ee5SAlex Elder 1913430c28c3SAlex Elder opcode = write_request ? CEPH_OSD_OP_WRITE : CEPH_OSD_OP_READ; 19147da22d29SAlex Elder img_offset = img_request->offset; 1915bf0d5f50SAlex Elder resid = img_request->length; 19164dda41d3SAlex Elder rbd_assert(resid > 0); 1917f1a4739fSAlex Elder 1918f1a4739fSAlex Elder if (type == OBJ_REQUEST_BIO) { 1919f1a4739fSAlex Elder bio_list = data_desc; 1920f1a4739fSAlex Elder rbd_assert(img_offset == bio_list->bi_sector << SECTOR_SHIFT); 1921f1a4739fSAlex Elder } else { 1922f1a4739fSAlex Elder rbd_assert(type == OBJ_REQUEST_PAGES); 1923f1a4739fSAlex Elder pages = data_desc; 1924f1a4739fSAlex Elder } 1925f1a4739fSAlex Elder 1926bf0d5f50SAlex Elder while (resid) { 19272fa12320SAlex Elder struct ceph_osd_request *osd_req; 1928bf0d5f50SAlex Elder const char *object_name; 1929bf0d5f50SAlex Elder u64 offset; 1930bf0d5f50SAlex Elder u64 length; 1931bf0d5f50SAlex Elder 19327da22d29SAlex Elder object_name = rbd_segment_name(rbd_dev, img_offset); 1933bf0d5f50SAlex Elder if (!object_name) 1934bf0d5f50SAlex Elder goto out_unwind; 19357da22d29SAlex Elder offset = rbd_segment_offset(rbd_dev, img_offset); 19367da22d29SAlex Elder length = rbd_segment_length(rbd_dev, img_offset, resid); 1937bf0d5f50SAlex Elder obj_request = rbd_obj_request_create(object_name, 1938f1a4739fSAlex Elder offset, length, type); 1939bf0d5f50SAlex Elder kfree(object_name); /* object request has its own copy */ 1940bf0d5f50SAlex Elder if (!obj_request) 1941bf0d5f50SAlex Elder goto out_unwind; 1942bf0d5f50SAlex Elder 1943f1a4739fSAlex Elder if (type == OBJ_REQUEST_BIO) { 1944f1a4739fSAlex Elder unsigned int clone_size; 1945f1a4739fSAlex Elder 1946bf0d5f50SAlex Elder rbd_assert(length <= (u64)UINT_MAX); 1947bf0d5f50SAlex Elder clone_size = (unsigned int)length; 1948f1a4739fSAlex Elder obj_request->bio_list = 1949f1a4739fSAlex Elder bio_chain_clone_range(&bio_list, 1950f1a4739fSAlex Elder &bio_offset, 1951f1a4739fSAlex Elder clone_size, 1952bf0d5f50SAlex Elder GFP_ATOMIC); 1953bf0d5f50SAlex Elder if (!obj_request->bio_list) 1954bf0d5f50SAlex Elder goto out_partial; 1955f1a4739fSAlex Elder } else { 1956f1a4739fSAlex Elder unsigned int page_count; 1957f1a4739fSAlex Elder 1958f1a4739fSAlex Elder obj_request->pages = pages; 1959f1a4739fSAlex Elder page_count = (u32)calc_pages_for(offset, length); 1960f1a4739fSAlex Elder obj_request->page_count = page_count; 1961f1a4739fSAlex Elder if ((offset + length) & ~PAGE_MASK) 1962f1a4739fSAlex Elder page_count--; /* more on last page */ 1963f1a4739fSAlex Elder pages += page_count; 1964f1a4739fSAlex Elder } 1965bf0d5f50SAlex Elder 19662fa12320SAlex Elder osd_req = rbd_osd_req_create(rbd_dev, write_request, 19672fa12320SAlex Elder obj_request); 19682fa12320SAlex Elder if (!osd_req) 1969bf0d5f50SAlex Elder goto out_partial; 19702fa12320SAlex Elder obj_request->osd_req = osd_req; 19712169238dSAlex Elder obj_request->callback = rbd_img_obj_callback; 1972430c28c3SAlex Elder 19732fa12320SAlex Elder osd_req_op_extent_init(osd_req, 0, opcode, offset, length, 19742fa12320SAlex Elder 0, 0); 1975f1a4739fSAlex Elder if (type == OBJ_REQUEST_BIO) 1976406e2c9fSAlex Elder osd_req_op_extent_osd_data_bio(osd_req, 0, 1977f1a4739fSAlex Elder obj_request->bio_list, length); 1978f1a4739fSAlex Elder else 1979f1a4739fSAlex Elder osd_req_op_extent_osd_data_pages(osd_req, 0, 1980f1a4739fSAlex Elder obj_request->pages, length, 1981f1a4739fSAlex Elder offset & ~PAGE_MASK, false, false); 19829d4df01fSAlex Elder 19839d4df01fSAlex Elder if (write_request) 19849d4df01fSAlex Elder rbd_osd_req_format_write(obj_request); 19859d4df01fSAlex Elder else 19869d4df01fSAlex Elder rbd_osd_req_format_read(obj_request); 1987430c28c3SAlex Elder 19887da22d29SAlex Elder obj_request->img_offset = img_offset; 1989bf0d5f50SAlex Elder rbd_img_obj_request_add(img_request, obj_request); 1990bf0d5f50SAlex Elder 19917da22d29SAlex Elder img_offset += length; 1992bf0d5f50SAlex Elder resid -= length; 1993bf0d5f50SAlex Elder } 1994bf0d5f50SAlex Elder 1995bf0d5f50SAlex Elder return 0; 1996bf0d5f50SAlex Elder 1997bf0d5f50SAlex Elder out_partial: 1998bf0d5f50SAlex Elder rbd_obj_request_put(obj_request); 1999bf0d5f50SAlex Elder out_unwind: 2000bf0d5f50SAlex Elder for_each_obj_request_safe(img_request, obj_request, next_obj_request) 2001bf0d5f50SAlex Elder rbd_obj_request_put(obj_request); 2002bf0d5f50SAlex Elder 2003bf0d5f50SAlex Elder return -ENOMEM; 2004bf0d5f50SAlex Elder } 2005bf0d5f50SAlex Elder 20063d7efd18SAlex Elder static void 20070eefd470SAlex Elder rbd_img_obj_copyup_callback(struct rbd_obj_request *obj_request) 20080eefd470SAlex Elder { 20090eefd470SAlex Elder struct rbd_img_request *img_request; 20100eefd470SAlex Elder struct rbd_device *rbd_dev; 20110eefd470SAlex Elder u64 length; 20120eefd470SAlex Elder u32 page_count; 20130eefd470SAlex Elder 20140eefd470SAlex Elder rbd_assert(obj_request->type == OBJ_REQUEST_BIO); 20150eefd470SAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 20160eefd470SAlex Elder img_request = obj_request->img_request; 20170eefd470SAlex Elder rbd_assert(img_request); 20180eefd470SAlex Elder 20190eefd470SAlex Elder rbd_dev = img_request->rbd_dev; 20200eefd470SAlex Elder rbd_assert(rbd_dev); 20210eefd470SAlex Elder length = (u64)1 << rbd_dev->header.obj_order; 20220eefd470SAlex Elder page_count = (u32)calc_pages_for(0, length); 20230eefd470SAlex Elder 20240eefd470SAlex Elder rbd_assert(obj_request->copyup_pages); 20250eefd470SAlex Elder ceph_release_page_vector(obj_request->copyup_pages, page_count); 20260eefd470SAlex Elder obj_request->copyup_pages = NULL; 20270eefd470SAlex Elder 20280eefd470SAlex Elder /* 20290eefd470SAlex Elder * We want the transfer count to reflect the size of the 20300eefd470SAlex Elder * original write request. There is no such thing as a 20310eefd470SAlex Elder * successful short write, so if the request was successful 20320eefd470SAlex Elder * we can just set it to the originally-requested length. 20330eefd470SAlex Elder */ 20340eefd470SAlex Elder if (!obj_request->result) 20350eefd470SAlex Elder obj_request->xferred = obj_request->length; 20360eefd470SAlex Elder 20370eefd470SAlex Elder /* Finish up with the normal image object callback */ 20380eefd470SAlex Elder 20390eefd470SAlex Elder rbd_img_obj_callback(obj_request); 20400eefd470SAlex Elder } 20410eefd470SAlex Elder 20420eefd470SAlex Elder static void 20433d7efd18SAlex Elder rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request) 20443d7efd18SAlex Elder { 20453d7efd18SAlex Elder struct rbd_obj_request *orig_request; 20460eefd470SAlex Elder struct ceph_osd_request *osd_req; 20470eefd470SAlex Elder struct ceph_osd_client *osdc; 20480eefd470SAlex Elder struct rbd_device *rbd_dev; 20493d7efd18SAlex Elder struct page **pages; 20503d7efd18SAlex Elder int result; 20513d7efd18SAlex Elder u64 obj_size; 20523d7efd18SAlex Elder u64 xferred; 20533d7efd18SAlex Elder 20543d7efd18SAlex Elder rbd_assert(img_request_child_test(img_request)); 20553d7efd18SAlex Elder 20563d7efd18SAlex Elder /* First get what we need from the image request */ 20573d7efd18SAlex Elder 20583d7efd18SAlex Elder pages = img_request->copyup_pages; 20593d7efd18SAlex Elder rbd_assert(pages != NULL); 20603d7efd18SAlex Elder img_request->copyup_pages = NULL; 20613d7efd18SAlex Elder 20623d7efd18SAlex Elder orig_request = img_request->obj_request; 20633d7efd18SAlex Elder rbd_assert(orig_request != NULL); 20640eefd470SAlex Elder rbd_assert(orig_request->type == OBJ_REQUEST_BIO); 20653d7efd18SAlex Elder result = img_request->result; 20663d7efd18SAlex Elder obj_size = img_request->length; 20673d7efd18SAlex Elder xferred = img_request->xferred; 20683d7efd18SAlex Elder 20690eefd470SAlex Elder rbd_dev = img_request->rbd_dev; 20700eefd470SAlex Elder rbd_assert(rbd_dev); 20710eefd470SAlex Elder rbd_assert(obj_size == (u64)1 << rbd_dev->header.obj_order); 20720eefd470SAlex Elder 20733d7efd18SAlex Elder rbd_img_request_put(img_request); 20743d7efd18SAlex Elder 20750eefd470SAlex Elder if (result) 20760eefd470SAlex Elder goto out_err; 20773d7efd18SAlex Elder 20780eefd470SAlex Elder /* Allocate the new copyup osd request for the original request */ 20793d7efd18SAlex Elder 20800eefd470SAlex Elder result = -ENOMEM; 20810eefd470SAlex Elder rbd_assert(!orig_request->osd_req); 20820eefd470SAlex Elder osd_req = rbd_osd_req_create_copyup(orig_request); 20830eefd470SAlex Elder if (!osd_req) 20840eefd470SAlex Elder goto out_err; 20850eefd470SAlex Elder orig_request->osd_req = osd_req; 20860eefd470SAlex Elder orig_request->copyup_pages = pages; 20873d7efd18SAlex Elder 20880eefd470SAlex Elder /* Initialize the copyup op */ 20890eefd470SAlex Elder 20900eefd470SAlex Elder osd_req_op_cls_init(osd_req, 0, CEPH_OSD_OP_CALL, "rbd", "copyup"); 20910eefd470SAlex Elder osd_req_op_cls_request_data_pages(osd_req, 0, pages, obj_size, 0, 20920eefd470SAlex Elder false, false); 20930eefd470SAlex Elder 20940eefd470SAlex Elder /* Then the original write request op */ 20950eefd470SAlex Elder 20960eefd470SAlex Elder osd_req_op_extent_init(osd_req, 1, CEPH_OSD_OP_WRITE, 20970eefd470SAlex Elder orig_request->offset, 20980eefd470SAlex Elder orig_request->length, 0, 0); 20990eefd470SAlex Elder osd_req_op_extent_osd_data_bio(osd_req, 1, orig_request->bio_list, 21000eefd470SAlex Elder orig_request->length); 21010eefd470SAlex Elder 21020eefd470SAlex Elder rbd_osd_req_format_write(orig_request); 21030eefd470SAlex Elder 21040eefd470SAlex Elder /* All set, send it off. */ 21050eefd470SAlex Elder 21060eefd470SAlex Elder orig_request->callback = rbd_img_obj_copyup_callback; 21070eefd470SAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 21080eefd470SAlex Elder result = rbd_obj_request_submit(osdc, orig_request); 21090eefd470SAlex Elder if (!result) 21100eefd470SAlex Elder return; 21110eefd470SAlex Elder out_err: 21120eefd470SAlex Elder /* Record the error code and complete the request */ 21130eefd470SAlex Elder 21140eefd470SAlex Elder orig_request->result = result; 21150eefd470SAlex Elder orig_request->xferred = 0; 21163d7efd18SAlex Elder obj_request_done_set(orig_request); 21173d7efd18SAlex Elder rbd_obj_request_complete(orig_request); 21183d7efd18SAlex Elder } 21193d7efd18SAlex Elder 21203d7efd18SAlex Elder /* 21213d7efd18SAlex Elder * Read from the parent image the range of data that covers the 21223d7efd18SAlex Elder * entire target of the given object request. This is used for 21233d7efd18SAlex Elder * satisfying a layered image write request when the target of an 21243d7efd18SAlex Elder * object request from the image request does not exist. 21253d7efd18SAlex Elder * 21263d7efd18SAlex Elder * A page array big enough to hold the returned data is allocated 21273d7efd18SAlex Elder * and supplied to rbd_img_request_fill() as the "data descriptor." 21283d7efd18SAlex Elder * When the read completes, this page array will be transferred to 21293d7efd18SAlex Elder * the original object request for the copyup operation. 21303d7efd18SAlex Elder * 21313d7efd18SAlex Elder * If an error occurs, record it as the result of the original 21323d7efd18SAlex Elder * object request and mark it done so it gets completed. 21333d7efd18SAlex Elder */ 21343d7efd18SAlex Elder static int rbd_img_obj_parent_read_full(struct rbd_obj_request *obj_request) 21353d7efd18SAlex Elder { 21363d7efd18SAlex Elder struct rbd_img_request *img_request = NULL; 21373d7efd18SAlex Elder struct rbd_img_request *parent_request = NULL; 21383d7efd18SAlex Elder struct rbd_device *rbd_dev; 21393d7efd18SAlex Elder u64 img_offset; 21403d7efd18SAlex Elder u64 length; 21413d7efd18SAlex Elder struct page **pages = NULL; 21423d7efd18SAlex Elder u32 page_count; 21433d7efd18SAlex Elder int result; 21443d7efd18SAlex Elder 21453d7efd18SAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 21463d7efd18SAlex Elder rbd_assert(obj_request->type == OBJ_REQUEST_BIO); 21473d7efd18SAlex Elder 21483d7efd18SAlex Elder img_request = obj_request->img_request; 21493d7efd18SAlex Elder rbd_assert(img_request != NULL); 21503d7efd18SAlex Elder rbd_dev = img_request->rbd_dev; 21513d7efd18SAlex Elder rbd_assert(rbd_dev->parent != NULL); 21523d7efd18SAlex Elder 21533d7efd18SAlex Elder /* 21540eefd470SAlex Elder * First things first. The original osd request is of no 21550eefd470SAlex Elder * use to use any more, we'll need a new one that can hold 21560eefd470SAlex Elder * the two ops in a copyup request. We'll get that later, 21570eefd470SAlex Elder * but for now we can release the old one. 21580eefd470SAlex Elder */ 21590eefd470SAlex Elder rbd_osd_req_destroy(obj_request->osd_req); 21600eefd470SAlex Elder obj_request->osd_req = NULL; 21610eefd470SAlex Elder 21620eefd470SAlex Elder /* 21633d7efd18SAlex Elder * Determine the byte range covered by the object in the 21643d7efd18SAlex Elder * child image to which the original request was to be sent. 21653d7efd18SAlex Elder */ 21663d7efd18SAlex Elder img_offset = obj_request->img_offset - obj_request->offset; 21673d7efd18SAlex Elder length = (u64)1 << rbd_dev->header.obj_order; 21683d7efd18SAlex Elder 21693d7efd18SAlex Elder /* 2170a9e8ba2cSAlex Elder * There is no defined parent data beyond the parent 2171a9e8ba2cSAlex Elder * overlap, so limit what we read at that boundary if 2172a9e8ba2cSAlex Elder * necessary. 2173a9e8ba2cSAlex Elder */ 2174a9e8ba2cSAlex Elder if (img_offset + length > rbd_dev->parent_overlap) { 2175a9e8ba2cSAlex Elder rbd_assert(img_offset < rbd_dev->parent_overlap); 2176a9e8ba2cSAlex Elder length = rbd_dev->parent_overlap - img_offset; 2177a9e8ba2cSAlex Elder } 2178a9e8ba2cSAlex Elder 2179a9e8ba2cSAlex Elder /* 21803d7efd18SAlex Elder * Allocate a page array big enough to receive the data read 21813d7efd18SAlex Elder * from the parent. 21823d7efd18SAlex Elder */ 21833d7efd18SAlex Elder page_count = (u32)calc_pages_for(0, length); 21843d7efd18SAlex Elder pages = ceph_alloc_page_vector(page_count, GFP_KERNEL); 21853d7efd18SAlex Elder if (IS_ERR(pages)) { 21863d7efd18SAlex Elder result = PTR_ERR(pages); 21873d7efd18SAlex Elder pages = NULL; 21883d7efd18SAlex Elder goto out_err; 21893d7efd18SAlex Elder } 21903d7efd18SAlex Elder 21913d7efd18SAlex Elder result = -ENOMEM; 21923d7efd18SAlex Elder parent_request = rbd_img_request_create(rbd_dev->parent, 21933d7efd18SAlex Elder img_offset, length, 21943d7efd18SAlex Elder false, true); 21953d7efd18SAlex Elder if (!parent_request) 21963d7efd18SAlex Elder goto out_err; 21973d7efd18SAlex Elder rbd_obj_request_get(obj_request); 21983d7efd18SAlex Elder parent_request->obj_request = obj_request; 21993d7efd18SAlex Elder 22003d7efd18SAlex Elder result = rbd_img_request_fill(parent_request, OBJ_REQUEST_PAGES, pages); 22013d7efd18SAlex Elder if (result) 22023d7efd18SAlex Elder goto out_err; 22033d7efd18SAlex Elder parent_request->copyup_pages = pages; 22043d7efd18SAlex Elder 22053d7efd18SAlex Elder parent_request->callback = rbd_img_obj_parent_read_full_callback; 22063d7efd18SAlex Elder result = rbd_img_request_submit(parent_request); 22073d7efd18SAlex Elder if (!result) 22083d7efd18SAlex Elder return 0; 22093d7efd18SAlex Elder 22103d7efd18SAlex Elder parent_request->copyup_pages = NULL; 22113d7efd18SAlex Elder parent_request->obj_request = NULL; 22123d7efd18SAlex Elder rbd_obj_request_put(obj_request); 22133d7efd18SAlex Elder out_err: 22143d7efd18SAlex Elder if (pages) 22153d7efd18SAlex Elder ceph_release_page_vector(pages, page_count); 22163d7efd18SAlex Elder if (parent_request) 22173d7efd18SAlex Elder rbd_img_request_put(parent_request); 22183d7efd18SAlex Elder obj_request->result = result; 22193d7efd18SAlex Elder obj_request->xferred = 0; 22203d7efd18SAlex Elder obj_request_done_set(obj_request); 22213d7efd18SAlex Elder 22223d7efd18SAlex Elder return result; 22233d7efd18SAlex Elder } 22243d7efd18SAlex Elder 2225c5b5ef6cSAlex Elder static void rbd_img_obj_exists_callback(struct rbd_obj_request *obj_request) 2226c5b5ef6cSAlex Elder { 2227c5b5ef6cSAlex Elder struct rbd_obj_request *orig_request; 2228c5b5ef6cSAlex Elder int result; 2229c5b5ef6cSAlex Elder 2230c5b5ef6cSAlex Elder rbd_assert(!obj_request_img_data_test(obj_request)); 2231c5b5ef6cSAlex Elder 2232c5b5ef6cSAlex Elder /* 2233c5b5ef6cSAlex Elder * All we need from the object request is the original 2234c5b5ef6cSAlex Elder * request and the result of the STAT op. Grab those, then 2235c5b5ef6cSAlex Elder * we're done with the request. 2236c5b5ef6cSAlex Elder */ 2237c5b5ef6cSAlex Elder orig_request = obj_request->obj_request; 2238c5b5ef6cSAlex Elder obj_request->obj_request = NULL; 2239c5b5ef6cSAlex Elder rbd_assert(orig_request); 2240c5b5ef6cSAlex Elder rbd_assert(orig_request->img_request); 2241c5b5ef6cSAlex Elder 2242c5b5ef6cSAlex Elder result = obj_request->result; 2243c5b5ef6cSAlex Elder obj_request->result = 0; 2244c5b5ef6cSAlex Elder 2245c5b5ef6cSAlex Elder dout("%s: obj %p for obj %p result %d %llu/%llu\n", __func__, 2246c5b5ef6cSAlex Elder obj_request, orig_request, result, 2247c5b5ef6cSAlex Elder obj_request->xferred, obj_request->length); 2248c5b5ef6cSAlex Elder rbd_obj_request_put(obj_request); 2249c5b5ef6cSAlex Elder 2250c5b5ef6cSAlex Elder rbd_assert(orig_request); 2251c5b5ef6cSAlex Elder rbd_assert(orig_request->img_request); 2252c5b5ef6cSAlex Elder 2253c5b5ef6cSAlex Elder /* 2254c5b5ef6cSAlex Elder * Our only purpose here is to determine whether the object 2255c5b5ef6cSAlex Elder * exists, and we don't want to treat the non-existence as 2256c5b5ef6cSAlex Elder * an error. If something else comes back, transfer the 2257c5b5ef6cSAlex Elder * error to the original request and complete it now. 2258c5b5ef6cSAlex Elder */ 2259c5b5ef6cSAlex Elder if (!result) { 2260c5b5ef6cSAlex Elder obj_request_existence_set(orig_request, true); 2261c5b5ef6cSAlex Elder } else if (result == -ENOENT) { 2262c5b5ef6cSAlex Elder obj_request_existence_set(orig_request, false); 2263c5b5ef6cSAlex Elder } else if (result) { 2264c5b5ef6cSAlex Elder orig_request->result = result; 22653d7efd18SAlex Elder goto out; 2266c5b5ef6cSAlex Elder } 2267c5b5ef6cSAlex Elder 2268c5b5ef6cSAlex Elder /* 2269c5b5ef6cSAlex Elder * Resubmit the original request now that we have recorded 2270c5b5ef6cSAlex Elder * whether the target object exists. 2271c5b5ef6cSAlex Elder */ 2272b454e36dSAlex Elder orig_request->result = rbd_img_obj_request_submit(orig_request); 22733d7efd18SAlex Elder out: 2274c5b5ef6cSAlex Elder if (orig_request->result) 2275c5b5ef6cSAlex Elder rbd_obj_request_complete(orig_request); 2276c5b5ef6cSAlex Elder rbd_obj_request_put(orig_request); 2277c5b5ef6cSAlex Elder } 2278c5b5ef6cSAlex Elder 2279c5b5ef6cSAlex Elder static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request) 2280c5b5ef6cSAlex Elder { 2281c5b5ef6cSAlex Elder struct rbd_obj_request *stat_request; 2282c5b5ef6cSAlex Elder struct rbd_device *rbd_dev; 2283c5b5ef6cSAlex Elder struct ceph_osd_client *osdc; 2284c5b5ef6cSAlex Elder struct page **pages = NULL; 2285c5b5ef6cSAlex Elder u32 page_count; 2286c5b5ef6cSAlex Elder size_t size; 2287c5b5ef6cSAlex Elder int ret; 2288c5b5ef6cSAlex Elder 2289c5b5ef6cSAlex Elder /* 2290c5b5ef6cSAlex Elder * The response data for a STAT call consists of: 2291c5b5ef6cSAlex Elder * le64 length; 2292c5b5ef6cSAlex Elder * struct { 2293c5b5ef6cSAlex Elder * le32 tv_sec; 2294c5b5ef6cSAlex Elder * le32 tv_nsec; 2295c5b5ef6cSAlex Elder * } mtime; 2296c5b5ef6cSAlex Elder */ 2297c5b5ef6cSAlex Elder size = sizeof (__le64) + sizeof (__le32) + sizeof (__le32); 2298c5b5ef6cSAlex Elder page_count = (u32)calc_pages_for(0, size); 2299c5b5ef6cSAlex Elder pages = ceph_alloc_page_vector(page_count, GFP_KERNEL); 2300c5b5ef6cSAlex Elder if (IS_ERR(pages)) 2301c5b5ef6cSAlex Elder return PTR_ERR(pages); 2302c5b5ef6cSAlex Elder 2303c5b5ef6cSAlex Elder ret = -ENOMEM; 2304c5b5ef6cSAlex Elder stat_request = rbd_obj_request_create(obj_request->object_name, 0, 0, 2305c5b5ef6cSAlex Elder OBJ_REQUEST_PAGES); 2306c5b5ef6cSAlex Elder if (!stat_request) 2307c5b5ef6cSAlex Elder goto out; 2308c5b5ef6cSAlex Elder 2309c5b5ef6cSAlex Elder rbd_obj_request_get(obj_request); 2310c5b5ef6cSAlex Elder stat_request->obj_request = obj_request; 2311c5b5ef6cSAlex Elder stat_request->pages = pages; 2312c5b5ef6cSAlex Elder stat_request->page_count = page_count; 2313c5b5ef6cSAlex Elder 2314c5b5ef6cSAlex Elder rbd_assert(obj_request->img_request); 2315c5b5ef6cSAlex Elder rbd_dev = obj_request->img_request->rbd_dev; 2316c5b5ef6cSAlex Elder stat_request->osd_req = rbd_osd_req_create(rbd_dev, false, 2317c5b5ef6cSAlex Elder stat_request); 2318c5b5ef6cSAlex Elder if (!stat_request->osd_req) 2319c5b5ef6cSAlex Elder goto out; 2320c5b5ef6cSAlex Elder stat_request->callback = rbd_img_obj_exists_callback; 2321c5b5ef6cSAlex Elder 2322c5b5ef6cSAlex Elder osd_req_op_init(stat_request->osd_req, 0, CEPH_OSD_OP_STAT); 2323c5b5ef6cSAlex Elder osd_req_op_raw_data_in_pages(stat_request->osd_req, 0, pages, size, 0, 2324c5b5ef6cSAlex Elder false, false); 23259d4df01fSAlex Elder rbd_osd_req_format_read(stat_request); 2326c5b5ef6cSAlex Elder 2327c5b5ef6cSAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 2328c5b5ef6cSAlex Elder ret = rbd_obj_request_submit(osdc, stat_request); 2329c5b5ef6cSAlex Elder out: 2330c5b5ef6cSAlex Elder if (ret) 2331c5b5ef6cSAlex Elder rbd_obj_request_put(obj_request); 2332c5b5ef6cSAlex Elder 2333c5b5ef6cSAlex Elder return ret; 2334c5b5ef6cSAlex Elder } 2335c5b5ef6cSAlex Elder 2336b454e36dSAlex Elder static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request) 2337b454e36dSAlex Elder { 2338b454e36dSAlex Elder struct rbd_img_request *img_request; 2339a9e8ba2cSAlex Elder struct rbd_device *rbd_dev; 23403d7efd18SAlex Elder bool known; 2341b454e36dSAlex Elder 2342b454e36dSAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 2343b454e36dSAlex Elder 2344b454e36dSAlex Elder img_request = obj_request->img_request; 2345b454e36dSAlex Elder rbd_assert(img_request); 2346a9e8ba2cSAlex Elder rbd_dev = img_request->rbd_dev; 2347b454e36dSAlex Elder 2348b454e36dSAlex Elder /* 2349a9e8ba2cSAlex Elder * Only writes to layered images need special handling. 2350a9e8ba2cSAlex Elder * Reads and non-layered writes are simple object requests. 2351a9e8ba2cSAlex Elder * Layered writes that start beyond the end of the overlap 2352a9e8ba2cSAlex Elder * with the parent have no parent data, so they too are 2353a9e8ba2cSAlex Elder * simple object requests. Finally, if the target object is 2354a9e8ba2cSAlex Elder * known to already exist, its parent data has already been 2355a9e8ba2cSAlex Elder * copied, so a write to the object can also be handled as a 2356a9e8ba2cSAlex Elder * simple object request. 2357b454e36dSAlex Elder */ 2358b454e36dSAlex Elder if (!img_request_write_test(img_request) || 2359b454e36dSAlex Elder !img_request_layered_test(img_request) || 2360a9e8ba2cSAlex Elder rbd_dev->parent_overlap <= obj_request->img_offset || 23613d7efd18SAlex Elder ((known = obj_request_known_test(obj_request)) && 23623d7efd18SAlex Elder obj_request_exists_test(obj_request))) { 2363b454e36dSAlex Elder 2364b454e36dSAlex Elder struct rbd_device *rbd_dev; 2365b454e36dSAlex Elder struct ceph_osd_client *osdc; 2366b454e36dSAlex Elder 2367b454e36dSAlex Elder rbd_dev = obj_request->img_request->rbd_dev; 2368b454e36dSAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 2369b454e36dSAlex Elder 2370b454e36dSAlex Elder return rbd_obj_request_submit(osdc, obj_request); 2371b454e36dSAlex Elder } 2372b454e36dSAlex Elder 2373b454e36dSAlex Elder /* 23743d7efd18SAlex Elder * It's a layered write. The target object might exist but 23753d7efd18SAlex Elder * we may not know that yet. If we know it doesn't exist, 23763d7efd18SAlex Elder * start by reading the data for the full target object from 23773d7efd18SAlex Elder * the parent so we can use it for a copyup to the target. 2378b454e36dSAlex Elder */ 23793d7efd18SAlex Elder if (known) 23803d7efd18SAlex Elder return rbd_img_obj_parent_read_full(obj_request); 23813d7efd18SAlex Elder 23823d7efd18SAlex Elder /* We don't know whether the target exists. Go find out. */ 2383b454e36dSAlex Elder 2384b454e36dSAlex Elder return rbd_img_obj_exists_submit(obj_request); 2385b454e36dSAlex Elder } 2386b454e36dSAlex Elder 2387bf0d5f50SAlex Elder static int rbd_img_request_submit(struct rbd_img_request *img_request) 2388bf0d5f50SAlex Elder { 2389bf0d5f50SAlex Elder struct rbd_obj_request *obj_request; 239046faeed4SAlex Elder struct rbd_obj_request *next_obj_request; 2391bf0d5f50SAlex Elder 239237206ee5SAlex Elder dout("%s: img %p\n", __func__, img_request); 239346faeed4SAlex Elder for_each_obj_request_safe(img_request, obj_request, next_obj_request) { 2394bf0d5f50SAlex Elder int ret; 2395bf0d5f50SAlex Elder 2396b454e36dSAlex Elder ret = rbd_img_obj_request_submit(obj_request); 2397bf0d5f50SAlex Elder if (ret) 2398bf0d5f50SAlex Elder return ret; 2399bf0d5f50SAlex Elder } 2400bf0d5f50SAlex Elder 2401bf0d5f50SAlex Elder return 0; 2402bf0d5f50SAlex Elder } 2403bf0d5f50SAlex Elder 24048b3e1a56SAlex Elder static void rbd_img_parent_read_callback(struct rbd_img_request *img_request) 24058b3e1a56SAlex Elder { 24068b3e1a56SAlex Elder struct rbd_obj_request *obj_request; 2407a9e8ba2cSAlex Elder struct rbd_device *rbd_dev; 2408a9e8ba2cSAlex Elder u64 obj_end; 24098b3e1a56SAlex Elder 24108b3e1a56SAlex Elder rbd_assert(img_request_child_test(img_request)); 24118b3e1a56SAlex Elder 24128b3e1a56SAlex Elder obj_request = img_request->obj_request; 2413a9e8ba2cSAlex Elder rbd_assert(obj_request); 2414a9e8ba2cSAlex Elder rbd_assert(obj_request->img_request); 24158b3e1a56SAlex Elder 2416a9e8ba2cSAlex Elder obj_request->result = img_request->result; 2417a9e8ba2cSAlex Elder if (obj_request->result) 2418a9e8ba2cSAlex Elder goto out; 2419a9e8ba2cSAlex Elder 2420a9e8ba2cSAlex Elder /* 2421a9e8ba2cSAlex Elder * We need to zero anything beyond the parent overlap 2422a9e8ba2cSAlex Elder * boundary. Since rbd_img_obj_request_read_callback() 2423a9e8ba2cSAlex Elder * will zero anything beyond the end of a short read, an 2424a9e8ba2cSAlex Elder * easy way to do this is to pretend the data from the 2425a9e8ba2cSAlex Elder * parent came up short--ending at the overlap boundary. 2426a9e8ba2cSAlex Elder */ 2427a9e8ba2cSAlex Elder rbd_assert(obj_request->img_offset < U64_MAX - obj_request->length); 2428a9e8ba2cSAlex Elder obj_end = obj_request->img_offset + obj_request->length; 2429a9e8ba2cSAlex Elder rbd_dev = obj_request->img_request->rbd_dev; 2430a9e8ba2cSAlex Elder if (obj_end > rbd_dev->parent_overlap) { 2431a9e8ba2cSAlex Elder u64 xferred = 0; 2432a9e8ba2cSAlex Elder 2433a9e8ba2cSAlex Elder if (obj_request->img_offset < rbd_dev->parent_overlap) 2434a9e8ba2cSAlex Elder xferred = rbd_dev->parent_overlap - 2435a9e8ba2cSAlex Elder obj_request->img_offset; 2436a9e8ba2cSAlex Elder 2437a9e8ba2cSAlex Elder obj_request->xferred = min(img_request->xferred, xferred); 2438a9e8ba2cSAlex Elder } else { 2439a9e8ba2cSAlex Elder obj_request->xferred = img_request->xferred; 2440a9e8ba2cSAlex Elder } 2441a9e8ba2cSAlex Elder out: 24428b3e1a56SAlex Elder rbd_img_obj_request_read_callback(obj_request); 24438b3e1a56SAlex Elder rbd_obj_request_complete(obj_request); 24448b3e1a56SAlex Elder } 24458b3e1a56SAlex Elder 24468b3e1a56SAlex Elder static void rbd_img_parent_read(struct rbd_obj_request *obj_request) 24478b3e1a56SAlex Elder { 24488b3e1a56SAlex Elder struct rbd_device *rbd_dev; 24498b3e1a56SAlex Elder struct rbd_img_request *img_request; 24508b3e1a56SAlex Elder int result; 24518b3e1a56SAlex Elder 24528b3e1a56SAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 24538b3e1a56SAlex Elder rbd_assert(obj_request->img_request != NULL); 24548b3e1a56SAlex Elder rbd_assert(obj_request->result == (s32) -ENOENT); 24558b3e1a56SAlex Elder rbd_assert(obj_request->type == OBJ_REQUEST_BIO); 24568b3e1a56SAlex Elder 24578b3e1a56SAlex Elder rbd_dev = obj_request->img_request->rbd_dev; 24588b3e1a56SAlex Elder rbd_assert(rbd_dev->parent != NULL); 24598b3e1a56SAlex Elder /* rbd_read_finish(obj_request, obj_request->length); */ 24608b3e1a56SAlex Elder img_request = rbd_img_request_create(rbd_dev->parent, 24618b3e1a56SAlex Elder obj_request->img_offset, 24628b3e1a56SAlex Elder obj_request->length, 24638b3e1a56SAlex Elder false, true); 24648b3e1a56SAlex Elder result = -ENOMEM; 24658b3e1a56SAlex Elder if (!img_request) 24668b3e1a56SAlex Elder goto out_err; 24678b3e1a56SAlex Elder 24688b3e1a56SAlex Elder rbd_obj_request_get(obj_request); 24698b3e1a56SAlex Elder img_request->obj_request = obj_request; 24708b3e1a56SAlex Elder 2471f1a4739fSAlex Elder result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO, 2472f1a4739fSAlex Elder obj_request->bio_list); 24738b3e1a56SAlex Elder if (result) 24748b3e1a56SAlex Elder goto out_err; 24758b3e1a56SAlex Elder 24768b3e1a56SAlex Elder img_request->callback = rbd_img_parent_read_callback; 24778b3e1a56SAlex Elder result = rbd_img_request_submit(img_request); 24788b3e1a56SAlex Elder if (result) 24798b3e1a56SAlex Elder goto out_err; 24808b3e1a56SAlex Elder 24818b3e1a56SAlex Elder return; 24828b3e1a56SAlex Elder out_err: 24838b3e1a56SAlex Elder if (img_request) 24848b3e1a56SAlex Elder rbd_img_request_put(img_request); 24858b3e1a56SAlex Elder obj_request->result = result; 24868b3e1a56SAlex Elder obj_request->xferred = 0; 24878b3e1a56SAlex Elder obj_request_done_set(obj_request); 24888b3e1a56SAlex Elder } 24898b3e1a56SAlex Elder 2490cf81b60eSAlex Elder static int rbd_obj_notify_ack(struct rbd_device *rbd_dev, 2491b8d70035SAlex Elder u64 ver, u64 notify_id) 2492b8d70035SAlex Elder { 2493b8d70035SAlex Elder struct rbd_obj_request *obj_request; 24942169238dSAlex Elder struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 2495b8d70035SAlex Elder int ret; 2496b8d70035SAlex Elder 2497b8d70035SAlex Elder obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0, 2498b8d70035SAlex Elder OBJ_REQUEST_NODATA); 2499b8d70035SAlex Elder if (!obj_request) 2500b8d70035SAlex Elder return -ENOMEM; 2501b8d70035SAlex Elder 2502b8d70035SAlex Elder ret = -ENOMEM; 2503430c28c3SAlex Elder obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request); 2504b8d70035SAlex Elder if (!obj_request->osd_req) 2505b8d70035SAlex Elder goto out; 25062169238dSAlex Elder obj_request->callback = rbd_obj_request_put; 2507b8d70035SAlex Elder 2508c99d2d4aSAlex Elder osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_NOTIFY_ACK, 2509c99d2d4aSAlex Elder notify_id, ver, 0); 25109d4df01fSAlex Elder rbd_osd_req_format_read(obj_request); 2511430c28c3SAlex Elder 2512b8d70035SAlex Elder ret = rbd_obj_request_submit(osdc, obj_request); 2513b8d70035SAlex Elder out: 2514cf81b60eSAlex Elder if (ret) 2515b8d70035SAlex Elder rbd_obj_request_put(obj_request); 2516b8d70035SAlex Elder 2517b8d70035SAlex Elder return ret; 2518b8d70035SAlex Elder } 2519b8d70035SAlex Elder 2520b8d70035SAlex Elder static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data) 2521b8d70035SAlex Elder { 2522b8d70035SAlex Elder struct rbd_device *rbd_dev = (struct rbd_device *)data; 2523b8d70035SAlex Elder u64 hver; 2524b8d70035SAlex Elder 2525b8d70035SAlex Elder if (!rbd_dev) 2526b8d70035SAlex Elder return; 2527b8d70035SAlex Elder 252837206ee5SAlex Elder dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__, 2529b8d70035SAlex Elder rbd_dev->header_name, (unsigned long long) notify_id, 2530b8d70035SAlex Elder (unsigned int) opcode); 2531522a0cc0SAlex Elder (void)rbd_dev_refresh(rbd_dev, &hver); 2532b8d70035SAlex Elder 2533cf81b60eSAlex Elder rbd_obj_notify_ack(rbd_dev, hver, notify_id); 2534b8d70035SAlex Elder } 2535b8d70035SAlex Elder 25369969ebc5SAlex Elder /* 25379969ebc5SAlex Elder * Request sync osd watch/unwatch. The value of "start" determines 25389969ebc5SAlex Elder * whether a watch request is being initiated or torn down. 25399969ebc5SAlex Elder */ 25409969ebc5SAlex Elder static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start) 25419969ebc5SAlex Elder { 25429969ebc5SAlex Elder struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 25439969ebc5SAlex Elder struct rbd_obj_request *obj_request; 25449969ebc5SAlex Elder int ret; 25459969ebc5SAlex Elder 25469969ebc5SAlex Elder rbd_assert(start ^ !!rbd_dev->watch_event); 25479969ebc5SAlex Elder rbd_assert(start ^ !!rbd_dev->watch_request); 25489969ebc5SAlex Elder 25499969ebc5SAlex Elder if (start) { 25503c663bbdSAlex Elder ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev, 25519969ebc5SAlex Elder &rbd_dev->watch_event); 25529969ebc5SAlex Elder if (ret < 0) 25539969ebc5SAlex Elder return ret; 25548eb87565SAlex Elder rbd_assert(rbd_dev->watch_event != NULL); 25559969ebc5SAlex Elder } 25569969ebc5SAlex Elder 25579969ebc5SAlex Elder ret = -ENOMEM; 25589969ebc5SAlex Elder obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0, 25599969ebc5SAlex Elder OBJ_REQUEST_NODATA); 25609969ebc5SAlex Elder if (!obj_request) 25619969ebc5SAlex Elder goto out_cancel; 25629969ebc5SAlex Elder 2563430c28c3SAlex Elder obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, obj_request); 2564430c28c3SAlex Elder if (!obj_request->osd_req) 2565430c28c3SAlex Elder goto out_cancel; 2566430c28c3SAlex Elder 25678eb87565SAlex Elder if (start) 2568975241afSAlex Elder ceph_osdc_set_request_linger(osdc, obj_request->osd_req); 25698eb87565SAlex Elder else 25706977c3f9SAlex Elder ceph_osdc_unregister_linger_request(osdc, 2571975241afSAlex Elder rbd_dev->watch_request->osd_req); 25722169238dSAlex Elder 25732169238dSAlex Elder osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH, 25742169238dSAlex Elder rbd_dev->watch_event->cookie, 25752169238dSAlex Elder rbd_dev->header.obj_version, start); 25769d4df01fSAlex Elder rbd_osd_req_format_write(obj_request); 25772169238dSAlex Elder 25789969ebc5SAlex Elder ret = rbd_obj_request_submit(osdc, obj_request); 25799969ebc5SAlex Elder if (ret) 25809969ebc5SAlex Elder goto out_cancel; 25819969ebc5SAlex Elder ret = rbd_obj_request_wait(obj_request); 25829969ebc5SAlex Elder if (ret) 25839969ebc5SAlex Elder goto out_cancel; 25849969ebc5SAlex Elder ret = obj_request->result; 25859969ebc5SAlex Elder if (ret) 25869969ebc5SAlex Elder goto out_cancel; 25879969ebc5SAlex Elder 25888eb87565SAlex Elder /* 25898eb87565SAlex Elder * A watch request is set to linger, so the underlying osd 25908eb87565SAlex Elder * request won't go away until we unregister it. We retain 25918eb87565SAlex Elder * a pointer to the object request during that time (in 25928eb87565SAlex Elder * rbd_dev->watch_request), so we'll keep a reference to 25938eb87565SAlex Elder * it. We'll drop that reference (below) after we've 25948eb87565SAlex Elder * unregistered it. 25958eb87565SAlex Elder */ 25968eb87565SAlex Elder if (start) { 25978eb87565SAlex Elder rbd_dev->watch_request = obj_request; 25988eb87565SAlex Elder 25998eb87565SAlex Elder return 0; 26008eb87565SAlex Elder } 26018eb87565SAlex Elder 26028eb87565SAlex Elder /* We have successfully torn down the watch request */ 26038eb87565SAlex Elder 26048eb87565SAlex Elder rbd_obj_request_put(rbd_dev->watch_request); 26058eb87565SAlex Elder rbd_dev->watch_request = NULL; 26069969ebc5SAlex Elder out_cancel: 26079969ebc5SAlex Elder /* Cancel the event if we're tearing down, or on error */ 26089969ebc5SAlex Elder ceph_osdc_cancel_event(rbd_dev->watch_event); 26099969ebc5SAlex Elder rbd_dev->watch_event = NULL; 26109969ebc5SAlex Elder if (obj_request) 26119969ebc5SAlex Elder rbd_obj_request_put(obj_request); 26129969ebc5SAlex Elder 26139969ebc5SAlex Elder return ret; 26149969ebc5SAlex Elder } 26159969ebc5SAlex Elder 261636be9a76SAlex Elder /* 2617f40eb349SAlex Elder * Synchronous osd object method call. Returns the number of bytes 2618f40eb349SAlex Elder * returned in the outbound buffer, or a negative error code. 261936be9a76SAlex Elder */ 262036be9a76SAlex Elder static int rbd_obj_method_sync(struct rbd_device *rbd_dev, 262136be9a76SAlex Elder const char *object_name, 262236be9a76SAlex Elder const char *class_name, 262336be9a76SAlex Elder const char *method_name, 26244157976bSAlex Elder const void *outbound, 262536be9a76SAlex Elder size_t outbound_size, 26264157976bSAlex Elder void *inbound, 262736be9a76SAlex Elder size_t inbound_size, 262836be9a76SAlex Elder u64 *version) 262936be9a76SAlex Elder { 26302169238dSAlex Elder struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 263136be9a76SAlex Elder struct rbd_obj_request *obj_request; 263236be9a76SAlex Elder struct page **pages; 263336be9a76SAlex Elder u32 page_count; 263436be9a76SAlex Elder int ret; 263536be9a76SAlex Elder 263636be9a76SAlex Elder /* 26376010a451SAlex Elder * Method calls are ultimately read operations. The result 26386010a451SAlex Elder * should placed into the inbound buffer provided. They 26396010a451SAlex Elder * also supply outbound data--parameters for the object 26406010a451SAlex Elder * method. Currently if this is present it will be a 26416010a451SAlex Elder * snapshot id. 264236be9a76SAlex Elder */ 264336be9a76SAlex Elder page_count = (u32)calc_pages_for(0, inbound_size); 264436be9a76SAlex Elder pages = ceph_alloc_page_vector(page_count, GFP_KERNEL); 264536be9a76SAlex Elder if (IS_ERR(pages)) 264636be9a76SAlex Elder return PTR_ERR(pages); 264736be9a76SAlex Elder 264836be9a76SAlex Elder ret = -ENOMEM; 26496010a451SAlex Elder obj_request = rbd_obj_request_create(object_name, 0, inbound_size, 265036be9a76SAlex Elder OBJ_REQUEST_PAGES); 265136be9a76SAlex Elder if (!obj_request) 265236be9a76SAlex Elder goto out; 265336be9a76SAlex Elder 265436be9a76SAlex Elder obj_request->pages = pages; 265536be9a76SAlex Elder obj_request->page_count = page_count; 265636be9a76SAlex Elder 2657430c28c3SAlex Elder obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request); 265836be9a76SAlex Elder if (!obj_request->osd_req) 265936be9a76SAlex Elder goto out; 266036be9a76SAlex Elder 2661c99d2d4aSAlex Elder osd_req_op_cls_init(obj_request->osd_req, 0, CEPH_OSD_OP_CALL, 266204017e29SAlex Elder class_name, method_name); 266304017e29SAlex Elder if (outbound_size) { 266404017e29SAlex Elder struct ceph_pagelist *pagelist; 266504017e29SAlex Elder 266604017e29SAlex Elder pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS); 266704017e29SAlex Elder if (!pagelist) 266804017e29SAlex Elder goto out; 266904017e29SAlex Elder 267004017e29SAlex Elder ceph_pagelist_init(pagelist); 267104017e29SAlex Elder ceph_pagelist_append(pagelist, outbound, outbound_size); 267204017e29SAlex Elder osd_req_op_cls_request_data_pagelist(obj_request->osd_req, 0, 267304017e29SAlex Elder pagelist); 267404017e29SAlex Elder } 2675a4ce40a9SAlex Elder osd_req_op_cls_response_data_pages(obj_request->osd_req, 0, 2676a4ce40a9SAlex Elder obj_request->pages, inbound_size, 267744cd188dSAlex Elder 0, false, false); 26789d4df01fSAlex Elder rbd_osd_req_format_read(obj_request); 2679430c28c3SAlex Elder 268036be9a76SAlex Elder ret = rbd_obj_request_submit(osdc, obj_request); 268136be9a76SAlex Elder if (ret) 268236be9a76SAlex Elder goto out; 268336be9a76SAlex Elder ret = rbd_obj_request_wait(obj_request); 268436be9a76SAlex Elder if (ret) 268536be9a76SAlex Elder goto out; 268636be9a76SAlex Elder 268736be9a76SAlex Elder ret = obj_request->result; 268836be9a76SAlex Elder if (ret < 0) 268936be9a76SAlex Elder goto out; 269057385b51SAlex Elder 269157385b51SAlex Elder rbd_assert(obj_request->xferred < (u64)INT_MAX); 269257385b51SAlex Elder ret = (int)obj_request->xferred; 2693903bb32eSAlex Elder ceph_copy_from_page_vector(pages, inbound, 0, obj_request->xferred); 269436be9a76SAlex Elder if (version) 269536be9a76SAlex Elder *version = obj_request->version; 269636be9a76SAlex Elder out: 269736be9a76SAlex Elder if (obj_request) 269836be9a76SAlex Elder rbd_obj_request_put(obj_request); 269936be9a76SAlex Elder else 270036be9a76SAlex Elder ceph_release_page_vector(pages, page_count); 270136be9a76SAlex Elder 270236be9a76SAlex Elder return ret; 270336be9a76SAlex Elder } 270436be9a76SAlex Elder 2705bf0d5f50SAlex Elder static void rbd_request_fn(struct request_queue *q) 2706cc344fa1SAlex Elder __releases(q->queue_lock) __acquires(q->queue_lock) 2707bf0d5f50SAlex Elder { 2708bf0d5f50SAlex Elder struct rbd_device *rbd_dev = q->queuedata; 2709bf0d5f50SAlex Elder bool read_only = rbd_dev->mapping.read_only; 2710bf0d5f50SAlex Elder struct request *rq; 2711bf0d5f50SAlex Elder int result; 2712bf0d5f50SAlex Elder 2713bf0d5f50SAlex Elder while ((rq = blk_fetch_request(q))) { 2714bf0d5f50SAlex Elder bool write_request = rq_data_dir(rq) == WRITE; 2715bf0d5f50SAlex Elder struct rbd_img_request *img_request; 2716bf0d5f50SAlex Elder u64 offset; 2717bf0d5f50SAlex Elder u64 length; 2718bf0d5f50SAlex Elder 2719bf0d5f50SAlex Elder /* Ignore any non-FS requests that filter through. */ 2720bf0d5f50SAlex Elder 2721bf0d5f50SAlex Elder if (rq->cmd_type != REQ_TYPE_FS) { 27224dda41d3SAlex Elder dout("%s: non-fs request type %d\n", __func__, 27234dda41d3SAlex Elder (int) rq->cmd_type); 27244dda41d3SAlex Elder __blk_end_request_all(rq, 0); 27254dda41d3SAlex Elder continue; 27264dda41d3SAlex Elder } 27274dda41d3SAlex Elder 27284dda41d3SAlex Elder /* Ignore/skip any zero-length requests */ 27294dda41d3SAlex Elder 27304dda41d3SAlex Elder offset = (u64) blk_rq_pos(rq) << SECTOR_SHIFT; 27314dda41d3SAlex Elder length = (u64) blk_rq_bytes(rq); 27324dda41d3SAlex Elder 27334dda41d3SAlex Elder if (!length) { 27344dda41d3SAlex Elder dout("%s: zero-length request\n", __func__); 2735bf0d5f50SAlex Elder __blk_end_request_all(rq, 0); 2736bf0d5f50SAlex Elder continue; 2737bf0d5f50SAlex Elder } 2738bf0d5f50SAlex Elder 2739bf0d5f50SAlex Elder spin_unlock_irq(q->queue_lock); 2740bf0d5f50SAlex Elder 2741bf0d5f50SAlex Elder /* Disallow writes to a read-only device */ 2742bf0d5f50SAlex Elder 2743bf0d5f50SAlex Elder if (write_request) { 2744bf0d5f50SAlex Elder result = -EROFS; 2745bf0d5f50SAlex Elder if (read_only) 2746bf0d5f50SAlex Elder goto end_request; 2747bf0d5f50SAlex Elder rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP); 2748bf0d5f50SAlex Elder } 2749bf0d5f50SAlex Elder 27506d292906SAlex Elder /* 27516d292906SAlex Elder * Quit early if the mapped snapshot no longer 27526d292906SAlex Elder * exists. It's still possible the snapshot will 27536d292906SAlex Elder * have disappeared by the time our request arrives 27546d292906SAlex Elder * at the osd, but there's no sense in sending it if 27556d292906SAlex Elder * we already know. 27566d292906SAlex Elder */ 27576d292906SAlex Elder if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) { 2758bf0d5f50SAlex Elder dout("request for non-existent snapshot"); 2759bf0d5f50SAlex Elder rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP); 2760bf0d5f50SAlex Elder result = -ENXIO; 2761bf0d5f50SAlex Elder goto end_request; 2762bf0d5f50SAlex Elder } 2763bf0d5f50SAlex Elder 2764bf0d5f50SAlex Elder result = -EINVAL; 2765bf0d5f50SAlex Elder if (WARN_ON(offset && length > U64_MAX - offset + 1)) 2766bf0d5f50SAlex Elder goto end_request; /* Shouldn't happen */ 2767bf0d5f50SAlex Elder 2768bf0d5f50SAlex Elder result = -ENOMEM; 2769bf0d5f50SAlex Elder img_request = rbd_img_request_create(rbd_dev, offset, length, 27709849e986SAlex Elder write_request, false); 2771bf0d5f50SAlex Elder if (!img_request) 2772bf0d5f50SAlex Elder goto end_request; 2773bf0d5f50SAlex Elder 2774bf0d5f50SAlex Elder img_request->rq = rq; 2775bf0d5f50SAlex Elder 2776f1a4739fSAlex Elder result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO, 2777f1a4739fSAlex Elder rq->bio); 2778bf0d5f50SAlex Elder if (!result) 2779bf0d5f50SAlex Elder result = rbd_img_request_submit(img_request); 2780bf0d5f50SAlex Elder if (result) 2781bf0d5f50SAlex Elder rbd_img_request_put(img_request); 2782bf0d5f50SAlex Elder end_request: 2783bf0d5f50SAlex Elder spin_lock_irq(q->queue_lock); 2784bf0d5f50SAlex Elder if (result < 0) { 27857da22d29SAlex Elder rbd_warn(rbd_dev, "%s %llx at %llx result %d\n", 27867da22d29SAlex Elder write_request ? "write" : "read", 27877da22d29SAlex Elder length, offset, result); 27887da22d29SAlex Elder 2789bf0d5f50SAlex Elder __blk_end_request_all(rq, result); 2790bf0d5f50SAlex Elder } 2791bf0d5f50SAlex Elder } 2792bf0d5f50SAlex Elder } 2793bf0d5f50SAlex Elder 2794602adf40SYehuda Sadeh /* 2795602adf40SYehuda Sadeh * a queue callback. Makes sure that we don't create a bio that spans across 2796602adf40SYehuda Sadeh * multiple osd objects. One exception would be with a single page bios, 2797f7760dadSAlex Elder * which we handle later at bio_chain_clone_range() 2798602adf40SYehuda Sadeh */ 2799602adf40SYehuda Sadeh static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd, 2800602adf40SYehuda Sadeh struct bio_vec *bvec) 2801602adf40SYehuda Sadeh { 2802602adf40SYehuda Sadeh struct rbd_device *rbd_dev = q->queuedata; 2803e5cfeed2SAlex Elder sector_t sector_offset; 2804e5cfeed2SAlex Elder sector_t sectors_per_obj; 2805e5cfeed2SAlex Elder sector_t obj_sector_offset; 2806e5cfeed2SAlex Elder int ret; 2807602adf40SYehuda Sadeh 2808e5cfeed2SAlex Elder /* 2809e5cfeed2SAlex Elder * Find how far into its rbd object the partition-relative 2810e5cfeed2SAlex Elder * bio start sector is to offset relative to the enclosing 2811e5cfeed2SAlex Elder * device. 2812e5cfeed2SAlex Elder */ 2813e5cfeed2SAlex Elder sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector; 2814e5cfeed2SAlex Elder sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT); 2815e5cfeed2SAlex Elder obj_sector_offset = sector_offset & (sectors_per_obj - 1); 2816593a9e7bSAlex Elder 2817e5cfeed2SAlex Elder /* 2818e5cfeed2SAlex Elder * Compute the number of bytes from that offset to the end 2819e5cfeed2SAlex Elder * of the object. Account for what's already used by the bio. 2820e5cfeed2SAlex Elder */ 2821e5cfeed2SAlex Elder ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT; 2822e5cfeed2SAlex Elder if (ret > bmd->bi_size) 2823e5cfeed2SAlex Elder ret -= bmd->bi_size; 2824e5cfeed2SAlex Elder else 2825e5cfeed2SAlex Elder ret = 0; 2826e5cfeed2SAlex Elder 2827e5cfeed2SAlex Elder /* 2828e5cfeed2SAlex Elder * Don't send back more than was asked for. And if the bio 2829e5cfeed2SAlex Elder * was empty, let the whole thing through because: "Note 2830e5cfeed2SAlex Elder * that a block device *must* allow a single page to be 2831e5cfeed2SAlex Elder * added to an empty bio." 2832e5cfeed2SAlex Elder */ 2833e5cfeed2SAlex Elder rbd_assert(bvec->bv_len <= PAGE_SIZE); 2834e5cfeed2SAlex Elder if (ret > (int) bvec->bv_len || !bmd->bi_size) 2835e5cfeed2SAlex Elder ret = (int) bvec->bv_len; 2836e5cfeed2SAlex Elder 2837e5cfeed2SAlex Elder return ret; 2838602adf40SYehuda Sadeh } 2839602adf40SYehuda Sadeh 2840602adf40SYehuda Sadeh static void rbd_free_disk(struct rbd_device *rbd_dev) 2841602adf40SYehuda Sadeh { 2842602adf40SYehuda Sadeh struct gendisk *disk = rbd_dev->disk; 2843602adf40SYehuda Sadeh 2844602adf40SYehuda Sadeh if (!disk) 2845602adf40SYehuda Sadeh return; 2846602adf40SYehuda Sadeh 2847602adf40SYehuda Sadeh if (disk->flags & GENHD_FL_UP) 2848602adf40SYehuda Sadeh del_gendisk(disk); 2849602adf40SYehuda Sadeh if (disk->queue) 2850602adf40SYehuda Sadeh blk_cleanup_queue(disk->queue); 2851602adf40SYehuda Sadeh put_disk(disk); 2852602adf40SYehuda Sadeh } 2853602adf40SYehuda Sadeh 2854788e2df3SAlex Elder static int rbd_obj_read_sync(struct rbd_device *rbd_dev, 2855788e2df3SAlex Elder const char *object_name, 2856788e2df3SAlex Elder u64 offset, u64 length, 285780ef15bfSAlex Elder void *buf, u64 *version) 2858788e2df3SAlex Elder 2859788e2df3SAlex Elder { 28602169238dSAlex Elder struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 2861788e2df3SAlex Elder struct rbd_obj_request *obj_request; 2862788e2df3SAlex Elder struct page **pages = NULL; 2863788e2df3SAlex Elder u32 page_count; 28641ceae7efSAlex Elder size_t size; 2865788e2df3SAlex Elder int ret; 2866788e2df3SAlex Elder 2867788e2df3SAlex Elder page_count = (u32) calc_pages_for(offset, length); 2868788e2df3SAlex Elder pages = ceph_alloc_page_vector(page_count, GFP_KERNEL); 2869788e2df3SAlex Elder if (IS_ERR(pages)) 2870788e2df3SAlex Elder ret = PTR_ERR(pages); 2871788e2df3SAlex Elder 2872788e2df3SAlex Elder ret = -ENOMEM; 2873788e2df3SAlex Elder obj_request = rbd_obj_request_create(object_name, offset, length, 2874788e2df3SAlex Elder OBJ_REQUEST_PAGES); 2875788e2df3SAlex Elder if (!obj_request) 2876788e2df3SAlex Elder goto out; 2877788e2df3SAlex Elder 2878788e2df3SAlex Elder obj_request->pages = pages; 2879788e2df3SAlex Elder obj_request->page_count = page_count; 2880788e2df3SAlex Elder 2881430c28c3SAlex Elder obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request); 2882788e2df3SAlex Elder if (!obj_request->osd_req) 2883788e2df3SAlex Elder goto out; 2884788e2df3SAlex Elder 2885c99d2d4aSAlex Elder osd_req_op_extent_init(obj_request->osd_req, 0, CEPH_OSD_OP_READ, 2886c99d2d4aSAlex Elder offset, length, 0, 0); 2887406e2c9fSAlex Elder osd_req_op_extent_osd_data_pages(obj_request->osd_req, 0, 2888a4ce40a9SAlex Elder obj_request->pages, 288944cd188dSAlex Elder obj_request->length, 289044cd188dSAlex Elder obj_request->offset & ~PAGE_MASK, 289144cd188dSAlex Elder false, false); 28929d4df01fSAlex Elder rbd_osd_req_format_read(obj_request); 2893430c28c3SAlex Elder 2894788e2df3SAlex Elder ret = rbd_obj_request_submit(osdc, obj_request); 2895788e2df3SAlex Elder if (ret) 2896788e2df3SAlex Elder goto out; 2897788e2df3SAlex Elder ret = rbd_obj_request_wait(obj_request); 2898788e2df3SAlex Elder if (ret) 2899788e2df3SAlex Elder goto out; 2900788e2df3SAlex Elder 2901788e2df3SAlex Elder ret = obj_request->result; 2902788e2df3SAlex Elder if (ret < 0) 2903788e2df3SAlex Elder goto out; 29041ceae7efSAlex Elder 29051ceae7efSAlex Elder rbd_assert(obj_request->xferred <= (u64) SIZE_MAX); 29061ceae7efSAlex Elder size = (size_t) obj_request->xferred; 2907903bb32eSAlex Elder ceph_copy_from_page_vector(pages, buf, 0, size); 290823ed6e13SAlex Elder rbd_assert(size <= (size_t) INT_MAX); 290923ed6e13SAlex Elder ret = (int) size; 2910788e2df3SAlex Elder if (version) 2911788e2df3SAlex Elder *version = obj_request->version; 2912788e2df3SAlex Elder out: 2913788e2df3SAlex Elder if (obj_request) 2914788e2df3SAlex Elder rbd_obj_request_put(obj_request); 2915788e2df3SAlex Elder else 2916788e2df3SAlex Elder ceph_release_page_vector(pages, page_count); 2917788e2df3SAlex Elder 2918788e2df3SAlex Elder return ret; 2919788e2df3SAlex Elder } 2920788e2df3SAlex Elder 2921602adf40SYehuda Sadeh /* 29224156d998SAlex Elder * Read the complete header for the given rbd device. 29234156d998SAlex Elder * 29244156d998SAlex Elder * Returns a pointer to a dynamically-allocated buffer containing 29254156d998SAlex Elder * the complete and validated header. Caller can pass the address 29264156d998SAlex Elder * of a variable that will be filled in with the version of the 29274156d998SAlex Elder * header object at the time it was read. 29284156d998SAlex Elder * 29294156d998SAlex Elder * Returns a pointer-coded errno if a failure occurs. 29304156d998SAlex Elder */ 29314156d998SAlex Elder static struct rbd_image_header_ondisk * 29324156d998SAlex Elder rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version) 29334156d998SAlex Elder { 29344156d998SAlex Elder struct rbd_image_header_ondisk *ondisk = NULL; 29354156d998SAlex Elder u32 snap_count = 0; 29364156d998SAlex Elder u64 names_size = 0; 29374156d998SAlex Elder u32 want_count; 29384156d998SAlex Elder int ret; 29394156d998SAlex Elder 29404156d998SAlex Elder /* 29414156d998SAlex Elder * The complete header will include an array of its 64-bit 29424156d998SAlex Elder * snapshot ids, followed by the names of those snapshots as 29434156d998SAlex Elder * a contiguous block of NUL-terminated strings. Note that 29444156d998SAlex Elder * the number of snapshots could change by the time we read 29454156d998SAlex Elder * it in, in which case we re-read it. 29464156d998SAlex Elder */ 29474156d998SAlex Elder do { 29484156d998SAlex Elder size_t size; 29494156d998SAlex Elder 29504156d998SAlex Elder kfree(ondisk); 29514156d998SAlex Elder 29524156d998SAlex Elder size = sizeof (*ondisk); 29534156d998SAlex Elder size += snap_count * sizeof (struct rbd_image_snap_ondisk); 29544156d998SAlex Elder size += names_size; 29554156d998SAlex Elder ondisk = kmalloc(size, GFP_KERNEL); 29564156d998SAlex Elder if (!ondisk) 29574156d998SAlex Elder return ERR_PTR(-ENOMEM); 29584156d998SAlex Elder 2959788e2df3SAlex Elder ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name, 296080ef15bfSAlex Elder 0, size, ondisk, version); 29614156d998SAlex Elder if (ret < 0) 29624156d998SAlex Elder goto out_err; 29634156d998SAlex Elder if (WARN_ON((size_t) ret < size)) { 29644156d998SAlex Elder ret = -ENXIO; 296506ecc6cbSAlex Elder rbd_warn(rbd_dev, "short header read (want %zd got %d)", 296606ecc6cbSAlex Elder size, ret); 29674156d998SAlex Elder goto out_err; 29684156d998SAlex Elder } 29694156d998SAlex Elder if (!rbd_dev_ondisk_valid(ondisk)) { 29704156d998SAlex Elder ret = -ENXIO; 297106ecc6cbSAlex Elder rbd_warn(rbd_dev, "invalid header"); 29724156d998SAlex Elder goto out_err; 29734156d998SAlex Elder } 29744156d998SAlex Elder 29754156d998SAlex Elder names_size = le64_to_cpu(ondisk->snap_names_len); 29764156d998SAlex Elder want_count = snap_count; 29774156d998SAlex Elder snap_count = le32_to_cpu(ondisk->snap_count); 29784156d998SAlex Elder } while (snap_count != want_count); 29794156d998SAlex Elder 29804156d998SAlex Elder return ondisk; 29814156d998SAlex Elder 29824156d998SAlex Elder out_err: 29834156d998SAlex Elder kfree(ondisk); 29844156d998SAlex Elder 29854156d998SAlex Elder return ERR_PTR(ret); 29864156d998SAlex Elder } 29874156d998SAlex Elder 29884156d998SAlex Elder /* 2989602adf40SYehuda Sadeh * reload the ondisk the header 2990602adf40SYehuda Sadeh */ 2991602adf40SYehuda Sadeh static int rbd_read_header(struct rbd_device *rbd_dev, 2992602adf40SYehuda Sadeh struct rbd_image_header *header) 2993602adf40SYehuda Sadeh { 29944156d998SAlex Elder struct rbd_image_header_ondisk *ondisk; 29954156d998SAlex Elder u64 ver = 0; 29964156d998SAlex Elder int ret; 2997602adf40SYehuda Sadeh 29984156d998SAlex Elder ondisk = rbd_dev_v1_header_read(rbd_dev, &ver); 29994156d998SAlex Elder if (IS_ERR(ondisk)) 30004156d998SAlex Elder return PTR_ERR(ondisk); 30014156d998SAlex Elder ret = rbd_header_from_disk(header, ondisk); 30024156d998SAlex Elder if (ret >= 0) 300359c2be1eSYehuda Sadeh header->obj_version = ver; 30044156d998SAlex Elder kfree(ondisk); 3005602adf40SYehuda Sadeh 30064156d998SAlex Elder return ret; 3007602adf40SYehuda Sadeh } 3008602adf40SYehuda Sadeh 300941f38c2bSAlex Elder static void rbd_remove_all_snaps(struct rbd_device *rbd_dev) 3010dfc5606dSYehuda Sadeh { 3011dfc5606dSYehuda Sadeh struct rbd_snap *snap; 3012a0593290SAlex Elder struct rbd_snap *next; 3013dfc5606dSYehuda Sadeh 30146087b51bSAlex Elder list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node) { 30156087b51bSAlex Elder list_del(&snap->node); 30166087b51bSAlex Elder rbd_snap_destroy(snap); 30176087b51bSAlex Elder } 3018dfc5606dSYehuda Sadeh } 3019dfc5606dSYehuda Sadeh 30209478554aSAlex Elder static void rbd_update_mapping_size(struct rbd_device *rbd_dev) 30219478554aSAlex Elder { 30229478554aSAlex Elder sector_t size; 30239478554aSAlex Elder 30240d7dbfceSAlex Elder if (rbd_dev->spec->snap_id != CEPH_NOSNAP) 30259478554aSAlex Elder return; 30269478554aSAlex Elder 30279478554aSAlex Elder size = (sector_t) rbd_dev->header.image_size / SECTOR_SIZE; 30289478554aSAlex Elder dout("setting size to %llu sectors", (unsigned long long) size); 30299478554aSAlex Elder rbd_dev->mapping.size = (u64) size; 30309478554aSAlex Elder set_capacity(rbd_dev->disk, size); 30319478554aSAlex Elder } 30329478554aSAlex Elder 3033602adf40SYehuda Sadeh /* 3034602adf40SYehuda Sadeh * only read the first part of the ondisk header, without the snaps info 3035602adf40SYehuda Sadeh */ 3036117973fbSAlex Elder static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver) 3037602adf40SYehuda Sadeh { 3038602adf40SYehuda Sadeh int ret; 3039602adf40SYehuda Sadeh struct rbd_image_header h; 3040602adf40SYehuda Sadeh 3041602adf40SYehuda Sadeh ret = rbd_read_header(rbd_dev, &h); 3042602adf40SYehuda Sadeh if (ret < 0) 3043602adf40SYehuda Sadeh return ret; 3044602adf40SYehuda Sadeh 3045a51aa0c0SJosh Durgin down_write(&rbd_dev->header_rwsem); 3046a51aa0c0SJosh Durgin 30479478554aSAlex Elder /* Update image size, and check for resize of mapped image */ 30489478554aSAlex Elder rbd_dev->header.image_size = h.image_size; 30499478554aSAlex Elder rbd_update_mapping_size(rbd_dev); 30509db4b3e3SSage Weil 3051849b4260SAlex Elder /* rbd_dev->header.object_prefix shouldn't change */ 3052602adf40SYehuda Sadeh kfree(rbd_dev->header.snap_sizes); 3053849b4260SAlex Elder kfree(rbd_dev->header.snap_names); 3054d1d25646SJosh Durgin /* osd requests may still refer to snapc */ 3055d1d25646SJosh Durgin ceph_put_snap_context(rbd_dev->header.snapc); 3056602adf40SYehuda Sadeh 3057b813623aSAlex Elder if (hver) 3058b813623aSAlex Elder *hver = h.obj_version; 3059a71b891bSJosh Durgin rbd_dev->header.obj_version = h.obj_version; 306093a24e08SJosh Durgin rbd_dev->header.image_size = h.image_size; 3061602adf40SYehuda Sadeh rbd_dev->header.snapc = h.snapc; 3062602adf40SYehuda Sadeh rbd_dev->header.snap_names = h.snap_names; 3063602adf40SYehuda Sadeh rbd_dev->header.snap_sizes = h.snap_sizes; 3064849b4260SAlex Elder /* Free the extra copy of the object prefix */ 3065849b4260SAlex Elder WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix)); 3066849b4260SAlex Elder kfree(h.object_prefix); 3067849b4260SAlex Elder 3068304f6808SAlex Elder ret = rbd_dev_snaps_update(rbd_dev); 3069dfc5606dSYehuda Sadeh 3070c666601aSJosh Durgin up_write(&rbd_dev->header_rwsem); 3071602adf40SYehuda Sadeh 3072dfc5606dSYehuda Sadeh return ret; 3073602adf40SYehuda Sadeh } 3074602adf40SYehuda Sadeh 3075117973fbSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver) 30761fe5e993SAlex Elder { 30771fe5e993SAlex Elder int ret; 30781fe5e993SAlex Elder 3079117973fbSAlex Elder rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 30801fe5e993SAlex Elder mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); 3081117973fbSAlex Elder if (rbd_dev->image_format == 1) 3082117973fbSAlex Elder ret = rbd_dev_v1_refresh(rbd_dev, hver); 3083117973fbSAlex Elder else 3084117973fbSAlex Elder ret = rbd_dev_v2_refresh(rbd_dev, hver); 30851fe5e993SAlex Elder mutex_unlock(&ctl_mutex); 3086d98df63eSLaurent Barbe revalidate_disk(rbd_dev->disk); 3087522a0cc0SAlex Elder if (ret) 3088522a0cc0SAlex Elder rbd_warn(rbd_dev, "got notification but failed to " 3089522a0cc0SAlex Elder " update snaps: %d\n", ret); 30901fe5e993SAlex Elder 30911fe5e993SAlex Elder return ret; 30921fe5e993SAlex Elder } 30931fe5e993SAlex Elder 3094602adf40SYehuda Sadeh static int rbd_init_disk(struct rbd_device *rbd_dev) 3095602adf40SYehuda Sadeh { 3096602adf40SYehuda Sadeh struct gendisk *disk; 3097602adf40SYehuda Sadeh struct request_queue *q; 3098593a9e7bSAlex Elder u64 segment_size; 3099602adf40SYehuda Sadeh 3100602adf40SYehuda Sadeh /* create gendisk info */ 3101602adf40SYehuda Sadeh disk = alloc_disk(RBD_MINORS_PER_MAJOR); 3102602adf40SYehuda Sadeh if (!disk) 31031fcdb8aaSAlex Elder return -ENOMEM; 3104602adf40SYehuda Sadeh 3105f0f8cef5SAlex Elder snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d", 3106de71a297SAlex Elder rbd_dev->dev_id); 3107602adf40SYehuda Sadeh disk->major = rbd_dev->major; 3108602adf40SYehuda Sadeh disk->first_minor = 0; 3109602adf40SYehuda Sadeh disk->fops = &rbd_bd_ops; 3110602adf40SYehuda Sadeh disk->private_data = rbd_dev; 3111602adf40SYehuda Sadeh 3112bf0d5f50SAlex Elder q = blk_init_queue(rbd_request_fn, &rbd_dev->lock); 3113602adf40SYehuda Sadeh if (!q) 3114602adf40SYehuda Sadeh goto out_disk; 3115029bcbd8SJosh Durgin 3116593a9e7bSAlex Elder /* We use the default size, but let's be explicit about it. */ 3117593a9e7bSAlex Elder blk_queue_physical_block_size(q, SECTOR_SIZE); 3118593a9e7bSAlex Elder 3119029bcbd8SJosh Durgin /* set io sizes to object size */ 3120593a9e7bSAlex Elder segment_size = rbd_obj_bytes(&rbd_dev->header); 3121593a9e7bSAlex Elder blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE); 3122593a9e7bSAlex Elder blk_queue_max_segment_size(q, segment_size); 3123593a9e7bSAlex Elder blk_queue_io_min(q, segment_size); 3124593a9e7bSAlex Elder blk_queue_io_opt(q, segment_size); 3125029bcbd8SJosh Durgin 3126602adf40SYehuda Sadeh blk_queue_merge_bvec(q, rbd_merge_bvec); 3127602adf40SYehuda Sadeh disk->queue = q; 3128602adf40SYehuda Sadeh 3129602adf40SYehuda Sadeh q->queuedata = rbd_dev; 3130602adf40SYehuda Sadeh 3131602adf40SYehuda Sadeh rbd_dev->disk = disk; 3132602adf40SYehuda Sadeh 313312f02944SAlex Elder set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE); 313412f02944SAlex Elder 3135602adf40SYehuda Sadeh return 0; 3136602adf40SYehuda Sadeh out_disk: 3137602adf40SYehuda Sadeh put_disk(disk); 31381fcdb8aaSAlex Elder 31391fcdb8aaSAlex Elder return -ENOMEM; 3140602adf40SYehuda Sadeh } 3141602adf40SYehuda Sadeh 3142dfc5606dSYehuda Sadeh /* 3143dfc5606dSYehuda Sadeh sysfs 3144dfc5606dSYehuda Sadeh */ 3145602adf40SYehuda Sadeh 3146593a9e7bSAlex Elder static struct rbd_device *dev_to_rbd_dev(struct device *dev) 3147593a9e7bSAlex Elder { 3148593a9e7bSAlex Elder return container_of(dev, struct rbd_device, dev); 3149593a9e7bSAlex Elder } 3150593a9e7bSAlex Elder 3151dfc5606dSYehuda Sadeh static ssize_t rbd_size_show(struct device *dev, 3152dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 3153602adf40SYehuda Sadeh { 3154593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 3155a51aa0c0SJosh Durgin sector_t size; 3156dfc5606dSYehuda Sadeh 3157a51aa0c0SJosh Durgin down_read(&rbd_dev->header_rwsem); 3158a51aa0c0SJosh Durgin size = get_capacity(rbd_dev->disk); 3159a51aa0c0SJosh Durgin up_read(&rbd_dev->header_rwsem); 3160a51aa0c0SJosh Durgin 3161a51aa0c0SJosh Durgin return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE); 3162602adf40SYehuda Sadeh } 3163602adf40SYehuda Sadeh 316434b13184SAlex Elder /* 316534b13184SAlex Elder * Note this shows the features for whatever's mapped, which is not 316634b13184SAlex Elder * necessarily the base image. 316734b13184SAlex Elder */ 316834b13184SAlex Elder static ssize_t rbd_features_show(struct device *dev, 316934b13184SAlex Elder struct device_attribute *attr, char *buf) 317034b13184SAlex Elder { 317134b13184SAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 317234b13184SAlex Elder 317334b13184SAlex Elder return sprintf(buf, "0x%016llx\n", 317434b13184SAlex Elder (unsigned long long) rbd_dev->mapping.features); 317534b13184SAlex Elder } 317634b13184SAlex Elder 3177dfc5606dSYehuda Sadeh static ssize_t rbd_major_show(struct device *dev, 3178dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 3179602adf40SYehuda Sadeh { 3180593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 3181dfc5606dSYehuda Sadeh 3182dfc5606dSYehuda Sadeh return sprintf(buf, "%d\n", rbd_dev->major); 3183dfc5606dSYehuda Sadeh } 3184dfc5606dSYehuda Sadeh 3185dfc5606dSYehuda Sadeh static ssize_t rbd_client_id_show(struct device *dev, 3186dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 3187dfc5606dSYehuda Sadeh { 3188593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 3189dfc5606dSYehuda Sadeh 31901dbb4399SAlex Elder return sprintf(buf, "client%lld\n", 31911dbb4399SAlex Elder ceph_client_id(rbd_dev->rbd_client->client)); 3192dfc5606dSYehuda Sadeh } 3193dfc5606dSYehuda Sadeh 3194dfc5606dSYehuda Sadeh static ssize_t rbd_pool_show(struct device *dev, 3195dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 3196dfc5606dSYehuda Sadeh { 3197593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 3198dfc5606dSYehuda Sadeh 31990d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->pool_name); 3200dfc5606dSYehuda Sadeh } 3201dfc5606dSYehuda Sadeh 32029bb2f334SAlex Elder static ssize_t rbd_pool_id_show(struct device *dev, 32039bb2f334SAlex Elder struct device_attribute *attr, char *buf) 32049bb2f334SAlex Elder { 32059bb2f334SAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 32069bb2f334SAlex Elder 32070d7dbfceSAlex Elder return sprintf(buf, "%llu\n", 32080d7dbfceSAlex Elder (unsigned long long) rbd_dev->spec->pool_id); 32099bb2f334SAlex Elder } 32109bb2f334SAlex Elder 3211dfc5606dSYehuda Sadeh static ssize_t rbd_name_show(struct device *dev, 3212dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 3213dfc5606dSYehuda Sadeh { 3214593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 3215dfc5606dSYehuda Sadeh 3216a92ffdf8SAlex Elder if (rbd_dev->spec->image_name) 32170d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->image_name); 3218a92ffdf8SAlex Elder 3219a92ffdf8SAlex Elder return sprintf(buf, "(unknown)\n"); 3220dfc5606dSYehuda Sadeh } 3221dfc5606dSYehuda Sadeh 3222589d30e0SAlex Elder static ssize_t rbd_image_id_show(struct device *dev, 3223589d30e0SAlex Elder struct device_attribute *attr, char *buf) 3224589d30e0SAlex Elder { 3225589d30e0SAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 3226589d30e0SAlex Elder 32270d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->image_id); 3228589d30e0SAlex Elder } 3229589d30e0SAlex Elder 323034b13184SAlex Elder /* 323134b13184SAlex Elder * Shows the name of the currently-mapped snapshot (or 323234b13184SAlex Elder * RBD_SNAP_HEAD_NAME for the base image). 323334b13184SAlex Elder */ 3234dfc5606dSYehuda Sadeh static ssize_t rbd_snap_show(struct device *dev, 3235dfc5606dSYehuda Sadeh struct device_attribute *attr, 3236dfc5606dSYehuda Sadeh char *buf) 3237dfc5606dSYehuda Sadeh { 3238593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 3239dfc5606dSYehuda Sadeh 32400d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->snap_name); 3241dfc5606dSYehuda Sadeh } 3242dfc5606dSYehuda Sadeh 324386b00e0dSAlex Elder /* 324486b00e0dSAlex Elder * For an rbd v2 image, shows the pool id, image id, and snapshot id 324586b00e0dSAlex Elder * for the parent image. If there is no parent, simply shows 324686b00e0dSAlex Elder * "(no parent image)". 324786b00e0dSAlex Elder */ 324886b00e0dSAlex Elder static ssize_t rbd_parent_show(struct device *dev, 324986b00e0dSAlex Elder struct device_attribute *attr, 325086b00e0dSAlex Elder char *buf) 325186b00e0dSAlex Elder { 325286b00e0dSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 325386b00e0dSAlex Elder struct rbd_spec *spec = rbd_dev->parent_spec; 325486b00e0dSAlex Elder int count; 325586b00e0dSAlex Elder char *bufp = buf; 325686b00e0dSAlex Elder 325786b00e0dSAlex Elder if (!spec) 325886b00e0dSAlex Elder return sprintf(buf, "(no parent image)\n"); 325986b00e0dSAlex Elder 326086b00e0dSAlex Elder count = sprintf(bufp, "pool_id %llu\npool_name %s\n", 326186b00e0dSAlex Elder (unsigned long long) spec->pool_id, spec->pool_name); 326286b00e0dSAlex Elder if (count < 0) 326386b00e0dSAlex Elder return count; 326486b00e0dSAlex Elder bufp += count; 326586b00e0dSAlex Elder 326686b00e0dSAlex Elder count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id, 326786b00e0dSAlex Elder spec->image_name ? spec->image_name : "(unknown)"); 326886b00e0dSAlex Elder if (count < 0) 326986b00e0dSAlex Elder return count; 327086b00e0dSAlex Elder bufp += count; 327186b00e0dSAlex Elder 327286b00e0dSAlex Elder count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n", 327386b00e0dSAlex Elder (unsigned long long) spec->snap_id, spec->snap_name); 327486b00e0dSAlex Elder if (count < 0) 327586b00e0dSAlex Elder return count; 327686b00e0dSAlex Elder bufp += count; 327786b00e0dSAlex Elder 327886b00e0dSAlex Elder count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap); 327986b00e0dSAlex Elder if (count < 0) 328086b00e0dSAlex Elder return count; 328186b00e0dSAlex Elder bufp += count; 328286b00e0dSAlex Elder 328386b00e0dSAlex Elder return (ssize_t) (bufp - buf); 328486b00e0dSAlex Elder } 328586b00e0dSAlex Elder 3286dfc5606dSYehuda Sadeh static ssize_t rbd_image_refresh(struct device *dev, 3287dfc5606dSYehuda Sadeh struct device_attribute *attr, 3288dfc5606dSYehuda Sadeh const char *buf, 3289dfc5606dSYehuda Sadeh size_t size) 3290dfc5606dSYehuda Sadeh { 3291593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 3292b813623aSAlex Elder int ret; 3293602adf40SYehuda Sadeh 3294117973fbSAlex Elder ret = rbd_dev_refresh(rbd_dev, NULL); 3295b813623aSAlex Elder 3296b813623aSAlex Elder return ret < 0 ? ret : size; 3297dfc5606dSYehuda Sadeh } 3298602adf40SYehuda Sadeh 3299dfc5606dSYehuda Sadeh static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL); 330034b13184SAlex Elder static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL); 3301dfc5606dSYehuda Sadeh static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL); 3302dfc5606dSYehuda Sadeh static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL); 3303dfc5606dSYehuda Sadeh static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL); 33049bb2f334SAlex Elder static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL); 3305dfc5606dSYehuda Sadeh static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL); 3306589d30e0SAlex Elder static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL); 3307dfc5606dSYehuda Sadeh static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh); 3308dfc5606dSYehuda Sadeh static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL); 330986b00e0dSAlex Elder static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL); 3310dfc5606dSYehuda Sadeh 3311dfc5606dSYehuda Sadeh static struct attribute *rbd_attrs[] = { 3312dfc5606dSYehuda Sadeh &dev_attr_size.attr, 331334b13184SAlex Elder &dev_attr_features.attr, 3314dfc5606dSYehuda Sadeh &dev_attr_major.attr, 3315dfc5606dSYehuda Sadeh &dev_attr_client_id.attr, 3316dfc5606dSYehuda Sadeh &dev_attr_pool.attr, 33179bb2f334SAlex Elder &dev_attr_pool_id.attr, 3318dfc5606dSYehuda Sadeh &dev_attr_name.attr, 3319589d30e0SAlex Elder &dev_attr_image_id.attr, 3320dfc5606dSYehuda Sadeh &dev_attr_current_snap.attr, 332186b00e0dSAlex Elder &dev_attr_parent.attr, 3322dfc5606dSYehuda Sadeh &dev_attr_refresh.attr, 3323dfc5606dSYehuda Sadeh NULL 3324dfc5606dSYehuda Sadeh }; 3325dfc5606dSYehuda Sadeh 3326dfc5606dSYehuda Sadeh static struct attribute_group rbd_attr_group = { 3327dfc5606dSYehuda Sadeh .attrs = rbd_attrs, 3328dfc5606dSYehuda Sadeh }; 3329dfc5606dSYehuda Sadeh 3330dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_attr_groups[] = { 3331dfc5606dSYehuda Sadeh &rbd_attr_group, 3332dfc5606dSYehuda Sadeh NULL 3333dfc5606dSYehuda Sadeh }; 3334dfc5606dSYehuda Sadeh 3335dfc5606dSYehuda Sadeh static void rbd_sysfs_dev_release(struct device *dev) 3336dfc5606dSYehuda Sadeh { 3337dfc5606dSYehuda Sadeh } 3338dfc5606dSYehuda Sadeh 3339dfc5606dSYehuda Sadeh static struct device_type rbd_device_type = { 3340dfc5606dSYehuda Sadeh .name = "rbd", 3341dfc5606dSYehuda Sadeh .groups = rbd_attr_groups, 3342dfc5606dSYehuda Sadeh .release = rbd_sysfs_dev_release, 3343dfc5606dSYehuda Sadeh }; 3344dfc5606dSYehuda Sadeh 33458b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec) 33468b8fb99cSAlex Elder { 33478b8fb99cSAlex Elder kref_get(&spec->kref); 33488b8fb99cSAlex Elder 33498b8fb99cSAlex Elder return spec; 33508b8fb99cSAlex Elder } 33518b8fb99cSAlex Elder 33528b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref); 33538b8fb99cSAlex Elder static void rbd_spec_put(struct rbd_spec *spec) 33548b8fb99cSAlex Elder { 33558b8fb99cSAlex Elder if (spec) 33568b8fb99cSAlex Elder kref_put(&spec->kref, rbd_spec_free); 33578b8fb99cSAlex Elder } 33588b8fb99cSAlex Elder 33598b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_alloc(void) 33608b8fb99cSAlex Elder { 33618b8fb99cSAlex Elder struct rbd_spec *spec; 33628b8fb99cSAlex Elder 33638b8fb99cSAlex Elder spec = kzalloc(sizeof (*spec), GFP_KERNEL); 33648b8fb99cSAlex Elder if (!spec) 33658b8fb99cSAlex Elder return NULL; 33668b8fb99cSAlex Elder kref_init(&spec->kref); 33678b8fb99cSAlex Elder 33688b8fb99cSAlex Elder return spec; 33698b8fb99cSAlex Elder } 33708b8fb99cSAlex Elder 33718b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref) 33728b8fb99cSAlex Elder { 33738b8fb99cSAlex Elder struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref); 33748b8fb99cSAlex Elder 33758b8fb99cSAlex Elder kfree(spec->pool_name); 33768b8fb99cSAlex Elder kfree(spec->image_id); 33778b8fb99cSAlex Elder kfree(spec->image_name); 33788b8fb99cSAlex Elder kfree(spec->snap_name); 33798b8fb99cSAlex Elder kfree(spec); 33808b8fb99cSAlex Elder } 33818b8fb99cSAlex Elder 3382cc344fa1SAlex Elder static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc, 3383c53d5893SAlex Elder struct rbd_spec *spec) 3384c53d5893SAlex Elder { 3385c53d5893SAlex Elder struct rbd_device *rbd_dev; 3386c53d5893SAlex Elder 3387c53d5893SAlex Elder rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL); 3388c53d5893SAlex Elder if (!rbd_dev) 3389c53d5893SAlex Elder return NULL; 3390c53d5893SAlex Elder 3391c53d5893SAlex Elder spin_lock_init(&rbd_dev->lock); 33926d292906SAlex Elder rbd_dev->flags = 0; 3393c53d5893SAlex Elder INIT_LIST_HEAD(&rbd_dev->node); 3394c53d5893SAlex Elder INIT_LIST_HEAD(&rbd_dev->snaps); 3395c53d5893SAlex Elder init_rwsem(&rbd_dev->header_rwsem); 3396c53d5893SAlex Elder 3397c53d5893SAlex Elder rbd_dev->spec = spec; 3398c53d5893SAlex Elder rbd_dev->rbd_client = rbdc; 3399c53d5893SAlex Elder 34000903e875SAlex Elder /* Initialize the layout used for all rbd requests */ 34010903e875SAlex Elder 34020903e875SAlex Elder rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); 34030903e875SAlex Elder rbd_dev->layout.fl_stripe_count = cpu_to_le32(1); 34040903e875SAlex Elder rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); 34050903e875SAlex Elder rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id); 34060903e875SAlex Elder 3407c53d5893SAlex Elder return rbd_dev; 3408c53d5893SAlex Elder } 3409c53d5893SAlex Elder 3410c53d5893SAlex Elder static void rbd_dev_destroy(struct rbd_device *rbd_dev) 3411c53d5893SAlex Elder { 341286b00e0dSAlex Elder rbd_spec_put(rbd_dev->parent_spec); 3413c53d5893SAlex Elder kfree(rbd_dev->header_name); 3414c53d5893SAlex Elder rbd_put_client(rbd_dev->rbd_client); 3415c53d5893SAlex Elder rbd_spec_put(rbd_dev->spec); 3416c53d5893SAlex Elder kfree(rbd_dev); 3417c53d5893SAlex Elder } 3418c53d5893SAlex Elder 34196087b51bSAlex Elder static void rbd_snap_destroy(struct rbd_snap *snap) 3420dfc5606dSYehuda Sadeh { 34213e83b65bSAlex Elder kfree(snap->name); 34223e83b65bSAlex Elder kfree(snap); 3423dfc5606dSYehuda Sadeh } 3424dfc5606dSYehuda Sadeh 34256087b51bSAlex Elder static struct rbd_snap *rbd_snap_create(struct rbd_device *rbd_dev, 3426c8d18425SAlex Elder const char *snap_name, 342734b13184SAlex Elder u64 snap_id, u64 snap_size, 342834b13184SAlex Elder u64 snap_features) 3429dfc5606dSYehuda Sadeh { 34304e891e0aSAlex Elder struct rbd_snap *snap; 34314e891e0aSAlex Elder 34324e891e0aSAlex Elder snap = kzalloc(sizeof (*snap), GFP_KERNEL); 3433dfc5606dSYehuda Sadeh if (!snap) 34344e891e0aSAlex Elder return ERR_PTR(-ENOMEM); 34354e891e0aSAlex Elder 34366e584f52SAlex Elder snap->name = snap_name; 3437c8d18425SAlex Elder snap->id = snap_id; 3438c8d18425SAlex Elder snap->size = snap_size; 343934b13184SAlex Elder snap->features = snap_features; 34404e891e0aSAlex Elder 34414e891e0aSAlex Elder return snap; 3442dfc5606dSYehuda Sadeh } 3443dfc5606dSYehuda Sadeh 34446e584f52SAlex Elder /* 34456e584f52SAlex Elder * Returns a dynamically-allocated snapshot name if successful, or a 34466e584f52SAlex Elder * pointer-coded error otherwise. 34476e584f52SAlex Elder */ 3448cd892126SAlex Elder static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which, 3449cd892126SAlex Elder u64 *snap_size, u64 *snap_features) 3450cd892126SAlex Elder { 3451cd892126SAlex Elder char *snap_name; 34526e584f52SAlex Elder int i; 3453cd892126SAlex Elder 3454cd892126SAlex Elder rbd_assert(which < rbd_dev->header.snapc->num_snaps); 3455cd892126SAlex Elder 3456cd892126SAlex Elder /* Skip over names until we find the one we are looking for */ 3457cd892126SAlex Elder 3458cd892126SAlex Elder snap_name = rbd_dev->header.snap_names; 34596e584f52SAlex Elder for (i = 0; i < which; i++) 3460cd892126SAlex Elder snap_name += strlen(snap_name) + 1; 3461cd892126SAlex Elder 34626e584f52SAlex Elder snap_name = kstrdup(snap_name, GFP_KERNEL); 34636e584f52SAlex Elder if (!snap_name) 34646e584f52SAlex Elder return ERR_PTR(-ENOMEM); 34656e584f52SAlex Elder 34666e584f52SAlex Elder *snap_size = rbd_dev->header.snap_sizes[which]; 34676e584f52SAlex Elder *snap_features = 0; /* No features for v1 */ 34686e584f52SAlex Elder 3469cd892126SAlex Elder return snap_name; 3470cd892126SAlex Elder } 3471cd892126SAlex Elder 3472dfc5606dSYehuda Sadeh /* 34739d475de5SAlex Elder * Get the size and object order for an image snapshot, or if 34749d475de5SAlex Elder * snap_id is CEPH_NOSNAP, gets this information for the base 34759d475de5SAlex Elder * image. 34769d475de5SAlex Elder */ 34779d475de5SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id, 34789d475de5SAlex Elder u8 *order, u64 *snap_size) 34799d475de5SAlex Elder { 34809d475de5SAlex Elder __le64 snapid = cpu_to_le64(snap_id); 34819d475de5SAlex Elder int ret; 34829d475de5SAlex Elder struct { 34839d475de5SAlex Elder u8 order; 34849d475de5SAlex Elder __le64 size; 34859d475de5SAlex Elder } __attribute__ ((packed)) size_buf = { 0 }; 34869d475de5SAlex Elder 348736be9a76SAlex Elder ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, 34889d475de5SAlex Elder "rbd", "get_size", 34894157976bSAlex Elder &snapid, sizeof (snapid), 34904157976bSAlex Elder &size_buf, sizeof (size_buf), NULL); 349136be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 34929d475de5SAlex Elder if (ret < 0) 34939d475de5SAlex Elder return ret; 349457385b51SAlex Elder if (ret < sizeof (size_buf)) 349557385b51SAlex Elder return -ERANGE; 34969d475de5SAlex Elder 3497c86f86e9SAlex Elder if (order) 34989d475de5SAlex Elder *order = size_buf.order; 34999d475de5SAlex Elder *snap_size = le64_to_cpu(size_buf.size); 35009d475de5SAlex Elder 35019d475de5SAlex Elder dout(" snap_id 0x%016llx order = %u, snap_size = %llu\n", 35029d475de5SAlex Elder (unsigned long long)snap_id, (unsigned int)*order, 35039d475de5SAlex Elder (unsigned long long)*snap_size); 35049d475de5SAlex Elder 35059d475de5SAlex Elder return 0; 35069d475de5SAlex Elder } 35079d475de5SAlex Elder 35089d475de5SAlex Elder static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev) 35099d475de5SAlex Elder { 35109d475de5SAlex Elder return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP, 35119d475de5SAlex Elder &rbd_dev->header.obj_order, 35129d475de5SAlex Elder &rbd_dev->header.image_size); 35139d475de5SAlex Elder } 35149d475de5SAlex Elder 35151e130199SAlex Elder static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev) 35161e130199SAlex Elder { 35171e130199SAlex Elder void *reply_buf; 35181e130199SAlex Elder int ret; 35191e130199SAlex Elder void *p; 35201e130199SAlex Elder 35211e130199SAlex Elder reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL); 35221e130199SAlex Elder if (!reply_buf) 35231e130199SAlex Elder return -ENOMEM; 35241e130199SAlex Elder 352536be9a76SAlex Elder ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, 35264157976bSAlex Elder "rbd", "get_object_prefix", NULL, 0, 352707b2391fSAlex Elder reply_buf, RBD_OBJ_PREFIX_LEN_MAX, NULL); 352836be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 35291e130199SAlex Elder if (ret < 0) 35301e130199SAlex Elder goto out; 35311e130199SAlex Elder 35321e130199SAlex Elder p = reply_buf; 35331e130199SAlex Elder rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p, 353457385b51SAlex Elder p + ret, NULL, GFP_NOIO); 353557385b51SAlex Elder ret = 0; 35361e130199SAlex Elder 35371e130199SAlex Elder if (IS_ERR(rbd_dev->header.object_prefix)) { 35381e130199SAlex Elder ret = PTR_ERR(rbd_dev->header.object_prefix); 35391e130199SAlex Elder rbd_dev->header.object_prefix = NULL; 35401e130199SAlex Elder } else { 35411e130199SAlex Elder dout(" object_prefix = %s\n", rbd_dev->header.object_prefix); 35421e130199SAlex Elder } 35431e130199SAlex Elder out: 35441e130199SAlex Elder kfree(reply_buf); 35451e130199SAlex Elder 35461e130199SAlex Elder return ret; 35471e130199SAlex Elder } 35481e130199SAlex Elder 3549b1b5402aSAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id, 3550b1b5402aSAlex Elder u64 *snap_features) 3551b1b5402aSAlex Elder { 3552b1b5402aSAlex Elder __le64 snapid = cpu_to_le64(snap_id); 3553b1b5402aSAlex Elder struct { 3554b1b5402aSAlex Elder __le64 features; 3555b1b5402aSAlex Elder __le64 incompat; 35564157976bSAlex Elder } __attribute__ ((packed)) features_buf = { 0 }; 3557d889140cSAlex Elder u64 incompat; 3558b1b5402aSAlex Elder int ret; 3559b1b5402aSAlex Elder 356036be9a76SAlex Elder ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, 3561b1b5402aSAlex Elder "rbd", "get_features", 35624157976bSAlex Elder &snapid, sizeof (snapid), 35634157976bSAlex Elder &features_buf, sizeof (features_buf), NULL); 356436be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 3565b1b5402aSAlex Elder if (ret < 0) 3566b1b5402aSAlex Elder return ret; 356757385b51SAlex Elder if (ret < sizeof (features_buf)) 356857385b51SAlex Elder return -ERANGE; 3569d889140cSAlex Elder 3570d889140cSAlex Elder incompat = le64_to_cpu(features_buf.incompat); 35715cbf6f12SAlex Elder if (incompat & ~RBD_FEATURES_SUPPORTED) 3572b8f5c6edSAlex Elder return -ENXIO; 3573d889140cSAlex Elder 3574b1b5402aSAlex Elder *snap_features = le64_to_cpu(features_buf.features); 3575b1b5402aSAlex Elder 3576b1b5402aSAlex Elder dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n", 3577b1b5402aSAlex Elder (unsigned long long)snap_id, 3578b1b5402aSAlex Elder (unsigned long long)*snap_features, 3579b1b5402aSAlex Elder (unsigned long long)le64_to_cpu(features_buf.incompat)); 3580b1b5402aSAlex Elder 3581b1b5402aSAlex Elder return 0; 3582b1b5402aSAlex Elder } 3583b1b5402aSAlex Elder 3584b1b5402aSAlex Elder static int rbd_dev_v2_features(struct rbd_device *rbd_dev) 3585b1b5402aSAlex Elder { 3586b1b5402aSAlex Elder return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP, 3587b1b5402aSAlex Elder &rbd_dev->header.features); 3588b1b5402aSAlex Elder } 3589b1b5402aSAlex Elder 359086b00e0dSAlex Elder static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev) 359186b00e0dSAlex Elder { 359286b00e0dSAlex Elder struct rbd_spec *parent_spec; 359386b00e0dSAlex Elder size_t size; 359486b00e0dSAlex Elder void *reply_buf = NULL; 359586b00e0dSAlex Elder __le64 snapid; 359686b00e0dSAlex Elder void *p; 359786b00e0dSAlex Elder void *end; 359886b00e0dSAlex Elder char *image_id; 359986b00e0dSAlex Elder u64 overlap; 360086b00e0dSAlex Elder int ret; 360186b00e0dSAlex Elder 360286b00e0dSAlex Elder parent_spec = rbd_spec_alloc(); 360386b00e0dSAlex Elder if (!parent_spec) 360486b00e0dSAlex Elder return -ENOMEM; 360586b00e0dSAlex Elder 360686b00e0dSAlex Elder size = sizeof (__le64) + /* pool_id */ 360786b00e0dSAlex Elder sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX + /* image_id */ 360886b00e0dSAlex Elder sizeof (__le64) + /* snap_id */ 360986b00e0dSAlex Elder sizeof (__le64); /* overlap */ 361086b00e0dSAlex Elder reply_buf = kmalloc(size, GFP_KERNEL); 361186b00e0dSAlex Elder if (!reply_buf) { 361286b00e0dSAlex Elder ret = -ENOMEM; 361386b00e0dSAlex Elder goto out_err; 361486b00e0dSAlex Elder } 361586b00e0dSAlex Elder 361686b00e0dSAlex Elder snapid = cpu_to_le64(CEPH_NOSNAP); 361736be9a76SAlex Elder ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, 361886b00e0dSAlex Elder "rbd", "get_parent", 36194157976bSAlex Elder &snapid, sizeof (snapid), 36204157976bSAlex Elder reply_buf, size, NULL); 362136be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 362286b00e0dSAlex Elder if (ret < 0) 362386b00e0dSAlex Elder goto out_err; 362486b00e0dSAlex Elder 362586b00e0dSAlex Elder p = reply_buf; 362657385b51SAlex Elder end = reply_buf + ret; 362757385b51SAlex Elder ret = -ERANGE; 362886b00e0dSAlex Elder ceph_decode_64_safe(&p, end, parent_spec->pool_id, out_err); 362986b00e0dSAlex Elder if (parent_spec->pool_id == CEPH_NOPOOL) 363086b00e0dSAlex Elder goto out; /* No parent? No problem. */ 363186b00e0dSAlex Elder 36320903e875SAlex Elder /* The ceph file layout needs to fit pool id in 32 bits */ 36330903e875SAlex Elder 36340903e875SAlex Elder ret = -EIO; 36350903e875SAlex Elder if (WARN_ON(parent_spec->pool_id > (u64)U32_MAX)) 363657385b51SAlex Elder goto out_err; 36370903e875SAlex Elder 3638979ed480SAlex Elder image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL); 363986b00e0dSAlex Elder if (IS_ERR(image_id)) { 364086b00e0dSAlex Elder ret = PTR_ERR(image_id); 364186b00e0dSAlex Elder goto out_err; 364286b00e0dSAlex Elder } 364386b00e0dSAlex Elder parent_spec->image_id = image_id; 364486b00e0dSAlex Elder ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err); 364586b00e0dSAlex Elder ceph_decode_64_safe(&p, end, overlap, out_err); 364686b00e0dSAlex Elder 364786b00e0dSAlex Elder rbd_dev->parent_overlap = overlap; 364886b00e0dSAlex Elder rbd_dev->parent_spec = parent_spec; 364986b00e0dSAlex Elder parent_spec = NULL; /* rbd_dev now owns this */ 365086b00e0dSAlex Elder out: 365186b00e0dSAlex Elder ret = 0; 365286b00e0dSAlex Elder out_err: 365386b00e0dSAlex Elder kfree(reply_buf); 365486b00e0dSAlex Elder rbd_spec_put(parent_spec); 365586b00e0dSAlex Elder 365686b00e0dSAlex Elder return ret; 365786b00e0dSAlex Elder } 365886b00e0dSAlex Elder 3659cc070d59SAlex Elder static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev) 3660cc070d59SAlex Elder { 3661cc070d59SAlex Elder struct { 3662cc070d59SAlex Elder __le64 stripe_unit; 3663cc070d59SAlex Elder __le64 stripe_count; 3664cc070d59SAlex Elder } __attribute__ ((packed)) striping_info_buf = { 0 }; 3665cc070d59SAlex Elder size_t size = sizeof (striping_info_buf); 3666cc070d59SAlex Elder void *p; 3667cc070d59SAlex Elder u64 obj_size; 3668cc070d59SAlex Elder u64 stripe_unit; 3669cc070d59SAlex Elder u64 stripe_count; 3670cc070d59SAlex Elder int ret; 3671cc070d59SAlex Elder 3672cc070d59SAlex Elder ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, 3673cc070d59SAlex Elder "rbd", "get_stripe_unit_count", NULL, 0, 3674cc070d59SAlex Elder (char *)&striping_info_buf, size, NULL); 3675cc070d59SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 3676cc070d59SAlex Elder if (ret < 0) 3677cc070d59SAlex Elder return ret; 3678cc070d59SAlex Elder if (ret < size) 3679cc070d59SAlex Elder return -ERANGE; 3680cc070d59SAlex Elder 3681cc070d59SAlex Elder /* 3682cc070d59SAlex Elder * We don't actually support the "fancy striping" feature 3683cc070d59SAlex Elder * (STRIPINGV2) yet, but if the striping sizes are the 3684cc070d59SAlex Elder * defaults the behavior is the same as before. So find 3685cc070d59SAlex Elder * out, and only fail if the image has non-default values. 3686cc070d59SAlex Elder */ 3687cc070d59SAlex Elder ret = -EINVAL; 3688cc070d59SAlex Elder obj_size = (u64)1 << rbd_dev->header.obj_order; 3689cc070d59SAlex Elder p = &striping_info_buf; 3690cc070d59SAlex Elder stripe_unit = ceph_decode_64(&p); 3691cc070d59SAlex Elder if (stripe_unit != obj_size) { 3692cc070d59SAlex Elder rbd_warn(rbd_dev, "unsupported stripe unit " 3693cc070d59SAlex Elder "(got %llu want %llu)", 3694cc070d59SAlex Elder stripe_unit, obj_size); 3695cc070d59SAlex Elder return -EINVAL; 3696cc070d59SAlex Elder } 3697cc070d59SAlex Elder stripe_count = ceph_decode_64(&p); 3698cc070d59SAlex Elder if (stripe_count != 1) { 3699cc070d59SAlex Elder rbd_warn(rbd_dev, "unsupported stripe count " 3700cc070d59SAlex Elder "(got %llu want 1)", stripe_count); 3701cc070d59SAlex Elder return -EINVAL; 3702cc070d59SAlex Elder } 3703cc070d59SAlex Elder rbd_dev->stripe_unit = stripe_unit; 3704cc070d59SAlex Elder rbd_dev->stripe_count = stripe_count; 3705cc070d59SAlex Elder 3706cc070d59SAlex Elder return 0; 3707cc070d59SAlex Elder } 3708cc070d59SAlex Elder 37099e15b77dSAlex Elder static char *rbd_dev_image_name(struct rbd_device *rbd_dev) 37109e15b77dSAlex Elder { 37119e15b77dSAlex Elder size_t image_id_size; 37129e15b77dSAlex Elder char *image_id; 37139e15b77dSAlex Elder void *p; 37149e15b77dSAlex Elder void *end; 37159e15b77dSAlex Elder size_t size; 37169e15b77dSAlex Elder void *reply_buf = NULL; 37179e15b77dSAlex Elder size_t len = 0; 37189e15b77dSAlex Elder char *image_name = NULL; 37199e15b77dSAlex Elder int ret; 37209e15b77dSAlex Elder 37219e15b77dSAlex Elder rbd_assert(!rbd_dev->spec->image_name); 37229e15b77dSAlex Elder 372369e7a02fSAlex Elder len = strlen(rbd_dev->spec->image_id); 372469e7a02fSAlex Elder image_id_size = sizeof (__le32) + len; 37259e15b77dSAlex Elder image_id = kmalloc(image_id_size, GFP_KERNEL); 37269e15b77dSAlex Elder if (!image_id) 37279e15b77dSAlex Elder return NULL; 37289e15b77dSAlex Elder 37299e15b77dSAlex Elder p = image_id; 37304157976bSAlex Elder end = image_id + image_id_size; 373169e7a02fSAlex Elder ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len); 37329e15b77dSAlex Elder 37339e15b77dSAlex Elder size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX; 37349e15b77dSAlex Elder reply_buf = kmalloc(size, GFP_KERNEL); 37359e15b77dSAlex Elder if (!reply_buf) 37369e15b77dSAlex Elder goto out; 37379e15b77dSAlex Elder 373836be9a76SAlex Elder ret = rbd_obj_method_sync(rbd_dev, RBD_DIRECTORY, 37399e15b77dSAlex Elder "rbd", "dir_get_name", 37409e15b77dSAlex Elder image_id, image_id_size, 37414157976bSAlex Elder reply_buf, size, NULL); 37429e15b77dSAlex Elder if (ret < 0) 37439e15b77dSAlex Elder goto out; 37449e15b77dSAlex Elder p = reply_buf; 3745f40eb349SAlex Elder end = reply_buf + ret; 3746f40eb349SAlex Elder 37479e15b77dSAlex Elder image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL); 37489e15b77dSAlex Elder if (IS_ERR(image_name)) 37499e15b77dSAlex Elder image_name = NULL; 37509e15b77dSAlex Elder else 37519e15b77dSAlex Elder dout("%s: name is %s len is %zd\n", __func__, image_name, len); 37529e15b77dSAlex Elder out: 37539e15b77dSAlex Elder kfree(reply_buf); 37549e15b77dSAlex Elder kfree(image_id); 37559e15b77dSAlex Elder 37569e15b77dSAlex Elder return image_name; 37579e15b77dSAlex Elder } 37589e15b77dSAlex Elder 37599e15b77dSAlex Elder /* 37609e15b77dSAlex Elder * When a parent image gets probed, we only have the pool, image, 37619e15b77dSAlex Elder * and snapshot ids but not the names of any of them. This call 37629e15b77dSAlex Elder * is made later to fill in those names. It has to be done after 37639e15b77dSAlex Elder * rbd_dev_snaps_update() has completed because some of the 37649e15b77dSAlex Elder * information (in particular, snapshot name) is not available 37659e15b77dSAlex Elder * until then. 37669e15b77dSAlex Elder */ 37679e15b77dSAlex Elder static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev) 37689e15b77dSAlex Elder { 37699e15b77dSAlex Elder struct ceph_osd_client *osdc; 37709e15b77dSAlex Elder const char *name; 37719e15b77dSAlex Elder void *reply_buf = NULL; 37729e15b77dSAlex Elder int ret; 37739e15b77dSAlex Elder 37749e15b77dSAlex Elder if (rbd_dev->spec->pool_name) 37759e15b77dSAlex Elder return 0; /* Already have the names */ 37769e15b77dSAlex Elder 37779e15b77dSAlex Elder /* Look up the pool name */ 37789e15b77dSAlex Elder 37799e15b77dSAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 37809e15b77dSAlex Elder name = ceph_pg_pool_name_by_id(osdc->osdmap, rbd_dev->spec->pool_id); 3781935dc89fSAlex Elder if (!name) { 3782935dc89fSAlex Elder rbd_warn(rbd_dev, "there is no pool with id %llu", 3783935dc89fSAlex Elder rbd_dev->spec->pool_id); /* Really a BUG() */ 3784935dc89fSAlex Elder return -EIO; 3785935dc89fSAlex Elder } 37869e15b77dSAlex Elder 37879e15b77dSAlex Elder rbd_dev->spec->pool_name = kstrdup(name, GFP_KERNEL); 37889e15b77dSAlex Elder if (!rbd_dev->spec->pool_name) 37899e15b77dSAlex Elder return -ENOMEM; 37909e15b77dSAlex Elder 37919e15b77dSAlex Elder /* Fetch the image name; tolerate failure here */ 37929e15b77dSAlex Elder 37939e15b77dSAlex Elder name = rbd_dev_image_name(rbd_dev); 379469e7a02fSAlex Elder if (name) 37959e15b77dSAlex Elder rbd_dev->spec->image_name = (char *)name; 379669e7a02fSAlex Elder else 379706ecc6cbSAlex Elder rbd_warn(rbd_dev, "unable to get image name"); 37989e15b77dSAlex Elder 37999e15b77dSAlex Elder /* Look up the snapshot name. */ 38009e15b77dSAlex Elder 38019e15b77dSAlex Elder name = rbd_snap_name(rbd_dev, rbd_dev->spec->snap_id); 38029e15b77dSAlex Elder if (!name) { 3803935dc89fSAlex Elder rbd_warn(rbd_dev, "no snapshot with id %llu", 3804935dc89fSAlex Elder rbd_dev->spec->snap_id); /* Really a BUG() */ 38059e15b77dSAlex Elder ret = -EIO; 38069e15b77dSAlex Elder goto out_err; 38079e15b77dSAlex Elder } 38089e15b77dSAlex Elder rbd_dev->spec->snap_name = kstrdup(name, GFP_KERNEL); 38099e15b77dSAlex Elder if(!rbd_dev->spec->snap_name) 38109e15b77dSAlex Elder goto out_err; 38119e15b77dSAlex Elder 38129e15b77dSAlex Elder return 0; 38139e15b77dSAlex Elder out_err: 38149e15b77dSAlex Elder kfree(reply_buf); 38159e15b77dSAlex Elder kfree(rbd_dev->spec->pool_name); 38169e15b77dSAlex Elder rbd_dev->spec->pool_name = NULL; 38179e15b77dSAlex Elder 38189e15b77dSAlex Elder return ret; 38199e15b77dSAlex Elder } 38209e15b77dSAlex Elder 38216e14b1a6SAlex Elder static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver) 382235d489f9SAlex Elder { 382335d489f9SAlex Elder size_t size; 382435d489f9SAlex Elder int ret; 382535d489f9SAlex Elder void *reply_buf; 382635d489f9SAlex Elder void *p; 382735d489f9SAlex Elder void *end; 382835d489f9SAlex Elder u64 seq; 382935d489f9SAlex Elder u32 snap_count; 383035d489f9SAlex Elder struct ceph_snap_context *snapc; 383135d489f9SAlex Elder u32 i; 383235d489f9SAlex Elder 383335d489f9SAlex Elder /* 383435d489f9SAlex Elder * We'll need room for the seq value (maximum snapshot id), 383535d489f9SAlex Elder * snapshot count, and array of that many snapshot ids. 383635d489f9SAlex Elder * For now we have a fixed upper limit on the number we're 383735d489f9SAlex Elder * prepared to receive. 383835d489f9SAlex Elder */ 383935d489f9SAlex Elder size = sizeof (__le64) + sizeof (__le32) + 384035d489f9SAlex Elder RBD_MAX_SNAP_COUNT * sizeof (__le64); 384135d489f9SAlex Elder reply_buf = kzalloc(size, GFP_KERNEL); 384235d489f9SAlex Elder if (!reply_buf) 384335d489f9SAlex Elder return -ENOMEM; 384435d489f9SAlex Elder 384536be9a76SAlex Elder ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, 38464157976bSAlex Elder "rbd", "get_snapcontext", NULL, 0, 384707b2391fSAlex Elder reply_buf, size, ver); 384836be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 384935d489f9SAlex Elder if (ret < 0) 385035d489f9SAlex Elder goto out; 385135d489f9SAlex Elder 385235d489f9SAlex Elder p = reply_buf; 385357385b51SAlex Elder end = reply_buf + ret; 385457385b51SAlex Elder ret = -ERANGE; 385535d489f9SAlex Elder ceph_decode_64_safe(&p, end, seq, out); 385635d489f9SAlex Elder ceph_decode_32_safe(&p, end, snap_count, out); 385735d489f9SAlex Elder 385835d489f9SAlex Elder /* 385935d489f9SAlex Elder * Make sure the reported number of snapshot ids wouldn't go 386035d489f9SAlex Elder * beyond the end of our buffer. But before checking that, 386135d489f9SAlex Elder * make sure the computed size of the snapshot context we 386235d489f9SAlex Elder * allocate is representable in a size_t. 386335d489f9SAlex Elder */ 386435d489f9SAlex Elder if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context)) 386535d489f9SAlex Elder / sizeof (u64)) { 386635d489f9SAlex Elder ret = -EINVAL; 386735d489f9SAlex Elder goto out; 386835d489f9SAlex Elder } 386935d489f9SAlex Elder if (!ceph_has_room(&p, end, snap_count * sizeof (__le64))) 387035d489f9SAlex Elder goto out; 387135d489f9SAlex Elder 387235d489f9SAlex Elder size = sizeof (struct ceph_snap_context) + 387335d489f9SAlex Elder snap_count * sizeof (snapc->snaps[0]); 387435d489f9SAlex Elder snapc = kmalloc(size, GFP_KERNEL); 387535d489f9SAlex Elder if (!snapc) { 387635d489f9SAlex Elder ret = -ENOMEM; 387735d489f9SAlex Elder goto out; 387835d489f9SAlex Elder } 387957385b51SAlex Elder ret = 0; 388035d489f9SAlex Elder 388135d489f9SAlex Elder atomic_set(&snapc->nref, 1); 388235d489f9SAlex Elder snapc->seq = seq; 388335d489f9SAlex Elder snapc->num_snaps = snap_count; 388435d489f9SAlex Elder for (i = 0; i < snap_count; i++) 388535d489f9SAlex Elder snapc->snaps[i] = ceph_decode_64(&p); 388635d489f9SAlex Elder 388735d489f9SAlex Elder rbd_dev->header.snapc = snapc; 388835d489f9SAlex Elder 388935d489f9SAlex Elder dout(" snap context seq = %llu, snap_count = %u\n", 389035d489f9SAlex Elder (unsigned long long)seq, (unsigned int)snap_count); 389135d489f9SAlex Elder out: 389235d489f9SAlex Elder kfree(reply_buf); 389335d489f9SAlex Elder 389457385b51SAlex Elder return ret; 389535d489f9SAlex Elder } 389635d489f9SAlex Elder 3897b8b1e2dbSAlex Elder static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which) 3898b8b1e2dbSAlex Elder { 3899b8b1e2dbSAlex Elder size_t size; 3900b8b1e2dbSAlex Elder void *reply_buf; 3901b8b1e2dbSAlex Elder __le64 snap_id; 3902b8b1e2dbSAlex Elder int ret; 3903b8b1e2dbSAlex Elder void *p; 3904b8b1e2dbSAlex Elder void *end; 3905b8b1e2dbSAlex Elder char *snap_name; 3906b8b1e2dbSAlex Elder 3907b8b1e2dbSAlex Elder size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN; 3908b8b1e2dbSAlex Elder reply_buf = kmalloc(size, GFP_KERNEL); 3909b8b1e2dbSAlex Elder if (!reply_buf) 3910b8b1e2dbSAlex Elder return ERR_PTR(-ENOMEM); 3911b8b1e2dbSAlex Elder 3912acb1b6caSAlex Elder rbd_assert(which < rbd_dev->header.snapc->num_snaps); 3913b8b1e2dbSAlex Elder snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]); 391436be9a76SAlex Elder ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, 3915b8b1e2dbSAlex Elder "rbd", "get_snapshot_name", 39164157976bSAlex Elder &snap_id, sizeof (snap_id), 391707b2391fSAlex Elder reply_buf, size, NULL); 391836be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 3919f40eb349SAlex Elder if (ret < 0) { 3920f40eb349SAlex Elder snap_name = ERR_PTR(ret); 3921b8b1e2dbSAlex Elder goto out; 3922f40eb349SAlex Elder } 3923b8b1e2dbSAlex Elder 3924b8b1e2dbSAlex Elder p = reply_buf; 3925f40eb349SAlex Elder end = reply_buf + ret; 3926e5c35534SAlex Elder snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL); 3927f40eb349SAlex Elder if (IS_ERR(snap_name)) 3928b8b1e2dbSAlex Elder goto out; 3929f40eb349SAlex Elder 3930b8b1e2dbSAlex Elder dout(" snap_id 0x%016llx snap_name = %s\n", 3931b8b1e2dbSAlex Elder (unsigned long long)le64_to_cpu(snap_id), snap_name); 3932b8b1e2dbSAlex Elder out: 3933b8b1e2dbSAlex Elder kfree(reply_buf); 3934b8b1e2dbSAlex Elder 3935f40eb349SAlex Elder return snap_name; 3936b8b1e2dbSAlex Elder } 3937b8b1e2dbSAlex Elder 3938b8b1e2dbSAlex Elder static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which, 3939b8b1e2dbSAlex Elder u64 *snap_size, u64 *snap_features) 3940b8b1e2dbSAlex Elder { 3941e0b49868SAlex Elder u64 snap_id; 3942acb1b6caSAlex Elder u64 size; 3943acb1b6caSAlex Elder u64 features; 3944acb1b6caSAlex Elder char *snap_name; 3945b8b1e2dbSAlex Elder int ret; 3946b8b1e2dbSAlex Elder 3947acb1b6caSAlex Elder rbd_assert(which < rbd_dev->header.snapc->num_snaps); 3948b8b1e2dbSAlex Elder snap_id = rbd_dev->header.snapc->snaps[which]; 3949acb1b6caSAlex Elder ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size); 3950b8b1e2dbSAlex Elder if (ret) 3951acb1b6caSAlex Elder goto out_err; 3952b8b1e2dbSAlex Elder 3953acb1b6caSAlex Elder ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, &features); 3954acb1b6caSAlex Elder if (ret) 3955acb1b6caSAlex Elder goto out_err; 3956acb1b6caSAlex Elder 3957acb1b6caSAlex Elder snap_name = rbd_dev_v2_snap_name(rbd_dev, which); 3958acb1b6caSAlex Elder if (!IS_ERR(snap_name)) { 3959acb1b6caSAlex Elder *snap_size = size; 3960acb1b6caSAlex Elder *snap_features = features; 3961acb1b6caSAlex Elder } 3962acb1b6caSAlex Elder 3963acb1b6caSAlex Elder return snap_name; 3964acb1b6caSAlex Elder out_err: 3965acb1b6caSAlex Elder return ERR_PTR(ret); 3966b8b1e2dbSAlex Elder } 3967b8b1e2dbSAlex Elder 3968b8b1e2dbSAlex Elder static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which, 3969b8b1e2dbSAlex Elder u64 *snap_size, u64 *snap_features) 3970b8b1e2dbSAlex Elder { 3971b8b1e2dbSAlex Elder if (rbd_dev->image_format == 1) 3972b8b1e2dbSAlex Elder return rbd_dev_v1_snap_info(rbd_dev, which, 3973b8b1e2dbSAlex Elder snap_size, snap_features); 3974b8b1e2dbSAlex Elder if (rbd_dev->image_format == 2) 3975b8b1e2dbSAlex Elder return rbd_dev_v2_snap_info(rbd_dev, which, 3976b8b1e2dbSAlex Elder snap_size, snap_features); 3977b8b1e2dbSAlex Elder return ERR_PTR(-EINVAL); 3978b8b1e2dbSAlex Elder } 3979b8b1e2dbSAlex Elder 3980117973fbSAlex Elder static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver) 3981117973fbSAlex Elder { 3982117973fbSAlex Elder int ret; 3983117973fbSAlex Elder __u8 obj_order; 3984117973fbSAlex Elder 3985117973fbSAlex Elder down_write(&rbd_dev->header_rwsem); 3986117973fbSAlex Elder 3987117973fbSAlex Elder /* Grab old order first, to see if it changes */ 3988117973fbSAlex Elder 3989117973fbSAlex Elder obj_order = rbd_dev->header.obj_order, 3990117973fbSAlex Elder ret = rbd_dev_v2_image_size(rbd_dev); 3991117973fbSAlex Elder if (ret) 3992117973fbSAlex Elder goto out; 3993117973fbSAlex Elder if (rbd_dev->header.obj_order != obj_order) { 3994117973fbSAlex Elder ret = -EIO; 3995117973fbSAlex Elder goto out; 3996117973fbSAlex Elder } 3997117973fbSAlex Elder rbd_update_mapping_size(rbd_dev); 3998117973fbSAlex Elder 3999117973fbSAlex Elder ret = rbd_dev_v2_snap_context(rbd_dev, hver); 4000117973fbSAlex Elder dout("rbd_dev_v2_snap_context returned %d\n", ret); 4001117973fbSAlex Elder if (ret) 4002117973fbSAlex Elder goto out; 4003117973fbSAlex Elder ret = rbd_dev_snaps_update(rbd_dev); 4004117973fbSAlex Elder dout("rbd_dev_snaps_update returned %d\n", ret); 4005117973fbSAlex Elder if (ret) 4006117973fbSAlex Elder goto out; 4007117973fbSAlex Elder out: 4008117973fbSAlex Elder up_write(&rbd_dev->header_rwsem); 4009117973fbSAlex Elder 4010117973fbSAlex Elder return ret; 4011117973fbSAlex Elder } 4012117973fbSAlex Elder 40139d475de5SAlex Elder /* 401435938150SAlex Elder * Scan the rbd device's current snapshot list and compare it to the 401535938150SAlex Elder * newly-received snapshot context. Remove any existing snapshots 401635938150SAlex Elder * not present in the new snapshot context. Add a new snapshot for 401735938150SAlex Elder * any snaphots in the snapshot context not in the current list. 401835938150SAlex Elder * And verify there are no changes to snapshots we already know 401935938150SAlex Elder * about. 402035938150SAlex Elder * 402135938150SAlex Elder * Assumes the snapshots in the snapshot context are sorted by 402235938150SAlex Elder * snapshot id, highest id first. (Snapshots in the rbd_dev's list 402335938150SAlex Elder * are also maintained in that order.) 4024522a0cc0SAlex Elder * 4025522a0cc0SAlex Elder * Note that any error occurs while updating the snapshot list 4026522a0cc0SAlex Elder * aborts the update, and the entire list is cleared. The snapshot 4027522a0cc0SAlex Elder * list becomes inconsistent at that point anyway, so it might as 4028522a0cc0SAlex Elder * well be empty. 4029dfc5606dSYehuda Sadeh */ 4030304f6808SAlex Elder static int rbd_dev_snaps_update(struct rbd_device *rbd_dev) 4031dfc5606dSYehuda Sadeh { 403235938150SAlex Elder struct ceph_snap_context *snapc = rbd_dev->header.snapc; 403335938150SAlex Elder const u32 snap_count = snapc->num_snaps; 403435938150SAlex Elder struct list_head *head = &rbd_dev->snaps; 403535938150SAlex Elder struct list_head *links = head->next; 403635938150SAlex Elder u32 index = 0; 4037522a0cc0SAlex Elder int ret = 0; 4038dfc5606dSYehuda Sadeh 40399fcbb800SAlex Elder dout("%s: snap count is %u\n", __func__, (unsigned int)snap_count); 404035938150SAlex Elder while (index < snap_count || links != head) { 404135938150SAlex Elder u64 snap_id; 404235938150SAlex Elder struct rbd_snap *snap; 4043cd892126SAlex Elder char *snap_name; 4044cd892126SAlex Elder u64 snap_size = 0; 4045cd892126SAlex Elder u64 snap_features = 0; 4046dfc5606dSYehuda Sadeh 404735938150SAlex Elder snap_id = index < snap_count ? snapc->snaps[index] 404835938150SAlex Elder : CEPH_NOSNAP; 404935938150SAlex Elder snap = links != head ? list_entry(links, struct rbd_snap, node) 405035938150SAlex Elder : NULL; 4051aafb230eSAlex Elder rbd_assert(!snap || snap->id != CEPH_NOSNAP); 4052dfc5606dSYehuda Sadeh 405335938150SAlex Elder if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) { 405435938150SAlex Elder struct list_head *next = links->next; 4055dfc5606dSYehuda Sadeh 40566d292906SAlex Elder /* 40576d292906SAlex Elder * A previously-existing snapshot is not in 40586d292906SAlex Elder * the new snap context. 40596d292906SAlex Elder * 4060522a0cc0SAlex Elder * If the now-missing snapshot is the one 4061522a0cc0SAlex Elder * the image represents, clear its existence 4062522a0cc0SAlex Elder * flag so we can avoid sending any more 4063522a0cc0SAlex Elder * requests to it. 40646d292906SAlex Elder */ 40650d7dbfceSAlex Elder if (rbd_dev->spec->snap_id == snap->id) 40666d292906SAlex Elder clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); 40673e83b65bSAlex Elder dout("removing %ssnap id %llu\n", 40680d7dbfceSAlex Elder rbd_dev->spec->snap_id == snap->id ? 40690d7dbfceSAlex Elder "mapped " : "", 40709fcbb800SAlex Elder (unsigned long long)snap->id); 40716087b51bSAlex Elder 40726087b51bSAlex Elder list_del(&snap->node); 40736087b51bSAlex Elder rbd_snap_destroy(snap); 4074dfc5606dSYehuda Sadeh 407535938150SAlex Elder /* Done with this list entry; advance */ 407635938150SAlex Elder 407735938150SAlex Elder links = next; 407835938150SAlex Elder continue; 4079dfc5606dSYehuda Sadeh } 408035938150SAlex Elder 4081b8b1e2dbSAlex Elder snap_name = rbd_dev_snap_info(rbd_dev, index, 4082cd892126SAlex Elder &snap_size, &snap_features); 4083522a0cc0SAlex Elder if (IS_ERR(snap_name)) { 4084522a0cc0SAlex Elder ret = PTR_ERR(snap_name); 4085522a0cc0SAlex Elder dout("failed to get snap info, error %d\n", ret); 4086522a0cc0SAlex Elder goto out_err; 4087522a0cc0SAlex Elder } 4088cd892126SAlex Elder 40899fcbb800SAlex Elder dout("entry %u: snap_id = %llu\n", (unsigned int)snap_count, 40909fcbb800SAlex Elder (unsigned long long)snap_id); 409135938150SAlex Elder if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) { 409235938150SAlex Elder struct rbd_snap *new_snap; 409335938150SAlex Elder 409435938150SAlex Elder /* We haven't seen this snapshot before */ 409535938150SAlex Elder 40966087b51bSAlex Elder new_snap = rbd_snap_create(rbd_dev, snap_name, 4097cd892126SAlex Elder snap_id, snap_size, snap_features); 40989fcbb800SAlex Elder if (IS_ERR(new_snap)) { 4099522a0cc0SAlex Elder ret = PTR_ERR(new_snap); 4100522a0cc0SAlex Elder dout(" failed to add dev, error %d\n", ret); 4101522a0cc0SAlex Elder goto out_err; 41029fcbb800SAlex Elder } 410335938150SAlex Elder 410435938150SAlex Elder /* New goes before existing, or at end of list */ 410535938150SAlex Elder 41069fcbb800SAlex Elder dout(" added dev%s\n", snap ? "" : " at end\n"); 410735938150SAlex Elder if (snap) 410835938150SAlex Elder list_add_tail(&new_snap->node, &snap->node); 410935938150SAlex Elder else 4110523f3258SAlex Elder list_add_tail(&new_snap->node, head); 411135938150SAlex Elder } else { 411235938150SAlex Elder /* Already have this one */ 411335938150SAlex Elder 41149fcbb800SAlex Elder dout(" already present\n"); 41159fcbb800SAlex Elder 4116cd892126SAlex Elder rbd_assert(snap->size == snap_size); 4117aafb230eSAlex Elder rbd_assert(!strcmp(snap->name, snap_name)); 4118cd892126SAlex Elder rbd_assert(snap->features == snap_features); 411935938150SAlex Elder 412035938150SAlex Elder /* Done with this list entry; advance */ 412135938150SAlex Elder 412235938150SAlex Elder links = links->next; 4123dfc5606dSYehuda Sadeh } 412435938150SAlex Elder 412535938150SAlex Elder /* Advance to the next entry in the snapshot context */ 412635938150SAlex Elder 412735938150SAlex Elder index++; 4128dfc5606dSYehuda Sadeh } 41299fcbb800SAlex Elder dout("%s: done\n", __func__); 4130dfc5606dSYehuda Sadeh 4131dfc5606dSYehuda Sadeh return 0; 4132522a0cc0SAlex Elder out_err: 4133522a0cc0SAlex Elder rbd_remove_all_snaps(rbd_dev); 4134522a0cc0SAlex Elder 4135522a0cc0SAlex Elder return ret; 4136dfc5606dSYehuda Sadeh } 4137dfc5606dSYehuda Sadeh 4138dfc5606dSYehuda Sadeh static int rbd_bus_add_dev(struct rbd_device *rbd_dev) 4139dfc5606dSYehuda Sadeh { 4140dfc5606dSYehuda Sadeh struct device *dev; 4141cd789ab9SAlex Elder int ret; 4142dfc5606dSYehuda Sadeh 4143dfc5606dSYehuda Sadeh mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); 4144dfc5606dSYehuda Sadeh 4145cd789ab9SAlex Elder dev = &rbd_dev->dev; 4146dfc5606dSYehuda Sadeh dev->bus = &rbd_bus_type; 4147dfc5606dSYehuda Sadeh dev->type = &rbd_device_type; 4148dfc5606dSYehuda Sadeh dev->parent = &rbd_root_dev; 4149dfc5606dSYehuda Sadeh dev->release = rbd_dev_release; 4150de71a297SAlex Elder dev_set_name(dev, "%d", rbd_dev->dev_id); 4151dfc5606dSYehuda Sadeh ret = device_register(dev); 4152dfc5606dSYehuda Sadeh 4153dfc5606dSYehuda Sadeh mutex_unlock(&ctl_mutex); 4154cd789ab9SAlex Elder 4155dfc5606dSYehuda Sadeh return ret; 4156602adf40SYehuda Sadeh } 4157602adf40SYehuda Sadeh 4158dfc5606dSYehuda Sadeh static void rbd_bus_del_dev(struct rbd_device *rbd_dev) 4159dfc5606dSYehuda Sadeh { 4160dfc5606dSYehuda Sadeh device_unregister(&rbd_dev->dev); 4161dfc5606dSYehuda Sadeh } 4162dfc5606dSYehuda Sadeh 4163e2839308SAlex Elder static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0); 41641ddbe94eSAlex Elder 41651ddbe94eSAlex Elder /* 4166499afd5bSAlex Elder * Get a unique rbd identifier for the given new rbd_dev, and add 4167499afd5bSAlex Elder * the rbd_dev to the global list. The minimum rbd id is 1. 41681ddbe94eSAlex Elder */ 4169e2839308SAlex Elder static void rbd_dev_id_get(struct rbd_device *rbd_dev) 4170b7f23c36SAlex Elder { 4171e2839308SAlex Elder rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max); 4172499afd5bSAlex Elder 4173499afd5bSAlex Elder spin_lock(&rbd_dev_list_lock); 4174499afd5bSAlex Elder list_add_tail(&rbd_dev->node, &rbd_dev_list); 4175499afd5bSAlex Elder spin_unlock(&rbd_dev_list_lock); 4176e2839308SAlex Elder dout("rbd_dev %p given dev id %llu\n", rbd_dev, 4177e2839308SAlex Elder (unsigned long long) rbd_dev->dev_id); 4178b7f23c36SAlex Elder } 4179b7f23c36SAlex Elder 41801ddbe94eSAlex Elder /* 4181499afd5bSAlex Elder * Remove an rbd_dev from the global list, and record that its 4182499afd5bSAlex Elder * identifier is no longer in use. 41831ddbe94eSAlex Elder */ 4184e2839308SAlex Elder static void rbd_dev_id_put(struct rbd_device *rbd_dev) 41851ddbe94eSAlex Elder { 4186d184f6bfSAlex Elder struct list_head *tmp; 4187de71a297SAlex Elder int rbd_id = rbd_dev->dev_id; 4188d184f6bfSAlex Elder int max_id; 4189d184f6bfSAlex Elder 4190aafb230eSAlex Elder rbd_assert(rbd_id > 0); 4191499afd5bSAlex Elder 4192e2839308SAlex Elder dout("rbd_dev %p released dev id %llu\n", rbd_dev, 4193e2839308SAlex Elder (unsigned long long) rbd_dev->dev_id); 4194499afd5bSAlex Elder spin_lock(&rbd_dev_list_lock); 4195499afd5bSAlex Elder list_del_init(&rbd_dev->node); 4196d184f6bfSAlex Elder 4197d184f6bfSAlex Elder /* 4198d184f6bfSAlex Elder * If the id being "put" is not the current maximum, there 4199d184f6bfSAlex Elder * is nothing special we need to do. 4200d184f6bfSAlex Elder */ 4201e2839308SAlex Elder if (rbd_id != atomic64_read(&rbd_dev_id_max)) { 4202d184f6bfSAlex Elder spin_unlock(&rbd_dev_list_lock); 4203d184f6bfSAlex Elder return; 4204d184f6bfSAlex Elder } 4205d184f6bfSAlex Elder 4206d184f6bfSAlex Elder /* 4207d184f6bfSAlex Elder * We need to update the current maximum id. Search the 4208d184f6bfSAlex Elder * list to find out what it is. We're more likely to find 4209d184f6bfSAlex Elder * the maximum at the end, so search the list backward. 4210d184f6bfSAlex Elder */ 4211d184f6bfSAlex Elder max_id = 0; 4212d184f6bfSAlex Elder list_for_each_prev(tmp, &rbd_dev_list) { 4213d184f6bfSAlex Elder struct rbd_device *rbd_dev; 4214d184f6bfSAlex Elder 4215d184f6bfSAlex Elder rbd_dev = list_entry(tmp, struct rbd_device, node); 4216b213e0b1SAlex Elder if (rbd_dev->dev_id > max_id) 4217b213e0b1SAlex Elder max_id = rbd_dev->dev_id; 4218d184f6bfSAlex Elder } 4219499afd5bSAlex Elder spin_unlock(&rbd_dev_list_lock); 42201ddbe94eSAlex Elder 42211ddbe94eSAlex Elder /* 4222e2839308SAlex Elder * The max id could have been updated by rbd_dev_id_get(), in 4223d184f6bfSAlex Elder * which case it now accurately reflects the new maximum. 4224d184f6bfSAlex Elder * Be careful not to overwrite the maximum value in that 4225d184f6bfSAlex Elder * case. 42261ddbe94eSAlex Elder */ 4227e2839308SAlex Elder atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id); 4228e2839308SAlex Elder dout(" max dev id has been reset\n"); 4229b7f23c36SAlex Elder } 4230b7f23c36SAlex Elder 4231a725f65eSAlex Elder /* 4232e28fff26SAlex Elder * Skips over white space at *buf, and updates *buf to point to the 4233e28fff26SAlex Elder * first found non-space character (if any). Returns the length of 4234593a9e7bSAlex Elder * the token (string of non-white space characters) found. Note 4235593a9e7bSAlex Elder * that *buf must be terminated with '\0'. 4236e28fff26SAlex Elder */ 4237e28fff26SAlex Elder static inline size_t next_token(const char **buf) 4238e28fff26SAlex Elder { 4239e28fff26SAlex Elder /* 4240e28fff26SAlex Elder * These are the characters that produce nonzero for 4241e28fff26SAlex Elder * isspace() in the "C" and "POSIX" locales. 4242e28fff26SAlex Elder */ 4243e28fff26SAlex Elder const char *spaces = " \f\n\r\t\v"; 4244e28fff26SAlex Elder 4245e28fff26SAlex Elder *buf += strspn(*buf, spaces); /* Find start of token */ 4246e28fff26SAlex Elder 4247e28fff26SAlex Elder return strcspn(*buf, spaces); /* Return token length */ 4248e28fff26SAlex Elder } 4249e28fff26SAlex Elder 4250e28fff26SAlex Elder /* 4251e28fff26SAlex Elder * Finds the next token in *buf, and if the provided token buffer is 4252e28fff26SAlex Elder * big enough, copies the found token into it. The result, if 4253593a9e7bSAlex Elder * copied, is guaranteed to be terminated with '\0'. Note that *buf 4254593a9e7bSAlex Elder * must be terminated with '\0' on entry. 4255e28fff26SAlex Elder * 4256e28fff26SAlex Elder * Returns the length of the token found (not including the '\0'). 4257e28fff26SAlex Elder * Return value will be 0 if no token is found, and it will be >= 4258e28fff26SAlex Elder * token_size if the token would not fit. 4259e28fff26SAlex Elder * 4260593a9e7bSAlex Elder * The *buf pointer will be updated to point beyond the end of the 4261e28fff26SAlex Elder * found token. Note that this occurs even if the token buffer is 4262e28fff26SAlex Elder * too small to hold it. 4263e28fff26SAlex Elder */ 4264e28fff26SAlex Elder static inline size_t copy_token(const char **buf, 4265e28fff26SAlex Elder char *token, 4266e28fff26SAlex Elder size_t token_size) 4267e28fff26SAlex Elder { 4268e28fff26SAlex Elder size_t len; 4269e28fff26SAlex Elder 4270e28fff26SAlex Elder len = next_token(buf); 4271e28fff26SAlex Elder if (len < token_size) { 4272e28fff26SAlex Elder memcpy(token, *buf, len); 4273e28fff26SAlex Elder *(token + len) = '\0'; 4274e28fff26SAlex Elder } 4275e28fff26SAlex Elder *buf += len; 4276e28fff26SAlex Elder 4277e28fff26SAlex Elder return len; 4278e28fff26SAlex Elder } 4279e28fff26SAlex Elder 4280e28fff26SAlex Elder /* 4281ea3352f4SAlex Elder * Finds the next token in *buf, dynamically allocates a buffer big 4282ea3352f4SAlex Elder * enough to hold a copy of it, and copies the token into the new 4283ea3352f4SAlex Elder * buffer. The copy is guaranteed to be terminated with '\0'. Note 4284ea3352f4SAlex Elder * that a duplicate buffer is created even for a zero-length token. 4285ea3352f4SAlex Elder * 4286ea3352f4SAlex Elder * Returns a pointer to the newly-allocated duplicate, or a null 4287ea3352f4SAlex Elder * pointer if memory for the duplicate was not available. If 4288ea3352f4SAlex Elder * the lenp argument is a non-null pointer, the length of the token 4289ea3352f4SAlex Elder * (not including the '\0') is returned in *lenp. 4290ea3352f4SAlex Elder * 4291ea3352f4SAlex Elder * If successful, the *buf pointer will be updated to point beyond 4292ea3352f4SAlex Elder * the end of the found token. 4293ea3352f4SAlex Elder * 4294ea3352f4SAlex Elder * Note: uses GFP_KERNEL for allocation. 4295ea3352f4SAlex Elder */ 4296ea3352f4SAlex Elder static inline char *dup_token(const char **buf, size_t *lenp) 4297ea3352f4SAlex Elder { 4298ea3352f4SAlex Elder char *dup; 4299ea3352f4SAlex Elder size_t len; 4300ea3352f4SAlex Elder 4301ea3352f4SAlex Elder len = next_token(buf); 43024caf35f9SAlex Elder dup = kmemdup(*buf, len + 1, GFP_KERNEL); 4303ea3352f4SAlex Elder if (!dup) 4304ea3352f4SAlex Elder return NULL; 4305ea3352f4SAlex Elder *(dup + len) = '\0'; 4306ea3352f4SAlex Elder *buf += len; 4307ea3352f4SAlex Elder 4308ea3352f4SAlex Elder if (lenp) 4309ea3352f4SAlex Elder *lenp = len; 4310ea3352f4SAlex Elder 4311ea3352f4SAlex Elder return dup; 4312ea3352f4SAlex Elder } 4313ea3352f4SAlex Elder 4314ea3352f4SAlex Elder /* 4315859c31dfSAlex Elder * Parse the options provided for an "rbd add" (i.e., rbd image 4316859c31dfSAlex Elder * mapping) request. These arrive via a write to /sys/bus/rbd/add, 4317859c31dfSAlex Elder * and the data written is passed here via a NUL-terminated buffer. 4318859c31dfSAlex Elder * Returns 0 if successful or an error code otherwise. 4319d22f76e7SAlex Elder * 4320859c31dfSAlex Elder * The information extracted from these options is recorded in 4321859c31dfSAlex Elder * the other parameters which return dynamically-allocated 4322859c31dfSAlex Elder * structures: 4323859c31dfSAlex Elder * ceph_opts 4324859c31dfSAlex Elder * The address of a pointer that will refer to a ceph options 4325859c31dfSAlex Elder * structure. Caller must release the returned pointer using 4326859c31dfSAlex Elder * ceph_destroy_options() when it is no longer needed. 4327859c31dfSAlex Elder * rbd_opts 4328859c31dfSAlex Elder * Address of an rbd options pointer. Fully initialized by 4329859c31dfSAlex Elder * this function; caller must release with kfree(). 4330859c31dfSAlex Elder * spec 4331859c31dfSAlex Elder * Address of an rbd image specification pointer. Fully 4332859c31dfSAlex Elder * initialized by this function based on parsed options. 4333859c31dfSAlex Elder * Caller must release with rbd_spec_put(). 4334859c31dfSAlex Elder * 4335859c31dfSAlex Elder * The options passed take this form: 4336859c31dfSAlex Elder * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>] 4337859c31dfSAlex Elder * where: 4338859c31dfSAlex Elder * <mon_addrs> 4339859c31dfSAlex Elder * A comma-separated list of one or more monitor addresses. 4340859c31dfSAlex Elder * A monitor address is an ip address, optionally followed 4341859c31dfSAlex Elder * by a port number (separated by a colon). 4342859c31dfSAlex Elder * I.e.: ip1[:port1][,ip2[:port2]...] 4343859c31dfSAlex Elder * <options> 4344859c31dfSAlex Elder * A comma-separated list of ceph and/or rbd options. 4345859c31dfSAlex Elder * <pool_name> 4346859c31dfSAlex Elder * The name of the rados pool containing the rbd image. 4347859c31dfSAlex Elder * <image_name> 4348859c31dfSAlex Elder * The name of the image in that pool to map. 4349859c31dfSAlex Elder * <snap_id> 4350859c31dfSAlex Elder * An optional snapshot id. If provided, the mapping will 4351859c31dfSAlex Elder * present data from the image at the time that snapshot was 4352859c31dfSAlex Elder * created. The image head is used if no snapshot id is 4353859c31dfSAlex Elder * provided. Snapshot mappings are always read-only. 4354a725f65eSAlex Elder */ 4355859c31dfSAlex Elder static int rbd_add_parse_args(const char *buf, 4356dc79b113SAlex Elder struct ceph_options **ceph_opts, 4357859c31dfSAlex Elder struct rbd_options **opts, 4358859c31dfSAlex Elder struct rbd_spec **rbd_spec) 4359a725f65eSAlex Elder { 4360e28fff26SAlex Elder size_t len; 4361859c31dfSAlex Elder char *options; 43620ddebc0cSAlex Elder const char *mon_addrs; 43630ddebc0cSAlex Elder size_t mon_addrs_size; 4364859c31dfSAlex Elder struct rbd_spec *spec = NULL; 43654e9afebaSAlex Elder struct rbd_options *rbd_opts = NULL; 4366859c31dfSAlex Elder struct ceph_options *copts; 4367dc79b113SAlex Elder int ret; 4368e28fff26SAlex Elder 4369e28fff26SAlex Elder /* The first four tokens are required */ 4370e28fff26SAlex Elder 43717ef3214aSAlex Elder len = next_token(&buf); 43724fb5d671SAlex Elder if (!len) { 43734fb5d671SAlex Elder rbd_warn(NULL, "no monitor address(es) provided"); 43744fb5d671SAlex Elder return -EINVAL; 43754fb5d671SAlex Elder } 43760ddebc0cSAlex Elder mon_addrs = buf; 4377f28e565aSAlex Elder mon_addrs_size = len + 1; 43787ef3214aSAlex Elder buf += len; 4379a725f65eSAlex Elder 4380dc79b113SAlex Elder ret = -EINVAL; 4381f28e565aSAlex Elder options = dup_token(&buf, NULL); 4382f28e565aSAlex Elder if (!options) 4383dc79b113SAlex Elder return -ENOMEM; 43844fb5d671SAlex Elder if (!*options) { 43854fb5d671SAlex Elder rbd_warn(NULL, "no options provided"); 43864fb5d671SAlex Elder goto out_err; 43874fb5d671SAlex Elder } 4388a725f65eSAlex Elder 4389859c31dfSAlex Elder spec = rbd_spec_alloc(); 4390859c31dfSAlex Elder if (!spec) 4391f28e565aSAlex Elder goto out_mem; 4392859c31dfSAlex Elder 4393859c31dfSAlex Elder spec->pool_name = dup_token(&buf, NULL); 4394859c31dfSAlex Elder if (!spec->pool_name) 4395859c31dfSAlex Elder goto out_mem; 43964fb5d671SAlex Elder if (!*spec->pool_name) { 43974fb5d671SAlex Elder rbd_warn(NULL, "no pool name provided"); 43984fb5d671SAlex Elder goto out_err; 43994fb5d671SAlex Elder } 4400e28fff26SAlex Elder 440169e7a02fSAlex Elder spec->image_name = dup_token(&buf, NULL); 4402859c31dfSAlex Elder if (!spec->image_name) 4403f28e565aSAlex Elder goto out_mem; 44044fb5d671SAlex Elder if (!*spec->image_name) { 44054fb5d671SAlex Elder rbd_warn(NULL, "no image name provided"); 44064fb5d671SAlex Elder goto out_err; 44074fb5d671SAlex Elder } 4408e28fff26SAlex Elder 4409f28e565aSAlex Elder /* 4410f28e565aSAlex Elder * Snapshot name is optional; default is to use "-" 4411f28e565aSAlex Elder * (indicating the head/no snapshot). 4412f28e565aSAlex Elder */ 44133feeb894SAlex Elder len = next_token(&buf); 4414820a5f3eSAlex Elder if (!len) { 44153feeb894SAlex Elder buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */ 44163feeb894SAlex Elder len = sizeof (RBD_SNAP_HEAD_NAME) - 1; 4417f28e565aSAlex Elder } else if (len > RBD_MAX_SNAP_NAME_LEN) { 4418dc79b113SAlex Elder ret = -ENAMETOOLONG; 4419f28e565aSAlex Elder goto out_err; 4420849b4260SAlex Elder } 44214caf35f9SAlex Elder spec->snap_name = kmemdup(buf, len + 1, GFP_KERNEL); 4422859c31dfSAlex Elder if (!spec->snap_name) 4423f28e565aSAlex Elder goto out_mem; 4424859c31dfSAlex Elder *(spec->snap_name + len) = '\0'; 4425e5c35534SAlex Elder 44260ddebc0cSAlex Elder /* Initialize all rbd options to the defaults */ 4427e28fff26SAlex Elder 44284e9afebaSAlex Elder rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL); 44294e9afebaSAlex Elder if (!rbd_opts) 44304e9afebaSAlex Elder goto out_mem; 44314e9afebaSAlex Elder 44324e9afebaSAlex Elder rbd_opts->read_only = RBD_READ_ONLY_DEFAULT; 4433d22f76e7SAlex Elder 4434859c31dfSAlex Elder copts = ceph_parse_options(options, mon_addrs, 44350ddebc0cSAlex Elder mon_addrs + mon_addrs_size - 1, 44364e9afebaSAlex Elder parse_rbd_opts_token, rbd_opts); 4437859c31dfSAlex Elder if (IS_ERR(copts)) { 4438859c31dfSAlex Elder ret = PTR_ERR(copts); 4439dc79b113SAlex Elder goto out_err; 4440dc79b113SAlex Elder } 4441859c31dfSAlex Elder kfree(options); 4442859c31dfSAlex Elder 4443859c31dfSAlex Elder *ceph_opts = copts; 44444e9afebaSAlex Elder *opts = rbd_opts; 4445859c31dfSAlex Elder *rbd_spec = spec; 44460ddebc0cSAlex Elder 4447dc79b113SAlex Elder return 0; 4448f28e565aSAlex Elder out_mem: 4449dc79b113SAlex Elder ret = -ENOMEM; 4450d22f76e7SAlex Elder out_err: 4451859c31dfSAlex Elder kfree(rbd_opts); 4452859c31dfSAlex Elder rbd_spec_put(spec); 4453f28e565aSAlex Elder kfree(options); 4454d22f76e7SAlex Elder 4455dc79b113SAlex Elder return ret; 4456a725f65eSAlex Elder } 4457a725f65eSAlex Elder 4458589d30e0SAlex Elder /* 4459589d30e0SAlex Elder * An rbd format 2 image has a unique identifier, distinct from the 4460589d30e0SAlex Elder * name given to it by the user. Internally, that identifier is 4461589d30e0SAlex Elder * what's used to specify the names of objects related to the image. 4462589d30e0SAlex Elder * 4463589d30e0SAlex Elder * A special "rbd id" object is used to map an rbd image name to its 4464589d30e0SAlex Elder * id. If that object doesn't exist, then there is no v2 rbd image 4465589d30e0SAlex Elder * with the supplied name. 4466589d30e0SAlex Elder * 4467589d30e0SAlex Elder * This function will record the given rbd_dev's image_id field if 4468589d30e0SAlex Elder * it can be determined, and in that case will return 0. If any 4469589d30e0SAlex Elder * errors occur a negative errno will be returned and the rbd_dev's 4470589d30e0SAlex Elder * image_id field will be unchanged (and should be NULL). 4471589d30e0SAlex Elder */ 4472589d30e0SAlex Elder static int rbd_dev_image_id(struct rbd_device *rbd_dev) 4473589d30e0SAlex Elder { 4474589d30e0SAlex Elder int ret; 4475589d30e0SAlex Elder size_t size; 4476589d30e0SAlex Elder char *object_name; 4477589d30e0SAlex Elder void *response; 4478589d30e0SAlex Elder void *p; 4479589d30e0SAlex Elder 44802f82ee54SAlex Elder /* If we already have it we don't need to look it up */ 44812f82ee54SAlex Elder 44822f82ee54SAlex Elder if (rbd_dev->spec->image_id) 44832f82ee54SAlex Elder return 0; 44842f82ee54SAlex Elder 4485589d30e0SAlex Elder /* 44862c0d0a10SAlex Elder * When probing a parent image, the image id is already 44872c0d0a10SAlex Elder * known (and the image name likely is not). There's no 44882c0d0a10SAlex Elder * need to fetch the image id again in this case. 44892c0d0a10SAlex Elder */ 44902c0d0a10SAlex Elder if (rbd_dev->spec->image_id) 44912c0d0a10SAlex Elder return 0; 44922c0d0a10SAlex Elder 44932c0d0a10SAlex Elder /* 4494589d30e0SAlex Elder * First, see if the format 2 image id file exists, and if 4495589d30e0SAlex Elder * so, get the image's persistent id from it. 4496589d30e0SAlex Elder */ 449769e7a02fSAlex Elder size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name); 4498589d30e0SAlex Elder object_name = kmalloc(size, GFP_NOIO); 4499589d30e0SAlex Elder if (!object_name) 4500589d30e0SAlex Elder return -ENOMEM; 45010d7dbfceSAlex Elder sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name); 4502589d30e0SAlex Elder dout("rbd id object name is %s\n", object_name); 4503589d30e0SAlex Elder 4504589d30e0SAlex Elder /* Response will be an encoded string, which includes a length */ 4505589d30e0SAlex Elder 4506589d30e0SAlex Elder size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX; 4507589d30e0SAlex Elder response = kzalloc(size, GFP_NOIO); 4508589d30e0SAlex Elder if (!response) { 4509589d30e0SAlex Elder ret = -ENOMEM; 4510589d30e0SAlex Elder goto out; 4511589d30e0SAlex Elder } 4512589d30e0SAlex Elder 451336be9a76SAlex Elder ret = rbd_obj_method_sync(rbd_dev, object_name, 45144157976bSAlex Elder "rbd", "get_id", NULL, 0, 451507b2391fSAlex Elder response, RBD_IMAGE_ID_LEN_MAX, NULL); 451636be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 4517589d30e0SAlex Elder if (ret < 0) 4518589d30e0SAlex Elder goto out; 4519589d30e0SAlex Elder 4520589d30e0SAlex Elder p = response; 45210d7dbfceSAlex Elder rbd_dev->spec->image_id = ceph_extract_encoded_string(&p, 452257385b51SAlex Elder p + ret, 4523979ed480SAlex Elder NULL, GFP_NOIO); 452457385b51SAlex Elder ret = 0; 452557385b51SAlex Elder 45260d7dbfceSAlex Elder if (IS_ERR(rbd_dev->spec->image_id)) { 45270d7dbfceSAlex Elder ret = PTR_ERR(rbd_dev->spec->image_id); 45280d7dbfceSAlex Elder rbd_dev->spec->image_id = NULL; 4529589d30e0SAlex Elder } else { 45300d7dbfceSAlex Elder dout("image_id is %s\n", rbd_dev->spec->image_id); 4531589d30e0SAlex Elder } 4532589d30e0SAlex Elder out: 4533589d30e0SAlex Elder kfree(response); 4534589d30e0SAlex Elder kfree(object_name); 4535589d30e0SAlex Elder 4536589d30e0SAlex Elder return ret; 4537589d30e0SAlex Elder } 4538589d30e0SAlex Elder 4539a30b71b9SAlex Elder static int rbd_dev_v1_probe(struct rbd_device *rbd_dev) 4540a30b71b9SAlex Elder { 4541a30b71b9SAlex Elder int ret; 4542a30b71b9SAlex Elder size_t size; 4543a30b71b9SAlex Elder 4544a30b71b9SAlex Elder /* Version 1 images have no id; empty string is used */ 4545a30b71b9SAlex Elder 45460d7dbfceSAlex Elder rbd_dev->spec->image_id = kstrdup("", GFP_KERNEL); 45470d7dbfceSAlex Elder if (!rbd_dev->spec->image_id) 4548a30b71b9SAlex Elder return -ENOMEM; 4549a30b71b9SAlex Elder 4550a30b71b9SAlex Elder /* Record the header object name for this rbd image. */ 4551a30b71b9SAlex Elder 455269e7a02fSAlex Elder size = strlen(rbd_dev->spec->image_name) + sizeof (RBD_SUFFIX); 4553a30b71b9SAlex Elder rbd_dev->header_name = kmalloc(size, GFP_KERNEL); 4554a30b71b9SAlex Elder if (!rbd_dev->header_name) { 4555a30b71b9SAlex Elder ret = -ENOMEM; 4556a30b71b9SAlex Elder goto out_err; 4557a30b71b9SAlex Elder } 45580d7dbfceSAlex Elder sprintf(rbd_dev->header_name, "%s%s", 45590d7dbfceSAlex Elder rbd_dev->spec->image_name, RBD_SUFFIX); 4560a30b71b9SAlex Elder 4561a30b71b9SAlex Elder /* Populate rbd image metadata */ 4562a30b71b9SAlex Elder 4563a30b71b9SAlex Elder ret = rbd_read_header(rbd_dev, &rbd_dev->header); 4564a30b71b9SAlex Elder if (ret < 0) 4565a30b71b9SAlex Elder goto out_err; 456686b00e0dSAlex Elder 456786b00e0dSAlex Elder /* Version 1 images have no parent (no layering) */ 456886b00e0dSAlex Elder 456986b00e0dSAlex Elder rbd_dev->parent_spec = NULL; 457086b00e0dSAlex Elder rbd_dev->parent_overlap = 0; 457186b00e0dSAlex Elder 4572a30b71b9SAlex Elder rbd_dev->image_format = 1; 4573a30b71b9SAlex Elder 4574a30b71b9SAlex Elder dout("discovered version 1 image, header name is %s\n", 4575a30b71b9SAlex Elder rbd_dev->header_name); 4576a30b71b9SAlex Elder 4577a30b71b9SAlex Elder return 0; 4578a30b71b9SAlex Elder 4579a30b71b9SAlex Elder out_err: 4580a30b71b9SAlex Elder kfree(rbd_dev->header_name); 4581a30b71b9SAlex Elder rbd_dev->header_name = NULL; 45820d7dbfceSAlex Elder kfree(rbd_dev->spec->image_id); 45830d7dbfceSAlex Elder rbd_dev->spec->image_id = NULL; 4584a30b71b9SAlex Elder 4585a30b71b9SAlex Elder return ret; 4586a30b71b9SAlex Elder } 4587a30b71b9SAlex Elder 4588a30b71b9SAlex Elder static int rbd_dev_v2_probe(struct rbd_device *rbd_dev) 4589a30b71b9SAlex Elder { 4590a30b71b9SAlex Elder size_t size; 45919d475de5SAlex Elder int ret; 45926e14b1a6SAlex Elder u64 ver = 0; 4593a30b71b9SAlex Elder 4594a30b71b9SAlex Elder /* 4595a30b71b9SAlex Elder * Image id was filled in by the caller. Record the header 4596a30b71b9SAlex Elder * object name for this rbd image. 4597a30b71b9SAlex Elder */ 4598979ed480SAlex Elder size = sizeof (RBD_HEADER_PREFIX) + strlen(rbd_dev->spec->image_id); 4599a30b71b9SAlex Elder rbd_dev->header_name = kmalloc(size, GFP_KERNEL); 4600a30b71b9SAlex Elder if (!rbd_dev->header_name) 4601a30b71b9SAlex Elder return -ENOMEM; 4602a30b71b9SAlex Elder sprintf(rbd_dev->header_name, "%s%s", 46030d7dbfceSAlex Elder RBD_HEADER_PREFIX, rbd_dev->spec->image_id); 46049d475de5SAlex Elder 46059d475de5SAlex Elder /* Get the size and object order for the image */ 46069d475de5SAlex Elder ret = rbd_dev_v2_image_size(rbd_dev); 460757385b51SAlex Elder if (ret) 46089d475de5SAlex Elder goto out_err; 46091e130199SAlex Elder 46101e130199SAlex Elder /* Get the object prefix (a.k.a. block_name) for the image */ 46111e130199SAlex Elder 46121e130199SAlex Elder ret = rbd_dev_v2_object_prefix(rbd_dev); 461357385b51SAlex Elder if (ret) 46141e130199SAlex Elder goto out_err; 4615b1b5402aSAlex Elder 4616d889140cSAlex Elder /* Get the and check features for the image */ 4617b1b5402aSAlex Elder 4618b1b5402aSAlex Elder ret = rbd_dev_v2_features(rbd_dev); 461957385b51SAlex Elder if (ret) 4620b1b5402aSAlex Elder goto out_err; 462135d489f9SAlex Elder 462286b00e0dSAlex Elder /* If the image supports layering, get the parent info */ 462386b00e0dSAlex Elder 462486b00e0dSAlex Elder if (rbd_dev->header.features & RBD_FEATURE_LAYERING) { 462586b00e0dSAlex Elder ret = rbd_dev_v2_parent_info(rbd_dev); 462657385b51SAlex Elder if (ret) 462786b00e0dSAlex Elder goto out_err; 4628770eba6eSAlex Elder rbd_warn(rbd_dev, "WARNING: kernel support for " 4629770eba6eSAlex Elder "layered rbd images is EXPERIMENTAL!"); 463086b00e0dSAlex Elder } 463186b00e0dSAlex Elder 4632cc070d59SAlex Elder /* If the image supports fancy striping, get its parameters */ 4633cc070d59SAlex Elder 4634cc070d59SAlex Elder if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) { 4635cc070d59SAlex Elder ret = rbd_dev_v2_striping_info(rbd_dev); 4636cc070d59SAlex Elder if (ret < 0) 4637cc070d59SAlex Elder goto out_err; 4638cc070d59SAlex Elder } 4639cc070d59SAlex Elder 46406e14b1a6SAlex Elder /* crypto and compression type aren't (yet) supported for v2 images */ 464135d489f9SAlex Elder 46426e14b1a6SAlex Elder rbd_dev->header.crypt_type = 0; 46436e14b1a6SAlex Elder rbd_dev->header.comp_type = 0; 46446e14b1a6SAlex Elder 46456e14b1a6SAlex Elder /* Get the snapshot context, plus the header version */ 46466e14b1a6SAlex Elder 46476e14b1a6SAlex Elder ret = rbd_dev_v2_snap_context(rbd_dev, &ver); 464835d489f9SAlex Elder if (ret) 464935d489f9SAlex Elder goto out_err; 46506e14b1a6SAlex Elder rbd_dev->header.obj_version = ver; 46516e14b1a6SAlex Elder 4652a30b71b9SAlex Elder rbd_dev->image_format = 2; 4653a30b71b9SAlex Elder 4654a30b71b9SAlex Elder dout("discovered version 2 image, header name is %s\n", 4655a30b71b9SAlex Elder rbd_dev->header_name); 4656a30b71b9SAlex Elder 465735152979SAlex Elder return 0; 46589d475de5SAlex Elder out_err: 465986b00e0dSAlex Elder rbd_dev->parent_overlap = 0; 466086b00e0dSAlex Elder rbd_spec_put(rbd_dev->parent_spec); 466186b00e0dSAlex Elder rbd_dev->parent_spec = NULL; 46629d475de5SAlex Elder kfree(rbd_dev->header_name); 46639d475de5SAlex Elder rbd_dev->header_name = NULL; 46641e130199SAlex Elder kfree(rbd_dev->header.object_prefix); 46651e130199SAlex Elder rbd_dev->header.object_prefix = NULL; 46669d475de5SAlex Elder 46679d475de5SAlex Elder return ret; 4668a30b71b9SAlex Elder } 4669a30b71b9SAlex Elder 467083a06263SAlex Elder static int rbd_dev_probe_finish(struct rbd_device *rbd_dev) 467183a06263SAlex Elder { 46722f82ee54SAlex Elder struct rbd_device *parent = NULL; 46732f82ee54SAlex Elder struct rbd_spec *parent_spec = NULL; 46742f82ee54SAlex Elder struct rbd_client *rbdc = NULL; 467583a06263SAlex Elder int ret; 467683a06263SAlex Elder 467783a06263SAlex Elder /* no need to lock here, as rbd_dev is not registered yet */ 467883a06263SAlex Elder ret = rbd_dev_snaps_update(rbd_dev); 467983a06263SAlex Elder if (ret) 468083a06263SAlex Elder return ret; 468183a06263SAlex Elder 46829e15b77dSAlex Elder ret = rbd_dev_probe_update_spec(rbd_dev); 46839e15b77dSAlex Elder if (ret) 46849e15b77dSAlex Elder goto err_out_snaps; 46859e15b77dSAlex Elder 468683a06263SAlex Elder ret = rbd_dev_set_mapping(rbd_dev); 468783a06263SAlex Elder if (ret) 468883a06263SAlex Elder goto err_out_snaps; 468983a06263SAlex Elder 469083a06263SAlex Elder /* generate unique id: find highest unique id, add one */ 469183a06263SAlex Elder rbd_dev_id_get(rbd_dev); 469283a06263SAlex Elder 469383a06263SAlex Elder /* Fill in the device name, now that we have its id. */ 469483a06263SAlex Elder BUILD_BUG_ON(DEV_NAME_LEN 469583a06263SAlex Elder < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH); 469683a06263SAlex Elder sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id); 469783a06263SAlex Elder 469883a06263SAlex Elder /* Get our block major device number. */ 469983a06263SAlex Elder 470083a06263SAlex Elder ret = register_blkdev(0, rbd_dev->name); 470183a06263SAlex Elder if (ret < 0) 470283a06263SAlex Elder goto err_out_id; 470383a06263SAlex Elder rbd_dev->major = ret; 470483a06263SAlex Elder 470583a06263SAlex Elder /* Set up the blkdev mapping. */ 470683a06263SAlex Elder 470783a06263SAlex Elder ret = rbd_init_disk(rbd_dev); 470883a06263SAlex Elder if (ret) 470983a06263SAlex Elder goto err_out_blkdev; 471083a06263SAlex Elder 471183a06263SAlex Elder ret = rbd_bus_add_dev(rbd_dev); 471283a06263SAlex Elder if (ret) 471383a06263SAlex Elder goto err_out_disk; 471483a06263SAlex Elder 471583a06263SAlex Elder /* 471683a06263SAlex Elder * At this point cleanup in the event of an error is the job 471783a06263SAlex Elder * of the sysfs code (initiated by rbd_bus_del_dev()). 471883a06263SAlex Elder */ 47192f82ee54SAlex Elder /* Probe the parent if there is one */ 47202f82ee54SAlex Elder 47212f82ee54SAlex Elder if (rbd_dev->parent_spec) { 47222f82ee54SAlex Elder /* 47232f82ee54SAlex Elder * We need to pass a reference to the client and the 47242f82ee54SAlex Elder * parent spec when creating the parent rbd_dev. 47252f82ee54SAlex Elder * Images related by parent/child relationships 47262f82ee54SAlex Elder * always share both. 47272f82ee54SAlex Elder */ 47282f82ee54SAlex Elder parent_spec = rbd_spec_get(rbd_dev->parent_spec); 47292f82ee54SAlex Elder rbdc = __rbd_get_client(rbd_dev->rbd_client); 47302f82ee54SAlex Elder 47312f82ee54SAlex Elder parent = rbd_dev_create(rbdc, parent_spec); 47322f82ee54SAlex Elder if (!parent) { 47332f82ee54SAlex Elder ret = -ENOMEM; 47342f82ee54SAlex Elder goto err_out_spec; 47352f82ee54SAlex Elder } 47362f82ee54SAlex Elder rbdc = NULL; /* parent now owns reference */ 47372f82ee54SAlex Elder parent_spec = NULL; /* parent now owns reference */ 47382f82ee54SAlex Elder ret = rbd_dev_probe(parent); 47392f82ee54SAlex Elder if (ret < 0) 47402f82ee54SAlex Elder goto err_out_parent; 47412f82ee54SAlex Elder rbd_dev->parent = parent; 47422f82ee54SAlex Elder } 47432f82ee54SAlex Elder 47449969ebc5SAlex Elder ret = rbd_dev_header_watch_sync(rbd_dev, 1); 474583a06263SAlex Elder if (ret) 474683a06263SAlex Elder goto err_out_bus; 474783a06263SAlex Elder 474883a06263SAlex Elder /* Everything's ready. Announce the disk to the world. */ 474983a06263SAlex Elder 475083a06263SAlex Elder add_disk(rbd_dev->disk); 475183a06263SAlex Elder 475283a06263SAlex Elder pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name, 475383a06263SAlex Elder (unsigned long long) rbd_dev->mapping.size); 475483a06263SAlex Elder 475583a06263SAlex Elder return ret; 47562f82ee54SAlex Elder 47572f82ee54SAlex Elder err_out_parent: 47582f82ee54SAlex Elder rbd_dev_destroy(parent); 47592f82ee54SAlex Elder err_out_spec: 47602f82ee54SAlex Elder rbd_spec_put(parent_spec); 47612f82ee54SAlex Elder rbd_put_client(rbdc); 476283a06263SAlex Elder err_out_bus: 476383a06263SAlex Elder /* this will also clean up rest of rbd_dev stuff */ 476483a06263SAlex Elder 476583a06263SAlex Elder rbd_bus_del_dev(rbd_dev); 476683a06263SAlex Elder 476783a06263SAlex Elder return ret; 476883a06263SAlex Elder err_out_disk: 476983a06263SAlex Elder rbd_free_disk(rbd_dev); 477083a06263SAlex Elder err_out_blkdev: 477183a06263SAlex Elder unregister_blkdev(rbd_dev->major, rbd_dev->name); 477283a06263SAlex Elder err_out_id: 477383a06263SAlex Elder rbd_dev_id_put(rbd_dev); 477483a06263SAlex Elder err_out_snaps: 477583a06263SAlex Elder rbd_remove_all_snaps(rbd_dev); 477683a06263SAlex Elder 477783a06263SAlex Elder return ret; 477883a06263SAlex Elder } 477983a06263SAlex Elder 4780a30b71b9SAlex Elder /* 4781a30b71b9SAlex Elder * Probe for the existence of the header object for the given rbd 4782a30b71b9SAlex Elder * device. For format 2 images this includes determining the image 4783a30b71b9SAlex Elder * id. 4784a30b71b9SAlex Elder */ 4785a30b71b9SAlex Elder static int rbd_dev_probe(struct rbd_device *rbd_dev) 4786a30b71b9SAlex Elder { 4787a30b71b9SAlex Elder int ret; 4788a30b71b9SAlex Elder 4789a30b71b9SAlex Elder /* 4790a30b71b9SAlex Elder * Get the id from the image id object. If it's not a 4791a30b71b9SAlex Elder * format 2 image, we'll get ENOENT back, and we'll assume 4792a30b71b9SAlex Elder * it's a format 1 image. 4793a30b71b9SAlex Elder */ 4794a30b71b9SAlex Elder ret = rbd_dev_image_id(rbd_dev); 4795a30b71b9SAlex Elder if (ret) 4796a30b71b9SAlex Elder ret = rbd_dev_v1_probe(rbd_dev); 4797a30b71b9SAlex Elder else 4798a30b71b9SAlex Elder ret = rbd_dev_v2_probe(rbd_dev); 479983a06263SAlex Elder if (ret) { 4800a30b71b9SAlex Elder dout("probe failed, returning %d\n", ret); 4801a30b71b9SAlex Elder 4802a30b71b9SAlex Elder return ret; 4803a30b71b9SAlex Elder } 4804a30b71b9SAlex Elder 480583a06263SAlex Elder ret = rbd_dev_probe_finish(rbd_dev); 480683a06263SAlex Elder if (ret) 480783a06263SAlex Elder rbd_header_free(&rbd_dev->header); 480883a06263SAlex Elder 480983a06263SAlex Elder return ret; 481083a06263SAlex Elder } 481183a06263SAlex Elder 481259c2be1eSYehuda Sadeh static ssize_t rbd_add(struct bus_type *bus, 481359c2be1eSYehuda Sadeh const char *buf, 481459c2be1eSYehuda Sadeh size_t count) 4815602adf40SYehuda Sadeh { 4816cb8627c7SAlex Elder struct rbd_device *rbd_dev = NULL; 4817dc79b113SAlex Elder struct ceph_options *ceph_opts = NULL; 48184e9afebaSAlex Elder struct rbd_options *rbd_opts = NULL; 4819859c31dfSAlex Elder struct rbd_spec *spec = NULL; 48209d3997fdSAlex Elder struct rbd_client *rbdc; 482127cc2594SAlex Elder struct ceph_osd_client *osdc; 482227cc2594SAlex Elder int rc = -ENOMEM; 4823602adf40SYehuda Sadeh 4824602adf40SYehuda Sadeh if (!try_module_get(THIS_MODULE)) 4825602adf40SYehuda Sadeh return -ENODEV; 4826602adf40SYehuda Sadeh 4827a725f65eSAlex Elder /* parse add command */ 4828859c31dfSAlex Elder rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec); 4829dc79b113SAlex Elder if (rc < 0) 4830bd4ba655SAlex Elder goto err_out_module; 4831a725f65eSAlex Elder 48329d3997fdSAlex Elder rbdc = rbd_get_client(ceph_opts); 48339d3997fdSAlex Elder if (IS_ERR(rbdc)) { 48349d3997fdSAlex Elder rc = PTR_ERR(rbdc); 48350ddebc0cSAlex Elder goto err_out_args; 48369d3997fdSAlex Elder } 4837c53d5893SAlex Elder ceph_opts = NULL; /* rbd_dev client now owns this */ 4838602adf40SYehuda Sadeh 4839602adf40SYehuda Sadeh /* pick the pool */ 48409d3997fdSAlex Elder osdc = &rbdc->client->osdc; 4841859c31dfSAlex Elder rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name); 4842602adf40SYehuda Sadeh if (rc < 0) 4843602adf40SYehuda Sadeh goto err_out_client; 4844859c31dfSAlex Elder spec->pool_id = (u64) rc; 4845859c31dfSAlex Elder 48460903e875SAlex Elder /* The ceph file layout needs to fit pool id in 32 bits */ 48470903e875SAlex Elder 48480903e875SAlex Elder if (WARN_ON(spec->pool_id > (u64) U32_MAX)) { 48490903e875SAlex Elder rc = -EIO; 48500903e875SAlex Elder goto err_out_client; 48510903e875SAlex Elder } 48520903e875SAlex Elder 4853c53d5893SAlex Elder rbd_dev = rbd_dev_create(rbdc, spec); 4854bd4ba655SAlex Elder if (!rbd_dev) 4855bd4ba655SAlex Elder goto err_out_client; 4856c53d5893SAlex Elder rbdc = NULL; /* rbd_dev now owns this */ 4857c53d5893SAlex Elder spec = NULL; /* rbd_dev now owns this */ 4858602adf40SYehuda Sadeh 4859bd4ba655SAlex Elder rbd_dev->mapping.read_only = rbd_opts->read_only; 4860c53d5893SAlex Elder kfree(rbd_opts); 4861c53d5893SAlex Elder rbd_opts = NULL; /* done with this */ 4862bd4ba655SAlex Elder 4863a30b71b9SAlex Elder rc = rbd_dev_probe(rbd_dev); 4864a30b71b9SAlex Elder if (rc < 0) 4865c53d5893SAlex Elder goto err_out_rbd_dev; 486605fd6f6fSAlex Elder 4867602adf40SYehuda Sadeh return count; 4868c53d5893SAlex Elder err_out_rbd_dev: 4869c53d5893SAlex Elder rbd_dev_destroy(rbd_dev); 4870bd4ba655SAlex Elder err_out_client: 48719d3997fdSAlex Elder rbd_put_client(rbdc); 48720ddebc0cSAlex Elder err_out_args: 487378cea76eSAlex Elder if (ceph_opts) 487478cea76eSAlex Elder ceph_destroy_options(ceph_opts); 48754e9afebaSAlex Elder kfree(rbd_opts); 4876859c31dfSAlex Elder rbd_spec_put(spec); 4877bd4ba655SAlex Elder err_out_module: 4878bd4ba655SAlex Elder module_put(THIS_MODULE); 487927cc2594SAlex Elder 4880602adf40SYehuda Sadeh dout("Error adding device %s\n", buf); 488127cc2594SAlex Elder 488227cc2594SAlex Elder return (ssize_t) rc; 4883602adf40SYehuda Sadeh } 4884602adf40SYehuda Sadeh 4885de71a297SAlex Elder static struct rbd_device *__rbd_get_dev(unsigned long dev_id) 4886602adf40SYehuda Sadeh { 4887602adf40SYehuda Sadeh struct list_head *tmp; 4888602adf40SYehuda Sadeh struct rbd_device *rbd_dev; 4889602adf40SYehuda Sadeh 4890e124a82fSAlex Elder spin_lock(&rbd_dev_list_lock); 4891602adf40SYehuda Sadeh list_for_each(tmp, &rbd_dev_list) { 4892602adf40SYehuda Sadeh rbd_dev = list_entry(tmp, struct rbd_device, node); 4893de71a297SAlex Elder if (rbd_dev->dev_id == dev_id) { 4894e124a82fSAlex Elder spin_unlock(&rbd_dev_list_lock); 4895602adf40SYehuda Sadeh return rbd_dev; 4896602adf40SYehuda Sadeh } 4897e124a82fSAlex Elder } 4898e124a82fSAlex Elder spin_unlock(&rbd_dev_list_lock); 4899602adf40SYehuda Sadeh return NULL; 4900602adf40SYehuda Sadeh } 4901602adf40SYehuda Sadeh 4902dfc5606dSYehuda Sadeh static void rbd_dev_release(struct device *dev) 4903602adf40SYehuda Sadeh { 4904593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4905602adf40SYehuda Sadeh 490659c2be1eSYehuda Sadeh if (rbd_dev->watch_event) 49079969ebc5SAlex Elder rbd_dev_header_watch_sync(rbd_dev, 0); 4908602adf40SYehuda Sadeh 4909602adf40SYehuda Sadeh /* clean up and free blkdev */ 4910602adf40SYehuda Sadeh rbd_free_disk(rbd_dev); 4911602adf40SYehuda Sadeh unregister_blkdev(rbd_dev->major, rbd_dev->name); 491232eec68dSAlex Elder 49132ac4e75dSAlex Elder /* release allocated disk header fields */ 49142ac4e75dSAlex Elder rbd_header_free(&rbd_dev->header); 49152ac4e75dSAlex Elder 491632eec68dSAlex Elder /* done with the id, and with the rbd_dev */ 4917e2839308SAlex Elder rbd_dev_id_put(rbd_dev); 4918c53d5893SAlex Elder rbd_assert(rbd_dev->rbd_client != NULL); 4919c53d5893SAlex Elder rbd_dev_destroy(rbd_dev); 4920602adf40SYehuda Sadeh 4921602adf40SYehuda Sadeh /* release module ref */ 4922602adf40SYehuda Sadeh module_put(THIS_MODULE); 4923602adf40SYehuda Sadeh } 4924602adf40SYehuda Sadeh 49252f82ee54SAlex Elder static void __rbd_remove(struct rbd_device *rbd_dev) 49262f82ee54SAlex Elder { 49272f82ee54SAlex Elder rbd_remove_all_snaps(rbd_dev); 49282f82ee54SAlex Elder rbd_bus_del_dev(rbd_dev); 49292f82ee54SAlex Elder } 49302f82ee54SAlex Elder 4931dfc5606dSYehuda Sadeh static ssize_t rbd_remove(struct bus_type *bus, 4932602adf40SYehuda Sadeh const char *buf, 4933602adf40SYehuda Sadeh size_t count) 4934602adf40SYehuda Sadeh { 4935602adf40SYehuda Sadeh struct rbd_device *rbd_dev = NULL; 4936602adf40SYehuda Sadeh int target_id, rc; 4937602adf40SYehuda Sadeh unsigned long ul; 4938602adf40SYehuda Sadeh int ret = count; 4939602adf40SYehuda Sadeh 4940602adf40SYehuda Sadeh rc = strict_strtoul(buf, 10, &ul); 4941602adf40SYehuda Sadeh if (rc) 4942602adf40SYehuda Sadeh return rc; 4943602adf40SYehuda Sadeh 4944602adf40SYehuda Sadeh /* convert to int; abort if we lost anything in the conversion */ 4945602adf40SYehuda Sadeh target_id = (int) ul; 4946602adf40SYehuda Sadeh if (target_id != ul) 4947602adf40SYehuda Sadeh return -EINVAL; 4948602adf40SYehuda Sadeh 4949602adf40SYehuda Sadeh mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); 4950602adf40SYehuda Sadeh 4951602adf40SYehuda Sadeh rbd_dev = __rbd_get_dev(target_id); 4952602adf40SYehuda Sadeh if (!rbd_dev) { 4953602adf40SYehuda Sadeh ret = -ENOENT; 4954602adf40SYehuda Sadeh goto done; 4955602adf40SYehuda Sadeh } 4956602adf40SYehuda Sadeh 4957a14ea269SAlex Elder spin_lock_irq(&rbd_dev->lock); 4958b82d167bSAlex Elder if (rbd_dev->open_count) 495942382b70SAlex Elder ret = -EBUSY; 4960b82d167bSAlex Elder else 4961b82d167bSAlex Elder set_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags); 4962a14ea269SAlex Elder spin_unlock_irq(&rbd_dev->lock); 4963b82d167bSAlex Elder if (ret < 0) 496442382b70SAlex Elder goto done; 496542382b70SAlex Elder 49662f82ee54SAlex Elder while (rbd_dev->parent_spec) { 49672f82ee54SAlex Elder struct rbd_device *first = rbd_dev; 49682f82ee54SAlex Elder struct rbd_device *second = first->parent; 49692f82ee54SAlex Elder struct rbd_device *third; 49702f82ee54SAlex Elder 49712f82ee54SAlex Elder /* 49722f82ee54SAlex Elder * Follow to the parent with no grandparent and 49732f82ee54SAlex Elder * remove it. 49742f82ee54SAlex Elder */ 49752f82ee54SAlex Elder while (second && (third = second->parent)) { 49762f82ee54SAlex Elder first = second; 49772f82ee54SAlex Elder second = third; 49782f82ee54SAlex Elder } 49792f82ee54SAlex Elder __rbd_remove(second); 49802f82ee54SAlex Elder rbd_spec_put(first->parent_spec); 49812f82ee54SAlex Elder first->parent_spec = NULL; 49822f82ee54SAlex Elder first->parent_overlap = 0; 49832f82ee54SAlex Elder first->parent = NULL; 49842f82ee54SAlex Elder } 49852f82ee54SAlex Elder __rbd_remove(rbd_dev); 4986602adf40SYehuda Sadeh 4987602adf40SYehuda Sadeh done: 4988602adf40SYehuda Sadeh mutex_unlock(&ctl_mutex); 4989aafb230eSAlex Elder 4990602adf40SYehuda Sadeh return ret; 4991602adf40SYehuda Sadeh } 4992602adf40SYehuda Sadeh 4993602adf40SYehuda Sadeh /* 4994602adf40SYehuda Sadeh * create control files in sysfs 4995dfc5606dSYehuda Sadeh * /sys/bus/rbd/... 4996602adf40SYehuda Sadeh */ 4997602adf40SYehuda Sadeh static int rbd_sysfs_init(void) 4998602adf40SYehuda Sadeh { 4999dfc5606dSYehuda Sadeh int ret; 5000602adf40SYehuda Sadeh 5001fed4c143SAlex Elder ret = device_register(&rbd_root_dev); 5002dfc5606dSYehuda Sadeh if (ret < 0) 5003dfc5606dSYehuda Sadeh return ret; 5004602adf40SYehuda Sadeh 5005fed4c143SAlex Elder ret = bus_register(&rbd_bus_type); 5006fed4c143SAlex Elder if (ret < 0) 5007fed4c143SAlex Elder device_unregister(&rbd_root_dev); 5008602adf40SYehuda Sadeh 5009602adf40SYehuda Sadeh return ret; 5010602adf40SYehuda Sadeh } 5011602adf40SYehuda Sadeh 5012602adf40SYehuda Sadeh static void rbd_sysfs_cleanup(void) 5013602adf40SYehuda Sadeh { 5014dfc5606dSYehuda Sadeh bus_unregister(&rbd_bus_type); 5015fed4c143SAlex Elder device_unregister(&rbd_root_dev); 5016602adf40SYehuda Sadeh } 5017602adf40SYehuda Sadeh 5018cc344fa1SAlex Elder static int __init rbd_init(void) 5019602adf40SYehuda Sadeh { 5020602adf40SYehuda Sadeh int rc; 5021602adf40SYehuda Sadeh 50221e32d34cSAlex Elder if (!libceph_compatible(NULL)) { 50231e32d34cSAlex Elder rbd_warn(NULL, "libceph incompatibility (quitting)"); 50241e32d34cSAlex Elder 50251e32d34cSAlex Elder return -EINVAL; 50261e32d34cSAlex Elder } 5027602adf40SYehuda Sadeh rc = rbd_sysfs_init(); 5028602adf40SYehuda Sadeh if (rc) 5029602adf40SYehuda Sadeh return rc; 5030f0f8cef5SAlex Elder pr_info("loaded " RBD_DRV_NAME_LONG "\n"); 5031602adf40SYehuda Sadeh return 0; 5032602adf40SYehuda Sadeh } 5033602adf40SYehuda Sadeh 5034cc344fa1SAlex Elder static void __exit rbd_exit(void) 5035602adf40SYehuda Sadeh { 5036602adf40SYehuda Sadeh rbd_sysfs_cleanup(); 5037602adf40SYehuda Sadeh } 5038602adf40SYehuda Sadeh 5039602adf40SYehuda Sadeh module_init(rbd_init); 5040602adf40SYehuda Sadeh module_exit(rbd_exit); 5041602adf40SYehuda Sadeh 5042602adf40SYehuda Sadeh MODULE_AUTHOR("Sage Weil <sage@newdream.net>"); 5043602adf40SYehuda Sadeh MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>"); 5044602adf40SYehuda Sadeh MODULE_DESCRIPTION("rados block device"); 5045602adf40SYehuda Sadeh 5046602adf40SYehuda Sadeh /* following authorship retained from original osdblk.c */ 5047602adf40SYehuda Sadeh MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>"); 5048602adf40SYehuda Sadeh 5049602adf40SYehuda Sadeh MODULE_LICENSE("GPL"); 5050