1602adf40SYehuda Sadeh /* 2602adf40SYehuda Sadeh rbd.c -- Export ceph rados objects as a Linux block device 3602adf40SYehuda Sadeh 4602adf40SYehuda Sadeh 5602adf40SYehuda Sadeh based on drivers/block/osdblk.c: 6602adf40SYehuda Sadeh 7602adf40SYehuda Sadeh Copyright 2009 Red Hat, Inc. 8602adf40SYehuda Sadeh 9602adf40SYehuda Sadeh This program is free software; you can redistribute it and/or modify 10602adf40SYehuda Sadeh it under the terms of the GNU General Public License as published by 11602adf40SYehuda Sadeh the Free Software Foundation. 12602adf40SYehuda Sadeh 13602adf40SYehuda Sadeh This program is distributed in the hope that it will be useful, 14602adf40SYehuda Sadeh but WITHOUT ANY WARRANTY; without even the implied warranty of 15602adf40SYehuda Sadeh MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16602adf40SYehuda Sadeh GNU General Public License for more details. 17602adf40SYehuda Sadeh 18602adf40SYehuda Sadeh You should have received a copy of the GNU General Public License 19602adf40SYehuda Sadeh along with this program; see the file COPYING. If not, write to 20602adf40SYehuda Sadeh the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 21602adf40SYehuda Sadeh 22602adf40SYehuda Sadeh 23602adf40SYehuda Sadeh 24dfc5606dSYehuda Sadeh For usage instructions, please refer to: 25602adf40SYehuda Sadeh 26dfc5606dSYehuda Sadeh Documentation/ABI/testing/sysfs-bus-rbd 27602adf40SYehuda Sadeh 28602adf40SYehuda Sadeh */ 29602adf40SYehuda Sadeh 30602adf40SYehuda Sadeh #include <linux/ceph/libceph.h> 31602adf40SYehuda Sadeh #include <linux/ceph/osd_client.h> 32602adf40SYehuda Sadeh #include <linux/ceph/mon_client.h> 33602adf40SYehuda Sadeh #include <linux/ceph/decode.h> 3459c2be1eSYehuda Sadeh #include <linux/parser.h> 35602adf40SYehuda Sadeh 36602adf40SYehuda Sadeh #include <linux/kernel.h> 37602adf40SYehuda Sadeh #include <linux/device.h> 38602adf40SYehuda Sadeh #include <linux/module.h> 39602adf40SYehuda Sadeh #include <linux/fs.h> 40602adf40SYehuda Sadeh #include <linux/blkdev.h> 41602adf40SYehuda Sadeh 42602adf40SYehuda Sadeh #include "rbd_types.h" 43602adf40SYehuda Sadeh 44aafb230eSAlex Elder #define RBD_DEBUG /* Activate rbd_assert() calls */ 45aafb230eSAlex Elder 46593a9e7bSAlex Elder /* 47593a9e7bSAlex Elder * The basic unit of block I/O is a sector. It is interpreted in a 48593a9e7bSAlex Elder * number of contexts in Linux (blk, bio, genhd), but the default is 49593a9e7bSAlex Elder * universally 512 bytes. These symbols are just slightly more 50593a9e7bSAlex Elder * meaningful than the bare numbers they represent. 51593a9e7bSAlex Elder */ 52593a9e7bSAlex Elder #define SECTOR_SHIFT 9 53593a9e7bSAlex Elder #define SECTOR_SIZE (1ULL << SECTOR_SHIFT) 54593a9e7bSAlex Elder 55f0f8cef5SAlex Elder #define RBD_DRV_NAME "rbd" 56f0f8cef5SAlex Elder #define RBD_DRV_NAME_LONG "rbd (rados block device)" 57602adf40SYehuda Sadeh 58602adf40SYehuda Sadeh #define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */ 59602adf40SYehuda Sadeh 60d4b125e9SAlex Elder #define RBD_SNAP_DEV_NAME_PREFIX "snap_" 61d4b125e9SAlex Elder #define RBD_MAX_SNAP_NAME_LEN \ 62d4b125e9SAlex Elder (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1)) 63d4b125e9SAlex Elder 6435d489f9SAlex Elder #define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */ 65602adf40SYehuda Sadeh 66602adf40SYehuda Sadeh #define RBD_SNAP_HEAD_NAME "-" 67602adf40SYehuda Sadeh 689e15b77dSAlex Elder /* This allows a single page to hold an image name sent by OSD */ 699e15b77dSAlex Elder #define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1) 70589d30e0SAlex Elder #define RBD_IMAGE_ID_LEN_MAX 64 719e15b77dSAlex Elder 721e130199SAlex Elder #define RBD_OBJ_PREFIX_LEN_MAX 64 73589d30e0SAlex Elder 74d889140cSAlex Elder /* Feature bits */ 75d889140cSAlex Elder 765cbf6f12SAlex Elder #define RBD_FEATURE_LAYERING (1<<0) 775cbf6f12SAlex Elder #define RBD_FEATURE_STRIPINGV2 (1<<1) 785cbf6f12SAlex Elder #define RBD_FEATURES_ALL \ 795cbf6f12SAlex Elder (RBD_FEATURE_LAYERING | RBD_FEATURE_STRIPINGV2) 80d889140cSAlex Elder 81d889140cSAlex Elder /* Features supported by this (client software) implementation. */ 82d889140cSAlex Elder 835cbf6f12SAlex Elder #define RBD_FEATURES_SUPPORTED (0) 84d889140cSAlex Elder 8581a89793SAlex Elder /* 8681a89793SAlex Elder * An RBD device name will be "rbd#", where the "rbd" comes from 8781a89793SAlex Elder * RBD_DRV_NAME above, and # is a unique integer identifier. 8881a89793SAlex Elder * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big 8981a89793SAlex Elder * enough to hold all possible device names. 9081a89793SAlex Elder */ 91602adf40SYehuda Sadeh #define DEV_NAME_LEN 32 9281a89793SAlex Elder #define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1) 93602adf40SYehuda Sadeh 94602adf40SYehuda Sadeh /* 95602adf40SYehuda Sadeh * block device image metadata (in-memory version) 96602adf40SYehuda Sadeh */ 97602adf40SYehuda Sadeh struct rbd_image_header { 98f84344f3SAlex Elder /* These four fields never change for a given rbd image */ 99849b4260SAlex Elder char *object_prefix; 10034b13184SAlex Elder u64 features; 101602adf40SYehuda Sadeh __u8 obj_order; 102602adf40SYehuda Sadeh __u8 crypt_type; 103602adf40SYehuda Sadeh __u8 comp_type; 104602adf40SYehuda Sadeh 105f84344f3SAlex Elder /* The remaining fields need to be updated occasionally */ 106f84344f3SAlex Elder u64 image_size; 107f84344f3SAlex Elder struct ceph_snap_context *snapc; 108602adf40SYehuda Sadeh char *snap_names; 109602adf40SYehuda Sadeh u64 *snap_sizes; 11059c2be1eSYehuda Sadeh 11159c2be1eSYehuda Sadeh u64 obj_version; 11259c2be1eSYehuda Sadeh }; 11359c2be1eSYehuda Sadeh 1140d7dbfceSAlex Elder /* 1150d7dbfceSAlex Elder * An rbd image specification. 1160d7dbfceSAlex Elder * 1170d7dbfceSAlex Elder * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely 118c66c6e0cSAlex Elder * identify an image. Each rbd_dev structure includes a pointer to 119c66c6e0cSAlex Elder * an rbd_spec structure that encapsulates this identity. 120c66c6e0cSAlex Elder * 121c66c6e0cSAlex Elder * Each of the id's in an rbd_spec has an associated name. For a 122c66c6e0cSAlex Elder * user-mapped image, the names are supplied and the id's associated 123c66c6e0cSAlex Elder * with them are looked up. For a layered image, a parent image is 124c66c6e0cSAlex Elder * defined by the tuple, and the names are looked up. 125c66c6e0cSAlex Elder * 126c66c6e0cSAlex Elder * An rbd_dev structure contains a parent_spec pointer which is 127c66c6e0cSAlex Elder * non-null if the image it represents is a child in a layered 128c66c6e0cSAlex Elder * image. This pointer will refer to the rbd_spec structure used 129c66c6e0cSAlex Elder * by the parent rbd_dev for its own identity (i.e., the structure 130c66c6e0cSAlex Elder * is shared between the parent and child). 131c66c6e0cSAlex Elder * 132c66c6e0cSAlex Elder * Since these structures are populated once, during the discovery 133c66c6e0cSAlex Elder * phase of image construction, they are effectively immutable so 134c66c6e0cSAlex Elder * we make no effort to synchronize access to them. 135c66c6e0cSAlex Elder * 136c66c6e0cSAlex Elder * Note that code herein does not assume the image name is known (it 137c66c6e0cSAlex Elder * could be a null pointer). 1380d7dbfceSAlex Elder */ 1390d7dbfceSAlex Elder struct rbd_spec { 1400d7dbfceSAlex Elder u64 pool_id; 1410d7dbfceSAlex Elder char *pool_name; 1420d7dbfceSAlex Elder 1430d7dbfceSAlex Elder char *image_id; 1440d7dbfceSAlex Elder char *image_name; 1450d7dbfceSAlex Elder 1460d7dbfceSAlex Elder u64 snap_id; 1470d7dbfceSAlex Elder char *snap_name; 1480d7dbfceSAlex Elder 1490d7dbfceSAlex Elder struct kref kref; 1500d7dbfceSAlex Elder }; 1510d7dbfceSAlex Elder 152602adf40SYehuda Sadeh /* 153f0f8cef5SAlex Elder * an instance of the client. multiple devices may share an rbd client. 154602adf40SYehuda Sadeh */ 155602adf40SYehuda Sadeh struct rbd_client { 156602adf40SYehuda Sadeh struct ceph_client *client; 157602adf40SYehuda Sadeh struct kref kref; 158602adf40SYehuda Sadeh struct list_head node; 159602adf40SYehuda Sadeh }; 160602adf40SYehuda Sadeh 161bf0d5f50SAlex Elder struct rbd_img_request; 162bf0d5f50SAlex Elder typedef void (*rbd_img_callback_t)(struct rbd_img_request *); 163bf0d5f50SAlex Elder 164bf0d5f50SAlex Elder #define BAD_WHICH U32_MAX /* Good which or bad which, which? */ 165bf0d5f50SAlex Elder 166bf0d5f50SAlex Elder struct rbd_obj_request; 167bf0d5f50SAlex Elder typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *); 168bf0d5f50SAlex Elder 1699969ebc5SAlex Elder enum obj_request_type { 1709969ebc5SAlex Elder OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES 1719969ebc5SAlex Elder }; 172bf0d5f50SAlex Elder 173926f9b3fSAlex Elder enum obj_req_flags { 174926f9b3fSAlex Elder OBJ_REQ_DONE, /* completion flag: not done = 0, done = 1 */ 1756365d33aSAlex Elder OBJ_REQ_IMG_DATA, /* object usage: standalone = 0, image = 1 */ 1765679c59fSAlex Elder OBJ_REQ_KNOWN, /* EXISTS flag valid: no = 0, yes = 1 */ 1775679c59fSAlex Elder OBJ_REQ_EXISTS, /* target exists: no = 0, yes = 1 */ 178926f9b3fSAlex Elder }; 179926f9b3fSAlex Elder 180bf0d5f50SAlex Elder struct rbd_obj_request { 181bf0d5f50SAlex Elder const char *object_name; 182bf0d5f50SAlex Elder u64 offset; /* object start byte */ 183bf0d5f50SAlex Elder u64 length; /* bytes from offset */ 184926f9b3fSAlex Elder unsigned long flags; 185bf0d5f50SAlex Elder 186c5b5ef6cSAlex Elder /* 187c5b5ef6cSAlex Elder * An object request associated with an image will have its 188c5b5ef6cSAlex Elder * img_data flag set; a standalone object request will not. 189c5b5ef6cSAlex Elder * 190c5b5ef6cSAlex Elder * A standalone object request will have which == BAD_WHICH 191c5b5ef6cSAlex Elder * and a null obj_request pointer. 192c5b5ef6cSAlex Elder * 193c5b5ef6cSAlex Elder * An object request initiated in support of a layered image 194c5b5ef6cSAlex Elder * object (to check for its existence before a write) will 195c5b5ef6cSAlex Elder * have which == BAD_WHICH and a non-null obj_request pointer. 196c5b5ef6cSAlex Elder * 197c5b5ef6cSAlex Elder * Finally, an object request for rbd image data will have 198c5b5ef6cSAlex Elder * which != BAD_WHICH, and will have a non-null img_request 199c5b5ef6cSAlex Elder * pointer. The value of which will be in the range 200c5b5ef6cSAlex Elder * 0..(img_request->obj_request_count-1). 201c5b5ef6cSAlex Elder */ 202c5b5ef6cSAlex Elder union { 203c5b5ef6cSAlex Elder struct rbd_obj_request *obj_request; /* STAT op */ 204c5b5ef6cSAlex Elder struct { 205bf0d5f50SAlex Elder struct rbd_img_request *img_request; 206c5b5ef6cSAlex Elder u64 img_offset; 207c5b5ef6cSAlex Elder /* links for img_request->obj_requests list */ 208c5b5ef6cSAlex Elder struct list_head links; 209c5b5ef6cSAlex Elder }; 210c5b5ef6cSAlex Elder }; 211bf0d5f50SAlex Elder u32 which; /* posn image request list */ 212bf0d5f50SAlex Elder 213bf0d5f50SAlex Elder enum obj_request_type type; 214788e2df3SAlex Elder union { 215bf0d5f50SAlex Elder struct bio *bio_list; 216788e2df3SAlex Elder struct { 217788e2df3SAlex Elder struct page **pages; 218788e2df3SAlex Elder u32 page_count; 219788e2df3SAlex Elder }; 220788e2df3SAlex Elder }; 221bf0d5f50SAlex Elder 222bf0d5f50SAlex Elder struct ceph_osd_request *osd_req; 223bf0d5f50SAlex Elder 224bf0d5f50SAlex Elder u64 xferred; /* bytes transferred */ 225bf0d5f50SAlex Elder u64 version; 2261b83bef2SSage Weil int result; 227bf0d5f50SAlex Elder 228bf0d5f50SAlex Elder rbd_obj_callback_t callback; 229788e2df3SAlex Elder struct completion completion; 230bf0d5f50SAlex Elder 231bf0d5f50SAlex Elder struct kref kref; 232bf0d5f50SAlex Elder }; 233bf0d5f50SAlex Elder 2340c425248SAlex Elder enum img_req_flags { 2359849e986SAlex Elder IMG_REQ_WRITE, /* I/O direction: read = 0, write = 1 */ 2369849e986SAlex Elder IMG_REQ_CHILD, /* initiator: block = 0, child image = 1 */ 237d0b2e944SAlex Elder IMG_REQ_LAYERED, /* ENOENT handling: normal = 0, layered = 1 */ 2380c425248SAlex Elder }; 2390c425248SAlex Elder 240bf0d5f50SAlex Elder struct rbd_img_request { 241bf0d5f50SAlex Elder struct rbd_device *rbd_dev; 242bf0d5f50SAlex Elder u64 offset; /* starting image byte offset */ 243bf0d5f50SAlex Elder u64 length; /* byte count from offset */ 2440c425248SAlex Elder unsigned long flags; 245bf0d5f50SAlex Elder union { 246bf0d5f50SAlex Elder u64 snap_id; /* for reads */ 2479849e986SAlex Elder struct ceph_snap_context *snapc; /* for writes */ 2489849e986SAlex Elder }; 2499849e986SAlex Elder union { 2509849e986SAlex Elder struct request *rq; /* block request */ 2519849e986SAlex Elder struct rbd_obj_request *obj_request; /* obj req initiator */ 252bf0d5f50SAlex Elder }; 253bf0d5f50SAlex Elder spinlock_t completion_lock;/* protects next_completion */ 254bf0d5f50SAlex Elder u32 next_completion; 255bf0d5f50SAlex Elder rbd_img_callback_t callback; 25655f27e09SAlex Elder u64 xferred;/* aggregate bytes transferred */ 257a5a337d4SAlex Elder int result; /* first nonzero obj_request result */ 258bf0d5f50SAlex Elder 259bf0d5f50SAlex Elder u32 obj_request_count; 260bf0d5f50SAlex Elder struct list_head obj_requests; /* rbd_obj_request structs */ 261bf0d5f50SAlex Elder 262bf0d5f50SAlex Elder struct kref kref; 263bf0d5f50SAlex Elder }; 264bf0d5f50SAlex Elder 265bf0d5f50SAlex Elder #define for_each_obj_request(ireq, oreq) \ 266ef06f4d3SAlex Elder list_for_each_entry(oreq, &(ireq)->obj_requests, links) 267bf0d5f50SAlex Elder #define for_each_obj_request_from(ireq, oreq) \ 268ef06f4d3SAlex Elder list_for_each_entry_from(oreq, &(ireq)->obj_requests, links) 269bf0d5f50SAlex Elder #define for_each_obj_request_safe(ireq, oreq, n) \ 270ef06f4d3SAlex Elder list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links) 271bf0d5f50SAlex Elder 272dfc5606dSYehuda Sadeh struct rbd_snap { 273dfc5606dSYehuda Sadeh struct device dev; 274dfc5606dSYehuda Sadeh const char *name; 2753591538fSJosh Durgin u64 size; 276dfc5606dSYehuda Sadeh struct list_head node; 277dfc5606dSYehuda Sadeh u64 id; 27834b13184SAlex Elder u64 features; 279dfc5606dSYehuda Sadeh }; 280dfc5606dSYehuda Sadeh 281f84344f3SAlex Elder struct rbd_mapping { 28299c1f08fSAlex Elder u64 size; 28334b13184SAlex Elder u64 features; 284f84344f3SAlex Elder bool read_only; 285f84344f3SAlex Elder }; 286f84344f3SAlex Elder 287602adf40SYehuda Sadeh /* 288602adf40SYehuda Sadeh * a single device 289602adf40SYehuda Sadeh */ 290602adf40SYehuda Sadeh struct rbd_device { 291de71a297SAlex Elder int dev_id; /* blkdev unique id */ 292602adf40SYehuda Sadeh 293602adf40SYehuda Sadeh int major; /* blkdev assigned major */ 294602adf40SYehuda Sadeh struct gendisk *disk; /* blkdev's gendisk and rq */ 295602adf40SYehuda Sadeh 296a30b71b9SAlex Elder u32 image_format; /* Either 1 or 2 */ 297602adf40SYehuda Sadeh struct rbd_client *rbd_client; 298602adf40SYehuda Sadeh 299602adf40SYehuda Sadeh char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */ 300602adf40SYehuda Sadeh 301b82d167bSAlex Elder spinlock_t lock; /* queue, flags, open_count */ 302602adf40SYehuda Sadeh 303602adf40SYehuda Sadeh struct rbd_image_header header; 304b82d167bSAlex Elder unsigned long flags; /* possibly lock protected */ 3050d7dbfceSAlex Elder struct rbd_spec *spec; 306602adf40SYehuda Sadeh 3070d7dbfceSAlex Elder char *header_name; 308971f839aSAlex Elder 3090903e875SAlex Elder struct ceph_file_layout layout; 3100903e875SAlex Elder 31159c2be1eSYehuda Sadeh struct ceph_osd_event *watch_event; 312975241afSAlex Elder struct rbd_obj_request *watch_request; 31359c2be1eSYehuda Sadeh 31486b00e0dSAlex Elder struct rbd_spec *parent_spec; 31586b00e0dSAlex Elder u64 parent_overlap; 3162f82ee54SAlex Elder struct rbd_device *parent; 31786b00e0dSAlex Elder 318c666601aSJosh Durgin /* protects updating the header */ 319c666601aSJosh Durgin struct rw_semaphore header_rwsem; 320f84344f3SAlex Elder 321f84344f3SAlex Elder struct rbd_mapping mapping; 322602adf40SYehuda Sadeh 323602adf40SYehuda Sadeh struct list_head node; 324dfc5606dSYehuda Sadeh 325dfc5606dSYehuda Sadeh /* list of snapshots */ 326dfc5606dSYehuda Sadeh struct list_head snaps; 327dfc5606dSYehuda Sadeh 328dfc5606dSYehuda Sadeh /* sysfs related */ 329dfc5606dSYehuda Sadeh struct device dev; 330b82d167bSAlex Elder unsigned long open_count; /* protected by lock */ 331dfc5606dSYehuda Sadeh }; 332dfc5606dSYehuda Sadeh 333b82d167bSAlex Elder /* 334b82d167bSAlex Elder * Flag bits for rbd_dev->flags. If atomicity is required, 335b82d167bSAlex Elder * rbd_dev->lock is used to protect access. 336b82d167bSAlex Elder * 337b82d167bSAlex Elder * Currently, only the "removing" flag (which is coupled with the 338b82d167bSAlex Elder * "open_count" field) requires atomic access. 339b82d167bSAlex Elder */ 3406d292906SAlex Elder enum rbd_dev_flags { 3416d292906SAlex Elder RBD_DEV_FLAG_EXISTS, /* mapped snapshot has not been deleted */ 342b82d167bSAlex Elder RBD_DEV_FLAG_REMOVING, /* this mapping is being removed */ 3436d292906SAlex Elder }; 3446d292906SAlex Elder 345602adf40SYehuda Sadeh static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */ 346e124a82fSAlex Elder 347602adf40SYehuda Sadeh static LIST_HEAD(rbd_dev_list); /* devices */ 348e124a82fSAlex Elder static DEFINE_SPINLOCK(rbd_dev_list_lock); 349e124a82fSAlex Elder 350602adf40SYehuda Sadeh static LIST_HEAD(rbd_client_list); /* clients */ 351432b8587SAlex Elder static DEFINE_SPINLOCK(rbd_client_list_lock); 352602adf40SYehuda Sadeh 353304f6808SAlex Elder static int rbd_dev_snaps_update(struct rbd_device *rbd_dev); 354304f6808SAlex Elder static int rbd_dev_snaps_register(struct rbd_device *rbd_dev); 355304f6808SAlex Elder 356dfc5606dSYehuda Sadeh static void rbd_dev_release(struct device *dev); 35741f38c2bSAlex Elder static void rbd_remove_snap_dev(struct rbd_snap *snap); 358dfc5606dSYehuda Sadeh 359f0f8cef5SAlex Elder static ssize_t rbd_add(struct bus_type *bus, const char *buf, 360f0f8cef5SAlex Elder size_t count); 361f0f8cef5SAlex Elder static ssize_t rbd_remove(struct bus_type *bus, const char *buf, 362f0f8cef5SAlex Elder size_t count); 3632f82ee54SAlex Elder static int rbd_dev_probe(struct rbd_device *rbd_dev); 364f0f8cef5SAlex Elder 365f0f8cef5SAlex Elder static struct bus_attribute rbd_bus_attrs[] = { 366f0f8cef5SAlex Elder __ATTR(add, S_IWUSR, NULL, rbd_add), 367f0f8cef5SAlex Elder __ATTR(remove, S_IWUSR, NULL, rbd_remove), 368f0f8cef5SAlex Elder __ATTR_NULL 369f0f8cef5SAlex Elder }; 370f0f8cef5SAlex Elder 371f0f8cef5SAlex Elder static struct bus_type rbd_bus_type = { 372f0f8cef5SAlex Elder .name = "rbd", 373f0f8cef5SAlex Elder .bus_attrs = rbd_bus_attrs, 374f0f8cef5SAlex Elder }; 375f0f8cef5SAlex Elder 376f0f8cef5SAlex Elder static void rbd_root_dev_release(struct device *dev) 377f0f8cef5SAlex Elder { 378f0f8cef5SAlex Elder } 379f0f8cef5SAlex Elder 380f0f8cef5SAlex Elder static struct device rbd_root_dev = { 381f0f8cef5SAlex Elder .init_name = "rbd", 382f0f8cef5SAlex Elder .release = rbd_root_dev_release, 383f0f8cef5SAlex Elder }; 384f0f8cef5SAlex Elder 38506ecc6cbSAlex Elder static __printf(2, 3) 38606ecc6cbSAlex Elder void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...) 38706ecc6cbSAlex Elder { 38806ecc6cbSAlex Elder struct va_format vaf; 38906ecc6cbSAlex Elder va_list args; 39006ecc6cbSAlex Elder 39106ecc6cbSAlex Elder va_start(args, fmt); 39206ecc6cbSAlex Elder vaf.fmt = fmt; 39306ecc6cbSAlex Elder vaf.va = &args; 39406ecc6cbSAlex Elder 39506ecc6cbSAlex Elder if (!rbd_dev) 39606ecc6cbSAlex Elder printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf); 39706ecc6cbSAlex Elder else if (rbd_dev->disk) 39806ecc6cbSAlex Elder printk(KERN_WARNING "%s: %s: %pV\n", 39906ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf); 40006ecc6cbSAlex Elder else if (rbd_dev->spec && rbd_dev->spec->image_name) 40106ecc6cbSAlex Elder printk(KERN_WARNING "%s: image %s: %pV\n", 40206ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf); 40306ecc6cbSAlex Elder else if (rbd_dev->spec && rbd_dev->spec->image_id) 40406ecc6cbSAlex Elder printk(KERN_WARNING "%s: id %s: %pV\n", 40506ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf); 40606ecc6cbSAlex Elder else /* punt */ 40706ecc6cbSAlex Elder printk(KERN_WARNING "%s: rbd_dev %p: %pV\n", 40806ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev, &vaf); 40906ecc6cbSAlex Elder va_end(args); 41006ecc6cbSAlex Elder } 41106ecc6cbSAlex Elder 412aafb230eSAlex Elder #ifdef RBD_DEBUG 413aafb230eSAlex Elder #define rbd_assert(expr) \ 414aafb230eSAlex Elder if (unlikely(!(expr))) { \ 415aafb230eSAlex Elder printk(KERN_ERR "\nAssertion failure in %s() " \ 416aafb230eSAlex Elder "at line %d:\n\n" \ 417aafb230eSAlex Elder "\trbd_assert(%s);\n\n", \ 418aafb230eSAlex Elder __func__, __LINE__, #expr); \ 419aafb230eSAlex Elder BUG(); \ 420aafb230eSAlex Elder } 421aafb230eSAlex Elder #else /* !RBD_DEBUG */ 422aafb230eSAlex Elder # define rbd_assert(expr) ((void) 0) 423aafb230eSAlex Elder #endif /* !RBD_DEBUG */ 424dfc5606dSYehuda Sadeh 4258b3e1a56SAlex Elder static void rbd_img_parent_read(struct rbd_obj_request *obj_request); 426b454e36dSAlex Elder static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request); 4278b3e1a56SAlex Elder 428117973fbSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver); 429117973fbSAlex Elder static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver); 43059c2be1eSYehuda Sadeh 431602adf40SYehuda Sadeh static int rbd_open(struct block_device *bdev, fmode_t mode) 432602adf40SYehuda Sadeh { 433f0f8cef5SAlex Elder struct rbd_device *rbd_dev = bdev->bd_disk->private_data; 434b82d167bSAlex Elder bool removing = false; 435602adf40SYehuda Sadeh 436f84344f3SAlex Elder if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only) 437602adf40SYehuda Sadeh return -EROFS; 438602adf40SYehuda Sadeh 439a14ea269SAlex Elder spin_lock_irq(&rbd_dev->lock); 440b82d167bSAlex Elder if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags)) 441b82d167bSAlex Elder removing = true; 442b82d167bSAlex Elder else 443b82d167bSAlex Elder rbd_dev->open_count++; 444a14ea269SAlex Elder spin_unlock_irq(&rbd_dev->lock); 445b82d167bSAlex Elder if (removing) 446b82d167bSAlex Elder return -ENOENT; 447b82d167bSAlex Elder 44842382b70SAlex Elder mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); 449c3e946ceSAlex Elder (void) get_device(&rbd_dev->dev); 450f84344f3SAlex Elder set_device_ro(bdev, rbd_dev->mapping.read_only); 45142382b70SAlex Elder mutex_unlock(&ctl_mutex); 452340c7a2bSAlex Elder 453602adf40SYehuda Sadeh return 0; 454602adf40SYehuda Sadeh } 455602adf40SYehuda Sadeh 456dfc5606dSYehuda Sadeh static int rbd_release(struct gendisk *disk, fmode_t mode) 457dfc5606dSYehuda Sadeh { 458dfc5606dSYehuda Sadeh struct rbd_device *rbd_dev = disk->private_data; 459b82d167bSAlex Elder unsigned long open_count_before; 460b82d167bSAlex Elder 461a14ea269SAlex Elder spin_lock_irq(&rbd_dev->lock); 462b82d167bSAlex Elder open_count_before = rbd_dev->open_count--; 463a14ea269SAlex Elder spin_unlock_irq(&rbd_dev->lock); 464b82d167bSAlex Elder rbd_assert(open_count_before > 0); 465dfc5606dSYehuda Sadeh 46642382b70SAlex Elder mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); 467c3e946ceSAlex Elder put_device(&rbd_dev->dev); 46842382b70SAlex Elder mutex_unlock(&ctl_mutex); 469dfc5606dSYehuda Sadeh 470dfc5606dSYehuda Sadeh return 0; 471dfc5606dSYehuda Sadeh } 472dfc5606dSYehuda Sadeh 473602adf40SYehuda Sadeh static const struct block_device_operations rbd_bd_ops = { 474602adf40SYehuda Sadeh .owner = THIS_MODULE, 475602adf40SYehuda Sadeh .open = rbd_open, 476dfc5606dSYehuda Sadeh .release = rbd_release, 477602adf40SYehuda Sadeh }; 478602adf40SYehuda Sadeh 479602adf40SYehuda Sadeh /* 480602adf40SYehuda Sadeh * Initialize an rbd client instance. 48143ae4701SAlex Elder * We own *ceph_opts. 482602adf40SYehuda Sadeh */ 483f8c38929SAlex Elder static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts) 484602adf40SYehuda Sadeh { 485602adf40SYehuda Sadeh struct rbd_client *rbdc; 486602adf40SYehuda Sadeh int ret = -ENOMEM; 487602adf40SYehuda Sadeh 48837206ee5SAlex Elder dout("%s:\n", __func__); 489602adf40SYehuda Sadeh rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL); 490602adf40SYehuda Sadeh if (!rbdc) 491602adf40SYehuda Sadeh goto out_opt; 492602adf40SYehuda Sadeh 493602adf40SYehuda Sadeh kref_init(&rbdc->kref); 494602adf40SYehuda Sadeh INIT_LIST_HEAD(&rbdc->node); 495602adf40SYehuda Sadeh 496bc534d86SAlex Elder mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); 497bc534d86SAlex Elder 49843ae4701SAlex Elder rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0); 499602adf40SYehuda Sadeh if (IS_ERR(rbdc->client)) 500bc534d86SAlex Elder goto out_mutex; 50143ae4701SAlex Elder ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */ 502602adf40SYehuda Sadeh 503602adf40SYehuda Sadeh ret = ceph_open_session(rbdc->client); 504602adf40SYehuda Sadeh if (ret < 0) 505602adf40SYehuda Sadeh goto out_err; 506602adf40SYehuda Sadeh 507432b8587SAlex Elder spin_lock(&rbd_client_list_lock); 508602adf40SYehuda Sadeh list_add_tail(&rbdc->node, &rbd_client_list); 509432b8587SAlex Elder spin_unlock(&rbd_client_list_lock); 510602adf40SYehuda Sadeh 511bc534d86SAlex Elder mutex_unlock(&ctl_mutex); 51237206ee5SAlex Elder dout("%s: rbdc %p\n", __func__, rbdc); 513bc534d86SAlex Elder 514602adf40SYehuda Sadeh return rbdc; 515602adf40SYehuda Sadeh 516602adf40SYehuda Sadeh out_err: 517602adf40SYehuda Sadeh ceph_destroy_client(rbdc->client); 518bc534d86SAlex Elder out_mutex: 519bc534d86SAlex Elder mutex_unlock(&ctl_mutex); 520602adf40SYehuda Sadeh kfree(rbdc); 521602adf40SYehuda Sadeh out_opt: 52243ae4701SAlex Elder if (ceph_opts) 52343ae4701SAlex Elder ceph_destroy_options(ceph_opts); 52437206ee5SAlex Elder dout("%s: error %d\n", __func__, ret); 52537206ee5SAlex Elder 52628f259b7SVasiliy Kulikov return ERR_PTR(ret); 527602adf40SYehuda Sadeh } 528602adf40SYehuda Sadeh 5292f82ee54SAlex Elder static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc) 5302f82ee54SAlex Elder { 5312f82ee54SAlex Elder kref_get(&rbdc->kref); 5322f82ee54SAlex Elder 5332f82ee54SAlex Elder return rbdc; 5342f82ee54SAlex Elder } 5352f82ee54SAlex Elder 536602adf40SYehuda Sadeh /* 5371f7ba331SAlex Elder * Find a ceph client with specific addr and configuration. If 5381f7ba331SAlex Elder * found, bump its reference count. 539602adf40SYehuda Sadeh */ 5401f7ba331SAlex Elder static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts) 541602adf40SYehuda Sadeh { 542602adf40SYehuda Sadeh struct rbd_client *client_node; 5431f7ba331SAlex Elder bool found = false; 544602adf40SYehuda Sadeh 54543ae4701SAlex Elder if (ceph_opts->flags & CEPH_OPT_NOSHARE) 546602adf40SYehuda Sadeh return NULL; 547602adf40SYehuda Sadeh 5481f7ba331SAlex Elder spin_lock(&rbd_client_list_lock); 5491f7ba331SAlex Elder list_for_each_entry(client_node, &rbd_client_list, node) { 5501f7ba331SAlex Elder if (!ceph_compare_options(ceph_opts, client_node->client)) { 5512f82ee54SAlex Elder __rbd_get_client(client_node); 5522f82ee54SAlex Elder 5531f7ba331SAlex Elder found = true; 5541f7ba331SAlex Elder break; 5551f7ba331SAlex Elder } 5561f7ba331SAlex Elder } 5571f7ba331SAlex Elder spin_unlock(&rbd_client_list_lock); 5581f7ba331SAlex Elder 5591f7ba331SAlex Elder return found ? client_node : NULL; 560602adf40SYehuda Sadeh } 561602adf40SYehuda Sadeh 562602adf40SYehuda Sadeh /* 56359c2be1eSYehuda Sadeh * mount options 56459c2be1eSYehuda Sadeh */ 56559c2be1eSYehuda Sadeh enum { 56659c2be1eSYehuda Sadeh Opt_last_int, 56759c2be1eSYehuda Sadeh /* int args above */ 56859c2be1eSYehuda Sadeh Opt_last_string, 56959c2be1eSYehuda Sadeh /* string args above */ 570cc0538b6SAlex Elder Opt_read_only, 571cc0538b6SAlex Elder Opt_read_write, 572cc0538b6SAlex Elder /* Boolean args above */ 573cc0538b6SAlex Elder Opt_last_bool, 57459c2be1eSYehuda Sadeh }; 57559c2be1eSYehuda Sadeh 57643ae4701SAlex Elder static match_table_t rbd_opts_tokens = { 57759c2be1eSYehuda Sadeh /* int args above */ 57859c2be1eSYehuda Sadeh /* string args above */ 579be466c1cSAlex Elder {Opt_read_only, "read_only"}, 580cc0538b6SAlex Elder {Opt_read_only, "ro"}, /* Alternate spelling */ 581cc0538b6SAlex Elder {Opt_read_write, "read_write"}, 582cc0538b6SAlex Elder {Opt_read_write, "rw"}, /* Alternate spelling */ 583cc0538b6SAlex Elder /* Boolean args above */ 58459c2be1eSYehuda Sadeh {-1, NULL} 58559c2be1eSYehuda Sadeh }; 58659c2be1eSYehuda Sadeh 58798571b5aSAlex Elder struct rbd_options { 58898571b5aSAlex Elder bool read_only; 58998571b5aSAlex Elder }; 59098571b5aSAlex Elder 59198571b5aSAlex Elder #define RBD_READ_ONLY_DEFAULT false 59298571b5aSAlex Elder 59359c2be1eSYehuda Sadeh static int parse_rbd_opts_token(char *c, void *private) 59459c2be1eSYehuda Sadeh { 59543ae4701SAlex Elder struct rbd_options *rbd_opts = private; 59659c2be1eSYehuda Sadeh substring_t argstr[MAX_OPT_ARGS]; 59759c2be1eSYehuda Sadeh int token, intval, ret; 59859c2be1eSYehuda Sadeh 59943ae4701SAlex Elder token = match_token(c, rbd_opts_tokens, argstr); 60059c2be1eSYehuda Sadeh if (token < 0) 60159c2be1eSYehuda Sadeh return -EINVAL; 60259c2be1eSYehuda Sadeh 60359c2be1eSYehuda Sadeh if (token < Opt_last_int) { 60459c2be1eSYehuda Sadeh ret = match_int(&argstr[0], &intval); 60559c2be1eSYehuda Sadeh if (ret < 0) { 60659c2be1eSYehuda Sadeh pr_err("bad mount option arg (not int) " 60759c2be1eSYehuda Sadeh "at '%s'\n", c); 60859c2be1eSYehuda Sadeh return ret; 60959c2be1eSYehuda Sadeh } 61059c2be1eSYehuda Sadeh dout("got int token %d val %d\n", token, intval); 61159c2be1eSYehuda Sadeh } else if (token > Opt_last_int && token < Opt_last_string) { 61259c2be1eSYehuda Sadeh dout("got string token %d val %s\n", token, 61359c2be1eSYehuda Sadeh argstr[0].from); 614cc0538b6SAlex Elder } else if (token > Opt_last_string && token < Opt_last_bool) { 615cc0538b6SAlex Elder dout("got Boolean token %d\n", token); 61659c2be1eSYehuda Sadeh } else { 61759c2be1eSYehuda Sadeh dout("got token %d\n", token); 61859c2be1eSYehuda Sadeh } 61959c2be1eSYehuda Sadeh 62059c2be1eSYehuda Sadeh switch (token) { 621cc0538b6SAlex Elder case Opt_read_only: 622cc0538b6SAlex Elder rbd_opts->read_only = true; 623cc0538b6SAlex Elder break; 624cc0538b6SAlex Elder case Opt_read_write: 625cc0538b6SAlex Elder rbd_opts->read_only = false; 626cc0538b6SAlex Elder break; 62759c2be1eSYehuda Sadeh default: 628aafb230eSAlex Elder rbd_assert(false); 629aafb230eSAlex Elder break; 63059c2be1eSYehuda Sadeh } 63159c2be1eSYehuda Sadeh return 0; 63259c2be1eSYehuda Sadeh } 63359c2be1eSYehuda Sadeh 63459c2be1eSYehuda Sadeh /* 635602adf40SYehuda Sadeh * Get a ceph client with specific addr and configuration, if one does 636602adf40SYehuda Sadeh * not exist create it. 637602adf40SYehuda Sadeh */ 6389d3997fdSAlex Elder static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts) 639602adf40SYehuda Sadeh { 640f8c38929SAlex Elder struct rbd_client *rbdc; 64159c2be1eSYehuda Sadeh 6421f7ba331SAlex Elder rbdc = rbd_client_find(ceph_opts); 6439d3997fdSAlex Elder if (rbdc) /* using an existing client */ 64443ae4701SAlex Elder ceph_destroy_options(ceph_opts); 6459d3997fdSAlex Elder else 646f8c38929SAlex Elder rbdc = rbd_client_create(ceph_opts); 647d720bcb0SAlex Elder 6489d3997fdSAlex Elder return rbdc; 649602adf40SYehuda Sadeh } 650602adf40SYehuda Sadeh 651602adf40SYehuda Sadeh /* 652602adf40SYehuda Sadeh * Destroy ceph client 653d23a4b3fSAlex Elder * 654432b8587SAlex Elder * Caller must hold rbd_client_list_lock. 655602adf40SYehuda Sadeh */ 656602adf40SYehuda Sadeh static void rbd_client_release(struct kref *kref) 657602adf40SYehuda Sadeh { 658602adf40SYehuda Sadeh struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref); 659602adf40SYehuda Sadeh 66037206ee5SAlex Elder dout("%s: rbdc %p\n", __func__, rbdc); 661cd9d9f5dSAlex Elder spin_lock(&rbd_client_list_lock); 662602adf40SYehuda Sadeh list_del(&rbdc->node); 663cd9d9f5dSAlex Elder spin_unlock(&rbd_client_list_lock); 664602adf40SYehuda Sadeh 665602adf40SYehuda Sadeh ceph_destroy_client(rbdc->client); 666602adf40SYehuda Sadeh kfree(rbdc); 667602adf40SYehuda Sadeh } 668602adf40SYehuda Sadeh 669602adf40SYehuda Sadeh /* 670602adf40SYehuda Sadeh * Drop reference to ceph client node. If it's not referenced anymore, release 671602adf40SYehuda Sadeh * it. 672602adf40SYehuda Sadeh */ 6739d3997fdSAlex Elder static void rbd_put_client(struct rbd_client *rbdc) 674602adf40SYehuda Sadeh { 675c53d5893SAlex Elder if (rbdc) 6769d3997fdSAlex Elder kref_put(&rbdc->kref, rbd_client_release); 677602adf40SYehuda Sadeh } 678602adf40SYehuda Sadeh 679a30b71b9SAlex Elder static bool rbd_image_format_valid(u32 image_format) 680a30b71b9SAlex Elder { 681a30b71b9SAlex Elder return image_format == 1 || image_format == 2; 682a30b71b9SAlex Elder } 683a30b71b9SAlex Elder 6848e94af8eSAlex Elder static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk) 6858e94af8eSAlex Elder { 686103a150fSAlex Elder size_t size; 687103a150fSAlex Elder u32 snap_count; 688103a150fSAlex Elder 689103a150fSAlex Elder /* The header has to start with the magic rbd header text */ 690103a150fSAlex Elder if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT))) 691103a150fSAlex Elder return false; 692103a150fSAlex Elder 693db2388b6SAlex Elder /* The bio layer requires at least sector-sized I/O */ 694db2388b6SAlex Elder 695db2388b6SAlex Elder if (ondisk->options.order < SECTOR_SHIFT) 696db2388b6SAlex Elder return false; 697db2388b6SAlex Elder 698db2388b6SAlex Elder /* If we use u64 in a few spots we may be able to loosen this */ 699db2388b6SAlex Elder 700db2388b6SAlex Elder if (ondisk->options.order > 8 * sizeof (int) - 1) 701db2388b6SAlex Elder return false; 702db2388b6SAlex Elder 703103a150fSAlex Elder /* 704103a150fSAlex Elder * The size of a snapshot header has to fit in a size_t, and 705103a150fSAlex Elder * that limits the number of snapshots. 706103a150fSAlex Elder */ 707103a150fSAlex Elder snap_count = le32_to_cpu(ondisk->snap_count); 708103a150fSAlex Elder size = SIZE_MAX - sizeof (struct ceph_snap_context); 709103a150fSAlex Elder if (snap_count > size / sizeof (__le64)) 710103a150fSAlex Elder return false; 711103a150fSAlex Elder 712103a150fSAlex Elder /* 713103a150fSAlex Elder * Not only that, but the size of the entire the snapshot 714103a150fSAlex Elder * header must also be representable in a size_t. 715103a150fSAlex Elder */ 716103a150fSAlex Elder size -= snap_count * sizeof (__le64); 717103a150fSAlex Elder if ((u64) size < le64_to_cpu(ondisk->snap_names_len)) 718103a150fSAlex Elder return false; 719103a150fSAlex Elder 720103a150fSAlex Elder return true; 7218e94af8eSAlex Elder } 7228e94af8eSAlex Elder 723602adf40SYehuda Sadeh /* 724602adf40SYehuda Sadeh * Create a new header structure, translate header format from the on-disk 725602adf40SYehuda Sadeh * header. 726602adf40SYehuda Sadeh */ 727602adf40SYehuda Sadeh static int rbd_header_from_disk(struct rbd_image_header *header, 7284156d998SAlex Elder struct rbd_image_header_ondisk *ondisk) 729602adf40SYehuda Sadeh { 730ccece235SAlex Elder u32 snap_count; 73158c17b0eSAlex Elder size_t len; 732d2bb24e5SAlex Elder size_t size; 733621901d6SAlex Elder u32 i; 734602adf40SYehuda Sadeh 7356a52325fSAlex Elder memset(header, 0, sizeof (*header)); 7366a52325fSAlex Elder 737103a150fSAlex Elder snap_count = le32_to_cpu(ondisk->snap_count); 738103a150fSAlex Elder 73958c17b0eSAlex Elder len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix)); 74058c17b0eSAlex Elder header->object_prefix = kmalloc(len + 1, GFP_KERNEL); 7416a52325fSAlex Elder if (!header->object_prefix) 742602adf40SYehuda Sadeh return -ENOMEM; 74358c17b0eSAlex Elder memcpy(header->object_prefix, ondisk->object_prefix, len); 74458c17b0eSAlex Elder header->object_prefix[len] = '\0'; 74500f1f36fSAlex Elder 746602adf40SYehuda Sadeh if (snap_count) { 747f785cc1dSAlex Elder u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len); 748f785cc1dSAlex Elder 749621901d6SAlex Elder /* Save a copy of the snapshot names */ 750621901d6SAlex Elder 751f785cc1dSAlex Elder if (snap_names_len > (u64) SIZE_MAX) 752f785cc1dSAlex Elder return -EIO; 753f785cc1dSAlex Elder header->snap_names = kmalloc(snap_names_len, GFP_KERNEL); 754602adf40SYehuda Sadeh if (!header->snap_names) 7556a52325fSAlex Elder goto out_err; 756f785cc1dSAlex Elder /* 757f785cc1dSAlex Elder * Note that rbd_dev_v1_header_read() guarantees 758f785cc1dSAlex Elder * the ondisk buffer we're working with has 759f785cc1dSAlex Elder * snap_names_len bytes beyond the end of the 760f785cc1dSAlex Elder * snapshot id array, this memcpy() is safe. 761f785cc1dSAlex Elder */ 762f785cc1dSAlex Elder memcpy(header->snap_names, &ondisk->snaps[snap_count], 763f785cc1dSAlex Elder snap_names_len); 7646a52325fSAlex Elder 765621901d6SAlex Elder /* Record each snapshot's size */ 766621901d6SAlex Elder 767d2bb24e5SAlex Elder size = snap_count * sizeof (*header->snap_sizes); 768d2bb24e5SAlex Elder header->snap_sizes = kmalloc(size, GFP_KERNEL); 769602adf40SYehuda Sadeh if (!header->snap_sizes) 7706a52325fSAlex Elder goto out_err; 771621901d6SAlex Elder for (i = 0; i < snap_count; i++) 772621901d6SAlex Elder header->snap_sizes[i] = 773621901d6SAlex Elder le64_to_cpu(ondisk->snaps[i].image_size); 774602adf40SYehuda Sadeh } else { 775ccece235SAlex Elder WARN_ON(ondisk->snap_names_len); 776602adf40SYehuda Sadeh header->snap_names = NULL; 777602adf40SYehuda Sadeh header->snap_sizes = NULL; 778602adf40SYehuda Sadeh } 779849b4260SAlex Elder 78034b13184SAlex Elder header->features = 0; /* No features support in v1 images */ 781602adf40SYehuda Sadeh header->obj_order = ondisk->options.order; 782602adf40SYehuda Sadeh header->crypt_type = ondisk->options.crypt_type; 783602adf40SYehuda Sadeh header->comp_type = ondisk->options.comp_type; 7846a52325fSAlex Elder 785621901d6SAlex Elder /* Allocate and fill in the snapshot context */ 786621901d6SAlex Elder 787f84344f3SAlex Elder header->image_size = le64_to_cpu(ondisk->image_size); 7886a52325fSAlex Elder size = sizeof (struct ceph_snap_context); 7896a52325fSAlex Elder size += snap_count * sizeof (header->snapc->snaps[0]); 7906a52325fSAlex Elder header->snapc = kzalloc(size, GFP_KERNEL); 7916a52325fSAlex Elder if (!header->snapc) 7926a52325fSAlex Elder goto out_err; 793602adf40SYehuda Sadeh 794602adf40SYehuda Sadeh atomic_set(&header->snapc->nref, 1); 795505cbb9bSAlex Elder header->snapc->seq = le64_to_cpu(ondisk->snap_seq); 796602adf40SYehuda Sadeh header->snapc->num_snaps = snap_count; 797621901d6SAlex Elder for (i = 0; i < snap_count; i++) 798602adf40SYehuda Sadeh header->snapc->snaps[i] = 799602adf40SYehuda Sadeh le64_to_cpu(ondisk->snaps[i].id); 800602adf40SYehuda Sadeh 801602adf40SYehuda Sadeh return 0; 802602adf40SYehuda Sadeh 8036a52325fSAlex Elder out_err: 804849b4260SAlex Elder kfree(header->snap_sizes); 805ccece235SAlex Elder header->snap_sizes = NULL; 806602adf40SYehuda Sadeh kfree(header->snap_names); 807ccece235SAlex Elder header->snap_names = NULL; 8086a52325fSAlex Elder kfree(header->object_prefix); 8096a52325fSAlex Elder header->object_prefix = NULL; 810ccece235SAlex Elder 81100f1f36fSAlex Elder return -ENOMEM; 812602adf40SYehuda Sadeh } 813602adf40SYehuda Sadeh 8149e15b77dSAlex Elder static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id) 8159e15b77dSAlex Elder { 8169e15b77dSAlex Elder struct rbd_snap *snap; 8179e15b77dSAlex Elder 8189e15b77dSAlex Elder if (snap_id == CEPH_NOSNAP) 8199e15b77dSAlex Elder return RBD_SNAP_HEAD_NAME; 8209e15b77dSAlex Elder 8219e15b77dSAlex Elder list_for_each_entry(snap, &rbd_dev->snaps, node) 8229e15b77dSAlex Elder if (snap_id == snap->id) 8239e15b77dSAlex Elder return snap->name; 8249e15b77dSAlex Elder 8259e15b77dSAlex Elder return NULL; 8269e15b77dSAlex Elder } 8279e15b77dSAlex Elder 8288836b995SAlex Elder static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name) 829602adf40SYehuda Sadeh { 830602adf40SYehuda Sadeh 831e86924a8SAlex Elder struct rbd_snap *snap; 83200f1f36fSAlex Elder 833e86924a8SAlex Elder list_for_each_entry(snap, &rbd_dev->snaps, node) { 834e86924a8SAlex Elder if (!strcmp(snap_name, snap->name)) { 8350d7dbfceSAlex Elder rbd_dev->spec->snap_id = snap->id; 836e86924a8SAlex Elder rbd_dev->mapping.size = snap->size; 83734b13184SAlex Elder rbd_dev->mapping.features = snap->features; 83800f1f36fSAlex Elder 839e86924a8SAlex Elder return 0; 840602adf40SYehuda Sadeh } 84100f1f36fSAlex Elder } 842e86924a8SAlex Elder 84300f1f36fSAlex Elder return -ENOENT; 84400f1f36fSAlex Elder } 845602adf40SYehuda Sadeh 846819d52bfSAlex Elder static int rbd_dev_set_mapping(struct rbd_device *rbd_dev) 847602adf40SYehuda Sadeh { 84878dc447dSAlex Elder int ret; 849602adf40SYehuda Sadeh 8500d7dbfceSAlex Elder if (!memcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME, 851cc9d734cSJosh Durgin sizeof (RBD_SNAP_HEAD_NAME))) { 8520d7dbfceSAlex Elder rbd_dev->spec->snap_id = CEPH_NOSNAP; 85399c1f08fSAlex Elder rbd_dev->mapping.size = rbd_dev->header.image_size; 85434b13184SAlex Elder rbd_dev->mapping.features = rbd_dev->header.features; 855e86924a8SAlex Elder ret = 0; 856602adf40SYehuda Sadeh } else { 8570d7dbfceSAlex Elder ret = snap_by_name(rbd_dev, rbd_dev->spec->snap_name); 858602adf40SYehuda Sadeh if (ret < 0) 859602adf40SYehuda Sadeh goto done; 860f84344f3SAlex Elder rbd_dev->mapping.read_only = true; 861602adf40SYehuda Sadeh } 8626d292906SAlex Elder set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); 8636d292906SAlex Elder 864602adf40SYehuda Sadeh done: 865602adf40SYehuda Sadeh return ret; 866602adf40SYehuda Sadeh } 867602adf40SYehuda Sadeh 868602adf40SYehuda Sadeh static void rbd_header_free(struct rbd_image_header *header) 869602adf40SYehuda Sadeh { 870849b4260SAlex Elder kfree(header->object_prefix); 871d78fd7aeSAlex Elder header->object_prefix = NULL; 872602adf40SYehuda Sadeh kfree(header->snap_sizes); 873d78fd7aeSAlex Elder header->snap_sizes = NULL; 874849b4260SAlex Elder kfree(header->snap_names); 875d78fd7aeSAlex Elder header->snap_names = NULL; 876d1d25646SJosh Durgin ceph_put_snap_context(header->snapc); 877d78fd7aeSAlex Elder header->snapc = NULL; 878602adf40SYehuda Sadeh } 879602adf40SYehuda Sadeh 88098571b5aSAlex Elder static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset) 881602adf40SYehuda Sadeh { 88265ccfe21SAlex Elder char *name; 88365ccfe21SAlex Elder u64 segment; 88465ccfe21SAlex Elder int ret; 885602adf40SYehuda Sadeh 8862fd82b9eSAlex Elder name = kmalloc(MAX_OBJ_NAME_SIZE + 1, GFP_NOIO); 88765ccfe21SAlex Elder if (!name) 88865ccfe21SAlex Elder return NULL; 88965ccfe21SAlex Elder segment = offset >> rbd_dev->header.obj_order; 8902fd82b9eSAlex Elder ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, "%s.%012llx", 89165ccfe21SAlex Elder rbd_dev->header.object_prefix, segment); 8922fd82b9eSAlex Elder if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) { 89365ccfe21SAlex Elder pr_err("error formatting segment name for #%llu (%d)\n", 89465ccfe21SAlex Elder segment, ret); 89565ccfe21SAlex Elder kfree(name); 89665ccfe21SAlex Elder name = NULL; 89765ccfe21SAlex Elder } 898602adf40SYehuda Sadeh 89965ccfe21SAlex Elder return name; 90065ccfe21SAlex Elder } 901602adf40SYehuda Sadeh 90265ccfe21SAlex Elder static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset) 90365ccfe21SAlex Elder { 90465ccfe21SAlex Elder u64 segment_size = (u64) 1 << rbd_dev->header.obj_order; 905602adf40SYehuda Sadeh 90665ccfe21SAlex Elder return offset & (segment_size - 1); 90765ccfe21SAlex Elder } 90865ccfe21SAlex Elder 90965ccfe21SAlex Elder static u64 rbd_segment_length(struct rbd_device *rbd_dev, 91065ccfe21SAlex Elder u64 offset, u64 length) 91165ccfe21SAlex Elder { 91265ccfe21SAlex Elder u64 segment_size = (u64) 1 << rbd_dev->header.obj_order; 91365ccfe21SAlex Elder 91465ccfe21SAlex Elder offset &= segment_size - 1; 91565ccfe21SAlex Elder 916aafb230eSAlex Elder rbd_assert(length <= U64_MAX - offset); 91765ccfe21SAlex Elder if (offset + length > segment_size) 91865ccfe21SAlex Elder length = segment_size - offset; 91965ccfe21SAlex Elder 92065ccfe21SAlex Elder return length; 921602adf40SYehuda Sadeh } 922602adf40SYehuda Sadeh 923602adf40SYehuda Sadeh /* 924029bcbd8SJosh Durgin * returns the size of an object in the image 925029bcbd8SJosh Durgin */ 926029bcbd8SJosh Durgin static u64 rbd_obj_bytes(struct rbd_image_header *header) 927029bcbd8SJosh Durgin { 928029bcbd8SJosh Durgin return 1 << header->obj_order; 929029bcbd8SJosh Durgin } 930029bcbd8SJosh Durgin 931029bcbd8SJosh Durgin /* 932602adf40SYehuda Sadeh * bio helpers 933602adf40SYehuda Sadeh */ 934602adf40SYehuda Sadeh 935602adf40SYehuda Sadeh static void bio_chain_put(struct bio *chain) 936602adf40SYehuda Sadeh { 937602adf40SYehuda Sadeh struct bio *tmp; 938602adf40SYehuda Sadeh 939602adf40SYehuda Sadeh while (chain) { 940602adf40SYehuda Sadeh tmp = chain; 941602adf40SYehuda Sadeh chain = chain->bi_next; 942602adf40SYehuda Sadeh bio_put(tmp); 943602adf40SYehuda Sadeh } 944602adf40SYehuda Sadeh } 945602adf40SYehuda Sadeh 946602adf40SYehuda Sadeh /* 947602adf40SYehuda Sadeh * zeros a bio chain, starting at specific offset 948602adf40SYehuda Sadeh */ 949602adf40SYehuda Sadeh static void zero_bio_chain(struct bio *chain, int start_ofs) 950602adf40SYehuda Sadeh { 951602adf40SYehuda Sadeh struct bio_vec *bv; 952602adf40SYehuda Sadeh unsigned long flags; 953602adf40SYehuda Sadeh void *buf; 954602adf40SYehuda Sadeh int i; 955602adf40SYehuda Sadeh int pos = 0; 956602adf40SYehuda Sadeh 957602adf40SYehuda Sadeh while (chain) { 958602adf40SYehuda Sadeh bio_for_each_segment(bv, chain, i) { 959602adf40SYehuda Sadeh if (pos + bv->bv_len > start_ofs) { 960602adf40SYehuda Sadeh int remainder = max(start_ofs - pos, 0); 961602adf40SYehuda Sadeh buf = bvec_kmap_irq(bv, &flags); 962602adf40SYehuda Sadeh memset(buf + remainder, 0, 963602adf40SYehuda Sadeh bv->bv_len - remainder); 96485b5aaa6SDan Carpenter bvec_kunmap_irq(buf, &flags); 965602adf40SYehuda Sadeh } 966602adf40SYehuda Sadeh pos += bv->bv_len; 967602adf40SYehuda Sadeh } 968602adf40SYehuda Sadeh 969602adf40SYehuda Sadeh chain = chain->bi_next; 970602adf40SYehuda Sadeh } 971602adf40SYehuda Sadeh } 972602adf40SYehuda Sadeh 973602adf40SYehuda Sadeh /* 974f7760dadSAlex Elder * Clone a portion of a bio, starting at the given byte offset 975f7760dadSAlex Elder * and continuing for the number of bytes indicated. 976602adf40SYehuda Sadeh */ 977f7760dadSAlex Elder static struct bio *bio_clone_range(struct bio *bio_src, 978f7760dadSAlex Elder unsigned int offset, 979f7760dadSAlex Elder unsigned int len, 980f7760dadSAlex Elder gfp_t gfpmask) 981602adf40SYehuda Sadeh { 982f7760dadSAlex Elder struct bio_vec *bv; 983f7760dadSAlex Elder unsigned int resid; 984f7760dadSAlex Elder unsigned short idx; 985f7760dadSAlex Elder unsigned int voff; 986f7760dadSAlex Elder unsigned short end_idx; 987f7760dadSAlex Elder unsigned short vcnt; 988f7760dadSAlex Elder struct bio *bio; 989602adf40SYehuda Sadeh 990f7760dadSAlex Elder /* Handle the easy case for the caller */ 991f7760dadSAlex Elder 992f7760dadSAlex Elder if (!offset && len == bio_src->bi_size) 993f7760dadSAlex Elder return bio_clone(bio_src, gfpmask); 994f7760dadSAlex Elder 995f7760dadSAlex Elder if (WARN_ON_ONCE(!len)) 996f7760dadSAlex Elder return NULL; 997f7760dadSAlex Elder if (WARN_ON_ONCE(len > bio_src->bi_size)) 998f7760dadSAlex Elder return NULL; 999f7760dadSAlex Elder if (WARN_ON_ONCE(offset > bio_src->bi_size - len)) 1000f7760dadSAlex Elder return NULL; 1001f7760dadSAlex Elder 1002f7760dadSAlex Elder /* Find first affected segment... */ 1003f7760dadSAlex Elder 1004f7760dadSAlex Elder resid = offset; 1005f7760dadSAlex Elder __bio_for_each_segment(bv, bio_src, idx, 0) { 1006f7760dadSAlex Elder if (resid < bv->bv_len) 1007f7760dadSAlex Elder break; 1008f7760dadSAlex Elder resid -= bv->bv_len; 1009602adf40SYehuda Sadeh } 1010f7760dadSAlex Elder voff = resid; 1011602adf40SYehuda Sadeh 1012f7760dadSAlex Elder /* ...and the last affected segment */ 1013542582fcSAlex Elder 1014f7760dadSAlex Elder resid += len; 1015f7760dadSAlex Elder __bio_for_each_segment(bv, bio_src, end_idx, idx) { 1016f7760dadSAlex Elder if (resid <= bv->bv_len) 1017f7760dadSAlex Elder break; 1018f7760dadSAlex Elder resid -= bv->bv_len; 1019f7760dadSAlex Elder } 1020f7760dadSAlex Elder vcnt = end_idx - idx + 1; 1021602adf40SYehuda Sadeh 1022f7760dadSAlex Elder /* Build the clone */ 1023f7760dadSAlex Elder 1024f7760dadSAlex Elder bio = bio_alloc(gfpmask, (unsigned int) vcnt); 1025f7760dadSAlex Elder if (!bio) 1026f7760dadSAlex Elder return NULL; /* ENOMEM */ 1027f7760dadSAlex Elder 1028f7760dadSAlex Elder bio->bi_bdev = bio_src->bi_bdev; 1029f7760dadSAlex Elder bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT); 1030f7760dadSAlex Elder bio->bi_rw = bio_src->bi_rw; 1031f7760dadSAlex Elder bio->bi_flags |= 1 << BIO_CLONED; 1032602adf40SYehuda Sadeh 1033602adf40SYehuda Sadeh /* 1034f7760dadSAlex Elder * Copy over our part of the bio_vec, then update the first 1035f7760dadSAlex Elder * and last (or only) entries. 1036602adf40SYehuda Sadeh */ 1037f7760dadSAlex Elder memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx], 1038f7760dadSAlex Elder vcnt * sizeof (struct bio_vec)); 1039f7760dadSAlex Elder bio->bi_io_vec[0].bv_offset += voff; 1040f7760dadSAlex Elder if (vcnt > 1) { 1041f7760dadSAlex Elder bio->bi_io_vec[0].bv_len -= voff; 1042f7760dadSAlex Elder bio->bi_io_vec[vcnt - 1].bv_len = resid; 1043602adf40SYehuda Sadeh } else { 1044f7760dadSAlex Elder bio->bi_io_vec[0].bv_len = len; 1045602adf40SYehuda Sadeh } 1046602adf40SYehuda Sadeh 1047f7760dadSAlex Elder bio->bi_vcnt = vcnt; 1048f7760dadSAlex Elder bio->bi_size = len; 1049f7760dadSAlex Elder bio->bi_idx = 0; 1050602adf40SYehuda Sadeh 1051f7760dadSAlex Elder return bio; 1052602adf40SYehuda Sadeh } 1053602adf40SYehuda Sadeh 1054f7760dadSAlex Elder /* 1055f7760dadSAlex Elder * Clone a portion of a bio chain, starting at the given byte offset 1056f7760dadSAlex Elder * into the first bio in the source chain and continuing for the 1057f7760dadSAlex Elder * number of bytes indicated. The result is another bio chain of 1058f7760dadSAlex Elder * exactly the given length, or a null pointer on error. 1059f7760dadSAlex Elder * 1060f7760dadSAlex Elder * The bio_src and offset parameters are both in-out. On entry they 1061f7760dadSAlex Elder * refer to the first source bio and the offset into that bio where 1062f7760dadSAlex Elder * the start of data to be cloned is located. 1063f7760dadSAlex Elder * 1064f7760dadSAlex Elder * On return, bio_src is updated to refer to the bio in the source 1065f7760dadSAlex Elder * chain that contains first un-cloned byte, and *offset will 1066f7760dadSAlex Elder * contain the offset of that byte within that bio. 1067f7760dadSAlex Elder */ 1068f7760dadSAlex Elder static struct bio *bio_chain_clone_range(struct bio **bio_src, 1069f7760dadSAlex Elder unsigned int *offset, 1070f7760dadSAlex Elder unsigned int len, 1071f7760dadSAlex Elder gfp_t gfpmask) 1072f7760dadSAlex Elder { 1073f7760dadSAlex Elder struct bio *bi = *bio_src; 1074f7760dadSAlex Elder unsigned int off = *offset; 1075f7760dadSAlex Elder struct bio *chain = NULL; 1076f7760dadSAlex Elder struct bio **end; 1077602adf40SYehuda Sadeh 1078f7760dadSAlex Elder /* Build up a chain of clone bios up to the limit */ 1079602adf40SYehuda Sadeh 1080f7760dadSAlex Elder if (!bi || off >= bi->bi_size || !len) 1081f7760dadSAlex Elder return NULL; /* Nothing to clone */ 1082602adf40SYehuda Sadeh 1083f7760dadSAlex Elder end = &chain; 1084f7760dadSAlex Elder while (len) { 1085f7760dadSAlex Elder unsigned int bi_size; 1086f7760dadSAlex Elder struct bio *bio; 1087f7760dadSAlex Elder 1088f5400b7aSAlex Elder if (!bi) { 1089f5400b7aSAlex Elder rbd_warn(NULL, "bio_chain exhausted with %u left", len); 1090f7760dadSAlex Elder goto out_err; /* EINVAL; ran out of bio's */ 1091f5400b7aSAlex Elder } 1092f7760dadSAlex Elder bi_size = min_t(unsigned int, bi->bi_size - off, len); 1093f7760dadSAlex Elder bio = bio_clone_range(bi, off, bi_size, gfpmask); 1094f7760dadSAlex Elder if (!bio) 1095f7760dadSAlex Elder goto out_err; /* ENOMEM */ 1096f7760dadSAlex Elder 1097f7760dadSAlex Elder *end = bio; 1098f7760dadSAlex Elder end = &bio->bi_next; 1099f7760dadSAlex Elder 1100f7760dadSAlex Elder off += bi_size; 1101f7760dadSAlex Elder if (off == bi->bi_size) { 1102f7760dadSAlex Elder bi = bi->bi_next; 1103f7760dadSAlex Elder off = 0; 1104f7760dadSAlex Elder } 1105f7760dadSAlex Elder len -= bi_size; 1106f7760dadSAlex Elder } 1107f7760dadSAlex Elder *bio_src = bi; 1108f7760dadSAlex Elder *offset = off; 1109f7760dadSAlex Elder 1110f7760dadSAlex Elder return chain; 1111f7760dadSAlex Elder out_err: 1112f7760dadSAlex Elder bio_chain_put(chain); 1113f7760dadSAlex Elder 1114602adf40SYehuda Sadeh return NULL; 1115602adf40SYehuda Sadeh } 1116602adf40SYehuda Sadeh 1117926f9b3fSAlex Elder /* 1118926f9b3fSAlex Elder * The default/initial value for all object request flags is 0. For 1119926f9b3fSAlex Elder * each flag, once its value is set to 1 it is never reset to 0 1120926f9b3fSAlex Elder * again. 1121926f9b3fSAlex Elder */ 11226365d33aSAlex Elder static void obj_request_img_data_set(struct rbd_obj_request *obj_request) 11236365d33aSAlex Elder { 11246365d33aSAlex Elder if (test_and_set_bit(OBJ_REQ_IMG_DATA, &obj_request->flags)) { 11256365d33aSAlex Elder struct rbd_device *rbd_dev; 11266365d33aSAlex Elder 112757acbaa7SAlex Elder rbd_dev = obj_request->img_request->rbd_dev; 11286365d33aSAlex Elder rbd_warn(rbd_dev, "obj_request %p already marked img_data\n", 11296365d33aSAlex Elder obj_request); 11306365d33aSAlex Elder } 11316365d33aSAlex Elder } 11326365d33aSAlex Elder 11336365d33aSAlex Elder static bool obj_request_img_data_test(struct rbd_obj_request *obj_request) 11346365d33aSAlex Elder { 11356365d33aSAlex Elder smp_mb(); 11366365d33aSAlex Elder return test_bit(OBJ_REQ_IMG_DATA, &obj_request->flags) != 0; 11376365d33aSAlex Elder } 11386365d33aSAlex Elder 113957acbaa7SAlex Elder static void obj_request_done_set(struct rbd_obj_request *obj_request) 114057acbaa7SAlex Elder { 114157acbaa7SAlex Elder if (test_and_set_bit(OBJ_REQ_DONE, &obj_request->flags)) { 114257acbaa7SAlex Elder struct rbd_device *rbd_dev = NULL; 114357acbaa7SAlex Elder 114457acbaa7SAlex Elder if (obj_request_img_data_test(obj_request)) 114557acbaa7SAlex Elder rbd_dev = obj_request->img_request->rbd_dev; 114657acbaa7SAlex Elder rbd_warn(rbd_dev, "obj_request %p already marked done\n", 114757acbaa7SAlex Elder obj_request); 114857acbaa7SAlex Elder } 114957acbaa7SAlex Elder } 115057acbaa7SAlex Elder 115157acbaa7SAlex Elder static bool obj_request_done_test(struct rbd_obj_request *obj_request) 115257acbaa7SAlex Elder { 115357acbaa7SAlex Elder smp_mb(); 115457acbaa7SAlex Elder return test_bit(OBJ_REQ_DONE, &obj_request->flags) != 0; 115557acbaa7SAlex Elder } 115657acbaa7SAlex Elder 11575679c59fSAlex Elder /* 11585679c59fSAlex Elder * This sets the KNOWN flag after (possibly) setting the EXISTS 11595679c59fSAlex Elder * flag. The latter is set based on the "exists" value provided. 11605679c59fSAlex Elder * 11615679c59fSAlex Elder * Note that for our purposes once an object exists it never goes 11625679c59fSAlex Elder * away again. It's possible that the response from two existence 11635679c59fSAlex Elder * checks are separated by the creation of the target object, and 11645679c59fSAlex Elder * the first ("doesn't exist") response arrives *after* the second 11655679c59fSAlex Elder * ("does exist"). In that case we ignore the second one. 11665679c59fSAlex Elder */ 11675679c59fSAlex Elder static void obj_request_existence_set(struct rbd_obj_request *obj_request, 11685679c59fSAlex Elder bool exists) 11695679c59fSAlex Elder { 11705679c59fSAlex Elder if (exists) 11715679c59fSAlex Elder set_bit(OBJ_REQ_EXISTS, &obj_request->flags); 11725679c59fSAlex Elder set_bit(OBJ_REQ_KNOWN, &obj_request->flags); 11735679c59fSAlex Elder smp_mb(); 11745679c59fSAlex Elder } 11755679c59fSAlex Elder 11765679c59fSAlex Elder static bool obj_request_known_test(struct rbd_obj_request *obj_request) 11775679c59fSAlex Elder { 11785679c59fSAlex Elder smp_mb(); 11795679c59fSAlex Elder return test_bit(OBJ_REQ_KNOWN, &obj_request->flags) != 0; 11805679c59fSAlex Elder } 11815679c59fSAlex Elder 11825679c59fSAlex Elder static bool obj_request_exists_test(struct rbd_obj_request *obj_request) 11835679c59fSAlex Elder { 11845679c59fSAlex Elder smp_mb(); 11855679c59fSAlex Elder return test_bit(OBJ_REQ_EXISTS, &obj_request->flags) != 0; 11865679c59fSAlex Elder } 11875679c59fSAlex Elder 1188bf0d5f50SAlex Elder static void rbd_obj_request_get(struct rbd_obj_request *obj_request) 1189bf0d5f50SAlex Elder { 119037206ee5SAlex Elder dout("%s: obj %p (was %d)\n", __func__, obj_request, 119137206ee5SAlex Elder atomic_read(&obj_request->kref.refcount)); 1192bf0d5f50SAlex Elder kref_get(&obj_request->kref); 1193bf0d5f50SAlex Elder } 1194bf0d5f50SAlex Elder 1195bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref); 1196bf0d5f50SAlex Elder static void rbd_obj_request_put(struct rbd_obj_request *obj_request) 1197bf0d5f50SAlex Elder { 1198bf0d5f50SAlex Elder rbd_assert(obj_request != NULL); 119937206ee5SAlex Elder dout("%s: obj %p (was %d)\n", __func__, obj_request, 120037206ee5SAlex Elder atomic_read(&obj_request->kref.refcount)); 1201bf0d5f50SAlex Elder kref_put(&obj_request->kref, rbd_obj_request_destroy); 1202bf0d5f50SAlex Elder } 1203bf0d5f50SAlex Elder 1204bf0d5f50SAlex Elder static void rbd_img_request_get(struct rbd_img_request *img_request) 1205bf0d5f50SAlex Elder { 120637206ee5SAlex Elder dout("%s: img %p (was %d)\n", __func__, img_request, 120737206ee5SAlex Elder atomic_read(&img_request->kref.refcount)); 1208bf0d5f50SAlex Elder kref_get(&img_request->kref); 1209bf0d5f50SAlex Elder } 1210bf0d5f50SAlex Elder 1211bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref); 1212bf0d5f50SAlex Elder static void rbd_img_request_put(struct rbd_img_request *img_request) 1213bf0d5f50SAlex Elder { 1214bf0d5f50SAlex Elder rbd_assert(img_request != NULL); 121537206ee5SAlex Elder dout("%s: img %p (was %d)\n", __func__, img_request, 121637206ee5SAlex Elder atomic_read(&img_request->kref.refcount)); 1217bf0d5f50SAlex Elder kref_put(&img_request->kref, rbd_img_request_destroy); 1218bf0d5f50SAlex Elder } 1219bf0d5f50SAlex Elder 1220bf0d5f50SAlex Elder static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request, 1221bf0d5f50SAlex Elder struct rbd_obj_request *obj_request) 1222bf0d5f50SAlex Elder { 122325dcf954SAlex Elder rbd_assert(obj_request->img_request == NULL); 122425dcf954SAlex Elder 1225b155e86cSAlex Elder /* Image request now owns object's original reference */ 1226bf0d5f50SAlex Elder obj_request->img_request = img_request; 122725dcf954SAlex Elder obj_request->which = img_request->obj_request_count; 12286365d33aSAlex Elder rbd_assert(!obj_request_img_data_test(obj_request)); 12296365d33aSAlex Elder obj_request_img_data_set(obj_request); 1230bf0d5f50SAlex Elder rbd_assert(obj_request->which != BAD_WHICH); 123125dcf954SAlex Elder img_request->obj_request_count++; 123225dcf954SAlex Elder list_add_tail(&obj_request->links, &img_request->obj_requests); 123337206ee5SAlex Elder dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request, 123437206ee5SAlex Elder obj_request->which); 1235bf0d5f50SAlex Elder } 1236bf0d5f50SAlex Elder 1237bf0d5f50SAlex Elder static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request, 1238bf0d5f50SAlex Elder struct rbd_obj_request *obj_request) 1239bf0d5f50SAlex Elder { 1240bf0d5f50SAlex Elder rbd_assert(obj_request->which != BAD_WHICH); 124125dcf954SAlex Elder 124237206ee5SAlex Elder dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request, 124337206ee5SAlex Elder obj_request->which); 1244bf0d5f50SAlex Elder list_del(&obj_request->links); 124525dcf954SAlex Elder rbd_assert(img_request->obj_request_count > 0); 124625dcf954SAlex Elder img_request->obj_request_count--; 124725dcf954SAlex Elder rbd_assert(obj_request->which == img_request->obj_request_count); 124825dcf954SAlex Elder obj_request->which = BAD_WHICH; 12496365d33aSAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 1250bf0d5f50SAlex Elder rbd_assert(obj_request->img_request == img_request); 1251bf0d5f50SAlex Elder obj_request->img_request = NULL; 125225dcf954SAlex Elder obj_request->callback = NULL; 1253bf0d5f50SAlex Elder rbd_obj_request_put(obj_request); 1254bf0d5f50SAlex Elder } 1255bf0d5f50SAlex Elder 1256bf0d5f50SAlex Elder static bool obj_request_type_valid(enum obj_request_type type) 1257bf0d5f50SAlex Elder { 1258bf0d5f50SAlex Elder switch (type) { 12599969ebc5SAlex Elder case OBJ_REQUEST_NODATA: 1260bf0d5f50SAlex Elder case OBJ_REQUEST_BIO: 1261788e2df3SAlex Elder case OBJ_REQUEST_PAGES: 1262bf0d5f50SAlex Elder return true; 1263bf0d5f50SAlex Elder default: 1264bf0d5f50SAlex Elder return false; 1265bf0d5f50SAlex Elder } 1266bf0d5f50SAlex Elder } 1267bf0d5f50SAlex Elder 1268bf0d5f50SAlex Elder static int rbd_obj_request_submit(struct ceph_osd_client *osdc, 1269bf0d5f50SAlex Elder struct rbd_obj_request *obj_request) 1270bf0d5f50SAlex Elder { 127137206ee5SAlex Elder dout("%s: osdc %p obj %p\n", __func__, osdc, obj_request); 127237206ee5SAlex Elder 1273bf0d5f50SAlex Elder return ceph_osdc_start_request(osdc, obj_request->osd_req, false); 1274bf0d5f50SAlex Elder } 1275bf0d5f50SAlex Elder 1276bf0d5f50SAlex Elder static void rbd_img_request_complete(struct rbd_img_request *img_request) 1277bf0d5f50SAlex Elder { 127855f27e09SAlex Elder 127937206ee5SAlex Elder dout("%s: img %p\n", __func__, img_request); 128055f27e09SAlex Elder 128155f27e09SAlex Elder /* 128255f27e09SAlex Elder * If no error occurred, compute the aggregate transfer 128355f27e09SAlex Elder * count for the image request. We could instead use 128455f27e09SAlex Elder * atomic64_cmpxchg() to update it as each object request 128555f27e09SAlex Elder * completes; not clear which way is better off hand. 128655f27e09SAlex Elder */ 128755f27e09SAlex Elder if (!img_request->result) { 128855f27e09SAlex Elder struct rbd_obj_request *obj_request; 128955f27e09SAlex Elder u64 xferred = 0; 129055f27e09SAlex Elder 129155f27e09SAlex Elder for_each_obj_request(img_request, obj_request) 129255f27e09SAlex Elder xferred += obj_request->xferred; 129355f27e09SAlex Elder img_request->xferred = xferred; 129455f27e09SAlex Elder } 129555f27e09SAlex Elder 1296bf0d5f50SAlex Elder if (img_request->callback) 1297bf0d5f50SAlex Elder img_request->callback(img_request); 1298bf0d5f50SAlex Elder else 1299bf0d5f50SAlex Elder rbd_img_request_put(img_request); 1300bf0d5f50SAlex Elder } 1301bf0d5f50SAlex Elder 1302788e2df3SAlex Elder /* Caller is responsible for rbd_obj_request_destroy(obj_request) */ 1303788e2df3SAlex Elder 1304788e2df3SAlex Elder static int rbd_obj_request_wait(struct rbd_obj_request *obj_request) 1305788e2df3SAlex Elder { 130637206ee5SAlex Elder dout("%s: obj %p\n", __func__, obj_request); 130737206ee5SAlex Elder 1308788e2df3SAlex Elder return wait_for_completion_interruptible(&obj_request->completion); 1309788e2df3SAlex Elder } 1310788e2df3SAlex Elder 13110c425248SAlex Elder /* 13120c425248SAlex Elder * The default/initial value for all image request flags is 0. Each 13130c425248SAlex Elder * is conditionally set to 1 at image request initialization time 13140c425248SAlex Elder * and currently never change thereafter. 13150c425248SAlex Elder */ 13160c425248SAlex Elder static void img_request_write_set(struct rbd_img_request *img_request) 13170c425248SAlex Elder { 13180c425248SAlex Elder set_bit(IMG_REQ_WRITE, &img_request->flags); 13190c425248SAlex Elder smp_mb(); 13200c425248SAlex Elder } 13210c425248SAlex Elder 13220c425248SAlex Elder static bool img_request_write_test(struct rbd_img_request *img_request) 13230c425248SAlex Elder { 13240c425248SAlex Elder smp_mb(); 13250c425248SAlex Elder return test_bit(IMG_REQ_WRITE, &img_request->flags) != 0; 13260c425248SAlex Elder } 13270c425248SAlex Elder 13289849e986SAlex Elder static void img_request_child_set(struct rbd_img_request *img_request) 13299849e986SAlex Elder { 13309849e986SAlex Elder set_bit(IMG_REQ_CHILD, &img_request->flags); 13319849e986SAlex Elder smp_mb(); 13329849e986SAlex Elder } 13339849e986SAlex Elder 13349849e986SAlex Elder static bool img_request_child_test(struct rbd_img_request *img_request) 13359849e986SAlex Elder { 13369849e986SAlex Elder smp_mb(); 13379849e986SAlex Elder return test_bit(IMG_REQ_CHILD, &img_request->flags) != 0; 13389849e986SAlex Elder } 13399849e986SAlex Elder 1340d0b2e944SAlex Elder static void img_request_layered_set(struct rbd_img_request *img_request) 1341d0b2e944SAlex Elder { 1342d0b2e944SAlex Elder set_bit(IMG_REQ_LAYERED, &img_request->flags); 1343d0b2e944SAlex Elder smp_mb(); 1344d0b2e944SAlex Elder } 1345d0b2e944SAlex Elder 1346d0b2e944SAlex Elder static bool img_request_layered_test(struct rbd_img_request *img_request) 1347d0b2e944SAlex Elder { 1348d0b2e944SAlex Elder smp_mb(); 1349d0b2e944SAlex Elder return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0; 1350d0b2e944SAlex Elder } 1351d0b2e944SAlex Elder 13526e2a4505SAlex Elder static void 13536e2a4505SAlex Elder rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request) 13546e2a4505SAlex Elder { 13556e2a4505SAlex Elder dout("%s: obj %p img %p result %d %llu/%llu\n", __func__, 13566e2a4505SAlex Elder obj_request, obj_request->img_request, obj_request->result, 13576e2a4505SAlex Elder obj_request->xferred, obj_request->length); 13586e2a4505SAlex Elder /* 13596e2a4505SAlex Elder * ENOENT means a hole in the image. We zero-fill the 13606e2a4505SAlex Elder * entire length of the request. A short read also implies 13616e2a4505SAlex Elder * zero-fill to the end of the request. Either way we 13626e2a4505SAlex Elder * update the xferred count to indicate the whole request 13636e2a4505SAlex Elder * was satisfied. 13646e2a4505SAlex Elder */ 13656e2a4505SAlex Elder BUG_ON(obj_request->type != OBJ_REQUEST_BIO); 13666e2a4505SAlex Elder if (obj_request->result == -ENOENT) { 13676e2a4505SAlex Elder zero_bio_chain(obj_request->bio_list, 0); 13686e2a4505SAlex Elder obj_request->result = 0; 13696e2a4505SAlex Elder obj_request->xferred = obj_request->length; 13706e2a4505SAlex Elder } else if (obj_request->xferred < obj_request->length && 13716e2a4505SAlex Elder !obj_request->result) { 13726e2a4505SAlex Elder zero_bio_chain(obj_request->bio_list, obj_request->xferred); 13736e2a4505SAlex Elder obj_request->xferred = obj_request->length; 13746e2a4505SAlex Elder } 13756e2a4505SAlex Elder obj_request_done_set(obj_request); 13766e2a4505SAlex Elder } 13776e2a4505SAlex Elder 1378bf0d5f50SAlex Elder static void rbd_obj_request_complete(struct rbd_obj_request *obj_request) 1379bf0d5f50SAlex Elder { 138037206ee5SAlex Elder dout("%s: obj %p cb %p\n", __func__, obj_request, 138137206ee5SAlex Elder obj_request->callback); 1382bf0d5f50SAlex Elder if (obj_request->callback) 1383bf0d5f50SAlex Elder obj_request->callback(obj_request); 1384788e2df3SAlex Elder else 1385788e2df3SAlex Elder complete_all(&obj_request->completion); 1386bf0d5f50SAlex Elder } 1387bf0d5f50SAlex Elder 1388c47f9371SAlex Elder static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request) 138939bf2c5dSAlex Elder { 139039bf2c5dSAlex Elder dout("%s: obj %p\n", __func__, obj_request); 139139bf2c5dSAlex Elder obj_request_done_set(obj_request); 139239bf2c5dSAlex Elder } 139339bf2c5dSAlex Elder 1394c47f9371SAlex Elder static void rbd_osd_read_callback(struct rbd_obj_request *obj_request) 1395bf0d5f50SAlex Elder { 139657acbaa7SAlex Elder struct rbd_img_request *img_request = NULL; 139757acbaa7SAlex Elder bool layered = false; 139857acbaa7SAlex Elder 139957acbaa7SAlex Elder if (obj_request_img_data_test(obj_request)) { 140057acbaa7SAlex Elder img_request = obj_request->img_request; 140157acbaa7SAlex Elder layered = img_request && img_request_layered_test(img_request); 140257acbaa7SAlex Elder } else { 140357acbaa7SAlex Elder img_request = NULL; 140457acbaa7SAlex Elder layered = false; 140557acbaa7SAlex Elder } 14068b3e1a56SAlex Elder 14078b3e1a56SAlex Elder dout("%s: obj %p img %p result %d %llu/%llu\n", __func__, 14088b3e1a56SAlex Elder obj_request, img_request, obj_request->result, 14098b3e1a56SAlex Elder obj_request->xferred, obj_request->length); 14108b3e1a56SAlex Elder if (layered && obj_request->result == -ENOENT) 14118b3e1a56SAlex Elder rbd_img_parent_read(obj_request); 14128b3e1a56SAlex Elder else if (img_request) 14136e2a4505SAlex Elder rbd_img_obj_request_read_callback(obj_request); 14146e2a4505SAlex Elder else 141507741308SAlex Elder obj_request_done_set(obj_request); 1416bf0d5f50SAlex Elder } 1417bf0d5f50SAlex Elder 1418c47f9371SAlex Elder static void rbd_osd_write_callback(struct rbd_obj_request *obj_request) 1419bf0d5f50SAlex Elder { 14201b83bef2SSage Weil dout("%s: obj %p result %d %llu\n", __func__, obj_request, 14211b83bef2SSage Weil obj_request->result, obj_request->length); 14221b83bef2SSage Weil /* 14238b3e1a56SAlex Elder * There is no such thing as a successful short write. Set 14248b3e1a56SAlex Elder * it to our originally-requested length. 14251b83bef2SSage Weil */ 14261b83bef2SSage Weil obj_request->xferred = obj_request->length; 142707741308SAlex Elder obj_request_done_set(obj_request); 1428bf0d5f50SAlex Elder } 1429bf0d5f50SAlex Elder 1430fbfab539SAlex Elder /* 1431fbfab539SAlex Elder * For a simple stat call there's nothing to do. We'll do more if 1432fbfab539SAlex Elder * this is part of a write sequence for a layered image. 1433fbfab539SAlex Elder */ 1434c47f9371SAlex Elder static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request) 1435fbfab539SAlex Elder { 143637206ee5SAlex Elder dout("%s: obj %p\n", __func__, obj_request); 1437fbfab539SAlex Elder obj_request_done_set(obj_request); 1438fbfab539SAlex Elder } 1439fbfab539SAlex Elder 1440bf0d5f50SAlex Elder static void rbd_osd_req_callback(struct ceph_osd_request *osd_req, 1441bf0d5f50SAlex Elder struct ceph_msg *msg) 1442bf0d5f50SAlex Elder { 1443bf0d5f50SAlex Elder struct rbd_obj_request *obj_request = osd_req->r_priv; 1444bf0d5f50SAlex Elder u16 opcode; 1445bf0d5f50SAlex Elder 144637206ee5SAlex Elder dout("%s: osd_req %p msg %p\n", __func__, osd_req, msg); 1447bf0d5f50SAlex Elder rbd_assert(osd_req == obj_request->osd_req); 144857acbaa7SAlex Elder if (obj_request_img_data_test(obj_request)) { 144957acbaa7SAlex Elder rbd_assert(obj_request->img_request); 145057acbaa7SAlex Elder rbd_assert(obj_request->which != BAD_WHICH); 145157acbaa7SAlex Elder } else { 145257acbaa7SAlex Elder rbd_assert(obj_request->which == BAD_WHICH); 145357acbaa7SAlex Elder } 1454bf0d5f50SAlex Elder 14551b83bef2SSage Weil if (osd_req->r_result < 0) 14561b83bef2SSage Weil obj_request->result = osd_req->r_result; 1457bf0d5f50SAlex Elder obj_request->version = le64_to_cpu(osd_req->r_reassert_version.version); 1458bf0d5f50SAlex Elder 14591b83bef2SSage Weil WARN_ON(osd_req->r_num_ops != 1); /* For now */ 1460bf0d5f50SAlex Elder 1461c47f9371SAlex Elder /* 1462c47f9371SAlex Elder * We support a 64-bit length, but ultimately it has to be 1463c47f9371SAlex Elder * passed to blk_end_request(), which takes an unsigned int. 1464c47f9371SAlex Elder */ 14651b83bef2SSage Weil obj_request->xferred = osd_req->r_reply_op_len[0]; 1466c47f9371SAlex Elder rbd_assert(obj_request->xferred < (u64)UINT_MAX); 146779528734SAlex Elder opcode = osd_req->r_ops[0].op; 1468bf0d5f50SAlex Elder switch (opcode) { 1469bf0d5f50SAlex Elder case CEPH_OSD_OP_READ: 1470c47f9371SAlex Elder rbd_osd_read_callback(obj_request); 1471bf0d5f50SAlex Elder break; 1472bf0d5f50SAlex Elder case CEPH_OSD_OP_WRITE: 1473c47f9371SAlex Elder rbd_osd_write_callback(obj_request); 1474bf0d5f50SAlex Elder break; 1475fbfab539SAlex Elder case CEPH_OSD_OP_STAT: 1476c47f9371SAlex Elder rbd_osd_stat_callback(obj_request); 1477fbfab539SAlex Elder break; 147836be9a76SAlex Elder case CEPH_OSD_OP_CALL: 1479b8d70035SAlex Elder case CEPH_OSD_OP_NOTIFY_ACK: 14809969ebc5SAlex Elder case CEPH_OSD_OP_WATCH: 1481c47f9371SAlex Elder rbd_osd_trivial_callback(obj_request); 14829969ebc5SAlex Elder break; 1483bf0d5f50SAlex Elder default: 1484bf0d5f50SAlex Elder rbd_warn(NULL, "%s: unsupported op %hu\n", 1485bf0d5f50SAlex Elder obj_request->object_name, (unsigned short) opcode); 1486bf0d5f50SAlex Elder break; 1487bf0d5f50SAlex Elder } 1488bf0d5f50SAlex Elder 148907741308SAlex Elder if (obj_request_done_test(obj_request)) 1490bf0d5f50SAlex Elder rbd_obj_request_complete(obj_request); 1491bf0d5f50SAlex Elder } 1492bf0d5f50SAlex Elder 14939d4df01fSAlex Elder static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request) 1494430c28c3SAlex Elder { 1495430c28c3SAlex Elder struct rbd_img_request *img_request = obj_request->img_request; 14968c042b0dSAlex Elder struct ceph_osd_request *osd_req = obj_request->osd_req; 14979d4df01fSAlex Elder u64 snap_id; 1498430c28c3SAlex Elder 14998c042b0dSAlex Elder rbd_assert(osd_req != NULL); 1500430c28c3SAlex Elder 15019d4df01fSAlex Elder snap_id = img_request ? img_request->snap_id : CEPH_NOSNAP; 15028c042b0dSAlex Elder ceph_osdc_build_request(osd_req, obj_request->offset, 15039d4df01fSAlex Elder NULL, snap_id, NULL); 15049d4df01fSAlex Elder } 15059d4df01fSAlex Elder 15069d4df01fSAlex Elder static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request) 15079d4df01fSAlex Elder { 15089d4df01fSAlex Elder struct rbd_img_request *img_request = obj_request->img_request; 15099d4df01fSAlex Elder struct ceph_osd_request *osd_req = obj_request->osd_req; 15109d4df01fSAlex Elder struct ceph_snap_context *snapc; 15119d4df01fSAlex Elder struct timespec mtime = CURRENT_TIME; 15129d4df01fSAlex Elder 15139d4df01fSAlex Elder rbd_assert(osd_req != NULL); 15149d4df01fSAlex Elder 15159d4df01fSAlex Elder snapc = img_request ? img_request->snapc : NULL; 15169d4df01fSAlex Elder ceph_osdc_build_request(osd_req, obj_request->offset, 15179d4df01fSAlex Elder snapc, CEPH_NOSNAP, &mtime); 1518430c28c3SAlex Elder } 1519430c28c3SAlex Elder 1520bf0d5f50SAlex Elder static struct ceph_osd_request *rbd_osd_req_create( 1521bf0d5f50SAlex Elder struct rbd_device *rbd_dev, 1522bf0d5f50SAlex Elder bool write_request, 1523430c28c3SAlex Elder struct rbd_obj_request *obj_request) 1524bf0d5f50SAlex Elder { 1525bf0d5f50SAlex Elder struct ceph_snap_context *snapc = NULL; 1526bf0d5f50SAlex Elder struct ceph_osd_client *osdc; 1527bf0d5f50SAlex Elder struct ceph_osd_request *osd_req; 1528bf0d5f50SAlex Elder 15296365d33aSAlex Elder if (obj_request_img_data_test(obj_request)) { 15306365d33aSAlex Elder struct rbd_img_request *img_request = obj_request->img_request; 15316365d33aSAlex Elder 15320c425248SAlex Elder rbd_assert(write_request == 15330c425248SAlex Elder img_request_write_test(img_request)); 15340c425248SAlex Elder if (write_request) 1535bf0d5f50SAlex Elder snapc = img_request->snapc; 1536bf0d5f50SAlex Elder } 1537bf0d5f50SAlex Elder 1538bf0d5f50SAlex Elder /* Allocate and initialize the request, for the single op */ 1539bf0d5f50SAlex Elder 1540bf0d5f50SAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 1541bf0d5f50SAlex Elder osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_ATOMIC); 1542bf0d5f50SAlex Elder if (!osd_req) 1543bf0d5f50SAlex Elder return NULL; /* ENOMEM */ 1544bf0d5f50SAlex Elder 1545430c28c3SAlex Elder if (write_request) 1546bf0d5f50SAlex Elder osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK; 1547430c28c3SAlex Elder else 1548bf0d5f50SAlex Elder osd_req->r_flags = CEPH_OSD_FLAG_READ; 1549bf0d5f50SAlex Elder 1550bf0d5f50SAlex Elder osd_req->r_callback = rbd_osd_req_callback; 1551bf0d5f50SAlex Elder osd_req->r_priv = obj_request; 1552bf0d5f50SAlex Elder 1553bf0d5f50SAlex Elder osd_req->r_oid_len = strlen(obj_request->object_name); 1554bf0d5f50SAlex Elder rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid)); 1555bf0d5f50SAlex Elder memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len); 1556bf0d5f50SAlex Elder 1557bf0d5f50SAlex Elder osd_req->r_file_layout = rbd_dev->layout; /* struct */ 1558bf0d5f50SAlex Elder 1559bf0d5f50SAlex Elder return osd_req; 1560bf0d5f50SAlex Elder } 1561bf0d5f50SAlex Elder 1562bf0d5f50SAlex Elder static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req) 1563bf0d5f50SAlex Elder { 1564bf0d5f50SAlex Elder ceph_osdc_put_request(osd_req); 1565bf0d5f50SAlex Elder } 1566bf0d5f50SAlex Elder 1567bf0d5f50SAlex Elder /* object_name is assumed to be a non-null pointer and NUL-terminated */ 1568bf0d5f50SAlex Elder 1569bf0d5f50SAlex Elder static struct rbd_obj_request *rbd_obj_request_create(const char *object_name, 1570bf0d5f50SAlex Elder u64 offset, u64 length, 1571bf0d5f50SAlex Elder enum obj_request_type type) 1572bf0d5f50SAlex Elder { 1573bf0d5f50SAlex Elder struct rbd_obj_request *obj_request; 1574bf0d5f50SAlex Elder size_t size; 1575bf0d5f50SAlex Elder char *name; 1576bf0d5f50SAlex Elder 1577bf0d5f50SAlex Elder rbd_assert(obj_request_type_valid(type)); 1578bf0d5f50SAlex Elder 1579bf0d5f50SAlex Elder size = strlen(object_name) + 1; 1580bf0d5f50SAlex Elder obj_request = kzalloc(sizeof (*obj_request) + size, GFP_KERNEL); 1581bf0d5f50SAlex Elder if (!obj_request) 1582bf0d5f50SAlex Elder return NULL; 1583bf0d5f50SAlex Elder 1584bf0d5f50SAlex Elder name = (char *)(obj_request + 1); 1585bf0d5f50SAlex Elder obj_request->object_name = memcpy(name, object_name, size); 1586bf0d5f50SAlex Elder obj_request->offset = offset; 1587bf0d5f50SAlex Elder obj_request->length = length; 1588926f9b3fSAlex Elder obj_request->flags = 0; 1589bf0d5f50SAlex Elder obj_request->which = BAD_WHICH; 1590bf0d5f50SAlex Elder obj_request->type = type; 1591bf0d5f50SAlex Elder INIT_LIST_HEAD(&obj_request->links); 1592788e2df3SAlex Elder init_completion(&obj_request->completion); 1593bf0d5f50SAlex Elder kref_init(&obj_request->kref); 1594bf0d5f50SAlex Elder 159537206ee5SAlex Elder dout("%s: \"%s\" %llu/%llu %d -> obj %p\n", __func__, object_name, 159637206ee5SAlex Elder offset, length, (int)type, obj_request); 159737206ee5SAlex Elder 1598bf0d5f50SAlex Elder return obj_request; 1599bf0d5f50SAlex Elder } 1600bf0d5f50SAlex Elder 1601bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref) 1602bf0d5f50SAlex Elder { 1603bf0d5f50SAlex Elder struct rbd_obj_request *obj_request; 1604bf0d5f50SAlex Elder 1605bf0d5f50SAlex Elder obj_request = container_of(kref, struct rbd_obj_request, kref); 1606bf0d5f50SAlex Elder 160737206ee5SAlex Elder dout("%s: obj %p\n", __func__, obj_request); 160837206ee5SAlex Elder 1609bf0d5f50SAlex Elder rbd_assert(obj_request->img_request == NULL); 1610bf0d5f50SAlex Elder rbd_assert(obj_request->which == BAD_WHICH); 1611bf0d5f50SAlex Elder 1612bf0d5f50SAlex Elder if (obj_request->osd_req) 1613bf0d5f50SAlex Elder rbd_osd_req_destroy(obj_request->osd_req); 1614bf0d5f50SAlex Elder 1615bf0d5f50SAlex Elder rbd_assert(obj_request_type_valid(obj_request->type)); 1616bf0d5f50SAlex Elder switch (obj_request->type) { 16179969ebc5SAlex Elder case OBJ_REQUEST_NODATA: 16189969ebc5SAlex Elder break; /* Nothing to do */ 1619bf0d5f50SAlex Elder case OBJ_REQUEST_BIO: 1620bf0d5f50SAlex Elder if (obj_request->bio_list) 1621bf0d5f50SAlex Elder bio_chain_put(obj_request->bio_list); 1622bf0d5f50SAlex Elder break; 1623788e2df3SAlex Elder case OBJ_REQUEST_PAGES: 1624788e2df3SAlex Elder if (obj_request->pages) 1625788e2df3SAlex Elder ceph_release_page_vector(obj_request->pages, 1626788e2df3SAlex Elder obj_request->page_count); 1627788e2df3SAlex Elder break; 1628bf0d5f50SAlex Elder } 1629bf0d5f50SAlex Elder 1630bf0d5f50SAlex Elder kfree(obj_request); 1631bf0d5f50SAlex Elder } 1632bf0d5f50SAlex Elder 1633bf0d5f50SAlex Elder /* 1634bf0d5f50SAlex Elder * Caller is responsible for filling in the list of object requests 1635bf0d5f50SAlex Elder * that comprises the image request, and the Linux request pointer 1636bf0d5f50SAlex Elder * (if there is one). 1637bf0d5f50SAlex Elder */ 1638cc344fa1SAlex Elder static struct rbd_img_request *rbd_img_request_create( 1639cc344fa1SAlex Elder struct rbd_device *rbd_dev, 1640bf0d5f50SAlex Elder u64 offset, u64 length, 16419849e986SAlex Elder bool write_request, 16429849e986SAlex Elder bool child_request) 1643bf0d5f50SAlex Elder { 1644bf0d5f50SAlex Elder struct rbd_img_request *img_request; 1645bf0d5f50SAlex Elder struct ceph_snap_context *snapc = NULL; 1646bf0d5f50SAlex Elder 1647bf0d5f50SAlex Elder img_request = kmalloc(sizeof (*img_request), GFP_ATOMIC); 1648bf0d5f50SAlex Elder if (!img_request) 1649bf0d5f50SAlex Elder return NULL; 1650bf0d5f50SAlex Elder 1651bf0d5f50SAlex Elder if (write_request) { 1652bf0d5f50SAlex Elder down_read(&rbd_dev->header_rwsem); 1653bf0d5f50SAlex Elder snapc = ceph_get_snap_context(rbd_dev->header.snapc); 1654bf0d5f50SAlex Elder up_read(&rbd_dev->header_rwsem); 1655bf0d5f50SAlex Elder if (WARN_ON(!snapc)) { 1656bf0d5f50SAlex Elder kfree(img_request); 1657bf0d5f50SAlex Elder return NULL; /* Shouldn't happen */ 1658bf0d5f50SAlex Elder } 16590c425248SAlex Elder 1660bf0d5f50SAlex Elder } 1661bf0d5f50SAlex Elder 1662bf0d5f50SAlex Elder img_request->rq = NULL; 1663bf0d5f50SAlex Elder img_request->rbd_dev = rbd_dev; 1664bf0d5f50SAlex Elder img_request->offset = offset; 1665bf0d5f50SAlex Elder img_request->length = length; 16660c425248SAlex Elder img_request->flags = 0; 16670c425248SAlex Elder if (write_request) { 16680c425248SAlex Elder img_request_write_set(img_request); 1669bf0d5f50SAlex Elder img_request->snapc = snapc; 16700c425248SAlex Elder } else { 1671bf0d5f50SAlex Elder img_request->snap_id = rbd_dev->spec->snap_id; 16720c425248SAlex Elder } 16739849e986SAlex Elder if (child_request) 16749849e986SAlex Elder img_request_child_set(img_request); 1675d0b2e944SAlex Elder if (rbd_dev->parent_spec) 1676d0b2e944SAlex Elder img_request_layered_set(img_request); 1677bf0d5f50SAlex Elder spin_lock_init(&img_request->completion_lock); 1678bf0d5f50SAlex Elder img_request->next_completion = 0; 1679bf0d5f50SAlex Elder img_request->callback = NULL; 1680a5a337d4SAlex Elder img_request->result = 0; 1681bf0d5f50SAlex Elder img_request->obj_request_count = 0; 1682bf0d5f50SAlex Elder INIT_LIST_HEAD(&img_request->obj_requests); 1683bf0d5f50SAlex Elder kref_init(&img_request->kref); 1684bf0d5f50SAlex Elder 1685bf0d5f50SAlex Elder rbd_img_request_get(img_request); /* Avoid a warning */ 1686bf0d5f50SAlex Elder rbd_img_request_put(img_request); /* TEMPORARY */ 1687bf0d5f50SAlex Elder 168837206ee5SAlex Elder dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev, 168937206ee5SAlex Elder write_request ? "write" : "read", offset, length, 169037206ee5SAlex Elder img_request); 169137206ee5SAlex Elder 1692bf0d5f50SAlex Elder return img_request; 1693bf0d5f50SAlex Elder } 1694bf0d5f50SAlex Elder 1695bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref) 1696bf0d5f50SAlex Elder { 1697bf0d5f50SAlex Elder struct rbd_img_request *img_request; 1698bf0d5f50SAlex Elder struct rbd_obj_request *obj_request; 1699bf0d5f50SAlex Elder struct rbd_obj_request *next_obj_request; 1700bf0d5f50SAlex Elder 1701bf0d5f50SAlex Elder img_request = container_of(kref, struct rbd_img_request, kref); 1702bf0d5f50SAlex Elder 170337206ee5SAlex Elder dout("%s: img %p\n", __func__, img_request); 170437206ee5SAlex Elder 1705bf0d5f50SAlex Elder for_each_obj_request_safe(img_request, obj_request, next_obj_request) 1706bf0d5f50SAlex Elder rbd_img_obj_request_del(img_request, obj_request); 170725dcf954SAlex Elder rbd_assert(img_request->obj_request_count == 0); 1708bf0d5f50SAlex Elder 17090c425248SAlex Elder if (img_request_write_test(img_request)) 1710bf0d5f50SAlex Elder ceph_put_snap_context(img_request->snapc); 1711bf0d5f50SAlex Elder 17128b3e1a56SAlex Elder if (img_request_child_test(img_request)) 17138b3e1a56SAlex Elder rbd_obj_request_put(img_request->obj_request); 17148b3e1a56SAlex Elder 1715bf0d5f50SAlex Elder kfree(img_request); 1716bf0d5f50SAlex Elder } 1717bf0d5f50SAlex Elder 17181217857fSAlex Elder static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request) 17191217857fSAlex Elder { 17206365d33aSAlex Elder struct rbd_img_request *img_request; 17211217857fSAlex Elder unsigned int xferred; 17221217857fSAlex Elder int result; 17238b3e1a56SAlex Elder bool more; 17241217857fSAlex Elder 17256365d33aSAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 17266365d33aSAlex Elder img_request = obj_request->img_request; 17276365d33aSAlex Elder 17281217857fSAlex Elder rbd_assert(obj_request->xferred <= (u64)UINT_MAX); 17291217857fSAlex Elder xferred = (unsigned int)obj_request->xferred; 17301217857fSAlex Elder result = obj_request->result; 17311217857fSAlex Elder if (result) { 17321217857fSAlex Elder struct rbd_device *rbd_dev = img_request->rbd_dev; 17331217857fSAlex Elder 17341217857fSAlex Elder rbd_warn(rbd_dev, "%s %llx at %llx (%llx)\n", 17351217857fSAlex Elder img_request_write_test(img_request) ? "write" : "read", 17361217857fSAlex Elder obj_request->length, obj_request->img_offset, 17371217857fSAlex Elder obj_request->offset); 17381217857fSAlex Elder rbd_warn(rbd_dev, " result %d xferred %x\n", 17391217857fSAlex Elder result, xferred); 17401217857fSAlex Elder if (!img_request->result) 17411217857fSAlex Elder img_request->result = result; 17421217857fSAlex Elder } 17431217857fSAlex Elder 17448b3e1a56SAlex Elder if (img_request_child_test(img_request)) { 17458b3e1a56SAlex Elder rbd_assert(img_request->obj_request != NULL); 17468b3e1a56SAlex Elder more = obj_request->which < img_request->obj_request_count - 1; 17478b3e1a56SAlex Elder } else { 17488b3e1a56SAlex Elder rbd_assert(img_request->rq != NULL); 17498b3e1a56SAlex Elder more = blk_end_request(img_request->rq, result, xferred); 17508b3e1a56SAlex Elder } 17518b3e1a56SAlex Elder 17528b3e1a56SAlex Elder return more; 17531217857fSAlex Elder } 17541217857fSAlex Elder 17552169238dSAlex Elder static void rbd_img_obj_callback(struct rbd_obj_request *obj_request) 17562169238dSAlex Elder { 17572169238dSAlex Elder struct rbd_img_request *img_request; 17582169238dSAlex Elder u32 which = obj_request->which; 17592169238dSAlex Elder bool more = true; 17602169238dSAlex Elder 17616365d33aSAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 17622169238dSAlex Elder img_request = obj_request->img_request; 17632169238dSAlex Elder 17642169238dSAlex Elder dout("%s: img %p obj %p\n", __func__, img_request, obj_request); 17652169238dSAlex Elder rbd_assert(img_request != NULL); 17662169238dSAlex Elder rbd_assert(img_request->obj_request_count > 0); 17672169238dSAlex Elder rbd_assert(which != BAD_WHICH); 17682169238dSAlex Elder rbd_assert(which < img_request->obj_request_count); 17692169238dSAlex Elder rbd_assert(which >= img_request->next_completion); 17702169238dSAlex Elder 17712169238dSAlex Elder spin_lock_irq(&img_request->completion_lock); 17722169238dSAlex Elder if (which != img_request->next_completion) 17732169238dSAlex Elder goto out; 17742169238dSAlex Elder 17752169238dSAlex Elder for_each_obj_request_from(img_request, obj_request) { 17762169238dSAlex Elder rbd_assert(more); 17772169238dSAlex Elder rbd_assert(which < img_request->obj_request_count); 17782169238dSAlex Elder 17792169238dSAlex Elder if (!obj_request_done_test(obj_request)) 17802169238dSAlex Elder break; 17811217857fSAlex Elder more = rbd_img_obj_end_request(obj_request); 17822169238dSAlex Elder which++; 17832169238dSAlex Elder } 17842169238dSAlex Elder 17852169238dSAlex Elder rbd_assert(more ^ (which == img_request->obj_request_count)); 17862169238dSAlex Elder img_request->next_completion = which; 17872169238dSAlex Elder out: 17882169238dSAlex Elder spin_unlock_irq(&img_request->completion_lock); 17892169238dSAlex Elder 17902169238dSAlex Elder if (!more) 17912169238dSAlex Elder rbd_img_request_complete(img_request); 17922169238dSAlex Elder } 17932169238dSAlex Elder 1794bf0d5f50SAlex Elder static int rbd_img_request_fill_bio(struct rbd_img_request *img_request, 1795bf0d5f50SAlex Elder struct bio *bio_list) 1796bf0d5f50SAlex Elder { 1797bf0d5f50SAlex Elder struct rbd_device *rbd_dev = img_request->rbd_dev; 1798bf0d5f50SAlex Elder struct rbd_obj_request *obj_request = NULL; 1799bf0d5f50SAlex Elder struct rbd_obj_request *next_obj_request; 18000c425248SAlex Elder bool write_request = img_request_write_test(img_request); 1801bf0d5f50SAlex Elder unsigned int bio_offset; 18027da22d29SAlex Elder u64 img_offset; 1803bf0d5f50SAlex Elder u64 resid; 1804bf0d5f50SAlex Elder u16 opcode; 1805bf0d5f50SAlex Elder 180637206ee5SAlex Elder dout("%s: img %p bio %p\n", __func__, img_request, bio_list); 180737206ee5SAlex Elder 1808430c28c3SAlex Elder opcode = write_request ? CEPH_OSD_OP_WRITE : CEPH_OSD_OP_READ; 1809bf0d5f50SAlex Elder bio_offset = 0; 18107da22d29SAlex Elder img_offset = img_request->offset; 18117da22d29SAlex Elder rbd_assert(img_offset == bio_list->bi_sector << SECTOR_SHIFT); 1812bf0d5f50SAlex Elder resid = img_request->length; 18134dda41d3SAlex Elder rbd_assert(resid > 0); 1814bf0d5f50SAlex Elder while (resid) { 18152fa12320SAlex Elder struct ceph_osd_request *osd_req; 1816bf0d5f50SAlex Elder const char *object_name; 1817bf0d5f50SAlex Elder unsigned int clone_size; 1818bf0d5f50SAlex Elder u64 offset; 1819bf0d5f50SAlex Elder u64 length; 1820bf0d5f50SAlex Elder 18217da22d29SAlex Elder object_name = rbd_segment_name(rbd_dev, img_offset); 1822bf0d5f50SAlex Elder if (!object_name) 1823bf0d5f50SAlex Elder goto out_unwind; 18247da22d29SAlex Elder offset = rbd_segment_offset(rbd_dev, img_offset); 18257da22d29SAlex Elder length = rbd_segment_length(rbd_dev, img_offset, resid); 1826bf0d5f50SAlex Elder obj_request = rbd_obj_request_create(object_name, 1827bf0d5f50SAlex Elder offset, length, 1828bf0d5f50SAlex Elder OBJ_REQUEST_BIO); 1829bf0d5f50SAlex Elder kfree(object_name); /* object request has its own copy */ 1830bf0d5f50SAlex Elder if (!obj_request) 1831bf0d5f50SAlex Elder goto out_unwind; 1832bf0d5f50SAlex Elder 1833bf0d5f50SAlex Elder rbd_assert(length <= (u64) UINT_MAX); 1834bf0d5f50SAlex Elder clone_size = (unsigned int) length; 1835bf0d5f50SAlex Elder obj_request->bio_list = bio_chain_clone_range(&bio_list, 1836bf0d5f50SAlex Elder &bio_offset, clone_size, 1837bf0d5f50SAlex Elder GFP_ATOMIC); 1838bf0d5f50SAlex Elder if (!obj_request->bio_list) 1839bf0d5f50SAlex Elder goto out_partial; 1840bf0d5f50SAlex Elder 18412fa12320SAlex Elder osd_req = rbd_osd_req_create(rbd_dev, write_request, 18422fa12320SAlex Elder obj_request); 18432fa12320SAlex Elder if (!osd_req) 1844bf0d5f50SAlex Elder goto out_partial; 18452fa12320SAlex Elder obj_request->osd_req = osd_req; 18462169238dSAlex Elder obj_request->callback = rbd_img_obj_callback; 1847430c28c3SAlex Elder 18482fa12320SAlex Elder osd_req_op_extent_init(osd_req, 0, opcode, offset, length, 18492fa12320SAlex Elder 0, 0); 1850406e2c9fSAlex Elder osd_req_op_extent_osd_data_bio(osd_req, 0, 1851a4ce40a9SAlex Elder obj_request->bio_list, obj_request->length); 18529d4df01fSAlex Elder 18539d4df01fSAlex Elder if (write_request) 18549d4df01fSAlex Elder rbd_osd_req_format_write(obj_request); 18559d4df01fSAlex Elder else 18569d4df01fSAlex Elder rbd_osd_req_format_read(obj_request); 1857430c28c3SAlex Elder 18587da22d29SAlex Elder obj_request->img_offset = img_offset; 1859bf0d5f50SAlex Elder rbd_img_obj_request_add(img_request, obj_request); 1860bf0d5f50SAlex Elder 18617da22d29SAlex Elder img_offset += length; 1862bf0d5f50SAlex Elder resid -= length; 1863bf0d5f50SAlex Elder } 1864bf0d5f50SAlex Elder 1865bf0d5f50SAlex Elder return 0; 1866bf0d5f50SAlex Elder 1867bf0d5f50SAlex Elder out_partial: 1868bf0d5f50SAlex Elder rbd_obj_request_put(obj_request); 1869bf0d5f50SAlex Elder out_unwind: 1870bf0d5f50SAlex Elder for_each_obj_request_safe(img_request, obj_request, next_obj_request) 1871bf0d5f50SAlex Elder rbd_obj_request_put(obj_request); 1872bf0d5f50SAlex Elder 1873bf0d5f50SAlex Elder return -ENOMEM; 1874bf0d5f50SAlex Elder } 1875bf0d5f50SAlex Elder 1876c5b5ef6cSAlex Elder static void rbd_img_obj_exists_callback(struct rbd_obj_request *obj_request) 1877c5b5ef6cSAlex Elder { 1878c5b5ef6cSAlex Elder struct rbd_obj_request *orig_request; 1879c5b5ef6cSAlex Elder int result; 1880c5b5ef6cSAlex Elder 1881c5b5ef6cSAlex Elder rbd_assert(!obj_request_img_data_test(obj_request)); 1882c5b5ef6cSAlex Elder 1883c5b5ef6cSAlex Elder /* 1884c5b5ef6cSAlex Elder * All we need from the object request is the original 1885c5b5ef6cSAlex Elder * request and the result of the STAT op. Grab those, then 1886c5b5ef6cSAlex Elder * we're done with the request. 1887c5b5ef6cSAlex Elder */ 1888c5b5ef6cSAlex Elder orig_request = obj_request->obj_request; 1889c5b5ef6cSAlex Elder obj_request->obj_request = NULL; 1890c5b5ef6cSAlex Elder rbd_assert(orig_request); 1891c5b5ef6cSAlex Elder rbd_assert(orig_request->img_request); 1892c5b5ef6cSAlex Elder 1893c5b5ef6cSAlex Elder result = obj_request->result; 1894c5b5ef6cSAlex Elder obj_request->result = 0; 1895c5b5ef6cSAlex Elder 1896c5b5ef6cSAlex Elder dout("%s: obj %p for obj %p result %d %llu/%llu\n", __func__, 1897c5b5ef6cSAlex Elder obj_request, orig_request, result, 1898c5b5ef6cSAlex Elder obj_request->xferred, obj_request->length); 1899c5b5ef6cSAlex Elder rbd_obj_request_put(obj_request); 1900c5b5ef6cSAlex Elder 1901c5b5ef6cSAlex Elder rbd_assert(orig_request); 1902c5b5ef6cSAlex Elder rbd_assert(orig_request->img_request); 1903c5b5ef6cSAlex Elder 1904c5b5ef6cSAlex Elder /* 1905c5b5ef6cSAlex Elder * Our only purpose here is to determine whether the object 1906c5b5ef6cSAlex Elder * exists, and we don't want to treat the non-existence as 1907c5b5ef6cSAlex Elder * an error. If something else comes back, transfer the 1908c5b5ef6cSAlex Elder * error to the original request and complete it now. 1909c5b5ef6cSAlex Elder */ 1910c5b5ef6cSAlex Elder if (!result) { 1911c5b5ef6cSAlex Elder obj_request_existence_set(orig_request, true); 1912c5b5ef6cSAlex Elder } else if (result == -ENOENT) { 1913c5b5ef6cSAlex Elder obj_request_existence_set(orig_request, false); 1914c5b5ef6cSAlex Elder } else if (result) { 1915c5b5ef6cSAlex Elder orig_request->result = result; 1916c5b5ef6cSAlex Elder goto out_err; 1917c5b5ef6cSAlex Elder } 1918c5b5ef6cSAlex Elder 1919c5b5ef6cSAlex Elder /* 1920c5b5ef6cSAlex Elder * Resubmit the original request now that we have recorded 1921c5b5ef6cSAlex Elder * whether the target object exists. 1922c5b5ef6cSAlex Elder */ 1923b454e36dSAlex Elder orig_request->result = rbd_img_obj_request_submit(orig_request); 1924c5b5ef6cSAlex Elder out_err: 1925c5b5ef6cSAlex Elder if (orig_request->result) 1926c5b5ef6cSAlex Elder rbd_obj_request_complete(orig_request); 1927c5b5ef6cSAlex Elder rbd_obj_request_put(orig_request); 1928c5b5ef6cSAlex Elder } 1929c5b5ef6cSAlex Elder 1930c5b5ef6cSAlex Elder static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request) 1931c5b5ef6cSAlex Elder { 1932c5b5ef6cSAlex Elder struct rbd_obj_request *stat_request; 1933c5b5ef6cSAlex Elder struct rbd_device *rbd_dev; 1934c5b5ef6cSAlex Elder struct ceph_osd_client *osdc; 1935c5b5ef6cSAlex Elder struct page **pages = NULL; 1936c5b5ef6cSAlex Elder u32 page_count; 1937c5b5ef6cSAlex Elder size_t size; 1938c5b5ef6cSAlex Elder int ret; 1939c5b5ef6cSAlex Elder 1940c5b5ef6cSAlex Elder /* 1941c5b5ef6cSAlex Elder * The response data for a STAT call consists of: 1942c5b5ef6cSAlex Elder * le64 length; 1943c5b5ef6cSAlex Elder * struct { 1944c5b5ef6cSAlex Elder * le32 tv_sec; 1945c5b5ef6cSAlex Elder * le32 tv_nsec; 1946c5b5ef6cSAlex Elder * } mtime; 1947c5b5ef6cSAlex Elder */ 1948c5b5ef6cSAlex Elder size = sizeof (__le64) + sizeof (__le32) + sizeof (__le32); 1949c5b5ef6cSAlex Elder page_count = (u32)calc_pages_for(0, size); 1950c5b5ef6cSAlex Elder pages = ceph_alloc_page_vector(page_count, GFP_KERNEL); 1951c5b5ef6cSAlex Elder if (IS_ERR(pages)) 1952c5b5ef6cSAlex Elder return PTR_ERR(pages); 1953c5b5ef6cSAlex Elder 1954c5b5ef6cSAlex Elder ret = -ENOMEM; 1955c5b5ef6cSAlex Elder stat_request = rbd_obj_request_create(obj_request->object_name, 0, 0, 1956c5b5ef6cSAlex Elder OBJ_REQUEST_PAGES); 1957c5b5ef6cSAlex Elder if (!stat_request) 1958c5b5ef6cSAlex Elder goto out; 1959c5b5ef6cSAlex Elder 1960c5b5ef6cSAlex Elder rbd_obj_request_get(obj_request); 1961c5b5ef6cSAlex Elder stat_request->obj_request = obj_request; 1962c5b5ef6cSAlex Elder stat_request->pages = pages; 1963c5b5ef6cSAlex Elder stat_request->page_count = page_count; 1964c5b5ef6cSAlex Elder 1965c5b5ef6cSAlex Elder rbd_assert(obj_request->img_request); 1966c5b5ef6cSAlex Elder rbd_dev = obj_request->img_request->rbd_dev; 1967c5b5ef6cSAlex Elder stat_request->osd_req = rbd_osd_req_create(rbd_dev, false, 1968c5b5ef6cSAlex Elder stat_request); 1969c5b5ef6cSAlex Elder if (!stat_request->osd_req) 1970c5b5ef6cSAlex Elder goto out; 1971c5b5ef6cSAlex Elder stat_request->callback = rbd_img_obj_exists_callback; 1972c5b5ef6cSAlex Elder 1973c5b5ef6cSAlex Elder osd_req_op_init(stat_request->osd_req, 0, CEPH_OSD_OP_STAT); 1974c5b5ef6cSAlex Elder osd_req_op_raw_data_in_pages(stat_request->osd_req, 0, pages, size, 0, 1975c5b5ef6cSAlex Elder false, false); 19769d4df01fSAlex Elder rbd_osd_req_format_read(stat_request); 1977c5b5ef6cSAlex Elder 1978c5b5ef6cSAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 1979c5b5ef6cSAlex Elder ret = rbd_obj_request_submit(osdc, stat_request); 1980c5b5ef6cSAlex Elder out: 1981c5b5ef6cSAlex Elder if (ret) 1982c5b5ef6cSAlex Elder rbd_obj_request_put(obj_request); 1983c5b5ef6cSAlex Elder 1984c5b5ef6cSAlex Elder return ret; 1985c5b5ef6cSAlex Elder } 1986c5b5ef6cSAlex Elder 1987b454e36dSAlex Elder static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request) 1988b454e36dSAlex Elder { 1989b454e36dSAlex Elder struct rbd_img_request *img_request; 1990b454e36dSAlex Elder 1991b454e36dSAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 1992b454e36dSAlex Elder 1993b454e36dSAlex Elder img_request = obj_request->img_request; 1994b454e36dSAlex Elder rbd_assert(img_request); 1995b454e36dSAlex Elder 1996b454e36dSAlex Elder /* (At the moment we don't care whether it exists or not...) */ 1997b454e36dSAlex Elder (void) obj_request_exists_test; 1998b454e36dSAlex Elder 1999b454e36dSAlex Elder /* 2000b454e36dSAlex Elder * Only layered writes need special handling. If it's not a 2001b454e36dSAlex Elder * layered write, or it is a layered write but we know the 2002b454e36dSAlex Elder * target object exists, it's no different from any other 2003b454e36dSAlex Elder * object request. 2004b454e36dSAlex Elder */ 2005b454e36dSAlex Elder if (!img_request_write_test(img_request) || 2006b454e36dSAlex Elder !img_request_layered_test(img_request) || 2007b454e36dSAlex Elder obj_request_known_test(obj_request)) { 2008b454e36dSAlex Elder 2009b454e36dSAlex Elder struct rbd_device *rbd_dev; 2010b454e36dSAlex Elder struct ceph_osd_client *osdc; 2011b454e36dSAlex Elder 2012b454e36dSAlex Elder rbd_dev = obj_request->img_request->rbd_dev; 2013b454e36dSAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 2014b454e36dSAlex Elder 2015b454e36dSAlex Elder return rbd_obj_request_submit(osdc, obj_request); 2016b454e36dSAlex Elder } 2017b454e36dSAlex Elder 2018b454e36dSAlex Elder /* 2019b454e36dSAlex Elder * It's a layered write and we don't know whether the target 2020b454e36dSAlex Elder * exists. Issue existence check; once that completes the 2021b454e36dSAlex Elder * original request will be submitted again. 2022b454e36dSAlex Elder */ 2023b454e36dSAlex Elder 2024b454e36dSAlex Elder return rbd_img_obj_exists_submit(obj_request); 2025b454e36dSAlex Elder } 2026b454e36dSAlex Elder 2027bf0d5f50SAlex Elder static int rbd_img_request_submit(struct rbd_img_request *img_request) 2028bf0d5f50SAlex Elder { 2029bf0d5f50SAlex Elder struct rbd_obj_request *obj_request; 203046faeed4SAlex Elder struct rbd_obj_request *next_obj_request; 2031bf0d5f50SAlex Elder 203237206ee5SAlex Elder dout("%s: img %p\n", __func__, img_request); 203346faeed4SAlex Elder for_each_obj_request_safe(img_request, obj_request, next_obj_request) { 2034bf0d5f50SAlex Elder int ret; 2035bf0d5f50SAlex Elder 2036b454e36dSAlex Elder ret = rbd_img_obj_request_submit(obj_request); 2037bf0d5f50SAlex Elder if (ret) 2038bf0d5f50SAlex Elder return ret; 2039bf0d5f50SAlex Elder } 2040bf0d5f50SAlex Elder 2041bf0d5f50SAlex Elder return 0; 2042bf0d5f50SAlex Elder } 2043bf0d5f50SAlex Elder 20448b3e1a56SAlex Elder static void rbd_img_parent_read_callback(struct rbd_img_request *img_request) 20458b3e1a56SAlex Elder { 20468b3e1a56SAlex Elder struct rbd_obj_request *obj_request; 20478b3e1a56SAlex Elder 20488b3e1a56SAlex Elder rbd_assert(img_request_child_test(img_request)); 20498b3e1a56SAlex Elder 20508b3e1a56SAlex Elder obj_request = img_request->obj_request; 20518b3e1a56SAlex Elder rbd_assert(obj_request != NULL); 20528b3e1a56SAlex Elder obj_request->result = img_request->result; 20538b3e1a56SAlex Elder obj_request->xferred = img_request->xferred; 20548b3e1a56SAlex Elder 20558b3e1a56SAlex Elder rbd_img_obj_request_read_callback(obj_request); 20568b3e1a56SAlex Elder rbd_obj_request_complete(obj_request); 20578b3e1a56SAlex Elder } 20588b3e1a56SAlex Elder 20598b3e1a56SAlex Elder static void rbd_img_parent_read(struct rbd_obj_request *obj_request) 20608b3e1a56SAlex Elder { 20618b3e1a56SAlex Elder struct rbd_device *rbd_dev; 20628b3e1a56SAlex Elder struct rbd_img_request *img_request; 20638b3e1a56SAlex Elder int result; 20648b3e1a56SAlex Elder 20658b3e1a56SAlex Elder rbd_assert(obj_request_img_data_test(obj_request)); 20668b3e1a56SAlex Elder rbd_assert(obj_request->img_request != NULL); 20678b3e1a56SAlex Elder rbd_assert(obj_request->result == (s32) -ENOENT); 20688b3e1a56SAlex Elder rbd_assert(obj_request->type == OBJ_REQUEST_BIO); 20698b3e1a56SAlex Elder 20708b3e1a56SAlex Elder rbd_dev = obj_request->img_request->rbd_dev; 20718b3e1a56SAlex Elder rbd_assert(rbd_dev->parent != NULL); 20728b3e1a56SAlex Elder /* rbd_read_finish(obj_request, obj_request->length); */ 20738b3e1a56SAlex Elder img_request = rbd_img_request_create(rbd_dev->parent, 20748b3e1a56SAlex Elder obj_request->img_offset, 20758b3e1a56SAlex Elder obj_request->length, 20768b3e1a56SAlex Elder false, true); 20778b3e1a56SAlex Elder result = -ENOMEM; 20788b3e1a56SAlex Elder if (!img_request) 20798b3e1a56SAlex Elder goto out_err; 20808b3e1a56SAlex Elder 20818b3e1a56SAlex Elder rbd_obj_request_get(obj_request); 20828b3e1a56SAlex Elder img_request->obj_request = obj_request; 20838b3e1a56SAlex Elder 20848b3e1a56SAlex Elder result = rbd_img_request_fill_bio(img_request, obj_request->bio_list); 20858b3e1a56SAlex Elder if (result) 20868b3e1a56SAlex Elder goto out_err; 20878b3e1a56SAlex Elder 20888b3e1a56SAlex Elder img_request->callback = rbd_img_parent_read_callback; 20898b3e1a56SAlex Elder result = rbd_img_request_submit(img_request); 20908b3e1a56SAlex Elder if (result) 20918b3e1a56SAlex Elder goto out_err; 20928b3e1a56SAlex Elder 20938b3e1a56SAlex Elder return; 20948b3e1a56SAlex Elder out_err: 20958b3e1a56SAlex Elder if (img_request) 20968b3e1a56SAlex Elder rbd_img_request_put(img_request); 20978b3e1a56SAlex Elder obj_request->result = result; 20988b3e1a56SAlex Elder obj_request->xferred = 0; 20998b3e1a56SAlex Elder obj_request_done_set(obj_request); 21008b3e1a56SAlex Elder } 21018b3e1a56SAlex Elder 2102cf81b60eSAlex Elder static int rbd_obj_notify_ack(struct rbd_device *rbd_dev, 2103b8d70035SAlex Elder u64 ver, u64 notify_id) 2104b8d70035SAlex Elder { 2105b8d70035SAlex Elder struct rbd_obj_request *obj_request; 21062169238dSAlex Elder struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 2107b8d70035SAlex Elder int ret; 2108b8d70035SAlex Elder 2109b8d70035SAlex Elder obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0, 2110b8d70035SAlex Elder OBJ_REQUEST_NODATA); 2111b8d70035SAlex Elder if (!obj_request) 2112b8d70035SAlex Elder return -ENOMEM; 2113b8d70035SAlex Elder 2114b8d70035SAlex Elder ret = -ENOMEM; 2115430c28c3SAlex Elder obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request); 2116b8d70035SAlex Elder if (!obj_request->osd_req) 2117b8d70035SAlex Elder goto out; 21182169238dSAlex Elder obj_request->callback = rbd_obj_request_put; 2119b8d70035SAlex Elder 2120c99d2d4aSAlex Elder osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_NOTIFY_ACK, 2121c99d2d4aSAlex Elder notify_id, ver, 0); 21229d4df01fSAlex Elder rbd_osd_req_format_read(obj_request); 2123430c28c3SAlex Elder 2124b8d70035SAlex Elder ret = rbd_obj_request_submit(osdc, obj_request); 2125b8d70035SAlex Elder out: 2126cf81b60eSAlex Elder if (ret) 2127b8d70035SAlex Elder rbd_obj_request_put(obj_request); 2128b8d70035SAlex Elder 2129b8d70035SAlex Elder return ret; 2130b8d70035SAlex Elder } 2131b8d70035SAlex Elder 2132b8d70035SAlex Elder static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data) 2133b8d70035SAlex Elder { 2134b8d70035SAlex Elder struct rbd_device *rbd_dev = (struct rbd_device *)data; 2135b8d70035SAlex Elder u64 hver; 2136b8d70035SAlex Elder int rc; 2137b8d70035SAlex Elder 2138b8d70035SAlex Elder if (!rbd_dev) 2139b8d70035SAlex Elder return; 2140b8d70035SAlex Elder 214137206ee5SAlex Elder dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__, 2142b8d70035SAlex Elder rbd_dev->header_name, (unsigned long long) notify_id, 2143b8d70035SAlex Elder (unsigned int) opcode); 2144b8d70035SAlex Elder rc = rbd_dev_refresh(rbd_dev, &hver); 2145b8d70035SAlex Elder if (rc) 2146b8d70035SAlex Elder rbd_warn(rbd_dev, "got notification but failed to " 2147b8d70035SAlex Elder " update snaps: %d\n", rc); 2148b8d70035SAlex Elder 2149cf81b60eSAlex Elder rbd_obj_notify_ack(rbd_dev, hver, notify_id); 2150b8d70035SAlex Elder } 2151b8d70035SAlex Elder 21529969ebc5SAlex Elder /* 21539969ebc5SAlex Elder * Request sync osd watch/unwatch. The value of "start" determines 21549969ebc5SAlex Elder * whether a watch request is being initiated or torn down. 21559969ebc5SAlex Elder */ 21569969ebc5SAlex Elder static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start) 21579969ebc5SAlex Elder { 21589969ebc5SAlex Elder struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 21599969ebc5SAlex Elder struct rbd_obj_request *obj_request; 21609969ebc5SAlex Elder int ret; 21619969ebc5SAlex Elder 21629969ebc5SAlex Elder rbd_assert(start ^ !!rbd_dev->watch_event); 21639969ebc5SAlex Elder rbd_assert(start ^ !!rbd_dev->watch_request); 21649969ebc5SAlex Elder 21659969ebc5SAlex Elder if (start) { 21663c663bbdSAlex Elder ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev, 21679969ebc5SAlex Elder &rbd_dev->watch_event); 21689969ebc5SAlex Elder if (ret < 0) 21699969ebc5SAlex Elder return ret; 21708eb87565SAlex Elder rbd_assert(rbd_dev->watch_event != NULL); 21719969ebc5SAlex Elder } 21729969ebc5SAlex Elder 21739969ebc5SAlex Elder ret = -ENOMEM; 21749969ebc5SAlex Elder obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0, 21759969ebc5SAlex Elder OBJ_REQUEST_NODATA); 21769969ebc5SAlex Elder if (!obj_request) 21779969ebc5SAlex Elder goto out_cancel; 21789969ebc5SAlex Elder 2179430c28c3SAlex Elder obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, obj_request); 2180430c28c3SAlex Elder if (!obj_request->osd_req) 2181430c28c3SAlex Elder goto out_cancel; 2182430c28c3SAlex Elder 21838eb87565SAlex Elder if (start) 2184975241afSAlex Elder ceph_osdc_set_request_linger(osdc, obj_request->osd_req); 21858eb87565SAlex Elder else 21866977c3f9SAlex Elder ceph_osdc_unregister_linger_request(osdc, 2187975241afSAlex Elder rbd_dev->watch_request->osd_req); 21882169238dSAlex Elder 21892169238dSAlex Elder osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH, 21902169238dSAlex Elder rbd_dev->watch_event->cookie, 21912169238dSAlex Elder rbd_dev->header.obj_version, start); 21929d4df01fSAlex Elder rbd_osd_req_format_write(obj_request); 21932169238dSAlex Elder 21949969ebc5SAlex Elder ret = rbd_obj_request_submit(osdc, obj_request); 21959969ebc5SAlex Elder if (ret) 21969969ebc5SAlex Elder goto out_cancel; 21979969ebc5SAlex Elder ret = rbd_obj_request_wait(obj_request); 21989969ebc5SAlex Elder if (ret) 21999969ebc5SAlex Elder goto out_cancel; 22009969ebc5SAlex Elder ret = obj_request->result; 22019969ebc5SAlex Elder if (ret) 22029969ebc5SAlex Elder goto out_cancel; 22039969ebc5SAlex Elder 22048eb87565SAlex Elder /* 22058eb87565SAlex Elder * A watch request is set to linger, so the underlying osd 22068eb87565SAlex Elder * request won't go away until we unregister it. We retain 22078eb87565SAlex Elder * a pointer to the object request during that time (in 22088eb87565SAlex Elder * rbd_dev->watch_request), so we'll keep a reference to 22098eb87565SAlex Elder * it. We'll drop that reference (below) after we've 22108eb87565SAlex Elder * unregistered it. 22118eb87565SAlex Elder */ 22128eb87565SAlex Elder if (start) { 22138eb87565SAlex Elder rbd_dev->watch_request = obj_request; 22148eb87565SAlex Elder 22158eb87565SAlex Elder return 0; 22168eb87565SAlex Elder } 22178eb87565SAlex Elder 22188eb87565SAlex Elder /* We have successfully torn down the watch request */ 22198eb87565SAlex Elder 22208eb87565SAlex Elder rbd_obj_request_put(rbd_dev->watch_request); 22218eb87565SAlex Elder rbd_dev->watch_request = NULL; 22229969ebc5SAlex Elder out_cancel: 22239969ebc5SAlex Elder /* Cancel the event if we're tearing down, or on error */ 22249969ebc5SAlex Elder ceph_osdc_cancel_event(rbd_dev->watch_event); 22259969ebc5SAlex Elder rbd_dev->watch_event = NULL; 22269969ebc5SAlex Elder if (obj_request) 22279969ebc5SAlex Elder rbd_obj_request_put(obj_request); 22289969ebc5SAlex Elder 22299969ebc5SAlex Elder return ret; 22309969ebc5SAlex Elder } 22319969ebc5SAlex Elder 223236be9a76SAlex Elder /* 223336be9a76SAlex Elder * Synchronous osd object method call 223436be9a76SAlex Elder */ 223536be9a76SAlex Elder static int rbd_obj_method_sync(struct rbd_device *rbd_dev, 223636be9a76SAlex Elder const char *object_name, 223736be9a76SAlex Elder const char *class_name, 223836be9a76SAlex Elder const char *method_name, 223936be9a76SAlex Elder const char *outbound, 224036be9a76SAlex Elder size_t outbound_size, 224136be9a76SAlex Elder char *inbound, 224236be9a76SAlex Elder size_t inbound_size, 224336be9a76SAlex Elder u64 *version) 224436be9a76SAlex Elder { 22452169238dSAlex Elder struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 224636be9a76SAlex Elder struct rbd_obj_request *obj_request; 224736be9a76SAlex Elder struct page **pages; 224836be9a76SAlex Elder u32 page_count; 224936be9a76SAlex Elder int ret; 225036be9a76SAlex Elder 225136be9a76SAlex Elder /* 22526010a451SAlex Elder * Method calls are ultimately read operations. The result 22536010a451SAlex Elder * should placed into the inbound buffer provided. They 22546010a451SAlex Elder * also supply outbound data--parameters for the object 22556010a451SAlex Elder * method. Currently if this is present it will be a 22566010a451SAlex Elder * snapshot id. 225736be9a76SAlex Elder */ 225836be9a76SAlex Elder page_count = (u32) calc_pages_for(0, inbound_size); 225936be9a76SAlex Elder pages = ceph_alloc_page_vector(page_count, GFP_KERNEL); 226036be9a76SAlex Elder if (IS_ERR(pages)) 226136be9a76SAlex Elder return PTR_ERR(pages); 226236be9a76SAlex Elder 226336be9a76SAlex Elder ret = -ENOMEM; 22646010a451SAlex Elder obj_request = rbd_obj_request_create(object_name, 0, inbound_size, 226536be9a76SAlex Elder OBJ_REQUEST_PAGES); 226636be9a76SAlex Elder if (!obj_request) 226736be9a76SAlex Elder goto out; 226836be9a76SAlex Elder 226936be9a76SAlex Elder obj_request->pages = pages; 227036be9a76SAlex Elder obj_request->page_count = page_count; 227136be9a76SAlex Elder 2272430c28c3SAlex Elder obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request); 227336be9a76SAlex Elder if (!obj_request->osd_req) 227436be9a76SAlex Elder goto out; 227536be9a76SAlex Elder 2276c99d2d4aSAlex Elder osd_req_op_cls_init(obj_request->osd_req, 0, CEPH_OSD_OP_CALL, 227704017e29SAlex Elder class_name, method_name); 227804017e29SAlex Elder if (outbound_size) { 227904017e29SAlex Elder struct ceph_pagelist *pagelist; 228004017e29SAlex Elder 228104017e29SAlex Elder pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS); 228204017e29SAlex Elder if (!pagelist) 228304017e29SAlex Elder goto out; 228404017e29SAlex Elder 228504017e29SAlex Elder ceph_pagelist_init(pagelist); 228604017e29SAlex Elder ceph_pagelist_append(pagelist, outbound, outbound_size); 228704017e29SAlex Elder osd_req_op_cls_request_data_pagelist(obj_request->osd_req, 0, 228804017e29SAlex Elder pagelist); 228904017e29SAlex Elder } 2290a4ce40a9SAlex Elder osd_req_op_cls_response_data_pages(obj_request->osd_req, 0, 2291a4ce40a9SAlex Elder obj_request->pages, inbound_size, 229244cd188dSAlex Elder 0, false, false); 22939d4df01fSAlex Elder rbd_osd_req_format_read(obj_request); 2294430c28c3SAlex Elder 229536be9a76SAlex Elder ret = rbd_obj_request_submit(osdc, obj_request); 229636be9a76SAlex Elder if (ret) 229736be9a76SAlex Elder goto out; 229836be9a76SAlex Elder ret = rbd_obj_request_wait(obj_request); 229936be9a76SAlex Elder if (ret) 230036be9a76SAlex Elder goto out; 230136be9a76SAlex Elder 230236be9a76SAlex Elder ret = obj_request->result; 230336be9a76SAlex Elder if (ret < 0) 230436be9a76SAlex Elder goto out; 230523ed6e13SAlex Elder ret = 0; 2306903bb32eSAlex Elder ceph_copy_from_page_vector(pages, inbound, 0, obj_request->xferred); 230736be9a76SAlex Elder if (version) 230836be9a76SAlex Elder *version = obj_request->version; 230936be9a76SAlex Elder out: 231036be9a76SAlex Elder if (obj_request) 231136be9a76SAlex Elder rbd_obj_request_put(obj_request); 231236be9a76SAlex Elder else 231336be9a76SAlex Elder ceph_release_page_vector(pages, page_count); 231436be9a76SAlex Elder 231536be9a76SAlex Elder return ret; 231636be9a76SAlex Elder } 231736be9a76SAlex Elder 2318bf0d5f50SAlex Elder static void rbd_request_fn(struct request_queue *q) 2319cc344fa1SAlex Elder __releases(q->queue_lock) __acquires(q->queue_lock) 2320bf0d5f50SAlex Elder { 2321bf0d5f50SAlex Elder struct rbd_device *rbd_dev = q->queuedata; 2322bf0d5f50SAlex Elder bool read_only = rbd_dev->mapping.read_only; 2323bf0d5f50SAlex Elder struct request *rq; 2324bf0d5f50SAlex Elder int result; 2325bf0d5f50SAlex Elder 2326bf0d5f50SAlex Elder while ((rq = blk_fetch_request(q))) { 2327bf0d5f50SAlex Elder bool write_request = rq_data_dir(rq) == WRITE; 2328bf0d5f50SAlex Elder struct rbd_img_request *img_request; 2329bf0d5f50SAlex Elder u64 offset; 2330bf0d5f50SAlex Elder u64 length; 2331bf0d5f50SAlex Elder 2332bf0d5f50SAlex Elder /* Ignore any non-FS requests that filter through. */ 2333bf0d5f50SAlex Elder 2334bf0d5f50SAlex Elder if (rq->cmd_type != REQ_TYPE_FS) { 23354dda41d3SAlex Elder dout("%s: non-fs request type %d\n", __func__, 23364dda41d3SAlex Elder (int) rq->cmd_type); 23374dda41d3SAlex Elder __blk_end_request_all(rq, 0); 23384dda41d3SAlex Elder continue; 23394dda41d3SAlex Elder } 23404dda41d3SAlex Elder 23414dda41d3SAlex Elder /* Ignore/skip any zero-length requests */ 23424dda41d3SAlex Elder 23434dda41d3SAlex Elder offset = (u64) blk_rq_pos(rq) << SECTOR_SHIFT; 23444dda41d3SAlex Elder length = (u64) blk_rq_bytes(rq); 23454dda41d3SAlex Elder 23464dda41d3SAlex Elder if (!length) { 23474dda41d3SAlex Elder dout("%s: zero-length request\n", __func__); 2348bf0d5f50SAlex Elder __blk_end_request_all(rq, 0); 2349bf0d5f50SAlex Elder continue; 2350bf0d5f50SAlex Elder } 2351bf0d5f50SAlex Elder 2352bf0d5f50SAlex Elder spin_unlock_irq(q->queue_lock); 2353bf0d5f50SAlex Elder 2354bf0d5f50SAlex Elder /* Disallow writes to a read-only device */ 2355bf0d5f50SAlex Elder 2356bf0d5f50SAlex Elder if (write_request) { 2357bf0d5f50SAlex Elder result = -EROFS; 2358bf0d5f50SAlex Elder if (read_only) 2359bf0d5f50SAlex Elder goto end_request; 2360bf0d5f50SAlex Elder rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP); 2361bf0d5f50SAlex Elder } 2362bf0d5f50SAlex Elder 23636d292906SAlex Elder /* 23646d292906SAlex Elder * Quit early if the mapped snapshot no longer 23656d292906SAlex Elder * exists. It's still possible the snapshot will 23666d292906SAlex Elder * have disappeared by the time our request arrives 23676d292906SAlex Elder * at the osd, but there's no sense in sending it if 23686d292906SAlex Elder * we already know. 23696d292906SAlex Elder */ 23706d292906SAlex Elder if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) { 2371bf0d5f50SAlex Elder dout("request for non-existent snapshot"); 2372bf0d5f50SAlex Elder rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP); 2373bf0d5f50SAlex Elder result = -ENXIO; 2374bf0d5f50SAlex Elder goto end_request; 2375bf0d5f50SAlex Elder } 2376bf0d5f50SAlex Elder 2377bf0d5f50SAlex Elder result = -EINVAL; 2378bf0d5f50SAlex Elder if (WARN_ON(offset && length > U64_MAX - offset + 1)) 2379bf0d5f50SAlex Elder goto end_request; /* Shouldn't happen */ 2380bf0d5f50SAlex Elder 2381bf0d5f50SAlex Elder result = -ENOMEM; 2382bf0d5f50SAlex Elder img_request = rbd_img_request_create(rbd_dev, offset, length, 23839849e986SAlex Elder write_request, false); 2384bf0d5f50SAlex Elder if (!img_request) 2385bf0d5f50SAlex Elder goto end_request; 2386bf0d5f50SAlex Elder 2387bf0d5f50SAlex Elder img_request->rq = rq; 2388bf0d5f50SAlex Elder 2389bf0d5f50SAlex Elder result = rbd_img_request_fill_bio(img_request, rq->bio); 2390bf0d5f50SAlex Elder if (!result) 2391bf0d5f50SAlex Elder result = rbd_img_request_submit(img_request); 2392bf0d5f50SAlex Elder if (result) 2393bf0d5f50SAlex Elder rbd_img_request_put(img_request); 2394bf0d5f50SAlex Elder end_request: 2395bf0d5f50SAlex Elder spin_lock_irq(q->queue_lock); 2396bf0d5f50SAlex Elder if (result < 0) { 23977da22d29SAlex Elder rbd_warn(rbd_dev, "%s %llx at %llx result %d\n", 23987da22d29SAlex Elder write_request ? "write" : "read", 23997da22d29SAlex Elder length, offset, result); 24007da22d29SAlex Elder 2401bf0d5f50SAlex Elder __blk_end_request_all(rq, result); 2402bf0d5f50SAlex Elder } 2403bf0d5f50SAlex Elder } 2404bf0d5f50SAlex Elder } 2405bf0d5f50SAlex Elder 2406602adf40SYehuda Sadeh /* 2407602adf40SYehuda Sadeh * a queue callback. Makes sure that we don't create a bio that spans across 2408602adf40SYehuda Sadeh * multiple osd objects. One exception would be with a single page bios, 2409f7760dadSAlex Elder * which we handle later at bio_chain_clone_range() 2410602adf40SYehuda Sadeh */ 2411602adf40SYehuda Sadeh static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd, 2412602adf40SYehuda Sadeh struct bio_vec *bvec) 2413602adf40SYehuda Sadeh { 2414602adf40SYehuda Sadeh struct rbd_device *rbd_dev = q->queuedata; 2415e5cfeed2SAlex Elder sector_t sector_offset; 2416e5cfeed2SAlex Elder sector_t sectors_per_obj; 2417e5cfeed2SAlex Elder sector_t obj_sector_offset; 2418e5cfeed2SAlex Elder int ret; 2419602adf40SYehuda Sadeh 2420e5cfeed2SAlex Elder /* 2421e5cfeed2SAlex Elder * Find how far into its rbd object the partition-relative 2422e5cfeed2SAlex Elder * bio start sector is to offset relative to the enclosing 2423e5cfeed2SAlex Elder * device. 2424e5cfeed2SAlex Elder */ 2425e5cfeed2SAlex Elder sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector; 2426e5cfeed2SAlex Elder sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT); 2427e5cfeed2SAlex Elder obj_sector_offset = sector_offset & (sectors_per_obj - 1); 2428593a9e7bSAlex Elder 2429e5cfeed2SAlex Elder /* 2430e5cfeed2SAlex Elder * Compute the number of bytes from that offset to the end 2431e5cfeed2SAlex Elder * of the object. Account for what's already used by the bio. 2432e5cfeed2SAlex Elder */ 2433e5cfeed2SAlex Elder ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT; 2434e5cfeed2SAlex Elder if (ret > bmd->bi_size) 2435e5cfeed2SAlex Elder ret -= bmd->bi_size; 2436e5cfeed2SAlex Elder else 2437e5cfeed2SAlex Elder ret = 0; 2438e5cfeed2SAlex Elder 2439e5cfeed2SAlex Elder /* 2440e5cfeed2SAlex Elder * Don't send back more than was asked for. And if the bio 2441e5cfeed2SAlex Elder * was empty, let the whole thing through because: "Note 2442e5cfeed2SAlex Elder * that a block device *must* allow a single page to be 2443e5cfeed2SAlex Elder * added to an empty bio." 2444e5cfeed2SAlex Elder */ 2445e5cfeed2SAlex Elder rbd_assert(bvec->bv_len <= PAGE_SIZE); 2446e5cfeed2SAlex Elder if (ret > (int) bvec->bv_len || !bmd->bi_size) 2447e5cfeed2SAlex Elder ret = (int) bvec->bv_len; 2448e5cfeed2SAlex Elder 2449e5cfeed2SAlex Elder return ret; 2450602adf40SYehuda Sadeh } 2451602adf40SYehuda Sadeh 2452602adf40SYehuda Sadeh static void rbd_free_disk(struct rbd_device *rbd_dev) 2453602adf40SYehuda Sadeh { 2454602adf40SYehuda Sadeh struct gendisk *disk = rbd_dev->disk; 2455602adf40SYehuda Sadeh 2456602adf40SYehuda Sadeh if (!disk) 2457602adf40SYehuda Sadeh return; 2458602adf40SYehuda Sadeh 2459602adf40SYehuda Sadeh if (disk->flags & GENHD_FL_UP) 2460602adf40SYehuda Sadeh del_gendisk(disk); 2461602adf40SYehuda Sadeh if (disk->queue) 2462602adf40SYehuda Sadeh blk_cleanup_queue(disk->queue); 2463602adf40SYehuda Sadeh put_disk(disk); 2464602adf40SYehuda Sadeh } 2465602adf40SYehuda Sadeh 2466788e2df3SAlex Elder static int rbd_obj_read_sync(struct rbd_device *rbd_dev, 2467788e2df3SAlex Elder const char *object_name, 2468788e2df3SAlex Elder u64 offset, u64 length, 2469788e2df3SAlex Elder char *buf, u64 *version) 2470788e2df3SAlex Elder 2471788e2df3SAlex Elder { 24722169238dSAlex Elder struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 2473788e2df3SAlex Elder struct rbd_obj_request *obj_request; 2474788e2df3SAlex Elder struct page **pages = NULL; 2475788e2df3SAlex Elder u32 page_count; 24761ceae7efSAlex Elder size_t size; 2477788e2df3SAlex Elder int ret; 2478788e2df3SAlex Elder 2479788e2df3SAlex Elder page_count = (u32) calc_pages_for(offset, length); 2480788e2df3SAlex Elder pages = ceph_alloc_page_vector(page_count, GFP_KERNEL); 2481788e2df3SAlex Elder if (IS_ERR(pages)) 2482788e2df3SAlex Elder ret = PTR_ERR(pages); 2483788e2df3SAlex Elder 2484788e2df3SAlex Elder ret = -ENOMEM; 2485788e2df3SAlex Elder obj_request = rbd_obj_request_create(object_name, offset, length, 2486788e2df3SAlex Elder OBJ_REQUEST_PAGES); 2487788e2df3SAlex Elder if (!obj_request) 2488788e2df3SAlex Elder goto out; 2489788e2df3SAlex Elder 2490788e2df3SAlex Elder obj_request->pages = pages; 2491788e2df3SAlex Elder obj_request->page_count = page_count; 2492788e2df3SAlex Elder 2493430c28c3SAlex Elder obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request); 2494788e2df3SAlex Elder if (!obj_request->osd_req) 2495788e2df3SAlex Elder goto out; 2496788e2df3SAlex Elder 2497c99d2d4aSAlex Elder osd_req_op_extent_init(obj_request->osd_req, 0, CEPH_OSD_OP_READ, 2498c99d2d4aSAlex Elder offset, length, 0, 0); 2499406e2c9fSAlex Elder osd_req_op_extent_osd_data_pages(obj_request->osd_req, 0, 2500a4ce40a9SAlex Elder obj_request->pages, 250144cd188dSAlex Elder obj_request->length, 250244cd188dSAlex Elder obj_request->offset & ~PAGE_MASK, 250344cd188dSAlex Elder false, false); 25049d4df01fSAlex Elder rbd_osd_req_format_read(obj_request); 2505430c28c3SAlex Elder 2506788e2df3SAlex Elder ret = rbd_obj_request_submit(osdc, obj_request); 2507788e2df3SAlex Elder if (ret) 2508788e2df3SAlex Elder goto out; 2509788e2df3SAlex Elder ret = rbd_obj_request_wait(obj_request); 2510788e2df3SAlex Elder if (ret) 2511788e2df3SAlex Elder goto out; 2512788e2df3SAlex Elder 2513788e2df3SAlex Elder ret = obj_request->result; 2514788e2df3SAlex Elder if (ret < 0) 2515788e2df3SAlex Elder goto out; 25161ceae7efSAlex Elder 25171ceae7efSAlex Elder rbd_assert(obj_request->xferred <= (u64) SIZE_MAX); 25181ceae7efSAlex Elder size = (size_t) obj_request->xferred; 2519903bb32eSAlex Elder ceph_copy_from_page_vector(pages, buf, 0, size); 252023ed6e13SAlex Elder rbd_assert(size <= (size_t) INT_MAX); 252123ed6e13SAlex Elder ret = (int) size; 2522788e2df3SAlex Elder if (version) 2523788e2df3SAlex Elder *version = obj_request->version; 2524788e2df3SAlex Elder out: 2525788e2df3SAlex Elder if (obj_request) 2526788e2df3SAlex Elder rbd_obj_request_put(obj_request); 2527788e2df3SAlex Elder else 2528788e2df3SAlex Elder ceph_release_page_vector(pages, page_count); 2529788e2df3SAlex Elder 2530788e2df3SAlex Elder return ret; 2531788e2df3SAlex Elder } 2532788e2df3SAlex Elder 2533602adf40SYehuda Sadeh /* 25344156d998SAlex Elder * Read the complete header for the given rbd device. 25354156d998SAlex Elder * 25364156d998SAlex Elder * Returns a pointer to a dynamically-allocated buffer containing 25374156d998SAlex Elder * the complete and validated header. Caller can pass the address 25384156d998SAlex Elder * of a variable that will be filled in with the version of the 25394156d998SAlex Elder * header object at the time it was read. 25404156d998SAlex Elder * 25414156d998SAlex Elder * Returns a pointer-coded errno if a failure occurs. 25424156d998SAlex Elder */ 25434156d998SAlex Elder static struct rbd_image_header_ondisk * 25444156d998SAlex Elder rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version) 25454156d998SAlex Elder { 25464156d998SAlex Elder struct rbd_image_header_ondisk *ondisk = NULL; 25474156d998SAlex Elder u32 snap_count = 0; 25484156d998SAlex Elder u64 names_size = 0; 25494156d998SAlex Elder u32 want_count; 25504156d998SAlex Elder int ret; 25514156d998SAlex Elder 25524156d998SAlex Elder /* 25534156d998SAlex Elder * The complete header will include an array of its 64-bit 25544156d998SAlex Elder * snapshot ids, followed by the names of those snapshots as 25554156d998SAlex Elder * a contiguous block of NUL-terminated strings. Note that 25564156d998SAlex Elder * the number of snapshots could change by the time we read 25574156d998SAlex Elder * it in, in which case we re-read it. 25584156d998SAlex Elder */ 25594156d998SAlex Elder do { 25604156d998SAlex Elder size_t size; 25614156d998SAlex Elder 25624156d998SAlex Elder kfree(ondisk); 25634156d998SAlex Elder 25644156d998SAlex Elder size = sizeof (*ondisk); 25654156d998SAlex Elder size += snap_count * sizeof (struct rbd_image_snap_ondisk); 25664156d998SAlex Elder size += names_size; 25674156d998SAlex Elder ondisk = kmalloc(size, GFP_KERNEL); 25684156d998SAlex Elder if (!ondisk) 25694156d998SAlex Elder return ERR_PTR(-ENOMEM); 25704156d998SAlex Elder 2571788e2df3SAlex Elder ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name, 25724156d998SAlex Elder 0, size, 25734156d998SAlex Elder (char *) ondisk, version); 25744156d998SAlex Elder if (ret < 0) 25754156d998SAlex Elder goto out_err; 25764156d998SAlex Elder if (WARN_ON((size_t) ret < size)) { 25774156d998SAlex Elder ret = -ENXIO; 257806ecc6cbSAlex Elder rbd_warn(rbd_dev, "short header read (want %zd got %d)", 257906ecc6cbSAlex Elder size, ret); 25804156d998SAlex Elder goto out_err; 25814156d998SAlex Elder } 25824156d998SAlex Elder if (!rbd_dev_ondisk_valid(ondisk)) { 25834156d998SAlex Elder ret = -ENXIO; 258406ecc6cbSAlex Elder rbd_warn(rbd_dev, "invalid header"); 25854156d998SAlex Elder goto out_err; 25864156d998SAlex Elder } 25874156d998SAlex Elder 25884156d998SAlex Elder names_size = le64_to_cpu(ondisk->snap_names_len); 25894156d998SAlex Elder want_count = snap_count; 25904156d998SAlex Elder snap_count = le32_to_cpu(ondisk->snap_count); 25914156d998SAlex Elder } while (snap_count != want_count); 25924156d998SAlex Elder 25934156d998SAlex Elder return ondisk; 25944156d998SAlex Elder 25954156d998SAlex Elder out_err: 25964156d998SAlex Elder kfree(ondisk); 25974156d998SAlex Elder 25984156d998SAlex Elder return ERR_PTR(ret); 25994156d998SAlex Elder } 26004156d998SAlex Elder 26014156d998SAlex Elder /* 2602602adf40SYehuda Sadeh * reload the ondisk the header 2603602adf40SYehuda Sadeh */ 2604602adf40SYehuda Sadeh static int rbd_read_header(struct rbd_device *rbd_dev, 2605602adf40SYehuda Sadeh struct rbd_image_header *header) 2606602adf40SYehuda Sadeh { 26074156d998SAlex Elder struct rbd_image_header_ondisk *ondisk; 26084156d998SAlex Elder u64 ver = 0; 26094156d998SAlex Elder int ret; 2610602adf40SYehuda Sadeh 26114156d998SAlex Elder ondisk = rbd_dev_v1_header_read(rbd_dev, &ver); 26124156d998SAlex Elder if (IS_ERR(ondisk)) 26134156d998SAlex Elder return PTR_ERR(ondisk); 26144156d998SAlex Elder ret = rbd_header_from_disk(header, ondisk); 26154156d998SAlex Elder if (ret >= 0) 261659c2be1eSYehuda Sadeh header->obj_version = ver; 26174156d998SAlex Elder kfree(ondisk); 2618602adf40SYehuda Sadeh 26194156d998SAlex Elder return ret; 2620602adf40SYehuda Sadeh } 2621602adf40SYehuda Sadeh 262241f38c2bSAlex Elder static void rbd_remove_all_snaps(struct rbd_device *rbd_dev) 2623dfc5606dSYehuda Sadeh { 2624dfc5606dSYehuda Sadeh struct rbd_snap *snap; 2625a0593290SAlex Elder struct rbd_snap *next; 2626dfc5606dSYehuda Sadeh 2627a0593290SAlex Elder list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node) 262841f38c2bSAlex Elder rbd_remove_snap_dev(snap); 2629dfc5606dSYehuda Sadeh } 2630dfc5606dSYehuda Sadeh 26319478554aSAlex Elder static void rbd_update_mapping_size(struct rbd_device *rbd_dev) 26329478554aSAlex Elder { 26339478554aSAlex Elder sector_t size; 26349478554aSAlex Elder 26350d7dbfceSAlex Elder if (rbd_dev->spec->snap_id != CEPH_NOSNAP) 26369478554aSAlex Elder return; 26379478554aSAlex Elder 26389478554aSAlex Elder size = (sector_t) rbd_dev->header.image_size / SECTOR_SIZE; 26399478554aSAlex Elder dout("setting size to %llu sectors", (unsigned long long) size); 26409478554aSAlex Elder rbd_dev->mapping.size = (u64) size; 26419478554aSAlex Elder set_capacity(rbd_dev->disk, size); 26429478554aSAlex Elder } 26439478554aSAlex Elder 2644602adf40SYehuda Sadeh /* 2645602adf40SYehuda Sadeh * only read the first part of the ondisk header, without the snaps info 2646602adf40SYehuda Sadeh */ 2647117973fbSAlex Elder static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver) 2648602adf40SYehuda Sadeh { 2649602adf40SYehuda Sadeh int ret; 2650602adf40SYehuda Sadeh struct rbd_image_header h; 2651602adf40SYehuda Sadeh 2652602adf40SYehuda Sadeh ret = rbd_read_header(rbd_dev, &h); 2653602adf40SYehuda Sadeh if (ret < 0) 2654602adf40SYehuda Sadeh return ret; 2655602adf40SYehuda Sadeh 2656a51aa0c0SJosh Durgin down_write(&rbd_dev->header_rwsem); 2657a51aa0c0SJosh Durgin 26589478554aSAlex Elder /* Update image size, and check for resize of mapped image */ 26599478554aSAlex Elder rbd_dev->header.image_size = h.image_size; 26609478554aSAlex Elder rbd_update_mapping_size(rbd_dev); 26619db4b3e3SSage Weil 2662849b4260SAlex Elder /* rbd_dev->header.object_prefix shouldn't change */ 2663602adf40SYehuda Sadeh kfree(rbd_dev->header.snap_sizes); 2664849b4260SAlex Elder kfree(rbd_dev->header.snap_names); 2665d1d25646SJosh Durgin /* osd requests may still refer to snapc */ 2666d1d25646SJosh Durgin ceph_put_snap_context(rbd_dev->header.snapc); 2667602adf40SYehuda Sadeh 2668b813623aSAlex Elder if (hver) 2669b813623aSAlex Elder *hver = h.obj_version; 2670a71b891bSJosh Durgin rbd_dev->header.obj_version = h.obj_version; 267193a24e08SJosh Durgin rbd_dev->header.image_size = h.image_size; 2672602adf40SYehuda Sadeh rbd_dev->header.snapc = h.snapc; 2673602adf40SYehuda Sadeh rbd_dev->header.snap_names = h.snap_names; 2674602adf40SYehuda Sadeh rbd_dev->header.snap_sizes = h.snap_sizes; 2675849b4260SAlex Elder /* Free the extra copy of the object prefix */ 2676849b4260SAlex Elder WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix)); 2677849b4260SAlex Elder kfree(h.object_prefix); 2678849b4260SAlex Elder 2679304f6808SAlex Elder ret = rbd_dev_snaps_update(rbd_dev); 2680304f6808SAlex Elder if (!ret) 2681304f6808SAlex Elder ret = rbd_dev_snaps_register(rbd_dev); 2682dfc5606dSYehuda Sadeh 2683c666601aSJosh Durgin up_write(&rbd_dev->header_rwsem); 2684602adf40SYehuda Sadeh 2685dfc5606dSYehuda Sadeh return ret; 2686602adf40SYehuda Sadeh } 2687602adf40SYehuda Sadeh 2688117973fbSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver) 26891fe5e993SAlex Elder { 26901fe5e993SAlex Elder int ret; 26911fe5e993SAlex Elder 2692117973fbSAlex Elder rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 26931fe5e993SAlex Elder mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); 2694117973fbSAlex Elder if (rbd_dev->image_format == 1) 2695117973fbSAlex Elder ret = rbd_dev_v1_refresh(rbd_dev, hver); 2696117973fbSAlex Elder else 2697117973fbSAlex Elder ret = rbd_dev_v2_refresh(rbd_dev, hver); 26981fe5e993SAlex Elder mutex_unlock(&ctl_mutex); 26991fe5e993SAlex Elder 27001fe5e993SAlex Elder return ret; 27011fe5e993SAlex Elder } 27021fe5e993SAlex Elder 2703602adf40SYehuda Sadeh static int rbd_init_disk(struct rbd_device *rbd_dev) 2704602adf40SYehuda Sadeh { 2705602adf40SYehuda Sadeh struct gendisk *disk; 2706602adf40SYehuda Sadeh struct request_queue *q; 2707593a9e7bSAlex Elder u64 segment_size; 2708602adf40SYehuda Sadeh 2709602adf40SYehuda Sadeh /* create gendisk info */ 2710602adf40SYehuda Sadeh disk = alloc_disk(RBD_MINORS_PER_MAJOR); 2711602adf40SYehuda Sadeh if (!disk) 27121fcdb8aaSAlex Elder return -ENOMEM; 2713602adf40SYehuda Sadeh 2714f0f8cef5SAlex Elder snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d", 2715de71a297SAlex Elder rbd_dev->dev_id); 2716602adf40SYehuda Sadeh disk->major = rbd_dev->major; 2717602adf40SYehuda Sadeh disk->first_minor = 0; 2718602adf40SYehuda Sadeh disk->fops = &rbd_bd_ops; 2719602adf40SYehuda Sadeh disk->private_data = rbd_dev; 2720602adf40SYehuda Sadeh 2721bf0d5f50SAlex Elder q = blk_init_queue(rbd_request_fn, &rbd_dev->lock); 2722602adf40SYehuda Sadeh if (!q) 2723602adf40SYehuda Sadeh goto out_disk; 2724029bcbd8SJosh Durgin 2725593a9e7bSAlex Elder /* We use the default size, but let's be explicit about it. */ 2726593a9e7bSAlex Elder blk_queue_physical_block_size(q, SECTOR_SIZE); 2727593a9e7bSAlex Elder 2728029bcbd8SJosh Durgin /* set io sizes to object size */ 2729593a9e7bSAlex Elder segment_size = rbd_obj_bytes(&rbd_dev->header); 2730593a9e7bSAlex Elder blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE); 2731593a9e7bSAlex Elder blk_queue_max_segment_size(q, segment_size); 2732593a9e7bSAlex Elder blk_queue_io_min(q, segment_size); 2733593a9e7bSAlex Elder blk_queue_io_opt(q, segment_size); 2734029bcbd8SJosh Durgin 2735602adf40SYehuda Sadeh blk_queue_merge_bvec(q, rbd_merge_bvec); 2736602adf40SYehuda Sadeh disk->queue = q; 2737602adf40SYehuda Sadeh 2738602adf40SYehuda Sadeh q->queuedata = rbd_dev; 2739602adf40SYehuda Sadeh 2740602adf40SYehuda Sadeh rbd_dev->disk = disk; 2741602adf40SYehuda Sadeh 274212f02944SAlex Elder set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE); 274312f02944SAlex Elder 2744602adf40SYehuda Sadeh return 0; 2745602adf40SYehuda Sadeh out_disk: 2746602adf40SYehuda Sadeh put_disk(disk); 27471fcdb8aaSAlex Elder 27481fcdb8aaSAlex Elder return -ENOMEM; 2749602adf40SYehuda Sadeh } 2750602adf40SYehuda Sadeh 2751dfc5606dSYehuda Sadeh /* 2752dfc5606dSYehuda Sadeh sysfs 2753dfc5606dSYehuda Sadeh */ 2754602adf40SYehuda Sadeh 2755593a9e7bSAlex Elder static struct rbd_device *dev_to_rbd_dev(struct device *dev) 2756593a9e7bSAlex Elder { 2757593a9e7bSAlex Elder return container_of(dev, struct rbd_device, dev); 2758593a9e7bSAlex Elder } 2759593a9e7bSAlex Elder 2760dfc5606dSYehuda Sadeh static ssize_t rbd_size_show(struct device *dev, 2761dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 2762602adf40SYehuda Sadeh { 2763593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 2764a51aa0c0SJosh Durgin sector_t size; 2765dfc5606dSYehuda Sadeh 2766a51aa0c0SJosh Durgin down_read(&rbd_dev->header_rwsem); 2767a51aa0c0SJosh Durgin size = get_capacity(rbd_dev->disk); 2768a51aa0c0SJosh Durgin up_read(&rbd_dev->header_rwsem); 2769a51aa0c0SJosh Durgin 2770a51aa0c0SJosh Durgin return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE); 2771602adf40SYehuda Sadeh } 2772602adf40SYehuda Sadeh 277334b13184SAlex Elder /* 277434b13184SAlex Elder * Note this shows the features for whatever's mapped, which is not 277534b13184SAlex Elder * necessarily the base image. 277634b13184SAlex Elder */ 277734b13184SAlex Elder static ssize_t rbd_features_show(struct device *dev, 277834b13184SAlex Elder struct device_attribute *attr, char *buf) 277934b13184SAlex Elder { 278034b13184SAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 278134b13184SAlex Elder 278234b13184SAlex Elder return sprintf(buf, "0x%016llx\n", 278334b13184SAlex Elder (unsigned long long) rbd_dev->mapping.features); 278434b13184SAlex Elder } 278534b13184SAlex Elder 2786dfc5606dSYehuda Sadeh static ssize_t rbd_major_show(struct device *dev, 2787dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 2788602adf40SYehuda Sadeh { 2789593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 2790dfc5606dSYehuda Sadeh 2791dfc5606dSYehuda Sadeh return sprintf(buf, "%d\n", rbd_dev->major); 2792dfc5606dSYehuda Sadeh } 2793dfc5606dSYehuda Sadeh 2794dfc5606dSYehuda Sadeh static ssize_t rbd_client_id_show(struct device *dev, 2795dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 2796dfc5606dSYehuda Sadeh { 2797593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 2798dfc5606dSYehuda Sadeh 27991dbb4399SAlex Elder return sprintf(buf, "client%lld\n", 28001dbb4399SAlex Elder ceph_client_id(rbd_dev->rbd_client->client)); 2801dfc5606dSYehuda Sadeh } 2802dfc5606dSYehuda Sadeh 2803dfc5606dSYehuda Sadeh static ssize_t rbd_pool_show(struct device *dev, 2804dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 2805dfc5606dSYehuda Sadeh { 2806593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 2807dfc5606dSYehuda Sadeh 28080d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->pool_name); 2809dfc5606dSYehuda Sadeh } 2810dfc5606dSYehuda Sadeh 28119bb2f334SAlex Elder static ssize_t rbd_pool_id_show(struct device *dev, 28129bb2f334SAlex Elder struct device_attribute *attr, char *buf) 28139bb2f334SAlex Elder { 28149bb2f334SAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 28159bb2f334SAlex Elder 28160d7dbfceSAlex Elder return sprintf(buf, "%llu\n", 28170d7dbfceSAlex Elder (unsigned long long) rbd_dev->spec->pool_id); 28189bb2f334SAlex Elder } 28199bb2f334SAlex Elder 2820dfc5606dSYehuda Sadeh static ssize_t rbd_name_show(struct device *dev, 2821dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 2822dfc5606dSYehuda Sadeh { 2823593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 2824dfc5606dSYehuda Sadeh 2825a92ffdf8SAlex Elder if (rbd_dev->spec->image_name) 28260d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->image_name); 2827a92ffdf8SAlex Elder 2828a92ffdf8SAlex Elder return sprintf(buf, "(unknown)\n"); 2829dfc5606dSYehuda Sadeh } 2830dfc5606dSYehuda Sadeh 2831589d30e0SAlex Elder static ssize_t rbd_image_id_show(struct device *dev, 2832589d30e0SAlex Elder struct device_attribute *attr, char *buf) 2833589d30e0SAlex Elder { 2834589d30e0SAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 2835589d30e0SAlex Elder 28360d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->image_id); 2837589d30e0SAlex Elder } 2838589d30e0SAlex Elder 283934b13184SAlex Elder /* 284034b13184SAlex Elder * Shows the name of the currently-mapped snapshot (or 284134b13184SAlex Elder * RBD_SNAP_HEAD_NAME for the base image). 284234b13184SAlex Elder */ 2843dfc5606dSYehuda Sadeh static ssize_t rbd_snap_show(struct device *dev, 2844dfc5606dSYehuda Sadeh struct device_attribute *attr, 2845dfc5606dSYehuda Sadeh char *buf) 2846dfc5606dSYehuda Sadeh { 2847593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 2848dfc5606dSYehuda Sadeh 28490d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->snap_name); 2850dfc5606dSYehuda Sadeh } 2851dfc5606dSYehuda Sadeh 285286b00e0dSAlex Elder /* 285386b00e0dSAlex Elder * For an rbd v2 image, shows the pool id, image id, and snapshot id 285486b00e0dSAlex Elder * for the parent image. If there is no parent, simply shows 285586b00e0dSAlex Elder * "(no parent image)". 285686b00e0dSAlex Elder */ 285786b00e0dSAlex Elder static ssize_t rbd_parent_show(struct device *dev, 285886b00e0dSAlex Elder struct device_attribute *attr, 285986b00e0dSAlex Elder char *buf) 286086b00e0dSAlex Elder { 286186b00e0dSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 286286b00e0dSAlex Elder struct rbd_spec *spec = rbd_dev->parent_spec; 286386b00e0dSAlex Elder int count; 286486b00e0dSAlex Elder char *bufp = buf; 286586b00e0dSAlex Elder 286686b00e0dSAlex Elder if (!spec) 286786b00e0dSAlex Elder return sprintf(buf, "(no parent image)\n"); 286886b00e0dSAlex Elder 286986b00e0dSAlex Elder count = sprintf(bufp, "pool_id %llu\npool_name %s\n", 287086b00e0dSAlex Elder (unsigned long long) spec->pool_id, spec->pool_name); 287186b00e0dSAlex Elder if (count < 0) 287286b00e0dSAlex Elder return count; 287386b00e0dSAlex Elder bufp += count; 287486b00e0dSAlex Elder 287586b00e0dSAlex Elder count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id, 287686b00e0dSAlex Elder spec->image_name ? spec->image_name : "(unknown)"); 287786b00e0dSAlex Elder if (count < 0) 287886b00e0dSAlex Elder return count; 287986b00e0dSAlex Elder bufp += count; 288086b00e0dSAlex Elder 288186b00e0dSAlex Elder count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n", 288286b00e0dSAlex Elder (unsigned long long) spec->snap_id, spec->snap_name); 288386b00e0dSAlex Elder if (count < 0) 288486b00e0dSAlex Elder return count; 288586b00e0dSAlex Elder bufp += count; 288686b00e0dSAlex Elder 288786b00e0dSAlex Elder count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap); 288886b00e0dSAlex Elder if (count < 0) 288986b00e0dSAlex Elder return count; 289086b00e0dSAlex Elder bufp += count; 289186b00e0dSAlex Elder 289286b00e0dSAlex Elder return (ssize_t) (bufp - buf); 289386b00e0dSAlex Elder } 289486b00e0dSAlex Elder 2895dfc5606dSYehuda Sadeh static ssize_t rbd_image_refresh(struct device *dev, 2896dfc5606dSYehuda Sadeh struct device_attribute *attr, 2897dfc5606dSYehuda Sadeh const char *buf, 2898dfc5606dSYehuda Sadeh size_t size) 2899dfc5606dSYehuda Sadeh { 2900593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 2901b813623aSAlex Elder int ret; 2902602adf40SYehuda Sadeh 2903117973fbSAlex Elder ret = rbd_dev_refresh(rbd_dev, NULL); 2904b813623aSAlex Elder 2905b813623aSAlex Elder return ret < 0 ? ret : size; 2906dfc5606dSYehuda Sadeh } 2907602adf40SYehuda Sadeh 2908dfc5606dSYehuda Sadeh static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL); 290934b13184SAlex Elder static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL); 2910dfc5606dSYehuda Sadeh static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL); 2911dfc5606dSYehuda Sadeh static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL); 2912dfc5606dSYehuda Sadeh static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL); 29139bb2f334SAlex Elder static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL); 2914dfc5606dSYehuda Sadeh static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL); 2915589d30e0SAlex Elder static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL); 2916dfc5606dSYehuda Sadeh static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh); 2917dfc5606dSYehuda Sadeh static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL); 291886b00e0dSAlex Elder static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL); 2919dfc5606dSYehuda Sadeh 2920dfc5606dSYehuda Sadeh static struct attribute *rbd_attrs[] = { 2921dfc5606dSYehuda Sadeh &dev_attr_size.attr, 292234b13184SAlex Elder &dev_attr_features.attr, 2923dfc5606dSYehuda Sadeh &dev_attr_major.attr, 2924dfc5606dSYehuda Sadeh &dev_attr_client_id.attr, 2925dfc5606dSYehuda Sadeh &dev_attr_pool.attr, 29269bb2f334SAlex Elder &dev_attr_pool_id.attr, 2927dfc5606dSYehuda Sadeh &dev_attr_name.attr, 2928589d30e0SAlex Elder &dev_attr_image_id.attr, 2929dfc5606dSYehuda Sadeh &dev_attr_current_snap.attr, 293086b00e0dSAlex Elder &dev_attr_parent.attr, 2931dfc5606dSYehuda Sadeh &dev_attr_refresh.attr, 2932dfc5606dSYehuda Sadeh NULL 2933dfc5606dSYehuda Sadeh }; 2934dfc5606dSYehuda Sadeh 2935dfc5606dSYehuda Sadeh static struct attribute_group rbd_attr_group = { 2936dfc5606dSYehuda Sadeh .attrs = rbd_attrs, 2937dfc5606dSYehuda Sadeh }; 2938dfc5606dSYehuda Sadeh 2939dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_attr_groups[] = { 2940dfc5606dSYehuda Sadeh &rbd_attr_group, 2941dfc5606dSYehuda Sadeh NULL 2942dfc5606dSYehuda Sadeh }; 2943dfc5606dSYehuda Sadeh 2944dfc5606dSYehuda Sadeh static void rbd_sysfs_dev_release(struct device *dev) 2945dfc5606dSYehuda Sadeh { 2946dfc5606dSYehuda Sadeh } 2947dfc5606dSYehuda Sadeh 2948dfc5606dSYehuda Sadeh static struct device_type rbd_device_type = { 2949dfc5606dSYehuda Sadeh .name = "rbd", 2950dfc5606dSYehuda Sadeh .groups = rbd_attr_groups, 2951dfc5606dSYehuda Sadeh .release = rbd_sysfs_dev_release, 2952dfc5606dSYehuda Sadeh }; 2953dfc5606dSYehuda Sadeh 2954dfc5606dSYehuda Sadeh 2955dfc5606dSYehuda Sadeh /* 2956dfc5606dSYehuda Sadeh sysfs - snapshots 2957dfc5606dSYehuda Sadeh */ 2958dfc5606dSYehuda Sadeh 2959dfc5606dSYehuda Sadeh static ssize_t rbd_snap_size_show(struct device *dev, 2960dfc5606dSYehuda Sadeh struct device_attribute *attr, 2961dfc5606dSYehuda Sadeh char *buf) 2962dfc5606dSYehuda Sadeh { 2963dfc5606dSYehuda Sadeh struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev); 2964dfc5606dSYehuda Sadeh 29653591538fSJosh Durgin return sprintf(buf, "%llu\n", (unsigned long long)snap->size); 2966dfc5606dSYehuda Sadeh } 2967dfc5606dSYehuda Sadeh 2968dfc5606dSYehuda Sadeh static ssize_t rbd_snap_id_show(struct device *dev, 2969dfc5606dSYehuda Sadeh struct device_attribute *attr, 2970dfc5606dSYehuda Sadeh char *buf) 2971dfc5606dSYehuda Sadeh { 2972dfc5606dSYehuda Sadeh struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev); 2973dfc5606dSYehuda Sadeh 2974593a9e7bSAlex Elder return sprintf(buf, "%llu\n", (unsigned long long)snap->id); 2975dfc5606dSYehuda Sadeh } 2976dfc5606dSYehuda Sadeh 297734b13184SAlex Elder static ssize_t rbd_snap_features_show(struct device *dev, 297834b13184SAlex Elder struct device_attribute *attr, 297934b13184SAlex Elder char *buf) 298034b13184SAlex Elder { 298134b13184SAlex Elder struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev); 298234b13184SAlex Elder 298334b13184SAlex Elder return sprintf(buf, "0x%016llx\n", 298434b13184SAlex Elder (unsigned long long) snap->features); 298534b13184SAlex Elder } 298634b13184SAlex Elder 2987dfc5606dSYehuda Sadeh static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL); 2988dfc5606dSYehuda Sadeh static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL); 298934b13184SAlex Elder static DEVICE_ATTR(snap_features, S_IRUGO, rbd_snap_features_show, NULL); 2990dfc5606dSYehuda Sadeh 2991dfc5606dSYehuda Sadeh static struct attribute *rbd_snap_attrs[] = { 2992dfc5606dSYehuda Sadeh &dev_attr_snap_size.attr, 2993dfc5606dSYehuda Sadeh &dev_attr_snap_id.attr, 299434b13184SAlex Elder &dev_attr_snap_features.attr, 2995dfc5606dSYehuda Sadeh NULL, 2996dfc5606dSYehuda Sadeh }; 2997dfc5606dSYehuda Sadeh 2998dfc5606dSYehuda Sadeh static struct attribute_group rbd_snap_attr_group = { 2999dfc5606dSYehuda Sadeh .attrs = rbd_snap_attrs, 3000dfc5606dSYehuda Sadeh }; 3001dfc5606dSYehuda Sadeh 3002dfc5606dSYehuda Sadeh static void rbd_snap_dev_release(struct device *dev) 3003dfc5606dSYehuda Sadeh { 3004dfc5606dSYehuda Sadeh struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev); 3005dfc5606dSYehuda Sadeh kfree(snap->name); 3006dfc5606dSYehuda Sadeh kfree(snap); 3007dfc5606dSYehuda Sadeh } 3008dfc5606dSYehuda Sadeh 3009dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_snap_attr_groups[] = { 3010dfc5606dSYehuda Sadeh &rbd_snap_attr_group, 3011dfc5606dSYehuda Sadeh NULL 3012dfc5606dSYehuda Sadeh }; 3013dfc5606dSYehuda Sadeh 3014dfc5606dSYehuda Sadeh static struct device_type rbd_snap_device_type = { 3015dfc5606dSYehuda Sadeh .groups = rbd_snap_attr_groups, 3016dfc5606dSYehuda Sadeh .release = rbd_snap_dev_release, 3017dfc5606dSYehuda Sadeh }; 3018dfc5606dSYehuda Sadeh 30198b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec) 30208b8fb99cSAlex Elder { 30218b8fb99cSAlex Elder kref_get(&spec->kref); 30228b8fb99cSAlex Elder 30238b8fb99cSAlex Elder return spec; 30248b8fb99cSAlex Elder } 30258b8fb99cSAlex Elder 30268b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref); 30278b8fb99cSAlex Elder static void rbd_spec_put(struct rbd_spec *spec) 30288b8fb99cSAlex Elder { 30298b8fb99cSAlex Elder if (spec) 30308b8fb99cSAlex Elder kref_put(&spec->kref, rbd_spec_free); 30318b8fb99cSAlex Elder } 30328b8fb99cSAlex Elder 30338b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_alloc(void) 30348b8fb99cSAlex Elder { 30358b8fb99cSAlex Elder struct rbd_spec *spec; 30368b8fb99cSAlex Elder 30378b8fb99cSAlex Elder spec = kzalloc(sizeof (*spec), GFP_KERNEL); 30388b8fb99cSAlex Elder if (!spec) 30398b8fb99cSAlex Elder return NULL; 30408b8fb99cSAlex Elder kref_init(&spec->kref); 30418b8fb99cSAlex Elder 30428b8fb99cSAlex Elder return spec; 30438b8fb99cSAlex Elder } 30448b8fb99cSAlex Elder 30458b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref) 30468b8fb99cSAlex Elder { 30478b8fb99cSAlex Elder struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref); 30488b8fb99cSAlex Elder 30498b8fb99cSAlex Elder kfree(spec->pool_name); 30508b8fb99cSAlex Elder kfree(spec->image_id); 30518b8fb99cSAlex Elder kfree(spec->image_name); 30528b8fb99cSAlex Elder kfree(spec->snap_name); 30538b8fb99cSAlex Elder kfree(spec); 30548b8fb99cSAlex Elder } 30558b8fb99cSAlex Elder 3056cc344fa1SAlex Elder static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc, 3057c53d5893SAlex Elder struct rbd_spec *spec) 3058c53d5893SAlex Elder { 3059c53d5893SAlex Elder struct rbd_device *rbd_dev; 3060c53d5893SAlex Elder 3061c53d5893SAlex Elder rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL); 3062c53d5893SAlex Elder if (!rbd_dev) 3063c53d5893SAlex Elder return NULL; 3064c53d5893SAlex Elder 3065c53d5893SAlex Elder spin_lock_init(&rbd_dev->lock); 30666d292906SAlex Elder rbd_dev->flags = 0; 3067c53d5893SAlex Elder INIT_LIST_HEAD(&rbd_dev->node); 3068c53d5893SAlex Elder INIT_LIST_HEAD(&rbd_dev->snaps); 3069c53d5893SAlex Elder init_rwsem(&rbd_dev->header_rwsem); 3070c53d5893SAlex Elder 3071c53d5893SAlex Elder rbd_dev->spec = spec; 3072c53d5893SAlex Elder rbd_dev->rbd_client = rbdc; 3073c53d5893SAlex Elder 30740903e875SAlex Elder /* Initialize the layout used for all rbd requests */ 30750903e875SAlex Elder 30760903e875SAlex Elder rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); 30770903e875SAlex Elder rbd_dev->layout.fl_stripe_count = cpu_to_le32(1); 30780903e875SAlex Elder rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); 30790903e875SAlex Elder rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id); 30800903e875SAlex Elder 3081c53d5893SAlex Elder return rbd_dev; 3082c53d5893SAlex Elder } 3083c53d5893SAlex Elder 3084c53d5893SAlex Elder static void rbd_dev_destroy(struct rbd_device *rbd_dev) 3085c53d5893SAlex Elder { 308686b00e0dSAlex Elder rbd_spec_put(rbd_dev->parent_spec); 3087c53d5893SAlex Elder kfree(rbd_dev->header_name); 3088c53d5893SAlex Elder rbd_put_client(rbd_dev->rbd_client); 3089c53d5893SAlex Elder rbd_spec_put(rbd_dev->spec); 3090c53d5893SAlex Elder kfree(rbd_dev); 3091c53d5893SAlex Elder } 3092c53d5893SAlex Elder 3093304f6808SAlex Elder static bool rbd_snap_registered(struct rbd_snap *snap) 3094304f6808SAlex Elder { 3095304f6808SAlex Elder bool ret = snap->dev.type == &rbd_snap_device_type; 3096304f6808SAlex Elder bool reg = device_is_registered(&snap->dev); 3097304f6808SAlex Elder 3098304f6808SAlex Elder rbd_assert(!ret ^ reg); 3099304f6808SAlex Elder 3100304f6808SAlex Elder return ret; 3101304f6808SAlex Elder } 3102304f6808SAlex Elder 310341f38c2bSAlex Elder static void rbd_remove_snap_dev(struct rbd_snap *snap) 3104dfc5606dSYehuda Sadeh { 3105dfc5606dSYehuda Sadeh list_del(&snap->node); 3106304f6808SAlex Elder if (device_is_registered(&snap->dev)) 3107dfc5606dSYehuda Sadeh device_unregister(&snap->dev); 3108dfc5606dSYehuda Sadeh } 3109dfc5606dSYehuda Sadeh 311014e7085dSAlex Elder static int rbd_register_snap_dev(struct rbd_snap *snap, 3111dfc5606dSYehuda Sadeh struct device *parent) 3112dfc5606dSYehuda Sadeh { 3113dfc5606dSYehuda Sadeh struct device *dev = &snap->dev; 3114dfc5606dSYehuda Sadeh int ret; 3115dfc5606dSYehuda Sadeh 3116dfc5606dSYehuda Sadeh dev->type = &rbd_snap_device_type; 3117dfc5606dSYehuda Sadeh dev->parent = parent; 3118dfc5606dSYehuda Sadeh dev->release = rbd_snap_dev_release; 3119d4b125e9SAlex Elder dev_set_name(dev, "%s%s", RBD_SNAP_DEV_NAME_PREFIX, snap->name); 3120304f6808SAlex Elder dout("%s: registering device for snapshot %s\n", __func__, snap->name); 3121304f6808SAlex Elder 3122dfc5606dSYehuda Sadeh ret = device_register(dev); 3123dfc5606dSYehuda Sadeh 3124dfc5606dSYehuda Sadeh return ret; 3125dfc5606dSYehuda Sadeh } 3126dfc5606dSYehuda Sadeh 31274e891e0aSAlex Elder static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev, 3128c8d18425SAlex Elder const char *snap_name, 312934b13184SAlex Elder u64 snap_id, u64 snap_size, 313034b13184SAlex Elder u64 snap_features) 3131dfc5606dSYehuda Sadeh { 31324e891e0aSAlex Elder struct rbd_snap *snap; 3133dfc5606dSYehuda Sadeh int ret; 31344e891e0aSAlex Elder 31354e891e0aSAlex Elder snap = kzalloc(sizeof (*snap), GFP_KERNEL); 3136dfc5606dSYehuda Sadeh if (!snap) 31374e891e0aSAlex Elder return ERR_PTR(-ENOMEM); 31384e891e0aSAlex Elder 31394e891e0aSAlex Elder ret = -ENOMEM; 3140c8d18425SAlex Elder snap->name = kstrdup(snap_name, GFP_KERNEL); 31414e891e0aSAlex Elder if (!snap->name) 31424e891e0aSAlex Elder goto err; 31434e891e0aSAlex Elder 3144c8d18425SAlex Elder snap->id = snap_id; 3145c8d18425SAlex Elder snap->size = snap_size; 314634b13184SAlex Elder snap->features = snap_features; 31474e891e0aSAlex Elder 31484e891e0aSAlex Elder return snap; 31494e891e0aSAlex Elder 3150dfc5606dSYehuda Sadeh err: 3151dfc5606dSYehuda Sadeh kfree(snap->name); 3152dfc5606dSYehuda Sadeh kfree(snap); 31534e891e0aSAlex Elder 31544e891e0aSAlex Elder return ERR_PTR(ret); 3155dfc5606dSYehuda Sadeh } 3156dfc5606dSYehuda Sadeh 3157cd892126SAlex Elder static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which, 3158cd892126SAlex Elder u64 *snap_size, u64 *snap_features) 3159cd892126SAlex Elder { 3160cd892126SAlex Elder char *snap_name; 3161cd892126SAlex Elder 3162cd892126SAlex Elder rbd_assert(which < rbd_dev->header.snapc->num_snaps); 3163cd892126SAlex Elder 3164cd892126SAlex Elder *snap_size = rbd_dev->header.snap_sizes[which]; 3165cd892126SAlex Elder *snap_features = 0; /* No features for v1 */ 3166cd892126SAlex Elder 3167cd892126SAlex Elder /* Skip over names until we find the one we are looking for */ 3168cd892126SAlex Elder 3169cd892126SAlex Elder snap_name = rbd_dev->header.snap_names; 3170cd892126SAlex Elder while (which--) 3171cd892126SAlex Elder snap_name += strlen(snap_name) + 1; 3172cd892126SAlex Elder 3173cd892126SAlex Elder return snap_name; 3174cd892126SAlex Elder } 3175cd892126SAlex Elder 3176dfc5606dSYehuda Sadeh /* 31779d475de5SAlex Elder * Get the size and object order for an image snapshot, or if 31789d475de5SAlex Elder * snap_id is CEPH_NOSNAP, gets this information for the base 31799d475de5SAlex Elder * image. 31809d475de5SAlex Elder */ 31819d475de5SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id, 31829d475de5SAlex Elder u8 *order, u64 *snap_size) 31839d475de5SAlex Elder { 31849d475de5SAlex Elder __le64 snapid = cpu_to_le64(snap_id); 31859d475de5SAlex Elder int ret; 31869d475de5SAlex Elder struct { 31879d475de5SAlex Elder u8 order; 31889d475de5SAlex Elder __le64 size; 31899d475de5SAlex Elder } __attribute__ ((packed)) size_buf = { 0 }; 31909d475de5SAlex Elder 319136be9a76SAlex Elder ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, 31929d475de5SAlex Elder "rbd", "get_size", 31939d475de5SAlex Elder (char *) &snapid, sizeof (snapid), 319407b2391fSAlex Elder (char *) &size_buf, sizeof (size_buf), NULL); 319536be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 31969d475de5SAlex Elder if (ret < 0) 31979d475de5SAlex Elder return ret; 31989d475de5SAlex Elder 31999d475de5SAlex Elder *order = size_buf.order; 32009d475de5SAlex Elder *snap_size = le64_to_cpu(size_buf.size); 32019d475de5SAlex Elder 32029d475de5SAlex Elder dout(" snap_id 0x%016llx order = %u, snap_size = %llu\n", 32039d475de5SAlex Elder (unsigned long long) snap_id, (unsigned int) *order, 32049d475de5SAlex Elder (unsigned long long) *snap_size); 32059d475de5SAlex Elder 32069d475de5SAlex Elder return 0; 32079d475de5SAlex Elder } 32089d475de5SAlex Elder 32099d475de5SAlex Elder static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev) 32109d475de5SAlex Elder { 32119d475de5SAlex Elder return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP, 32129d475de5SAlex Elder &rbd_dev->header.obj_order, 32139d475de5SAlex Elder &rbd_dev->header.image_size); 32149d475de5SAlex Elder } 32159d475de5SAlex Elder 32161e130199SAlex Elder static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev) 32171e130199SAlex Elder { 32181e130199SAlex Elder void *reply_buf; 32191e130199SAlex Elder int ret; 32201e130199SAlex Elder void *p; 32211e130199SAlex Elder 32221e130199SAlex Elder reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL); 32231e130199SAlex Elder if (!reply_buf) 32241e130199SAlex Elder return -ENOMEM; 32251e130199SAlex Elder 322636be9a76SAlex Elder ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, 32271e130199SAlex Elder "rbd", "get_object_prefix", 32281e130199SAlex Elder NULL, 0, 322907b2391fSAlex Elder reply_buf, RBD_OBJ_PREFIX_LEN_MAX, NULL); 323036be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 32311e130199SAlex Elder if (ret < 0) 32321e130199SAlex Elder goto out; 32331e130199SAlex Elder 32341e130199SAlex Elder p = reply_buf; 32351e130199SAlex Elder rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p, 32361e130199SAlex Elder p + RBD_OBJ_PREFIX_LEN_MAX, 32371e130199SAlex Elder NULL, GFP_NOIO); 32381e130199SAlex Elder 32391e130199SAlex Elder if (IS_ERR(rbd_dev->header.object_prefix)) { 32401e130199SAlex Elder ret = PTR_ERR(rbd_dev->header.object_prefix); 32411e130199SAlex Elder rbd_dev->header.object_prefix = NULL; 32421e130199SAlex Elder } else { 32431e130199SAlex Elder dout(" object_prefix = %s\n", rbd_dev->header.object_prefix); 32441e130199SAlex Elder } 32451e130199SAlex Elder 32461e130199SAlex Elder out: 32471e130199SAlex Elder kfree(reply_buf); 32481e130199SAlex Elder 32491e130199SAlex Elder return ret; 32501e130199SAlex Elder } 32511e130199SAlex Elder 3252b1b5402aSAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id, 3253b1b5402aSAlex Elder u64 *snap_features) 3254b1b5402aSAlex Elder { 3255b1b5402aSAlex Elder __le64 snapid = cpu_to_le64(snap_id); 3256b1b5402aSAlex Elder struct { 3257b1b5402aSAlex Elder __le64 features; 3258b1b5402aSAlex Elder __le64 incompat; 3259b1b5402aSAlex Elder } features_buf = { 0 }; 3260d889140cSAlex Elder u64 incompat; 3261b1b5402aSAlex Elder int ret; 3262b1b5402aSAlex Elder 326336be9a76SAlex Elder ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, 3264b1b5402aSAlex Elder "rbd", "get_features", 3265b1b5402aSAlex Elder (char *) &snapid, sizeof (snapid), 3266b1b5402aSAlex Elder (char *) &features_buf, sizeof (features_buf), 326707b2391fSAlex Elder NULL); 326836be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 3269b1b5402aSAlex Elder if (ret < 0) 3270b1b5402aSAlex Elder return ret; 3271d889140cSAlex Elder 3272d889140cSAlex Elder incompat = le64_to_cpu(features_buf.incompat); 32735cbf6f12SAlex Elder if (incompat & ~RBD_FEATURES_SUPPORTED) 3274b8f5c6edSAlex Elder return -ENXIO; 3275d889140cSAlex Elder 3276b1b5402aSAlex Elder *snap_features = le64_to_cpu(features_buf.features); 3277b1b5402aSAlex Elder 3278b1b5402aSAlex Elder dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n", 3279b1b5402aSAlex Elder (unsigned long long) snap_id, 3280b1b5402aSAlex Elder (unsigned long long) *snap_features, 3281b1b5402aSAlex Elder (unsigned long long) le64_to_cpu(features_buf.incompat)); 3282b1b5402aSAlex Elder 3283b1b5402aSAlex Elder return 0; 3284b1b5402aSAlex Elder } 3285b1b5402aSAlex Elder 3286b1b5402aSAlex Elder static int rbd_dev_v2_features(struct rbd_device *rbd_dev) 3287b1b5402aSAlex Elder { 3288b1b5402aSAlex Elder return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP, 3289b1b5402aSAlex Elder &rbd_dev->header.features); 3290b1b5402aSAlex Elder } 3291b1b5402aSAlex Elder 329286b00e0dSAlex Elder static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev) 329386b00e0dSAlex Elder { 329486b00e0dSAlex Elder struct rbd_spec *parent_spec; 329586b00e0dSAlex Elder size_t size; 329686b00e0dSAlex Elder void *reply_buf = NULL; 329786b00e0dSAlex Elder __le64 snapid; 329886b00e0dSAlex Elder void *p; 329986b00e0dSAlex Elder void *end; 330086b00e0dSAlex Elder char *image_id; 330186b00e0dSAlex Elder u64 overlap; 330286b00e0dSAlex Elder int ret; 330386b00e0dSAlex Elder 330486b00e0dSAlex Elder parent_spec = rbd_spec_alloc(); 330586b00e0dSAlex Elder if (!parent_spec) 330686b00e0dSAlex Elder return -ENOMEM; 330786b00e0dSAlex Elder 330886b00e0dSAlex Elder size = sizeof (__le64) + /* pool_id */ 330986b00e0dSAlex Elder sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX + /* image_id */ 331086b00e0dSAlex Elder sizeof (__le64) + /* snap_id */ 331186b00e0dSAlex Elder sizeof (__le64); /* overlap */ 331286b00e0dSAlex Elder reply_buf = kmalloc(size, GFP_KERNEL); 331386b00e0dSAlex Elder if (!reply_buf) { 331486b00e0dSAlex Elder ret = -ENOMEM; 331586b00e0dSAlex Elder goto out_err; 331686b00e0dSAlex Elder } 331786b00e0dSAlex Elder 331886b00e0dSAlex Elder snapid = cpu_to_le64(CEPH_NOSNAP); 331936be9a76SAlex Elder ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, 332086b00e0dSAlex Elder "rbd", "get_parent", 332186b00e0dSAlex Elder (char *) &snapid, sizeof (snapid), 332207b2391fSAlex Elder (char *) reply_buf, size, NULL); 332336be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 332486b00e0dSAlex Elder if (ret < 0) 332586b00e0dSAlex Elder goto out_err; 332686b00e0dSAlex Elder 332786b00e0dSAlex Elder ret = -ERANGE; 332886b00e0dSAlex Elder p = reply_buf; 332986b00e0dSAlex Elder end = (char *) reply_buf + size; 333086b00e0dSAlex Elder ceph_decode_64_safe(&p, end, parent_spec->pool_id, out_err); 333186b00e0dSAlex Elder if (parent_spec->pool_id == CEPH_NOPOOL) 333286b00e0dSAlex Elder goto out; /* No parent? No problem. */ 333386b00e0dSAlex Elder 33340903e875SAlex Elder /* The ceph file layout needs to fit pool id in 32 bits */ 33350903e875SAlex Elder 33360903e875SAlex Elder ret = -EIO; 33370903e875SAlex Elder if (WARN_ON(parent_spec->pool_id > (u64) U32_MAX)) 33380903e875SAlex Elder goto out; 33390903e875SAlex Elder 3340979ed480SAlex Elder image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL); 334186b00e0dSAlex Elder if (IS_ERR(image_id)) { 334286b00e0dSAlex Elder ret = PTR_ERR(image_id); 334386b00e0dSAlex Elder goto out_err; 334486b00e0dSAlex Elder } 334586b00e0dSAlex Elder parent_spec->image_id = image_id; 334686b00e0dSAlex Elder ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err); 334786b00e0dSAlex Elder ceph_decode_64_safe(&p, end, overlap, out_err); 334886b00e0dSAlex Elder 334986b00e0dSAlex Elder rbd_dev->parent_overlap = overlap; 335086b00e0dSAlex Elder rbd_dev->parent_spec = parent_spec; 335186b00e0dSAlex Elder parent_spec = NULL; /* rbd_dev now owns this */ 335286b00e0dSAlex Elder out: 335386b00e0dSAlex Elder ret = 0; 335486b00e0dSAlex Elder out_err: 335586b00e0dSAlex Elder kfree(reply_buf); 335686b00e0dSAlex Elder rbd_spec_put(parent_spec); 335786b00e0dSAlex Elder 335886b00e0dSAlex Elder return ret; 335986b00e0dSAlex Elder } 336086b00e0dSAlex Elder 33619e15b77dSAlex Elder static char *rbd_dev_image_name(struct rbd_device *rbd_dev) 33629e15b77dSAlex Elder { 33639e15b77dSAlex Elder size_t image_id_size; 33649e15b77dSAlex Elder char *image_id; 33659e15b77dSAlex Elder void *p; 33669e15b77dSAlex Elder void *end; 33679e15b77dSAlex Elder size_t size; 33689e15b77dSAlex Elder void *reply_buf = NULL; 33699e15b77dSAlex Elder size_t len = 0; 33709e15b77dSAlex Elder char *image_name = NULL; 33719e15b77dSAlex Elder int ret; 33729e15b77dSAlex Elder 33739e15b77dSAlex Elder rbd_assert(!rbd_dev->spec->image_name); 33749e15b77dSAlex Elder 337569e7a02fSAlex Elder len = strlen(rbd_dev->spec->image_id); 337669e7a02fSAlex Elder image_id_size = sizeof (__le32) + len; 33779e15b77dSAlex Elder image_id = kmalloc(image_id_size, GFP_KERNEL); 33789e15b77dSAlex Elder if (!image_id) 33799e15b77dSAlex Elder return NULL; 33809e15b77dSAlex Elder 33819e15b77dSAlex Elder p = image_id; 33829e15b77dSAlex Elder end = (char *) image_id + image_id_size; 338369e7a02fSAlex Elder ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32) len); 33849e15b77dSAlex Elder 33859e15b77dSAlex Elder size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX; 33869e15b77dSAlex Elder reply_buf = kmalloc(size, GFP_KERNEL); 33879e15b77dSAlex Elder if (!reply_buf) 33889e15b77dSAlex Elder goto out; 33899e15b77dSAlex Elder 339036be9a76SAlex Elder ret = rbd_obj_method_sync(rbd_dev, RBD_DIRECTORY, 33919e15b77dSAlex Elder "rbd", "dir_get_name", 33929e15b77dSAlex Elder image_id, image_id_size, 339307b2391fSAlex Elder (char *) reply_buf, size, NULL); 33949e15b77dSAlex Elder if (ret < 0) 33959e15b77dSAlex Elder goto out; 33969e15b77dSAlex Elder p = reply_buf; 33979e15b77dSAlex Elder end = (char *) reply_buf + size; 33989e15b77dSAlex Elder image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL); 33999e15b77dSAlex Elder if (IS_ERR(image_name)) 34009e15b77dSAlex Elder image_name = NULL; 34019e15b77dSAlex Elder else 34029e15b77dSAlex Elder dout("%s: name is %s len is %zd\n", __func__, image_name, len); 34039e15b77dSAlex Elder out: 34049e15b77dSAlex Elder kfree(reply_buf); 34059e15b77dSAlex Elder kfree(image_id); 34069e15b77dSAlex Elder 34079e15b77dSAlex Elder return image_name; 34089e15b77dSAlex Elder } 34099e15b77dSAlex Elder 34109e15b77dSAlex Elder /* 34119e15b77dSAlex Elder * When a parent image gets probed, we only have the pool, image, 34129e15b77dSAlex Elder * and snapshot ids but not the names of any of them. This call 34139e15b77dSAlex Elder * is made later to fill in those names. It has to be done after 34149e15b77dSAlex Elder * rbd_dev_snaps_update() has completed because some of the 34159e15b77dSAlex Elder * information (in particular, snapshot name) is not available 34169e15b77dSAlex Elder * until then. 34179e15b77dSAlex Elder */ 34189e15b77dSAlex Elder static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev) 34199e15b77dSAlex Elder { 34209e15b77dSAlex Elder struct ceph_osd_client *osdc; 34219e15b77dSAlex Elder const char *name; 34229e15b77dSAlex Elder void *reply_buf = NULL; 34239e15b77dSAlex Elder int ret; 34249e15b77dSAlex Elder 34259e15b77dSAlex Elder if (rbd_dev->spec->pool_name) 34269e15b77dSAlex Elder return 0; /* Already have the names */ 34279e15b77dSAlex Elder 34289e15b77dSAlex Elder /* Look up the pool name */ 34299e15b77dSAlex Elder 34309e15b77dSAlex Elder osdc = &rbd_dev->rbd_client->client->osdc; 34319e15b77dSAlex Elder name = ceph_pg_pool_name_by_id(osdc->osdmap, rbd_dev->spec->pool_id); 3432935dc89fSAlex Elder if (!name) { 3433935dc89fSAlex Elder rbd_warn(rbd_dev, "there is no pool with id %llu", 3434935dc89fSAlex Elder rbd_dev->spec->pool_id); /* Really a BUG() */ 3435935dc89fSAlex Elder return -EIO; 3436935dc89fSAlex Elder } 34379e15b77dSAlex Elder 34389e15b77dSAlex Elder rbd_dev->spec->pool_name = kstrdup(name, GFP_KERNEL); 34399e15b77dSAlex Elder if (!rbd_dev->spec->pool_name) 34409e15b77dSAlex Elder return -ENOMEM; 34419e15b77dSAlex Elder 34429e15b77dSAlex Elder /* Fetch the image name; tolerate failure here */ 34439e15b77dSAlex Elder 34449e15b77dSAlex Elder name = rbd_dev_image_name(rbd_dev); 344569e7a02fSAlex Elder if (name) 34469e15b77dSAlex Elder rbd_dev->spec->image_name = (char *) name; 344769e7a02fSAlex Elder else 344806ecc6cbSAlex Elder rbd_warn(rbd_dev, "unable to get image name"); 34499e15b77dSAlex Elder 34509e15b77dSAlex Elder /* Look up the snapshot name. */ 34519e15b77dSAlex Elder 34529e15b77dSAlex Elder name = rbd_snap_name(rbd_dev, rbd_dev->spec->snap_id); 34539e15b77dSAlex Elder if (!name) { 3454935dc89fSAlex Elder rbd_warn(rbd_dev, "no snapshot with id %llu", 3455935dc89fSAlex Elder rbd_dev->spec->snap_id); /* Really a BUG() */ 34569e15b77dSAlex Elder ret = -EIO; 34579e15b77dSAlex Elder goto out_err; 34589e15b77dSAlex Elder } 34599e15b77dSAlex Elder rbd_dev->spec->snap_name = kstrdup(name, GFP_KERNEL); 34609e15b77dSAlex Elder if(!rbd_dev->spec->snap_name) 34619e15b77dSAlex Elder goto out_err; 34629e15b77dSAlex Elder 34639e15b77dSAlex Elder return 0; 34649e15b77dSAlex Elder out_err: 34659e15b77dSAlex Elder kfree(reply_buf); 34669e15b77dSAlex Elder kfree(rbd_dev->spec->pool_name); 34679e15b77dSAlex Elder rbd_dev->spec->pool_name = NULL; 34689e15b77dSAlex Elder 34699e15b77dSAlex Elder return ret; 34709e15b77dSAlex Elder } 34719e15b77dSAlex Elder 34726e14b1a6SAlex Elder static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver) 347335d489f9SAlex Elder { 347435d489f9SAlex Elder size_t size; 347535d489f9SAlex Elder int ret; 347635d489f9SAlex Elder void *reply_buf; 347735d489f9SAlex Elder void *p; 347835d489f9SAlex Elder void *end; 347935d489f9SAlex Elder u64 seq; 348035d489f9SAlex Elder u32 snap_count; 348135d489f9SAlex Elder struct ceph_snap_context *snapc; 348235d489f9SAlex Elder u32 i; 348335d489f9SAlex Elder 348435d489f9SAlex Elder /* 348535d489f9SAlex Elder * We'll need room for the seq value (maximum snapshot id), 348635d489f9SAlex Elder * snapshot count, and array of that many snapshot ids. 348735d489f9SAlex Elder * For now we have a fixed upper limit on the number we're 348835d489f9SAlex Elder * prepared to receive. 348935d489f9SAlex Elder */ 349035d489f9SAlex Elder size = sizeof (__le64) + sizeof (__le32) + 349135d489f9SAlex Elder RBD_MAX_SNAP_COUNT * sizeof (__le64); 349235d489f9SAlex Elder reply_buf = kzalloc(size, GFP_KERNEL); 349335d489f9SAlex Elder if (!reply_buf) 349435d489f9SAlex Elder return -ENOMEM; 349535d489f9SAlex Elder 349636be9a76SAlex Elder ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, 349735d489f9SAlex Elder "rbd", "get_snapcontext", 349835d489f9SAlex Elder NULL, 0, 349907b2391fSAlex Elder reply_buf, size, ver); 350036be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 350135d489f9SAlex Elder if (ret < 0) 350235d489f9SAlex Elder goto out; 350335d489f9SAlex Elder 350435d489f9SAlex Elder ret = -ERANGE; 350535d489f9SAlex Elder p = reply_buf; 350635d489f9SAlex Elder end = (char *) reply_buf + size; 350735d489f9SAlex Elder ceph_decode_64_safe(&p, end, seq, out); 350835d489f9SAlex Elder ceph_decode_32_safe(&p, end, snap_count, out); 350935d489f9SAlex Elder 351035d489f9SAlex Elder /* 351135d489f9SAlex Elder * Make sure the reported number of snapshot ids wouldn't go 351235d489f9SAlex Elder * beyond the end of our buffer. But before checking that, 351335d489f9SAlex Elder * make sure the computed size of the snapshot context we 351435d489f9SAlex Elder * allocate is representable in a size_t. 351535d489f9SAlex Elder */ 351635d489f9SAlex Elder if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context)) 351735d489f9SAlex Elder / sizeof (u64)) { 351835d489f9SAlex Elder ret = -EINVAL; 351935d489f9SAlex Elder goto out; 352035d489f9SAlex Elder } 352135d489f9SAlex Elder if (!ceph_has_room(&p, end, snap_count * sizeof (__le64))) 352235d489f9SAlex Elder goto out; 352335d489f9SAlex Elder 352435d489f9SAlex Elder size = sizeof (struct ceph_snap_context) + 352535d489f9SAlex Elder snap_count * sizeof (snapc->snaps[0]); 352635d489f9SAlex Elder snapc = kmalloc(size, GFP_KERNEL); 352735d489f9SAlex Elder if (!snapc) { 352835d489f9SAlex Elder ret = -ENOMEM; 352935d489f9SAlex Elder goto out; 353035d489f9SAlex Elder } 353135d489f9SAlex Elder 353235d489f9SAlex Elder atomic_set(&snapc->nref, 1); 353335d489f9SAlex Elder snapc->seq = seq; 353435d489f9SAlex Elder snapc->num_snaps = snap_count; 353535d489f9SAlex Elder for (i = 0; i < snap_count; i++) 353635d489f9SAlex Elder snapc->snaps[i] = ceph_decode_64(&p); 353735d489f9SAlex Elder 353835d489f9SAlex Elder rbd_dev->header.snapc = snapc; 353935d489f9SAlex Elder 354035d489f9SAlex Elder dout(" snap context seq = %llu, snap_count = %u\n", 354135d489f9SAlex Elder (unsigned long long) seq, (unsigned int) snap_count); 354235d489f9SAlex Elder 354335d489f9SAlex Elder out: 354435d489f9SAlex Elder kfree(reply_buf); 354535d489f9SAlex Elder 354635d489f9SAlex Elder return 0; 354735d489f9SAlex Elder } 354835d489f9SAlex Elder 3549b8b1e2dbSAlex Elder static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which) 3550b8b1e2dbSAlex Elder { 3551b8b1e2dbSAlex Elder size_t size; 3552b8b1e2dbSAlex Elder void *reply_buf; 3553b8b1e2dbSAlex Elder __le64 snap_id; 3554b8b1e2dbSAlex Elder int ret; 3555b8b1e2dbSAlex Elder void *p; 3556b8b1e2dbSAlex Elder void *end; 3557b8b1e2dbSAlex Elder char *snap_name; 3558b8b1e2dbSAlex Elder 3559b8b1e2dbSAlex Elder size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN; 3560b8b1e2dbSAlex Elder reply_buf = kmalloc(size, GFP_KERNEL); 3561b8b1e2dbSAlex Elder if (!reply_buf) 3562b8b1e2dbSAlex Elder return ERR_PTR(-ENOMEM); 3563b8b1e2dbSAlex Elder 3564b8b1e2dbSAlex Elder snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]); 356536be9a76SAlex Elder ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, 3566b8b1e2dbSAlex Elder "rbd", "get_snapshot_name", 3567b8b1e2dbSAlex Elder (char *) &snap_id, sizeof (snap_id), 356807b2391fSAlex Elder reply_buf, size, NULL); 356936be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 3570b8b1e2dbSAlex Elder if (ret < 0) 3571b8b1e2dbSAlex Elder goto out; 3572b8b1e2dbSAlex Elder 3573b8b1e2dbSAlex Elder p = reply_buf; 3574b8b1e2dbSAlex Elder end = (char *) reply_buf + size; 3575e5c35534SAlex Elder snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL); 3576b8b1e2dbSAlex Elder if (IS_ERR(snap_name)) { 3577b8b1e2dbSAlex Elder ret = PTR_ERR(snap_name); 3578b8b1e2dbSAlex Elder goto out; 3579b8b1e2dbSAlex Elder } else { 3580b8b1e2dbSAlex Elder dout(" snap_id 0x%016llx snap_name = %s\n", 3581b8b1e2dbSAlex Elder (unsigned long long) le64_to_cpu(snap_id), snap_name); 3582b8b1e2dbSAlex Elder } 3583b8b1e2dbSAlex Elder kfree(reply_buf); 3584b8b1e2dbSAlex Elder 3585b8b1e2dbSAlex Elder return snap_name; 3586b8b1e2dbSAlex Elder out: 3587b8b1e2dbSAlex Elder kfree(reply_buf); 3588b8b1e2dbSAlex Elder 3589b8b1e2dbSAlex Elder return ERR_PTR(ret); 3590b8b1e2dbSAlex Elder } 3591b8b1e2dbSAlex Elder 3592b8b1e2dbSAlex Elder static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which, 3593b8b1e2dbSAlex Elder u64 *snap_size, u64 *snap_features) 3594b8b1e2dbSAlex Elder { 3595e0b49868SAlex Elder u64 snap_id; 3596b8b1e2dbSAlex Elder u8 order; 3597b8b1e2dbSAlex Elder int ret; 3598b8b1e2dbSAlex Elder 3599b8b1e2dbSAlex Elder snap_id = rbd_dev->header.snapc->snaps[which]; 3600b8b1e2dbSAlex Elder ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, &order, snap_size); 3601b8b1e2dbSAlex Elder if (ret) 3602b8b1e2dbSAlex Elder return ERR_PTR(ret); 3603b8b1e2dbSAlex Elder ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, snap_features); 3604b8b1e2dbSAlex Elder if (ret) 3605b8b1e2dbSAlex Elder return ERR_PTR(ret); 3606b8b1e2dbSAlex Elder 3607b8b1e2dbSAlex Elder return rbd_dev_v2_snap_name(rbd_dev, which); 3608b8b1e2dbSAlex Elder } 3609b8b1e2dbSAlex Elder 3610b8b1e2dbSAlex Elder static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which, 3611b8b1e2dbSAlex Elder u64 *snap_size, u64 *snap_features) 3612b8b1e2dbSAlex Elder { 3613b8b1e2dbSAlex Elder if (rbd_dev->image_format == 1) 3614b8b1e2dbSAlex Elder return rbd_dev_v1_snap_info(rbd_dev, which, 3615b8b1e2dbSAlex Elder snap_size, snap_features); 3616b8b1e2dbSAlex Elder if (rbd_dev->image_format == 2) 3617b8b1e2dbSAlex Elder return rbd_dev_v2_snap_info(rbd_dev, which, 3618b8b1e2dbSAlex Elder snap_size, snap_features); 3619b8b1e2dbSAlex Elder return ERR_PTR(-EINVAL); 3620b8b1e2dbSAlex Elder } 3621b8b1e2dbSAlex Elder 3622117973fbSAlex Elder static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver) 3623117973fbSAlex Elder { 3624117973fbSAlex Elder int ret; 3625117973fbSAlex Elder __u8 obj_order; 3626117973fbSAlex Elder 3627117973fbSAlex Elder down_write(&rbd_dev->header_rwsem); 3628117973fbSAlex Elder 3629117973fbSAlex Elder /* Grab old order first, to see if it changes */ 3630117973fbSAlex Elder 3631117973fbSAlex Elder obj_order = rbd_dev->header.obj_order, 3632117973fbSAlex Elder ret = rbd_dev_v2_image_size(rbd_dev); 3633117973fbSAlex Elder if (ret) 3634117973fbSAlex Elder goto out; 3635117973fbSAlex Elder if (rbd_dev->header.obj_order != obj_order) { 3636117973fbSAlex Elder ret = -EIO; 3637117973fbSAlex Elder goto out; 3638117973fbSAlex Elder } 3639117973fbSAlex Elder rbd_update_mapping_size(rbd_dev); 3640117973fbSAlex Elder 3641117973fbSAlex Elder ret = rbd_dev_v2_snap_context(rbd_dev, hver); 3642117973fbSAlex Elder dout("rbd_dev_v2_snap_context returned %d\n", ret); 3643117973fbSAlex Elder if (ret) 3644117973fbSAlex Elder goto out; 3645117973fbSAlex Elder ret = rbd_dev_snaps_update(rbd_dev); 3646117973fbSAlex Elder dout("rbd_dev_snaps_update returned %d\n", ret); 3647117973fbSAlex Elder if (ret) 3648117973fbSAlex Elder goto out; 3649117973fbSAlex Elder ret = rbd_dev_snaps_register(rbd_dev); 3650117973fbSAlex Elder dout("rbd_dev_snaps_register returned %d\n", ret); 3651117973fbSAlex Elder out: 3652117973fbSAlex Elder up_write(&rbd_dev->header_rwsem); 3653117973fbSAlex Elder 3654117973fbSAlex Elder return ret; 3655117973fbSAlex Elder } 3656117973fbSAlex Elder 36579d475de5SAlex Elder /* 365835938150SAlex Elder * Scan the rbd device's current snapshot list and compare it to the 365935938150SAlex Elder * newly-received snapshot context. Remove any existing snapshots 366035938150SAlex Elder * not present in the new snapshot context. Add a new snapshot for 366135938150SAlex Elder * any snaphots in the snapshot context not in the current list. 366235938150SAlex Elder * And verify there are no changes to snapshots we already know 366335938150SAlex Elder * about. 366435938150SAlex Elder * 366535938150SAlex Elder * Assumes the snapshots in the snapshot context are sorted by 366635938150SAlex Elder * snapshot id, highest id first. (Snapshots in the rbd_dev's list 366735938150SAlex Elder * are also maintained in that order.) 3668dfc5606dSYehuda Sadeh */ 3669304f6808SAlex Elder static int rbd_dev_snaps_update(struct rbd_device *rbd_dev) 3670dfc5606dSYehuda Sadeh { 367135938150SAlex Elder struct ceph_snap_context *snapc = rbd_dev->header.snapc; 367235938150SAlex Elder const u32 snap_count = snapc->num_snaps; 367335938150SAlex Elder struct list_head *head = &rbd_dev->snaps; 367435938150SAlex Elder struct list_head *links = head->next; 367535938150SAlex Elder u32 index = 0; 3676dfc5606dSYehuda Sadeh 36779fcbb800SAlex Elder dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count); 367835938150SAlex Elder while (index < snap_count || links != head) { 367935938150SAlex Elder u64 snap_id; 368035938150SAlex Elder struct rbd_snap *snap; 3681cd892126SAlex Elder char *snap_name; 3682cd892126SAlex Elder u64 snap_size = 0; 3683cd892126SAlex Elder u64 snap_features = 0; 3684dfc5606dSYehuda Sadeh 368535938150SAlex Elder snap_id = index < snap_count ? snapc->snaps[index] 368635938150SAlex Elder : CEPH_NOSNAP; 368735938150SAlex Elder snap = links != head ? list_entry(links, struct rbd_snap, node) 368835938150SAlex Elder : NULL; 3689aafb230eSAlex Elder rbd_assert(!snap || snap->id != CEPH_NOSNAP); 3690dfc5606dSYehuda Sadeh 369135938150SAlex Elder if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) { 369235938150SAlex Elder struct list_head *next = links->next; 3693dfc5606dSYehuda Sadeh 36946d292906SAlex Elder /* 36956d292906SAlex Elder * A previously-existing snapshot is not in 36966d292906SAlex Elder * the new snap context. 36976d292906SAlex Elder * 36986d292906SAlex Elder * If the now missing snapshot is the one the 36996d292906SAlex Elder * image is mapped to, clear its exists flag 37006d292906SAlex Elder * so we can avoid sending any more requests 37016d292906SAlex Elder * to it. 37026d292906SAlex Elder */ 37030d7dbfceSAlex Elder if (rbd_dev->spec->snap_id == snap->id) 37046d292906SAlex Elder clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); 370541f38c2bSAlex Elder rbd_remove_snap_dev(snap); 37069fcbb800SAlex Elder dout("%ssnap id %llu has been removed\n", 37070d7dbfceSAlex Elder rbd_dev->spec->snap_id == snap->id ? 37080d7dbfceSAlex Elder "mapped " : "", 37099fcbb800SAlex Elder (unsigned long long) snap->id); 3710dfc5606dSYehuda Sadeh 371135938150SAlex Elder /* Done with this list entry; advance */ 371235938150SAlex Elder 371335938150SAlex Elder links = next; 371435938150SAlex Elder continue; 3715dfc5606dSYehuda Sadeh } 371635938150SAlex Elder 3717b8b1e2dbSAlex Elder snap_name = rbd_dev_snap_info(rbd_dev, index, 3718cd892126SAlex Elder &snap_size, &snap_features); 3719cd892126SAlex Elder if (IS_ERR(snap_name)) 3720cd892126SAlex Elder return PTR_ERR(snap_name); 3721cd892126SAlex Elder 37229fcbb800SAlex Elder dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count, 37239fcbb800SAlex Elder (unsigned long long) snap_id); 372435938150SAlex Elder if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) { 372535938150SAlex Elder struct rbd_snap *new_snap; 372635938150SAlex Elder 372735938150SAlex Elder /* We haven't seen this snapshot before */ 372835938150SAlex Elder 3729c8d18425SAlex Elder new_snap = __rbd_add_snap_dev(rbd_dev, snap_name, 3730cd892126SAlex Elder snap_id, snap_size, snap_features); 37319fcbb800SAlex Elder if (IS_ERR(new_snap)) { 37329fcbb800SAlex Elder int err = PTR_ERR(new_snap); 37339fcbb800SAlex Elder 37349fcbb800SAlex Elder dout(" failed to add dev, error %d\n", err); 37359fcbb800SAlex Elder 37369fcbb800SAlex Elder return err; 37379fcbb800SAlex Elder } 373835938150SAlex Elder 373935938150SAlex Elder /* New goes before existing, or at end of list */ 374035938150SAlex Elder 37419fcbb800SAlex Elder dout(" added dev%s\n", snap ? "" : " at end\n"); 374235938150SAlex Elder if (snap) 374335938150SAlex Elder list_add_tail(&new_snap->node, &snap->node); 374435938150SAlex Elder else 3745523f3258SAlex Elder list_add_tail(&new_snap->node, head); 374635938150SAlex Elder } else { 374735938150SAlex Elder /* Already have this one */ 374835938150SAlex Elder 37499fcbb800SAlex Elder dout(" already present\n"); 37509fcbb800SAlex Elder 3751cd892126SAlex Elder rbd_assert(snap->size == snap_size); 3752aafb230eSAlex Elder rbd_assert(!strcmp(snap->name, snap_name)); 3753cd892126SAlex Elder rbd_assert(snap->features == snap_features); 375435938150SAlex Elder 375535938150SAlex Elder /* Done with this list entry; advance */ 375635938150SAlex Elder 375735938150SAlex Elder links = links->next; 3758dfc5606dSYehuda Sadeh } 375935938150SAlex Elder 376035938150SAlex Elder /* Advance to the next entry in the snapshot context */ 376135938150SAlex Elder 376235938150SAlex Elder index++; 3763dfc5606dSYehuda Sadeh } 37649fcbb800SAlex Elder dout("%s: done\n", __func__); 3765dfc5606dSYehuda Sadeh 3766dfc5606dSYehuda Sadeh return 0; 3767dfc5606dSYehuda Sadeh } 3768dfc5606dSYehuda Sadeh 3769304f6808SAlex Elder /* 3770304f6808SAlex Elder * Scan the list of snapshots and register the devices for any that 3771304f6808SAlex Elder * have not already been registered. 3772304f6808SAlex Elder */ 3773304f6808SAlex Elder static int rbd_dev_snaps_register(struct rbd_device *rbd_dev) 3774304f6808SAlex Elder { 3775304f6808SAlex Elder struct rbd_snap *snap; 3776304f6808SAlex Elder int ret = 0; 3777304f6808SAlex Elder 377837206ee5SAlex Elder dout("%s:\n", __func__); 377986ff77bbSAlex Elder if (WARN_ON(!device_is_registered(&rbd_dev->dev))) 378086ff77bbSAlex Elder return -EIO; 3781304f6808SAlex Elder 3782304f6808SAlex Elder list_for_each_entry(snap, &rbd_dev->snaps, node) { 3783304f6808SAlex Elder if (!rbd_snap_registered(snap)) { 3784304f6808SAlex Elder ret = rbd_register_snap_dev(snap, &rbd_dev->dev); 3785304f6808SAlex Elder if (ret < 0) 3786304f6808SAlex Elder break; 3787304f6808SAlex Elder } 3788304f6808SAlex Elder } 3789304f6808SAlex Elder dout("%s: returning %d\n", __func__, ret); 3790304f6808SAlex Elder 3791304f6808SAlex Elder return ret; 3792304f6808SAlex Elder } 3793304f6808SAlex Elder 3794dfc5606dSYehuda Sadeh static int rbd_bus_add_dev(struct rbd_device *rbd_dev) 3795dfc5606dSYehuda Sadeh { 3796dfc5606dSYehuda Sadeh struct device *dev; 3797cd789ab9SAlex Elder int ret; 3798dfc5606dSYehuda Sadeh 3799dfc5606dSYehuda Sadeh mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); 3800dfc5606dSYehuda Sadeh 3801cd789ab9SAlex Elder dev = &rbd_dev->dev; 3802dfc5606dSYehuda Sadeh dev->bus = &rbd_bus_type; 3803dfc5606dSYehuda Sadeh dev->type = &rbd_device_type; 3804dfc5606dSYehuda Sadeh dev->parent = &rbd_root_dev; 3805dfc5606dSYehuda Sadeh dev->release = rbd_dev_release; 3806de71a297SAlex Elder dev_set_name(dev, "%d", rbd_dev->dev_id); 3807dfc5606dSYehuda Sadeh ret = device_register(dev); 3808dfc5606dSYehuda Sadeh 3809dfc5606dSYehuda Sadeh mutex_unlock(&ctl_mutex); 3810cd789ab9SAlex Elder 3811dfc5606dSYehuda Sadeh return ret; 3812602adf40SYehuda Sadeh } 3813602adf40SYehuda Sadeh 3814dfc5606dSYehuda Sadeh static void rbd_bus_del_dev(struct rbd_device *rbd_dev) 3815dfc5606dSYehuda Sadeh { 3816dfc5606dSYehuda Sadeh device_unregister(&rbd_dev->dev); 3817dfc5606dSYehuda Sadeh } 3818dfc5606dSYehuda Sadeh 3819e2839308SAlex Elder static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0); 38201ddbe94eSAlex Elder 38211ddbe94eSAlex Elder /* 3822499afd5bSAlex Elder * Get a unique rbd identifier for the given new rbd_dev, and add 3823499afd5bSAlex Elder * the rbd_dev to the global list. The minimum rbd id is 1. 38241ddbe94eSAlex Elder */ 3825e2839308SAlex Elder static void rbd_dev_id_get(struct rbd_device *rbd_dev) 3826b7f23c36SAlex Elder { 3827e2839308SAlex Elder rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max); 3828499afd5bSAlex Elder 3829499afd5bSAlex Elder spin_lock(&rbd_dev_list_lock); 3830499afd5bSAlex Elder list_add_tail(&rbd_dev->node, &rbd_dev_list); 3831499afd5bSAlex Elder spin_unlock(&rbd_dev_list_lock); 3832e2839308SAlex Elder dout("rbd_dev %p given dev id %llu\n", rbd_dev, 3833e2839308SAlex Elder (unsigned long long) rbd_dev->dev_id); 3834b7f23c36SAlex Elder } 3835b7f23c36SAlex Elder 38361ddbe94eSAlex Elder /* 3837499afd5bSAlex Elder * Remove an rbd_dev from the global list, and record that its 3838499afd5bSAlex Elder * identifier is no longer in use. 38391ddbe94eSAlex Elder */ 3840e2839308SAlex Elder static void rbd_dev_id_put(struct rbd_device *rbd_dev) 38411ddbe94eSAlex Elder { 3842d184f6bfSAlex Elder struct list_head *tmp; 3843de71a297SAlex Elder int rbd_id = rbd_dev->dev_id; 3844d184f6bfSAlex Elder int max_id; 3845d184f6bfSAlex Elder 3846aafb230eSAlex Elder rbd_assert(rbd_id > 0); 3847499afd5bSAlex Elder 3848e2839308SAlex Elder dout("rbd_dev %p released dev id %llu\n", rbd_dev, 3849e2839308SAlex Elder (unsigned long long) rbd_dev->dev_id); 3850499afd5bSAlex Elder spin_lock(&rbd_dev_list_lock); 3851499afd5bSAlex Elder list_del_init(&rbd_dev->node); 3852d184f6bfSAlex Elder 3853d184f6bfSAlex Elder /* 3854d184f6bfSAlex Elder * If the id being "put" is not the current maximum, there 3855d184f6bfSAlex Elder * is nothing special we need to do. 3856d184f6bfSAlex Elder */ 3857e2839308SAlex Elder if (rbd_id != atomic64_read(&rbd_dev_id_max)) { 3858d184f6bfSAlex Elder spin_unlock(&rbd_dev_list_lock); 3859d184f6bfSAlex Elder return; 3860d184f6bfSAlex Elder } 3861d184f6bfSAlex Elder 3862d184f6bfSAlex Elder /* 3863d184f6bfSAlex Elder * We need to update the current maximum id. Search the 3864d184f6bfSAlex Elder * list to find out what it is. We're more likely to find 3865d184f6bfSAlex Elder * the maximum at the end, so search the list backward. 3866d184f6bfSAlex Elder */ 3867d184f6bfSAlex Elder max_id = 0; 3868d184f6bfSAlex Elder list_for_each_prev(tmp, &rbd_dev_list) { 3869d184f6bfSAlex Elder struct rbd_device *rbd_dev; 3870d184f6bfSAlex Elder 3871d184f6bfSAlex Elder rbd_dev = list_entry(tmp, struct rbd_device, node); 3872b213e0b1SAlex Elder if (rbd_dev->dev_id > max_id) 3873b213e0b1SAlex Elder max_id = rbd_dev->dev_id; 3874d184f6bfSAlex Elder } 3875499afd5bSAlex Elder spin_unlock(&rbd_dev_list_lock); 38761ddbe94eSAlex Elder 38771ddbe94eSAlex Elder /* 3878e2839308SAlex Elder * The max id could have been updated by rbd_dev_id_get(), in 3879d184f6bfSAlex Elder * which case it now accurately reflects the new maximum. 3880d184f6bfSAlex Elder * Be careful not to overwrite the maximum value in that 3881d184f6bfSAlex Elder * case. 38821ddbe94eSAlex Elder */ 3883e2839308SAlex Elder atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id); 3884e2839308SAlex Elder dout(" max dev id has been reset\n"); 3885b7f23c36SAlex Elder } 3886b7f23c36SAlex Elder 3887a725f65eSAlex Elder /* 3888e28fff26SAlex Elder * Skips over white space at *buf, and updates *buf to point to the 3889e28fff26SAlex Elder * first found non-space character (if any). Returns the length of 3890593a9e7bSAlex Elder * the token (string of non-white space characters) found. Note 3891593a9e7bSAlex Elder * that *buf must be terminated with '\0'. 3892e28fff26SAlex Elder */ 3893e28fff26SAlex Elder static inline size_t next_token(const char **buf) 3894e28fff26SAlex Elder { 3895e28fff26SAlex Elder /* 3896e28fff26SAlex Elder * These are the characters that produce nonzero for 3897e28fff26SAlex Elder * isspace() in the "C" and "POSIX" locales. 3898e28fff26SAlex Elder */ 3899e28fff26SAlex Elder const char *spaces = " \f\n\r\t\v"; 3900e28fff26SAlex Elder 3901e28fff26SAlex Elder *buf += strspn(*buf, spaces); /* Find start of token */ 3902e28fff26SAlex Elder 3903e28fff26SAlex Elder return strcspn(*buf, spaces); /* Return token length */ 3904e28fff26SAlex Elder } 3905e28fff26SAlex Elder 3906e28fff26SAlex Elder /* 3907e28fff26SAlex Elder * Finds the next token in *buf, and if the provided token buffer is 3908e28fff26SAlex Elder * big enough, copies the found token into it. The result, if 3909593a9e7bSAlex Elder * copied, is guaranteed to be terminated with '\0'. Note that *buf 3910593a9e7bSAlex Elder * must be terminated with '\0' on entry. 3911e28fff26SAlex Elder * 3912e28fff26SAlex Elder * Returns the length of the token found (not including the '\0'). 3913e28fff26SAlex Elder * Return value will be 0 if no token is found, and it will be >= 3914e28fff26SAlex Elder * token_size if the token would not fit. 3915e28fff26SAlex Elder * 3916593a9e7bSAlex Elder * The *buf pointer will be updated to point beyond the end of the 3917e28fff26SAlex Elder * found token. Note that this occurs even if the token buffer is 3918e28fff26SAlex Elder * too small to hold it. 3919e28fff26SAlex Elder */ 3920e28fff26SAlex Elder static inline size_t copy_token(const char **buf, 3921e28fff26SAlex Elder char *token, 3922e28fff26SAlex Elder size_t token_size) 3923e28fff26SAlex Elder { 3924e28fff26SAlex Elder size_t len; 3925e28fff26SAlex Elder 3926e28fff26SAlex Elder len = next_token(buf); 3927e28fff26SAlex Elder if (len < token_size) { 3928e28fff26SAlex Elder memcpy(token, *buf, len); 3929e28fff26SAlex Elder *(token + len) = '\0'; 3930e28fff26SAlex Elder } 3931e28fff26SAlex Elder *buf += len; 3932e28fff26SAlex Elder 3933e28fff26SAlex Elder return len; 3934e28fff26SAlex Elder } 3935e28fff26SAlex Elder 3936e28fff26SAlex Elder /* 3937ea3352f4SAlex Elder * Finds the next token in *buf, dynamically allocates a buffer big 3938ea3352f4SAlex Elder * enough to hold a copy of it, and copies the token into the new 3939ea3352f4SAlex Elder * buffer. The copy is guaranteed to be terminated with '\0'. Note 3940ea3352f4SAlex Elder * that a duplicate buffer is created even for a zero-length token. 3941ea3352f4SAlex Elder * 3942ea3352f4SAlex Elder * Returns a pointer to the newly-allocated duplicate, or a null 3943ea3352f4SAlex Elder * pointer if memory for the duplicate was not available. If 3944ea3352f4SAlex Elder * the lenp argument is a non-null pointer, the length of the token 3945ea3352f4SAlex Elder * (not including the '\0') is returned in *lenp. 3946ea3352f4SAlex Elder * 3947ea3352f4SAlex Elder * If successful, the *buf pointer will be updated to point beyond 3948ea3352f4SAlex Elder * the end of the found token. 3949ea3352f4SAlex Elder * 3950ea3352f4SAlex Elder * Note: uses GFP_KERNEL for allocation. 3951ea3352f4SAlex Elder */ 3952ea3352f4SAlex Elder static inline char *dup_token(const char **buf, size_t *lenp) 3953ea3352f4SAlex Elder { 3954ea3352f4SAlex Elder char *dup; 3955ea3352f4SAlex Elder size_t len; 3956ea3352f4SAlex Elder 3957ea3352f4SAlex Elder len = next_token(buf); 39584caf35f9SAlex Elder dup = kmemdup(*buf, len + 1, GFP_KERNEL); 3959ea3352f4SAlex Elder if (!dup) 3960ea3352f4SAlex Elder return NULL; 3961ea3352f4SAlex Elder *(dup + len) = '\0'; 3962ea3352f4SAlex Elder *buf += len; 3963ea3352f4SAlex Elder 3964ea3352f4SAlex Elder if (lenp) 3965ea3352f4SAlex Elder *lenp = len; 3966ea3352f4SAlex Elder 3967ea3352f4SAlex Elder return dup; 3968ea3352f4SAlex Elder } 3969ea3352f4SAlex Elder 3970ea3352f4SAlex Elder /* 3971859c31dfSAlex Elder * Parse the options provided for an "rbd add" (i.e., rbd image 3972859c31dfSAlex Elder * mapping) request. These arrive via a write to /sys/bus/rbd/add, 3973859c31dfSAlex Elder * and the data written is passed here via a NUL-terminated buffer. 3974859c31dfSAlex Elder * Returns 0 if successful or an error code otherwise. 3975d22f76e7SAlex Elder * 3976859c31dfSAlex Elder * The information extracted from these options is recorded in 3977859c31dfSAlex Elder * the other parameters which return dynamically-allocated 3978859c31dfSAlex Elder * structures: 3979859c31dfSAlex Elder * ceph_opts 3980859c31dfSAlex Elder * The address of a pointer that will refer to a ceph options 3981859c31dfSAlex Elder * structure. Caller must release the returned pointer using 3982859c31dfSAlex Elder * ceph_destroy_options() when it is no longer needed. 3983859c31dfSAlex Elder * rbd_opts 3984859c31dfSAlex Elder * Address of an rbd options pointer. Fully initialized by 3985859c31dfSAlex Elder * this function; caller must release with kfree(). 3986859c31dfSAlex Elder * spec 3987859c31dfSAlex Elder * Address of an rbd image specification pointer. Fully 3988859c31dfSAlex Elder * initialized by this function based on parsed options. 3989859c31dfSAlex Elder * Caller must release with rbd_spec_put(). 3990859c31dfSAlex Elder * 3991859c31dfSAlex Elder * The options passed take this form: 3992859c31dfSAlex Elder * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>] 3993859c31dfSAlex Elder * where: 3994859c31dfSAlex Elder * <mon_addrs> 3995859c31dfSAlex Elder * A comma-separated list of one or more monitor addresses. 3996859c31dfSAlex Elder * A monitor address is an ip address, optionally followed 3997859c31dfSAlex Elder * by a port number (separated by a colon). 3998859c31dfSAlex Elder * I.e.: ip1[:port1][,ip2[:port2]...] 3999859c31dfSAlex Elder * <options> 4000859c31dfSAlex Elder * A comma-separated list of ceph and/or rbd options. 4001859c31dfSAlex Elder * <pool_name> 4002859c31dfSAlex Elder * The name of the rados pool containing the rbd image. 4003859c31dfSAlex Elder * <image_name> 4004859c31dfSAlex Elder * The name of the image in that pool to map. 4005859c31dfSAlex Elder * <snap_id> 4006859c31dfSAlex Elder * An optional snapshot id. If provided, the mapping will 4007859c31dfSAlex Elder * present data from the image at the time that snapshot was 4008859c31dfSAlex Elder * created. The image head is used if no snapshot id is 4009859c31dfSAlex Elder * provided. Snapshot mappings are always read-only. 4010a725f65eSAlex Elder */ 4011859c31dfSAlex Elder static int rbd_add_parse_args(const char *buf, 4012dc79b113SAlex Elder struct ceph_options **ceph_opts, 4013859c31dfSAlex Elder struct rbd_options **opts, 4014859c31dfSAlex Elder struct rbd_spec **rbd_spec) 4015a725f65eSAlex Elder { 4016e28fff26SAlex Elder size_t len; 4017859c31dfSAlex Elder char *options; 40180ddebc0cSAlex Elder const char *mon_addrs; 40190ddebc0cSAlex Elder size_t mon_addrs_size; 4020859c31dfSAlex Elder struct rbd_spec *spec = NULL; 40214e9afebaSAlex Elder struct rbd_options *rbd_opts = NULL; 4022859c31dfSAlex Elder struct ceph_options *copts; 4023dc79b113SAlex Elder int ret; 4024e28fff26SAlex Elder 4025e28fff26SAlex Elder /* The first four tokens are required */ 4026e28fff26SAlex Elder 40277ef3214aSAlex Elder len = next_token(&buf); 40284fb5d671SAlex Elder if (!len) { 40294fb5d671SAlex Elder rbd_warn(NULL, "no monitor address(es) provided"); 40304fb5d671SAlex Elder return -EINVAL; 40314fb5d671SAlex Elder } 40320ddebc0cSAlex Elder mon_addrs = buf; 4033f28e565aSAlex Elder mon_addrs_size = len + 1; 40347ef3214aSAlex Elder buf += len; 4035a725f65eSAlex Elder 4036dc79b113SAlex Elder ret = -EINVAL; 4037f28e565aSAlex Elder options = dup_token(&buf, NULL); 4038f28e565aSAlex Elder if (!options) 4039dc79b113SAlex Elder return -ENOMEM; 40404fb5d671SAlex Elder if (!*options) { 40414fb5d671SAlex Elder rbd_warn(NULL, "no options provided"); 40424fb5d671SAlex Elder goto out_err; 40434fb5d671SAlex Elder } 4044a725f65eSAlex Elder 4045859c31dfSAlex Elder spec = rbd_spec_alloc(); 4046859c31dfSAlex Elder if (!spec) 4047f28e565aSAlex Elder goto out_mem; 4048859c31dfSAlex Elder 4049859c31dfSAlex Elder spec->pool_name = dup_token(&buf, NULL); 4050859c31dfSAlex Elder if (!spec->pool_name) 4051859c31dfSAlex Elder goto out_mem; 40524fb5d671SAlex Elder if (!*spec->pool_name) { 40534fb5d671SAlex Elder rbd_warn(NULL, "no pool name provided"); 40544fb5d671SAlex Elder goto out_err; 40554fb5d671SAlex Elder } 4056e28fff26SAlex Elder 405769e7a02fSAlex Elder spec->image_name = dup_token(&buf, NULL); 4058859c31dfSAlex Elder if (!spec->image_name) 4059f28e565aSAlex Elder goto out_mem; 40604fb5d671SAlex Elder if (!*spec->image_name) { 40614fb5d671SAlex Elder rbd_warn(NULL, "no image name provided"); 40624fb5d671SAlex Elder goto out_err; 40634fb5d671SAlex Elder } 4064e28fff26SAlex Elder 4065f28e565aSAlex Elder /* 4066f28e565aSAlex Elder * Snapshot name is optional; default is to use "-" 4067f28e565aSAlex Elder * (indicating the head/no snapshot). 4068f28e565aSAlex Elder */ 40693feeb894SAlex Elder len = next_token(&buf); 4070820a5f3eSAlex Elder if (!len) { 40713feeb894SAlex Elder buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */ 40723feeb894SAlex Elder len = sizeof (RBD_SNAP_HEAD_NAME) - 1; 4073f28e565aSAlex Elder } else if (len > RBD_MAX_SNAP_NAME_LEN) { 4074dc79b113SAlex Elder ret = -ENAMETOOLONG; 4075f28e565aSAlex Elder goto out_err; 4076849b4260SAlex Elder } 40774caf35f9SAlex Elder spec->snap_name = kmemdup(buf, len + 1, GFP_KERNEL); 4078859c31dfSAlex Elder if (!spec->snap_name) 4079f28e565aSAlex Elder goto out_mem; 4080859c31dfSAlex Elder *(spec->snap_name + len) = '\0'; 4081e5c35534SAlex Elder 40820ddebc0cSAlex Elder /* Initialize all rbd options to the defaults */ 4083e28fff26SAlex Elder 40844e9afebaSAlex Elder rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL); 40854e9afebaSAlex Elder if (!rbd_opts) 40864e9afebaSAlex Elder goto out_mem; 40874e9afebaSAlex Elder 40884e9afebaSAlex Elder rbd_opts->read_only = RBD_READ_ONLY_DEFAULT; 4089d22f76e7SAlex Elder 4090859c31dfSAlex Elder copts = ceph_parse_options(options, mon_addrs, 40910ddebc0cSAlex Elder mon_addrs + mon_addrs_size - 1, 40924e9afebaSAlex Elder parse_rbd_opts_token, rbd_opts); 4093859c31dfSAlex Elder if (IS_ERR(copts)) { 4094859c31dfSAlex Elder ret = PTR_ERR(copts); 4095dc79b113SAlex Elder goto out_err; 4096dc79b113SAlex Elder } 4097859c31dfSAlex Elder kfree(options); 4098859c31dfSAlex Elder 4099859c31dfSAlex Elder *ceph_opts = copts; 41004e9afebaSAlex Elder *opts = rbd_opts; 4101859c31dfSAlex Elder *rbd_spec = spec; 41020ddebc0cSAlex Elder 4103dc79b113SAlex Elder return 0; 4104f28e565aSAlex Elder out_mem: 4105dc79b113SAlex Elder ret = -ENOMEM; 4106d22f76e7SAlex Elder out_err: 4107859c31dfSAlex Elder kfree(rbd_opts); 4108859c31dfSAlex Elder rbd_spec_put(spec); 4109f28e565aSAlex Elder kfree(options); 4110d22f76e7SAlex Elder 4111dc79b113SAlex Elder return ret; 4112a725f65eSAlex Elder } 4113a725f65eSAlex Elder 4114589d30e0SAlex Elder /* 4115589d30e0SAlex Elder * An rbd format 2 image has a unique identifier, distinct from the 4116589d30e0SAlex Elder * name given to it by the user. Internally, that identifier is 4117589d30e0SAlex Elder * what's used to specify the names of objects related to the image. 4118589d30e0SAlex Elder * 4119589d30e0SAlex Elder * A special "rbd id" object is used to map an rbd image name to its 4120589d30e0SAlex Elder * id. If that object doesn't exist, then there is no v2 rbd image 4121589d30e0SAlex Elder * with the supplied name. 4122589d30e0SAlex Elder * 4123589d30e0SAlex Elder * This function will record the given rbd_dev's image_id field if 4124589d30e0SAlex Elder * it can be determined, and in that case will return 0. If any 4125589d30e0SAlex Elder * errors occur a negative errno will be returned and the rbd_dev's 4126589d30e0SAlex Elder * image_id field will be unchanged (and should be NULL). 4127589d30e0SAlex Elder */ 4128589d30e0SAlex Elder static int rbd_dev_image_id(struct rbd_device *rbd_dev) 4129589d30e0SAlex Elder { 4130589d30e0SAlex Elder int ret; 4131589d30e0SAlex Elder size_t size; 4132589d30e0SAlex Elder char *object_name; 4133589d30e0SAlex Elder void *response; 4134589d30e0SAlex Elder void *p; 4135589d30e0SAlex Elder 41362f82ee54SAlex Elder /* If we already have it we don't need to look it up */ 41372f82ee54SAlex Elder 41382f82ee54SAlex Elder if (rbd_dev->spec->image_id) 41392f82ee54SAlex Elder return 0; 41402f82ee54SAlex Elder 4141589d30e0SAlex Elder /* 41422c0d0a10SAlex Elder * When probing a parent image, the image id is already 41432c0d0a10SAlex Elder * known (and the image name likely is not). There's no 41442c0d0a10SAlex Elder * need to fetch the image id again in this case. 41452c0d0a10SAlex Elder */ 41462c0d0a10SAlex Elder if (rbd_dev->spec->image_id) 41472c0d0a10SAlex Elder return 0; 41482c0d0a10SAlex Elder 41492c0d0a10SAlex Elder /* 4150589d30e0SAlex Elder * First, see if the format 2 image id file exists, and if 4151589d30e0SAlex Elder * so, get the image's persistent id from it. 4152589d30e0SAlex Elder */ 415369e7a02fSAlex Elder size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name); 4154589d30e0SAlex Elder object_name = kmalloc(size, GFP_NOIO); 4155589d30e0SAlex Elder if (!object_name) 4156589d30e0SAlex Elder return -ENOMEM; 41570d7dbfceSAlex Elder sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name); 4158589d30e0SAlex Elder dout("rbd id object name is %s\n", object_name); 4159589d30e0SAlex Elder 4160589d30e0SAlex Elder /* Response will be an encoded string, which includes a length */ 4161589d30e0SAlex Elder 4162589d30e0SAlex Elder size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX; 4163589d30e0SAlex Elder response = kzalloc(size, GFP_NOIO); 4164589d30e0SAlex Elder if (!response) { 4165589d30e0SAlex Elder ret = -ENOMEM; 4166589d30e0SAlex Elder goto out; 4167589d30e0SAlex Elder } 4168589d30e0SAlex Elder 416936be9a76SAlex Elder ret = rbd_obj_method_sync(rbd_dev, object_name, 4170589d30e0SAlex Elder "rbd", "get_id", 4171589d30e0SAlex Elder NULL, 0, 417207b2391fSAlex Elder response, RBD_IMAGE_ID_LEN_MAX, NULL); 417336be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 4174589d30e0SAlex Elder if (ret < 0) 4175589d30e0SAlex Elder goto out; 4176589d30e0SAlex Elder 4177589d30e0SAlex Elder p = response; 41780d7dbfceSAlex Elder rbd_dev->spec->image_id = ceph_extract_encoded_string(&p, 4179589d30e0SAlex Elder p + RBD_IMAGE_ID_LEN_MAX, 4180979ed480SAlex Elder NULL, GFP_NOIO); 41810d7dbfceSAlex Elder if (IS_ERR(rbd_dev->spec->image_id)) { 41820d7dbfceSAlex Elder ret = PTR_ERR(rbd_dev->spec->image_id); 41830d7dbfceSAlex Elder rbd_dev->spec->image_id = NULL; 4184589d30e0SAlex Elder } else { 41850d7dbfceSAlex Elder dout("image_id is %s\n", rbd_dev->spec->image_id); 4186589d30e0SAlex Elder } 4187589d30e0SAlex Elder out: 4188589d30e0SAlex Elder kfree(response); 4189589d30e0SAlex Elder kfree(object_name); 4190589d30e0SAlex Elder 4191589d30e0SAlex Elder return ret; 4192589d30e0SAlex Elder } 4193589d30e0SAlex Elder 4194a30b71b9SAlex Elder static int rbd_dev_v1_probe(struct rbd_device *rbd_dev) 4195a30b71b9SAlex Elder { 4196a30b71b9SAlex Elder int ret; 4197a30b71b9SAlex Elder size_t size; 4198a30b71b9SAlex Elder 4199a30b71b9SAlex Elder /* Version 1 images have no id; empty string is used */ 4200a30b71b9SAlex Elder 42010d7dbfceSAlex Elder rbd_dev->spec->image_id = kstrdup("", GFP_KERNEL); 42020d7dbfceSAlex Elder if (!rbd_dev->spec->image_id) 4203a30b71b9SAlex Elder return -ENOMEM; 4204a30b71b9SAlex Elder 4205a30b71b9SAlex Elder /* Record the header object name for this rbd image. */ 4206a30b71b9SAlex Elder 420769e7a02fSAlex Elder size = strlen(rbd_dev->spec->image_name) + sizeof (RBD_SUFFIX); 4208a30b71b9SAlex Elder rbd_dev->header_name = kmalloc(size, GFP_KERNEL); 4209a30b71b9SAlex Elder if (!rbd_dev->header_name) { 4210a30b71b9SAlex Elder ret = -ENOMEM; 4211a30b71b9SAlex Elder goto out_err; 4212a30b71b9SAlex Elder } 42130d7dbfceSAlex Elder sprintf(rbd_dev->header_name, "%s%s", 42140d7dbfceSAlex Elder rbd_dev->spec->image_name, RBD_SUFFIX); 4215a30b71b9SAlex Elder 4216a30b71b9SAlex Elder /* Populate rbd image metadata */ 4217a30b71b9SAlex Elder 4218a30b71b9SAlex Elder ret = rbd_read_header(rbd_dev, &rbd_dev->header); 4219a30b71b9SAlex Elder if (ret < 0) 4220a30b71b9SAlex Elder goto out_err; 422186b00e0dSAlex Elder 422286b00e0dSAlex Elder /* Version 1 images have no parent (no layering) */ 422386b00e0dSAlex Elder 422486b00e0dSAlex Elder rbd_dev->parent_spec = NULL; 422586b00e0dSAlex Elder rbd_dev->parent_overlap = 0; 422686b00e0dSAlex Elder 4227a30b71b9SAlex Elder rbd_dev->image_format = 1; 4228a30b71b9SAlex Elder 4229a30b71b9SAlex Elder dout("discovered version 1 image, header name is %s\n", 4230a30b71b9SAlex Elder rbd_dev->header_name); 4231a30b71b9SAlex Elder 4232a30b71b9SAlex Elder return 0; 4233a30b71b9SAlex Elder 4234a30b71b9SAlex Elder out_err: 4235a30b71b9SAlex Elder kfree(rbd_dev->header_name); 4236a30b71b9SAlex Elder rbd_dev->header_name = NULL; 42370d7dbfceSAlex Elder kfree(rbd_dev->spec->image_id); 42380d7dbfceSAlex Elder rbd_dev->spec->image_id = NULL; 4239a30b71b9SAlex Elder 4240a30b71b9SAlex Elder return ret; 4241a30b71b9SAlex Elder } 4242a30b71b9SAlex Elder 4243a30b71b9SAlex Elder static int rbd_dev_v2_probe(struct rbd_device *rbd_dev) 4244a30b71b9SAlex Elder { 4245a30b71b9SAlex Elder size_t size; 42469d475de5SAlex Elder int ret; 42476e14b1a6SAlex Elder u64 ver = 0; 4248a30b71b9SAlex Elder 4249a30b71b9SAlex Elder /* 4250a30b71b9SAlex Elder * Image id was filled in by the caller. Record the header 4251a30b71b9SAlex Elder * object name for this rbd image. 4252a30b71b9SAlex Elder */ 4253979ed480SAlex Elder size = sizeof (RBD_HEADER_PREFIX) + strlen(rbd_dev->spec->image_id); 4254a30b71b9SAlex Elder rbd_dev->header_name = kmalloc(size, GFP_KERNEL); 4255a30b71b9SAlex Elder if (!rbd_dev->header_name) 4256a30b71b9SAlex Elder return -ENOMEM; 4257a30b71b9SAlex Elder sprintf(rbd_dev->header_name, "%s%s", 42580d7dbfceSAlex Elder RBD_HEADER_PREFIX, rbd_dev->spec->image_id); 42599d475de5SAlex Elder 42609d475de5SAlex Elder /* Get the size and object order for the image */ 42619d475de5SAlex Elder 42629d475de5SAlex Elder ret = rbd_dev_v2_image_size(rbd_dev); 42639d475de5SAlex Elder if (ret < 0) 42649d475de5SAlex Elder goto out_err; 42651e130199SAlex Elder 42661e130199SAlex Elder /* Get the object prefix (a.k.a. block_name) for the image */ 42671e130199SAlex Elder 42681e130199SAlex Elder ret = rbd_dev_v2_object_prefix(rbd_dev); 42691e130199SAlex Elder if (ret < 0) 42701e130199SAlex Elder goto out_err; 4271b1b5402aSAlex Elder 4272d889140cSAlex Elder /* Get the and check features for the image */ 4273b1b5402aSAlex Elder 4274b1b5402aSAlex Elder ret = rbd_dev_v2_features(rbd_dev); 4275b1b5402aSAlex Elder if (ret < 0) 4276b1b5402aSAlex Elder goto out_err; 427735d489f9SAlex Elder 427886b00e0dSAlex Elder /* If the image supports layering, get the parent info */ 427986b00e0dSAlex Elder 428086b00e0dSAlex Elder if (rbd_dev->header.features & RBD_FEATURE_LAYERING) { 428186b00e0dSAlex Elder ret = rbd_dev_v2_parent_info(rbd_dev); 428286b00e0dSAlex Elder if (ret < 0) 428386b00e0dSAlex Elder goto out_err; 428486b00e0dSAlex Elder } 428586b00e0dSAlex Elder 42866e14b1a6SAlex Elder /* crypto and compression type aren't (yet) supported for v2 images */ 428735d489f9SAlex Elder 42886e14b1a6SAlex Elder rbd_dev->header.crypt_type = 0; 42896e14b1a6SAlex Elder rbd_dev->header.comp_type = 0; 42906e14b1a6SAlex Elder 42916e14b1a6SAlex Elder /* Get the snapshot context, plus the header version */ 42926e14b1a6SAlex Elder 42936e14b1a6SAlex Elder ret = rbd_dev_v2_snap_context(rbd_dev, &ver); 429435d489f9SAlex Elder if (ret) 429535d489f9SAlex Elder goto out_err; 42966e14b1a6SAlex Elder rbd_dev->header.obj_version = ver; 42976e14b1a6SAlex Elder 4298a30b71b9SAlex Elder rbd_dev->image_format = 2; 4299a30b71b9SAlex Elder 4300a30b71b9SAlex Elder dout("discovered version 2 image, header name is %s\n", 4301a30b71b9SAlex Elder rbd_dev->header_name); 4302a30b71b9SAlex Elder 430335152979SAlex Elder return 0; 43049d475de5SAlex Elder out_err: 430586b00e0dSAlex Elder rbd_dev->parent_overlap = 0; 430686b00e0dSAlex Elder rbd_spec_put(rbd_dev->parent_spec); 430786b00e0dSAlex Elder rbd_dev->parent_spec = NULL; 43089d475de5SAlex Elder kfree(rbd_dev->header_name); 43099d475de5SAlex Elder rbd_dev->header_name = NULL; 43101e130199SAlex Elder kfree(rbd_dev->header.object_prefix); 43111e130199SAlex Elder rbd_dev->header.object_prefix = NULL; 43129d475de5SAlex Elder 43139d475de5SAlex Elder return ret; 4314a30b71b9SAlex Elder } 4315a30b71b9SAlex Elder 431683a06263SAlex Elder static int rbd_dev_probe_finish(struct rbd_device *rbd_dev) 431783a06263SAlex Elder { 43182f82ee54SAlex Elder struct rbd_device *parent = NULL; 43192f82ee54SAlex Elder struct rbd_spec *parent_spec = NULL; 43202f82ee54SAlex Elder struct rbd_client *rbdc = NULL; 432183a06263SAlex Elder int ret; 432283a06263SAlex Elder 432383a06263SAlex Elder /* no need to lock here, as rbd_dev is not registered yet */ 432483a06263SAlex Elder ret = rbd_dev_snaps_update(rbd_dev); 432583a06263SAlex Elder if (ret) 432683a06263SAlex Elder return ret; 432783a06263SAlex Elder 43289e15b77dSAlex Elder ret = rbd_dev_probe_update_spec(rbd_dev); 43299e15b77dSAlex Elder if (ret) 43309e15b77dSAlex Elder goto err_out_snaps; 43319e15b77dSAlex Elder 433283a06263SAlex Elder ret = rbd_dev_set_mapping(rbd_dev); 433383a06263SAlex Elder if (ret) 433483a06263SAlex Elder goto err_out_snaps; 433583a06263SAlex Elder 433683a06263SAlex Elder /* generate unique id: find highest unique id, add one */ 433783a06263SAlex Elder rbd_dev_id_get(rbd_dev); 433883a06263SAlex Elder 433983a06263SAlex Elder /* Fill in the device name, now that we have its id. */ 434083a06263SAlex Elder BUILD_BUG_ON(DEV_NAME_LEN 434183a06263SAlex Elder < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH); 434283a06263SAlex Elder sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id); 434383a06263SAlex Elder 434483a06263SAlex Elder /* Get our block major device number. */ 434583a06263SAlex Elder 434683a06263SAlex Elder ret = register_blkdev(0, rbd_dev->name); 434783a06263SAlex Elder if (ret < 0) 434883a06263SAlex Elder goto err_out_id; 434983a06263SAlex Elder rbd_dev->major = ret; 435083a06263SAlex Elder 435183a06263SAlex Elder /* Set up the blkdev mapping. */ 435283a06263SAlex Elder 435383a06263SAlex Elder ret = rbd_init_disk(rbd_dev); 435483a06263SAlex Elder if (ret) 435583a06263SAlex Elder goto err_out_blkdev; 435683a06263SAlex Elder 435783a06263SAlex Elder ret = rbd_bus_add_dev(rbd_dev); 435883a06263SAlex Elder if (ret) 435983a06263SAlex Elder goto err_out_disk; 436083a06263SAlex Elder 436183a06263SAlex Elder /* 436283a06263SAlex Elder * At this point cleanup in the event of an error is the job 436383a06263SAlex Elder * of the sysfs code (initiated by rbd_bus_del_dev()). 436483a06263SAlex Elder */ 43652f82ee54SAlex Elder /* Probe the parent if there is one */ 43662f82ee54SAlex Elder 43672f82ee54SAlex Elder if (rbd_dev->parent_spec) { 43682f82ee54SAlex Elder /* 43692f82ee54SAlex Elder * We need to pass a reference to the client and the 43702f82ee54SAlex Elder * parent spec when creating the parent rbd_dev. 43712f82ee54SAlex Elder * Images related by parent/child relationships 43722f82ee54SAlex Elder * always share both. 43732f82ee54SAlex Elder */ 43742f82ee54SAlex Elder parent_spec = rbd_spec_get(rbd_dev->parent_spec); 43752f82ee54SAlex Elder rbdc = __rbd_get_client(rbd_dev->rbd_client); 43762f82ee54SAlex Elder 43772f82ee54SAlex Elder parent = rbd_dev_create(rbdc, parent_spec); 43782f82ee54SAlex Elder if (!parent) { 43792f82ee54SAlex Elder ret = -ENOMEM; 43802f82ee54SAlex Elder goto err_out_spec; 43812f82ee54SAlex Elder } 43822f82ee54SAlex Elder rbdc = NULL; /* parent now owns reference */ 43832f82ee54SAlex Elder parent_spec = NULL; /* parent now owns reference */ 43842f82ee54SAlex Elder ret = rbd_dev_probe(parent); 43852f82ee54SAlex Elder if (ret < 0) 43862f82ee54SAlex Elder goto err_out_parent; 43872f82ee54SAlex Elder rbd_dev->parent = parent; 43882f82ee54SAlex Elder } 43892f82ee54SAlex Elder 439083a06263SAlex Elder down_write(&rbd_dev->header_rwsem); 439183a06263SAlex Elder ret = rbd_dev_snaps_register(rbd_dev); 439283a06263SAlex Elder up_write(&rbd_dev->header_rwsem); 439383a06263SAlex Elder if (ret) 439483a06263SAlex Elder goto err_out_bus; 439583a06263SAlex Elder 43969969ebc5SAlex Elder ret = rbd_dev_header_watch_sync(rbd_dev, 1); 439783a06263SAlex Elder if (ret) 439883a06263SAlex Elder goto err_out_bus; 439983a06263SAlex Elder 440083a06263SAlex Elder /* Everything's ready. Announce the disk to the world. */ 440183a06263SAlex Elder 440283a06263SAlex Elder add_disk(rbd_dev->disk); 440383a06263SAlex Elder 440483a06263SAlex Elder pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name, 440583a06263SAlex Elder (unsigned long long) rbd_dev->mapping.size); 440683a06263SAlex Elder 440783a06263SAlex Elder return ret; 44082f82ee54SAlex Elder 44092f82ee54SAlex Elder err_out_parent: 44102f82ee54SAlex Elder rbd_dev_destroy(parent); 44112f82ee54SAlex Elder err_out_spec: 44122f82ee54SAlex Elder rbd_spec_put(parent_spec); 44132f82ee54SAlex Elder rbd_put_client(rbdc); 441483a06263SAlex Elder err_out_bus: 441583a06263SAlex Elder /* this will also clean up rest of rbd_dev stuff */ 441683a06263SAlex Elder 441783a06263SAlex Elder rbd_bus_del_dev(rbd_dev); 441883a06263SAlex Elder 441983a06263SAlex Elder return ret; 442083a06263SAlex Elder err_out_disk: 442183a06263SAlex Elder rbd_free_disk(rbd_dev); 442283a06263SAlex Elder err_out_blkdev: 442383a06263SAlex Elder unregister_blkdev(rbd_dev->major, rbd_dev->name); 442483a06263SAlex Elder err_out_id: 442583a06263SAlex Elder rbd_dev_id_put(rbd_dev); 442683a06263SAlex Elder err_out_snaps: 442783a06263SAlex Elder rbd_remove_all_snaps(rbd_dev); 442883a06263SAlex Elder 442983a06263SAlex Elder return ret; 443083a06263SAlex Elder } 443183a06263SAlex Elder 4432a30b71b9SAlex Elder /* 4433a30b71b9SAlex Elder * Probe for the existence of the header object for the given rbd 4434a30b71b9SAlex Elder * device. For format 2 images this includes determining the image 4435a30b71b9SAlex Elder * id. 4436a30b71b9SAlex Elder */ 4437a30b71b9SAlex Elder static int rbd_dev_probe(struct rbd_device *rbd_dev) 4438a30b71b9SAlex Elder { 4439a30b71b9SAlex Elder int ret; 4440a30b71b9SAlex Elder 4441a30b71b9SAlex Elder /* 4442a30b71b9SAlex Elder * Get the id from the image id object. If it's not a 4443a30b71b9SAlex Elder * format 2 image, we'll get ENOENT back, and we'll assume 4444a30b71b9SAlex Elder * it's a format 1 image. 4445a30b71b9SAlex Elder */ 4446a30b71b9SAlex Elder ret = rbd_dev_image_id(rbd_dev); 4447a30b71b9SAlex Elder if (ret) 4448a30b71b9SAlex Elder ret = rbd_dev_v1_probe(rbd_dev); 4449a30b71b9SAlex Elder else 4450a30b71b9SAlex Elder ret = rbd_dev_v2_probe(rbd_dev); 445183a06263SAlex Elder if (ret) { 4452a30b71b9SAlex Elder dout("probe failed, returning %d\n", ret); 4453a30b71b9SAlex Elder 4454a30b71b9SAlex Elder return ret; 4455a30b71b9SAlex Elder } 4456a30b71b9SAlex Elder 445783a06263SAlex Elder ret = rbd_dev_probe_finish(rbd_dev); 445883a06263SAlex Elder if (ret) 445983a06263SAlex Elder rbd_header_free(&rbd_dev->header); 446083a06263SAlex Elder 446183a06263SAlex Elder return ret; 446283a06263SAlex Elder } 446383a06263SAlex Elder 446459c2be1eSYehuda Sadeh static ssize_t rbd_add(struct bus_type *bus, 446559c2be1eSYehuda Sadeh const char *buf, 446659c2be1eSYehuda Sadeh size_t count) 4467602adf40SYehuda Sadeh { 4468cb8627c7SAlex Elder struct rbd_device *rbd_dev = NULL; 4469dc79b113SAlex Elder struct ceph_options *ceph_opts = NULL; 44704e9afebaSAlex Elder struct rbd_options *rbd_opts = NULL; 4471859c31dfSAlex Elder struct rbd_spec *spec = NULL; 44729d3997fdSAlex Elder struct rbd_client *rbdc; 447327cc2594SAlex Elder struct ceph_osd_client *osdc; 447427cc2594SAlex Elder int rc = -ENOMEM; 4475602adf40SYehuda Sadeh 4476602adf40SYehuda Sadeh if (!try_module_get(THIS_MODULE)) 4477602adf40SYehuda Sadeh return -ENODEV; 4478602adf40SYehuda Sadeh 4479a725f65eSAlex Elder /* parse add command */ 4480859c31dfSAlex Elder rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec); 4481dc79b113SAlex Elder if (rc < 0) 4482bd4ba655SAlex Elder goto err_out_module; 4483a725f65eSAlex Elder 44849d3997fdSAlex Elder rbdc = rbd_get_client(ceph_opts); 44859d3997fdSAlex Elder if (IS_ERR(rbdc)) { 44869d3997fdSAlex Elder rc = PTR_ERR(rbdc); 44870ddebc0cSAlex Elder goto err_out_args; 44889d3997fdSAlex Elder } 4489c53d5893SAlex Elder ceph_opts = NULL; /* rbd_dev client now owns this */ 4490602adf40SYehuda Sadeh 4491602adf40SYehuda Sadeh /* pick the pool */ 44929d3997fdSAlex Elder osdc = &rbdc->client->osdc; 4493859c31dfSAlex Elder rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name); 4494602adf40SYehuda Sadeh if (rc < 0) 4495602adf40SYehuda Sadeh goto err_out_client; 4496859c31dfSAlex Elder spec->pool_id = (u64) rc; 4497859c31dfSAlex Elder 44980903e875SAlex Elder /* The ceph file layout needs to fit pool id in 32 bits */ 44990903e875SAlex Elder 45000903e875SAlex Elder if (WARN_ON(spec->pool_id > (u64) U32_MAX)) { 45010903e875SAlex Elder rc = -EIO; 45020903e875SAlex Elder goto err_out_client; 45030903e875SAlex Elder } 45040903e875SAlex Elder 4505c53d5893SAlex Elder rbd_dev = rbd_dev_create(rbdc, spec); 4506bd4ba655SAlex Elder if (!rbd_dev) 4507bd4ba655SAlex Elder goto err_out_client; 4508c53d5893SAlex Elder rbdc = NULL; /* rbd_dev now owns this */ 4509c53d5893SAlex Elder spec = NULL; /* rbd_dev now owns this */ 4510602adf40SYehuda Sadeh 4511bd4ba655SAlex Elder rbd_dev->mapping.read_only = rbd_opts->read_only; 4512c53d5893SAlex Elder kfree(rbd_opts); 4513c53d5893SAlex Elder rbd_opts = NULL; /* done with this */ 4514bd4ba655SAlex Elder 4515a30b71b9SAlex Elder rc = rbd_dev_probe(rbd_dev); 4516a30b71b9SAlex Elder if (rc < 0) 4517c53d5893SAlex Elder goto err_out_rbd_dev; 451805fd6f6fSAlex Elder 4519602adf40SYehuda Sadeh return count; 4520c53d5893SAlex Elder err_out_rbd_dev: 4521c53d5893SAlex Elder rbd_dev_destroy(rbd_dev); 4522bd4ba655SAlex Elder err_out_client: 45239d3997fdSAlex Elder rbd_put_client(rbdc); 45240ddebc0cSAlex Elder err_out_args: 452578cea76eSAlex Elder if (ceph_opts) 452678cea76eSAlex Elder ceph_destroy_options(ceph_opts); 45274e9afebaSAlex Elder kfree(rbd_opts); 4528859c31dfSAlex Elder rbd_spec_put(spec); 4529bd4ba655SAlex Elder err_out_module: 4530bd4ba655SAlex Elder module_put(THIS_MODULE); 453127cc2594SAlex Elder 4532602adf40SYehuda Sadeh dout("Error adding device %s\n", buf); 453327cc2594SAlex Elder 453427cc2594SAlex Elder return (ssize_t) rc; 4535602adf40SYehuda Sadeh } 4536602adf40SYehuda Sadeh 4537de71a297SAlex Elder static struct rbd_device *__rbd_get_dev(unsigned long dev_id) 4538602adf40SYehuda Sadeh { 4539602adf40SYehuda Sadeh struct list_head *tmp; 4540602adf40SYehuda Sadeh struct rbd_device *rbd_dev; 4541602adf40SYehuda Sadeh 4542e124a82fSAlex Elder spin_lock(&rbd_dev_list_lock); 4543602adf40SYehuda Sadeh list_for_each(tmp, &rbd_dev_list) { 4544602adf40SYehuda Sadeh rbd_dev = list_entry(tmp, struct rbd_device, node); 4545de71a297SAlex Elder if (rbd_dev->dev_id == dev_id) { 4546e124a82fSAlex Elder spin_unlock(&rbd_dev_list_lock); 4547602adf40SYehuda Sadeh return rbd_dev; 4548602adf40SYehuda Sadeh } 4549e124a82fSAlex Elder } 4550e124a82fSAlex Elder spin_unlock(&rbd_dev_list_lock); 4551602adf40SYehuda Sadeh return NULL; 4552602adf40SYehuda Sadeh } 4553602adf40SYehuda Sadeh 4554dfc5606dSYehuda Sadeh static void rbd_dev_release(struct device *dev) 4555602adf40SYehuda Sadeh { 4556593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4557602adf40SYehuda Sadeh 455859c2be1eSYehuda Sadeh if (rbd_dev->watch_event) 45599969ebc5SAlex Elder rbd_dev_header_watch_sync(rbd_dev, 0); 4560602adf40SYehuda Sadeh 4561602adf40SYehuda Sadeh /* clean up and free blkdev */ 4562602adf40SYehuda Sadeh rbd_free_disk(rbd_dev); 4563602adf40SYehuda Sadeh unregister_blkdev(rbd_dev->major, rbd_dev->name); 456432eec68dSAlex Elder 45652ac4e75dSAlex Elder /* release allocated disk header fields */ 45662ac4e75dSAlex Elder rbd_header_free(&rbd_dev->header); 45672ac4e75dSAlex Elder 456832eec68dSAlex Elder /* done with the id, and with the rbd_dev */ 4569e2839308SAlex Elder rbd_dev_id_put(rbd_dev); 4570c53d5893SAlex Elder rbd_assert(rbd_dev->rbd_client != NULL); 4571c53d5893SAlex Elder rbd_dev_destroy(rbd_dev); 4572602adf40SYehuda Sadeh 4573602adf40SYehuda Sadeh /* release module ref */ 4574602adf40SYehuda Sadeh module_put(THIS_MODULE); 4575602adf40SYehuda Sadeh } 4576602adf40SYehuda Sadeh 45772f82ee54SAlex Elder static void __rbd_remove(struct rbd_device *rbd_dev) 45782f82ee54SAlex Elder { 45792f82ee54SAlex Elder rbd_remove_all_snaps(rbd_dev); 45802f82ee54SAlex Elder rbd_bus_del_dev(rbd_dev); 45812f82ee54SAlex Elder } 45822f82ee54SAlex Elder 4583dfc5606dSYehuda Sadeh static ssize_t rbd_remove(struct bus_type *bus, 4584602adf40SYehuda Sadeh const char *buf, 4585602adf40SYehuda Sadeh size_t count) 4586602adf40SYehuda Sadeh { 4587602adf40SYehuda Sadeh struct rbd_device *rbd_dev = NULL; 4588602adf40SYehuda Sadeh int target_id, rc; 4589602adf40SYehuda Sadeh unsigned long ul; 4590602adf40SYehuda Sadeh int ret = count; 4591602adf40SYehuda Sadeh 4592602adf40SYehuda Sadeh rc = strict_strtoul(buf, 10, &ul); 4593602adf40SYehuda Sadeh if (rc) 4594602adf40SYehuda Sadeh return rc; 4595602adf40SYehuda Sadeh 4596602adf40SYehuda Sadeh /* convert to int; abort if we lost anything in the conversion */ 4597602adf40SYehuda Sadeh target_id = (int) ul; 4598602adf40SYehuda Sadeh if (target_id != ul) 4599602adf40SYehuda Sadeh return -EINVAL; 4600602adf40SYehuda Sadeh 4601602adf40SYehuda Sadeh mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); 4602602adf40SYehuda Sadeh 4603602adf40SYehuda Sadeh rbd_dev = __rbd_get_dev(target_id); 4604602adf40SYehuda Sadeh if (!rbd_dev) { 4605602adf40SYehuda Sadeh ret = -ENOENT; 4606602adf40SYehuda Sadeh goto done; 4607602adf40SYehuda Sadeh } 4608602adf40SYehuda Sadeh 4609a14ea269SAlex Elder spin_lock_irq(&rbd_dev->lock); 4610b82d167bSAlex Elder if (rbd_dev->open_count) 461142382b70SAlex Elder ret = -EBUSY; 4612b82d167bSAlex Elder else 4613b82d167bSAlex Elder set_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags); 4614a14ea269SAlex Elder spin_unlock_irq(&rbd_dev->lock); 4615b82d167bSAlex Elder if (ret < 0) 461642382b70SAlex Elder goto done; 461742382b70SAlex Elder 46182f82ee54SAlex Elder while (rbd_dev->parent_spec) { 46192f82ee54SAlex Elder struct rbd_device *first = rbd_dev; 46202f82ee54SAlex Elder struct rbd_device *second = first->parent; 46212f82ee54SAlex Elder struct rbd_device *third; 46222f82ee54SAlex Elder 46232f82ee54SAlex Elder /* 46242f82ee54SAlex Elder * Follow to the parent with no grandparent and 46252f82ee54SAlex Elder * remove it. 46262f82ee54SAlex Elder */ 46272f82ee54SAlex Elder while (second && (third = second->parent)) { 46282f82ee54SAlex Elder first = second; 46292f82ee54SAlex Elder second = third; 46302f82ee54SAlex Elder } 46312f82ee54SAlex Elder __rbd_remove(second); 46322f82ee54SAlex Elder rbd_spec_put(first->parent_spec); 46332f82ee54SAlex Elder first->parent_spec = NULL; 46342f82ee54SAlex Elder first->parent_overlap = 0; 46352f82ee54SAlex Elder first->parent = NULL; 46362f82ee54SAlex Elder } 46372f82ee54SAlex Elder __rbd_remove(rbd_dev); 4638602adf40SYehuda Sadeh 4639602adf40SYehuda Sadeh done: 4640602adf40SYehuda Sadeh mutex_unlock(&ctl_mutex); 4641aafb230eSAlex Elder 4642602adf40SYehuda Sadeh return ret; 4643602adf40SYehuda Sadeh } 4644602adf40SYehuda Sadeh 4645602adf40SYehuda Sadeh /* 4646602adf40SYehuda Sadeh * create control files in sysfs 4647dfc5606dSYehuda Sadeh * /sys/bus/rbd/... 4648602adf40SYehuda Sadeh */ 4649602adf40SYehuda Sadeh static int rbd_sysfs_init(void) 4650602adf40SYehuda Sadeh { 4651dfc5606dSYehuda Sadeh int ret; 4652602adf40SYehuda Sadeh 4653fed4c143SAlex Elder ret = device_register(&rbd_root_dev); 4654dfc5606dSYehuda Sadeh if (ret < 0) 4655dfc5606dSYehuda Sadeh return ret; 4656602adf40SYehuda Sadeh 4657fed4c143SAlex Elder ret = bus_register(&rbd_bus_type); 4658fed4c143SAlex Elder if (ret < 0) 4659fed4c143SAlex Elder device_unregister(&rbd_root_dev); 4660602adf40SYehuda Sadeh 4661602adf40SYehuda Sadeh return ret; 4662602adf40SYehuda Sadeh } 4663602adf40SYehuda Sadeh 4664602adf40SYehuda Sadeh static void rbd_sysfs_cleanup(void) 4665602adf40SYehuda Sadeh { 4666dfc5606dSYehuda Sadeh bus_unregister(&rbd_bus_type); 4667fed4c143SAlex Elder device_unregister(&rbd_root_dev); 4668602adf40SYehuda Sadeh } 4669602adf40SYehuda Sadeh 4670cc344fa1SAlex Elder static int __init rbd_init(void) 4671602adf40SYehuda Sadeh { 4672602adf40SYehuda Sadeh int rc; 4673602adf40SYehuda Sadeh 46741e32d34cSAlex Elder if (!libceph_compatible(NULL)) { 46751e32d34cSAlex Elder rbd_warn(NULL, "libceph incompatibility (quitting)"); 46761e32d34cSAlex Elder 46771e32d34cSAlex Elder return -EINVAL; 46781e32d34cSAlex Elder } 4679602adf40SYehuda Sadeh rc = rbd_sysfs_init(); 4680602adf40SYehuda Sadeh if (rc) 4681602adf40SYehuda Sadeh return rc; 4682f0f8cef5SAlex Elder pr_info("loaded " RBD_DRV_NAME_LONG "\n"); 4683602adf40SYehuda Sadeh return 0; 4684602adf40SYehuda Sadeh } 4685602adf40SYehuda Sadeh 4686cc344fa1SAlex Elder static void __exit rbd_exit(void) 4687602adf40SYehuda Sadeh { 4688602adf40SYehuda Sadeh rbd_sysfs_cleanup(); 4689602adf40SYehuda Sadeh } 4690602adf40SYehuda Sadeh 4691602adf40SYehuda Sadeh module_init(rbd_init); 4692602adf40SYehuda Sadeh module_exit(rbd_exit); 4693602adf40SYehuda Sadeh 4694602adf40SYehuda Sadeh MODULE_AUTHOR("Sage Weil <sage@newdream.net>"); 4695602adf40SYehuda Sadeh MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>"); 4696602adf40SYehuda Sadeh MODULE_DESCRIPTION("rados block device"); 4697602adf40SYehuda Sadeh 4698602adf40SYehuda Sadeh /* following authorship retained from original osdblk.c */ 4699602adf40SYehuda Sadeh MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>"); 4700602adf40SYehuda Sadeh 4701602adf40SYehuda Sadeh MODULE_LICENSE("GPL"); 4702