xref: /openbmc/linux/drivers/block/rbd.c (revision 2f82ee54)
1602adf40SYehuda Sadeh /*
2602adf40SYehuda Sadeh    rbd.c -- Export ceph rados objects as a Linux block device
3602adf40SYehuda Sadeh 
4602adf40SYehuda Sadeh 
5602adf40SYehuda Sadeh    based on drivers/block/osdblk.c:
6602adf40SYehuda Sadeh 
7602adf40SYehuda Sadeh    Copyright 2009 Red Hat, Inc.
8602adf40SYehuda Sadeh 
9602adf40SYehuda Sadeh    This program is free software; you can redistribute it and/or modify
10602adf40SYehuda Sadeh    it under the terms of the GNU General Public License as published by
11602adf40SYehuda Sadeh    the Free Software Foundation.
12602adf40SYehuda Sadeh 
13602adf40SYehuda Sadeh    This program is distributed in the hope that it will be useful,
14602adf40SYehuda Sadeh    but WITHOUT ANY WARRANTY; without even the implied warranty of
15602adf40SYehuda Sadeh    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16602adf40SYehuda Sadeh    GNU General Public License for more details.
17602adf40SYehuda Sadeh 
18602adf40SYehuda Sadeh    You should have received a copy of the GNU General Public License
19602adf40SYehuda Sadeh    along with this program; see the file COPYING.  If not, write to
20602adf40SYehuda Sadeh    the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21602adf40SYehuda Sadeh 
22602adf40SYehuda Sadeh 
23602adf40SYehuda Sadeh 
24dfc5606dSYehuda Sadeh    For usage instructions, please refer to:
25602adf40SYehuda Sadeh 
26dfc5606dSYehuda Sadeh                  Documentation/ABI/testing/sysfs-bus-rbd
27602adf40SYehuda Sadeh 
28602adf40SYehuda Sadeh  */
29602adf40SYehuda Sadeh 
30602adf40SYehuda Sadeh #include <linux/ceph/libceph.h>
31602adf40SYehuda Sadeh #include <linux/ceph/osd_client.h>
32602adf40SYehuda Sadeh #include <linux/ceph/mon_client.h>
33602adf40SYehuda Sadeh #include <linux/ceph/decode.h>
3459c2be1eSYehuda Sadeh #include <linux/parser.h>
35602adf40SYehuda Sadeh 
36602adf40SYehuda Sadeh #include <linux/kernel.h>
37602adf40SYehuda Sadeh #include <linux/device.h>
38602adf40SYehuda Sadeh #include <linux/module.h>
39602adf40SYehuda Sadeh #include <linux/fs.h>
40602adf40SYehuda Sadeh #include <linux/blkdev.h>
41602adf40SYehuda Sadeh 
42602adf40SYehuda Sadeh #include "rbd_types.h"
43602adf40SYehuda Sadeh 
44aafb230eSAlex Elder #define RBD_DEBUG	/* Activate rbd_assert() calls */
45aafb230eSAlex Elder 
46593a9e7bSAlex Elder /*
47593a9e7bSAlex Elder  * The basic unit of block I/O is a sector.  It is interpreted in a
48593a9e7bSAlex Elder  * number of contexts in Linux (blk, bio, genhd), but the default is
49593a9e7bSAlex Elder  * universally 512 bytes.  These symbols are just slightly more
50593a9e7bSAlex Elder  * meaningful than the bare numbers they represent.
51593a9e7bSAlex Elder  */
52593a9e7bSAlex Elder #define	SECTOR_SHIFT	9
53593a9e7bSAlex Elder #define	SECTOR_SIZE	(1ULL << SECTOR_SHIFT)
54593a9e7bSAlex Elder 
55f0f8cef5SAlex Elder #define RBD_DRV_NAME "rbd"
56f0f8cef5SAlex Elder #define RBD_DRV_NAME_LONG "rbd (rados block device)"
57602adf40SYehuda Sadeh 
58602adf40SYehuda Sadeh #define RBD_MINORS_PER_MAJOR	256		/* max minors per blkdev */
59602adf40SYehuda Sadeh 
60d4b125e9SAlex Elder #define RBD_SNAP_DEV_NAME_PREFIX	"snap_"
61d4b125e9SAlex Elder #define RBD_MAX_SNAP_NAME_LEN	\
62d4b125e9SAlex Elder 			(NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
63d4b125e9SAlex Elder 
6435d489f9SAlex Elder #define RBD_MAX_SNAP_COUNT	510	/* allows max snapc to fit in 4KB */
65602adf40SYehuda Sadeh 
66602adf40SYehuda Sadeh #define RBD_SNAP_HEAD_NAME	"-"
67602adf40SYehuda Sadeh 
689e15b77dSAlex Elder /* This allows a single page to hold an image name sent by OSD */
699e15b77dSAlex Elder #define RBD_IMAGE_NAME_LEN_MAX	(PAGE_SIZE - sizeof (__le32) - 1)
70589d30e0SAlex Elder #define RBD_IMAGE_ID_LEN_MAX	64
719e15b77dSAlex Elder 
721e130199SAlex Elder #define RBD_OBJ_PREFIX_LEN_MAX	64
73589d30e0SAlex Elder 
74d889140cSAlex Elder /* Feature bits */
75d889140cSAlex Elder 
765cbf6f12SAlex Elder #define RBD_FEATURE_LAYERING	(1<<0)
775cbf6f12SAlex Elder #define RBD_FEATURE_STRIPINGV2	(1<<1)
785cbf6f12SAlex Elder #define RBD_FEATURES_ALL \
795cbf6f12SAlex Elder 	    (RBD_FEATURE_LAYERING | RBD_FEATURE_STRIPINGV2)
80d889140cSAlex Elder 
81d889140cSAlex Elder /* Features supported by this (client software) implementation. */
82d889140cSAlex Elder 
835cbf6f12SAlex Elder #define RBD_FEATURES_SUPPORTED	(0)
84d889140cSAlex Elder 
8581a89793SAlex Elder /*
8681a89793SAlex Elder  * An RBD device name will be "rbd#", where the "rbd" comes from
8781a89793SAlex Elder  * RBD_DRV_NAME above, and # is a unique integer identifier.
8881a89793SAlex Elder  * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
8981a89793SAlex Elder  * enough to hold all possible device names.
9081a89793SAlex Elder  */
91602adf40SYehuda Sadeh #define DEV_NAME_LEN		32
9281a89793SAlex Elder #define MAX_INT_FORMAT_WIDTH	((5 * sizeof (int)) / 2 + 1)
93602adf40SYehuda Sadeh 
94602adf40SYehuda Sadeh /*
95602adf40SYehuda Sadeh  * block device image metadata (in-memory version)
96602adf40SYehuda Sadeh  */
97602adf40SYehuda Sadeh struct rbd_image_header {
98f84344f3SAlex Elder 	/* These four fields never change for a given rbd image */
99849b4260SAlex Elder 	char *object_prefix;
10034b13184SAlex Elder 	u64 features;
101602adf40SYehuda Sadeh 	__u8 obj_order;
102602adf40SYehuda Sadeh 	__u8 crypt_type;
103602adf40SYehuda Sadeh 	__u8 comp_type;
104602adf40SYehuda Sadeh 
105f84344f3SAlex Elder 	/* The remaining fields need to be updated occasionally */
106f84344f3SAlex Elder 	u64 image_size;
107f84344f3SAlex Elder 	struct ceph_snap_context *snapc;
108602adf40SYehuda Sadeh 	char *snap_names;
109602adf40SYehuda Sadeh 	u64 *snap_sizes;
11059c2be1eSYehuda Sadeh 
11159c2be1eSYehuda Sadeh 	u64 obj_version;
11259c2be1eSYehuda Sadeh };
11359c2be1eSYehuda Sadeh 
1140d7dbfceSAlex Elder /*
1150d7dbfceSAlex Elder  * An rbd image specification.
1160d7dbfceSAlex Elder  *
1170d7dbfceSAlex Elder  * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
118c66c6e0cSAlex Elder  * identify an image.  Each rbd_dev structure includes a pointer to
119c66c6e0cSAlex Elder  * an rbd_spec structure that encapsulates this identity.
120c66c6e0cSAlex Elder  *
121c66c6e0cSAlex Elder  * Each of the id's in an rbd_spec has an associated name.  For a
122c66c6e0cSAlex Elder  * user-mapped image, the names are supplied and the id's associated
123c66c6e0cSAlex Elder  * with them are looked up.  For a layered image, a parent image is
124c66c6e0cSAlex Elder  * defined by the tuple, and the names are looked up.
125c66c6e0cSAlex Elder  *
126c66c6e0cSAlex Elder  * An rbd_dev structure contains a parent_spec pointer which is
127c66c6e0cSAlex Elder  * non-null if the image it represents is a child in a layered
128c66c6e0cSAlex Elder  * image.  This pointer will refer to the rbd_spec structure used
129c66c6e0cSAlex Elder  * by the parent rbd_dev for its own identity (i.e., the structure
130c66c6e0cSAlex Elder  * is shared between the parent and child).
131c66c6e0cSAlex Elder  *
132c66c6e0cSAlex Elder  * Since these structures are populated once, during the discovery
133c66c6e0cSAlex Elder  * phase of image construction, they are effectively immutable so
134c66c6e0cSAlex Elder  * we make no effort to synchronize access to them.
135c66c6e0cSAlex Elder  *
136c66c6e0cSAlex Elder  * Note that code herein does not assume the image name is known (it
137c66c6e0cSAlex Elder  * could be a null pointer).
1380d7dbfceSAlex Elder  */
1390d7dbfceSAlex Elder struct rbd_spec {
1400d7dbfceSAlex Elder 	u64		pool_id;
1410d7dbfceSAlex Elder 	char		*pool_name;
1420d7dbfceSAlex Elder 
1430d7dbfceSAlex Elder 	char		*image_id;
1440d7dbfceSAlex Elder 	char		*image_name;
1450d7dbfceSAlex Elder 
1460d7dbfceSAlex Elder 	u64		snap_id;
1470d7dbfceSAlex Elder 	char		*snap_name;
1480d7dbfceSAlex Elder 
1490d7dbfceSAlex Elder 	struct kref	kref;
1500d7dbfceSAlex Elder };
1510d7dbfceSAlex Elder 
152602adf40SYehuda Sadeh /*
153f0f8cef5SAlex Elder  * an instance of the client.  multiple devices may share an rbd client.
154602adf40SYehuda Sadeh  */
155602adf40SYehuda Sadeh struct rbd_client {
156602adf40SYehuda Sadeh 	struct ceph_client	*client;
157602adf40SYehuda Sadeh 	struct kref		kref;
158602adf40SYehuda Sadeh 	struct list_head	node;
159602adf40SYehuda Sadeh };
160602adf40SYehuda Sadeh 
161bf0d5f50SAlex Elder struct rbd_img_request;
162bf0d5f50SAlex Elder typedef void (*rbd_img_callback_t)(struct rbd_img_request *);
163bf0d5f50SAlex Elder 
164bf0d5f50SAlex Elder #define	BAD_WHICH	U32_MAX		/* Good which or bad which, which? */
165bf0d5f50SAlex Elder 
166bf0d5f50SAlex Elder struct rbd_obj_request;
167bf0d5f50SAlex Elder typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *);
168bf0d5f50SAlex Elder 
1699969ebc5SAlex Elder enum obj_request_type {
1709969ebc5SAlex Elder 	OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES
1719969ebc5SAlex Elder };
172bf0d5f50SAlex Elder 
173926f9b3fSAlex Elder enum obj_req_flags {
174926f9b3fSAlex Elder 	OBJ_REQ_DONE,		/* completion flag: not done = 0, done = 1 */
1756365d33aSAlex Elder 	OBJ_REQ_IMG_DATA,	/* object usage: standalone = 0, image = 1 */
176926f9b3fSAlex Elder };
177926f9b3fSAlex Elder 
178bf0d5f50SAlex Elder struct rbd_obj_request {
179bf0d5f50SAlex Elder 	const char		*object_name;
180bf0d5f50SAlex Elder 	u64			offset;		/* object start byte */
181bf0d5f50SAlex Elder 	u64			length;		/* bytes from offset */
182926f9b3fSAlex Elder 	unsigned long		flags;
183bf0d5f50SAlex Elder 
184bf0d5f50SAlex Elder 	struct rbd_img_request	*img_request;
1857da22d29SAlex Elder 	u64			img_offset;	/* image relative offset */
186bf0d5f50SAlex Elder 	struct list_head	links;		/* img_request->obj_requests */
187bf0d5f50SAlex Elder 	u32			which;		/* posn image request list */
188bf0d5f50SAlex Elder 
189bf0d5f50SAlex Elder 	enum obj_request_type	type;
190788e2df3SAlex Elder 	union {
191bf0d5f50SAlex Elder 		struct bio	*bio_list;
192788e2df3SAlex Elder 		struct {
193788e2df3SAlex Elder 			struct page	**pages;
194788e2df3SAlex Elder 			u32		page_count;
195788e2df3SAlex Elder 		};
196788e2df3SAlex Elder 	};
197bf0d5f50SAlex Elder 
198bf0d5f50SAlex Elder 	struct ceph_osd_request	*osd_req;
199bf0d5f50SAlex Elder 
200bf0d5f50SAlex Elder 	u64			xferred;	/* bytes transferred */
201bf0d5f50SAlex Elder 	u64			version;
2021b83bef2SSage Weil 	int			result;
203bf0d5f50SAlex Elder 
204bf0d5f50SAlex Elder 	rbd_obj_callback_t	callback;
205788e2df3SAlex Elder 	struct completion	completion;
206bf0d5f50SAlex Elder 
207bf0d5f50SAlex Elder 	struct kref		kref;
208bf0d5f50SAlex Elder };
209bf0d5f50SAlex Elder 
2100c425248SAlex Elder enum img_req_flags {
2119849e986SAlex Elder 	IMG_REQ_WRITE,		/* I/O direction: read = 0, write = 1 */
2129849e986SAlex Elder 	IMG_REQ_CHILD,		/* initiator: block = 0, child image = 1 */
213d0b2e944SAlex Elder 	IMG_REQ_LAYERED,	/* ENOENT handling: normal = 0, layered = 1 */
2140c425248SAlex Elder };
2150c425248SAlex Elder 
216bf0d5f50SAlex Elder struct rbd_img_request {
217bf0d5f50SAlex Elder 	struct rbd_device	*rbd_dev;
218bf0d5f50SAlex Elder 	u64			offset;	/* starting image byte offset */
219bf0d5f50SAlex Elder 	u64			length;	/* byte count from offset */
2200c425248SAlex Elder 	unsigned long		flags;
221bf0d5f50SAlex Elder 	union {
222bf0d5f50SAlex Elder 		u64			snap_id;	/* for reads */
2239849e986SAlex Elder 		struct ceph_snap_context *snapc;	/* for writes */
2249849e986SAlex Elder 	};
2259849e986SAlex Elder 	union {
2269849e986SAlex Elder 		struct request		*rq;		/* block request */
2279849e986SAlex Elder 		struct rbd_obj_request	*obj_request;	/* obj req initiator */
228bf0d5f50SAlex Elder 	};
229bf0d5f50SAlex Elder 	spinlock_t		completion_lock;/* protects next_completion */
230bf0d5f50SAlex Elder 	u32			next_completion;
231bf0d5f50SAlex Elder 	rbd_img_callback_t	callback;
23255f27e09SAlex Elder 	u64			xferred;/* aggregate bytes transferred */
233a5a337d4SAlex Elder 	int			result;	/* first nonzero obj_request result */
234bf0d5f50SAlex Elder 
235bf0d5f50SAlex Elder 	u32			obj_request_count;
236bf0d5f50SAlex Elder 	struct list_head	obj_requests;	/* rbd_obj_request structs */
237bf0d5f50SAlex Elder 
238bf0d5f50SAlex Elder 	struct kref		kref;
239bf0d5f50SAlex Elder };
240bf0d5f50SAlex Elder 
241bf0d5f50SAlex Elder #define for_each_obj_request(ireq, oreq) \
242ef06f4d3SAlex Elder 	list_for_each_entry(oreq, &(ireq)->obj_requests, links)
243bf0d5f50SAlex Elder #define for_each_obj_request_from(ireq, oreq) \
244ef06f4d3SAlex Elder 	list_for_each_entry_from(oreq, &(ireq)->obj_requests, links)
245bf0d5f50SAlex Elder #define for_each_obj_request_safe(ireq, oreq, n) \
246ef06f4d3SAlex Elder 	list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links)
247bf0d5f50SAlex Elder 
248dfc5606dSYehuda Sadeh struct rbd_snap {
249dfc5606dSYehuda Sadeh 	struct	device		dev;
250dfc5606dSYehuda Sadeh 	const char		*name;
2513591538fSJosh Durgin 	u64			size;
252dfc5606dSYehuda Sadeh 	struct list_head	node;
253dfc5606dSYehuda Sadeh 	u64			id;
25434b13184SAlex Elder 	u64			features;
255dfc5606dSYehuda Sadeh };
256dfc5606dSYehuda Sadeh 
257f84344f3SAlex Elder struct rbd_mapping {
25899c1f08fSAlex Elder 	u64                     size;
25934b13184SAlex Elder 	u64                     features;
260f84344f3SAlex Elder 	bool			read_only;
261f84344f3SAlex Elder };
262f84344f3SAlex Elder 
263602adf40SYehuda Sadeh /*
264602adf40SYehuda Sadeh  * a single device
265602adf40SYehuda Sadeh  */
266602adf40SYehuda Sadeh struct rbd_device {
267de71a297SAlex Elder 	int			dev_id;		/* blkdev unique id */
268602adf40SYehuda Sadeh 
269602adf40SYehuda Sadeh 	int			major;		/* blkdev assigned major */
270602adf40SYehuda Sadeh 	struct gendisk		*disk;		/* blkdev's gendisk and rq */
271602adf40SYehuda Sadeh 
272a30b71b9SAlex Elder 	u32			image_format;	/* Either 1 or 2 */
273602adf40SYehuda Sadeh 	struct rbd_client	*rbd_client;
274602adf40SYehuda Sadeh 
275602adf40SYehuda Sadeh 	char			name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
276602adf40SYehuda Sadeh 
277b82d167bSAlex Elder 	spinlock_t		lock;		/* queue, flags, open_count */
278602adf40SYehuda Sadeh 
279602adf40SYehuda Sadeh 	struct rbd_image_header	header;
280b82d167bSAlex Elder 	unsigned long		flags;		/* possibly lock protected */
2810d7dbfceSAlex Elder 	struct rbd_spec		*spec;
282602adf40SYehuda Sadeh 
2830d7dbfceSAlex Elder 	char			*header_name;
284971f839aSAlex Elder 
2850903e875SAlex Elder 	struct ceph_file_layout	layout;
2860903e875SAlex Elder 
28759c2be1eSYehuda Sadeh 	struct ceph_osd_event   *watch_event;
288975241afSAlex Elder 	struct rbd_obj_request	*watch_request;
28959c2be1eSYehuda Sadeh 
29086b00e0dSAlex Elder 	struct rbd_spec		*parent_spec;
29186b00e0dSAlex Elder 	u64			parent_overlap;
292*2f82ee54SAlex Elder 	struct rbd_device	*parent;
29386b00e0dSAlex Elder 
294c666601aSJosh Durgin 	/* protects updating the header */
295c666601aSJosh Durgin 	struct rw_semaphore     header_rwsem;
296f84344f3SAlex Elder 
297f84344f3SAlex Elder 	struct rbd_mapping	mapping;
298602adf40SYehuda Sadeh 
299602adf40SYehuda Sadeh 	struct list_head	node;
300dfc5606dSYehuda Sadeh 
301dfc5606dSYehuda Sadeh 	/* list of snapshots */
302dfc5606dSYehuda Sadeh 	struct list_head	snaps;
303dfc5606dSYehuda Sadeh 
304dfc5606dSYehuda Sadeh 	/* sysfs related */
305dfc5606dSYehuda Sadeh 	struct device		dev;
306b82d167bSAlex Elder 	unsigned long		open_count;	/* protected by lock */
307dfc5606dSYehuda Sadeh };
308dfc5606dSYehuda Sadeh 
309b82d167bSAlex Elder /*
310b82d167bSAlex Elder  * Flag bits for rbd_dev->flags.  If atomicity is required,
311b82d167bSAlex Elder  * rbd_dev->lock is used to protect access.
312b82d167bSAlex Elder  *
313b82d167bSAlex Elder  * Currently, only the "removing" flag (which is coupled with the
314b82d167bSAlex Elder  * "open_count" field) requires atomic access.
315b82d167bSAlex Elder  */
3166d292906SAlex Elder enum rbd_dev_flags {
3176d292906SAlex Elder 	RBD_DEV_FLAG_EXISTS,	/* mapped snapshot has not been deleted */
318b82d167bSAlex Elder 	RBD_DEV_FLAG_REMOVING,	/* this mapping is being removed */
3196d292906SAlex Elder };
3206d292906SAlex Elder 
321602adf40SYehuda Sadeh static DEFINE_MUTEX(ctl_mutex);	  /* Serialize open/close/setup/teardown */
322e124a82fSAlex Elder 
323602adf40SYehuda Sadeh static LIST_HEAD(rbd_dev_list);    /* devices */
324e124a82fSAlex Elder static DEFINE_SPINLOCK(rbd_dev_list_lock);
325e124a82fSAlex Elder 
326602adf40SYehuda Sadeh static LIST_HEAD(rbd_client_list);		/* clients */
327432b8587SAlex Elder static DEFINE_SPINLOCK(rbd_client_list_lock);
328602adf40SYehuda Sadeh 
329304f6808SAlex Elder static int rbd_dev_snaps_update(struct rbd_device *rbd_dev);
330304f6808SAlex Elder static int rbd_dev_snaps_register(struct rbd_device *rbd_dev);
331304f6808SAlex Elder 
332dfc5606dSYehuda Sadeh static void rbd_dev_release(struct device *dev);
33341f38c2bSAlex Elder static void rbd_remove_snap_dev(struct rbd_snap *snap);
334dfc5606dSYehuda Sadeh 
335f0f8cef5SAlex Elder static ssize_t rbd_add(struct bus_type *bus, const char *buf,
336f0f8cef5SAlex Elder 		       size_t count);
337f0f8cef5SAlex Elder static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
338f0f8cef5SAlex Elder 			  size_t count);
339*2f82ee54SAlex Elder static int rbd_dev_probe(struct rbd_device *rbd_dev);
340f0f8cef5SAlex Elder 
341f0f8cef5SAlex Elder static struct bus_attribute rbd_bus_attrs[] = {
342f0f8cef5SAlex Elder 	__ATTR(add, S_IWUSR, NULL, rbd_add),
343f0f8cef5SAlex Elder 	__ATTR(remove, S_IWUSR, NULL, rbd_remove),
344f0f8cef5SAlex Elder 	__ATTR_NULL
345f0f8cef5SAlex Elder };
346f0f8cef5SAlex Elder 
347f0f8cef5SAlex Elder static struct bus_type rbd_bus_type = {
348f0f8cef5SAlex Elder 	.name		= "rbd",
349f0f8cef5SAlex Elder 	.bus_attrs	= rbd_bus_attrs,
350f0f8cef5SAlex Elder };
351f0f8cef5SAlex Elder 
352f0f8cef5SAlex Elder static void rbd_root_dev_release(struct device *dev)
353f0f8cef5SAlex Elder {
354f0f8cef5SAlex Elder }
355f0f8cef5SAlex Elder 
356f0f8cef5SAlex Elder static struct device rbd_root_dev = {
357f0f8cef5SAlex Elder 	.init_name =    "rbd",
358f0f8cef5SAlex Elder 	.release =      rbd_root_dev_release,
359f0f8cef5SAlex Elder };
360f0f8cef5SAlex Elder 
36106ecc6cbSAlex Elder static __printf(2, 3)
36206ecc6cbSAlex Elder void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
36306ecc6cbSAlex Elder {
36406ecc6cbSAlex Elder 	struct va_format vaf;
36506ecc6cbSAlex Elder 	va_list args;
36606ecc6cbSAlex Elder 
36706ecc6cbSAlex Elder 	va_start(args, fmt);
36806ecc6cbSAlex Elder 	vaf.fmt = fmt;
36906ecc6cbSAlex Elder 	vaf.va = &args;
37006ecc6cbSAlex Elder 
37106ecc6cbSAlex Elder 	if (!rbd_dev)
37206ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
37306ecc6cbSAlex Elder 	else if (rbd_dev->disk)
37406ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: %s: %pV\n",
37506ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
37606ecc6cbSAlex Elder 	else if (rbd_dev->spec && rbd_dev->spec->image_name)
37706ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: image %s: %pV\n",
37806ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
37906ecc6cbSAlex Elder 	else if (rbd_dev->spec && rbd_dev->spec->image_id)
38006ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: id %s: %pV\n",
38106ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
38206ecc6cbSAlex Elder 	else	/* punt */
38306ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
38406ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev, &vaf);
38506ecc6cbSAlex Elder 	va_end(args);
38606ecc6cbSAlex Elder }
38706ecc6cbSAlex Elder 
388aafb230eSAlex Elder #ifdef RBD_DEBUG
389aafb230eSAlex Elder #define rbd_assert(expr)						\
390aafb230eSAlex Elder 		if (unlikely(!(expr))) {				\
391aafb230eSAlex Elder 			printk(KERN_ERR "\nAssertion failure in %s() "	\
392aafb230eSAlex Elder 						"at line %d:\n\n"	\
393aafb230eSAlex Elder 					"\trbd_assert(%s);\n\n",	\
394aafb230eSAlex Elder 					__func__, __LINE__, #expr);	\
395aafb230eSAlex Elder 			BUG();						\
396aafb230eSAlex Elder 		}
397aafb230eSAlex Elder #else /* !RBD_DEBUG */
398aafb230eSAlex Elder #  define rbd_assert(expr)	((void) 0)
399aafb230eSAlex Elder #endif /* !RBD_DEBUG */
400dfc5606dSYehuda Sadeh 
401117973fbSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver);
402117973fbSAlex Elder static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver);
40359c2be1eSYehuda Sadeh 
404602adf40SYehuda Sadeh static int rbd_open(struct block_device *bdev, fmode_t mode)
405602adf40SYehuda Sadeh {
406f0f8cef5SAlex Elder 	struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
407b82d167bSAlex Elder 	bool removing = false;
408602adf40SYehuda Sadeh 
409f84344f3SAlex Elder 	if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
410602adf40SYehuda Sadeh 		return -EROFS;
411602adf40SYehuda Sadeh 
412a14ea269SAlex Elder 	spin_lock_irq(&rbd_dev->lock);
413b82d167bSAlex Elder 	if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
414b82d167bSAlex Elder 		removing = true;
415b82d167bSAlex Elder 	else
416b82d167bSAlex Elder 		rbd_dev->open_count++;
417a14ea269SAlex Elder 	spin_unlock_irq(&rbd_dev->lock);
418b82d167bSAlex Elder 	if (removing)
419b82d167bSAlex Elder 		return -ENOENT;
420b82d167bSAlex Elder 
42142382b70SAlex Elder 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
422c3e946ceSAlex Elder 	(void) get_device(&rbd_dev->dev);
423f84344f3SAlex Elder 	set_device_ro(bdev, rbd_dev->mapping.read_only);
42442382b70SAlex Elder 	mutex_unlock(&ctl_mutex);
425340c7a2bSAlex Elder 
426602adf40SYehuda Sadeh 	return 0;
427602adf40SYehuda Sadeh }
428602adf40SYehuda Sadeh 
429dfc5606dSYehuda Sadeh static int rbd_release(struct gendisk *disk, fmode_t mode)
430dfc5606dSYehuda Sadeh {
431dfc5606dSYehuda Sadeh 	struct rbd_device *rbd_dev = disk->private_data;
432b82d167bSAlex Elder 	unsigned long open_count_before;
433b82d167bSAlex Elder 
434a14ea269SAlex Elder 	spin_lock_irq(&rbd_dev->lock);
435b82d167bSAlex Elder 	open_count_before = rbd_dev->open_count--;
436a14ea269SAlex Elder 	spin_unlock_irq(&rbd_dev->lock);
437b82d167bSAlex Elder 	rbd_assert(open_count_before > 0);
438dfc5606dSYehuda Sadeh 
43942382b70SAlex Elder 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
440c3e946ceSAlex Elder 	put_device(&rbd_dev->dev);
44142382b70SAlex Elder 	mutex_unlock(&ctl_mutex);
442dfc5606dSYehuda Sadeh 
443dfc5606dSYehuda Sadeh 	return 0;
444dfc5606dSYehuda Sadeh }
445dfc5606dSYehuda Sadeh 
446602adf40SYehuda Sadeh static const struct block_device_operations rbd_bd_ops = {
447602adf40SYehuda Sadeh 	.owner			= THIS_MODULE,
448602adf40SYehuda Sadeh 	.open			= rbd_open,
449dfc5606dSYehuda Sadeh 	.release		= rbd_release,
450602adf40SYehuda Sadeh };
451602adf40SYehuda Sadeh 
452602adf40SYehuda Sadeh /*
453602adf40SYehuda Sadeh  * Initialize an rbd client instance.
45443ae4701SAlex Elder  * We own *ceph_opts.
455602adf40SYehuda Sadeh  */
456f8c38929SAlex Elder static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
457602adf40SYehuda Sadeh {
458602adf40SYehuda Sadeh 	struct rbd_client *rbdc;
459602adf40SYehuda Sadeh 	int ret = -ENOMEM;
460602adf40SYehuda Sadeh 
46137206ee5SAlex Elder 	dout("%s:\n", __func__);
462602adf40SYehuda Sadeh 	rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
463602adf40SYehuda Sadeh 	if (!rbdc)
464602adf40SYehuda Sadeh 		goto out_opt;
465602adf40SYehuda Sadeh 
466602adf40SYehuda Sadeh 	kref_init(&rbdc->kref);
467602adf40SYehuda Sadeh 	INIT_LIST_HEAD(&rbdc->node);
468602adf40SYehuda Sadeh 
469bc534d86SAlex Elder 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
470bc534d86SAlex Elder 
47143ae4701SAlex Elder 	rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
472602adf40SYehuda Sadeh 	if (IS_ERR(rbdc->client))
473bc534d86SAlex Elder 		goto out_mutex;
47443ae4701SAlex Elder 	ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
475602adf40SYehuda Sadeh 
476602adf40SYehuda Sadeh 	ret = ceph_open_session(rbdc->client);
477602adf40SYehuda Sadeh 	if (ret < 0)
478602adf40SYehuda Sadeh 		goto out_err;
479602adf40SYehuda Sadeh 
480432b8587SAlex Elder 	spin_lock(&rbd_client_list_lock);
481602adf40SYehuda Sadeh 	list_add_tail(&rbdc->node, &rbd_client_list);
482432b8587SAlex Elder 	spin_unlock(&rbd_client_list_lock);
483602adf40SYehuda Sadeh 
484bc534d86SAlex Elder 	mutex_unlock(&ctl_mutex);
48537206ee5SAlex Elder 	dout("%s: rbdc %p\n", __func__, rbdc);
486bc534d86SAlex Elder 
487602adf40SYehuda Sadeh 	return rbdc;
488602adf40SYehuda Sadeh 
489602adf40SYehuda Sadeh out_err:
490602adf40SYehuda Sadeh 	ceph_destroy_client(rbdc->client);
491bc534d86SAlex Elder out_mutex:
492bc534d86SAlex Elder 	mutex_unlock(&ctl_mutex);
493602adf40SYehuda Sadeh 	kfree(rbdc);
494602adf40SYehuda Sadeh out_opt:
49543ae4701SAlex Elder 	if (ceph_opts)
49643ae4701SAlex Elder 		ceph_destroy_options(ceph_opts);
49737206ee5SAlex Elder 	dout("%s: error %d\n", __func__, ret);
49837206ee5SAlex Elder 
49928f259b7SVasiliy Kulikov 	return ERR_PTR(ret);
500602adf40SYehuda Sadeh }
501602adf40SYehuda Sadeh 
502*2f82ee54SAlex Elder static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc)
503*2f82ee54SAlex Elder {
504*2f82ee54SAlex Elder 	kref_get(&rbdc->kref);
505*2f82ee54SAlex Elder 
506*2f82ee54SAlex Elder 	return rbdc;
507*2f82ee54SAlex Elder }
508*2f82ee54SAlex Elder 
509602adf40SYehuda Sadeh /*
5101f7ba331SAlex Elder  * Find a ceph client with specific addr and configuration.  If
5111f7ba331SAlex Elder  * found, bump its reference count.
512602adf40SYehuda Sadeh  */
5131f7ba331SAlex Elder static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
514602adf40SYehuda Sadeh {
515602adf40SYehuda Sadeh 	struct rbd_client *client_node;
5161f7ba331SAlex Elder 	bool found = false;
517602adf40SYehuda Sadeh 
51843ae4701SAlex Elder 	if (ceph_opts->flags & CEPH_OPT_NOSHARE)
519602adf40SYehuda Sadeh 		return NULL;
520602adf40SYehuda Sadeh 
5211f7ba331SAlex Elder 	spin_lock(&rbd_client_list_lock);
5221f7ba331SAlex Elder 	list_for_each_entry(client_node, &rbd_client_list, node) {
5231f7ba331SAlex Elder 		if (!ceph_compare_options(ceph_opts, client_node->client)) {
524*2f82ee54SAlex Elder 			__rbd_get_client(client_node);
525*2f82ee54SAlex Elder 
5261f7ba331SAlex Elder 			found = true;
5271f7ba331SAlex Elder 			break;
5281f7ba331SAlex Elder 		}
5291f7ba331SAlex Elder 	}
5301f7ba331SAlex Elder 	spin_unlock(&rbd_client_list_lock);
5311f7ba331SAlex Elder 
5321f7ba331SAlex Elder 	return found ? client_node : NULL;
533602adf40SYehuda Sadeh }
534602adf40SYehuda Sadeh 
535602adf40SYehuda Sadeh /*
53659c2be1eSYehuda Sadeh  * mount options
53759c2be1eSYehuda Sadeh  */
53859c2be1eSYehuda Sadeh enum {
53959c2be1eSYehuda Sadeh 	Opt_last_int,
54059c2be1eSYehuda Sadeh 	/* int args above */
54159c2be1eSYehuda Sadeh 	Opt_last_string,
54259c2be1eSYehuda Sadeh 	/* string args above */
543cc0538b6SAlex Elder 	Opt_read_only,
544cc0538b6SAlex Elder 	Opt_read_write,
545cc0538b6SAlex Elder 	/* Boolean args above */
546cc0538b6SAlex Elder 	Opt_last_bool,
54759c2be1eSYehuda Sadeh };
54859c2be1eSYehuda Sadeh 
54943ae4701SAlex Elder static match_table_t rbd_opts_tokens = {
55059c2be1eSYehuda Sadeh 	/* int args above */
55159c2be1eSYehuda Sadeh 	/* string args above */
552be466c1cSAlex Elder 	{Opt_read_only, "read_only"},
553cc0538b6SAlex Elder 	{Opt_read_only, "ro"},		/* Alternate spelling */
554cc0538b6SAlex Elder 	{Opt_read_write, "read_write"},
555cc0538b6SAlex Elder 	{Opt_read_write, "rw"},		/* Alternate spelling */
556cc0538b6SAlex Elder 	/* Boolean args above */
55759c2be1eSYehuda Sadeh 	{-1, NULL}
55859c2be1eSYehuda Sadeh };
55959c2be1eSYehuda Sadeh 
56098571b5aSAlex Elder struct rbd_options {
56198571b5aSAlex Elder 	bool	read_only;
56298571b5aSAlex Elder };
56398571b5aSAlex Elder 
56498571b5aSAlex Elder #define RBD_READ_ONLY_DEFAULT	false
56598571b5aSAlex Elder 
56659c2be1eSYehuda Sadeh static int parse_rbd_opts_token(char *c, void *private)
56759c2be1eSYehuda Sadeh {
56843ae4701SAlex Elder 	struct rbd_options *rbd_opts = private;
56959c2be1eSYehuda Sadeh 	substring_t argstr[MAX_OPT_ARGS];
57059c2be1eSYehuda Sadeh 	int token, intval, ret;
57159c2be1eSYehuda Sadeh 
57243ae4701SAlex Elder 	token = match_token(c, rbd_opts_tokens, argstr);
57359c2be1eSYehuda Sadeh 	if (token < 0)
57459c2be1eSYehuda Sadeh 		return -EINVAL;
57559c2be1eSYehuda Sadeh 
57659c2be1eSYehuda Sadeh 	if (token < Opt_last_int) {
57759c2be1eSYehuda Sadeh 		ret = match_int(&argstr[0], &intval);
57859c2be1eSYehuda Sadeh 		if (ret < 0) {
57959c2be1eSYehuda Sadeh 			pr_err("bad mount option arg (not int) "
58059c2be1eSYehuda Sadeh 			       "at '%s'\n", c);
58159c2be1eSYehuda Sadeh 			return ret;
58259c2be1eSYehuda Sadeh 		}
58359c2be1eSYehuda Sadeh 		dout("got int token %d val %d\n", token, intval);
58459c2be1eSYehuda Sadeh 	} else if (token > Opt_last_int && token < Opt_last_string) {
58559c2be1eSYehuda Sadeh 		dout("got string token %d val %s\n", token,
58659c2be1eSYehuda Sadeh 		     argstr[0].from);
587cc0538b6SAlex Elder 	} else if (token > Opt_last_string && token < Opt_last_bool) {
588cc0538b6SAlex Elder 		dout("got Boolean token %d\n", token);
58959c2be1eSYehuda Sadeh 	} else {
59059c2be1eSYehuda Sadeh 		dout("got token %d\n", token);
59159c2be1eSYehuda Sadeh 	}
59259c2be1eSYehuda Sadeh 
59359c2be1eSYehuda Sadeh 	switch (token) {
594cc0538b6SAlex Elder 	case Opt_read_only:
595cc0538b6SAlex Elder 		rbd_opts->read_only = true;
596cc0538b6SAlex Elder 		break;
597cc0538b6SAlex Elder 	case Opt_read_write:
598cc0538b6SAlex Elder 		rbd_opts->read_only = false;
599cc0538b6SAlex Elder 		break;
60059c2be1eSYehuda Sadeh 	default:
601aafb230eSAlex Elder 		rbd_assert(false);
602aafb230eSAlex Elder 		break;
60359c2be1eSYehuda Sadeh 	}
60459c2be1eSYehuda Sadeh 	return 0;
60559c2be1eSYehuda Sadeh }
60659c2be1eSYehuda Sadeh 
60759c2be1eSYehuda Sadeh /*
608602adf40SYehuda Sadeh  * Get a ceph client with specific addr and configuration, if one does
609602adf40SYehuda Sadeh  * not exist create it.
610602adf40SYehuda Sadeh  */
6119d3997fdSAlex Elder static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
612602adf40SYehuda Sadeh {
613f8c38929SAlex Elder 	struct rbd_client *rbdc;
61459c2be1eSYehuda Sadeh 
6151f7ba331SAlex Elder 	rbdc = rbd_client_find(ceph_opts);
6169d3997fdSAlex Elder 	if (rbdc)	/* using an existing client */
61743ae4701SAlex Elder 		ceph_destroy_options(ceph_opts);
6189d3997fdSAlex Elder 	else
619f8c38929SAlex Elder 		rbdc = rbd_client_create(ceph_opts);
620d720bcb0SAlex Elder 
6219d3997fdSAlex Elder 	return rbdc;
622602adf40SYehuda Sadeh }
623602adf40SYehuda Sadeh 
624602adf40SYehuda Sadeh /*
625602adf40SYehuda Sadeh  * Destroy ceph client
626d23a4b3fSAlex Elder  *
627432b8587SAlex Elder  * Caller must hold rbd_client_list_lock.
628602adf40SYehuda Sadeh  */
629602adf40SYehuda Sadeh static void rbd_client_release(struct kref *kref)
630602adf40SYehuda Sadeh {
631602adf40SYehuda Sadeh 	struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
632602adf40SYehuda Sadeh 
63337206ee5SAlex Elder 	dout("%s: rbdc %p\n", __func__, rbdc);
634cd9d9f5dSAlex Elder 	spin_lock(&rbd_client_list_lock);
635602adf40SYehuda Sadeh 	list_del(&rbdc->node);
636cd9d9f5dSAlex Elder 	spin_unlock(&rbd_client_list_lock);
637602adf40SYehuda Sadeh 
638602adf40SYehuda Sadeh 	ceph_destroy_client(rbdc->client);
639602adf40SYehuda Sadeh 	kfree(rbdc);
640602adf40SYehuda Sadeh }
641602adf40SYehuda Sadeh 
642602adf40SYehuda Sadeh /*
643602adf40SYehuda Sadeh  * Drop reference to ceph client node. If it's not referenced anymore, release
644602adf40SYehuda Sadeh  * it.
645602adf40SYehuda Sadeh  */
6469d3997fdSAlex Elder static void rbd_put_client(struct rbd_client *rbdc)
647602adf40SYehuda Sadeh {
648c53d5893SAlex Elder 	if (rbdc)
6499d3997fdSAlex Elder 		kref_put(&rbdc->kref, rbd_client_release);
650602adf40SYehuda Sadeh }
651602adf40SYehuda Sadeh 
652a30b71b9SAlex Elder static bool rbd_image_format_valid(u32 image_format)
653a30b71b9SAlex Elder {
654a30b71b9SAlex Elder 	return image_format == 1 || image_format == 2;
655a30b71b9SAlex Elder }
656a30b71b9SAlex Elder 
6578e94af8eSAlex Elder static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
6588e94af8eSAlex Elder {
659103a150fSAlex Elder 	size_t size;
660103a150fSAlex Elder 	u32 snap_count;
661103a150fSAlex Elder 
662103a150fSAlex Elder 	/* The header has to start with the magic rbd header text */
663103a150fSAlex Elder 	if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
664103a150fSAlex Elder 		return false;
665103a150fSAlex Elder 
666db2388b6SAlex Elder 	/* The bio layer requires at least sector-sized I/O */
667db2388b6SAlex Elder 
668db2388b6SAlex Elder 	if (ondisk->options.order < SECTOR_SHIFT)
669db2388b6SAlex Elder 		return false;
670db2388b6SAlex Elder 
671db2388b6SAlex Elder 	/* If we use u64 in a few spots we may be able to loosen this */
672db2388b6SAlex Elder 
673db2388b6SAlex Elder 	if (ondisk->options.order > 8 * sizeof (int) - 1)
674db2388b6SAlex Elder 		return false;
675db2388b6SAlex Elder 
676103a150fSAlex Elder 	/*
677103a150fSAlex Elder 	 * The size of a snapshot header has to fit in a size_t, and
678103a150fSAlex Elder 	 * that limits the number of snapshots.
679103a150fSAlex Elder 	 */
680103a150fSAlex Elder 	snap_count = le32_to_cpu(ondisk->snap_count);
681103a150fSAlex Elder 	size = SIZE_MAX - sizeof (struct ceph_snap_context);
682103a150fSAlex Elder 	if (snap_count > size / sizeof (__le64))
683103a150fSAlex Elder 		return false;
684103a150fSAlex Elder 
685103a150fSAlex Elder 	/*
686103a150fSAlex Elder 	 * Not only that, but the size of the entire the snapshot
687103a150fSAlex Elder 	 * header must also be representable in a size_t.
688103a150fSAlex Elder 	 */
689103a150fSAlex Elder 	size -= snap_count * sizeof (__le64);
690103a150fSAlex Elder 	if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
691103a150fSAlex Elder 		return false;
692103a150fSAlex Elder 
693103a150fSAlex Elder 	return true;
6948e94af8eSAlex Elder }
6958e94af8eSAlex Elder 
696602adf40SYehuda Sadeh /*
697602adf40SYehuda Sadeh  * Create a new header structure, translate header format from the on-disk
698602adf40SYehuda Sadeh  * header.
699602adf40SYehuda Sadeh  */
700602adf40SYehuda Sadeh static int rbd_header_from_disk(struct rbd_image_header *header,
7014156d998SAlex Elder 				 struct rbd_image_header_ondisk *ondisk)
702602adf40SYehuda Sadeh {
703ccece235SAlex Elder 	u32 snap_count;
70458c17b0eSAlex Elder 	size_t len;
705d2bb24e5SAlex Elder 	size_t size;
706621901d6SAlex Elder 	u32 i;
707602adf40SYehuda Sadeh 
7086a52325fSAlex Elder 	memset(header, 0, sizeof (*header));
7096a52325fSAlex Elder 
710103a150fSAlex Elder 	snap_count = le32_to_cpu(ondisk->snap_count);
711103a150fSAlex Elder 
71258c17b0eSAlex Elder 	len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
71358c17b0eSAlex Elder 	header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
7146a52325fSAlex Elder 	if (!header->object_prefix)
715602adf40SYehuda Sadeh 		return -ENOMEM;
71658c17b0eSAlex Elder 	memcpy(header->object_prefix, ondisk->object_prefix, len);
71758c17b0eSAlex Elder 	header->object_prefix[len] = '\0';
71800f1f36fSAlex Elder 
719602adf40SYehuda Sadeh 	if (snap_count) {
720f785cc1dSAlex Elder 		u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
721f785cc1dSAlex Elder 
722621901d6SAlex Elder 		/* Save a copy of the snapshot names */
723621901d6SAlex Elder 
724f785cc1dSAlex Elder 		if (snap_names_len > (u64) SIZE_MAX)
725f785cc1dSAlex Elder 			return -EIO;
726f785cc1dSAlex Elder 		header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
727602adf40SYehuda Sadeh 		if (!header->snap_names)
7286a52325fSAlex Elder 			goto out_err;
729f785cc1dSAlex Elder 		/*
730f785cc1dSAlex Elder 		 * Note that rbd_dev_v1_header_read() guarantees
731f785cc1dSAlex Elder 		 * the ondisk buffer we're working with has
732f785cc1dSAlex Elder 		 * snap_names_len bytes beyond the end of the
733f785cc1dSAlex Elder 		 * snapshot id array, this memcpy() is safe.
734f785cc1dSAlex Elder 		 */
735f785cc1dSAlex Elder 		memcpy(header->snap_names, &ondisk->snaps[snap_count],
736f785cc1dSAlex Elder 			snap_names_len);
7376a52325fSAlex Elder 
738621901d6SAlex Elder 		/* Record each snapshot's size */
739621901d6SAlex Elder 
740d2bb24e5SAlex Elder 		size = snap_count * sizeof (*header->snap_sizes);
741d2bb24e5SAlex Elder 		header->snap_sizes = kmalloc(size, GFP_KERNEL);
742602adf40SYehuda Sadeh 		if (!header->snap_sizes)
7436a52325fSAlex Elder 			goto out_err;
744621901d6SAlex Elder 		for (i = 0; i < snap_count; i++)
745621901d6SAlex Elder 			header->snap_sizes[i] =
746621901d6SAlex Elder 				le64_to_cpu(ondisk->snaps[i].image_size);
747602adf40SYehuda Sadeh 	} else {
748ccece235SAlex Elder 		WARN_ON(ondisk->snap_names_len);
749602adf40SYehuda Sadeh 		header->snap_names = NULL;
750602adf40SYehuda Sadeh 		header->snap_sizes = NULL;
751602adf40SYehuda Sadeh 	}
752849b4260SAlex Elder 
75334b13184SAlex Elder 	header->features = 0;	/* No features support in v1 images */
754602adf40SYehuda Sadeh 	header->obj_order = ondisk->options.order;
755602adf40SYehuda Sadeh 	header->crypt_type = ondisk->options.crypt_type;
756602adf40SYehuda Sadeh 	header->comp_type = ondisk->options.comp_type;
7576a52325fSAlex Elder 
758621901d6SAlex Elder 	/* Allocate and fill in the snapshot context */
759621901d6SAlex Elder 
760f84344f3SAlex Elder 	header->image_size = le64_to_cpu(ondisk->image_size);
7616a52325fSAlex Elder 	size = sizeof (struct ceph_snap_context);
7626a52325fSAlex Elder 	size += snap_count * sizeof (header->snapc->snaps[0]);
7636a52325fSAlex Elder 	header->snapc = kzalloc(size, GFP_KERNEL);
7646a52325fSAlex Elder 	if (!header->snapc)
7656a52325fSAlex Elder 		goto out_err;
766602adf40SYehuda Sadeh 
767602adf40SYehuda Sadeh 	atomic_set(&header->snapc->nref, 1);
768505cbb9bSAlex Elder 	header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
769602adf40SYehuda Sadeh 	header->snapc->num_snaps = snap_count;
770621901d6SAlex Elder 	for (i = 0; i < snap_count; i++)
771602adf40SYehuda Sadeh 		header->snapc->snaps[i] =
772602adf40SYehuda Sadeh 			le64_to_cpu(ondisk->snaps[i].id);
773602adf40SYehuda Sadeh 
774602adf40SYehuda Sadeh 	return 0;
775602adf40SYehuda Sadeh 
7766a52325fSAlex Elder out_err:
777849b4260SAlex Elder 	kfree(header->snap_sizes);
778ccece235SAlex Elder 	header->snap_sizes = NULL;
779602adf40SYehuda Sadeh 	kfree(header->snap_names);
780ccece235SAlex Elder 	header->snap_names = NULL;
7816a52325fSAlex Elder 	kfree(header->object_prefix);
7826a52325fSAlex Elder 	header->object_prefix = NULL;
783ccece235SAlex Elder 
78400f1f36fSAlex Elder 	return -ENOMEM;
785602adf40SYehuda Sadeh }
786602adf40SYehuda Sadeh 
7879e15b77dSAlex Elder static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
7889e15b77dSAlex Elder {
7899e15b77dSAlex Elder 	struct rbd_snap *snap;
7909e15b77dSAlex Elder 
7919e15b77dSAlex Elder 	if (snap_id == CEPH_NOSNAP)
7929e15b77dSAlex Elder 		return RBD_SNAP_HEAD_NAME;
7939e15b77dSAlex Elder 
7949e15b77dSAlex Elder 	list_for_each_entry(snap, &rbd_dev->snaps, node)
7959e15b77dSAlex Elder 		if (snap_id == snap->id)
7969e15b77dSAlex Elder 			return snap->name;
7979e15b77dSAlex Elder 
7989e15b77dSAlex Elder 	return NULL;
7999e15b77dSAlex Elder }
8009e15b77dSAlex Elder 
8018836b995SAlex Elder static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
802602adf40SYehuda Sadeh {
803602adf40SYehuda Sadeh 
804e86924a8SAlex Elder 	struct rbd_snap *snap;
80500f1f36fSAlex Elder 
806e86924a8SAlex Elder 	list_for_each_entry(snap, &rbd_dev->snaps, node) {
807e86924a8SAlex Elder 		if (!strcmp(snap_name, snap->name)) {
8080d7dbfceSAlex Elder 			rbd_dev->spec->snap_id = snap->id;
809e86924a8SAlex Elder 			rbd_dev->mapping.size = snap->size;
81034b13184SAlex Elder 			rbd_dev->mapping.features = snap->features;
81100f1f36fSAlex Elder 
812e86924a8SAlex Elder 			return 0;
813602adf40SYehuda Sadeh 		}
81400f1f36fSAlex Elder 	}
815e86924a8SAlex Elder 
81600f1f36fSAlex Elder 	return -ENOENT;
81700f1f36fSAlex Elder }
818602adf40SYehuda Sadeh 
819819d52bfSAlex Elder static int rbd_dev_set_mapping(struct rbd_device *rbd_dev)
820602adf40SYehuda Sadeh {
82178dc447dSAlex Elder 	int ret;
822602adf40SYehuda Sadeh 
8230d7dbfceSAlex Elder 	if (!memcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME,
824cc9d734cSJosh Durgin 		    sizeof (RBD_SNAP_HEAD_NAME))) {
8250d7dbfceSAlex Elder 		rbd_dev->spec->snap_id = CEPH_NOSNAP;
82699c1f08fSAlex Elder 		rbd_dev->mapping.size = rbd_dev->header.image_size;
82734b13184SAlex Elder 		rbd_dev->mapping.features = rbd_dev->header.features;
828e86924a8SAlex Elder 		ret = 0;
829602adf40SYehuda Sadeh 	} else {
8300d7dbfceSAlex Elder 		ret = snap_by_name(rbd_dev, rbd_dev->spec->snap_name);
831602adf40SYehuda Sadeh 		if (ret < 0)
832602adf40SYehuda Sadeh 			goto done;
833f84344f3SAlex Elder 		rbd_dev->mapping.read_only = true;
834602adf40SYehuda Sadeh 	}
8356d292906SAlex Elder 	set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
8366d292906SAlex Elder 
837602adf40SYehuda Sadeh done:
838602adf40SYehuda Sadeh 	return ret;
839602adf40SYehuda Sadeh }
840602adf40SYehuda Sadeh 
841602adf40SYehuda Sadeh static void rbd_header_free(struct rbd_image_header *header)
842602adf40SYehuda Sadeh {
843849b4260SAlex Elder 	kfree(header->object_prefix);
844d78fd7aeSAlex Elder 	header->object_prefix = NULL;
845602adf40SYehuda Sadeh 	kfree(header->snap_sizes);
846d78fd7aeSAlex Elder 	header->snap_sizes = NULL;
847849b4260SAlex Elder 	kfree(header->snap_names);
848d78fd7aeSAlex Elder 	header->snap_names = NULL;
849d1d25646SJosh Durgin 	ceph_put_snap_context(header->snapc);
850d78fd7aeSAlex Elder 	header->snapc = NULL;
851602adf40SYehuda Sadeh }
852602adf40SYehuda Sadeh 
85398571b5aSAlex Elder static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
854602adf40SYehuda Sadeh {
85565ccfe21SAlex Elder 	char *name;
85665ccfe21SAlex Elder 	u64 segment;
85765ccfe21SAlex Elder 	int ret;
858602adf40SYehuda Sadeh 
8592fd82b9eSAlex Elder 	name = kmalloc(MAX_OBJ_NAME_SIZE + 1, GFP_NOIO);
86065ccfe21SAlex Elder 	if (!name)
86165ccfe21SAlex Elder 		return NULL;
86265ccfe21SAlex Elder 	segment = offset >> rbd_dev->header.obj_order;
8632fd82b9eSAlex Elder 	ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, "%s.%012llx",
86465ccfe21SAlex Elder 			rbd_dev->header.object_prefix, segment);
8652fd82b9eSAlex Elder 	if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) {
86665ccfe21SAlex Elder 		pr_err("error formatting segment name for #%llu (%d)\n",
86765ccfe21SAlex Elder 			segment, ret);
86865ccfe21SAlex Elder 		kfree(name);
86965ccfe21SAlex Elder 		name = NULL;
87065ccfe21SAlex Elder 	}
871602adf40SYehuda Sadeh 
87265ccfe21SAlex Elder 	return name;
87365ccfe21SAlex Elder }
874602adf40SYehuda Sadeh 
87565ccfe21SAlex Elder static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
87665ccfe21SAlex Elder {
87765ccfe21SAlex Elder 	u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
878602adf40SYehuda Sadeh 
87965ccfe21SAlex Elder 	return offset & (segment_size - 1);
88065ccfe21SAlex Elder }
88165ccfe21SAlex Elder 
88265ccfe21SAlex Elder static u64 rbd_segment_length(struct rbd_device *rbd_dev,
88365ccfe21SAlex Elder 				u64 offset, u64 length)
88465ccfe21SAlex Elder {
88565ccfe21SAlex Elder 	u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
88665ccfe21SAlex Elder 
88765ccfe21SAlex Elder 	offset &= segment_size - 1;
88865ccfe21SAlex Elder 
889aafb230eSAlex Elder 	rbd_assert(length <= U64_MAX - offset);
89065ccfe21SAlex Elder 	if (offset + length > segment_size)
89165ccfe21SAlex Elder 		length = segment_size - offset;
89265ccfe21SAlex Elder 
89365ccfe21SAlex Elder 	return length;
894602adf40SYehuda Sadeh }
895602adf40SYehuda Sadeh 
896602adf40SYehuda Sadeh /*
897029bcbd8SJosh Durgin  * returns the size of an object in the image
898029bcbd8SJosh Durgin  */
899029bcbd8SJosh Durgin static u64 rbd_obj_bytes(struct rbd_image_header *header)
900029bcbd8SJosh Durgin {
901029bcbd8SJosh Durgin 	return 1 << header->obj_order;
902029bcbd8SJosh Durgin }
903029bcbd8SJosh Durgin 
904029bcbd8SJosh Durgin /*
905602adf40SYehuda Sadeh  * bio helpers
906602adf40SYehuda Sadeh  */
907602adf40SYehuda Sadeh 
908602adf40SYehuda Sadeh static void bio_chain_put(struct bio *chain)
909602adf40SYehuda Sadeh {
910602adf40SYehuda Sadeh 	struct bio *tmp;
911602adf40SYehuda Sadeh 
912602adf40SYehuda Sadeh 	while (chain) {
913602adf40SYehuda Sadeh 		tmp = chain;
914602adf40SYehuda Sadeh 		chain = chain->bi_next;
915602adf40SYehuda Sadeh 		bio_put(tmp);
916602adf40SYehuda Sadeh 	}
917602adf40SYehuda Sadeh }
918602adf40SYehuda Sadeh 
919602adf40SYehuda Sadeh /*
920602adf40SYehuda Sadeh  * zeros a bio chain, starting at specific offset
921602adf40SYehuda Sadeh  */
922602adf40SYehuda Sadeh static void zero_bio_chain(struct bio *chain, int start_ofs)
923602adf40SYehuda Sadeh {
924602adf40SYehuda Sadeh 	struct bio_vec *bv;
925602adf40SYehuda Sadeh 	unsigned long flags;
926602adf40SYehuda Sadeh 	void *buf;
927602adf40SYehuda Sadeh 	int i;
928602adf40SYehuda Sadeh 	int pos = 0;
929602adf40SYehuda Sadeh 
930602adf40SYehuda Sadeh 	while (chain) {
931602adf40SYehuda Sadeh 		bio_for_each_segment(bv, chain, i) {
932602adf40SYehuda Sadeh 			if (pos + bv->bv_len > start_ofs) {
933602adf40SYehuda Sadeh 				int remainder = max(start_ofs - pos, 0);
934602adf40SYehuda Sadeh 				buf = bvec_kmap_irq(bv, &flags);
935602adf40SYehuda Sadeh 				memset(buf + remainder, 0,
936602adf40SYehuda Sadeh 				       bv->bv_len - remainder);
93785b5aaa6SDan Carpenter 				bvec_kunmap_irq(buf, &flags);
938602adf40SYehuda Sadeh 			}
939602adf40SYehuda Sadeh 			pos += bv->bv_len;
940602adf40SYehuda Sadeh 		}
941602adf40SYehuda Sadeh 
942602adf40SYehuda Sadeh 		chain = chain->bi_next;
943602adf40SYehuda Sadeh 	}
944602adf40SYehuda Sadeh }
945602adf40SYehuda Sadeh 
946602adf40SYehuda Sadeh /*
947f7760dadSAlex Elder  * Clone a portion of a bio, starting at the given byte offset
948f7760dadSAlex Elder  * and continuing for the number of bytes indicated.
949602adf40SYehuda Sadeh  */
950f7760dadSAlex Elder static struct bio *bio_clone_range(struct bio *bio_src,
951f7760dadSAlex Elder 					unsigned int offset,
952f7760dadSAlex Elder 					unsigned int len,
953f7760dadSAlex Elder 					gfp_t gfpmask)
954602adf40SYehuda Sadeh {
955f7760dadSAlex Elder 	struct bio_vec *bv;
956f7760dadSAlex Elder 	unsigned int resid;
957f7760dadSAlex Elder 	unsigned short idx;
958f7760dadSAlex Elder 	unsigned int voff;
959f7760dadSAlex Elder 	unsigned short end_idx;
960f7760dadSAlex Elder 	unsigned short vcnt;
961f7760dadSAlex Elder 	struct bio *bio;
962602adf40SYehuda Sadeh 
963f7760dadSAlex Elder 	/* Handle the easy case for the caller */
964f7760dadSAlex Elder 
965f7760dadSAlex Elder 	if (!offset && len == bio_src->bi_size)
966f7760dadSAlex Elder 		return bio_clone(bio_src, gfpmask);
967f7760dadSAlex Elder 
968f7760dadSAlex Elder 	if (WARN_ON_ONCE(!len))
969f7760dadSAlex Elder 		return NULL;
970f7760dadSAlex Elder 	if (WARN_ON_ONCE(len > bio_src->bi_size))
971f7760dadSAlex Elder 		return NULL;
972f7760dadSAlex Elder 	if (WARN_ON_ONCE(offset > bio_src->bi_size - len))
973f7760dadSAlex Elder 		return NULL;
974f7760dadSAlex Elder 
975f7760dadSAlex Elder 	/* Find first affected segment... */
976f7760dadSAlex Elder 
977f7760dadSAlex Elder 	resid = offset;
978f7760dadSAlex Elder 	__bio_for_each_segment(bv, bio_src, idx, 0) {
979f7760dadSAlex Elder 		if (resid < bv->bv_len)
980f7760dadSAlex Elder 			break;
981f7760dadSAlex Elder 		resid -= bv->bv_len;
982602adf40SYehuda Sadeh 	}
983f7760dadSAlex Elder 	voff = resid;
984602adf40SYehuda Sadeh 
985f7760dadSAlex Elder 	/* ...and the last affected segment */
986542582fcSAlex Elder 
987f7760dadSAlex Elder 	resid += len;
988f7760dadSAlex Elder 	__bio_for_each_segment(bv, bio_src, end_idx, idx) {
989f7760dadSAlex Elder 		if (resid <= bv->bv_len)
990f7760dadSAlex Elder 			break;
991f7760dadSAlex Elder 		resid -= bv->bv_len;
992f7760dadSAlex Elder 	}
993f7760dadSAlex Elder 	vcnt = end_idx - idx + 1;
994602adf40SYehuda Sadeh 
995f7760dadSAlex Elder 	/* Build the clone */
996f7760dadSAlex Elder 
997f7760dadSAlex Elder 	bio = bio_alloc(gfpmask, (unsigned int) vcnt);
998f7760dadSAlex Elder 	if (!bio)
999f7760dadSAlex Elder 		return NULL;	/* ENOMEM */
1000f7760dadSAlex Elder 
1001f7760dadSAlex Elder 	bio->bi_bdev = bio_src->bi_bdev;
1002f7760dadSAlex Elder 	bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT);
1003f7760dadSAlex Elder 	bio->bi_rw = bio_src->bi_rw;
1004f7760dadSAlex Elder 	bio->bi_flags |= 1 << BIO_CLONED;
1005602adf40SYehuda Sadeh 
1006602adf40SYehuda Sadeh 	/*
1007f7760dadSAlex Elder 	 * Copy over our part of the bio_vec, then update the first
1008f7760dadSAlex Elder 	 * and last (or only) entries.
1009602adf40SYehuda Sadeh 	 */
1010f7760dadSAlex Elder 	memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx],
1011f7760dadSAlex Elder 			vcnt * sizeof (struct bio_vec));
1012f7760dadSAlex Elder 	bio->bi_io_vec[0].bv_offset += voff;
1013f7760dadSAlex Elder 	if (vcnt > 1) {
1014f7760dadSAlex Elder 		bio->bi_io_vec[0].bv_len -= voff;
1015f7760dadSAlex Elder 		bio->bi_io_vec[vcnt - 1].bv_len = resid;
1016602adf40SYehuda Sadeh 	} else {
1017f7760dadSAlex Elder 		bio->bi_io_vec[0].bv_len = len;
1018602adf40SYehuda Sadeh 	}
1019602adf40SYehuda Sadeh 
1020f7760dadSAlex Elder 	bio->bi_vcnt = vcnt;
1021f7760dadSAlex Elder 	bio->bi_size = len;
1022f7760dadSAlex Elder 	bio->bi_idx = 0;
1023602adf40SYehuda Sadeh 
1024f7760dadSAlex Elder 	return bio;
1025602adf40SYehuda Sadeh }
1026602adf40SYehuda Sadeh 
1027f7760dadSAlex Elder /*
1028f7760dadSAlex Elder  * Clone a portion of a bio chain, starting at the given byte offset
1029f7760dadSAlex Elder  * into the first bio in the source chain and continuing for the
1030f7760dadSAlex Elder  * number of bytes indicated.  The result is another bio chain of
1031f7760dadSAlex Elder  * exactly the given length, or a null pointer on error.
1032f7760dadSAlex Elder  *
1033f7760dadSAlex Elder  * The bio_src and offset parameters are both in-out.  On entry they
1034f7760dadSAlex Elder  * refer to the first source bio and the offset into that bio where
1035f7760dadSAlex Elder  * the start of data to be cloned is located.
1036f7760dadSAlex Elder  *
1037f7760dadSAlex Elder  * On return, bio_src is updated to refer to the bio in the source
1038f7760dadSAlex Elder  * chain that contains first un-cloned byte, and *offset will
1039f7760dadSAlex Elder  * contain the offset of that byte within that bio.
1040f7760dadSAlex Elder  */
1041f7760dadSAlex Elder static struct bio *bio_chain_clone_range(struct bio **bio_src,
1042f7760dadSAlex Elder 					unsigned int *offset,
1043f7760dadSAlex Elder 					unsigned int len,
1044f7760dadSAlex Elder 					gfp_t gfpmask)
1045f7760dadSAlex Elder {
1046f7760dadSAlex Elder 	struct bio *bi = *bio_src;
1047f7760dadSAlex Elder 	unsigned int off = *offset;
1048f7760dadSAlex Elder 	struct bio *chain = NULL;
1049f7760dadSAlex Elder 	struct bio **end;
1050602adf40SYehuda Sadeh 
1051f7760dadSAlex Elder 	/* Build up a chain of clone bios up to the limit */
1052602adf40SYehuda Sadeh 
1053f7760dadSAlex Elder 	if (!bi || off >= bi->bi_size || !len)
1054f7760dadSAlex Elder 		return NULL;		/* Nothing to clone */
1055602adf40SYehuda Sadeh 
1056f7760dadSAlex Elder 	end = &chain;
1057f7760dadSAlex Elder 	while (len) {
1058f7760dadSAlex Elder 		unsigned int bi_size;
1059f7760dadSAlex Elder 		struct bio *bio;
1060f7760dadSAlex Elder 
1061f5400b7aSAlex Elder 		if (!bi) {
1062f5400b7aSAlex Elder 			rbd_warn(NULL, "bio_chain exhausted with %u left", len);
1063f7760dadSAlex Elder 			goto out_err;	/* EINVAL; ran out of bio's */
1064f5400b7aSAlex Elder 		}
1065f7760dadSAlex Elder 		bi_size = min_t(unsigned int, bi->bi_size - off, len);
1066f7760dadSAlex Elder 		bio = bio_clone_range(bi, off, bi_size, gfpmask);
1067f7760dadSAlex Elder 		if (!bio)
1068f7760dadSAlex Elder 			goto out_err;	/* ENOMEM */
1069f7760dadSAlex Elder 
1070f7760dadSAlex Elder 		*end = bio;
1071f7760dadSAlex Elder 		end = &bio->bi_next;
1072f7760dadSAlex Elder 
1073f7760dadSAlex Elder 		off += bi_size;
1074f7760dadSAlex Elder 		if (off == bi->bi_size) {
1075f7760dadSAlex Elder 			bi = bi->bi_next;
1076f7760dadSAlex Elder 			off = 0;
1077f7760dadSAlex Elder 		}
1078f7760dadSAlex Elder 		len -= bi_size;
1079f7760dadSAlex Elder 	}
1080f7760dadSAlex Elder 	*bio_src = bi;
1081f7760dadSAlex Elder 	*offset = off;
1082f7760dadSAlex Elder 
1083f7760dadSAlex Elder 	return chain;
1084f7760dadSAlex Elder out_err:
1085f7760dadSAlex Elder 	bio_chain_put(chain);
1086f7760dadSAlex Elder 
1087602adf40SYehuda Sadeh 	return NULL;
1088602adf40SYehuda Sadeh }
1089602adf40SYehuda Sadeh 
1090926f9b3fSAlex Elder /*
1091926f9b3fSAlex Elder  * The default/initial value for all object request flags is 0.  For
1092926f9b3fSAlex Elder  * each flag, once its value is set to 1 it is never reset to 0
1093926f9b3fSAlex Elder  * again.
1094926f9b3fSAlex Elder  */
1095926f9b3fSAlex Elder static void obj_request_done_set(struct rbd_obj_request *obj_request)
1096926f9b3fSAlex Elder {
1097926f9b3fSAlex Elder 	if (test_and_set_bit(OBJ_REQ_DONE, &obj_request->flags)) {
1098926f9b3fSAlex Elder 		struct rbd_img_request *img_request = obj_request->img_request;
1099926f9b3fSAlex Elder 		struct rbd_device *rbd_dev;
1100926f9b3fSAlex Elder 
1101926f9b3fSAlex Elder 		rbd_dev = img_request ? img_request->rbd_dev : NULL;
1102926f9b3fSAlex Elder 		rbd_warn(rbd_dev, "obj_request %p already marked done\n",
1103926f9b3fSAlex Elder 			obj_request);
1104926f9b3fSAlex Elder 	}
1105926f9b3fSAlex Elder }
1106926f9b3fSAlex Elder 
1107926f9b3fSAlex Elder static bool obj_request_done_test(struct rbd_obj_request *obj_request)
1108926f9b3fSAlex Elder {
1109926f9b3fSAlex Elder 	smp_mb();
1110926f9b3fSAlex Elder 	return test_bit(OBJ_REQ_DONE, &obj_request->flags) != 0;
1111926f9b3fSAlex Elder }
1112926f9b3fSAlex Elder 
11136365d33aSAlex Elder static void obj_request_img_data_set(struct rbd_obj_request *obj_request)
11146365d33aSAlex Elder {
11156365d33aSAlex Elder 	if (test_and_set_bit(OBJ_REQ_IMG_DATA, &obj_request->flags)) {
11166365d33aSAlex Elder 		struct rbd_img_request *img_request = obj_request->img_request;
11176365d33aSAlex Elder 		struct rbd_device *rbd_dev;
11186365d33aSAlex Elder 
11196365d33aSAlex Elder 		rbd_dev = img_request ? img_request->rbd_dev : NULL;
11206365d33aSAlex Elder 		rbd_warn(rbd_dev, "obj_request %p already marked img_data\n",
11216365d33aSAlex Elder 			obj_request);
11226365d33aSAlex Elder 	}
11236365d33aSAlex Elder }
11246365d33aSAlex Elder 
11256365d33aSAlex Elder static bool obj_request_img_data_test(struct rbd_obj_request *obj_request)
11266365d33aSAlex Elder {
11276365d33aSAlex Elder 	smp_mb();
11286365d33aSAlex Elder 	return test_bit(OBJ_REQ_IMG_DATA, &obj_request->flags) != 0;
11296365d33aSAlex Elder }
11306365d33aSAlex Elder 
1131bf0d5f50SAlex Elder static void rbd_obj_request_get(struct rbd_obj_request *obj_request)
1132bf0d5f50SAlex Elder {
113337206ee5SAlex Elder 	dout("%s: obj %p (was %d)\n", __func__, obj_request,
113437206ee5SAlex Elder 		atomic_read(&obj_request->kref.refcount));
1135bf0d5f50SAlex Elder 	kref_get(&obj_request->kref);
1136bf0d5f50SAlex Elder }
1137bf0d5f50SAlex Elder 
1138bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref);
1139bf0d5f50SAlex Elder static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1140bf0d5f50SAlex Elder {
1141bf0d5f50SAlex Elder 	rbd_assert(obj_request != NULL);
114237206ee5SAlex Elder 	dout("%s: obj %p (was %d)\n", __func__, obj_request,
114337206ee5SAlex Elder 		atomic_read(&obj_request->kref.refcount));
1144bf0d5f50SAlex Elder 	kref_put(&obj_request->kref, rbd_obj_request_destroy);
1145bf0d5f50SAlex Elder }
1146bf0d5f50SAlex Elder 
1147bf0d5f50SAlex Elder static void rbd_img_request_get(struct rbd_img_request *img_request)
1148bf0d5f50SAlex Elder {
114937206ee5SAlex Elder 	dout("%s: img %p (was %d)\n", __func__, img_request,
115037206ee5SAlex Elder 		atomic_read(&img_request->kref.refcount));
1151bf0d5f50SAlex Elder 	kref_get(&img_request->kref);
1152bf0d5f50SAlex Elder }
1153bf0d5f50SAlex Elder 
1154bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref);
1155bf0d5f50SAlex Elder static void rbd_img_request_put(struct rbd_img_request *img_request)
1156bf0d5f50SAlex Elder {
1157bf0d5f50SAlex Elder 	rbd_assert(img_request != NULL);
115837206ee5SAlex Elder 	dout("%s: img %p (was %d)\n", __func__, img_request,
115937206ee5SAlex Elder 		atomic_read(&img_request->kref.refcount));
1160bf0d5f50SAlex Elder 	kref_put(&img_request->kref, rbd_img_request_destroy);
1161bf0d5f50SAlex Elder }
1162bf0d5f50SAlex Elder 
1163bf0d5f50SAlex Elder static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1164bf0d5f50SAlex Elder 					struct rbd_obj_request *obj_request)
1165bf0d5f50SAlex Elder {
116625dcf954SAlex Elder 	rbd_assert(obj_request->img_request == NULL);
116725dcf954SAlex Elder 
1168bf0d5f50SAlex Elder 	rbd_obj_request_get(obj_request);
1169bf0d5f50SAlex Elder 	obj_request->img_request = img_request;
117025dcf954SAlex Elder 	obj_request->which = img_request->obj_request_count;
11716365d33aSAlex Elder 	rbd_assert(!obj_request_img_data_test(obj_request));
11726365d33aSAlex Elder 	obj_request_img_data_set(obj_request);
1173bf0d5f50SAlex Elder 	rbd_assert(obj_request->which != BAD_WHICH);
117425dcf954SAlex Elder 	img_request->obj_request_count++;
117525dcf954SAlex Elder 	list_add_tail(&obj_request->links, &img_request->obj_requests);
117637206ee5SAlex Elder 	dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
117737206ee5SAlex Elder 		obj_request->which);
1178bf0d5f50SAlex Elder }
1179bf0d5f50SAlex Elder 
1180bf0d5f50SAlex Elder static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1181bf0d5f50SAlex Elder 					struct rbd_obj_request *obj_request)
1182bf0d5f50SAlex Elder {
1183bf0d5f50SAlex Elder 	rbd_assert(obj_request->which != BAD_WHICH);
118425dcf954SAlex Elder 
118537206ee5SAlex Elder 	dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
118637206ee5SAlex Elder 		obj_request->which);
1187bf0d5f50SAlex Elder 	list_del(&obj_request->links);
118825dcf954SAlex Elder 	rbd_assert(img_request->obj_request_count > 0);
118925dcf954SAlex Elder 	img_request->obj_request_count--;
119025dcf954SAlex Elder 	rbd_assert(obj_request->which == img_request->obj_request_count);
119125dcf954SAlex Elder 	obj_request->which = BAD_WHICH;
11926365d33aSAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
1193bf0d5f50SAlex Elder 	rbd_assert(obj_request->img_request == img_request);
1194bf0d5f50SAlex Elder 	obj_request->img_request = NULL;
119525dcf954SAlex Elder 	obj_request->callback = NULL;
1196bf0d5f50SAlex Elder 	rbd_obj_request_put(obj_request);
1197bf0d5f50SAlex Elder }
1198bf0d5f50SAlex Elder 
1199bf0d5f50SAlex Elder static bool obj_request_type_valid(enum obj_request_type type)
1200bf0d5f50SAlex Elder {
1201bf0d5f50SAlex Elder 	switch (type) {
12029969ebc5SAlex Elder 	case OBJ_REQUEST_NODATA:
1203bf0d5f50SAlex Elder 	case OBJ_REQUEST_BIO:
1204788e2df3SAlex Elder 	case OBJ_REQUEST_PAGES:
1205bf0d5f50SAlex Elder 		return true;
1206bf0d5f50SAlex Elder 	default:
1207bf0d5f50SAlex Elder 		return false;
1208bf0d5f50SAlex Elder 	}
1209bf0d5f50SAlex Elder }
1210bf0d5f50SAlex Elder 
1211bf0d5f50SAlex Elder static int rbd_obj_request_submit(struct ceph_osd_client *osdc,
1212bf0d5f50SAlex Elder 				struct rbd_obj_request *obj_request)
1213bf0d5f50SAlex Elder {
121437206ee5SAlex Elder 	dout("%s: osdc %p obj %p\n", __func__, osdc, obj_request);
121537206ee5SAlex Elder 
1216bf0d5f50SAlex Elder 	return ceph_osdc_start_request(osdc, obj_request->osd_req, false);
1217bf0d5f50SAlex Elder }
1218bf0d5f50SAlex Elder 
1219bf0d5f50SAlex Elder static void rbd_img_request_complete(struct rbd_img_request *img_request)
1220bf0d5f50SAlex Elder {
122155f27e09SAlex Elder 
122237206ee5SAlex Elder 	dout("%s: img %p\n", __func__, img_request);
122355f27e09SAlex Elder 
122455f27e09SAlex Elder 	/*
122555f27e09SAlex Elder 	 * If no error occurred, compute the aggregate transfer
122655f27e09SAlex Elder 	 * count for the image request.  We could instead use
122755f27e09SAlex Elder 	 * atomic64_cmpxchg() to update it as each object request
122855f27e09SAlex Elder 	 * completes; not clear which way is better off hand.
122955f27e09SAlex Elder 	 */
123055f27e09SAlex Elder 	if (!img_request->result) {
123155f27e09SAlex Elder 		struct rbd_obj_request *obj_request;
123255f27e09SAlex Elder 		u64 xferred = 0;
123355f27e09SAlex Elder 
123455f27e09SAlex Elder 		for_each_obj_request(img_request, obj_request)
123555f27e09SAlex Elder 			xferred += obj_request->xferred;
123655f27e09SAlex Elder 		img_request->xferred = xferred;
123755f27e09SAlex Elder 	}
123855f27e09SAlex Elder 
1239bf0d5f50SAlex Elder 	if (img_request->callback)
1240bf0d5f50SAlex Elder 		img_request->callback(img_request);
1241bf0d5f50SAlex Elder 	else
1242bf0d5f50SAlex Elder 		rbd_img_request_put(img_request);
1243bf0d5f50SAlex Elder }
1244bf0d5f50SAlex Elder 
1245788e2df3SAlex Elder /* Caller is responsible for rbd_obj_request_destroy(obj_request) */
1246788e2df3SAlex Elder 
1247788e2df3SAlex Elder static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
1248788e2df3SAlex Elder {
124937206ee5SAlex Elder 	dout("%s: obj %p\n", __func__, obj_request);
125037206ee5SAlex Elder 
1251788e2df3SAlex Elder 	return wait_for_completion_interruptible(&obj_request->completion);
1252788e2df3SAlex Elder }
1253788e2df3SAlex Elder 
12540c425248SAlex Elder /*
12550c425248SAlex Elder  * The default/initial value for all image request flags is 0.  Each
12560c425248SAlex Elder  * is conditionally set to 1 at image request initialization time
12570c425248SAlex Elder  * and currently never change thereafter.
12580c425248SAlex Elder  */
12590c425248SAlex Elder static void img_request_write_set(struct rbd_img_request *img_request)
12600c425248SAlex Elder {
12610c425248SAlex Elder 	set_bit(IMG_REQ_WRITE, &img_request->flags);
12620c425248SAlex Elder 	smp_mb();
12630c425248SAlex Elder }
12640c425248SAlex Elder 
12650c425248SAlex Elder static bool img_request_write_test(struct rbd_img_request *img_request)
12660c425248SAlex Elder {
12670c425248SAlex Elder 	smp_mb();
12680c425248SAlex Elder 	return test_bit(IMG_REQ_WRITE, &img_request->flags) != 0;
12690c425248SAlex Elder }
12700c425248SAlex Elder 
12719849e986SAlex Elder static void img_request_child_set(struct rbd_img_request *img_request)
12729849e986SAlex Elder {
12739849e986SAlex Elder 	set_bit(IMG_REQ_CHILD, &img_request->flags);
12749849e986SAlex Elder 	smp_mb();
12759849e986SAlex Elder }
12769849e986SAlex Elder 
12779849e986SAlex Elder static bool img_request_child_test(struct rbd_img_request *img_request)
12789849e986SAlex Elder {
12799849e986SAlex Elder 	smp_mb();
12809849e986SAlex Elder 	return test_bit(IMG_REQ_CHILD, &img_request->flags) != 0;
12819849e986SAlex Elder }
12829849e986SAlex Elder 
1283d0b2e944SAlex Elder static void img_request_layered_set(struct rbd_img_request *img_request)
1284d0b2e944SAlex Elder {
1285d0b2e944SAlex Elder 	set_bit(IMG_REQ_LAYERED, &img_request->flags);
1286d0b2e944SAlex Elder 	smp_mb();
1287d0b2e944SAlex Elder }
1288d0b2e944SAlex Elder 
1289d0b2e944SAlex Elder static bool img_request_layered_test(struct rbd_img_request *img_request)
1290d0b2e944SAlex Elder {
1291d0b2e944SAlex Elder 	smp_mb();
1292d0b2e944SAlex Elder 	return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
1293d0b2e944SAlex Elder }
1294d0b2e944SAlex Elder 
12956e2a4505SAlex Elder static void
12966e2a4505SAlex Elder rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request)
12976e2a4505SAlex Elder {
12986e2a4505SAlex Elder 	dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
12996e2a4505SAlex Elder 		obj_request, obj_request->img_request, obj_request->result,
13006e2a4505SAlex Elder 		obj_request->xferred, obj_request->length);
13016e2a4505SAlex Elder 	/*
13026e2a4505SAlex Elder 	 * ENOENT means a hole in the image.  We zero-fill the
13036e2a4505SAlex Elder 	 * entire length of the request.  A short read also implies
13046e2a4505SAlex Elder 	 * zero-fill to the end of the request.  Either way we
13056e2a4505SAlex Elder 	 * update the xferred count to indicate the whole request
13066e2a4505SAlex Elder 	 * was satisfied.
13076e2a4505SAlex Elder 	 */
13086e2a4505SAlex Elder 	BUG_ON(obj_request->type != OBJ_REQUEST_BIO);
13096e2a4505SAlex Elder 	if (obj_request->result == -ENOENT) {
13106e2a4505SAlex Elder 		zero_bio_chain(obj_request->bio_list, 0);
13116e2a4505SAlex Elder 		obj_request->result = 0;
13126e2a4505SAlex Elder 		obj_request->xferred = obj_request->length;
13136e2a4505SAlex Elder 	} else if (obj_request->xferred < obj_request->length &&
13146e2a4505SAlex Elder 			!obj_request->result) {
13156e2a4505SAlex Elder 		zero_bio_chain(obj_request->bio_list, obj_request->xferred);
13166e2a4505SAlex Elder 		obj_request->xferred = obj_request->length;
13176e2a4505SAlex Elder 	}
13186e2a4505SAlex Elder 	obj_request_done_set(obj_request);
13196e2a4505SAlex Elder }
13206e2a4505SAlex Elder 
1321bf0d5f50SAlex Elder static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
1322bf0d5f50SAlex Elder {
132337206ee5SAlex Elder 	dout("%s: obj %p cb %p\n", __func__, obj_request,
132437206ee5SAlex Elder 		obj_request->callback);
1325bf0d5f50SAlex Elder 	if (obj_request->callback)
1326bf0d5f50SAlex Elder 		obj_request->callback(obj_request);
1327788e2df3SAlex Elder 	else
1328788e2df3SAlex Elder 		complete_all(&obj_request->completion);
1329bf0d5f50SAlex Elder }
1330bf0d5f50SAlex Elder 
1331c47f9371SAlex Elder static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request)
133239bf2c5dSAlex Elder {
133339bf2c5dSAlex Elder 	dout("%s: obj %p\n", __func__, obj_request);
133439bf2c5dSAlex Elder 	obj_request_done_set(obj_request);
133539bf2c5dSAlex Elder }
133639bf2c5dSAlex Elder 
1337c47f9371SAlex Elder static void rbd_osd_read_callback(struct rbd_obj_request *obj_request)
1338bf0d5f50SAlex Elder {
133937206ee5SAlex Elder 	dout("%s: obj %p result %d %llu/%llu\n", __func__, obj_request,
1340c47f9371SAlex Elder 		obj_request->result, obj_request->xferred, obj_request->length);
13416e2a4505SAlex Elder 	if (obj_request->img_request)
13426e2a4505SAlex Elder 		rbd_img_obj_request_read_callback(obj_request);
13436e2a4505SAlex Elder 	else
134407741308SAlex Elder 		obj_request_done_set(obj_request);
1345bf0d5f50SAlex Elder }
1346bf0d5f50SAlex Elder 
1347c47f9371SAlex Elder static void rbd_osd_write_callback(struct rbd_obj_request *obj_request)
1348bf0d5f50SAlex Elder {
13491b83bef2SSage Weil 	dout("%s: obj %p result %d %llu\n", __func__, obj_request,
13501b83bef2SSage Weil 		obj_request->result, obj_request->length);
13511b83bef2SSage Weil 	/*
13521b83bef2SSage Weil 	 * There is no such thing as a successful short write.
13531b83bef2SSage Weil 	 * Our xferred value is the number of bytes transferred
13541b83bef2SSage Weil 	 * back.  Set it to our originally-requested length.
13551b83bef2SSage Weil 	 */
13561b83bef2SSage Weil 	obj_request->xferred = obj_request->length;
135707741308SAlex Elder 	obj_request_done_set(obj_request);
1358bf0d5f50SAlex Elder }
1359bf0d5f50SAlex Elder 
1360fbfab539SAlex Elder /*
1361fbfab539SAlex Elder  * For a simple stat call there's nothing to do.  We'll do more if
1362fbfab539SAlex Elder  * this is part of a write sequence for a layered image.
1363fbfab539SAlex Elder  */
1364c47f9371SAlex Elder static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request)
1365fbfab539SAlex Elder {
136637206ee5SAlex Elder 	dout("%s: obj %p\n", __func__, obj_request);
1367fbfab539SAlex Elder 	obj_request_done_set(obj_request);
1368fbfab539SAlex Elder }
1369fbfab539SAlex Elder 
1370bf0d5f50SAlex Elder static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
1371bf0d5f50SAlex Elder 				struct ceph_msg *msg)
1372bf0d5f50SAlex Elder {
1373bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request = osd_req->r_priv;
1374bf0d5f50SAlex Elder 	u16 opcode;
1375bf0d5f50SAlex Elder 
137637206ee5SAlex Elder 	dout("%s: osd_req %p msg %p\n", __func__, osd_req, msg);
1377bf0d5f50SAlex Elder 	rbd_assert(osd_req == obj_request->osd_req);
13786365d33aSAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request) ^
13796365d33aSAlex Elder 				!obj_request->img_request);
13806365d33aSAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request) ^
1381bf0d5f50SAlex Elder 				(obj_request->which == BAD_WHICH));
1382bf0d5f50SAlex Elder 
13831b83bef2SSage Weil 	if (osd_req->r_result < 0)
13841b83bef2SSage Weil 		obj_request->result = osd_req->r_result;
1385bf0d5f50SAlex Elder 	obj_request->version = le64_to_cpu(osd_req->r_reassert_version.version);
1386bf0d5f50SAlex Elder 
13871b83bef2SSage Weil 	WARN_ON(osd_req->r_num_ops != 1);	/* For now */
1388bf0d5f50SAlex Elder 
1389c47f9371SAlex Elder 	/*
1390c47f9371SAlex Elder 	 * We support a 64-bit length, but ultimately it has to be
1391c47f9371SAlex Elder 	 * passed to blk_end_request(), which takes an unsigned int.
1392c47f9371SAlex Elder 	 */
13931b83bef2SSage Weil 	obj_request->xferred = osd_req->r_reply_op_len[0];
1394c47f9371SAlex Elder 	rbd_assert(obj_request->xferred < (u64) UINT_MAX);
139579528734SAlex Elder 	opcode = osd_req->r_ops[0].op;
1396bf0d5f50SAlex Elder 	switch (opcode) {
1397bf0d5f50SAlex Elder 	case CEPH_OSD_OP_READ:
1398c47f9371SAlex Elder 		rbd_osd_read_callback(obj_request);
1399bf0d5f50SAlex Elder 		break;
1400bf0d5f50SAlex Elder 	case CEPH_OSD_OP_WRITE:
1401c47f9371SAlex Elder 		rbd_osd_write_callback(obj_request);
1402bf0d5f50SAlex Elder 		break;
1403fbfab539SAlex Elder 	case CEPH_OSD_OP_STAT:
1404c47f9371SAlex Elder 		rbd_osd_stat_callback(obj_request);
1405fbfab539SAlex Elder 		break;
140636be9a76SAlex Elder 	case CEPH_OSD_OP_CALL:
1407b8d70035SAlex Elder 	case CEPH_OSD_OP_NOTIFY_ACK:
14089969ebc5SAlex Elder 	case CEPH_OSD_OP_WATCH:
1409c47f9371SAlex Elder 		rbd_osd_trivial_callback(obj_request);
14109969ebc5SAlex Elder 		break;
1411bf0d5f50SAlex Elder 	default:
1412bf0d5f50SAlex Elder 		rbd_warn(NULL, "%s: unsupported op %hu\n",
1413bf0d5f50SAlex Elder 			obj_request->object_name, (unsigned short) opcode);
1414bf0d5f50SAlex Elder 		break;
1415bf0d5f50SAlex Elder 	}
1416bf0d5f50SAlex Elder 
141707741308SAlex Elder 	if (obj_request_done_test(obj_request))
1418bf0d5f50SAlex Elder 		rbd_obj_request_complete(obj_request);
1419bf0d5f50SAlex Elder }
1420bf0d5f50SAlex Elder 
14212fa12320SAlex Elder static void rbd_osd_req_format(struct rbd_obj_request *obj_request,
142279528734SAlex Elder 					bool write_request)
1423430c28c3SAlex Elder {
1424430c28c3SAlex Elder 	struct rbd_img_request *img_request = obj_request->img_request;
14258c042b0dSAlex Elder 	struct ceph_osd_request *osd_req = obj_request->osd_req;
1426430c28c3SAlex Elder 	struct ceph_snap_context *snapc = NULL;
1427430c28c3SAlex Elder 	u64 snap_id = CEPH_NOSNAP;
1428430c28c3SAlex Elder 	struct timespec *mtime = NULL;
1429430c28c3SAlex Elder 	struct timespec now;
1430430c28c3SAlex Elder 
14318c042b0dSAlex Elder 	rbd_assert(osd_req != NULL);
1432430c28c3SAlex Elder 
1433430c28c3SAlex Elder 	if (write_request) {
1434430c28c3SAlex Elder 		now = CURRENT_TIME;
1435430c28c3SAlex Elder 		mtime = &now;
1436430c28c3SAlex Elder 		if (img_request)
1437430c28c3SAlex Elder 			snapc = img_request->snapc;
14382fa12320SAlex Elder 	} else if (img_request) {
1439430c28c3SAlex Elder 		snap_id = img_request->snap_id;
1440430c28c3SAlex Elder 	}
14418c042b0dSAlex Elder 	ceph_osdc_build_request(osd_req, obj_request->offset,
144279528734SAlex Elder 			snapc, snap_id, mtime);
1443430c28c3SAlex Elder }
1444430c28c3SAlex Elder 
1445bf0d5f50SAlex Elder static struct ceph_osd_request *rbd_osd_req_create(
1446bf0d5f50SAlex Elder 					struct rbd_device *rbd_dev,
1447bf0d5f50SAlex Elder 					bool write_request,
1448430c28c3SAlex Elder 					struct rbd_obj_request *obj_request)
1449bf0d5f50SAlex Elder {
1450bf0d5f50SAlex Elder 	struct ceph_snap_context *snapc = NULL;
1451bf0d5f50SAlex Elder 	struct ceph_osd_client *osdc;
1452bf0d5f50SAlex Elder 	struct ceph_osd_request *osd_req;
1453bf0d5f50SAlex Elder 
14546365d33aSAlex Elder 	if (obj_request_img_data_test(obj_request)) {
14556365d33aSAlex Elder 		struct rbd_img_request *img_request = obj_request->img_request;
14566365d33aSAlex Elder 
14570c425248SAlex Elder 		rbd_assert(write_request ==
14580c425248SAlex Elder 				img_request_write_test(img_request));
14590c425248SAlex Elder 		if (write_request)
1460bf0d5f50SAlex Elder 			snapc = img_request->snapc;
1461bf0d5f50SAlex Elder 	}
1462bf0d5f50SAlex Elder 
1463bf0d5f50SAlex Elder 	/* Allocate and initialize the request, for the single op */
1464bf0d5f50SAlex Elder 
1465bf0d5f50SAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
1466bf0d5f50SAlex Elder 	osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_ATOMIC);
1467bf0d5f50SAlex Elder 	if (!osd_req)
1468bf0d5f50SAlex Elder 		return NULL;	/* ENOMEM */
1469bf0d5f50SAlex Elder 
1470430c28c3SAlex Elder 	if (write_request)
1471bf0d5f50SAlex Elder 		osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
1472430c28c3SAlex Elder 	else
1473bf0d5f50SAlex Elder 		osd_req->r_flags = CEPH_OSD_FLAG_READ;
1474bf0d5f50SAlex Elder 
1475bf0d5f50SAlex Elder 	osd_req->r_callback = rbd_osd_req_callback;
1476bf0d5f50SAlex Elder 	osd_req->r_priv = obj_request;
1477bf0d5f50SAlex Elder 
1478bf0d5f50SAlex Elder 	osd_req->r_oid_len = strlen(obj_request->object_name);
1479bf0d5f50SAlex Elder 	rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid));
1480bf0d5f50SAlex Elder 	memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len);
1481bf0d5f50SAlex Elder 
1482bf0d5f50SAlex Elder 	osd_req->r_file_layout = rbd_dev->layout;	/* struct */
1483bf0d5f50SAlex Elder 
1484bf0d5f50SAlex Elder 	return osd_req;
1485bf0d5f50SAlex Elder }
1486bf0d5f50SAlex Elder 
1487bf0d5f50SAlex Elder static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
1488bf0d5f50SAlex Elder {
1489bf0d5f50SAlex Elder 	ceph_osdc_put_request(osd_req);
1490bf0d5f50SAlex Elder }
1491bf0d5f50SAlex Elder 
1492bf0d5f50SAlex Elder /* object_name is assumed to be a non-null pointer and NUL-terminated */
1493bf0d5f50SAlex Elder 
1494bf0d5f50SAlex Elder static struct rbd_obj_request *rbd_obj_request_create(const char *object_name,
1495bf0d5f50SAlex Elder 						u64 offset, u64 length,
1496bf0d5f50SAlex Elder 						enum obj_request_type type)
1497bf0d5f50SAlex Elder {
1498bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
1499bf0d5f50SAlex Elder 	size_t size;
1500bf0d5f50SAlex Elder 	char *name;
1501bf0d5f50SAlex Elder 
1502bf0d5f50SAlex Elder 	rbd_assert(obj_request_type_valid(type));
1503bf0d5f50SAlex Elder 
1504bf0d5f50SAlex Elder 	size = strlen(object_name) + 1;
1505bf0d5f50SAlex Elder 	obj_request = kzalloc(sizeof (*obj_request) + size, GFP_KERNEL);
1506bf0d5f50SAlex Elder 	if (!obj_request)
1507bf0d5f50SAlex Elder 		return NULL;
1508bf0d5f50SAlex Elder 
1509bf0d5f50SAlex Elder 	name = (char *)(obj_request + 1);
1510bf0d5f50SAlex Elder 	obj_request->object_name = memcpy(name, object_name, size);
1511bf0d5f50SAlex Elder 	obj_request->offset = offset;
1512bf0d5f50SAlex Elder 	obj_request->length = length;
1513926f9b3fSAlex Elder 	obj_request->flags = 0;
1514bf0d5f50SAlex Elder 	obj_request->which = BAD_WHICH;
1515bf0d5f50SAlex Elder 	obj_request->type = type;
1516bf0d5f50SAlex Elder 	INIT_LIST_HEAD(&obj_request->links);
1517788e2df3SAlex Elder 	init_completion(&obj_request->completion);
1518bf0d5f50SAlex Elder 	kref_init(&obj_request->kref);
1519bf0d5f50SAlex Elder 
152037206ee5SAlex Elder 	dout("%s: \"%s\" %llu/%llu %d -> obj %p\n", __func__, object_name,
152137206ee5SAlex Elder 		offset, length, (int)type, obj_request);
152237206ee5SAlex Elder 
1523bf0d5f50SAlex Elder 	return obj_request;
1524bf0d5f50SAlex Elder }
1525bf0d5f50SAlex Elder 
1526bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref)
1527bf0d5f50SAlex Elder {
1528bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
1529bf0d5f50SAlex Elder 
1530bf0d5f50SAlex Elder 	obj_request = container_of(kref, struct rbd_obj_request, kref);
1531bf0d5f50SAlex Elder 
153237206ee5SAlex Elder 	dout("%s: obj %p\n", __func__, obj_request);
153337206ee5SAlex Elder 
1534bf0d5f50SAlex Elder 	rbd_assert(obj_request->img_request == NULL);
1535bf0d5f50SAlex Elder 	rbd_assert(obj_request->which == BAD_WHICH);
1536bf0d5f50SAlex Elder 
1537bf0d5f50SAlex Elder 	if (obj_request->osd_req)
1538bf0d5f50SAlex Elder 		rbd_osd_req_destroy(obj_request->osd_req);
1539bf0d5f50SAlex Elder 
1540bf0d5f50SAlex Elder 	rbd_assert(obj_request_type_valid(obj_request->type));
1541bf0d5f50SAlex Elder 	switch (obj_request->type) {
15429969ebc5SAlex Elder 	case OBJ_REQUEST_NODATA:
15439969ebc5SAlex Elder 		break;		/* Nothing to do */
1544bf0d5f50SAlex Elder 	case OBJ_REQUEST_BIO:
1545bf0d5f50SAlex Elder 		if (obj_request->bio_list)
1546bf0d5f50SAlex Elder 			bio_chain_put(obj_request->bio_list);
1547bf0d5f50SAlex Elder 		break;
1548788e2df3SAlex Elder 	case OBJ_REQUEST_PAGES:
1549788e2df3SAlex Elder 		if (obj_request->pages)
1550788e2df3SAlex Elder 			ceph_release_page_vector(obj_request->pages,
1551788e2df3SAlex Elder 						obj_request->page_count);
1552788e2df3SAlex Elder 		break;
1553bf0d5f50SAlex Elder 	}
1554bf0d5f50SAlex Elder 
1555bf0d5f50SAlex Elder 	kfree(obj_request);
1556bf0d5f50SAlex Elder }
1557bf0d5f50SAlex Elder 
1558bf0d5f50SAlex Elder /*
1559bf0d5f50SAlex Elder  * Caller is responsible for filling in the list of object requests
1560bf0d5f50SAlex Elder  * that comprises the image request, and the Linux request pointer
1561bf0d5f50SAlex Elder  * (if there is one).
1562bf0d5f50SAlex Elder  */
1563cc344fa1SAlex Elder static struct rbd_img_request *rbd_img_request_create(
1564cc344fa1SAlex Elder 					struct rbd_device *rbd_dev,
1565bf0d5f50SAlex Elder 					u64 offset, u64 length,
15669849e986SAlex Elder 					bool write_request,
15679849e986SAlex Elder 					bool child_request)
1568bf0d5f50SAlex Elder {
1569bf0d5f50SAlex Elder 	struct rbd_img_request *img_request;
1570bf0d5f50SAlex Elder 	struct ceph_snap_context *snapc = NULL;
1571bf0d5f50SAlex Elder 
1572bf0d5f50SAlex Elder 	img_request = kmalloc(sizeof (*img_request), GFP_ATOMIC);
1573bf0d5f50SAlex Elder 	if (!img_request)
1574bf0d5f50SAlex Elder 		return NULL;
1575bf0d5f50SAlex Elder 
1576bf0d5f50SAlex Elder 	if (write_request) {
1577bf0d5f50SAlex Elder 		down_read(&rbd_dev->header_rwsem);
1578bf0d5f50SAlex Elder 		snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1579bf0d5f50SAlex Elder 		up_read(&rbd_dev->header_rwsem);
1580bf0d5f50SAlex Elder 		if (WARN_ON(!snapc)) {
1581bf0d5f50SAlex Elder 			kfree(img_request);
1582bf0d5f50SAlex Elder 			return NULL;	/* Shouldn't happen */
1583bf0d5f50SAlex Elder 		}
15840c425248SAlex Elder 
1585bf0d5f50SAlex Elder 	}
1586bf0d5f50SAlex Elder 
1587bf0d5f50SAlex Elder 	img_request->rq = NULL;
1588bf0d5f50SAlex Elder 	img_request->rbd_dev = rbd_dev;
1589bf0d5f50SAlex Elder 	img_request->offset = offset;
1590bf0d5f50SAlex Elder 	img_request->length = length;
15910c425248SAlex Elder 	img_request->flags = 0;
15920c425248SAlex Elder 	if (write_request) {
15930c425248SAlex Elder 		img_request_write_set(img_request);
1594bf0d5f50SAlex Elder 		img_request->snapc = snapc;
15950c425248SAlex Elder 	} else {
1596bf0d5f50SAlex Elder 		img_request->snap_id = rbd_dev->spec->snap_id;
15970c425248SAlex Elder 	}
15989849e986SAlex Elder 	if (child_request)
15999849e986SAlex Elder 		img_request_child_set(img_request);
1600d0b2e944SAlex Elder 	if (rbd_dev->parent_spec)
1601d0b2e944SAlex Elder 		img_request_layered_set(img_request);
1602bf0d5f50SAlex Elder 	spin_lock_init(&img_request->completion_lock);
1603bf0d5f50SAlex Elder 	img_request->next_completion = 0;
1604bf0d5f50SAlex Elder 	img_request->callback = NULL;
1605a5a337d4SAlex Elder 	img_request->result = 0;
1606bf0d5f50SAlex Elder 	img_request->obj_request_count = 0;
1607bf0d5f50SAlex Elder 	INIT_LIST_HEAD(&img_request->obj_requests);
1608bf0d5f50SAlex Elder 	kref_init(&img_request->kref);
1609bf0d5f50SAlex Elder 
1610d0b2e944SAlex Elder 	(void) img_request_layered_test(img_request);	/* Avoid a warning */
1611bf0d5f50SAlex Elder 	rbd_img_request_get(img_request);	/* Avoid a warning */
1612bf0d5f50SAlex Elder 	rbd_img_request_put(img_request);	/* TEMPORARY */
1613bf0d5f50SAlex Elder 
161437206ee5SAlex Elder 	dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev,
161537206ee5SAlex Elder 		write_request ? "write" : "read", offset, length,
161637206ee5SAlex Elder 		img_request);
161737206ee5SAlex Elder 
1618bf0d5f50SAlex Elder 	return img_request;
1619bf0d5f50SAlex Elder }
1620bf0d5f50SAlex Elder 
1621bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref)
1622bf0d5f50SAlex Elder {
1623bf0d5f50SAlex Elder 	struct rbd_img_request *img_request;
1624bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
1625bf0d5f50SAlex Elder 	struct rbd_obj_request *next_obj_request;
1626bf0d5f50SAlex Elder 
1627bf0d5f50SAlex Elder 	img_request = container_of(kref, struct rbd_img_request, kref);
1628bf0d5f50SAlex Elder 
162937206ee5SAlex Elder 	dout("%s: img %p\n", __func__, img_request);
163037206ee5SAlex Elder 
1631bf0d5f50SAlex Elder 	for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1632bf0d5f50SAlex Elder 		rbd_img_obj_request_del(img_request, obj_request);
163325dcf954SAlex Elder 	rbd_assert(img_request->obj_request_count == 0);
1634bf0d5f50SAlex Elder 
16350c425248SAlex Elder 	if (img_request_write_test(img_request))
1636bf0d5f50SAlex Elder 		ceph_put_snap_context(img_request->snapc);
1637bf0d5f50SAlex Elder 
1638bf0d5f50SAlex Elder 	kfree(img_request);
1639bf0d5f50SAlex Elder }
1640bf0d5f50SAlex Elder 
16411217857fSAlex Elder static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request)
16421217857fSAlex Elder {
16436365d33aSAlex Elder 	struct rbd_img_request *img_request;
16441217857fSAlex Elder 	unsigned int xferred;
16451217857fSAlex Elder 	int result;
16461217857fSAlex Elder 
16476365d33aSAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
16486365d33aSAlex Elder 	img_request = obj_request->img_request;
16496365d33aSAlex Elder 
16501217857fSAlex Elder 	rbd_assert(!img_request_child_test(img_request));
16511217857fSAlex Elder 	rbd_assert(img_request->rq != NULL);
16521217857fSAlex Elder 
16531217857fSAlex Elder 	rbd_assert(obj_request->xferred <= (u64)UINT_MAX);
16541217857fSAlex Elder 	xferred = (unsigned int)obj_request->xferred;
16551217857fSAlex Elder 	result = obj_request->result;
16561217857fSAlex Elder 	if (result) {
16571217857fSAlex Elder 		struct rbd_device *rbd_dev = img_request->rbd_dev;
16581217857fSAlex Elder 
16591217857fSAlex Elder 		rbd_warn(rbd_dev, "%s %llx at %llx (%llx)\n",
16601217857fSAlex Elder 			img_request_write_test(img_request) ? "write" : "read",
16611217857fSAlex Elder 			obj_request->length, obj_request->img_offset,
16621217857fSAlex Elder 			obj_request->offset);
16631217857fSAlex Elder 		rbd_warn(rbd_dev, "  result %d xferred %x\n",
16641217857fSAlex Elder 			result, xferred);
16651217857fSAlex Elder 		if (!img_request->result)
16661217857fSAlex Elder 			img_request->result = result;
16671217857fSAlex Elder 	}
16681217857fSAlex Elder 
16691217857fSAlex Elder 	return blk_end_request(img_request->rq, result, xferred);
16701217857fSAlex Elder }
16711217857fSAlex Elder 
16722169238dSAlex Elder static void rbd_img_obj_callback(struct rbd_obj_request *obj_request)
16732169238dSAlex Elder {
16742169238dSAlex Elder 	struct rbd_img_request *img_request;
16752169238dSAlex Elder 	u32 which = obj_request->which;
16762169238dSAlex Elder 	bool more = true;
16772169238dSAlex Elder 
16786365d33aSAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
16792169238dSAlex Elder 	img_request = obj_request->img_request;
16802169238dSAlex Elder 
16812169238dSAlex Elder 	dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
16822169238dSAlex Elder 	rbd_assert(img_request != NULL);
16832169238dSAlex Elder 	rbd_assert(img_request->obj_request_count > 0);
16842169238dSAlex Elder 	rbd_assert(which != BAD_WHICH);
16852169238dSAlex Elder 	rbd_assert(which < img_request->obj_request_count);
16862169238dSAlex Elder 	rbd_assert(which >= img_request->next_completion);
16872169238dSAlex Elder 
16882169238dSAlex Elder 	spin_lock_irq(&img_request->completion_lock);
16892169238dSAlex Elder 	if (which != img_request->next_completion)
16902169238dSAlex Elder 		goto out;
16912169238dSAlex Elder 
16922169238dSAlex Elder 	for_each_obj_request_from(img_request, obj_request) {
16932169238dSAlex Elder 		rbd_assert(more);
16942169238dSAlex Elder 		rbd_assert(which < img_request->obj_request_count);
16952169238dSAlex Elder 
16962169238dSAlex Elder 		if (!obj_request_done_test(obj_request))
16972169238dSAlex Elder 			break;
16981217857fSAlex Elder 		more = rbd_img_obj_end_request(obj_request);
16992169238dSAlex Elder 		which++;
17002169238dSAlex Elder 	}
17012169238dSAlex Elder 
17022169238dSAlex Elder 	rbd_assert(more ^ (which == img_request->obj_request_count));
17032169238dSAlex Elder 	img_request->next_completion = which;
17042169238dSAlex Elder out:
17052169238dSAlex Elder 	spin_unlock_irq(&img_request->completion_lock);
17062169238dSAlex Elder 
17072169238dSAlex Elder 	if (!more)
17082169238dSAlex Elder 		rbd_img_request_complete(img_request);
17092169238dSAlex Elder }
17102169238dSAlex Elder 
1711bf0d5f50SAlex Elder static int rbd_img_request_fill_bio(struct rbd_img_request *img_request,
1712bf0d5f50SAlex Elder 					struct bio *bio_list)
1713bf0d5f50SAlex Elder {
1714bf0d5f50SAlex Elder 	struct rbd_device *rbd_dev = img_request->rbd_dev;
1715bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request = NULL;
1716bf0d5f50SAlex Elder 	struct rbd_obj_request *next_obj_request;
17170c425248SAlex Elder 	bool write_request = img_request_write_test(img_request);
1718bf0d5f50SAlex Elder 	unsigned int bio_offset;
17197da22d29SAlex Elder 	u64 img_offset;
1720bf0d5f50SAlex Elder 	u64 resid;
1721bf0d5f50SAlex Elder 	u16 opcode;
1722bf0d5f50SAlex Elder 
172337206ee5SAlex Elder 	dout("%s: img %p bio %p\n", __func__, img_request, bio_list);
172437206ee5SAlex Elder 
1725430c28c3SAlex Elder 	opcode = write_request ? CEPH_OSD_OP_WRITE : CEPH_OSD_OP_READ;
1726bf0d5f50SAlex Elder 	bio_offset = 0;
17277da22d29SAlex Elder 	img_offset = img_request->offset;
17287da22d29SAlex Elder 	rbd_assert(img_offset == bio_list->bi_sector << SECTOR_SHIFT);
1729bf0d5f50SAlex Elder 	resid = img_request->length;
17304dda41d3SAlex Elder 	rbd_assert(resid > 0);
1731bf0d5f50SAlex Elder 	while (resid) {
17322fa12320SAlex Elder 		struct ceph_osd_request *osd_req;
1733bf0d5f50SAlex Elder 		const char *object_name;
1734bf0d5f50SAlex Elder 		unsigned int clone_size;
1735bf0d5f50SAlex Elder 		u64 offset;
1736bf0d5f50SAlex Elder 		u64 length;
1737bf0d5f50SAlex Elder 
17387da22d29SAlex Elder 		object_name = rbd_segment_name(rbd_dev, img_offset);
1739bf0d5f50SAlex Elder 		if (!object_name)
1740bf0d5f50SAlex Elder 			goto out_unwind;
17417da22d29SAlex Elder 		offset = rbd_segment_offset(rbd_dev, img_offset);
17427da22d29SAlex Elder 		length = rbd_segment_length(rbd_dev, img_offset, resid);
1743bf0d5f50SAlex Elder 		obj_request = rbd_obj_request_create(object_name,
1744bf0d5f50SAlex Elder 						offset, length,
1745bf0d5f50SAlex Elder 						OBJ_REQUEST_BIO);
1746bf0d5f50SAlex Elder 		kfree(object_name);	/* object request has its own copy */
1747bf0d5f50SAlex Elder 		if (!obj_request)
1748bf0d5f50SAlex Elder 			goto out_unwind;
1749bf0d5f50SAlex Elder 
1750bf0d5f50SAlex Elder 		rbd_assert(length <= (u64) UINT_MAX);
1751bf0d5f50SAlex Elder 		clone_size = (unsigned int) length;
1752bf0d5f50SAlex Elder 		obj_request->bio_list = bio_chain_clone_range(&bio_list,
1753bf0d5f50SAlex Elder 						&bio_offset, clone_size,
1754bf0d5f50SAlex Elder 						GFP_ATOMIC);
1755bf0d5f50SAlex Elder 		if (!obj_request->bio_list)
1756bf0d5f50SAlex Elder 			goto out_partial;
1757bf0d5f50SAlex Elder 
17582fa12320SAlex Elder 		osd_req = rbd_osd_req_create(rbd_dev, write_request,
17592fa12320SAlex Elder 						obj_request);
17602fa12320SAlex Elder 		if (!osd_req)
1761bf0d5f50SAlex Elder 			goto out_partial;
17622fa12320SAlex Elder 		obj_request->osd_req = osd_req;
17632169238dSAlex Elder 		obj_request->callback = rbd_img_obj_callback;
1764430c28c3SAlex Elder 
17652fa12320SAlex Elder 		osd_req_op_extent_init(osd_req, 0, opcode, offset, length,
17662fa12320SAlex Elder 						0, 0);
1767a4ce40a9SAlex Elder 		osd_req_op_extent_osd_data_bio(osd_req, 0, write_request,
1768a4ce40a9SAlex Elder 				obj_request->bio_list, obj_request->length);
17692fa12320SAlex Elder 		rbd_osd_req_format(obj_request, write_request);
1770430c28c3SAlex Elder 
17717da22d29SAlex Elder 		obj_request->img_offset = img_offset;
1772bf0d5f50SAlex Elder 		rbd_img_obj_request_add(img_request, obj_request);
1773bf0d5f50SAlex Elder 
17747da22d29SAlex Elder 		img_offset += length;
1775bf0d5f50SAlex Elder 		resid -= length;
1776bf0d5f50SAlex Elder 	}
1777bf0d5f50SAlex Elder 
1778bf0d5f50SAlex Elder 	return 0;
1779bf0d5f50SAlex Elder 
1780bf0d5f50SAlex Elder out_partial:
1781bf0d5f50SAlex Elder 	rbd_obj_request_put(obj_request);
1782bf0d5f50SAlex Elder out_unwind:
1783bf0d5f50SAlex Elder 	for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1784bf0d5f50SAlex Elder 		rbd_obj_request_put(obj_request);
1785bf0d5f50SAlex Elder 
1786bf0d5f50SAlex Elder 	return -ENOMEM;
1787bf0d5f50SAlex Elder }
1788bf0d5f50SAlex Elder 
1789bf0d5f50SAlex Elder static int rbd_img_request_submit(struct rbd_img_request *img_request)
1790bf0d5f50SAlex Elder {
1791bf0d5f50SAlex Elder 	struct rbd_device *rbd_dev = img_request->rbd_dev;
1792bf0d5f50SAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1793bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
179446faeed4SAlex Elder 	struct rbd_obj_request *next_obj_request;
1795bf0d5f50SAlex Elder 
179637206ee5SAlex Elder 	dout("%s: img %p\n", __func__, img_request);
179746faeed4SAlex Elder 	for_each_obj_request_safe(img_request, obj_request, next_obj_request) {
1798bf0d5f50SAlex Elder 		int ret;
1799bf0d5f50SAlex Elder 
1800bf0d5f50SAlex Elder 		ret = rbd_obj_request_submit(osdc, obj_request);
1801bf0d5f50SAlex Elder 		if (ret)
1802bf0d5f50SAlex Elder 			return ret;
1803bf0d5f50SAlex Elder 		/*
1804bf0d5f50SAlex Elder 		 * The image request has its own reference to each
1805bf0d5f50SAlex Elder 		 * of its object requests, so we can safely drop the
1806bf0d5f50SAlex Elder 		 * initial one here.
1807bf0d5f50SAlex Elder 		 */
1808bf0d5f50SAlex Elder 		rbd_obj_request_put(obj_request);
1809bf0d5f50SAlex Elder 	}
1810bf0d5f50SAlex Elder 
1811bf0d5f50SAlex Elder 	return 0;
1812bf0d5f50SAlex Elder }
1813bf0d5f50SAlex Elder 
1814cf81b60eSAlex Elder static int rbd_obj_notify_ack(struct rbd_device *rbd_dev,
1815b8d70035SAlex Elder 				   u64 ver, u64 notify_id)
1816b8d70035SAlex Elder {
1817b8d70035SAlex Elder 	struct rbd_obj_request *obj_request;
18182169238dSAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1819b8d70035SAlex Elder 	int ret;
1820b8d70035SAlex Elder 
1821b8d70035SAlex Elder 	obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
1822b8d70035SAlex Elder 							OBJ_REQUEST_NODATA);
1823b8d70035SAlex Elder 	if (!obj_request)
1824b8d70035SAlex Elder 		return -ENOMEM;
1825b8d70035SAlex Elder 
1826b8d70035SAlex Elder 	ret = -ENOMEM;
1827430c28c3SAlex Elder 	obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
1828b8d70035SAlex Elder 	if (!obj_request->osd_req)
1829b8d70035SAlex Elder 		goto out;
18302169238dSAlex Elder 	obj_request->callback = rbd_obj_request_put;
1831b8d70035SAlex Elder 
1832c99d2d4aSAlex Elder 	osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_NOTIFY_ACK,
1833c99d2d4aSAlex Elder 					notify_id, ver, 0);
18342fa12320SAlex Elder 	rbd_osd_req_format(obj_request, false);
1835430c28c3SAlex Elder 
1836b8d70035SAlex Elder 	ret = rbd_obj_request_submit(osdc, obj_request);
1837b8d70035SAlex Elder out:
1838cf81b60eSAlex Elder 	if (ret)
1839b8d70035SAlex Elder 		rbd_obj_request_put(obj_request);
1840b8d70035SAlex Elder 
1841b8d70035SAlex Elder 	return ret;
1842b8d70035SAlex Elder }
1843b8d70035SAlex Elder 
1844b8d70035SAlex Elder static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1845b8d70035SAlex Elder {
1846b8d70035SAlex Elder 	struct rbd_device *rbd_dev = (struct rbd_device *)data;
1847b8d70035SAlex Elder 	u64 hver;
1848b8d70035SAlex Elder 	int rc;
1849b8d70035SAlex Elder 
1850b8d70035SAlex Elder 	if (!rbd_dev)
1851b8d70035SAlex Elder 		return;
1852b8d70035SAlex Elder 
185337206ee5SAlex Elder 	dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__,
1854b8d70035SAlex Elder 		rbd_dev->header_name, (unsigned long long) notify_id,
1855b8d70035SAlex Elder 		(unsigned int) opcode);
1856b8d70035SAlex Elder 	rc = rbd_dev_refresh(rbd_dev, &hver);
1857b8d70035SAlex Elder 	if (rc)
1858b8d70035SAlex Elder 		rbd_warn(rbd_dev, "got notification but failed to "
1859b8d70035SAlex Elder 			   " update snaps: %d\n", rc);
1860b8d70035SAlex Elder 
1861cf81b60eSAlex Elder 	rbd_obj_notify_ack(rbd_dev, hver, notify_id);
1862b8d70035SAlex Elder }
1863b8d70035SAlex Elder 
18649969ebc5SAlex Elder /*
18659969ebc5SAlex Elder  * Request sync osd watch/unwatch.  The value of "start" determines
18669969ebc5SAlex Elder  * whether a watch request is being initiated or torn down.
18679969ebc5SAlex Elder  */
18689969ebc5SAlex Elder static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start)
18699969ebc5SAlex Elder {
18709969ebc5SAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
18719969ebc5SAlex Elder 	struct rbd_obj_request *obj_request;
18729969ebc5SAlex Elder 	int ret;
18739969ebc5SAlex Elder 
18749969ebc5SAlex Elder 	rbd_assert(start ^ !!rbd_dev->watch_event);
18759969ebc5SAlex Elder 	rbd_assert(start ^ !!rbd_dev->watch_request);
18769969ebc5SAlex Elder 
18779969ebc5SAlex Elder 	if (start) {
18783c663bbdSAlex Elder 		ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev,
18799969ebc5SAlex Elder 						&rbd_dev->watch_event);
18809969ebc5SAlex Elder 		if (ret < 0)
18819969ebc5SAlex Elder 			return ret;
18828eb87565SAlex Elder 		rbd_assert(rbd_dev->watch_event != NULL);
18839969ebc5SAlex Elder 	}
18849969ebc5SAlex Elder 
18859969ebc5SAlex Elder 	ret = -ENOMEM;
18869969ebc5SAlex Elder 	obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
18879969ebc5SAlex Elder 							OBJ_REQUEST_NODATA);
18889969ebc5SAlex Elder 	if (!obj_request)
18899969ebc5SAlex Elder 		goto out_cancel;
18909969ebc5SAlex Elder 
1891430c28c3SAlex Elder 	obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, obj_request);
1892430c28c3SAlex Elder 	if (!obj_request->osd_req)
1893430c28c3SAlex Elder 		goto out_cancel;
1894430c28c3SAlex Elder 
18958eb87565SAlex Elder 	if (start)
1896975241afSAlex Elder 		ceph_osdc_set_request_linger(osdc, obj_request->osd_req);
18978eb87565SAlex Elder 	else
18986977c3f9SAlex Elder 		ceph_osdc_unregister_linger_request(osdc,
1899975241afSAlex Elder 					rbd_dev->watch_request->osd_req);
19002169238dSAlex Elder 
19012169238dSAlex Elder 	osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH,
19022169238dSAlex Elder 				rbd_dev->watch_event->cookie,
19032169238dSAlex Elder 				rbd_dev->header.obj_version, start);
19042169238dSAlex Elder 	rbd_osd_req_format(obj_request, true);
19052169238dSAlex Elder 
19069969ebc5SAlex Elder 	ret = rbd_obj_request_submit(osdc, obj_request);
19079969ebc5SAlex Elder 	if (ret)
19089969ebc5SAlex Elder 		goto out_cancel;
19099969ebc5SAlex Elder 	ret = rbd_obj_request_wait(obj_request);
19109969ebc5SAlex Elder 	if (ret)
19119969ebc5SAlex Elder 		goto out_cancel;
19129969ebc5SAlex Elder 	ret = obj_request->result;
19139969ebc5SAlex Elder 	if (ret)
19149969ebc5SAlex Elder 		goto out_cancel;
19159969ebc5SAlex Elder 
19168eb87565SAlex Elder 	/*
19178eb87565SAlex Elder 	 * A watch request is set to linger, so the underlying osd
19188eb87565SAlex Elder 	 * request won't go away until we unregister it.  We retain
19198eb87565SAlex Elder 	 * a pointer to the object request during that time (in
19208eb87565SAlex Elder 	 * rbd_dev->watch_request), so we'll keep a reference to
19218eb87565SAlex Elder 	 * it.  We'll drop that reference (below) after we've
19228eb87565SAlex Elder 	 * unregistered it.
19238eb87565SAlex Elder 	 */
19248eb87565SAlex Elder 	if (start) {
19258eb87565SAlex Elder 		rbd_dev->watch_request = obj_request;
19268eb87565SAlex Elder 
19278eb87565SAlex Elder 		return 0;
19288eb87565SAlex Elder 	}
19298eb87565SAlex Elder 
19308eb87565SAlex Elder 	/* We have successfully torn down the watch request */
19318eb87565SAlex Elder 
19328eb87565SAlex Elder 	rbd_obj_request_put(rbd_dev->watch_request);
19338eb87565SAlex Elder 	rbd_dev->watch_request = NULL;
19349969ebc5SAlex Elder out_cancel:
19359969ebc5SAlex Elder 	/* Cancel the event if we're tearing down, or on error */
19369969ebc5SAlex Elder 	ceph_osdc_cancel_event(rbd_dev->watch_event);
19379969ebc5SAlex Elder 	rbd_dev->watch_event = NULL;
19389969ebc5SAlex Elder 	if (obj_request)
19399969ebc5SAlex Elder 		rbd_obj_request_put(obj_request);
19409969ebc5SAlex Elder 
19419969ebc5SAlex Elder 	return ret;
19429969ebc5SAlex Elder }
19439969ebc5SAlex Elder 
194436be9a76SAlex Elder /*
194536be9a76SAlex Elder  * Synchronous osd object method call
194636be9a76SAlex Elder  */
194736be9a76SAlex Elder static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
194836be9a76SAlex Elder 			     const char *object_name,
194936be9a76SAlex Elder 			     const char *class_name,
195036be9a76SAlex Elder 			     const char *method_name,
195136be9a76SAlex Elder 			     const char *outbound,
195236be9a76SAlex Elder 			     size_t outbound_size,
195336be9a76SAlex Elder 			     char *inbound,
195436be9a76SAlex Elder 			     size_t inbound_size,
195536be9a76SAlex Elder 			     u64 *version)
195636be9a76SAlex Elder {
19572169238dSAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
195836be9a76SAlex Elder 	struct rbd_obj_request *obj_request;
195936be9a76SAlex Elder 	struct page **pages;
196036be9a76SAlex Elder 	u32 page_count;
196136be9a76SAlex Elder 	int ret;
196236be9a76SAlex Elder 
196336be9a76SAlex Elder 	/*
19646010a451SAlex Elder 	 * Method calls are ultimately read operations.  The result
19656010a451SAlex Elder 	 * should placed into the inbound buffer provided.  They
19666010a451SAlex Elder 	 * also supply outbound data--parameters for the object
19676010a451SAlex Elder 	 * method.  Currently if this is present it will be a
19686010a451SAlex Elder 	 * snapshot id.
196936be9a76SAlex Elder 	 */
197036be9a76SAlex Elder 	page_count = (u32) calc_pages_for(0, inbound_size);
197136be9a76SAlex Elder 	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
197236be9a76SAlex Elder 	if (IS_ERR(pages))
197336be9a76SAlex Elder 		return PTR_ERR(pages);
197436be9a76SAlex Elder 
197536be9a76SAlex Elder 	ret = -ENOMEM;
19766010a451SAlex Elder 	obj_request = rbd_obj_request_create(object_name, 0, inbound_size,
197736be9a76SAlex Elder 							OBJ_REQUEST_PAGES);
197836be9a76SAlex Elder 	if (!obj_request)
197936be9a76SAlex Elder 		goto out;
198036be9a76SAlex Elder 
198136be9a76SAlex Elder 	obj_request->pages = pages;
198236be9a76SAlex Elder 	obj_request->page_count = page_count;
198336be9a76SAlex Elder 
1984430c28c3SAlex Elder 	obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
198536be9a76SAlex Elder 	if (!obj_request->osd_req)
198636be9a76SAlex Elder 		goto out;
198736be9a76SAlex Elder 
1988c99d2d4aSAlex Elder 	osd_req_op_cls_init(obj_request->osd_req, 0, CEPH_OSD_OP_CALL,
198904017e29SAlex Elder 					class_name, method_name);
199004017e29SAlex Elder 	if (outbound_size) {
199104017e29SAlex Elder 		struct ceph_pagelist *pagelist;
199204017e29SAlex Elder 
199304017e29SAlex Elder 		pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS);
199404017e29SAlex Elder 		if (!pagelist)
199504017e29SAlex Elder 			goto out;
199604017e29SAlex Elder 
199704017e29SAlex Elder 		ceph_pagelist_init(pagelist);
199804017e29SAlex Elder 		ceph_pagelist_append(pagelist, outbound, outbound_size);
199904017e29SAlex Elder 		osd_req_op_cls_request_data_pagelist(obj_request->osd_req, 0,
200004017e29SAlex Elder 						pagelist);
200104017e29SAlex Elder 	}
2002a4ce40a9SAlex Elder 	osd_req_op_cls_response_data_pages(obj_request->osd_req, 0,
2003a4ce40a9SAlex Elder 					obj_request->pages, inbound_size,
200444cd188dSAlex Elder 					0, false, false);
20052fa12320SAlex Elder 	rbd_osd_req_format(obj_request, false);
2006430c28c3SAlex Elder 
200736be9a76SAlex Elder 	ret = rbd_obj_request_submit(osdc, obj_request);
200836be9a76SAlex Elder 	if (ret)
200936be9a76SAlex Elder 		goto out;
201036be9a76SAlex Elder 	ret = rbd_obj_request_wait(obj_request);
201136be9a76SAlex Elder 	if (ret)
201236be9a76SAlex Elder 		goto out;
201336be9a76SAlex Elder 
201436be9a76SAlex Elder 	ret = obj_request->result;
201536be9a76SAlex Elder 	if (ret < 0)
201636be9a76SAlex Elder 		goto out;
201723ed6e13SAlex Elder 	ret = 0;
2018903bb32eSAlex Elder 	ceph_copy_from_page_vector(pages, inbound, 0, obj_request->xferred);
201936be9a76SAlex Elder 	if (version)
202036be9a76SAlex Elder 		*version = obj_request->version;
202136be9a76SAlex Elder out:
202236be9a76SAlex Elder 	if (obj_request)
202336be9a76SAlex Elder 		rbd_obj_request_put(obj_request);
202436be9a76SAlex Elder 	else
202536be9a76SAlex Elder 		ceph_release_page_vector(pages, page_count);
202636be9a76SAlex Elder 
202736be9a76SAlex Elder 	return ret;
202836be9a76SAlex Elder }
202936be9a76SAlex Elder 
2030bf0d5f50SAlex Elder static void rbd_request_fn(struct request_queue *q)
2031cc344fa1SAlex Elder 		__releases(q->queue_lock) __acquires(q->queue_lock)
2032bf0d5f50SAlex Elder {
2033bf0d5f50SAlex Elder 	struct rbd_device *rbd_dev = q->queuedata;
2034bf0d5f50SAlex Elder 	bool read_only = rbd_dev->mapping.read_only;
2035bf0d5f50SAlex Elder 	struct request *rq;
2036bf0d5f50SAlex Elder 	int result;
2037bf0d5f50SAlex Elder 
2038bf0d5f50SAlex Elder 	while ((rq = blk_fetch_request(q))) {
2039bf0d5f50SAlex Elder 		bool write_request = rq_data_dir(rq) == WRITE;
2040bf0d5f50SAlex Elder 		struct rbd_img_request *img_request;
2041bf0d5f50SAlex Elder 		u64 offset;
2042bf0d5f50SAlex Elder 		u64 length;
2043bf0d5f50SAlex Elder 
2044bf0d5f50SAlex Elder 		/* Ignore any non-FS requests that filter through. */
2045bf0d5f50SAlex Elder 
2046bf0d5f50SAlex Elder 		if (rq->cmd_type != REQ_TYPE_FS) {
20474dda41d3SAlex Elder 			dout("%s: non-fs request type %d\n", __func__,
20484dda41d3SAlex Elder 				(int) rq->cmd_type);
20494dda41d3SAlex Elder 			__blk_end_request_all(rq, 0);
20504dda41d3SAlex Elder 			continue;
20514dda41d3SAlex Elder 		}
20524dda41d3SAlex Elder 
20534dda41d3SAlex Elder 		/* Ignore/skip any zero-length requests */
20544dda41d3SAlex Elder 
20554dda41d3SAlex Elder 		offset = (u64) blk_rq_pos(rq) << SECTOR_SHIFT;
20564dda41d3SAlex Elder 		length = (u64) blk_rq_bytes(rq);
20574dda41d3SAlex Elder 
20584dda41d3SAlex Elder 		if (!length) {
20594dda41d3SAlex Elder 			dout("%s: zero-length request\n", __func__);
2060bf0d5f50SAlex Elder 			__blk_end_request_all(rq, 0);
2061bf0d5f50SAlex Elder 			continue;
2062bf0d5f50SAlex Elder 		}
2063bf0d5f50SAlex Elder 
2064bf0d5f50SAlex Elder 		spin_unlock_irq(q->queue_lock);
2065bf0d5f50SAlex Elder 
2066bf0d5f50SAlex Elder 		/* Disallow writes to a read-only device */
2067bf0d5f50SAlex Elder 
2068bf0d5f50SAlex Elder 		if (write_request) {
2069bf0d5f50SAlex Elder 			result = -EROFS;
2070bf0d5f50SAlex Elder 			if (read_only)
2071bf0d5f50SAlex Elder 				goto end_request;
2072bf0d5f50SAlex Elder 			rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP);
2073bf0d5f50SAlex Elder 		}
2074bf0d5f50SAlex Elder 
20756d292906SAlex Elder 		/*
20766d292906SAlex Elder 		 * Quit early if the mapped snapshot no longer
20776d292906SAlex Elder 		 * exists.  It's still possible the snapshot will
20786d292906SAlex Elder 		 * have disappeared by the time our request arrives
20796d292906SAlex Elder 		 * at the osd, but there's no sense in sending it if
20806d292906SAlex Elder 		 * we already know.
20816d292906SAlex Elder 		 */
20826d292906SAlex Elder 		if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
2083bf0d5f50SAlex Elder 			dout("request for non-existent snapshot");
2084bf0d5f50SAlex Elder 			rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
2085bf0d5f50SAlex Elder 			result = -ENXIO;
2086bf0d5f50SAlex Elder 			goto end_request;
2087bf0d5f50SAlex Elder 		}
2088bf0d5f50SAlex Elder 
2089bf0d5f50SAlex Elder 		result = -EINVAL;
2090bf0d5f50SAlex Elder 		if (WARN_ON(offset && length > U64_MAX - offset + 1))
2091bf0d5f50SAlex Elder 			goto end_request;	/* Shouldn't happen */
2092bf0d5f50SAlex Elder 
2093bf0d5f50SAlex Elder 		result = -ENOMEM;
2094bf0d5f50SAlex Elder 		img_request = rbd_img_request_create(rbd_dev, offset, length,
20959849e986SAlex Elder 							write_request, false);
2096bf0d5f50SAlex Elder 		if (!img_request)
2097bf0d5f50SAlex Elder 			goto end_request;
2098bf0d5f50SAlex Elder 
2099bf0d5f50SAlex Elder 		img_request->rq = rq;
2100bf0d5f50SAlex Elder 
2101bf0d5f50SAlex Elder 		result = rbd_img_request_fill_bio(img_request, rq->bio);
2102bf0d5f50SAlex Elder 		if (!result)
2103bf0d5f50SAlex Elder 			result = rbd_img_request_submit(img_request);
2104bf0d5f50SAlex Elder 		if (result)
2105bf0d5f50SAlex Elder 			rbd_img_request_put(img_request);
2106bf0d5f50SAlex Elder end_request:
2107bf0d5f50SAlex Elder 		spin_lock_irq(q->queue_lock);
2108bf0d5f50SAlex Elder 		if (result < 0) {
21097da22d29SAlex Elder 			rbd_warn(rbd_dev, "%s %llx at %llx result %d\n",
21107da22d29SAlex Elder 				write_request ? "write" : "read",
21117da22d29SAlex Elder 				length, offset, result);
21127da22d29SAlex Elder 
2113bf0d5f50SAlex Elder 			__blk_end_request_all(rq, result);
2114bf0d5f50SAlex Elder 		}
2115bf0d5f50SAlex Elder 	}
2116bf0d5f50SAlex Elder }
2117bf0d5f50SAlex Elder 
2118602adf40SYehuda Sadeh /*
2119602adf40SYehuda Sadeh  * a queue callback. Makes sure that we don't create a bio that spans across
2120602adf40SYehuda Sadeh  * multiple osd objects. One exception would be with a single page bios,
2121f7760dadSAlex Elder  * which we handle later at bio_chain_clone_range()
2122602adf40SYehuda Sadeh  */
2123602adf40SYehuda Sadeh static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
2124602adf40SYehuda Sadeh 			  struct bio_vec *bvec)
2125602adf40SYehuda Sadeh {
2126602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev = q->queuedata;
2127e5cfeed2SAlex Elder 	sector_t sector_offset;
2128e5cfeed2SAlex Elder 	sector_t sectors_per_obj;
2129e5cfeed2SAlex Elder 	sector_t obj_sector_offset;
2130e5cfeed2SAlex Elder 	int ret;
2131602adf40SYehuda Sadeh 
2132e5cfeed2SAlex Elder 	/*
2133e5cfeed2SAlex Elder 	 * Find how far into its rbd object the partition-relative
2134e5cfeed2SAlex Elder 	 * bio start sector is to offset relative to the enclosing
2135e5cfeed2SAlex Elder 	 * device.
2136e5cfeed2SAlex Elder 	 */
2137e5cfeed2SAlex Elder 	sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector;
2138e5cfeed2SAlex Elder 	sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
2139e5cfeed2SAlex Elder 	obj_sector_offset = sector_offset & (sectors_per_obj - 1);
2140593a9e7bSAlex Elder 
2141e5cfeed2SAlex Elder 	/*
2142e5cfeed2SAlex Elder 	 * Compute the number of bytes from that offset to the end
2143e5cfeed2SAlex Elder 	 * of the object.  Account for what's already used by the bio.
2144e5cfeed2SAlex Elder 	 */
2145e5cfeed2SAlex Elder 	ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT;
2146e5cfeed2SAlex Elder 	if (ret > bmd->bi_size)
2147e5cfeed2SAlex Elder 		ret -= bmd->bi_size;
2148e5cfeed2SAlex Elder 	else
2149e5cfeed2SAlex Elder 		ret = 0;
2150e5cfeed2SAlex Elder 
2151e5cfeed2SAlex Elder 	/*
2152e5cfeed2SAlex Elder 	 * Don't send back more than was asked for.  And if the bio
2153e5cfeed2SAlex Elder 	 * was empty, let the whole thing through because:  "Note
2154e5cfeed2SAlex Elder 	 * that a block device *must* allow a single page to be
2155e5cfeed2SAlex Elder 	 * added to an empty bio."
2156e5cfeed2SAlex Elder 	 */
2157e5cfeed2SAlex Elder 	rbd_assert(bvec->bv_len <= PAGE_SIZE);
2158e5cfeed2SAlex Elder 	if (ret > (int) bvec->bv_len || !bmd->bi_size)
2159e5cfeed2SAlex Elder 		ret = (int) bvec->bv_len;
2160e5cfeed2SAlex Elder 
2161e5cfeed2SAlex Elder 	return ret;
2162602adf40SYehuda Sadeh }
2163602adf40SYehuda Sadeh 
2164602adf40SYehuda Sadeh static void rbd_free_disk(struct rbd_device *rbd_dev)
2165602adf40SYehuda Sadeh {
2166602adf40SYehuda Sadeh 	struct gendisk *disk = rbd_dev->disk;
2167602adf40SYehuda Sadeh 
2168602adf40SYehuda Sadeh 	if (!disk)
2169602adf40SYehuda Sadeh 		return;
2170602adf40SYehuda Sadeh 
2171602adf40SYehuda Sadeh 	if (disk->flags & GENHD_FL_UP)
2172602adf40SYehuda Sadeh 		del_gendisk(disk);
2173602adf40SYehuda Sadeh 	if (disk->queue)
2174602adf40SYehuda Sadeh 		blk_cleanup_queue(disk->queue);
2175602adf40SYehuda Sadeh 	put_disk(disk);
2176602adf40SYehuda Sadeh }
2177602adf40SYehuda Sadeh 
2178788e2df3SAlex Elder static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
2179788e2df3SAlex Elder 				const char *object_name,
2180788e2df3SAlex Elder 				u64 offset, u64 length,
2181788e2df3SAlex Elder 				char *buf, u64 *version)
2182788e2df3SAlex Elder 
2183788e2df3SAlex Elder {
21842169238dSAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2185788e2df3SAlex Elder 	struct rbd_obj_request *obj_request;
2186788e2df3SAlex Elder 	struct page **pages = NULL;
2187788e2df3SAlex Elder 	u32 page_count;
21881ceae7efSAlex Elder 	size_t size;
2189788e2df3SAlex Elder 	int ret;
2190788e2df3SAlex Elder 
2191788e2df3SAlex Elder 	page_count = (u32) calc_pages_for(offset, length);
2192788e2df3SAlex Elder 	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2193788e2df3SAlex Elder 	if (IS_ERR(pages))
2194788e2df3SAlex Elder 		ret = PTR_ERR(pages);
2195788e2df3SAlex Elder 
2196788e2df3SAlex Elder 	ret = -ENOMEM;
2197788e2df3SAlex Elder 	obj_request = rbd_obj_request_create(object_name, offset, length,
2198788e2df3SAlex Elder 							OBJ_REQUEST_PAGES);
2199788e2df3SAlex Elder 	if (!obj_request)
2200788e2df3SAlex Elder 		goto out;
2201788e2df3SAlex Elder 
2202788e2df3SAlex Elder 	obj_request->pages = pages;
2203788e2df3SAlex Elder 	obj_request->page_count = page_count;
2204788e2df3SAlex Elder 
2205430c28c3SAlex Elder 	obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
2206788e2df3SAlex Elder 	if (!obj_request->osd_req)
2207788e2df3SAlex Elder 		goto out;
2208788e2df3SAlex Elder 
2209c99d2d4aSAlex Elder 	osd_req_op_extent_init(obj_request->osd_req, 0, CEPH_OSD_OP_READ,
2210c99d2d4aSAlex Elder 					offset, length, 0, 0);
2211a4ce40a9SAlex Elder 	osd_req_op_extent_osd_data_pages(obj_request->osd_req, 0, false,
2212a4ce40a9SAlex Elder 					obj_request->pages,
221344cd188dSAlex Elder 					obj_request->length,
221444cd188dSAlex Elder 					obj_request->offset & ~PAGE_MASK,
221544cd188dSAlex Elder 					false, false);
22162fa12320SAlex Elder 	rbd_osd_req_format(obj_request, false);
2217430c28c3SAlex Elder 
2218788e2df3SAlex Elder 	ret = rbd_obj_request_submit(osdc, obj_request);
2219788e2df3SAlex Elder 	if (ret)
2220788e2df3SAlex Elder 		goto out;
2221788e2df3SAlex Elder 	ret = rbd_obj_request_wait(obj_request);
2222788e2df3SAlex Elder 	if (ret)
2223788e2df3SAlex Elder 		goto out;
2224788e2df3SAlex Elder 
2225788e2df3SAlex Elder 	ret = obj_request->result;
2226788e2df3SAlex Elder 	if (ret < 0)
2227788e2df3SAlex Elder 		goto out;
22281ceae7efSAlex Elder 
22291ceae7efSAlex Elder 	rbd_assert(obj_request->xferred <= (u64) SIZE_MAX);
22301ceae7efSAlex Elder 	size = (size_t) obj_request->xferred;
2231903bb32eSAlex Elder 	ceph_copy_from_page_vector(pages, buf, 0, size);
223223ed6e13SAlex Elder 	rbd_assert(size <= (size_t) INT_MAX);
223323ed6e13SAlex Elder 	ret = (int) size;
2234788e2df3SAlex Elder 	if (version)
2235788e2df3SAlex Elder 		*version = obj_request->version;
2236788e2df3SAlex Elder out:
2237788e2df3SAlex Elder 	if (obj_request)
2238788e2df3SAlex Elder 		rbd_obj_request_put(obj_request);
2239788e2df3SAlex Elder 	else
2240788e2df3SAlex Elder 		ceph_release_page_vector(pages, page_count);
2241788e2df3SAlex Elder 
2242788e2df3SAlex Elder 	return ret;
2243788e2df3SAlex Elder }
2244788e2df3SAlex Elder 
2245602adf40SYehuda Sadeh /*
22464156d998SAlex Elder  * Read the complete header for the given rbd device.
22474156d998SAlex Elder  *
22484156d998SAlex Elder  * Returns a pointer to a dynamically-allocated buffer containing
22494156d998SAlex Elder  * the complete and validated header.  Caller can pass the address
22504156d998SAlex Elder  * of a variable that will be filled in with the version of the
22514156d998SAlex Elder  * header object at the time it was read.
22524156d998SAlex Elder  *
22534156d998SAlex Elder  * Returns a pointer-coded errno if a failure occurs.
22544156d998SAlex Elder  */
22554156d998SAlex Elder static struct rbd_image_header_ondisk *
22564156d998SAlex Elder rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
22574156d998SAlex Elder {
22584156d998SAlex Elder 	struct rbd_image_header_ondisk *ondisk = NULL;
22594156d998SAlex Elder 	u32 snap_count = 0;
22604156d998SAlex Elder 	u64 names_size = 0;
22614156d998SAlex Elder 	u32 want_count;
22624156d998SAlex Elder 	int ret;
22634156d998SAlex Elder 
22644156d998SAlex Elder 	/*
22654156d998SAlex Elder 	 * The complete header will include an array of its 64-bit
22664156d998SAlex Elder 	 * snapshot ids, followed by the names of those snapshots as
22674156d998SAlex Elder 	 * a contiguous block of NUL-terminated strings.  Note that
22684156d998SAlex Elder 	 * the number of snapshots could change by the time we read
22694156d998SAlex Elder 	 * it in, in which case we re-read it.
22704156d998SAlex Elder 	 */
22714156d998SAlex Elder 	do {
22724156d998SAlex Elder 		size_t size;
22734156d998SAlex Elder 
22744156d998SAlex Elder 		kfree(ondisk);
22754156d998SAlex Elder 
22764156d998SAlex Elder 		size = sizeof (*ondisk);
22774156d998SAlex Elder 		size += snap_count * sizeof (struct rbd_image_snap_ondisk);
22784156d998SAlex Elder 		size += names_size;
22794156d998SAlex Elder 		ondisk = kmalloc(size, GFP_KERNEL);
22804156d998SAlex Elder 		if (!ondisk)
22814156d998SAlex Elder 			return ERR_PTR(-ENOMEM);
22824156d998SAlex Elder 
2283788e2df3SAlex Elder 		ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name,
22844156d998SAlex Elder 				       0, size,
22854156d998SAlex Elder 				       (char *) ondisk, version);
22864156d998SAlex Elder 		if (ret < 0)
22874156d998SAlex Elder 			goto out_err;
22884156d998SAlex Elder 		if (WARN_ON((size_t) ret < size)) {
22894156d998SAlex Elder 			ret = -ENXIO;
229006ecc6cbSAlex Elder 			rbd_warn(rbd_dev, "short header read (want %zd got %d)",
229106ecc6cbSAlex Elder 				size, ret);
22924156d998SAlex Elder 			goto out_err;
22934156d998SAlex Elder 		}
22944156d998SAlex Elder 		if (!rbd_dev_ondisk_valid(ondisk)) {
22954156d998SAlex Elder 			ret = -ENXIO;
229606ecc6cbSAlex Elder 			rbd_warn(rbd_dev, "invalid header");
22974156d998SAlex Elder 			goto out_err;
22984156d998SAlex Elder 		}
22994156d998SAlex Elder 
23004156d998SAlex Elder 		names_size = le64_to_cpu(ondisk->snap_names_len);
23014156d998SAlex Elder 		want_count = snap_count;
23024156d998SAlex Elder 		snap_count = le32_to_cpu(ondisk->snap_count);
23034156d998SAlex Elder 	} while (snap_count != want_count);
23044156d998SAlex Elder 
23054156d998SAlex Elder 	return ondisk;
23064156d998SAlex Elder 
23074156d998SAlex Elder out_err:
23084156d998SAlex Elder 	kfree(ondisk);
23094156d998SAlex Elder 
23104156d998SAlex Elder 	return ERR_PTR(ret);
23114156d998SAlex Elder }
23124156d998SAlex Elder 
23134156d998SAlex Elder /*
2314602adf40SYehuda Sadeh  * reload the ondisk the header
2315602adf40SYehuda Sadeh  */
2316602adf40SYehuda Sadeh static int rbd_read_header(struct rbd_device *rbd_dev,
2317602adf40SYehuda Sadeh 			   struct rbd_image_header *header)
2318602adf40SYehuda Sadeh {
23194156d998SAlex Elder 	struct rbd_image_header_ondisk *ondisk;
23204156d998SAlex Elder 	u64 ver = 0;
23214156d998SAlex Elder 	int ret;
2322602adf40SYehuda Sadeh 
23234156d998SAlex Elder 	ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
23244156d998SAlex Elder 	if (IS_ERR(ondisk))
23254156d998SAlex Elder 		return PTR_ERR(ondisk);
23264156d998SAlex Elder 	ret = rbd_header_from_disk(header, ondisk);
23274156d998SAlex Elder 	if (ret >= 0)
232859c2be1eSYehuda Sadeh 		header->obj_version = ver;
23294156d998SAlex Elder 	kfree(ondisk);
2330602adf40SYehuda Sadeh 
23314156d998SAlex Elder 	return ret;
2332602adf40SYehuda Sadeh }
2333602adf40SYehuda Sadeh 
233441f38c2bSAlex Elder static void rbd_remove_all_snaps(struct rbd_device *rbd_dev)
2335dfc5606dSYehuda Sadeh {
2336dfc5606dSYehuda Sadeh 	struct rbd_snap *snap;
2337a0593290SAlex Elder 	struct rbd_snap *next;
2338dfc5606dSYehuda Sadeh 
2339a0593290SAlex Elder 	list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
234041f38c2bSAlex Elder 		rbd_remove_snap_dev(snap);
2341dfc5606dSYehuda Sadeh }
2342dfc5606dSYehuda Sadeh 
23439478554aSAlex Elder static void rbd_update_mapping_size(struct rbd_device *rbd_dev)
23449478554aSAlex Elder {
23459478554aSAlex Elder 	sector_t size;
23469478554aSAlex Elder 
23470d7dbfceSAlex Elder 	if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
23489478554aSAlex Elder 		return;
23499478554aSAlex Elder 
23509478554aSAlex Elder 	size = (sector_t) rbd_dev->header.image_size / SECTOR_SIZE;
23519478554aSAlex Elder 	dout("setting size to %llu sectors", (unsigned long long) size);
23529478554aSAlex Elder 	rbd_dev->mapping.size = (u64) size;
23539478554aSAlex Elder 	set_capacity(rbd_dev->disk, size);
23549478554aSAlex Elder }
23559478554aSAlex Elder 
2356602adf40SYehuda Sadeh /*
2357602adf40SYehuda Sadeh  * only read the first part of the ondisk header, without the snaps info
2358602adf40SYehuda Sadeh  */
2359117973fbSAlex Elder static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver)
2360602adf40SYehuda Sadeh {
2361602adf40SYehuda Sadeh 	int ret;
2362602adf40SYehuda Sadeh 	struct rbd_image_header h;
2363602adf40SYehuda Sadeh 
2364602adf40SYehuda Sadeh 	ret = rbd_read_header(rbd_dev, &h);
2365602adf40SYehuda Sadeh 	if (ret < 0)
2366602adf40SYehuda Sadeh 		return ret;
2367602adf40SYehuda Sadeh 
2368a51aa0c0SJosh Durgin 	down_write(&rbd_dev->header_rwsem);
2369a51aa0c0SJosh Durgin 
23709478554aSAlex Elder 	/* Update image size, and check for resize of mapped image */
23719478554aSAlex Elder 	rbd_dev->header.image_size = h.image_size;
23729478554aSAlex Elder 	rbd_update_mapping_size(rbd_dev);
23739db4b3e3SSage Weil 
2374849b4260SAlex Elder 	/* rbd_dev->header.object_prefix shouldn't change */
2375602adf40SYehuda Sadeh 	kfree(rbd_dev->header.snap_sizes);
2376849b4260SAlex Elder 	kfree(rbd_dev->header.snap_names);
2377d1d25646SJosh Durgin 	/* osd requests may still refer to snapc */
2378d1d25646SJosh Durgin 	ceph_put_snap_context(rbd_dev->header.snapc);
2379602adf40SYehuda Sadeh 
2380b813623aSAlex Elder 	if (hver)
2381b813623aSAlex Elder 		*hver = h.obj_version;
2382a71b891bSJosh Durgin 	rbd_dev->header.obj_version = h.obj_version;
238393a24e08SJosh Durgin 	rbd_dev->header.image_size = h.image_size;
2384602adf40SYehuda Sadeh 	rbd_dev->header.snapc = h.snapc;
2385602adf40SYehuda Sadeh 	rbd_dev->header.snap_names = h.snap_names;
2386602adf40SYehuda Sadeh 	rbd_dev->header.snap_sizes = h.snap_sizes;
2387849b4260SAlex Elder 	/* Free the extra copy of the object prefix */
2388849b4260SAlex Elder 	WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
2389849b4260SAlex Elder 	kfree(h.object_prefix);
2390849b4260SAlex Elder 
2391304f6808SAlex Elder 	ret = rbd_dev_snaps_update(rbd_dev);
2392304f6808SAlex Elder 	if (!ret)
2393304f6808SAlex Elder 		ret = rbd_dev_snaps_register(rbd_dev);
2394dfc5606dSYehuda Sadeh 
2395c666601aSJosh Durgin 	up_write(&rbd_dev->header_rwsem);
2396602adf40SYehuda Sadeh 
2397dfc5606dSYehuda Sadeh 	return ret;
2398602adf40SYehuda Sadeh }
2399602adf40SYehuda Sadeh 
2400117973fbSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver)
24011fe5e993SAlex Elder {
24021fe5e993SAlex Elder 	int ret;
24031fe5e993SAlex Elder 
2404117973fbSAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
24051fe5e993SAlex Elder 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2406117973fbSAlex Elder 	if (rbd_dev->image_format == 1)
2407117973fbSAlex Elder 		ret = rbd_dev_v1_refresh(rbd_dev, hver);
2408117973fbSAlex Elder 	else
2409117973fbSAlex Elder 		ret = rbd_dev_v2_refresh(rbd_dev, hver);
24101fe5e993SAlex Elder 	mutex_unlock(&ctl_mutex);
24111fe5e993SAlex Elder 
24121fe5e993SAlex Elder 	return ret;
24131fe5e993SAlex Elder }
24141fe5e993SAlex Elder 
2415602adf40SYehuda Sadeh static int rbd_init_disk(struct rbd_device *rbd_dev)
2416602adf40SYehuda Sadeh {
2417602adf40SYehuda Sadeh 	struct gendisk *disk;
2418602adf40SYehuda Sadeh 	struct request_queue *q;
2419593a9e7bSAlex Elder 	u64 segment_size;
2420602adf40SYehuda Sadeh 
2421602adf40SYehuda Sadeh 	/* create gendisk info */
2422602adf40SYehuda Sadeh 	disk = alloc_disk(RBD_MINORS_PER_MAJOR);
2423602adf40SYehuda Sadeh 	if (!disk)
24241fcdb8aaSAlex Elder 		return -ENOMEM;
2425602adf40SYehuda Sadeh 
2426f0f8cef5SAlex Elder 	snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
2427de71a297SAlex Elder 		 rbd_dev->dev_id);
2428602adf40SYehuda Sadeh 	disk->major = rbd_dev->major;
2429602adf40SYehuda Sadeh 	disk->first_minor = 0;
2430602adf40SYehuda Sadeh 	disk->fops = &rbd_bd_ops;
2431602adf40SYehuda Sadeh 	disk->private_data = rbd_dev;
2432602adf40SYehuda Sadeh 
2433bf0d5f50SAlex Elder 	q = blk_init_queue(rbd_request_fn, &rbd_dev->lock);
2434602adf40SYehuda Sadeh 	if (!q)
2435602adf40SYehuda Sadeh 		goto out_disk;
2436029bcbd8SJosh Durgin 
2437593a9e7bSAlex Elder 	/* We use the default size, but let's be explicit about it. */
2438593a9e7bSAlex Elder 	blk_queue_physical_block_size(q, SECTOR_SIZE);
2439593a9e7bSAlex Elder 
2440029bcbd8SJosh Durgin 	/* set io sizes to object size */
2441593a9e7bSAlex Elder 	segment_size = rbd_obj_bytes(&rbd_dev->header);
2442593a9e7bSAlex Elder 	blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
2443593a9e7bSAlex Elder 	blk_queue_max_segment_size(q, segment_size);
2444593a9e7bSAlex Elder 	blk_queue_io_min(q, segment_size);
2445593a9e7bSAlex Elder 	blk_queue_io_opt(q, segment_size);
2446029bcbd8SJosh Durgin 
2447602adf40SYehuda Sadeh 	blk_queue_merge_bvec(q, rbd_merge_bvec);
2448602adf40SYehuda Sadeh 	disk->queue = q;
2449602adf40SYehuda Sadeh 
2450602adf40SYehuda Sadeh 	q->queuedata = rbd_dev;
2451602adf40SYehuda Sadeh 
2452602adf40SYehuda Sadeh 	rbd_dev->disk = disk;
2453602adf40SYehuda Sadeh 
245412f02944SAlex Elder 	set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
245512f02944SAlex Elder 
2456602adf40SYehuda Sadeh 	return 0;
2457602adf40SYehuda Sadeh out_disk:
2458602adf40SYehuda Sadeh 	put_disk(disk);
24591fcdb8aaSAlex Elder 
24601fcdb8aaSAlex Elder 	return -ENOMEM;
2461602adf40SYehuda Sadeh }
2462602adf40SYehuda Sadeh 
2463dfc5606dSYehuda Sadeh /*
2464dfc5606dSYehuda Sadeh   sysfs
2465dfc5606dSYehuda Sadeh */
2466602adf40SYehuda Sadeh 
2467593a9e7bSAlex Elder static struct rbd_device *dev_to_rbd_dev(struct device *dev)
2468593a9e7bSAlex Elder {
2469593a9e7bSAlex Elder 	return container_of(dev, struct rbd_device, dev);
2470593a9e7bSAlex Elder }
2471593a9e7bSAlex Elder 
2472dfc5606dSYehuda Sadeh static ssize_t rbd_size_show(struct device *dev,
2473dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
2474602adf40SYehuda Sadeh {
2475593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2476a51aa0c0SJosh Durgin 	sector_t size;
2477dfc5606dSYehuda Sadeh 
2478a51aa0c0SJosh Durgin 	down_read(&rbd_dev->header_rwsem);
2479a51aa0c0SJosh Durgin 	size = get_capacity(rbd_dev->disk);
2480a51aa0c0SJosh Durgin 	up_read(&rbd_dev->header_rwsem);
2481a51aa0c0SJosh Durgin 
2482a51aa0c0SJosh Durgin 	return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
2483602adf40SYehuda Sadeh }
2484602adf40SYehuda Sadeh 
248534b13184SAlex Elder /*
248634b13184SAlex Elder  * Note this shows the features for whatever's mapped, which is not
248734b13184SAlex Elder  * necessarily the base image.
248834b13184SAlex Elder  */
248934b13184SAlex Elder static ssize_t rbd_features_show(struct device *dev,
249034b13184SAlex Elder 			     struct device_attribute *attr, char *buf)
249134b13184SAlex Elder {
249234b13184SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
249334b13184SAlex Elder 
249434b13184SAlex Elder 	return sprintf(buf, "0x%016llx\n",
249534b13184SAlex Elder 			(unsigned long long) rbd_dev->mapping.features);
249634b13184SAlex Elder }
249734b13184SAlex Elder 
2498dfc5606dSYehuda Sadeh static ssize_t rbd_major_show(struct device *dev,
2499dfc5606dSYehuda Sadeh 			      struct device_attribute *attr, char *buf)
2500602adf40SYehuda Sadeh {
2501593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2502dfc5606dSYehuda Sadeh 
2503dfc5606dSYehuda Sadeh 	return sprintf(buf, "%d\n", rbd_dev->major);
2504dfc5606dSYehuda Sadeh }
2505dfc5606dSYehuda Sadeh 
2506dfc5606dSYehuda Sadeh static ssize_t rbd_client_id_show(struct device *dev,
2507dfc5606dSYehuda Sadeh 				  struct device_attribute *attr, char *buf)
2508dfc5606dSYehuda Sadeh {
2509593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2510dfc5606dSYehuda Sadeh 
25111dbb4399SAlex Elder 	return sprintf(buf, "client%lld\n",
25121dbb4399SAlex Elder 			ceph_client_id(rbd_dev->rbd_client->client));
2513dfc5606dSYehuda Sadeh }
2514dfc5606dSYehuda Sadeh 
2515dfc5606dSYehuda Sadeh static ssize_t rbd_pool_show(struct device *dev,
2516dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
2517dfc5606dSYehuda Sadeh {
2518593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2519dfc5606dSYehuda Sadeh 
25200d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
2521dfc5606dSYehuda Sadeh }
2522dfc5606dSYehuda Sadeh 
25239bb2f334SAlex Elder static ssize_t rbd_pool_id_show(struct device *dev,
25249bb2f334SAlex Elder 			     struct device_attribute *attr, char *buf)
25259bb2f334SAlex Elder {
25269bb2f334SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
25279bb2f334SAlex Elder 
25280d7dbfceSAlex Elder 	return sprintf(buf, "%llu\n",
25290d7dbfceSAlex Elder 		(unsigned long long) rbd_dev->spec->pool_id);
25309bb2f334SAlex Elder }
25319bb2f334SAlex Elder 
2532dfc5606dSYehuda Sadeh static ssize_t rbd_name_show(struct device *dev,
2533dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
2534dfc5606dSYehuda Sadeh {
2535593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2536dfc5606dSYehuda Sadeh 
2537a92ffdf8SAlex Elder 	if (rbd_dev->spec->image_name)
25380d7dbfceSAlex Elder 		return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
2539a92ffdf8SAlex Elder 
2540a92ffdf8SAlex Elder 	return sprintf(buf, "(unknown)\n");
2541dfc5606dSYehuda Sadeh }
2542dfc5606dSYehuda Sadeh 
2543589d30e0SAlex Elder static ssize_t rbd_image_id_show(struct device *dev,
2544589d30e0SAlex Elder 			     struct device_attribute *attr, char *buf)
2545589d30e0SAlex Elder {
2546589d30e0SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2547589d30e0SAlex Elder 
25480d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
2549589d30e0SAlex Elder }
2550589d30e0SAlex Elder 
255134b13184SAlex Elder /*
255234b13184SAlex Elder  * Shows the name of the currently-mapped snapshot (or
255334b13184SAlex Elder  * RBD_SNAP_HEAD_NAME for the base image).
255434b13184SAlex Elder  */
2555dfc5606dSYehuda Sadeh static ssize_t rbd_snap_show(struct device *dev,
2556dfc5606dSYehuda Sadeh 			     struct device_attribute *attr,
2557dfc5606dSYehuda Sadeh 			     char *buf)
2558dfc5606dSYehuda Sadeh {
2559593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2560dfc5606dSYehuda Sadeh 
25610d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
2562dfc5606dSYehuda Sadeh }
2563dfc5606dSYehuda Sadeh 
256486b00e0dSAlex Elder /*
256586b00e0dSAlex Elder  * For an rbd v2 image, shows the pool id, image id, and snapshot id
256686b00e0dSAlex Elder  * for the parent image.  If there is no parent, simply shows
256786b00e0dSAlex Elder  * "(no parent image)".
256886b00e0dSAlex Elder  */
256986b00e0dSAlex Elder static ssize_t rbd_parent_show(struct device *dev,
257086b00e0dSAlex Elder 			     struct device_attribute *attr,
257186b00e0dSAlex Elder 			     char *buf)
257286b00e0dSAlex Elder {
257386b00e0dSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
257486b00e0dSAlex Elder 	struct rbd_spec *spec = rbd_dev->parent_spec;
257586b00e0dSAlex Elder 	int count;
257686b00e0dSAlex Elder 	char *bufp = buf;
257786b00e0dSAlex Elder 
257886b00e0dSAlex Elder 	if (!spec)
257986b00e0dSAlex Elder 		return sprintf(buf, "(no parent image)\n");
258086b00e0dSAlex Elder 
258186b00e0dSAlex Elder 	count = sprintf(bufp, "pool_id %llu\npool_name %s\n",
258286b00e0dSAlex Elder 			(unsigned long long) spec->pool_id, spec->pool_name);
258386b00e0dSAlex Elder 	if (count < 0)
258486b00e0dSAlex Elder 		return count;
258586b00e0dSAlex Elder 	bufp += count;
258686b00e0dSAlex Elder 
258786b00e0dSAlex Elder 	count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id,
258886b00e0dSAlex Elder 			spec->image_name ? spec->image_name : "(unknown)");
258986b00e0dSAlex Elder 	if (count < 0)
259086b00e0dSAlex Elder 		return count;
259186b00e0dSAlex Elder 	bufp += count;
259286b00e0dSAlex Elder 
259386b00e0dSAlex Elder 	count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n",
259486b00e0dSAlex Elder 			(unsigned long long) spec->snap_id, spec->snap_name);
259586b00e0dSAlex Elder 	if (count < 0)
259686b00e0dSAlex Elder 		return count;
259786b00e0dSAlex Elder 	bufp += count;
259886b00e0dSAlex Elder 
259986b00e0dSAlex Elder 	count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap);
260086b00e0dSAlex Elder 	if (count < 0)
260186b00e0dSAlex Elder 		return count;
260286b00e0dSAlex Elder 	bufp += count;
260386b00e0dSAlex Elder 
260486b00e0dSAlex Elder 	return (ssize_t) (bufp - buf);
260586b00e0dSAlex Elder }
260686b00e0dSAlex Elder 
2607dfc5606dSYehuda Sadeh static ssize_t rbd_image_refresh(struct device *dev,
2608dfc5606dSYehuda Sadeh 				 struct device_attribute *attr,
2609dfc5606dSYehuda Sadeh 				 const char *buf,
2610dfc5606dSYehuda Sadeh 				 size_t size)
2611dfc5606dSYehuda Sadeh {
2612593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2613b813623aSAlex Elder 	int ret;
2614602adf40SYehuda Sadeh 
2615117973fbSAlex Elder 	ret = rbd_dev_refresh(rbd_dev, NULL);
2616b813623aSAlex Elder 
2617b813623aSAlex Elder 	return ret < 0 ? ret : size;
2618dfc5606dSYehuda Sadeh }
2619602adf40SYehuda Sadeh 
2620dfc5606dSYehuda Sadeh static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
262134b13184SAlex Elder static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
2622dfc5606dSYehuda Sadeh static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
2623dfc5606dSYehuda Sadeh static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
2624dfc5606dSYehuda Sadeh static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
26259bb2f334SAlex Elder static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
2626dfc5606dSYehuda Sadeh static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
2627589d30e0SAlex Elder static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
2628dfc5606dSYehuda Sadeh static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
2629dfc5606dSYehuda Sadeh static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
263086b00e0dSAlex Elder static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
2631dfc5606dSYehuda Sadeh 
2632dfc5606dSYehuda Sadeh static struct attribute *rbd_attrs[] = {
2633dfc5606dSYehuda Sadeh 	&dev_attr_size.attr,
263434b13184SAlex Elder 	&dev_attr_features.attr,
2635dfc5606dSYehuda Sadeh 	&dev_attr_major.attr,
2636dfc5606dSYehuda Sadeh 	&dev_attr_client_id.attr,
2637dfc5606dSYehuda Sadeh 	&dev_attr_pool.attr,
26389bb2f334SAlex Elder 	&dev_attr_pool_id.attr,
2639dfc5606dSYehuda Sadeh 	&dev_attr_name.attr,
2640589d30e0SAlex Elder 	&dev_attr_image_id.attr,
2641dfc5606dSYehuda Sadeh 	&dev_attr_current_snap.attr,
264286b00e0dSAlex Elder 	&dev_attr_parent.attr,
2643dfc5606dSYehuda Sadeh 	&dev_attr_refresh.attr,
2644dfc5606dSYehuda Sadeh 	NULL
2645dfc5606dSYehuda Sadeh };
2646dfc5606dSYehuda Sadeh 
2647dfc5606dSYehuda Sadeh static struct attribute_group rbd_attr_group = {
2648dfc5606dSYehuda Sadeh 	.attrs = rbd_attrs,
2649dfc5606dSYehuda Sadeh };
2650dfc5606dSYehuda Sadeh 
2651dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_attr_groups[] = {
2652dfc5606dSYehuda Sadeh 	&rbd_attr_group,
2653dfc5606dSYehuda Sadeh 	NULL
2654dfc5606dSYehuda Sadeh };
2655dfc5606dSYehuda Sadeh 
2656dfc5606dSYehuda Sadeh static void rbd_sysfs_dev_release(struct device *dev)
2657dfc5606dSYehuda Sadeh {
2658dfc5606dSYehuda Sadeh }
2659dfc5606dSYehuda Sadeh 
2660dfc5606dSYehuda Sadeh static struct device_type rbd_device_type = {
2661dfc5606dSYehuda Sadeh 	.name		= "rbd",
2662dfc5606dSYehuda Sadeh 	.groups		= rbd_attr_groups,
2663dfc5606dSYehuda Sadeh 	.release	= rbd_sysfs_dev_release,
2664dfc5606dSYehuda Sadeh };
2665dfc5606dSYehuda Sadeh 
2666dfc5606dSYehuda Sadeh 
2667dfc5606dSYehuda Sadeh /*
2668dfc5606dSYehuda Sadeh   sysfs - snapshots
2669dfc5606dSYehuda Sadeh */
2670dfc5606dSYehuda Sadeh 
2671dfc5606dSYehuda Sadeh static ssize_t rbd_snap_size_show(struct device *dev,
2672dfc5606dSYehuda Sadeh 				  struct device_attribute *attr,
2673dfc5606dSYehuda Sadeh 				  char *buf)
2674dfc5606dSYehuda Sadeh {
2675dfc5606dSYehuda Sadeh 	struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2676dfc5606dSYehuda Sadeh 
26773591538fSJosh Durgin 	return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
2678dfc5606dSYehuda Sadeh }
2679dfc5606dSYehuda Sadeh 
2680dfc5606dSYehuda Sadeh static ssize_t rbd_snap_id_show(struct device *dev,
2681dfc5606dSYehuda Sadeh 				struct device_attribute *attr,
2682dfc5606dSYehuda Sadeh 				char *buf)
2683dfc5606dSYehuda Sadeh {
2684dfc5606dSYehuda Sadeh 	struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2685dfc5606dSYehuda Sadeh 
2686593a9e7bSAlex Elder 	return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
2687dfc5606dSYehuda Sadeh }
2688dfc5606dSYehuda Sadeh 
268934b13184SAlex Elder static ssize_t rbd_snap_features_show(struct device *dev,
269034b13184SAlex Elder 				struct device_attribute *attr,
269134b13184SAlex Elder 				char *buf)
269234b13184SAlex Elder {
269334b13184SAlex Elder 	struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
269434b13184SAlex Elder 
269534b13184SAlex Elder 	return sprintf(buf, "0x%016llx\n",
269634b13184SAlex Elder 			(unsigned long long) snap->features);
269734b13184SAlex Elder }
269834b13184SAlex Elder 
2699dfc5606dSYehuda Sadeh static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
2700dfc5606dSYehuda Sadeh static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
270134b13184SAlex Elder static DEVICE_ATTR(snap_features, S_IRUGO, rbd_snap_features_show, NULL);
2702dfc5606dSYehuda Sadeh 
2703dfc5606dSYehuda Sadeh static struct attribute *rbd_snap_attrs[] = {
2704dfc5606dSYehuda Sadeh 	&dev_attr_snap_size.attr,
2705dfc5606dSYehuda Sadeh 	&dev_attr_snap_id.attr,
270634b13184SAlex Elder 	&dev_attr_snap_features.attr,
2707dfc5606dSYehuda Sadeh 	NULL,
2708dfc5606dSYehuda Sadeh };
2709dfc5606dSYehuda Sadeh 
2710dfc5606dSYehuda Sadeh static struct attribute_group rbd_snap_attr_group = {
2711dfc5606dSYehuda Sadeh 	.attrs = rbd_snap_attrs,
2712dfc5606dSYehuda Sadeh };
2713dfc5606dSYehuda Sadeh 
2714dfc5606dSYehuda Sadeh static void rbd_snap_dev_release(struct device *dev)
2715dfc5606dSYehuda Sadeh {
2716dfc5606dSYehuda Sadeh 	struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2717dfc5606dSYehuda Sadeh 	kfree(snap->name);
2718dfc5606dSYehuda Sadeh 	kfree(snap);
2719dfc5606dSYehuda Sadeh }
2720dfc5606dSYehuda Sadeh 
2721dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_snap_attr_groups[] = {
2722dfc5606dSYehuda Sadeh 	&rbd_snap_attr_group,
2723dfc5606dSYehuda Sadeh 	NULL
2724dfc5606dSYehuda Sadeh };
2725dfc5606dSYehuda Sadeh 
2726dfc5606dSYehuda Sadeh static struct device_type rbd_snap_device_type = {
2727dfc5606dSYehuda Sadeh 	.groups		= rbd_snap_attr_groups,
2728dfc5606dSYehuda Sadeh 	.release	= rbd_snap_dev_release,
2729dfc5606dSYehuda Sadeh };
2730dfc5606dSYehuda Sadeh 
27318b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
27328b8fb99cSAlex Elder {
27338b8fb99cSAlex Elder 	kref_get(&spec->kref);
27348b8fb99cSAlex Elder 
27358b8fb99cSAlex Elder 	return spec;
27368b8fb99cSAlex Elder }
27378b8fb99cSAlex Elder 
27388b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref);
27398b8fb99cSAlex Elder static void rbd_spec_put(struct rbd_spec *spec)
27408b8fb99cSAlex Elder {
27418b8fb99cSAlex Elder 	if (spec)
27428b8fb99cSAlex Elder 		kref_put(&spec->kref, rbd_spec_free);
27438b8fb99cSAlex Elder }
27448b8fb99cSAlex Elder 
27458b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_alloc(void)
27468b8fb99cSAlex Elder {
27478b8fb99cSAlex Elder 	struct rbd_spec *spec;
27488b8fb99cSAlex Elder 
27498b8fb99cSAlex Elder 	spec = kzalloc(sizeof (*spec), GFP_KERNEL);
27508b8fb99cSAlex Elder 	if (!spec)
27518b8fb99cSAlex Elder 		return NULL;
27528b8fb99cSAlex Elder 	kref_init(&spec->kref);
27538b8fb99cSAlex Elder 
27548b8fb99cSAlex Elder 	return spec;
27558b8fb99cSAlex Elder }
27568b8fb99cSAlex Elder 
27578b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref)
27588b8fb99cSAlex Elder {
27598b8fb99cSAlex Elder 	struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
27608b8fb99cSAlex Elder 
27618b8fb99cSAlex Elder 	kfree(spec->pool_name);
27628b8fb99cSAlex Elder 	kfree(spec->image_id);
27638b8fb99cSAlex Elder 	kfree(spec->image_name);
27648b8fb99cSAlex Elder 	kfree(spec->snap_name);
27658b8fb99cSAlex Elder 	kfree(spec);
27668b8fb99cSAlex Elder }
27678b8fb99cSAlex Elder 
2768cc344fa1SAlex Elder static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
2769c53d5893SAlex Elder 				struct rbd_spec *spec)
2770c53d5893SAlex Elder {
2771c53d5893SAlex Elder 	struct rbd_device *rbd_dev;
2772c53d5893SAlex Elder 
2773c53d5893SAlex Elder 	rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL);
2774c53d5893SAlex Elder 	if (!rbd_dev)
2775c53d5893SAlex Elder 		return NULL;
2776c53d5893SAlex Elder 
2777c53d5893SAlex Elder 	spin_lock_init(&rbd_dev->lock);
27786d292906SAlex Elder 	rbd_dev->flags = 0;
2779c53d5893SAlex Elder 	INIT_LIST_HEAD(&rbd_dev->node);
2780c53d5893SAlex Elder 	INIT_LIST_HEAD(&rbd_dev->snaps);
2781c53d5893SAlex Elder 	init_rwsem(&rbd_dev->header_rwsem);
2782c53d5893SAlex Elder 
2783c53d5893SAlex Elder 	rbd_dev->spec = spec;
2784c53d5893SAlex Elder 	rbd_dev->rbd_client = rbdc;
2785c53d5893SAlex Elder 
27860903e875SAlex Elder 	/* Initialize the layout used for all rbd requests */
27870903e875SAlex Elder 
27880903e875SAlex Elder 	rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
27890903e875SAlex Elder 	rbd_dev->layout.fl_stripe_count = cpu_to_le32(1);
27900903e875SAlex Elder 	rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
27910903e875SAlex Elder 	rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id);
27920903e875SAlex Elder 
2793c53d5893SAlex Elder 	return rbd_dev;
2794c53d5893SAlex Elder }
2795c53d5893SAlex Elder 
2796c53d5893SAlex Elder static void rbd_dev_destroy(struct rbd_device *rbd_dev)
2797c53d5893SAlex Elder {
279886b00e0dSAlex Elder 	rbd_spec_put(rbd_dev->parent_spec);
2799c53d5893SAlex Elder 	kfree(rbd_dev->header_name);
2800c53d5893SAlex Elder 	rbd_put_client(rbd_dev->rbd_client);
2801c53d5893SAlex Elder 	rbd_spec_put(rbd_dev->spec);
2802c53d5893SAlex Elder 	kfree(rbd_dev);
2803c53d5893SAlex Elder }
2804c53d5893SAlex Elder 
2805304f6808SAlex Elder static bool rbd_snap_registered(struct rbd_snap *snap)
2806304f6808SAlex Elder {
2807304f6808SAlex Elder 	bool ret = snap->dev.type == &rbd_snap_device_type;
2808304f6808SAlex Elder 	bool reg = device_is_registered(&snap->dev);
2809304f6808SAlex Elder 
2810304f6808SAlex Elder 	rbd_assert(!ret ^ reg);
2811304f6808SAlex Elder 
2812304f6808SAlex Elder 	return ret;
2813304f6808SAlex Elder }
2814304f6808SAlex Elder 
281541f38c2bSAlex Elder static void rbd_remove_snap_dev(struct rbd_snap *snap)
2816dfc5606dSYehuda Sadeh {
2817dfc5606dSYehuda Sadeh 	list_del(&snap->node);
2818304f6808SAlex Elder 	if (device_is_registered(&snap->dev))
2819dfc5606dSYehuda Sadeh 		device_unregister(&snap->dev);
2820dfc5606dSYehuda Sadeh }
2821dfc5606dSYehuda Sadeh 
282214e7085dSAlex Elder static int rbd_register_snap_dev(struct rbd_snap *snap,
2823dfc5606dSYehuda Sadeh 				  struct device *parent)
2824dfc5606dSYehuda Sadeh {
2825dfc5606dSYehuda Sadeh 	struct device *dev = &snap->dev;
2826dfc5606dSYehuda Sadeh 	int ret;
2827dfc5606dSYehuda Sadeh 
2828dfc5606dSYehuda Sadeh 	dev->type = &rbd_snap_device_type;
2829dfc5606dSYehuda Sadeh 	dev->parent = parent;
2830dfc5606dSYehuda Sadeh 	dev->release = rbd_snap_dev_release;
2831d4b125e9SAlex Elder 	dev_set_name(dev, "%s%s", RBD_SNAP_DEV_NAME_PREFIX, snap->name);
2832304f6808SAlex Elder 	dout("%s: registering device for snapshot %s\n", __func__, snap->name);
2833304f6808SAlex Elder 
2834dfc5606dSYehuda Sadeh 	ret = device_register(dev);
2835dfc5606dSYehuda Sadeh 
2836dfc5606dSYehuda Sadeh 	return ret;
2837dfc5606dSYehuda Sadeh }
2838dfc5606dSYehuda Sadeh 
28394e891e0aSAlex Elder static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
2840c8d18425SAlex Elder 						const char *snap_name,
284134b13184SAlex Elder 						u64 snap_id, u64 snap_size,
284234b13184SAlex Elder 						u64 snap_features)
2843dfc5606dSYehuda Sadeh {
28444e891e0aSAlex Elder 	struct rbd_snap *snap;
2845dfc5606dSYehuda Sadeh 	int ret;
28464e891e0aSAlex Elder 
28474e891e0aSAlex Elder 	snap = kzalloc(sizeof (*snap), GFP_KERNEL);
2848dfc5606dSYehuda Sadeh 	if (!snap)
28494e891e0aSAlex Elder 		return ERR_PTR(-ENOMEM);
28504e891e0aSAlex Elder 
28514e891e0aSAlex Elder 	ret = -ENOMEM;
2852c8d18425SAlex Elder 	snap->name = kstrdup(snap_name, GFP_KERNEL);
28534e891e0aSAlex Elder 	if (!snap->name)
28544e891e0aSAlex Elder 		goto err;
28554e891e0aSAlex Elder 
2856c8d18425SAlex Elder 	snap->id = snap_id;
2857c8d18425SAlex Elder 	snap->size = snap_size;
285834b13184SAlex Elder 	snap->features = snap_features;
28594e891e0aSAlex Elder 
28604e891e0aSAlex Elder 	return snap;
28614e891e0aSAlex Elder 
2862dfc5606dSYehuda Sadeh err:
2863dfc5606dSYehuda Sadeh 	kfree(snap->name);
2864dfc5606dSYehuda Sadeh 	kfree(snap);
28654e891e0aSAlex Elder 
28664e891e0aSAlex Elder 	return ERR_PTR(ret);
2867dfc5606dSYehuda Sadeh }
2868dfc5606dSYehuda Sadeh 
2869cd892126SAlex Elder static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which,
2870cd892126SAlex Elder 		u64 *snap_size, u64 *snap_features)
2871cd892126SAlex Elder {
2872cd892126SAlex Elder 	char *snap_name;
2873cd892126SAlex Elder 
2874cd892126SAlex Elder 	rbd_assert(which < rbd_dev->header.snapc->num_snaps);
2875cd892126SAlex Elder 
2876cd892126SAlex Elder 	*snap_size = rbd_dev->header.snap_sizes[which];
2877cd892126SAlex Elder 	*snap_features = 0;	/* No features for v1 */
2878cd892126SAlex Elder 
2879cd892126SAlex Elder 	/* Skip over names until we find the one we are looking for */
2880cd892126SAlex Elder 
2881cd892126SAlex Elder 	snap_name = rbd_dev->header.snap_names;
2882cd892126SAlex Elder 	while (which--)
2883cd892126SAlex Elder 		snap_name += strlen(snap_name) + 1;
2884cd892126SAlex Elder 
2885cd892126SAlex Elder 	return snap_name;
2886cd892126SAlex Elder }
2887cd892126SAlex Elder 
2888dfc5606dSYehuda Sadeh /*
28899d475de5SAlex Elder  * Get the size and object order for an image snapshot, or if
28909d475de5SAlex Elder  * snap_id is CEPH_NOSNAP, gets this information for the base
28919d475de5SAlex Elder  * image.
28929d475de5SAlex Elder  */
28939d475de5SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
28949d475de5SAlex Elder 				u8 *order, u64 *snap_size)
28959d475de5SAlex Elder {
28969d475de5SAlex Elder 	__le64 snapid = cpu_to_le64(snap_id);
28979d475de5SAlex Elder 	int ret;
28989d475de5SAlex Elder 	struct {
28999d475de5SAlex Elder 		u8 order;
29009d475de5SAlex Elder 		__le64 size;
29019d475de5SAlex Elder 	} __attribute__ ((packed)) size_buf = { 0 };
29029d475de5SAlex Elder 
290336be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
29049d475de5SAlex Elder 				"rbd", "get_size",
29059d475de5SAlex Elder 				(char *) &snapid, sizeof (snapid),
290607b2391fSAlex Elder 				(char *) &size_buf, sizeof (size_buf), NULL);
290736be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
29089d475de5SAlex Elder 	if (ret < 0)
29099d475de5SAlex Elder 		return ret;
29109d475de5SAlex Elder 
29119d475de5SAlex Elder 	*order = size_buf.order;
29129d475de5SAlex Elder 	*snap_size = le64_to_cpu(size_buf.size);
29139d475de5SAlex Elder 
29149d475de5SAlex Elder 	dout("  snap_id 0x%016llx order = %u, snap_size = %llu\n",
29159d475de5SAlex Elder 		(unsigned long long) snap_id, (unsigned int) *order,
29169d475de5SAlex Elder 		(unsigned long long) *snap_size);
29179d475de5SAlex Elder 
29189d475de5SAlex Elder 	return 0;
29199d475de5SAlex Elder }
29209d475de5SAlex Elder 
29219d475de5SAlex Elder static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
29229d475de5SAlex Elder {
29239d475de5SAlex Elder 	return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
29249d475de5SAlex Elder 					&rbd_dev->header.obj_order,
29259d475de5SAlex Elder 					&rbd_dev->header.image_size);
29269d475de5SAlex Elder }
29279d475de5SAlex Elder 
29281e130199SAlex Elder static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
29291e130199SAlex Elder {
29301e130199SAlex Elder 	void *reply_buf;
29311e130199SAlex Elder 	int ret;
29321e130199SAlex Elder 	void *p;
29331e130199SAlex Elder 
29341e130199SAlex Elder 	reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
29351e130199SAlex Elder 	if (!reply_buf)
29361e130199SAlex Elder 		return -ENOMEM;
29371e130199SAlex Elder 
293836be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
29391e130199SAlex Elder 				"rbd", "get_object_prefix",
29401e130199SAlex Elder 				NULL, 0,
294107b2391fSAlex Elder 				reply_buf, RBD_OBJ_PREFIX_LEN_MAX, NULL);
294236be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
29431e130199SAlex Elder 	if (ret < 0)
29441e130199SAlex Elder 		goto out;
29451e130199SAlex Elder 
29461e130199SAlex Elder 	p = reply_buf;
29471e130199SAlex Elder 	rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
29481e130199SAlex Elder 						p + RBD_OBJ_PREFIX_LEN_MAX,
29491e130199SAlex Elder 						NULL, GFP_NOIO);
29501e130199SAlex Elder 
29511e130199SAlex Elder 	if (IS_ERR(rbd_dev->header.object_prefix)) {
29521e130199SAlex Elder 		ret = PTR_ERR(rbd_dev->header.object_prefix);
29531e130199SAlex Elder 		rbd_dev->header.object_prefix = NULL;
29541e130199SAlex Elder 	} else {
29551e130199SAlex Elder 		dout("  object_prefix = %s\n", rbd_dev->header.object_prefix);
29561e130199SAlex Elder 	}
29571e130199SAlex Elder 
29581e130199SAlex Elder out:
29591e130199SAlex Elder 	kfree(reply_buf);
29601e130199SAlex Elder 
29611e130199SAlex Elder 	return ret;
29621e130199SAlex Elder }
29631e130199SAlex Elder 
2964b1b5402aSAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
2965b1b5402aSAlex Elder 		u64 *snap_features)
2966b1b5402aSAlex Elder {
2967b1b5402aSAlex Elder 	__le64 snapid = cpu_to_le64(snap_id);
2968b1b5402aSAlex Elder 	struct {
2969b1b5402aSAlex Elder 		__le64 features;
2970b1b5402aSAlex Elder 		__le64 incompat;
2971b1b5402aSAlex Elder 	} features_buf = { 0 };
2972d889140cSAlex Elder 	u64 incompat;
2973b1b5402aSAlex Elder 	int ret;
2974b1b5402aSAlex Elder 
297536be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
2976b1b5402aSAlex Elder 				"rbd", "get_features",
2977b1b5402aSAlex Elder 				(char *) &snapid, sizeof (snapid),
2978b1b5402aSAlex Elder 				(char *) &features_buf, sizeof (features_buf),
297907b2391fSAlex Elder 				NULL);
298036be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
2981b1b5402aSAlex Elder 	if (ret < 0)
2982b1b5402aSAlex Elder 		return ret;
2983d889140cSAlex Elder 
2984d889140cSAlex Elder 	incompat = le64_to_cpu(features_buf.incompat);
29855cbf6f12SAlex Elder 	if (incompat & ~RBD_FEATURES_SUPPORTED)
2986b8f5c6edSAlex Elder 		return -ENXIO;
2987d889140cSAlex Elder 
2988b1b5402aSAlex Elder 	*snap_features = le64_to_cpu(features_buf.features);
2989b1b5402aSAlex Elder 
2990b1b5402aSAlex Elder 	dout("  snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
2991b1b5402aSAlex Elder 		(unsigned long long) snap_id,
2992b1b5402aSAlex Elder 		(unsigned long long) *snap_features,
2993b1b5402aSAlex Elder 		(unsigned long long) le64_to_cpu(features_buf.incompat));
2994b1b5402aSAlex Elder 
2995b1b5402aSAlex Elder 	return 0;
2996b1b5402aSAlex Elder }
2997b1b5402aSAlex Elder 
2998b1b5402aSAlex Elder static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
2999b1b5402aSAlex Elder {
3000b1b5402aSAlex Elder 	return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
3001b1b5402aSAlex Elder 						&rbd_dev->header.features);
3002b1b5402aSAlex Elder }
3003b1b5402aSAlex Elder 
300486b00e0dSAlex Elder static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
300586b00e0dSAlex Elder {
300686b00e0dSAlex Elder 	struct rbd_spec *parent_spec;
300786b00e0dSAlex Elder 	size_t size;
300886b00e0dSAlex Elder 	void *reply_buf = NULL;
300986b00e0dSAlex Elder 	__le64 snapid;
301086b00e0dSAlex Elder 	void *p;
301186b00e0dSAlex Elder 	void *end;
301286b00e0dSAlex Elder 	char *image_id;
301386b00e0dSAlex Elder 	u64 overlap;
301486b00e0dSAlex Elder 	int ret;
301586b00e0dSAlex Elder 
301686b00e0dSAlex Elder 	parent_spec = rbd_spec_alloc();
301786b00e0dSAlex Elder 	if (!parent_spec)
301886b00e0dSAlex Elder 		return -ENOMEM;
301986b00e0dSAlex Elder 
302086b00e0dSAlex Elder 	size = sizeof (__le64) +				/* pool_id */
302186b00e0dSAlex Elder 		sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX +	/* image_id */
302286b00e0dSAlex Elder 		sizeof (__le64) +				/* snap_id */
302386b00e0dSAlex Elder 		sizeof (__le64);				/* overlap */
302486b00e0dSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
302586b00e0dSAlex Elder 	if (!reply_buf) {
302686b00e0dSAlex Elder 		ret = -ENOMEM;
302786b00e0dSAlex Elder 		goto out_err;
302886b00e0dSAlex Elder 	}
302986b00e0dSAlex Elder 
303086b00e0dSAlex Elder 	snapid = cpu_to_le64(CEPH_NOSNAP);
303136be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
303286b00e0dSAlex Elder 				"rbd", "get_parent",
303386b00e0dSAlex Elder 				(char *) &snapid, sizeof (snapid),
303407b2391fSAlex Elder 				(char *) reply_buf, size, NULL);
303536be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
303686b00e0dSAlex Elder 	if (ret < 0)
303786b00e0dSAlex Elder 		goto out_err;
303886b00e0dSAlex Elder 
303986b00e0dSAlex Elder 	ret = -ERANGE;
304086b00e0dSAlex Elder 	p = reply_buf;
304186b00e0dSAlex Elder 	end = (char *) reply_buf + size;
304286b00e0dSAlex Elder 	ceph_decode_64_safe(&p, end, parent_spec->pool_id, out_err);
304386b00e0dSAlex Elder 	if (parent_spec->pool_id == CEPH_NOPOOL)
304486b00e0dSAlex Elder 		goto out;	/* No parent?  No problem. */
304586b00e0dSAlex Elder 
30460903e875SAlex Elder 	/* The ceph file layout needs to fit pool id in 32 bits */
30470903e875SAlex Elder 
30480903e875SAlex Elder 	ret = -EIO;
30490903e875SAlex Elder 	if (WARN_ON(parent_spec->pool_id > (u64) U32_MAX))
30500903e875SAlex Elder 		goto out;
30510903e875SAlex Elder 
3052979ed480SAlex Elder 	image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
305386b00e0dSAlex Elder 	if (IS_ERR(image_id)) {
305486b00e0dSAlex Elder 		ret = PTR_ERR(image_id);
305586b00e0dSAlex Elder 		goto out_err;
305686b00e0dSAlex Elder 	}
305786b00e0dSAlex Elder 	parent_spec->image_id = image_id;
305886b00e0dSAlex Elder 	ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err);
305986b00e0dSAlex Elder 	ceph_decode_64_safe(&p, end, overlap, out_err);
306086b00e0dSAlex Elder 
306186b00e0dSAlex Elder 	rbd_dev->parent_overlap = overlap;
306286b00e0dSAlex Elder 	rbd_dev->parent_spec = parent_spec;
306386b00e0dSAlex Elder 	parent_spec = NULL;	/* rbd_dev now owns this */
306486b00e0dSAlex Elder out:
306586b00e0dSAlex Elder 	ret = 0;
306686b00e0dSAlex Elder out_err:
306786b00e0dSAlex Elder 	kfree(reply_buf);
306886b00e0dSAlex Elder 	rbd_spec_put(parent_spec);
306986b00e0dSAlex Elder 
307086b00e0dSAlex Elder 	return ret;
307186b00e0dSAlex Elder }
307286b00e0dSAlex Elder 
30739e15b77dSAlex Elder static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
30749e15b77dSAlex Elder {
30759e15b77dSAlex Elder 	size_t image_id_size;
30769e15b77dSAlex Elder 	char *image_id;
30779e15b77dSAlex Elder 	void *p;
30789e15b77dSAlex Elder 	void *end;
30799e15b77dSAlex Elder 	size_t size;
30809e15b77dSAlex Elder 	void *reply_buf = NULL;
30819e15b77dSAlex Elder 	size_t len = 0;
30829e15b77dSAlex Elder 	char *image_name = NULL;
30839e15b77dSAlex Elder 	int ret;
30849e15b77dSAlex Elder 
30859e15b77dSAlex Elder 	rbd_assert(!rbd_dev->spec->image_name);
30869e15b77dSAlex Elder 
308769e7a02fSAlex Elder 	len = strlen(rbd_dev->spec->image_id);
308869e7a02fSAlex Elder 	image_id_size = sizeof (__le32) + len;
30899e15b77dSAlex Elder 	image_id = kmalloc(image_id_size, GFP_KERNEL);
30909e15b77dSAlex Elder 	if (!image_id)
30919e15b77dSAlex Elder 		return NULL;
30929e15b77dSAlex Elder 
30939e15b77dSAlex Elder 	p = image_id;
30949e15b77dSAlex Elder 	end = (char *) image_id + image_id_size;
309569e7a02fSAlex Elder 	ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32) len);
30969e15b77dSAlex Elder 
30979e15b77dSAlex Elder 	size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
30989e15b77dSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
30999e15b77dSAlex Elder 	if (!reply_buf)
31009e15b77dSAlex Elder 		goto out;
31019e15b77dSAlex Elder 
310236be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, RBD_DIRECTORY,
31039e15b77dSAlex Elder 				"rbd", "dir_get_name",
31049e15b77dSAlex Elder 				image_id, image_id_size,
310507b2391fSAlex Elder 				(char *) reply_buf, size, NULL);
31069e15b77dSAlex Elder 	if (ret < 0)
31079e15b77dSAlex Elder 		goto out;
31089e15b77dSAlex Elder 	p = reply_buf;
31099e15b77dSAlex Elder 	end = (char *) reply_buf + size;
31109e15b77dSAlex Elder 	image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
31119e15b77dSAlex Elder 	if (IS_ERR(image_name))
31129e15b77dSAlex Elder 		image_name = NULL;
31139e15b77dSAlex Elder 	else
31149e15b77dSAlex Elder 		dout("%s: name is %s len is %zd\n", __func__, image_name, len);
31159e15b77dSAlex Elder out:
31169e15b77dSAlex Elder 	kfree(reply_buf);
31179e15b77dSAlex Elder 	kfree(image_id);
31189e15b77dSAlex Elder 
31199e15b77dSAlex Elder 	return image_name;
31209e15b77dSAlex Elder }
31219e15b77dSAlex Elder 
31229e15b77dSAlex Elder /*
31239e15b77dSAlex Elder  * When a parent image gets probed, we only have the pool, image,
31249e15b77dSAlex Elder  * and snapshot ids but not the names of any of them.  This call
31259e15b77dSAlex Elder  * is made later to fill in those names.  It has to be done after
31269e15b77dSAlex Elder  * rbd_dev_snaps_update() has completed because some of the
31279e15b77dSAlex Elder  * information (in particular, snapshot name) is not available
31289e15b77dSAlex Elder  * until then.
31299e15b77dSAlex Elder  */
31309e15b77dSAlex Elder static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev)
31319e15b77dSAlex Elder {
31329e15b77dSAlex Elder 	struct ceph_osd_client *osdc;
31339e15b77dSAlex Elder 	const char *name;
31349e15b77dSAlex Elder 	void *reply_buf = NULL;
31359e15b77dSAlex Elder 	int ret;
31369e15b77dSAlex Elder 
31379e15b77dSAlex Elder 	if (rbd_dev->spec->pool_name)
31389e15b77dSAlex Elder 		return 0;	/* Already have the names */
31399e15b77dSAlex Elder 
31409e15b77dSAlex Elder 	/* Look up the pool name */
31419e15b77dSAlex Elder 
31429e15b77dSAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
31439e15b77dSAlex Elder 	name = ceph_pg_pool_name_by_id(osdc->osdmap, rbd_dev->spec->pool_id);
3144935dc89fSAlex Elder 	if (!name) {
3145935dc89fSAlex Elder 		rbd_warn(rbd_dev, "there is no pool with id %llu",
3146935dc89fSAlex Elder 			rbd_dev->spec->pool_id);	/* Really a BUG() */
3147935dc89fSAlex Elder 		return -EIO;
3148935dc89fSAlex Elder 	}
31499e15b77dSAlex Elder 
31509e15b77dSAlex Elder 	rbd_dev->spec->pool_name = kstrdup(name, GFP_KERNEL);
31519e15b77dSAlex Elder 	if (!rbd_dev->spec->pool_name)
31529e15b77dSAlex Elder 		return -ENOMEM;
31539e15b77dSAlex Elder 
31549e15b77dSAlex Elder 	/* Fetch the image name; tolerate failure here */
31559e15b77dSAlex Elder 
31569e15b77dSAlex Elder 	name = rbd_dev_image_name(rbd_dev);
315769e7a02fSAlex Elder 	if (name)
31589e15b77dSAlex Elder 		rbd_dev->spec->image_name = (char *) name;
315969e7a02fSAlex Elder 	else
316006ecc6cbSAlex Elder 		rbd_warn(rbd_dev, "unable to get image name");
31619e15b77dSAlex Elder 
31629e15b77dSAlex Elder 	/* Look up the snapshot name. */
31639e15b77dSAlex Elder 
31649e15b77dSAlex Elder 	name = rbd_snap_name(rbd_dev, rbd_dev->spec->snap_id);
31659e15b77dSAlex Elder 	if (!name) {
3166935dc89fSAlex Elder 		rbd_warn(rbd_dev, "no snapshot with id %llu",
3167935dc89fSAlex Elder 			rbd_dev->spec->snap_id);	/* Really a BUG() */
31689e15b77dSAlex Elder 		ret = -EIO;
31699e15b77dSAlex Elder 		goto out_err;
31709e15b77dSAlex Elder 	}
31719e15b77dSAlex Elder 	rbd_dev->spec->snap_name = kstrdup(name, GFP_KERNEL);
31729e15b77dSAlex Elder 	if(!rbd_dev->spec->snap_name)
31739e15b77dSAlex Elder 		goto out_err;
31749e15b77dSAlex Elder 
31759e15b77dSAlex Elder 	return 0;
31769e15b77dSAlex Elder out_err:
31779e15b77dSAlex Elder 	kfree(reply_buf);
31789e15b77dSAlex Elder 	kfree(rbd_dev->spec->pool_name);
31799e15b77dSAlex Elder 	rbd_dev->spec->pool_name = NULL;
31809e15b77dSAlex Elder 
31819e15b77dSAlex Elder 	return ret;
31829e15b77dSAlex Elder }
31839e15b77dSAlex Elder 
31846e14b1a6SAlex Elder static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver)
318535d489f9SAlex Elder {
318635d489f9SAlex Elder 	size_t size;
318735d489f9SAlex Elder 	int ret;
318835d489f9SAlex Elder 	void *reply_buf;
318935d489f9SAlex Elder 	void *p;
319035d489f9SAlex Elder 	void *end;
319135d489f9SAlex Elder 	u64 seq;
319235d489f9SAlex Elder 	u32 snap_count;
319335d489f9SAlex Elder 	struct ceph_snap_context *snapc;
319435d489f9SAlex Elder 	u32 i;
319535d489f9SAlex Elder 
319635d489f9SAlex Elder 	/*
319735d489f9SAlex Elder 	 * We'll need room for the seq value (maximum snapshot id),
319835d489f9SAlex Elder 	 * snapshot count, and array of that many snapshot ids.
319935d489f9SAlex Elder 	 * For now we have a fixed upper limit on the number we're
320035d489f9SAlex Elder 	 * prepared to receive.
320135d489f9SAlex Elder 	 */
320235d489f9SAlex Elder 	size = sizeof (__le64) + sizeof (__le32) +
320335d489f9SAlex Elder 			RBD_MAX_SNAP_COUNT * sizeof (__le64);
320435d489f9SAlex Elder 	reply_buf = kzalloc(size, GFP_KERNEL);
320535d489f9SAlex Elder 	if (!reply_buf)
320635d489f9SAlex Elder 		return -ENOMEM;
320735d489f9SAlex Elder 
320836be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
320935d489f9SAlex Elder 				"rbd", "get_snapcontext",
321035d489f9SAlex Elder 				NULL, 0,
321107b2391fSAlex Elder 				reply_buf, size, ver);
321236be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
321335d489f9SAlex Elder 	if (ret < 0)
321435d489f9SAlex Elder 		goto out;
321535d489f9SAlex Elder 
321635d489f9SAlex Elder 	ret = -ERANGE;
321735d489f9SAlex Elder 	p = reply_buf;
321835d489f9SAlex Elder 	end = (char *) reply_buf + size;
321935d489f9SAlex Elder 	ceph_decode_64_safe(&p, end, seq, out);
322035d489f9SAlex Elder 	ceph_decode_32_safe(&p, end, snap_count, out);
322135d489f9SAlex Elder 
322235d489f9SAlex Elder 	/*
322335d489f9SAlex Elder 	 * Make sure the reported number of snapshot ids wouldn't go
322435d489f9SAlex Elder 	 * beyond the end of our buffer.  But before checking that,
322535d489f9SAlex Elder 	 * make sure the computed size of the snapshot context we
322635d489f9SAlex Elder 	 * allocate is representable in a size_t.
322735d489f9SAlex Elder 	 */
322835d489f9SAlex Elder 	if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
322935d489f9SAlex Elder 				 / sizeof (u64)) {
323035d489f9SAlex Elder 		ret = -EINVAL;
323135d489f9SAlex Elder 		goto out;
323235d489f9SAlex Elder 	}
323335d489f9SAlex Elder 	if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
323435d489f9SAlex Elder 		goto out;
323535d489f9SAlex Elder 
323635d489f9SAlex Elder 	size = sizeof (struct ceph_snap_context) +
323735d489f9SAlex Elder 				snap_count * sizeof (snapc->snaps[0]);
323835d489f9SAlex Elder 	snapc = kmalloc(size, GFP_KERNEL);
323935d489f9SAlex Elder 	if (!snapc) {
324035d489f9SAlex Elder 		ret = -ENOMEM;
324135d489f9SAlex Elder 		goto out;
324235d489f9SAlex Elder 	}
324335d489f9SAlex Elder 
324435d489f9SAlex Elder 	atomic_set(&snapc->nref, 1);
324535d489f9SAlex Elder 	snapc->seq = seq;
324635d489f9SAlex Elder 	snapc->num_snaps = snap_count;
324735d489f9SAlex Elder 	for (i = 0; i < snap_count; i++)
324835d489f9SAlex Elder 		snapc->snaps[i] = ceph_decode_64(&p);
324935d489f9SAlex Elder 
325035d489f9SAlex Elder 	rbd_dev->header.snapc = snapc;
325135d489f9SAlex Elder 
325235d489f9SAlex Elder 	dout("  snap context seq = %llu, snap_count = %u\n",
325335d489f9SAlex Elder 		(unsigned long long) seq, (unsigned int) snap_count);
325435d489f9SAlex Elder 
325535d489f9SAlex Elder out:
325635d489f9SAlex Elder 	kfree(reply_buf);
325735d489f9SAlex Elder 
325835d489f9SAlex Elder 	return 0;
325935d489f9SAlex Elder }
326035d489f9SAlex Elder 
3261b8b1e2dbSAlex Elder static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which)
3262b8b1e2dbSAlex Elder {
3263b8b1e2dbSAlex Elder 	size_t size;
3264b8b1e2dbSAlex Elder 	void *reply_buf;
3265b8b1e2dbSAlex Elder 	__le64 snap_id;
3266b8b1e2dbSAlex Elder 	int ret;
3267b8b1e2dbSAlex Elder 	void *p;
3268b8b1e2dbSAlex Elder 	void *end;
3269b8b1e2dbSAlex Elder 	char *snap_name;
3270b8b1e2dbSAlex Elder 
3271b8b1e2dbSAlex Elder 	size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
3272b8b1e2dbSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
3273b8b1e2dbSAlex Elder 	if (!reply_buf)
3274b8b1e2dbSAlex Elder 		return ERR_PTR(-ENOMEM);
3275b8b1e2dbSAlex Elder 
3276b8b1e2dbSAlex Elder 	snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]);
327736be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
3278b8b1e2dbSAlex Elder 				"rbd", "get_snapshot_name",
3279b8b1e2dbSAlex Elder 				(char *) &snap_id, sizeof (snap_id),
328007b2391fSAlex Elder 				reply_buf, size, NULL);
328136be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3282b8b1e2dbSAlex Elder 	if (ret < 0)
3283b8b1e2dbSAlex Elder 		goto out;
3284b8b1e2dbSAlex Elder 
3285b8b1e2dbSAlex Elder 	p = reply_buf;
3286b8b1e2dbSAlex Elder 	end = (char *) reply_buf + size;
3287e5c35534SAlex Elder 	snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
3288b8b1e2dbSAlex Elder 	if (IS_ERR(snap_name)) {
3289b8b1e2dbSAlex Elder 		ret = PTR_ERR(snap_name);
3290b8b1e2dbSAlex Elder 		goto out;
3291b8b1e2dbSAlex Elder 	} else {
3292b8b1e2dbSAlex Elder 		dout("  snap_id 0x%016llx snap_name = %s\n",
3293b8b1e2dbSAlex Elder 			(unsigned long long) le64_to_cpu(snap_id), snap_name);
3294b8b1e2dbSAlex Elder 	}
3295b8b1e2dbSAlex Elder 	kfree(reply_buf);
3296b8b1e2dbSAlex Elder 
3297b8b1e2dbSAlex Elder 	return snap_name;
3298b8b1e2dbSAlex Elder out:
3299b8b1e2dbSAlex Elder 	kfree(reply_buf);
3300b8b1e2dbSAlex Elder 
3301b8b1e2dbSAlex Elder 	return ERR_PTR(ret);
3302b8b1e2dbSAlex Elder }
3303b8b1e2dbSAlex Elder 
3304b8b1e2dbSAlex Elder static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which,
3305b8b1e2dbSAlex Elder 		u64 *snap_size, u64 *snap_features)
3306b8b1e2dbSAlex Elder {
3307e0b49868SAlex Elder 	u64 snap_id;
3308b8b1e2dbSAlex Elder 	u8 order;
3309b8b1e2dbSAlex Elder 	int ret;
3310b8b1e2dbSAlex Elder 
3311b8b1e2dbSAlex Elder 	snap_id = rbd_dev->header.snapc->snaps[which];
3312b8b1e2dbSAlex Elder 	ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, &order, snap_size);
3313b8b1e2dbSAlex Elder 	if (ret)
3314b8b1e2dbSAlex Elder 		return ERR_PTR(ret);
3315b8b1e2dbSAlex Elder 	ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, snap_features);
3316b8b1e2dbSAlex Elder 	if (ret)
3317b8b1e2dbSAlex Elder 		return ERR_PTR(ret);
3318b8b1e2dbSAlex Elder 
3319b8b1e2dbSAlex Elder 	return rbd_dev_v2_snap_name(rbd_dev, which);
3320b8b1e2dbSAlex Elder }
3321b8b1e2dbSAlex Elder 
3322b8b1e2dbSAlex Elder static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which,
3323b8b1e2dbSAlex Elder 		u64 *snap_size, u64 *snap_features)
3324b8b1e2dbSAlex Elder {
3325b8b1e2dbSAlex Elder 	if (rbd_dev->image_format == 1)
3326b8b1e2dbSAlex Elder 		return rbd_dev_v1_snap_info(rbd_dev, which,
3327b8b1e2dbSAlex Elder 					snap_size, snap_features);
3328b8b1e2dbSAlex Elder 	if (rbd_dev->image_format == 2)
3329b8b1e2dbSAlex Elder 		return rbd_dev_v2_snap_info(rbd_dev, which,
3330b8b1e2dbSAlex Elder 					snap_size, snap_features);
3331b8b1e2dbSAlex Elder 	return ERR_PTR(-EINVAL);
3332b8b1e2dbSAlex Elder }
3333b8b1e2dbSAlex Elder 
3334117973fbSAlex Elder static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver)
3335117973fbSAlex Elder {
3336117973fbSAlex Elder 	int ret;
3337117973fbSAlex Elder 	__u8 obj_order;
3338117973fbSAlex Elder 
3339117973fbSAlex Elder 	down_write(&rbd_dev->header_rwsem);
3340117973fbSAlex Elder 
3341117973fbSAlex Elder 	/* Grab old order first, to see if it changes */
3342117973fbSAlex Elder 
3343117973fbSAlex Elder 	obj_order = rbd_dev->header.obj_order,
3344117973fbSAlex Elder 	ret = rbd_dev_v2_image_size(rbd_dev);
3345117973fbSAlex Elder 	if (ret)
3346117973fbSAlex Elder 		goto out;
3347117973fbSAlex Elder 	if (rbd_dev->header.obj_order != obj_order) {
3348117973fbSAlex Elder 		ret = -EIO;
3349117973fbSAlex Elder 		goto out;
3350117973fbSAlex Elder 	}
3351117973fbSAlex Elder 	rbd_update_mapping_size(rbd_dev);
3352117973fbSAlex Elder 
3353117973fbSAlex Elder 	ret = rbd_dev_v2_snap_context(rbd_dev, hver);
3354117973fbSAlex Elder 	dout("rbd_dev_v2_snap_context returned %d\n", ret);
3355117973fbSAlex Elder 	if (ret)
3356117973fbSAlex Elder 		goto out;
3357117973fbSAlex Elder 	ret = rbd_dev_snaps_update(rbd_dev);
3358117973fbSAlex Elder 	dout("rbd_dev_snaps_update returned %d\n", ret);
3359117973fbSAlex Elder 	if (ret)
3360117973fbSAlex Elder 		goto out;
3361117973fbSAlex Elder 	ret = rbd_dev_snaps_register(rbd_dev);
3362117973fbSAlex Elder 	dout("rbd_dev_snaps_register returned %d\n", ret);
3363117973fbSAlex Elder out:
3364117973fbSAlex Elder 	up_write(&rbd_dev->header_rwsem);
3365117973fbSAlex Elder 
3366117973fbSAlex Elder 	return ret;
3367117973fbSAlex Elder }
3368117973fbSAlex Elder 
33699d475de5SAlex Elder /*
337035938150SAlex Elder  * Scan the rbd device's current snapshot list and compare it to the
337135938150SAlex Elder  * newly-received snapshot context.  Remove any existing snapshots
337235938150SAlex Elder  * not present in the new snapshot context.  Add a new snapshot for
337335938150SAlex Elder  * any snaphots in the snapshot context not in the current list.
337435938150SAlex Elder  * And verify there are no changes to snapshots we already know
337535938150SAlex Elder  * about.
337635938150SAlex Elder  *
337735938150SAlex Elder  * Assumes the snapshots in the snapshot context are sorted by
337835938150SAlex Elder  * snapshot id, highest id first.  (Snapshots in the rbd_dev's list
337935938150SAlex Elder  * are also maintained in that order.)
3380dfc5606dSYehuda Sadeh  */
3381304f6808SAlex Elder static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
3382dfc5606dSYehuda Sadeh {
338335938150SAlex Elder 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
338435938150SAlex Elder 	const u32 snap_count = snapc->num_snaps;
338535938150SAlex Elder 	struct list_head *head = &rbd_dev->snaps;
338635938150SAlex Elder 	struct list_head *links = head->next;
338735938150SAlex Elder 	u32 index = 0;
3388dfc5606dSYehuda Sadeh 
33899fcbb800SAlex Elder 	dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count);
339035938150SAlex Elder 	while (index < snap_count || links != head) {
339135938150SAlex Elder 		u64 snap_id;
339235938150SAlex Elder 		struct rbd_snap *snap;
3393cd892126SAlex Elder 		char *snap_name;
3394cd892126SAlex Elder 		u64 snap_size = 0;
3395cd892126SAlex Elder 		u64 snap_features = 0;
3396dfc5606dSYehuda Sadeh 
339735938150SAlex Elder 		snap_id = index < snap_count ? snapc->snaps[index]
339835938150SAlex Elder 					     : CEPH_NOSNAP;
339935938150SAlex Elder 		snap = links != head ? list_entry(links, struct rbd_snap, node)
340035938150SAlex Elder 				     : NULL;
3401aafb230eSAlex Elder 		rbd_assert(!snap || snap->id != CEPH_NOSNAP);
3402dfc5606dSYehuda Sadeh 
340335938150SAlex Elder 		if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
340435938150SAlex Elder 			struct list_head *next = links->next;
3405dfc5606dSYehuda Sadeh 
34066d292906SAlex Elder 			/*
34076d292906SAlex Elder 			 * A previously-existing snapshot is not in
34086d292906SAlex Elder 			 * the new snap context.
34096d292906SAlex Elder 			 *
34106d292906SAlex Elder 			 * If the now missing snapshot is the one the
34116d292906SAlex Elder 			 * image is mapped to, clear its exists flag
34126d292906SAlex Elder 			 * so we can avoid sending any more requests
34136d292906SAlex Elder 			 * to it.
34146d292906SAlex Elder 			 */
34150d7dbfceSAlex Elder 			if (rbd_dev->spec->snap_id == snap->id)
34166d292906SAlex Elder 				clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
341741f38c2bSAlex Elder 			rbd_remove_snap_dev(snap);
34189fcbb800SAlex Elder 			dout("%ssnap id %llu has been removed\n",
34190d7dbfceSAlex Elder 				rbd_dev->spec->snap_id == snap->id ?
34200d7dbfceSAlex Elder 							"mapped " : "",
34219fcbb800SAlex Elder 				(unsigned long long) snap->id);
3422dfc5606dSYehuda Sadeh 
342335938150SAlex Elder 			/* Done with this list entry; advance */
342435938150SAlex Elder 
342535938150SAlex Elder 			links = next;
342635938150SAlex Elder 			continue;
3427dfc5606dSYehuda Sadeh 		}
342835938150SAlex Elder 
3429b8b1e2dbSAlex Elder 		snap_name = rbd_dev_snap_info(rbd_dev, index,
3430cd892126SAlex Elder 					&snap_size, &snap_features);
3431cd892126SAlex Elder 		if (IS_ERR(snap_name))
3432cd892126SAlex Elder 			return PTR_ERR(snap_name);
3433cd892126SAlex Elder 
34349fcbb800SAlex Elder 		dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count,
34359fcbb800SAlex Elder 			(unsigned long long) snap_id);
343635938150SAlex Elder 		if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
343735938150SAlex Elder 			struct rbd_snap *new_snap;
343835938150SAlex Elder 
343935938150SAlex Elder 			/* We haven't seen this snapshot before */
344035938150SAlex Elder 
3441c8d18425SAlex Elder 			new_snap = __rbd_add_snap_dev(rbd_dev, snap_name,
3442cd892126SAlex Elder 					snap_id, snap_size, snap_features);
34439fcbb800SAlex Elder 			if (IS_ERR(new_snap)) {
34449fcbb800SAlex Elder 				int err = PTR_ERR(new_snap);
34459fcbb800SAlex Elder 
34469fcbb800SAlex Elder 				dout("  failed to add dev, error %d\n", err);
34479fcbb800SAlex Elder 
34489fcbb800SAlex Elder 				return err;
34499fcbb800SAlex Elder 			}
345035938150SAlex Elder 
345135938150SAlex Elder 			/* New goes before existing, or at end of list */
345235938150SAlex Elder 
34539fcbb800SAlex Elder 			dout("  added dev%s\n", snap ? "" : " at end\n");
345435938150SAlex Elder 			if (snap)
345535938150SAlex Elder 				list_add_tail(&new_snap->node, &snap->node);
345635938150SAlex Elder 			else
3457523f3258SAlex Elder 				list_add_tail(&new_snap->node, head);
345835938150SAlex Elder 		} else {
345935938150SAlex Elder 			/* Already have this one */
346035938150SAlex Elder 
34619fcbb800SAlex Elder 			dout("  already present\n");
34629fcbb800SAlex Elder 
3463cd892126SAlex Elder 			rbd_assert(snap->size == snap_size);
3464aafb230eSAlex Elder 			rbd_assert(!strcmp(snap->name, snap_name));
3465cd892126SAlex Elder 			rbd_assert(snap->features == snap_features);
346635938150SAlex Elder 
346735938150SAlex Elder 			/* Done with this list entry; advance */
346835938150SAlex Elder 
346935938150SAlex Elder 			links = links->next;
3470dfc5606dSYehuda Sadeh 		}
347135938150SAlex Elder 
347235938150SAlex Elder 		/* Advance to the next entry in the snapshot context */
347335938150SAlex Elder 
347435938150SAlex Elder 		index++;
3475dfc5606dSYehuda Sadeh 	}
34769fcbb800SAlex Elder 	dout("%s: done\n", __func__);
3477dfc5606dSYehuda Sadeh 
3478dfc5606dSYehuda Sadeh 	return 0;
3479dfc5606dSYehuda Sadeh }
3480dfc5606dSYehuda Sadeh 
3481304f6808SAlex Elder /*
3482304f6808SAlex Elder  * Scan the list of snapshots and register the devices for any that
3483304f6808SAlex Elder  * have not already been registered.
3484304f6808SAlex Elder  */
3485304f6808SAlex Elder static int rbd_dev_snaps_register(struct rbd_device *rbd_dev)
3486304f6808SAlex Elder {
3487304f6808SAlex Elder 	struct rbd_snap *snap;
3488304f6808SAlex Elder 	int ret = 0;
3489304f6808SAlex Elder 
349037206ee5SAlex Elder 	dout("%s:\n", __func__);
349186ff77bbSAlex Elder 	if (WARN_ON(!device_is_registered(&rbd_dev->dev)))
349286ff77bbSAlex Elder 		return -EIO;
3493304f6808SAlex Elder 
3494304f6808SAlex Elder 	list_for_each_entry(snap, &rbd_dev->snaps, node) {
3495304f6808SAlex Elder 		if (!rbd_snap_registered(snap)) {
3496304f6808SAlex Elder 			ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
3497304f6808SAlex Elder 			if (ret < 0)
3498304f6808SAlex Elder 				break;
3499304f6808SAlex Elder 		}
3500304f6808SAlex Elder 	}
3501304f6808SAlex Elder 	dout("%s: returning %d\n", __func__, ret);
3502304f6808SAlex Elder 
3503304f6808SAlex Elder 	return ret;
3504304f6808SAlex Elder }
3505304f6808SAlex Elder 
3506dfc5606dSYehuda Sadeh static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
3507dfc5606dSYehuda Sadeh {
3508dfc5606dSYehuda Sadeh 	struct device *dev;
3509cd789ab9SAlex Elder 	int ret;
3510dfc5606dSYehuda Sadeh 
3511dfc5606dSYehuda Sadeh 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
3512dfc5606dSYehuda Sadeh 
3513cd789ab9SAlex Elder 	dev = &rbd_dev->dev;
3514dfc5606dSYehuda Sadeh 	dev->bus = &rbd_bus_type;
3515dfc5606dSYehuda Sadeh 	dev->type = &rbd_device_type;
3516dfc5606dSYehuda Sadeh 	dev->parent = &rbd_root_dev;
3517dfc5606dSYehuda Sadeh 	dev->release = rbd_dev_release;
3518de71a297SAlex Elder 	dev_set_name(dev, "%d", rbd_dev->dev_id);
3519dfc5606dSYehuda Sadeh 	ret = device_register(dev);
3520dfc5606dSYehuda Sadeh 
3521dfc5606dSYehuda Sadeh 	mutex_unlock(&ctl_mutex);
3522cd789ab9SAlex Elder 
3523dfc5606dSYehuda Sadeh 	return ret;
3524602adf40SYehuda Sadeh }
3525602adf40SYehuda Sadeh 
3526dfc5606dSYehuda Sadeh static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
3527dfc5606dSYehuda Sadeh {
3528dfc5606dSYehuda Sadeh 	device_unregister(&rbd_dev->dev);
3529dfc5606dSYehuda Sadeh }
3530dfc5606dSYehuda Sadeh 
3531e2839308SAlex Elder static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
35321ddbe94eSAlex Elder 
35331ddbe94eSAlex Elder /*
3534499afd5bSAlex Elder  * Get a unique rbd identifier for the given new rbd_dev, and add
3535499afd5bSAlex Elder  * the rbd_dev to the global list.  The minimum rbd id is 1.
35361ddbe94eSAlex Elder  */
3537e2839308SAlex Elder static void rbd_dev_id_get(struct rbd_device *rbd_dev)
3538b7f23c36SAlex Elder {
3539e2839308SAlex Elder 	rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
3540499afd5bSAlex Elder 
3541499afd5bSAlex Elder 	spin_lock(&rbd_dev_list_lock);
3542499afd5bSAlex Elder 	list_add_tail(&rbd_dev->node, &rbd_dev_list);
3543499afd5bSAlex Elder 	spin_unlock(&rbd_dev_list_lock);
3544e2839308SAlex Elder 	dout("rbd_dev %p given dev id %llu\n", rbd_dev,
3545e2839308SAlex Elder 		(unsigned long long) rbd_dev->dev_id);
3546b7f23c36SAlex Elder }
3547b7f23c36SAlex Elder 
35481ddbe94eSAlex Elder /*
3549499afd5bSAlex Elder  * Remove an rbd_dev from the global list, and record that its
3550499afd5bSAlex Elder  * identifier is no longer in use.
35511ddbe94eSAlex Elder  */
3552e2839308SAlex Elder static void rbd_dev_id_put(struct rbd_device *rbd_dev)
35531ddbe94eSAlex Elder {
3554d184f6bfSAlex Elder 	struct list_head *tmp;
3555de71a297SAlex Elder 	int rbd_id = rbd_dev->dev_id;
3556d184f6bfSAlex Elder 	int max_id;
3557d184f6bfSAlex Elder 
3558aafb230eSAlex Elder 	rbd_assert(rbd_id > 0);
3559499afd5bSAlex Elder 
3560e2839308SAlex Elder 	dout("rbd_dev %p released dev id %llu\n", rbd_dev,
3561e2839308SAlex Elder 		(unsigned long long) rbd_dev->dev_id);
3562499afd5bSAlex Elder 	spin_lock(&rbd_dev_list_lock);
3563499afd5bSAlex Elder 	list_del_init(&rbd_dev->node);
3564d184f6bfSAlex Elder 
3565d184f6bfSAlex Elder 	/*
3566d184f6bfSAlex Elder 	 * If the id being "put" is not the current maximum, there
3567d184f6bfSAlex Elder 	 * is nothing special we need to do.
3568d184f6bfSAlex Elder 	 */
3569e2839308SAlex Elder 	if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
3570d184f6bfSAlex Elder 		spin_unlock(&rbd_dev_list_lock);
3571d184f6bfSAlex Elder 		return;
3572d184f6bfSAlex Elder 	}
3573d184f6bfSAlex Elder 
3574d184f6bfSAlex Elder 	/*
3575d184f6bfSAlex Elder 	 * We need to update the current maximum id.  Search the
3576d184f6bfSAlex Elder 	 * list to find out what it is.  We're more likely to find
3577d184f6bfSAlex Elder 	 * the maximum at the end, so search the list backward.
3578d184f6bfSAlex Elder 	 */
3579d184f6bfSAlex Elder 	max_id = 0;
3580d184f6bfSAlex Elder 	list_for_each_prev(tmp, &rbd_dev_list) {
3581d184f6bfSAlex Elder 		struct rbd_device *rbd_dev;
3582d184f6bfSAlex Elder 
3583d184f6bfSAlex Elder 		rbd_dev = list_entry(tmp, struct rbd_device, node);
3584b213e0b1SAlex Elder 		if (rbd_dev->dev_id > max_id)
3585b213e0b1SAlex Elder 			max_id = rbd_dev->dev_id;
3586d184f6bfSAlex Elder 	}
3587499afd5bSAlex Elder 	spin_unlock(&rbd_dev_list_lock);
35881ddbe94eSAlex Elder 
35891ddbe94eSAlex Elder 	/*
3590e2839308SAlex Elder 	 * The max id could have been updated by rbd_dev_id_get(), in
3591d184f6bfSAlex Elder 	 * which case it now accurately reflects the new maximum.
3592d184f6bfSAlex Elder 	 * Be careful not to overwrite the maximum value in that
3593d184f6bfSAlex Elder 	 * case.
35941ddbe94eSAlex Elder 	 */
3595e2839308SAlex Elder 	atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
3596e2839308SAlex Elder 	dout("  max dev id has been reset\n");
3597b7f23c36SAlex Elder }
3598b7f23c36SAlex Elder 
3599a725f65eSAlex Elder /*
3600e28fff26SAlex Elder  * Skips over white space at *buf, and updates *buf to point to the
3601e28fff26SAlex Elder  * first found non-space character (if any). Returns the length of
3602593a9e7bSAlex Elder  * the token (string of non-white space characters) found.  Note
3603593a9e7bSAlex Elder  * that *buf must be terminated with '\0'.
3604e28fff26SAlex Elder  */
3605e28fff26SAlex Elder static inline size_t next_token(const char **buf)
3606e28fff26SAlex Elder {
3607e28fff26SAlex Elder         /*
3608e28fff26SAlex Elder         * These are the characters that produce nonzero for
3609e28fff26SAlex Elder         * isspace() in the "C" and "POSIX" locales.
3610e28fff26SAlex Elder         */
3611e28fff26SAlex Elder         const char *spaces = " \f\n\r\t\v";
3612e28fff26SAlex Elder 
3613e28fff26SAlex Elder         *buf += strspn(*buf, spaces);	/* Find start of token */
3614e28fff26SAlex Elder 
3615e28fff26SAlex Elder 	return strcspn(*buf, spaces);   /* Return token length */
3616e28fff26SAlex Elder }
3617e28fff26SAlex Elder 
3618e28fff26SAlex Elder /*
3619e28fff26SAlex Elder  * Finds the next token in *buf, and if the provided token buffer is
3620e28fff26SAlex Elder  * big enough, copies the found token into it.  The result, if
3621593a9e7bSAlex Elder  * copied, is guaranteed to be terminated with '\0'.  Note that *buf
3622593a9e7bSAlex Elder  * must be terminated with '\0' on entry.
3623e28fff26SAlex Elder  *
3624e28fff26SAlex Elder  * Returns the length of the token found (not including the '\0').
3625e28fff26SAlex Elder  * Return value will be 0 if no token is found, and it will be >=
3626e28fff26SAlex Elder  * token_size if the token would not fit.
3627e28fff26SAlex Elder  *
3628593a9e7bSAlex Elder  * The *buf pointer will be updated to point beyond the end of the
3629e28fff26SAlex Elder  * found token.  Note that this occurs even if the token buffer is
3630e28fff26SAlex Elder  * too small to hold it.
3631e28fff26SAlex Elder  */
3632e28fff26SAlex Elder static inline size_t copy_token(const char **buf,
3633e28fff26SAlex Elder 				char *token,
3634e28fff26SAlex Elder 				size_t token_size)
3635e28fff26SAlex Elder {
3636e28fff26SAlex Elder         size_t len;
3637e28fff26SAlex Elder 
3638e28fff26SAlex Elder 	len = next_token(buf);
3639e28fff26SAlex Elder 	if (len < token_size) {
3640e28fff26SAlex Elder 		memcpy(token, *buf, len);
3641e28fff26SAlex Elder 		*(token + len) = '\0';
3642e28fff26SAlex Elder 	}
3643e28fff26SAlex Elder 	*buf += len;
3644e28fff26SAlex Elder 
3645e28fff26SAlex Elder         return len;
3646e28fff26SAlex Elder }
3647e28fff26SAlex Elder 
3648e28fff26SAlex Elder /*
3649ea3352f4SAlex Elder  * Finds the next token in *buf, dynamically allocates a buffer big
3650ea3352f4SAlex Elder  * enough to hold a copy of it, and copies the token into the new
3651ea3352f4SAlex Elder  * buffer.  The copy is guaranteed to be terminated with '\0'.  Note
3652ea3352f4SAlex Elder  * that a duplicate buffer is created even for a zero-length token.
3653ea3352f4SAlex Elder  *
3654ea3352f4SAlex Elder  * Returns a pointer to the newly-allocated duplicate, or a null
3655ea3352f4SAlex Elder  * pointer if memory for the duplicate was not available.  If
3656ea3352f4SAlex Elder  * the lenp argument is a non-null pointer, the length of the token
3657ea3352f4SAlex Elder  * (not including the '\0') is returned in *lenp.
3658ea3352f4SAlex Elder  *
3659ea3352f4SAlex Elder  * If successful, the *buf pointer will be updated to point beyond
3660ea3352f4SAlex Elder  * the end of the found token.
3661ea3352f4SAlex Elder  *
3662ea3352f4SAlex Elder  * Note: uses GFP_KERNEL for allocation.
3663ea3352f4SAlex Elder  */
3664ea3352f4SAlex Elder static inline char *dup_token(const char **buf, size_t *lenp)
3665ea3352f4SAlex Elder {
3666ea3352f4SAlex Elder 	char *dup;
3667ea3352f4SAlex Elder 	size_t len;
3668ea3352f4SAlex Elder 
3669ea3352f4SAlex Elder 	len = next_token(buf);
36704caf35f9SAlex Elder 	dup = kmemdup(*buf, len + 1, GFP_KERNEL);
3671ea3352f4SAlex Elder 	if (!dup)
3672ea3352f4SAlex Elder 		return NULL;
3673ea3352f4SAlex Elder 	*(dup + len) = '\0';
3674ea3352f4SAlex Elder 	*buf += len;
3675ea3352f4SAlex Elder 
3676ea3352f4SAlex Elder 	if (lenp)
3677ea3352f4SAlex Elder 		*lenp = len;
3678ea3352f4SAlex Elder 
3679ea3352f4SAlex Elder 	return dup;
3680ea3352f4SAlex Elder }
3681ea3352f4SAlex Elder 
3682ea3352f4SAlex Elder /*
3683859c31dfSAlex Elder  * Parse the options provided for an "rbd add" (i.e., rbd image
3684859c31dfSAlex Elder  * mapping) request.  These arrive via a write to /sys/bus/rbd/add,
3685859c31dfSAlex Elder  * and the data written is passed here via a NUL-terminated buffer.
3686859c31dfSAlex Elder  * Returns 0 if successful or an error code otherwise.
3687d22f76e7SAlex Elder  *
3688859c31dfSAlex Elder  * The information extracted from these options is recorded in
3689859c31dfSAlex Elder  * the other parameters which return dynamically-allocated
3690859c31dfSAlex Elder  * structures:
3691859c31dfSAlex Elder  *  ceph_opts
3692859c31dfSAlex Elder  *      The address of a pointer that will refer to a ceph options
3693859c31dfSAlex Elder  *      structure.  Caller must release the returned pointer using
3694859c31dfSAlex Elder  *      ceph_destroy_options() when it is no longer needed.
3695859c31dfSAlex Elder  *  rbd_opts
3696859c31dfSAlex Elder  *	Address of an rbd options pointer.  Fully initialized by
3697859c31dfSAlex Elder  *	this function; caller must release with kfree().
3698859c31dfSAlex Elder  *  spec
3699859c31dfSAlex Elder  *	Address of an rbd image specification pointer.  Fully
3700859c31dfSAlex Elder  *	initialized by this function based on parsed options.
3701859c31dfSAlex Elder  *	Caller must release with rbd_spec_put().
3702859c31dfSAlex Elder  *
3703859c31dfSAlex Elder  * The options passed take this form:
3704859c31dfSAlex Elder  *  <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
3705859c31dfSAlex Elder  * where:
3706859c31dfSAlex Elder  *  <mon_addrs>
3707859c31dfSAlex Elder  *      A comma-separated list of one or more monitor addresses.
3708859c31dfSAlex Elder  *      A monitor address is an ip address, optionally followed
3709859c31dfSAlex Elder  *      by a port number (separated by a colon).
3710859c31dfSAlex Elder  *        I.e.:  ip1[:port1][,ip2[:port2]...]
3711859c31dfSAlex Elder  *  <options>
3712859c31dfSAlex Elder  *      A comma-separated list of ceph and/or rbd options.
3713859c31dfSAlex Elder  *  <pool_name>
3714859c31dfSAlex Elder  *      The name of the rados pool containing the rbd image.
3715859c31dfSAlex Elder  *  <image_name>
3716859c31dfSAlex Elder  *      The name of the image in that pool to map.
3717859c31dfSAlex Elder  *  <snap_id>
3718859c31dfSAlex Elder  *      An optional snapshot id.  If provided, the mapping will
3719859c31dfSAlex Elder  *      present data from the image at the time that snapshot was
3720859c31dfSAlex Elder  *      created.  The image head is used if no snapshot id is
3721859c31dfSAlex Elder  *      provided.  Snapshot mappings are always read-only.
3722a725f65eSAlex Elder  */
3723859c31dfSAlex Elder static int rbd_add_parse_args(const char *buf,
3724dc79b113SAlex Elder 				struct ceph_options **ceph_opts,
3725859c31dfSAlex Elder 				struct rbd_options **opts,
3726859c31dfSAlex Elder 				struct rbd_spec **rbd_spec)
3727a725f65eSAlex Elder {
3728e28fff26SAlex Elder 	size_t len;
3729859c31dfSAlex Elder 	char *options;
37300ddebc0cSAlex Elder 	const char *mon_addrs;
37310ddebc0cSAlex Elder 	size_t mon_addrs_size;
3732859c31dfSAlex Elder 	struct rbd_spec *spec = NULL;
37334e9afebaSAlex Elder 	struct rbd_options *rbd_opts = NULL;
3734859c31dfSAlex Elder 	struct ceph_options *copts;
3735dc79b113SAlex Elder 	int ret;
3736e28fff26SAlex Elder 
3737e28fff26SAlex Elder 	/* The first four tokens are required */
3738e28fff26SAlex Elder 
37397ef3214aSAlex Elder 	len = next_token(&buf);
37404fb5d671SAlex Elder 	if (!len) {
37414fb5d671SAlex Elder 		rbd_warn(NULL, "no monitor address(es) provided");
37424fb5d671SAlex Elder 		return -EINVAL;
37434fb5d671SAlex Elder 	}
37440ddebc0cSAlex Elder 	mon_addrs = buf;
3745f28e565aSAlex Elder 	mon_addrs_size = len + 1;
37467ef3214aSAlex Elder 	buf += len;
3747a725f65eSAlex Elder 
3748dc79b113SAlex Elder 	ret = -EINVAL;
3749f28e565aSAlex Elder 	options = dup_token(&buf, NULL);
3750f28e565aSAlex Elder 	if (!options)
3751dc79b113SAlex Elder 		return -ENOMEM;
37524fb5d671SAlex Elder 	if (!*options) {
37534fb5d671SAlex Elder 		rbd_warn(NULL, "no options provided");
37544fb5d671SAlex Elder 		goto out_err;
37554fb5d671SAlex Elder 	}
3756a725f65eSAlex Elder 
3757859c31dfSAlex Elder 	spec = rbd_spec_alloc();
3758859c31dfSAlex Elder 	if (!spec)
3759f28e565aSAlex Elder 		goto out_mem;
3760859c31dfSAlex Elder 
3761859c31dfSAlex Elder 	spec->pool_name = dup_token(&buf, NULL);
3762859c31dfSAlex Elder 	if (!spec->pool_name)
3763859c31dfSAlex Elder 		goto out_mem;
37644fb5d671SAlex Elder 	if (!*spec->pool_name) {
37654fb5d671SAlex Elder 		rbd_warn(NULL, "no pool name provided");
37664fb5d671SAlex Elder 		goto out_err;
37674fb5d671SAlex Elder 	}
3768e28fff26SAlex Elder 
376969e7a02fSAlex Elder 	spec->image_name = dup_token(&buf, NULL);
3770859c31dfSAlex Elder 	if (!spec->image_name)
3771f28e565aSAlex Elder 		goto out_mem;
37724fb5d671SAlex Elder 	if (!*spec->image_name) {
37734fb5d671SAlex Elder 		rbd_warn(NULL, "no image name provided");
37744fb5d671SAlex Elder 		goto out_err;
37754fb5d671SAlex Elder 	}
3776e28fff26SAlex Elder 
3777f28e565aSAlex Elder 	/*
3778f28e565aSAlex Elder 	 * Snapshot name is optional; default is to use "-"
3779f28e565aSAlex Elder 	 * (indicating the head/no snapshot).
3780f28e565aSAlex Elder 	 */
37813feeb894SAlex Elder 	len = next_token(&buf);
3782820a5f3eSAlex Elder 	if (!len) {
37833feeb894SAlex Elder 		buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
37843feeb894SAlex Elder 		len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
3785f28e565aSAlex Elder 	} else if (len > RBD_MAX_SNAP_NAME_LEN) {
3786dc79b113SAlex Elder 		ret = -ENAMETOOLONG;
3787f28e565aSAlex Elder 		goto out_err;
3788849b4260SAlex Elder 	}
37894caf35f9SAlex Elder 	spec->snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
3790859c31dfSAlex Elder 	if (!spec->snap_name)
3791f28e565aSAlex Elder 		goto out_mem;
3792859c31dfSAlex Elder 	*(spec->snap_name + len) = '\0';
3793e5c35534SAlex Elder 
37940ddebc0cSAlex Elder 	/* Initialize all rbd options to the defaults */
3795e28fff26SAlex Elder 
37964e9afebaSAlex Elder 	rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
37974e9afebaSAlex Elder 	if (!rbd_opts)
37984e9afebaSAlex Elder 		goto out_mem;
37994e9afebaSAlex Elder 
38004e9afebaSAlex Elder 	rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
3801d22f76e7SAlex Elder 
3802859c31dfSAlex Elder 	copts = ceph_parse_options(options, mon_addrs,
38030ddebc0cSAlex Elder 					mon_addrs + mon_addrs_size - 1,
38044e9afebaSAlex Elder 					parse_rbd_opts_token, rbd_opts);
3805859c31dfSAlex Elder 	if (IS_ERR(copts)) {
3806859c31dfSAlex Elder 		ret = PTR_ERR(copts);
3807dc79b113SAlex Elder 		goto out_err;
3808dc79b113SAlex Elder 	}
3809859c31dfSAlex Elder 	kfree(options);
3810859c31dfSAlex Elder 
3811859c31dfSAlex Elder 	*ceph_opts = copts;
38124e9afebaSAlex Elder 	*opts = rbd_opts;
3813859c31dfSAlex Elder 	*rbd_spec = spec;
38140ddebc0cSAlex Elder 
3815dc79b113SAlex Elder 	return 0;
3816f28e565aSAlex Elder out_mem:
3817dc79b113SAlex Elder 	ret = -ENOMEM;
3818d22f76e7SAlex Elder out_err:
3819859c31dfSAlex Elder 	kfree(rbd_opts);
3820859c31dfSAlex Elder 	rbd_spec_put(spec);
3821f28e565aSAlex Elder 	kfree(options);
3822d22f76e7SAlex Elder 
3823dc79b113SAlex Elder 	return ret;
3824a725f65eSAlex Elder }
3825a725f65eSAlex Elder 
3826589d30e0SAlex Elder /*
3827589d30e0SAlex Elder  * An rbd format 2 image has a unique identifier, distinct from the
3828589d30e0SAlex Elder  * name given to it by the user.  Internally, that identifier is
3829589d30e0SAlex Elder  * what's used to specify the names of objects related to the image.
3830589d30e0SAlex Elder  *
3831589d30e0SAlex Elder  * A special "rbd id" object is used to map an rbd image name to its
3832589d30e0SAlex Elder  * id.  If that object doesn't exist, then there is no v2 rbd image
3833589d30e0SAlex Elder  * with the supplied name.
3834589d30e0SAlex Elder  *
3835589d30e0SAlex Elder  * This function will record the given rbd_dev's image_id field if
3836589d30e0SAlex Elder  * it can be determined, and in that case will return 0.  If any
3837589d30e0SAlex Elder  * errors occur a negative errno will be returned and the rbd_dev's
3838589d30e0SAlex Elder  * image_id field will be unchanged (and should be NULL).
3839589d30e0SAlex Elder  */
3840589d30e0SAlex Elder static int rbd_dev_image_id(struct rbd_device *rbd_dev)
3841589d30e0SAlex Elder {
3842589d30e0SAlex Elder 	int ret;
3843589d30e0SAlex Elder 	size_t size;
3844589d30e0SAlex Elder 	char *object_name;
3845589d30e0SAlex Elder 	void *response;
3846589d30e0SAlex Elder 	void *p;
3847589d30e0SAlex Elder 
3848*2f82ee54SAlex Elder 	/* If we already have it we don't need to look it up */
3849*2f82ee54SAlex Elder 
3850*2f82ee54SAlex Elder 	if (rbd_dev->spec->image_id)
3851*2f82ee54SAlex Elder 		return 0;
3852*2f82ee54SAlex Elder 
3853589d30e0SAlex Elder 	/*
38542c0d0a10SAlex Elder 	 * When probing a parent image, the image id is already
38552c0d0a10SAlex Elder 	 * known (and the image name likely is not).  There's no
38562c0d0a10SAlex Elder 	 * need to fetch the image id again in this case.
38572c0d0a10SAlex Elder 	 */
38582c0d0a10SAlex Elder 	if (rbd_dev->spec->image_id)
38592c0d0a10SAlex Elder 		return 0;
38602c0d0a10SAlex Elder 
38612c0d0a10SAlex Elder 	/*
3862589d30e0SAlex Elder 	 * First, see if the format 2 image id file exists, and if
3863589d30e0SAlex Elder 	 * so, get the image's persistent id from it.
3864589d30e0SAlex Elder 	 */
386569e7a02fSAlex Elder 	size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name);
3866589d30e0SAlex Elder 	object_name = kmalloc(size, GFP_NOIO);
3867589d30e0SAlex Elder 	if (!object_name)
3868589d30e0SAlex Elder 		return -ENOMEM;
38690d7dbfceSAlex Elder 	sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name);
3870589d30e0SAlex Elder 	dout("rbd id object name is %s\n", object_name);
3871589d30e0SAlex Elder 
3872589d30e0SAlex Elder 	/* Response will be an encoded string, which includes a length */
3873589d30e0SAlex Elder 
3874589d30e0SAlex Elder 	size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
3875589d30e0SAlex Elder 	response = kzalloc(size, GFP_NOIO);
3876589d30e0SAlex Elder 	if (!response) {
3877589d30e0SAlex Elder 		ret = -ENOMEM;
3878589d30e0SAlex Elder 		goto out;
3879589d30e0SAlex Elder 	}
3880589d30e0SAlex Elder 
388136be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, object_name,
3882589d30e0SAlex Elder 				"rbd", "get_id",
3883589d30e0SAlex Elder 				NULL, 0,
388407b2391fSAlex Elder 				response, RBD_IMAGE_ID_LEN_MAX, NULL);
388536be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3886589d30e0SAlex Elder 	if (ret < 0)
3887589d30e0SAlex Elder 		goto out;
3888589d30e0SAlex Elder 
3889589d30e0SAlex Elder 	p = response;
38900d7dbfceSAlex Elder 	rbd_dev->spec->image_id = ceph_extract_encoded_string(&p,
3891589d30e0SAlex Elder 						p + RBD_IMAGE_ID_LEN_MAX,
3892979ed480SAlex Elder 						NULL, GFP_NOIO);
38930d7dbfceSAlex Elder 	if (IS_ERR(rbd_dev->spec->image_id)) {
38940d7dbfceSAlex Elder 		ret = PTR_ERR(rbd_dev->spec->image_id);
38950d7dbfceSAlex Elder 		rbd_dev->spec->image_id = NULL;
3896589d30e0SAlex Elder 	} else {
38970d7dbfceSAlex Elder 		dout("image_id is %s\n", rbd_dev->spec->image_id);
3898589d30e0SAlex Elder 	}
3899589d30e0SAlex Elder out:
3900589d30e0SAlex Elder 	kfree(response);
3901589d30e0SAlex Elder 	kfree(object_name);
3902589d30e0SAlex Elder 
3903589d30e0SAlex Elder 	return ret;
3904589d30e0SAlex Elder }
3905589d30e0SAlex Elder 
3906a30b71b9SAlex Elder static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
3907a30b71b9SAlex Elder {
3908a30b71b9SAlex Elder 	int ret;
3909a30b71b9SAlex Elder 	size_t size;
3910a30b71b9SAlex Elder 
3911a30b71b9SAlex Elder 	/* Version 1 images have no id; empty string is used */
3912a30b71b9SAlex Elder 
39130d7dbfceSAlex Elder 	rbd_dev->spec->image_id = kstrdup("", GFP_KERNEL);
39140d7dbfceSAlex Elder 	if (!rbd_dev->spec->image_id)
3915a30b71b9SAlex Elder 		return -ENOMEM;
3916a30b71b9SAlex Elder 
3917a30b71b9SAlex Elder 	/* Record the header object name for this rbd image. */
3918a30b71b9SAlex Elder 
391969e7a02fSAlex Elder 	size = strlen(rbd_dev->spec->image_name) + sizeof (RBD_SUFFIX);
3920a30b71b9SAlex Elder 	rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3921a30b71b9SAlex Elder 	if (!rbd_dev->header_name) {
3922a30b71b9SAlex Elder 		ret = -ENOMEM;
3923a30b71b9SAlex Elder 		goto out_err;
3924a30b71b9SAlex Elder 	}
39250d7dbfceSAlex Elder 	sprintf(rbd_dev->header_name, "%s%s",
39260d7dbfceSAlex Elder 		rbd_dev->spec->image_name, RBD_SUFFIX);
3927a30b71b9SAlex Elder 
3928a30b71b9SAlex Elder 	/* Populate rbd image metadata */
3929a30b71b9SAlex Elder 
3930a30b71b9SAlex Elder 	ret = rbd_read_header(rbd_dev, &rbd_dev->header);
3931a30b71b9SAlex Elder 	if (ret < 0)
3932a30b71b9SAlex Elder 		goto out_err;
393386b00e0dSAlex Elder 
393486b00e0dSAlex Elder 	/* Version 1 images have no parent (no layering) */
393586b00e0dSAlex Elder 
393686b00e0dSAlex Elder 	rbd_dev->parent_spec = NULL;
393786b00e0dSAlex Elder 	rbd_dev->parent_overlap = 0;
393886b00e0dSAlex Elder 
3939a30b71b9SAlex Elder 	rbd_dev->image_format = 1;
3940a30b71b9SAlex Elder 
3941a30b71b9SAlex Elder 	dout("discovered version 1 image, header name is %s\n",
3942a30b71b9SAlex Elder 		rbd_dev->header_name);
3943a30b71b9SAlex Elder 
3944a30b71b9SAlex Elder 	return 0;
3945a30b71b9SAlex Elder 
3946a30b71b9SAlex Elder out_err:
3947a30b71b9SAlex Elder 	kfree(rbd_dev->header_name);
3948a30b71b9SAlex Elder 	rbd_dev->header_name = NULL;
39490d7dbfceSAlex Elder 	kfree(rbd_dev->spec->image_id);
39500d7dbfceSAlex Elder 	rbd_dev->spec->image_id = NULL;
3951a30b71b9SAlex Elder 
3952a30b71b9SAlex Elder 	return ret;
3953a30b71b9SAlex Elder }
3954a30b71b9SAlex Elder 
3955a30b71b9SAlex Elder static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
3956a30b71b9SAlex Elder {
3957a30b71b9SAlex Elder 	size_t size;
39589d475de5SAlex Elder 	int ret;
39596e14b1a6SAlex Elder 	u64 ver = 0;
3960a30b71b9SAlex Elder 
3961a30b71b9SAlex Elder 	/*
3962a30b71b9SAlex Elder 	 * Image id was filled in by the caller.  Record the header
3963a30b71b9SAlex Elder 	 * object name for this rbd image.
3964a30b71b9SAlex Elder 	 */
3965979ed480SAlex Elder 	size = sizeof (RBD_HEADER_PREFIX) + strlen(rbd_dev->spec->image_id);
3966a30b71b9SAlex Elder 	rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3967a30b71b9SAlex Elder 	if (!rbd_dev->header_name)
3968a30b71b9SAlex Elder 		return -ENOMEM;
3969a30b71b9SAlex Elder 	sprintf(rbd_dev->header_name, "%s%s",
39700d7dbfceSAlex Elder 			RBD_HEADER_PREFIX, rbd_dev->spec->image_id);
39719d475de5SAlex Elder 
39729d475de5SAlex Elder 	/* Get the size and object order for the image */
39739d475de5SAlex Elder 
39749d475de5SAlex Elder 	ret = rbd_dev_v2_image_size(rbd_dev);
39759d475de5SAlex Elder 	if (ret < 0)
39769d475de5SAlex Elder 		goto out_err;
39771e130199SAlex Elder 
39781e130199SAlex Elder 	/* Get the object prefix (a.k.a. block_name) for the image */
39791e130199SAlex Elder 
39801e130199SAlex Elder 	ret = rbd_dev_v2_object_prefix(rbd_dev);
39811e130199SAlex Elder 	if (ret < 0)
39821e130199SAlex Elder 		goto out_err;
3983b1b5402aSAlex Elder 
3984d889140cSAlex Elder 	/* Get the and check features for the image */
3985b1b5402aSAlex Elder 
3986b1b5402aSAlex Elder 	ret = rbd_dev_v2_features(rbd_dev);
3987b1b5402aSAlex Elder 	if (ret < 0)
3988b1b5402aSAlex Elder 		goto out_err;
398935d489f9SAlex Elder 
399086b00e0dSAlex Elder 	/* If the image supports layering, get the parent info */
399186b00e0dSAlex Elder 
399286b00e0dSAlex Elder 	if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
399386b00e0dSAlex Elder 		ret = rbd_dev_v2_parent_info(rbd_dev);
399486b00e0dSAlex Elder 		if (ret < 0)
399586b00e0dSAlex Elder 			goto out_err;
399686b00e0dSAlex Elder 	}
399786b00e0dSAlex Elder 
39986e14b1a6SAlex Elder 	/* crypto and compression type aren't (yet) supported for v2 images */
399935d489f9SAlex Elder 
40006e14b1a6SAlex Elder 	rbd_dev->header.crypt_type = 0;
40016e14b1a6SAlex Elder 	rbd_dev->header.comp_type = 0;
40026e14b1a6SAlex Elder 
40036e14b1a6SAlex Elder 	/* Get the snapshot context, plus the header version */
40046e14b1a6SAlex Elder 
40056e14b1a6SAlex Elder 	ret = rbd_dev_v2_snap_context(rbd_dev, &ver);
400635d489f9SAlex Elder 	if (ret)
400735d489f9SAlex Elder 		goto out_err;
40086e14b1a6SAlex Elder 	rbd_dev->header.obj_version = ver;
40096e14b1a6SAlex Elder 
4010a30b71b9SAlex Elder 	rbd_dev->image_format = 2;
4011a30b71b9SAlex Elder 
4012a30b71b9SAlex Elder 	dout("discovered version 2 image, header name is %s\n",
4013a30b71b9SAlex Elder 		rbd_dev->header_name);
4014a30b71b9SAlex Elder 
401535152979SAlex Elder 	return 0;
40169d475de5SAlex Elder out_err:
401786b00e0dSAlex Elder 	rbd_dev->parent_overlap = 0;
401886b00e0dSAlex Elder 	rbd_spec_put(rbd_dev->parent_spec);
401986b00e0dSAlex Elder 	rbd_dev->parent_spec = NULL;
40209d475de5SAlex Elder 	kfree(rbd_dev->header_name);
40219d475de5SAlex Elder 	rbd_dev->header_name = NULL;
40221e130199SAlex Elder 	kfree(rbd_dev->header.object_prefix);
40231e130199SAlex Elder 	rbd_dev->header.object_prefix = NULL;
40249d475de5SAlex Elder 
40259d475de5SAlex Elder 	return ret;
4026a30b71b9SAlex Elder }
4027a30b71b9SAlex Elder 
402883a06263SAlex Elder static int rbd_dev_probe_finish(struct rbd_device *rbd_dev)
402983a06263SAlex Elder {
4030*2f82ee54SAlex Elder 	struct rbd_device *parent = NULL;
4031*2f82ee54SAlex Elder 	struct rbd_spec *parent_spec = NULL;
4032*2f82ee54SAlex Elder 	struct rbd_client *rbdc = NULL;
403383a06263SAlex Elder 	int ret;
403483a06263SAlex Elder 
403583a06263SAlex Elder 	/* no need to lock here, as rbd_dev is not registered yet */
403683a06263SAlex Elder 	ret = rbd_dev_snaps_update(rbd_dev);
403783a06263SAlex Elder 	if (ret)
403883a06263SAlex Elder 		return ret;
403983a06263SAlex Elder 
40409e15b77dSAlex Elder 	ret = rbd_dev_probe_update_spec(rbd_dev);
40419e15b77dSAlex Elder 	if (ret)
40429e15b77dSAlex Elder 		goto err_out_snaps;
40439e15b77dSAlex Elder 
404483a06263SAlex Elder 	ret = rbd_dev_set_mapping(rbd_dev);
404583a06263SAlex Elder 	if (ret)
404683a06263SAlex Elder 		goto err_out_snaps;
404783a06263SAlex Elder 
404883a06263SAlex Elder 	/* generate unique id: find highest unique id, add one */
404983a06263SAlex Elder 	rbd_dev_id_get(rbd_dev);
405083a06263SAlex Elder 
405183a06263SAlex Elder 	/* Fill in the device name, now that we have its id. */
405283a06263SAlex Elder 	BUILD_BUG_ON(DEV_NAME_LEN
405383a06263SAlex Elder 			< sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
405483a06263SAlex Elder 	sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
405583a06263SAlex Elder 
405683a06263SAlex Elder 	/* Get our block major device number. */
405783a06263SAlex Elder 
405883a06263SAlex Elder 	ret = register_blkdev(0, rbd_dev->name);
405983a06263SAlex Elder 	if (ret < 0)
406083a06263SAlex Elder 		goto err_out_id;
406183a06263SAlex Elder 	rbd_dev->major = ret;
406283a06263SAlex Elder 
406383a06263SAlex Elder 	/* Set up the blkdev mapping. */
406483a06263SAlex Elder 
406583a06263SAlex Elder 	ret = rbd_init_disk(rbd_dev);
406683a06263SAlex Elder 	if (ret)
406783a06263SAlex Elder 		goto err_out_blkdev;
406883a06263SAlex Elder 
406983a06263SAlex Elder 	ret = rbd_bus_add_dev(rbd_dev);
407083a06263SAlex Elder 	if (ret)
407183a06263SAlex Elder 		goto err_out_disk;
407283a06263SAlex Elder 
407383a06263SAlex Elder 	/*
407483a06263SAlex Elder 	 * At this point cleanup in the event of an error is the job
407583a06263SAlex Elder 	 * of the sysfs code (initiated by rbd_bus_del_dev()).
407683a06263SAlex Elder 	 */
4077*2f82ee54SAlex Elder 	/* Probe the parent if there is one */
4078*2f82ee54SAlex Elder 
4079*2f82ee54SAlex Elder 	if (rbd_dev->parent_spec) {
4080*2f82ee54SAlex Elder 		/*
4081*2f82ee54SAlex Elder 		 * We need to pass a reference to the client and the
4082*2f82ee54SAlex Elder 		 * parent spec when creating the parent rbd_dev.
4083*2f82ee54SAlex Elder 		 * Images related by parent/child relationships
4084*2f82ee54SAlex Elder 		 * always share both.
4085*2f82ee54SAlex Elder 		 */
4086*2f82ee54SAlex Elder 		parent_spec = rbd_spec_get(rbd_dev->parent_spec);
4087*2f82ee54SAlex Elder 		rbdc = __rbd_get_client(rbd_dev->rbd_client);
4088*2f82ee54SAlex Elder 
4089*2f82ee54SAlex Elder 		parent = rbd_dev_create(rbdc, parent_spec);
4090*2f82ee54SAlex Elder 		if (!parent) {
4091*2f82ee54SAlex Elder 			ret = -ENOMEM;
4092*2f82ee54SAlex Elder 			goto err_out_spec;
4093*2f82ee54SAlex Elder 		}
4094*2f82ee54SAlex Elder 		rbdc = NULL;		/* parent now owns reference */
4095*2f82ee54SAlex Elder 		parent_spec = NULL;	/* parent now owns reference */
4096*2f82ee54SAlex Elder 		ret = rbd_dev_probe(parent);
4097*2f82ee54SAlex Elder 		if (ret < 0)
4098*2f82ee54SAlex Elder 			goto err_out_parent;
4099*2f82ee54SAlex Elder 		rbd_dev->parent = parent;
4100*2f82ee54SAlex Elder 	}
4101*2f82ee54SAlex Elder 
410283a06263SAlex Elder 	down_write(&rbd_dev->header_rwsem);
410383a06263SAlex Elder 	ret = rbd_dev_snaps_register(rbd_dev);
410483a06263SAlex Elder 	up_write(&rbd_dev->header_rwsem);
410583a06263SAlex Elder 	if (ret)
410683a06263SAlex Elder 		goto err_out_bus;
410783a06263SAlex Elder 
41089969ebc5SAlex Elder 	ret = rbd_dev_header_watch_sync(rbd_dev, 1);
410983a06263SAlex Elder 	if (ret)
411083a06263SAlex Elder 		goto err_out_bus;
411183a06263SAlex Elder 
411283a06263SAlex Elder 	/* Everything's ready.  Announce the disk to the world. */
411383a06263SAlex Elder 
411483a06263SAlex Elder 	add_disk(rbd_dev->disk);
411583a06263SAlex Elder 
411683a06263SAlex Elder 	pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
411783a06263SAlex Elder 		(unsigned long long) rbd_dev->mapping.size);
411883a06263SAlex Elder 
411983a06263SAlex Elder 	return ret;
4120*2f82ee54SAlex Elder 
4121*2f82ee54SAlex Elder err_out_parent:
4122*2f82ee54SAlex Elder 	rbd_dev_destroy(parent);
4123*2f82ee54SAlex Elder err_out_spec:
4124*2f82ee54SAlex Elder 	rbd_spec_put(parent_spec);
4125*2f82ee54SAlex Elder 	rbd_put_client(rbdc);
412683a06263SAlex Elder err_out_bus:
412783a06263SAlex Elder 	/* this will also clean up rest of rbd_dev stuff */
412883a06263SAlex Elder 
412983a06263SAlex Elder 	rbd_bus_del_dev(rbd_dev);
413083a06263SAlex Elder 
413183a06263SAlex Elder 	return ret;
413283a06263SAlex Elder err_out_disk:
413383a06263SAlex Elder 	rbd_free_disk(rbd_dev);
413483a06263SAlex Elder err_out_blkdev:
413583a06263SAlex Elder 	unregister_blkdev(rbd_dev->major, rbd_dev->name);
413683a06263SAlex Elder err_out_id:
413783a06263SAlex Elder 	rbd_dev_id_put(rbd_dev);
413883a06263SAlex Elder err_out_snaps:
413983a06263SAlex Elder 	rbd_remove_all_snaps(rbd_dev);
414083a06263SAlex Elder 
414183a06263SAlex Elder 	return ret;
414283a06263SAlex Elder }
414383a06263SAlex Elder 
4144a30b71b9SAlex Elder /*
4145a30b71b9SAlex Elder  * Probe for the existence of the header object for the given rbd
4146a30b71b9SAlex Elder  * device.  For format 2 images this includes determining the image
4147a30b71b9SAlex Elder  * id.
4148a30b71b9SAlex Elder  */
4149a30b71b9SAlex Elder static int rbd_dev_probe(struct rbd_device *rbd_dev)
4150a30b71b9SAlex Elder {
4151a30b71b9SAlex Elder 	int ret;
4152a30b71b9SAlex Elder 
4153a30b71b9SAlex Elder 	/*
4154a30b71b9SAlex Elder 	 * Get the id from the image id object.  If it's not a
4155a30b71b9SAlex Elder 	 * format 2 image, we'll get ENOENT back, and we'll assume
4156a30b71b9SAlex Elder 	 * it's a format 1 image.
4157a30b71b9SAlex Elder 	 */
4158a30b71b9SAlex Elder 	ret = rbd_dev_image_id(rbd_dev);
4159a30b71b9SAlex Elder 	if (ret)
4160a30b71b9SAlex Elder 		ret = rbd_dev_v1_probe(rbd_dev);
4161a30b71b9SAlex Elder 	else
4162a30b71b9SAlex Elder 		ret = rbd_dev_v2_probe(rbd_dev);
416383a06263SAlex Elder 	if (ret) {
4164a30b71b9SAlex Elder 		dout("probe failed, returning %d\n", ret);
4165a30b71b9SAlex Elder 
4166a30b71b9SAlex Elder 		return ret;
4167a30b71b9SAlex Elder 	}
4168a30b71b9SAlex Elder 
416983a06263SAlex Elder 	ret = rbd_dev_probe_finish(rbd_dev);
417083a06263SAlex Elder 	if (ret)
417183a06263SAlex Elder 		rbd_header_free(&rbd_dev->header);
417283a06263SAlex Elder 
417383a06263SAlex Elder 	return ret;
417483a06263SAlex Elder }
417583a06263SAlex Elder 
417659c2be1eSYehuda Sadeh static ssize_t rbd_add(struct bus_type *bus,
417759c2be1eSYehuda Sadeh 		       const char *buf,
417859c2be1eSYehuda Sadeh 		       size_t count)
4179602adf40SYehuda Sadeh {
4180cb8627c7SAlex Elder 	struct rbd_device *rbd_dev = NULL;
4181dc79b113SAlex Elder 	struct ceph_options *ceph_opts = NULL;
41824e9afebaSAlex Elder 	struct rbd_options *rbd_opts = NULL;
4183859c31dfSAlex Elder 	struct rbd_spec *spec = NULL;
41849d3997fdSAlex Elder 	struct rbd_client *rbdc;
418527cc2594SAlex Elder 	struct ceph_osd_client *osdc;
418627cc2594SAlex Elder 	int rc = -ENOMEM;
4187602adf40SYehuda Sadeh 
4188602adf40SYehuda Sadeh 	if (!try_module_get(THIS_MODULE))
4189602adf40SYehuda Sadeh 		return -ENODEV;
4190602adf40SYehuda Sadeh 
4191a725f65eSAlex Elder 	/* parse add command */
4192859c31dfSAlex Elder 	rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
4193dc79b113SAlex Elder 	if (rc < 0)
4194bd4ba655SAlex Elder 		goto err_out_module;
4195a725f65eSAlex Elder 
41969d3997fdSAlex Elder 	rbdc = rbd_get_client(ceph_opts);
41979d3997fdSAlex Elder 	if (IS_ERR(rbdc)) {
41989d3997fdSAlex Elder 		rc = PTR_ERR(rbdc);
41990ddebc0cSAlex Elder 		goto err_out_args;
42009d3997fdSAlex Elder 	}
4201c53d5893SAlex Elder 	ceph_opts = NULL;	/* rbd_dev client now owns this */
4202602adf40SYehuda Sadeh 
4203602adf40SYehuda Sadeh 	/* pick the pool */
42049d3997fdSAlex Elder 	osdc = &rbdc->client->osdc;
4205859c31dfSAlex Elder 	rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name);
4206602adf40SYehuda Sadeh 	if (rc < 0)
4207602adf40SYehuda Sadeh 		goto err_out_client;
4208859c31dfSAlex Elder 	spec->pool_id = (u64) rc;
4209859c31dfSAlex Elder 
42100903e875SAlex Elder 	/* The ceph file layout needs to fit pool id in 32 bits */
42110903e875SAlex Elder 
42120903e875SAlex Elder 	if (WARN_ON(spec->pool_id > (u64) U32_MAX)) {
42130903e875SAlex Elder 		rc = -EIO;
42140903e875SAlex Elder 		goto err_out_client;
42150903e875SAlex Elder 	}
42160903e875SAlex Elder 
4217c53d5893SAlex Elder 	rbd_dev = rbd_dev_create(rbdc, spec);
4218bd4ba655SAlex Elder 	if (!rbd_dev)
4219bd4ba655SAlex Elder 		goto err_out_client;
4220c53d5893SAlex Elder 	rbdc = NULL;		/* rbd_dev now owns this */
4221c53d5893SAlex Elder 	spec = NULL;		/* rbd_dev now owns this */
4222602adf40SYehuda Sadeh 
4223bd4ba655SAlex Elder 	rbd_dev->mapping.read_only = rbd_opts->read_only;
4224c53d5893SAlex Elder 	kfree(rbd_opts);
4225c53d5893SAlex Elder 	rbd_opts = NULL;	/* done with this */
4226bd4ba655SAlex Elder 
4227a30b71b9SAlex Elder 	rc = rbd_dev_probe(rbd_dev);
4228a30b71b9SAlex Elder 	if (rc < 0)
4229c53d5893SAlex Elder 		goto err_out_rbd_dev;
423005fd6f6fSAlex Elder 
4231602adf40SYehuda Sadeh 	return count;
4232c53d5893SAlex Elder err_out_rbd_dev:
4233c53d5893SAlex Elder 	rbd_dev_destroy(rbd_dev);
4234bd4ba655SAlex Elder err_out_client:
42359d3997fdSAlex Elder 	rbd_put_client(rbdc);
42360ddebc0cSAlex Elder err_out_args:
423778cea76eSAlex Elder 	if (ceph_opts)
423878cea76eSAlex Elder 		ceph_destroy_options(ceph_opts);
42394e9afebaSAlex Elder 	kfree(rbd_opts);
4240859c31dfSAlex Elder 	rbd_spec_put(spec);
4241bd4ba655SAlex Elder err_out_module:
4242bd4ba655SAlex Elder 	module_put(THIS_MODULE);
424327cc2594SAlex Elder 
4244602adf40SYehuda Sadeh 	dout("Error adding device %s\n", buf);
424527cc2594SAlex Elder 
424627cc2594SAlex Elder 	return (ssize_t) rc;
4247602adf40SYehuda Sadeh }
4248602adf40SYehuda Sadeh 
4249de71a297SAlex Elder static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
4250602adf40SYehuda Sadeh {
4251602adf40SYehuda Sadeh 	struct list_head *tmp;
4252602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev;
4253602adf40SYehuda Sadeh 
4254e124a82fSAlex Elder 	spin_lock(&rbd_dev_list_lock);
4255602adf40SYehuda Sadeh 	list_for_each(tmp, &rbd_dev_list) {
4256602adf40SYehuda Sadeh 		rbd_dev = list_entry(tmp, struct rbd_device, node);
4257de71a297SAlex Elder 		if (rbd_dev->dev_id == dev_id) {
4258e124a82fSAlex Elder 			spin_unlock(&rbd_dev_list_lock);
4259602adf40SYehuda Sadeh 			return rbd_dev;
4260602adf40SYehuda Sadeh 		}
4261e124a82fSAlex Elder 	}
4262e124a82fSAlex Elder 	spin_unlock(&rbd_dev_list_lock);
4263602adf40SYehuda Sadeh 	return NULL;
4264602adf40SYehuda Sadeh }
4265602adf40SYehuda Sadeh 
4266dfc5606dSYehuda Sadeh static void rbd_dev_release(struct device *dev)
4267602adf40SYehuda Sadeh {
4268593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4269602adf40SYehuda Sadeh 
427059c2be1eSYehuda Sadeh 	if (rbd_dev->watch_event)
42719969ebc5SAlex Elder 		rbd_dev_header_watch_sync(rbd_dev, 0);
4272602adf40SYehuda Sadeh 
4273602adf40SYehuda Sadeh 	/* clean up and free blkdev */
4274602adf40SYehuda Sadeh 	rbd_free_disk(rbd_dev);
4275602adf40SYehuda Sadeh 	unregister_blkdev(rbd_dev->major, rbd_dev->name);
427632eec68dSAlex Elder 
42772ac4e75dSAlex Elder 	/* release allocated disk header fields */
42782ac4e75dSAlex Elder 	rbd_header_free(&rbd_dev->header);
42792ac4e75dSAlex Elder 
428032eec68dSAlex Elder 	/* done with the id, and with the rbd_dev */
4281e2839308SAlex Elder 	rbd_dev_id_put(rbd_dev);
4282c53d5893SAlex Elder 	rbd_assert(rbd_dev->rbd_client != NULL);
4283c53d5893SAlex Elder 	rbd_dev_destroy(rbd_dev);
4284602adf40SYehuda Sadeh 
4285602adf40SYehuda Sadeh 	/* release module ref */
4286602adf40SYehuda Sadeh 	module_put(THIS_MODULE);
4287602adf40SYehuda Sadeh }
4288602adf40SYehuda Sadeh 
4289*2f82ee54SAlex Elder static void __rbd_remove(struct rbd_device *rbd_dev)
4290*2f82ee54SAlex Elder {
4291*2f82ee54SAlex Elder 	rbd_remove_all_snaps(rbd_dev);
4292*2f82ee54SAlex Elder 	rbd_bus_del_dev(rbd_dev);
4293*2f82ee54SAlex Elder }
4294*2f82ee54SAlex Elder 
4295dfc5606dSYehuda Sadeh static ssize_t rbd_remove(struct bus_type *bus,
4296602adf40SYehuda Sadeh 			  const char *buf,
4297602adf40SYehuda Sadeh 			  size_t count)
4298602adf40SYehuda Sadeh {
4299602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev = NULL;
4300602adf40SYehuda Sadeh 	int target_id, rc;
4301602adf40SYehuda Sadeh 	unsigned long ul;
4302602adf40SYehuda Sadeh 	int ret = count;
4303602adf40SYehuda Sadeh 
4304602adf40SYehuda Sadeh 	rc = strict_strtoul(buf, 10, &ul);
4305602adf40SYehuda Sadeh 	if (rc)
4306602adf40SYehuda Sadeh 		return rc;
4307602adf40SYehuda Sadeh 
4308602adf40SYehuda Sadeh 	/* convert to int; abort if we lost anything in the conversion */
4309602adf40SYehuda Sadeh 	target_id = (int) ul;
4310602adf40SYehuda Sadeh 	if (target_id != ul)
4311602adf40SYehuda Sadeh 		return -EINVAL;
4312602adf40SYehuda Sadeh 
4313602adf40SYehuda Sadeh 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
4314602adf40SYehuda Sadeh 
4315602adf40SYehuda Sadeh 	rbd_dev = __rbd_get_dev(target_id);
4316602adf40SYehuda Sadeh 	if (!rbd_dev) {
4317602adf40SYehuda Sadeh 		ret = -ENOENT;
4318602adf40SYehuda Sadeh 		goto done;
4319602adf40SYehuda Sadeh 	}
4320602adf40SYehuda Sadeh 
4321a14ea269SAlex Elder 	spin_lock_irq(&rbd_dev->lock);
4322b82d167bSAlex Elder 	if (rbd_dev->open_count)
432342382b70SAlex Elder 		ret = -EBUSY;
4324b82d167bSAlex Elder 	else
4325b82d167bSAlex Elder 		set_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags);
4326a14ea269SAlex Elder 	spin_unlock_irq(&rbd_dev->lock);
4327b82d167bSAlex Elder 	if (ret < 0)
432842382b70SAlex Elder 		goto done;
432942382b70SAlex Elder 
4330*2f82ee54SAlex Elder 	while (rbd_dev->parent_spec) {
4331*2f82ee54SAlex Elder 		struct rbd_device *first = rbd_dev;
4332*2f82ee54SAlex Elder 		struct rbd_device *second = first->parent;
4333*2f82ee54SAlex Elder 		struct rbd_device *third;
4334*2f82ee54SAlex Elder 
4335*2f82ee54SAlex Elder 		/*
4336*2f82ee54SAlex Elder 		 * Follow to the parent with no grandparent and
4337*2f82ee54SAlex Elder 		 * remove it.
4338*2f82ee54SAlex Elder 		 */
4339*2f82ee54SAlex Elder 		while (second && (third = second->parent)) {
4340*2f82ee54SAlex Elder 			first = second;
4341*2f82ee54SAlex Elder 			second = third;
4342*2f82ee54SAlex Elder 		}
4343*2f82ee54SAlex Elder 		__rbd_remove(second);
4344*2f82ee54SAlex Elder 		rbd_spec_put(first->parent_spec);
4345*2f82ee54SAlex Elder 		first->parent_spec = NULL;
4346*2f82ee54SAlex Elder 		first->parent_overlap = 0;
4347*2f82ee54SAlex Elder 		first->parent = NULL;
4348*2f82ee54SAlex Elder 	}
4349*2f82ee54SAlex Elder 	__rbd_remove(rbd_dev);
4350602adf40SYehuda Sadeh 
4351602adf40SYehuda Sadeh done:
4352602adf40SYehuda Sadeh 	mutex_unlock(&ctl_mutex);
4353aafb230eSAlex Elder 
4354602adf40SYehuda Sadeh 	return ret;
4355602adf40SYehuda Sadeh }
4356602adf40SYehuda Sadeh 
4357602adf40SYehuda Sadeh /*
4358602adf40SYehuda Sadeh  * create control files in sysfs
4359dfc5606dSYehuda Sadeh  * /sys/bus/rbd/...
4360602adf40SYehuda Sadeh  */
4361602adf40SYehuda Sadeh static int rbd_sysfs_init(void)
4362602adf40SYehuda Sadeh {
4363dfc5606dSYehuda Sadeh 	int ret;
4364602adf40SYehuda Sadeh 
4365fed4c143SAlex Elder 	ret = device_register(&rbd_root_dev);
4366dfc5606dSYehuda Sadeh 	if (ret < 0)
4367dfc5606dSYehuda Sadeh 		return ret;
4368602adf40SYehuda Sadeh 
4369fed4c143SAlex Elder 	ret = bus_register(&rbd_bus_type);
4370fed4c143SAlex Elder 	if (ret < 0)
4371fed4c143SAlex Elder 		device_unregister(&rbd_root_dev);
4372602adf40SYehuda Sadeh 
4373602adf40SYehuda Sadeh 	return ret;
4374602adf40SYehuda Sadeh }
4375602adf40SYehuda Sadeh 
4376602adf40SYehuda Sadeh static void rbd_sysfs_cleanup(void)
4377602adf40SYehuda Sadeh {
4378dfc5606dSYehuda Sadeh 	bus_unregister(&rbd_bus_type);
4379fed4c143SAlex Elder 	device_unregister(&rbd_root_dev);
4380602adf40SYehuda Sadeh }
4381602adf40SYehuda Sadeh 
4382cc344fa1SAlex Elder static int __init rbd_init(void)
4383602adf40SYehuda Sadeh {
4384602adf40SYehuda Sadeh 	int rc;
4385602adf40SYehuda Sadeh 
43861e32d34cSAlex Elder 	if (!libceph_compatible(NULL)) {
43871e32d34cSAlex Elder 		rbd_warn(NULL, "libceph incompatibility (quitting)");
43881e32d34cSAlex Elder 
43891e32d34cSAlex Elder 		return -EINVAL;
43901e32d34cSAlex Elder 	}
4391602adf40SYehuda Sadeh 	rc = rbd_sysfs_init();
4392602adf40SYehuda Sadeh 	if (rc)
4393602adf40SYehuda Sadeh 		return rc;
4394f0f8cef5SAlex Elder 	pr_info("loaded " RBD_DRV_NAME_LONG "\n");
4395602adf40SYehuda Sadeh 	return 0;
4396602adf40SYehuda Sadeh }
4397602adf40SYehuda Sadeh 
4398cc344fa1SAlex Elder static void __exit rbd_exit(void)
4399602adf40SYehuda Sadeh {
4400602adf40SYehuda Sadeh 	rbd_sysfs_cleanup();
4401602adf40SYehuda Sadeh }
4402602adf40SYehuda Sadeh 
4403602adf40SYehuda Sadeh module_init(rbd_init);
4404602adf40SYehuda Sadeh module_exit(rbd_exit);
4405602adf40SYehuda Sadeh 
4406602adf40SYehuda Sadeh MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
4407602adf40SYehuda Sadeh MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
4408602adf40SYehuda Sadeh MODULE_DESCRIPTION("rados block device");
4409602adf40SYehuda Sadeh 
4410602adf40SYehuda Sadeh /* following authorship retained from original osdblk.c */
4411602adf40SYehuda Sadeh MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
4412602adf40SYehuda Sadeh 
4413602adf40SYehuda Sadeh MODULE_LICENSE("GPL");
4414